From 3d158cdc8dad62dfed45d5d808ae9f14f16e4dae Mon Sep 17 00:00:00 2001 From: wchen61 <183351030@qq.com> Date: Fri, 15 Nov 2024 16:52:20 +0800 Subject: [PATCH 001/255] Add default value to avoid Falcon crash (#5363) (#10347) Signed-off-by: wchen61 --- vllm/model_executor/models/falcon.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index dcfcb6694feb5..b3dbf063ac298 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -250,6 +250,9 @@ def __init__( self.mlp = FalconMLP(config, quant_config) self.config = config + if (not hasattr(config, "num_ln_in_parallel_attn")): + config.num_ln_in_parallel_attn = None + if (config.num_ln_in_parallel_attn is None and config.new_decoder_architecture): config.num_ln_in_parallel_attn = 2 From b311efd0bd84faffcb1fe47aaa27ffd8c53688be Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 15 Nov 2024 17:34:17 +0800 Subject: [PATCH 002/255] [Misc] Fix import error in tensorizer tests and cleanup some code (#10349) Signed-off-by: DarkLight1337 --- tests/tensorizer_loader/test_tensorizer.py | 70 ++++++++++--------- vllm/engine/llm_engine.py | 3 - vllm/entrypoints/llm.py | 3 - .../tool_parsers/abstract_tool_parser.py | 17 +++-- vllm/inputs/preprocess.py | 9 +-- vllm/utils.py | 20 ++++++ vllm/v1/engine/llm_engine.py | 3 - 7 files changed, 67 insertions(+), 58 deletions(-) diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 32591ecfe6774..edd079bc7a389 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -8,10 +8,12 @@ import openai import pytest import torch +from huggingface_hub import snapshot_download from tensorizer import EncryptionParams from vllm import SamplingParams from vllm.engine.arg_utils import EngineArgs +# yapf conflicts with isort for this docstring # yapf: disable from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig, TensorSerializer, @@ -20,13 +22,14 @@ open_stream, serialize_vllm_model, tensorize_vllm_model) +# yapf: enable +from vllm.utils import import_from_path from ..conftest import VllmRunner -from ..utils import RemoteOpenAIServer +from ..utils import VLLM_PATH, RemoteOpenAIServer from .conftest import retry_until_skip -# yapf conflicts with isort for this docstring - +EXAMPLES_PATH = VLLM_PATH / "examples" prompts = [ "Hello, my name is", @@ -94,8 +97,8 @@ def test_can_deserialize_s3(vllm_runner): num_readers=1, s3_endpoint="object.ord1.coreweave.com", )) as loaded_hf_model: - deserialized_outputs = loaded_hf_model.generate(prompts, - sampling_params) + deserialized_outputs = loaded_hf_model.generate( + prompts, sampling_params) # noqa: E501 assert deserialized_outputs @@ -111,23 +114,21 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs( outputs = vllm_model.generate(prompts, sampling_params) - config_for_serializing = TensorizerConfig( - tensorizer_uri=model_path, - encryption_keyfile=key_path - ) + config_for_serializing = TensorizerConfig(tensorizer_uri=model_path, + encryption_keyfile=key_path) serialize_vllm_model(get_torch_model(vllm_model), config_for_serializing) config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path, encryption_keyfile=key_path) - with vllm_runner( - model_ref, - load_format="tensorizer", - model_loader_extra_config=config_for_deserializing) as loaded_vllm_model: # noqa: E501 + with vllm_runner(model_ref, + load_format="tensorizer", + model_loader_extra_config=config_for_deserializing + ) as loaded_vllm_model: # noqa: E501 - deserialized_outputs = loaded_vllm_model.generate(prompts, - sampling_params) + deserialized_outputs = loaded_vllm_model.generate( + prompts, sampling_params) # noqa: E501 assert outputs == deserialized_outputs @@ -156,14 +157,14 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner, def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): - from huggingface_hub import snapshot_download - - from examples.multilora_inference import (create_test_prompts, - process_requests) + multilora_inference = import_from_path( + "examples.multilora_inference", + EXAMPLES_PATH / "multilora_inference.py", + ) model_ref = "meta-llama/Llama-2-7b-hf" lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") - test_prompts = create_test_prompts(lora_path) + test_prompts = multilora_inference.create_test_prompts(lora_path) # Serialize model before deserializing and binding LoRA adapters with vllm_runner(model_ref, ) as vllm_model: @@ -186,7 +187,8 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): max_num_seqs=50, max_model_len=1000, ) as loaded_vllm_model: - process_requests(loaded_vllm_model.model.llm_engine, test_prompts) + multilora_inference.process_requests( + loaded_vllm_model.model.llm_engine, test_prompts) assert loaded_vllm_model @@ -217,8 +219,11 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): ## Start OpenAI API server openai_args = [ - "--dtype", "float16", "--load-format", - "tensorizer", "--model-loader-extra-config", + "--dtype", + "float16", + "--load-format", + "tensorizer", + "--model-loader-extra-config", json.dumps(model_loader_extra_config), ] @@ -251,8 +256,7 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner): torch.cuda.empty_cache() -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Requires 2 GPUs") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs") def test_tensorizer_with_tp_path_without_template(vllm_runner): with pytest.raises(ValueError): model_ref = "EleutherAI/pythia-1.4b" @@ -271,10 +275,9 @@ def test_tensorizer_with_tp_path_without_template(vllm_runner): ) -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Requires 2 GPUs") -def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner, - tmp_path): +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs") +def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs( + vllm_runner, tmp_path): model_ref = "EleutherAI/pythia-1.4b" # record outputs from un-sharded un-tensorized model with vllm_runner( @@ -313,13 +316,12 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner, disable_custom_all_reduce=True, enforce_eager=True, model_loader_extra_config=tensorizer_config) as loaded_vllm_model: - deserialized_outputs = loaded_vllm_model.generate(prompts, - sampling_params) + deserialized_outputs = loaded_vllm_model.generate( + prompts, sampling_params) assert outputs == deserialized_outputs - @retry_until_skip(3) def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path): gc.collect() @@ -337,8 +339,8 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path): with vllm_runner(model_ref, load_format="tensorizer", model_loader_extra_config=config) as loaded_vllm_model: - deserialized_outputs = loaded_vllm_model.generate(prompts, - sampling_params) + deserialized_outputs = loaded_vllm_model.generate( + prompts, sampling_params) # noqa: E501 assert outputs == deserialized_outputs diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index f5299746d845d..aa9c7893c4cfe 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -2002,9 +2002,6 @@ def create_trace_span(self, seq_group: SequenceGroup) -> None: SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_EXECUTE, metrics.model_execute_time) - def is_encoder_decoder_model(self): - return self.input_preprocessor.is_encoder_decoder_model() - def _validate_model_inputs(self, inputs: ProcessorInputs, lora_request: Optional[LoRARequest]): if is_encoder_decoder_inputs(inputs): diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 3ab467e649b57..4b33fc1458ee3 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -964,6 +964,3 @@ def _run_engine( # This is necessary because some requests may be finished earlier than # its previous requests. return sorted(outputs, key=lambda x: int(x.request_id)) - - def _is_encoder_decoder_model(self): - return self.llm_engine.is_encoder_decoder_model() diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py index 5ce31bd4d941b..aa7c201098935 100644 --- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py @@ -1,5 +1,3 @@ -import importlib -import importlib.util import os from functools import cached_property from typing import Callable, Dict, List, Optional, Sequence, Type, Union @@ -9,7 +7,7 @@ ExtractedToolCallInformation) from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer -from vllm.utils import is_list_of +from vllm.utils import import_from_path, is_list_of logger = init_logger(__name__) @@ -149,13 +147,14 @@ def _register(module): @classmethod def import_tool_parser(cls, plugin_path: str) -> None: """ - Import a user defined tool parser by the path of the tool parser define + Import a user-defined tool parser by the path of the tool parser define file. """ module_name = os.path.splitext(os.path.basename(plugin_path))[0] - spec = importlib.util.spec_from_file_location(module_name, plugin_path) - if spec is None or spec.loader is None: - logger.error("load %s from %s failed.", module_name, plugin_path) + + try: + import_from_path(module_name, plugin_path) + except Exception: + logger.exception("Failed to load module '%s' from %s.", + module_name, plugin_path) return - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index fdf28615fda10..aacff87df6d79 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -67,7 +67,7 @@ def get_decoder_start_token_id(self) -> Optional[int]: model config is unavailable. ''' - if not self.is_encoder_decoder_model(): + if not self.model_config.is_encoder_decoder: print_warning_once("Using None for decoder start token id because " "this is not an encoder/decoder model.") return None @@ -632,7 +632,7 @@ def preprocess( prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> ProcessorInputs: """Preprocess the input prompt.""" - if self.is_encoder_decoder_model(): + if self.model_config.is_encoder_decoder: # Encoder-decoder model requires special mapping of # input prompts to encoder & decoder return self._process_encoder_decoder_prompt( @@ -660,7 +660,7 @@ async def preprocess_async( prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> ProcessorInputs: """Async version of :meth:`preprocess`.""" - if self.is_encoder_decoder_model(): + if self.model_config.is_encoder_decoder: # Encoder-decoder model requires special mapping of # input prompts to encoder & decoder return await self._process_encoder_decoder_prompt_async( @@ -679,6 +679,3 @@ async def preprocess_async( lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, ) - - def is_encoder_decoder_model(self): - return self.model_config.is_encoder_decoder diff --git a/vllm/utils.py b/vllm/utils.py index 1b02cbff79f78..111460a29de47 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -5,6 +5,7 @@ import enum import gc import getpass +import importlib.util import inspect import ipaddress import os @@ -1539,6 +1540,25 @@ def is_in_doc_build() -> bool: return False +def import_from_path(module_name: str, file_path: Union[str, os.PathLike]): + """ + Import a Python file according to its file path. + + Based on the official recipe: + https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly + """ + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ModuleNotFoundError(f"No module named '{module_name}'") + + assert spec.loader is not None + + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + spec.loader.exec_module(module) + return module + + # create a library to hold the custom op vllm_lib = Library("vllm", "FRAGMENT") # noqa diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 4ebfff9584267..75a77be750acd 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -163,9 +163,6 @@ def step(self) -> List[RequestOutput]: def get_model_config(self): pass - def is_encoder_decoder_model(self): - pass - def start_profile(self): pass From 26908554b2ecc8f76fa57942566629ec5713ef5b Mon Sep 17 00:00:00 2001 From: Xin Yang <105740670+xyang16@users.noreply.github.com> Date: Fri, 15 Nov 2024 02:22:57 -0800 Subject: [PATCH 003/255] [Doc] Remove float32 choice from --lora-dtype (#10348) Signed-off-by: Xin Yang --- vllm/engine/arg_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 4afc61c8d0c4c..dbbcd6e95b791 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -601,7 +601,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: '--lora-dtype', type=str, default=EngineArgs.lora_dtype, - choices=['auto', 'float16', 'bfloat16', 'float32'], + choices=['auto', 'float16', 'bfloat16'], help=('Data type for LoRA. If auto, will default to ' 'base model dtype.')) parser.add_argument( From 1d65ec7eeb35f03eb87ed080094f1aa5ff2ae3d3 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 15 Nov 2024 18:34:58 +0800 Subject: [PATCH 004/255] [Bugfix] Fix fully sharded LoRA bug (#10352) Signed-off-by: Jee Jee Li --- vllm/lora/fully_sharded_layers.py | 23 ++++++++++++----------- vllm/lora/layers.py | 15 ++++++++------- vllm/worker/worker.py | 2 +- 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index 04fc635828d4d..3443c3feb4d2a 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -165,15 +165,14 @@ class MergedColumnParallelLinearWithShardedLoRA( def slice_lora_a( self, lora_a: List[Union[torch.Tensor, None]] ) -> List[Union[torch.Tensor, None]]: - if lora_a[0] is None or lora_a[1] is None: - return lora_a + #NOTE: lora_a contains 2 subloras, and each sublora could be None. output_shard_size = self.lora_a_stacked[0].shape[2] output_start_idx = self.tp_rank * output_shard_size lora_a = [ - lora_a[0][:, - output_start_idx:output_start_idx + output_shard_size], - lora_a[1][:, - output_start_idx:output_start_idx + output_shard_size], + lora_a[0][:, output_start_idx:output_start_idx + + output_shard_size] if lora_a[0] is not None else None, + lora_a[1][:, output_start_idx:output_start_idx + + output_shard_size] if lora_a[1] is not None else None, ] return lora_a @@ -261,14 +260,16 @@ class MergedQKVParallelLinearWithShardedLora(MergedQKVParallelLinearWithLora): def slice_lora_a( self, lora_a: List[Union[torch.Tensor, None]] ) -> List[Union[torch.Tensor, None]]: - if lora_a[0] is None or lora_a[1] is None or lora_a[2] is None: - return lora_a + # NOTE: lora_a contains 3 subloras, and each sublora could be None. shard_size = [self.lora_a_stacked[i].shape[2] for i in range(3)] start_idx = [self.tp_rank * shard_size[i] for i in range(3)] lora_a = [ - lora_a[0][:, start_idx[0]:start_idx[0] + shard_size[0]], - lora_a[1][:, start_idx[1]:start_idx[1] + shard_size[1]], - lora_a[2][:, start_idx[2]:start_idx[2] + shard_size[2]], + lora_a[0][:, start_idx[0]:start_idx[0] + + shard_size[0]] if lora_a[0] is not None else None, + lora_a[1][:, start_idx[1]:start_idx[1] + + shard_size[1]] if lora_a[1] is not None else None, + lora_a[2][:, start_idx[2]:start_idx[2] + + shard_size[2]] if lora_a[2] is not None else None, ] return lora_a diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 7429c60e0222d..6afe80219fe07 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -685,26 +685,27 @@ def slice_lora_a( def slice_lora_b( self, lora_b: List[Union[torch.Tensor, None]] ) -> List[Union[torch.Tensor, None]]: - if lora_b[0] is None or lora_b[1] is None: - return lora_b + #NOTE: lora_b contains 2 subloras, and each sublora could be None. shard_size = self.output_dim start_idx = self.tp_rank * shard_size end_idx = (self.tp_rank + 1) * shard_size lora_b = [ - lora_b[0][:, start_idx:end_idx], - lora_b[1][:, start_idx:end_idx], + lora_b[0][:, start_idx:end_idx] if lora_b[0] is not None else None, + lora_b[1][:, start_idx:end_idx] if lora_b[1] is not None else None, ] return lora_b def slice_bias( self, bias: List[Union[torch.Tensor, None]]) -> List[Union[torch.Tensor, None]]: - if bias[0] is None or bias[1] is None: - return bias + # NOTE : each bias could be None. shard_size = self.output_dim start_idx = self.tp_rank * shard_size end_idx = (self.tp_rank + 1) * shard_size - bias = [bias[0][start_idx:end_idx], bias[1][start_idx:end_idx]] + bias = [ + bias[0][start_idx:end_idx] if bias[0] is not None else None, + bias[1][start_idx:end_idx] if bias[1] is not None else None + ] return bias def set_lora( diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index d8c8011a585d8..d3ca6d9d0b17e 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -232,7 +232,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: logger.info( "Memory profiling results: total_gpu_memory=%.2fGiB" " initial_memory_usage=%.2fGiB peak_torch_memory=%.2fGiB" - " memory_usage_post_profile=%.2fGib" + " memory_usage_post_profile=%.2fGiB" " non_torch_memory=%.2fGiB kv_cache_size=%.2fGiB" " gpu_memory_utilization=%.2f", total_gpu_memory / (1024**3), (total_gpu_memory - free_memory_pre_profile) / (1024**3), From f2056f726d9b0f257bc0e79938a9a6f483ce9e2d Mon Sep 17 00:00:00 2001 From: shangmingc Date: Fri, 15 Nov 2024 20:40:30 +0800 Subject: [PATCH 005/255] [Misc] Fix some help info of arg_utils to improve readability (#10362) --- vllm/engine/arg_utils.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index dbbcd6e95b791..d73f95f59c71f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -272,10 +272,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument( '--allowed-local-media-path', type=str, - help="Allowing API requests to read local images or videos" - "from directories specified by the server file system." - "This is a security risk." - "Should only be enabled in trusted environments") + help="Allowing API requests to read local images or videos " + "from directories specified by the server file system. " + "This is a security risk. " + "Should only be enabled in trusted environments.") parser.add_argument('--download-dir', type=nullable_str, default=EngineArgs.download_dir, @@ -340,7 +340,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: 'scaling factors. This should generally be supplied, when ' 'KV cache dtype is FP8. Otherwise, KV cache scaling factors ' 'default to 1.0, which may cause accuracy issues. ' - 'FP8_E5M2 (without scaling) is only supported on cuda version' + 'FP8_E5M2 (without scaling) is only supported on cuda version ' 'greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead ' 'supported for common inference criteria.') parser.add_argument('--max-model-len', @@ -446,9 +446,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: 'this argument can be seen as a virtual way to increase ' 'the GPU memory size. For example, if you have one 24 GB ' 'GPU and set this to 10, virtually you can think of it as ' - 'a 34 GB GPU. Then you can load a 13B model with BF16 weight,' + 'a 34 GB GPU. Then you can load a 13B model with BF16 weight, ' 'which requires at least 26GB GPU memory. Note that this ' - 'requires fast CPU-GPU interconnect, as part of the model is' + 'requires fast CPU-GPU interconnect, as part of the model is ' 'loaded from CPU memory to GPU memory on the fly in each ' 'model forward pass.') parser.add_argument( @@ -468,7 +468,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: type=int, default=None, help='If specified, ignore GPU profiling result and use this number' - 'of GPU blocks. Used for testing preemption.') + ' of GPU blocks. Used for testing preemption.') parser.add_argument('--max-num-batched-tokens', type=int, default=EngineArgs.max_num_batched_tokens, @@ -514,7 +514,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument('--hf-overrides', type=json.loads, default=EngineArgs.hf_overrides, - help='Extra arguments for the HuggingFace config.' + help='Extra arguments for the HuggingFace config. ' 'This should be a JSON string that will be ' 'parsed into a dictionary.') parser.add_argument('--enforce-eager', @@ -572,7 +572,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: '--mm-processor-kwargs', default=None, type=json.loads, - help=('Overrides for the multimodal input mapping/processing,' + help=('Overrides for the multimodal input mapping/processing, ' 'e.g., image processor. For example: {"num_crops": 4}.')) # LoRA related configs @@ -822,9 +822,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "of the provided names. The model name in the model " "field of a response will be the first name in this " "list. If not specified, the model name will be the " - "same as the `--model` argument. Noted that this name(s)" + "same as the `--model` argument. Noted that this name(s) " "will also be used in `model_name` tag content of " - "prometheus metrics, if multiple names provided, metrics" + "prometheus metrics, if multiple names provided, metrics " "tag will take the first one.") parser.add_argument('--qlora-adapter-name-or-path', type=str, From 3a763ba0c3a92fdde78e855ded94f9ff29e02088 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 15 Nov 2024 05:55:51 -0800 Subject: [PATCH 006/255] [core][misc] keep compatibility for old-style classes (#10356) Signed-off-by: youkaichao --- vllm/model_executor/model_loader/loader.py | 40 +++++++++++++++------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 5bcae37961195..140b61fe6d56a 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -94,18 +94,34 @@ def _initialize_model(vllm_config: VllmConfig, prefix: str = "") -> nn.Module: model_config = vllm_config.model_config model_class, _ = get_model_architecture(model_config) signatures = inspect.signature(model_class.__init__) - # collect all kw-only parameters - kw_only_params = [ - param.name for param in signatures.parameters.values() - if param.kind == inspect.Parameter.KEYWORD_ONLY - ] - assert "vllm_config" in kw_only_params and "prefix" in kw_only_params, \ - ("vLLM model class must accept `vllm_config` and `prefix` as kw-only " - "arguments. Possibly you have an old-style model class registered from " - "out of tree and it is used for new vLLM version. " - "Please check https://docs.vllm.ai/en/latest/design/class_hierarchy.html " - "for the design and update the model class accordingly.") - return model_class(vllm_config=vllm_config, prefix=prefix) + all_params = [param.name for param in signatures.parameters.values()] + if "vllm_config" in all_params and "prefix" in all_params: + # new-style model class + return model_class(vllm_config=vllm_config, prefix=prefix) + msg = ("vLLM model class should accept `vllm_config` and `prefix` as " + "input arguments. Possibly you have an old-style model class" + " registered from out of tree and it is used for new vLLM version. " + "Check https://docs.vllm.ai/en/latest/design/class_hierarchy.html " + "for the design and update the model class accordingly.") + logger.warning(msg) + logger.warning( + "Trying to guess the arguments for old-style model class %s", + model_class) + # try to be compatible with old-style model class + kwargs = {} + if "prefix" in all_params: + kwargs["prefix"] = prefix + if "config" in all_params: + kwargs["config"] = model_config.hf_config + if "cache_config" in all_params: + kwargs["cache_config"] = vllm_config.cache_config + if "quant_config" in all_params: + kwargs["quant_config"] = vllm_config.quant_config + if "lora_config" in all_params: + kwargs["lora_config"] = vllm_config.lora_config + if "scheduler_config" in all_params: + kwargs["scheduler_config"] = vllm_config.scheduler_config + return model_class(**kwargs) class BaseModelLoader(ABC): From 691a3ec0475ba1fe4255bc975d02cc7a4392bf2c Mon Sep 17 00:00:00 2001 From: Guillaume Calmettes Date: Fri, 15 Nov 2024 15:50:40 +0100 Subject: [PATCH 007/255] [Bugfix] Ensure special tokens are properly filtered out for guided structured output with MistralTokenizer (#10363) Signed-off-by: Guillaume Calmettes --- requirements-common.txt | 4 ++-- vllm/transformers_utils/tokenizers/mistral.py | 19 +++++++++++++++---- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/requirements-common.txt b/requirements-common.txt index acb766d25a2d9..c68004d27626b 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -17,7 +17,7 @@ pillow # Required for image processing prometheus_client >= 0.18.0 prometheus-fastapi-instrumentator >= 7.0.0 tiktoken >= 0.6.0 # Required for DBRX tokenizer -lm-format-enforcer == 0.10.6 +lm-format-enforcer >= 0.10.9, < 0.11 outlines >= 0.0.43, < 0.1 typing_extensions >= 4.10 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4 @@ -31,4 +31,4 @@ pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. -compressed-tensors == 0.8.0 # required for compressed-tensors \ No newline at end of file +compressed-tensors == 0.8.0 # required for compressed-tensors diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index b1cb9a15b943b..83b3c37d6f04c 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -174,18 +174,29 @@ def _download_mistral_tokenizer_from_hf(tokenizer_name: str, revision=revision) return tokenizer_file - # the following attributes are set to fit VLLM's design + # the following attributes are set to fit VLLM's design and are used + # by the guided structured output backends. @property def all_special_tokens_extended(self) -> List[str]: - return [] + # tekken defines its own extended special tokens list + if hasattr(self.tokenizer, "SPECIAL_TOKENS"): + special_tokens = self.tokenizer.SPECIAL_TOKENS + else: + special_tokens = list(SpecialTokens) + return [ + s.value if isinstance(s, SpecialTokens) else s + for s in special_tokens + ] @property def all_special_tokens(self) -> List[str]: - return [] + return self.all_special_tokens_extended @property def all_special_ids(self) -> List[int]: - return [] + return [ + self.all_special_tokens.index(t) for t in self.all_special_tokens + ] @property def bos_token_id(self) -> int: From 79ee45b42822d750ead6121c8c741c8a947bfeaf Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Fri, 15 Nov 2024 17:31:18 +0100 Subject: [PATCH 008/255] [Misc] Bump up test_fused_moe tolerance (#10364) Signed-off-by: ElizaWszola --- tests/kernels/test_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index 17428ebfc2e28..8b23b62826053 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -45,7 +45,7 @@ def test_fused_moe( score = torch.randn((m, e), device="cuda", dtype=dtype) triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False) torch_output = torch_moe(a, w1, w2, score, topk) - torch.testing.assert_close(triton_output, torch_output, atol=1e-2, rtol=0) + torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0) @pytest.mark.parametrize("dtype", From a6221a144af772fd1a68fe7e627935dc53e81738 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Fri, 15 Nov 2024 09:48:07 -0800 Subject: [PATCH 009/255] [Misc] bump mistral common version (#10367) Signed-off-by: simon-mo --- requirements-common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-common.txt b/requirements-common.txt index c68004d27626b..f62ad66a1ecc4 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -26,7 +26,7 @@ pyzmq msgspec gguf == 0.10.0 importlib_metadata -mistral_common[opencv] >= 1.4.4 +mistral_common[opencv] >= 1.5.0 pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 From c76ac49d266e27aa3fea84ef2df1f813d24c91c7 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Fri, 15 Nov 2024 12:47:40 -0800 Subject: [PATCH 010/255] [Docs] Add Nebius as sponsors (#10371) Signed-off-by: simon-mo --- README.md | 1 + docs/source/community/sponsors.md | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index 6530886ed7de2..0ef073210d070 100644 --- a/README.md +++ b/README.md @@ -100,6 +100,7 @@ vLLM is a community project. Our compute resources for development and testing a - Dropbox - Google Cloud - Lambda Lab +- Nebius - NVIDIA - Replicate - Roblox diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md index 52fbf9a577c7e..c6f83b3a92ca0 100644 --- a/docs/source/community/sponsors.md +++ b/docs/source/community/sponsors.md @@ -15,6 +15,7 @@ vLLM is a community project. Our compute resources for development and testing a - Dropbox - Google Cloud - Lambda Lab +- Nebius - NVIDIA - Replicate - Roblox From a067f85e08f6604b328a16efe3ead4629e0ead5b Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Fri, 15 Nov 2024 16:13:53 -0500 Subject: [PATCH 011/255] [Frontend] Add --version flag to CLI (#10369) Signed-off-by: Russell Bryant --- vllm/scripts.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/scripts.py b/vllm/scripts.py index 4e4c071784287..a51c21cfa29e7 100644 --- a/vllm/scripts.py +++ b/vllm/scripts.py @@ -9,6 +9,7 @@ from openai import OpenAI from openai.types.chat import ChatCompletionMessageParam +import vllm.version from vllm.engine.arg_utils import EngineArgs from vllm.entrypoints.openai.api_server import run_server from vllm.entrypoints.openai.cli_args import (make_arg_parser, @@ -143,6 +144,11 @@ def main(): env_setup() parser = FlexibleArgumentParser(description="vLLM CLI") + parser.add_argument('-v', + '--version', + action='version', + version=vllm.version.__version__) + subparsers = parser.add_subparsers(required=True, dest="subparser") serve_parser = subparsers.add_parser( From 3e8d14d8a1e3e54655f79d7bb3481cde02943281 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Fri, 15 Nov 2024 16:20:20 -0500 Subject: [PATCH 012/255] [Doc] Move PR template content to docs (#10159) Signed-off-by: Russell Bryant --- .github/PULL_REQUEST_TEMPLATE.md | 71 +--------------- .github/scripts/cleanup_pr_body.sh | 25 +++++- docs/source/contributing/overview.rst | 114 +++++++++++++++++++++++--- 3 files changed, 126 insertions(+), 84 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index be0afc6305044..51a73c857ccb2 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -2,73 +2,4 @@ FILL IN THE PR DESCRIPTION HERE FIX #xxxx (*link existing issues this PR will resolve*) -**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE** - ---- - -
- - PR Checklist (Click to Expand) - -

Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.

- -

PR Title and Classification

-

Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:

-
    -
  • [Bugfix] for bug fixes.
  • -
  • [CI/Build] for build or continuous integration improvements.
  • -
  • [Doc] for documentation fixes and improvements.
  • -
  • [Model] for adding a new model or improving an existing model. Model name should appear in the title.
  • -
  • [Frontend] For changes on the vLLM frontend (e.g., OpenAI API server, LLM class, etc.)
  • -
  • [Kernel] for changes affecting CUDA kernels or other compute kernels.
  • -
  • [Core] for changes in the core vLLM logic (e.g., LLMEngine, AsyncLLMEngine, Scheduler, etc.)
  • -
  • [Hardware][Vendor] for hardware-specific changes. Vendor name should appear in the prefix (e.g., [Hardware][AMD]).
  • -
  • [Misc] for PRs that do not fit the above categories. Please use this sparingly.
  • -
-

Note: If the PR spans more than one category, please include all relevant prefixes.

- -

Code Quality

- -

The PR need to meet the following code quality standards:

- -
    -
  • We adhere to Google Python style guide and Google C++ style guide.
  • -
  • Pass all linter checks. Please use format.sh to format your code.
  • -
  • The code need to be well-documented to ensure future contributors can easily understand the code.
  • -
  • Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.
  • -
  • Please add documentation to docs/source/ if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.
  • -
- -

Adding or changing kernels

-

Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.

-
    -
  • Make sure custom ops are registered following PyTorch guidelines: Custom C++ and CUDA Operators and The Custom Operators Manual
  • -
  • Custom operations that return Tensors require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.
  • -
  • Use torch.libary.opcheck() to test the function registration and meta-function for any registered ops. See tests/kernels for examples.
  • -
  • When changing the C++ signature of an existing op, the schema must be updated to reflect the changes.
  • -
  • If a new custom type is needed, see the following document: Custom Class Support in PT2. -
- -

Notes for Large Changes

-

Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with rfc-required and might not go through the PR.

- -

What to Expect for the Reviews

- -

The goal of the vLLM team is to be a transparent reviewing machine. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process:

- -
    -
  • After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.
  • -
  • After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.
  • -
  • After the review, the reviewer will put an action-required label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.
  • -
  • Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion. -
  • -
- -

Thank You

- -

Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone!

- - -
- - +**BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html ** diff --git a/.github/scripts/cleanup_pr_body.sh b/.github/scripts/cleanup_pr_body.sh index 3b2da7b9f8966..3246c6f9bc4b7 100755 --- a/.github/scripts/cleanup_pr_body.sh +++ b/.github/scripts/cleanup_pr_body.sh @@ -15,19 +15,36 @@ NEW=/tmp/new_pr_body.txt gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}" cp "${OLD}" "${NEW}" -# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**" -sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE\*\*/,$d' "${NEW}" - # Remove "FIX #xxxx (*link existing issues this PR will resolve*)" sed -i '/FIX #xxxx.*$/d' "${NEW}" # Remove "FILL IN THE PR DESCRIPTION HERE" sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}" +# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**" +sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}" + +# Remove HTML
section that includes text of "PR Checklist (Click to Expand)" +python3 - <.*?.*?PR Checklist \(Click to Expand\).*?.*?
', re.DOTALL) +content = re.sub(pattern, '', content) + +with open("${NEW}", "w") as file: + file.write(content) +EOF + # Run this only if ${NEW} is different than ${OLD} if ! cmp -s "${OLD}" "${NEW}"; then - echo "Updating PR body" gh pr edit --body-file "${NEW}" "${PR_NUMBER}" + echo + echo "Updated PR body:" + echo + cat "${NEW}" else echo "No changes needed" fi diff --git a/docs/source/contributing/overview.rst b/docs/source/contributing/overview.rst index ac2d2b2fe4103..4cea0afdaea74 100644 --- a/docs/source/contributing/overview.rst +++ b/docs/source/contributing/overview.rst @@ -41,15 +41,6 @@ Testing Contribution Guidelines ======================= -DCO and Signed-off-by ----------------------- - -When contributing changes to this project, you must agree to the `DCO `_. -Commits must include a ``Signed-off-by:`` header which certifies agreement with -the terms of the `DCO `_. - -Using ``-s`` with ``git commit`` will automatically add this header. - Issues ------ @@ -61,7 +52,110 @@ If you encounter a bug or have a feature request, please `search existing issues Pull Requests & Code Reviews ---------------------------- -Please check the PR checklist in the `PR template `_ for a detailed guide for contribution. +Thank you for your contribution to vLLM! Before submitting the pull request, +please ensure the PR meets the following criteria. This helps vLLM maintain the +code quality and improve the efficiency of the review process. + +DCO and Signed-off-by +^^^^^^^^^^^^^^^^^^^^^ + +When contributing changes to this project, you must agree to the `DCO `_. +Commits must include a ``Signed-off-by:`` header which certifies agreement with +the terms of the `DCO `_. + +Using ``-s`` with ``git commit`` will automatically add this header. + +PR Title and Classification +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Only specific types of PRs will be reviewed. The PR title is prefixed +appropriately to indicate the type of change. Please use one of the following: + +- ``[Bugfix]`` for bug fixes. +- ``[CI/Build]`` for build or continuous integration improvements. +- ``[Doc]`` for documentation fixes and improvements. +- ``[Model]`` for adding a new model or improving an existing model. Model name + should appear in the title. +- ``[Frontend]`` For changes on the vLLM frontend (e.g., OpenAI API server, + ``LLM`` class, etc.) +- ``[Kernel]`` for changes affecting CUDA kernels or other compute kernels. +- ``[Core]`` for changes in the core vLLM logic (e.g., ``LLMEngine``, + ``AsyncLLMEngine``, ``Scheduler``, etc.) +- ``[Hardware][Vendor]`` for hardware-specific changes. Vendor name should + appear in the prefix (e.g., ``[Hardware][AMD]``). +- ``[Misc]`` for PRs that do not fit the above categories. Please use this + sparingly. + +.. note:: + If the PR spans more than one category, please include all relevant prefixes. + +Code Quality +^^^^^^^^^^^^ + +The PR needs to meet the following code quality standards: + +- We adhere to `Google Python style guide + `_ and `Google C++ style guide + `_. +- Pass all linter checks. Please use `format.sh + `_ to format your + code. +- The code needs to be well-documented to ensure future contributors can easily + understand the code. +- Include sufficient tests to ensure the project stays correct and robust. This + includes both unit tests and integration tests. +- Please add documentation to ``docs/source/`` if the PR modifies the + user-facing behaviors of vLLM. It helps vLLM users understand and utilize the + new features or changes. + +Adding or Changing Kernels +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Each custom kernel needs a schema and one or more implementations to be registered with PyTorch. + +- Make sure custom ops are registered following PyTorch guidelines: + `Custom C++ and CUDA Operators `_ + and `The Custom Operators Manual `_. +- Custom operations that return ``Tensors`` require meta-functions. + Meta-functions should be implemented and registered in Python so that dynamic + dims can be handled automatically. See above documents for a description of + meta-functions. +- Use `torch.library.opcheck() `_ + to test the function registration and meta-function for any registered ops. + See ``tests/kernels`` for examples. +- When changing the C++ signature of an existing op, the schema must be updated + to reflect the changes. +- If a new custom type is needed, see the following document: + `Custom Class Support in PT2 `_. + +Notes for Large Changes +^^^^^^^^^^^^^^^^^^^^^^^ + +Please keep the changes as concise as possible. For major architectural changes +(>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue +(RFC) discussing the technical design and justification. Otherwise, we will tag +it with ``rfc-required`` and might not go through the PR. + +What to Expect for the Reviews +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The goal of the vLLM team is to be a *transparent reviewing machine*. We would +like to make the review process transparent and efficient and make sure no +contributor feels confused or frustrated. However, the vLLM team is small, so we +need to prioritize some PRs over others. Here is what you can expect from the +review process: + +- After the PR is submitted, the PR will be assigned to a reviewer. Every + reviewer will pick up the PRs based on their expertise and availability. +- After the PR is assigned, the reviewer will provide status updates every 2-3 + days. If the PR is not reviewed within 7 days, please feel free to ping the + reviewer or the vLLM team. +- After the review, the reviewer will put an ``action-required`` label on the PR + if there are changes required. The contributor should address the comments and + ping the reviewer to re-review the PR. +- Please respond to all comments within a reasonable time frame. If a comment + isn't clear or you disagree with a suggestion, feel free to ask for + clarification or discuss the suggestion. Thank You --------- From 4f168f69a3e856bda3f30e02fcee7db2a01ff32b Mon Sep 17 00:00:00 2001 From: Michael Green <59619482+mikegre-google@users.noreply.github.com> Date: Fri, 15 Nov 2024 21:26:17 +0000 Subject: [PATCH 013/255] [Docs] Misc updates to TPU installation instructions (#10165) --- .../getting_started/tpu-installation.rst | 54 ++++++++++++------- 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst index 75ab2b6ba02dc..22cc684a1c778 100644 --- a/docs/source/getting_started/tpu-installation.rst +++ b/docs/source/getting_started/tpu-installation.rst @@ -44,15 +44,18 @@ Requirements Provision Cloud TPUs ==================== -You can provision Cloud TPUs using the `Cloud TPU API `_` -or the `queued resources `_` -API. This section shows how to create TPUs using the queued resource API. -For more information about using the Cloud TPU API, see `Create a Cloud TPU using the Create Node API `_. -`Queued resources `_ -enable you to request Cloud TPU resources in a queued manner. When you request -queued resources, the request is added to a queue maintained by the Cloud TPU -service. When the requested resource becomes available, it's assigned to your -Google Cloud project for your immediate exclusive use. +You can provision Cloud TPUs using the `Cloud TPU API `_ +or the `queued resources `_ +API. This section shows how to create TPUs using the queued resource API. For +more information about using the Cloud TPU API, see `Create a Cloud TPU using the Create Node API `_. +Queued resources enable you to request Cloud TPU resources in a queued manner. +When you request queued resources, the request is added to a queue maintained by +the Cloud TPU service. When the requested resource becomes available, it's +assigned to your Google Cloud project for your immediate exclusive use. + +.. note:: + In all of the following commands, replace the ALL CAPS parameter names with + appropriate values. See the parameter descriptions table for more information. Provision a Cloud TPU with the queued resource API -------------------------------------------------- @@ -68,6 +71,7 @@ Create a TPU v5e with 4 TPU chips: --runtime-version RUNTIME_VERSION \ --service-account SERVICE_ACCOUNT + .. list-table:: Parameter descriptions :header-rows: 1 @@ -81,12 +85,13 @@ Create a TPU v5e with 4 TPU chips: * - PROJECT_ID - Your Google Cloud project * - ZONE - - The `zone `_ where you - want to create your Cloud TPU. + - The GCP zone where you want to create your Cloud TPU. The value you use + depends on the version of TPUs you are using. For more information, see + `TPU regions and zones `_ * - ACCELERATOR_TYPE - - The TPU version you want to use. Specify the TPU version, followed by a - '-' and the number of TPU cores. For example `v5e-4` specifies a v5e TPU - with 4 cores. For more information, see `TPU versions `_. + - The TPU version you want to use. Specify the TPU version, for example + `v5litepod-4` specifies a v5e TPU with 4 cores. For more information, + see `TPU versions `_. * - RUNTIME_VERSION - The TPU VM runtime version to use. For more information see `TPU VM images `_. * - SERVICE_ACCOUNT @@ -98,7 +103,15 @@ Connect to your TPU using SSH: .. code-block:: bash - gcloud compute tpus tpu-vm ssh TPU_NAME + gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE + +Install Miniconda + +.. code-block:: bash + + wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh + bash Miniconda3-latest-Linux-x86_64.sh + source ~/.bashrc Create and activate a Conda environment for vLLM: @@ -162,9 +175,11 @@ Run the Docker image with the following command: .. note:: - Since TPU relies on XLA which requires static shapes, vLLM bucketizes the possible input shapes and compiles an XLA graph for each different shape. - The compilation time may take 20~30 minutes in the first run. - However, the compilation time reduces to ~5 minutes afterwards because the XLA graphs are cached in the disk (in :code:`VLLM_XLA_CACHE_PATH` or :code:`~/.cache/vllm/xla_cache` by default). + Since TPU relies on XLA which requires static shapes, vLLM bucketizes the + possible input shapes and compiles an XLA graph for each shape. The + compilation time may take 20~30 minutes in the first run. However, the + compilation time reduces to ~5 minutes afterwards because the XLA graphs are + cached in the disk (in :code:`VLLM_XLA_CACHE_PATH` or :code:`~/.cache/vllm/xla_cache` by default). .. tip:: @@ -173,7 +188,8 @@ Run the Docker image with the following command: .. code-block:: console from torch._C import * # noqa: F403 - ImportError: libopenblas.so.0: cannot open shared object file: No such file or directory + ImportError: libopenblas.so.0: cannot open shared object file: No such + file or directory Install OpenBLAS with the following command: From 32e46e000f77499f4dd7c0bed194e33856f2df24 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 16 Nov 2024 13:35:40 +0800 Subject: [PATCH 014/255] [Frontend] Automatic detection of chat content format from AST (#9919) Signed-off-by: DarkLight1337 --- .../serving/openai_compatible_server.md | 18 +- tests/entrypoints/openai/test_serving_chat.py | 3 +- tests/entrypoints/test_chat_utils.py | 619 +++++++++++------- vllm/config.py | 2 - vllm/engine/arg_utils.py | 10 - vllm/engine/llm_engine.py | 4 +- vllm/entrypoints/chat_utils.py | 246 ++++++- vllm/entrypoints/llm.py | 44 +- vllm/entrypoints/openai/api_server.py | 13 +- vllm/entrypoints/openai/cli_args.py | 17 +- vllm/entrypoints/openai/protocol.py | 71 +- vllm/entrypoints/openai/run_batch.py | 2 + vllm/entrypoints/openai/serving_chat.py | 40 +- vllm/entrypoints/openai/serving_embedding.py | 12 +- vllm/entrypoints/openai/serving_engine.py | 17 +- .../openai/serving_tokenization.py | 20 +- 16 files changed, 788 insertions(+), 350 deletions(-) diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 78965813b1213..79d032bf8b211 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -172,12 +172,20 @@ completion = client.chat.completions.create( ] ) ``` -Most chat templates for LLMs expect the `content` to be a `string` but there are some newer models like -`meta-llama/Llama-Guard-3-1B` that expect the content to be parsed with the new OpenAI spec. In order to choose which -format the content needs to be parsed in by vLLM, please use the `--chat-template-text-format` argument to specify -between `string` or `openai`. The default value is `string` and vLLM internally converts both spec formats to match -this, unless explicitly specified. +Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like +`meta-llama/Llama-Guard-3-1B` that expect the content to be formatted according to the OpenAI schema in the +request. vLLM provides best-effort support to detect this automatically, which is logged as a string like +*"Detected the chat template content format to be..."*, and internally converts incoming requests to match +the detected format, which can be one of: + +- `"string"`: A string. + - Example: `"Hello world"` +- `"openai"`: A list of dictionaries, similar to OpenAI schema. + - Example: `[{"type": "text", "text": "Hello world!"}]` + +If the result is not what you expect, you can set the `--chat-template-content-format` CLI argument +to override which format to use. ## Command line arguments for the server diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index e969d33775d86..93660e6118ca8 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -26,7 +26,6 @@ class MockModelConfig: tokenizer = MODEL_NAME trust_remote_code = False tokenizer_mode = "auto" - chat_template_text_format = "string" max_model_len = 100 tokenizer_revision = None multimodal_config = MultiModalConfig() @@ -49,6 +48,7 @@ async def _async_serving_chat_init(): BASE_MODEL_PATHS, response_role="assistant", chat_template=CHAT_TEMPLATE, + chat_template_content_format="auto", lora_modules=None, prompt_adapters=None, request_logger=None) @@ -70,6 +70,7 @@ def test_serving_chat_should_set_correct_max_tokens(): BASE_MODEL_PATHS, response_role="assistant", chat_template=CHAT_TEMPLATE, + chat_template_content_format="auto", lora_modules=None, prompt_adapters=None, request_logger=None) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 5fa466f8f041f..72477e048eafa 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -6,15 +6,24 @@ from vllm.assets.image import ImageAsset from vllm.config import ModelConfig -from vllm.entrypoints.chat_utils import (parse_chat_messages, - parse_chat_messages_futures) +from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template, + parse_chat_messages, + parse_chat_messages_futures, + resolve_chat_template_content_format) from vllm.entrypoints.llm import apply_hf_chat_template from vllm.multimodal import MultiModalDataDict from vllm.multimodal.utils import encode_image_base64 from vllm.transformers_utils.tokenizer_group import TokenizerGroup +from ..utils import VLLM_PATH + +EXAMPLES_DIR = VLLM_PATH / "examples" + PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct" +ULTRAVOX_MODEL_ID = "fixie-ai/ultravox-v0_3" +QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct" MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct" +LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B" @pytest.fixture(scope="function") @@ -26,7 +35,6 @@ def phi3v_model_config(): trust_remote_code=True, dtype="bfloat16", seed=0, - chat_template_text_format="string", limit_mm_per_prompt={ "image": 2, }) @@ -94,19 +102,24 @@ def test_parse_chat_messages_single_image( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "What's in the image?" - }] - }], phi3v_model_config, phi3v_tokenizer) + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What's in the image?" + }] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) assert conversation == [{ "role": "user", @@ -121,19 +134,24 @@ async def test_parse_chat_messages_single_image_async( phi3v_tokenizer, image_url, ): - conversation, mm_future = parse_chat_messages_futures([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "What's in the image?" - }] - }], phi3v_model_config, phi3v_tokenizer) + conversation, mm_future = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What's in the image?" + }] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) assert conversation == [{ "role": "user", @@ -147,24 +165,29 @@ def test_parse_chat_messages_multiple_images( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "What's in these images?" - }] - }], phi3v_model_config, phi3v_tokenizer) + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What's in these images?" + }] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) assert conversation == [{ "role": @@ -181,24 +204,29 @@ async def test_parse_chat_messages_multiple_images_async( phi3v_tokenizer, image_url, ): - conversation, mm_future = parse_chat_messages_futures([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "What's in these images?" - }] - }], phi3v_model_config, phi3v_tokenizer) + conversation, mm_future = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What's in these images?" + }] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) assert conversation == [{ "role": @@ -214,27 +242,31 @@ def test_parse_chat_messages_placeholder_already_in_prompt( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": - "text", - "text": - "What's in <|image_1|> and how does it compare to <|image_2|>?" - }] - }], phi3v_model_config, phi3v_tokenizer) - + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": + "text", + "text": + "What's in <|image_1|> and how does it compare to <|image_2|>?" + }] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) assert conversation == [{ "role": "user", @@ -249,26 +281,35 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": - "text", - "text": - "What's in <|image_1|> and how does it compare to the other one?" - }] - }], phi3v_model_config, phi3v_tokenizer) + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": + "text", + "text": + "What's in <|image_1|> and how does it compare to the other one?" # noqa: E501 + } + ] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) assert conversation == [{ "role": @@ -285,34 +326,39 @@ def test_parse_chat_messages_multiple_images_across_messages( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What's in this image?" + }] }, { - "type": "text", - "text": "What's in this image?" - }] - }, { - "role": "assistant", - "content": "Some stuff." - }, { - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } + "role": "assistant", + "content": "Some stuff." }, { - "type": "text", - "text": "What about this one?" - }] - }], phi3v_model_config, phi3v_tokenizer) + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What about this one?" + }] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) assert conversation == [ { @@ -335,7 +381,6 @@ def test_parse_chat_messages_context_text_format( phi3v_model_config, phi3v_tokenizer, ): - phi3v_model_config.chat_template_text_format = "openai" conversation, mm_data = parse_chat_messages( [{ "role": "user", @@ -349,7 +394,11 @@ def test_parse_chat_messages_context_text_format( }, { "role": "user", "content": "What about this one?" - }], phi3v_model_config, phi3v_tokenizer) + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="openai", + ) assert conversation == [ { @@ -389,29 +438,34 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message( ValueError, match="At most 2 image\\(s\\) may be provided in one request\\." ): - parse_chat_messages([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "What's in these images?" - }] - }], phi3v_model_config, phi3v_tokenizer) + parse_chat_messages( + [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What's in these images?" + }] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) def test_parse_chat_messages_rejects_too_many_images_across_messages( @@ -427,39 +481,44 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages( ValueError, match="At most 2 image\\(s\\) may be provided in one request\\." ): - parse_chat_messages([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } + parse_chat_messages( + [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What's in this image?" + }] }, { - "type": "text", - "text": "What's in this image?" - }] - }, { - "role": "assistant", - "content": "Some stuff." - }, { - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } + "role": "assistant", + "content": "Some stuff." }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "What about these two?" - }] - }], phi3v_model_config, phi3v_tokenizer) + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What about these two?" + }] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) def test_parse_chat_messages_multiple_images_uncommon_input( @@ -467,17 +526,22 @@ def test_parse_chat_messages_multiple_images_uncommon_input( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages([{ - "role": - "user", - "content": [ - "What's in these images?", { - "image_url": image_url - }, { - "image_url": image_url - } - ] - }], phi3v_model_config, phi3v_tokenizer) + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [ + "What's in these images?", { + "image_url": image_url + }, { + "image_url": image_url + } + ] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) assert conversation == [{ "role": @@ -495,16 +559,21 @@ def test_mllama_single_image( image_url, ): """Ensures that a single image is parsed correctly mllama.""" - conversation, mm_data = parse_chat_messages([{ - "role": - "user", - "content": [{ - 'type': 'text', - 'text': 'The content of this image is:' - }, { - "image_url": image_url - }] - }], mllama_model_config, mllama_tokenizer) + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [{ + 'type': 'text', + 'text': 'The content of this image is:' + }, { + "image_url": image_url + }] + }], + mllama_model_config, + mllama_tokenizer, + content_format="openai", + ) _assert_mm_data_is_image_input(mm_data, 1) assert conversation == [{ 'role': @@ -524,26 +593,31 @@ def test_mllama_interleaved_images( image_url, ): """Ensures that multiple image are parsed as interleaved dicts.""" - conversation, mm_data = parse_chat_messages([{ - "role": - "user", - "content": [ - { - 'type': 'text', - 'text': 'The content of the first image is:' - }, - { - "image_url": image_url - }, - { - 'type': 'text', - 'text': 'The content of the second image is:' - }, - { - "image_url": image_url - }, - ] - }], mllama_model_config, mllama_tokenizer) + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + 'type': 'text', + 'text': 'The content of the first image is:' + }, + { + "image_url": image_url + }, + { + 'type': 'text', + 'text': 'The content of the second image is:' + }, + { + "image_url": image_url + }, + ] + }], + mllama_model_config, + mllama_tokenizer, + content_format="openai", + ) _assert_mm_data_is_image_input(mm_data, 2) assert conversation == [{ 'role': @@ -626,6 +700,7 @@ def get_conversation(is_hf: bool): vllm_conversation, model_config, tokenizer_group, + content_format="openai", ) vllm_result = apply_hf_chat_template( @@ -636,3 +711,89 @@ def get_conversation(is_hf: bool): ) assert hf_result == vllm_result + + +# yapf: disable +@pytest.mark.parametrize( + ("model", "expected_format"), + [(PHI3V_MODEL_ID, "string"), + (QWEN2VL_MODEL_ID, "openai"), + (ULTRAVOX_MODEL_ID, "string"), + (MLLAMA_MODEL_ID, "openai"), + (LLAMA_GUARD_MODEL_ID, "openai")], +) +# yapf: enable +def test_resolve_content_format_hf_defined(model, expected_format): + tokenizer_group = TokenizerGroup( + model, + enable_lora=False, + max_num_seqs=5, + max_input_length=None, + ) + tokenizer = tokenizer_group.tokenizer + + chat_template = tokenizer.chat_template + assert isinstance(chat_template, str) + + print("[TEXT]") + print(chat_template) + print("[AST]") + print(_try_extract_ast(chat_template)) + + resolved_format = resolve_chat_template_content_format( + None, # Test detecting the tokenizer's chat_template + "auto", + tokenizer, + ) + + assert resolved_format == expected_format + + +# yapf: disable +@pytest.mark.parametrize( + ("template_path", "expected_format"), + [("template_alpaca.jinja", "string"), + ("template_baichuan.jinja", "string"), + ("template_blip2.jinja", "string"), + ("template_chatglm.jinja", "string"), + ("template_chatglm2.jinja", "string"), + ("template_chatml.jinja", "string"), + ("template_falcon_180b.jinja", "string"), + ("template_falcon.jinja", "string"), + ("template_inkbot.jinja", "string"), + ("template_llava.jinja", "string"), + ("template_vlm2vec.jinja", "openai"), + ("tool_chat_template_granite_20b_fc.jinja", "string"), + ("tool_chat_template_hermes.jinja", "string"), + ("tool_chat_template_internlm2_tool.jinja", "string"), + ("tool_chat_template_llama3.1_json.jinja", "string"), + ("tool_chat_template_llama3.2_json.jinja", "string"), + ("tool_chat_template_mistral_parallel.jinja", "string"), + ("tool_chat_template_mistral.jinja", "string")], +) +# yapf: enable +def test_resolve_content_format_examples(template_path, expected_format): + tokenizer_group = TokenizerGroup( + PHI3V_MODEL_ID, + enable_lora=False, + max_num_seqs=5, + max_input_length=None, + ) + dummy_tokenizer = tokenizer_group.tokenizer + dummy_tokenizer.chat_template = None + + chat_template = load_chat_template(EXAMPLES_DIR / template_path) + assert isinstance(chat_template, str) + + print("[TEXT]") + print(chat_template) + print("[AST]") + print(_try_extract_ast(chat_template)) + + resolved_format = resolve_chat_template_content_format( + chat_template, + "auto", + dummy_tokenizer, + ) + + assert resolved_format == expected_format diff --git a/vllm/config.py b/vllm/config.py index 1c190da1d327e..64b2f75e092de 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -155,7 +155,6 @@ def __init__( limit_mm_per_prompt: Optional[Mapping[str, int]] = None, use_async_output_proc: bool = True, config_format: ConfigFormat = ConfigFormat.AUTO, - chat_template_text_format: str = "string", hf_overrides: Optional[HfOverrides] = None, mm_processor_kwargs: Optional[Dict[str, Any]] = None, override_neuron_config: Optional[Dict[str, Any]] = None, @@ -216,7 +215,6 @@ def __init__( self.model, revision) self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype) self.use_async_output_proc = use_async_output_proc - self.chat_template_text_format = chat_template_text_format self.mm_processor_kwargs = mm_processor_kwargs # Set enforce_eager to False if the value is unset. diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d73f95f59c71f..92fa87c7fa45b 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -90,7 +90,6 @@ class EngineArgs: task: TaskOption = "auto" skip_tokenizer_init: bool = False tokenizer_mode: str = 'auto' - chat_template_text_format: str = 'string' trust_remote_code: bool = False allowed_local_media_path: str = "" download_dir: Optional[str] = None @@ -258,14 +257,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: 'fast tokenizer if available.\n* "slow" will ' 'always use the slow tokenizer. \n* ' '"mistral" will always use the `mistral_common` tokenizer.') - parser.add_argument( - '--chat-template-text-format', - type=str, - default=EngineArgs.chat_template_text_format, - choices=['string', 'openai'], - help='The format to render text content within a chat template. ' - '"string" will keep the content field as a string whereas ' - '"openai" will parse content in the current OpenAI format.') parser.add_argument('--trust-remote-code', action='store_true', help='Trust remote code from huggingface.') @@ -894,7 +885,6 @@ def create_model_config(self) -> ModelConfig: # We know this is not None because we set it in __post_init__ tokenizer=cast(str, self.tokenizer), tokenizer_mode=self.tokenizer_mode, - chat_template_text_format=self.chat_template_text_format, trust_remote_code=self.trust_remote_code, allowed_local_media_path=self.allowed_local_media_path, dtype=self.dtype, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index aa9c7893c4cfe..9a2d73a020c8f 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -262,8 +262,7 @@ def __init__( "num_scheduler_steps=%d, chunked_prefill_enabled=%s " "multi_step_stream_outputs=%s, enable_prefix_caching=%s, " "use_async_output_proc=%s, use_cached_outputs=%s, " - "chat_template_text_format=%s, mm_processor_kwargs=%s, " - "pooler_config=%r)", + "mm_processor_kwargs=%s, pooler_config=%r)", VLLM_VERSION, model_config.model, speculative_config, @@ -296,7 +295,6 @@ def __init__( cache_config.enable_prefix_caching, model_config.use_async_output_proc, use_cached_outputs, - model_config.chat_template_text_format, model_config.mm_processor_kwargs, model_config.pooler_config, ) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 3ca460c47c3bd..abee5ac46391c 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -2,12 +2,14 @@ import codecs import json from abc import ABC, abstractmethod -from collections import defaultdict +from collections import defaultdict, deque from functools import lru_cache, partial from pathlib import Path from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List, Literal, Mapping, Optional, Tuple, TypeVar, Union, cast) +import jinja2.nodes +import transformers.utils.chat_template_utils as hf_chat_utils # yapf conflicts with isort for this block # yapf: disable from openai.types.chat import (ChatCompletionAssistantMessageParam, @@ -153,6 +155,199 @@ class ConversationMessage(TypedDict, total=False): """The tool calls generated by the model, such as function calls.""" +# Passed in by user +ChatTemplateContentFormatOption = Literal["auto", "string", "openai"] + +# Used internally +_ChatTemplateContentFormat = Literal["string", "openai"] + + +def _is_var_access(node: jinja2.nodes.Node, varname: str) -> bool: + if isinstance(node, jinja2.nodes.Name): + return node.ctx == "load" and node.name == varname + + return False + + +def _is_attr_access(node: jinja2.nodes.Node, varname: str, key: str) -> bool: + if isinstance(node, jinja2.nodes.Getitem): + return (_is_var_access(node.node, varname) + and isinstance(node.arg, jinja2.nodes.Const) + and node.arg.value == key) + + if isinstance(node, jinja2.nodes.Getattr): + return _is_var_access(node.node, varname) and node.attr == key + + return False + + +def _is_var_or_elems_access( + node: jinja2.nodes.Node, + varname: str, + key: Optional[str] = None, +) -> bool: + if isinstance(node, jinja2.nodes.Filter): + return (node.node is not None + and _is_var_or_elems_access(node.node, varname, key)) + if isinstance(node, jinja2.nodes.Test): + return _is_var_or_elems_access(node.node, varname, key) + + if (isinstance(node, jinja2.nodes.Getitem) + and isinstance(node.arg, jinja2.nodes.Slice)): + return _is_var_or_elems_access(node.node, varname, key) + + # yapf: disable + return ( + _is_attr_access(node, varname, key) if key + else _is_var_access(node, varname) + ) # yapf: enable + + +def _iter_nodes_assign_var_or_elems(root: jinja2.nodes.Node, varname: str): + # Global variable that is implicitly defined at the root + yield root, varname + + # Iterative BFS + related_varnames = deque([varname]) + while related_varnames: + related_varname = related_varnames.popleft() + + for assign_ast in root.find_all(jinja2.nodes.Assign): + lhs = assign_ast.target + rhs = assign_ast.node + + if _is_var_or_elems_access(rhs, related_varname): + assert isinstance(lhs, jinja2.nodes.Name) + yield assign_ast, lhs.name + + # Avoid infinite looping for self-assignment + if lhs.name != related_varname: + related_varnames.append(lhs.name) + + +# NOTE: The proper way to handle this is to build a CFG so that we can handle +# the scope in which each variable is defined, but that is too complicated +def _iter_nodes_assign_messages_item(root: jinja2.nodes.Node): + messages_varnames = [ + varname + for _, varname in _iter_nodes_assign_var_or_elems(root, "messages") + ] + + # Search for {%- for message in messages -%} loops + for loop_ast in root.find_all(jinja2.nodes.For): + loop_iter = loop_ast.iter + loop_target = loop_ast.target + + for varname in messages_varnames: + if _is_var_or_elems_access(loop_iter, varname): + assert isinstance(loop_target, jinja2.nodes.Name) + yield loop_ast, loop_target.name + break + + +def _iter_nodes_assign_content_item(root: jinja2.nodes.Node): + message_varnames = [ + varname for _, varname in _iter_nodes_assign_messages_item(root) + ] + + # Search for {%- for content in message['content'] -%} loops + for loop_ast in root.find_all(jinja2.nodes.For): + loop_iter = loop_ast.iter + loop_target = loop_ast.target + + for varname in message_varnames: + if _is_var_or_elems_access(loop_iter, varname, "content"): + assert isinstance(loop_target, jinja2.nodes.Name) + yield loop_ast, loop_target.name + break + + +def _try_extract_ast(chat_template: str) -> Optional[jinja2.nodes.Template]: + try: + jinja_compiled = hf_chat_utils._compile_jinja_template(chat_template) + return jinja_compiled.environment.parse(chat_template) + except Exception: + logger.exception("Error when compiling Jinja template") + return None + + +def _detect_content_format( + chat_template: str, + *, + default: _ChatTemplateContentFormat, +) -> _ChatTemplateContentFormat: + jinja_ast = _try_extract_ast(chat_template) + if jinja_ast is None: + return default + + try: + next(_iter_nodes_assign_content_item(jinja_ast)) + except StopIteration: + return "string" + except Exception: + logger.exception("Error when parsing AST of Jinja template") + return default + else: + return "openai" + + +def _resolve_chat_template_content_format( + chat_template: Optional[str], + given_format: ChatTemplateContentFormatOption, + tokenizer: AnyTokenizer, +) -> _ChatTemplateContentFormat: + if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)): + tokenizer_chat_template = tokenizer.chat_template + else: + tokenizer_chat_template = None + + jinja_text: Optional[str] + if isinstance(tokenizer_chat_template, str) and chat_template is None: + jinja_text = tokenizer_chat_template + elif (isinstance(tokenizer_chat_template, dict) + and chat_template in tokenizer_chat_template): + jinja_text = tokenizer_chat_template[chat_template] + else: + jinja_text = load_chat_template(chat_template, is_literal=True) + + detected_format = ("string" if jinja_text is None else + _detect_content_format(jinja_text, default="string")) + + return detected_format if given_format == "auto" else given_format + + +@lru_cache +def resolve_chat_template_content_format( + chat_template: Optional[str], + given_format: ChatTemplateContentFormatOption, + tokenizer: AnyTokenizer, +) -> _ChatTemplateContentFormat: + detected_format = _resolve_chat_template_content_format( + chat_template, + given_format, + tokenizer, + ) + + logger.info( + "Detected the chat template content format to be '%s'. " + "You can set `--chat-template-content-format` to override this.", + detected_format, + ) + + if given_format != "auto" and given_format != detected_format: + logger.warning( + "You specified `--chat-template-content-format %s` " + "which is different from the detected format '%s'. " + "If our automatic detection is incorrect, please consider " + "opening a GitHub issue so that we can improve it: " + "https://github.com/vllm-project/vllm/issues/new/choose", + given_format, + detected_format, + ) + + return detected_format + + ModalityStr = Literal["image", "audio", "video"] _T = TypeVar("_T") @@ -407,12 +602,23 @@ def validate_chat_template(chat_template: Optional[Union[Path, str]]): def load_chat_template( - chat_template: Optional[Union[Path, str]]) -> Optional[str]: + chat_template: Optional[Union[Path, str]], + *, + is_literal: bool = False, +) -> Optional[str]: if chat_template is None: return None + + if is_literal: + if isinstance(chat_template, Path): + raise TypeError("chat_template is expected to be read directly " + "from its value") + + return codecs.decode(chat_template, "unicode_escape") + try: with open(chat_template) as f: - resolved_chat_template = f.read() + return f.read() except OSError as e: if isinstance(chat_template, Path): raise @@ -426,10 +632,7 @@ def load_chat_template( # If opening a file fails, set chat template to be args to # ensure we decode so our escape are interpreted correctly - resolved_chat_template = codecs.decode(chat_template, "unicode_escape") - - logger.info("Using supplied chat template:\n%s", resolved_chat_template) - return resolved_chat_template + return load_chat_template(chat_template, is_literal=True) # TODO: Let user specify how to insert multimodal tokens into prompt @@ -464,7 +667,6 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int], _AudioParser = partial(cast, ChatCompletionContentPartAudioParam) _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam) _VideoParser = partial(cast, ChatCompletionContentPartVideoParam) -MODEL_KEEP_MULTI_MODAL_CONTENT = {'mllama'} # Define a mapping from part types to their corresponding parsing functions. MM_PARSER_MAP: Dict[str, Callable[[ChatCompletionContentPartParam], str]] = { @@ -542,18 +744,12 @@ def _parse_chat_message_content_parts( role: str, parts: Iterable[ChatCompletionContentPartParam], mm_tracker: BaseMultiModalItemTracker, - chat_template_text_format: str, + *, + wrap_dicts: bool, ) -> List[ConversationMessage]: content: List[Union[str, Dict[str, str]]] = [] mm_parser = mm_tracker.create_parser() - model_config = mm_tracker.model_config - - wrap_dicts = (chat_template_text_format == "openai" - or (model_config.task == "embedding" - and model_config.is_multimodal_model) - or (model_config.hf_config.model_type - in MODEL_KEEP_MULTI_MODAL_CONTENT)) for part in parts: parse_res = _parse_chat_message_content_part( @@ -578,9 +774,11 @@ def _parse_chat_message_content_parts( def _parse_chat_message_content_part( - part: ChatCompletionContentPartParam, - mm_parser: BaseMultiModalContentParser, - wrap_dicts: bool) -> Optional[Union[str, Dict[str, str]]]: + part: ChatCompletionContentPartParam, + mm_parser: BaseMultiModalContentParser, + *, + wrap_dicts: bool, +) -> Optional[Union[str, Dict[str, str]]]: """Parses a single part of a conversation. If wrap_dicts is True, structured dictionary pieces for texts and images will be wrapped in dictionaries, i.e., {"type": "text", "text", ...} and @@ -629,7 +827,7 @@ def _parse_chat_message_content_part( def _parse_chat_message_content( message: ChatCompletionMessageParam, mm_tracker: BaseMultiModalItemTracker, - chat_template_text_format: str, + content_format: _ChatTemplateContentFormat, ) -> List[ConversationMessage]: role = message["role"] content = message.get("content") @@ -645,7 +843,7 @@ def _parse_chat_message_content( role, content, # type: ignore mm_tracker, - chat_template_text_format, + wrap_dicts=(content_format == "openai"), ) for result_msg in result: @@ -684,6 +882,7 @@ def parse_chat_messages( messages: List[ChatCompletionMessageParam], model_config: ModelConfig, tokenizer: AnyTokenizer, + content_format: _ChatTemplateContentFormat, ) -> Tuple[List[ConversationMessage], Optional[MultiModalDataDict]]: conversation: List[ConversationMessage] = [] mm_tracker = MultiModalItemTracker(model_config, tokenizer) @@ -692,7 +891,7 @@ def parse_chat_messages( sub_messages = _parse_chat_message_content( msg, mm_tracker, - model_config.chat_template_text_format, + content_format, ) conversation.extend(sub_messages) @@ -706,6 +905,7 @@ def parse_chat_messages_futures( messages: List[ChatCompletionMessageParam], model_config: ModelConfig, tokenizer: AnyTokenizer, + content_format: _ChatTemplateContentFormat, ) -> Tuple[List[ConversationMessage], Awaitable[Optional[MultiModalDataDict]]]: conversation: List[ConversationMessage] = [] mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer) @@ -714,7 +914,7 @@ def parse_chat_messages_futures( sub_messages = _parse_chat_message_content( msg, mm_tracker, - model_config.chat_template_text_format, + content_format, ) conversation.extend(sub_messages) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 4b33fc1458ee3..86b0b6893f1d9 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -13,9 +13,11 @@ TaskOption) from vllm.engine.llm_engine import LLMEngine from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, + ChatTemplateContentFormatOption, apply_hf_chat_template, apply_mistral_chat_template, - parse_chat_messages) + parse_chat_messages, + resolve_chat_template_content_format) from vllm.inputs import PromptType, TextPrompt, TokensPrompt from vllm.inputs.parse import parse_and_batch_prompt from vllm.logger import init_logger @@ -523,6 +525,7 @@ def chat( use_tqdm: bool = True, lora_request: Optional[LoRARequest] = None, chat_template: Optional[str] = None, + chat_template_content_format: ChatTemplateContentFormatOption = "auto", add_generation_prompt: bool = True, continue_final_message: bool = False, tools: Optional[List[Dict[str, Any]]] = None, @@ -539,9 +542,11 @@ def chat( to the OpenAI API. Args: - messages: A list of conversations or a single conversation. - - Each conversation is represented as a list of messages. - - Each message is a dictionary with 'role' and 'content' keys. + messages: A list of conversations or a single conversation. + + - Each conversation is represented as a list of messages. + - Each message is a dictionary with 'role' and 'content' keys. + sampling_params: The sampling parameters for text generation. If None, we use the default sampling parameters. When it is a single value, it is applied to every prompt. When it @@ -551,11 +556,19 @@ def chat( lora_request: LoRA request to use for generation, if any. chat_template: The template to use for structuring the chat. If not provided, the model's default chat template will be used. + chat_template_content_format: The format to render message content. + + - "string" will render the content as a string. + Example: ``"Who are you?"`` + - "openai" will render the content as a list of dictionaries, + similar to OpenAI schema. + Example: ``[{"type": "text", "text": "Who are you?"}]`` + add_generation_prompt: If True, adds a generation template to each message. continue_final_message: If True, continues the final message in - the conversation instead of starting a new one. Cannot be `True` - if `add_generation_prompt` is also `True`. + the conversation instead of starting a new one. Cannot be + ``True`` if ``add_generation_prompt`` is also ``True``. mm_processor_kwargs: Multimodal processor kwarg overrides for this chat request. Only used for offline requests. @@ -576,17 +589,26 @@ def chat( cast(List[ChatCompletionMessageParam], messages) ] + tokenizer = self.get_tokenizer() + model_config = self.llm_engine.get_model_config() + resolved_content_format = resolve_chat_template_content_format( + chat_template, + chat_template_content_format, + tokenizer, + ) + prompts: List[Union[TokensPrompt, TextPrompt]] = [] for msgs in list_of_messages: - tokenizer = self.get_tokenizer() - model_config = self.llm_engine.get_model_config() - # NOTE: _parse_chat_message_content_parts() currently doesn't # handle mm_processor_kwargs, since there is no implementation in # the chat message parsing for it. conversation, mm_data = parse_chat_messages( - msgs, model_config, tokenizer) + msgs, + model_config, + tokenizer, + content_format=resolved_content_format, + ) prompt_data: Union[str, List[int]] if isinstance(tokenizer, MistralTokenizer): @@ -737,7 +759,7 @@ def encode( generation, if any. Returns: - A list of `EmbeddingRequestOutput` objects containing the + A list of ``EmbeddingRequestOutput`` objects containing the generated embeddings in the same order as the input prompts. Note: diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index b13f6a228b4c6..b0fe061f5db4a 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -29,6 +29,7 @@ from vllm.engine.multiprocessing.client import MQLLMEngineClient from vllm.engine.multiprocessing.engine import run_mp_engine from vllm.engine.protocol import EngineClient +from vllm.entrypoints.chat_utils import load_chat_template from vllm.entrypoints.launcher import serve_http from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.cli_args import (make_arg_parser, @@ -529,6 +530,9 @@ def init_app_state( state.engine_client = engine_client state.log_stats = not args.disable_log_stats + resolved_chat_template = load_chat_template(args.chat_template) + logger.info("Using supplied chat template:\n%s", resolved_chat_template) + state.openai_serving_chat = OpenAIServingChat( engine_client, model_config, @@ -537,7 +541,8 @@ def init_app_state( lora_modules=args.lora_modules, prompt_adapters=args.prompt_adapters, request_logger=request_logger, - chat_template=args.chat_template, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_auto_tools=args.enable_auto_tool_choice, tool_parser=args.tool_call_parser, @@ -557,7 +562,8 @@ def init_app_state( model_config, base_model_paths, request_logger=request_logger, - chat_template=args.chat_template, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, ) if model_config.task == "embedding" else None state.openai_serving_tokenization = OpenAIServingTokenization( engine_client, @@ -565,7 +571,8 @@ def init_app_state( base_model_paths, lora_modules=args.lora_modules, request_logger=request_logger, - chat_template=args.chat_template, + chat_template=resolved_chat_template, + chat_template_content_format=args.chat_template_content_format, ) diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index eb08a89293370..24c206a1261f2 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -7,10 +7,11 @@ import argparse import json import ssl -from typing import List, Optional, Sequence, Union +from typing import List, Optional, Sequence, Union, get_args from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str -from vllm.entrypoints.chat_utils import validate_chat_template +from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption, + validate_chat_template) from vllm.entrypoints.openai.serving_engine import (LoRAModulePath, PromptAdapterPath) from vllm.entrypoints.openai.tool_parsers import ToolParserManager @@ -132,6 +133,18 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: help="The file path to the chat template, " "or the template in single-line form " "for the specified model") + parser.add_argument( + '--chat-template-content-format', + type=str, + default="auto", + choices=get_args(ChatTemplateContentFormatOption), + help='The format to render message content within a chat template.' + '\n\n' + '* "string" will render the content as a string. ' + 'Example: "Hello World"\n' + '* "openai" will render the content as a list of dictionaries, ' + 'similar to OpenAI schema. ' + 'Example: [{"type": "text", "text": "Hello world!"}]') parser.add_argument("--response-role", type=nullable_str, default="assistant", diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 820aefd8800d9..b7b064ae01f05 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -5,9 +5,8 @@ from typing import Any, Dict, List, Literal, Optional, Union import torch -from openai.types.chat import ChatCompletionContentPartParam from pydantic import BaseModel, ConfigDict, Field, model_validator -from typing_extensions import Annotated, Required, TypedDict +from typing_extensions import Annotated from vllm.entrypoints.chat_utils import ChatCompletionMessageParam from vllm.pooling_params import PoolingParams @@ -35,26 +34,6 @@ assert _LONG_INFO.max == _MOCK_LONG_INFO.max -class CustomChatCompletionMessageParam(TypedDict, total=False): - """Enables custom roles in the Chat Completion API.""" - role: Required[str] - """The role of the message's author.""" - - content: Union[str, List[ChatCompletionContentPartParam]] - """The contents of the message.""" - - name: str - """An optional name for the participant. - - Provides the model information to differentiate between participants of the - same role. - """ - - tool_call_id: Optional[str] - - tool_calls: Optional[List[dict]] - - class OpenAIBaseModel(BaseModel): # OpenAI API does not allow extra fields model_config = ConfigDict(extra="forbid") @@ -1054,16 +1033,56 @@ class TokenizeCompletionRequest(OpenAIBaseModel): model: str prompt: str - add_special_tokens: bool = Field(default=True) + add_special_tokens: bool = Field( + default=True, + description=( + "If true (the default), special tokens (e.g. BOS) will be added to " + "the prompt."), + ) class TokenizeChatRequest(OpenAIBaseModel): model: str messages: List[ChatCompletionMessageParam] - add_generation_prompt: bool = Field(default=True) - continue_final_message: bool = Field(default=False) - add_special_tokens: bool = Field(default=False) + add_generation_prompt: bool = Field( + default=True, + description= + ("If true, the generation prompt will be added to the chat template. " + "This is a parameter used by chat template in tokenizer config of the " + "model."), + ) + continue_final_message: bool = Field( + default=False, + description= + ("If this is set, the chat will be formatted so that the final " + "message in the chat is open-ended, without any EOS tokens. The " + "model will continue this message rather than starting a new one. " + "This allows you to \"prefill\" part of the model's response for it. " + "Cannot be used at the same time as `add_generation_prompt`."), + ) + add_special_tokens: bool = Field( + default=False, + description=( + "If true, special tokens (e.g. BOS) will be added to the prompt " + "on top of what is added by the chat template. " + "For most models, the chat template takes care of adding the " + "special tokens so this should be set to false (as is the " + "default)."), + ) + chat_template: Optional[str] = Field( + default=None, + description=( + "A Jinja template to use for this conversion. " + "As of transformers v4.44, default chat template is no longer " + "allowed, so you must provide a chat template if the tokenizer " + "does not define one."), + ) + chat_template_kwargs: Optional[Dict[str, Any]] = Field( + default=None, + description=("Additional kwargs to pass to the template renderer. " + "Will be accessible by the chat template."), + ) @model_validator(mode="before") @classmethod diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 1b422a93263b2..00cdb3b6839f5 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -222,6 +222,7 @@ async def main(args): prompt_adapters=None, request_logger=request_logger, chat_template=None, + chat_template_content_format="auto", enable_prompt_tokens_details=args.enable_prompt_tokens_details, ) if model_config.task == "generate" else None openai_serving_embedding = OpenAIServingEmbedding( @@ -230,6 +231,7 @@ async def main(args): base_model_paths, request_logger=request_logger, chat_template=None, + chat_template_content_format="auto", ) if model_config.task == "embedding" else None tracker = BatchProgressTracker() diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 77cae00ae827f..2eef909eb9319 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -10,7 +10,8 @@ from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient -from vllm.entrypoints.chat_utils import ConversationMessage, load_chat_template +from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption, + ConversationMessage) from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.protocol import ( ChatCompletionLogProb, ChatCompletionLogProbs, @@ -38,20 +39,23 @@ class OpenAIServingChat(OpenAIServing): - def __init__(self, - engine_client: EngineClient, - model_config: ModelConfig, - base_model_paths: List[BaseModelPath], - response_role: str, - *, - lora_modules: Optional[List[LoRAModulePath]], - prompt_adapters: Optional[List[PromptAdapterPath]], - request_logger: Optional[RequestLogger], - chat_template: Optional[str], - return_tokens_as_token_ids: bool = False, - enable_auto_tools: bool = False, - tool_parser: Optional[str] = None, - enable_prompt_tokens_details: bool = False): + def __init__( + self, + engine_client: EngineClient, + model_config: ModelConfig, + base_model_paths: List[BaseModelPath], + response_role: str, + *, + lora_modules: Optional[List[LoRAModulePath]], + prompt_adapters: Optional[List[PromptAdapterPath]], + request_logger: Optional[RequestLogger], + chat_template: Optional[str], + chat_template_content_format: ChatTemplateContentFormatOption, + return_tokens_as_token_ids: bool = False, + enable_auto_tools: bool = False, + tool_parser: Optional[str] = None, + enable_prompt_tokens_details: bool = False, + ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, base_model_paths=base_model_paths, @@ -61,8 +65,8 @@ def __init__(self, return_tokens_as_token_ids=return_tokens_as_token_ids) self.response_role = response_role - self.use_tool_use_model_template = False - self.chat_template = load_chat_template(chat_template) + self.chat_template = chat_template + self.chat_template_content_format: Final = chat_template_content_format # set up tool use self.enable_auto_tools: bool = enable_auto_tools @@ -120,6 +124,7 @@ async def create_chat_completion( ) = self._maybe_get_adapters(request) tokenizer = await self.engine_client.get_tokenizer(lora_request) + tool_parser = self.tool_parser # validation for OpenAI tools @@ -157,6 +162,7 @@ async def create_chat_completion( tokenizer, request.messages, chat_template=request.chat_template or self.chat_template, + chat_template_content_format=self.chat_template_content_format, add_generation_prompt=request.add_generation_prompt, continue_final_message=request.continue_final_message, tool_dicts=tool_dicts, diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index bbe7db8f13231..74ad7389784fc 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -1,7 +1,7 @@ import asyncio import base64 import time -from typing import AsyncGenerator, List, Literal, Optional, Union, cast +from typing import AsyncGenerator, Final, List, Literal, Optional, Union, cast import numpy as np from fastapi import Request @@ -9,7 +9,7 @@ from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient -from vllm.entrypoints.chat_utils import load_chat_template +from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.protocol import (EmbeddingChatRequest, EmbeddingRequest, @@ -77,7 +77,8 @@ def __init__( *, request_logger: Optional[RequestLogger], chat_template: Optional[str], - ): + chat_template_content_format: ChatTemplateContentFormatOption, + ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, base_model_paths=base_model_paths, @@ -85,7 +86,8 @@ def __init__( prompt_adapters=None, request_logger=request_logger) - self.chat_template = load_chat_template(chat_template) + self.chat_template = chat_template + self.chat_template_content_format: Final = chat_template_content_format async def create_embedding( self, @@ -144,6 +146,8 @@ async def create_embedding( tokenizer, request.messages, chat_template=request.chat_template or self.chat_template, + chat_template_content_format=self. + chat_template_content_format, add_generation_prompt=request.add_generation_prompt, continue_final_message=request.continue_final_message, truncate_prompt_tokens=truncate_prompt_tokens, diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index fa315fa516632..cae2877ea7e99 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -11,14 +11,16 @@ from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient +# yapf conflicts with isort for this block +# yapf: disable from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, + ChatTemplateContentFormatOption, ConversationMessage, apply_hf_chat_template, apply_mistral_chat_template, - parse_chat_messages_futures) + parse_chat_messages_futures, + resolve_chat_template_content_format) from vllm.entrypoints.logger import RequestLogger -# yapf conflicts with isort for this block -# yapf: disable from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, CompletionRequest, DetokenizeRequest, @@ -426,7 +428,8 @@ async def _preprocess_chat( request: ChatLikeRequest, tokenizer: AnyTokenizer, messages: List[ChatCompletionMessageParam], - chat_template: Optional[str] = None, + chat_template: Optional[str], + chat_template_content_format: ChatTemplateContentFormatOption, add_generation_prompt: bool = True, continue_final_message: bool = False, tool_dicts: Optional[List[Dict[str, Any]]] = None, @@ -437,10 +440,16 @@ async def _preprocess_chat( add_special_tokens: bool = False, ) -> Tuple[List[ConversationMessage], Sequence[RequestPrompt], List[TokensPrompt]]: + resolved_content_format = resolve_chat_template_content_format( + chat_template, + chat_template_content_format, + tokenizer, + ) conversation, mm_data_future = parse_chat_messages_futures( messages, self.model_config, tokenizer, + content_format=resolved_content_format, ) _chat_template_kwargs: Dict[str, Any] = dict( diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index 1fd82304f7a4d..59b3b1311f881 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -1,8 +1,8 @@ -from typing import List, Optional, Union +from typing import Final, List, Optional, Union from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient -from vllm.entrypoints.chat_utils import load_chat_template +from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption from vllm.entrypoints.logger import RequestLogger # yapf conflicts with isort for this block # yapf: disable @@ -33,7 +33,8 @@ def __init__( lora_modules: Optional[List[LoRAModulePath]], request_logger: Optional[RequestLogger], chat_template: Optional[str], - ): + chat_template_content_format: ChatTemplateContentFormatOption, + ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, base_model_paths=base_model_paths, @@ -41,12 +42,8 @@ def __init__( prompt_adapters=None, request_logger=request_logger) - # If this is None we use the tokenizer's default chat template - # the list of commonly-used chat template names for HF named templates - hf_chat_templates: List[str] = ['default', 'tool_use'] - self.chat_template = chat_template \ - if chat_template in hf_chat_templates \ - else load_chat_template(chat_template) + self.chat_template = chat_template + self.chat_template_content_format: Final = chat_template_content_format async def create_tokenize( self, @@ -75,9 +72,12 @@ async def create_tokenize( request, tokenizer, request.messages, - chat_template=self.chat_template, + chat_template=request.chat_template or self.chat_template, + chat_template_content_format=self. + chat_template_content_format, add_generation_prompt=request.add_generation_prompt, continue_final_message=request.continue_final_message, + chat_template_kwargs=request.chat_template_kwargs, add_special_tokens=request.add_special_tokens, ) else: From 755b85359be910fabe39a75299439fc11beb57d4 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 15 Nov 2024 21:46:27 -0800 Subject: [PATCH 015/255] [doc] add doc for the plugin system (#10372) Signed-off-by: youkaichao --- docs/source/design/class_hierarchy.rst | 2 + docs/source/design/plugin_system.rst | 62 ++++++++++++++++++++++++++ docs/source/index.rst | 1 + docs/source/models/adding_model.rst | 25 +++-------- vllm/plugins/__init__.py | 16 +++++-- 5 files changed, 84 insertions(+), 22 deletions(-) create mode 100644 docs/source/design/plugin_system.rst diff --git a/docs/source/design/class_hierarchy.rst b/docs/source/design/class_hierarchy.rst index 15f0c8ccf77ee..58a888b17ba53 100644 --- a/docs/source/design/class_hierarchy.rst +++ b/docs/source/design/class_hierarchy.rst @@ -1,3 +1,5 @@ +.. _class_hierarchy: + vLLM's Class Hierarchy ======================= diff --git a/docs/source/design/plugin_system.rst b/docs/source/design/plugin_system.rst new file mode 100644 index 0000000000000..bfca702b9267a --- /dev/null +++ b/docs/source/design/plugin_system.rst @@ -0,0 +1,62 @@ +.. _plugin_system: + +vLLM's Plugin System +==================== + +The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM. + +How Plugins Work in vLLM +------------------------ + +Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see :ref:`class_hierarchy`), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the `load_general_plugins `__ function in the ``vllm.plugins`` module. This function is called for every process created by vLLM before it starts any work. + +How vLLM Discovers Plugins +-------------------------- + +vLLM's plugin system uses the standard Python ``entry_points`` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin: + +.. code-block:: python + + # inside `setup.py` file + from setuptools import setup + + setup(name='vllm_add_dummy_model', + version='0.1', + packages=['vllm_add_dummy_model'], + entry_points={ + 'vllm.general_plugins': + ["register_dummy_model = vllm_add_dummy_model:register"] + }) + + # inside `vllm_add_dummy_model.py` file + def register(): + from vllm import ModelRegistry + + if "MyLlava" not in ModelRegistry.get_supported_archs(): + ModelRegistry.register_model("MyLlava", + "vllm_add_dummy_model.my_llava:MyLlava") + +For more information on adding entry points to your package, please check the `official documentation `__. + +Every plugin has three parts: + +1. **Plugin group**: The name of the entry point group. vLLM uses the entry point group ``vllm.general_plugins`` to register general plugins. This is the key of ``entry_points`` in the ``setup.py`` file. Always use ``vllm.general_plugins`` for vLLM's general plugins. + +2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the ``entry_points`` dictionary. In the example above, the plugin name is ``register_dummy_model``. Plugins can be filtered by their names using the ``VLLM_PLUGINS`` environment variable. To load only a specific plugin, set ``VLLM_PLUGINS`` to the plugin name. + +3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is ``vllm_add_dummy_model:register``, which refers to a function named ``register`` in the ``vllm_add_dummy_model`` module. + +What Can Plugins Do? +-------------------- + +Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling ``ModelRegistry.register_model`` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM. + +Guidelines for Writing Plugins +------------------------------ + +- **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes. + +Compatibility Guarantee +----------------------- + +vLLM guarantees the interface of documented plugins, such as ``ModelRegistry.register_model``, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, ``"vllm_add_dummy_model.my_llava:MyLlava"`` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development. \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index a2abd2995b1cc..3b2698a8845ed 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -158,6 +158,7 @@ Documentation design/class_hierarchy design/huggingface_integration + design/plugin_system design/input_processing/model_inputs_index design/kernel/paged_attention design/multimodal/multimodal_index diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst index c6d88cc38e99b..a70ebf99c746f 100644 --- a/docs/source/models/adding_model.rst +++ b/docs/source/models/adding_model.rst @@ -102,11 +102,11 @@ This method should load the weights from the HuggingFace's checkpoint file and a Finally, register your :code:`*ForCausalLM` class to the :code:`_VLLM_MODELS` in `vllm/model_executor/models/registry.py `_. 6. Out-of-Tree Model Integration --------------------------------------------- +-------------------------------- -We also provide a way to integrate a model without modifying the vLLM codebase. Step 2, 3, 4 are still required, but you can skip step 1 and 5. +You can integrate a model without modifying the vLLM codebase. Steps 2, 3, and 4 are still required, but you can skip steps 1 and 5. Instead, write a plugin to register your model. For general introduction of the plugin system, see :ref:`plugin_system`. -Just add the following lines in your code: +To register the model, use the following code: .. code-block:: python @@ -114,7 +114,7 @@ Just add the following lines in your code: from your_code import YourModelForCausalLM ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) -If your model imports modules that initialize CUDA, consider instead lazy-importing it to avoid an error like :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`: +If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`: .. code-block:: python @@ -123,19 +123,8 @@ If your model imports modules that initialize CUDA, consider instead lazy-import ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM") .. important:: - If your model is a multimodal model, make sure the model class implements the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. + If your model is a multimodal model, ensure the model class implements the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. Read more about that :ref:`here `. -If you are running api server with :code:`vllm serve `, you can wrap the entrypoint with the following code: - -.. code-block:: python - - from vllm import ModelRegistry - from your_code import YourModelForCausalLM - ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) - - if __name__ == '__main__': - import runpy - runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__') - -Save the above code in a file and run it with :code:`python your_file.py `. +.. note:: + Although you can directly put these code snippets in your script using ``vllm.LLM``, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server. diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 8373e11cfff9f..9fca724599012 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -27,16 +27,24 @@ def load_general_plugins(): allowed_plugins = envs.VLLM_PLUGINS discovered_plugins = entry_points(group='vllm.general_plugins') + logger.info("Available plugins:") + for plugin in discovered_plugins: + logger.info("name=%s, value=%s, group=%s", plugin.name, plugin.value, + plugin.group) + if allowed_plugins is None: + logger.info("all available plugins will be loaded.") + logger.info("set environment variable VLLM_PLUGINS to control" + " which plugins to load.") + else: + logger.info("plugins to load: %s", allowed_plugins) for plugin in discovered_plugins: - logger.info("Found general plugin: %s", plugin.name) if allowed_plugins is None or plugin.name in allowed_plugins: try: func = plugin.load() func() - logger.info("Loaded general plugin: %s", plugin.name) + logger.info("plugin %s loaded.", plugin.name) except Exception: - logger.exception("Failed to load general plugin: %s", - plugin.name) + logger.exception("Failed to load plugin %s", plugin.name) _torch_compile_backend: Optional[Union[Callable, str]] = None From 2f427c2d163b5c6d5923a8808e9d786e170944ce Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sat, 16 Nov 2024 01:23:20 -0800 Subject: [PATCH 016/255] [misc][plugin] improve log messages (#10386) Signed-off-by: youkaichao --- vllm/plugins/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 9fca724599012..7b1bbb14c5302 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -27,6 +27,9 @@ def load_general_plugins(): allowed_plugins = envs.VLLM_PLUGINS discovered_plugins = entry_points(group='vllm.general_plugins') + if len(discovered_plugins) == 0: + logger.info("No plugins found.") + return logger.info("Available plugins:") for plugin in discovered_plugins: logger.info("name=%s, value=%s, group=%s", plugin.name, plugin.value, From 1d754726265d52773653e53e1a18f6eb63122480 Mon Sep 17 00:00:00 2001 From: rasmith Date: Sat, 16 Nov 2024 03:55:05 -0600 Subject: [PATCH 017/255] [BugFix] [Kernel] Fix GPU SEGV occuring in fused_moe kernel (#10385) Signed-off-by: Randall Smith --- vllm/model_executor/layers/fused_moe/fused_moe.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 340da32263c1c..e6f9f01ef0f74 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -105,16 +105,18 @@ def fused_moe_kernel( num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr) if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded: return - offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to( + tl.int64) offs_token = tl.load(sorted_token_ids_ptr + offs_token_id) token_mask = offs_token < num_valid_tokens - offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + offs_bn = (pid_n * BLOCK_SIZE_N + + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N offs_k = tl.arange(0, BLOCK_SIZE_K) a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak) - off_experts = tl.load(expert_ids_ptr + pid_m) + off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64) b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) if use_int8_w8a16: From 8b6725b0cf4ee5f363218f4bc341970c80297ccf Mon Sep 17 00:00:00 2001 From: Jaehyun An Date: Sat, 16 Nov 2024 19:15:40 +0900 Subject: [PATCH 018/255] [Misc] Update benchmark to support image_url file or http (#10287) Signed-off-by: rbbang --- benchmarks/benchmark_serving.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index bdb8ea8e2a5dc..e9fc037a46965 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -251,6 +251,19 @@ def sample_hf_requests( "url": f"data:image/jpeg;base64,{image_base64}" }, } + elif "image" in data and isinstance(data["image"], str): + if (data["image"].startswith("http://") or \ + data["image"].startswith("file://")): + image_url = data["image"] + else: + image_url = f"file://{data['image']}" + + mm_content = { + "type": "image_url", + "image_url": { + "url": image_url + }, + } else: mm_content = None From b98d89efd4b1a09c11c4d0cf30c9af0e93514764 Mon Sep 17 00:00:00 2001 From: Sky Lee <46676799+skylee-01@users.noreply.github.com> Date: Sun, 17 Nov 2024 00:33:01 +0800 Subject: [PATCH 019/255] [Misc] Medusa supports custom bias (#10361) --- vllm/model_executor/models/medusa.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py index de5b2d89c0962..b05360b55466b 100644 --- a/vllm/model_executor/models/medusa.py +++ b/vllm/model_executor/models/medusa.py @@ -14,11 +14,14 @@ class ResidualBlock(nn.Module): - def __init__(self, hidden_size: int, num_layers: int) -> None: + def __init__(self, config: VllmConfig, hidden_size: int, + num_layers: int) -> None: super().__init__() self.layers = nn.ModuleList([ - nn.Linear(hidden_size, hidden_size, bias=False) + nn.Linear(hidden_size, + hidden_size, + bias=getattr(config, "medusa_fc_bias", False)) for _ in range(num_layers) ]) self.act = nn.SiLU() @@ -49,7 +52,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() self.config = config self.blocks = nn.ModuleList([ - ResidualBlock(hidden_size=self.config.hidden_size, + ResidualBlock(config=config, + hidden_size=self.config.hidden_size, num_layers=self.config.num_hidden_layers) for _ in range(self.config.num_heads) ]) From 361c29e1740e0b2186f8cca3ed96ad235a8a960a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=B5=E8=84=91=E6=98=9F=E4=BA=BA?= Date: Sun, 17 Nov 2024 02:10:00 +0800 Subject: [PATCH 020/255] [Bugfix] Fix M-RoPE position calculation when chunked prefill is enabled (#10388) Signed-off-by: imkero --- .../vision_language/test_qwen2_vl.py | 136 +++++++++++++++++- .../model_executor/layers/rotary_embedding.py | 3 +- vllm/worker/model_runner.py | 1 + 3 files changed, 135 insertions(+), 5 deletions(-) diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py index 718c675b86fb4..71b6ba4dca435 100644 --- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py @@ -18,6 +18,7 @@ IMAGE_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>" VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>" +MODEL_HIDDEN_SIZE = 1536 def qwen2_vl_chat_template(*query): @@ -230,7 +231,7 @@ def batch_make_video_embeddings( return result -def run_test( +def run_embedding_input_test( vllm_runner: Type[VllmRunner], inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]], model: str, @@ -326,7 +327,7 @@ def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model, [], ) for image, prompt in zip(images, IMAGE_PROMPTS)] - run_test( + run_embedding_input_test( vllm_runner, inputs_per_case, model, @@ -371,7 +372,7 @@ def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets, [], )] - run_test( + run_embedding_input_test( vllm_runner, inputs_per_case, model, @@ -416,7 +417,134 @@ def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model, [rescale_video_size(video, factor) for factor in size_factors], ) for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)] - run_test( + run_embedding_input_test( + vllm_runner, + inputs_per_case, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + mm_limit=1, + tensor_parallel_size=1, + ) + + +def run_chunked_prefill_test( + vllm_runner: Type[VllmRunner], + inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]], + model: str, + *, + dtype: str, + max_tokens: int, + num_logprobs: int, + mm_limit: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +): + """Compare inference result between + chunked prefill disabled and chunked prefill enabled + """ + + # NOTE: + # max_model_len should be greater than image_feature_size + with vllm_runner(model, + task="generate", + max_model_len=4000, + max_num_seqs=4, + dtype=dtype, + limit_mm_per_prompt={ + "image": mm_limit, + "video": mm_limit + }, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend + ) as vllm_model: + + outputs_per_case = [ + vllm_model.generate_greedy_logprobs(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images or None, + videos=videos or None) + for prompts, images, videos in inputs + ] + + with vllm_runner( + model, + task="generate", + max_model_len=4000, + max_num_seqs=4, + dtype=dtype, + limit_mm_per_prompt={ + "image": mm_limit, + "video": mm_limit + }, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enable_chunked_prefill=True, + # should be small enough to ensure prefilling is chunked + max_num_batched_tokens=32, + mm_processor_kwargs={ + "max_pixels": 16 * 28 * 28, + }) as vllm_model_chunked: + outputs_per_case_chunked = [ + vllm_model_chunked.generate_greedy_logprobs( + prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images or None, + videos=videos or None) for prompts, images, videos in inputs + ] + + for outputs, \ + outputs_chunked \ + in zip(outputs_per_case, + outputs_per_case_chunked): + check_logprobs_close( + outputs_0_lst=outputs, + outputs_1_lst=outputs_chunked, + name_0="non_chunked", + name_1="chunked", + ) + + +@pytest.mark.core_model +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [1]) +@pytest.mark.parametrize("num_logprobs", [10]) +def test_qwen2_vl_mrope_chunked_prefill(vllm_runner, example_prompts, + model: str, dtype: str, + max_tokens: int, + num_logprobs: int) -> None: + """ + Test Qwen2-VL's chunked prefill with M-RoPE + """ + prompts = [ + qwen2_vl_chat_template(IMAGE_PLACEHOLDER, prompt) + for prompt in example_prompts[:1] + ] + + # 1. Qwen2-VL's M-RoPE works only when there are some multi-modal inputs, + # so an image is included in the inputs + # 2. however, Qwen2-VL currently won't work properly + # when chunked prefill is enabled and there are some multi-modal inputs, + # here use a hacky way: provide a **zero-length** image to make it happy + # + # and finally we achieved: + # (1) chunked_prefill enabled; (2) M-RoPE works; to continue our tests + zero_len_image = { + "image_embeds": torch.empty((0, MODEL_HIDDEN_SIZE)), + "image_grid_thw": torch.tensor([[0, 0, 0]]) + } + images = [zero_len_image] * len(prompts) + + inputs_per_case: List[Tuple[List[str], PromptImageInput, + PromptVideoInput]] = [ + (prompts, images, []), + ] + + run_chunked_prefill_test( vllm_runner, inputs_per_case, model, diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 63ceec63e8317..b01e4c61fe101 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -847,6 +847,7 @@ def get_input_positions( vision_end_token_id: int, spatial_merge_size: int, context_len: int = 0, + seq_len: Optional[int] = None, ) -> Tuple[List[List[int]], int]: """Get mrope input positions and delta value.""" @@ -921,7 +922,7 @@ def get_input_positions( torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) - llm_positions = llm_positions[:, context_len:] + llm_positions = llm_positions[:, context_len:seq_len] mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 042f9f07eace6..22ee3f9f863e4 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -700,6 +700,7 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, spatial_merge_size=hf_config.vision_config. spatial_merge_size, context_len=inter_data.context_lens[seq_idx], + seq_len=inter_data.seq_lens[seq_idx], ) seq_data.mrope_position_delta = mrope_position_delta From 661a34fd4fdd700a29b2db758e23e4e243e7ff18 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sat, 16 Nov 2024 10:45:26 -0800 Subject: [PATCH 021/255] [V1] Add code owners for V1 (#10397) Signed-off-by: Woosuk Kwon --- .github/CODEOWNERS | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index cd721971d01d6..3cb91fc0f8232 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -3,13 +3,16 @@ # This lists cover the "core" components of vLLM that require careful review /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/core @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/engine/llm_engine.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/executor/executor_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/worker/worker_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/worker/worker.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/model_executor/layers/sampler.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -CMakeLists.txt @tlrmchlsmth @WoosukKwon +/vllm/core @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +CMakeLists.txt @tlrmchlsmth + +# vLLM V1 +/vllm/v1 @WoosukKwon @robertgshaw2-neuralmagic @njhill @ywang96 @comaniac @alexm-neuralmagic # Test ownership /tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo From 4fd937502827a7e06c54ded1f9d9b70ff640e222 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sat, 16 Nov 2024 18:02:14 -0800 Subject: [PATCH 022/255] [2/N][torch.compile] make compilation cfg part of vllm cfg (#10383) Signed-off-by: youkaichao --- tests/compile/piecewise/test_simple.py | 8 +- tests/compile/piecewise/test_toy_llama.py | 22 +- tests/compile/test_basic_correctness.py | 2 +- tests/compile/test_full_graph.py | 2 +- tests/compile/test_fusion.py | 2 +- tests/compile/test_wrapper.py | 4 +- tests/compile/utils.py | 2 +- .../model_executor/test_enabled_custom_ops.py | 52 ++--- tests/tpu/test_compilation.py | 2 +- tests/tpu/test_custom_dispatcher.py | 2 +- vllm/compilation/backends.py | 20 +- vllm/compilation/config.py | 159 --------------- vllm/compilation/decorators.py | 10 +- vllm/compilation/fusion.py | 2 +- vllm/compilation/inductor_pass.py | 2 +- vllm/compilation/levels.py | 8 - vllm/compilation/wrapper.py | 11 +- vllm/config.py | 189 ++++++++++++++++++ vllm/envs.py | 13 -- vllm/model_executor/custom_op.py | 27 +-- vllm/model_executor/model_loader/loader.py | 7 +- vllm/platforms/interface.py | 20 +- vllm/platforms/tpu.py | 21 +- vllm/plugins/__init__.py | 30 ++- vllm/v1/worker/gpu_model_runner.py | 10 +- vllm/worker/model_runner.py | 7 +- vllm/worker/tpu_model_runner.py | 8 +- 27 files changed, 359 insertions(+), 283 deletions(-) delete mode 100644 vllm/compilation/config.py delete mode 100644 vllm/compilation/levels.py diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index c631850ecdedb..45f56cbbd4b16 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -11,8 +11,8 @@ from vllm.compilation.compile_context import set_compile_context from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile -from vllm.compilation.levels import CompilationLevel -from vllm.config import VllmConfig +from vllm.config import CompilationLevel, VllmConfig +from vllm.plugins import set_current_vllm_config from vllm.utils import direct_register_custom_op global_counter = 0 @@ -82,7 +82,9 @@ def test_simple_piecewise_compile(): os.environ["VLLM_TORCH_COMPILE_CONFIG"] = config os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE) - model = SillyModel(vllm_config=VllmConfig(), prefix='') + vllm_config = VllmConfig() + with set_current_vllm_config(vllm_config): + model = SillyModel(vllm_config=vllm_config, prefix='') inputs = torch.randn(100).cuda() diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index c363a587a818e..8032304e95806 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -15,12 +15,10 @@ from torch.library import Library from vllm.compilation.compile_context import set_compile_context -from vllm.compilation.config import CompilationConfig from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile -from vllm.compilation.levels import CompilationLevel -from vllm.config import VllmConfig -from vllm.plugins import set_compilation_config +from vllm.config import CompilationConfig, CompilationLevel, VllmConfig +from vllm.plugins import set_compilation_config, set_current_vllm_config from vllm.utils import direct_register_custom_op # create a library to hold the custom op @@ -272,9 +270,11 @@ def run_model(llama_config, CompilationLevel.NO_COMPILATION) set_compilation_config(None) - model = LlamaModel(config=llama_config, - vllm_config=VllmConfig(), - prefix="").eval().cuda() + vllm_config = VllmConfig() + with set_current_vllm_config(vllm_config): + model = LlamaModel(config=llama_config, + vllm_config=vllm_config, + prefix="").eval().cuda() B = 16 # max batch size input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda() @@ -395,9 +395,11 @@ def benchmark(): else: set_compilation_config(None) - model = LlamaModel(config=llama_config, - vllm_config=VllmConfig(), - prefix="").eval().cuda().to(torch.bfloat16) + vllm_config = VllmConfig() + with set_current_vllm_config(vllm_config): + model = LlamaModel(config=llama_config, + vllm_config=vllm_config, + prefix="").eval().cuda().to(torch.bfloat16) B = 256 # max batch size input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda() diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 833589ba5dc9f..08747ebc58b75 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -3,7 +3,7 @@ import pytest -from vllm.compilation.levels import CompilationLevel +from vllm.config import CompilationLevel from vllm.utils import cuda_device_count_stateless from ..utils import compare_all_settings diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index f00334934cb46..4dfdfe21a67df 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -1,6 +1,6 @@ import pytest -from vllm.compilation.levels import CompilationLevel +from vllm.config import CompilationLevel from ..utils import fork_new_process_for_each_test from .utils import TEST_MODELS, check_full_graph_support diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index e4d3defafb951..4db79b070fd8d 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -3,10 +3,10 @@ from compressed_tensors.quantization import FP8_DTYPE import vllm.envs as envs -from vllm.compilation.config import CompilationConfig from vllm.compilation.fusion import (FusionPass, find_auto_fn, find_auto_fn_maybe) from vllm.compilation.reshapes import RedundantReshapesPass +from vllm.config import CompilationConfig from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( apply_fp8_linear) diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py index 3668c1fab6b89..74f66baaa5ea1 100644 --- a/tests/compile/test_wrapper.py +++ b/tests/compile/test_wrapper.py @@ -3,6 +3,7 @@ import torch from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher +from vllm.config import CompilationLevel class MyMod(torch.nn.Module): @@ -18,7 +19,8 @@ class MyWrapper(TorchCompileWrapperWithCustomDispatcher): def __init__(self, model): self.model = model compiled_callable = torch.compile(self.forward, backend="eager") - super().__init__(compiled_callable) + super().__init__(compiled_callable, + compilation_level=CompilationLevel.DYNAMO_ONCE) def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None): # this is the function to be compiled diff --git a/tests/compile/utils.py b/tests/compile/utils.py index 222c63a342a4b..729f10676888b 100644 --- a/tests/compile/utils.py +++ b/tests/compile/utils.py @@ -4,7 +4,7 @@ from tests.quantization.utils import is_quant_method_supported from vllm import LLM, SamplingParams -from vllm.compilation.levels import CompilationLevel +from vllm.config import CompilationLevel from vllm.platforms import current_platform TEST_MODELS = [ diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index af267f804ffa7..c3219bc50646b 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -3,11 +3,13 @@ import pytest +from vllm.config import CompilationConfig, VllmConfig from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.activation import (GeluAndMul, ReLUSquaredActivation, SiluAndMul) from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.plugins import set_current_vllm_config # Registered subclass for test @@ -51,42 +53,40 @@ class Relu3(ReLUSquaredActivation): ]) def test_enabled_ops(env: str, torch_level: int, ops_enabled: List[int], default_on: bool): - os.environ["VLLM_CUSTOM_OPS"] = env os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(torch_level) + vllm_config = VllmConfig(compilation_config=CompilationConfig( + custom_ops=env.split(","))) + with set_current_vllm_config(vllm_config): + assert CustomOp.default_on() == default_on - # Reset default_on (computed once): - CustomOp.default_on.cache_clear() + ops_enabled = [bool(x) for x in ops_enabled] - assert CustomOp.default_on() == default_on + assert RMSNorm(1024).enabled() == ops_enabled[0] + assert CustomOp.op_registry["rms_norm"].enabled() == ops_enabled[0] - ops_enabled = [bool(x) for x in ops_enabled] + assert SiluAndMul().enabled() == ops_enabled[1] + assert CustomOp.op_registry["silu_and_mul"].enabled() == ops_enabled[1] - assert RMSNorm(1024).enabled() == ops_enabled[0] - assert CustomOp.op_registry["rms_norm"].enabled() == ops_enabled[0] + assert GeluAndMul().enabled() == ops_enabled[2] + assert CustomOp.op_registry["gelu_and_mul"].enabled() == ops_enabled[2] - assert SiluAndMul().enabled() == ops_enabled[1] - assert CustomOp.op_registry["silu_and_mul"].enabled() == ops_enabled[1] + # If registered, subclasses should follow their own name + assert Relu3().enabled() == ops_enabled[3] + assert CustomOp.op_registry["relu3"].enabled() == ops_enabled[3] - assert GeluAndMul().enabled() == ops_enabled[2] - assert CustomOp.op_registry["gelu_and_mul"].enabled() == ops_enabled[2] + # Unregistered subclass + class SiluAndMul2(SiluAndMul): + pass - # If registered, subclasses should follow their own name - assert Relu3().enabled() == ops_enabled[3] - assert CustomOp.op_registry["relu3"].enabled() == ops_enabled[3] - - # Unregistered subclass - class SiluAndMul2(SiluAndMul): - pass - - # Subclasses should not require registration - assert SiluAndMul2().enabled() == SiluAndMul().enabled() + # Subclasses should not require registration + assert SiluAndMul2().enabled() == SiluAndMul().enabled() @pytest.mark.parametrize( "env", ["all,none", "all,+rms_norm,all", "+rms_norm,-rms_norm"]) def test_enabled_ops_invalid(env: str): - os.environ["VLLM_CUSTOM_OPS"] = env - CustomOp.default_on.cache_clear() - - with pytest.raises(AssertionError): - RMSNorm(1024).enabled() + with pytest.raises(Exception): # noqa + vllm_config = VllmConfig(compilation_config=CompilationConfig( + custom_ops=env.split(","))) + with set_current_vllm_config(vllm_config): + RMSNorm(1024).enabled() diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py index 86d9af88e49ea..941abe17a3378 100644 --- a/tests/tpu/test_compilation.py +++ b/tests/tpu/test_compilation.py @@ -5,7 +5,7 @@ import depyf -from vllm.compilation.levels import CompilationLevel +from vllm.config import CompilationLevel # disable custom dispatcher, let Dynamo takes over # all the control diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py index 923d0f1680802..53b10c06135a1 100644 --- a/tests/tpu/test_custom_dispatcher.py +++ b/tests/tpu/test_custom_dispatcher.py @@ -1,6 +1,6 @@ import os -from vllm.compilation.levels import CompilationLevel +from vllm.config import CompilationLevel from ..utils import compare_two_settings diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 5682faa158069..22c613931f082 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -10,13 +10,12 @@ import torch.fx as fx import vllm.envs as envs +from vllm.config import CompilationConfig, CompilationLevel from vllm.logger import init_logger from vllm.utils import combine_fx_passes, weak_ref_tensors -from .config import CompilationConfig from .counter import compilation_counter from .fusion import FusionPass -from .levels import CompilationLevel from .reshapes import RedundantReshapesPass logger = init_logger(__name__) @@ -392,7 +391,10 @@ class VllmBackend: sym_tensor_indices: List[int] input_buffers: List[torch.Tensor] - def __init__(self, post_grad_passes: Sequence[Callable] = ()): + def __init__( + self, + compilation_configs: CompilationConfig, + ): global global_graph_pool if global_graph_pool is None: global_graph_pool = torch.cuda.graph_pool_handle() @@ -401,11 +403,13 @@ def __init__(self, post_grad_passes: Sequence[Callable] = ()): # streams, it might not be safe to share a global pool. # only investigate this when we use multiple streams self.graph_pool = global_graph_pool - self.post_grad_passes = post_grad_passes + self.post_grad_passes = [] self.sym_tensor_indices = [] self.input_buffers = [] + self.compilation_configs = compilation_configs + # `torch.compile` is JIT compiled, so we don't need to # do anything here @@ -437,10 +441,10 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: assert not self._called, "VllmBackend can only be called once" self.graph = graph - # config is read now, because only here can + # config is updated now, because only here can # we get the sizes to capture for cudagraph # from compilation context - self.compilation_configs = CompilationConfig.select_and_init_config() + self.compilation_configs.init_during_runtime() self.add_passes_to_config() self.split_gm, self.piecewise_graphs = split_graph( @@ -688,4 +692,6 @@ def select_default_backend(level: int) -> Union[str, Callable]: return backend_str assert level == CompilationLevel.PIECEWISE - return VllmBackend() + from vllm.plugins import get_current_vllm_config + compilation_config = get_current_vllm_config().compilation_config + return VllmBackend(compilation_config) diff --git a/vllm/compilation/config.py b/vllm/compilation/config.py deleted file mode 100644 index 3e663505c627d..0000000000000 --- a/vllm/compilation/config.py +++ /dev/null @@ -1,159 +0,0 @@ -import copy -from pathlib import Path -from typing import Any, Dict, List, Optional - -from pydantic import BaseModel, Field, PrivateAttr - -import vllm.envs as envs -from vllm.logger import init_logger - -from .compile_context import get_compile_context - -logger = init_logger(__name__) - - -class CompilationConfig(BaseModel): - """ - Configuration for compilation. - It has two parts: - - CudaGraph capture: - - use_cudagraph: whether to use cudagraph inside compilation. - - False: cudagraph inside compilation is not used. - - True: cudagraph inside compilation is used. It requires - that all input buffers have fixed addresses. - Note that this is orthogonal to the cudagraph capture out - side of compilation. - TODO: move outside cudagraph logic into compilation. - torch.compile will handle cudagraph capture logic in the future. - - cudagraph_capture_sizes: sizes to capture cudagraph. - - None: capture sizes are inferred from compilation context. - - List[int]: capture sizes are specified. - - cudagraph_num_of_warmups: number of warmup runs for cudagraph. - It means the first several runs will be treated as warmup runs. - Only after that, the execution will be recorded, and the recorded - cudagraph will be used for subsequent runs. - - cudagraph_copy_inputs: whether to copy input tensors for - cudagraph. If the caller can guarantee that the same input buffers - are always used, it can set this to False. Otherwise, it should - set this to True, and the compiler will copy the input to an - internally managed buffer. Default is False. - - Inductor compilation: - - use_inductor: whether to use inductor compilation. - - False: inductor compilation is not used. graph runs in eager. - - True: inductor compilation is used. one graph for symbolic shape - is compiled. In addition, compile for different sizes specified - in inductor_compile_sizes, using configurations - in inductor_compile_config. - - inductor_compile_sizes: sizes to compile for inductor. - - inductor_specialize_for_cudagraph_no_more_than: an optional integer - to specialize inductor for cudagraph sizes no more than the - specified size. It is useful when we want to specialize inductor - with a subset of cudagraph sizes. - - inductor_compile_config: additional configurations for inductor. - - None: use default configurations. - - inductor_passes: additional passes for inductor. It is a dictionary - from pass name to pass function qualified name. We use function - name because the config uses json format. If we pass the config - from Python, functions can also be passed directly via Python object - constructor, e.g. `CompilationConfig(inductor_passes={"a": func})` - - Custom inductor passes: - - dump_graph_stages: list of stages for which we want to dump the graph. - Each pass defines its own stages (before, after, maybe in-between). - - dump_graph_dir: directory to dump the graph. Default is . - - enable_fusion: whether to enable the custom fusion pass. - TODO better pass enabling system. - - Why we have different sizes for cudagraph and inductor: - - cudagraph: a cudagraph captured for a specific size can only be used - for the same size. We need to capture all the sizes we want to use. - - inductor: a graph compiled by inductor for a general shape can be used - for different sizes. Inductor can also compile for specific sizes, - where it can have more information to optimize the graph with fully - static shapes. However, we find the general shape compilation is - sufficient for most cases. It might be beneficial to compile for - certain small batchsizes, where inductor is good at optimizing. - """ - use_inductor: bool = True - inductor_specialize_for_cudagraph_no_more_than: Optional[int] = None - inductor_compile_sizes: Optional[List[int]] = Field(default_factory=dict) - inductor_compile_config: Dict = Field(default_factory=dict) - inductor_passes: Dict[str, str] = Field(default_factory=dict) - - use_cudagraph: bool = False - non_cudagraph_ops: List[str] = Field(default_factory=list) - cudagraph_num_of_warmups: int = 0 - cudagraph_capture_sizes: Optional[List[int]] = None - cudagraph_copy_inputs: bool = False - - dump_graph_stages: List[str] = Field(default_factory=list) - dump_graph_dir: Path = Field(default=Path(".")) - enable_fusion: bool = True - - # not configurable, computed after init - compile_sizes: List[int] = PrivateAttr - capture_sizes: List[int] = PrivateAttr - - def model_post_init(self, __context: Any) -> None: - for k, v in self.inductor_passes.items(): - if not isinstance(v, str): - assert callable(v), ( - f"pass {k} should be a function or a qualified name") - self.inductor_compile_config[k] = v - continue - - # resolve function from qualified name - names = v.split(".") - module = ".".join(names[:-1]) - func_name = names[-1] - func = __import__(module).__dict__[func_name] - self.inductor_compile_config[k] = func - - def init_during_runtime(self): - """To complete the initialization of config, - we need to know the compile context, which is only available - during the first run of the model. - """ - context = get_compile_context() - context = copy.deepcopy(context) if context is not None else [] - sizes_to_specialize: List[int] = context - if self.cudagraph_capture_sizes is None: - self.capture_sizes = sizes_to_specialize - else: - self.capture_sizes = self.cudagraph_capture_sizes - logger.info(("cudagraph sizes specified by model runner" - " %s is overridden by config %s"), - sizes_to_specialize, self.cudagraph_capture_sizes) - if self.inductor_specialize_for_cudagraph_no_more_than is not None: - assert self.inductor_compile_sizes is None, ( - "inductor_compile_sizes should be None when " - "inductor_specialize_for_cudagraph_no_more_than is not None") - self.compile_sizes = [ - x for x in self.capture_sizes - if x <= self.inductor_specialize_for_cudagraph_no_more_than - ] - else: - assert self.inductor_compile_sizes is not None, ( - "inductor_compile_sizes should not be None when " - "inductor_specialize_for_cudagraph_no_more_than is None") - self.compile_sizes = self.inductor_compile_sizes - - @staticmethod - def select_and_init_config() -> "CompilationConfig": - """The order of selecting config is: - 1. Use the config specified in environment variable. - 2. Use the config specified in plugins. - 3. Use the default config. - """ - config_path = envs.VLLM_TORCH_COMPILE_CONFIG - if config_path is not None: - with open(config_path) as json_file: - config = CompilationConfig.model_validate_json( - json_file.read()) - else: - from vllm.plugins import get_compilation_config - predefined_config = get_compilation_config() - config = predefined_config if predefined_config is not None else ( - CompilationConfig()) - - config.init_during_runtime() - return config diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index ca1e96a33c014..4b78491bc5a48 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -3,10 +3,8 @@ import torch -import vllm.envs as envs -from vllm.compilation.levels import CompilationLevel from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher -from vllm.config import VllmConfig +from vllm.config import CompilationLevel, VllmConfig from vllm.logger import init_logger from vllm.sequence import IntermediateTensors from vllm.utils import supports_dynamo @@ -126,12 +124,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs): old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs) # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner # will handle the compilation, so we don't need to do anything here. - self.do_not_compile = envs.VLLM_TORCH_COMPILE_LEVEL in [ + self.do_not_compile = \ + vllm_config.compilation_config.level in [ CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS ] or not supports_dynamo() if self.do_not_compile: return - TorchCompileWrapperWithCustomDispatcher.__init__(self) + TorchCompileWrapperWithCustomDispatcher.__init__( + self, compilation_level=vllm_config.compilation_config.level) cls.__init__ = __init__ # type: ignore diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py index eb43604b1399b..e6a3afef85e1b 100644 --- a/vllm/compilation/fusion.py +++ b/vllm/compilation/fusion.py @@ -6,8 +6,8 @@ from torch._inductor.pattern_matcher import (Match, PatternMatcherPass, fwd_only, register_replacement) -from vllm.compilation.config import CompilationConfig from vllm.compilation.inductor_pass import InductorPass +from vllm.config import CompilationConfig from vllm.logger import init_logger logger = init_logger(__name__) diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py index b23351fa19759..8082a08b40019 100644 --- a/vllm/compilation/inductor_pass.py +++ b/vllm/compilation/inductor_pass.py @@ -2,7 +2,7 @@ import torch -from vllm.compilation.config import CompilationConfig +from vllm.config import CompilationConfig # yapf: disable from vllm.distributed import get_tensor_model_parallel_rank as get_tp_rank from vllm.distributed import ( diff --git a/vllm/compilation/levels.py b/vllm/compilation/levels.py deleted file mode 100644 index 19a3a2b526870..0000000000000 --- a/vllm/compilation/levels.py +++ /dev/null @@ -1,8 +0,0 @@ -# constants for the levels of the compilation process - - -class CompilationLevel: - NO_COMPILATION = 0 - DYNAMO_AS_IS = 1 - DYNAMO_ONCE = 2 - PIECEWISE = 3 diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index 7366ed4d16b0b..2a1aecc11ce26 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -8,8 +8,7 @@ import torch import vllm.envs as envs - -from .levels import CompilationLevel +from vllm.config import CompilationLevel class TorchCompileWrapperWithCustomDispatcher: @@ -25,7 +24,9 @@ class TorchCompileWrapperWithCustomDispatcher: `torch.compile` over the forward method. """ - def __init__(self, compiled_callable: Optional[Callable] = None): + def __init__(self, + compiled_callable: Optional[Callable] = None, + compilation_level: int = 0): if compiled_callable is None: # default compilation settings @@ -38,7 +39,7 @@ def __init__(self, compiled_callable: Optional[Callable] = None): backend = get_torch_compile_backend() if backend is None: from vllm.compilation.backends import select_default_backend - backend = select_default_backend(envs.VLLM_TORCH_COMPILE_LEVEL) + backend = select_default_backend(compilation_level) compiled_callable = torch.compile( self.forward, @@ -54,7 +55,7 @@ def __init__(self, compiled_callable: Optional[Callable] = None): # subclasses can use this to switch between the custom dispatcher # and the default Dynamo guard mechanism. self.use_custom_dispatcher: bool = \ - envs.VLLM_TORCH_COMPILE_LEVEL >= CompilationLevel.DYNAMO_ONCE + compilation_level >= CompilationLevel.DYNAMO_ONCE def __call__(self, *args, **kwargs): """Implement the dispatch logic here, beyond the torch.compile level. diff --git a/vllm/config.py b/vllm/config.py index 64b2f75e092de..7e37edbe594b1 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3,10 +3,12 @@ import json import warnings from dataclasses import dataclass, field, replace +from pathlib import Path from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Dict, Final, List, Literal, Mapping, Optional, Set, Tuple, Type, Union) import torch +from pydantic import BaseModel, Field, PrivateAttr from transformers import PretrainedConfig import vllm.envs as envs @@ -2052,6 +2054,185 @@ def __post_init__(self): f"installed. Original error:\n{otel_import_error_traceback}") +class CompilationLevel: + # constants for the levels of the compilation process + NO_COMPILATION = 0 + DYNAMO_AS_IS = 1 + DYNAMO_ONCE = 2 + PIECEWISE = 3 + + +class CompilationConfig(BaseModel): + """ + Configuration for compilation. + It has three parts: + - Top-level Compilation control: + - level: the level of compilation. + - 0: no compilation. + - 1: dynamo as is. + - 2: dynamo once. + - 3: piecewise compilation. + - custom_ops: fine-grained control over which custom ops to enable/disable. + Use 'all' to enable all, 'none' to disable all. + Also specify a list of custom op names to enable (prefixed with a '+'), + or disable (prefixed with a '-'). + Examples: + - 'all,-op1' to enable all except op1 + - 'none,+op1,+op2' to enable only op1 and op2 + By default, all custom ops are enabled when running without Inductor + and disabled when running with Inductor (compile_level >= Inductor). + - CudaGraph capture: + - use_cudagraph: whether to use cudagraph inside compilation. + - False: cudagraph inside compilation is not used. + - True: cudagraph inside compilation is used. It requires + that all input buffers have fixed addresses. + Note that this is orthogonal to the cudagraph capture out + side of compilation. + TODO: move outside cudagraph logic into compilation. + torch.compile will handle cudagraph capture logic in the future. + - cudagraph_capture_sizes: sizes to capture cudagraph. + - None: capture sizes are inferred from compilation context. + - List[int]: capture sizes are specified. + - cudagraph_num_of_warmups: number of warmup runs for cudagraph. + It means the first several runs will be treated as warmup runs. + Only after that, the execution will be recorded, and the recorded + cudagraph will be used for subsequent runs. + - cudagraph_copy_inputs: whether to copy input tensors for + cudagraph. If the caller can guarantee that the same input buffers + are always used, it can set this to False. Otherwise, it should + set this to True, and the compiler will copy the input to an + internally managed buffer. Default is False. + - Inductor compilation: + - use_inductor: whether to use inductor compilation. + - False: inductor compilation is not used. graph runs in eager. + - True: inductor compilation is used. one graph for symbolic shape + is compiled. In addition, compile for different sizes specified + in inductor_compile_sizes, using configurations + in inductor_compile_config. + - inductor_compile_sizes: sizes to compile for inductor. + - inductor_specialize_for_cudagraph_no_more_than: an optional integer + to specialize inductor for cudagraph sizes no more than the + specified size. It is useful when we want to specialize inductor + with a subset of cudagraph sizes. + - inductor_compile_config: additional configurations for inductor. + - None: use default configurations. + - inductor_passes: additional passes for inductor. It is a dictionary + from pass name to pass function qualified name. We use function + name because the config uses json format. If we pass the config + from Python, functions can also be passed directly via Python object + constructor, e.g. `CompilationConfig(inductor_passes={"a": func})` + - custom inductor passes: + - dump_graph_stages: list of stages for which we want to dump the graph. + Each pass defines its own stages (before, after, maybe in-between). + - dump_graph_dir: directory to dump the graph. Default is . + - enable_fusion: whether to enable the custom fusion pass. + TODO better pass enabling system. + + Why we have different sizes for cudagraph and inductor: + - cudagraph: a cudagraph captured for a specific size can only be used + for the same size. We need to capture all the sizes we want to use. + - inductor: a graph compiled by inductor for a general shape can be used + for different sizes. Inductor can also compile for specific sizes, + where it can have more information to optimize the graph with fully + static shapes. However, we find the general shape compilation is + sufficient for most cases. It might be beneficial to compile for + certain small batchsizes, where inductor is good at optimizing. + """ # noqa + level: int = 0 + custom_ops: List[str] = Field(default_factory=list) + + use_inductor: bool = True + inductor_specialize_for_cudagraph_no_more_than: Optional[int] = None + inductor_compile_sizes: Optional[List[int]] = Field(default_factory=dict) + inductor_compile_config: Dict = Field(default_factory=dict) + inductor_passes: Dict[str, str] = Field(default_factory=dict) + + use_cudagraph: bool = False + non_cudagraph_ops: List[str] = Field(default_factory=list) + cudagraph_num_of_warmups: int = 0 + cudagraph_capture_sizes: Optional[List[int]] = None + cudagraph_copy_inputs: bool = False + + dump_graph_stages: List[str] = Field(default_factory=list) + dump_graph_dir: Path = Field(default=Path(".")) + enable_fusion: bool = True + + # not configurable, computed after init + compile_sizes: List[int] = PrivateAttr + capture_sizes: List[int] = PrivateAttr + + def model_post_init(self, __context: Any) -> None: + self.level = envs.VLLM_TORCH_COMPILE_LEVEL + + count_none = self.custom_ops.count("none") + count_all = self.custom_ops.count("all") + assert count_none + count_all <= 1, "Can only specify 'none' or 'all'" + + for k, v in self.inductor_passes.items(): + if not isinstance(v, str): + assert callable(v), ( + f"pass {k} should be a function or a qualified name") + self.inductor_compile_config[k] = v + continue + + # resolve function from qualified name + names = v.split(".") + module = ".".join(names[:-1]) + func_name = names[-1] + func = __import__(module).__dict__[func_name] + self.inductor_compile_config[k] = func + + def init_during_runtime(self): + """To complete the initialization of config, + we need to know the compile context, which is only available + during the first run of the model. + """ + from vllm.compilation.compile_context import get_compile_context + context = get_compile_context() + context = copy.deepcopy(context) if context is not None else [] + sizes_to_specialize: List[int] = context + if self.cudagraph_capture_sizes is None: + self.capture_sizes = sizes_to_specialize + else: + self.capture_sizes = self.cudagraph_capture_sizes + logger.info(("cudagraph sizes specified by model runner" + " %s is overridden by config %s"), + sizes_to_specialize, self.cudagraph_capture_sizes) + if self.inductor_specialize_for_cudagraph_no_more_than is not None: + assert self.inductor_compile_sizes is None, ( + "inductor_compile_sizes should be None when " + "inductor_specialize_for_cudagraph_no_more_than is not None") + self.compile_sizes = [ + x for x in self.capture_sizes + if x <= self.inductor_specialize_for_cudagraph_no_more_than + ] + else: + assert self.inductor_compile_sizes is not None, ( + "inductor_compile_sizes should not be None when " + "inductor_specialize_for_cudagraph_no_more_than is None") + self.compile_sizes = self.inductor_compile_sizes + + @staticmethod + def select_and_init_config() -> "CompilationConfig": + """The order of selecting config is: + 1. Use the config specified in environment variable. + 2. Use the config specified in plugins. + 3. Use the default config. + """ + config_path = envs.VLLM_TORCH_COMPILE_CONFIG + if config_path is not None: + with open(config_path) as json_file: + config = CompilationConfig.model_validate_json( + json_file.read()) + else: + from vllm.plugins import get_compilation_config + predefined_config = get_compilation_config() + config = predefined_config if predefined_config is not None else ( + CompilationConfig()) + + return config + + @dataclass class VllmConfig: """Dataclass which contains all vllm-related configuration. This @@ -2073,6 +2254,8 @@ class VllmConfig: observability_config: Optional[ObservabilityConfig] = None prompt_adapter_config: Optional[PromptAdapterConfig] = None quant_config: Optional[QuantizationConfig] = None + compilation_config: CompilationConfig = field(default=None, + init=True) # type: ignore @staticmethod def _get_quantization_config( @@ -2133,6 +2316,12 @@ def __post_init__(self): self.quant_config = VllmConfig._get_quantization_config( self.model_config, self.load_config) + if self.compilation_config is None: + self.compilation_config = CompilationConfig.select_and_init_config( + ) + + current_platform.check_and_update_config(self) + def __str__(self): return ("model=%r, speculative_config=%r, tokenizer=%r, " "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, " diff --git a/vllm/envs.py b/vllm/envs.py index f320e35971f94..716e835a555f1 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -69,7 +69,6 @@ VLLM_SKIP_P2P_CHECK: bool = False VLLM_TORCH_COMPILE_LEVEL: int = 0 VLLM_TORCH_COMPILE_CONFIG: Optional[str] = None - VLLM_CUSTOM_OPS: List[str] = [] VLLM_DISABLED_KERNELS: List[str] = [] VLLM_USE_V1: bool = False VLLM_ENABLE_V1_MULTIPROCESSING: bool = False @@ -217,18 +216,6 @@ def get_default_config_root(): "VLLM_TORCH_COMPILE_CONFIG": lambda: os.environ.get("VLLM_TORCH_COMPILE_CONFIG", None), - # Fine-grained control over which custom ops to enable/disable. - # Use 'all' to enable all, 'none' to disable all. - # Also specify a list of custom op names to enable (prefixed with a '+'), - # or disable (prefixed with a '-'). - # Examples: - # - 'all,-op1' to enable all except op1 - # - 'none,+op1,+op2' to enable only op1 and op2 - # By default, all custom ops are enabled when running without Inductor - # and disabled when running with Inductor (compile_level >= Inductor). - "VLLM_CUSTOM_OPS": - lambda: os.environ.get("VLLM_CUSTOM_OPS", "").replace(" ", "").split(","), - # local rank of the process in the distributed setting, used to determine # the GPU device id "LOCAL_RANK": diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 24d75f4df4e02..6ae7d7cf6964f 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -1,12 +1,10 @@ -from functools import lru_cache from typing import Dict, Type import torch.nn as nn -import vllm.envs as envs -from vllm.compilation.levels import CompilationLevel from vllm.logger import init_logger from vllm.platforms import current_platform +from vllm.plugins import get_current_vllm_config from vllm.utils import print_warning_once logger = init_logger(__name__) @@ -87,6 +85,8 @@ def dispatch_forward(self): @classmethod def enabled(cls) -> bool: # if no name, then it was not registered + compilation_config = get_current_vllm_config().compilation_config + custom_ops = compilation_config.custom_ops if not hasattr(cls, "name"): print_warning_once( f"Custom op {cls.__name__} was not registered, " @@ -94,22 +94,25 @@ def enabled(cls) -> bool: f"It will be enabled/disabled based on the global settings.") return CustomOp.default_on() - enabled = f"+{cls.name}" in envs.VLLM_CUSTOM_OPS - disabled = f"-{cls.name}" in envs.VLLM_CUSTOM_OPS + enabled = f"+{cls.name}" in custom_ops + disabled = f"-{cls.name}" in custom_ops assert not (enabled and disabled), f"Cannot enable and disable {cls.name}" return (CustomOp.default_on() or enabled) and not disabled - # On by default if VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.PIECEWISE - # Specifying 'all' or 'none' in VLLM_CUSTOM_OPS takes precedence. @staticmethod - @lru_cache def default_on() -> bool: - count_none = envs.VLLM_CUSTOM_OPS.count("none") - count_all = envs.VLLM_CUSTOM_OPS.count("all") - assert count_none + count_all <= 1, "Can only specify 'none' or 'all'" - return envs.VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.PIECEWISE and \ + """ + On by default if level < CompilationLevel.PIECEWISE + Specifying 'all' or 'none' in custom_op takes precedence. + """ + from vllm.config import CompilationLevel + compilation_config = get_current_vllm_config().compilation_config + custom_ops = compilation_config.custom_ops + count_none = custom_ops.count("none") + count_all = custom_ops.count("all") + return compilation_config.level < CompilationLevel.PIECEWISE and \ not count_none > 0 or count_all > 0 # Dictionary of all custom ops (classes, indexed by registered name). diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 140b61fe6d56a..0f8b81c3ef40c 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -42,6 +42,7 @@ safetensors_weights_iterator) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform +from vllm.plugins import set_current_vllm_config from vllm.utils import is_pin_memory_available @@ -97,7 +98,8 @@ def _initialize_model(vllm_config: VllmConfig, prefix: str = "") -> nn.Module: all_params = [param.name for param in signatures.parameters.values()] if "vllm_config" in all_params and "prefix" in all_params: # new-style model class - return model_class(vllm_config=vllm_config, prefix=prefix) + with set_current_vllm_config(vllm_config): + return model_class(vllm_config=vllm_config, prefix=prefix) msg = ("vLLM model class should accept `vllm_config` and `prefix` as " "input arguments. Possibly you have an old-style model class" " registered from out of tree and it is used for new vLLM version. " @@ -121,7 +123,8 @@ def _initialize_model(vllm_config: VllmConfig, prefix: str = "") -> nn.Module: kwargs["lora_config"] = vllm_config.lora_config if "scheduler_config" in all_params: kwargs["scheduler_config"] = vllm_config.scheduler_config - return model_class(**kwargs) + with set_current_vllm_config(vllm_config): + return model_class(**kwargs) class BaseModelLoader(ABC): diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 81d8bdae2383c..970c0d1be617e 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -1,10 +1,15 @@ import enum import random -from typing import NamedTuple, Optional, Tuple, Union +from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union import numpy as np import torch +if TYPE_CHECKING: + from vllm.config import VllmConfig +else: + VllmConfig = None + class PlatformEnum(enum.Enum): CUDA = enum.auto() @@ -129,6 +134,19 @@ def seed_everything(cls, seed: int) -> None: np.random.seed(seed) torch.manual_seed(seed) + @classmethod + def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + """ + Check and update the configuration for the current platform. + + It can raise an exception if the configuration is not compatible with + the current platform, or it can update the configuration to make it + compatible with the current platform. + + The config is passed by reference, so it can be modified in place. + """ + pass + class UnspecifiedPlatform(Platform): _enum = PlatformEnum.UNSPECIFIED diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 8d0ce47df4040..c2e22bfc09f22 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -1,18 +1,16 @@ import os +from typing import TYPE_CHECKING import torch -import vllm.envs as envs -from vllm.compilation.levels import CompilationLevel from vllm.plugins import set_torch_compile_backend from .interface import Platform, PlatformEnum -if "VLLM_TORCH_COMPILE_LEVEL" not in os.environ: - os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.DYNAMO_ONCE) - -assert envs.VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.PIECEWISE,\ - "TPU does not support Inductor." +if TYPE_CHECKING: + from vllm.config import VllmConfig +else: + VllmConfig = None set_torch_compile_backend("openxla") @@ -31,3 +29,12 @@ def get_device_total_memory(cls, device_id: int = 0) -> int: @classmethod def inference_mode(cls): return torch.no_grad() + + @classmethod + def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + from vllm.config import CompilationLevel + compilation_config = vllm_config.compilation_config + if "VLLM_TORCH_COMPILE_LEVEL" not in os.environ: + compilation_config.level = CompilationLevel.DYNAMO_ONCE + assert compilation_config.level < CompilationLevel.PIECEWISE,\ + "TPU does not support Inductor." diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 7b1bbb14c5302..c20b9ec891d5d 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -1,11 +1,11 @@ import logging +from contextlib import contextmanager from typing import TYPE_CHECKING, Callable, Optional, Union import vllm.envs as envs if TYPE_CHECKING: - from vllm.compilation.config import CompilationConfig - from vllm.config import VllmConfig + from vllm.config import CompilationConfig, VllmConfig else: CompilationConfig = None VllmConfig = None @@ -72,3 +72,29 @@ def set_compilation_config(config: Optional[CompilationConfig]): def get_compilation_config() -> Optional[CompilationConfig]: return _compilation_config + + +_current_vllm_config: Optional[VllmConfig] = None + + +@contextmanager +def set_current_vllm_config(vllm_config: VllmConfig): + """ + Temporarily set the current VLLM config. + Used during model initialization. + We save the current VLLM config in a global variable, + so that all modules can access it, e.g. custom ops + can access the VLLM config to determine how to dispatch. + """ + global _current_vllm_config + old_vllm_config = _current_vllm_config + try: + _current_vllm_config = vllm_config + yield + finally: + _current_vllm_config = old_vllm_config + + +def get_current_vllm_config() -> VllmConfig: + assert _current_vllm_config is not None, "Current VLLM config is not set." + return _current_vllm_config diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index eebd1de96537f..d60f93a44f6dd 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1,4 +1,3 @@ -import os import time from dataclasses import dataclass from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple @@ -8,11 +7,8 @@ import torch.distributed import torch.nn as nn -from vllm import envs from vllm.compilation.compile_context import set_compile_context -from vllm.compilation.config import CompilationConfig -from vllm.compilation.levels import CompilationLevel -from vllm.config import VllmConfig +from vllm.config import CompilationConfig, CompilationLevel, VllmConfig from vllm.forward_context import set_forward_context from vllm.inputs import INPUT_REGISTRY, InputRegistry from vllm.logger import init_logger @@ -99,7 +95,7 @@ def __init__( pin_memory=self.pin_memory, ) - self.use_cuda_graph = (envs.VLLM_TORCH_COMPILE_LEVEL + self.use_cuda_graph = (self.vllm_config.compilation_config.level == CompilationLevel.PIECEWISE and not self.model_config.enforce_eager) # TODO(woosuk): Provide an option to tune the max cudagraph batch size. @@ -517,9 +513,9 @@ def load_model(self) -> None: # CUDA graphs do not work properly with the custom CUDA kernels. # FIXME(woosuk): Disable inductor to reduce the compilation time # and avoid any potential issues with the inductor. - os.environ["VLLM_CUSTOM_OPS"] = "none" set_compilation_config( CompilationConfig( + custom_ops=["none"], use_cudagraph=True, non_cudagraph_ops=["vllm.unified_v1_flash_attention"], use_inductor=True, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 22ee3f9f863e4..fd89f95445565 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -19,8 +19,7 @@ from vllm.attention.backends.abstract import AttentionState from vllm.attention.backends.utils import CommonAttentionState from vllm.compilation.compile_context import set_compile_context -from vllm.compilation.levels import CompilationLevel -from vllm.config import VllmConfig +from vllm.config import CompilationLevel, VllmConfig from vllm.core.scheduler import SchedulerOutputs from vllm.distributed import get_pp_group from vllm.distributed.parallel_state import graph_capture @@ -1142,8 +1141,8 @@ def load_model(self) -> None: "provided. Defaulting to scaling factors of 1.0. " "This may lead to less accurate results!") - if envs.VLLM_TORCH_COMPILE_LEVEL == CompilationLevel.DYNAMO_AS_IS \ - and supports_dynamo(): + if self.vllm_config.compilation_config.level ==\ + CompilationLevel.DYNAMO_AS_IS and supports_dynamo(): from vllm.plugins import get_torch_compile_backend backend = get_torch_compile_backend() or "eager" self.model = torch.compile( diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index a721186137328..d7a641857a613 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -140,7 +140,7 @@ def load_model(self) -> None: model = get_model(vllm_config=self.vllm_config) model = model.eval() xm.wait_device_ops() - self.model = ModelWrapper(model) + self.model = ModelWrapper(model, self.vllm_config) def _dummy_run( self, @@ -669,13 +669,15 @@ def execute_model( class ModelWrapper(TorchCompileWrapperWithCustomDispatcher): - def __init__(self, model: nn.Module): + def __init__(self, model: nn.Module, vllm_config: VllmConfig): self.model = model compiled_callable = torch.compile(self.forward, backend="openxla", fullgraph=True, dynamic=False) - super().__init__(compiled_callable) + super().__init__( + compiled_callable, + compilation_level=vllm_config.compilation_config.level) def __call__(self, *args, is_prompt: bool, **kwargs): if len(self.compiled_codes) < 3 or not self.use_custom_dispatcher: From 643ecf7b11a3e74c838f438cfc1b3e59c018853b Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Sat, 16 Nov 2024 21:18:46 -0800 Subject: [PATCH 023/255] [V1] Refactor model executable interface for all text-only language models (#10374) Signed-off-by: Roger Wang --- vllm/model_executor/models/arctic.py | 16 ++++++++++++++-- vllm/model_executor/models/baichuan.py | 16 ++++++++++++++-- vllm/model_executor/models/bloom.py | 17 ++++++++++++++--- vllm/model_executor/models/commandr.py | 16 ++++++++++++++-- vllm/model_executor/models/dbrx.py | 16 ++++++++++++++-- vllm/model_executor/models/deepseek.py | 16 ++++++++++++++-- vllm/model_executor/models/deepseek_v2.py | 16 ++++++++++++++-- vllm/model_executor/models/eagle.py | 13 ++++++++++--- vllm/model_executor/models/exaone.py | 7 ++++++- vllm/model_executor/models/falcon.py | 16 ++++++++++++++-- vllm/model_executor/models/gemma.py | 7 ++++++- vllm/model_executor/models/gemma2.py | 12 ++++++++++-- vllm/model_executor/models/gpt2.py | 7 +++++-- vllm/model_executor/models/gpt_bigcode.py | 17 +++++++++++++---- vllm/model_executor/models/gpt_j.py | 16 ++++++++++++++-- vllm/model_executor/models/gpt_neox.py | 16 ++++++++++++++-- vllm/model_executor/models/granite.py | 7 ++++++- vllm/model_executor/models/granitemoe.py | 16 ++++++++++++++-- vllm/model_executor/models/internlm2.py | 9 +++++++-- vllm/model_executor/models/jais.py | 14 ++++++++++++-- vllm/model_executor/models/jamba.py | 16 ++++++++++++++-- vllm/model_executor/models/mamba.py | 15 +++++++++++++-- vllm/model_executor/models/minicpm.py | 7 ++++++- vllm/model_executor/models/mixtral.py | 16 ++++++++++++++-- vllm/model_executor/models/mixtral_quant.py | 16 ++++++++++++++-- vllm/model_executor/models/mpt.py | 16 ++++++++++++++-- vllm/model_executor/models/nemotron.py | 7 ++++++- vllm/model_executor/models/olmo.py | 19 +++++++++++++------ vllm/model_executor/models/olmoe.py | 16 ++++++++++++++-- vllm/model_executor/models/orion.py | 16 ++++++++++++++-- vllm/model_executor/models/persimmon.py | 8 +++++++- vllm/model_executor/models/phi.py | 16 ++++++++++++++-- vllm/model_executor/models/phi3_small.py | 19 +++++++++++-------- vllm/model_executor/models/phimoe.py | 16 ++++++++++++++-- vllm/model_executor/models/qwen.py | 16 ++++++++++++++-- vllm/model_executor/models/qwen2.py | 2 +- vllm/model_executor/models/qwen2_cls.py | 7 ++++++- vllm/model_executor/models/qwen2_moe.py | 16 ++++++++++++++-- vllm/model_executor/models/qwen2_rm.py | 7 ++++++- vllm/model_executor/models/solar.py | 4 +++- vllm/model_executor/models/stablelm.py | 16 ++++++++++++++-- vllm/model_executor/models/starcoder2.py | 16 ++++++++++++++-- vllm/model_executor/models/xverse.py | 16 ++++++++++++++-- 43 files changed, 483 insertions(+), 90 deletions(-) diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index 9ee2a2cc09a24..d52418ee0f6f1 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -389,6 +389,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory(["hidden_states"], config.hidden_size)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -396,9 +399,13 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: - hidden_states = self.embed_tokens(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) else: assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] @@ -439,6 +446,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -446,9 +456,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index aabbd31192a40..01ce7c42cd391 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -284,6 +284,9 @@ def __init__( make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -291,9 +294,13 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: - hidden_states = self.embed_tokens(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) residual = None else: assert intermediate_tensors is not None @@ -363,6 +370,9 @@ def __init__( self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -370,9 +380,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 84adf574af5e2..cf2eee8172769 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -251,6 +251,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory(["hidden_states"], config.hidden_size)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.word_embeddings_layernorm(self.word_embeddings(input_ids)) + def forward( self, input_ids: torch.Tensor, @@ -258,10 +261,13 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: - hidden_states = self.word_embeddings(input_ids) - hidden_states = self.word_embeddings_layernorm(hidden_states) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) else: assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] @@ -301,6 +307,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.transformer.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -308,9 +317,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.transformer(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index cd5c1d6844716..fbb09a64cde9b 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -280,6 +280,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -287,9 +290,13 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: - hidden_states = self.embed_tokens(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) residual = None else: assert intermediate_tensors is not None @@ -354,6 +361,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + @torch.no_grad() def forward( self, @@ -362,9 +372,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index fff8710f6b475..3952ff31e5cec 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -321,6 +321,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory(["hidden_states"], config.d_model)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.wte(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -328,9 +331,13 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: - hidden_states = self.wte(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) else: assert intermediate_tensors hidden_states = intermediate_tensors["hidden_states"] @@ -376,6 +383,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.transformer.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -383,9 +393,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.transformer(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index a9bf1440c4d60..36dfea5a65656 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -353,6 +353,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -360,9 +363,13 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: - hidden_states = self.embed_tokens(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) residual = None else: hidden_states = intermediate_tensors["hidden_states"] @@ -401,6 +408,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -408,9 +418,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 4fb1eed15a2e7..1e32fe60c7a5b 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -445,6 +445,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -452,9 +455,13 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: - hidden_states = self.embed_tokens(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) residual = None else: assert intermediate_tensors is not None @@ -495,6 +502,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -502,9 +512,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py index 85c51e8404584..f138d13630263 100644 --- a/vllm/model_executor/models/eagle.py +++ b/vllm/model_executor/models/eagle.py @@ -78,6 +78,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def sampler(self): return self.model.sampler + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -86,11 +89,14 @@ def forward( attn_metadata: AttentionMetadata, previous_hidden_states: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - tok_embeds = self.model.model.embed_tokens(input_ids) + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings(input_ids) + inputs_embeds = self.fc( - torch.cat([tok_embeds, previous_hidden_states], dim=-1)) + torch.cat([inputs_embeds, previous_hidden_states], dim=-1)) inputs_embeds[positions == 0] = 0 # masking inputs at position=0 @@ -100,7 +106,8 @@ def forward( positions=positions, kv_caches=kv_caches, attn_metadata=attn_metadata, - intermediate_tensors=intermediate_tensors) + intermediate_tensors=intermediate_tensors, + ) return hidden_states def compute_logits(self, hidden_states: torch.Tensor, diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index cd3e7da657e0e..52dd603ca558d 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -479,6 +479,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -486,9 +489,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: model_output = self.transformer(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return model_output def compute_logits( diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index b3dbf063ac298..e97abe949ccdb 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -367,6 +367,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory(["hidden_states"], config.hidden_size)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.word_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -374,9 +377,13 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: - hidden_states = self.word_embeddings(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) else: hidden_states = intermediate_tensors["hidden_states"] for i in range(self.start_layer, self.end_layer): @@ -432,6 +439,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.transformer.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.LongTensor, @@ -439,9 +449,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: hidden_states = self.transformer(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 55baba809e58f..ace13664c6ea6 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -390,6 +390,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -397,9 +400,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index eeb3fd98a7eac..a60b4e73a76d4 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -272,6 +272,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: Optional[torch.Tensor], @@ -285,7 +288,7 @@ def forward( if inputs_embeds is not None: hidden_states = inputs_embeds else: - hidden_states = self.embed_tokens(input_ids) + hidden_states = self.get_input_embeddings(input_ids) hidden_states *= self.normalizer residual = None else: @@ -414,6 +417,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -421,9 +427,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index cc85693f99526..fa0fdad28d161 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -209,6 +209,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory(["hidden_states"], config.n_embd)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.wte(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -220,7 +223,7 @@ def forward( ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: if inputs_embeds is None: - inputs_embeds = self.wte(input_ids) + inputs_embeds = self.get_input_embeddings(input_ids) position_embeds = self.wpe(position_ids) hidden_states = inputs_embeds + position_embeds else: @@ -262,7 +265,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.transformer.make_empty_intermediate_tensors) def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.transformer.wte(input_ids) + return self.transformer.get_input_embeddings(input_ids) def forward( self, diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index ab25c66c3a887..b2fc79d0d36dc 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -218,6 +218,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory(["hidden_states"], config.n_embd)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.wte(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -225,11 +228,12 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: - inputs_embeds = self.wte(input_ids) - position_embeds = self.wpe(position_ids) - hidden_states = inputs_embeds + position_embeds + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings(input_ids) + hidden_states = inputs_embeds + self.wpe(position_ids) else: hidden_states = intermediate_tensors["hidden_states"] @@ -285,6 +289,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.transformer.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -292,9 +299,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.transformer(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index a83d03480dde1..cec3fd12a67d6 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -201,6 +201,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory(["hidden_states"], config.n_embd)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.wte(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -208,9 +211,13 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: - hidden_states = self.wte(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) else: hidden_states = intermediate_tensors["hidden_states"] for i in range(self.start_layer, self.end_layer): @@ -250,6 +257,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.transformer.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -257,9 +267,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.transformer(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 794b141bfa4aa..11f286d6bcba0 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -214,6 +214,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory(["hidden_states"], config.hidden_size)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_in(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -221,9 +224,13 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: - hidden_states = self.embed_in(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) else: hidden_states = intermediate_tensors["hidden_states"] for i in range(self.start_layer, self.end_layer): @@ -262,6 +269,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.gpt_neox.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.gpt_neox.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -269,9 +279,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.gpt_neox(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index d1e6e31f2b8d1..cb2583e69d88d 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -409,6 +409,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): else: self.lm_head = PPMissingLayer() + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -416,9 +419,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: model_output = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return model_output def compute_logits( diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 2ed115c56af45..f437dd521a7d5 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -277,6 +277,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -284,9 +287,13 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: if get_pp_group().is_first_rank: - hidden_states = self.embed_tokens(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) hidden_states *= self.embedding_multiplier residual = None else: @@ -366,6 +373,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.sampler = get_sampler() + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -373,9 +383,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 21fa6983063b8..19bfe16e4d5fc 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -290,7 +290,7 @@ def forward( if inputs_embeds is not None: hidden_states = inputs_embeds else: - hidden_states = self.tok_embeddings(input_ids) + hidden_states = self.get_input_embeddings(input_ids) residual = None else: assert intermediate_tensors is not None @@ -335,6 +335,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -342,9 +345,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index 65800c44e5a93..ee49ffb3cd87f 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -250,6 +250,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory(["hidden_states"], config.n_embd)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.wte(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -257,9 +260,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[IntermediateTensors, torch.Tensor]: if get_pp_group().is_first_rank: - inputs_embeds = self.wte(input_ids) + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings(input_ids) if self.wpe is not None: position_embeds = self.wpe(position_ids) hidden_states = inputs_embeds + position_embeds @@ -311,6 +316,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.transformer.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -318,9 +326,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[IntermediateTensors, torch.Tensor]: hidden_states = self.transformer(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 88fb8d5cf555a..5612dd6886385 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -292,6 +292,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.final_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -299,8 +302,12 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, mamba_cache_params: MambaCacheParams, + inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - hidden_states = self.embed_tokens(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) residual = None for i in range(len(self.layers)): layer = self.layers[i] @@ -381,12 +388,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.vocab_size) self.sampler = get_sampler() + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, kv_caches: List[KVCache], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs): if self.mamba_cache is None: max_batch_size = (_get_graph_batch_size( @@ -409,7 +420,8 @@ def forward(self, mamba_cache_tensors[1], state_indices_tensor) hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, mamba_cache_params) + attn_metadata, mamba_cache_params, + inputs_embeds) return hidden_states def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs): diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index 55c575e22a0f6..ac0d265a961f0 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -106,15 +106,22 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.norm_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, attn_metadata: AttentionMetadata, mamba_cache_params: MambaCacheParams, + inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - hidden_states = self.embeddings(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) residual = None for i in range(len(self.layers)): @@ -168,12 +175,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.vocab_size) self.sampler = get_sampler() + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.backbone.get_input_embeddings(input_ids) + def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, kv_caches: List[KVCache], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs): if self.mamba_cache is None: max_batch_size = (_get_graph_batch_size( @@ -194,7 +205,7 @@ def forward(self, state_indices_tensor) hidden_states = self.backbone(input_ids, positions, attn_metadata, - mamba_cache_params) + mamba_cache_params, inputs_embeds) return hidden_states diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 2db953329fd91..6b67266c53362 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -504,6 +504,9 @@ def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""): self.model = MiniCPMModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -511,9 +514,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 3eb2f60fd4fc7..eebf5bab5a288 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -281,6 +281,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -288,9 +291,13 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: - hidden_states = self.embed_tokens(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) residual = None else: assert intermediate_tensors is not None @@ -363,6 +370,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -370,9 +380,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index 95cfb6f54dc10..af2e9586988df 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -318,6 +318,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -325,9 +328,13 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: - hidden_states = self.embed_tokens(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) residual = None else: assert intermediate_tensors is not None @@ -368,6 +375,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -375,9 +385,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index e15c0fe8db060..3c74ef2448abb 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -237,6 +237,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory(["hidden_states"], config.d_model)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.wte(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -244,9 +247,13 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: - hidden_states = self.wte(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) else: assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] @@ -283,6 +290,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.transformer.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -290,9 +300,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.transformer(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index e09d7088a69ce..eb45beae7d21a 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -440,6 +440,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -447,9 +450,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: model_output = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return model_output def compute_logits( diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 3467ae5896494..98d4e1ec320a4 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -248,6 +248,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory(["hidden_states"], config.hidden_size)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -255,17 +258,16 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: """ :param input_ids: A tensor of shape `(batch_size, seq_len)`. """ if get_pp_group().is_first_rank: - # Get embeddings of input. - # shape: (batch_size, seq_len, d_model) - inputs_embeds = self.embed_tokens(input_ids) - - # embed positions - hidden_states = inputs_embeds + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) else: assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] @@ -315,6 +317,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -322,6 +327,7 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model( input_ids=input_ids, @@ -329,6 +335,7 @@ def forward( kv_caches=kv_caches, attn_metadata=attn_metadata, intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, ) return hidden_states diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 3d31919edd862..f4eebab8c98dd 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -269,6 +269,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -276,9 +279,13 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: - hidden_states = self.embed_tokens(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) residual = None else: assert intermediate_tensors is not None @@ -326,6 +333,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -333,9 +343,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits(self, hidden_states: torch.Tensor, diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 38821c8288347..39d659c49cbcf 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -237,6 +237,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): "hidden_states", ], config.hidden_size)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -244,9 +247,13 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: - hidden_states = self.embed_tokens(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) else: assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] @@ -286,6 +293,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -293,9 +303,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index 2e34a7cc30873..62c509153a111 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -235,6 +235,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory(["hidden_states"], config.hidden_size)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -248,7 +251,7 @@ def forward( if inputs_embeds is not None: hidden_states = inputs_embeds else: - hidden_states = self.embed_tokens(input_ids) + hidden_states = self.get_input_embeddings(input_ids) else: assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] @@ -282,6 +285,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 262f6996fc374..a2ab0d74c48db 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -218,6 +218,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory(["hidden_states"], config.hidden_size)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -225,9 +228,13 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: - hidden_states = self.embed_tokens(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) else: assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] @@ -303,6 +310,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -310,9 +320,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py index 8a5fb6d303e60..2139cec441807 100644 --- a/vllm/model_executor/models/phi3_small.py +++ b/vllm/model_executor/models/phi3_small.py @@ -324,11 +324,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory(["hidden_states"], config.hidden_size)) - def get_input_embeddings(self): - return self.embed_tokens - - def set_input_embeddings(self, value): - self.embed_tokens = value + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) def forward( self, @@ -337,9 +334,13 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor], ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: - hidden_states = self.embed_tokens(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) if (self.mup_embedding_multiplier is not None and self.mup_embedding_multiplier > 0.0): hidden_states = hidden_states * self.mup_embedding_multiplier @@ -397,8 +398,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): else: self.dummy_token_indices = None - def get_input_embeddings(self): - return self.model.embed_tokens + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) def set_input_embeddings(self, value): self.model.embed_tokens = value @@ -433,6 +434,7 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: output_hidden_states = self.model( input_ids=input_ids, @@ -440,6 +442,7 @@ def forward( kv_caches=kv_caches, attn_metadata=attn_metadata, intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, ) output_hidden_states = output_hidden_states return output_hidden_states diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 6d71a8949111b..b7e70f8fa2c6d 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -465,6 +465,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -472,9 +475,13 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: - hidden_states = self.embed_tokens(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) residual = None else: assert intermediate_tensors is not None @@ -560,6 +567,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -567,9 +577,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits(self, hidden_states: torch.Tensor, diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 3d26ede722dd1..447632cefcd9a 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -578,6 +578,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config=quant_config) if hasattr( config, "visual") else None + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.wte(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -586,6 +589,7 @@ def forward( attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], pixel_values: Optional[QwenImageInputs], + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: img_pos = None # If pixel / visual embeddings are provided, this is a visual model @@ -606,6 +610,10 @@ def forward( ) if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) hidden_states = self.wte(input_ids) # Merge the image embeddings into the hidden states if actually have # visual features and the corresponding image tokens @@ -915,6 +923,9 @@ def _get_image_input_type( ) return None + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.transformer.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -922,7 +933,8 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, - pixel_values: Optional[torch.Tensor] = None + pixel_values: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: if intermediate_tensors is not None: input_ids = None @@ -932,7 +944,7 @@ def forward( hidden_states = self.transformer(input_ids, positions, kv_caches, attn_metadata, intermediate_tensors, - pixel_values) + pixel_values, inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 431e397e1e10d..8f10df808c216 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -309,7 +309,7 @@ def forward( if inputs_embeds is not None: hidden_states = inputs_embeds else: - hidden_states = self.embed_tokens(input_ids) + hidden_states = self.get_input_embeddings(input_ids) residual = None else: assert intermediate_tensors is not None diff --git a/vllm/model_executor/models/qwen2_cls.py b/vllm/model_executor/models/qwen2_cls.py index 120403e948686..07eb330620a43 100644 --- a/vllm/model_executor/models/qwen2_cls.py +++ b/vllm/model_executor/models/qwen2_cls.py @@ -72,6 +72,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): normalize=False, softmax=True) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -79,9 +82,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) logits, _ = self.score(hidden_states) return logits diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 51c0cd5664fd2..249d94b5d95e9 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -344,6 +344,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -351,9 +354,13 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: - hidden_states = self.embed_tokens(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) residual = None else: assert intermediate_tensors is not None @@ -395,6 +402,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -402,9 +412,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 55843d8325348..6db467af334f5 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -85,6 +85,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -92,9 +95,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) logits, _ = self.score(hidden_states) return logits diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 4f03ca501fb68..affb2c975ce4a 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -456,9 +456,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: model_output = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return model_output def compute_logits(self, hidden_states: torch.Tensor, diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 1125f9e9f9617..99acce596602e 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -218,6 +218,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory(["hidden_states"], config.hidden_size)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -225,9 +228,13 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: - hidden_states = self.embed_tokens(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) else: assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] @@ -265,6 +272,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -272,9 +282,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index ce7a7957f52c4..0ef940acebb93 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -221,6 +221,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory(["hidden_states"], config.hidden_size)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -228,9 +231,13 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: - hidden_states = self.embed_tokens(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) else: assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] @@ -273,6 +280,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -280,9 +290,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py index 153527da20d75..51172d8782a70 100644 --- a/vllm/model_executor/models/xverse.py +++ b/vllm/model_executor/models/xverse.py @@ -252,6 +252,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -259,9 +262,13 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: - hidden_states = self.embed_tokens(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) residual = None else: hidden_states = intermediate_tensors["hidden_states"] @@ -335,6 +342,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -342,9 +352,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( From 905d0f0af4e2c07893e36778da9ab02bde01ace8 Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Sun, 17 Nov 2024 00:58:22 -0600 Subject: [PATCH 024/255] [CI/Build] Fix IDC hpu [Device not found] issue (#10384) Signed-off-by: Chendi Xue --- .buildkite/run-hpu-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh index 4505dc7a9373c..fa4f74fca7a11 100644 --- a/.buildkite/run-hpu-test.sh +++ b/.buildkite/run-hpu-test.sh @@ -13,4 +13,4 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and launch offline inference -docker run --runtime=habana --name=hpu-test --network=host -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py \ No newline at end of file +docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py \ No newline at end of file From cf349c4a97adb36354bdc2b14448ea55279d1575 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 17 Nov 2024 15:12:04 +0800 Subject: [PATCH 025/255] [Bugfix][CPU] Fix CPU embedding runner with tensor parallel (#10394) Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/worker/cpu_embedding_model_runner.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/worker/cpu_embedding_model_runner.py b/vllm/worker/cpu_embedding_model_runner.py index 7053075bf4d8f..d0b8fec48d74f 100644 --- a/vllm/worker/cpu_embedding_model_runner.py +++ b/vllm/worker/cpu_embedding_model_runner.py @@ -66,6 +66,10 @@ def execute_model( hidden_states = model_executable(**execute_model_kwargs) + # Only perform pooling in the driver worker. + if not self.is_driver_worker: + return [] + return [ self.model.pooler(hidden_states=hidden_states, pooling_metadata=model_input.pooling_metadata) From 8d74b5aee9e780852de870c936b59707835e84f5 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sat, 16 Nov 2024 23:14:23 -0800 Subject: [PATCH 026/255] [platforms] refactor cpu code (#10402) Signed-off-by: youkaichao --- vllm/executor/cpu_executor.py | 68 +---------------------------------- vllm/platforms/cpu.py | 60 +++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 67 deletions(-) diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 4ceb5a837dd7f..1542a2ae367eb 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -2,9 +2,6 @@ from functools import partial from typing import Any, Awaitable, List, Optional, Set, Tuple, Union -import vllm.envs as envs -from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, - SchedulerConfig) from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper, ResultHandler, WorkerMonitor) @@ -13,7 +10,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import ExecuteModelRequest -from vllm.utils import (GiB_bytes, get_distributed_init_method, get_open_port, +from vllm.utils import (get_distributed_init_method, get_open_port, get_vllm_instance_id, make_async) from vllm.worker.worker_base import WorkerWrapperBase @@ -57,13 +54,6 @@ def _init_executor(self) -> None: os.environ["LOCAL_WORLD_SIZE"] = str( self.parallel_config.tensor_parallel_size) - self.model_config = _verify_and_get_model_config(self.model_config) - self.cache_config = _verify_and_get_cache_config(self.cache_config) - self.scheduler_config = _verify_and_get_scheduler_config( - self.scheduler_config) - self.parallel_config = _verify_and_get_parallel_config( - self.parallel_config) - # Multiprocessing-based executor does not support multi-node setting. # Since it only works for single node, we can use the loopback address # 127.0.0.1 for communication. @@ -313,62 +303,6 @@ async def check_health_async(self) -> None: self.check_health() -def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig: - # Reminder: Please update docs/source/serving/compatibility_matrix.rst - # If the feature combo become valid - if not config.enforce_eager: - logger.warning( - "CUDA graph is not supported on CPU, fallback to the eager " - "mode.") - config.enforce_eager = True - return config - - -def _verify_and_get_scheduler_config( - config: SchedulerConfig) -> SchedulerConfig: - # Reminder: Please update docs/source/serving/compatibility_matrix.rst - # If the feature combo become valid - if config.chunked_prefill_enabled: - logger.warning("Chunked prefill is not supported on CPU, disable it.") - config.chunked_prefill_enabled = False - - return config - - -def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig: - # Reminder: Please update docs/source/serving/compatibility_matrix.rst - # If the feature combo become valid - if config.enable_prefix_caching: - logger.warning("Prefix caching is not supported on CPU, disable it.") - config.enable_prefix_caching = False - - kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE - - if kv_cache_space >= 0: - if kv_cache_space == 0: - config.cpu_kvcache_space_bytes = 4 * GiB_bytes # type: ignore - logger.warning("Environment variable VLLM_CPU_KVCACHE_SPACE (GB) " - "for CPU backend is not set, using 4 by default.") - else: - config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes # type: ignore - else: - raise RuntimeError( - "Invalid environment variable VLLM_CPU_KVCACHE_SPACE" - f" {kv_cache_space}, expect a positive integer value.") - - return config - - -def _verify_and_get_parallel_config(config: ParallelConfig) -> ParallelConfig: - if (config.distributed_executor_backend is not None - and config.distributed_executor_backend != "mp"): - logger.warning( - "%s is not supported on CPU, fallback to mp distributed executor " - "backend.", config.distributed_executor_backend) - config.distributed_executor_backend = "mp" - return config - - def _driver_method_invoker(driver, method: str, *args, **kwargs): return getattr(driver, method)(*args, **kwargs) diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 5243f59203afc..42bee31dfb0e9 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -1,8 +1,19 @@ +from typing import TYPE_CHECKING + import psutil import torch +from vllm.logger import init_logger + from .interface import Platform, PlatformEnum +if TYPE_CHECKING: + from vllm.config import VllmConfig +else: + VllmConfig = None + +logger = init_logger(__name__) + class CpuPlatform(Platform): _enum = PlatformEnum.CPU @@ -18,3 +29,52 @@ def get_device_total_memory(cls, device_id: int = 0) -> int: @classmethod def inference_mode(cls): return torch.no_grad() + + @classmethod + def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + import vllm.envs as envs + from vllm.utils import GiB_bytes + model_config = vllm_config.model_config + # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # If the feature combo become valid + if not model_config.enforce_eager: + logger.warning( + "CUDA graph is not supported on CPU, fallback to the eager " + "mode.") + model_config.enforce_eager = True + + cache_config = vllm_config.cache_config + + if cache_config.enable_prefix_caching: + logger.warning( + "Prefix caching is not supported on CPU, disable it.") + cache_config.enable_prefix_caching = False + + kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE + + if kv_cache_space >= 0: + if kv_cache_space == 0: + cache_config.cpu_kvcache_space_bytes = 4 * GiB_bytes # type: ignore + logger.warning( + "Environment variable VLLM_CPU_KVCACHE_SPACE (GB) " + "for CPU backend is not set, using 4 by default.") + else: + cache_config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes # type: ignore # noqa + else: + raise RuntimeError( + "Invalid environment variable VLLM_CPU_KVCACHE_SPACE" + f" {kv_cache_space}, expect a positive integer value.") + + scheduler_config = vllm_config.scheduler_config + if scheduler_config.chunked_prefill_enabled: + logger.warning( + "Chunked prefill is not supported on CPU, disable it.") + scheduler_config.chunked_prefill_enabled = False + + parallel_config = vllm_config.parallel_config + if (parallel_config.distributed_executor_backend is not None + and parallel_config.distributed_executor_backend != "mp"): + logger.warning(("%s is not supported on CPU, fallback to mp " + "distributed executor backend."), + parallel_config.distributed_executor_backend) + parallel_config.distributed_executor_backend = "mp" From 76aab90ab68476c353ad58019fd51fd18622056a Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Sun, 17 Nov 2024 16:44:44 +0800 Subject: [PATCH 027/255] [Hardware] [HPU]add `mark_step` for hpu (#10239) Signed-off-by: Kunshang Ji --- vllm/worker/hpu_model_runner.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 1ff30d685c6b1..99cf9a7e67256 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -272,6 +272,19 @@ def precompute_indices_and_offsets(block_size, slot_mapping, is_prompt): return indices, offsets +def modify_decoder_layer(module: torch.nn.Module, suffix="DecoderLayer"): + if module.__class__.__name__.endswith(suffix): + + def forward_hook(module, args, output): + htorch.core.mark_step() + return output + + module.register_forward_hook(forward_hook) + + for child_name, child_module in module.named_children(): + modify_decoder_layer(child_module) + + class HpuModelAdapter: def __init__(self, model, block_size, dtype, enforce_eager): @@ -636,6 +649,7 @@ def load_model(self) -> None: else: self.model = self.model.to("hpu") htcore.mark_step() + modify_decoder_layer(self.model) torch.hpu.synchronize() with HabanaMemoryProfiler() as m_wrap: From 80d85c5d7bc33ce0ae210ebad3c45e4361b57640 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=B5=E8=84=91=E6=98=9F=E4=BA=BA?= Date: Sun, 17 Nov 2024 16:50:24 +0800 Subject: [PATCH 028/255] [Bugfix] Fix mrope_position_delta in non-last prefill chunk (#10403) Signed-off-by: imkero --- vllm/model_executor/layers/rotary_embedding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index b01e4c61fe101..117fe086e5e87 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -922,9 +922,9 @@ def get_input_positions( torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) - llm_positions = llm_positions[:, context_len:seq_len] mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() + llm_positions = llm_positions[:, context_len:seq_len] return llm_positions.tolist(), mrope_position_delta From d1557e66d3227355e5aed8018a945a5e6a733147 Mon Sep 17 00:00:00 2001 From: wchen61 Date: Sun, 17 Nov 2024 19:32:40 +0800 Subject: [PATCH 029/255] =?UTF-8?q?[Misc]=20Enhance=20offline=5Finference?= =?UTF-8?q?=20to=20support=20user-configurable=20paramet=E2=80=A6=20(#1039?= =?UTF-8?q?2)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: wchen61 --- examples/offline_inference.py | 98 ++++++++++++++++++++++++++++------- 1 file changed, 78 insertions(+), 20 deletions(-) diff --git a/examples/offline_inference.py b/examples/offline_inference.py index 9b758fa2479f6..391ac6b9b6b03 100644 --- a/examples/offline_inference.py +++ b/examples/offline_inference.py @@ -1,22 +1,80 @@ +from dataclasses import asdict + from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import EngineArgs +from vllm.utils import FlexibleArgumentParser + + +def get_prompts(num_prompts: int): + # The default sample prompts. + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + + if num_prompts != len(prompts): + prompts = (prompts * ((num_prompts // len(prompts)) + 1))[:num_prompts] + + return prompts + + +def main(args): + # Create prompts + prompts = get_prompts(args.num_prompts) + + # Create a sampling params object. + sampling_params = SamplingParams(n=args.n, + temperature=args.temperature, + top_p=args.top_p, + top_k=args.top_k, + max_tokens=args.max_tokens) + + # Create an LLM. + # The default model is 'facebook/opt-125m' + engine_args = EngineArgs.from_cli_args(args) + llm = LLM(**asdict(engine_args)) + + # Generate texts from the prompts. + # The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + +if __name__ == '__main__': + parser = FlexibleArgumentParser() + parser = EngineArgs.add_cli_args(parser) + group = parser.add_argument_group("SamplingParams options") + group.add_argument("--num-prompts", + type=int, + default=4, + help="Number of prompts used for inference") + group.add_argument("--max-tokens", + type=int, + default=16, + help="Generated output length for sampling") + group.add_argument('--n', + type=int, + default=1, + help='Number of generated sequences per prompt') + group.add_argument('--temperature', + type=float, + default=0.8, + help='Temperature for text generation') + group.add_argument('--top-p', + type=float, + default=0.95, + help='top_p for text generation') + group.add_argument('--top-k', + type=int, + default=-1, + help='top_k for text generation') -# Sample prompts. -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] -# Create a sampling params object. -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - -# Create an LLM. -llm = LLM(model="facebook/opt-125m") -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = llm.generate(prompts, sampling_params) -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + args = parser.parse_args() + main(args) From c4e464333eac5a46e1cc2701e095a44057c82927 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Mon, 18 Nov 2024 09:07:46 +0800 Subject: [PATCH 030/255] [Misc] Add uninitialized params tracking for `AutoWeightsLoader` (#10327) Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/model_loader/loader.py | 12 +++++++++++- vllm/model_executor/models/arctic.py | 8 ++++++-- vllm/model_executor/models/baichuan.py | 8 ++++++-- vllm/model_executor/models/bert.py | 8 ++++++-- vllm/model_executor/models/blip.py | 12 ++++++++---- vllm/model_executor/models/blip2.py | 7 ++++--- vllm/model_executor/models/bloom.py | 8 ++++++-- vllm/model_executor/models/chameleon.py | 8 ++++++-- vllm/model_executor/models/chatglm.py | 10 ++++++++-- vllm/model_executor/models/clip.py | 11 ++++++++--- vllm/model_executor/models/commandr.py | 4 +++- vllm/model_executor/models/dbrx.py | 8 ++++++-- vllm/model_executor/models/decilm.py | 8 ++++++-- vllm/model_executor/models/deepseek.py | 8 ++++++-- vllm/model_executor/models/deepseek_v2.py | 8 ++++++-- vllm/model_executor/models/exaone.py | 9 +++++++-- vllm/model_executor/models/falcon.py | 8 ++++++-- vllm/model_executor/models/florence2.py | 17 +++++++++++------ vllm/model_executor/models/fuyu.py | 8 +++++--- vllm/model_executor/models/gemma.py | 4 +++- vllm/model_executor/models/gemma2.py | 9 ++++++--- vllm/model_executor/models/gpt2.py | 8 ++++++-- vllm/model_executor/models/gpt_bigcode.py | 8 ++++++-- vllm/model_executor/models/gpt_j.py | 8 ++++++-- vllm/model_executor/models/gpt_neox.py | 8 ++++++-- vllm/model_executor/models/granite.py | 9 +++++++-- vllm/model_executor/models/granitemoe.py | 8 +++++--- .../models/idefics2_vision_model.py | 11 ++++++++--- vllm/model_executor/models/idefics3.py | 7 ++++--- vllm/model_executor/models/intern_vit.py | 8 ++++++-- vllm/model_executor/models/internlm2.py | 8 ++++++-- vllm/model_executor/models/internvl.py | 7 ++++--- vllm/model_executor/models/jais.py | 8 ++++++-- vllm/model_executor/models/jamba.py | 8 ++++++-- vllm/model_executor/models/llama.py | 15 ++++++++++----- vllm/model_executor/models/llava.py | 7 ++++--- vllm/model_executor/models/llava_next.py | 7 ++++--- vllm/model_executor/models/llava_next_video.py | 7 ++++--- vllm/model_executor/models/llava_onevision.py | 7 ++++--- vllm/model_executor/models/mamba.py | 8 ++++++-- vllm/model_executor/models/medusa.py | 9 +++++++-- vllm/model_executor/models/minicpm.py | 8 ++++++-- vllm/model_executor/models/minicpmv.py | 14 +++++++++----- vllm/model_executor/models/mixtral.py | 8 ++++++-- vllm/model_executor/models/mixtral_quant.py | 8 ++++++-- vllm/model_executor/models/mllama.py | 9 ++++++--- vllm/model_executor/models/mlp_speculator.py | 8 ++++++-- vllm/model_executor/models/mpt.py | 8 ++++++-- vllm/model_executor/models/nemotron.py | 8 ++++++-- vllm/model_executor/models/olmo.py | 8 ++++++-- vllm/model_executor/models/olmoe.py | 8 ++++++-- vllm/model_executor/models/opt.py | 8 ++++++-- vllm/model_executor/models/orion.py | 8 ++++++-- vllm/model_executor/models/paligemma.py | 7 ++++--- vllm/model_executor/models/persimmon.py | 8 ++++++-- vllm/model_executor/models/phi.py | 8 ++++++-- vllm/model_executor/models/phi3_small.py | 8 ++++++-- vllm/model_executor/models/phi3v.py | 9 ++++++--- vllm/model_executor/models/phimoe.py | 8 ++++++-- vllm/model_executor/models/pixtral.py | 12 ++++++++---- vllm/model_executor/models/qwen.py | 8 ++++++-- vllm/model_executor/models/qwen2.py | 18 ++++++++++++------ vllm/model_executor/models/qwen2_audio.py | 9 +++++++-- vllm/model_executor/models/qwen2_cls.py | 7 ++++--- vllm/model_executor/models/qwen2_moe.py | 8 ++++++-- vllm/model_executor/models/qwen2_rm.py | 7 ++++--- vllm/model_executor/models/qwen2_vl.py | 8 ++++++-- vllm/model_executor/models/siglip.py | 11 ++++++++--- vllm/model_executor/models/solar.py | 9 +++++++-- vllm/model_executor/models/stablelm.py | 8 ++++++-- vllm/model_executor/models/starcoder2.py | 8 ++++++-- vllm/model_executor/models/ultravox.py | 7 ++++--- vllm/model_executor/models/utils.py | 11 ++++++----- vllm/model_executor/models/xverse.py | 8 ++++++-- 74 files changed, 454 insertions(+), 185 deletions(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 0f8b81c3ef40c..d9ce85949e4ee 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -334,7 +334,17 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module: with target_device: model = _initialize_model(vllm_config=vllm_config) - model.load_weights(self._get_all_weights(model_config, model)) + weights_to_load = {name for name, _ in model.named_parameters()} + loaded_weights = model.load_weights( + self._get_all_weights(model_config, model)) + # We only enable strict check for non-quantiized models + # that have loaded weights tracking currently. + if model_config.quantization is None and loaded_weights is not None: + weights_not_loaded = weights_to_load - loaded_weights + if weights_not_loaded: + raise ValueError( + "Following weights were not initialized from " + f"checkpoint: {weights_not_loaded}") for _, module in model.named_modules(): quant_method = getattr(module, "quant_method", None) diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index d52418ee0f6f1..e58ad19cab54c 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -1,5 +1,5 @@ """Inference-only Snowflake Arctic model.""" -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -480,7 +480,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -518,6 +519,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("ws", f"experts.{expert_id}.w3.weight", expert_id)) params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() logger.info( "It will take ~10 minutes loading from the 16-bit weights. " @@ -573,3 +575,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 01ce7c42cd391..3749a16a38994 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -18,7 +18,7 @@ # limitations under the License. """Inference-only BaiChuan model compatible with HuggingFace weights.""" import math -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -404,13 +404,15 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("gate_up_proj", "gate_proj", 0), ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -449,6 +451,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params class BaichuanForCausalLM(BaiChuanBaseForCausalLM): diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 42dd6119e76f1..d8301a36acb01 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -1,4 +1,4 @@ -from typing import Iterable, List, Optional, Tuple +from typing import Iterable, List, Optional, Set, Tuple import torch from torch import nn @@ -337,7 +337,8 @@ def forward( return self.encoder(hidden_states, kv_caches, attn_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "query", "q"), @@ -346,6 +347,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ] params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "pooler" in name: continue @@ -368,6 +370,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params class BertEmbeddingModel(nn.Module): diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index e612010677364..6db6462e97f3f 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -1,6 +1,6 @@ """Minimal implementation of BlipVisionModel intended to be only used within a vision language model.""" -from typing import Iterable, Optional, Tuple, Union +from typing import Iterable, Optional, Set, Tuple, Union import torch import torch.nn as nn @@ -415,7 +415,8 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: return self.post_layernorm(hidden_states) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -423,6 +424,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("qkv_proj", "v_proj", "v"), ] if self.shard_weight else [] params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() layer_count = len(self.encoder.layers) for name, loaded_weight in weights: @@ -440,8 +442,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue - - param = params_dict[name.replace(weight_name, param_name)] + name = name.replace(weight_name, param_name) + param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) break @@ -450,3 +452,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 03dc1d15ab697..7d7639b4a92ce 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -1,5 +1,5 @@ from functools import cached_property -from typing import (Iterable, List, Literal, Mapping, Optional, Tuple, +from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) import torch @@ -692,6 +692,7 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) - loader.load_weights(weights) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index cf2eee8172769..1060d418474ef 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -16,7 +16,7 @@ # limitations under the License. """Inference-only BLOOM model compatible with HuggingFace weights.""" import math -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -341,8 +341,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if name == "lm_head.weight": continue @@ -371,3 +373,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 7b59c818e0b60..8f91abffaea90 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -1,5 +1,5 @@ from functools import cached_property -from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, +from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) import torch @@ -1034,7 +1034,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -1044,6 +1045,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -1111,3 +1113,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 70e9b607b0642..81e56381eabd8 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -3,7 +3,8 @@ """Inference-only ChatGLM model compatible with THUDM weights.""" from argparse import Namespace from array import array -from typing import Dict, Iterable, List, Mapping, Optional, Tuple, TypedDict +from typing import (Dict, Iterable, List, Mapping, Optional, Set, Tuple, + TypedDict) import torch from PIL import Image @@ -645,7 +646,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: # Merge two ColumnParallelLinear into one MergedColumnParallelLinear merged_weights_dict: Dict[str, Dict[str, Optional[torch.Tensor]]] = { "transformer.vision.linear_proj.merged_proj.weight": { @@ -655,6 +657,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): } params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: Set[str] = set() for name, loaded_weight in weights: is_weight_to_be_merge = False for _, merged_weight_dict in merged_weights_dict.items(): @@ -677,6 +680,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) for combined_name, merged_weight_dict in merged_weights_dict.items(): if combined_name in params_dict: @@ -686,3 +690,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, combined_weight) + loaded_params.add(combined_name) + return loaded_params diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 2d81b9266826b..184758f4a8a45 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -1,6 +1,6 @@ """Minimal implementation of CLIPVisionModel intended to be only used within a vision language model.""" -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import numpy as np import torch @@ -483,7 +483,8 @@ def device(self): # (TODO) Add prefix argument for filtering out weights to be loaded # ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986 - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -491,6 +492,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("qkv_proj", "v_proj", "v"), ] if self.shard_weight else [] params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() layer_count = len(self.vision_model.encoder.layers) for name, loaded_weight in weights: @@ -508,8 +510,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue + name = name.replace(weight_name, param_name) - param = params_dict[name.replace(weight_name, param_name)] + param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) break @@ -518,3 +521,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index fbb09a64cde9b..9fd083e5a02a9 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -402,7 +402,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -447,3 +448,4 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): default_weight_loader) weight_loader(param, loaded_weight) loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 3952ff31e5cec..eab338800249e 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -1,4 +1,4 @@ -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import torch import torch.nn as nn @@ -417,13 +417,15 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: expert_params_mapping = [( "w13_weight" if weight_name in ["w1", "v1"] else "w2_weight", f"mlp.{weight_name}", ) for weight_name in ["w1", "v1", "w2"]] params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: Set[str] = set() for name, loaded_weight in weights: for param_name, weight_name in expert_params_mapping: if weight_name not in name: @@ -447,3 +449,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py index b38fd9fa49c21..c551853956b92 100644 --- a/vllm/model_executor/models/decilm.py +++ b/vllm/model_executor/models/decilm.py @@ -22,7 +22,7 @@ # limitations under the License. """Inference-only DeciLM model compatible with HuggingFace weights.""" -from typing import Iterable, Tuple +from typing import Iterable, Set, Tuple import torch @@ -57,7 +57,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): delattr(config, "num_key_value_heads_per_layer") super().__init__(vllm_config=vllm_config) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -67,6 +68,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -97,6 +99,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params def _degroup_weight(self, loaded_weight: torch.Tensor) -> torch.Tensor: hidden_size = self.config.hidden_size diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index 36dfea5a65656..8c5ad9904e925 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -20,7 +20,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Deepseek model.""" -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -442,7 +442,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -453,6 +454,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ] params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -487,3 +489,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 1e32fe60c7a5b..d2c4ca0bf85e9 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -20,7 +20,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only DeepseekV2 model.""" -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -550,7 +550,8 @@ def make_empty_intermediate_tensors( device=device), }) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("gate_up_proj", "gate_proj", 0), @@ -566,6 +567,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): num_experts=self.config.n_routed_experts) params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -623,3 +625,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 52dd603ca558d..9d739d0479548 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -22,7 +22,7 @@ # limitations under the License. """Inference-only Exaone model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -513,7 +513,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -523,6 +524,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): (".gate_up_proj", ".c_fc_1", 1), ] params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -543,6 +545,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): default_weight_loader) loaded_weight = loaded_weight[0] weight_loader(param, loaded_weight) + loaded_params.add(scale_name) continue for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: @@ -576,6 +579,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params # If this function is called, it should always initialize KV cache scale # factors (or else raise an exception). Thus, handled exceptions should diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index e97abe949ccdb..2aa4b67d99894 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -18,7 +18,7 @@ """PyTorch Falcon model.""" import math -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -473,7 +473,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: total_num_heads = self.config.num_attention_heads if self.config.new_decoder_architecture: total_num_kv_heads = self.config.num_kv_heads @@ -483,6 +484,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): total_num_kv_heads = total_num_heads num_query_heads_per_kv_head = total_num_heads // total_num_kv_heads params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if name == "lm_head.weight" and self.tie_word_embeddings: # Falcon uses tied embeddings except Falcon-11b. @@ -519,3 +521,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index 971a71180164b..d3a9ff6915b84 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -1,5 +1,5 @@ import math -from typing import Iterable, List, Optional, Tuple +from typing import Iterable, List, Optional, Set, Tuple import torch import torch.nn as nn @@ -156,7 +156,8 @@ def sample(self, logits: torch.Tensor, next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -165,12 +166,13 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ] params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue - - param = params_dict[name.replace(weight_name, param_name)] + name = name.replace(weight_name, param_name) + param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) break @@ -183,6 +185,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params class Florence2ForConditionalGeneration(nn.Module): @@ -248,10 +252,11 @@ def sample( ) -> SamplerOutput: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: skip_prefixes = [ 'image_projection', "vision_tower", "image_proj_norm", "image_pos_embed", "visual_temporal_embed" ] loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) - loader.load_weights(weights) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 31fc098a8bb3f..7b46907ac83ab 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -16,7 +16,8 @@ """ PyTorch Fuyu model.""" import math from array import array -from typing import Iterable, List, Literal, Mapping, Optional, Tuple, TypedDict +from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, + TypedDict) import torch import torch.nn as nn @@ -354,6 +355,7 @@ def sample( next_tokens = self.language_model.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) - loader.load_weights(weights) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index ace13664c6ea6..64e03b30bf2f1 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -424,7 +424,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -469,3 +470,4 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): logger.warning( "Some weights are not initialized from checkpoints: %s", unloaded_params) + return loaded_params diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index a60b4e73a76d4..4ba39223cc07f 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -312,7 +312,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -354,6 +355,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): logger.warning( "Some weights are not initialized from checkpoints: %s", unloaded_params) + return loaded_params class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): @@ -451,13 +453,14 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), ) - loader.load_weights(weights) + return loader.load_weights(weights) class Gemma2EmbeddingModel(nn.Module, SupportsPP): diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index fa0fdad28d161..1c61408ae1dd9 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -16,7 +16,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GPT-2 model compatible with HuggingFace weights.""" -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -298,8 +298,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "lm_head.weight" in name: # GPT-2 ties the weights of the embedding layer and the final @@ -328,3 +330,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index b2fc79d0d36dc..50a143cb1b600 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -17,7 +17,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GPTBigCode model compatible with HuggingFace weights.""" -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -323,8 +323,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "lm_head.weight" in name: continue @@ -344,3 +346,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader(param, loaded_weight, 'v') else: weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index cec3fd12a67d6..d5defc60764e6 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -15,7 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GPT-J model compatible with HuggingFace weights.""" -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -291,7 +291,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -301,6 +302,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "attn.bias" in name or "attn.masked_bias" in name: continue @@ -330,3 +332,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 11f286d6bcba0..0bb5e2f9b95f9 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -15,7 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GPT-NeoX model compatible with HuggingFace weights.""" -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -303,8 +303,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if ("attention.bias" in name or "attention.masked_bias" in name or "rotary_emb.inv_freq" in name): @@ -337,3 +339,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index cb2583e69d88d..c1e2e87f08ec3 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -20,7 +20,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only IBM Granite model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -455,7 +455,8 @@ def make_empty_intermediate_tensors( device=device), }) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -465,6 +466,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -485,6 +487,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): default_weight_loader) loaded_weight = loaded_weight[0] weight_loader(param, loaded_weight) + loaded_params.add(scale_name) continue for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: @@ -518,6 +521,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params # If this function is called, it should always initialize KV cache scale # factors (or else raise an exception). Thus, handled exceptions should diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index f437dd521a7d5..a91a18816995f 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -20,7 +20,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GraniteMoe model.""" -from typing import Iterable, List, Optional, Tuple +from typing import Iterable, List, Optional, Set, Tuple import torch from torch import nn @@ -419,7 +419,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: new_weights = {} for n, p in weights: if n.endswith('.block_sparse_moe.input_linear.weight'): @@ -452,4 +453,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): pass else: new_weights[n] = p - mixtral.MixtralForCausalLM.load_weights(self, new_weights.items()) + return mixtral.MixtralForCausalLM.load_weights(self, + new_weights.items()) diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index b21bc2a3f9ce1..16192928beb1f 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -15,7 +15,7 @@ # limitations under the License. """PyTorch Idefics2 model.""" -from typing import Iterable, Optional, Tuple +from typing import Iterable, Optional, Set, Tuple import torch from torch import nn @@ -331,7 +331,8 @@ def forward( last_hidden_state = self.post_layernorm(encoder_outputs) return last_hidden_state - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -339,11 +340,13 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue - param = params_dict[name.replace(weight_name, param_name)] + name = name.replace(weight_name, param_name) + param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) break @@ -352,3 +355,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 0cecc754e916f..5d176b2a4e416 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -15,7 +15,7 @@ import math from typing import (Dict, Iterable, List, Literal, Mapping, NamedTuple, - Optional, Tuple, TypedDict, Union) + Optional, Set, Tuple, TypedDict, Union) import torch import torch.utils.checkpoint @@ -751,9 +751,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) - loader.load_weights(weights) + return loader.load_weights(weights) def get_mm_mapping(self) -> MultiModelKeys: """ diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 9761635d2a6c2..bd91a0806ae5c 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -5,7 +5,7 @@ # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- from functools import partial -from typing import Iterable, Optional, Tuple +from typing import Iterable, Optional, Set, Tuple import torch import torch.nn as nn @@ -469,10 +469,14 @@ def forward( return encoder_outputs - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 19bfe16e4d5fc..94b819b5d9366 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -1,5 +1,5 @@ from functools import partial -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -369,13 +369,15 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("gate_up_proj", "w1", 0), ("gate_up_proj", "w3", 1), ] params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -402,3 +404,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 92579e3aae949..7ea2f9be2191d 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -6,7 +6,7 @@ # -------------------------------------------------------- import re from functools import cached_property, partial -from typing import (Iterable, List, Literal, Mapping, Optional, Tuple, +from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) import torch @@ -663,6 +663,7 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) - loader.load_weights(weights) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index ee49ffb3cd87f..41db85b678456 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -19,7 +19,7 @@ """Inference-only Jais model compatible with HuggingFace weights.""" import math -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -350,8 +350,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "lm_head.weight" in name: # GPT-2 ties the weights of the embedding layer and the final @@ -382,3 +384,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 5612dd6886385..f83f0fce7275f 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -1,5 +1,5 @@ """Inference-only Jamba model.""" -from typing import Iterable, List, Optional, Tuple +from typing import Iterable, List, Optional, Set, Tuple import torch from torch import nn @@ -462,7 +462,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -479,6 +480,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): num_experts=self.config.num_experts) params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -534,6 +536,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params def _is_moe_layer(name: str): diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index e53631ef19f31..2b40e9ec73fad 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -20,7 +20,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only LLaMA model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -350,7 +350,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -360,6 +361,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -375,6 +377,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): default_weight_loader) loaded_weight = loaded_weight[0] weight_loader(param, loaded_weight) + loaded_params.add(scale_name) continue for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: @@ -390,7 +393,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) - break else: # Skip loading extra bias for GPTQ models. @@ -408,6 +410,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params # If this function is called, it should always initialize KV cache scale # factors (or else raise an exception). Thus, handled exceptions should @@ -577,13 +581,14 @@ def sample(self, logits: torch.Tensor, next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), ) - loader.load_weights( + return loader.load_weights( self.maybe_remap_mistral(name, loaded_weight) for name, loaded_weight in weights) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index b13bcfa676811..e7d3161a7cb2d 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -1,5 +1,5 @@ from functools import cached_property -from typing import (Iterable, List, Literal, Mapping, Optional, Protocol, +from typing import (Iterable, List, Literal, Mapping, Optional, Protocol, Set, Tuple, TypedDict, Union) import torch @@ -547,6 +547,7 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) - loader.load_weights(weights) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index dd2fa6cac969f..37e2227a52dcd 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -1,5 +1,5 @@ from functools import cached_property -from typing import (Iterable, List, Literal, Mapping, Optional, Tuple, +from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) import torch @@ -654,6 +654,7 @@ def pooler( ) -> Optional[PoolerOutput]: return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) - loader.load_weights(weights) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 5d5598d07bfde..e2880c76cf43d 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -1,6 +1,6 @@ import math from functools import cached_property -from typing import (Iterable, List, Literal, Mapping, Optional, Tuple, +from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) import numpy as np @@ -445,10 +445,11 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( self, # This model doesn't support images for now ignore_unexpected_prefixes=["image_newline"], ) - loader.load_weights(weights) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index a5b2108177830..705ca1e4ab6e6 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -1,6 +1,6 @@ import math from functools import cached_property -from typing import (Iterable, List, Literal, Mapping, Optional, Tuple, +from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) import numpy as np @@ -887,6 +887,7 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) - loader.load_weights(weights) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index ac0d265a961f0..405b8f7787ba8 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -1,5 +1,5 @@ """PyTorch MAMBA model.""" -from typing import Iterable, List, Optional, Tuple +from typing import Iterable, List, Optional, Set, Tuple import torch from torch import nn @@ -243,8 +243,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "A_log" in name: name = name.replace("A_log", "A") @@ -256,3 +258,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py index b05360b55466b..b4ed6538bddac 100644 --- a/vllm/model_executor/models/medusa.py +++ b/vllm/model_executor/models/medusa.py @@ -1,4 +1,4 @@ -from typing import Iterable, List, Optional, Tuple +from typing import Iterable, List, Optional, Set, Tuple import torch import torch.nn as nn @@ -156,8 +156,10 @@ def generate_proposals( sampling_metadata=sampling_metadata, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() weights_map = {} @@ -181,9 +183,12 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) if self.token_map is not None: self.token_map.to(device=self.lm_heads[0].weight.device) assert (self.truncated_vocab_size == self.orig_vocab_size) or (self.token_map is not None) + + return loaded_params diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 6b67266c53362..b92bff4d7c28c 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -21,7 +21,7 @@ # limitations under the License. """Inference-only MiniCPM model compatible with HuggingFace weights.""" import math -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -539,7 +539,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -556,6 +557,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): for weight_name in ["w1", "w2", "w3"] ] params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -606,3 +608,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index fd8eda997f76f..99bf1d42d0355 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -24,7 +24,7 @@ import re from functools import partial from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional, - Tuple, TypedDict, Union) + Set, Tuple, TypedDict, Union) import torch import torch.types @@ -602,7 +602,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -612,6 +613,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items(): if key_to_modify in name: @@ -630,10 +632,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue - if is_pp_missing_parameter( - name.replace(weight_name, param_name), self): + name = name.replace(weight_name, param_name) + if is_pp_missing_parameter(name, self): continue - param = params_dict[name.replace(weight_name, param_name)] + param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) break @@ -646,6 +648,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params def get_mm_mapping(self) -> MultiModelKeys: """ diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index eebf5bab5a288..0faffb4f1b00c 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -20,7 +20,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Mixtral model.""" -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -404,7 +404,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -421,6 +422,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): num_experts=self.config.num_local_experts) params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -478,3 +480,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index af2e9586988df..ddd6afcf6a1b6 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -20,7 +20,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Mixtral model.""" -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import numpy as np import torch @@ -409,7 +409,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -418,6 +419,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ] params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -448,3 +450,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index db7ee7b2d8537..41f62b37f3bd9 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -13,7 +13,7 @@ # limitations under the License. """PyTorch Mllama model.""" import math -from typing import (Iterable, List, Literal, Mapping, Optional, Tuple, +from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) import numpy as np @@ -1427,7 +1427,8 @@ def forward( return outputs - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -1437,7 +1438,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - updated_params = set() + updated_params: Set[str] = set() for name, loaded_weight in weights: if 'patch_embedding.weight' in name: name = name.replace('patch_embedding.weight', @@ -1457,6 +1458,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + updated_params.add(name) + return updated_params def skip_attention_mask(sparse_mask: List[List[int]]) -> bool: diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index 4d7e82880041d..f2aa2653c4f5c 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -1,5 +1,5 @@ import math -from typing import Iterable, List, Tuple +from typing import Iterable, List, Set, Tuple import torch import torch.nn as nn @@ -188,11 +188,15 @@ def generate_proposals( return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: param = params_dict.get(name.replace("speculator.", "")) if param is not None: weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 3c74ef2448abb..8716e92b0f1c2 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -1,6 +1,6 @@ # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main import math -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import torch import torch.nn as nn @@ -324,8 +324,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: Set[str] = set() for name, loaded_weight in weights: # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: @@ -336,3 +338,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index eb45beae7d21a..ceab299a7950a 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -20,7 +20,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Nemotron model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -474,7 +474,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -482,6 +483,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): (".qkv_proj", ".v_proj", "v"), ] params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -522,3 +524,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 98d4e1ec320a4..dc138e2e636ad 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -20,7 +20,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only OLMo model compatible with HuggingFace weights.""" -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -356,7 +356,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -366,6 +367,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -402,3 +404,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index f4eebab8c98dd..ab87695d8e650 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -10,7 +10,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only OLMoE model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -364,7 +364,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -383,6 +384,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): num_experts=self.config.num_experts) params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -455,3 +457,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 997fe642439e6..db85a494980a7 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -16,7 +16,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only OPT model compatible with HuggingFace weights.""" -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -394,7 +394,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -402,6 +403,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "lm_head.weight" in name and self.config.tie_word_embeddings: continue @@ -431,3 +433,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 39d659c49cbcf..b01734af8ddd8 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -3,7 +3,7 @@ # Copyright (c) OrionStar Inc. # LICENSE: https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/LICENSE """Inference-only Orion-14B model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -327,7 +327,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -337,6 +338,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -368,3 +370,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index eea229359255e..dd5256eb87ab3 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -1,4 +1,4 @@ -from typing import (Iterable, List, Literal, Mapping, Optional, Tuple, +from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) import torch @@ -295,6 +295,7 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) - loader.load_weights(weights) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index 62c509153a111..3b8199f4f1661 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -19,7 +19,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only persimmon model compatible with HuggingFace weights.""" -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -324,8 +324,10 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -358,3 +360,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index a2ab0d74c48db..0a117bf16c9b3 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -34,7 +34,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """Inference-only Phi-1.5 model compatible with HuggingFace weights.""" -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -345,7 +345,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -353,6 +354,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("qkv_proj", "v_proj", "v") ] params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: @@ -383,3 +385,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py index 2139cec441807..a78e4d355a314 100644 --- a/vllm/model_executor/models/phi3_small.py +++ b/vllm/model_executor/models/phi3_small.py @@ -1,5 +1,5 @@ import math -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -457,9 +457,11 @@ def sample( sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -471,3 +473,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 4db65edc174f1..2e583bb08e87a 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -15,7 +15,7 @@ import itertools import re from functools import cached_property, lru_cache -from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, +from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) import numpy as np @@ -744,7 +744,8 @@ def pooler( ) -> Optional[PoolerOutput]: return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ "model.vision_embed_tokens.wte": "embed_tokens", @@ -759,5 +760,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # The HF config doesn't specify whether these are tied, # so we detect it this way - if "embed_tokens" not in autoloaded_weights: + if "embed_tokens.weight" not in autoloaded_weights: self.embed_tokens = self.language_model.model.embed_tokens + autoloaded_weights.add("embed_tokens.weight") + return autoloaded_weights diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index b7e70f8fa2c6d..e475d286bd7ea 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -20,7 +20,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only PhiMoE model.""" -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -598,7 +598,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -613,6 +614,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): num_experts=self.config.num_local_experts) params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -666,3 +668,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index a3e30ea2dd299..307febde7eef0 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -1,7 +1,7 @@ from dataclasses import dataclass, fields from functools import cached_property from itertools import tee -from typing import Iterable, List, Mapping, Optional, Tuple, Union +from typing import Iterable, List, Mapping, Optional, Set, Tuple, Union import numpy import torch @@ -1053,7 +1053,8 @@ def forward( # (TODO) Add prefix argument for filtering out weights to be loaded # ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986 - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -1063,6 +1064,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() layer_count = len(self.transformer.layers) for name, loaded_weight in weights: @@ -1075,8 +1077,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue - - param = params_dict[name.replace(weight_name, param_name)] + name = name.replace(weight_name, param_name) + param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) break @@ -1085,3 +1087,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 447632cefcd9a..3978c176a2144 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -8,7 +8,7 @@ import re from functools import partial from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping, - Optional, Tuple, TypedDict, Union) + Optional, Set, Tuple, TypedDict, Union) import numpy as np import torch @@ -964,13 +964,15 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("gate_up_proj", "w2", 0), ("gate_up_proj", "w1", 1), ] params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -999,6 +1001,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params class QWenLLM(QWenBaseModel): diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 8f10df808c216..370cff5fa153f 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -21,7 +21,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2 model compatible with HuggingFace weights.""" -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -332,7 +332,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -342,6 +343,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -372,6 +374,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): @@ -494,13 +498,14 @@ def pooler( ) -> Optional[PoolerOutput]: return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] if self.config.tie_word_embeddings else None), ) - loader.load_weights(weights) + return loader.load_weights(weights) class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP): @@ -564,7 +569,8 @@ def pooler( ) -> Optional[PoolerOutput]: return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self, ignore_unexpected_prefixes=["lm_head."]) - loader.load_weights(weights) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index d30950361ad89..a4965f34b1ca8 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -20,7 +20,8 @@ # limitations under the License. """Inference-only Qwen2-Audio model compatible with HuggingFace weights.""" from functools import lru_cache -from typing import Iterable, List, Mapping, Optional, Tuple, TypedDict, Union +from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, + Union) import librosa import numpy as np @@ -420,7 +421,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -430,6 +432,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -463,3 +466,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/qwen2_cls.py b/vllm/model_executor/models/qwen2_cls.py index 07eb330620a43..dc5dabf6fc38b 100644 --- a/vllm/model_executor/models/qwen2_cls.py +++ b/vllm/model_executor/models/qwen2_cls.py @@ -4,7 +4,7 @@ # Copyright 2024 The Qwen team. # Copyright 2023 The vLLM team. """Inference-only Qwen2-Classification model compatible with HF weights.""" -from typing import Iterable, List, Optional, Tuple +from typing import Iterable, List, Optional, Set, Tuple import torch from torch import nn @@ -97,7 +97,8 @@ def pooler( ) -> Optional[PoolerOutput]: return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self, ignore_unexpected_prefixes=["lm_head."]) - loader.load_weights(weights) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 249d94b5d95e9..96a9bc451f4df 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -21,7 +21,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2MoE model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union import torch import torch.nn.functional as F @@ -436,7 +436,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -455,6 +456,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): num_experts=self.config.num_experts) params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -532,3 +534,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 6db467af334f5..988d682d36be3 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -3,7 +3,7 @@ # Copyright 2024 The Qwen team. # Copyright 2023 The vLLM team. """Inference-only Qwen2-RM model compatible with HuggingFace weights.""" -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -110,7 +110,8 @@ def pooler( ) -> Optional[PoolerOutput]: return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self, ignore_unexpected_prefixes=["lm_head."]) - loader.load_weights(weights) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 2335baf459771..ef6b52db6e17d 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -23,7 +23,7 @@ """Inference-only Qwen2-VL model compatible with HuggingFace weights.""" from functools import partial from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping, - Optional, Tuple, Type, TypedDict, Union) + Optional, Set, Tuple, Type, TypedDict, Union) import torch import torch.nn as nn @@ -1333,7 +1333,8 @@ def pooler( ) -> Optional[PoolerOutput]: return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -1343,6 +1344,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("gate_up_proj", "gate_proj", 0), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -1392,3 +1394,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index acaf4afdecfe5..c9e09b879843a 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -2,7 +2,7 @@ within a vision language model.""" import math -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import numpy as np import torch @@ -594,7 +594,8 @@ def forward( interpolate_pos_encoding=interpolate_pos_encoding, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -602,6 +603,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("qkv_proj", "v_proj", "v"), ] if self.shard_weight else [] params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() layer_count = len(self.vision_model.encoder.layers) for name, loaded_weight in weights: @@ -619,8 +621,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue + name = name.replace(weight_name, param_name) - param = params_dict[name.replace(weight_name, param_name)] + param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) break @@ -629,3 +632,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index affb2c975ce4a..6d6fafc5ab0eb 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -21,7 +21,7 @@ # limitations under the License. """Inference-only Solar model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -477,7 +477,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -487,6 +488,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -502,6 +504,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): default_weight_loader) loaded_weight = loaded_weight[0] weight_loader(param, loaded_weight) + loaded_params.add(scale_name) continue for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: @@ -535,6 +538,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params # If this function is called, it should always initialize KV cache scale # factors (or else raise an exception). Thus, handled exceptions should diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 99acce596602e..e11d2e916730a 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -18,7 +18,7 @@ # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json """Inference-only StabeLM (https://github.com/Stability-AI/StableLM) model compatible with HuggingFace weights.""" -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -306,7 +306,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -316,6 +317,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -347,3 +349,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 0ef940acebb93..74c66042226de 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -17,7 +17,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch Starcoder2 model.""" -from typing import Iterable, List, Optional, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -314,7 +314,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -323,6 +324,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ] params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -346,3 +348,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 9fde22c016de0..512adbc7db35e 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -3,7 +3,7 @@ import math from functools import cached_property, lru_cache -from typing import (Iterable, List, Literal, Mapping, Optional, Tuple, +from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union, cast) import numpy as np @@ -504,10 +504,11 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."}) loader = AutoWeightsLoader(self, ignore_unexpected_prefixes=["audio_tower."]) - loader.load_weights(weights, mapper=hf_to_vllm_mapper) + return loader.load_weights(weights, mapper=hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 1d51885f9094a..7a4fcce95603d 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -1,7 +1,7 @@ import itertools from dataclasses import dataclass, field from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping, - Optional, Protocol, Tuple, Union, overload) + Optional, Protocol, Set, Tuple, Union, overload) import torch import torch.nn as nn @@ -172,8 +172,9 @@ def _load_module( if module != self.module: module_load_weights = getattr(module, "load_weights", None) if callable(module_load_weights): - module_load_weights(weights) - return + loaded_params = module_load_weights(weights) + yield from map(lambda x: self._get_qualname(base_prefix, x), + loaded_params) child_modules = dict(module.named_children()) child_params = dict(module.named_parameters(recurse=False)) @@ -222,11 +223,11 @@ def load_weights( weights: Iterable[Tuple[str, torch.Tensor]], *, mapper: Optional[WeightsMapper] = None, - ) -> List[str]: + ) -> Set[str]: if mapper is not None: weights = mapper.apply(weights) - autoloaded_weights = list(self._load_module("", self.module, weights)) + autoloaded_weights = set(self._load_module("", self.module, weights)) return autoloaded_weights diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py index 51172d8782a70..bc37a997eabb5 100644 --- a/vllm/model_executor/models/xverse.py +++ b/vllm/model_executor/models/xverse.py @@ -19,7 +19,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Xverse model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union import torch from torch import nn @@ -376,7 +376,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ ("qkv_proj", "q_proj", "q"), ("qkv_proj", "k_proj", "k"), @@ -385,6 +386,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() for name, loaded_weight in weights: if ("rotary_emb.inv_freq" in name or "rotary_emb.cos_cached" in name @@ -413,3 +415,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params From 47826cacf0e037b4e109f0b2d8d594e47def500e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=84=8D=F0=9D=95=A0=F0=9D=95=9D=F0=9D=95=9D=F0=9D=95=A0?= =?UTF-8?q?=F0=9D=95=A8=20=F0=9D=95=84=F0=9D=95=92=F0=9D=95=9F?= Date: Mon, 18 Nov 2024 05:29:26 +0200 Subject: [PATCH 031/255] [Bugfix] Ignore ray reinit error when current platform is ROCm or XPU (#10375) Signed-off-by: Hollow Man --- vllm/executor/ray_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 41dd59bc65ec5..4f28efd639084 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -234,7 +234,7 @@ def initialize_ray_cluster( if current_platform.is_rocm() or current_platform.is_xpu(): # Try to connect existing ray instance and create a new one if not found try: - ray.init("auto") + ray.init("auto", ignore_reinit_error=True) except ConnectionError: logger.warning( "No existing RAY instance detected. " From 51bb12d17b374d5c4521cd01e5b066fd2419a8fa Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sun, 17 Nov 2024 23:57:20 -0800 Subject: [PATCH 032/255] [4/N][torch.compile] clean up set_torch_compile_backend (#10401) Signed-off-by: youkaichao --- vllm/compilation/backends.py | 16 ++-------------- vllm/compilation/wrapper.py | 11 +++-------- vllm/config.py | 31 ++++++++++++++++++++++++++++++- vllm/platforms/tpu.py | 7 +++---- vllm/plugins/__init__.py | 14 +------------- vllm/utils.py | 9 +++++++++ vllm/worker/model_runner.py | 3 +-- 7 files changed, 49 insertions(+), 42 deletions(-) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 22c613931f082..0cf1e3a95fcba 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -2,15 +2,14 @@ import dataclasses import operator from contextlib import ExitStack -from typing import (Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, - Union) +from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple from unittest.mock import patch import torch import torch.fx as fx import vllm.envs as envs -from vllm.config import CompilationConfig, CompilationLevel +from vllm.config import CompilationConfig from vllm.logger import init_logger from vllm.utils import combine_fx_passes, weak_ref_tensors @@ -684,14 +683,3 @@ def __call__(self, *args) -> Any: entry.cudagraph.replay() return entry.output - - -def select_default_backend(level: int) -> Union[str, Callable]: - if level in [CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE]: - backend_str = "eager" - return backend_str - assert level == CompilationLevel.PIECEWISE - - from vllm.plugins import get_current_vllm_config - compilation_config = get_current_vllm_config().compilation_config - return VllmBackend(compilation_config) diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index 2a1aecc11ce26..0143d0301ca1a 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -32,14 +32,9 @@ def __init__(self, # default compilation settings # compiling the forward method - # choose the compile backend - - # if the user has set the backend, use it - from vllm.plugins import get_torch_compile_backend - backend = get_torch_compile_backend() - if backend is None: - from vllm.compilation.backends import select_default_backend - backend = select_default_backend(compilation_level) + from vllm.plugins import get_current_vllm_config + backend = get_current_vllm_config( + ).compilation_config.init_backend() compiled_callable = torch.compile( self.forward, diff --git a/vllm/config.py b/vllm/config.py index 7e37edbe594b1..14017bbdb3cf2 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -22,7 +22,7 @@ get_hf_text_config, get_pooling_config, get_sentence_transformer_tokenizer_config, is_encoder_decoder, uses_mrope) from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory, - identity, print_warning_once) + identity, print_warning_once, resolve_obj_by_qualname) if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup @@ -2072,6 +2072,13 @@ class CompilationConfig(BaseModel): - 1: dynamo as is. - 2: dynamo once. - 3: piecewise compilation. + - backend: the backend for compilation. It needs to be a string. + - "" (empty string): use the default backend. + - "eager"/"openxla"/...: use the specified backend registered in PyTorch. + - "full.module.name": a qualified name which can be used to import the backend function. + We use string to avoid serialization issues when using compilation in a distributed setting. + When the compilation level is 1 or 2, the backend is used for the compilation directly (it sees the whole graph). + When the compilation level is 3, the backend is used for the piecewise compilation (it sees a part of the graph). - custom_ops: fine-grained control over which custom ops to enable/disable. Use 'all' to enable all, 'none' to disable all. Also specify a list of custom op names to enable (prefixed with a '+'), @@ -2139,6 +2146,7 @@ class CompilationConfig(BaseModel): certain small batchsizes, where inductor is good at optimizing. """ # noqa level: int = 0 + backend: str = "" custom_ops: List[str] = Field(default_factory=list) use_inductor: bool = True @@ -2182,6 +2190,27 @@ def model_post_init(self, __context: Any) -> None: func = __import__(module).__dict__[func_name] self.inductor_compile_config[k] = func + def init_backend(self) -> Union[str, Callable]: + if self.level == CompilationLevel.NO_COMPILATION: + raise ValueError("No compilation level is set.") + + from torch._dynamo.backends.registry import list_backends + torch_backends = list_backends(exclude_tags=tuple()) + if self.level in [ + CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE + ]: + if self.backend == "": + return "eager" + if self.backend in torch_backends: + return self.backend + return resolve_obj_by_qualname(self.backend) + + # TODO: pass user-specified backend to piecewise compilation + # merge with the config use_inductor + assert self.level == CompilationLevel.PIECEWISE + from vllm.compilation.backends import VllmBackend + return VllmBackend(self) + def init_during_runtime(self): """To complete the initialization of config, we need to know the compile context, which is only available diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index c2e22bfc09f22..643db835c85ff 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -3,8 +3,6 @@ import torch -from vllm.plugins import set_torch_compile_backend - from .interface import Platform, PlatformEnum if TYPE_CHECKING: @@ -12,8 +10,6 @@ else: VllmConfig = None -set_torch_compile_backend("openxla") - class TpuPlatform(Platform): _enum = PlatformEnum.TPU @@ -38,3 +34,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: compilation_config.level = CompilationLevel.DYNAMO_ONCE assert compilation_config.level < CompilationLevel.PIECEWISE,\ "TPU does not support Inductor." + + if compilation_config.backend == "": + compilation_config.backend = "openxla" diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index c20b9ec891d5d..a0c73a752b5e8 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -1,6 +1,6 @@ import logging from contextlib import contextmanager -from typing import TYPE_CHECKING, Callable, Optional, Union +from typing import TYPE_CHECKING, Optional import vllm.envs as envs @@ -50,18 +50,6 @@ def load_general_plugins(): logger.exception("Failed to load plugin %s", plugin.name) -_torch_compile_backend: Optional[Union[Callable, str]] = None - - -def set_torch_compile_backend(backend: Union[Callable, str]): - global _torch_compile_backend - _torch_compile_backend = backend - - -def get_torch_compile_backend() -> Optional[Union[Callable, str]]: - return _torch_compile_backend - - _compilation_config: Optional[CompilationConfig] = None diff --git a/vllm/utils.py b/vllm/utils.py index 111460a29de47..5d0514cd9d168 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1600,3 +1600,12 @@ def direct_register_custom_op( my_lib.impl(op_name, op_func, "CUDA") if fake_impl is not None: my_lib._register_fake(op_name, fake_impl) + + +def resolve_obj_by_qualname(qualname: str) -> Any: + """ + Resolve an object by its fully qualified name. + """ + module_name, obj_name = qualname.rsplit(".", 1) + module = importlib.import_module(module_name) + return getattr(module, obj_name) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index fd89f95445565..fb5813651680b 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1143,8 +1143,7 @@ def load_model(self) -> None: if self.vllm_config.compilation_config.level ==\ CompilationLevel.DYNAMO_AS_IS and supports_dynamo(): - from vllm.plugins import get_torch_compile_backend - backend = get_torch_compile_backend() or "eager" + backend = self.vllm_config.compilation_config.init_backend() self.model = torch.compile( self.model, fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, From c7dec926f6f1beaed759b8689373926e68867358 Mon Sep 17 00:00:00 2001 From: lkchen Date: Mon, 18 Nov 2024 00:06:16 -0800 Subject: [PATCH 033/255] [VLM] Report multi_modal_placeholders in output (#10407) Signed-off-by: Linkun Chen --- .../vision_language/test_pixtral.py | 79 ++++++++++++++++++- vllm/model_executor/models/pixtral.py | 16 +++- vllm/outputs.py | 30 +++++-- 3 files changed, 115 insertions(+), 10 deletions(-) diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py index d8a98a0f84d3b..6233860747b9c 100644 --- a/tests/models/decoder_only/vision_language/test_pixtral.py +++ b/tests/models/decoder_only/vision_language/test_pixtral.py @@ -8,13 +8,17 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple import pytest +from mistral_common.multimodal import download_image from mistral_common.protocol.instruct.messages import ImageURLChunk from mistral_common.protocol.instruct.request import ChatCompletionRequest from mistral_common.tokens.tokenizers.mistral import MistralTokenizer from mistral_common.tokens.tokenizers.multimodal import image_from_chunk +from transformers import AutoProcessor -from vllm import EngineArgs, LLMEngine, SamplingParams, TokensPrompt +from vllm import (EngineArgs, LLMEngine, RequestOutput, SamplingParams, + TextPrompt, TokensPrompt) from vllm.multimodal import MultiModalDataBuiltins +from vllm.multimodal.inputs import PlaceholderRange from vllm.sequence import Logprob, SampleLogprobs from ....utils import VLLM_PATH, large_gpu_test @@ -49,6 +53,20 @@ def _create_msg_format(urls: List[str]) -> List[Dict[str, Any]]: }] +def _create_msg_format_hf(urls: List[str]) -> List[Dict[str, Any]]: + return [{ + "role": + "user", + "content": [{ + "type": "text", + "content": PROMPT, + }, *({ + "type": "image", + "image": download_image(url) + } for url in urls)], + }] + + def _create_engine_inputs(urls: List[str]) -> TokensPrompt: msg = _create_msg_format(urls) @@ -70,6 +88,23 @@ def _create_engine_inputs(urls: List[str]) -> TokensPrompt: return engine_inputs +def _create_engine_inputs_hf(urls: List[str]) -> TextPrompt: + msg = _create_msg_format_hf(urls) + + tokenizer = AutoProcessor.from_pretrained("mistral-community/pixtral-12b") + prompt = tokenizer.apply_chat_template(msg) + + images = [] + for chunk in msg[0]["content"]: + if chunk["type"] == "image": + images.append(chunk["image"]) + + mm_data = MultiModalDataBuiltins(image=images) + engine_inputs = TextPrompt(prompt=prompt, multi_modal_data=mm_data) + + return engine_inputs + + MSGS = [ _create_msg_format(IMG_URLS[:1]), _create_msg_format(IMG_URLS[:2]), @@ -191,3 +226,45 @@ def test_model_engine(vllm_runner, model: str, dtype: str) -> None: outputs_1_lst=logprobs, name_0="h100_ref", name_1="output") + + +@large_gpu_test(min_gb=24) +@pytest.mark.parametrize( + "prompt,expected_ranges", + [(_create_engine_inputs_hf(IMG_URLS[:1]), [{ + "offset": 10, + "length": 494 + }]), + (_create_engine_inputs_hf(IMG_URLS[1:4]), [{ + "offset": 10, + "length": 266 + }, { + "offset": 276, + "length": 1056 + }, { + "offset": 1332, + "length": 418 + }])]) +def test_multi_modal_placeholders( + vllm_runner, prompt, expected_ranges: list[PlaceholderRange]) -> None: + with vllm_runner( + "mistral-community/pixtral-12b", + max_model_len=8192, + limit_mm_per_prompt=LIMIT_MM_PER_PROMPT, + ) as vllm_model: + outputs = vllm_model.model.generate(prompt) + + assert len(outputs) == 1, f"{len(outputs)=}" + output: RequestOutput = outputs[0] + assert hasattr(output, + "multi_modal_placeholders"), f"{output.__dict__=}" + assert "image" in output.multi_modal_placeholders, \ + f"{output.multi_modal_placeholders.keys()=}" + image_placeholder_ranges: list[ + PlaceholderRange] = output.multi_modal_placeholders["image"] + assert len(image_placeholder_ranges) == len( + expected_ranges), f"{image_placeholder_ranges=}" + for real_range, expected_range in zip(image_placeholder_ranges, + expected_ranges): + assert real_range == expected_range, \ + f"{real_range=} {expected_range=}" diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 307febde7eef0..d44a538d56b8c 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -30,6 +30,7 @@ from vllm.model_executor.models.utils import merge_multimodal_embeddings from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs +from vllm.multimodal.inputs import PlaceholderRange from vllm.multimodal.utils import (cached_get_tokenizer, consecutive_placeholder_ranges) from vllm.sequence import IntermediateTensors, SequenceData @@ -773,15 +774,28 @@ def input_processor_for_pixtral_hf( replace_tokens[-1] = image_end_id replace_tokens_list.append(replace_tokens) + reverse_offsets: List[int] = [] # Backward iteration for replacement without affecting known indices for placeholder_idx, replace_tokens in zip(reversed(placeholder_indices), reversed(replace_tokens_list)): + reverse_offsets.append( + len(new_token_ids) - placeholder_idx + len(replace_tokens)) new_token_ids[placeholder_idx:placeholder_idx + 1] = replace_tokens + placeholder_ranges: List[PlaceholderRange] = [] + for reverse_offset, replace_tokens in zip(reversed(reverse_offsets), + replace_tokens_list): + placeholder_ranges.append( + PlaceholderRange( + offset=len(new_token_ids) - reverse_offset, + length=len(replace_tokens), + )) + # NOTE: Create a defensive copy of the original inputs return token_inputs(prompt_token_ids=new_token_ids, prompt=new_prompt, - multi_modal_data=multi_modal_data) + multi_modal_data=multi_modal_data, + multi_modal_placeholders={"image": placeholder_ranges}) class PixtralHFMLP(nn.Module): diff --git a/vllm/outputs.py b/vllm/outputs.py index badf50d0602d6..4ae9b377ae693 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -5,6 +5,7 @@ from typing import Union from vllm.lora.request import LoRARequest +from vllm.multimodal.inputs import MultiModalPlaceholderDict from vllm.sampling_params import RequestOutputKind from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs, SequenceGroup, SequenceGroupBase, SequenceStatus) @@ -103,10 +104,13 @@ def __init__( encoder_prompt: Optional[str] = None, encoder_prompt_token_ids: Optional[List[int]] = None, num_cached_tokens: Optional[int] = None, + *, + multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None, ) -> None: self.request_id = request_id self.prompt = prompt self.prompt_token_ids = prompt_token_ids + self.multi_modal_placeholders = multi_modal_placeholders or {} self.prompt_logprobs = prompt_logprobs self.outputs = outputs self.finished = finished @@ -275,17 +279,26 @@ def from_seq_group( finished_time = time.time() if finished else None seq_group.set_finished_time(finished_time) - init_args = (seq_group.request_id, prompt, prompt_token_ids, - prompt_logprobs, outputs, finished, seq_group.metrics, - seq_group.lora_request, encoder_prompt, - encoder_prompt_token_ids, num_cached_tokens) + init_kwargs = { + "request_id": seq_group.request_id, + "prompt": prompt, + "prompt_token_ids": prompt_token_ids, + "prompt_logprobs": prompt_logprobs, + "outputs": outputs, + "finished": finished, + "metrics": seq_group.metrics, + "lora_request": seq_group.lora_request, + "encoder_prompt": encoder_prompt, + "encoder_prompt_token_ids": encoder_prompt_token_ids, + "num_cached_tokens": num_cached_tokens, + "multi_modal_placeholders": seq_group.multi_modal_placeholders + } if use_cache: request_output = seq_group.cached_request_output - request_output.__init__(*init_args) # type: ignore - + request_output.__init__(**init_kwargs) # type: ignore else: - request_output = cls(*init_args) + request_output = cls(**init_kwargs) # type: ignore return request_output @@ -300,7 +313,8 @@ def __repr__(self) -> str: f"finished={self.finished}, " f"metrics={self.metrics}, " f"lora_request={self.lora_request}, " - f"num_cached_tokens={self.num_cached_tokens})") + f"num_cached_tokens={self.num_cached_tokens}, " + f"multi_modal_placeholders={self.multi_modal_placeholders})") class EmbeddingRequestOutput: From 01aae1cc68d6013dd91e87418a6d82fa02c58457 Mon Sep 17 00:00:00 2001 From: Maybewuss <38156589+Maybewuss@users.noreply.github.com> Date: Mon, 18 Nov 2024 18:05:36 +0800 Subject: [PATCH 034/255] [Model] Remove redundant softmax when using PoolingType.STEP (#10415) --- vllm/model_executor/layers/pooler.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 6fee57a0a03eb..bfe2d7d0f382e 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -118,14 +118,13 @@ def forward( if returned_token_ids is not None and len(returned_token_ids) > 0: hidden_states = hidden_states[:, returned_token_ids] - logits = hidden_states.softmax(dim=-1) step_tag_id = self.step_tag_id offset = 0 pooled_data_lst = [] for prompt_len, seq_data_i in zip( prompt_lens, pooling_metadata.seq_data.values()): - pooled_data_i = logits[offset:offset + prompt_len] + pooled_data_i = hidden_states[offset:offset + prompt_len] if step_tag_id is not None: token_ids = torch.tensor(seq_data_i.prompt_token_ids) pooled_data_i = pooled_data_i[token_ids == step_tag_id] From 5be4e52b6522113f7276e60b32cb5c1f912de6fd Mon Sep 17 00:00:00 2001 From: B-201 Date: Mon, 18 Nov 2024 20:57:10 +0800 Subject: [PATCH 035/255] [Model][LoRA]LoRA support added for glm-4v (#10418) Signed-off-by: B-201 --- vllm/model_executor/models/chatglm.py | 98 +++++++++++++++++++++------ 1 file changed, 79 insertions(+), 19 deletions(-) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 81e56381eabd8..625e31bb0d368 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -30,6 +30,7 @@ ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel +from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalData, MultiModalKwargs @@ -574,25 +575,8 @@ def forward( return hidden_states -@MULTIMODAL_REGISTRY.register_image_input_mapper(mm_input_mapper_for_glmv) -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_glmv_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_glmv) -@INPUT_REGISTRY.register_input_processor(input_processor_for_glmv) -class ChatGLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, - SupportsMultiModal): - packed_modules_mapping = { - "query_key_value": ["query_key_value"], - "dense_h_to_4h": ["dense_h_to_4h"] - } - # LoRA specific attributes - supported_lora_modules = [ - "query_key_value", - "dense", - "dense_h_to_4h", - "dense_4h_to_h", - ] - embedding_modules = {} - embedding_padding_modules = [] +class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP, + SupportsMultiModal): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -692,3 +676,79 @@ def load_weights(self, weights: Iterable[Tuple[str, weight_loader(param, combined_weight) loaded_params.add(combined_name) return loaded_params + + +class ChatGLM(ChatGLMBaseModel): + packed_modules_mapping = { + "query_key_value": ["query_key_value"], + "dense_h_to_4h": ["dense_h_to_4h"] + } + # LoRA specific attributes + supported_lora_modules = [ + "query_key_value", + "dense", + "dense_h_to_4h", + "dense_4h_to_h", + ] + + embedding_modules = {} + embedding_padding_modules = [] + + +class ChatGLMV(ChatGLMBaseModel): + packed_modules_mapping = { + "query_key_value": ["query_key_value"], + "dense_h_to_4h": ["dense_h_to_4h"], + "merged_proj": ["gate_proj", "dense_h_to_4h"] + } + # LoRA specific attributes + supported_lora_modules = [ + "query_key_value", + "dense", + "dense_h_to_4h", + "dense_4h_to_h", + # vision + "fc1", + "fc2", + "merged_proj", + "linear_proj" + ] + + embedding_modules = {} + embedding_padding_modules = [] + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="transformer.encoder", + connector="transformer.vision.linear_proj", + tower_model="transformer.vision.transformer") + + +@MULTIMODAL_REGISTRY.register_image_input_mapper(mm_input_mapper_for_glmv) +@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_glmv_image_tokens) +@INPUT_REGISTRY.register_dummy_data(dummy_data_for_glmv) +@INPUT_REGISTRY.register_input_processor(input_processor_for_glmv) +class ChatGLMForCausalLM(ChatGLMBaseModel, SupportsLoRA, SupportsPP, + SupportsMultiModal): + # Ensure that the LoRA support check passes when the class is not + # initialized, but set all these attributes to empty. + packed_modules_mapping = {} + supported_lora_modules = [] + embedding_modules = {} + embedding_padding_modules = [] + + def __new__( + cls, + vllm_config: VllmConfig, + prefix: str = "", + ) -> None: + config = vllm_config.model_config.hf_config + # Initialize VL + if hasattr(config, "visual"): + return ChatGLM(vllm_config=vllm_config, prefix=prefix) + # Initialize LLM + else: + return ChatGLMV(vllm_config=vllm_config, prefix=prefix) From e7ebb662d777a9617644428031c1cf80c38939ba Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Mon, 18 Nov 2024 21:45:21 +0800 Subject: [PATCH 036/255] [Model] Remove transformers attention porting in VITs (#10414) Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/blip.py | 66 +++++++++++++----------- vllm/model_executor/models/clip.py | 65 ++++++++++++----------- vllm/model_executor/models/intern_vit.py | 32 ++++++++---- vllm/model_executor/models/molmo.py | 2 +- vllm/model_executor/models/qwen2_vl.py | 2 +- vllm/model_executor/models/siglip.py | 63 ++++++++++++---------- vllm/model_executor/models/utils.py | 11 ++-- 7 files changed, 139 insertions(+), 102 deletions(-) diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 6db6462e97f3f..6af59697160a0 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -4,10 +4,11 @@ import torch import torch.nn as nn +import torch.nn.functional as F from PIL import Image from transformers import Blip2VisionConfig, BlipVisionConfig -from transformers.models.blip.modeling_blip import BlipAttention +from vllm.attention.selector import _Backend from vllm.config import ModelConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.inputs import DecoderOnlyInputs, token_inputs @@ -21,11 +22,7 @@ repeat_and_pad_placeholder_tokens) from vllm.sequence import SequenceData -try: - from xformers import ops as xops - USE_XFORMERS_OPS = True -except ImportError: - USE_XFORMERS_OPS = False +from .utils import get_vit_attn_backend def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int: @@ -168,7 +165,7 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: return embeddings -class BlipParallelAttention(nn.Module): +class BlipAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" def __init__( @@ -208,6 +205,12 @@ def __init__( self.tp_size = get_tensor_model_parallel_world_size() self.num_heads_per_partition = divide(self.num_heads, self.tp_size) + # Detect attention implementation. + self.attn_backend = get_vit_attn_backend(support_fa=False) + if self.attn_backend not in {_Backend.TORCH_SDPA, _Backend.XFORMERS}: + raise RuntimeError( + f"BLIP does not support {self.attn_backend} backend now.") + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() @@ -231,11 +234,26 @@ def forward( self.num_heads_per_partition, self.head_dim) - out = xops.memory_efficient_attention_forward(query_states, - key_states, - value_states, - p=self.dropout, - scale=self.scale) + if self.attn_backend == _Backend.XFORMERS: + from xformers import ops as xops + + out = xops.memory_efficient_attention_forward(query_states, + key_states, + value_states, + p=self.dropout, + scale=self.scale) + elif self.attn_backend == _Backend.TORCH_SDPA: + query_states, key_states, value_states = (x.transpose(1, 2) + for x in (query_states, + key_states, + value_states)) + out = F.scaled_dot_product_attention(query_states, + key_states, + value_states, + dropout_p=self.dropout, + scale=self.scale) + out = out.transpose(1, 2) + out = out.view(bsz, tgt_len, -1) attn_output, _ = self.projection(out) @@ -285,18 +303,11 @@ def __init__( super().__init__() # fallback to sdpa attention if tp unavailable - num_heads = config.num_attention_heads - tp_size = get_tensor_model_parallel_world_size() - if USE_XFORMERS_OPS and num_heads % tp_size == 0: - self.self_attn = BlipParallelAttention( - config, - quant_config=quant_config, - prefix=f"{prefix}.self_attn", - ) - else: - # Blip doesn't have SDPA attention implemented in transformers - # use eager attention instead for cpu backend - self.self_attn = BlipAttention(config) + self.self_attn = BlipAttention( + config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.mlp = BlipMLP(config, @@ -374,11 +385,6 @@ def __init__( prefix: str = "", ) -> None: super().__init__() - - tp_size = get_tensor_model_parallel_world_size() - num_heads = config.num_attention_heads - self.shard_weight = USE_XFORMERS_OPS and num_heads % tp_size == 0 - self.config = config self.embeddings = BlipVisionEmbeddings(config) @@ -422,7 +428,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "q_proj", "q"), ("qkv_proj", "k_proj", "k"), ("qkv_proj", "v_proj", "v"), - ] if self.shard_weight else [] + ] params_dict = dict(self.named_parameters()) loaded_params: Set[str] = set() layer_count = len(self.encoder.layers) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 184758f4a8a45..7f638506f9fb2 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -5,10 +5,11 @@ import numpy as np import torch import torch.nn as nn +import torch.nn.functional as F from PIL import Image from transformers import CLIPVisionConfig -from transformers.models.clip.modeling_clip import CLIPSdpaAttention +from vllm.attention.selector import _Backend from vllm.config import ModelConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.inputs import DecoderOnlyInputs, token_inputs @@ -23,11 +24,7 @@ repeat_and_pad_placeholder_tokens) from vllm.sequence import SequenceData -try: - from xformers import ops as xops - USE_XFORMERS_OPS = True -except ImportError: - USE_XFORMERS_OPS = False +from .utils import get_vit_attn_backend def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int: @@ -197,7 +194,7 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: return embeddings -class CLIPParallelAttention(nn.Module): +class CLIPAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" def __init__( @@ -237,6 +234,12 @@ def __init__( self.tp_size = get_tensor_model_parallel_world_size() self.num_heads_per_partition = divide(self.num_heads, self.tp_size) + # Detect attention implementation. + self.attn_backend = get_vit_attn_backend(support_fa=False) + if self.attn_backend not in {_Backend.TORCH_SDPA, _Backend.XFORMERS}: + raise RuntimeError( + f"CLIP does not support {self.attn_backend} backend now.") + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() @@ -261,11 +264,26 @@ def forward( self.num_heads_per_partition, self.head_dim) - out = xops.memory_efficient_attention_forward(query_states, - key_states, - value_states, - p=self.dropout, - scale=self.scale) + if self.attn_backend == _Backend.XFORMERS: + from xformers import ops as xops + + out = xops.memory_efficient_attention_forward(query_states, + key_states, + value_states, + p=self.dropout, + scale=self.scale) + elif self.attn_backend == _Backend.TORCH_SDPA: + query_states, key_states, value_states = (x.transpose(1, 2) + for x in (query_states, + key_states, + value_states)) + out = F.scaled_dot_product_attention(query_states, + key_states, + value_states, + dropout_p=self.dropout, + scale=self.scale) + out = out.transpose(1, 2) + out = out.view(bsz, tgt_len, -1) attn_output, _ = self.out_proj(out) @@ -311,17 +329,11 @@ def __init__( prefix: str = "", ) -> None: super().__init__() - - num_heads = config.num_attention_heads - tp_size = get_tensor_model_parallel_world_size() - if USE_XFORMERS_OPS and num_heads % tp_size == 0: - self.self_attn = CLIPParallelAttention( - config, - quant_config=quant_config, - prefix=f"{prefix}.self_attn", - ) - else: - self.self_attn = CLIPSdpaAttention(config) + self.self_attn = CLIPAttention( + config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.mlp = CLIPMLP(config, @@ -461,11 +473,6 @@ def __init__( prefix: str = "", ) -> None: super().__init__() - - tp_size = get_tensor_model_parallel_world_size() - num_heads = config.num_attention_heads - self.shard_weight = USE_XFORMERS_OPS and num_heads % tp_size == 0 - self.vision_model = CLIPVisionTransformer( config=config, quant_config=quant_config, @@ -490,7 +497,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "q_proj", "q"), ("qkv_proj", "k_proj", "k"), ("qkv_proj", "v_proj", "v"), - ] if self.shard_weight else [] + ] params_dict = dict(self.named_parameters()) loaded_params: Set[str] = set() layer_count = len(self.vision_model.encoder.layers) diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index bd91a0806ae5c..c4346fcb3bd2a 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -12,6 +12,7 @@ import torch.nn.functional as F from transformers import PretrainedConfig +from vllm.attention.selector import _Backend from vllm.distributed import (divide, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, split_tensor_along_last_dim, @@ -24,11 +25,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader -try: - from xformers import ops as xops - USE_XFORMERS_OPS = True -except ImportError: - USE_XFORMERS_OPS = False +from .utils import get_vit_attn_backend NORM2FN = { 'rms_norm': RMSNorm, @@ -186,6 +183,11 @@ def __init__( prefix=f"{prefix}.proj", ) + self.attn_backend = get_vit_attn_backend(support_fa=False) + if self.attn_backend not in {_Backend.TORCH_SDPA, _Backend.XFORMERS}: + raise RuntimeError( + f"InternViT does not support {self.attn_backend} backend now.") + def _apply_qk_norm(self, q: torch.Tensor, k: torch.Tensor): if self.tp_size > 1: q = tensor_model_parallel_all_gather(q.contiguous()) @@ -211,11 +213,21 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: k = k.view(B, N, self.num_heads_per_partition, self.head_dim) v = v.view(B, N, self.num_heads_per_partition, self.head_dim) - x = xops.memory_efficient_attention_forward(q, k, v, scale=self.scale) - x = x.view(B, N, -1) + if self.attn_backend == _Backend.XFORMERS: + from xformers import ops as xops - x, _ = self.proj(x) - return x + out = xops.memory_efficient_attention_forward(q, + k, + v, + scale=self.scale) + elif self.attn_backend == _Backend.TORCH_SDPA: + q, k, v = (x.transpose(1, 2) for x in (q, k, v)) + out = F.scaled_dot_product_attention(q, k, v, scale=self.scale) + out = out.transpose(1, 2) + + out = out.view(B, N, -1) + out, _ = self.proj(out) + return out class InternSdpaAttention(nn.Module): @@ -362,7 +374,7 @@ def _init_attn( tp_size = get_tensor_model_parallel_world_size() num_heads = config.num_attention_heads - if USE_XFORMERS_OPS and (num_heads + num_dummy_heads) % tp_size == 0: + if (num_heads + num_dummy_heads) % tp_size == 0: return InternParallelAttention(config, quant_config=quant_config, num_dummy_heads=num_dummy_heads, diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 035a1e2ab7b02..a7c90a3f5031b 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -187,7 +187,7 @@ def __init__( ) # Detect attention implementation. - self.attn_backend: _Backend = get_vit_attn_backend() + self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) if self.attn_backend not in { _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS }: diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index ef6b52db6e17d..a929b9323b245 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -260,7 +260,7 @@ def __init__( prefix=f"{prefix}.proj") # Detect attention implementation. - self.attn_backend: _Backend = get_vit_attn_backend() + self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) if self.attn_backend not in { _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS }: diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index c9e09b879843a..c58ad99692900 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -6,11 +6,12 @@ import numpy as np import torch +import torch.nn.functional as F from PIL import Image from torch import nn from transformers import SiglipVisionConfig -from transformers.models.siglip.modeling_siglip import SiglipSdpaAttention +from vllm.attention.selector import _Backend from vllm.config import ModelConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.inputs import DecoderOnlyInputs, token_inputs @@ -27,11 +28,7 @@ repeat_and_pad_placeholder_tokens) from vllm.sequence import SequenceData -try: - from xformers import ops as xops - USE_XFORMERS_OPS = True -except ImportError: - USE_XFORMERS_OPS = False +from .utils import get_vit_attn_backend def get_siglip_patch_grid_length(*, image_size: int, patch_size: int) -> int: @@ -254,7 +251,7 @@ def forward(self, return embeddings -class SiglipParallelAttention(nn.Module): +class SiglipAttention(nn.Module): def __init__( self, @@ -293,6 +290,11 @@ def __init__( self.tp_size = get_tensor_model_parallel_world_size() self.num_heads_per_partition = divide(self.num_heads, self.tp_size) + self.attn_backend = get_vit_attn_backend(support_fa=False) + if self.attn_backend not in {_Backend.TORCH_SDPA, _Backend.XFORMERS}: + raise RuntimeError( + f"SIGLIP does not support {self.attn_backend} backend now.") + def forward( self, hidden_states: torch.Tensor, @@ -313,11 +315,26 @@ def forward( self.num_heads_per_partition, self.head_dim) - out = xops.memory_efficient_attention_forward(query_states, - key_states, - value_states, - p=self.dropout, - scale=self.scale) + if self.attn_backend == _Backend.XFORMERS: + from xformers import ops as xops + + out = xops.memory_efficient_attention_forward(query_states, + key_states, + value_states, + p=self.dropout, + scale=self.scale) + elif self.attn_backend == _Backend.TORCH_SDPA: + query_states, key_states, value_states = (x.transpose(1, 2) + for x in (query_states, + key_states, + value_states)) + out = F.scaled_dot_product_attention(query_states, + key_states, + value_states, + dropout_p=self.dropout, + scale=self.scale) + out = out.transpose(1, 2) + out = out.view(batch_size, q_len, -1) attn_output, _ = self.out_proj(out) @@ -372,17 +389,11 @@ def __init__( self.embed_dim = config.hidden_size - num_heads = config.num_attention_heads - tp_size = get_tensor_model_parallel_world_size() - if USE_XFORMERS_OPS and num_heads % tp_size == 0: - self.self_attn = SiglipParallelAttention( - config, - quant_config=quant_config, - prefix=f"{prefix}.self_attn", - ) - else: - self.self_attn = SiglipSdpaAttention(config) - + self.self_attn = SiglipAttention( + config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) self.mlp = SiglipMLP( @@ -569,10 +580,6 @@ def __init__( ) -> None: super().__init__() - num_heads = config.num_attention_heads - tp_size = get_tensor_model_parallel_world_size() - self.shard_weight = USE_XFORMERS_OPS and num_heads % tp_size == 0 - self.vision_model = SiglipVisionTransformer( config, quant_config, @@ -601,7 +608,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "q_proj", "q"), ("qkv_proj", "k_proj", "k"), ("qkv_proj", "v_proj", "v"), - ] if self.shard_weight else [] + ] params_dict = dict(self.named_parameters()) loaded_params: Set[str] = set() layer_count = len(self.vision_model.encoder.layers) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 7a4fcce95603d..03226f42ee053 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -587,7 +587,11 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any: return llm(*args, **kwargs) -def get_vit_attn_backend() -> _Backend: +def get_vit_attn_backend(support_fa: bool = False) -> _Backend: + """ + Get the available attention backend for Vision Transformer. + """ + # TODO(Isotr0py): Remove `support_fa` after support FA for all ViTs attn. selected_backend: Optional[_Backend] = get_global_forced_attn_backend() if selected_backend is None: backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND @@ -596,7 +600,7 @@ def get_vit_attn_backend() -> _Backend: if selected_backend is None: # For Volta and Turing GPUs, use xformers instead. device_available = current_platform.has_device_capability(80) - if device_available: + if device_available and support_fa: from transformers.utils import is_flash_attn_2_available if is_flash_attn_2_available(): selected_backend = _Backend.FLASH_ATTN @@ -606,7 +610,8 @@ def get_vit_attn_backend() -> _Backend: "so we use xformers backend instead. You can run " "`pip install flash-attn` to use flash-attention backend.") selected_backend = _Backend.XFORMERS - elif current_platform.is_cpu(): + elif current_platform.is_cpu() or current_platform.is_rocm(): + # ROCM doesn't support xformers selected_backend = _Backend.TORCH_SDPA else: selected_backend = _Backend.XFORMERS From 4186be8111e20c64d0cbcbdebbdd1081e77f1075 Mon Sep 17 00:00:00 2001 From: B-201 Date: Mon, 18 Nov 2024 23:08:30 +0800 Subject: [PATCH 037/255] [Doc] Update doc for LoRA support in GLM-4V (#10425) Signed-off-by: B-201 --- docs/source/models/supported_models.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 96a513d42753b..e902d393f2f70 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -446,7 +446,7 @@ Text Generation - GLM-4V - T + I - :code:`THUDM/glm-4v-9b` etc. - - + - ✅︎ - ✅︎ * - :code:`H2OVLChatModel` - H2OVL From 7851b45196aff994277ec832c0cf5bec0073f08e Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 18 Nov 2024 07:20:06 -0800 Subject: [PATCH 038/255] [5/N][torch.compile] torch.jit.script --> torch.compile (#10406) Signed-off-by: youkaichao --- vllm/model_executor/layers/rejection_sampler.py | 2 +- vllm/model_executor/layers/vocab_parallel_embedding.py | 4 ++-- vllm/model_executor/models/phi3_small.py | 4 ++-- vllm/worker/model_runner.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index 2e9a0e170693b..3ab0ba9e9f5c2 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -368,7 +368,7 @@ def _smallest_positive_value(self) -> float: # Note that we always sample with replacement. # probs will be modified in place, but this is fine, as we pass # in a copy already. -@torch.jit.script +@torch.compile(dynamic=True) def _multinomial( probs: torch.Tensor, num_samples: int, diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 52771f50a7a23..30548e656c557 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -133,13 +133,13 @@ def __post_init__(self): assert self.num_added_elements <= self.num_added_elements_padded -@torch.jit.script +@torch.compile(dynamic=True) def get_masked_input_and_mask( input_: torch.Tensor, org_vocab_start_index: int, org_vocab_end_index: int, num_org_vocab_padding: int, added_vocab_start_index: int, added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]: - # torch.jit.script will fuse all of the pointwise ops below + # torch.compile will fuse all of the pointwise ops below # into a single kernel, making it very fast org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < org_vocab_end_index) diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py index a78e4d355a314..f71cbd1264c45 100644 --- a/vllm/model_executor/models/phi3_small.py +++ b/vllm/model_executor/models/phi3_small.py @@ -54,12 +54,12 @@ def weight_loader(self, param: torch.nn.Parameter, return load_column_parallel_weight(param, loaded_weight) -@torch.jit.script +@torch.compile(dynamic=True) def quick_gelu(x): return x * torch.sigmoid(1.702 * x) -@torch.jit.script +@torch.compile(dynamic=True) def gegelu(input, limit: Optional[float] = None): a_gelu, a_linear = input[..., ::2], input[..., 1::2] if limit is not None: diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index fb5813651680b..ed0360fb7f727 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1769,7 +1769,7 @@ def capture( # Run the model a few times without capturing the graph. # This is to make sure that the captured graph does not include the # kernel launches for initial benchmarking (e.g., Triton autotune). - # Note one iteration is not enough for torch.jit.script + # Note one iteration is not enough for torch.compile for _ in range(_NUM_WARMUP_ITERS): self.model( input_ids=input_ids, From 31894a21559436f4a9d72f751e8bd7ba4ab18613 Mon Sep 17 00:00:00 2001 From: ismael-dm Date: Mon, 18 Nov 2024 18:52:12 +0100 Subject: [PATCH 039/255] [Doc] Add documentation for Structured Outputs (#9943) Signed-off-by: ismael-dm --- docs/source/index.rst | 1 + docs/source/models/structured_outputs.rst | 173 ++++++++++++++++++ .../offline_inference_structured_outputs.py | 78 ++++++++ ...enai_chat_completion_structured_outputs.py | 94 ++++++++++ 4 files changed, 346 insertions(+) create mode 100644 docs/source/models/structured_outputs.rst create mode 100644 examples/offline_inference_structured_outputs.py create mode 100644 examples/openai_chat_completion_structured_outputs.py diff --git a/docs/source/index.rst b/docs/source/index.rst index 3b2698a8845ed..b04acbbce4169 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -101,6 +101,7 @@ Documentation models/engine_args models/lora models/vlm + models/structured_outputs models/spec_decode models/performance diff --git a/docs/source/models/structured_outputs.rst b/docs/source/models/structured_outputs.rst new file mode 100644 index 0000000000000..ff4ff7169fc5f --- /dev/null +++ b/docs/source/models/structured_outputs.rst @@ -0,0 +1,173 @@ +.. _structured_outputs: + +Structured Outputs +================== + +vLLM supports the generation of structured outputs using `outlines `_ or `lm-format-enforcer `_ as backends for the guided decoding. +This document shows you some examples of the different options that are available to generate structured outputs. + + +Online Inference (OpenAI API) +----------------------------- + +You can generate structured outputs using the OpenAI’s `Completions `_ and `Chat `_ API. + +The following parameters are supported, which must be added as extra parameters: + +- ``guided_choice``: the output will be exactly one of the choices. +- ``guided_regex``: the output will follow the regex pattern. +- ``guided_json``: the output will follow the JSON schema. +- ``guided_grammar``: the output will follow the context free grammar. +- ``guided_whitespace_pattern``: used to override the default whitespace pattern for guided json decoding. +- ``guided_decoding_backend``: used to select the guided decoding backend to use. + +You can see the complete list of supported parameters on the `OpenAI Compatible Server `_ page. + +Now let´s see an example for each of the cases, starting with the ``guided_choice``, as it´s the easiest one: + +.. code-block:: python + + from openai import OpenAI + client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="-", + ) + + completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[ + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + ], + extra_body={"guided_choice": ["positive", "negative"]}, + ) + print(completion.choices[0].message.content) + + +The next example shows how to use the ``guided_regex``. The idea is to generate an email address, given a simple regex template: + +.. code-block:: python + + completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[ + { + "role": "user", + "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n", + } + ], + extra_body={"guided_regex": "\w+@\w+\.com\n", "stop": ["\n"]}, + ) + print(completion.choices[0].message.content) + +One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats. +For this we can use the ``guided_json`` parameter in two different ways: + +- Using directly a `JSON Schema `_ +- Defining a `Pydantic model `_ and then extracting the JSON Schema from it (which is normally an easier option). + +The next example shows how to use the ``guided_json`` parameter with a Pydantic model: + +.. code-block:: python + + from pydantic import BaseModel + from enum import Enum + + class CarType(str, Enum): + sedan = "sedan" + suv = "SUV" + truck = "Truck" + coupe = "Coupe" + + + class CarDescription(BaseModel): + brand: str + model: str + car_type: CarType + + + json_schema = CarDescription.model_json_schema() + + completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[ + { + "role": "user", + "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's", + } + ], + extra_body={"guided_json": json_schema}, + ) + print(completion.choices[0].message.content) + +.. tip:: + While not strictly necessary, normally it´s better to indicate in the prompt that a JSON needs to be generated and which fields and how should the LLM fill them. + This can improve the results notably in most cases. + + +Finally we have the ``guided_grammar``, which probably is the most difficult one to use but it´s really powerful, as it allows us to define complete languages like SQL queries. +It works by using a context free EBNF grammar, which for example we can use to define a specific format of simplified SQL queries, like in the example below: + +.. code-block:: python + + simplified_sql_grammar = """ + ?start: select_statement + + ?select_statement: "SELECT " column_list " FROM " table_name + + ?column_list: column_name ("," column_name)* + + ?table_name: identifier + + ?column_name: identifier + + ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/ + """ + + completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[ + { + "role": "user", + "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.", + } + ], + extra_body={"guided_grammar": simplified_sql_grammar}, + ) + print(completion.choices[0].message.content) + +The complete code of the examples can be found on `examples/openai_chat_completion_structured_outputs.py `_. + + +Offline Inference +----------------- + +Offline inference allows for the same types of guided decoding. +To use it, we´ll need to configure the guided decoding using the class ``GuidedDecodingParams`` inside ``SamplingParams``. +The main available options inside ``GuidedDecodingParams`` are: + +- ``json`` +- ``regex`` +- ``choice`` +- ``grammar`` +- ``backend`` +- ``whitespace_pattern`` + +These parameters can be used in the same way as the parameters from the Online Inference examples above. +One example for the usage of the ``choices`` parameter is shown below: + +.. code-block:: python + + from vllm import LLM, SamplingParams + from vllm.sampling_params import GuidedDecodingParams + + llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct") + + guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"]) + sampling_params = SamplingParams(guided_decoding=guided_decoding_params) + outputs = llm.generate( + prompts="Classify this sentiment: vLLM is wonderful!", + sampling_params=sampling_params, + ) + print(outputs[0].outputs[0].text) + +A complete example with all options can be found in `examples/offline_inference_structured_outputs.py `_. \ No newline at end of file diff --git a/examples/offline_inference_structured_outputs.py b/examples/offline_inference_structured_outputs.py new file mode 100644 index 0000000000000..00d864606eeff --- /dev/null +++ b/examples/offline_inference_structured_outputs.py @@ -0,0 +1,78 @@ +from enum import Enum + +from pydantic import BaseModel + +from vllm import LLM, SamplingParams +from vllm.sampling_params import GuidedDecodingParams + +llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100) + +# Guided decoding by Choice (list of possible options) +guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"]) +sampling_params = SamplingParams(guided_decoding=guided_decoding_params) +outputs = llm.generate( + prompts="Classify this sentiment: vLLM is wonderful!", + sampling_params=sampling_params, +) +print(outputs[0].outputs[0].text) + +# Guided decoding by Regex +guided_decoding_params = GuidedDecodingParams(regex="\w+@\w+\.com\n") +sampling_params = SamplingParams(guided_decoding=guided_decoding_params, + stop=["\n"]) +prompt = ("Generate an email address for Alan Turing, who works in Enigma." + "End in .com and new line. Example result:" + "alan.turing@enigma.com\n") +outputs = llm.generate(prompts=prompt, sampling_params=sampling_params) +print(outputs[0].outputs[0].text) + + +# Guided decoding by JSON using Pydantic schema +class CarType(str, Enum): + sedan = "sedan" + suv = "SUV" + truck = "Truck" + coupe = "Coupe" + + +class CarDescription(BaseModel): + brand: str + model: str + car_type: CarType + + +json_schema = CarDescription.model_json_schema() + +guided_decoding_params = GuidedDecodingParams(json=json_schema) +sampling_params = SamplingParams(guided_decoding=guided_decoding_params) +prompt = ("Generate a JSON with the brand, model and car_type of" + "the most iconic car from the 90's") +outputs = llm.generate( + prompts=prompt, + sampling_params=sampling_params, +) +print(outputs[0].outputs[0].text) + +# Guided decoding by Grammar +simplified_sql_grammar = """ + ?start: select_statement + + ?select_statement: "SELECT " column_list " FROM " table_name + + ?column_list: column_name ("," column_name)* + + ?table_name: identifier + + ?column_name: identifier + + ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/ +""" +guided_decoding_params = GuidedDecodingParams(grammar=simplified_sql_grammar) +sampling_params = SamplingParams(guided_decoding=guided_decoding_params) +prompt = ("Generate an SQL query to show the 'username' and 'email'" + "from the 'users' table.") +outputs = llm.generate( + prompts=prompt, + sampling_params=sampling_params, +) +print(outputs[0].outputs[0].text) diff --git a/examples/openai_chat_completion_structured_outputs.py b/examples/openai_chat_completion_structured_outputs.py new file mode 100644 index 0000000000000..8c059c7ca07ce --- /dev/null +++ b/examples/openai_chat_completion_structured_outputs.py @@ -0,0 +1,94 @@ +from enum import Enum + +from openai import OpenAI +from pydantic import BaseModel + +client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="-", +) + +# Guided decoding by Choice (list of possible options) +completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[{ + "role": "user", + "content": "Classify this sentiment: vLLM is wonderful!" + }], + extra_body={"guided_choice": ["positive", "negative"]}, +) +print(completion.choices[0].message.content) + +# Guided decoding by Regex +prompt = ("Generate an email address for Alan Turing, who works in Enigma." + "End in .com and new line. Example result:" + "alan.turing@enigma.com\n") + +completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[{ + "role": "user", + "content": prompt, + }], + extra_body={ + "guided_regex": "\w+@\w+\.com\n", + "stop": ["\n"] + }, +) +print(completion.choices[0].message.content) + + +# Guided decoding by JSON using Pydantic schema +class CarType(str, Enum): + sedan = "sedan" + suv = "SUV" + truck = "Truck" + coupe = "Coupe" + + +class CarDescription(BaseModel): + brand: str + model: str + car_type: CarType + + +json_schema = CarDescription.model_json_schema() + +prompt = ("Generate a JSON with the brand, model and car_type of" + "the most iconic car from the 90's") +completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[{ + "role": "user", + "content": prompt, + }], + extra_body={"guided_json": json_schema}, +) +print(completion.choices[0].message.content) + +# Guided decoding by Grammar +simplified_sql_grammar = """ + ?start: select_statement + + ?select_statement: "SELECT " column_list " FROM " table_name + + ?column_list: column_name ("," column_name)* + + ?table_name: identifier + + ?column_name: identifier + + ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/ +""" + +prompt = ("Generate an SQL query to show the 'username' and 'email'" + "from the 'users' table.") +completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[{ + "role": "user", + "content": prompt, + }], + extra_body={"guided_grammar": simplified_sql_grammar}, +) +print(completion.choices[0].message.content) From 4f686d139f6acb31ea31eaf57ed1bb3920a77682 Mon Sep 17 00:00:00 2001 From: Andrew Nesbitt Date: Mon, 18 Nov 2024 17:52:42 +0000 Subject: [PATCH 040/255] Fix open_collective value in FUNDING.yml (#10426) Signed-off-by: Andrew Nesbitt --- .github/FUNDING.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml index 71f4e520135d4..d1f6105a47166 100644 --- a/.github/FUNDING.yml +++ b/.github/FUNDING.yml @@ -1,2 +1,2 @@ github: [vllm-project] -open_collective: [vllm] +open_collective: vllm From 281cc4b3cd2f6c84c2cd8272ef83d97edd1c323a Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 18 Nov 2024 13:04:14 -0500 Subject: [PATCH 041/255] [Model][Bugfix] Support TP for PixtralHF ViT (#10405) Signed-off-by: mgoin --- vllm/model_executor/models/pixtral.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index d44a538d56b8c..f7f46770057e2 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -17,6 +17,7 @@ from vllm.attention import AttentionMetadata from vllm.config import ModelConfig, VllmConfig +from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, InputContext, token_inputs) from vllm.model_executor.layers.activation import get_act_and_mul_fn @@ -843,17 +844,20 @@ def __init__( self.config = config assert not config.hidden_size % config.num_attention_heads - self.n_heads = config.num_attention_heads + self.total_num_heads = config.num_attention_heads + tp_size = get_tensor_model_parallel_world_size() + self.n_heads = divide(config.num_attention_heads, tp_size) self.head_dim = config.hidden_size // config.num_attention_heads self.qkv_proj = QKVParallelLinear( hidden_size=config.hidden_size, head_size=self.head_dim, - total_num_heads=self.n_heads, + total_num_heads=self.total_num_heads, bias=False, quant_config=quant_config, prefix=f"{prefix}.qkv_proj", ) + assert self.total_num_heads * self.head_dim == config.hidden_size self.o_proj = RowParallelLinear( input_size=config.hidden_size, output_size=config.hidden_size, From 6b2d25efc78f21867ca37e3f707c5a94f906478f Mon Sep 17 00:00:00 2001 From: Yan Ma Date: Tue, 19 Nov 2024 02:18:05 +0800 Subject: [PATCH 042/255] [Hardware][XPU] AWQ/GPTQ support for xpu backend (#10107) Signed-off-by: yan ma --- .../quantization/supported_hardware.rst | 8 +- tests/quantization/test_ipex_quant.py | 10 +- vllm/model_executor/layers/linear.py | 2 +- .../layers/quantization/gptq.py | 1 - .../layers/quantization/gptq_marlin.py | 4 + .../layers/quantization/ipex_quant.py | 169 +++++++++++++----- vllm/model_executor/model_loader/loader.py | 4 +- 7 files changed, 146 insertions(+), 52 deletions(-) diff --git a/docs/source/quantization/supported_hardware.rst b/docs/source/quantization/supported_hardware.rst index 9bf0cdb80376d..09f8e7112cf0c 100644 --- a/docs/source/quantization/supported_hardware.rst +++ b/docs/source/quantization/supported_hardware.rst @@ -27,7 +27,7 @@ The table below shows the compatibility of various quantization implementations - ✅︎ - ✅︎ - ✗ - - ✗ + - ✅︎ - ✅︎ - ✗ - ✗ @@ -38,8 +38,8 @@ The table below shows the compatibility of various quantization implementations - ✅︎ - ✅︎ - ✗ - - ✗ - - ✗ + - ✅︎ + - ✅︎ - ✗ - ✗ * - Marlin (GPTQ/AWQ/FP8) @@ -129,4 +129,4 @@ Notes: Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. -For the most up-to-date information on hardware support and quantization methods, please check the `quantization directory `_ or consult with the vLLM development team. \ No newline at end of file +For the most up-to-date information on hardware support and quantization methods, please check the `quantization directory `_ or consult with the vLLM development team. diff --git a/tests/quantization/test_ipex_quant.py b/tests/quantization/test_ipex_quant.py index d541efcefcac3..68a73f0f8ab48 100644 --- a/tests/quantization/test_ipex_quant.py +++ b/tests/quantization/test_ipex_quant.py @@ -1,5 +1,5 @@ """Test model set-up and inference for quantized HF models supported - on the CPU backend using IPEX (including AWQ). + on the CPU/GPU backend using IPEX (including AWQ/GPTQ). Validating the configuration and printing results for manual checking. @@ -11,13 +11,15 @@ from vllm.platforms import current_platform MODELS = [ - "casperhansen/llama-3-8b-instruct-awq", + "AMead10/Llama-3.2-1B-Instruct-AWQ", + "shuyuej/Llama-3.2-1B-Instruct-GPTQ", # with g_idx ] DTYPE = ["bfloat16"] -@pytest.mark.skipif(not current_platform.is_cpu(), - reason="only supports the CPU backend.") +@pytest.mark.skipif(not current_platform.is_cpu() + and not current_platform.is_xpu(), + reason="only supports Intel CPU/XPU backend.") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", DTYPE) def test_ipex_quant(vllm_runner, model, dtype): diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 94f30412e43b3..e1f8a6e36d781 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -27,7 +27,7 @@ "AWQLinearMethod", "GPTQMarlinLinearMethod", "Fp8LinearMethod", "MarlinLinearMethod", "QQQLinearMethod", "GPTQMarlin24LinearMethod", "TPUInt8LinearMethod", "GPTQLinearMethod", "FBGEMMFp8LinearMethod", - "ModelOptFp8LinearMethod", "IPEXAWQLinearMethod" + "ModelOptFp8LinearMethod", "IPEXAWQLinearMethod", "IPEXGPTQLinearMethod" ] diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index 0aa605e62454e..abafad0f1047e 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -210,7 +210,6 @@ def create_weights( def process_weights_after_loading(self, layer: torch.nn.Module) -> None: # for torch.compile - layer.qweight = Parameter(layer.qweight.data, requires_grad=False) layer.qzeros = Parameter(layer.qzeros.data, requires_grad=False) layer.qweight = Parameter(layer.qweight.data, requires_grad=False) layer.g_idx = Parameter(layer.g_idx.data, requires_grad=False) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 1f72e3afbbce5..a3e58bf1b2a4c 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -23,6 +23,7 @@ PackedColumnParameter, PackedvLLMParameter, RowvLLMParameter) +from vllm.platforms import current_platform from vllm.scalar_type import scalar_types logger = init_logger(__name__) @@ -134,6 +135,9 @@ def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]): sym = quant_config.get("sym") desc_act = quant_config.get("desc_act") + if not current_platform.is_cuda(): + return False + if quant_method != "gptq": return False diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index 330c2ad195d78..c16a962134d06 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -2,21 +2,26 @@ import torch -from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase -from vllm.model_executor.layers.quantization.awq import AWQLinearMethod +from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, + UnquantizedLinearMethod) +from vllm.model_executor.layers.quantization.awq import (AWQLinearMethod, + is_layer_skipped_awq) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod from vllm.platforms import current_platform +MIN_IPEX_VERSION = "2.5.0" + class IPEXConfig(QuantizationConfig): - """INT8 quantization config class using IPEX for the CPU backend, - including AWQ. + """INT8 quantization config class using IPEX for the CPU/XPU backend, + including AWQ, GPTQ. """ IPEX_QUANT_METHOD_MAP = { "awq": 1, - "gptq": 2, + "gptq": 0, } def __init__( @@ -24,29 +29,30 @@ def __init__( method: str, weight_bits: int, group_size: int, + modules_to_not_convert: Optional[List[str]] = None, + desc_act: Optional[bool] = None, + lm_head_quantized: Optional[bool] = None, ) -> None: self.method = method self.weight_bits = weight_bits self.group_size = group_size + self.modules_to_not_convert = modules_to_not_convert or [] + self.desc_act = desc_act + self.lm_head_quantized = lm_head_quantized self.pack_factor = 32 // self.weight_bits if self.weight_bits not in [4]: raise ValueError(f"IPEX quantization supports weight bits [4], " f"but got {self.weight_bits}.") - if self.method == "awq": - self.quant_method = IPEXAWQLinearMethod - else: - raise ValueError(f"IPEX quantization supports [awq], " + if self.method not in ["awq", "gptq"]: + raise ValueError(f"IPEX quantization supports [awq, gptq], " f"but got {self.method}.") def __repr__(self) -> str: - return (f"IPEXConfig(method={self.method}" + return (f"IPEXConfig(method={self.method}," f"weight_bits={self.weight_bits}, " - f"group_size={self.group_size}") - - def get_ipex_quant_method_id(self) -> int: - return IPEXConfig.IPEX_QUANT_METHOD_MAP[self.method] + f"group_size={self.group_size})") @classmethod def get_name(cls) -> str: @@ -70,19 +76,32 @@ def get_config_filenames() -> List[str]: @classmethod def from_config(cls, config: Dict[str, Any]) -> "IPEXConfig": method = cls.get_from_keys(config, ["quant_method"]).lower() - weight_bits = cls.get_from_keys(config, ["w_bit", "bits"]) - group_size = cls.get_from_keys(config, ["q_group_size", "group_size"]) - return cls(method, weight_bits, group_size) + if method == "awq": + weight_bits = cls.get_from_keys(config, ["w_bit", "bits"]) + group_size = cls.get_from_keys(config, + ["q_group_size", "group_size"]) + modules_to_not_convert = cls.get_from_keys_or( + config, ["modules_to_not_convert"], None) + return cls(method, weight_bits, group_size, modules_to_not_convert, + False, False) + # otherwise for gptq + weight_bits = cls.get_from_keys(config, ["bits"]) + group_size = cls.get_from_keys(config, ["group_size"]) + lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], + default=False) + desc_act = cls.get_from_keys_or(config, ["desc_act"], default=False) + return cls(method, weight_bits, group_size, [], desc_act, + lm_head_quantized) @classmethod def override_quantization_method(cls, hf_quant_cfg, user_quant) -> Optional[str]: - if not current_platform.is_cpu(): + if not current_platform.is_cpu() and not current_platform.is_xpu(): return None quant_method = hf_quant_cfg.get("quant_method", "").lower() - if quant_method in ["awq"]: + if quant_method in ["awq", "gptq"]: return cls.get_name() return None @@ -90,12 +109,81 @@ def override_quantization_method(cls, hf_quant_cfg, def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["LinearMethodBase"]: if isinstance(layer, LinearBase): - return self.quant_method(self) + if self.method == "awq": + if is_layer_skipped_awq(prefix, self.modules_to_not_convert): + return UnquantizedLinearMethod() + return IPEXAWQLinearMethod(self) + if self.method == "gptq": + return IPEXGPTQLinearMethod(self) return None +class IPEXGPTQLinearMethod(GPTQLinearMethod): + """GPTQ linear method using IPEX for the CPU/XPU backend. + """ + + def __init__(self, quant_config: IPEXConfig): + self.quant_config = quant_config # type: ignore + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + bias = layer.bias if not layer.skip_bias_add else None + + try: + import intel_extension_for_pytorch as ipex + if ipex.__version__ < MIN_IPEX_VERSION: + raise ImportError( + "intel_extension_for_pytorch version is " + "wrong. Please install " + f"intel_extension_for_pytorch>={MIN_IPEX_VERSION}.") + except ImportError as err: + raise ImportError( + "Please install " + f"intel_extension_for_pytorch>={MIN_IPEX_VERSION} via " + f"`pip install intel_extension_for_pytorch>={MIN_IPEX_VERSION}`" + " to use IPEX-AWQ linear method.") from err + # Using the compute dtype (lowp_mode) as INT8 to leverage instructions + # with better performance. + lowp_mode = ipex.quantization.WoqLowpMode.INT8 + # The weight will be de-packed from INT4 to INT8. + weight_dtype = ipex.quantization.WoqWeightDtype.INT4 + # The float activation will be quantized (dynamic, per-token) to INT8. + act_quant_mode = ipex.quantization.WoqActQuantMode.PER_BATCH_IC_BLOCK + + qconfig = ipex.quantization.get_weight_only_quant_qconfig_mapping( + weight_dtype=weight_dtype, + lowp_mode=lowp_mode, + act_quant_mode=act_quant_mode, + group_size=self.quant_config.group_size, + ) + layer.ipex_output_size = layer.qweight.shape[-1] + g_idx = layer.g_idx if self.quant_config.desc_act else None + layer.ipex_qlinear = ipex.llm.quantization.woq_linear. \ + IPEXWeightOnlyQuantizedLinear.from_weight( + layer.qweight, + layer.scales, + layer.qzeros, + layer.qweight.size(0), + layer.ipex_output_size, + qconfig=qconfig, + g_idx=g_idx, + bias=bias, + group_size=self.quant_config.group_size, + quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["gptq"] + ) + + def apply(self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + reshaped_x = x.reshape(-1, x.shape[-1]) + out = layer.ipex_qlinear(reshaped_x) + if bias is not None: + out.add_(bias) + return out.reshape(x.shape[:-1] + (layer.ipex_output_size, )) + + class IPEXAWQLinearMethod(AWQLinearMethod): - """AWQ linear method using IPEX for the CPU backend. + """AWQ linear method using IPEX for the CPU/XPU backend. """ def __init__(self, quant_config: IPEXConfig): @@ -108,15 +196,16 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: try: import intel_extension_for_pytorch as ipex - if ipex.__version__ < "2.4.0": - raise ImportError("intel_extension_for_pytorch version is " - "wrong. Please install " - "intel_extension_for_pytorch>=2.4.0.") + if ipex.__version__ < MIN_IPEX_VERSION: + raise ImportError( + "intel_extension_for_pytorch version is " + "wrong. Please install " + f"intel_extension_for_pytorch>={MIN_IPEX_VERSION}.") except ImportError as err: raise ImportError( "Please install " - "intel_extension_for_pytorch>=2.4.0 via " - "`pip install intel_extension_for_pytorch>=2.4.0`" + f"intel_extension_for_pytorch>={MIN_IPEX_VERSION} via " + f"`pip install intel_extension_for_pytorch>={MIN_IPEX_VERSION}`" " to use IPEX-AWQ linear method.") from err # Using the compute dtype (lowp_mode) as INT8 to leverage instructions @@ -136,19 +225,18 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.ipex_output_size = layer.qweight.size( 1) * self.quant_config.pack_factor - layer.ipex_qlinear = ipex.nn.modules.weight_only_quantization.\ - WeightOnlyQuantizedLinear.from_weight( - layer.qweight, - layer.scales, - layer.qzeros, - layer.qweight.size(0), - layer.ipex_output_size, - qconfig=qconfig, - bias=bias, - group_size=self.quant_config.group_size, - quant_method= - self.quant_config.get_ipex_quant_method_id() # type: ignore - ) + layer.ipex_qlinear = ipex.llm.quantization.woq_linear. \ + IPEXWeightOnlyQuantizedLinear.from_weight( + layer.qweight, + layer.scales, + layer.qzeros, + layer.qweight.size(0), + layer.ipex_output_size, + qconfig=qconfig, + bias=bias, + group_size=self.quant_config.group_size, + quant_method=IPEXConfig.IPEX_QUANT_METHOD_MAP["awq"] # type: ignore + ) def apply(self, layer: torch.nn.Module, @@ -156,5 +244,4 @@ def apply(self, bias: Optional[torch.Tensor] = None) -> torch.Tensor: reshaped_x = x.reshape(-1, x.shape[-1]) out = layer.ipex_qlinear(reshaped_x) - return out.reshape(x.shape[:-1] + (layer.ipex_output_size, )) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index d9ce85949e4ee..b41c23704b7ff 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -29,6 +29,8 @@ from vllm.logger import init_logger from vllm.model_executor.layers.linear import (ReplicatedLinear, RowParallelLinear) +from vllm.model_executor.layers.quantization.base_config import ( + QuantizeMethodBase) from vllm.model_executor.model_loader.tensorizer import ( TensorizerConfig, is_vllm_tensorized, load_with_tensorizer, serialize_vllm_model, tensorizer_weights_iterator) @@ -348,7 +350,7 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module: for _, module in model.named_modules(): quant_method = getattr(module, "quant_method", None) - if quant_method is not None: + if isinstance(quant_method, QuantizeMethodBase): # When quant methods need to process weights after loading # (for repacking, quantizing, etc), they expect parameters # to be on the global target device. This scope is for the From c2170a5b395acb9f5f4ce8425c3be18aacb67513 Mon Sep 17 00:00:00 2001 From: Angus Wang Date: Mon, 18 Nov 2024 11:39:40 -0800 Subject: [PATCH 043/255] [Kernel] Explicitly specify other value in tl.load calls (#9014) Signed-off-by: Angus Wang --- .../blocksparse_attention_kernel.py | 13 ++++++++++--- vllm/lora/ops/bgmv_expand.py | 4 +++- vllm/lora/ops/bgmv_expand_slice.py | 8 +++++++- vllm/lora/ops/sgmv_expand.py | 5 ++++- vllm/lora/ops/sgmv_expand_slice.py | 5 ++++- .../layers/quantization/awq_triton.py | 14 +++++++------- 6 files changed, 35 insertions(+), 14 deletions(-) diff --git a/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py index ec1c37c5bcb0e..727a470ba6d0e 100644 --- a/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +++ b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py @@ -157,19 +157,22 @@ def _fwd_kernel_inner( k = tl.load( k_ptrs + start_n * stride_kt, mask=offs_n[None, :] + start_n < k_seqlen, + other=0.0, ) else: k = tl.load( k_ptrs + start_n * stride_kt, mask=(offs_n[None, :] + start_n < k_seqlen) & (offs_d[:, None] < D_HEAD), + other=0.0, ) else: if EVEN_D: k = tl.load(k_ptrs + start_n * stride_kt) else: k = tl.load(k_ptrs + start_n * stride_kt, - mask=offs_d[:, None] < D_HEAD) + mask=offs_d[:, None] < D_HEAD, + other=0.0) qk = tl.zeros([BLOCK_M_LOADING, BLOCK_N], dtype=tl.float32) qk += tl.dot(q, k) @@ -200,19 +203,22 @@ def _fwd_kernel_inner( v = tl.load( v_ptrs + start_n * stride_vt, mask=offs_n[:, None] + start_n < k_seqlen, + other=0.0, ) else: v = tl.load( v_ptrs + start_n * stride_vt, mask=(offs_n[:, None] + start_n < k_seqlen) & (offs_d[None, :] < D_HEAD), + other=0.0, ) else: if EVEN_D: v = tl.load(v_ptrs + start_n * stride_vt) else: v = tl.load(v_ptrs + start_n * stride_vt, - mask=offs_d[None, :] < D_HEAD) + mask=offs_d[None, :] < D_HEAD, + other=0.0) acc += tl.dot(p, v) @@ -318,12 +324,13 @@ def _fwd_kernel_batch_inference( q = tl.load( Q + offs_m[:, None] * stride_qt + offs_d[None, :] * stride_qd, mask=offs_m[:, None] < q_seqlen, + other=0.0, ) else: q = tl.load( Q + offs_m[:, None] * stride_qt + offs_d[None, :] * stride_qd, mask=(offs_m[:, None] < q_seqlen) & (offs_d[None, :] < D_HEAD), - other=0, + other=0.0, ) sparse_crow_ptr = (layout_crow_ptr + off_h * layout_crow_stride_h + diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py index 6a32387a6f36c..f176259fddc78 100644 --- a/vllm/lora/ops/bgmv_expand.py +++ b/vllm/lora/ops/bgmv_expand.py @@ -75,7 +75,9 @@ def _bgmv_expand_kernel( other=0.0, ) # [BLOCK_N,BLOCK_K] if ADD_INPUTS: - tiled_out = tl.load(c_ptr + current_n * cn_stride, mask=c_mask) + tiled_out = tl.load(c_ptr + current_n * cn_stride, + mask=c_mask, + other=0.0) accumulator = tl.sum(tiled_a * tiled_b, 1) + tiled_out else: accumulator = tl.sum(tiled_a * tiled_b, 1) diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py index 73628fd20d327..2c6ed96c253f0 100644 --- a/vllm/lora/ops/bgmv_expand_slice.py +++ b/vllm/lora/ops/bgmv_expand_slice.py @@ -78,7 +78,13 @@ def _bgmv_expand_slice_kernel( ) # [BLOCK_N,BLOCK_K] if ADD_INPUTS: - tiled_out = tl.load(c_ptr + current_n * cn_stride, mask=c_mask) + # explicitly pass in other=None to tell triton that masked values + # can be uninitialized. This is OK because the later tl.store + # operation uses the same mask, eliminating the risk of garbage + # values propagating + tiled_out = tl.load(c_ptr + current_n * cn_stride, + mask=c_mask, + other=None) accumulator = tl.sum(tiled_a * tiled_b, 1) + tiled_out else: accumulator = tl.sum(tiled_a * tiled_b, 1) diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py index 4910cb4061298..ee2cd2e05e2ee 100644 --- a/vllm/lora/ops/sgmv_expand.py +++ b/vllm/lora/ops/sgmv_expand.py @@ -88,7 +88,10 @@ def _sgmv_expand_kernel( c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (offset_cn[None, :] < N) if ADD_INPUTS: - tiled_out = tl.load(c_ptr, mask=c_mask) + # explicitly pass in other=None to tell triton that masked values + # can be uninitialized. This is OK because the later tl.store operation + # uses the same mask, eliminating the risk of garbage values propagating + tiled_out = tl.load(c_ptr, mask=c_mask, other=None) tiled_c += tiled_out tl.store(c_ptr, tiled_c, mask=c_mask) diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py index 844f5cec39e93..5244fa14913a4 100644 --- a/vllm/lora/ops/sgmv_expand_slice.py +++ b/vllm/lora/ops/sgmv_expand_slice.py @@ -94,7 +94,10 @@ def _sgmv_expand_slice_kernel( c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (offset_cn[None, :] < (slice_offset + N)) if ADD_INPUTS: - tiled_out = tl.load(c_ptr, mask=c_mask) + # explicitly pass in other=None to tell triton that masked values + # can be uninitialized. This is OK because the later tl.store operation + # uses the same mask, eliminating the risk of garbage values propagating + tiled_out = tl.load(c_ptr, mask=c_mask, other=None) tiled_c += tiled_out tl.store(c_ptr, tiled_c, mask=c_mask) diff --git a/vllm/model_executor/layers/quantization/awq_triton.py b/vllm/model_executor/layers/quantization/awq_triton.py index bbb7fc8ad5087..ace8f4a348812 100644 --- a/vllm/model_executor/layers/quantization/awq_triton.py +++ b/vllm/model_executor/layers/quantization/awq_triton.py @@ -42,7 +42,7 @@ def awq_dequantize_kernel( result_masks = result_masks_y[:, None] & result_masks_x[None, :] # Load the weights. - iweights = tl.load(qweight_ptr + offsets, masks) + iweights = tl.load(qweight_ptr + offsets, masks, 0.0) iweights = tl.interleave(iweights, iweights) iweights = tl.interleave(iweights, iweights) iweights = tl.interleave(iweights, iweights) @@ -71,7 +71,7 @@ def awq_dequantize_kernel( zero_masks = zero_masks_y[:, None] & zero_masks_x[None, :] # Load the zeros. - zeros = tl.load(zeros_ptr + zero_offsets, zero_masks) + zeros = tl.load(zeros_ptr + zero_offsets, zero_masks, 0.0) zeros = tl.interleave(zeros, zeros) zeros = tl.interleave(zeros, zeros) zeros = tl.interleave(zeros, zeros) @@ -91,7 +91,7 @@ def awq_dequantize_kernel( scale_masks = scale_masks_y[:, None] & scale_masks_x[None, :] # Load the scales. - scales = tl.load(scales_ptr + scale_offsets, scale_masks) + scales = tl.load(scales_ptr + scale_offsets, scale_masks, 0.0) scales = tl.broadcast_to(scales, (BLOCK_SIZE_Y, BLOCK_SIZE_X * 8)) # Dequantize. @@ -165,10 +165,10 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K, for k in range(0, tl.cdiv(K, BLOCK_SIZE_K * SPLIT_K)): masks_k = offsets_k < K masks_a = masks_am[:, None] & masks_k[None, :] - a = tl.load(a_ptrs, mask=masks_a) + a = tl.load(a_ptrs, mask=masks_a, other=0.0) masks_b = masks_k[:, None] & masks_bn[None, :] - b = tl.load(b_ptrs, mask=masks_b) + b = tl.load(b_ptrs, mask=masks_b, other=0.0) b = tl.interleave(b, b) b = tl.interleave(b, b) b = tl.interleave(b, b) @@ -181,7 +181,7 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K, masks_zk = offsets_szk < K // group_size masks_z = masks_zk[:, None] & masks_zn[None, :] zeros_ptrs = zeros_ptr + offsets_z - zeros = tl.load(zeros_ptrs, mask=masks_z) + zeros = tl.load(zeros_ptrs, mask=masks_z, other=0.0) zeros = tl.interleave(zeros, zeros) zeros = tl.interleave(zeros, zeros) zeros = tl.interleave(zeros, zeros) @@ -191,7 +191,7 @@ def awq_gemm_kernel(a_ptr, b_ptr, c_ptr, zeros_ptr, scales_ptr, M, N, K, masks_sk = offsets_szk < K // group_size masks_s = masks_sk[:, None] & masks_sn[None, :] scales_ptrs = scales_ptr + offsets_s - scales = tl.load(scales_ptrs, mask=masks_s) + scales = tl.load(scales_ptrs, mask=masks_s, other=0.0) scales = tl.broadcast_to(scales, (BLOCK_SIZE_K, BLOCK_SIZE_N)) b = (b >> shifts) & 0xF From 96d999fbe8d610fa4c5b7cad6bb0d0158d1d5b8b Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Mon, 18 Nov 2024 14:59:29 -0500 Subject: [PATCH 044/255] [Kernel] Initial Machete W4A8 support + Refactors (#9855) Signed-off-by: Lucas Wilkinson --- benchmarks/kernels/benchmark_machete.py | 519 +++++++++---- benchmarks/kernels/graph_machete_bench.py | 5 +- benchmarks/kernels/weight_shapes.py | 6 + csrc/cutlass_extensions/cute_utils.cuh | 4 +- .../epilogue}/broadcast_load_epilogue_c2x.hpp | 1 + .../epilogue}/broadcast_load_epilogue_c3x.hpp | 0 .../epilogue/scaled_mm_epilogues_c2x.hpp | 317 ++++++++ .../epilogue/scaled_mm_epilogues_c3x.hpp | 315 ++++++++ .../vllm_cutlass_library_extension.py | 29 + .../vllm_numeric_conversion.cuh | 239 +++++- csrc/cutlass_extensions/vllm_type_utils.cuh | 42 + .../cutlass_w8a8/scaled_mm_c2x.cu | 53 +- .../cutlass_w8a8/scaled_mm_c2x.cuh | 302 -------- .../cutlass_w8a8/scaled_mm_c3x.cu | 312 +------- csrc/quantization/machete/generate.py | 732 ++++++++++-------- .../quantization/machete/machete_mainloop.cuh | 25 +- .../machete/machete_mm_kernel.cuh | 206 +++-- .../machete/machete_mm_launcher.cuh | 90 +-- .../machete/machete_prepack_kernel.cuh | 63 +- .../machete/machete_prepack_launcher.cuh | 15 +- .../machete/machete_prepacked_layout.cuh | 54 +- csrc/quantization/machete/machete_pytorch.cu | 120 ++- csrc/torch_bindings.cpp | 35 +- tests/kernels/test_machete_gemm.py | 284 ------- tests/kernels/test_machete_mm.py | 406 ++++++++++ vllm/_custom_ops.py | 75 +- .../layers/quantization/kernels/machete.py | 16 +- .../layers/quantization/utils/quant_utils.py | 45 +- 28 files changed, 2616 insertions(+), 1694 deletions(-) rename csrc/{quantization/cutlass_w8a8 => cutlass_extensions/epilogue}/broadcast_load_epilogue_c2x.hpp (99%) rename csrc/{quantization/cutlass_w8a8 => cutlass_extensions/epilogue}/broadcast_load_epilogue_c3x.hpp (100%) create mode 100644 csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp create mode 100644 csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp create mode 100644 csrc/cutlass_extensions/vllm_type_utils.cuh delete mode 100644 tests/kernels/test_machete_gemm.py create mode 100644 tests/kernels/test_machete_mm.py diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index 665b50bf18cf0..a0342d08f1db8 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -2,8 +2,10 @@ import copy import itertools import math +import os import pickle as pkl import time +from dataclasses import dataclass from itertools import product from typing import Callable, Iterable, List, Optional, Tuple @@ -15,11 +17,12 @@ from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.utils.marlin_utils import ( - GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, marlin_permute_scales) + GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, marlin_permute_scales, + marlin_zero_points) from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( MarlinWorkspace) from vllm.model_executor.layers.quantization.utils.quant_utils import ( - gptq_pack, pack_rows, quantize_weights) + pack_rows, quantize_weights) from vllm.scalar_type import ScalarType, scalar_types from vllm.utils import FlexibleArgumentParser @@ -27,149 +30,349 @@ DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024] DEFAULT_TP_SIZES = [1] +NVTX_PROFILE = os.environ.get("NVTX_PROFILE", False) + +if NVTX_PROFILE: + import nvtx + + +def terse_type_name(dt): + return { + torch.bfloat16: "bf16", + torch.float16: "fp16", + torch.int8: "int8", + torch.float8_e4m3fn: "fp8", + torch.bfloat16: "bf16", + torch.float: "float", + torch.int: "int", + }[dt] + + +@dataclass +class BenchmarkTensors: + w_ref: torch.Tensor + a: torch.Tensor + + w_q: torch.Tensor + group_size: Optional[int] + wtype: ScalarType + w_g_s: torch.Tensor + w_g_zp: Optional[torch.Tensor] + w_ch_s: Optional[torch.Tensor] + w_tok_s: Optional[torch.Tensor] + + +@dataclass +class TypeConfig: + act_type: torch.dtype + weight_type: ScalarType + output_type: Optional[torch.dtype] + group_scale_type: Optional[torch.dtype] + group_zero_type: Optional[torch.dtype] + channel_scale_type: Optional[torch.dtype] + token_scale_type: Optional[torch.dtype] + + +def rand_data(shape, dtype=torch.float16, scale=1): + if dtype.is_floating_point: + return (scale * torch.rand(shape, device="cuda") - 0.3).to(dtype) + else: + return torch.randint(-15, 15, shape, dtype=dtype, device="cuda") + + +def quantize_and_pack(atype: torch.dtype, + w: torch.Tensor, + wtype: ScalarType, + stype: Optional[torch.dtype], + group_size: Optional[int], + zero_points: bool = False): + assert wtype.is_integer(), "TODO: support floating point weights" + + w_ref, w_q, w_s, w_zp = quantize_weights( + w, + wtype, + group_size=group_size, + zero_points=zero_points, + # to match how the kernel applies zps + ref_zero_points_after_scales=True) -def machete_pack_weights(w_q: torch.tensor, wtype: ScalarType) -> torch.tensor: w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape) - w_q = w_q.t().contiguous().t() # make col major - return ops.machete_prepack_B(w_q, wtype) + return w_ref, w_q, w_s, w_zp -def make_bench_tensors( - atype: torch.dtype, wtype: ScalarType, group_size: int, m: int, n: int, - k: int -) -> Tuple[torch.tensor, List[Tuple[torch.tensor, torch.tensor, torch.tensor, - torch.tensor]]]: - assert wtype.is_integer(), "TODO: support floating point weights" +def create_bench_tensors(shape: Tuple[int, int, int], types: TypeConfig, + group_size: Optional[int]) -> List[BenchmarkTensors]: + m, n, k = shape # we want to make sure that weights don't fit into L2 cache between runs so # we construct enough weights to exceed L2 cache, which is 50mb on a H100 # so we target total weight size > 2*50mb - num_weights = math.ceil(2 * 50 * 1024**2 * 8 / (k * n * wtype.size_bits)) - - a = torch.randn((m, k), device="cuda", dtype=atype) * 5 - weights = [ - torch.randn((k, n), device="cuda", dtype=atype) - for _ in range(num_weights) - ] - quanitized_weights = [ - quantize_weights(w, wtype, group_size) for w in weights - ] - - return a, quanitized_weights + num_weights = math.ceil(2 * 50 * 1024**2 * 8 / + (k * n * types.weight_type.size_bits)) + + a = rand_data((m, k), types.act_type, scale=5) + + benchmark_tensors: List[BenchmarkTensors] = [] + for _ in range(num_weights): + w = rand_data((k, n), types.act_type, scale=5) + + if types.group_scale_type is not None: + w = w.to(types.group_scale_type) + if w.dtype.itemsize == 1: + w = w.to(torch.float16) + + w_ref, w_q_packed, w_s, w_zp = quantize_and_pack( + a.dtype, w, types.weight_type, types.group_scale_type, group_size, + types.group_zero_type is not None) + + if not a.dtype.is_floating_point: + aiinfo = torch.iinfo(a.dtype) + w_ref = w_ref.round().clamp(aiinfo.min, aiinfo.max) + + w_ref = w_ref.to(torch.float32) + + w_ch_s = None if types.channel_scale_type is None else\ + rand_data((n,), types.channel_scale_type) + w_tok_s = None if types.token_scale_type is None else\ + rand_data((m,), types.token_scale_type) + + benchmark_tensors.append( + BenchmarkTensors(w_ref=w_ref, + a=a, + w_q=w_q_packed, + wtype=types.weight_type, + w_g_s=w_s, + w_g_zp=w_zp, + group_size=group_size, + w_ch_s=w_ch_s, + w_tok_s=w_tok_s)) + + return benchmark_tensors + + +def torch_matmul_f16_create_bench_fn(bt: BenchmarkTensors) -> Callable: + a = bt.a + w = bt.w_ref.to(bt.a.dtype) # use float reference tensor + if a.dtype not in [torch.float16, torch.bfloat16]: + a = a.to(torch.float16) + w = w.to(torch.float16) + return lambda: torch.matmul(a, w) + + +def cutlass_scaled_mm_create_bench_fn(bt: BenchmarkTensors) -> Callable: + if bt.w_ch_s is not None and bt.w_tok_s is not None: + scale_a = bt.w_tok_s.to(torch.float32) + scale_b = bt.w_ch_s.to(torch.float32) + else: + scale_a = torch.tensor(1.0, dtype=torch.float32, device=bt.a.device) + scale_b = torch.tensor(1.0, dtype=torch.float32, device=bt.a.device) + w_col_major = bt.w_ref.to(bt.a.dtype).t().contiguous().t() + return lambda: ops.cutlass_scaled_mm( + bt.a, w_col_major, scale_a, scale_b, out_dtype=torch.float16) + + +def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable: + device = bt.a.device + + workspace = MarlinWorkspace(bt.w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N, + GPTQ_MARLIN_MAX_PARALLEL) + + if bt.w_g_zp is None: + w_zp = torch.empty(0, dtype=torch.int, device=device) + else: + w_zp = marlin_zero_points(bt.w_g_zp, bt.w_ref.shape[0], + bt.w_ref.shape[1], bt.wtype.size_bits) + + if bt.group_size is None: + w_s = torch.tensor([], device="cuda", dtype=torch.half) + else: + w_s = marlin_permute_scales(bt.w_g_s, bt.w_ref.shape[0], + bt.w_ref.shape[1], bt.group_size) + + sort_indices = torch.empty(0, dtype=torch.int, device=device) + g_idx = torch.empty(0, dtype=torch.int, device=device) + w_q = ops.gptq_marlin_repack(bt.w_q, sort_indices, bt.w_ref.shape[0], + bt.w_ref.shape[1], bt.wtype.size_bits) + + if bt.a.dtype.is_floating_point: + assert bt.w_ch_s is None + assert bt.w_tok_s is None + assert bt.group_size is not None + + fn = lambda: ops.gptq_marlin_gemm(a=bt.a, + b_q_weight=w_q, + b_scales=w_s, + b_zeros=w_zp, + g_idx=g_idx, + perm=sort_indices, + workspace=workspace.scratch, + b_q_type=bt.wtype, + size_m=bt.a.shape[0], + size_n=bt.w_ref.shape[1], + size_k=bt.w_ref.shape[0], + is_k_full=True) + else: + assert bt.a.dtype == torch.int8 + assert bt.wtype == scalar_types.uint4b8 + + if bt.w_ch_s is not None: + s_ch = bt.w_ch_s.to(torch.float32) + else: + s_ch = torch.ones(bt.w_ref.shape[1], + dtype=torch.float32, + device=device) + + if bt.w_tok_s is not None: + s_tok = bt.w_tok_s.to(torch.float32) + else: + s_tok = torch.ones(bt.a.shape[0], + dtype=torch.float32, + device=device) + + fn = lambda: ops.marlin_qqq_gemm(a=bt.a, + b_q_weight=w_q, + s_group=w_s, + s_tok=s_tok, + s_ch=s_ch, + workspace=workspace.scratch, + size_m=bt.a.shape[0], + size_n=bt.w_ref.shape[1], + size_k=bt.w_ref.shape[0]) + + return fn + + +def machete_create_bench_fn(bt: BenchmarkTensors, + out_type=torch.dtype, + schedule=None) -> Callable: + w_q = bt.w_q.t().contiguous().t() # make col major + w_q = ops.machete_prepack_B(w_q, bt.a.dtype, bt.wtype, + None if bt.w_g_s is None else bt.w_g_s.dtype) + + w_g_zp = bt.w_g_zp + if w_g_zp is not None: + w_g_zp = -1 * bt.w_g_s * (w_g_zp.to(bt.w_g_s.dtype)) + + return lambda: ops.machete_mm( + a=bt.a, + b_q=bt.w_q, + b_type=bt.wtype, + b_group_scales=bt.w_g_s, + b_group_zeros=w_g_zp, + b_group_size=bt.group_size, + b_channel_scales=bt.w_ch_s, + a_token_scales=bt.w_tok_s, + out_type=out_type, + schedule=schedule, + ) # impl - # bench -def bench_fn(label: str, sub_label: str, description: str, - fn: Callable) -> TMeasurement: - min_run_time = 1 - return TBenchmark.Timer( - stmt="fn()", + +def bench_fns(label: str, sub_label: str, description: str, + fns: List[Callable]): + + min_run_time = 1 if not NVTX_PROFILE else 0.1 + res = TBenchmark.Timer( + stmt=""" + for fn in fns: + fn() + """, globals={ - "fn": fn + "fns": fns }, label=label, sub_label=sub_label, description=description, ).blocked_autorange(min_run_time=min_run_time) + if NVTX_PROFILE: + with nvtx.annotate("mm-bench"), nvtx.annotate( + f"{label}|{sub_label}|{description}"): + fns[0]() -def loop_over_weights( - a: torch.tensor, weights: List[Tuple[torch.tensor, torch.tensor, - torch.tensor, torch.tensor]], - fn: Callable[[torch.tensor, torch.tensor, torch.tensor, torch.tensor], - None]): - for w_ref, w_q, w_s, _ in weights: - fn(a, w_ref, w_q, w_s) + return res _SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None _SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None -def bench(atype: torch.dtype, - wtype: ScalarType, +def bench(types: TypeConfig, group_size: int, m: int, k: int, n: int, label: str, sub_label: str, - benchmark_marlinv1: bool = True, - sweep_schedules: bool = True) -> Iterable[TMeasurement]: - global _SWEEP_SCHEDULES_RESULTS - - a, weights = make_bench_tensors(atype, wtype, group_size, m, n, k) - sub_label += f", L={len(weights)}" - - weights_machete = [(w_ref, machete_pack_weights(w_q, wtype), w_s, w_zp) - for w_ref, w_q, w_s, w_zp in weights] + sweep_schedules: bool = True) -> List[TMeasurement]: + benchmark_tensors = create_bench_tensors((m, n, k), types, group_size) + sub_label += f", L={len(benchmark_tensors)}" + + name_type_string = f"W{types.weight_type}"+\ + f"-A{terse_type_name(types.act_type)}" + if types.group_scale_type is not None: + name_type_string += f"-GS{terse_type_name(types.group_scale_type)}" + if types.group_zero_type is not None: + name_type_string += f"-GZ{terse_type_name(types.group_zero_type)}" + if group_size is not None: + name_type_string += f"-G{group_size}" + if types.channel_scale_type is not None: + name_type_string += f"-CS{terse_type_name(types.channel_scale_type)}" + if types.token_scale_type is not None: + name_type_string += f"-TS{terse_type_name(types.token_scale_type)}" timers = [] # pytorch impl timers.append( - bench_fn( - label, sub_label, "torch.matmul", lambda: loop_over_weights( - a, - weights, - lambda a, w_ref, w_q, w_s: torch.matmul(a, w_ref), - ))) + bench_fns( + label, sub_label, "torch.matmul (fp16)", + [torch_matmul_f16_create_bench_fn(bt) + for bt in benchmark_tensors])) - if benchmark_marlinv1: - w_ref = weights[0][0] - - w_zp_empty = torch.empty(0, dtype=torch.int, device=w_ref.device) - sort_indices = torch.empty(0, dtype=torch.int, device=w_ref.device) - g_idx = torch.empty(0, dtype=torch.int, device=w_ref.device) - - def marlinv1_pack_weights(w_q: torch.tensor) -> torch.tensor: - w_q_gptq = gptq_pack(w_q, wtype.size_bits, *w_ref.shape) - return ops.gptq_marlin_repack(w_q_gptq, sort_indices, *w_ref.shape, - wtype.size_bits) - - def marlinv1_permute_scales(w_s: torch.tensor) -> torch.tensor: - return marlin_permute_scales(w_s, *w_ref.shape, group_size) - - weights_marlinv1 = [(w_ref, marlinv1_pack_weights(w_q), - marlinv1_permute_scales(w_s), w_zp) - for w_ref, w_q, w_s, w_zp in weights] - - workspace = MarlinWorkspace(w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N, - GPTQ_MARLIN_MAX_PARALLEL) - - # marlinv1 + if types.act_type == torch.int8 or types.act_type == torch.float8_e4m3fn: + timers.append( + bench_fns( + label, sub_label, + f"cutlass_scaled_mm ({terse_type_name(types.act_type)})", [ + cutlass_scaled_mm_create_bench_fn(bt) + for bt in benchmark_tensors + ])) + + if types.act_type != torch.float8_e4m3fn: timers.append( - bench_fn( - label, sub_label, "marlin_orig", lambda: loop_over_weights( - a, weights_marlinv1, lambda a, w_ref, w_q, w_s: ops. - gptq_marlin_gemm(a, - w_q, - w_s, - w_zp_empty, - g_idx, - sort_indices, - workspace.scratch, - wtype, - size_m=a.shape[0], - size_n=w_ref.shape[1], - size_k=w_ref.shape[0], - is_k_full=True)))) + bench_fns(label, sub_label, f"marlin ({name_type_string})", + [marlin_create_bench_fn(bt) + for bt in benchmark_tensors])) # machete timers.append( - bench_fn( - label, sub_label, "machete_heuristic", lambda: loop_over_weights( - a, weights_machete, lambda a, _, w_q, w_s: ops.machete_gemm( - a, w_q, wtype, b_scales=w_s, b_group_size=group_size)))) + bench_fns(label, sub_label, f"machete ({name_type_string})", [ + machete_create_bench_fn(bt, out_type=types.output_type) + for bt in benchmark_tensors + ])) if sweep_schedules: + global _SWEEP_SCHEDULES_RESULTS + print("Finding best schedule for machete") best = None best_schedule = None - schedules = ops.machete_supported_schedules(wtype) + schedules = ops.machete_supported_schedules( + a_type=types.act_type, + b_type=types.weight_type, + group_scales_type=types.group_scale_type, + group_zeros_type=types.group_zero_type, + token_scales_type=types.token_scale_type, + channel_scales_type=types.channel_scale_type, + out_type=types.output_type) + + if schedules is None or len(schedules) == 0: + raise ValueError("No schedules found to sweep") + for schedule in reversed(schedules): schedule_M = int(schedule.split("_")[0].split("x")[1]) @@ -177,16 +380,11 @@ def marlinv1_permute_scales(w_s: torch.tensor) -> torch.tensor: if schedule_M >= 2 * max(m, 16) or schedule_M < m // 4: continue - def run(a, _, w_q, w_s, schedule=schedule): - ops.machete_gemm(a, - w_q, - wtype, - w_s, - b_group_size=group_size, - schedule=schedule) - - res = bench_fn(label, sub_label, "machete_best", - lambda: loop_over_weights(a, weights_machete, run)) + res = bench_fns(label, sub_label, "machete_best", [ + machete_create_bench_fn( + bt, out_type=types.output_type, schedule=schedule) + for bt in benchmark_tensors + ]) results_row = { "M": m, @@ -213,25 +411,33 @@ def run(a, _, w_q, w_s, schedule=schedule): # runner -def print_timers(timers: Iterable[TMeasurement]): +def print_timers(timers: List[TMeasurement]): compare = TBenchmark.Compare(timers) compare.print() -def run(dtype: torch.dtype, sweep_schedules: bool, - MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]: +def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]: + types = TypeConfig( + act_type=args.act_type, + weight_type=scalar_types.uint4b8 if args.group_zero_type is None \ + else scalar_types.uint4, + output_type=args.out_type, + group_scale_type=args.group_scale_type, + group_zero_type=args.group_zero_type, + channel_scale_type=args.channel_scale_type, + token_scale_type=args.token_scale_type, + ) - results = [] + results: List[TMeasurement] = [] for m, k, n in MKNs: - timers = bench(dtype, - scalar_types.uint4b8, - 128, + timers = bench(types, + args.group_size, m, k, n, - f"{dtype}-gemm", + f"{args.act_type}-gemm", f"MKN=({m}x{k}x{n})", - sweep_schedules=sweep_schedules) + sweep_schedules=args.sweep_schedules) print_timers(timers) results.extend(timers) @@ -240,7 +446,7 @@ def run(dtype: torch.dtype, sweep_schedules: bool, # output makers def make_output( - data: Iterable[TMeasurement], + data: List[TMeasurement], MKNs: Iterable[Tuple[int, int, int]], base_description: str, timestamp=None, @@ -262,7 +468,6 @@ def run_square_bench(args): dim_sizes = list( range(args.dim_start, args.dim_end + 1, args.dim_increment)) MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes)) - data = run(args.dtype, args.sweep_schedules, MKNs) make_output(data, MKNs, f"square_bench-{args.dtype}") @@ -306,33 +511,49 @@ def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]: for k, n in KNs: MKNs.append((m, k, n)) - data = run(args.dtype, args.sweep_schedules, MKNs) + data = run(args, MKNs) model_bench_data.append(data) + type_string = f"{args.act_type}" + # Print all results for data, model_tp in zip(model_bench_data, models_tps): model, tp_size = model_tp - print(f"== Results {args.dtype} {model}-TP{tp_size} ====") + print(f"== Results {type_string} {model}-TP{tp_size} ====") print_timers(data) - timestamp = int(time.time()) + timestr = time.strftime("%Y%m%d-%H%M%S") - all_data = [] + all_results = [] for d in model_bench_data: - all_data.extend(d) + all_results.extend(d) + # pickle all data - with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f: - pkl.dump(all_data, f) + with open(f"model_bench-{type_string}-{timestr}.pkl", "wb") as f: + args_dict = vars(args) + args_dict.pop("func") + pkl.dump({ + "args": args_dict, + "results": all_results, + }, f) if __name__ == "__main__": def to_torch_dtype(dt): - if dt == "bfloat16": - return torch.bfloat16 - if dt == "float16": - return torch.float16 - raise ValueError("unsupported dtype") + return { + "bfloat16": torch.bfloat16, + "float16": torch.float16, + "int8": torch.int8, + "float8_e4m3fn": torch.float8_e4m3fn, + "int": torch.int, + "float": torch.float, + }[dt] + + class ToTorchDtype(argparse.Action): + + def __call__(self, parser, namespace, values, option_string=None): + setattr(namespace, self.dest, to_torch_dtype(values)) parser = FlexibleArgumentParser( description=""" @@ -352,12 +573,42 @@ def to_torch_dtype(dt): """, # noqa: E501 formatter_class=argparse.RawTextHelpFormatter, ) - parser.add_argument( - "--dtype", - type=to_torch_dtype, + "--act-type", + action=ToTorchDtype, required=True, - help="Available options are ['bfloat16', 'float16']", + choices=['bfloat16', 'float16', 'int8', 'float8_e4m3fn'], + ) + parser.add_argument( + "--group-scale-type", + action=ToTorchDtype, + choices=['bfloat16', 'float16'], + ) + parser.add_argument( + "--group-zero-type", + type=to_torch_dtype, + choices=['bfloat16', 'float16'], + ) + parser.add_argument( + "--channel-scale-type", + action=ToTorchDtype, + choices=['float'], + ) + parser.add_argument( + "--token-scale-type", + action=ToTorchDtype, + choices=['float'], + ) + parser.add_argument( + "--out-type", + action=ToTorchDtype, + choices=['bfloat16', 'float16'], + ) + parser.add_argument( + "--group-size", + type=int, + help="Available options are ['None', '-1', '128'], default=128", + default=128, ) parser.add_argument( "--sweep-schedules", diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py index de608fd05af70..7d0bd84150a27 100644 --- a/benchmarks/kernels/graph_machete_bench.py +++ b/benchmarks/kernels/graph_machete_bench.py @@ -20,10 +20,11 @@ args = parser.parse_args() with open(args.filename, 'rb') as f: - data: List[TMeasurement] = pickle.load(f) + data = pickle.load(f) + raw_results: List[TMeasurement] = data["results"] results = defaultdict(lambda: list()) - for v in data: + for v in raw_results: result = re.search(r"MKN=\(\d+x(\d+x\d+)\)", v.task_spec.sub_label) if result is not None: KN = result.group(1) diff --git a/benchmarks/kernels/weight_shapes.py b/benchmarks/kernels/weight_shapes.py index 25ec9d6028627..51f24f3ba1774 100644 --- a/benchmarks/kernels/weight_shapes.py +++ b/benchmarks/kernels/weight_shapes.py @@ -40,4 +40,10 @@ ([8192, 57344], 1), ([28672, 8192], 0), ], + "meta-llama/Llama-3.1-405b-hf": [ + ([16384, 18432], 1), + ([16384, 16384], 0), + ([16384, 106496], 1), + ([53248, 16384], 0), + ], } diff --git a/csrc/cutlass_extensions/cute_utils.cuh b/csrc/cutlass_extensions/cute_utils.cuh index 1842fab8b2cac..f61fe3ceb978a 100644 --- a/csrc/cutlass_extensions/cute_utils.cuh +++ b/csrc/cutlass_extensions/cute_utils.cuh @@ -20,9 +20,9 @@ CUTE_HOST_DEVICE static constexpr auto permute_layout(Layout l) { // is the layout f(x) = x template CUTE_HOST_DEVICE static constexpr bool is_identity_layout() { - if constexpr (std::is_same_v) + if constexpr (std::is_same_v) { return true; - else { + } else { constexpr auto coalesced_layout = coalesce(Layout{}); if constexpr (rank(coalesced_layout) == 1 && stride<0>(coalesced_layout) == 1) { diff --git a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp b/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp similarity index 99% rename from csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp rename to csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp index d407d66ab2aa6..7aa87feb4cce2 100644 --- a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp +++ b/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp @@ -52,6 +52,7 @@ // clang-format off #include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp" +#include "cutlass/epilogue/threadblock/fusion/visitors.hpp" #include "cute/tensor.hpp" namespace cutlass::epilogue::threadblock { diff --git a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp b/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp similarity index 100% rename from csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp rename to csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp new file mode 100644 index 0000000000000..c69e87999ae71 --- /dev/null +++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp @@ -0,0 +1,317 @@ +#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp" + +/* + This file defines custom epilogues for fusing channel scales, token scales, + bias, and activation zero-points onto a GEMM operation using the + CUTLASS 2.x API, for sm80 (Ampere) NVIDIA GPUs. + + Epilogues must contain a public type named EVTCompute of type Sm80EVT, + as well as a static prepare_args function that constructs an + EVTCompute::Arguments struct. +*/ + +namespace vllm::c2x { + +using namespace cute; + +/* + * This class provides the common load descriptors for the + * ScaledEpilogue[...] classes + */ +template +struct ScaledEpilogueBase { + protected: + using Accum = cutlass::epilogue::threadblock::VisitorAccFetch; + + template + using ColOrScalarLoad = + cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast< + OutputTileThreadMap, T, Stride, Int<0>, Int<0>>>; + + template + using RowOrScalarLoad = + cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast< + OutputTileThreadMap, T, Stride, Int<1>, Int<0>>>; + + template + using ColLoad = cutlass::epilogue::threadblock::VisitorColBroadcast< + OutputTileThreadMap, T, Stride, Int<0>, Int<0>>>; + + template + using RowLoad = cutlass::epilogue::threadblock::VisitorRowBroadcast< + OutputTileThreadMap, T, Stride, Int<1>, Int<0>>>; + + template + using RowOrZeroLoad = + cutlass::epilogue::threadblock::VisitorRowOrZeroBroadcast< + OutputTileThreadMap, T, Stride, Int<1>, Int<0>>>; + + // This utility function constructs the arguments for the load descriptors + // from a tensor. It can handle both row and column, as well as row/column or + // scalar cases. + template + static auto args_from_tensor(torch::Tensor const& tensor) { + using Arguments = typename Descriptor::Arguments; + auto* data_ptr = static_cast(tensor.data_ptr()); + if constexpr (std::is_same_v> || + std::is_same_v>) { + return Arguments{data_ptr, tensor.numel() != 1}; + } else { + // it would technically work but no use case as data_ptr is never nullptr + static_assert(!std::is_same_v>); + return Arguments{data_ptr}; + } + } + + // This overload handles the case where there might not be a tensor, in which + // case a nullptr is passed and a constant (0) is used. + template + static auto args_from_tensor(c10::optional const& tensor) { + static_assert(std::is_same_v>); + using Arguments = typename Descriptor::Arguments; + auto* data_ptr = tensor ? static_cast(tensor->data_ptr()) : nullptr; + return Arguments{data_ptr}; + } +}; + +/* + This epilogue function defines a quantized GEMM operation similar to + torch._scaled_mm. + + A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or + per-row. B can be quantized per-tensor or per-column. + Any combination of per-tensor and per-row or column is supported. + A and B must have symmetric quantization (zero point == 0). + + So the GEMM operation is D = (a_scales * A) (b_scales * B), where the + scales are applied elementwise with numpy-style broadcasting. + + ScaleA and ScaleB define the epilogue functions that apply the scales for + the A and B operands respectively. These scales may be either per-tensor or + per row or column. +*/ +template +struct ScaledEpilogue + : private ScaledEpilogueBase { + private: + using SUPER = ScaledEpilogueBase; + using Accum = typename SUPER::Accum; + using ScaleA = typename SUPER::template ColOrScalarLoad; + using ScaleB = typename SUPER::template RowOrScalarLoad; + + using Compute0 = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiplies, float, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTCompute0 = + cutlass::epilogue::threadblock::Sm80EVT; + + using Compute1 = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiplies, ElementD, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + public: + using EVTCompute = + cutlass::epilogue::threadblock::Sm80EVT; + using ArgumentType = typename EVTCompute::Arguments; + + static ArgumentType prepare_args(torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { + auto a_args = SUPER::template args_from_tensor(a_scales); + auto b_args = SUPER::template args_from_tensor(b_scales); + + typename EVTCompute0::Arguments evt0_args{b_args}; + return ArgumentType{a_args, evt0_args}; + } +}; + +/* + * This epilogue performs the same operation as ScaledEpilogue, but adds a bias. + * This bias can also be used in the per-tensor azp case, where the activation + * zero point (azp) is used to compute an azp correction term, + * which is folded into the bias. + * + * The bias tensor must be per-output channel. + * ScaleA and ScaleB can be per-tensor or per-token/per-channel. + */ +template +struct ScaledEpilogueBias + : protected ScaledEpilogueBase { + protected: + using SUPER = ScaledEpilogueBase; + using Accum = typename SUPER::Accum; + using ScaleA = typename SUPER::template ColOrScalarLoad; + using ScaleB = typename SUPER::template RowOrScalarLoad; + using Bias = typename SUPER::template RowLoad; + using Compute0 = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiplies, float, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTCompute0 = + cutlass::epilogue::threadblock::Sm80EVT; + + using Compute1 = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiply_add, ElementD, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + public: + using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT; + using ArgumentType = typename EVTCompute::Arguments; + static ArgumentType prepare_args(torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + torch::Tensor const& bias) { + auto a_args = SUPER::template args_from_tensor(a_scales); + auto b_args = SUPER::template args_from_tensor(b_scales); + auto bias_args = SUPER::template args_from_tensor(bias); + + typename EVTCompute0::Arguments evt0_args{b_args}; + return ArgumentType{a_args, evt0_args, bias_args}; + } +}; + +/* + * This epilogue directly supports per-tensor azp in int32 form. + * As opposed to the per-token epilogue below, this epilogue only has an azp_adj + * term, which should already be multiplied with the scalar azp. + * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B. + * + * This epilogue also supports bias, which remains per-channel. + */ +template +struct ScaledEpilogueBiasAzp + : protected ScaledEpilogueBase { + private: + using SUPER = ScaledEpilogueBase; + using Accum = typename SUPER::Accum; + using ScaleA = typename SUPER::template ColOrScalarLoad; + using ScaleB = typename SUPER::template RowOrScalarLoad; + using Bias = typename SUPER::template RowOrZeroLoad; + + // This is the full AZP term, azp * J @ B, shape (1,n) + using AzpWithAdj = typename SUPER::template RowLoad; + + // Compute float(accum - azp_adj), both operands are int32_t + using ComputeAzp = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::minus, float, int32_t, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTComputeAzp = + cutlass::epilogue::threadblock::Sm80EVT; + + using ComputeScaleB = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiplies, float, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTComputeScaleB = + cutlass::epilogue::threadblock::Sm80EVT; + + using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiply_add, ElementD, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + public: + using EVTCompute = + cutlass::epilogue::threadblock::Sm80EVT; + + using ArgumentType = typename EVTCompute::Arguments; + + static ArgumentType prepare_args(torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + torch::Tensor const& azp_adj, + c10::optional const& bias) { + auto a_args = SUPER::template args_from_tensor(a_scales); + auto b_args = SUPER::template args_from_tensor(b_scales); + auto bias_args = SUPER::template args_from_tensor(bias); + auto azp_adj_args = + SUPER::template args_from_tensor(azp_adj); + + typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args}; + typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args}; + return ArgumentType{a_args, evt_scale_b_args, bias_args}; + } +}; + +/* + * This epilogue supports per-token azp by computing and applying + * the correction term using a rank-1 update. If the term were materialized, + * it would require O(m*n) space, and this way it only requires O(m+n) space. + * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero + * point for each row of A. + * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B. + * + * This epilogue also supports bias, which remains per-channel. + */ +template +struct ScaledEpilogueBiasAzpToken + : protected ScaledEpilogueBase { + private: + using SUPER = ScaledEpilogueBase; + using Accum = typename SUPER::Accum; + using ScaleA = typename SUPER::template ColOrScalarLoad; + using ScaleB = typename SUPER::template RowOrScalarLoad; + using Bias = typename SUPER::template RowOrZeroLoad; + + // Per-token azp term, shape (m,1) + using Azp = typename SUPER::template ColLoad; + + // This is the AZP adjustment term, J @ B, shape (1,n) + using AzpAdj = typename SUPER::template RowLoad; + + // Compute azp * azp_adj + using ComputeAzp = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiplies, int32_t, int32_t, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTComputeAzp = + cutlass::epilogue::threadblock::Sm80EVT; + + // Compute float(accum - azp*azp_adj), all operands are int32_t + using ComputeAcc = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::minus, float, int32_t, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTComputeAcc = + cutlass::epilogue::threadblock::Sm80EVT; + + using ComputeScaleB = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiplies, float, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTComputeScaleB = + cutlass::epilogue::threadblock::Sm80EVT; + + using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiply_add, ElementD, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + public: + using EVTCompute = + cutlass::epilogue::threadblock::Sm80EVT; + + using ArgumentType = typename EVTCompute::Arguments; + + static ArgumentType prepare_args(torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + torch::Tensor const& azp_adj, + torch::Tensor const& azp, + c10::optional const& bias) { + auto a_args = SUPER::template args_from_tensor(a_scales); + auto b_args = SUPER::template args_from_tensor(b_scales); + auto bias_args = SUPER::template args_from_tensor(bias); + auto azp_args = SUPER::template args_from_tensor(azp); + auto azp_adj_args = + SUPER::template args_from_tensor(azp_adj); + + typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args}; + typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args}; + typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args}; + return ArgumentType{a_args, evt_scale_b_args, bias_args}; + } +}; + +}; // namespace vllm::c2x \ No newline at end of file diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp new file mode 100644 index 0000000000000..95764ecddc79f --- /dev/null +++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp @@ -0,0 +1,315 @@ +#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp" + +/* + This file defines custom epilogues for fusing channel scales, token scales, + bias, and activation zero-points onto a GEMM operation using the + CUTLASS 3.x API, for NVIDIA GPUs with sm90a (Hopper) or later. + + Epilogues must contain a public type named EVTCompute of type Sm90EVT, + as well as a static prepare_args function that constructs an + EVTCompute::Arguments struct. +*/ + +namespace vllm::c3x { + +using namespace cute; + +/* + * This class provides the common load descriptors for the + * ScaledEpilogue[...] classes + */ +template +struct ScaledEpilogueBase { + protected: + using Accum = cutlass::epilogue::fusion::Sm90AccFetch; + + template + using ColOrScalarLoad = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast< + 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, + Stride, Int<0>, Int<0>>>; + + template + using RowOrScalarLoad = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast< + 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, + Stride, Int<1>, Int<0>>>; + + // Don't want to support nullptr by default + template + using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast< + 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, + Stride, Int<0>, Int<0>>, 128 / sizeof_bits_v, EnableNullPtr>; + + // Don't want to support nullptr by default + template + using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast< + 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, + Stride, Int<1>, Int<0>>, 128 / sizeof_bits_v, EnableNullPtr>; + + // This utility function constructs the arguments for the load descriptors + // from a tensor. It can handle both row and column, as well as row/column or + // scalar cases. + template + static auto args_from_tensor(torch::Tensor const& tensor) { + using Arguments = typename Descriptor::Arguments; + auto* data_ptr = static_cast(tensor.data_ptr()); + if constexpr (std::is_same_v> || + std::is_same_v>) { + return Arguments{data_ptr, tensor.numel() != 1}; + } else { + static_assert(!std::is_same_v> && + !std::is_same_v>); + return Arguments{data_ptr}; + } + } + + // This overload handles the case where there might not be a tensor, in which + // case a nullptr is passed and a constant (0) is used. + template + static auto args_from_tensor(c10::optional const& tensor) { + using Arguments = typename Descriptor::Arguments; + auto* data_ptr = tensor ? static_cast(tensor->data_ptr()) : nullptr; + static_assert(std::is_same_v> || + std::is_same_v>); + return Arguments{data_ptr}; + } +}; + +/* + This epilogue function defines a quantized GEMM operation similar to + torch.scaled_mm_. + + A and B may be both either int8 or fp8_e4m3. A can be + quantized per-tensor or per-row. B can be quantized per-tensor or per-column. + Any combination of per-tensor and per-row or column is supported. + A and B must have symmetric quantization (zero point == 0). + + So the GEMM operation is D = (a_scales * A) (b_scales * B), where the + scales are applied elementwise with numpy-style broadcasting. + + ScaleA and ScaleB define the epilogue functions that apply the scales for + the A and B operands respectively. These scales may be either per-tensor or + per row or column. +*/ +template +struct ScaledEpilogue + : private ScaledEpilogueBase { + private: + using SUPER = ScaledEpilogueBase; + using Accum = typename SUPER::Accum; + using ScaleA = typename SUPER::template ColOrScalarLoad; + using ScaleB = typename SUPER::template RowOrScalarLoad; + + using Compute0 = cutlass::epilogue::fusion::Sm90Compute< + cutlass::multiplies, float, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTCompute0 = + cutlass::epilogue::fusion::Sm90EVT; + + using Compute1 = cutlass::epilogue::fusion::Sm90Compute< + cutlass::multiplies, ElementD, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + public: + using EVTCompute = + cutlass::epilogue::fusion::Sm90EVT; + using ArgumentType = typename EVTCompute::Arguments; + + static ArgumentType prepare_args(torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { + auto a_args = SUPER::template args_from_tensor(a_scales); + auto b_args = SUPER::template args_from_tensor(b_scales); + + typename EVTCompute0::Arguments evt0_args{b_args}; + return ArgumentType{a_args, evt0_args}; + } +}; + +/* + * This epilogue performs the same operation as ScaledEpilogue, but adds a bias. + * This bias can also be used in the per-tensor azp case, where the activation + * zero point (azp) is used to compute an azp correction term, + * which is folded into the bias. + * + * The bias tensor must be per-output channel. + * ScaleA and ScaleB can be per-tensor or per-token/per-channel. + */ +template +struct ScaledEpilogueBias + : private ScaledEpilogueBase { + private: + using SUPER = ScaledEpilogueBase; + using Accum = typename SUPER::Accum; + using ScaleA = typename SUPER::template ColOrScalarLoad; + using ScaleB = typename SUPER::template RowOrScalarLoad; + using Bias = typename SUPER::template RowLoad; + + using Compute0 = cutlass::epilogue::fusion::Sm90Compute< + cutlass::multiplies, float, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTCompute0 = + cutlass::epilogue::fusion::Sm90EVT; + + using Compute1 = cutlass::epilogue::fusion::Sm90Compute< + cutlass::multiply_add, ElementD, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + public: + using EVTCompute = + cutlass::epilogue::fusion::Sm90EVT; + + using ArgumentType = typename EVTCompute::Arguments; + static ArgumentType prepare_args(torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + torch::Tensor const& bias) { + auto a_args = SUPER::template args_from_tensor(a_scales); + auto b_args = SUPER::template args_from_tensor(b_scales); + auto bias_args = SUPER::template args_from_tensor(bias); + + typename EVTCompute0::Arguments evt0_args{b_args}; + return ArgumentType{a_args, evt0_args, bias_args}; + } +}; + +/* + * This epilogue directly supports per-tensor azp in int32 form. + * As opposed to the per-token epilogue below, this epilogue only has an azp_adj + * term, which should already be multiplied with the scalar azp. + * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B. + * + * This epilogue also supports bias, which remains per-channel. + */ +template +struct ScaledEpilogueBiasAzp + : private ScaledEpilogueBase { + private: + using SUPER = ScaledEpilogueBase; + using Accum = typename SUPER::Accum; + using ScaleA = typename SUPER::template ColOrScalarLoad; + using ScaleB = typename SUPER::template RowOrScalarLoad; + using Bias = typename SUPER::template RowLoad; + + // This is the full AZP term, azp * J @ B, shape (1,n) + using AzpWithAdj = typename SUPER::template RowLoad; + + // Compute float(accum - azp_adj), both operands are int32_t + using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute< + cutlass::minus, float, int32_t, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTComputeAzp = + cutlass::epilogue::fusion::Sm90EVT; + + using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute< + cutlass::multiplies, float, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTComputeScaleB = + cutlass::epilogue::fusion::Sm90EVT; + + using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute< + cutlass::multiply_add, ElementD, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + public: + using EVTCompute = + cutlass::epilogue::fusion::Sm90EVT; + using ArgumentType = typename EVTCompute::Arguments; + + static ArgumentType prepare_args(torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + torch::Tensor const& azp_adj, + c10::optional const& bias) { + auto a_args = SUPER::template args_from_tensor(a_scales); + auto b_args = SUPER::template args_from_tensor(b_scales); + auto bias_args = SUPER::template args_from_tensor(bias); + auto azp_adj_args = + SUPER::template args_from_tensor(azp_adj); + + typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args}; + typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args}; + return ArgumentType{a_args, evt_scale_b_args, bias_args}; + } +}; + +/* + * This epilogue supports per-token azp by computing and applying + * the correction term using a rank-1 update. If the term were materialized, + * it would require O(m*n) space, and this way it only requires O(m+n) space. + * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero + * point for each row of A. + * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B. + * + * This epilogue also supports bias, which remains per-channel. + */ +template +struct ScaledEpilogueBiasAzpToken + : private ScaledEpilogueBase { + private: + using SUPER = ScaledEpilogueBase; + using Accum = typename SUPER::Accum; + using ScaleA = typename SUPER::template ColOrScalarLoad; + using ScaleB = typename SUPER::template RowOrScalarLoad; + using Bias = typename SUPER::template RowLoad; + + // Per-token azp term, shape (m,1) + using Azp = typename SUPER::template ColLoad; + + // This is the AZP adjustment term, J @ B, shape (1,n) + using AzpAdj = typename SUPER::template RowLoad; + + // Compute azp * azp_adj + using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute< + cutlass::multiplies, int32_t, int32_t, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTComputeAzp = + cutlass::epilogue::fusion::Sm90EVT; + + // Compute float(accum - azp*azp_adj), all operands are int32_t + using ComputeAcc = cutlass::epilogue::fusion::Sm90Compute< + cutlass::minus, float, int32_t, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTComputeAcc = + cutlass::epilogue::fusion::Sm90EVT; + + using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute< + cutlass::multiplies, float, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTComputeScaleB = + cutlass::epilogue::fusion::Sm90EVT; + + using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute< + cutlass::multiply_add, ElementD, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + public: + using EVTCompute = + cutlass::epilogue::fusion::Sm90EVT; + using ArgumentType = typename EVTCompute::Arguments; + + static ArgumentType prepare_args(torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + torch::Tensor const& azp_adj, + torch::Tensor const& azp, + c10::optional const& bias) { + auto a_args = SUPER::template args_from_tensor(a_scales); + auto b_args = SUPER::template args_from_tensor(b_scales); + auto bias_args = SUPER::template args_from_tensor(bias); + auto azp_args = SUPER::template args_from_tensor(azp); + auto azp_adj_args = + SUPER::template args_from_tensor(azp_adj); + + typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args}; + typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args}; + typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args}; + return ArgumentType{a_args, evt_scale_b_args, bias_args}; + } +}; + +}; // namespace vllm::c3x \ No newline at end of file diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py index 4fcfcd311aa91..a5beea1a35e49 100644 --- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py +++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py @@ -35,6 +35,35 @@ class MixedInputKernelScheduleType(enum.Enum): } } +VLLMDataTypeSize: Dict[Union[VLLMDataType, DataType], int] = { + **DataTypeSize, # type: ignore + **{ + VLLMDataType.u4b8: 4, + VLLMDataType.u8b128: 8, + } +} + +VLLMDataTypeVLLMScalarTypeTag: Dict[Union[VLLMDataType, DataType], str] = { + VLLMDataType.u4b8: "vllm::kU4B8", + VLLMDataType.u8b128: "vllm::kU8B128", + DataType.u4: "vllm::kU4", + DataType.u8: "vllm::kU8", + DataType.s4: "vllm::kS4", + DataType.s8: "vllm::kS8", + DataType.f16: "vllm::kFloat16", + DataType.bf16: "vllm::kBfloat16", +} + +VLLMDataTypeTorchDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = { + DataType.u8: "at::ScalarType::Byte", + DataType.s8: "at::ScalarType::Char", + DataType.e4m3: "at::ScalarType::Float8_e4m3fn", + DataType.s32: "at::ScalarType::Int", + DataType.f16: "at::ScalarType::Half", + DataType.bf16: "at::ScalarType::BFloat16", + DataType.f32: "at::ScalarType::Float", +} + VLLMKernelScheduleTag: Dict[Union[ MixedInputKernelScheduleType, KernelScheduleType], str] = { **KernelScheduleTag, # type: ignore diff --git a/csrc/cutlass_extensions/vllm_numeric_conversion.cuh b/csrc/cutlass_extensions/vllm_numeric_conversion.cuh index 2ad914f8e9868..90f226cf64c0a 100644 --- a/csrc/cutlass_extensions/vllm_numeric_conversion.cuh +++ b/csrc/cutlass_extensions/vllm_numeric_conversion.cuh @@ -3,6 +3,7 @@ #include "cutlass/numeric_conversion.h" #include "cutlass_extensions/vllm_custom_types.cuh" #include "cutlass_extensions/cute_utils.cuh" +#include "cutlass_extensions/vllm_type_utils.cuh" // this file extends: // https://github.com/NVIDIA/cutlass/blob/cutlass-3.5.0/include/cutlass/numeric_conversion.h @@ -28,8 +29,19 @@ struct InterleavedNumericArrayConverter { CUTLASS_DEVICE static result_type convert(source_type const& source) { - CUTE_INVALID_CONTROL_PATH( - "InterleavedNumericArrayConverter not implemented\n"); + if (cute::elect_one_sync()) { + if constexpr (std::is_same_v) { + printf( + "Convert %s <= %s (N = %d, IlvBlkLayout = void), not implemented\n", + nameof_v, nameof_v, N); + } else { + printf( + "Convert %s <= %s (N = %d, size(IlvBlkLayout{}) = %d), not " + "implemented\n", + nameof_v, nameof_v, N, size(IlvBlkLayout{})); + } + __brkpt(); + } return {}; } @@ -56,11 +68,6 @@ struct InterleavedNumericArrayConverter< result_type operator()(source_type const& s) const { return convert(s); } }; -// TODO (LucasWilkinson): Implement -// for Array <= Array - -// .... - template struct ArrayConverterPacked32Bit { using result_type = Array; @@ -86,14 +93,16 @@ struct ArrayConverterPacked32Bit { using ScalarConverter = NumericConverter; template - CUTLASS_DEVICE static uint32_t to_reg(PackedSrc const& source) { + CUTLASS_DEVICE static auto to_regs(PackedSrc const& src) { if constexpr (sizeof(PackedSrc) == 1) { - return static_cast(reinterpret_cast(source)); + return Array{reinterpret_cast(src)}; } else if constexpr (sizeof(PackedSrc) == 2) { - return static_cast(reinterpret_cast(source)); + return Array{reinterpret_cast(src)}; + } else if constexpr (sizeof(PackedSrc) == 4) { + return Array{reinterpret_cast(src)}; } else { - static_assert(sizeof(PackedSrc) == 4); - return reinterpret_cast(source); + static_assert(sizeof(PackedSrc) == 8); + return reinterpret_cast const&>(src); } } @@ -110,7 +119,7 @@ struct ArrayConverterPacked32Bit { static_assert(std::is_same_v); static_assert(std::is_same_v); - return RegConvert32bit::template convert(to_reg(source)); + return RegConvert32bit::template convert(to_regs(source)); } friend class detail::VectorizedConverter; @@ -140,6 +149,131 @@ struct ArrayConverterPacked32Bit { } }; +// Convert 8 4bit values packed into a 32bit register to 8 8bit values packed +// into 2 32bit register. +template +CUTLASS_DEVICE cutlass::AlignedArray lut_4bit_to_8bit_convert( + uint32_t src) { + cutlass::AlignedArray r; + // Determines if the value is in the top half of the LUT if set or + // (i.e. LUT[8:15]) in the bottom half (i.e. LUT[0:7]) if not set. Then move + // into bit position 0x4 of each nibble so when or'd with final_prmt_base it + // selects the correct candidate. When elements in final_prmt_base + // are >= 0x4, the high candidate is selected (i.e. LUT[8:15]), when elements + // are < 0x4, the low candidate is selected (i.e. LUT[0:7]) + uint32_t high_bit = (src & 0x88888888) >> 1; + + // `high_bit` is OR'd with 0x31203120 to find the correct value in the LUT + // (selects correct high or low candidate) + const uint32_t final_prmt_base = 0x32103210; + + // Ignore the high bit when indexing into LUT, for each 4bit value + // we index into both the high and low candidates then use + // high_bit | final_prmt_base to select the correct candidate + uint32_t lut_idx = (src & 0x77777777); + + auto pack = [](uint8_t a, uint8_t b, uint8_t c, uint8_t d) { + return uint32_t(a) | (uint32_t(b) << 8) | (uint32_t(c) << 16) | + (uint32_t(d) << 24); + }; + + static constexpr uint32_t LOW_0 = pack(LUT0, LUT1, LUT2, LUT3); + static constexpr uint32_t LOW_1 = pack(LUT4, LUT5, LUT6, LUT7); + static constexpr uint32_t HIGH_0 = pack(LUT8, LUT9, LUT10, LUT11); + static constexpr uint32_t HIGH_1 = pack(LUT12, LUT13, LUT14, LUT15); + + CUTLASS_PRAGMA_UNROLL + for (int ii = 0; ii < 2; ++ii, lut_idx >>= 16, high_bit >>= 16) { + uint32_t final_prmt_idx = final_prmt_base | high_bit; + + // This uses a look up table to convert packed int4s to packed int8s, + // using the int4 value as the index to prmt. It first select both the + // high and low candidates, then uses the high bit (i.e. `high_bit`) to + // select the correct candidate. + asm volatile( + "{\n" + " .reg .b32 low, high;\n" + " prmt.b32 low, %1, %2, %5;\n" + " prmt.b32 high, %3, %4, %5;\n" + " prmt.b32 %0, low, high, %6;\n" + "}\n" + : "=r"(r[ii]) + : "n"(LOW_0), "n"(LOW_1), "n"(HIGH_0), "n"(HIGH_1), "r"(lut_idx), + "r"(final_prmt_idx)); + } + + return r; +}; + +// for Array <= Array +template +struct NumericArrayConverter { + using result_type = Array; + using source_type = Array; + + static FloatRoundStyle const round_style = Round; + + private: + struct RegConvert { + template + CUTLASS_DEVICE static PackedResultType convert(Array src_) { + // [-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7] as int8s + auto r = lut_4bit_to_8bit_convert<0xF8, 0xF9, 0xFA, 0xFB, // + 0xFC, 0xFD, 0xFE, 0xFF, // + 0x00, 0x01, 0x02, 0x03, // + 0x04, 0x05, 0x06, 0x07>(src_[0]); + return reinterpret_cast(r); + }; + }; + + public: + CUTLASS_DEVICE + static result_type convert(source_type const& source) { + return ArrayConverterPacked32Bit::convert(source); + } + + CUTLASS_DEVICE + result_type operator()(source_type const& s) const { return convert(s); } +}; + +// for Array <= Array +template +struct NumericArrayConverter { + using result_type = Array; + using source_type = Array; + + static FloatRoundStyle const round_style = Round; + + private: + struct RegConvert { + template + CUTLASS_DEVICE static PackedResultType convert(Array src_) { + // [-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7] as fp8s + auto r = lut_4bit_to_8bit_convert<0xD0, 0xCE, 0xCC, 0xCA, // + 0xC8, 0xC4, 0xC0, 0xB8, // + 0x00, 0x38, 0x40, 0x44, // + 0x48, 0x4A, 0x4C, 0x4E>(src_[0]); + return reinterpret_cast(r); + }; + }; + + public: + CUTLASS_DEVICE + static result_type convert(source_type const& source) { + return ArrayConverterPacked32Bit::convert(source); + } + + CUTLASS_DEVICE + result_type operator()(source_type const& s) const { return convert(s); } +}; + // for Array <= Array template struct NumericArrayConverter { @@ -148,7 +282,8 @@ struct NumericArrayConverter { struct RegConvert { template - CUTLASS_DEVICE static PackedResultType convert(uint32_t src) { + CUTLASS_DEVICE static PackedResultType convert(Array src_) { + uint32_t src = src_[0]; using RegArray = cutlass::AlignedArray; @@ -249,7 +384,8 @@ struct InterleavedNumericArrayConverter, Stride<_4, _1>>, private: struct RegConvert { template - CUTLASS_DEVICE static PackedResultType convert(uint32_t src) { + CUTLASS_DEVICE static PackedResultType convert(Array src_) { + uint32_t src = src_[0]; using RegArray = cutlass::AlignedArray; @@ -338,7 +474,8 @@ struct InterleavedNumericArrayConverter, Stride<_4, _1>>, private: struct RegConvert { template - CUTLASS_DEVICE static PackedResultType convert(uint32_t src) { + CUTLASS_DEVICE static PackedResultType convert(Array src_) { + uint32_t src = src_[0]; using RegArray = cutlass::AlignedArray; @@ -417,7 +554,8 @@ struct NumericArrayConverter { struct RegConvert { template - CUTLASS_DEVICE static PackedResultType convert(uint32_t src) { + CUTLASS_DEVICE static PackedResultType convert(Array src_) { + uint32_t src = src_[0]; // Hold output FP16s in reg. We need 1 reg for every 2 elements using RegArray = cutlass::AlignedArray { private: struct RegConvert { template - CUTLASS_DEVICE static PackedResultType convert(uint32_t src) { + CUTLASS_DEVICE static PackedResultType convert(Array src_) { + uint32_t src = src_[0]; PackedResultType r; // __byte_perm simulates the add.u32 0x4B000000 to every u8 element of @@ -513,7 +652,8 @@ struct NumericArrayConverter { private: struct RegConvert { template - CUTLASS_DEVICE static PackedResultType convert(uint32_t src_reg) { + CUTLASS_DEVICE static PackedResultType convert(Array src_) { + uint32_t src_reg = src_[0]; // Hold output BF16s in reg. We need 1 reg for every 2 elements using RegArray = cutlass::AlignedArray, Stride<_4, _1>>, private: struct RegConvert { template - CUTLASS_DEVICE static PackedResultType convert(uint32_t src) { + CUTLASS_DEVICE static PackedResultType convert(Array src_) { + uint32_t src = src_[0]; using RegArray = cutlass::AlignedArray; @@ -671,7 +812,8 @@ struct InterleavedNumericArrayConverter, Stride<_4, _1>>, private: struct RegConvert { template - CUTLASS_DEVICE static PackedResultType convert(uint32_t src) { + CUTLASS_DEVICE static PackedResultType convert(Array src_) { + uint32_t src = src_[0]; using RegArray = cutlass::AlignedArray; @@ -788,6 +930,61 @@ struct NumericArrayConverter { #endif +// for Array <= Array +// FastFP16toINT8 from https://arxiv.org/pdf/2406.09904 +template +struct NumericArrayConverter { + using result_type = Array; + using source_type = Array; + + struct RegConvert { + // FastFP16toINT8 from https://arxiv.org/pdf/2406.09904 + template + CUTLASS_DEVICE static PackedResultType convert( + Array src) { + // Hold output int8s in reg. We need 1 reg for every 4 elements + using RegArray = cutlass::AlignedArray< + uint32_t, std::max(PackedResultType::kElements / 4, size_t(1))>; + RegArray r; + + static constexpr uint32_t MAGIC_BIAS_ = 0x64806480; + auto MAGIC_BIAS = *reinterpret_cast(&MAGIC_BIAS_); + + *reinterpret_cast(&src[0]) = + __hadd2(*reinterpret_cast(&src[0]), MAGIC_BIAS); + + if constexpr (src_regs > 1) { + *reinterpret_cast(&src[1]) = + __hadd2(*reinterpret_cast(&src[1]), MAGIC_BIAS); + } + + static_assert(PackedResultType::kElements <= 4); + uint32_t uint8s; + static constexpr uint32_t MASK_0246 = 0x6420; + static constexpr uint32_t UINT8s_TO_INT8s_MASK = 0x80808080; + asm volatile("prmt.b32 %0,%1,%2,%3;\n" + : "=r"(uint8s) + : "r"(src[0]), "r"((src_regs > 1) ? src[1] : src[0]), + "n"(MASK_0246)); + + uint32_t int8s = (uint8s ^ UINT8s_TO_INT8s_MASK); + + return reinterpret_cast(int8s); + }; + }; + + public: + CUTLASS_DEVICE + static result_type convert(source_type const& source) { + return ArrayConverterPacked32Bit::convert(source); + } + + CUTLASS_DEVICE + result_type operator()(source_type const& s) const { return convert(s); } +}; + ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace cutlass diff --git a/csrc/cutlass_extensions/vllm_type_utils.cuh b/csrc/cutlass_extensions/vllm_type_utils.cuh new file mode 100644 index 0000000000000..500ed508c8303 --- /dev/null +++ b/csrc/cutlass_extensions/vllm_type_utils.cuh @@ -0,0 +1,42 @@ +#include "cutlass/bfloat16.h" +#include "cutlass/half.h" +#include "cuda_bf16.h" + +#include "cutlass_extensions/vllm_custom_types.cuh" + +namespace cutlass { + +template +struct nameof { + static constexpr char const* value = "unknown"; +}; + +template +inline constexpr auto nameof_v = nameof::value; + +#define NAMEOF_TYPE(T) \ + template <> \ + struct nameof { \ + static constexpr char const* value = #T; \ + }; + +NAMEOF_TYPE(float_e4m3_t) +NAMEOF_TYPE(float_e5m2_t) +NAMEOF_TYPE(half_t) +NAMEOF_TYPE(nv_bfloat16) +NAMEOF_TYPE(bfloat16_t) +NAMEOF_TYPE(float) + +NAMEOF_TYPE(int4b_t) +NAMEOF_TYPE(int8_t) +NAMEOF_TYPE(int32_t) +NAMEOF_TYPE(int64_t) + +NAMEOF_TYPE(vllm_uint4b8_t) +NAMEOF_TYPE(uint4b_t) +NAMEOF_TYPE(uint8_t) +NAMEOF_TYPE(vllm_uint8b128_t) +NAMEOF_TYPE(uint32_t) +NAMEOF_TYPE(uint64_t) + +}; // namespace cutlass \ No newline at end of file diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu index ee801e16573d4..dbb72e8bbd3f5 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu @@ -8,6 +8,10 @@ #include "scaled_mm_c2x_sm89_fp8_dispatch.cuh" #include "scaled_mm_c2x_sm89_int8_dispatch.cuh" +#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp" + +using namespace vllm; + /* This file defines quantized GEMM operations using the CUTLASS 2.x API, for NVIDIA GPUs with SM versions prior to sm90 (Hopper). @@ -22,12 +26,11 @@ void cutlass_scaled_mm_sm75_epilogue(torch::Tensor& out, torch::Tensor const& a, TORCH_CHECK(b.dtype() == torch::kInt8); if (out.dtype() == torch::kBFloat16) { - return vllm::cutlass_gemm_sm75_dispatch( + return cutlass_gemm_sm75_dispatch( out, a, b, std::forward(epilogue_args)...); } else { TORCH_CHECK(out.dtype() == torch::kFloat16); - return vllm::cutlass_gemm_sm75_dispatch( + return cutlass_gemm_sm75_dispatch( out, a, b, std::forward(epilogue_args)...); } } @@ -42,10 +45,10 @@ void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a, if (bias) { TORCH_CHECK(bias->dtype() == out.dtype(), "currently bias dtype must match output dtype ", out.dtype()); - return cutlass_scaled_mm_sm75_epilogue( + return cutlass_scaled_mm_sm75_epilogue( out, a, b, a_scales, b_scales, *bias); } else { - return cutlass_scaled_mm_sm75_epilogue( + return cutlass_scaled_mm_sm75_epilogue( out, a, b, a_scales, b_scales); } } @@ -61,10 +64,10 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& out, torch::Tensor const& a, TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (azp) { - return cutlass_scaled_mm_sm75_epilogue( + return cutlass_scaled_mm_sm75_epilogue( out, a, b, a_scales, b_scales, azp_adj, *azp, bias); } else { - return cutlass_scaled_mm_sm75_epilogue( + return cutlass_scaled_mm_sm75_epilogue( out, a, b, a_scales, b_scales, azp_adj, bias); } } @@ -78,12 +81,11 @@ void cutlass_scaled_mm_sm80_epilogue(torch::Tensor& out, torch::Tensor const& a, TORCH_CHECK(b.dtype() == torch::kInt8); if (out.dtype() == torch::kBFloat16) { - return vllm::cutlass_gemm_sm80_dispatch( + return cutlass_gemm_sm80_dispatch( out, a, b, std::forward(epilogue_args)...); } else { TORCH_CHECK(out.dtype() == torch::kFloat16); - return vllm::cutlass_gemm_sm80_dispatch( + return cutlass_gemm_sm80_dispatch( out, a, b, std::forward(epilogue_args)...); } } @@ -98,10 +100,10 @@ void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a, if (bias) { TORCH_CHECK(bias->dtype() == out.dtype(), "currently bias dtype must match output dtype ", out.dtype()); - return cutlass_scaled_mm_sm80_epilogue( + return cutlass_scaled_mm_sm80_epilogue( out, a, b, a_scales, b_scales, *bias); } else { - return cutlass_scaled_mm_sm80_epilogue( + return cutlass_scaled_mm_sm80_epilogue( out, a, b, a_scales, b_scales); } } @@ -117,10 +119,10 @@ void cutlass_scaled_mm_azp_sm80(torch::Tensor& out, torch::Tensor const& a, TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (azp) { - return cutlass_scaled_mm_sm80_epilogue( + return cutlass_scaled_mm_sm80_epilogue( out, a, b, a_scales, b_scales, azp_adj, *azp, bias); } else { - return cutlass_scaled_mm_sm80_epilogue( + return cutlass_scaled_mm_sm80_epilogue( out, a, b, a_scales, b_scales, azp_adj, bias); } } @@ -134,13 +136,12 @@ void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a, TORCH_CHECK(b.dtype() == torch::kInt8); if (out.dtype() == torch::kBFloat16) { - return vllm::cutlass_gemm_sm89_int8_dispatch( + return cutlass_gemm_sm89_int8_dispatch( out, a, b, std::forward(epilogue_args)...); } else { assert(out.dtype() == torch::kFloat16); - return vllm::cutlass_gemm_sm89_int8_dispatch( + return cutlass_gemm_sm89_int8_dispatch( out, a, b, std::forward(epilogue_args)...); } } else { @@ -148,13 +149,13 @@ void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a, TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn); if (out.dtype() == torch::kBFloat16) { - return vllm::cutlass_gemm_sm89_fp8_dispatch< - cutlass::float_e4m3_t, cutlass::bfloat16_t, Epilogue>( + return cutlass_gemm_sm89_fp8_dispatch( out, a, b, std::forward(epilogue_args)...); } else { TORCH_CHECK(out.dtype() == torch::kFloat16); - return vllm::cutlass_gemm_sm89_fp8_dispatch( + return cutlass_gemm_sm89_fp8_dispatch( out, a, b, std::forward(epilogue_args)...); } } @@ -170,10 +171,10 @@ void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a, if (bias) { TORCH_CHECK(bias->dtype() == out.dtype(), "currently bias dtype must match output dtype ", out.dtype()); - return cutlass_scaled_mm_sm89_epilogue( + return cutlass_scaled_mm_sm89_epilogue( out, a, b, a_scales, b_scales, *bias); } else { - return cutlass_scaled_mm_sm89_epilogue( + return cutlass_scaled_mm_sm89_epilogue( out, a, b, a_scales, b_scales); } } @@ -189,10 +190,10 @@ void cutlass_scaled_mm_azp_sm89(torch::Tensor& out, torch::Tensor const& a, TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (azp) { - return cutlass_scaled_mm_sm89_epilogue( + return cutlass_scaled_mm_sm89_epilogue( out, a, b, a_scales, b_scales, azp_adj, *azp, bias); } else { - return cutlass_scaled_mm_sm89_epilogue( + return cutlass_scaled_mm_sm89_epilogue( out, a, b, a_scales, b_scales, azp_adj, bias); } } diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh index 6329ff63623e2..d03242f44ab1d 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh @@ -21,7 +21,6 @@ #include "cutlass/epilogue/threadblock/fusion/visitors.hpp" #include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h" -#include "broadcast_load_epilogue_c2x.hpp" #include "common.hpp" // clang-format on @@ -71,307 +70,6 @@ struct enable_sm89_to_sm90 : Kernel { #endif } }; - -/* - * This class provides the common load descriptors for the - * ScaledEpilogue[...] classes - */ -template -struct ScaledEpilogueBase { - protected: - using Accum = cutlass::epilogue::threadblock::VisitorAccFetch; - - template - using ColOrScalarLoad = - cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast< - OutputTileThreadMap, T, Stride, Int<0>, Int<0>>>; - - template - using RowOrScalarLoad = - cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast< - OutputTileThreadMap, T, Stride, Int<1>, Int<0>>>; - - template - using ColLoad = cutlass::epilogue::threadblock::VisitorColBroadcast< - OutputTileThreadMap, T, Stride, Int<0>, Int<0>>>; - - template - using RowLoad = cutlass::epilogue::threadblock::VisitorRowBroadcast< - OutputTileThreadMap, T, Stride, Int<1>, Int<0>>>; - - template - using RowOrZeroLoad = - cutlass::epilogue::threadblock::VisitorRowOrZeroBroadcast< - OutputTileThreadMap, T, Stride, Int<1>, Int<0>>>; - - // This utility function constructs the arguments for the load descriptors - // from a tensor. It can handle both row and column, as well as row/column or - // scalar cases. - template - static auto args_from_tensor(torch::Tensor const& tensor) { - using Arguments = typename Descriptor::Arguments; - auto* data_ptr = static_cast(tensor.data_ptr()); - if constexpr (std::is_same_v> || - std::is_same_v>) { - return Arguments{data_ptr, tensor.numel() != 1}; - } else { - // it would technically work but no use case as data_ptr is never nullptr - static_assert(!std::is_same_v>); - return Arguments{data_ptr}; - } - } - - // This overload handles the case where there might not be a tensor, in which - // case a nullptr is passed and a constant (0) is used. - template - static auto args_from_tensor(c10::optional const& tensor) { - static_assert(std::is_same_v>); - using Arguments = typename Descriptor::Arguments; - auto* data_ptr = tensor ? static_cast(tensor->data_ptr()) : nullptr; - return Arguments{data_ptr}; - } -}; - -/* - This epilogue function defines a quantized GEMM operation similar to - torch._scaled_mm. - - A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or - per-row. B can be quantized per-tensor or per-column. - Any combination of per-tensor and per-row or column is supported. - A and B must have symmetric quantization (zero point == 0). - - So the GEMM operation is D = (a_scales * A) (b_scales * B), where the - scales are applied elementwise with numpy-style broadcasting. - - ScaleA and ScaleB define the epilogue functions that apply the scales for - the A and B operands respectively. These scales may be either per-tensor or - per row or column. -*/ -template -struct ScaledEpilogue - : private ScaledEpilogueBase { - private: - using SUPER = ScaledEpilogueBase; - using Accum = typename SUPER::Accum; - using ScaleA = typename SUPER::template ColOrScalarLoad; - using ScaleB = typename SUPER::template RowOrScalarLoad; - - using Compute0 = cutlass::epilogue::threadblock::VisitorCompute< - cutlass::multiplies, float, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTCompute0 = - cutlass::epilogue::threadblock::Sm80EVT; - - using Compute1 = cutlass::epilogue::threadblock::VisitorCompute< - cutlass::multiplies, ElementD, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - public: - using EVTCompute = - cutlass::epilogue::threadblock::Sm80EVT; - using ArgumentType = typename EVTCompute::Arguments; - - static ArgumentType prepare_args(torch::Tensor const& a_scales, - torch::Tensor const& b_scales) { - auto a_args = SUPER::template args_from_tensor(a_scales); - auto b_args = SUPER::template args_from_tensor(b_scales); - - typename EVTCompute0::Arguments evt0_args{b_args}; - return ArgumentType{a_args, evt0_args}; - } -}; - -/* - * This epilogue performs the same operation as ScaledEpilogue, but adds a bias. - * This bias can also be used in the per-tensor azp case, where the activation - * zero point (azp) is used to compute an azp correction term, - * which is folded into the bias. - * - * The bias tensor must be per-output channel. - * ScaleA and ScaleB can be per-tensor or per-token/per-channel. - */ -template -struct ScaledEpilogueBias - : protected ScaledEpilogueBase { - protected: - using SUPER = ScaledEpilogueBase; - using Accum = typename SUPER::Accum; - using ScaleA = typename SUPER::template ColOrScalarLoad; - using ScaleB = typename SUPER::template RowOrScalarLoad; - using Bias = typename SUPER::template RowLoad; - using Compute0 = cutlass::epilogue::threadblock::VisitorCompute< - cutlass::multiplies, float, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTCompute0 = - cutlass::epilogue::threadblock::Sm80EVT; - - using Compute1 = cutlass::epilogue::threadblock::VisitorCompute< - cutlass::multiply_add, ElementD, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - public: - using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT; - using ArgumentType = typename EVTCompute::Arguments; - static ArgumentType prepare_args(torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - torch::Tensor const& bias) { - auto a_args = SUPER::template args_from_tensor(a_scales); - auto b_args = SUPER::template args_from_tensor(b_scales); - auto bias_args = SUPER::template args_from_tensor(bias); - - typename EVTCompute0::Arguments evt0_args{b_args}; - return ArgumentType{a_args, evt0_args, bias_args}; - } -}; - -/* - * This epilogue directly supports per-tensor azp in int32 form. - * As opposed to the per-token epilogue below, this epilogue only has an azp_adj - * term, which should already be multiplied with the scalar azp. - * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B. - * - * This epilogue also supports bias, which remains per-channel. - */ -template -struct ScaledEpilogueBiasAzp - : protected ScaledEpilogueBase { - private: - using SUPER = ScaledEpilogueBase; - using Accum = typename SUPER::Accum; - using ScaleA = typename SUPER::template ColOrScalarLoad; - using ScaleB = typename SUPER::template RowOrScalarLoad; - using Bias = typename SUPER::template RowOrZeroLoad; - - // This is the full AZP term, azp * J @ B, shape (1,n) - using AzpWithAdj = typename SUPER::template RowLoad; - - // Compute float(accum - azp_adj), both operands are int32_t - using ComputeAzp = cutlass::epilogue::threadblock::VisitorCompute< - cutlass::minus, float, int32_t, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTComputeAzp = - cutlass::epilogue::threadblock::Sm80EVT; - - using ComputeScaleB = cutlass::epilogue::threadblock::VisitorCompute< - cutlass::multiplies, float, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTComputeScaleB = - cutlass::epilogue::threadblock::Sm80EVT; - - using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute< - cutlass::multiply_add, ElementD, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - public: - using EVTCompute = - cutlass::epilogue::threadblock::Sm80EVT; - - using ArgumentType = typename EVTCompute::Arguments; - - static ArgumentType prepare_args(torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - torch::Tensor const& azp_adj, - c10::optional const& bias) { - auto a_args = SUPER::template args_from_tensor(a_scales); - auto b_args = SUPER::template args_from_tensor(b_scales); - auto bias_args = SUPER::template args_from_tensor(bias); - auto azp_adj_args = - SUPER::template args_from_tensor(azp_adj); - - typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args}; - typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args}; - return ArgumentType{a_args, evt_scale_b_args, bias_args}; - } -}; - -/* - * This epilogue supports per-token azp by computing and applying - * the correction term using a rank-1 update. If the term were materialized, - * it would require O(m*n) space, and this way it only requires O(m+n) space. - * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero - * point for each row of A. - * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B. - * - * This epilogue also supports bias, which remains per-channel. - */ -template -struct ScaledEpilogueBiasAzpToken - : protected ScaledEpilogueBase { - private: - using SUPER = ScaledEpilogueBase; - using Accum = typename SUPER::Accum; - using ScaleA = typename SUPER::template ColOrScalarLoad; - using ScaleB = typename SUPER::template RowOrScalarLoad; - using Bias = typename SUPER::template RowOrZeroLoad; - - // Per-token azp term, shape (m,1) - using Azp = typename SUPER::template ColLoad; - - // This is the AZP adjustment term, J @ B, shape (1,n) - using AzpAdj = typename SUPER::template RowLoad; - - // Compute azp * azp_adj - using ComputeAzp = cutlass::epilogue::threadblock::VisitorCompute< - cutlass::multiplies, int32_t, int32_t, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTComputeAzp = - cutlass::epilogue::threadblock::Sm80EVT; - - // Compute float(accum - azp*azp_adj), all operands are int32_t - using ComputeAcc = cutlass::epilogue::threadblock::VisitorCompute< - cutlass::minus, float, int32_t, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTComputeAcc = - cutlass::epilogue::threadblock::Sm80EVT; - - using ComputeScaleB = cutlass::epilogue::threadblock::VisitorCompute< - cutlass::multiplies, float, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTComputeScaleB = - cutlass::epilogue::threadblock::Sm80EVT; - - using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute< - cutlass::multiply_add, ElementD, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - public: - using EVTCompute = - cutlass::epilogue::threadblock::Sm80EVT; - - using ArgumentType = typename EVTCompute::Arguments; - - static ArgumentType prepare_args(torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - torch::Tensor const& azp_adj, - torch::Tensor const& azp, - c10::optional const& bias) { - auto a_args = SUPER::template args_from_tensor(a_scales); - auto b_args = SUPER::template args_from_tensor(b_scales); - auto bias_args = SUPER::template args_from_tensor(bias); - auto azp_args = SUPER::template args_from_tensor(azp); - auto azp_adj_args = - SUPER::template args_from_tensor(azp_adj); - - typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args}; - typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args}; - typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args}; - return ArgumentType{a_args, evt_scale_b_args, bias_args}; - } -}; - template typename ArchGuard, typename ElementAB_, typename ElementD_, template typename Epilogue_, typename TileShape, diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu index 292c9e4b34e1c..33581a63d4c3d 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu @@ -23,11 +23,12 @@ #include "cutlass/epilogue/collective/collective_builder.hpp" #include "cutlass/gemm/collective/collective_builder.hpp" -#include "broadcast_load_epilogue_c3x.hpp" +#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp" #include "common.hpp" // clang-format on using namespace cute; +using namespace vllm; /* This file defines quantized GEMM operations using the CUTLASS 3.x API, for @@ -56,305 +57,6 @@ struct enable_sm90_or_later : Kernel { #endif } }; - -/* - * This class provides the common load descriptors for the - * ScaledEpilogue[...] classes - */ -template -struct ScaledEpilogueBase { - protected: - using Accum = cutlass::epilogue::fusion::Sm90AccFetch; - - template - using ColOrScalarLoad = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast< - 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, - Stride, Int<0>, Int<0>>>; - - template - using RowOrScalarLoad = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast< - 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, - Stride, Int<1>, Int<0>>>; - - // Don't want to support nullptr by default - template - using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast< - 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, - Stride, Int<0>, Int<0>>, 128 / sizeof_bits_v, EnableNullPtr>; - - // Don't want to support nullptr by default - template - using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast< - 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, - Stride, Int<1>, Int<0>>, 128 / sizeof_bits_v, EnableNullPtr>; - - // This utility function constructs the arguments for the load descriptors - // from a tensor. It can handle both row and column, as well as row/column or - // scalar cases. - template - static auto args_from_tensor(torch::Tensor const& tensor) { - using Arguments = typename Descriptor::Arguments; - auto* data_ptr = static_cast(tensor.data_ptr()); - if constexpr (std::is_same_v> || - std::is_same_v>) { - return Arguments{data_ptr, tensor.numel() != 1}; - } else { - static_assert(!std::is_same_v> && - !std::is_same_v>); - return Arguments{data_ptr}; - } - } - - // This overload handles the case where there might not be a tensor, in which - // case a nullptr is passed and a constant (0) is used. - template - static auto args_from_tensor(c10::optional const& tensor) { - using Arguments = typename Descriptor::Arguments; - auto* data_ptr = tensor ? static_cast(tensor->data_ptr()) : nullptr; - static_assert(std::is_same_v> || - std::is_same_v>); - return Arguments{data_ptr}; - } -}; - -/* - This epilogue function defines a quantized GEMM operation similar to - torch.scaled_mm_. - - A and B may be both either int8 or fp8_e4m3. A can be - quantized per-tensor or per-row. B can be quantized per-tensor or per-column. - Any combination of per-tensor and per-row or column is supported. - A and B must have symmetric quantization (zero point == 0). - - So the GEMM operation is D = (a_scales * A) (b_scales * B), where the - scales are applied elementwise with numpy-style broadcasting. - - ScaleA and ScaleB define the epilogue functions that apply the scales for - the A and B operands respectively. These scales may be either per-tensor or - per row or column. -*/ -template -struct ScaledEpilogue - : private ScaledEpilogueBase { - private: - using SUPER = ScaledEpilogueBase; - using Accum = typename SUPER::Accum; - using ScaleA = typename SUPER::template ColOrScalarLoad; - using ScaleB = typename SUPER::template RowOrScalarLoad; - - using Compute0 = cutlass::epilogue::fusion::Sm90Compute< - cutlass::multiplies, float, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTCompute0 = - cutlass::epilogue::fusion::Sm90EVT; - - using Compute1 = cutlass::epilogue::fusion::Sm90Compute< - cutlass::multiplies, ElementD, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - public: - using EVTCompute = - cutlass::epilogue::fusion::Sm90EVT; - using ArgumentType = typename EVTCompute::Arguments; - - static ArgumentType prepare_args(torch::Tensor const& a_scales, - torch::Tensor const& b_scales) { - auto a_args = SUPER::template args_from_tensor(a_scales); - auto b_args = SUPER::template args_from_tensor(b_scales); - - typename EVTCompute0::Arguments evt0_args{b_args}; - return ArgumentType{a_args, evt0_args}; - } -}; - -/* - * This epilogue performs the same operation as ScaledEpilogue, but adds a bias. - * This bias can also be used in the per-tensor azp case, where the activation - * zero point (azp) is used to compute an azp correction term, - * which is folded into the bias. - * - * The bias tensor must be per-output channel. - * ScaleA and ScaleB can be per-tensor or per-token/per-channel. - */ -template -struct ScaledEpilogueBias - : private ScaledEpilogueBase { - private: - using SUPER = ScaledEpilogueBase; - using Accum = typename SUPER::Accum; - using ScaleA = typename SUPER::template ColOrScalarLoad; - using ScaleB = typename SUPER::template RowOrScalarLoad; - using Bias = typename SUPER::template RowLoad; - - using Compute0 = cutlass::epilogue::fusion::Sm90Compute< - cutlass::multiplies, float, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTCompute0 = - cutlass::epilogue::fusion::Sm90EVT; - - using Compute1 = cutlass::epilogue::fusion::Sm90Compute< - cutlass::multiply_add, ElementD, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - public: - using EVTCompute = - cutlass::epilogue::fusion::Sm90EVT; - - using ArgumentType = typename EVTCompute::Arguments; - static ArgumentType prepare_args(torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - torch::Tensor const& bias) { - auto a_args = SUPER::template args_from_tensor(a_scales); - auto b_args = SUPER::template args_from_tensor(b_scales); - auto bias_args = SUPER::template args_from_tensor(bias); - - typename EVTCompute0::Arguments evt0_args{b_args}; - return ArgumentType{a_args, evt0_args, bias_args}; - } -}; - -/* - * This epilogue directly supports per-tensor azp in int32 form. - * As opposed to the per-token epilogue below, this epilogue only has an azp_adj - * term, which should already be multiplied with the scalar azp. - * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B. - * - * This epilogue also supports bias, which remains per-channel. - */ -template -struct ScaledEpilogueBiasAzp - : private ScaledEpilogueBase { - private: - using SUPER = ScaledEpilogueBase; - using Accum = typename SUPER::Accum; - using ScaleA = typename SUPER::template ColOrScalarLoad; - using ScaleB = typename SUPER::template RowOrScalarLoad; - using Bias = typename SUPER::template RowLoad; - - // This is the full AZP term, azp * J @ B, shape (1,n) - using AzpWithAdj = typename SUPER::template RowLoad; - - // Compute float(accum - azp_adj), both operands are int32_t - using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute< - cutlass::minus, float, int32_t, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTComputeAzp = - cutlass::epilogue::fusion::Sm90EVT; - - using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute< - cutlass::multiplies, float, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTComputeScaleB = - cutlass::epilogue::fusion::Sm90EVT; - - using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute< - cutlass::multiply_add, ElementD, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - public: - using EVTCompute = - cutlass::epilogue::fusion::Sm90EVT; - using ArgumentType = typename EVTCompute::Arguments; - - static ArgumentType prepare_args(torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - torch::Tensor const& azp_adj, - c10::optional const& bias) { - auto a_args = SUPER::template args_from_tensor(a_scales); - auto b_args = SUPER::template args_from_tensor(b_scales); - auto bias_args = SUPER::template args_from_tensor(bias); - auto azp_adj_args = - SUPER::template args_from_tensor(azp_adj); - - typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args}; - typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args}; - return ArgumentType{a_args, evt_scale_b_args, bias_args}; - } -}; - -/* - * This epilogue supports per-token azp by computing and applying - * the correction term using a rank-1 update. If the term were materialized, - * it would require O(m*n) space, and this way it only requires O(m+n) space. - * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero - * point for each row of A. - * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B. - * - * This epilogue also supports bias, which remains per-channel. - */ -template -struct ScaledEpilogueBiasAzpToken - : private ScaledEpilogueBase { - private: - using SUPER = ScaledEpilogueBase; - using Accum = typename SUPER::Accum; - using ScaleA = typename SUPER::template ColOrScalarLoad; - using ScaleB = typename SUPER::template RowOrScalarLoad; - using Bias = typename SUPER::template RowLoad; - - // Per-token azp term, shape (m,1) - using Azp = typename SUPER::template ColLoad; - - // This is the AZP adjustment term, J @ B, shape (1,n) - using AzpAdj = typename SUPER::template RowLoad; - - // Compute azp * azp_adj - using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute< - cutlass::multiplies, int32_t, int32_t, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTComputeAzp = - cutlass::epilogue::fusion::Sm90EVT; - - // Compute float(accum - azp*azp_adj), all operands are int32_t - using ComputeAcc = cutlass::epilogue::fusion::Sm90Compute< - cutlass::minus, float, int32_t, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTComputeAcc = - cutlass::epilogue::fusion::Sm90EVT; - - using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute< - cutlass::multiplies, float, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTComputeScaleB = - cutlass::epilogue::fusion::Sm90EVT; - - using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute< - cutlass::multiply_add, ElementD, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - public: - using EVTCompute = - cutlass::epilogue::fusion::Sm90EVT; - using ArgumentType = typename EVTCompute::Arguments; - - static ArgumentType prepare_args(torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - torch::Tensor const& azp_adj, - torch::Tensor const& azp, - c10::optional const& bias) { - auto a_args = SUPER::template args_from_tensor(a_scales); - auto b_args = SUPER::template args_from_tensor(b_scales); - auto bias_args = SUPER::template args_from_tensor(bias); - auto azp_args = SUPER::template args_from_tensor(azp); - auto azp_adj_args = - SUPER::template args_from_tensor(azp_adj); - - typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args}; - typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args}; - typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args}; - return ArgumentType{a_args, evt_scale_b_args, bias_args}; - } -}; - template typename Epilogue_, typename TileShape, typename ClusterShape, typename KernelSchedule, @@ -721,11 +423,11 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a, if (bias) { TORCH_CHECK(bias->dtype() == c.dtype(), "currently bias dtype must match output dtype ", c.dtype()); - return cutlass_scaled_mm_sm90_epilogue( + return cutlass_scaled_mm_sm90_epilogue( c, a, b, a_scales, b_scales, *bias); } else { - return cutlass_scaled_mm_sm90_epilogue(c, a, b, a_scales, - b_scales); + return cutlass_scaled_mm_sm90_epilogue( + c, a, b, a_scales, b_scales); } } @@ -740,10 +442,10 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a, TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (azp) { - return cutlass_scaled_mm_sm90_epilogue( + return cutlass_scaled_mm_sm90_epilogue( out, a, b, a_scales, b_scales, azp_adj, *azp, bias); } else { - return cutlass_scaled_mm_sm90_epilogue( + return cutlass_scaled_mm_sm90_epilogue( out, a, b, a_scales, b_scales, azp_adj, bias); } } diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index d126af1849024..ac63afe79a255 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -3,8 +3,10 @@ import os import shutil from collections.abc import Iterable -from dataclasses import dataclass -from typing import List, Optional, Tuple, Union +from copy import deepcopy +from dataclasses import dataclass, fields +from functools import reduce +from typing import Dict, List, Optional, Tuple, Union import jinja2 # yapf conflicts with isort for this block @@ -14,7 +16,10 @@ MixedInputKernelScheduleType, TileSchedulerTag, TileSchedulerType, VLLMDataType, - VLLMDataTypeNames, VLLMDataTypeTag, + VLLMDataTypeNames, + VLLMDataTypeSize, VLLMDataTypeTag, + VLLMDataTypeTorchDataTypeTag, + VLLMDataTypeVLLMScalarTypeTag, VLLMKernelScheduleTag) # yapf: enable @@ -27,49 +32,125 @@ #include "../machete_mm_launcher.cuh" namespace machete { -using GemmDispatcher_ = GemmDispatcher< - {{DataTypeTag[type_config.element_a]}}, // ElementA - {{DataTypeTag[type_config.element_b]}}, // ElementB - {{DataTypeTag[type_config.element_d]}}, // ElementD - {{DataTypeTag[type_config.accumulator]}}, // Accumulator - {{DataTypeTag[type_config.element_b_scale]}}, // Scales - {{DataTypeTag[type_config.element_b_zeropoint]}}>; // Zeropoints - -{% for s in schedules %}extern torch::Tensor -impl_{{type_name}}_sch_{{ gen_sch_name(s) }}(PyTorchArguments args); -{% endfor %} -template <> -torch::Tensor GemmDispatcher_::dispatch(PyTorchArguments args) { + +{% for impl_config in impl_configs %} +{% set type_sig = gen_type_sig(impl_config.types) -%} +{% for s in impl_config.schedules %} +extern torch::Tensor impl_{{type_sig}}_sch_{{gen_sch_sig(s)}}(MMArgs); +{%- endfor %} + +torch::Tensor mm_dispatch_{{type_sig}}(MMArgs args) { [[maybe_unused]] auto M = args.A.size(0); [[maybe_unused]] auto N = args.B.size(1); [[maybe_unused]] auto K = args.A.size(1); - if (!args.schedule) { - {%- for cond, s in heuristic %} + if (!args.maybe_schedule) { + {%- for cond, s in impl_config.heuristic %} {%if cond is not none%}if ({{cond}}) {%- else %}else {%- endif %} - return impl_{{ type_name }}_sch_{{ gen_sch_name(s) }}(args);{% endfor %} + return impl_{{type_sig}}_sch_{{ gen_sch_sig(s) }}(args);{% endfor %} } - {% for s in schedules %} - if (*args.schedule == "{{ gen_sch_name(s) }}") { - return impl_{{ type_name }}_sch_{{ gen_sch_name(s) }}(args); - } - {% endfor %} + {%- for s in impl_config.schedules %} + if (*args.maybe_schedule == "{{ gen_sch_sig(s) }}") + return impl_{{type_sig}}_sch_{{ gen_sch_sig(s) }}(args); + {%- endfor %} TORCH_CHECK_NOT_IMPLEMENTED(false, "machete_gemm(..) is not implemented for " - "schedule = ", *args.schedule); + "schedule = ", *args.maybe_schedule); } +{%- endfor %} + -template <> -std::vector GemmDispatcher_::supported_schedules() { - return { - {% for s in schedules -%} - "{{ gen_sch_name(s) }}"{{ ", - " if not loop.last }}{%- endfor %} - }; +static inline std::optional maybe_scalartype( + c10::optional const& t) { + if (!t) { + return std::nullopt; + } else { + return t->scalar_type(); + }; +} + +torch::Tensor mm_dispatch(MMArgs args) { + auto out_type = args.maybe_out_type.value_or(args.A.scalar_type()); + auto a_type = args.A.scalar_type(); + auto maybe_g_scales_type = maybe_scalartype(args.maybe_group_scales); + auto maybe_g_zeros_type = maybe_scalartype(args.maybe_group_zeros); + auto maybe_ch_scales_type = maybe_scalartype(args.maybe_channel_scales); + auto maybe_tok_scales_type = maybe_scalartype(args.maybe_token_scales); + + {% for impl_config in impl_configs %} + {% set t = impl_config.types -%} + {% set type_sig = gen_type_sig(t) -%} + if (args.b_type == {{VLLMScalarTypeTag[t.b]}} + && a_type == {{TorchTypeTag[t.a]}} + && out_type == {{TorchTypeTag[t.out]}} + && {%if t.b_group_scale != void -%} + maybe_g_scales_type == {{TorchTypeTag[t.b_group_scale]}} + {%- else %}!maybe_g_scales_type{%endif%} + && {%if t.b_group_zeropoint != void -%} + maybe_g_zeros_type == {{TorchTypeTag[t.b_group_zeropoint]}} + {%- else %}!maybe_g_zeros_type{%endif%} + && {%if t.b_channel_scale != void -%} + maybe_ch_scales_type == {{TorchTypeTag[t.b_channel_scale]}} + {%- else %}!maybe_ch_scales_type{%endif%} + && {%if t.a_token_scale != void -%} + maybe_tok_scales_type == {{TorchTypeTag[t.a_token_scale]}} + {%- else %}!maybe_tok_scales_type{%endif%} + ) { + return mm_dispatch_{{type_sig}}(args); + } + {%- endfor %} + + TORCH_CHECK_NOT_IMPLEMENTED( + false, "machete_mm(..) is not implemented for " + "a_type=", args.A.scalar_type(), + ", b_type=", args.b_type.str(), + ", out_type=", out_type, + ", with_group_scale_type=", maybe_g_scales_type + ? toString(*maybe_g_scales_type) : "None", + ", with_group_zeropoint_type=", maybe_g_zeros_type + ? toString(*maybe_g_zeros_type) : "None", + ", with_channel_scale_type=", maybe_ch_scales_type + ? toString(*maybe_ch_scales_type) : "None", + ", with_token_scale_type=", maybe_tok_scales_type + ? toString(*maybe_tok_scales_type) : "None", + "; implemented types are: \\n", + {%- for impl_config in impl_configs %} + {% set t = impl_config.types -%} + "\\t{{gen_type_option_name(t)}}\\n", + {%- endfor %} + ""); } +std::vector supported_schedules_dispatch( + SupportedSchedulesArgs args) { + auto out_type = args.maybe_out_type.value_or(args.a_type); + + {% for impl_config in impl_configs %} + {% set t = impl_config.types -%} + {% set schs = impl_config.schedules -%} + if (args.b_type == {{VLLMScalarTypeTag[t.b]}} + && args.a_type == {{TorchTypeTag[t.a]}} + && out_type == {{TorchTypeTag[t.out]}} + && {%if t.b_group_scale != void -%} + args.maybe_group_scales_type == {{TorchTypeTag[t.b_group_scale]}} + {%- else %}!args.maybe_group_scales_type{%endif%} + && {%if t.b_group_zeropoint != void-%} + args.maybe_group_zeros_type == {{TorchTypeTag[t.b_group_zeropoint]}} + {%- else %}!args.maybe_group_zeros_type{%endif%} + ) { + return { + {%- for s in impl_config.schedules %} + "{{gen_sch_sig(s)}}"{% if not loop.last %},{% endif %} + {%- endfor %} + }; + } + {%- endfor %} + + return {}; +}; + }; // namespace machete """ @@ -77,20 +158,10 @@ #include "../machete_mm_launcher.cuh" namespace machete { -template -using Kernel = MacheteKernelTemplate< - {{DataTypeTag[type_config.element_a]}}, // ElementA - {{DataTypeTag[type_config.element_b]}}, // ElementB - {{DataTypeTag[type_config.element_d]}}, // ElementD - {{DataTypeTag[type_config.accumulator]}}, // Accumulator - {{DataTypeTag[type_config.element_b_scale]}}, // Scales - {{DataTypeTag[type_config.element_b_zeropoint]}}, // Zeropoints - cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput, - Config, with_C, with_scales, with_zeropoints>; - -{% for sch in schedules %} -{% set schedule_name = gen_sch_name(sch) -%} -struct sch_{{schedule_name}} { + +{% for sch in unique_schedules(impl_configs) %} +{% set sch_sig = gen_sch_sig(sch) -%} +struct sch_{{sch_sig}} { using TileShapeNM = Shape<{{ to_cute_constant(sch.tile_shape_mn)|join(', ')}}>; using ClusterShape = Shape<{{ @@ -101,27 +172,34 @@ using TileScheduler = {{TileSchedulerTag[sch.tile_scheduler]}}; using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto; }; - +{% endfor %} + +{% for impl_config in impl_configs %} +{% set t = impl_config.types -%} +{% set schs = impl_config.schedules -%} +{% set type_sig = gen_type_sig(t) -%} + +template +using Kernel_{{type_sig}} = MacheteKernelTemplate< + {{DataTypeTag[t.a]}}, // ElementA + {{DataTypeTag[t.b]}}, // ElementB + {{DataTypeTag[t.out]}}, // ElementD + {{DataTypeTag[t.accumulator]}}, // Accumulator + {{DataTypeTag[t.b_group_scale]}}, // GroupScaleT + {{DataTypeTag[t.b_group_zeropoint]}}, // GroupZeroT + {{DataTypeTag[t.b_channel_scale]}}, // ChannelScaleT + {{DataTypeTag[t.a_token_scale]}}, // TokenScaleT + cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput, + Sch>; + +{% for sch in schs %} +{% set sch_sig = gen_sch_sig(sch) -%} torch::Tensor -impl_{{type_name}}_sch_{{schedule_name}}(PyTorchArguments args) { - bool with_C = args.C.has_value(), with_scales = args.scales.has_value(), - with_zeropoints = args.zeros.has_value(); - - {% for s in specializations %} - if (with_C == {{s.with_C|lower}} - && with_zeropoints == {{s.with_zeropoints|lower}} - && with_scales == {{s.with_scales|lower}}) { - return run_impl>(args); - }{% endfor %} - - TORCH_CHECK_NOT_IMPLEMENTED( - false, "for the sake of compile times and binary size machete_mm(..) is " - " not implemented for with_C=", with_C, ", with_scales=", with_scales, - ", with_zeropoints=", with_zeropoints, - " (for {{type_name}}_sch_{{schedule_name}})"); +impl_{{type_sig}}_sch_{{sch_sig}}(MMArgs args) { + return run_impl>(args); } -{% endfor %} +{%- endfor %} +{%- endfor %} }; // namespace machete """ @@ -130,26 +208,34 @@ #include "../machete_prepack_launcher.cuh" namespace machete { -using PrepackBDispatcher_ = PrepackBDispatcher< - {{DataTypeTag[type_config.element_a]}}, // ElementA - {{DataTypeTag[type_config.element_b]}}, // ElementB - {{DataTypeTag[type_config.element_d]}}, // ElementD - {{DataTypeTag[type_config.accumulator]}}, // Accumulator - {{DataTypeTag[type_config.element_b_scale]}}, // Scales - {{DataTypeTag[type_config.element_b_zeropoint]}}>; // Zeropoints - -using PrepackedLayoutB = PrepackedLayoutBTemplate< - {{DataTypeTag[type_config.element_a]}}, // ElementA - {{DataTypeTag[type_config.element_b]}}, // ElementB - {{DataTypeTag[type_config.element_d]}}, // ElementD - {{DataTypeTag[type_config.accumulator]}}, // Accumulator - cutlass::layout::ColumnMajor, - cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput>; - -template <> -torch::Tensor PrepackBDispatcher_::dispatch(torch::Tensor B) { - return prepack_impl(B); + +torch::Tensor prepack_B_dispatch(PrepackBArgs args) { + auto convert_type = args.maybe_group_scales_type.value_or(args.a_type); + {%- for t in types %} + {% set b_type = unsigned_type_with_bitwidth(t.b_num_bits) %} + if (args.a_type == {{TorchTypeTag[t.a]}} + && args.b_type.size_bits() == {{t.b_num_bits}} + && convert_type == {{TorchTypeTag[t.convert]}}) { + return prepack_impl< + PrepackedLayoutBTemplate< + {{DataTypeTag[t.a]}}, // ElementA + {{DataTypeTag[b_type]}}, // ElementB + {{DataTypeTag[t.convert]}}, // ElementConvert + {{DataTypeTag[t.accumulator]}}, // Accumulator + cutlass::layout::ColumnMajor, + cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput> + >(args.B); + } + {%- endfor %} + + TORCH_CHECK_NOT_IMPLEMENTED(false, + "prepack_B_dispatch(..) is not implemented for " + "atype = ", args.a_type, + ", b_type = ", args.b_type.str(), + ", with_group_scales_type= ", args.maybe_group_scales_type ? + toString(*args.maybe_group_scales_type) : "None"); } + }; // namespace machete """ @@ -166,32 +252,34 @@ class ScheduleConfig: tile_scheduler: TileSchedulerType -@dataclass +@dataclass(frozen=True) class TypeConfig: - element_a: DataType - element_b: Union[DataType, VLLMDataType] - element_b_scale: DataType - element_b_zeropoint: DataType - element_d: DataType + a: DataType + b: Union[DataType, VLLMDataType] + b_group_scale: DataType + b_group_zeropoint: DataType + b_channel_scale: DataType + a_token_scale: DataType + out: DataType accumulator: DataType -@dataclass -class Specialization: - with_C: bool - with_zeropoints: bool - with_scales: bool +@dataclass(frozen=True) +class PrepackTypeConfig: + a: DataType + b_num_bits: int + convert: DataType + accumulator: DataType @dataclass class ImplConfig: - type_config: TypeConfig - schedule_configs: List[ScheduleConfig] - specializations: List[Specialization] + types: TypeConfig + schedules: List[ScheduleConfig] heuristic: List[Tuple[Optional[str], ScheduleConfig]] -def generate_schedule_name(schedule_config: ScheduleConfig) -> str: +def generate_sch_sig(schedule_config: ScheduleConfig) -> str: tile_shape = ( f"{schedule_config.tile_shape_mn[0]}x{schedule_config.tile_shape_mn[1]}" ) @@ -209,40 +297,34 @@ def generate_schedule_name(schedule_config: ScheduleConfig) -> str: f"_{epilogue_schedule}_{tile_scheduler}") -# mostly unique shorter schedule_name -def generate_terse_schedule_name(schedule_config: ScheduleConfig) -> str: +# mostly unique shorter sch_sig +def generate_terse_sch_sig(schedule_config: ScheduleConfig) -> str: kernel_terse_names_replace = { "KernelTmaWarpSpecializedCooperativeMixedInput_": "TmaMI_", "TmaWarpSpecializedCooperative_": "TmaCoop_", "StreamKScheduler": "streamK", } - schedule_name = generate_schedule_name(schedule_config) + sch_sig = generate_sch_sig(schedule_config) for orig, terse in kernel_terse_names_replace.items(): - schedule_name = schedule_name.replace(orig, terse) - return schedule_name + sch_sig = sch_sig.replace(orig, terse) + return sch_sig # unique type_name -def generate_type_signature(kernel_type_config: TypeConfig): - element_a = VLLMDataTypeNames[kernel_type_config.element_a] - element_b = VLLMDataTypeNames[kernel_type_config.element_b] - element_d = VLLMDataTypeNames[kernel_type_config.element_d] - accumulator = VLLMDataTypeNames[kernel_type_config.accumulator] - element_scale = VLLMDataTypeNames[kernel_type_config.element_b_scale] - element_zeropoint = VLLMDataTypeNames[ - kernel_type_config.element_b_zeropoint] - - return (f"{element_a}{element_b}{element_d}" - f"{accumulator}{element_scale}{element_zeropoint}") - +def generate_type_signature(kernel_types: TypeConfig): + return str("".join([ + VLLMDataTypeNames[getattr(kernel_types, field.name)] + for field in fields(TypeConfig) + ])) -# non-unique shorter type_name -def generate_terse_type_signature(kernel_type_config: TypeConfig): - element_a = VLLMDataTypeNames[kernel_type_config.element_a] - element_b = VLLMDataTypeNames[kernel_type_config.element_b] - return f"{element_a}{element_b}" +def generate_type_option_name(kernel_types: TypeConfig): + return ", ".join([ + f"{field.name.replace('b_', 'with_')+'_type'}=" + + VLLMDataTypeNames[getattr(kernel_types, field.name)] + for field in fields(TypeConfig) + ]) def is_power_of_two(n): @@ -263,13 +345,36 @@ def _to_cute_constant(value: int): return _to_cute_constant(value) +def unique_schedules(impl_configs: List[ImplConfig]): + return list( + set(sch for impl_config in impl_configs + for sch in impl_config.schedules)) + + +def unsigned_type_with_bitwidth(num_bits): + return { + 4: DataType.u4, + 8: DataType.u8, + 16: DataType.u16, + 32: DataType.u32, + 64: DataType.u64, + }[num_bits] + + template_globals = { + "void": DataType.void, "DataTypeTag": VLLMDataTypeTag, + "VLLMScalarTypeTag": VLLMDataTypeVLLMScalarTypeTag, + "TorchTypeTag": VLLMDataTypeTorchDataTypeTag, "KernelScheduleTag": VLLMKernelScheduleTag, "EpilogueScheduleTag": EpilogueScheduleTag, "TileSchedulerTag": TileSchedulerTag, "to_cute_constant": to_cute_constant, - "gen_sch_name": generate_terse_schedule_name, + "gen_sch_sig": generate_terse_sch_sig, + "gen_type_sig": generate_type_signature, + "unique_schedules": unique_schedules, + "unsigned_type_with_bitwidth": unsigned_type_with_bitwidth, + "gen_type_option_name": generate_type_option_name } @@ -284,42 +389,82 @@ def create_template(template_str): prepack_dispatch_template = create_template(PREPACK_TEMPLATE) -def create_sources(impl_config: ImplConfig, num_impl_files=1): +def create_sources(impl_configs: List[ImplConfig], num_impl_files=8): sources = [] - type_name = generate_type_signature(impl_config.type_config) - terse_type_name = generate_terse_type_signature(impl_config.type_config) - sources.append(( - f"machete_mm_{terse_type_name}", - mm_dispatch_template.render(type_name=type_name, - type_config=impl_config.type_config, - schedules=impl_config.schedule_configs, - heuristic=impl_config.heuristic), + "machete_mm_dispatch", + mm_dispatch_template.render(impl_configs=impl_configs), )) + prepack_types = [] + for impl_config in impl_configs: + convert_type = impl_config.types.a \ + if impl_config.types.b_group_scale == DataType.void \ + else impl_config.types.b_group_scale + prepack_types.append( + PrepackTypeConfig( + a=impl_config.types.a, + b_num_bits=VLLMDataTypeSize[impl_config.types.b], + convert=convert_type, + accumulator=impl_config.types.accumulator, + )) + + def prepacked_type_key(prepack_type: PrepackTypeConfig): + # For now we we can just use the first accumulator type seen since + # the tensor core shapes/layouts don't vary based on accumulator + # type so we can generate less code this way + return (prepack_type.a, prepack_type.b_num_bits, prepack_type.convert) + + unique_prepack_types = [] + prepack_types_seen = set() + for prepack_type in prepack_types: + key = prepacked_type_key(prepack_type) + if key not in prepack_types_seen: + unique_prepack_types.append(prepack_type) + prepack_types_seen.add(key) + sources.append(( - f"machete_prepack_{terse_type_name}", - prepack_dispatch_template.render( - type_name=type_name, - type_config=impl_config.type_config, - ), + "machete_prepack", + prepack_dispatch_template.render(types=unique_prepack_types, ), )) - num_schedules = len(impl_config.schedule_configs) - schedules_per_file = math.ceil(num_schedules / num_impl_files) - for part, i in enumerate(range(0, num_schedules, schedules_per_file)): - file_schedules = impl_config.schedule_configs[i:i + schedules_per_file] + # Split up impls across files + num_impls = reduce(lambda x, y: x + len(y.schedules), impl_configs, 0) + num_impls_per_file = math.ceil(num_impls / num_impl_files) + + files_impls: List[List[ImplConfig]] = [[]] + + curr_num_impls_assigned = 0 + curr_impl_in_file = 0 + curr_impl_configs = deepcopy(list(reversed(impl_configs))) + + while curr_num_impls_assigned < num_impls: + room_left_in_file = num_impls_per_file - curr_impl_in_file + if room_left_in_file == 0: + files_impls.append([]) + room_left_in_file = num_impls_per_file + curr_impl_in_file = 0 + + curr_ic = curr_impl_configs[-1] + if len(curr_ic.schedules) >= room_left_in_file: + # Break apart the current impl config + tmp_ic = deepcopy(curr_ic) + tmp_ic.schedules = curr_ic.schedules[:room_left_in_file] + curr_ic.schedules = curr_ic.schedules[room_left_in_file:] + files_impls[-1].append(tmp_ic) + else: + files_impls[-1].append(curr_ic) + curr_impl_configs.pop() + curr_num_impls_assigned += len(files_impls[-1][-1].schedules) + curr_impl_in_file += len(files_impls[-1][-1].schedules) + for part, file_impls in enumerate(files_impls): sources.append(( - f"machete_mm_{terse_type_name}_impl_part{part}", - mm_impl_template.render( - type_name=type_name, - type_config=impl_config.type_config, - schedules=file_schedules, - specializations=impl_config.specializations, - ), + f"machete_mm_impl_part{part+1}", + mm_impl_template.render(impl_configs=file_impls), )) + return sources @@ -328,187 +473,169 @@ def generate(): # about how this works SCRIPT_DIR = os.path.dirname(__file__) - schedule_common_params = dict( + sch_common_params = dict( kernel_schedule=TmaMI, epilogue_schedule=TmaCoop, tile_scheduler=TileSchedulerType.StreamK, ) - # For now we use the same heuristic for all types - # Heuristic is currently tuned for H100s - default_heuristic = [ + # Stored as "condition": ((tile_shape_mn), (cluster_shape_mnk)) + default_tile_heuristic_config = { #### M = 257+ - ( - "M > 256 && K <= 16384 && N <= 4096", - ScheduleConfig( - tile_shape_mn=(128, 128), - cluster_shape_mnk=(2, 1, 1), - **schedule_common_params # type: ignore - )), - ( - "M > 256", - ScheduleConfig( - tile_shape_mn=(128, 256), - cluster_shape_mnk=(2, 1, 1), - **schedule_common_params # type: ignore - )), + "M > 256 && K <= 16384 && N <= 4096": ((128, 128), (2, 1, 1)), + "M > 256": ((128, 256), (2, 1, 1)), #### M = 129-256 - ( - "M > 128 && K <= 4096 && N <= 4096", - ScheduleConfig( - tile_shape_mn=(128, 64), - cluster_shape_mnk=(2, 1, 1), - **schedule_common_params # type: ignore - )), - ( - "M > 128 && K <= 8192 && N <= 8192", - ScheduleConfig( - tile_shape_mn=(128, 128), - cluster_shape_mnk=(2, 1, 1), - **schedule_common_params # type: ignore - )), - ( - "M > 128", - ScheduleConfig( - tile_shape_mn=(128, 256), - cluster_shape_mnk=(2, 1, 1), - **schedule_common_params # type: ignore - )), + "M > 128 && K <= 4096 && N <= 4096": ((128, 64), (2, 1, 1)), + "M > 128 && K <= 8192 && N <= 8192": ((128, 128), (2, 1, 1)), + "M > 128": ((128, 256), (2, 1, 1)), #### M = 65-128 - ( - "M > 64 && K <= 4069 && N <= 4069", - ScheduleConfig( - tile_shape_mn=(128, 32), - cluster_shape_mnk=(2, 1, 1), - **schedule_common_params # type: ignore - )), - ( - "M > 64 && K <= 4069 && N <= 8192", - ScheduleConfig( - tile_shape_mn=(128, 64), - cluster_shape_mnk=(2, 1, 1), - **schedule_common_params # type: ignore - )), - ( - "M > 64 && K >= 8192 && N >= 12288", - ScheduleConfig( - tile_shape_mn=(256, 128), - cluster_shape_mnk=(2, 1, 1), - **schedule_common_params # type: ignore - )), - ( - "M > 64", - ScheduleConfig( - tile_shape_mn=(128, 128), - cluster_shape_mnk=(2, 1, 1), - **schedule_common_params # type: ignore - )), + "M > 64 && K <= 4069 && N <= 4069": ((128, 32), (2, 1, 1)), + "M > 64 && K <= 4069 && N <= 8192": ((128, 64), (2, 1, 1)), + "M > 64 && K >= 8192 && N >= 12288": ((256, 128), (2, 1, 1)), + "M > 64": ((128, 128), (2, 1, 1)), #### M = 33-64 - ( - "M > 32 && K <= 6144 && N <= 6144", - ScheduleConfig( - tile_shape_mn=(128, 16), - cluster_shape_mnk=(1, 1, 1), - **schedule_common_params # type: ignore - )), - ( - "M > 32 && K >= 16384 && N >= 12288", - ScheduleConfig( - tile_shape_mn=(256, 64), - cluster_shape_mnk=(2, 1, 1), - **schedule_common_params # type: ignore - )), - ( - "M > 32", - ScheduleConfig( - tile_shape_mn=(128, 64), - cluster_shape_mnk=(2, 1, 1), - **schedule_common_params # type: ignore - )), + "M > 32 && K <= 6144 && N <= 6144": ((128, 16), (1, 1, 1)), + "M > 32 && K >= 16384 && N >= 12288": ((256, 64), (2, 1, 1)), + "M > 32": ((128, 64), (2, 1, 1)), #### M = 17-32 - ( - "M > 16 && K <= 12288 && N <= 8192", - ScheduleConfig( - tile_shape_mn=(128, 32), - cluster_shape_mnk=(2, 1, 1), - **schedule_common_params # type: ignore - )), - ( - "M > 16", - ScheduleConfig( - tile_shape_mn=(256, 32), - cluster_shape_mnk=(2, 1, 1), - **schedule_common_params # type: ignore - )), + "M > 16 && K <= 12288 && N <= 8192": ((128, 32), (2, 1, 1)), + "M > 16": ((256, 32), (2, 1, 1)), #### M = 1-16 - ( - "N >= 26624", - ScheduleConfig( - tile_shape_mn=(256, 16), - cluster_shape_mnk=(1, 1, 1), - **schedule_common_params # type: ignore - )), - ( - None, - ScheduleConfig( - tile_shape_mn=(128, 16), - cluster_shape_mnk=(1, 1, 1), - **schedule_common_params # type: ignore - )), + "N >= 26624": ((256, 16), (1, 1, 1)), + None: ((128, 16), (1, 1, 1)), + } + + # For now we use the same heuristic for all types + # Heuristic is currently tuned for H100s + default_heuristic = [ + (cond, ScheduleConfig(*tile_config, + **sch_common_params)) # type: ignore + for cond, tile_config in default_tile_heuristic_config.items() ] - # Do not use schedules = list(set(...)) because we need to make sure - # the output list is deterministic; otherwise the generated kernel file - # will be non-deterministic and causes ccache miss. - schedules = [] - for _, schedule_config in default_heuristic: - if schedule_config not in schedules: - schedules.append(schedule_config) + def get_unique_schedules(heuristic: Dict[str, ScheduleConfig]): + # Do not use schedules = list(set(...)) because we need to make sure + # the output list is deterministic; otherwise the generated kernel file + # will be non-deterministic and causes ccache miss. + schedules = [] + for _, schedule_config in heuristic: + if schedule_config not in schedules: + schedules.append(schedule_config) + return schedules impl_configs = [] GPTQ_kernel_type_configs = list( TypeConfig( - element_a=element_a, - element_b=element_b, - element_b_scale=element_a, - element_b_zeropoint=element_a, - element_d=element_a, + a=a, + b=b, + b_group_scale=a, + b_group_zeropoint=DataType.void, + b_channel_scale=DataType.void, + a_token_scale=DataType.void, + out=a, accumulator=DataType.f32, - ) for element_b in (VLLMDataType.u4b8, VLLMDataType.u8b128) - for element_a in (DataType.f16, DataType.bf16)) - - GPTQ_kernel_specializations = [ - Specialization(with_C=False, with_zeropoints=False, with_scales=True) - ] + ) for b in (VLLMDataType.u4b8, VLLMDataType.u8b128) + for a in (DataType.f16, DataType.bf16)) impl_configs += [ - ImplConfig(x[0], x[1], x[2], x[3]) - for x in zip(GPTQ_kernel_type_configs, itertools.repeat(schedules), - itertools.repeat(GPTQ_kernel_specializations), + ImplConfig(x[0], x[1], x[2]) + for x in zip(GPTQ_kernel_type_configs, + itertools.repeat(get_unique_schedules(default_heuristic)), itertools.repeat(default_heuristic)) ] AWQ_kernel_type_configs = list( TypeConfig( - element_a=element_a, - element_b=element_b, - element_b_scale=element_a, - element_b_zeropoint=element_a, - element_d=element_a, + a=a, + b=b, + b_group_scale=a, + b_group_zeropoint=a, + b_channel_scale=DataType.void, + a_token_scale=DataType.void, + out=a, accumulator=DataType.f32, - ) for element_b in (DataType.u4, DataType.u8) - for element_a in (DataType.f16, DataType.bf16)) + ) for b in (DataType.u4, DataType.u8) + for a in (DataType.f16, DataType.bf16)) + + impl_configs += [ + ImplConfig(x[0], x[1], x[2]) + for x in zip(AWQ_kernel_type_configs, + itertools.repeat(get_unique_schedules(default_heuristic)), + itertools.repeat(default_heuristic)) + ] - AWQ_kernel_specializations = [ - Specialization(with_C=False, with_zeropoints=True, with_scales=True) + # Stored as "condition": ((tile_shape_mn), (cluster_shape_mnk)) + # TODO (LucasWilkinson): Further tuning required + qqq_tile_heuristic_config = { + #### M = 257+ + # ((128, 256), (2, 1, 1)) Broken for QQQ types + # TODO (LucasWilkinson): Investigate further + # "M > 256 && K <= 16384 && N <= 4096": ((128, 128), (2, 1, 1)), + # "M > 256": ((128, 256), (2, 1, 1)), + "M > 256": ((128, 128), (2, 1, 1)), + #### M = 129-256 + "M > 128 && K <= 4096 && N <= 4096": ((128, 64), (2, 1, 1)), + "M > 128 && K <= 8192 && N <= 8192": ((128, 128), (2, 1, 1)), + # ((128, 256), (2, 1, 1)) Broken for QQQ types + # TODO (LucasWilkinson): Investigate further + # "M > 128": ((128, 256), (2, 1, 1)), + "M > 128": ((128, 128), (2, 1, 1)), + #### M = 65-128 + "M > 64 && K <= 4069 && N <= 4069": ((128, 32), (2, 1, 1)), + "M > 64 && K <= 4069 && N <= 8192": ((128, 64), (2, 1, 1)), + "M > 64 && K >= 8192 && N >= 12288": ((256, 128), (2, 1, 1)), + "M > 64": ((128, 128), (2, 1, 1)), + #### M = 33-64 + "M > 32 && K <= 6144 && N <= 6144": ((128, 16), (1, 1, 1)), + # Broken for QQQ types + # TODO (LucasWilkinson): Investigate further + #"M > 32 && K >= 16384 && N >= 12288": ((256, 64), (2, 1, 1)), + "M > 32": ((128, 64), (2, 1, 1)), + #### M = 17-32 + "M > 16 && K <= 12288 && N <= 8192": ((128, 32), (2, 1, 1)), + "M > 16": ((256, 32), (2, 1, 1)), + #### M = 1-16 + "N >= 26624": ((256, 16), (1, 1, 1)), + None: ((128, 16), (1, 1, 1)), + } + + # For now we use the same heuristic for all types + # Heuristic is currently tuned for H100s + qqq_heuristic = [ + (cond, ScheduleConfig(*tile_config, + **sch_common_params)) # type: ignore + for cond, tile_config in qqq_tile_heuristic_config.items() + ] + + QQQ_kernel_types = [ + *(TypeConfig( + a=DataType.s8, + b=VLLMDataType.u4b8, + b_group_scale=b_group_scale, + b_group_zeropoint=DataType.void, + b_channel_scale=DataType.f32, + a_token_scale=DataType.f32, + out=DataType.f16, + accumulator=DataType.s32, + ) for b_group_scale in (DataType.f16, DataType.void)), + *(TypeConfig( + a=DataType.e4m3, + b=VLLMDataType.u4b8, + b_group_scale=b_group_scale, + b_group_zeropoint=DataType.void, + b_channel_scale=DataType.f32, + a_token_scale=DataType.f32, + out=DataType.f16, + accumulator=DataType.f32, + ) for b_group_scale in (DataType.f16, DataType.void)), ] impl_configs += [ - ImplConfig(x[0], x[1], x[2], x[3]) - for x in zip(AWQ_kernel_type_configs, itertools.repeat(schedules), - itertools.repeat(AWQ_kernel_specializations), - itertools.repeat(default_heuristic)) + ImplConfig(x[0], x[1], x[2]) + for x in zip(QQQ_kernel_types, + itertools.repeat(get_unique_schedules(qqq_heuristic)), + itertools.repeat(qqq_heuristic)) ] output_dir = os.path.join(SCRIPT_DIR, "generated") @@ -521,12 +648,11 @@ def generate(): os.makedirs(output_dir) # Render each group of configurations into separate files - for impl_config in impl_configs: - for filename, code in create_sources(impl_config): - filepath = os.path.join(output_dir, f"{filename}.cu") - with open(filepath, "w") as output_file: - output_file.write(code) - print(f"Rendered template to {filepath}") + for filename, code in create_sources(impl_configs): + filepath = os.path.join(output_dir, f"{filename}.cu") + with open(filepath, "w") as output_file: + output_file.write(code) + print(f"Rendered template to {filepath}") if __name__ == "__main__": diff --git a/csrc/quantization/machete/machete_mainloop.cuh b/csrc/quantization/machete/machete_mainloop.cuh index e8e7b14de0da1..816f33a1078e5 100644 --- a/csrc/quantization/machete/machete_mainloop.cuh +++ b/csrc/quantization/machete/machete_mainloop.cuh @@ -171,6 +171,10 @@ struct MacheteCollectiveMma { make_shape(size<0>(TileShape_MNK{}), size<2>(TileShape_MNK{}), Int{}))); + using SmemLayoutACopy = decltype(GmemLayoutA::TVbNbKL_to_offset_copy( + make_shape(size<0>(TileShape_MNK{}), size<2>(TileShape_MNK{}), + Int{}))); + using SmemLayoutAtomARowMajor = decltype(rs_smem_selector(TileShape_MNK{})), @@ -288,14 +292,7 @@ struct MacheteCollectiveMma { static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomScale{})) == 0, "SmemLayoutAtomScale must evenly divide tile k shape."); - // Tile along modes in a way that maximizes the TMA box size. - using SmemLayoutACopy = decltype(tile_to_shape( - SmemLayoutAtomARowMajor{}, - make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), - Int{}), - conditional_t<::cutlass::gemm::detail::is_major<0, StrideA>(), - Step<_2, _1, _3>, Step<_1, _2, _3>>{})); - + // Tile along modes in a way that maximizes the TMA box size using SmemLayoutB = decltype(tile_to_shape( SmemLayoutAtomB{}, make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), @@ -428,12 +425,12 @@ struct MacheteCollectiveMma { // clang-format on // ((athrid, val), (BlocksM, BlockK), L) -> (storage_idx) - using PrepackedStrideA = decltype(stride(GmemLayoutA::TVbNbKL_to_offset( + using PrepackedStrideA = decltype(stride(GmemLayoutA::TVbNbKL_to_offset_copy( make_shape(int32_t(0), int32_t(0), int32_t(0))))); using ATensor = decltype(make_tensor( get_logical_ptr(static_cast(nullptr)), - shape(GmemLayoutA::TVbNbKL_to_offset( + shape(GmemLayoutA::TVbNbKL_to_offset_copy( make_shape(int32_t(0), int32_t(0), int32_t(0)))), PrepackedStrideA{})); @@ -450,8 +447,8 @@ struct MacheteCollectiveMma { static constexpr auto make_tma_copy_A(ATensor tensor_a = ATensor{}) { return make_tma_copy( - GmemTiledCopyA{}, tensor_a, SmemLayoutA{}(_, _, cute::Int<0>{}), - shape(SmemLayoutA{}(_, _, cute::Int<0>{})), + GmemTiledCopyA{}, tensor_a, SmemLayoutACopy{}(_, _, cute::Int<0>{}), + shape(SmemLayoutACopy{}(_, _, cute::Int<0>{})), size<1>(ClusterShape{})); // mcast along N mode for this M load, if any } @@ -584,7 +581,7 @@ struct MacheteCollectiveMma { typename Params::TMA_Scale tma_load_scale; typename Params::TMA_Zero tma_load_zero; - auto layout = GmemLayoutA::TVbNbKL_to_offset(make_shape(M, K, L)); + auto layout = GmemLayoutA::TVbNbKL_to_offset_copy(make_shape(M, K, L)); tma_load_a = make_tma_copy_A( make_logical_tensor(ptr_A, shape(layout), stride(layout))); @@ -722,7 +719,7 @@ struct MacheteCollectiveMma { // (TILE_V,TILE_B,m,k,l) auto make_gA_mkl = [&]() { // ((athrid, val), (BlocksM, BlockK), L) -> (storage_idx) - auto layout = GmemLayoutA::TVbNbKL_to_offset(make_shape(M, K, L)); + auto layout = GmemLayoutA::TVbNbKL_to_offset_copy(make_shape(M, K, L)); Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(shape(layout)); return local_tile(mA_mkl, make_shape(size<0>(layout), PPBlocksPerTile_MK{}), diff --git a/csrc/quantization/machete/machete_mm_kernel.cuh b/csrc/quantization/machete/machete_mm_kernel.cuh index 4d41b8d291484..d4d19ae5deec7 100644 --- a/csrc/quantization/machete/machete_mm_kernel.cuh +++ b/csrc/quantization/machete/machete_mm_kernel.cuh @@ -21,6 +21,8 @@ #include "cutlass_extensions/cute_utils.cuh" #include "cutlass_extensions/vllm_numeric_conversion.cuh" +#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp" +#include "cutlass_extensions/torch_utils.hpp" #include "machete_collective_builder.cuh" #include "machete_prepacked_layout.cuh" #include "machete_interleaving_utils.cuh" @@ -37,27 +39,42 @@ using namespace cute; // W is quantized, in this situation or right-hand operand is quantized so // we compute the transpose to move it to the left-hand side. template + typename AccumulatorT, typename GroupScaleT, typename GroupZeroT, + typename ChannelScaleT, typename TokenScaleT, class KernelSchedule, + typename ScheduleConfig> struct MacheteKernelTemplate { + static constexpr bool with_C = false; // not ever used + static constexpr bool with_group_scales = !std::is_same_v; + static constexpr bool with_group_zeropoints = + !std::is_same_v; + static constexpr bool with_channel_scales = + !std::is_same_v; + static constexpr bool with_token_scales = !std::is_same_v; + using MmaType = ElementA_; using ElementA = ElementA_; using ElementB = ElementB_; using ElementD = ElementD_; using ElementC = cute::conditional_t; - using ElementZ = ZeroT; - using ElementS = ScaleT; - - using ElementAccumulator = - AccumulatorT; // Element type for internal accumulation + using ElementAccumulator = AccumulatorT; using ElementCompute = AccumulatorT; // For Epilogue + // Use dummy values when we don't have scales or zeropoints + using ElementZGroup = + cute::conditional_t; + using ElementSGroup = + cute::conditional_t; + using ElementConvertGroup = + cute::conditional_t; + using ElementSChannel = + cute::conditional_t; + using ElementSToken = + cute::conditional_t; using BTypeTuple = cute::conditional_t< - with_scales, - cute::conditional_t, - cute::tuple>, + with_group_scales, + cute::conditional_t, + cute::tuple>, ElementB>; using LayoutA = cutlass::layout::RowMajor; @@ -71,8 +88,8 @@ struct MacheteKernelTemplate { using StrideA = cutlass::detail::TagToStrideA_t; using StrideC = cutlass::detail::TagToStrideA_t; using StrideD = cutlass::detail::TagToStrideA_t; - using StrideS = cutlass::detail::TagToStrideA_t; - using StrideZ = StrideS; + using StrideSGroup = cutlass::detail::TagToStrideA_t; + using StrideZGroup = StrideSGroup; using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose::type; @@ -85,8 +102,8 @@ struct MacheteKernelTemplate { using OperatorClass = cutlass::arch::OpClassTensorOp; using PrepackedLayoutB = - PrepackedLayoutBTemplate; + PrepackedLayoutBTemplate; static int constexpr TileShapeK = 128 * 8 / cutlass::sizeof_bits::value; @@ -103,12 +120,42 @@ struct MacheteKernelTemplate { using EpilogueTileType = typename ScheduleConfig::EpilogueTileType; using TileScheduler = typename ScheduleConfig::TileScheduler; + static_assert( + (!with_channel_scales && !with_token_scales) || + ((with_channel_scales && with_token_scales) && + std::is_same_v), + "Currently token and channel scales (if present) must be the same type"); + + using EpilogueDescriptor = + cutlass::epilogue::collective::detail::EpilogueDescriptor< + TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD, + ElementD, EpilogueSchedule>; + + // Currently only supports float scales + using ChTokScalesEpilogue = + typename vllm::c3x::ScaledEpilogue; + static_assert((with_channel_scales || with_token_scales) || + (std::is_same_v && + std::is_same_v), + "Currently token and channel scales (if present) must be float " + "(and if one is present the other must be too)"); + + using StoreEpilogueCompute = typename cutlass::epilogue::fusion::Sm90EVT< + cutlass::epilogue::fusion::Sm90AccFetch>; + + using EVTCompute = + std::conditional_t; + + // EVTCompute using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< ArchTag, OperatorClass, TileShape, ClusterShape, EpilogueTileType, - ElementAccumulator, ElementAccumulator, ElementC, LayoutC_Transpose, - AlignmentC, ElementD, LayoutD_Transpose, AlignmentD, - EpilogueSchedule>::CollectiveOp; + ElementAccumulator, ElementSChannel, ElementC, LayoutC_Transpose, + AlignmentC, ElementD, LayoutD_Transpose, AlignmentD, EpilogueSchedule, + EVTCompute>::CollectiveOp; using CollectiveMainloop = typename cutlass::gemm::collective::VLLMCollectiveBuilder< @@ -131,26 +178,44 @@ struct MacheteKernelTemplate { using MainloopArguments = typename GemmKernel::MainloopArguments; using EpilogueArguments = typename GemmKernel::EpilogueArguments; - template static Arguments create_arguments( cudaStream_t stream, - ElementA const* A_ptr, // A is an MxK matrix - Layout const& layout_A, - ElementB const* B_ptr, // B is an KxN prepacked matrix - ElementD* D_ptr, // D is an MxN matrix - Layout const& layout_D, - ElementC const* C_ptr, // C is an MxN matrix - std::optional> const& layout_C, - ElementS const* S_ptr, // S is an scale_KxN matrix - std::optional> const& layout_S, - ElementZ const* Z_ptr, // Z is an scale_KxN matrix - std::optional> const& layout_Z, - ElementCompute alpha, ElementCompute beta, - std::optional maybe_group_size) { - static_assert(!with_zeropoints || with_scales); - - int M = size<0>(layout_A), N = size<1>(layout_D), K = size<1>(layout_A); + torch::Tensor const& A, // MxK matrix + torch::Tensor const& B, // KxN prepacked matrix + torch::Tensor& D, // MxN matrix + c10::optional const& maybe_g_scales, // scale_KxN matrix + c10::optional const& maybe_g_zeros, // scale_KxN matrix + c10::optional maybe_group_size, + c10::optional const& maybe_ch_scales, // len N vector + c10::optional const& maybe_tok_scales) // len M vector + { + static_assert(!with_group_zeropoints || with_group_scales); + + int M = A.size(0), N = B.size(1), K = A.size(1); + TORCH_CHECK(D.size(0) == M && D.size(1) == N); + + auto layout_A = make_cute_layout(A, "A"); + auto layout_D = make_cute_layout(D, "D"); + auto layout_S_group = + maybe_make_cute_layout(maybe_g_scales, "group_scales"); + auto layout_Z_group = + maybe_make_cute_layout(maybe_g_zeros, "group_zeros"); + int64_t numel_S_channel = maybe_ch_scales ? maybe_ch_scales->numel() : 0; + int64_t numel_S_token = maybe_tok_scales ? maybe_tok_scales->numel() : 0; + + auto unwrap = [](auto const& t) { + return t ? t->const_data_ptr() : nullptr; + }; + auto A_ptr = static_cast(A.const_data_ptr()); + auto B_ptr = static_cast(B.const_data_ptr()); + auto D_ptr = static_cast(D.mutable_data_ptr()); + auto S_group_ptr = + static_cast(unwrap(maybe_g_scales)); + auto Z_group_ptr = static_cast(unwrap(maybe_g_zeros)); + auto S_channel_ptr = + static_cast(unwrap(maybe_ch_scales)); + auto S_token_ptr = + static_cast(unwrap(maybe_tok_scales)); int const group_size = maybe_group_size == -1 ? K : maybe_group_size.value_or(K); @@ -159,26 +224,28 @@ struct MacheteKernelTemplate { TORCH_CHECK(size<0>(layout_A) == M && size<1>(layout_A) == K); TORCH_CHECK(size<0>(layout_D) == M && size<1>(layout_D) == N); - if constexpr (with_C) { - TORCH_CHECK(C_ptr && layout_C); + if constexpr (with_group_scales) { + TORCH_CHECK(S_group_ptr && layout_S_group); + TORCH_CHECK((size<0>(*layout_S_group) == scale_k && + size<1>(*layout_S_group) == N)); } else { - TORCH_CHECK(!C_ptr, "C not supported"); + TORCH_CHECK(!S_group_ptr, "Scales not supported"); } - if constexpr (with_scales) { - TORCH_CHECK(S_ptr && layout_S); - TORCH_CHECK((size<0>(*layout_S) == scale_k && size<1>(*layout_S) == N)); + if constexpr (with_group_zeropoints) { + TORCH_CHECK(Z_group_ptr && layout_Z_group); + TORCH_CHECK((size<0>(*layout_Z_group) == scale_k && + size<1>(*layout_Z_group) == N)); + TORCH_CHECK(layout_S_group && *layout_Z_group == *layout_S_group, + "Scales and zeros must have the same layout"); } else { - TORCH_CHECK(!S_ptr, "Scales not supported"); + TORCH_CHECK(!Z_group_ptr, "Zeropoints not supported"); } - if constexpr (with_zeropoints) { - TORCH_CHECK(Z_ptr && layout_Z); - TORCH_CHECK((size<0>(*layout_Z) == scale_k && size<1>(*layout_Z) == N)); - TORCH_CHECK(layout_S && *layout_Z == *layout_S, - "Scales and zeros must have the same layout"); - } else { - TORCH_CHECK(!Z_ptr, "Zeropoints not supported"); + if constexpr (with_channel_scales || with_token_scales) { + TORCH_CHECK( + (maybe_ch_scales->numel() == N || maybe_ch_scales->numel() == 1) && + (maybe_tok_scales->numel() == M || maybe_tok_scales->numel() == 1)); } // Transpose A and D @@ -186,24 +253,33 @@ struct MacheteKernelTemplate { // for B (which is At) auto stride_At = layout_A.stride(); auto stride_Dt = permute_layout<1, 0, 2>(layout_D).stride(); - auto stride_Ct = stride_Dt; - if (layout_C) { - stride_Ct = permute_layout<1, 0, 2>(*layout_C).stride(); - } MainloopArguments mainloop_arguments{}; - EpilogueArguments epilogue_arguments{ - {alpha, beta}, C_ptr, stride_Ct, D_ptr, stride_Dt}; + // {Accum, C, C_layout, D, D} + EpilogueArguments epilogue_arguments{}; + + if constexpr (with_channel_scales || with_token_scales) { + epilogue_arguments = + EpilogueArguments{ChTokScalesEpilogue::prepare_args( + *maybe_ch_scales, *maybe_tok_scales), + nullptr, + {}, + D_ptr, + stride_Dt}; + } else { + epilogue_arguments = EpilogueArguments{{}, nullptr, {}, D_ptr, stride_Dt}; + } - if constexpr (with_scales && with_zeropoints) { - auto stride_S = permute_layout<1, 0, 2>(*layout_S).stride(); - mainloop_arguments = - MainloopArguments{B_ptr, _StrideB{}, A_ptr, stride_At, - S_ptr, stride_S, group_size, Z_ptr}; - } else if constexpr (with_scales) { - auto stride_S = permute_layout<1, 0, 2>(*layout_S).stride(); + if constexpr (with_group_scales && with_group_zeropoints) { + auto stride_S_group = permute_layout<1, 0, 2>(*layout_S_group).stride(); mainloop_arguments = MainloopArguments{ - B_ptr, _StrideB{}, A_ptr, stride_At, S_ptr, stride_S, group_size}; + B_ptr, _StrideB{}, A_ptr, stride_At, + S_group_ptr, stride_S_group, group_size, Z_group_ptr}; + } else if constexpr (with_group_scales) { + auto stride_S_group = permute_layout<1, 0, 2>(*layout_S_group).stride(); + mainloop_arguments = + MainloopArguments{B_ptr, _StrideB{}, A_ptr, stride_At, + S_group_ptr, stride_S_group, group_size}; } else { mainloop_arguments = MainloopArguments{B_ptr, _StrideB{}, A_ptr, stride_At}; diff --git a/csrc/quantization/machete/machete_mm_launcher.cuh b/csrc/quantization/machete/machete_mm_launcher.cuh index 60a4ed60535b7..4b0da5b303e0c 100644 --- a/csrc/quantization/machete/machete_mm_launcher.cuh +++ b/csrc/quantization/machete/machete_mm_launcher.cuh @@ -5,73 +5,61 @@ #include "machete_mm_kernel.cuh" #include "cutlass_extensions/torch_utils.hpp" +#include "core/scalar_type.hpp" namespace machete { -struct PyTorchArguments { +struct MMArgs { torch::Tensor const& A; torch::Tensor const& B; - c10::optional const& scales; - c10::optional const& zeros; - c10::optional group_size; - c10::optional const& C; - c10::optional alpha; - c10::optional beta; - c10::optional schedule; + vllm::ScalarType const& b_type; + c10::optional const& maybe_out_type; + c10::optional const& maybe_group_scales; + c10::optional const& maybe_group_zeros; + c10::optional maybe_group_size; + c10::optional const& maybe_channel_scales; + c10::optional const& maybe_token_scales; + c10::optional maybe_schedule; }; +struct SupportedSchedulesArgs { + at::ScalarType a_type; + vllm::ScalarType b_type; + c10::optional maybe_group_scales_type; + c10::optional maybe_group_zeros_type; + c10::optional maybe_channel_scales_type; + c10::optional maybe_token_scales_type; + c10::optional maybe_out_type; +}; + +torch::Tensor mm_dispatch(MMArgs args); + +std::vector supported_schedules_dispatch( + SupportedSchedulesArgs args); + template -torch::Tensor run_impl(PyTorchArguments args) { +torch::Tensor run_impl(MMArgs args) { const at::cuda::OptionalCUDAGuard device_guard(device_of(args.A)); auto device = args.A.device(); auto stream = at::cuda::getCurrentCUDAStream(device.index()); - using EleA = typename MacheteKernel::ElementA; - using EleB = typename MacheteKernel::ElementB; - using EleC = typename MacheteKernel::ElementC; - using EleD = typename MacheteKernel::ElementD; - using EleScale = typename MacheteKernel::ElementS; - using EleZero = typename MacheteKernel::ElementZ; - - using StrideA = typename MacheteKernel::StrideA; - using StrideC = typename MacheteKernel::StrideC; - using StrideD = typename MacheteKernel::StrideD; - using StrideS = typename MacheteKernel::StrideS; - using StrideZ = typename MacheteKernel::StrideZ; - int M = args.A.size(0); int N = args.B.size(1); int K = args.A.size(1); // Allocate output - torch::Tensor D = - torch::empty({M, N}, torch::TensorOptions() - .dtype(equivalent_scalar_type_v) - .device(device)); - - auto const &A = args.A, &B = args.B; - auto const &C = args.C, &scales = args.scales, &zeros = args.zeros; - - auto layout_A = make_cute_layout(A, "A"); - auto layout_D = make_cute_layout(D, "D"); - auto layout_C = maybe_make_cute_layout(C, "C"); - auto layout_S = maybe_make_cute_layout(scales, "scales"); - auto layout_Z = maybe_make_cute_layout(zeros, "zeros"); - - auto A_ptr = static_cast(A.const_data_ptr()); - auto B_ptr = static_cast(B.const_data_ptr()); - auto D_ptr = static_cast(D.mutable_data_ptr()); - auto C_ptr = static_cast(C ? C->const_data_ptr() : nullptr); - auto S_ptr = - static_cast(scales ? scales->const_data_ptr() : nullptr); - auto Z_ptr = - static_cast(zeros ? zeros->const_data_ptr() : nullptr); + torch::Tensor D = torch::empty( + {M, N}, + torch::TensorOptions() + .dtype(equivalent_scalar_type_v) + .device(device)); auto arguments = MacheteKernel::create_arguments( - stream, A_ptr, layout_A, B_ptr, D_ptr, layout_D, C_ptr, layout_C, S_ptr, - layout_S, Z_ptr, layout_Z, args.alpha.value_or(1), args.beta.value_or(0), - args.group_size); + stream, // + args.A, args.B, D, args.maybe_group_scales, args.maybe_group_zeros, + args.maybe_group_size, args.maybe_channel_scales, + args.maybe_token_scales); TORCH_CHECK(MacheteKernel::can_implement(arguments), "Machete kernel cannot be run with these arguments"); @@ -84,12 +72,4 @@ torch::Tensor run_impl(PyTorchArguments args) { return D; }; -template -struct GemmDispatcher { - static torch::Tensor dispatch(PyTorchArguments args); - static std::vector supported_schedules(); -}; - }; // namespace machete \ No newline at end of file diff --git a/csrc/quantization/machete/machete_prepack_kernel.cuh b/csrc/quantization/machete/machete_prepack_kernel.cuh index f23483f928b47..d002355ca49d6 100644 --- a/csrc/quantization/machete/machete_prepack_kernel.cuh +++ b/csrc/quantization/machete/machete_prepack_kernel.cuh @@ -6,31 +6,49 @@ namespace machete { -template -static __global__ void prepack_B_kernel(BInTensor B_in, - BTiledOutTensor B_tiled_out) { - auto tB_in = local_tile(B_in, TileShapeNKL{}, - make_coord(blockIdx.x, blockIdx.y, blockIdx.z)); - auto tB_out = B_tiled_out(make_coord(_, _), - make_coord(blockIdx.x, blockIdx.y), blockIdx.z); +template +static __global__ void prepack_B_kernel(BInTensor B_in, ElementB* B_out_ptr) { + auto constexpr block_size = + Int{}; + auto constexpr eles_per_thread = Int{}; + static_assert(block_size % threads == 0, + "block_size must be divisible by the number of threads"); - auto tiled_copy = make_tiled_copy(Copy_Atom{}, - Layout, Stride<_32, _1>>{}, - Layout>{}); + // Which pre-packed are we responsible for + auto blk_coord = make_coord(blockIdx.x, blockIdx.y, blockIdx.z); + auto tB_in = local_tile( + B_in, append(typename PrepackedLayoutB::PPBlockShape_NK{}, _1{}), + blk_coord); - auto thr_copy = tiled_copy.get_thread_slice(threadIdx.x); + // Find the start offset in the output for this pre-packed block + auto bNbKL_to_offset = PrepackedLayoutB::bNbKL_to_offset(shape(B_in)); - Tensor thr_tile_S = thr_copy.partition_S(tB_in); - Tensor thr_tile_D = thr_copy.partition_D(tB_out); + // Tensor representing a 1:1 mapping to the output space in 1D + auto tB_out_linear = + make_tensor(get_logical_ptr(B_out_ptr) + bNbKL_to_offset(blk_coord), + make_layout(make_shape(block_size))); + // Mapping from output space (1D) to input space + auto tB_in_linear = make_tensor( + tB_in.data(), + tB_in.layout() + .compose(right_inverse(PrepackedLayoutB::ppblock_ilvd_NK_to_offset())) + .with_shape(make_shape(block_size))); + + // Tile for this specific thread (could have used a TiledCopy but these work + // best with 2d layouts, this is a simple 1d layout so local_tile is enough, + // we are also not that concerned with performance for this kernel) + auto thr_tB_in_linear = + local_tile(tB_in_linear, make_shape(eles_per_thread), threadIdx.x); + auto thr_tB_out_linear = + local_tile(tB_out_linear, make_shape(eles_per_thread), threadIdx.x); // Construct a register-backed Tensor with the same shape as each thread's // partition - auto fragment = make_tensor(shape(thr_tile_D)); + auto fragment = make_tensor(shape(thr_tB_in_linear)); - // Copy from GMEM to RMEM and from RMEM to GMEM - copy(tiled_copy, thr_tile_S, fragment); - copy(Copy_Atom{}, fragment, thr_tile_D); + copy(thr_tB_in_linear, fragment); + copy(Copy_Atom{}, fragment, thr_tB_out_linear); } template @@ -44,18 +62,15 @@ static void prepack_B_template( TORCH_CHECK(size<0>(B_layout) % size<0>(TileShapeNKL{}) == 0); TORCH_CHECK(size<1>(B_layout) % size<1>(TileShapeNKL{}) == 0); - TORCH_CHECK(size<2>(B_layout) % size<2>(TileShapeNKL{}) == 0); auto N_tiles = size<0>(B_layout) / size<0>(TileShapeNKL{}); auto K_tiles = size<1>(B_layout) / size<1>(TileShapeNKL{}); - auto L_tiles = size<2>(B_layout) / size<2>(TileShapeNKL{}); + auto L_tiles = size<2>(B_layout); auto B_in = make_tensor(get_logical_ptr(B_in_ptr), B_layout); - auto B_tiled_out = - make_tensor(get_logical_ptr(B_out_ptr), ilvd_NKbNbKL_to_offset); - prepack_B_kernel - <<>>(B_in, B_tiled_out); + prepack_B_kernel<128, PrepackedLayoutB> + <<>>(B_in, B_out_ptr); } }; // namespace machete \ No newline at end of file diff --git a/csrc/quantization/machete/machete_prepack_launcher.cuh b/csrc/quantization/machete/machete_prepack_launcher.cuh index a33d8f9484cfe..3486d28be2126 100644 --- a/csrc/quantization/machete/machete_prepack_launcher.cuh +++ b/csrc/quantization/machete/machete_prepack_launcher.cuh @@ -2,9 +2,17 @@ #include "machete_prepack_kernel.cuh" #include "cutlass_extensions/torch_utils.hpp" +#include "core/scalar_type.hpp" namespace machete { +struct PrepackBArgs { + torch::Tensor const& B; + at::ScalarType a_type; + vllm::ScalarType b_type; + c10::optional maybe_group_scales_type; +}; + template torch::Tensor prepack_impl(torch::Tensor const B) { const at::cuda::OptionalCUDAGuard device_guard(device_of(B)); @@ -61,11 +69,6 @@ torch::Tensor prepack_impl(torch::Tensor const B) { return D; }; -template -struct PrepackBDispatcher { - static torch::Tensor dispatch(torch::Tensor B); -}; +torch::Tensor prepack_B_dispatch(PrepackBArgs args); }; // namespace machete \ No newline at end of file diff --git a/csrc/quantization/machete/machete_prepacked_layout.cuh b/csrc/quantization/machete/machete_prepacked_layout.cuh index 78e2cc5eec7d8..680a858a893c1 100644 --- a/csrc/quantization/machete/machete_prepacked_layout.cuh +++ b/csrc/quantization/machete/machete_prepacked_layout.cuh @@ -41,7 +41,7 @@ struct IlvBlkLayoutAuto {}; // The contract here is that the `TiledMma` determined below matches the one // ultimately used in the kernel. (this is also why the other element types are // required along with the kernel schedule) -template // clang-format on @@ -49,20 +49,27 @@ struct PrepackedLayoutBTemplate { using MmaType = ElementA_; using ElementA = ElementA_; using ElementB = ElementB_; - using ElementD = ElementD_; - using ElementAccumulator = - AccumulatorT; // Element type for internal accumulation + using ElementAccumulator = AccumulatorT; using ElementMma = MmaType; - // Only use interleaved layouts for subbyte weights, prmt instructions makes - // non-interleaved layouts for 8bit+ weights efficient enough we don't need - // iterleaved layouts + // Interleave for 4bit bit types when we are not upconverting to fp8 or int8, + // in those cases case we use a LUT using prmt instructions to upconvert and + // is more efficient if the data is not interleaved For 8bit+ prmt + // instructions makes non-interleaved layouts efficient enough we don't need + // iterleaved layouts (and can reuse more of the existing cutlass converts) + static constexpr bool should_interleave = + sizeof_bits_v <= 4 && + !std::is_same_v && + !std::is_same_v; + + // Only use interleaved layouts for subbyte weights, using IlvdBlkLayout = std::conditional_t< std::is_same_v, - std::conditional_t <= 4, - decltype(get_interleaved_blk_layout< - ElementB, sizeof_bits_v, 32>()), - void>, + std::conditional_t< + should_interleave, + decltype(get_interleaved_blk_layout< + ElementB, sizeof_bits_v, 32>()), + void>, IlvBlkLayout_>; // TODO (LucasWilkinson): compare the performance for other sizes @@ -135,7 +142,8 @@ struct PrepackedLayoutBTemplate { // then ((IlvBlk), FrgB) is {A, C, B, D, C, G, D, H} auto frgV = get<1, 0>(layout_no_interleave); auto ilvdBlk = IlvdBlkLayout{}; - static_assert(size(frgV) % 4 == 0, "FrgV must be divisible by 4"); + static_assert(size(frgV) % size(ilvdBlk) == 0, + "FrgV must be divisible by size(ilvdBlk)"); auto ilvd_FrgV = make_layout( make_shape(shape(ilvdBlk), Int{}), make_stride(stride(ilvdBlk), size(ilvdBlk))); @@ -175,6 +183,15 @@ struct PrepackedLayoutBTemplate { return group<1, 3>(result(_, repeat(result)>(_))); } + // ((athrid_val), (BlocksN, BlocksK, L)) -> (N, K, L) + template + CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset_copy( + Shape_NKL shape_mkl) { + auto layout = TVbNbKL_to_offset(shape_mkl); + return make_layout(coalesce(get<0>(layout)), get<1>(layout), + get<2>(layout)); + } + // ((BlockN, BlockK), (BlocksN, BlocksK), L) -> (storage_idx) template CUTE_HOST_DEVICE static constexpr auto ilvd_NKbNbKL_to_offset( @@ -197,6 +214,19 @@ struct PrepackedLayoutBTemplate { return group<1, 3>(result(_, repeat(result)>(_))); } + // (BlocksN, BlocksK, L) -> (storage_idx) + template + CUTE_HOST_DEVICE static constexpr auto bNbKL_to_offset(Shape_NKL shape_mkl) { + // (BlocksN, BlocksK, L) + auto blocks_shape = + cute::transform(shape_mkl, append(PPBlockShape_NK{}, _1{}), + [](auto x, auto y) { return x / y; }); + auto stride = size(PPBlockShape_NK{}); + + // (BlocksN, BlocksK, L) -> (storage_idx) + return make_layout(blocks_shape, compact_col_major(blocks_shape, stride)); + } + // ((athrid, val), (BlocksN, BlocksK, L)) -> (N, K, L) template CUTE_HOST_DEVICE static auto TVbNbK_to_NKL(Shape_NKL shape_mkl) { diff --git a/csrc/quantization/machete/machete_pytorch.cu b/csrc/quantization/machete/machete_pytorch.cu index 9f9073ded6191..da2c2fb0d3e77 100644 --- a/csrc/quantization/machete/machete_pytorch.cu +++ b/csrc/quantization/machete/machete_pytorch.cu @@ -8,89 +8,61 @@ namespace machete { using namespace vllm; -// -// Utils (type dispatching) -// - -template -static auto scalar_type_dispatch(ScalarType const& type, Fn fn) { - if (type == vllm::kU4) { - return fn(cutlass::uint4b_t{}); - } else if (type == vllm::kU8) { - return fn(cutlass::uint8_t{}); - } else if (type == vllm::kU4B8) { - return fn(cutlass::vllm_uint4b8_t{}); - } else if (type == vllm::kU8B128) { - return fn(cutlass::vllm_uint8b128_t{}); - } else { - TORCH_CHECK(false, "Unsupported type ", type.str()); - } -} - -#define AT_DISPATCH_CASE_SUPPORTED_COMPUTE_TYPES(...) \ - AT_DISPATCH_CASE_REDUCED_FLOATING_TYPES(__VA_ARGS__) - -#define AT_DISPATCH_SUPPORTED_COMPUTE_TYPES(TYPE, NAME, ...) \ - AT_DISPATCH_SWITCH(TYPE, NAME, \ - AT_DISPATCH_CASE_SUPPORTED_COMPUTE_TYPES(__VA_ARGS__)) - -// -// Interface -// - -std::vector supported_schedules(ScalarTypeId const btype_id) { -#if defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 12 - vllm::ScalarType b_type = ScalarType::from_id(btype_id); - return scalar_type_dispatch(b_type, [&](auto BType) { - return GemmDispatcher::supported_schedules(); +std::vector supported_schedules( + at::ScalarType a_type, int64_t b_type_id, + c10::optional maybe_group_scales_type, + c10::optional maybe_group_zeros_type, + c10::optional maybe_channel_scales_type, + c10::optional maybe_token_scales_type, + c10::optional maybe_out_type) { + ScalarType const b_type = ScalarType::from_id(b_type_id); + return supported_schedules_dispatch({ + .a_type = a_type, + .b_type = b_type, + .maybe_group_scales_type = maybe_group_scales_type, + .maybe_group_zeros_type = maybe_group_zeros_type, + .maybe_channel_scales_type = maybe_channel_scales_type, + .maybe_token_scales_type = maybe_token_scales_type, + .maybe_out_type = maybe_out_type, }); -#else - TORCH_CHECK(false, "Machete requires CUDA 12.0 or later"); -#endif } -torch::Tensor gemm(torch::Tensor const& A, torch::Tensor const& B, - ScalarTypeId const btype_id, - c10::optional const& scales, - c10::optional const& zeros, - c10::optional group_size, - c10::optional const& C, - c10::optional alpha, c10::optional beta, - c10::optional schedule) { -#if defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 12 - ScalarType const btype = ScalarType::from_id(btype_id); - auto args = PyTorchArguments{.A = A, - .B = B, - .scales = scales, - .zeros = zeros, - .group_size = group_size, - .C = C, - .alpha = alpha, - .beta = beta, - .schedule = schedule}; - - return scalar_type_dispatch(btype, [&](auto BType) { - return AT_DISPATCH_SUPPORTED_COMPUTE_TYPES( - A.scalar_type(), "machete_gemm", [&] { - using ComputeType = equivalent_cutlass_type_t; - return GemmDispatcher::dispatch(args); - }); - }); -#else - TORCH_CHECK(false, "Machete requires CUDA 12.0 or later"); -#endif +torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B, + int64_t b_type_id, + c10::optional const& maybe_out_type, + c10::optional const& maybe_group_scales, + c10::optional const& maybe_group_zeros, + c10::optional maybe_group_size, + c10::optional const& maybe_channel_scales, + c10::optional const& maybe_token_scales, + c10::optional maybe_schedule) { + ScalarType const b_type = ScalarType::from_id(b_type_id); + return mm_dispatch({.A = A, + .B = B, + .b_type = b_type, + .maybe_out_type = maybe_out_type, + .maybe_group_scales = maybe_group_scales, + .maybe_group_zeros = maybe_group_zeros, + .maybe_group_size = maybe_group_size, + .maybe_channel_scales = maybe_channel_scales, + .maybe_token_scales = maybe_token_scales, + .maybe_schedule = maybe_schedule}); } -torch::Tensor prepack_B(torch::Tensor const& B, ScalarTypeId const btype_id) { - ScalarType const btype = ScalarType::from_id(btype_id); - return scalar_type_dispatch(btype, [&](auto BType) { - return PrepackBDispatcher::dispatch(B); - }); +torch::Tensor prepack_B( + torch::Tensor const& B, at::ScalarType const& a_type, int64_t b_type_id, + c10::optional const& maybe_group_scales_type) { + ScalarType const b_type = ScalarType::from_id(b_type_id); + return prepack_B_dispatch( + {.B = B, + .a_type = a_type, + .b_type = b_type, + .maybe_group_scales_type = maybe_group_scales_type}); } TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { m.impl("machete_prepack_B", &prepack_B); - m.impl("machete_gemm", &gemm); + m.impl("machete_mm", &mm); } // use CatchAll since supported_schedules has no tensor arguments diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 229fd554d3eee..e4cc7ec951848 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -203,13 +203,36 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // conditionally compiled so impl in source file // Machete (Dense) Optimized Mixed Precision GEMM for Hopper. - ops.def("machete_supported_schedules(int btype) -> str[]"); ops.def( - "machete_gemm(Tensor A, Tensor B, int btype, " - " Tensor? scales, Tensor? zeros, int? group_size, " - " Tensor? C, float? alpha, float? beta, str? schedule)" - "-> Tensor"); - ops.def("machete_prepack_B(Tensor B, int btype) -> Tensor"); + "machete_supported_schedules(" + " ScalarType a_type," + " int b_type," + " ScalarType? maybe_group_scales_type," + " ScalarType? maybe_group_zeros_type," + " ScalarType? maybe_channel_scales_type," + " ScalarType? maybe_token_scales_type," + " ScalarType? maybe_out_type" + ") -> str[]"); + ops.def( + "machete_mm(" + " Tensor A," + " Tensor B," + " int b_type," + " ScalarType? out_type," + " Tensor? group_scales," + " Tensor? group_zeros," + " int? group_size," + " Tensor? channel_scales," + " Tensor? token_scales," + " str? schedule" + ") -> Tensor"); + ops.def( + "machete_prepack_B(" + " Tensor B," + " ScalarType a_type," + " int b_type," + " ScalarType? group_scales_type" + ") -> Tensor"); // conditionally compiled so impl registration is in source file ops.def("permute_cols(Tensor A, Tensor perm) -> Tensor"); diff --git a/tests/kernels/test_machete_gemm.py b/tests/kernels/test_machete_gemm.py deleted file mode 100644 index 59c0a24753c3b..0000000000000 --- a/tests/kernels/test_machete_gemm.py +++ /dev/null @@ -1,284 +0,0 @@ -"""Tests for the machete kernel. - -Run `pytest tests/kernels/test_machete_gemm.py`. -""" - -import math -from typing import Optional, Tuple - -import pytest -import torch - -from tests.kernels.utils import opcheck -from vllm import _custom_ops as ops -from vllm.model_executor.layers.quantization.utils.quant_utils import ( - pack_rows, quantize_weights) -from vllm.platforms import current_platform -from vllm.scalar_type import ScalarType, scalar_types - -CUDA_DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] - -MNK_SHAPES = [ - (1, 128, 128), - (1, 512, 1024), - (1, 4096, 4096), - (1, 8192, 28672), - (13, 8192, 4096), - (26, 4096, 8192), - (64, 4096, 4096), - (64, 8192, 28672), - (257, 128, 4096), - (257, 4224, 4160), - (257, 4096, 4096), - (1024, 4096, 8192), - (1024, 8192, 4096), -] - -ACT_TYPES = [torch.float16, torch.bfloat16] -WTYPE_ZEROPOINTS = [ - # GPTQ style - (scalar_types.uint4b8, False), - (scalar_types.uint8b128, False), - # AWQ style - (scalar_types.uint4, True), - (scalar_types.uint8, True), -] - -# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel -# unit tests to a common utility function. Currently the use of -# `is_quant_method_supported` conflates kernels with quantization methods -# an assumption which is breaking down as quantizations methods can have -# have kernels and some kernels support multiple quantization methods. -IS_SUPPORTED_BY_GPU = current_platform.has_device_capability(90) - - -def rand_data(shape, dtype=torch.float16): - return 10 * (torch.rand(shape, dtype=dtype, device="cuda") - 0.3) - - -def maybe_convert_zeropoints(zps: Optional[torch.Tensor], s: torch.Tensor): - return zps if zps is None else -1 * s * (zps.to(s.dtype)) - - -def machete_quantize_and_pack(w: torch.Tensor, - wtype: ScalarType, - group_size: int, - zero_points: bool = False): - assert wtype.is_integer(), "TODO: support floating point weights" - - w_ref, w_q, w_s, w_zp = quantize_weights( - w, - wtype, - group_size, - zero_points=zero_points, - # to match how the kernel applies zps - ref_zero_points_after_scales=True) - - w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape) - w_q = w_q.t().contiguous().t() # convert to col major - w_q_machete = ops.machete_prepack_B(w_q, wtype) - - opcheck(torch.ops._C.machete_prepack_B, (w_q, wtype.id)) - - return w_ref, w_q_machete, w_s, w_zp - - -def machete_gemm_test_helper(a: torch.Tensor, b: torch.Tensor, - wtype: ScalarType, group_size: int, - zero_points: bool): - w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack( - b, wtype, group_size, zero_points) - - output_ref = torch.matmul(a, w_ref) - - output = ops.machete_gemm( - a=a, - b_q=w_q_packed, - b_type=wtype, - b_scales=w_s, - b_zeros=maybe_convert_zeropoints(w_zp, w_s), - b_group_size=group_size, - ) - - # Relax atol as our reduction dim becomes larger (more rounding error) - # Relax atol when we have zeropoints since the way machete applies - # zeropoints (after scales) causes noise around 0 - atol = 1 if zero_points else min(5e-2 * math.sqrt(a.shape[1]), 1) - torch.testing.assert_close(output, output_ref, rtol=1e-1, atol=atol) - - -@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU, - reason="Machete is not supported on this GPU type.") -@pytest.mark.parametrize("shape", - MNK_SHAPES, - ids=lambda x: "x".join(str(v) for v in x)) -@pytest.mark.parametrize("atype", ACT_TYPES, ids=lambda x: str(x)) -@pytest.mark.parametrize("wtype_zeropoints", WTYPE_ZEROPOINTS) -@pytest.mark.parametrize("group_size", [128, None]) -def test_machete_all_schedules(shape, atype: torch.dtype, - wtype_zeropoints: Tuple[ScalarType, bool], - group_size: Optional[int]): - m, n, k = shape - wtype, zero_points = wtype_zeropoints - - if group_size is not None and k % group_size != 0: - return - - print(f"MNK = {m} {n} {k}") - - # Normalize group_size - if group_size is None: - group_size = k - assert group_size <= k - - a = rand_data((m, k), atype) - w = rand_data((k, n), atype) - - w_ref, w_q_machete, w_s, w_zp = machete_quantize_and_pack( - w, wtype, group_size, zero_points) - - output_ref = torch.matmul(a, w_ref) - - for schedule in ops.machete_supported_schedules(wtype): - print(f"Testing schedule {schedule}") - output = ops.machete_gemm( - a, - b_q=w_q_machete, - b_type=wtype, - b_scales=w_s, - b_zeros=maybe_convert_zeropoints(w_zp, w_s), - b_group_size=group_size, - schedule=schedule, - ) - - opcheck( - torch.ops._C.machete_gemm, - (a, w_q_machete, wtype.id, w_s, maybe_convert_zeropoints( - w_zp, w_s), group_size, None, None, None, schedule)) - - # Relax atol as our reduction dim becomes larger (more rounding error) - # Relax atol when we have zeropoints since the way machete applies - # zeropoints (after scales) causes noise around 0 - atol = 1 if zero_points else min(5e-2 * math.sqrt(k), 1) - torch.testing.assert_close(output, output_ref, rtol=1e-1, atol=atol),\ - f"Schedule failed {schedule}" - - -@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU, - reason="Machete is not supported on this GPU type.") -@pytest.mark.parametrize("shape", - MNK_SHAPES, - ids=lambda x: "x".join(str(v) for v in x)) -@pytest.mark.parametrize("atype", ACT_TYPES, ids=lambda x: str(x)) -@pytest.mark.parametrize("wtype_zeropoints", WTYPE_ZEROPOINTS) -@pytest.mark.parametrize("group_size", [128, None]) -def test_machete_heuristic(shape, atype: torch.dtype, - wtype_zeropoints: Tuple[ScalarType, bool], - group_size: Optional[int]): - m, n, k = shape - wtype, zero_points = wtype_zeropoints - - if group_size is not None and k % group_size != 0: - return - - # Normalize group_size - if group_size is None: - group_size = k - assert group_size <= k - - a = rand_data((m, k), atype) - b = rand_data((k, n), atype) - - machete_gemm_test_helper(a, b, wtype, group_size, zero_points) - - -# Test working on other devices -@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU, - reason="Machete is not supported on this GPU type.") -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_machete_devices(device: str): - m, n, k = 512, 4096, 4096 - wtype = scalar_types.uint4b8 - group_size = 128 - zero_points = False - - print(f"MNK = {m} {n} {k}, device = {device}") - - a = rand_data((m, k), torch.float16).to(device) - b = rand_data((k, n), torch.float16).to(device) - - machete_gemm_test_helper(a, b, wtype, group_size, zero_points) - - -# Test working with a subset of A and B -@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU, - reason="Machete is not supported on this GPU type.") -def test_machete_subset(): - big_m, big_n, big_k = 1024, 1024, 1024 - m, n, k = 512, 512, 512 - wtype = scalar_types.uint4b8 - group_size = 128 - zero_points = False - - whole_a = rand_data((big_m, big_k), torch.float16) - whole_b = rand_data((big_k, big_n), torch.float16) - - a = whole_a[0:m, 0:k] - b = whole_b[0:k, 0:n] - - machete_gemm_test_helper(a, b, wtype, group_size, zero_points) - - -# Test to make sure cuda graphs work -class MacheteLayer(torch.nn.Module): - - def __init__(self, **kwargs): - super().__init__() - self.kwargs = kwargs - - def forward(self, a): - return ops.machete_gemm(**self.kwargs) - - -@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU, - reason="Machete is not supported on this GPU type.") -def test_machete_cuda_graph(): - m, n, k = 512, 4096, 4096 - - a = rand_data((m, k), torch.float16) - b = rand_data((k, n), torch.float16) - wtype = scalar_types.uint4b8 - group_size = 128 - zero_points = False - - w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack( - b, wtype, group_size, zero_points) - - # Construct a trivial model with a single layer that calls a machete kernel - model = MacheteLayer( - a=a, - b_q=w_q_packed, - b_type=wtype, - b_scales=w_s, - b_zeros=maybe_convert_zeropoints(w_zp, w_s), - b_group_size=group_size, - ) - - output_ref = torch.matmul(a, w_ref) - - # Run the model with a cuda graph - stream = torch.cuda.Stream() - with torch.cuda.stream(stream): - g = torch.cuda.CUDAGraph() - with torch.cuda.graph(g): - output = model(a) - output.zero_() - g.replay() - - # Relax atol as our reduction dim becomes larger (more rounding error) - # Relax atol when we have zeropoints since the way machete applies - # zeropoints (after scales) causes noise around 0 - atol = 1 if zero_points else min(5e-2 * math.sqrt(k), 1) - torch.testing.assert_close(output, output_ref, rtol=1e-1, atol=atol) diff --git a/tests/kernels/test_machete_mm.py b/tests/kernels/test_machete_mm.py new file mode 100644 index 0000000000000..1c6eb2dd9a228 --- /dev/null +++ b/tests/kernels/test_machete_mm.py @@ -0,0 +1,406 @@ +"""Tests for the machete kernel. + +Run `pytest tests/kernels/test_machete_mm.py`. +""" + +import math +from dataclasses import dataclass, fields +from typing import List, Optional, Tuple + +import pytest +import torch + +from tests.kernels.utils import opcheck +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + pack_rows, quantize_weights) +from vllm.platforms import current_platform +from vllm.scalar_type import ScalarType, scalar_types + +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) +] + +# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel +# unit tests to a common utility function. Currently the use of +# `is_quant_method_supported` conflates kernels with quantization methods +# an assumption which is breaking down as quantizations methods can have +# have kernels and some kernels support multiple quantization methods. +IS_SUPPORTED_BY_GPU = current_platform.get_device_capability()[0] >= 9 + +MNK_SHAPES = [ + (1, 128, 128), + (1, 512, 1024), + (1, 4096, 4096), + (1, 8192, 28672), + (13, 8192, 4096), + (26, 4096, 8192), + (64, 4096, 4096), + (64, 8192, 28672), + (257, 128, 4096), + (257, 4224, 4160), + (257, 4096, 4096), + (1024, 4096, 8192), + (1024, 8192, 4096), +] + +GROUP_SIZES_TO_TEST: List[Optional[int]] = [128, -1] + + +@dataclass +class TypeConfig: + act_type: torch.dtype + weight_type: ScalarType + output_type: Optional[torch.dtype] + group_scale_type: Optional[torch.dtype] + group_zero_type: Optional[torch.dtype] + channel_scale_type: Optional[torch.dtype] + token_scale_type: Optional[torch.dtype] + + +@dataclass +class Tensors: + w_ref: torch.Tensor + a_ref: torch.Tensor + a: torch.Tensor + w_q: torch.Tensor + w_g_s: Optional[torch.Tensor] + w_g_zp: Optional[torch.Tensor] + w_ch_s: Optional[torch.Tensor] + w_tok_s: Optional[torch.Tensor] + + +# (Act Type, Weight Type, Output Type, Scale Type, ZeroPoints, +# Ch Scales Type, Tok Scales Type) +# NOTE: None "Scale Type" means the act type is floating point +# None "Output Type" means the output type is the same as the act type +TestTypeTuple = Tuple[List[torch.dtype], ScalarType, Optional[torch.dtype], + Optional[torch.dtype], bool] +TEST_TYPES = [ + # GPTQ style + *(TypeConfig(act_type=a_type, + weight_type=w_type, + output_type=None, + group_scale_type=a_type, + group_zero_type=None, + channel_scale_type=None, + token_scale_type=None) + for w_type in [scalar_types.uint4b8, scalar_types.uint8b128] + for a_type in [torch.float16, torch.bfloat16]), + # AWQ style + *(TypeConfig(act_type=a_type, + weight_type=w_type, + output_type=None, + group_scale_type=a_type, + group_zero_type=a_type, + channel_scale_type=None, + token_scale_type=None) + for w_type in [scalar_types.uint4, scalar_types.uint8] + for a_type in [torch.float16, torch.bfloat16]), + # QQQ style + *(TypeConfig(act_type=torch.int8, + weight_type=scalar_types.uint4b8, + output_type=torch.float16, + group_scale_type=group_scale_type, + group_zero_type=None, + channel_scale_type=torch.float, + token_scale_type=torch.float) + for group_scale_type in [None, torch.float16]), + *(TypeConfig(act_type=torch.float8_e4m3fn, + weight_type=scalar_types.uint4b8, + output_type=torch.float16, + group_scale_type=group_scale_type, + group_zero_type=None, + channel_scale_type=torch.float, + token_scale_type=torch.float) + for group_scale_type in [None, torch.float16]), +] + +# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel +# unit tests to a common utility function. Currently the use of +# `is_quant_method_supported` conflates kernels with quantization methods +# an assumption which is breaking down as quantizations methods can have +# have kernels and some kernels support multiple quantization methods. +IS_SUPPORTED_BY_GPU = current_platform.has_device_capability(90) + + +def rand_data(shape, dtype=torch.float16, scale=1, offset=0): + if dtype.is_floating_point: + return (scale * torch.rand(shape, device="cuda") - offset).to(dtype) + else: + return torch.randint(-8, 7, shape, dtype=dtype, device="cuda") + + +def maybe_convert_zeropoints(zps: Optional[torch.Tensor], s: torch.Tensor): + return zps if zps is None else -1 * s * (zps.to(s.dtype)) + + +def group_size_valid(shape: Tuple[int, int, int], + group_size: Optional[int]) -> bool: + return group_size is None or group_size == -1 or group_size % shape[2] == 0 + + +def machete_quantize_and_pack(atype: torch.dtype, + w: torch.Tensor, + wtype: ScalarType, + stype: Optional[torch.dtype], + group_size: Optional[int], + zero_points: bool = False): + assert wtype.is_integer(), "TODO: support floating point weights" + + w_ref, w_q, w_s, w_zp = quantize_weights( + w, + wtype, + group_size=group_size, + zero_points=zero_points, + # to match how the kernel applies zps + ref_zero_points_after_scales=True) + + w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape) + w_q = w_q.t().contiguous().t() # convert to col major + + w_q_machete = ops.machete_prepack_B(w_q, atype, wtype, stype) + opcheck(torch.ops._C.machete_prepack_B, (w_q, atype, wtype.id, stype)) + + return w_ref, w_q_machete, w_s, w_zp + + +def create_test_tensors(shape: Tuple[int, int, int], + types: TypeConfig, + group_size: Optional[int], + subset_stride_factor: Optional[int] = None) -> Tensors: + m, n, k = shape + factor = subset_stride_factor or 1 + + print("create_test_tensors, shape:", shape, "types:", types, "group_size:", + group_size) + + a = rand_data((m * factor, k * factor), types.act_type, scale=3, offset=2) + w = rand_data((k * factor, n * factor), types.act_type, scale=3, offset=1) + + if factor > 1: + a = a[0:m, 0:k] + w = w[0:k, 0:n] + + if types.group_scale_type is not None: + w = w.to(types.group_scale_type) + if w.dtype.itemsize == 1: + w = w.to(torch.float16) + + w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack( + a.dtype, w, types.weight_type, types.group_scale_type, group_size, + types.group_zero_type is not None) + + if not a.dtype.is_floating_point: + aiinfo = torch.iinfo(a.dtype) + w_ref = w_ref.round().clamp(aiinfo.min, aiinfo.max) + + a_ref = a.to(torch.float32) + w_ref = w_ref.to(torch.float32) + + w_ch_s = None if types.channel_scale_type is None else\ + rand_data((n,), types.channel_scale_type) + w_tok_s = None if types.token_scale_type is None else\ + rand_data((m,), types.token_scale_type) + + return Tensors(w_ref=w_ref, + a_ref=a_ref, + a=a, + w_q=w_q_packed, + w_g_s=w_s, + w_g_zp=maybe_convert_zeropoints(w_zp, w_s), + w_ch_s=w_ch_s, + w_tok_s=w_tok_s) + + +# None stype means scales use the same dtype as a +def machete_mm_test_helper(types: TypeConfig, + tensors: Tensors, + group_size: Optional[int] = None, + schedule: Optional[str] = None): + output_ref = torch.matmul(tensors.a_ref, tensors.w_ref) + output_ref_type = output_ref.dtype + + if tensors.w_ch_s is not None: + output_ref = (output_ref.to(tensors.w_ch_s.dtype) * + tensors.w_ch_s.unsqueeze(0)).to(output_ref_type) + if tensors.w_tok_s is not None: + output_ref = (output_ref.to(tensors.w_tok_s.dtype) * + tensors.w_tok_s.unsqueeze(1)).to(output_ref_type) + + output = ops.machete_mm( + a=tensors.a, + b_q=tensors.w_q, + b_type=types.weight_type, + b_group_scales=tensors.w_g_s, + b_group_zeros=tensors.w_g_zp, + b_group_size=group_size, + b_channel_scales=tensors.w_ch_s, + a_token_scales=tensors.w_tok_s, + out_type=types.output_type, + schedule=schedule, + ) + + print(output) + print(output_ref) + + # Relax atol as our reduction dim becomes larger (more rounding error) + # Relax atol when we have zeropoints since the way machete applies + # zeropoints (after scales) causes noise around 0 + atol = 1 if tensors.w_g_zp is not None\ + else min(5e-2 * math.sqrt(tensors.a.shape[1]), 1) + rtol = 1e-1 if tensors.a.element_size() >= 2 else 2e-1 + torch.testing.assert_close(output, + output_ref.to(output.dtype), + rtol=rtol, + atol=atol) + + +@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU, + reason="Machete is not supported on this GPU type.") +@pytest.mark.parametrize("shape", + MNK_SHAPES, + ids=lambda x: "x".join(str(v) for v in x)) +@pytest.mark.parametrize("types", TEST_TYPES) +def test_machete_all_schedules(shape, types: TypeConfig): + + group_sizes: List[Optional[int]] = [] + if types.group_scale_type is None: + group_sizes = [None] + else: + group_sizes = GROUP_SIZES_TO_TEST + + for group_size in group_sizes: + if not group_size_valid(shape, group_size): + continue + + tensors = create_test_tensors(shape, types, group_size) + print(f"MNK = {shape}") + for schedule in ops.machete_supported_schedules( + types.act_type, + types.weight_type, + group_scales_type=types.group_scale_type, + group_zeros_type=types.group_scale_type, + out_type=types.output_type): + print(f"Testing schedule {schedule}") + machete_mm_test_helper(types, tensors, group_size, schedule) + + +@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU, + reason="Machete is not supported on this GPU type.") +@pytest.mark.parametrize("shape", + MNK_SHAPES, + ids=lambda x: "x".join(str(v) for v in x)) +@pytest.mark.parametrize("types", TEST_TYPES) +def test_machete_heuristic(shape, types: TypeConfig): + group_sizes: List[Optional[int]] = [] + if types.group_scale_type is None: + group_sizes = [None] + else: + group_sizes = GROUP_SIZES_TO_TEST + + for group_size in group_sizes: + if not group_size_valid(shape, group_size): + continue + + tensors = create_test_tensors(shape, types, group_size) + machete_mm_test_helper(types, tensors, group_size) + + +# Test working on other devices +@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU, + reason="Machete is not supported on this GPU type.") +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_machete_devices(device: str): + group_size = 128 + + type_config = TypeConfig(act_type=torch.float16, + weight_type=scalar_types.uint4b8, + output_type=None, + group_scale_type=torch.float16, + group_zero_type=None, + channel_scale_type=None, + token_scale_type=None) + + tensors = create_test_tensors((512, 4096, 4096), type_config, group_size) + + for field in fields(Tensors): + tensor = getattr(tensors, field.name) + if isinstance(tensor, torch.Tensor): + setattr(tensors, field.name, tensor.to(device)) + + machete_mm_test_helper(type_config, tensors, group_size) + + +# Test working with a subset of A and B +@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU, + reason="Machete is not supported on this GPU type.") +def test_machete_subset(): + group_size = 128 + + type_config = TypeConfig(act_type=torch.float16, + weight_type=scalar_types.uint4b8, + output_type=None, + group_scale_type=torch.float16, + group_zero_type=None, + channel_scale_type=None, + token_scale_type=None) + + tensors = create_test_tensors((512, 4096, 4096), + type_config, + group_size, + subset_stride_factor=2) + machete_mm_test_helper(type_config, tensors, group_size) + + +# Test to make sure cuda graphs work +class MacheteLayer(torch.nn.Module): + + def __init__(self, **kwargs): + super().__init__() + self.kwargs = kwargs + + def forward(self, a): + return ops.machete_mm(a=a, **self.kwargs) + + +@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU, + reason="Machete is not supported on this GPU type.") +def test_machete_cuda_graph(): + m, n, k = 512, 4096, 4096 + + a = rand_data((m, k), torch.float16) + b = rand_data((k, n), torch.float16) + wtype = scalar_types.uint4b8 + stype = torch.float16 + group_size = 128 + zero_points = False + + w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack( + a.dtype, b, wtype, stype, group_size, zero_points) + + # Construct a trivial model with a single layer that calls a machete kernel + model = MacheteLayer( + b_q=w_q_packed, + b_type=wtype, + b_group_scales=w_s, + b_group_zeros=maybe_convert_zeropoints(w_zp, w_s), + b_group_size=group_size, + ) + + output_ref = torch.matmul(a, w_ref) + + # Run the model with a cuda graph + stream = torch.cuda.Stream() + with torch.cuda.stream(stream): + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(g): + output = model(a) + output.zero_() + g.replay() + + # Relax atol as our reduction dim becomes larger (more rounding error) + # Relax atol when we have zeropoints since the way machete applies + # zeropoints (after scales) causes noise around 0 + atol = 1 if zero_points else min(5e-2 * math.sqrt(k), 1) + torch.testing.assert_close(output, output_ref, rtol=1e-1, atol=atol) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index b276b8fc25473..aa89010ca8ecd 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -444,18 +444,18 @@ def _fp8_marlin_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor, size_k: torch.SymInt) -> torch.Tensor: return torch.empty((size_m, size_n), dtype=a.dtype, device=a.device) - @register_fake("_C::machete_gemm") - def machete_gemm_fake( + @register_fake("_C::machete_mm") + def machete_mm_fake( a: torch.Tensor, - # Should be the tensor returned by machete_prepack_B + # b_q Should be the tensor returned by machete_prepack_B b_q: torch.Tensor, b_type: ScalarType, - b_scales: Optional[torch.Tensor] = None, - b_zeros: Optional[torch.Tensor] = None, + out_type: Optional[torch.dtype] = None, + b_group_scales: Optional[torch.Tensor] = None, + b_group_zeros: Optional[torch.Tensor] = None, b_group_size: Optional[int] = None, - c: Optional[torch.Tensor] = None, - alpha: Optional[float] = None, - beta: Optional[float] = None, + b_channel_scales: Optional[torch.Tensor] = None, + a_token_scales: Optional[torch.Tensor] = None, schedule: Optional[str] = None, ) -> torch.Tensor: m = a.size(0) @@ -463,8 +463,9 @@ def machete_gemm_fake( return torch.empty((m, n), device=a.device, dtype=a.dtype) @register_fake("_C::machete_prepack_B") - def machete_prepack_B_fake(b_q_weight: torch.Tensor, - b_type: ScalarType) -> torch.Tensor: + def machete_prepack_B_fake( + b_q_weight: torch.Tensor, a_type: torch.dtype, b_type: ScalarType, + group_scales_type: Optional[torch.dtype]) -> torch.Tensor: return torch.empty_like(b_q_weight, memory_format=torch.contiguous_format) @@ -617,29 +618,41 @@ def fp8_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, # machete -def machete_supported_schedules(b_type: ScalarType) -> List[str]: - return torch.ops._C.machete_supported_schedules(b_type.id) - - -def machete_gemm( - a: torch.Tensor, - b_q: torch.Tensor, # Should be the tensor returned by machete_prepack_B - b_type: ScalarType, - b_scales: Optional[torch.Tensor] = None, - b_zeros: Optional[torch.Tensor] = None, - b_group_size: Optional[int] = None, - c: Optional[torch.Tensor] = None, - alpha: Optional[float] = None, - beta: Optional[float] = None, - schedule: Optional[str] = None, -) -> torch.Tensor: - return torch.ops._C.machete_gemm(a, b_q, b_type.id, b_scales, b_zeros, - b_group_size, c, alpha, beta, schedule) +def machete_supported_schedules( + a_type: torch.dtype, + b_type: ScalarType, + group_scales_type: Optional[torch.dtype], + group_zeros_type: Optional[torch.dtype] = None, + channel_scales_type: Optional[torch.dtype] = None, + token_scales_type: Optional[torch.dtype] = None, + out_type: Optional[torch.dtype] = None) -> List[str]: + return torch.ops._C.machete_supported_schedules( + a_type, b_type.id, group_scales_type, group_zeros_type, + channel_scales_type, token_scales_type, out_type) -def machete_prepack_B(b_q_weight: torch.Tensor, - b_type: ScalarType) -> torch.Tensor: - return torch.ops._C.machete_prepack_B(b_q_weight, b_type.id) +def machete_mm( + a: torch.Tensor, + # b_q Should be the tensor returned by machete_prepack_B + b_q: torch.Tensor, + b_type: ScalarType, + out_type: Optional[torch.dtype] = None, + b_group_scales: Optional[torch.Tensor] = None, + b_group_zeros: Optional[torch.Tensor] = None, + b_group_size: Optional[int] = None, + b_channel_scales: Optional[torch.Tensor] = None, + a_token_scales: Optional[torch.Tensor] = None, + schedule: Optional[str] = None) -> torch.Tensor: + return torch.ops._C.machete_mm(a, b_q, b_type.id, out_type, b_group_scales, + b_group_zeros, b_group_size, + b_channel_scales, a_token_scales, schedule) + + +def machete_prepack_B( + b_q_weight: torch.Tensor, a_type: torch.dtype, b_type: ScalarType, + group_scales_type: Optional[torch.dtype]) -> torch.Tensor: + return torch.ops._C.machete_prepack_B(b_q_weight, a_type, b_type.id, + group_scales_type) if hasattr(torch.ops._C, "permute_cols"): diff --git a/vllm/model_executor/layers/quantization/kernels/machete.py b/vllm/model_executor/layers/quantization/kernels/machete.py index e5696d08f30f5..15df0200f30b5 100644 --- a/vllm/model_executor/layers/quantization/kernels/machete.py +++ b/vllm/model_executor/layers/quantization/kernels/machete.py @@ -79,7 +79,9 @@ def transform_w_q(x): c.weight_type, packed_dim=0) x.data = ops.machete_prepack_B(x.data.t().contiguous().t(), - self.config.weight_type) + a_type=c.act_type, + b_type=c.weight_type, + group_scales_type=c.act_type) return x def transform_w_s(x): @@ -105,12 +107,12 @@ def apply_weights(self, if c.has_g_idx: x_2d = self.act_perm(x_2d) - output = ops.machete_gemm(a=x_2d, - b_q=w_q, - b_type=c.weight_type, - b_zeros=None, - b_scales=w_s, - b_group_size=c.group_size) + output = ops.machete_mm(a=x_2d, + b_q=w_q, + b_type=c.weight_type, + b_group_zeros=None, + b_group_scales=w_s, + b_group_size=c.group_size) if bias is not None: output.add_(bias) # In-place add diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py index c217f5ca620a1..83055d6000d83 100644 --- a/vllm/model_executor/layers/quantization/utils/quant_utils.py +++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py @@ -126,11 +126,14 @@ def permute_rows(q_w: torch.Tensor, def quantize_weights(w: torch.Tensor, quant_type: ScalarType, - group_size: int, + group_size: Optional[int], zero_points: bool = False, ref_zero_points_after_scales: bool = False): assert quant_type.is_integer(), \ "Floating point quantization may work but has not been tested" + assert not zero_points or group_size is not None, \ + "to have group zero points, group_size must be provided "\ + "(-1 group_size is channelwise)" orig_device = w.device orig_type = w.dtype @@ -140,10 +143,9 @@ def quantize_weights(w: torch.Tensor, if group_size == -1: group_size = size_k - assert group_size <= size_k # Reshape to [groupsize, -1] - if group_size < size_k: + if group_size is not None and group_size < size_k: w = w.reshape((-1, group_size, size_n)) w = w.permute(1, 0, 2) w = w.reshape((group_size, -1)) @@ -155,18 +157,20 @@ def quantize_weights(w: torch.Tensor, max_q_val = quant_type.max() min_q_val = quant_type.min() - if zero_points: - assert not quant_type.is_signed() and quant_type.max() > 0 - w_s = (max_val - min_val).clamp(min=1e-5) / quant_type.max() - maybe_w_zp = torch.round(torch.abs(min_val / w_s)) \ - .clamp(min_q_val, max_q_val).int() - else: - # If the bias is such that there are no possible negative/positive - # values, set the max value to inf to avoid divide by 0 - w_s = torch.max( - abs(max_val / (max_q_val if max_q_val != 0 else torch.inf)), - abs(min_val / (min_q_val if min_q_val != 0 else torch.inf))) - maybe_w_zp = None + w_s = torch.Tensor([1.0]).to(w.device) # unscaled case + maybe_w_zp = None + if group_size is not None: + if zero_points: + assert not quant_type.is_signed() and quant_type.max() > 0 + w_s = (max_val - min_val).clamp(min=1e-5) / quant_type.max() + maybe_w_zp = torch.round(torch.abs(min_val / w_s)) \ + .clamp(min_q_val, max_q_val).int() + else: + # If the bias is such that there are no possible negative/positive + # values, set the max value to inf to avoid divide by 0 + w_s = torch.max( + abs(max_val / (max_q_val if max_q_val != 0 else torch.inf)), + abs(min_val / (min_q_val if min_q_val != 0 else torch.inf))) # Quantize w_q = torch.round(w / w_s).int() + (maybe_w_zp if zero_points else 0) @@ -176,7 +180,7 @@ def quantize_weights(w: torch.Tensor, # For some kernels (namely Machete) the zero-points are applied after the # scales are applied, for this case computing the reference in similar way # allows us to use tighter error tolerances in our unit tests. - if ref_zero_points_after_scales and zero_points: + if ref_zero_points_after_scales and maybe_w_zp is not None: w_ref = w_q.to(orig_type) * w_s - maybe_w_zp.to(orig_type) * w_s else: w_ref = (w_q - (maybe_w_zp if zero_points else 0)).to(orig_type) * w_s @@ -185,7 +189,7 @@ def quantize_weights(w: torch.Tensor, w_q += quant_type.bias # Restore original shapes - if group_size < size_k: + if group_size is not None and group_size < size_k: def reshape_w(w): w = w.reshape((group_size, -1, size_n)) @@ -195,17 +199,16 @@ def reshape_w(w): w_q = reshape_w(w_q) w_ref = reshape_w(w_ref) + w_s = w_s.reshape((-1, size_n)).contiguous() - w_s = w_s.reshape((-1, size_n)).contiguous() - - if zero_points: + if maybe_w_zp is not None: maybe_w_zp = maybe_w_zp.reshape((-1, size_n)).contiguous() maybe_w_zp = maybe_w_zp.to(device=orig_device) return ( w_ref.to(device=orig_device), w_q.to(device=orig_device), - w_s.to(device=orig_device), + w_s if group_size is not None else None, maybe_w_zp, ) From a03ea40792201ac8ff547d37d9f9255b347b9ccd Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 18 Nov 2024 15:14:59 -0800 Subject: [PATCH 045/255] [3/N][torch.compile] consolidate custom op logging (#10399) Signed-off-by: youkaichao --- vllm/config.py | 12 ++++++++++-- vllm/model_executor/custom_op.py | 9 ++++++--- vllm/plugins/__init__.py | 4 ++++ 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 14017bbdb3cf2..ea9ec43cc5a15 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -4,8 +4,9 @@ import warnings from dataclasses import dataclass, field, replace from pathlib import Path -from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Dict, Final, List, - Literal, Mapping, Optional, Set, Tuple, Type, Union) +from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Counter, Dict, + Final, List, Literal, Mapping, Optional, Set, Tuple, Type, + Union) import torch from pydantic import BaseModel, Field, PrivateAttr @@ -2169,6 +2170,10 @@ class CompilationConfig(BaseModel): compile_sizes: List[int] = PrivateAttr capture_sizes: List[int] = PrivateAttr + # keep track of enabled and disabled custom ops + enabled_custom_ops: Counter[str] = PrivateAttr + disabled_custom_ops: Counter[str] = PrivateAttr + def model_post_init(self, __context: Any) -> None: self.level = envs.VLLM_TORCH_COMPILE_LEVEL @@ -2190,6 +2195,9 @@ def model_post_init(self, __context: Any) -> None: func = __import__(module).__dict__[func_name] self.inductor_compile_config[k] = func + self.enabled_custom_ops = Counter() + self.disabled_custom_ops = Counter() + def init_backend(self) -> Union[str, Callable]: if self.level == CompilationLevel.NO_COMPILATION: raise ValueError("No compilation level is set.") diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 6ae7d7cf6964f..b07966f2ab7d0 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -61,10 +61,13 @@ def forward_hpu(self, *args, **kwargs): def dispatch_forward(self): # NOTE(woosuk): Here we assume that vLLM was built for only one # specific backend. Currently, we do not support dynamic dispatching. - + compilation_config = get_current_vllm_config().compilation_config enabled = self.enabled() - logger.debug("custom op %s %s", self.__class__.name, - "enabled" if enabled else "disabled") + if enabled: + compilation_config.enabled_custom_ops.update([self.__class__.name]) + else: + compilation_config.disabled_custom_ops.update( + [self.__class__.name]) if not enabled: return self.forward_native diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index a0c73a752b5e8..c5182139db50b 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -80,6 +80,10 @@ def set_current_vllm_config(vllm_config: VllmConfig): _current_vllm_config = vllm_config yield finally: + logger.debug("enabled custom ops: %s", + vllm_config.compilation_config.enabled_custom_ops) + logger.debug("disabled custom ops: %s", + vllm_config.compilation_config.disabled_custom_ops) _current_vllm_config = old_vllm_config From 2298e69b5f1dc77f00aee687a3843a4dae12cb91 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 18 Nov 2024 15:29:37 -0800 Subject: [PATCH 046/255] [ci][bugfix] fix kernel tests (#10431) Signed-off-by: youkaichao --- vllm/plugins/__init__.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index c5182139db50b..fdc848cedf054 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -6,9 +6,6 @@ if TYPE_CHECKING: from vllm.config import CompilationConfig, VllmConfig -else: - CompilationConfig = None - VllmConfig = None logger = logging.getLogger(__name__) @@ -50,23 +47,23 @@ def load_general_plugins(): logger.exception("Failed to load plugin %s", plugin.name) -_compilation_config: Optional[CompilationConfig] = None +_compilation_config: Optional["CompilationConfig"] = None -def set_compilation_config(config: Optional[CompilationConfig]): +def set_compilation_config(config: Optional["CompilationConfig"]): global _compilation_config _compilation_config = config -def get_compilation_config() -> Optional[CompilationConfig]: +def get_compilation_config() -> Optional["CompilationConfig"]: return _compilation_config -_current_vllm_config: Optional[VllmConfig] = None +_current_vllm_config: Optional["VllmConfig"] = None @contextmanager -def set_current_vllm_config(vllm_config: VllmConfig): +def set_current_vllm_config(vllm_config: "VllmConfig"): """ Temporarily set the current VLLM config. Used during model initialization. @@ -87,6 +84,12 @@ def set_current_vllm_config(vllm_config: VllmConfig): _current_vllm_config = old_vllm_config -def get_current_vllm_config() -> VllmConfig: - assert _current_vllm_config is not None, "Current VLLM config is not set." +def get_current_vllm_config() -> "VllmConfig": + if _current_vllm_config is None: + # in ci, usually when we test custom ops/modules directly, + # we don't set the vllm config. In that case, we set a default + # config. + logger.warning("Current VLLM config is not set.") + from vllm.config import VllmConfig + return VllmConfig() return _current_vllm_config From 90a6c759caf84ff7722449a33895e397ccf1a2af Mon Sep 17 00:00:00 2001 From: Ricky Xu Date: Mon, 18 Nov 2024 15:39:14 -0800 Subject: [PATCH 047/255] [misc] partial prefix & random input generation benchmark (#9929) Signed-off-by: rickyx --- benchmarks/benchmark_prefix_caching.py | 116 +++++++++++++++++++------ 1 file changed, 91 insertions(+), 25 deletions(-) diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index 6d33096ca1d11..5e9381f712e10 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -54,13 +54,30 @@ def test_prefix(llm=None, sampling_params=None, prompts=None): print(f"cost time {end_time - start_time}") -def sample_requests( +@dataclasses.dataclass +class Request: + prompt: str + prompt_len: int + output_len: int + + +def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> str: + vocab = tokenizer.get_vocab() + # Remove the special tokens. + vocab = { + k: v + for k, v in vocab.items() if k not in tokenizer.all_special_ids + } + return random.choices(list(vocab.values()), k=length) + + +def sample_requests_from_dataset( dataset_path: str, num_requests: int, tokenizer: PreTrainedTokenizerBase, input_length_range: Tuple[int, int], fixed_output_len: Optional[int], -) -> List[Tuple[str, int, int]]: +) -> List[Request]: if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") @@ -77,31 +94,55 @@ def sample_requests( random.shuffle(dataset) min_len, max_len = input_length_range + assert min_len >= 0 and max_len >= min_len, "input_length_range too small" # Filter out sequences that are too long or too short - filtered_dataset: List[Tuple[str, int, int]] = [] + filtered_requests: List[Request] = [] + for i in range(len(dataset)): - if len(filtered_dataset) == num_requests: + if len(filtered_requests) == num_requests: break # Tokenize the prompts and completions. - prompt = dataset[i][0] - prompt_token_ids = tokenizer(prompt).input_ids + prompt_token_ids = tokenizer(dataset[i][0]).input_ids + prompt = tokenizer.decode(prompt_token_ids) completion = dataset[i][1] completion_token_ids = tokenizer(completion).input_ids prompt_len = len(prompt_token_ids) - output_len = len(completion_token_ids - ) if fixed_output_len is None else fixed_output_len - if prompt_len < 4 or output_len < 4: - # Prune too short sequences. - continue + output_len = (len(completion_token_ids) + if fixed_output_len is None else fixed_output_len) if min_len <= prompt_len <= max_len: - filtered_dataset.append((prompt, prompt_len, output_len)) + filtered_requests.append(Request(prompt, prompt_len, output_len)) + + return filtered_requests + + +def sample_requests_from_random( + num_requests: int, + tokenizer: PreTrainedTokenizerBase, + input_length_range: Tuple[int, int], + fixed_output_len: Optional[int], + prefix_len: int, +) -> List[Request]: - return filtered_dataset + requests = [] + prefix_token_ids = sample_tokens(tokenizer, prefix_len) + min_len, max_len = input_length_range + + for i in range(num_requests): + unique_part_token_ids = sample_tokens( + tokenizer, + random.randint(min_len - prefix_len, max_len - prefix_len)) + prompt_token_ids = prefix_token_ids + unique_part_token_ids + prompt = tokenizer.decode(prompt_token_ids) + prompt_len = len(prompt_token_ids) + assert (min_len <= prompt_len <= max_len + ), f"prompt_len {prompt_len} out of range {min_len}:{max_len}" + requests.append(Request(prompt, prompt_len, fixed_output_len)) + return requests -def repeat_and_sort_requests(requests: List[Tuple[str, int, int]], +def repeat_and_sort_requests(requests: List[Request], repeat_count: int, sort: bool = False) -> List[str]: repeated_requests = requests * repeat_count @@ -109,7 +150,7 @@ def repeat_and_sort_requests(requests: List[Tuple[str, int, int]], repeated_requests.sort(key=lambda x: x[1]) else: random.shuffle(repeated_requests) - return [req[0] for req in repeated_requests] + return [req.prompt for req in repeated_requests] def main(args): @@ -117,9 +158,12 @@ def main(args): input_length_range = tuple(map(int, args.input_length_range.split(':'))) random.seed(args.seed) if args.dataset_path is not None: - print(f"Start to sample {args.num_prompts} prompts" + if args.prefix_len > 0: + raise ValueError("prefix-len is not supported when " + "dataset-path is provided.") + print(f"Start to sample {args.num_prompts} prompts " f"from {args.dataset_path}") - filtered_datasets = sample_requests( + filtered_requests = sample_requests_from_dataset( dataset_path=args.dataset_path, num_requests=args.num_prompts, tokenizer=tokenizer, @@ -127,9 +171,22 @@ def main(args): fixed_output_len=args.output_len, ) else: - prompt_len = len(tokenizer(PROMPT).input_ids) - filtered_datasets = [(PROMPT, prompt_len, args.output_len) - ] * args.num_prompts + print(f"Start to sample {args.num_prompts} prompts from random") + filtered_requests = sample_requests_from_random( + num_requests=args.num_prompts, + tokenizer=tokenizer, + input_length_range=input_length_range, + fixed_output_len=args.output_len, + prefix_len=args.prefix_len, + ) + + # Print some helpful stats of the requests. + print(f"Sampled {len(filtered_requests)} requests.") + prompt_lens = [req.prompt_len for req in filtered_requests] + print(f"Average input length: {sum(prompt_lens) / len(prompt_lens)}") + print(f"P50 input length: {sorted(prompt_lens)[len(prompt_lens) // 2]}") + print(f"Min Prompt Length: {min(prompt_lens)}") + print(f"Max Prompt Length: {max(prompt_lens)}") engine_args = EngineArgs.from_cli_args(args) @@ -137,8 +194,8 @@ def main(args): sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len) - print("Testing filtered datasets") - prompts = repeat_and_sort_requests(filtered_datasets, + print("Testing filtered requests") + prompts = repeat_and_sort_requests(filtered_requests, repeat_count=args.repeat_count, sort=args.sort) @@ -161,20 +218,29 @@ def main(args): parser.add_argument('--output-len', type=int, default=10) parser.add_argument('--num-prompts', type=int, - default=1, + required=True, help="Number of the prompts sampled from dataset") parser.add_argument('--repeat-count', type=int, - default=100, + default=1, help='Number of times to repeat each prompt') parser.add_argument('--sort', action='store_true', help='Sort prompts by input length') parser.add_argument('--input-length-range', type=str, - default='128:256', + required=True, help='Range of input lengths for sampling prompts,' 'specified as "min:max" (e.g., "128:256").') + parser.add_argument( + "--prefix-len", + type=int, + default=0, + help="Specifies the length of a common prefix to be " + "added to the input prompt. The input-length-range will " + "subtract this length when filtering prompts. Only used " + "when dataset-path is not provided.", + ) parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() From 284203f171d86a9581295436d6175246215437fd Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Mon, 18 Nov 2024 15:04:25 -1000 Subject: [PATCH 048/255] [ci/build] Have dependabot ignore all patch update (#10436) We have too many dependencies and all patch updates can be a little noisy. This is to have dependabot ignore all patch version updates. --- .github/dependabot.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 4f54eea564ecb..683b70cd89989 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -15,6 +15,8 @@ updates: allow: - dependency-type: "all" ignore: + - dependency-name: "*" + update-types: ["version-update:semver-patch"] - dependency-name: "torch" - dependency-name: "torchvision" - dependency-name: "xformers" @@ -24,9 +26,6 @@ updates: - dependency-name: "ray[adag]" - dependency-name: "lm-eval" groups: - patch-update: - applies-to: version-updates - update-types: ["patch"] minor-update: applies-to: version-updates update-types: ["minor"] From 7eb719df13cf8059485f52648a6a115700158301 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Tue, 19 Nov 2024 11:21:42 +0800 Subject: [PATCH 049/255] [Bugfix]Fix Phi-3 BNB online quantization (#10417) Signed-off-by: Jee Jee Li --- vllm/model_executor/layers/linear.py | 12 +++++++++--- vllm/model_executor/models/phi3.py | 10 ++++++++++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index e1f8a6e36d781..9da38d4857d6d 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -470,7 +470,8 @@ def weight_loader(self, needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False) if loaded_shard_id is None: - # Loaded weight is already fused on disk (qkv/mlp). + # Loaded weight is already fused on disk (mlp). + # (e.g., Phi-3's gate_up_proj). if output_dim is None: if needs_scalar_to_array: param_data, loaded_weight = adjust_scalar_to_fused_array( @@ -480,6 +481,8 @@ def weight_loader(self, param_data.copy_(loaded_weight) return current_shard_offset = 0 + use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", + False) shard_offsets: List[Tuple[int, int, int]] = [] for i, output_size in enumerate(self.output_sizes): shard_offsets.append((i, current_shard_offset, output_size)) @@ -495,7 +498,9 @@ def weight_loader(self, # Special case for Marlin. shard_size, shard_offset = adjust_marlin_shard( param, shard_size, shard_offset) - + if use_bitsandbytes_4bit: + shard_size = loaded_weight.shape[output_dim] // 2 + shard_offset = shard_size * shard_id loaded_weight_shard = loaded_weight.narrow( output_dim, shard_offset, shard_size) self.weight_loader(param, loaded_weight_shard, shard_id) @@ -808,7 +813,8 @@ def weight_loader(self, needs_scalar_to_array = getattr(param, "needs_scalar_to_array", False) if loaded_shard_id is None: - # Loaded weight is already fused on disk (qkv/mlp). + # Loaded weight is already fused on disk (qkv). + # (e.g., Phi-3's qkv_proj). if output_dim is None: if needs_scalar_to_array: param_data, loaded_weight = adjust_scalar_to_fused_array( diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py index 34141511ea791..54158bc141235 100644 --- a/vllm/model_executor/models/phi3.py +++ b/vllm/model_executor/models/phi3.py @@ -14,3 +14,13 @@ class Phi3ForCausalLM(LlamaForCausalLM): "gate_up_proj", ], } + + # BitandBytes specific attributes + default_bitsandbytes_target_modules = [ + ".gate_up_proj.", + ".down_proj.", + ".qkv_proj.", + ".o_proj.", + ] + # Initialize an empty dict when there is no stacked parameter mapping. + bitsandbytes_stacked_params_mapping = {} From 8c1fb507052d385d94ac49a7388fd6db5d0069e7 Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Tue, 19 Nov 2024 11:22:26 +0800 Subject: [PATCH 050/255] [Platform][Refactor] Extract func `get_default_attn_backend` to `Platform` (#10358) Signed-off-by: Mengqing Cao --- tests/kernels/test_attention_selector.py | 19 ++++---- vllm/attention/selector.py | 56 +++--------------------- vllm/model_executor/models/molmo.py | 2 +- vllm/model_executor/models/qwen2_vl.py | 2 +- vllm/model_executor/models/utils.py | 4 +- vllm/platforms/__init__.py | 1 + vllm/platforms/cpu.py | 10 ++++- vllm/platforms/hpu.py | 6 ++- vllm/platforms/interface.py | 19 ++++++++ vllm/platforms/openvino.py | 8 +++- vllm/platforms/rocm.py | 14 +++++- vllm/platforms/tpu.py | 12 ++++- vllm/platforms/xpu.py | 12 ++++- vllm/worker/enc_dec_model_runner.py | 3 +- 14 files changed, 99 insertions(+), 69 deletions(-) diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index 169ce040d370c..d37f95d48d5b2 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -5,6 +5,7 @@ from tests.kernels.utils import override_backend_env_variable from vllm.attention.selector import which_attn_to_use +from vllm.platforms import cpu, cuda, openvino, rocm from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL @@ -19,26 +20,28 @@ def test_env(name: str, device: str, monkeypatch): override_backend_env_variable(monkeypatch, name) if device == "cpu": - with patch("vllm.attention.selector.current_platform.is_cpu", - return_value=True): + with patch("vllm.attention.selector.current_platform", + cpu.CpuPlatform()): backend = which_attn_to_use(16, torch.float16, torch.float16, 16, False) assert backend.name == "TORCH_SDPA" elif device == "hip": - with patch("vllm.attention.selector.current_platform.is_rocm", - return_value=True): + with patch("vllm.attention.selector.current_platform", + rocm.RocmPlatform()): backend = which_attn_to_use(16, torch.float16, torch.float16, 16, False) assert backend.name == "ROCM_FLASH" elif device == "openvino": - with patch("vllm.attention.selector.current_platform.is_openvino", - return_value=True): + with patch("vllm.attention.selector.current_platform", + openvino.OpenVinoPlatform()): backend = which_attn_to_use(16, torch.float16, torch.float16, 16, False) assert backend.name == "OPENVINO" else: - backend = which_attn_to_use(16, torch.float16, torch.float16, 16, - False) + with patch("vllm.attention.selector.current_platform", + cuda.CudaPlatform()): + backend = which_attn_to_use(16, torch.float16, torch.float16, 16, + False) assert backend.name == name diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 664707e9dc65d..d263839705690 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -1,4 +1,3 @@ -import enum import os from contextlib import contextmanager from functools import lru_cache @@ -9,26 +8,12 @@ import vllm.envs as envs from vllm.attention.backends.abstract import AttentionBackend from vllm.logger import init_logger -from vllm.platforms import current_platform +from vllm.platforms import _Backend, current_platform from vllm.utils import STR_BACKEND_ENV_VAR logger = init_logger(__name__) -class _Backend(enum.Enum): - FLASH_ATTN = enum.auto() - FLASH_ATTN_VLLM_V1 = enum.auto() - XFORMERS = enum.auto() - ROCM_FLASH = enum.auto() - TORCH_SDPA = enum.auto() - OPENVINO = enum.auto() - FLASHINFER = enum.auto() - HPU_ATTN = enum.auto() - PALLAS = enum.auto() - IPEX = enum.auto() - NO_ATTENTION = enum.auto() - - def backend_name_to_enum(backend_name: str) -> _Backend: assert backend_name is not None @@ -216,40 +201,11 @@ def which_attn_to_use(head_size: int, if backend_by_env_var is not None: selected_backend = backend_name_to_enum(backend_by_env_var) - if current_platform.is_cpu(): - if selected_backend != _Backend.TORCH_SDPA: - logger.info("Cannot use %s backend on CPU.", selected_backend) - return _Backend.TORCH_SDPA - - if current_platform.is_openvino(): - if selected_backend != _Backend.OPENVINO: - logger.info("Cannot use %s backend on OpenVINO.", selected_backend) - return _Backend.OPENVINO - - if current_platform.is_xpu(): - if selected_backend != _Backend.IPEX: - logger.info("Cannot use %s backend on XPU.", selected_backend) - return _Backend.IPEX - - if current_platform.is_tpu(): - if selected_backend != _Backend.PALLAS: - logger.info("Cannot use %s backend on TPU.", selected_backend) - return _Backend.PALLAS - - if current_platform.is_rocm(): - # AMD GPUs. - selected_backend = (_Backend.ROCM_FLASH if selected_backend - == _Backend.FLASH_ATTN else selected_backend) - if selected_backend == _Backend.ROCM_FLASH: - if not current_platform.has_device_capability(90): - # not Instinct series GPUs. - logger.info("flash_attn is not supported on NAVI GPUs.") - else: - logger.info("%s is not supported in AMD GPUs.", selected_backend) - return _Backend.ROCM_FLASH - - if current_platform.is_hpu(): - return _Backend.HPU_ATTN + # get device-specific default attn_backend + default_backend = current_platform.get_default_attn_backend( + selected_backend) + if default_backend is not None: + return default_backend if use_v1: return _Backend.FLASH_ATTN_VLLM_V1 diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index a7c90a3f5031b..2528f741864b3 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -13,7 +13,6 @@ from transformers import PretrainedConfig from vllm.attention import Attention, AttentionMetadata -from vllm.attention.selector import _Backend from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, @@ -38,6 +37,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer +from vllm.platforms import _Backend from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, SequenceData) from vllm.transformers_utils.processor import get_processor diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index a929b9323b245..0ac81387b1bd8 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -39,7 +39,6 @@ make_batched_images, make_batched_videos, smart_resize) from vllm.attention import AttentionMetadata -from vllm.attention.selector import _Backend from vllm.config import VllmConfig from vllm.distributed import get_pp_group, parallel_state from vllm.distributed import utils as dist_utils @@ -65,6 +64,7 @@ from vllm.multimodal.inputs import (MultiModalData, MultiModalDataDict, MultiModalKwargs) from vllm.multimodal.utils import cached_get_tokenizer +from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors, PoolerOutput, SequenceData from vllm.transformers_utils.config import uses_mrope from vllm.transformers_utils.processor import cached_get_processor diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 03226f42ee053..2ab9b19e22068 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -9,13 +9,13 @@ from transformers import PretrainedConfig import vllm.envs as envs -from vllm.attention.selector import (_Backend, backend_name_to_enum, +from vllm.attention.selector import (backend_name_to_enum, get_global_forced_attn_backend) from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors -from vllm.platforms import current_platform +from vllm.platforms import _Backend, current_platform from vllm.sequence import IntermediateTensors from vllm.utils import is_pin_memory_available diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index 9e740837381f8..1f68fc2e25df3 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -1,3 +1,4 @@ +from .interface import _Backend # noqa: F401 from .interface import Platform, PlatformEnum, UnspecifiedPlatform current_platform: Platform diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 42bee31dfb0e9..f9a34a47959ec 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -5,7 +5,9 @@ from vllm.logger import init_logger -from .interface import Platform, PlatformEnum +from .interface import Platform, PlatformEnum, _Backend + +logger = init_logger(__name__) if TYPE_CHECKING: from vllm.config import VllmConfig @@ -22,6 +24,12 @@ class CpuPlatform(Platform): def get_device_name(cls, device_id: int = 0) -> str: return "cpu" + @classmethod + def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: + if selected_backend != _Backend.TORCH_SDPA: + logger.info("Cannot use %s backend on CPU.", selected_backend) + return _Backend.TORCH_SDPA + @classmethod def get_device_total_memory(cls, device_id: int = 0) -> int: return psutil.virtual_memory().total diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index 170cfff94f90d..1e0888a30ba96 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -1,11 +1,15 @@ import torch -from .interface import Platform, PlatformEnum +from .interface import Platform, PlatformEnum, _Backend class HpuPlatform(Platform): _enum = PlatformEnum.HPU + @classmethod + def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: + return _Backend.HPU_ATTN + @staticmethod def inference_mode(): return torch.no_grad() diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 970c0d1be617e..f4849fa2ccfb0 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -11,6 +11,20 @@ VllmConfig = None +class _Backend(enum.Enum): + FLASH_ATTN = enum.auto() + FLASH_ATTN_VLLM_V1 = enum.auto() + XFORMERS = enum.auto() + ROCM_FLASH = enum.auto() + TORCH_SDPA = enum.auto() + OPENVINO = enum.auto() + FLASHINFER = enum.auto() + HPU_ATTN = enum.auto() + PALLAS = enum.auto() + IPEX = enum.auto() + NO_ATTENTION = enum.auto() + + class PlatformEnum(enum.Enum): CUDA = enum.auto() ROCM = enum.auto() @@ -71,6 +85,11 @@ def is_cuda_alike(self) -> bool: """Stateless version of :func:`torch.cuda.is_available`.""" return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM) + @classmethod + def get_default_attn_backend(cls, selected_backend: _Backend): + """Get the default attention backend of a device.""" + return None + @classmethod def get_device_capability( cls, diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py index 31fe3f1fcbfe4..ad69ced5417b3 100644 --- a/vllm/platforms/openvino.py +++ b/vllm/platforms/openvino.py @@ -3,7 +3,7 @@ import vllm.envs as envs from vllm.logger import init_logger -from .interface import Platform, PlatformEnum +from .interface import Platform, PlatformEnum, _Backend logger = init_logger(__name__) @@ -11,6 +11,12 @@ class OpenVinoPlatform(Platform): _enum = PlatformEnum.OPENVINO + @classmethod + def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: + if selected_backend != _Backend.OPENVINO: + logger.info("Cannot use %s backend on OpenVINO.", selected_backend) + return _Backend.OPENVINO + @classmethod def get_device_name(self, device_id: int = 0) -> str: return "openvino" diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index fd8afc92b0f28..022256996f97b 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -5,7 +5,7 @@ from vllm.logger import init_logger -from .interface import DeviceCapability, Platform, PlatformEnum +from .interface import DeviceCapability, Platform, PlatformEnum, _Backend logger = init_logger(__name__) @@ -19,6 +19,18 @@ class RocmPlatform(Platform): _enum = PlatformEnum.ROCM + @classmethod + def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: + selected_backend = (_Backend.ROCM_FLASH if selected_backend + == _Backend.FLASH_ATTN else selected_backend) + if selected_backend == _Backend.ROCM_FLASH: + if not cls.has_device_capability(90): + # not Instinct series GPUs. + logger.info("flash_attn is not supported on NAVI GPUs.") + else: + logger.info("%s is not supported in AMD GPUs.", selected_backend) + return _Backend.ROCM_FLASH + @classmethod @lru_cache(maxsize=8) def get_device_capability(cls, device_id: int = 0) -> DeviceCapability: diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 643db835c85ff..9057afb6514e4 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -3,17 +3,27 @@ import torch -from .interface import Platform, PlatformEnum +from vllm.logger import init_logger + +from .interface import Platform, PlatformEnum, _Backend if TYPE_CHECKING: from vllm.config import VllmConfig else: VllmConfig = None +logger = init_logger(__name__) + class TpuPlatform(Platform): _enum = PlatformEnum.TPU + @classmethod + def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: + if selected_backend != _Backend.PALLAS: + logger.info("Cannot use %s backend on TPU.", selected_backend) + return _Backend.PALLAS + @classmethod def get_device_name(cls, device_id: int = 0) -> str: raise NotImplementedError diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 106e8eddf458f..d0b3dca9a4195 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -1,11 +1,21 @@ import torch -from .interface import DeviceCapability, Platform, PlatformEnum +from vllm.logger import init_logger + +from .interface import DeviceCapability, Platform, PlatformEnum, _Backend + +logger = init_logger(__name__) class XPUPlatform(Platform): _enum = PlatformEnum.XPU + @classmethod + def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: + if selected_backend != _Backend.IPEX: + logger.info("Cannot use %s backend on XPU.", selected_backend) + return _Backend.IPEX + @staticmethod def get_device_capability(device_id: int = 0) -> DeviceCapability: major, minor, *_ = torch.xpu.get_device_capability( diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 82824faa6629a..687d2cc79360f 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -8,7 +8,7 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionMetadata) from vllm.attention.backends.utils import PAD_SLOT_ID -from vllm.attention.selector import (_Backend, get_env_variable_attn_backend, +from vllm.attention.selector import (get_env_variable_attn_backend, get_global_forced_attn_backend) from vllm.config import VllmConfig from vllm.forward_context import set_forward_context @@ -18,6 +18,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs, MultiModalRegistry) +from vllm.platforms import _Backend from vllm.sampling_params import SamplingParams from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceGroupMetadata) From 74f8c2cf5f6a34fd21cfbe6d72bcc1b2a2a6754a Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 18 Nov 2024 23:37:46 -0500 Subject: [PATCH 051/255] Add openai.beta.chat.completions.parse example to structured_outputs.rst (#10433) --- docs/source/models/structured_outputs.rst | 98 ++++++++++++++++++++++- 1 file changed, 96 insertions(+), 2 deletions(-) diff --git a/docs/source/models/structured_outputs.rst b/docs/source/models/structured_outputs.rst index ff4ff7169fc5f..484e1f17d191e 100644 --- a/docs/source/models/structured_outputs.rst +++ b/docs/source/models/structured_outputs.rst @@ -10,7 +10,7 @@ This document shows you some examples of the different options that are availabl Online Inference (OpenAI API) ----------------------------- -You can generate structured outputs using the OpenAI’s `Completions `_ and `Chat `_ API. +You can generate structured outputs using the OpenAI's `Completions `_ and `Chat `_ API. The following parameters are supported, which must be added as extra parameters: @@ -137,6 +137,100 @@ It works by using a context free EBNF grammar, which for example we can use to d The complete code of the examples can be found on `examples/openai_chat_completion_structured_outputs.py `_. +Experimental Automatic Parsing (OpenAI API) +-------------------------------------------- + +This section covers the OpenAI beta wrapper over the ``client.chat.completions.create()`` method that provides richer integrations with Python specific types. + +At the time of writing (``openai==1.54.4``), this is a "beta" feature in the OpenAI client library. Code reference can be found `here `_. + +For the following examples, vLLM was setup using ``vllm serve meta-llama/Llama-3.1-8B-Instruct`` + +Here is a simple example demonstrating how to get structured output using Pydantic models: + +.. code-block:: python + + from pydantic import BaseModel + from openai import OpenAI + + + class Info(BaseModel): + name: str + age: int + + + client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") + completion = client.beta.chat.completions.parse( + model="meta-llama/Llama-3.1-8B-Instruct", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"}, + ], + response_format=Info, + extra_body=dict(guided_decoding_backend="outlines"), + ) + + message = completion.choices[0].message + print(message) + assert message.parsed + print("Name:", message.parsed.name) + print("Age:", message.parsed.age) + +Output: + +.. code-block:: console + + ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28)) + Name: Cameron + Age: 28 + + +Here is a more complex example using nested Pydantic models to handle a step-by-step math solution: + +.. code-block:: python + + from typing import List + from pydantic import BaseModel + from openai import OpenAI + + + class Step(BaseModel): + explanation: str + output: str + + + class MathResponse(BaseModel): + steps: List[Step] + final_answer: str + + + client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") + completion = client.beta.chat.completions.parse( + model="meta-llama/Llama-3.1-8B-Instruct", + messages=[ + {"role": "system", "content": "You are a helpful expert math tutor."}, + {"role": "user", "content": "Solve 8x + 31 = 2."}, + ], + response_format=MathResponse, + extra_body=dict(guided_decoding_backend="outlines"), + ) + + message = completion.choices[0].message + print(message) + assert message.parsed + for i, step in enumerate(message.parsed.steps): + print(f"Step #{i}:", step) + print("Answer:", message.parsed.final_answer) + +Output: + +.. code-block:: console + + ParsedChatCompletionMessage[MathResponse](content='{ "steps": [{ "explanation": "First, let\'s isolate the term with the variable \'x\'. To do this, we\'ll subtract 31 from both sides of the equation.", "output": "8x + 31 - 31 = 2 - 31"}, { "explanation": "By subtracting 31 from both sides, we simplify the equation to 8x = -29.", "output": "8x = -29"}, { "explanation": "Next, let\'s isolate \'x\' by dividing both sides of the equation by 8.", "output": "8x / 8 = -29 / 8"}], "final_answer": "x = -29/8" }', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=MathResponse(steps=[Step(explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation.", output='8x + 31 - 31 = 2 - 31'), Step(explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.', output='8x = -29'), Step(explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8.", output='8x / 8 = -29 / 8')], final_answer='x = -29/8')) + Step #0: explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation." output='8x + 31 - 31 = 2 - 31' + Step #1: explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.' output='8x = -29' + Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8." output='8x / 8 = -29 / 8' + Answer: x = -29/8 Offline Inference ----------------- @@ -170,4 +264,4 @@ One example for the usage of the ``choices`` parameter is shown below: ) print(outputs[0].outputs[0].text) -A complete example with all options can be found in `examples/offline_inference_structured_outputs.py `_. \ No newline at end of file +A complete example with all options can be found in `examples/offline_inference_structured_outputs.py `_. From 272e31c0bd8640c15e85211c74fc9b428ad86902 Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Mon, 18 Nov 2024 21:57:10 -0700 Subject: [PATCH 052/255] [Bugfix] Guard for negative counter metrics to prevent crash (#10430) Signed-off-by: Travis Johnson --- vllm/engine/llm_engine.py | 2 +- vllm/engine/metrics.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 9a2d73a020c8f..e72dc81f35b67 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1716,7 +1716,7 @@ def _get_stats(self, # not counted (to avoid double counting) actual_num_batched_tokens = scheduler_outputs.num_batched_tokens # type: ignore - num_generation_tokens_from_prefill_groups = 0. + num_generation_tokens_from_prefill_groups = 0 # NOTE: if scheduler_outputs.num_prefill_groups > 0 and # the len of scheduler_outputs.scheduled_seq_groups is != # scheduler_outputs.num_prefill_groups, this means that diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index e896bcdded2d1..47472c274ccb6 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -512,6 +512,11 @@ def _log_gauge(self, gauge, data: Union[int, float]) -> None: def _log_counter(self, counter, data: Union[int, float]) -> None: # Convenience function for logging to counter. + # Prevent ValueError from negative increment + if data < 0: + logger.warning("Skipping negative increment of %g to %s", data, + counter) + return counter.labels(**self.labels).inc(data) def _log_counter_labels(self, counter, data: CollectionsCounter, From 382b6a4852b9afc9a740b02736688e20f7d58446 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Tue, 19 Nov 2024 16:54:58 +0800 Subject: [PATCH 053/255] [Misc] Avoid misleading warning messages (#10438) Signed-off-by: Jee Jee Li --- vllm/model_executor/models/chatglm.py | 5 ++--- vllm/model_executor/models/qwen.py | 10 +++++----- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 625e31bb0d368..2ea592aaba9f9 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -575,8 +575,7 @@ def forward( return hidden_states -class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP, - SupportsMultiModal): +class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -695,7 +694,7 @@ class ChatGLM(ChatGLMBaseModel): embedding_padding_modules = [] -class ChatGLMV(ChatGLMBaseModel): +class ChatGLMV(ChatGLMBaseModel, SupportsMultiModal): packed_modules_mapping = { "query_key_value": ["query_key_value"], "dense_h_to_4h": ["dense_h_to_4h"], diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 3978c176a2144..44ce6eda42943 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -870,7 +870,7 @@ def dummy_data_for_qwen( return DummyData(seq_data, mm_data) -class QWenBaseModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): +class QWenBaseModel(nn.Module, SupportsPP, SupportsLoRA): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -1024,7 +1024,7 @@ class QWenLLM(QWenBaseModel): embedding_padding_modules = [] -class QWenVL(QWenBaseModel): +class QWenVL(QWenBaseModel, SupportsMultiModal): packed_modules_mapping = { "c_attn": ["c_attn"], "gate_up_proj": [ @@ -1062,7 +1062,7 @@ def get_mm_mapping(self) -> MultiModelKeys: @MULTIMODAL_REGISTRY.register_max_image_tokens(MAX_QWEN_IMG_TOKENS) @INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen) @INPUT_REGISTRY.register_input_processor(input_processor_for_qwen) -class QWenLMHeadModel(QWenBaseModel, SupportsLoRA): +class QWenLMHeadModel(QWenBaseModel, SupportsMultiModal, SupportsLoRA): """ QWenLMHeadModel is not only applicable to LLM but also to VL, which is not conducive to the current integration logic of LoRA in vLLM. Therefore, it @@ -1083,7 +1083,7 @@ def __new__( config = vllm_config.model_config.hf_config # Initialize VL if hasattr(config, "visual"): - return QWenVL(vllm_config=vllm_config) + return QWenVL(vllm_config=vllm_config, prefix=prefix) # Initialize LLM else: - return QWenLLM(vllm_config=vllm_config) + return QWenLLM(vllm_config=vllm_config, prefix=prefix) From 5390d6664f65d84f37a5fb524e967b01baad9100 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Tue, 19 Nov 2024 04:52:11 -0500 Subject: [PATCH 054/255] [Doc] Add the start of an arch overview page (#10368) --- .github/workflows/png-lint.yml | 37 +++ .../arch_overview/entrypoints.excalidraw.png | Bin 0 -> 123422 bytes .../arch_overview/llm_engine.excalidraw.png | Bin 0 -> 178116 bytes docs/source/design/arch_overview.rst | 274 ++++++++++++++++++ docs/source/design/class_hierarchy.rst | 74 ----- docs/source/design/plugin_system.rst | 4 +- docs/source/index.rst | 2 +- format.sh | 4 + tools/png-lint.sh | 15 + vllm/engine/arg_utils.py | 2 +- 10 files changed, 334 insertions(+), 78 deletions(-) create mode 100644 .github/workflows/png-lint.yml create mode 100644 docs/source/assets/design/arch_overview/entrypoints.excalidraw.png create mode 100644 docs/source/assets/design/arch_overview/llm_engine.excalidraw.png create mode 100644 docs/source/design/arch_overview.rst delete mode 100644 docs/source/design/class_hierarchy.rst create mode 100755 tools/png-lint.sh diff --git a/.github/workflows/png-lint.yml b/.github/workflows/png-lint.yml new file mode 100644 index 0000000000000..4932af943a07b --- /dev/null +++ b/.github/workflows/png-lint.yml @@ -0,0 +1,37 @@ +name: Lint PNG exports from excalidraw +on: + push: + branches: + - "main" + paths: + - '*.excalidraw.png' + - '.github/workflows/png-lint.yml' + pull_request: + branches: + - "main" + paths: + - '*.excalidraw.png' + - '.github/workflows/png-lint.yml' + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + actionlint: + runs-on: ubuntu-latest + steps: + - name: "Checkout" + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 + + - name: "Run png-lint.sh to check excalidraw exported images" + run: | + tools/png-lint.sh diff --git a/docs/source/assets/design/arch_overview/entrypoints.excalidraw.png b/docs/source/assets/design/arch_overview/entrypoints.excalidraw.png new file mode 100644 index 0000000000000000000000000000000000000000..bbf46286cfe5d0820e4183827f9c2b852b005b4b GIT binary patch literal 123422 zcmeEu^)B1}casAfPfRAfR-IqR2?cD2;T7l(blg0g8-tNOyONib#%tbSm9F z#1QX3eg(as_j&I7AGm)I{it*1oa^js@3q%j`|_5Tl{|8g>fo+jyN*cRlu+EYYk%;r zU3=6H?1R6#^I5DEe%o!UD0y{PS_944u3hxIq$I?Z?`uu=99maV+T5Cx8$(l`@Q%C3 z6eW1A@Fa>k0DaMq5qPD8`TJ}emUC%+XU=oF;XBQQmp8+E23+xLDMT*I(>-Qf#K**T?fwrxX!ag_ zZS3HG?(l#5kH^I5kI^^Qy?U^V=m7n{|Iv>Ui|TmsLgznxHF9@k_1$~^)8*jHw1^mj zOA8L%_>Yf++@kQmzq?=m`rljpw@dloRs6q${J*jIZ_~!8-rT{!a7M-7#8J5|l#$w7+e)VKb*j3IKFx`{6d ze_!SaTt@k#E$Q~X(9<29?98p@vFtYP%CpF}D&J_$v+AQ=cU`FGt=MAID9AYD(0&(_ zroo@AoamQ$!WOs4ntMV+Rajsh%%Uz?5(P8_z?|vjszMUSfbV>Zz;sss!zdIfMlSoG2 z7|LwY@M!zp=?8rnt`B(>Gyn0)Aq)IuM@7w$evZRbXHeN78z$urM^QYEhS+&&++d)} z*D)(mCHY?9(hg+ z!yP#$%@0N2V(&C~OI1rxopv33BWYNmYj}7ot$3OP``N+7pvIpBlhOQ7+$cfZIB4BA ze6oGF^bf>Nz3hQZf>i;yY{E%gsgCXRn3V=eVxU<8dCFob6RX z?=(CGXG%CT94yp&WUu{N9a&hN4kyhZioaGz;lOKSpQ~>~{=5tFm6ZWs3)mwSNv)~s z`Em7=%$O8or8wy;sZp^gxkzXpc%0_-N61q&i(8+cWF4oGqlXLjGe=T}BH^>OG2=1S zrtOjYh*aR~cx$Se7p-%G8RcixMF<6PF;dCxHG-Dm#on7-Mst-0rrSI1f$xK|uMt=D zp2`GsRm0bh*czkt(u=1BCc5&i(72#JS@=udbhm)p=~@=3Tkq79oyza) zoc;0Xkp=lZQ|FV5qm7@?0c>HP6~5&{Ua+S7yC)H|gas|+-FakJ=1BIr+i4mmhbY+& zlhsl4n14Ox?JKhO^*JSb_?3bB98ZW2?)Dd5xvZK+-)r8nPEg6xz#|`162nFNoDuxu z#@3RgQv1c1&I*P07vu7V{NeZPR@?SNAyHER`|WV~gTr*cqh>z+1AFdpA*+@eeUUX|gRGh?oV|_$)D^n(fr*eZxG|WJB zrGJm$;OlEmPR~vY=fOtlptdUcA>_5XFp}+-mWO3gh*2Vp06%8YaWuzoSp}e55dQgzLN%HUGUBlbM?J_M2Q6 zmXpj-KSHd^#$3rtN9A)(RYvX;L2B{rCSEjmS^42r`k*tt+BRM zYFs?B|GW=|+BWbvBnK^v255aS?aVQWiXT~_LEf?YzkA1r$-b3gk5Kc}MIOIkl)w=D zcAzM|Z6d9h?PB$GXAV)OYPMn8kI%?A@EobT&Sb)1fTctU@wtHXHMqcLnrwn23yswa$y@c z^J{q)?T3VHDY~(!WX(QW_Dyc?XXPW8>svC{c6qL&UXRu)kAyjfzoO!Pj`MwHMIBCZ z{87`Hziw8Q0apD58>74STu#; zaG>5~R23We#9{6 zIR*KcGgVQU$fT#p9zRz_*VUP87KR=$eWIJF7@H#DO;)}!>Mlr_%$6ClKXl?^zt9Z` z9{P>F4wLP7A@9o>M?&r>6`uXDhh(XuqMWin=))P$-+}ULLlr`<9q!$!@f z&Dw;hGyPnc&5|Vgyf1{Ga*C2;QODAs5_Yz$ZQ)P7F~Ju#dHCCa_%X(WHFycmUZ&jH_pf#uc#VYSnq|{`_c-hsOMXD zc>3+j?hksskJ9t(Ujg?z4suHK;>XLEcCz#Z&$)pr^>nRL#pgtdt+qX zpznyyhP6ABqLBpPfPz{1BCaPVVkSV_{q!V`5HGq>?SCicI(y{jO``}m!-=-^e2Ek$-ZNM`(rP0S!efD6eeP2A{y0E7k^`d)rsLledG62ZbQRf~jfS6O z(=e8#bF&*A$X*qLGV5v|ME%#ABpwG4LS@vj10W^NWdOv8j4?hjmD4Pn9WLzBdD|(GG~c4@Bm*{AQB{RYeo+NJMMbh1x#8paHK zJ4FsJhanP|PeRVv{=sw4loy@*>@>INJKS(kzgvj)pEZF$Jq9RCL+0~or~Z{3ljm2! z63OHoId%C1Y$iR(J<>TWJ~+K!`K;x(_8I2gZ>z4!jV^l8+`IYOwTbp8u!mpWJ?T8w z8yRIojUieNX~o!BoR{=@Ik}g$&34#;21guD;^eetW4RrUkf$y{4%*VRHQatlr`aNf z{R7ywX-!W;=E^E_BPibB^4D5B!v$%7;@kUcYez0spKmfD3ThUU*B_SqAdik)2cPK){$-Gi)B?u01=h+t)Zp#!r@@)@Ti4$Cq_P{A=Q2e zbrU-GmAlsImK|A@n+p+J#Omsh%x=O9%*qKmLp9h#S_g!N{FNpj9%4JN=|S~Z z{ERWf%4wz_9{7cGk>q#pxcsVo>gZxe^86-YCLdskZZL`{`o^oQ{j~S#Zy7F>H(%5) zD``pB?n6Krq}F_Z6~Q%n{{T&0sLMCY1@!?RIy|a%uB>>RNkyH>WQt8=NL<5R;F6x z#6py~Ktr5Xsk6hE(?)luI&)JE>w_zpKTDg7Zmnm#Z7j9nH$ks`#Rnx)Su z`{c^`2V(o6W4RgeBF|>vh6~gv^XcwF{BVfX4S@{!dEnu;H6u20!yl2Y~MWT0Tu7`i4Ad z4zx0-xxD}&v;!2A-emSzLr@pKl2Usxcvf_}VBn!*A3!qyGq!bCDk9J+BLStloZ~+^ z?htbztzNHIiLdBpyO!IMN{E2HsXUQjkj1x1afcR_+o**Ti?6oVi~hlbe27IW4$_?h z)^siM?x&}5g8{cfxJ(q~EkoX|5~Y`K7WLRQ%a{nhfsR-0Eu)My%e&mR7{AmYH6j7bD*@`Z{=OvN9$g!^@kd+lZ_Kc#UO`796AfMAr7KSFp zS)lB+{w1mqOc0i~Pj9Dh5h8XatKZsOE}wc#E;~CK_RWqWP{=9Eex_G=>S}fi9Jvp$?d4iQiK2l8W z6-%9tf8<72-=<--q%W!M{RF+78mHO*3II7+`6_m`oUhLK$sn^1OzOp%?}=}F?$xp) zp)Wu3Y@B|Iwajj;Nhf8kBU8^Sv2F#Rh4FtUSK)0(#nG>aYfcCn3W0Y-oH==mNXD{O^B3%JMuH*_D0!ll5|?ZpwhwGP`Wp+Y3I zrE4bmt=<3R$w{PT)ogp)@f%=>iIB}oaiwdQ_a3BV3-z^YGD|97pMSxsUDhqKwQk#0 z>f-pizi}%cUUvVX0VO$M?)Gb4C=x!$X$`Z!GB-7u3{0w8oW5inZ|cG6p5pvhbfO}1 zaZRuqk(Y{aGfC6Q5F*_sCAYPl3|@b0coF9&t?fL9A`tqvXcrbWi|j}+lakn#@8fnT zZ{80oq^jlW zY&wsLi>}Qy#oijJWPRH*G8pu_H{f>lkYle?m6pp?ZjN)_SZpZ2?Qje)ZSD-u;((8tO@uj`~prigv@o#1tqyyz7&t? z87udI3>!IVRqcmLRZdg{2E!`|^DX>d4O0d4*K^mO53Y9W9uZ@S+?)?69}Mp9a7^*b z{p58tAD~cGs4tm;{Y0xwGJk~o+8B?`&(AqjZ)bGuj&3fcdmO=c*VV=i_Cjxd?$7a)jWz~+Yg=ln3MWwA0xB3 zu8mThh>2gZGN=ceWOd~yr~U6}`f9$rO(NTm5zvG~%>_b- zQA7})QzKLN3A`5OJB=YPThla)btp`gS?Hr_qOBgLN(XUN988M#6%iP|oHLT=*XGTG z;>^`ms6HR2gnF!kUdijS%tr8+V>*L!2{x%AY)CxP&yQI+OtyMn-Kw-@k=)Ea8}m5= z-mRAI^V?(`DpQ}1iXJA*E0bC7KjYZ_!{v@Vt5`W5`Z7S{PQuEbqez{~HEI5mR=&BG zquG&6X=_*)cz58-s|DvV`olgV(Tg3;0J31=Gy>V?m#zqrH_gmXpmId$|(4?#{YOy6c6-4R<)^{`fQhdUnm2iB)Ft)26i3nhw2>SYYxAA?Cfp<66O|sdbY-dq z>Ss?s0rb%Y3u?h_`u%IJa(-I0pIO>e(=GPgDmssDZlS54b~8fjv!6_O*?QX0fx~%@ z3+t;3snBR>4Yay{V?H029STcdRRIC1$!Ca#Q~rVH=zH7`-CI$D4air+2EP0K|uF7Ur_)P(N^~(k}aO!Q8dQLKfc@ z(Zn(lBEptqV{RiY%&kzPf-ncpwC9-4K+8yif8Ih{Q#KW36rQ=cK_JlL)m@=VJ#4C& zL*t7WLa;I4DZfLA9mKE|bGfZ*WgE(jG!+MFG$<`kAK~rp|LA9QWzT-nx+`;A=^h)k zmJprkHczDbox^m>b*5~SC>HL9XsLPU7kOy+S(U9CW)NwNNE|19m zJZ*@=!+$@uT;^-)?o4u>Me7p}*2?#GxN3D&#iRXQkm@P$TmQQr*$xgIHQ$^hW>5X}Rb+3|% zyM^j|kL*#-*!y!oD|qM$TPK(l2lKZ?Tl9up|7@uP1;7?MD4g53-B8`UhGdtr?;1sR zCfzNWg)$0+52n6Gsc{NFFZP6>N0Lq$mSFNNHKF5DFUg=cNnvl->_mWOeF(23iT!gM z&BjFiuC=9!M`kHaT?!1&Tvn)hO|<|v&8t`v$1y?GE6|`GsCgEj_1PuLwlP;_?7Nj;!QPfj4BLwfbHYZW8da9bzxf2>;9Z` z_0@6~^gdK^eYkKJaGQFN(@nDED2hCKqMgs4Bd;}CkbdUYkv$d^X%v_D`ex-wxzEd6U4C`#F5rW};pDfTwe+I$oDkI4C?&Y2~sK}UTWYxFnPfkv(H zt=j-cjTRqUlr~daiffug`z7eFMqPgq8sERr z^w>833ei$=v`Y2e;M(4KX6kTK11s)UAapoPNL))Nqo40aa~Yu#gbu_y%k=X_yP%`E z$1Atbg@ltNxkim(vVu2DV*}*b3i^p^G%p+qa{r?jG~Iw)+Np7H-%kHpa0ns&9cq~3 z`RF25>Wdww(8Y>v4dk;2zCSO5LjGAu7n;I2iXUt;+Nb0~cvuY+@c7B@!fe|_lYoWk z7Yb3TBqPeasiF3)USTFI!ES!&x$dQ66(IWse>^5nfofe5-djYMqiYFf6{NCvIrh03 z)xKlRhN4_K(Xcv$W$>7-*dK-&N>-YxPob1ONWnbl>LWqqCK^pD6o9ETjacmNgk)*F z+~cTy?%K=6_6_Y_&yw^~+i9=pN?c9;7)^Dn-F;EcjWS4hsr;`ubVlE>^?`-ii z!r8v^S1rE0JbmTD+)#jar%8O+uu!9%$SQ;3L)oDhDqr|c%dq4uJ-H9%{cWRLnwx2C z0_J5=j#+wX>)P_n3|a8WD1CvtRoiG7=GLa|434Nk%Nfaazm%Yyj z&3z_+Gh>iSd3v!>1DHv{@r39}xU#JAJ|Xg-3kJ=c74{QvR+zV_-+Ra{PWK4<^|`g# zl6mbs`vb8eNCdJ0EERm)B*N~ap2gEewT#k%{WcPtB*HT#3)0DF^tiDH`|@$AFY4fq zQUFNvKV|N)*<2kdwj&o9j@Vj{-&nU64H6q z!ocP`GX(=aI+bcsERS*}7YD1~S^)LbknBgE`2t%Y6Vuo2Bf}8nQOlZt2Ny1EJGlnT zG>Lnk2L86(my(}LiP(BhqlO%ajn9Yb=Vq2T9->b`^SSWeN3QKr4v{vyAwy@JY#3F0 zibmmlewuz7jxK_kZokhXqpRyIZ)I?g(OBlnhzvc6Qs*V}^_4ks?n8ffP^$_E*HtoJ zcX}ISXo)4S=KN%(4)!BPuO&@dBYgp#+udXeGf*&ir`*>2wl>@NGZ$h1cU`B{JP{BF z)N>wiK?ODkzljtK)dp-`OYyr}bQ0Kt_e!F7AI7IpyhJuqjACqf@@5%#9@#UHYyR=3%r<7l3(T=4;=h#LumIgof|3 ziV}%Et!m@JtM;L3$=How6TM?tCv5K3SPRnkUY5Yj#d4k(F?yJ z5rRi?of0AHLKc~3Tu-qhZce#KJEzhUcSvHt!)MO3c$GTXHn?0LM=R0aKl25N5ZKmDS>Dk>X(!vZHp_lM_M-}5*#%7 z>(iG}RIoS(KpxnR%Nm%p_}*#`$8{2%w&H~EBXo3#uuI-8N|Gv9pxQAd_vaoHY^M5w zRp3QoG(#|_Xq`vjr~Pm`Nr^`tDK(DtTPjR7m$SPX!fTc1zOlBKq%5IpsZD#Pq|f2R zNb4aQ^x$I?bOAI!vHY}2!s)rrmo;y5&WMigJ3>XwV&#S@W6>g5UPvC}R3xP38#jJ5 z(QvMojBsNwnSBV|n2_JyBeX*8X(jXMnKHZjrS2CkY{iot)FQ40j(r5u{-9$aY>;i$ zcK1Q+^=-LkbN~3VDIm|dKBs>lRBHVQBZ$aCpJ{+WwFH(lFU;o4X>mXL8i^SEcUOcZ z0l-{gItu!j3u?DYjS9=8MkkQ+9=G4ANEu?~F{j&1Bltk@rTT4i56w2 z!Rg`|(oNy3fBHf(dQlJo8su(=6rTGhBa2}UT-2r2@q6MNs@F-X~egz7})_pr4UATN*vfA6`7R3KtB&t8 zU2^gwJyYWHy=Dnm4i`E~{H{{zsK?1<$@|l~k2a6k;)IP}QS*}88Yk#c@t7NT9_q?x zI-c}=KR-8i0HZpl@1NNAT%y$Oo=)`!kO3Vaj$3?qu)9oYm30%r%#1R@d}q?J-Gmw8 zNxRqAUTdX&n;Cf{NjrAzjG%<~GQ3}&i_76R7kaWgxEoU_eV!gZO=fB=M6R1{(8YNd zSgeXerV5N9OIGxyH-D@N(v9C3qTjAVKn^xZLs|;fJoa=)^!`kK9~y{B5CNW1zRN0a zL-S)VwYW#9H70FfIo~JN;aBDc!k!kfH>NOT`y47x0$wNkZdv;~+wU%MUxL>msvdDv zrpXw1P;;9OQiWfzfvDg{X}Se>VP|bB@9U(G=}?KM#d!xE2fp9)8be^2NO=i zx&t*W@lmVq+In~yQ!%fW=D&dQJ~f`4?&Er}s0=+ilCz&s_SOWtMryCsQWZLQ`SzWH zci4|zFZ9zK5rIm4h zgVakRTNV>2F*ina|FbAxG0+@#I2!~}{!ZTdsUL2!mk#F67%3k^`{$Xq zFkfVM`ABj>qQxS7sX0ZJv$-sqO-AYmP^eGPaS0x&SxWO3+F4dmh#ep!1!K?zF06wE zKu+A*_jR^3dj;36DA8?hJ%cP-TRtS6I~ZQ4Z`(a*?<&5CyVWk01kx}A4G!aZNpl4oO6+hb$N zq%`bVt3uFm{gL^48`n#VDkG@^Eo92rW(-ciD zTQ(ug)KA%JWa+8gL#V@sc#@F&wjvcv(i#Hi<6W3^2Q{%2b<2MWxbQp# zZ57|zC^N-b_Qg@WLiW!H_1uE}$ex{yeDCB5-6*e@8TUZTAR^?PFF6$jh(d}Y>3g1Jm|<%H%Q zQ!ptqRN1aF$bAlv-XU;R|mSb`X!coNJHd(^T!eK5(G6t z)MTLIS2Lk6y>hf|{AQa1pMuxH<4YXPHkF0BG#{aX(qbNqtIxU&_^TlszN zg43I^3NyFpD$Ba&+w5men6J|%m*gbTJeDD2sZMc=WKmA&=ZuN@3_ho3E%#M}k+NI? zZ5Qh*Q$fe+oVT(9)rq*v*aP<&KgZYS8C^{|o$QZK&F>xG_$lpL*(cczRJgd*)OjDi zkIV8u|MY{$Zc;<4FtKGibzmFk^)t@4m2Yx4x#?k_De8)`1nOb!?Wr{lC03`o#{^lX zs#8kcj$R4~QxYw6u`J)(Bw&)1nflH;Rk4GoMlH{L%(eUV*=y2-(Ks_gmFVVO==Sm# zW}EmMCl|83wdfU0#`Sqp4O>keQtvOCl)5ZCFkef3y!Gt(dDo%_Zk{CUc(LOQDY_OR ze6GwLF6(qZ`-?pm<)RC`TumnL4m8aN$FTtS8 z?g)yEkBaAO57{bFX#aj(qo*9<6kBKAZF5i5ZU!ER*fXbs~QYmN|1Qxerfx#kf+zqC2b zda~S<3~dXIKVQ==ZiSr71n%Y0#`Cw6GrSE;#To(HU5q=SN~c%llM5Ebw!-rJ1iNIQ z1gScT>?eNW6OCfDbZ1SV;WGnBadz|N7VlM;?g`G@sNT0diyhxV>+u&Fcq>e~_Q$aG zRTSxYak++Z+f6WvC`~)htcZXoZrOaNW|7e1h&E?_|9(Rmg!J`G3UeV}K+!>XO!xi# z63MPp6Y1raAavS3+uS+}eLUj2;6D?6ue)f!Ur#TB`lZy{TEWZ+*=+*ipEka!1SdjB zS#WBGK;R6}4`^r&k6WXe1C}_enNrhEp?e||?11!RKePO}wT#v_zHgQ)l--FYNRaTg zwU#(7XxujPlHrO^s_~qlp%}@IpWnnK-5IfV`$1o)g@=dN?lE^^{2Vbib3P|Z9mikl z|9UcxgTUKk_cqLFIH+2LVq*s%fX6fgTwISy#&(qDn_(Y7xu1kAXD@J`>iL#EB^>m~280G(a7XNytM<=Vb1D8CUKig-*J1tRzj`tTiPQ z3Q^J15N2BA^4UCoN=Xq}7Kx_MTRLVQs<|Td+LiWpvfG z<)5dfJHPnz1@q}qTXqT(BkWo^!ghQ-#QsuUO{C71Z-?dsbJs=Un_QCbZ_;gdbz;wn zUH#b9mXcc1b3_jsa}Pb(z2{u)X#u;0T;s-b$`K;5UMicbfbmG}X(Jw;WY?O^9hnFA z|89PdLkNy3RF4;|g2|ZFgXj-tds8t^Es0812Txua?e@Zd2o-cpvvq~Q$F|$|@Nxxt z4})*&G4r)4rOn^$2Fu9ANfv7``H7>v5<>8dZ8I;-YFSCB7#l;=u*AyZpA0J3M6wCf z#q90TJeY;T8Moz>;hu>WRRU%pS3VN%T~bu{SKn1o?|*Tp?t$yzB-o(C zcKlfSuOHxQZlj`dntbq*yZbn5G7n=5s5lK@+J5BBFH@Gi!{RW&ja_f0tgp80r)oZY9pF_VO8jeaB8*8gpJO3FREQ9Gk%fS8do_xACHz)fit?O*XV;p7ro`{VPEp6aFIjMS8e!~PC==Uiz zn9RJBH%H8qXfBSNp&onD#satxPE=8ICX-Fd=RmU;MJa`T-RO5~d6x~smlUrnONHpW zF{Y{KH{_^zn$^~yg(q)KQN@5UvrQ?U!E#}^zL&&JW#ZW~(Z}EVldSDGqXmS9-+#vL zwGyD>6=}de&qN1ikAso;(FW+gq<#&BQ`NL(%Ud2X;hsqhO`zw-kLa8j`C~(dEJ%vD z;)VYThY!$L7?3kcTT{&?C`2Ph7VGErR^sS`O#J@FwyIA?oJmm;%2YgBu=RrzH7uEf zam+eXZq$scfn7q2SD$F5JZoFL?ha2p@MP#t5?`x$B6_sy%ze_`w*uI5fszuSOLDnE zcvi&Nqieqoc{d1ue%Ydv#Tl4X@NQp8zvgx;ylT9}2p8@El;HiaKGc2Sj4|czZUNuu z8EYhL1>QB_on))88O&ag4L#L114`%f;&G=fSyw#KNYJqW(F0$B&gyze?%N5EJWKXY z>lhl-ys!*LPHwk`razhC*v(wTL;u2dM^B>Py?ctf7}8%n?L8LUxSjR8{bA5OlFY;= zY)Is$8!KCYOymGC1E8nZ0XB&9E7gj!Hr7@WWlK!; z^uxV}oPE!??k}P^-IWv#0V9nlKTS?jIG_HxC?)!1sbRh zg7n9C)Pa~64{)YY`csm@Y$xwlW!jF&3Xa9R{YXno6aN@XCIl|&n+XvcM_YSy+ca!n zDR}^Ol?|##7ZcALdh8&LsC$VLC5Me0$vG`A5P&kqTav2|9op3555&fR+isC9d1n~!ZHzzXG;gOlAR{uAd}wnnSn z9rM!G5jKl&)PAIxo?%N~`4qYV!eugI((!L1eN=U2yD;t|dJl`?r8}_0$Ix35yJD7O z&z(+WA4Mu`zhv?yEd4sTwjPOM83bm_m&&n=h=*e~o@_tj41RP}gzwJ#dHKI4)CLV{p} zEZ+~}<_MVg$>VI`JHKMphKE22uF>+cPzb2&3QL|eTrPQUrR z7wtGzxLGJ>Ao+XNEnBAQ6qgp>%=Qu|+j!w9%`bs31EE-Sf!0HtWH!~Y4&_6 zYjB>v-l6QHrE&`rz2Jn>xSK;z(};7|eB^q>z0f_iU6^rdh2HC1v@=2`em(Dd*v&O%Pi~DuwOT+N zM{ZiOqUKsH(=c?9SuirePbXqKEG-CXQ`ms*A!U=hs0*6GwOWIBF_)OHgrzjau`S&5 z{v`xv$Vkbg9=Po^4#dHF>nmo`lG%@kw36-gUfl(;5(665%-tj*w<@1ArM|ZOV&1AM zjPgywZ}2}~usa9b@wLY>U+F~VLe4U+u6-t#lPFuZD-lP=C9X_8N|_k!8Wc5Zpn=E? zZF^rt=i1PgZ`G%x{o~FnqlX04*LU#qr4nVL#{koh6r5K=u*fav#nIy`j*C0&DGcHG zKR|Zce_bH%0A&}%- zkI9TaR(aV%#DzH7D!{GUiLpTYiMbQw=a9^r_L;4DnUa-@TO#9+7XVa}C;cQF>K;9` zL#fgWBGWu0_6C^Wk@xxuQMM6|XR27TR#OLN958XxL4gvTN0p`2`r_uh)iSBA%H{gp zmwj)ObeFFWvCC=}qzj)&tq^nf0jW+ZY_-FdG1>uRN+tRg5GWuuiQ|;>kQMy?H z@l%o^x?bwX+<8GrGla}NNiA~1Kg|MBQz-K923rO7ULKLpTBX}s@%mLWt$UQzG?PW$ zLbQ(Yfyefquypd{PbsF#vHLa{Sj6L4DHh2d;saZmmoj85gY^Q3LryrXh7JbxyQ8v( zf{s~zNy0~Nl(K`GQw*V*3ehb~q4aZUVqqE0=X>hPxgQoBi>KqKj{2#1R~pHF{w+&@ zVyYhxC}FZBt!~?eo+VFDAr&BPg) zcNIf*k_ccu(iT;xVEJmlC0BAE?ZZ}~_btx!iLzBdAD;r$tIYuE;Xn?9=L<`-(?U^u za;m0EmwTxdtQs8`fC_xvE6OLWI-bEP|5Iss=C0S7li7zhWG^bDjPEN;_G!sP+avtQ zFu(ujBO#$yO)*j#TMi|Zmad__9$Oocbp_$#cZW;13Ujox&`p%AYTu0xlvj@1wj1?z zfN^?%^~YbEC>AymkVoZnU#MGQB(3ks=&}@r!6G3xARTGyeBSBAgum!SUu~iM$m141 z8cX-L+*J*yGK3yhS(DgqtNrLBh51y7-n=V*Q0rcL*YBXW|JG8e7Wx_U6{_^H7c;T9 z0vIipq8idmoxz(;W_7WA!|mKqT&~;-)bFX>ky)fy#%OqeA}IdwLq6haPb(az>ieWf zl@9)k*G1D)VKo(r#6_6FCl<`-(_>7SLoVf_*MSLKv)J2@bk5rDtFfhkAV|{LlsnwR z&TEZvBTNNnJn7Ox{^VdgsyOwTDmBgkgneF#GZKYGgX+?bCY@rWko0Gw|~T7+41 zja4e2gGm|T`SgSm$^I!+$TaKZx}PvR>N$4ua90l15(rc_0_#44(-}}6dYr7R;$_d1 zDn2y*o}hpjW($M~jLLfU2`w)HF1lu@5x{Vj4~{6VQ=bs;1_EE*ka1F9t*?j}MW&S- z+I&aZt>1G$t#zpK9|44N#PJoG^+V;?xbo4lE*!g98#a8iY;9hNAH4K@v?pfG{K^O09Q(}M zTI#5(z$N$doifg%1R31%vmNX8f32d+6Q&mBqCu!L`TF{-L&aXoJK%U~=IGEUb6urB zPMglPTs9FOzz*|wI*9Q|K(k3+uMom3)vPV;y^rv0BzQkR%8G8-jg*~aRnyP%ZMo0L zXSb1wIzvwo zyuGEj`NnGh1 zLW*M;|JwuNFW;8cRu;u(vGq8P1}tm=JW|UNTA0zc^ax=Xlw%_{p@{C=0-=Nvwc46- z_gWYH>RuhcPBXD;)azp4_F>UQp0cGO+KLE&|%ODL)Zl@qdl>Dl1~%A{~-8b zl=Wiib~-lCN#Jopd)fjzejD4wPLd>qW?2P&whJx8Y|+?%R}d#t^_V6w0cJw1?&6C4 zK`eI5L5Ot-hMxL0yp=6g2uCQ{H28*ILNbfyHfj1icF1a~rzcadD(009bh_du-|S4( zY~qOD$*C9FCFYuUG>yFO(^c_NBY7-s0&bXmXix^7R!1dj4Qq?#B1G^&J@Pfu1-7?p zWkG^*`^e7C7bX-yk5bP)M|jEEaG<9iv|s&BkIwuY*HQ8t){)H8qnu2tLywo7VsNqd zWpKfW^Xw2|Y6gIdf&sF3f&fai218ypH<2MdL#5*z!UKu`=7^&Kp0Lok?XayL^oiPG4)B9R8u6|z2*ZH zXqQoyO7>Ol?mJlvz$f_8Xy0KzhN;y~DAYpK(*U!{{df7aDQ77q-+9~eak4rnB{QM# zvWqnHSb}~8K6lZ=xg!$P#WJrUlM=t9&b!a1`@OxZweKs5G-_Ij-0l}5G^n^%o5&LutTSGPDDzp28eRz5OTmZi$FWmUS=8hP+KM@Zi5Zy2&U zwZN7T=4?8WF>F&Nr}u?Uk_(~7)??Jqbr|%jwWMIp3B0J996#>#+Zzdsh`CEoR`2I$ZwF9uilw>j z>VDHn2i#nh_nV^hz&?A`K7PIuPN&;=8FZ_#$ch3_67WOznuST# zmCR?ikQo^;R??1Ne{|UzWV8cXo3k<(D+Sx27(Oh;HUBuBv2_QmJ6D^TWrg=BHu~RJ zTAjR)^06wO%G2R4uo?(DrhN$}Y5D7$jI>u)(EO64m8jCk@vLIcPcV_KPxchI*=bM2 z`nh?G_qhTN_jZXO_zZyeC|@xay-IuIZ_Tt26vN@+ouMC&n_}J^s5-@t9JH!;9qjq8 z>I)I(DoM&DcyvP?kM8GZI&`|In%sl3D942;eY|I!TIC6L?-J9NBps|WM;9_C3BMdT z52{QpG{Op37crO?e*#3 zelQqjk^iYck`HGKFj};(ol_(nGEM&onU~XqaG248NVPSyW(Ysg#$Kh$^dm-Jj=2-~ z5MyS!P)ZqtXSwXQQ7m;>zE-WWGz-KAlavkSn`IX$2UnI+eY|q11<1Ib3A;RMl-Iny zUpkT_Qz6xTnJH{4D@&VjX2Rgf#kVes>NEbtOTYfCdASgIU6eE9pxqI~m*#APq7KR1)0jv9 z_jzhj$cSYuquWm2U^*u6R|wKR|GQ76Kb}&XgUKOI_>x;N*{!voFy{8fVLUOOG4$I4 zKW}E{-h1eT9PCNnW$TGWwSa;taroxZqyxJ{&44LjZhUD(eOUCrfr z@W!5+?4dF2vvkwavaZXGk?JjbhH}xgx_hvJfr{nJb z3qFIL;G_7qYzLp6Ai2mNMdl0Vc%;GLJ5&3N+)#L~VE`I} zGn^eMO|uBOa$}mGHWs_42E}Nsa$j5AptdrL3jxgFh<&d)5qqM^C|Irw=DZ+ps#iR> z*gz~dIf^TRdOjpI;w+7789Eh_T%}}lYy9AtosUt@!IhjK0O=Eq*VQb%7i-a1CfrU? zaVeW7D7kzbFKlcj&WoEUYb<2 zOb*di0bH2T)%kQE5o+Z!(O#|oYC0({qXM{>DWGUx#&J6Dy~`V_x9lR1AQx?{Xv*O7yd9>?YQ^ zI3ofMDO3CaPn!43`gje(%hR0aJebaAd4D@jq{{r775BKC&6inB2uM2d>J4d2Ak%Cz z@?Ch!UEfrf*g?q>5HB_8M+E}(0oZDO`P`bgrPe6>>W${bolOQ4et%@5T+YN}M=Mr|Ol zJ$^cey%Zk^!vJFAd$Y!ZUWbI&)F-2*fd;)cH5ERd9EH;_y?NcN>?y0f_-~=Wvv1k-nCy4M}TqQ#H!ch;sV-jlI@f2Sb@&G~a zBh(jjO(mvgO(28U)Sm0}((Q$|@%5qn&2a2UL>^11RnZvZ1bD|_YMIf>a$m#q9k=%D zbReWhl56Yr|1xkqlhpx#KIHz}NS8L?&NyvIHHGcGFE zeW0HqrnrhY9;%`b?D%EiW!gXhds8><**1`Xv1nDTA97L#w^?gDcwK*=2Sf(EokoUH z8d)t&=jMPF<(xLPLFb@~VhMO*WKp`%hFKjtc_{-#j~fE@pq#c3fTH_-SzSOoNLVDni-r2?i#ppxt{e1JizACAK~ z1ej^02|O4^$iW6s_MQsFlGP-&B1fkPB9q0G1yI%v;5>pXpg$jvzkRfw8i87SrU=Zf zfAHd8=K&#i;`sr9jU}_!RMT)>5IKzTTB12vIwqUmv1?m;(YY7gx`+r8V&MmnDHMD? zZ(lops_T&`a_%RhFhHNC_U7DmcS7S9=Ao*n!tLi@-+J3B{wdliNPo~05Kz5S!G2{L zccR~NjNq9VtUcnmR7$b>2FLD9{h|iU!P+r-Yn9R^FQgPO}I@gir>i$AJX(mk;4L*?0~HWND5w^z2y7e|<>@ z{q_+b&B`DZ8p6R!HqV!iQ2iEW$1aoA%S!)qiql^oJeCOW`X>L_zaDY=Rl^hRMh%3i zFh=X&Lpc*Cs2N9}P-!I19^I;z9ldq?Ei=C00Bz@0H&2>aJ{3u3vf^I?|@pxQ9T z+3!kn9WYluiChipQ?9yYVda=S-q=C++<_f3^`Gl`-ULcjvPR12*Yj=v5HmbD3HaZ% zEzJU9h(F#>wV+^bbbROs9KS^;3UOA2g_g_o+Kr9B*?fIb? z&YcKC9)zBAZ)d)h6=LV5TL0`#K84IYJwHOd2=bFU$RnYoa7Iqy8TXYR+lOoY%QgR( z6b{$a+Y9Fv=)%DZj#M72^FtPh@SC9{9>0vfo0g!Q^%C9klf$A3Ik|wcPHM&)1iLwu>sE>O1{>uYH zZgEFwTe>t3+a2Ea*F)bswgSrLl)sX!O`dA5DHhA7*~u@`_yK@JBM5CG@rW#`zxUt? z>*uKd8ScA@$&hBD;H>c9fv}6{G}C362%2b&_G)fND0pO`LdjfN91a)Ahu$9K^mZ8V z;rl1){_97*fen=Heogy763+?amLx1f)R;{EKla`_D(kF!9~PubR0O0I5RsB@xD^p; zX=#w|?v@c$y1Ppnq#H%)uA2rWZn`_)bI%B)GtX~5&sy)l-@j(oLOz^x_CEXUxc0R{ zLuWC7^FwJla*}~@RslY9d^NDte_5l*f8B807i!=TfB5WPis`SLpc~)lgH4 z8Ratl88m9)i<-kDSNM}C!Fs-3YLfOIKOo4CrIoC}0I37||b-i~WUE@uya{0ka#grs@2P_X(7~UrYwX7B}k;CxR zX)&k8)Wmat!tZ!-ofK;|@UHSKxrem%mfxS!saKZ*lC}T&n=`3?jGt}@p7t880(5FR zJIGVfa>P85=HCnF-|ZFZ&H^1Iz5p@V#hnQG&B(&<_9nPBs}{{SfBS2Y=NN#>q?yht z9;#PzaIa9MtevrH?v{Z~zOhEBVF++H+K&j~#BEo~^XLCHN_##lNh0X`{aei#yKCdn zM3;5$tSwHH;m+rQBujX4tH~g@`OSDZ&uFZ0w5&L1RN({HKNt4}y!<$@W|`l=_ZdB4=Z^MaUHyj zllkv{HNE-gVWZ-w z{4ShK^gn31-Qc8dfRdx%ItOsG6b6v;4jY+%SOXEpCAa6Vzcd3Lz|kH*!{4M&k%k1= zB}_n{)IO072;@xxU{j+d{m~)F4_06@)%Jpy-?uW01mFb|03z0E7VV$Pcn!e=2<2R# z!DE0U32^FGDKiHABeQvcOq7eHTeLkQTNFF0Tt|7{O=Y7nK3> zjq?KkeM!zqU3eD{=(U*p7R&{x&VIFwW3SaZFR=h1NP_ob*H;Zn;Qgu`+qs^KY$kdo zCV+k@XQ)7ZoSjd47GxMi;Mvia_f^sNfX(BRZfR=!?aF{pv@!t=-RX$XPyF?d23!2; zyQ!S5i~;T#J-8!|O@s66U;Ohhw^QIdcI=B``E|*2OkS697~2w@Ul(+UUH^}--UUbo zbvHm0V}1#aJs|=&Up2Vbse_c1REa{&8mK7z45)rR6>t)85axN^ekaMV>)H;1fJ<&g3;}-Eyl|kT4aord^x*7pC|w^?e$}}@tp!8_ zo*|%f>b~T|{7Vk$ue3m7q1ObWvtu6{rIEP4L5o>pCOTVh=I z2zh^%#_xyrQ^8~2`pH_<7dL5jV zlV>;WKkvgoS9;A$0v-&vDFywV?fQSNz=;B!)s0?m>i;f)HXs1jqh)6M|9)0C;0YWA zo~+*=k^i(U7G)rS2b%oNf&UWYKi53x1{duKPFg|j6O^sZ&9Nx=*5gAii`F@D7OoK&0PZzIMcV|o@ z>&`TEPYrs)bFzB4#%jY_(ycMxjffPxR{Oum>fM`bUJDPC2mbr1Y8fCPcG+wEFZ+M? z#1mqJ*Squ!HSPaHX^Kj&dFk{1PZuDS0+P&)v1t3>Rdqz!6H=1!KPiM(T*E*_9`*lz|LyT9dQD;f(@n)gLSqv_NMQW0e;$Ds6W6eO z=YP_E44@}zo>Ec&RZl$=kkQ!2o&Fc|hl>pKLhKFw7ekzI>TG_HjjA_j8F1tD1{rzf$cjtlmWc?|8yZXW0=4Hk(x-uXE2Vt{G% z~JBF0dRF1{w4`~D|xt=-u z^k2XK0uQW2&z_E^o%A!`&nJ_D1W}Gg`j$ma!st8+j@{TZ3QO?ky~eHn?fX!9HUcaV z35D95P7yyUB<_3kJ3q5@x`;9z`8@z`q)&(yiBfj450{_#O5r-gge#ct`7gvRIX z3YY2$nL+f$A=#`vocVmlc(IoJgjAFN0cz6G=f)_!LP}5121L+HY|Lm*OJj^pTv{ovZJZ^L({1y%7)v&!&hA4NB5`X-m9n2=VARgS9@b+K9h}`O z-}Dc0h+HByY8E?%88uqIAd%s{MUn!A6m_IqB8~j5os0`#m0k2N>&RQwGi0X2z7YgW z8YG%wx*TXk;>3VnUeaQ7$8O5^#B|BEfDN* zL>cq>YdHi9-e~m4@BTK^}@GE>2a^K4wG4oPf%ye3K zmJdcqN; zgOJG_8|f4mm?)n`bA8wsMt#9+*_X>hd-lq6vg;;F7BAB4SDxXc%(YxtSIJJa5=h5_ z8V}*{2HP*uI2QQ1jgE9iUHlH7`TQ~t$GIi=V*&K`q(VI1oq4)lrPk;3=9X- z#P#meB3$B?FAs6iD!^u&7$i#d>49+wlob%y26SM2+=$$^8MSVjr{8S z>DdF+7E`V%hOr!1ih-0$)!4o8@bdrzB8UX_?0=0}-)SQp;Ek;s2Y&zd+OQnNiKc#f zuY>z=DOyRe?f&klxK9aGnx%X94Wtw~{(2XQ7v9=Xvd1r59vLDbUTJ*zLkMihHIK3c zh}*#NEn)b=v9vH=`DG?>SLsqE%a_2Q!Y{s>q18XKY8I55CY3lHrO z-0V?J=H*K-gbUfd&RQ6sF`lWV;pq9cpd3Ww;a+MwNan_?_uB)Dubj^%Kzj|sC~62d zc6F!k`JG_o6!LFMN8h^%K6bf0=|*Kwo9aq(+@dT9n3?fxz`!Ji&8~W`E3PN;w)Wu9 zJ>wD}&Nll;j+W@FC3Tbt`?VL{Oq&Nawhw+s-#Vz`8!oI|Woz>u?Ys*IPuvU90qIWk z&2$&|Szi@9UyFI}8ZnoVV4)z0gT>`KMvvUO8}ai%DEtv6i8+IB1d&)RX>Y1FrO#nz zd>NsWJfUm!Za)7YL+*8Po1(XKGPnrw^jb)fc+b}>@ronz4Vk}xFNXHj-*a7en6yh} zd!d6SEK?#VrkEM`=PqAJAi#W;!jPT!mT2xgQWTR2CQP(D&)QKVp9HurVFwO5Tu5UD zcw?rY555yZuxD-gp2|Xfc|;Ugh^HgCDDtt-C?pVFdoXX)cl}Ir0cX6h+=v>qZCF(a zWXl<3t7mC6nptZ*BDEUcAyZNb@BnAO)zIA83u$!XwAsPWPgnMf&kqR}2Pen_u|TBi}6KG~>* z-K{)QR4z+=s`lvCEAQ+}t)fRGM2=k<%%>scu@8Lvof(A&AyDdvJ%r*h*oaq^RYU-? zWBX*YUrX<$0K6We1wUng=O`gS>0)vnlevJ>{e+f!1!EFsbPw;u?8WPgT^Zkfuu{{c zve)VNZdtmwCzOufdzx|ga-9@`z_hk({a`7pyahI-)X}Ecs?PJ`F+!rLbP=*ab@`o{nnbgxK(D_J1eXKMWEmElo+?zg zxHv~83+CRz7XuKPrr|7`rmi@)P{8hAwX0D2%IYq&vBb0EtlJkLLOFp6n<2@2LRb%; zd_muw75c)!efw(i#2fgo@W=@$ac?5<$0?U68MVbRB}v4x@ZUe{2mY=j!69e?oU3z~ zjsSJw7tL6A=RxyI#4@`{yW4@D7rbVMR@0a*N0VXwR*CQUy>4U&%ARotjYv?ne~ROK znC{3s8SNxN(nBV&TMyO(2z(uor|4WsJ)J=}(93ou@Gq?knwTd!$(tXuuaUkmg4cER ztF0#lf=KCmBMf;5R;?Ddyg#nj_d_p~)@H$TZ0;C@x@85rHcm~Cwf<&5raF2HETUWTHEJYe2=xbVTOt%^;eL&4AybLXZA%FjQS6?30MqwkJMLwbbPCW^hkH>#GKQ%c5iR7|JChcG(A@rQa< zj$5geCSC$BZi(v+G-LPY8aM>q6W(V1_=8SwzB6WkQRksj{hQr2PKwBngdG=1hf^*2 zo}L%ra-DI`Vi02*K4L#Zc#7$zPs~ImbQSj8r*FK0^QQ-Sibv7AN<%nw^POj7bRVVO z`l_-I|4@OPbSRHqVB^C@u=X$1o0AR(8JbBN)|-t8A_SnMozxl4F4CVNOhmh?VE=w|#zoUn&4|4o zuJ{MXEWDfu-3AMhe;1C>o_X5DUYD6ALb^`_+0WTjS>diRXgE^@!#Y^G9oA8+T$etj zi^e?%y|MJlpY*XUV5R1&Ue0AE?ZrAtM1c)PJNW&ndM4xD0YI%iTSxlRbh}q$81dfv z@zJHV1P0wrbh3f9E`~JmH?tbHzH~|%^t*Z2U@P_QJd{pJ?f8T4^7=aomO!3+y(A)j zZSwaY06D!Lw8g`O+biumgjbkbfFeii+E-EP5xWT#y^ZhUc!eoO&?1mtqx2=2>&%mE^@@7F(^ksBawat{rX=_6_kgw6WBy>; zJ#5^fVKQWlQz2(OLqC7ax`ou?4_z#_H<0@ngbAMuaN;V~!a>(gGg8}}=xyDx@HR4? z7#6oo081aZc{gn*rlF0L!=$f2#4PNV3w)Cnl7up=@L=YQ{q-NfYMp<-XHgHB(A#%$ z%OqI;b`0JFw~{ghMe89z`|!e#K-CW*y+u)WS16q`&5MK`jD}2lUx2wK1g3Io7CK&Q zb!tZc7NEG|+ZL((e;?d{8i< z5|7+8m^%cY$bi&m@f};77`qtt7U44~&Rt>wtRx7T(UgD$CvKu$T92aaYl3_uw%-Y_ z{7zg-#M4DwO3crXaUbK(oM_Bcd;{Q#IW)BM&9eh9&exWDC+SkMbO2qK> zG3TbIQ!RlhU-4bROBA@yU?(Y@8Q%!+%=r2esuTKLVV*~xfm)d){s-*`>DQ(Y5LNff za+O3c+}#jsM!qOi>)19na?ZWylUDgC8R2-dx8#L*gIYV=WOcXA{si*2;rfJ+-1$~Z zM}M+8Dd-QhefDC3;T3(*T<6`8`b8u!$6JqGlGAOV>#}M3`N5KVxl9e=MshSAiToSf z0!kQo^i-41yQQU+Rd#;*pFA#58Fl)0k812U&_@qz*>?fmWUI}$503In;})Lbbei|> z>m>i{O+f?gNkRaC4U69R8Q`I_Iy|_Fg(+}rffqJ|ZG_-uiN5L9<1xw*$0^#gu71|= z*pn_VT@)u-nen}hSB5x#o2oae6Y1o5E)@Q_Fz2PgvX`rJLLN~*53-AAm4XL4NGLdq zG`WiPYgRITie~9XB@5^5`m(A6v(Q30aYl{WMOjLrW`7oksi&e*0(DUL9Qj8~G?Rg~ zPKznNdY@nK_+#F?dw4-qWxEtw`|8slB4I*4I8iY1dffH`L%L_+jtC)%vZ7N1ladn2 zTTIh&8YD^ecD zW|Ox9?egD#A0hf?+Vulu#9}(IsTC8O!`>%GQ>Ddwn9t_GT@M~M(~O9(vTlRVx2fDu z@;DFK=uV6PG^>S_8B6eownWmN0eRs*>Le&*!k%N++J%n(J27byc=>+Rl z2>`Zjn2U_F2vL>G3=r-1;%$7$!*%2G{4>}&~ zg7wk9#dOp!&66#QwcQw{yKiE%zp6~Dxvv*gpV<}7;FLJ`>is+VH%SSb2|dz}b0{Ip zrt4nMZh?#WOvP;BN^}3v0K$H1A?AnnhnVMv%P`g6>iDw4Bd2tVJ$t(g=yY(Dc+?h= z{|Q(l{N-mc{`xq{OgfrP7=%BKW1iW1bY6hpPW@3h-P#UWujrcKZpzXGz(hT8E`ZPF zLyT=e30)b?rMba>`fkDwst(?T3#-*HP|r{28$QJ5Xdl%XLA;{mHC4D596&rw{AtmA zwiHYmAbWcJ!>@;-55ShBG9}_@M?V=fXj}w%UDpNgyhb4>@5UiOuF5V@_v{Y-jCNIs zroiEf`@a!-Rbij-j@5nen1P--u0AB!B zD5D!s75le*>(fKd{!9hTCJr9^_4{SU&^>OPM*x#Og9v(MLL$3jTY)2iGWG6r#E{8g zDC1VCP*)4jWR(Iirs@wLd!aMO@-)QdG;^5tqIe^>6sCFZQQ1l@eXw*F- z=uV0Pl#5dJC+oc2R;R!XnJ3s@%1S!q^W}fZdjKgZ-)ZYhW3hw?7V@T<4CYZM)SPR2 zsuYe_I24we%vRYb_ZSS@#8S$1B6g{kZ-lZL^*?)!w);H`1eb!JKY|pnK3wxDdgEW@ zcBx$jQN6&qz^S8UxR3h?9LyATSHEwyW`+@8?$F13Z!ri$T-MReTeg0BqnX!pm2=B& zm-`vyE3U0QQ%<&N!TDs8b)eqfCS)tvz}}kz^WX{ko-uA;OPLLZcK7O6;duRl4|l0- zH^$mb^gmc4#_09;QWBM0j6br0by_1OPKw8TntvALY{^JGTo@*RV$j?Z18QXavAppE zl~enGd25a9)0ev1ijC`{Az$|RmfvB$lX3;g&wbHn$Y7}KNMEO9$YWs6Z|nbpSH#5! zyV2IpkU@HSAJ{_P+1Bq(x}RNlNSkBZU&+14r_=bdqTG2-ABloIMZyJ9%x9afYVVfw zc&1aaUY@U-?gH(OM158J)rQ6N>%)${u17@3vH9uqIMq5pU{c84fC8k{xK9EA&`FE$ z8vA$yg!22ue2v;?=uSKq0!eOwv&BPiry1LNdrq_pu*p#AwT}aLoQMo+tRDbeDW}p3 z_gVZZULlmEvp-VbWa1Yp)ntmuk*pA6Om3DwN#^D&C?%Q(yP9=zi%AHBC!znY+j(h$ zSPwO=r3~#4CE!q06&6l*9?zbCZIh#xTj;!}NhO&!S)@Taz{(e3pZwMpmyJ*nOgOEj zpYMp+a&phvpsJO%wBvUx9_1U&!%e|nu}X{=160$~bmwL;l3}_&e!{NYV4i<(r?qdED%(vp2%5;CBEMUv)egp+X1FT%u%?(mvuQvKxlw?l zj}=n6518wtdIWhN95=QCwzT_zg(ebk({?lk@!bbRuc}l&5OQ>?WCR&(f1ez_^TXSr zur```;HtIj1PR^)q%aO2W6KAnfGelFwyZ#AM_Co;*yFMxc4BX))|`C9F<-5uyPxDV zdl%3Nw`TYeka~d8P}OsX3yntPQ$Rh;@Z7}FiE|kH=J_+iZ^TnVguniFG>+WO*_|!PswIre_EOVOT z*{e>1aXtNdN3%VGBu`%O>zn`_DcX>{TX~7g0dIe?8*h^{gGyOQC{wMLBZtvL#I7WF zYx!kp{#u_y1zDl!knxaPApbDk;^{(XA~kqCI7p4X-n>nhoqr^|{Z{r~LZG`jSddD0 z!pRc=o_0CfYcC9aAW0;W^BwChORKtrK``hm^`-A}e}PKg+{~y^IfM4aeP<5}SR4d9 z0GR^A=!Qq*){!36S8t+ftlXS-V`H4SG}RraK&?bz`G%08r_%MhZtDJ}y*>VkG*yBq zX1-Y7iq6Cu3PS@kNKp1jg+maC=6ZDs+DLV~A|(@@GCkQ02VVg`W4RZ{0kG#FBHmeM zsj!3}c@+IxGd#Z-t%8Gcl}jE`1XmB9uxY6Rw{sR*qf#r;pO9m4^PAE*jc6STHl?*e zo^qWm2*FH7pt~_8q+ZG;se-8+?gmj3iCj#z+|JGQqy61Hil|YXnh@y$Fvn6&toV1NXhYAn%H|-VpK|#h)N1; zep=0ePaXTn?B1Q+;TMtg%HGDjDg@+bhdM@+IaQi4b($cBf!jeutOCiTYrZ!~!=G5- zu$|@=y-qRmTp7+~ipY@p{54cMlkjAFaOPA1A2u zX2{991fBoPy>pWIt`GK;<`NyEr7v>V0gp0**u3&ya0e+A0ME&{n^(@x#?*}mV`k}B zkNlopXMaB5cz)KOBPqab{LXtgs_lnhUdPbZ=G(sOgzJ{aiOYkuje0a{}4|! zaE|lRtaBbJ#Yow$)Q$zg+X_#|B{?!5ZPlBRY8*l2DMwgF|EcN zfR|u&XWiUs^D~KTx*@YuCIms`+W6Uyy!X^|Wz5SrUVr(CrpZ3%9lW>kdDZki4=l;Z`Vu33% zJju}Vxc+FtqzR1ACM{flv^rqnIY9;y)8^l)G|8gs^K^LlL3!rYI2n&_5xsJ4Z(7`> z%ajB6iQUh{t9Vl%#0A97^h0}D{p}S>TyHj>3vXmQVd85}(|Y_F1P-%SDy>Is_g7r3 zoZGI*Ko?gPu$S}IZpS7Yz4YGM`9V5fAqwp#J*#c6Jbu@wlVRBRT4Vee^z`U$eL-e% z9()Jhps5!b?Ftz}EmEO?i7t8kVs23eB$VEpPFBw!3(=_890VH=nZyF%#FHn}H;(Iz zWNY~)wCA1k(|7}>9c|t%#>{!)c}#6QUvpTQlFLyny!iqeS$G88`gC3No+GD|+L8&! z;5!v&llLcUoC_S%9oUHZ@n(DK7CV!9M^rg1UOVbOp-@}nwGFS{BSx||QqvSl<}LvJ z>~BSZ0HU>wILd*$&@$}&nCP;O(O3ZP&|0;%KkpQLAV*G08#(E>UZ7r~ZS%|m zy##OSxnTK9!8d0+<72yp^oq7c<><6#{~eR4T$CT7V-;Zs2Wxvv>AVDhL5PFfamKT! z8aQ>#5(+uA$?SJZU96<4%*uI3^5n3s%|L=fb7!9&K@ys4kQjRh01=X)Crt;_;-mdc z&hmwS1?C+}4jP(J`!YA!tq&zh3F8B7-^e{qL*jW4ycfMuv(3e+LAWqbEG|EQmnH2H zb-D*-KRU13m_I$T*@lsOyofDbn%n0!ci^K)KKKTn|F!KR&y#dpPOBshe(z6~^&?82 z8zaTidXQ;>CmTMy>{^n*wc{BoS4$J{;W#_ZrO(BB zc-3heG_JOSF*&@M%4~br5MOI-e zFocWU*&r}kjU$FfFI40jTUA2wERE>rr*HXU)qz9w{cbPOwO)uP%Cst*89fozZkC~n zwWKuJhaW2lo*n4#L()Q^gQL9iLG1pKJ^q5Wn9 zuakjM5FIm?7)V7K$z>hd#28KX>$UUr&t<*{`H~ShWIK9n`JyYfj2ckyfUJVmXtiDm zB}0`=A4RM{ua-R9OND}6arZ7!hxaQ^<3zeAD`V{3&bvC?Am-25iSOzT7(NhdgPmJI z%QBr31xx}b95nHxievPi_KF#Qa$ZA;Ds2GnwgVrU74WtOxMTHXUerT+$e9$jt32iQ zIZa~A1P%& z0&$5wo7?d&?P!sCf<%mIVzVo3ogYE(tcb^+cbKx*v_(Ouvk$HUYj=__-oaZV&{z|$ zaywS=Lf3j!H2N4Q5|+TeEa50m(ynN{;U;DY9&b{eaUAcSgnHmY%h_9H>(z_~Ao(&E z)pQa)(edmiKG2z$K~QI2ch3Wb&NuaKP@|W5)1F>FHMC`cnW6a#X*u6XI_-;>*1OHs zd7Q8$uRkO*Tpy`dhtSskRn}Dzzq8trVX*;~=*byojoRZ=IoMI{LhRZjW(&1n<2_vN z-?b|VpCIZSI^m5@JYCvdz^7%8Rk3m>m#w`xwnsq+tiHL(i4Z$QgpT)Xqk^$k$S?)ox4e@c~ zv%0Z@?83=`?6@@_2Qpt9A5XwuDY}B+H3!1j`pBo{>m$?-Q!&j&Q)5H9=^hON0^=_V zl0WY%wJU`u@RqmL)HuXxWFee5VG?w}CxNJlkSF){cuj7sZjUwW$6Ul92KuUjQVWw& zf2P%H0catwn{}2XUDx>Hr2k3A9${_zY-f@sC+dTlYQ=6V8?)XRmecZ5Et%3=y59JS zkl6io&PJH|meP1>Xq}5X`T64TEPJ*}ezjg`AgRWlv*sdyUyZa!TO@ytc3k!KTKCB@ zpF-qMvs2rRW-1||zCaMDEdmGk$R927$%40ll3G&mvQh-OLdNX_$jxe%y*{lId&XM$ zQX-7nS7*fKO2{sa+^ zYdXDZ^;Z*&jl9uMF`0Lv>PaS03l=P%{waKID%(Yyo zuMVE!@MO8OeXRm{K8(bWjQw?VXOaN}vSpI{8R=*3wEkobv!;UrnaS$CW$~JQQK&=N zF^234|K@PfO`i=B$tRNy78V1(+@&68or4nB)_EHeS`;&6od$ZYOX}q#w7rb_P#DfW zq1|BAVGoKHZZB$BG1!RpK_WHtHE`P9|5JT`Ywza8R8CD>KkXif)0@G9!^tU^NIexB$N^HhTT}o*h|6qESAqgy;@-@8XT^3o!RqBCFuZLgHjidqDy( z)=I6wrleDqgmMrR#QwCg4@Rc@YlZ3h)cez(V?`nF1bCbJMHK|2lDYOyXBI*a>=;Wb z_Pgx+mKbeG9g_80_RadsY(#=Ogl>+g9PMK`&F-Ch;dJftz-+%yyY7_97rw6FQSmOg zO(Si$D!ZqcDIIq|?1BHm$5aCw;sn(iP}mAoZdiAA}e$4_5# zfTOb?#-#q~#S{@=>n;zSmxo&s?C3{NYtOO$k1O zups87I2B^oSXLsoAjlyOumNMDxc$N@{|1TSvYDpvMdqTxTjiWnB6btemTvXB%*|m6 z8<2*P9Dh}#9j4pq7*v=-HMj!un#NTzrtg54=ZhAq|N0WCd4<+qh3Cr{>ArE@mW12% z4@o>uAkMVKo_hdQDYg#KZHr~kt9ZX|o2T}<(gRyRf4cIU4uNTul&Fu9<|L|dwB=#1Y7Tk*z{E1b=Mx_XP5g!t%se;^E zXB=aU;iMlVc|()Ib|yo(*cdfYTK2h1waY{KdeuVKe_^Yd zVHPV~o%SDd^#=62s4ePGL$?xL=o4&~`Iyp@)joFrF_f@XmZP-g++8TRl5V%Rx*V8L zqc&~Dq8BiAomRgs{mVK-%&n*U-KE5SI(BERB zw(Bvwr8U2#T+WhMfY_gP$}m{fH-JF;WLs}y9mf+`vwrN*veb5)##PqH((YRhyU!B^ z3m7>uyJ;!q2z})|gJeBCen+7D)((4z@__xV{X@Sg zx#zs2msO~6_4j=wdv#nc>xe<_c}TH)J6S4aM5(Y4gZB7Q%RC~>;nE#SA8hsnKTf5z z29xR%HHT;yIzXfm7EUiS$Rv1XT0QHb%$awp-tXkFG1^fma4UtyYvI`|y#+#J;L)ex zXVgW5;F~0@<4f`--Vo+@-+xhRhS;J~Ms_l>J*0a~_1ISP%&0d<%YD{qSvlA3 z1bcJnS!1CKCfkZrv9q~pk#HBroJ)IA4F3kXCDJ^&*ysECPo9>@PacnrSlv#SE`(x{ zJ2HzA@L3-|9>Ck})?pQwTvvmw6xN3l4Lu^3GWs%?omyj1=GeifT`mXxC}VM#YGMU7 zY`*cM1w44YLS#2y9b&Lm7!N1{q8J9}!dJ9Y-8s;e zE|#~1$Um~Wqp76@j7|ka$^6KClMi$&Kb-SIf&}1`TD}auH5;eyH672;nUR4DNZ7SH zFdn?jt;z$puU_UJ_{;2M7ZV3>{v|G)I(4$q8$_EW&)Jg@=O1SY7xpGi2pw~Y7g?I{ zRc_n^+<>9a_g0Qcy63V7=9g-x9NrjF>es{u$i7t>MPIbh(s)#AJ|IQJZe*;IW*=be zV^lcF)JtkPOC!ZRQS1!Tmb6D|%t4KDqfVt$3p?LKDEmvTytOh{56Up{nIe42oJacY zp4aq%xc}Zz4z$$Pjg1aYvM+l&`qW3Lm!bqEu1TR5Id+=lG&0=^c>Hgk%7&!{Sg2Q14<=D;0>i*{9rS>x|`AKR@|? z6Q1eXUI==5Y=VtR!g|HKfRK1V0KvU?<%zM3aE8WATYDy(k{$2 z`sRms>cE-n0?%=X#IC)>X;n=gZO0L((}`hr%aI@v>`u$c+#Q~YjkH9GWpCv(TL3g8 z^V6)00o^;p7v^}2xrbuPg$8*mcIWAdiF{VI-wi|(s`t``T0900Zjki$SfLcT^s%mP zbO(RNsEW>@JFqX_i5ZRL*^+HreR7v>Da8X+ud`*h?nJRB)**Xw7HHDcVV`#|TA?}9 zzp>fEH#8ru4luOG^6;y$DMH0^l@p7iUFl2jgAFxuqIQTgMRBaTgG6M1zBo-2v}Vsy za?0g^)XCw%+{4$Su3n$1mmURA2iUxcyMF`5Qoz#&_kOPcub^86VuQtqA@mgT3|y&c zl=^Pnq1@)cBr0UgyBkHN);0;e4pPstT*S-GM_0?L6IsV{r3I@1tufY04D!bbI^qOr z)8S&}YYsc*385TjDlK~Y2-euE1!{HOG8^6p2W_RaVP^tO+_c$Y@_B0Ca{bnyG44ue zBn>Jeu0HqSxskSG7{D_)<2vvhn|`fepIeLk4qJd@?(o`a{Xu6pL)ooC4ExTM>M8o6 zqS2%8Da+7t*pr9$obE0Q{%poCq11-aeD?eUuX;F!e@G`QUw4>xJu90f(iGm<8?}Ln z1BjkxVb15u^nEjaM!Opr=SQ$4*-Fn13G}hv?n#$9d(5TCC zdh{fP2$%_zrQ>16IH#qgtNPl2GQ z!j~wQ!Np3eN5Z=FTX5fD0wG%!`%B!%0oO8)DQBH!SSzQZe}8B{a8m0$U3RFkGxVFE zgq#iUK~~LO-yjkO_^7U>$0!ZlFC&>$jhEKrE3}GxNQ;~c%(Gzu-Xy$JP8-Kn;rg&o zLev-jrJghds+qBc|E zsK7Jiz=-tN>4M7EiBShJ59KRqj?tV=2lkQIUud*Py7%2GTt`|Pzpx9kS(s$vI5D-F zDn9AQ?754wkRNYx+Fg(kn1+GK5Sv-&kmN1^`<{DtiPX$7g2<6S-d5}sIH=fghHlc^ z@?Y0*cu9Qcz$Ug!;6|8WuAAOF%MBs&ui5hTdEcFl)VoMh!29SYlROITy92@)7QQ!M z_~&a-;_VK0YO6}-!_q{s)pym*&hC(9qEAf8W+J}fF&E+ z3QZKwt3jOft6=rsYjK;b%YKsQN5I(1AHyOJW=zf}O2nMvj2UiLi&F0F1ly_OGpTFS za%th0a)>X*^1%SsPqNx>{UK<#*eZ($ne0jXAihv>Yrb$HYrfqgH7<`SHanMOqtq^Y zTB&Hhp__Bkj6_@wj$3a|9gjqgmS9P9%Nt7bBpvghvPp1VD~NxbzGStEs^2iYYxr> zNM%XI`m*L@ivo$H1^$eN#HL#3Au;909tBGbJJw4q+bSFNkv(rXj12+!oJD{4MNm{c z%!uP!T8}2Szxmj5m5|Cf7;0ZztCo|lZL@xZ!YMZw+fecC`>-rg&=lcXpHZMzbziiw zd@pwBX-l4t1JiRa{Vv-rl|liZvt^i7@%=;t{uLM7mI4RKi)OVtX9cN5E=Ca82dLE@ z*GxoF=jwLEN&>{khK;+tB~X@5Qm?-e^Z@l!{ymjh_p;PMfdZ^T%S`HHDx?<)aOB-? z?Cs{DJ!NUSxy~5z9B3Xbw>2LXNW+CCxu42N#dEErHLp)pP0CtwSEV^)6MaDOgySod z+)gm+t2mPdtEFRjnk)@=@?uzZPe=MP-FP-!S;u86U(XY@h3-w;Xf!Ezc9Tf14~Vs! z!nW?NMdtr3Z&(sl*OO|P(UH6DYBpple|)L!biXHTdy_JVYdr$g9f(mTHOvOuAA6n| zpHTIkG$p0NL^$)5#veOI)yADpRFThtQ3e8X=q8Yl+OrR2bM4hVd~idnr|eZ8I3&Zo zO_ICEVe4w5K$ zY7Zg@UG;UPgo!`fv|9TTVXh|p3DTcWrHMJXb@tVqPPb^(G%lEVyfgj|-C*gQs|W9=MWuj!JcIJG!<8W@;m&=CmW z&ps|XyesowWQ^#2Pe?pE37;hdF!V7j0Z9{RxXM`1JF|7`x7vcfkwEZ!Yn8_S+&2MI zY}GAduf<6+d)16U@8ia854me`$lk62;@2^qc3@GU98L!~V>3v&T}M{d0YH_Hy;nQj zdN8RZ*)5=Ne5HDJbMM>@J2xEU8SnB;*kmPF?DN<=4pDwguStz+s2iJQ;n%E@%w6!I zRSBarDOv|dDADFxiFjQ4!r`zlYDMBb6q2CgDI|91p?JUF{(u`kjx26bv zy6`^mp0YAy8@|lKm=N7_I|EgZNMnUZ?eGY@JeSu%obRF*%oh=nO+R!XaDOZet?d+N zisnf948~#+wSp=F9`-GhO+|;NCY!sf6$T`{N79Krv~e8fo80c$3(|=kO_ajbg^-d& znY_Emp$@UmAgJozed`w`x;&UI31f5WM{2pdvi^2`4r26JW7#!a?xIW5~d@oZcqaONA>aGFi@u8gi23)p{o zmPETBz}HC@-&>|!pB`{mtyV@s-zxqU-ob554{tZ+H<^A&6L_5Y~&+v1zEk#@G z68*MEcQ>j;z#s=Q(Xa>kYG<=KuKP<_kpecZdelSF!FAyZN*I8FlnN4pfPi#|bR!`( zgh+RHtAL8MfYi_+H8cYZDbfwn-96M$L)|mq_rBk{Yu)?v<#oZ#Is2Tm_p_hp*>gpw zuyn){%0`($7Omd%QLeT{_ojL}6Vg1@ZEL90u`Szs@cBe=@6?5I(9SMjJz$)|bQmaZ z9OPPvDc!V2N)6i!=2D0PPEUo!=$|b@S7A5;w32f+0*k&||NUab;SV+6G$5$hp4>^u zhulnGoRDNb}3kf>Wh6kJ=*$vXTm&r!rg3!K)+a7skqrne=EI3+Cx~R*`lwuyk6CJ zspp1tpsH^K(fXVWs@drz!&tlAE^`bIuZNOwvceRx0JE`2wB6K^II+ehAZKFx!67;~ zwOwaw-_A+4^KsPAcjqBdtB-}mXE9E9EbJ>lIX#~H&TR7%AeX_RZIKVeo~PpwQVG25 zP2zE5e?9dzYT&IRFf*jVh{G0_UA_ch>=ojEO1i5llksYr^(PwbV2fJ2u8-?jjpo}j zzN_*cmKJTZj&$w22pWPZbw}Rvc5yF3K6GPk2_cMw>x7nV*&vP`yhEbi9%40BJlp z(?Tm9&MqYXyaon+GE=SUn@U7zQ5869JEh~Nz5Zu^Nv+ZeN1bve_8QpfcLNJhYV_Ii zPBV3Rc~#02f57UN5wJd4^r8EBXzhRD_zpmV#LlH#t%n>f z;}Q5@otb+Ud21F&qhh9p^s~b#{u1zY z*mBVmxNS~MWgGZ(TGI#ICGG3(&eki}dI2{lmK>frakUz8xE;AkJ}T^eRHc|nH1_=o zz{1YEv%C33NGrC-8Tx#uyQsep;BBI#^8PqyFtN+d?_)IL}-PLHg`4@#qPG`r?)Z z5qj^F>YxqQa87?&#jFUP42}fm^(FiN72Sa6eTs_7RBN=eCHe$R6%;K<>4De!ASp=r zFS!9@nG7WQ$JY%5GrbgggDFPRd6~YIznWJT`-Ry6<38GEqWnA`dFYxcR_XN|cm)u{K%>2VUyRO1Il@6WRpgK5jw-K+9fIJuu_zi%U<*Rf3{wtLAeG(=N%-wOUw>?SG zC2bc$3im$s0_8_aPXc%M-aT7?+zc#R-_+RRw{mv%Ba{3?MG!9MWYcZEx}CRI+vDsL z6_%eswO|0)YP;(y{Sj5VK-}Vs(N;HW}J+M)SuJBe;QF1y`Xkeu~?pw|; z!kq6V1GQ&Uv+=pH}*wt@>t7UhjUquPCFLoDkcmK zxd01mrdZ~N4`g3<0Yz(F0w3855pXIh*yUGJfck{jWCyR8V|n()CapeQ*4ZCSzj^}I3C}+|3Au@5;2IK=AsH*Bf{-DHkk-Ny0!k) z_`@~7$iub1P$XT$780?HL~zOm1LZOD9yt&WzB+K3^*KnrzF??Uo`Laz&RxVt`C4sG z<<)M83{axWVM?dqOY6NCCz&-Xr3}!>Z;zQ3R=szR_?3kvwEG|OFjMG}@j8}SG;#dQ zSTjlGch*%-5n^2HONn9D@{2mB@_74=goQ-0Ja`Od2tW3!HJqt0U^K9=Ss{_}dS9O6 zlt{1bo*A5fI=M04AR&_nX>2}1a^2S`l5Y)A7udQy7&lgt1#_tAmV{4akGCgYK#tFC z4=>-iqpeCjn=d@E@P2BA|&Jp>xR<1*5&DhA4Lpk+-IYey@5BGPS9j)n>KHtqBdUYOe{4S4p84tG0-(1%px<4(|NCkYtB3$FrP!K|hLgJ-+@*s12>!bG_ zcw9*8%>zIo`;Bn_vtEe%cC_HG;-mfj0crr1M*>Hy1aGYWkPM$DZ7ufy`rc>ZHtV=P z8+Y~EZ}IdtzyDP-usxO@%2Hw`NA}J0*`EM?)n2ak;0=eE;G_DSm{DBr%L?dL{-q8} z;<5W2RQLcuB{_MG?sbPyU={vMGpRyvcN;z@Kk2fe=Qg{s&qcB9%nJPLD^MGvVn_sA z=t@0Tb!u?~S`DSQ(s+HPpqAU9j;!AmY5ydWjut6gngwuwp>B=1GU$Mk zbDFr^8ex8y$K*ca8b9|3s4iJ<;-L<=YB3b_ok@56^>Uf8CQpiBnhjEG8P z*}Z^Sv*`Bbg-fGg#R3lcxomFMF(qv(hi$OXk&D5TzLy6QqfNG6~R zpd%%4N)>hQ+YcC=t@mC|NfGpc_QJ4^W38{)ULu`uIBEupIVZEc9RB(Rv%s&1rJvU* z#hd9^ka#BOQfaX{2b=Mh?V=1xlagrR%G)rNSi>S^|=z9nk=> zyr3kr6Qp~V@wpo%cW!?&i^{U+koa8BqV&b4+ zQC?qBN$vzB%j8P491wj}*k+8!t38nCHI%*2sA5dOsFL>z;DM{sO&E&8&~_} zx7U<{fFW4b!o=Q|2KF0IN%kftF_*+SMtZ6k*iKDm_5(h3H!Z)jw96|O%&-zZuoA0& zV$XTNKW|^HJ>|^**8)UO?Y~|m-$i>rXTgwUzQ*tU0drVPc9PRt(*5)ZIm|}=vBxMV z%kkDn;@9U_ouB&N{C4u}xP_wrb#;mMNLHmPBfJ`W`{C?v+rL}@K>p0OMI0LmD^ve9 zCUo@%l5G}w@2?nZ>Ia$${C7b1fsOG7F~G)nv-CX^9)5&D64uFWiNTMb>w;a#e4)|^ zoJ|0-d@Y#e_{l4rlP*P}0=riLS4p+bc8er|(>kYJdFy=z_Y2GP-$B`8&Ejf6_2<+= zz=@>wl@NXUO%rNgvR-h~_L3tU?JeEnl9!P&5-(VhNkFuf=-vJ3`zc5Z@pY_1W>X5U%9~cRCi14bJkf}3}Oyf7uit`Lo2cLq^)l$$5!!hqs zdmTykpP@+0V9slX%8(3VT}N#W%1;ZXR3WkyQbi_P(yz%LY9qX2X<2@F>ohusPNz^y z+zb1W4x%apT+YfBkIft0Fhqp-SPrV(t0|Rhf_1$QF9wo@ascAj6%sew; z0XJ}UuX@Ppf85Cbcs>*Z_EDG>WUMvb>a!4J{obJiW z0*qV9(@VI;h;$;4HRXOOnJdODLN1C1$8tNPnu0lwE#{4V9g_W{t>A3bU-wz^)~G>l zFjWsj)01qx`P07;0Nddsv85ug{iQa(23o%Oe`DXLP83Dr?04GplOU@dj@E3FbMVC3& z2f8EKGH_PPo8u*(Nf=rF;1e=o+5j zVFoX~dG#N#)i+$Fzby|kb@>F#WusrAP!5uEFy*QN)O?V`vTg;(2Na6X)Tuwu!W z_!tf{)vdzZWalqjlG!Z*Wqi=MT>-dJP5OWA0RYPf-T%t5*T5=cSW+0VV@`YTJh%z} z{qA2ew-5U$|AM((8vlZQ;AdP9&{LPQnj39N!|0Acm*xSST+r*sQk})g0lJBIWrf3( zR)lhm5a7qpsVMnB@CB0^dg;uPas634{l}lfR$Jv$uNMh-E4-1`IWMUYFjd}6x{}zA!fi8Nl##^V}E)%rM7^(05 zq{GH9$xQIR5h#YqGcww^tAH5H24g8kO?>(vG0oQuN*1d~olm_{tY(3`FS_UA&UgKv z54F_H&Vllr1?NC`rSiH{J-2(<^U$+s7-*IGL8I0(g=NZJE|JSJgGUA7f*}iGI$_-o z<$!+DJ!fDW&-hvynB=O1?RKm!QS3ejF=X5LIsEnymdlI_U^~MaGw7TM-|>(EnZ@eq zPr7n&R(}$YgVY+exsgB*Q8%}~V|xc2^h+Yc{Q zfD2QMm`)zfB8D1Lb}L2`YjxE5p2UI@K?ll)yu9F3N-1KjV4TI7B?EGNm^M-^54^=3 zb|2SjdbKeo4N(~8l$LqX z#8a|cB)=NVWwz0fN(bTfPH`HJbuU!`ME}3Wa@-J*K2;qZ=u=%_4CT1<-|-~h3xu9( z8=D1+y}FB9AC|n(i+5{LbUI9(`1xsYf4R9D~ zfOFyAPydQOb11iQ?RP5PHe;bdkyIT#E})?;v+4YhdLlpgq1E)y%YOd#AasyM<4-SU z%^~~qr;^wjukUba{nC~LzJ&2N%FTh@_Wf?K62_jcWDpwgDZ?X1#_&uHS&!;K^*(4U zK6~|ylB(PsWV!ck1Gx*dx&r)s2}0P-XR$kvERYsNQCHxK{Wr;RWCU@>8XqfHgGm>m zvKm;K?^AT^>b;6CM30d-<@1@WQKvjcw>0ic;@!Tp%6{X{OYD#MO#Hm6YP@L=A4_V` zWr{tbTT_goqm>gQj{N@ZLyi>gv)C-YJDClY)&_^~v}U^C`F5wNm3n>X%~|K;vz;w( z{mNbbHLB~EKi|fI>{%gSROj8*Wed?n+lcKiSHVpaxVb=9C!Szc$m6!SqA5u@)rsvV zyyJMF&B5JSf_m@9cB-3y<_=F{l12e_`V#4?ek;>{Ibi{+vJ#B>ry?Spmc!zNB# zx~F&&PMkKd-BH28Xoz*ESLl%)F+K_>Pj)b)?EN6*Hp4`zFG6UFz1M!ajZrDv`eeQ! z;L(HarC4SPSn_L%^P}{Z?8VI3GGF3tA%@+%6Hd%^HVqd;p$Er_BNME4@skFVIcQOd zg`eNEhyUagp#G+l~}`)MJK#GZ|X3bbbnMWN8Zc?o-*{1C;J-S2zUTBj;)= zW?=bi*~~h~GN2=4+^+INjY#O{xseaMZHBQ|}ICnZOU&zbPcpa?(H-L!HuMLIK37*;uGf{Cgz z;!@vak2blNQ%|O!#P5T70|j^9T2xt?ed*o-i<5G6tB&6)tLLW(_f*!Y;G_<}84tep zyLm7}a_H9^{j;5(1RLsbFXzG^j8Cq1aWzw@9KlQMfY~`}d`VWYDo&ipl60^tPw2xU zjY-xg&-f?MNRmcBv;JN{Izh0kUeRudg_>n6%S9V$R}A z@Ug??bJt4+Yg>A2EA74M-Il_xBOM>n%LP8Ma~PFOAi&1gRul859ayR?`_^muOhA5% zXXh0;^BZ7dUxSIIqby!`{I?cvGEjX{g*&}4|66)^X`($EudYiB)8+lsbZS4m`z_pX zdci4Y`M1elPd{v;Tx<7p`;y>1fGJJ>?4>_wWjC9?f=&+s_W zA7J2T7v&NeQ%^SY_w<1MROQ%e)FOPA;*e>Iq*rY@<`Z7|ipfAEZ`OhFJDRST5w_85 z%_EyVf~~R6V1lg{Cf4&l%gLSzr!jqZ8R2)mb3lq|dWo*|TU~I`JqC7Q%j@h5dR^T3 zeBuG(<+MW`bWdN3rB8UujWos_Z_mB}9{@h>J$E@_003v3ga=?Pl;-C|FP;O$|0^+z zS2=Or27N)0j1ai~7{F3Ge$0l*+y#r_;+3Em2SNaf1HTO(A(;c0!yLb1yJ6OMYQq9_ z$;QV$Z>McOL2jibzrVq&ZI+ z;2b~)$bqgJ(mK3ShjXf|}@umZ}3yqWwG;Qw$Qbi5J=(Y*S=F(v>Sa`9&=b_FnSfU!W9 zg{n@;%c|JWK|Vh~z6%Z-|3Tl2DFFsuEH{O{2uq-NPk&hka&76+&spaH);E}P@#Cb* zjzBQSWQBVB#!Wqsy`_k3iE}sA#|%eL7anA`b-dHkf9ys!8Cy3i+XbB8p~E0#@LEng z2bDhm)#~`Zk!z{-8r~3f?}c+Ky>bHm$;_HRuDA9Z#|PLv;F+;$$$C)63?427O|Rnr z%)|$hy1J=AY90iEasB(bD zc~%+gDjIJbcg5r=6ItURU>Cl>P&fUSSRkdkcGdfBzZOr&fSHwRSJoH!*HNEdD7;DC zBRpFvRY0n(j-M%?^{;a10_F~%t|^Y|W^L4CMWEC=9XGPo%1jvhIVti}044?fh1S+! zyR$xlgcCE;=K(Hs(MvIK?+!oDMrYj|1r9? z)j=q5ABt;?V&2G|i~A{Z;0!2z%z6UBzehQ4NWyk7m+0BL312~P-KFy(ls4b7YThhb zVP5%omsR#Zp&!>dlCN<9xlD2G?E&4$Z{5X_KvbVdu?DNhQ`}*Lrr3I!XVwjXrjo=x zDt_IrSZ_KLLk1P-k9JCyUH6&Vu%wklYuIhi#$6oiIRn~y@5^1G9ShwEXg$VGaBzCF zM#i1oqe8sq%eOjow8RQi1V(ad*%db%N@CIx*?hAF`iTU-HEREyNLWu9)_~qD*##3? zrk{iBjX=jN+zk^8?S&6G|2gf?-?M=GYqSe*a2#_EgbyTtu!DmU{vdM7mNspTK=f)n z(+{UeEt@j3^b!3FA4)Gun*x&Fi#FT2eR9hYfjzg8uXkM5@B{(os?oAgLDt~X%G7Vp zl9=7WfJc-_5(l#wjV;r(2U^Z(Ew2*}mYy;q;a9XW5w|DiO8qYttc2e6?$#Dfi9~j@ zB3>GABn4@%I?o?-EoEBaX=0xJ0-ke*j}E^b85IYgV?6^L)|) z-rW~3+Q?<~V^PP_=XB2~Vr?*UjyyQi{JH%^EsM!7Fl{)93y>jpYh^V*ZZR$L2H)e* zo1x+exIt=hF>Ky*Y-9XPPS6)!p^^WVwGg6Nq{Cokt4ejmjnBlQYKYs_YkCPIX0YH|9<=DBH-IEEy)-6f$-kmzTiMyvx%iA$TYKHGZJiJ z*dIvV8KvC@HZ6VEcjDyX*B9mHu9fz~hk6VuDNzI3-*N9&cDWo+->W&Qni7HoJ8SWL zjr{v=J-RRT%3Okg^Qo$fRFVDGz`l-Wt!q-rxdJMvHltd##p?~+YcykvvtNXP+j=DK zDL-p3$s)!=4qcHKY_vOFF1j4IA=M05?rOo0J0UrOzRi{NLvfq{^<~GKBE* zN}B0ojz%v*R01YBkW{QKD{KnEJ671rg`Bo(t?cJ>ydQ}C+%9a4*6?)xE8FPkedGs} zg!_g?iR(A&uN_d&ro5-N3;PZHHl#egdpApd|E1Ai`jJ!SGoiL|GFjkvqEU4G8TI7# z%fg0eN7a(pS&Dw8pgWz_`!U$Cx)GLXYms-od)r&?b92=3hJKy{mgKgUFo?ib+){cX z6j8*yp;fF)zBd%o9UEiUe}bLb=a(c-iR|6MdbOw7fUb?QBQ0gjXE(DxV(uyk@O=6knp zuj|x$<-qI}sLpaMj98+S;4FTN`(437Z3IbGvdC#I29BqVrG4Omr!3H5pZ})MQc`Id zf4RLX5h4<6#u)bd+j$Dht{gDdGac(A%92RD=J`{5bnsEiK4?TMgAKi!RGj^0W)dr8 zWV~JJ=d#Y(lfn%vx=!vnOYNsbm&y*+`(M%3_LUF?rQhe zr<SjkPdYtn@JkhPx+cd+GL==ESGA-OH9<;NHqYut`p83*Vil1&7~p zh{fk7zFS^=~e{x1IjnsaB?u7wLOuum>FVx-n_^KpNH? zIQ>bz3cP%@JZ86U3-+ZQRsY7B(`S7PtDk|FLd?0?JZ=o8t_7=J{Pxp0?dB~o{WE={ zZ-Um10h41pxg|iju+hSjjS*y(Cx8#|l;4TWugYYfhMX(Su&N)m(n8oTw5CJgJ19je z^@#9f{Ox?>^<}a75#mY$Ba@z{0l$tV!3=5!UOFE2n1)x(;0I-S)URW1p`OkIbEfm7 z(KI7t&jy*}k128zxI5DDwGmq{->Lbr5c%w`h!-@U(~K)D;%HKz3R!rB!JhjXYu7HA zH>U8@9&L@p`CsozG+mLt@>BRm%(!`W0i5g9@;>fc9Y`s}$$_R)jNQT$G4O~=_RI2E zfu}I4j0neIW6QfAl(aOM*W+ot0Tb`E!>6@b2fv@Xr9OG6ofzNl{c|M35Tte_f0@VQ z=5*&r#ohUKfS~zK?NQb7gDssU;G27J!He)1{Ouqv;2#EkFjHq*@ax-j!%bW7Yhm(% zM*D%ai4#f|KDf<<{N`u@m6NHLut@T*6X6FVCIGgnfw*Q>5*iK%H-@vK&$JCG_<>s|v7GFT4D=YhGe>ESC~IPa{3J$Wa(Fkubp$K%eDIZL+c!+B>O6q zgf)`-6RydV0*yR8lkLxPSnZ-9{Z`A3+mHBwd8THIQMZ|o--^OK-9NNdp}eZlU~toN zD4Pfob3Y>xg%=Hg| zj3^we68Kdp^Z7a{eZax^BKDKdq7SS~;`6wfE=72}wmrV}epPyq34n-);Eyn1rBDN_ z=k$qVMicA>P|gs_e6`+Y+S_2j%k~^5ZrQte8F0V9rZ0t4VJL*)=m|~o`zqDRkc)}lC?5iv?#pt z1%15(#vxp^v`M+Ka9Ee}Kj>XgUU>^KH}6ShkT9g$CWS&7g`JP`#D%jGS!E8qnS)=^?6a{gBq= z;#v;%F1jWdxbN@@JW9MK3H+0oa_(8VbkF`nGIpO|Jqg0h?@tiNl&>!ouSs@JbdOVl zeut?x;tB7)w^Vs^kRC!+^(UbZO|z3c!70_qJgtW^le3)E`!?rQU@i}E-WVTrCj)dQ z+3&jFf4Pgfn0Tgs?He=fhjf_wa()5qj{9a+t+G;;t+!#^E8XX4adA4^(djd%DO#rq`fouyrWkL?+W{sYp}EJYdku2l`0{X0?a-yxAr_4=vT-z4K~#fgxI593Jm&QE$9uyy>abt}#EoA!^q z>igkEe%|$ZfLEaHh$jFM8t<*;uD4UQ*Bt)13v3-s20woPqRDUFd^WKr5NdZxX+2Ce zg_3Cl%s`pHu>uuV!_o)KM%)UL4th3%;CodmkYgbv!5%0)?B!~C`0Jm%!^5_uv0!ed zOsCd$gLsrIopopi%JdHQiYT6$4b-suzq)DB7vLd?UyzuKEN;9~yFtxIwB@emaWrM*dU1jjNj{WBe30MDe0hZb`6GwkKJ`!Sk zG=Qn;|MxW1frHSl5a-7NQW216JtvF7!G1g*{m#O#O!sk%Ej(J^Vzj6U? z-A7V`-YjV88vx+GILz!4m;$bRZD)RRT;q7C*$GH%<4Z3V#dop4op>-RCCbIy%{+SW zu?Na8AstSJOK{J#(HEQLP4o|At-fvt=|h5roLJxk70#L z0`OVmv*;*=O%JfN8v12NFt+Mvik^xjZ^$7kYf0HGKkj z9G&4iQznDIamtQnSaMp^nA>E+$k)bMT9rO!fI&tj^Pwz;+JomN+Lj}ygZPuaw%dxSy%%MxqJ60SgSJe1hT3KYKKaJPHCvb5#(TMk}w2o>T+k2mJ<+1kTqjEz`BY< zysauJLMvw{oiPl6Ve-i7;LrZ`<@vIlO4P>H!x!0ZGqn?a#v|}ts#l(d0}~rhP{4Ek zfcV~_6{taPT^PN&JJxt{5SlFDdEoQvI&`f%S;+DC;ms~USXl;lTimw@%>!aFwtBAf zZ)0NSbl!O6yB~{57|-jE^HW3pRQ=5lHEY&0LKS^zK5SV{o$t!!nhWCYWVxky4e@aI zqRL0ha8rPm@7@!1t>L{|*AzP=^S`)6+ianvEWqBO6 zqBd#fvR3pZaXkrmUx4lMw1n zb$oQLk?HE*dysh#F`99P0cE67J;C=XUv*IB{(nlA&_?v z_NJoyXQQ1+HAsxU#?g@;>;{b>2$D@!v z0t%kEI}svGC%}lPiKQs>t8eF`6o96#<##FB8t9HTd0%fk{TC$ro7OU;@hkEI-OXks zy7q`(X}|7m)JR3sWX?MUOke9}c;>Z!7^R@{9g=%>0#_ivj|Kv3=7=kxE|~%i${hjwbQ?5<$<+dyHxo zzt$5vseC=e0H2*@w*m2Dao0g7aNRvrS-%$7hY?I1DEEf1ha2o$q2U)G;g)Tck6ng( zBMH2-UF#Naj}|x7<P(3fBREQcL~Fu7I$!o?+Uat?uiQ-+8LTBd5UDh-X?vFj0(N8M-+^(+9UC zOac<93hknxDPAx=29NU9j{7wLSys}AvhPW3j)6-$%mLz*0>8o^2ejK35!Z7Bz)elh zUk!{d*G`G;YM~{Q1uq>P2b!+1w7gZ4Ih2ryeh-C93#S%gd^!k->oU%!Q+zTI@~fQ= z2(fB&W_SPeTrQixe01>*WO;0|6qR6vWtVfx#kIlMhm$Mcb0L3HyYZ5~y>pe4z$3)x zt9t*-qqg=Hi|g9Vu?Amni8gzeK$7UA7aJd1?PKda54+xJt^8mvt{YN+JL>@*bKof9kV6n7?gw&(Q;QqdpF4(zO2{ny)78vwV9-;aW?ezQxqOkhCk!7 zvMLm%7|qvb$avR*8A3toEo47#o<>DuEKkf!8z~8r4!J+(7PmfPI!5~r*>4wq z2eU&(T;)fL4H>|`<*p&RdU@v-`Dr#Mg~qzF`B{&W8J}9@X2NDt;{gpoqHiXj^v{8> zOk@0pqxcH5#`pE{rv!jGO&Jv+@LlJeg}zi&SdxO{QYSRVOx3Hz2JCTy{3^q=1c zhO)hmV%Z8)`1h#L!N9&wu+U?UEROXtlL*DZWS%;GXEnP$T0fU9a&z!yR2A^&en;qw z&Rer@*Wt{?WBIn#ojPpZcUz~2kXYi($&}z6K3OUiMIr5F6GyU>>T$s6xtT}dw5IPa zCGd0%GuZ-$$?Ll+I5)_^S!&1c*g3|0mI$E-rZ(6LbGkG)(e)${2KLz^2x?QazBW6- zZtbycGh5s1zPYQuxDz__zGNpL8Oz%@8L^q!tg>99SB}HR`3^VbWDYL$s}R#ZDwQG7 z!r!Vgtu-xuoCFIe<29=Q&5MRPw`-DdP%Z;QJO%+_0dm+y}euBPhEzQ)O z4a_vkVkp~ZPxtNkjR;8b(Ube$GN^z6DUgmq=pK9y42ZzIGPmJ(d;nqzr8W6&8yNm& zzy3O#)Y~K6V&ArQr^>GJnZod4PzaIl%%16dq2E!E63e}<(b1?XF*Y~5X%t8wVxSJ$Y8z)PsJzpHm8u!fL7C+zpo6l5sd(3$kgsZt&0JubS4I|K7O zmKk~aQ#hyonlld^yXwBSiyM*!vm6d>^mK`5%UmHH6}EH74SLQv3w0Vg`VgdKNGaeFf`xN zHsSB&!c6n!WOCOFdIbf4sET!2k`-!ju^`|ks**~?9|{5~lKxMZoNdo$tFf)f$$guUoV14cB^N)t%P5c03P-v|!UvGXL`(QK!W}`-l5J*O%=3 z7tY&1b3JmIP1o48w>Mhkch(1YCAG?dgpSi|^819-LJ+Agqe-JSYMNKL*cQ<6a|*jg zY)f1PZ8CG%nCb>D{n_wLDy#A=p(E`JRhUKt=*%h%I>#P8<@F7=yJ#29df)tUqK}0T zE}FT8A~>nEFOD)*i1(Ov>Q4o}EsW=pe_;@ngVkQ^@k){6o99QdJ5v&*y~bZ{e`hf| z`G1WL)?d%xN1=>z8u;b~BVp^KRJ3fh>lxZP&Gq=&^59Hl550MfpA2YRJuUOnftKp@ z-Q(<;w25d;oBDWa7vC$wPt^OD3uBA*sO|KtF+FD`Nn>2ZX}Ls5mZf}4K)lJb5MpEi zdgd@HME+bK*1r6mf|GUH?vryr=>PUjxmox_Byn6f|H=UlPyh0NDy!w& zgGE!t)faaF754;aT%2V?GmwyWZ32?@F6e*~ue(;%VsDB^|Z#H<_vAK=d4fmMc zZrI2XkMr!Kfm#GSaS{A&WPgv{*R>orPIl4ux}UteIdRniu$F?SXxap1W*%MzLF~I%ciV&JJywImEDlqp=Y8WWM z;VDlf!HTc1lx(X@xco)Fd&K~t{R4a5-L0^2rJ??RD2GXpz0}ZOms_!gNq*TPxVmj0 z!y2D~*|Uwptq|YaS3t}7!=Lz-#pFpH;GD#QSsTGe;`UJ)QrXhIlLy+dOwpp(M}GV> z$5-A%8x!HdHFiT42`)&hzC)C5s+P&hf$2os;I1>+^6zMO0zrO;6;XOzspnrXS2xiu z8W1f{YslTl--7rdh>`BV=hG1D7eb!-@V1m!ZfjZ`E2)l7yhFGA1^%J*RQ{(%O`LyR z6xINI!F)1f_&Ys;&|lZQ|B8CoZi2?E4~=14`e!!B)^ar1)#Iww92jTV{|=dQ5!{Vs zj_>~X{fxcX!ET_WRHIl&9ZdT>b_iJ$xMhRE@iaGRv`I0|3DvTMe2WKp@4C%`3m`}K zO)Jb=-2-DGWF(2^J&9ZV<~-WNGhSXN2cjW55zjXV%eDTJ5By%#GW4Uy%oo@muVN|u z@Scw?#|sbj4=UbB=3RFhZGko0r)1bLVw>DD^H?qwN4P+zk?Pi@CF29i+Ah7+Jg~Aj z(`ZPeppb4}8JA7+!=tLf0!>(?NHN)N++!_cnT2vkSK%oq%T^p^<`k-59Q!Luvpihpm9GrWhYcl%5 zxg^J5h4Or?ak(SPURZ{~)QpnEH%~R9{5HznH0>&1(_0)(Kt}waKdv_kGl9+TWc)IF zw($1Gp!y8L<<6S}&+(z60b3jh{&!3T+k+S||D<+X8ohPd@gzf>NDv-7qW z*_XcBzJ^by3u>~AR`UMa;L1sflv=&727Vw5tVnJI(&4XHwF-)kQ`cqyM#Qe{di)ig zVmx&w@c`8pO6c$|YxP^!hSLxIkYn_&B#&^?`Lut3rxdCFw&xqM>(b3*)K?dySMHz* z>1IBgP>Y(S3=}y^!vv9H2UPWZBG|Rz=+%n z>ZSGu#&YxC?;iNR{FSP}U<$Og9WRv~m#6GkDn;13fg3MvxT^htv4>aV5JRE2JHTi9 z-4yR`O**h{{>@G@e^Cu?1}k>&^Z!CP^k@PBEC&+mtWpE&%e#tr;~!F%fX zY3U8-lZqSTfTHjTVIj_ZQ0Q#5&)K23$k<3qZPzf|^q*q#{a&p=*@v@`7K1q@iR2lF@%biXE z>582?@p|jS^%aorg&moWXzS(z#0pHTeuEegw;rYs`#{<49tltb`7O7dtlnXko8gT{ z!z7(4=bOKB$t>1x?393F=K$dGZ_eGmIX4D=`r!0k0BiaS04qTmM)Ci0g#Vr6hIJQ& zK-GIS{M%R$K*nS|?}EiLnaySO9C#v_^NnDA3YLv+zqWT>4!H6)-H2}W8w=K*D7YNU zzcLx(bDdfS!$73;iAS;5s!ry*NgC}$3#aZ2R&F@*g62Xzpu|dXw}Q-LsN(;y_uY?B z$N%G8C$mCE3Y`_%*)qB`Wn^TJG@O~}WJJ2Flp-0~v$D#FE_){#*~%7?l^q#b-{LM48d%|U)_7KA^FisW^q>+?+>Orp`9 z{q9~(5pwpy@h_+PYd{cFnDww{ZT98Ibx;Tu6?XjZNyHIw zxMTA!I1NC(#WM3wF>-{m8Ym}*FAkXpabnKHMt9C248fy0bhpaCfoyj_7_%oQ9&YH9 z@s#1<@3VPF3H5cRRXZ61kbEc<<(2i(w^7Sy3;#|*$#Nk z-i2;mSsAI9pX<2M(@A&o{)Vb9!&4VPy9{NCs&T_ua(YpPT*+)YK>sNBFHPH5@vOoJ1ZuZpksm~p4 z*PNm1!Owa04>2SS`B@m>`p5el-114PTt6dsW8m zZ#Kib|G>0wgUstIn}ch4NZ-+CG5LR4lS^msf_gmNo0S3plQ4pNnP=|+0^qbm1_4hy|#8T#>`yT^2z4<_&h(SUP5(Lph!Lgt|9*{J7G%LO+BsmPIn zQW|hw%**qKX(FLg2ygv6nq?JW_9cJ1QKk$-(%DAS-@nt}Q8B!C7#q7jS!}khdhcFM z7KW+-XR{Ub_`(URu2i;ght5oqeJ9raK%IX|+Eb5tw*JMYvz{-#jLNU5-C*aQEy(z~ z1>Fk@6{I_+Z&tG>JZ88tKF6e2c7I;3`3@S?bL+j+Zp3UbfX+z$v^^d7<8nF2r@mW_ zu{`HgIa{x~=v-TP=2m5Ix>4PiQm@s1DYuXOt<#^Y?0^~%hBRJ0ZD#d+;R5?^^FOm} zW6qp~yRjYs(SiAr8%m^F_H}U51s(EjJ~WX~l^DaBzsL%20Ho2FwrjVj!pm|`?_QJ^ z=btp}4oz*ODY{c|2FJ4cbMR!j`d2VRHa^4h)Kj)nldO{8^^OULfyDdzPXCMZ(Bwcj zuhHeZw{PjH1(Z2gdVAiTOLR1v=%{h5b6}8nZxU5MEGKha`vL1OkjOmyy|+l#>agiU zu}cNnpr|nk7>v^)d^KB{9$Sl{52S9+aGK!m7*wpKtIQ`YPM>I*MXNlxQ=3TXv*j+< zg#wD4-Q4+Pt>>9r*my6CKRFkC@{>|-GpbZ6#nS4zQW_-T3MP; zW*D)IAp=q^0LWYWP7FMO475mt(Vrmj6ZIJOf8rEDd@JLH*SjB4+Wv6eykD)Ch@`A) zE?=ARbL`8ubdBlP(V~YYbA>C-7JWg#VPnWm9O>m2wK)5oQ%UubyZV-QL(J>zf4?5> z9`YzKS5!Z|ifX%tv!^IZc6&Zn%#roJ@u^}<$MVYNQU_eOsd(p^Z>{_pR{o9+kF z3L6!<3twM^^FrG630o)c+zEhUW!fwdL!5dlBdt!Nf5Z#`!?1P(13w1a zq{%^ZAcKeYEpQbH=>D#+17@K3okt5$cYV#hEyTdx<+!Q-V!@1|xaAMGtKT?nrE4|M zakZ4cOSlk|H_Z>yLiG05`|51d=ff}NhXgjez4xA~W0(VzC)-LL1QoOIm0vl4wUnXx zpi*ptexoz5>j`S&uDBOPn;6Ni<$170=VoV8ISsJ=VweMrqjpd}%|_vQ3?1w5SV69- zKG%H3m_O?+`(Uj=GXxq2uJu7Gv$5N~weP4*u~4@rD+!%`AlEpmFw`asI|g({Amwv2 zF}^vFuvcHsirdp{7-$p}b~v5)!K0EN6RJ8q)7{!}$)5hH!>sr1Hu4-cK1}uI%bVSk zSoGHYC2xDZ=YaO~`Q|aF-oJKL(;c^)fSp+9=Hr|u{ovZ|qNN@OforX1l=%+#h}GWZ zSB6Y%H^*P^OE&OF`CR+O@wMo8PufIOR}u4H5D;QG(aF1zncvSA1SWars4De6M`l3! zjrFIGJf!fhbyz6afbuuZk|y*UuYqd{FCU%-8zA6)iUNRYLib^ z*RKoadu-DHO!7K24Yt*11IcFY>UKkytu%}iuk5`9YTfXyX-4~e?xtOOyKWq*V!R-w zQsn_I&bbAyJuF-uHT(HQFd^iSQ%UQhRld_*qawFE9oYlKLho;@&5z1djipALrAypa zYFnkI{_s%xr@w@Ov((DhJawApPE+a3DRmquYn&bSa0=T5zJC2{*7{YIYX?7tSc`zb zRm}V9*PW9W=8-nW?x4n6M%(V-i>hZ$FXQ*Ao6O5N23SS&THM6gyG|#@49qzR~?QSH`ZV?`Ezo$d1ClCGao*7K5=Y_7=hB_(z`6x60s+sr{~XXZLa6<*sbw8djp1iyPJeotlBsF zg3z=KDfcGFs*jF|6<=N)6_F_Y{q|?`y}K=ixpv<^I;YRxdj2Bf2jAJKUqk)fUnEL} zZoV^_?%e8k-_kz<62a^iuC1%h_e&LCiM(q+7Z?szH`r!4-3~N^O&q{V0@?f+`$>Ri z1srcQ*x46oDWfE4J}uEMc7cK`?pJps5yF6zb#q)o&%l>vy2akyFcajDhRQTR1q?^v zH*m$&>gePSt=o$S&Au(6kmiZd8|Bjbxii7ej z>1G(IZ3lzl0DRS`le7=s00_yk?5N7;%ZTXd_qmLwS$%x*AAgI1dHqdOrNB5{zrQh8 z;bXVqN?)b!I8a!lKfZ9|`_{eKWt$>B^iSI@UZ-{6kB5Fhjjr!ao?Zzyr_U+6uvBw+ zOYHH!!+Q&#;o|pYnE5|Gt|VVld$nA+>nUpByX;c0Q~9Z<>=HBGqo(;TWpnrGM?U(C zhtP{Ng8GZgKNsJ$gRAaoSEpKd6kW&Lvth9@IXYk%pU-9wsn4c3A2e@5O`IQpNq6|e zSfg_X_PNc{7Yf`K9iHu3#k!t%U^ig@>o{>?(K)&|uIkv(LHH0p!Hz}@9_CGB{%gzPQ{NL$7eci zGfesW%iZ7Ew%!DVQz_3C-8W&)bo1Bq6>&Coem_2&Nec4jL3de#x|8llZRnCe$~Rx< zc~0h~848^{9)YxBcQ~P8l)-Ehm${s=v3H_P8{cE#8o?=JO< zu%#6Lexv>a$M0sZnPk}ufXt~fPi{iCC={9_m9Cc@>!&Kj? zgTW|6-1gaG)lf9`Yx)2p>)1Fw0!%rjDO6Jxq3M5uY^L7_?sju;QK*sibqz!!A^U~% z|2-;V#f}9L`QX)sfadcZinhU?zz5ZvONqG}-Ikib`C?Q{pJAA>?S&P=+NTrt z^{cMK`q90q6N=wW$7SvF``wB)V-%I942pw4huyxLt%e3|=J??xZd^-uL0DT9%y^;m zES=GoM0fOMyYtolVjK(l0H!sW1NmR$8>Y<#I>Wnv(4y{afdb)%P?Gcq)4?7AtV**? z?4n!%EFN?7!t+sj$L}wW(6g=x)_32^>U4VaNIKPZHKy&-dgq45mRIo;a z+i?5~bWkZ)F@|D4LA?da&8{mVUV8nfzU9k4f%J0YjZDy$e2`Jq`Z-9vO`HU z+tY;ER_Sqjr~aan->*k@sft`wUVTyW*R;GKK9qAR0A=BunVz%1HC)t^|H1A>dW9EG zKUD5C6r@>lRw^=aTxWoXo{$&chEkZf%gZ+J%KK zef;(Ht@zgF59%u0o5GjQf4&$g3c6Zt>K#E*Gy+Ie*(nYG%Xb1n>)5=vyeIJT%eU5m zy)UspRa06Z%l3FH@8ko4S^v?F#AV38-}G!&fjYatWabZupYi`L3N&%>#BgN)571Rt zw{)ZTXMbairecDrQJ?Bu#RI)!ZHuG&OFUHu4Q1#^74#!0STXn&x=#M6aZNgv(y%vB zA-6!G$!7=Ka-Urr`Gz2-)Rubl9#sDOHD{1hmP?+OF&CUTZK9yH8psukSZ{f)I5k|% z`$lt86c4)s9MZr9F%>>We+rT4YpuiIPt$n2jR(nS+1V;wijuNUCKN+BwX2zccB(~B zfw`qhrRAw*5uq7kx2^w`wyzVnW=lue?^MNQutfv+QeXgudIWBynfx&B(P^>u0!Br5 zesl6AFxjh7$K@Sh9h)cs?V90MAp_|}$Y!hl_W5?pM+BX}{EMdic_aIt|Lu3LC% z>DFx>B5fpI`0H&tzLembfuf4Y*eu8aT*PjCq?ZV=H3b)M#a!|#xz!^5G$!A}6a>#- zfG`mMtFpTis+$YVpm>t06;>RtvN?8d>qWf*cSOfU`DHJcQ_W1B3!nD)=KWf>#qj)8y($WUo+Wr(r@-D2p$g&c#NR|ZJDtDA+1DoVKG#Iy38bxK70rO=(*bw zkxghFj=r3Px7a%z7||!|^RJ0DskW;cxJ~y=$fsmk^}oLz^RlN+z~36VRTkg8STaue zz%n8v`GSCj!ou8@H%6(b3I*3*mw|6X2I(q1dk)I7iK(_Vuy++)DP8pDSu+_Zdp-VZ zbw*+_z*gjz#m2kpg&VWwf`{a@%Ceo+2@;s@{`Ll@Et1en|M48AM`|GJ>3Qj-U)=OMOIHp|J*^y&OxGGFsPQ-hw{ z{rdA}H=lXRzKsfPUv&i4UAQ9Idu8k37I)p&>38oWbSm`II>X+bIK0&@{ZqEnT*Px> zR2J>=SBkMEf_ZV|q0Tf^F6iw`hmWtYe*NC7z6GuW`rRMQ2L?h|dTy0wzV~cbdRY_- z-CWviSvmq*%3s+e+yk>H^}waeNAD~&^33KAQz`TwbNJ+F1Z1^aBd3cXXplxr{ozo$ z0+?X`*+wMEfWwf?l%~%x$G98AKMREx^Nrp?K5bY@=SV0hU1Gk)o$b`!KUvM*bWu~r;iBhw!HVSkrAm)|t~ zZs4moBBWq>7$7?7`LT>cAi|@#`8j|rRSQ5pHtx{-2F__n3Fg(=Ju-%?ueaO!Wc5kP)MPk&==e!3l)Cc|RE`T57t01IN_b+S^s69q^M} zQWTxTHEDsBlbu(QbFJq3%v6-rz3=AwyGq~a@VwE*g+r~Rz_0LDfk3vt&Un!f#!rO%8#-oSszCT+Sx7QoaZ)c%OQG-cUBH|QbrC#`C%gPHaEzkIl2-#+DC zG+?U1tT&Wf;31*ijd^e_9%>QacH)&*9?%9)PRA9T#T@1~hrWCYtL94=_xw)h_g!+a zlsshGq2UMXdewckaV& zie(k0>rW13LPoAV2TyzffN>zbNF3Qg&e}itn7T7LmbUji0PVAoGijt{fu2|?v#CsC z3>krtsd@%(OE#v6pmWZzh*v2V$h_eIE=g@m7U1Kn>PIwiQ~J|-ut(QijOG=%@Zy8< z!-o$@@I+V{P?pV_$Vz{*z739H*vvQ|b)=$qIz_h)jDI1VXYc|zjTq)6vm2BFt4hF3!p~sro{GKv0lJgl_faM&S`TNN{FAnH( zs_m2Xjb0iO&(G&++>p4T-|o0y2}@p?h96Y6p>J()^p^o(PRA(mJ_ltP=!aJ3WTYX2 zfQXUa2ZwdA4c78Ij6EKQIwkNB5ux`;XwInNpKwu^fiu#G0SWYh$w&ce;X1`!CqsG; z%TGJ^$SY)!gAFT5RYKo*=L?uy@v$WI3+VySJu5zFVcXQu;HpHua1P-JV0A^`f_i||PVG$bf=vU!~5A>ZJ}{bOS?kNG~!k}yaCo3gBy#vm8LL$C9PwPPUp zWAc%Po#e-5M*`vHFYwTrQIc`c>+o|b5)LrPB$#)n|D}LQrWJf7X-^i@AT@PoQuGrt zs9cx^YaSP5w;hG0kmcxIB1|=>6Po@VCRVV*U1eNiR9tJ|A|1@n0+DIq0NTOwQ~7+n{rW?>L9Pem{?cS^Aa@z5{OILH5(FtIsUc#O zmX_v{kxsdfT!*C6KqQkWZv~q@n>zdonS|s32|2cZV#=F@)3FM>`ZNs-Y6UY3TqtV7 z(Ej7shk}Ep=&-Eo9T89MK(#PPu#L`T7PO7R9ABaq;LKn=EMdW*>y+PbUAzT5zzxxW zJMQY*lw@lG1L*+HZz#_37s||Q_R0X<1NI311~H+^sZY?LTKp2mMpn)7{UIHBe^x2F zwHu2Hd8Oc8sxd$alFo)Or?eE>YxH!IQ4%QGhR`3vl(S-A71F;Lys>#tA`+w~tYn0H z|6x#a?Myjq(CC;D=j^i^s9M4*jyx61%4SJV5Mq#vHAKs+jHX)2sWiWMhb(3Kcq{fX zI@N&;>r)CPQ$>N$hFzpB_$6SL?MKBeiq3U*CwSwSF7u_|6}8m7G+V>idNmTd(J_kqDia~l#l-vgle%=n#j zsY5$)%Y3gt>i5ka(067y)6kP*?H54SEgIS4egIexhW6#I?K`>$C z`82M7LsZg>yeEx{HKllN|0G`&sJ1-~71+WV*j z@F4_P>D}qN5GkLb7YYGt5&ypC#yn@)*q5xqKG9c*i~A^O2p1(bZbqJ_an?5CeZfJ- znR6f+0S~+M8WlaQkIXR6eOEQJ=@ZW_{h+0VuK5`yK>9oe15oV$=`YVYzWE>oBbfIw z+O=NL`v?}>ZVjd(Ns*od_58E>{i{s8Po7xW!e6l*ICk}6Qfg{Kc?{o+R}X*6kxpJQ z3iDc;3jdIo+1z~pTZ7z1p&w?@_%MMsB;ymd!PuD{t z|0lSn5EbFwZnezE$eqx~=vOvtD?03sg|-2bB(U(75* zl2H=SwU-qbc>{H+efmB@M_LJ4@f7xbVERIYxESCLLx#{u#%zdg%~GBvHZl^{MEQY) zHEDCAl(X{Rfq09PRYX0&Xi#;j^(H^D=^_2ulyV^M?T6ZoYKYJxp{W|2=kDaZ!ISN| zrnjh=Bf;4u3k|D4GxynxQ)hyi(x1lToqb2pinwj?C&t~Rk6FDZ+5?jdrEU{#XO6qi zIQ|xLeT*pv!mC(mD=R^zH>GfsiGs}Tk}RN;(-cgDn_dk#%;#~Q(K#NotWk5k*ura{ z!p)F|e+-<1xfvWKhO=0gl>Acw<*!5};>;{C)40rdo*}X%xQ@I#hR zdFQu`DkOXOx1S*GG02#5_H~f!^|DTZj)c}e=)@|8s1!=L@wbN8|IEoju?l-sJT<(J z2bt5&<%tR>0Zg27AgtlISCci!A5}Q+2SY6sffzuYq#QWRu8{EMzB(^IiIx#3VHqfG zMwu#v6%&?;@9g&V9FhhKqV_@KY5&C17>QkuZERrQ3n`G}&G)Fs@imzLIPufb&00lM*ipiH6rG zgb4jgmi_2CDiIt1E)sQPS7v7@9LEud%T4X$WYlg}iMF^ebi6O>TBByHm-ct3nM{cC&op^+Vr~Q$jTS z=PS~>G%FQPt}u{|z|%ZdAD%`p_9E0D`jK`N$v?QpM|HCz2CgAbqj0(+ZP5JHb@ijVuUu~szdA0ynQG&}~C z8IEfM6wEXz;r8+jmip#d7CFpPHjq#N+`uEq;O;B5*2if+sUzLXE?m6GzL9^kowaG% zi>_H9Jh#-YFy~(C%-UweLBibwJR| zZEN=4n%u_BXSr@#4cc8ijF%vTR|`AQK&l0x0f~PhxdmIYoiC;n840-RywU}&MdI^T@ffzS<(cnd zKy&JZEEAC5r0_RF+}0xY0y_2%o4Xt>!N?FN#^M%dUBT# zq#IS8A0RuP{>&)cZ5G8@f2#a-*irq@xcgBH5vT0t6=5Ba!yISM_@tW3yVe+_wZi=F6K&P| z>3m17+dQ0S@&o=fdeNl8+8swX#-{8QI=5#u~Y$hZ5=#E6>2!*XUO_P5+2ZW z6_q|EIamOvgo^@~=l3dwqwEH1(Q6i_Pf`7YQoI=ESXhDE-XmcI+!iR^GN(i~xQjByl` zS};^;-bYdnB;Oq*4xrMw1+9tJWEhsNslGD2NBpi(B)EPBPx7C%5IQc34oJgeHP7M$ zN#`T$XbMVvTNWIy$By^gR^l23XOaG2wliGQRkz0BiqrqK z8iM34xo07>S(}beX1rS15YV^)%#}F$O0qP;3Q@hM`{Lz38xD+BUGu=U;NE|NhV%>T z0R;da3*Y4(6kxOu!^yK%+i-J{xFH=`aK`%PVMB@HKXzBXlem#2{t#AFfM@Z+yR(R| z!Zv(<%<-(v$?K|ZVJ+XE8MJ1321E)G4wuJNE6i;}l_3&Z8k42h!Df5N@%0lu^E|D; zv@F}8MEEWzDS6Yu0g z&bMfwaWbUWxBv13N*bMX)g0XZh+nZ4V)V1mya*3FM`GgptD`LTY6!QVBoP<%r%p&> z-}n`!KzNH5GJbTNb`uQhiaRjAvjIUseqFSRii+x>5AyquZ}Wmc8z&PxJCPZYF92n~ zemG2Y=VGDAfOZ}K!p;qE3WAtIJe;`u^A7={P76sBjVCs?L24R$!N>S{L*xTuh2Sgk z>{JKyiNHdB0w3Z#GQ6AcM63Y#P*c4iZpYccD>c$2knRM1#YfmW#g!jF8G+Zc4gQS2 z6ax#dki%@m5BnJdpnkgEM_2@dgRfA2G+DhaxBVRqc*SEk5iSTOu&@0u8x9bjFgC^< zMu^68s*xQO{By5Bw9S&mw(mX>S4>dNX=K15!XnP$6eo}hA_)zKwA0hi?Rw}NddhakzK|?1?`D74Gc(fOt^)DR6zZ~zut*a#0JP0WX!o}BgL8T} z4|5RFg*^>uNc*AGHNq2CS15fj+5bg?erlLmPu)Zpdg9(rgQ7ykt5QycMJh}Hpnqg| zOC2KP5;_%lg-rS6Vq?M+qM`toY}$KYlC?elRJt^CDiUCR1rftoKftEvG(Ybl99cmF z06I48B8u!~;P*VI@|+wGBPE0gJ|4!G3gGf%9S?AQwoxa74j+gw7Xd!QQ||9W>j%c$U!lNu z0i@J8o!>`T7r_X%b=mS|5lNvHfPgE`@(9W0z&|ApHSB*(lC~z`Ap|Pbq#b7DaGxWb%qMq*^k5>B}l3u zH~_Rd(&jwjnG9m`j>Pwbg%NJxp`1f)M24Yx2o~n-@e?7O+4%!l@my*|-F7;~&_j_9 z$q7vkJnw?$HNZizJLf>`-yg(Hge3wjJlb;BoJgpor0?GC z(!d#dGz}N8)_DYrypWjR2l&YgLVCN-KR#3RlJ6w!w5cTK zJd;bXWs)EA^pCCe-8Nfjo3Zal-J1&a%f>uCUUf(5tJ=sFyMIT2KSgAIQ27Y1KKV{U zOxgzu?C6Dy5)Za>C-xv6qixoYBx$14bsXGhSGa3jhCpY?U^tBoyPe$lPPS<&11l}< zc|&5GAu1Y~8Zw=Qu_0Q_Cnl_&w;x{zx~o|>*7XsQZK%P*?i6uH2y7!10v6GL{bddZ7y zDzsL2H2|+c%g87qH}8PP9n;1qurav@V|4429Jo@7iV@>Bjnv|6wLv_F zuxoF-J#)_}{kgh-uyVy>dtFMb#5Rm9vJa zPIEu~`Y$#Jor~ohrrEPPivA8kG9kmLVyUAal>h8{WFVPXT-<9a9UyQ>Y$2rYnT=?n z!Cbwt%C$W92DtlsRJ*p1QP}5K31V`)nzGoWR_RfToGZzxse-FXM+(|Th7R}oYp&<; zA8<2^C@@_Qwf=EA`pgN{`_rDfN(N@@?H#*b45DJTe0*c_ODSqcjG0?*(UL%=a{v&Z zZxr2e;--?Rgt=@x8b&rgDzC^CFu-~^{VgeWKVT4#_b^-dXxtN1D%|LwiL?rfKW0?f zFsN@g$$?f07ZyXc7~jVxmW=?LtUeZ!JiZ3(|Snu7+> z^}^BeEsKneXx*rCoq^}>=%@((p~iPtDm6>jvoPzrB;USfP#vJ&A&CMZ}7=P&=6=9JH_mt^nZ=VxG zY|^>7K|uw>7&P-{zus0;muOzitFm58(bsb3;*mKj+!26I9ChXK1UnC5J|_6*M%A~G zeeES{+;X>M+3v1s` z=FbEsW}-f5|Csc|hMTwi8A=$G4ssu?`(x}_HkhWTQ#E4jcyG3t(|?-c*_WIy1{ote zN{Vahn>dkQ^(FUG;JYZYq@}t|*Z1#*e-MnHa{3(7Q1_?2u7Q3bL=&-QkhiI2_}p9^ zmA$bR30_~no3a1I<+r~Bx;NLaqJy7IE=$@sf3w+?YSQ?sbl1rLo_WdSBn4~6R>zzJ#wF;_vkl(^VR(c(--c&RL?689T>S<8E_}mZT@1<>@g#HlzEb@ zn=Bi<1YBl0@AB6IZ*>PR(JagO!btWAEDq4;G!pa0CLo7@%fvj{(`B{v<{BVUHZyvP z<2gZf9mXTlY zG-?fP9riiId-p2{qMcsI+UVxL!82SWA?x`^(9eBiMrTV zS?^Z^tY;0@+?j2i27(R}ECQiiAl&sX9*!eLul!*qyVlf;e_qWA7+~+crmH0Wd04{s zynXBlxUU-Zeo>-MWpjwt=#AFohZVu+vjXe@ue)K6yHS5iiW%Ti;rv@4yUAAE4RYqva^_4%sO+C>xgt<%=&SLQGNO)ctHOHtlZan4yuy%2TJTwE%LEAloF z981P+_uNO^sj;K$qHhSX7OWoKrFp8;M+NU|?+oxL(#|PL2%dk7*PMPVwbEUOzhTm= zrR;l`jK4nmGGE|xomQrpEmehJ&d?pHp!d74MbrNunwDh{Z5e~JTsC zrMDvBrC&{Ad^;?MJ9Kl3@8TDl(g>>78QkaBovScSP?I{6l$J;qN40TMn46%aT2d$y z7JA@6-3SXap63S^3e$de;!76brAzKk930!B`BfjNls~cVXP@EiLW~0tky+Jt`v{ig zecCzLE35r#?mI1RIjWV|&LfC?k^{~QUV0K)K**b6f6;-LiqKzbx4)Da5A$wHA1jfz zCv47~p3&A#OxufKE3jx{*cqu}7>jm#QXh_1jFrey7LrOVm}wbpwLg72LwM|U#Hz2HPAxRNo>}l}Q{$m6qgaY=B6})tNEIfFJF{A>3ZPe^(^Odl<0$>16 znZ_;=xipeO0zoIo<4V2VP-nD-uzCY_ONoeTeqfr#`aSU{yp)y&UV5vPA3(1BfKK!a z0`{FIrPqXl?+U>GsLr7e1iuMc3HPAX&8B^+LLpMq5)ll9#%WD+ zLOu<7n+m+tUub=O`%5|(0kFTJzjMeEHkSpAz5mZb|DT0GSn&VOLdayQ&AG=XA}b$I zn{n|aEM)xFQzX*(`b7*<`Z=+UN|qh4QsZHYN2JIko&uZT+Dg+h(k?@%-<8J4b305I zt;D8c3O~O(QCwV{N!^@zMVD0Jcpb1=Y*gSO0%4~;!uSJO6LfW3b0b(0=>@Aq;u;4t z{F^Pw@tSoUe^RJ{tKNaM94%WSNAaHX5@XDstqU5%10|cWbfJt-zM(4X@bBZCUgm?rc z9O$8_F=Dhjnttwn;^;n#Dw17+KgSqMuHB}0uM94xdv#dt0pSZQ(4DAPQXd5pjxu-_ zi1I@=raP6ZH)YHO?gns!5_r^O8l?$p3|R_!kY1Y{Dg@$4QYOb2iE!hjhd-#aF9IG= zJ~yy$d#Bu;C@zu80Gfb;R(hSnLjdv9qZNUpSDWZdx94B5_VjRfS$B8$7(j|+QdTio zR2ync>dYmg7i0~2lqg{i8o~y;!9Q)#!{XbKhD_wbTD2ISL{Y^o2+8egOloYp_9%@| z5{3Om10n}a=2GD1Z%8;x0nOL+>9ZCAFhr$-G=t@}>GaacQ12Q(=n+3r)_eaaiui@` zNFa#=Tic1*r)q#K^8n98_U#MkoS-~3fwO=|*93Ao6crWylD`sgfC!(1V8aVY!iRLw zfCgpnQ5+x_5<)cNs#$SXr|WDSYzTL)hI{`(g4#&T2INMsBLFJh>70XBfuVtIpn@wy;8tDX?}7Y zXew5H7To;Ivq9!I@ysgV%->@4&k|LcFs0>zq@kg!MDP;q2826Sx!q=r%XV)Hz+KJl z8IMaO4(HkrViw2lkf3q^K*j&qzQupD3m}av!l>-TemQJp+1ekVdq7`?vJsulZ!L(XE%i3ue>7L(%v3g7^l3ev9iBmdP!sfQ$QNG zLMH+i$<{wrLs-NaK&1PbbS9C4h-OJ9{C;tC?rRgJ7#xck7Ud>w6SZfCz-Ea(VCU&7 zdzWO*AmU$^I)5b&%i7M{C9Ltf{(JsUV230PyES~S!((2<~>77 z+u<};6YV8^12uBjhKWo@_kDdA6rTb%w*DZ?5o zRT*u?e}$@*6L2;TMdq5CDOHF>r-wKY`%T=6D&_`DCl+O?*A;o@7rAs6yn=m8^Wl(C z(}GXb(&FMJ-*6X_xM%_hK<`dIPJq;JW|~w@+@Bxc&m<~#J!Sa&-y%$Eb3E;0AZT))R&z#y{_!aI@S%th4g_!E3J zNsO~{QCi;3PWNTN5S4knS?n4?i6mZth*Tyf03Q-eyaf|)gOo{cR{5>aCJX_J1mICn zUeoMY?DnLJHW6yLMq*YM@cQt>6=HcS7h86mYA*^Ai7P$6KG&kWeGe}T5f0!iK)Ig) zV_bR1i5W~98CC?CzDbDIAuFkRJk^7)+sa+$cg`~k;TSx)Luw?xLv}D2nVdKU4w9pOEg{$+Wi_YeRkrI{@Ud+2JPVT#ub4 zd}U``*UAF%<%@4p+v&3S-9%90oj zE?&Gy{mGVmPbnTPLgmOF01;TDD2wrtzc0ZK0=bSw#l_NJ z+m_jSv8bk{dh6&kP}cJg$M(WOa?Y_!ci8AU7b~xJ;Y9__PY`%lq(1$YKm--O$nY0- z>VK{IS{z?D@n8G#ANY9+vy(E<-EWI}>CBqI{h1(jFpxr6a(qL~vGh^3rD$rjVM`u- zto%0TIa8u7#r7YNc`Fn&inHoLqN5RRtT#CJ6Y1zW2!O}-2PzR@V~M$}yVgomxOjd||K5n8oE{RNoGJ zm*&z*#6%N>UN$2h;^zXrh@=9TQh`K{RjSMBOC|zW<44cIP+g94x#05 zbPhYx5<50ldHwYu(UN3LgYpW(jYqEG2_nrG86B}(9r@ckhcUw+iT>ZLvM3D*CA1Jt zDoBz6&}e8UGEN=*qXauZZ{uL)1L^0y?~u|qUe@t2r|r@W4A7+IuX7HQY2nQ9_x<=} zrI1vvs|L8(lb>)Xp_j;z2zvF$0L2X&Pm}(d?@0gA@Ux!}QB_h;RGgR@|EV&3fkgPP z!HJGH9lwURWrRwRA?(!Z)KNx>ipCdR~dkq;2avk@y4ToxH=*o>?7k= zAJO-IOcjy@Xsxj?B6spW4hXe1isEs=oF$0xc0`62>k}RY|<1Z^j0>LKFfNlprHRo@*h%2DG(Xu>^-gZ~CWw zj27Fie-M8AKzAhnuc`CPIe__RZm0A~V8&(uSmNqk4ilJ%iV^6n$%zT|en) zB9=*zOc8`tv~__!tLAE9zF=XTcb zj0Yfb&FIF>c;An4QG%uf`!=y294TG7H~1 zztG(DzF(wM7YVg+SjeLgZ+C1Z-XR-JbA8VNo6?K@gN_p1=XSs$5eL)d$QpvA^UPFi zD^W`tK-s?J?(2j+Y%ZJMg(x=a5ZM+^^F1d#=B(QL`?)$-PMI1rRG`saOjZ~@XD z>)dhl9yLzSW)hunxvMcYI-FQvgG}@V90?{b<0)I1pV7N8=;zc=Q6TswV~}WHZth1r z$_?{di&Uqz(5e6`r2}guu2>K!^`5IG7T>tveh=b_m|0;+<-wa z&yj4{!ByK1#OYfe+b)VF-H`no7kZ3Y|yw6xfacysrwgI`?p)A*d99g z3Hq}Rizt=iqjk|)hw@!cEaQj+%&lv*dBjI zW0(jEOa~4yb0(7NFL9fJFuOLhR9YEocj@%vj7QBbZE62nPaspf_GF^TUq@J@;-DU> zTX2yH=#5iEy+K740vy1Dwxa@Ps16_e4>y6`He5Bv_hWK`>2BipctCJ~{g&xC3t{`y zwETMzf~DG}v{?ZFt6mFL{X0&8*@I16>=tFoZRFWM~HSE~Y zz7)_|N_-GngaX?~bw6Q%#mX=-icBV5z^8XS6-JB{fecaz;#P=*3`-k|-AZlZ-#j5m zGzK9ND~iv3Khjn}MbP-ArGTf@q*?Qfa0-^o2=TjaA&@uaf1-JDryTpYlbv~pK^=s+ zwJA_vlE#hp9*;xM>F)p>StetUSP0}Ex5kN;Z_{+2s6vLzxceG2$`%(FKlBRB{D-9Y z_sKL2&_Z;Yyr&5Nl^BL}HbT3*=W{4~IM0sQ!5XJfql(*UmUun_{N})i7eonKO2-9j zIDs36HM-a+8^s2aYT5s7lk`glBCgj&;;N>)UwM6xY~*H>p`OJjYcd)RIxvL~#7v1HLiny54vBV_m&T&<7fuu7eA_-pO-H0pU<3#pXq&ZC$HoU3b7$T=c z?M<|?i5@cHM0ivPgXl}))&2F(1jN`1F!sI4SnL+=naFr`spqd!8)|`JyPLk_xSzmQ z1NRS25&(DUJR(xn3~QZSy_Xt-?Mqr4q5l9W>7N(@z^dU^Z`cTI#7p;3-}>>c##4_N zj_nuJo|--m=v~2okz8~AA5i)a-@D=s(xc}6VdIXyV^)bKC&xj6H1+OlqBtg;Yx@G& zTOf)jUZdQqL4d=p{UA`a*YGV8=cc^o+uu+atWWMhC%~3`XdOkc@30uX-Ng8l1J>&A z6v{|=ysy8154qLoKVAa=3djj|Y< zKgE9=X@HNB4<|Ho${uM$8Cj=r{D)Q{Z_(-;@I$za?1=_R3oNtE22fm!{<*RL1_c59 zY7jg=$+XZC;L%4fqmiJM*u7!i!b?=R|4D5+asUq2F~|NAYc{lDjc&4`a!P_$grw|$ zo(9mJFl3*74E;hZ=JngZ(ve^G&6&11YWNe$$0v~o`7k^A_8Mdn&&)VUd&~_U^p|wq zM#ejihUEv~4u#sq@C&lC(xgJF?u=mc6*4Icu(Zs+{lUCmY}cL==6uN&Z%QQk!2k)` zjT<^MsnY-USGWdI;8R|(aUtoGCkWpd52~ge$#%=f4j%89J3ENwSmC{}|7j~C`ljw0Jzi* zh~Hk9WW)af6o#-8op;|{rH!G#Vd1cKM@5DENJRjP|3pEV-h{iE3SG;eQm{W z4**pv`CV-P@4X_s;U32<^Eh6Y8{kvyP#)q~GsJ;T1v&EeR zL*x%z*OkXyB^Vst1df3EdXNMy)Wn^+ywi`xlf+({AUUPOi-(*FDqU~imc5h*ZfA}0 znklUDp3t~lpUPy9-=9!2IGmpLp}4b&O8|`yTQn5^t^a90K#A+-{Zgog1L!Ye`HOF+ zemp)b2QDnJQhlc5e;a#mWAgH7HTz|mUDSJJcI{@!{vLnOV{JOWaq?ZcQ@E-9QR%fQ zoRVGWSm6vQq(1EU;lN4jI)Mn8 zP{E6G=qAO~T)B;>;5GyF!uadw(O_JBOmc)}w8e5qwO!&2IQQfxT+#<#4<*N97tX%> zZ*WrDz+#*GRR1JwE(+qhQ88x>ldO2(;CkirEUJ>!LmO7+hYv*;f*Z&hz)d?hGbdD* zx~eoXxkc}Uv#jS@+>c@~1jEtM>+>}+v)*G$IgOhm68DbXNnyA2xPt4t>K|1&8ZELE z?-YI% zZes_V&6@1;nzTTAD(C^gCMV;Y9T`CaN2R?*N_L$VZl1z%vL;=+LHAHAK~=@8x^Ln% zQmO6nUM_n}uj$-JOO7cj_a>4f8nUlc9qci4Z9UWY^^Jb!^VRaLjdvzjd|CfaRg}+` zt!YX!vdz>M&z3pJ_1N+)G%3w`T>e{cRCcvVb#1Ee!TbfjESW|5e%H{bW7p1Jea`n> zxos-9)AGr?F*Ue3ebC8I__8x(F#vrpn0F0lOnZpYYfoFbryV1g>2AI`U~n~&0Xjum z4#qNI%^Flamu{KeJG^fC?8yOQuqYV9$mWBWET5_#?~}i`xsb4SDX&55OC-3~pmgJG zTIJgc@SlfqV5F@a-13G5vtJL@iL0Mzpi+(WYX&2YZi5cz=5BXd*r1}U!)sMz!R-{N zj}K|@mW9_e%>tt9R-UWaHTwC`O_~42)K`Z^xp&bDG6o3hkvK|NM=1f7lEwfOq`NU- zXi#A2R8c8CN=OVKNHZWXbc=%0H4F?5!caqZ-u?Q0-@W$_pYuHW9GH3E-*4}=)?Rz> zlhT`J7B3{XKa@VB6W!k#J`h32Jqv^xjX~-ju2PqG=F0{2|GK7Ep8g@Blt0`G%I(XH zaAVT0x?J>=wW8S_$&-#deV@Mt-=Pk7vnI0!jNX;o}R-L(b|&+Ic!y_=h43ZY9tz>Ws>* z+Bmqr|F+;suW}FJJKriqpySKR{y4x_?zS1egE&^W9 zV4TNH20fvA@f)2*k3PDP@YJEwb1v_7DLLLVrpW1FZzIJks%tad7TgAf{NR61frk%o?%4j&^cbLKQ{$BQlp^vK4 zeXH4H7=`3RNdajK6)P$Z+WfB7vr@loaw~hB|Az6%aeZ*5SgT+e%z`IbPn0*x7!$4U zI_Lr)QaRBb&Y-BFU15YiY_DyZD`*?~U>K=*8TX1RMl?ktw^L;}`JFUrcbixpCe$uA zCQkIoDV?<5`1iMu0)z_o+g;F1@f$e5n5ZHTz?G!qG+pJWo%&XMQVZiAlF))UYuetA zwH%lF1K9r*gXh*^F-4)+YWrP7ox~OG4G0f&p^j-T{xG$Nt#KWA_WG=_wN}>-zBa7!^QpDfUEwZ@Vy3Dcuw<44{ z0N<)f%6FB6iY5jWk7G&FG*tifar648CvaZV8;kx=B`mh4xM~eIba7*KwcBnpx?yKB z*1@Bk?_e$0w=nxIdUPJ&r4rrW5qjQh(cgXP2TO~RK;>`BS7S4ORE5yDz3nxdEED_v zJ7%3<9$4&dF442m&e#?8S@gU~($JMz{&{(5fW)uZHpp{fu2{x<+c}xNz()+iUyHKn zU^m_q8JXK>nH(D9vj-bCg%vp5pZ+EmzVLvTn^*K=peQ4Fd(EL$Ti4heQb9$aud98< zl6*lUPgcnkwcYQ3YT5?lE@Xg5BMuNB=G()|$1aR$Twr5d)O`ApqkrOBJ2Q|UH9o-k z%mFQg7*qK_Mc#G;KbE1s`1d!S)PFLxFTJvEK?$;R33SpzDO!1L!0xF7Zf~dSmHEDv z8Z5)dOzJfw>~q@b_#S4dyFK~;#Nrw=lJ27Q{zUdj)G-mmBdPzv z>ECDf8%?*xovQOLm2KK)luY9Ug)Bs zK2~z(*#4z@dqLNEJR&Z2%Uxiz?Rk9lrbV~;`dLoo?V3`VxTVEtjw7-HOl~m+FAvs>tD_+!HmCE4x_?v_xe?4)r1#SM)Z;g=?C#2(an>oIO%S7#P)00o`JQ(XY(Vi|FNIN zK_z8%Wu=~5D)f&wB13wlbaVPHlbW@3D(5hgyv^>1;F%hAw(urDuk-45Z4=^5S^j{Vrwpzd-Y5F}iNA0ykLg;7=)Vihn~rm_ZI5i9)410 z(uUP&7Ud<4`oLL_0g6#E`r%-Iqgo}tuVkrr*MqY@U2Dk0!Ie@-yn2oVTY;TclF#pP zbrfn>ypeTcLPy-VsAde=@Q6K7VR>x0sF{G(UP(s_Ap?kKF&K z{qXlxiw461Bq0%CPoIm&F|h8&E?upbg)&ALIlMb5I+-T3XSEy{dc$;wL~OnkeC;vb zP;)wo9F68!2{zsR)-2L*%QxF+_JY2Gv8(ojkT5wCr{7`(+On(4hO2$-uu~?$SP<-9;aq?ODkl94JcYv+(X3qo#c8`BX4BqH=Rm zC1eS}j%c#NJC)ALUSC#CU5!pw(CMAWrbMWP5gOk07TtSHOZh@cZ1e>uf<@F40;hhQ zxezSBH2IR_&zbB6c`B}1XByWKa<+^77@L)~;-WoYGR`1q-JKq2lhk?L=BQ29;HOMk zs$OH)K?0rGtkJ}^48MrwTRd`w8tB}DR`I;UzB3tVF{$r+RM5TyK612)HpMwXt>{6w z@leB5!y&-C&A&Jk2Jq)ewb=zB5gcL=0(DySvy#F`MzzTMN&a}smCF4$m&2|jY_e|R z92v>e_@JVkEa6`F4!W&fRWzs(>F$eVOIvCd`MlS@*7r2PlVzym5t@LrGZ zC~bLgcmTJJAi;H?VV8gomN@@KZ6#}l~d9J1!-w%8skr{|H&Oc7>`vsqyN>s zL4ITM=UM(nJd8u?=;)l{ArEioL*;`q{d4pt=8#sGaIp!(;MX!J&RmEzj zQn9U8wWlR|74p{U;ZkPZdnpf8KBjH~roKV?!Ol}hV@^GFVv(J%E_xgM^Dx8rM#)2 z=xhZNnO8|CS+HU`BgL)9i3L%Gmb$AQ3g-ux*Wt8_bba&B_SR@zdvQ{_x7@gP6(UH{ z=S<0jKC>SV6-$BYBrjCHArsamhn-T;Cha+Qs?hDO3(ukIiAY2FBWuQCH~3+^-%UW8 zMNuAu!*U!PPmMJ#&sc_e50nD*qea9J*^jq%0RYF!E=CEMtfB>f|8YlyBv;EwvNHLZ z>hMn*6z970)i1vt;t3%1l!_lt8-mVEw3Ie6S|0>wup4}MA9E_ngC??@D zy6b6$c;zq>a%fWKLT!>qMAfYaeb!mHOKoyGj`oEF9YGshq0U`l(r7k&jLc47+x8sE zuE#`+G^K0e{=+u^*Hz`iAH-dCYwu#0vTX_lw*W;^y1v75*QY?BlYgd~x{ab7irv2c zb3FX&GaF0`LD$p{UnimmxkFL_TM(Qi7#?9SW#p_=GE{jl+%;xm9Cn>+^Bhy!lMolb>@zlkmW|lGB??|6x zezO@DcG$8M2e_UiLy|`87`qSx=egFFL3v=ECS(tvGws!_wZ<;6X)}gv`q=>4vfIqp zFykFzyM}9()`=lI`tklMZME8cLF5T74ItNIv83Bl-rK81ER6q5f5b%d;!0bOGqCX8 z{{$($vbv{l^jW#1?`raSce2yb1Ips8)*%^5(S4%xP*JBGnw!>SX4jTdWdcgbW7k8x zsN&Y!45=wSy>?Os=RStKlU%;<1j#V{a`i;JS6j~idm0KZctE_+S^bXc9d zDT(2eymhd<(jVWSuU$iK&Tsj9PcCr?VC<~aK?sL`CmjvaGr_;-eDe^ZLOWA`Y2zOZ z|AEnS${fi*Z@KPZ*5GlCodu5fD=+~lsM;5>SJEEhwmj$J^IXeJ7fr)^eGj|f)zGhe zRx)@EnB)qEk2jHBm(hzr9$TEB(%G11Gf%N#i2SDUmrchQwo+A4J`NF#eW`mDYYTbBZ^IsJ&6$ zFlUC0eVBo_RNT5Av21tm^54C*>nr(9{DQMN<@R@pv6L~QT&-Zriy{+*Xxbf{(D z&ARhNPrLaSfDa6E+3jd3@evr@6FlJ{Ti>tkJbvcty>;CuojK+xgQF+@HpBNDaOV)t zQv!5k_38NV*T;0Lx3%xG*WyW|#yjeLhit>t*+E+-S75i)J-AJZq~R?#LsDP%T?S>g6#2ua5F;Krc zcXmn!Llh4@Vdu-*1HNqIAX=}?oRTEjuz*OCvSib>0(^xL`UbCy>sPKy{EVe`4CB{2u6)g@PW7BXbY6F_DpzXwh z4$PPDnNOGcwMR}f_2G?}^Q;qqbfjnIJJud>q%eiO+~DD7 zv+U5vv(a{i!EO=SI!iYXdyTUWTZ(;2FNkot>)0jd#+es?F~#I0v*DA~{x^#1!>vYx zz6X02BQ^dME%ioE_hJ1;tK_a@-AMD3U0S^VWi@)BV_Hu*&LZ9?H47>e>Tsv^v4Ey? z5!B5d?rAdINCd~CHU|i<>nFv2pK>U+(o%Os1Q7e-FKz80a#1L&bW!i5w|{cs3NbJD zL5B#%sZfby-|p@$_*PpTS|53FQbdu=evLFyj!3Y$17Ny~GbdQ~RKDP=eBP3n*s2uD zCF{`3Ja4kio`2mW?g3v}$J_}F4v1DHO4vs?WSIPA7ujC{5xTOlBP z;&kmQvz08co(mCdy`C2LTrmN|^i(HSeRyFsxbXd*@~AqwVW;gOP4yb_xO|y8v`vWh z0VLwJu()4Yc*R!b%iUmpQP7&pO_?; z>wlH#6-C7{{r!%Xfp;_Ig5FmliDEsXrEZ1>b2Y#@@4(!_V!CTzWfz7Me72`2yERqq z8o%HhNlWMW6`9zi9f&f?>j?+DY5MS_b_SBMW1s>#^J_kb#3)SDtUV2)kv$^O8_m4S zd1!W=H+ZDeQPl8 zMy3E~yxZf1g)>qLji=uK9?l#oDHIg1C?8zkxbyD}=l>?WoC2aNrt&8DvY+fD)UAa7 zwisP_;b)T0UHIr0pK)Mj$&b84GVj=4pe_0BH}Vgt!v~wrI+j9t*BX@Qt%EeNUuW~J z&eNje7)ypcW^qq6WnRzvEL3h(u(ysHC#rk*0_Lo!vA}YnmzawEYZ}4Z9m|#pXslVZon(* zSJfV2UG4MtC1YJZ&>vn0GvAQh-yE;>WslevE4piEWHuhH?fR{)VE9L@ec7*U*tN^& zxr+#X!^!)7W3|C+l^*b&)>Czf2R$3^U#AXiJ_nlGhN`bAq{M9;9-OIWE_~TF^!jM7 z?@0i21JdMx;v2Y>iU?UJ?zB*q%W8%Yc?+=~N5W*cP;Pmq`(jUNj$K%M<(F_oIcq1v zKt-UFYvS~UI{6!99{!SSlaJ@hKRGGn!s&K{GhhAb<7*C&pSH06DVp(hoe(c<0~h^xRe5Ib`Sp>%nVLIa)iQpgm{||d9y$`- zs@h|(E_20MVohPNd@jFfJz5ksPhc^WD}P#;WmImR97G)eH7SSw7Tzl*8ZV9px3azy zuQsfXMYUZdR?LQ&q$KJ3MueI8`anTW-l`iAi}6A;MRdrmZl`LyL1ur{TVSfqY}-T1 zMqJdGyQFZ|#CPUk)@Wu#Y_cf6Z*gVLkSjeCTg99G_0D9H2m(crEYyUqk#xC{+{p2_`}3 zsdnkL3dmy0FL=cT^=}UVsNxoBM=r7(6`FF)79)?xv{`>n4vogl4f%(=7H7Gym%Cr zll??0s=6V7ezy5ZN@|Z_ceU?9WL>nFlZAt)PwciPmF${vo9b6QsgHCG=gzE3G166> zGgBYgFPqJ=*_4n9M)bePt2c+wNDGj91hb4u2KbKbso~>%w>mb3(c+w5+pE)jETL1L znskzr@R5eBlQkk{gKJmEi|W&VaM2DN4Q76YNW#(Nv{uo-EOS=o1Ti#?5Ti=HoaEn~ zmL2g$G5{l73m)pxh#iB{DhP}^=57(^VGc;EO5X0;FpOxbkV&SMh?0;{z(arO1WUSS zq8N|bWLqXRH|?gS?ib7ocVe21A^U3>(4*=y2+;x&dS+L?P7ORkv1)gR1?197)>E>z z{WPiK7q^;x4vR~;1MFVf|D%ICb6EFu&RrXY_dps=XDoASzM;@G$!OU%KWo;Z=YuGn zh=hCM?9DIJd(S(Q{6a-YKb6|p&`de^CEn&%u9Nf_D2(j491}$aNXA*R%OKB9tmW#z zAI{a9NarknitAO`=FFE?)}EZr3dQaXY<%67 zD5Lz($E8F`Qgd)uykH+_CHt?@TtBWfBZ)kaqgRZ#Q|9e9=N+gOTxO1y_Ijs`Obv$T_5&62y}9Ad50z%kY3r?ftsOy#p z-F;4W-N;#diEgmh-s6>M!nZ?wkAK?4$D$nTo*YYSEfSWeq!TxEOJ0)2nj{HJ97FCs z&w9=ZQcoaq+)!yU`B=A!pJ|yj;^dGuhvp{}pQT{aPgvKX(wpO1b+R+N)I$@y^{6NU z3r3(k?lcMNXl2K`*T`|L5rnOLb-C$?lM-`ZBPF7e=WU8?G`XX-O-tgumVe0ypGpo2 z(Am!!Oczv+klz6~cU|GUxARl$IRth0pUq0b=8?qqh}HYTgeU9~)P1?K+|^97Yr#J9 zvy!AQa#mkqvG{gi(+h6D!dV@vWxQOu9>D>Xz~rcV$OFj3JZ=)mn$R%lyPAeyD6f6@ zK)GL_XfPz;VhgMO*^mNNYDlc~^?4t+geIo%y5g&n!OTi8U5O6B1t+r$T4(I!+lG#? z+P+5@424;xV2SR{+YX@AZ?s}LuABL7B*goh%~>%FW z@&{>EK**(tEk%;r#hYs3ao{ojas`2p4iEh4e%*OjO!ur#}B!khLEZ|@{aijU6X46IxZv1_Xy z`Sc|01}UOHd(*0+a9m$P!G+nbGil#aG(RCwJdRp#?Wfb~REPWuK}EgnQRolVy)38a zYdf3pSO-)J-_8@l+_7)71?^q?%vl7c9LgWYOZLUmp*#sWrr#vIR%R5sw<^m= zye?||DRFf`v5Ue|DWPdt``9LgGtTkO(^JND;yGnAJjqLHAC51!3PzvN1!Kew@ba-d z;g|vn^7Pb5RL+S%b7WsJ9!vPuLm-&rU%b%Z@E5Q0D;AURan!eSoX=ObnnBEaAC7k~ zk>fB!9M6AV^!5wS_kgnV!y_!u2u^KMXc0(N(xwOizN|UJGg8`bU+zvtkPN2oLaG+7 zMb0H$TX7%xvws$&vSMJ6B&kNvZ4RsQe zMzW)Nj2-qFZp84qWlWC}RTFq`*XA{JdCVx^39aiTf>B*X6Q;S(xXnv_b@EzWd!V*Z zr0pEG2@4m;Lst+1qP3vM{xKz!cqs^KkWw3|5%J!GE<}n`Bp%aoRI~22Yrnk9y|$dz zg&wg%k~_O3Su8iK-^XIlXXfDqf=(pRhE(gSGJD;06|?$AmY-i$)Yx|OKcd`6iWMAU zMk!Mu|HE*yfIwfg;wau)kaHx&>-~^Pg&G8x$B9 zP_D^A8e+4SOeo0}9?*~rdko%YX&!n&m20N{Uezknjs-Qw_%Ng&o9P!GP(4{fr--S-TTfim_Zj^!N=dtjRCy|!Bv;F~!pMvm*6jSYD2 z6Pu|(se;ooG~Xi`KaOAVfv*t*9;W;@yHTydnJnty?f9+)kQcD(+va@r?P2CJgPx>y zW0Qo{=+N1ZlS<|OFXWCOYI2j|TIC`D$_`JJ2&Ni&D-n6V?RIisdJa1k|ZYt%YwS1lZqRNSC&}c)L5yie(RXmsMQQh^)qIcK({8x3v90iTJ56 z9OM79$_Wpot&AqO_NW+_{$LHKtu&=!r)uF*^gjdI zDH|JI1BPt!3R#m_w8EAZ45iUP8qAK@gHsCJv_8{3tw!xsZPzHIEq22k4NThRL#K$!PUxDYtwbPtBtkT@Hr2G!k}WqxxWw>&I8> zbVVi{zM&E}uNCu*oXZ=QoL}dDr(D1X7;GQJ zn4%==Borh%e+7wVVh2f-bCTy?9r9gy0F0==5-q3~z9oD1RFDxq*fg3o#@x!Xcwx@c zUibYD;-B{d*^oT@0CLBOTH-N(iAN3rOb{pawzyA2^FR+1As zE~{DltL%5Cm@OdIOQbB{rgnSy*GEoI^u<0NsO`QhzSM1stAa_V(TJr>$BBLopwZrQrA5%qiF=2&_TH*IDp4YcH$mAHxHHRj5z3cJp!Y$WQ z$u>zTF!XvHUv$bbSz+g(z*l?90Cv5=YMuA`wH6?x_nz5=qW#cU-d;lKlzZ6y=)*`U zB{|WlCn@|u+`udgSCrWzk)cwx(k!luG)1o=_-VKl{rgzBN#e(9XG&fJhrIu=Sx2AP z-p)qxSf*?@6nregOgWGOG15FmWl|%mLX(k4w`|sl0E!Eo@UACl^gFEs5-NphD5fJb}&qmE6;Vm|mYdy0+(H)P2+IeXPj&uFohIhLvq8HJuOM1A3*bvZP zY(@*iUd4h-utMvg@rx)FL`v`STPk5a$ogswWgy?7U`T+`;_)Py7T9i-s9Ei7MP7za z-qW6kY*P>*+C>BlRQ0mCbgR=75}Z3945>WnzmjQJD2AAzq4+Ce=L3?XAYx~%BUY~$ z=0<}@=?5ar{v5EwqIU+I+T0}?5r#FSHKxzwHJ8`Wyt~qtFwt{oXk{iVH%74aRvp*R z;apQ0fyzN}nHv49>dZJ`X+LU<>t=`qtIv1CT4nBcvc{%0XN>sU1Qk`0_<^CB*>=id zFUTCoYAa2{VhrjWbE@|}1jd;`1e-hVwOD)9M83$Z6rfql`-^wygcVy}ou2BJX!~{u z>Dl3mK>XgyELgZciLz-v)7udKTmDkI)^YtT68Y852s-ZY?Y3zm5KX*b;CWM@b$;;> z1oJfIwob(Is_(V*>Q`R1}W*D{V#4h=wg zULMV+{h3Q{i`mfjku$96H3G+ocfcp@za1j#6P<{ny=E^0FLGmh4Qz1QDUWn~uDN(n zwUstS_{N>R0P(F|AyGw!{AvGyWfR;TWYdt*%vP+}q$~+2?>cBT`F(g8v9hjgQ(rTkkPU zku9KZRxgi=xN0QPv~}Hj?^)x3Y&RWE|3T{0`D>ygOzCxWnsJ0KYqx>}k}u72rbZ64IF~pK=zfKa;(BA1cm^Px9CDbk|C5KO@+pZ6n0`An zgyqp85hk_nFw=fHtISx6o=(4_zmy#bFP+slkq6R+JVy0=*s6affp!yDL~2AN$&A1> zkBB?gp~5}ahxgZuD_6Y1v0|TGt!oHWu1pLOP&Mpy!H`P6+x9@KBZq>7n}S3}RoU{@ z>W#_;VdsazhgaD&8!xL7SQekvnzx)H^GA`zQqqt|q2r&qAJ;Ua9<}4l*oI`M>qg%D zJ5bHhXQ8ftE1N^@A#l3YDCKg!J;Ye);5t>Y3C0D>Yt<*%#k-AM2->CVuN#AS+cR9_ ze=vPULoR#@>X%$R`!+O)@u9?*lytRbuBY%KcBkwXoo*EZl(jyAZ`Z)4D_1WgbG-GS z<7g6MZTKu8F5~`AYTc3~gZpO3wEFOzr4Bm~2V5WMF)d&pd|I_TAy6#j4m}N(X4x7{ zyc4mlZYGYkD4ax@rEK|67mJOMvx1jmS*5*75Hpjp^D7Yz+I!{?E?hBC{7!9TlESh` z6GgWu+krxep&7OIwwUg9(l-pba${&k`U$EDei53#a<%i@dUV)DHD@aJa^^xS*|W>e za|A!C!yc~_$NTVy`UFT@Uf|+m^nSA{!0E42vC^C|s|99BVMGV4dw~9VOiM2DOtmwx z3S$e1J=PcEZaqsn3o9yxdzQu2RZTGt z{G8Aa_vzu}~L#$uaUSQ2r5Vubo18Dq?Y)VKCU zr_A`>CE;mA$TIn~-l&_V;ji9(?gV8pp@A>MO!%Ks&=B26;{(O`XsGu~EL-r+sa?Hk zZV~LklDu&ZE~HIS;Gb*Y2G`)9Ki2>(&_a*(1R%*w>_#GQ@6( zbgEP;^F*wLb8B`!H!s_V%Pj(Owq=uNl+p8FfhQFei*c(2#M#`n6Aw~QU_{v{oeRUi zt(R$8Qf73v(;<&abh*tu|EO^T5tQhy3h_IJ(?7e8OwK;;`SlE`UJG8;!CXsmPb}$UPB}Jhe{=#pGQa$dZ5$RWpSY2LMho`^vr>foMCX@H7645LMmY;u%I;9Bug9ULI1bhBU%Jf-c z*kpbL(ZsXG&^hTKyzqi`-rM?A)U{DxrLGWywuII87n`PiYUxcuVM=c)7^F|hj?ht| zHejkI?=SWB1gD5N#^4u0jU~x))wuk4y*UQ$%W7w-%rEwX?QWaq-;XEWuXKFm(k7oH zLMpi2a34@S^6 zt{Fu%R8(EzU1>M|hzNP7Of7WLY{dd#k$Nvsqb%T$sE10|`T?MpKi*S^+{V?vRLSwf z0I!BNQZdb(+{1hGt~+c;Pq`*!VaB#QW@feBs1ly)I^A+9GI3MSyePYvcy8q*mO{bGXn!xj7hX1{Ds?>PU43AaQ<{1fd=O&`=TX)# zGDM>EEzUYYq()B^`Xt0Vp;(xV5WQNnIBNCE+boR6$;4mu+8$KrU3Ci1+vYqE>aB>U zo4K9%k@^~kW3K6h^8^HE^8PO_(_r`^3DJ`7&+j(;7geiO)iS0xY)g{n$!*m1uXFGh z7l9yqHW*I}KKt-(YwAIGrc3CoYdoRzi4ZWn|2rqze}Kja2-3-yv>Yhc-mCPHPuUmY zOp#H1FDAfCnjr=)7Wq2SGt#Is#yKSeXPrWm`;w4d^NP6~x1h9MIpH!YyK)lreChJ? zWEkTx{bL4PbW;=|g7xTPpM333;K}pz*c~>NUpO=Tq-?onVV%%S#Xvgaa3&eFLm}SW zis#vP(*)XI8=~G<1FaJod;}07Eml)is_=6b?MstqKbZ1@QAzb_FgRAc4WUoThf~3=&h&E`B%0Rz~&SPgq zsI^ZyZYf1!7AVkeMQ{73K3(0^CCpQk#vTrww6P#M)rB1d#(&Yx+`{XxSWJ4!(v%{d z_L~pX&n-jyPEmyrn`Mue0;+`Zl+V~(&gKQ*15bz78JO5qK5-jE^7IzfExv1;LmO4Z z@_wE3bNo=L3^cBQ=K5-5)apD`sUG_GBDLhHfvY&@cXah>?C3?076Rbp4%U~*k`c@L zcjt%I%(Z4P2LX@j0^P5wQ*EaEa39M&G!7V1!__cSiLX#=W@owLw!mxfaOK{k;|`!) zJ)Rlwz1}B(5Wjgv$=XWl9VgaVqltwv)0FO_iuBe~qZsXiRy)pps;MU7+`9%BzE=cfPjsV88xu+_jO{X~OI674w(L z?rE3Yc>NSfB>Gpb6cxjZiy=!4F4y~dxZKZc;4I89XAo1sU4 zx{*shBf+o9HGZwj{5#~qV&H=nav3n2Kkj(Rc8XH%caY|iFQ$JoZ+LX>?-V8-Yn++b zJ15*o_YhnT>z!v!TNFPG;-A^g&KkJq`ekfilt@X^IW>pK zKq=Z12z$r87;C*`B&A@F{&I z?qehZ3%OccjnvzFhV5|dsWAe+w)>Tc4Y9HyF+fhDWv%nBM1vvrG*bm9Ki1Q&*nZK3 z;t`QW(o#QV>Xhqe?p9n9kf+!`pU?Fw`dSQ@6Gv8(5Gl00v!}gnjg^5aaTN!MXUgR{ zOV$dNreiro7jYRQZyV-M8wtLB)_E`XyY<^g|1vV`Xw=a^(gWx!@EYCdtoi4v*9nC} zPLr8fA#X(wyFt?$r+BAM&R57hMgQlt-FsvsF?6U2i=1r2IK!30au6&yHXfutRAY zw0y>>Ss!)z%4PT5u_3tyEh{0)6i3g3ve{d$5NC0RCKM4MT?S1#h|D^)l|8y*HeR1R z5=C96AJl0G1-0P~%o0ENF!Ff)meVGh(0!$cg-CGcagx!H%(JISbE>Ya1GC0u^F zjGpS`bH$Cv9HBH64Z>fRs<#J!Ve)`^M?i3TcpvSU%BSnK;P|=bU6yRzPNF{_S6;GG zrk#Mn?#K=6svp@dGjcJ(W8MWe{rvZGCF-0y`BIjtf!k?qq2dRc1l3rPmi`Cqx~ozd z%_sItC!Y~tSI=i+@q5-TQUW8%E{x7SUh+}}`|Kn_n|rIfbZ9rIoX36()U=iEDre+L zQ#YD?8FmejYmh7acsyO&BZLupPvAZ1tLh_*T0}tn?N>`tp*iJvX=E%a2WlB@bQg{& z!Bn(Dc0Jll+}F=AT<5_>VtAlYlpDt$rfv>pB@5_DrtE2C$tLkHw}toQEdF3N%vOyM zkG)mF+FgAgeRTH3sE(WjCsPPWBX0BDKR(W-ukNHq9RdA4<}GP2$0j^r;Cyu_HSbFUcF1Zqx*y25U<2}K?Pq*>v0LCMaCPv15kOY|;aapOHA2ii zk2Y zfU;e+3lawlj3-X|sNSQ@*rDx(HpDdzYF%}&K->h=elZ}y0_7# zE%9tTbVpoZWc=m+!E78kZXk#k(ZZ9Cfs%^Mwrb=$wI*g*eXuc0%nhmg+y>N3wq{*I z_2}Ki<1j-4Ss-&xS>&!FnJ(ofd6zVqJ99aw*PL$ZU(n z2@S^(#qz8I>lbol5>Mrq`)v;Y^8)0WhTqX8YoL@0%wlqmCtS($BZHDe`mF4-z@%$5 zifT7t!kSlryFJ)mY*GNeOouGY`!nu$+`xaEnNw7Wzb|t#VUK5HyL~!x zMl&?iLtSiND^gRSCMd-{LY3Y$oW@`zpL{2##<(8mi7`MXo zpk`L0I{-yzy)9|<&&hgF|7xip46mG+r!qvY%qWnB*Ts)v?A^~#vYqG4@Ix6Aw?wVG zxp?11-P25T1+wn&+|r!LlPoxmfmD3X{C8RG&BzFf9@uJ{vk4+)R#fxro?r4$rwq z=9h~tJ1#kTt&rP_8_}M^l@{n-5A&B(JN3K2UfJ&0k+T)6M9p(>~rFXtXzziB2ThDN0w4w;HDZPfA}y_Vt%SUUoG>~Z`)vIKAyVvvzMul-DAAJ9gqHQ zpNF&;jlkfWXgV-`Jm-DC`kEgFzq=Cp9&XUea&|~>{XS#kZQ-gZMuA~u7vE}%aCGsy zSyumzugb2(24wr34FWHEMjr@+y!D9}l6q2}M|Egl&sU1#XX^Ke@KO2ep3S*oFhj(r zsa@X#&@T)PX;tLe`9nFz+g@ukg!q;8hi2xN`_1e3>V-YmZ3UnHJk=KBiTuB!DhvZj!0y02DGW!BOpRMzux}4k!B=-!t&gb2w)U zRPS~+Ix=W3`$F4k;^@>l)8EV$QqD>ERI43H%irKjL;i>V*@PCZ8xXsy4$IW13> z{!ff)QD**OrV+(VKBJO$9&Oy3=^)N?V_x2W@>YzlSr_k{%BR2n>MD0#2-4&La@1!ECmslH!Q_3K0k{PK2g>p((lG_yKjoeo5FH`+-J^d>Mb`{v^{#^;(aia;%75jB-|W9aIAP#pCyn?2({d z_o;b~f^jlGZiU9*@v!uLF4F8HC7anPuwuT${I&jP1jsSQ5pVwYZ|F}gGPQSVY(KOW zCA;41g;z)=fV`q4Wa-OyIR0WP%3*+)GHCzV``qN9JO;B@3@4lI?Y=e^|)ILqu=0IWF3w3 znfRFe5Py?*uvPfckP@Ox@iEYBGaBkd0fYkOIYQmxhmrD&a7mZdXY;&u&GbiQ? zMWUP}P~AE)OYNImBY>l_y5QcUeb&D09eK>eVOBxBfKp-z~|<4H9Gb z?c9KK$Nr5hB{2|%Q1m_^pcKQLzrQbhl;_0)%K#y!(V)%He5vwg(^^c}s7XSb*9$#(&JX z1I{R;qJjw!5DC)cBmxFdBu9ZJ(+vnTIU^E`2#69SgOWo75+#EIN=9-91<6q|h~8Dz zbG|wAeRI$Gf1k?_x@y<1T~#}*z4qGgde@~X2nO592I43SaSG=fYweTA)z931tUXtvB&o!AGs&KNhB&5I1@O9sZqcg+-r1+A zWis?Y^L}~vu|lzU+q6Xcke7vezDlEV7W>2o5WA4cnLz+u4oBNHT=YI3F z@`ReUPAI$E+RTkG`^$r;Vbh5QQ`Tzx2Lp|6)oxqC6=R95@!-#gKgB}ZxS(xwHi(?b zOEMiKnd-L*h`mRXg3Q4o#gW~wB!)1W3`$zbdfzO^96H{HJaq*;X83&3$S_;)Y?m5^ zc6GwL{`REX#HgPOc+;K!$_?t9AG`Kdj-|?dkC*bL*Ix(cYRP0T1Em$}VUCk&mI?wm z_D1#U;R(S?vMf!8J}uD!?zSHQ9n|UTl(GU-ipHbzya2FzL?c*j@<{|SEPv?r%YHXu zCPjJmy@3)xHBG&r?+>+0Hx@Wa^sia+WbJ#SMQ_Z!mk}CqU8_Qn^`Skw1&4&39mHq1!SE!ReiV* zH#WdUT&Z!Cx7%~&Qhk45!aj>t)pE z*mi|U`+2giUq@6|=6kJqYzqD`>E?p#lE^8PM2o=(tyQ|z_FVG<01e%t%VWPCEuEM5 z6wK(EexrdcM1NV_(V0qHL8=Qw)htBKB!k$Ybexs+{4*bY>L1gU+kKXK&2j^UR>X5S zlXR1UyfiE*J<8UU@2r`pgtf2r6WCcY_t8b_T$1;u#W9vfQM_DWNXo4j=(!8EmJoVt zk#j_&aeIvYKz4j7AD@oaAp7q>!F=>FH9x=f3}6VPr5bapds6MsYkhPhPpcwQY}5L z5-Mp0h;&))gL?8kFLKul){8X-D0O$hk{DPFn}q(n=|l72o@#|XCrVuDe4vFhCRBZ7 zq1zQkg#1L7rx{=f5Vg=H?x1yT*#37^Ey82J@!=Xm>huRjVuBFy|AX>n#pF{Y+73+$ zECPfDM1W~1kv*vHah5hOeUzBXz2!k*Cdl*(bPpJrI6Gi{pGu;fH(-da!Zv2aVpor9 z^jL2}=R-qQ$k#~kb#Hw#5PgKVljTVyf7`yd+t(eGMOB>MbYo%IL&J(6a=fc zYbS*uyUuyArT+Nr7?2#`5gv-Vc}L5bF@U`1v?LT14$0HC(&v>jJ4^+4n_p9>Yx5;v z7wuFoeLbrzWn{Wu2J>`=n2w%()=FAUd8xVl0<1P3%si4co;3`NUmGuPcrB&~_*riL zz3IN)^hF!0UfRZlzTBkGYsm#)%Y7PKR@n9vE42#pTaHrm)<_R}9rh0S%DTOAllFM} z?$FF@hTLe?=wk{nPZl4CX|<lF~XU!B;SxcLu$r4QJPNg0;T& zJMKEp_X{{^7N~i3dpSvT=742M>o8Mq&yyeO3j?!;K5Z9PdX-q&840H4DUPudt9`}X zLZf3Z{eKCA(ph;?24u$f14~ylD@06dmJht!)7*j!_$qc#25gs12NE4y^KRmiNL4e4 zdy?mRG;py-UJP1)oTZ+jp(T}tYYdC9uGp%Z`USjeJ<_-bp|xqeZ>iZ!?jIww>PipQ zDt9Z`0vujbN}T6is-uplcQ+cjEBV27$T3*u<5y+YBwjMwrTo^EUs+mtbqzFXWNIHZ z3*W4iZO8b8&cv;EJ*V)aHUVQxP=aN3H&I8({`lRIpMhnx<^!&U!;%}c8 zhDxLE{}?-m@6OZ;d%+uD)Q3OS$UiH;k}+DYdEkroc2?CyT1u?;+eUNtSrtn1T7wfy9P?c+L+2=o`xTBcDMPD^ zUKDfS`xw3EjJ0qF`kbf`i=L5^h6#>$nU7B$yM8#hAoITJF$ay~L_aw0#!O{s`bj`ZI2b$>2>Floe#pP!UKZ~xlgmt?{c38XXgFJd2KF4#X zRYXfa5H!hJ7!_H)D~(exY1=*&!sedk*sXlI;Im7K2z!esh55*e2`jREm;fF$Lg3jQ zm{N7AHrv!Se}1~4c|5Uu)=>y2mmKq&-j(L!q6_#b=~%FfMK!o;9_&mWtYuA6t+IoT zS#UON$)X{#o4$_P$9bx)+Rf%GY|0ik_|HnqDc!Qg3HR)H?-7@aw{TmZ>+A0SKs~29 zTl90N-IwKtIrM^L#DO<+AH^F%;)|@8GhJsN-JHFw{4sv_9r?)V^3@HT_~dG{2&@su zb-Xr;JzCF|XJ>nb(b#$Fb?-^mR=LnBf4Dv4dstXS`<@oJcdkHyg+X{AGC!!`IK(l( zfT~dY@bs+Von80D$F1i3k7;?i$KF8E?8~6qoj}p-Nq>x9)AUTymMR9KLp4gq6yWp1 z8MQ}JW1aeN-J+jck}dVK*WB~QQ!Gu1R?%t)TyW(*+CE~xpQDO9aS3Y{VoN+XbWFLU zQtR<)yPJu;-^^D_MJDg$nmjG&E|GQ3uq+kFXB(&cc%Q|zDUMX!3+UE7JOK8G-k-un zpr0Nqu$=qg*3uWg&kZ6n^dZ7*VCC9kE>}ED0KO^wb8kIRN9c?^Ug=chnh({~d)plu zD`KClRp#>6bOY8evi5+Nx^Xs=qH+6Y>dF+O-o2uwy8sgw()QtKIHL4I>|@~mM7ApN z)?;r)ASx9p!a8n}hOui-c)ZQ0A3)gbIWk1e(li@OptOmOC ztw*jDp1KN}o<%xMoN}S zYwb*;{bUOsK|~jifLnoA0e{Gl+8;qY_C zB(XeTRgATa;YiI5y=S?`^_1n!-D$A_rl1hAjmyDcbb!r8B-k3Wng|42OJu5goTU~n za%(T8nooZOqZ@dnM*spsXpi{8k%!1~Zx9f0u@+p9Mf?bJ2t;V2?b>01%AKJ_`ks|T zdE5gGL669UMmv-g7EkLX6&&DCnQV=KCAB+6Su4(3VM>{oqVW~CknzVr0R)?5T7~eN z3T$7|X!LD&Q?lyI#RZ%>E8$tDyd?5W{;*w3q(Gx~1DL7Krk5OhssHN)=N$uaOZrZ= z-CN|AF(2uk9PV%RjA_LiCiPx=PBFOBG?w|?9uk#_+(a zV#*mXbw;;`HkY&~MQgsyR6K1+&iL(J?3KfAD&>lCSMWd_#RLAhDbt_@TWaMv%Ea$^ zcx-g92V+VXeTYvBtXH`g$j}(-G*@BB(cH9r>Fy2s=m&p#0rpPeK#_@)3A~>TDbfLe zs#@zDmrAa}^*JHQWHsT}P%iGaMYEB3NJha6F^4IG{sM~=>A&<&rSY!={FkCKMO~~K zSflX-ppQ+zw+SExi@k?VpS<|a^4uL`&$TX1`3wyr)ALtn{`iy&4VQOWY!zZiA-t5u zAgRD&;W5a08u2Z4Nw}WW2|FITv(irqL6(63 zc}?gyu{T*w_oo>2;vUQ7L*mEA1DL-x)HJJ6xV^xl2iZM7Nfl{f?zX9k%hAa<`%)Se zNKASYJr5*WndWp1xSg45q0rDncVR**5dj2}ub{>Ek``r)u(ejtbv_GAz^(Ei`vL{n z7U1dh&#l<)2Xjahpw<>k>+*ov)2b+jRtectm!It>%7d=eAD@e!pap- ztwP`MN>`>}#F zdDF}`CKcZc@B9pjp6Y$|;L_`iwvNV8&m-@Ee3@)@xF<6+yUB5DEXn9G3WzW8{uo*Ep!g4W_kK0I3wiReMs)G1_#X$QKu;W+OjCc!{ei|rms_8RWm9H0~Yxq5^F`$l@ez7*y2n5H{*as zadz6yw4-?+RY(tPWc!U|V0)V)_)gt-1_xM;T_ZhyjyhjSJ<50FajzoY#Gxx3Ww13q zdsBBs>v)?p=e?fqcd1A3mQMiDOv%@8R0ofr1g$rU*9KvN-8$q5GyVi@-{t zEaUdqa8&JJ%VJLxe^LMv_bJMi>~Ni~p%IcZk?exDX^9XVehlcp+CS&}Z{`Gt0xY+w15*Le%;3fyW4@01t!UHvukC3}e>B z9p-x9dJk|!>5!Y^9u>i-+qh5eMOOB%pal5KI+adV-(v{SNvipV;2Sy}&I(F1WFr&Y zY}04!|6;&zN&SQE0gO{>o`9s&H9!xiTfiI<`)08(&$PB5Ja7oIn?nX{EUdoL4Cj%L z{dBzz@lPW2zK)p1AB6gL->N$Eqo8ydP*^9a9DNb%55RIfjE{wPD;BK7VCz6AvIC;j z8ojOn%&{3NMb}n1bQuBC$$`ASr+74f;q6)cS>Rc2%VF1x6LZd@rPPpiQtAkQ^!q7% z=y29r>5LyOM-#$1Y;rF7;%H|3mouUc`}0IVQ_z7%oN$_fNV-V~FuuXuv~B;fYtV?b z;U9SezXJDv`gUh&+^x24eO$gt(|(g1UZ<9{A`Wb$;kiI3o7+@4~l`=zCb$@hTwXH|S|vP`hxL>E6~Ubs{3{ zMMRaaUEQyp01O4ceHhX>FS=aGD03$M~ z_CsO!G5}E1*Qb%a43G-O29-Oj(__V~j-!DRR`HHW=iH_k`CyJixO6p#q< zp(|azSM|=oJbB@QB-3r1O;1Gc!_N@VVDuo2H!3Q$MUw;1(%6c)Pd|4 zLaE1`h(lmDtOQFdu-ggJJ}vK)1-^$#kz>^Xr z3TSwiz=dc;%$_5NP5`tQ@3yfpFIIufMjp^Yco1{Hao!_dxfq_sOjhZYM==N~*kDUr zD+z>(9VYR|)C-jWct!LN0mKo>dZgUaIO66ZA4wz*s_NcJ2SAi^BE#UTu_*umQpo5` z);cE#ZyjK3^MJk@g}6P4KCBpgeoCk%-)cZGP1Ozy9>oY04^NRx(laJtzuDB&Z^|Hm z0;c`K2GN~@k)I!^?BB+iY<qkolbcupVi{EQ4MRU>)m7QM z22d^QZvFea^x69bWkB^STf|#jnSvN%YDh3-TG&i4M*8AFTEZAP1#I*QXg|LT^9Fy6 zsN;KpT=ODsJN=w4oMcyGi4?!>dcW znTuwIdmH7#I6md^^`Xo$MWv@es6GUGLN5JJqJ&1~T?T|hL;sVeDdF!fr=}38mGB^uO_ILIZz_syiZh+5cWKj+NTMAwF zyAT>>0t9Ei&*(z(TI2?F+p3COatbY2y$dzwT^oTI6wtpTEFf-(9aq)9Y^>tB7$k}l zHRGrU_ZzG}V=@GeM^ue>l$*n~EXvotkv;in087WRB>UE7d<$?}oibe2r-}zP!=mHO zjc2T-_aIn9GOUi}JyCCs`Vi33pEpCS6k3NFzn)NY51@-jILxc;D~s(XwY>>cPP_f! zh1h=bdiVBc1cEcT+H3s0zy~!~f|^LkZAR89(IWPla?2N0llfhD!MBhXzzcC}J7BD% zBR-w)8c%N@`Ex&()eWOVH=4dTIli#;cf{`voX!M9a(O7Uj>2bK@LjzRE0RcXam)fp zn1ndn(UMm<+qxc8mt={P382RzFl7Ai%H;Z?1Kry# zv&163+PBA=5U*?wPa%ks!trUIK)uS|*qiYq-(3f%{rrqrw0|AS{u}xjDD7u0aPG3L(iIKY-BbByM7+h{aMP5r% zrbUcQL=$za=D}zV)txubE>WM}bBZAdM_yr!Co!JBgTZGn53ij?75z=N!09D#J;~g^ zKaoYUZ!SMkrsufPhrdfWiUa~%zjVHwqJ-hdtA?yOwrSDac#xK(ia`Apr(?k*~;O0xxE zuN+y0Y}kar!!o?U=j$o!GTYiO(3Ub_Vc7_W?NR;)^UE8`OKu>2;${#BVw2mWC_mzDb7tiOf~yEg|tDAUJA}I2(ephk(z2?TCR@lUu#L9{M%1k+u*_ z1;$Z`6im!?_z0d7DsZr|Sb=qPWAZJ>h}zFV(1pJ5cdTb5o@%qS*sfukbG)9$#=34I~9)l^mkH!~TY_ z8hz5c=|HA#X*Z{S#UR_57OoK1=J#f0Sl$&utf)JU8u!7~MS9j0dd-GLvK<(3GQD!U zLoUfqxJv~Z$xyDItKbI6i-qC)TPJrSet<9@8)?RFH z8r}dYWQL;U)SReDUsIYQ`31x}N|TyEMw>W8`ciUAS;ngErlfHE-lvjZ5Zg6DmW~GI zkG$sndu>2E!JF{QNFL@9;O9TN+e4s?h&ta;fqsYUF@{F==Xug`P+<@ain)qc-ZheI z8zL(e?65V^ORO+1P+Uqs0DUg z!Gn2{=Rn?TKtC(+dn*o01nDY{UO6uYp$-`tq-{#5m(HqadMY5W7KR{Y@^15%shTz} zba6I#@Li?vCGC8+4PzrRXwySb%C~S+>8FxuR6l&E8Y~yOGG5{MK&MqX#e;-ijGqI{ zaCa-%PB>&u1}?d^DzWmZ^E7xSL98^mI!KY(fnC@#T}9uo{y@$Zd%MEZu=Dg1(KIr- zWn{>_e><>A6&)i*DYi8CL-)}mNg!v3`Y@ug$tsvTamT6RFpWVVdCWDmDvw>QDpttb z%WvM!eD*ziO%v~a^tIg%DhZYj^HptufK3MlE!A!g^w;8JN;z6YKv%rlO(BIjoZcT) z9UaJLimn+x#d+m<6b^IKS|*dK*GtlYJU1d?;4K6r>C}gQwM*jiilOv@os=}4Xf&8F zg!6Q|WtsbG)~od*+H!U*k{b3|$+m_xR0B+ zst#QilTuR&Mf)evvbaZ3PP#Q_mSZzT@Zbyxou!^SsEbrD$Pj!EL-1RKg|wffGW5?A zj(EBgq!Ebb{qtXt7nxS`fq1){XjP~nBagJ2m4#1uog7?^*4*F!)gOe%l<}A%HBG}- z=(SR=c3wB%O!@7;rYUH)AQ;sLMTo!sJsdik`KCfuOjlFn%X%$)G_tSv*8{@0@fHq1 z#L3G!tyboWzy6od|A&};JO|%dzdULgNEiMAt#vS*Y_wig;PAZ)5N68A+hS|K0i6#? z2BHr{|Cz;-(P=fW7+a&?Xm>#Ak3{TBv)LbPzR=pdO}xhF!2Byea&+W~O?FRa6Taa- z>w{)PQd4onGuXo{^Z9<)s$Sa5K7J^Up%osxa@l-fM`Wb2pRkSkA(l;Ajp_Us{0qWWKIMJ((+KEHTeeb(SE;u zx$m^0g4y7i&v=6oXvcqh9QxqY1g}zNHlg-eVY+B~%V2S@5`pHdoIa?^dWmaI@oRcdvAb&E(FY4 zd7{_S+DhaPTtK6r-jwp%s5}-Vf>5s^Jt;Zax8>h&gO9AGS<)`*J4;7CY1G7%fPMj{ zj>Elgnis}shKV5T6wt7}fmUK_VK+wMc4(e|{o`dCeKnM{FkR;k>zmBjL+7qsJ};hp ziu8t?2ibBmyW83fc6)yQY7xKlEP?|j&ITl2F(+TMFxsYns{(IPYPQk2GM>dATAWx$ zU!T5-cb9EDf-zuFvLiR8Db_aV!!uQ3zaM<`<-F2WkZvr|4v&Bxly{DN0wZe)sLdOQ<)}srC-?7n!?baTaxOlvXC1Kiw74GMpFaz`yCA-|t@^NrP;MFbg{U!-)A?hlkPduMNk4c(uRf z^}Y#pfBx-Ti23)wJY7i-nfGJdrBNN>e`@&s*N-r?oqie+^S3Acr&I%(^&Rn_TO8zr zfY$%ZoBsbU`u|!NUAo^xOB<+iV7-lu?9g$C8+RNGZEP&fjSU^ltqI&t1d~fR7h^+9 za}zs5XO6q})&w$lCMTSoJzQlHW#VDtWWo`Qtxe1crf_Ox?_h@= zchDm~&qary^1mKl82(g2x>R0_H4@a@m|QgKmOxX4qs10Vr(X%*6PS0fkQ+zRFgVi9 z7f2a)2tTK=gnsmjz~?6xEQ!gTY@E4|?JZbwPj0fJ>HLD|kDzLx`TA-*dY=)CsX3a6 z_ZJQeiHdsOP!e4{yvgZ(MV7&e-;^EAg#XbWq=_DxL>JWDLXY@89rYi3qKCp-7WdC* z?FJ9HeZ}G{0~2h}R6+mmSF&s$FPINccCPG7@hZwZs(tuy?vnqn={n2Yw&7J$E{`vp zoHk?cOe};QAL$vPX+lp%%eu6M=AXCU+ZHRjXvazzA*f+ib~F2s%NQf+EZv`3=Wmbv z`4~dQyi@QBdSYTC#5H0+pxc9H#K{SxsY2*~UXBL2b>)YxVtRWpgE>mGS~Di*Ihw`j!4F%9rc1~% zMsV9;{Nv6K17g(L&d*%W-q}|Y)@UY^X!JV&ms0ul=>U-*p9+`J1l+*0#u2-nT?I53 z|HD@gZLK{Uig2VSnuzqQF#DmGc+vvUU)~(_$8Zb3t+(W8+xo+1Q$RlwO+!BrBjEej z2@gg~q7h5LGTnaWfVGoOdV#-#UgWvRKYX`O0sqpKh_roa0V2yr!y^rR>T~|kc@ii)oXdygd(+2J2vR z)vSTp@T&}(j47&d_I~ZHzxbuJm_r-APv}AY;{3exBT}zN@8%J57~{v{UwUse_siR+ zIW|VP{4C3Nb&3`sHMh|KTA08T=~2}tLNjPlf|`uWl0CVkR5XRpnxB294L!u5$%2k$ zRRP->SR>NIgCYhzwx393)Dd3fHm zSdh(Koa-!Q*U>`zGmi*qlnD=6Szwt=e|95fU<4*V@;y?VDv{Wpocew( zhA3{qnM^9&A17-pM>?!i`R-SMX6HX&7mB9wZxjgX4hY=VGtE*(ukfJBJWWZPmM^|x zvA>whu-J=e8fl30j!66V{92W>77O2)nAAkC7_D}=XycS`H<1J-?>7~OTC9U6X*}|U zVT8U7K9Kr~WqQK!VENJDb(Me+l9Uv1yWALomVn1%i+26T--g?d*O)GJs7%&PNm7V8p*I}8o1Y{cj9fIactfmFnXNGwS{~^^q;EC)9D=3^ zEw5crjN={U3!M6eb=Xow3&8VPNtB^SxYy_pf9L}XT~dCJ7Bt1Mb2Ydzi5Nu&;pkEw6z`b79`V zGVxeuv@w#(P-h-ND-J{I3j(4un%Shh89Qi>r7Iv4)h0)WkRp= zG(KAPNho0cy0JDqh+Yv{v8%>o9+}zSs`mY)GK)46kI>}>rRGQcjqQniwioFyH>fmk1Q^s_hTM5YswnweYPX$9T z_+Q21xt@Hzix$T7W&6?tRe!7zqqNr#G^Y_3&s+_HaIzI^WG%TlJkw4LtqPJ?ztP_y z+8vMw7efb&*`goXK7r{2F>@ z@{&{lmML~+2|W}{TqN~u42CqJ2z0EGbRQusR&W?gV30)T1|DNxj;-0z%Atg3qUeo6 z@@R>8wR@yi!)IWbT8J^aoRfhH({n!PMV|DxXkH#QpAuOt_LG}fpP`p@5dUpU+2Eav zrf41^blCCmHF#f3#4@cnrB6h&m`;bvdAvby<$G4JJJkGheQ9Y-Pw|BfdJF%CvqB-a z##8@9=1*pvAz>k3FK6DuI(Xks#M*sqX2IIYVja+oyrWLqw+wBp`h7|FC4Jv2D~dqh z7lYS)9{A)vDl9R)_4OrKrX;qhek_wXsd)v&J~<9N#M+@RA@C;MK6}%9%8c*kn=P!J zdUx%0v_RwYhUj%XdWOsGEEz(NFc+~7De}w8s1C>@_^euq*f(Dzx$mC-LMoE15Nt=nRTMr(zz%f=!Zi5@yR=^dz++&t$Glh5wIwW$T_%7h@Z@WvB1ANi|NI zT=`92Y={bzL>X`WwpJEHR;2j@hhC%C1&huU&xF|G)X1E{CZ4Mtpr@^*KTEwW@mkS@byL= zmO!c^VXU1)vvwsrF|8^ZeUiZLR?1ME<#*+si*BY@G;XBpa9rh_e4&trHIkYWd9EaD z#`zGe?XB^K-X$;}=>w#*@P~n&}?pokB-4_`7vO;d<6p1E2o` DhPovE literal 0 HcmV?d00001 diff --git a/docs/source/assets/design/arch_overview/llm_engine.excalidraw.png b/docs/source/assets/design/arch_overview/llm_engine.excalidraw.png new file mode 100644 index 0000000000000000000000000000000000000000..ade1d602a918726f5a407f9422eadc0b9c25639f GIT binary patch literal 178116 zcmeFZWmwc}*ETL8f{1|%f`up`0;04Cf`W8|AZ<`XDk%+u4GMyENq5cAZ4kmR^hl#L zL#GV&u2J0LzMtcH-~Xrom-~b4eT*{mo9nvPTIV{?b6qc#6eOwk((m21YZsNYl(@>S zU3=gUX&5;fyfQj?u?7A^WTPT^V^?P5iSb>#7(O_9 zFI6sxsc_wl^*pcoKJd!kGcrt!suWhw##}S zF*(Ek`eR+&L-c|o>HqDYG5E>BzkBn4|97Gc?~MPiPj`^~qNv0LMe_fC>W9ex{&P+K8#(`tod198>!*au_hQcs8Y8acns;)oj)Yqc4b=sx zWa?Z=(JL!H!m4ua;);0*tB?8Tu{Zw%O!1QyJHIy`+@Bi?K2j9V)$Ia(w zpWjAf*+&Jehn@$rs!A+RbsO{8cIR7A(C+$JJJwA874gBFq9xy=+kK;qY*@R%l0)&@ zYCK_eP?VSGJui`;5PGEKwmv6$<-vf*Lc*5w*5=0iXtev6<~YgW*jE?drfVW)UZ1#f zPBBX8?NtZMx+vk2c-i2K`_k^p{2L42ll2faHSgtAf3^M4M5Ge0Pq0P`+HuXZ>J&a` zYN?P2_;iQGr?ekcWHTx=SnW%q5JS`+CG0Ho?!v87{ZIGe$v&@4cAnMDHF)^=PS_b) zv4OS})%lH;{;PB=Y3do)n>|G~Nh~4MOfmwa(zNH5D(lJ{L%EZZ9FKYbYoSEhDGy5d z4wuw)9r^`ZD-SgdRP{RabCfvD@KLAeSCrdMb+sp3mD+wdkGiS}=cBXSwFF1l1ONRe z1-*H*9p{InSxD?x=ZB54ah!X4To%xc!A3^7Xw(ndCgZ#HK^(c(!!q!!FBTH6FF!z? z5_YOpRTMt@Iwkb*zksr5@=*pq`n-atPtN>;#U3e@-#@>KtbB{WwWkHQW$JdLoOKpI z)~;E!aWxIs2lGb_Bn0cdrQy&zwo)+lwZy@SPT;|vrn_O0y~)aP*<`jBSb2AMQWVcj zLnyalN4h4L`=*OKGjGIAcS}>zrN@*PFP*r~{~tR9sN2-MUqt=)$!DBjU%=GEep{Mf zwwq8Zn#%9?S*sC~gf05494Ap{mQ`lN>lvh6zS8HMQ5|BtIbAX{^zGSczVV4gTx-%B zgD-(~$s&=l8UsJ(26IiC8AEvxZ{@=IIBwBu<(q5LaTz=}Xb8D{3tRPhdmH`%>Ce4) zdA3QNx1rj8Nb)p~pz!sHv@D;j_r~PRS_gNX9jqBFeT@$14<2t#(w2Yl`KX3~XBK>4 zZC}a`&3bN)OdU-=i!K8pgJSh${OF?}3k}k5>v8pB>Tk=g(IObRqj{oNFNxWNVcTSWW9)XkVxRbWN zgw;^q&#-|sD4g1So{dGM)x06xP50Ne- z$RpTdI9+^B32Wh3W}?3@q|u=xuQ~*kIxY}D`|F5*BCKCXp*%bF_E~#OG1*(9FtwdNq5l2stuARvi4d=6nkTVTP9L6lI zF~JvL?j85~YZc?e;IY->Mn7yP@qu6u`8ri9dmV>fJ2zcHqmioG?hJZ?2QiD&z1`gd zTfX#{?|BDu>R*e$Vms@1Ml9ZWWjbn)LYOep~O- zF@eYdjL24?pxs1mqI~#w+h^0=jD8cAfl|UKmmiX+T@A{H@k_X?BHbo`6){oNUO4w1 zS3eeSKS<^67^odwwqr`z&0DEz{zN2Xg})y~E6f4ck%PgJ$;x~fmLZ#R| zO~iJ@TqhZCNqB|*=5tk26se@|G@jticg8QQsbH%V-T$!WWK_M{n-f=5rps3JME}YV zp2u2ZNqKU8}Jl|{nqmk)dCH*#mzeRrSRn#Fj3g}d5=u`2=epDrlHc#OH@ zG|m&Tx=uscV8Pp*y?xe?{%C=SDb1mYSEukouK4jJ>*kxZH_=hI|At{PaIzft|2|iV zyPcLNS!c>tqHw3IM;ch`gE<|3F6O+w1}&!^I{_pVRI&PWW#9Dzu+Z7+a@Bs)FW0c% ziQajdK}2TXv(o1RAu*~+3bDgds%ePOdFSb(8j3|6hOeUAlVp#$?DZ^7B=bNUsmg}> zo2k_>9t0!O7_%R7SgTJ1S;ng!_VlkX6n_O_Xi{`Ion$+Er0;%xTIAx*mq+~1O1>_9 z4FDc5^Y5LtmLGityI*b?=L(ITiMi-8BV**UM~!;TEEN~ z!+BvzIc{=3V+vY+S)O1VhETT#x_SK>}i?AgQ< zS;X=}@th;XKiPp(ldg{q&ZcQ(U4&!108i-OZk_u+|IIagZJj~<6XvGao0Oq^7Fs3& z$9y`t%AE2u$p*5DM(Di`ZQ6HMxL5F)f1NFDP@@Vd+r;An?=A&j9Wh?Ot=)*`6mE_9 z*9v+ORCiAF=5sRIU z1!_e-z^aTIrqh@1eIjSoFL$Y3IEN?w#y;gtD7m4oTIDMp+E(&*rfAdtT*Fr;Pg65F z9Hx6nl~gw}#Yc$&60_ooUj~g*<-tBFTr8Q)RZ6-m^jC9uFc%)rk0SJU`BxFU168z| z=nTh;D|bJ5x7{{M)jJ&5^rQYA8%uYA)d7pQ261Hu-kS>;k=W1|Zah>Egk~#A+~nRh z*(^?VV{PmoG6+yTz-H<9Us(yvO{Ca(^m%<`X?IdznR9*Wtu&z-!%7dL#(Gkozw>L} zZu0oV<&y*W-|1dD#uyV7Ypf84dVTV$OaQaITygcS)3AdI==3k){8=j$#6Ia7Sy8sQ z4zy`sR>fu)>&%3t^VY^n0-Uj*&$I9JHJxJH@$auZjiRruO(F@A{DbRs0K-8J;)PL+ zgUq637pl%lI&dYA`Tk$0-dh?DQ^=iAn(Zz7z!H+e&&V`OpwZS zavOd__`8QguDKTTSoQUoxS{7;L`~s%-zXV;MZ@v7*beuUPuimy7R_3Z^pZu`^$Dgp zwuIF^RP^WGv1#y?@oW&2k2$`7viIoxWNW%6H=?&Vj{q1KPV5tB2qJ|#f0mN=V&LWb z)wPqINZsH}PCp_1SX}1D*bWE=3+7Egt%XlQPj4@i3oV2$ivbiCiG_UgP8B_uxtnq2 zb8QrHP17UKPP?54z_j1u?XI{bCXZl!w@#C3z>u==7(VzgTSiFc1|3OIBDfsq4Ummn zxCG5)rvo=AK4|IsHIj?JuY@CXQ(wK=rqrF#7PAhm$hkf*NYXrnQ(y^fSZ2w8qHf(&W2N6j;v_s6gq?PkT+ruKB<;Qm3RhdhleL%VogDF)gY!xKlanuJ<_)pO2R-97f= zucJ#KkW((eS-G`ZB(NRnU-y)AX+a&_%vrYh z76pBRNfzeXS_7-qxba!PiKIjUW9$T7exgNhv7L3NVQ{E@(&@j_2jM`(6H3{3sCxHax@A%L>b!qyG;*EmHqhs$`A_E=0%TA zLYzhKhMt-cI#C*7E8PnFsV=69_4!xptE15h5@nlf0828h-Q16|s)*+rW2P7OhV0+| z&hX2gW7L3*bXUCEw^@AAgPz;%w671L67|=|((4MmyL$Hso~p@Q4J|`O0BGTDX&N)7 z^(O=#h{Ew2#&sr3(>Cw2$TMyKP?qN%+RyCg1c~cs(y4yx%jh9GNj~{CM@-_)DUQP| z3aCu)7?JKT!JM2<`H#MRcyiykHT}q+XCe|KB2Ia#WGnV7nRvF*KAQVXYuHoxP=Ye_ z^8Kd|ms6X)FSkI1TDAc@6R-6C#(VjD0$N@P(GUxcWoye*Y)~S(BR*$_8+AP`G0*9J zd9S2Eo+0mc&b?}pihJ5jP8wOE_G}uNwp_SHb^{B^H>ZwLG3q)FziXu22Vh-e(bD5x zLUH9vzx&Jy&KL|PMKx&%T04afqn4T1Ycm!4xkl))2^^1RC3KU9bKgHV7A-(}=BNAK zB`F)pZ_%X&z3nyW=xb$W?t(k?k%N?gobG?Sm?NmRCpr94N_dVvg^ELZ!3#4@{Z%4ASSh2&*MXi^iPetE)kjXANcy-fF zY3MwBvcTvQ_3$KGo+OFTSi+yDhN1YGylbv_sdAUq4_qPvJ7VCVG;FMY+lbchf~hHr zWR7x~myVm`tXR(_6ozCgA-fOf7fVt+X{ZXtQxVH4WR`EWk`y8hhHCxqmbhS|goj_S zM3Hx|zu?P}XIM%}ih2uFiFlxtS8UkzB+U>woB>u98|wL%y%THnJY!$WU7b?^@Xgpf z6MS#fo3hsQUd%V?f@-1lixP zQ)VJD#g`O2&(>&9Kv-q zMj=u#s{Bpt)N@bh>~sWe#|99b?|TH%)mV02Jvs85{yBe2L<%cpXx^nFzU^dsjKUNsEE z6G~bw{eB}4vhHvm#652B^6RAL#3^AptCL6!etH#RjsUtMV#a|j{GguNYWoe4`@_af zbRcU$v!BlSAD#i$s^8fkcRS8 zEis!Y_BZP#>~9nobYA%eYJJe_z%>(LH^7s*ZW~UH0R1v*y7o8S`3Tt5>Z;Es5n(Od z$a0okTA|6P1FbwlR361krAKu_py~YaYfo7yZ^YlPpbK0URld8}Z(px-S3 z=vZyV_|8cDbvKpcJSKjnusIzI;-;cgU3m!_S$h7(Iz9R|i6>A|D=9=u=+w3aQQox0 zTK}`wi(|^8iX6?(BMqVMp^?{IwNQ%-z()AUgi9VSG^(IrI-U^~?qi zlZuk>^Ut7vd_w6E}@kY z<>=^yooqvl6iX2nUPqlqlJqq#n=>=6M*w$9raNLB?aHi`kBuK+&2`c<%}yN6P)}C6 zZa>p!!Ig09v@quapxb=Mp8;d zR6eGTf&uxC3r2O2ccig}6JD^ns`VEM{g)}Du!9wlhoio8DvVzzK8)Hbe3`h6R%sQD zG9qbSKQsmiY2Qh~w&=R=<}~OD=cRTHuW3`*(pVyu03(&KyzP!5U_56IPtRvQhT~ma@d?V|-zW&_{u~he$pMGy-*KY- zx&Vx*uI+!|y1l>T=VMNHg&KuO;Kim*iY$&jabw{#|2hy@jN6fY0$1u!$ywUex^!&O$`ce7Z>F`&ry}$nhmRrv!8B!?a&5*ljOt_q`Yk&B8m&$iC#% zDV){Uc7m>F7^*TAA{;775CBwq^ktVHvmTm7PL1iCr!h zp|8_4-{a8ly3$G8MnFKL!)IJItJ;K`?0Am%%3g6Tuv=L9Q`^YB#h~R9Q2R$~aXHlV zwp=T}Bi{GqRdt?Ze-o;>rkTQ^(@WO*mh-vg(rj03$NY|5P&>c)q^KdLsw;>?TX58v z52)xLPWP*QUQEn?tUfM1{F?sJm4bAe3!E%N^=2KRM(oCo5v!BIQM8T&l|-Ymn0p;9 zMa-QTr~R=;;UkTaSI3HN%eyU0&?Zw=xhcH$m#GcS_Ut=0u(p~e9WDah&ACzwVXk&5bp~iMEuMEPau8L?^Hd`+(3De?E(bK3QaWk#A(jROM5pICZ08} zE{qmL)nL>rHXk0MOkQSJR9y;LBsi;h|9vQ!wjLjxYarauW-@cC{41UV0aZ=d~DDk;v-0=kfeM#R@X^Lm=pI~86nC2zsIM+8y6 z--iF_4U#u_A(jIp^!hzw zHAmUhrIsfDoL_q8C7iT)wZCsY;t*s4wp&Dzf<_E?z#oqw`hJ@s3?@gI@a@w5lY6;&EX0Eim!O zu@wse%+S94TELq9thx9?m;=Up zLKnRroW$8TMh0=TQ75_ab5z_euY@tfxpjSVJ_ix?+W%`8?vy-k@ zl03`t_|ccw7MsM}{>w` zHqo$?KpY1!?U_P4%;*)3MmY;bzDn)T>g&u}D7E7tbfN80s7-^`4RG@%%`>u9g*=Om9BjU65r$)(cdpg|K2WC!@>iaInHb-n?)G!xFTB zu?gBpgSzjCxKa8#8_sBOTBfoVQUF&J(S# zi6(GvNfX3)#ww_NuCCv{i)^jFDP3s1DzS3?YEdM&rO=_1JJhG=9>6q_GuTjU7uid+ z*;Ln79rYqtFQ^uB_5Tb)HB-M+s|NmaiJcz2^&!Wj@Sj)*-j}abZXvlh|rA zEfS7_FjY&Z!6yX89ge|?CAqewC?}c4 zh=zD1121wZr{)wgN>SEK-+3ypgNEM2c|0LFz2?b%4xg83pe-iO0%T=GV!JzXHp71|ziK)2_8R8sa9h)A#!yS@T>> zD18`*LCYdZFPxW9Ti>*6fI+T0gFrV^zLvQpPXDOszi%7qLo@gJvNAR=M zb5;mAuWqi*7=C<8#Tpb1;%Ty6$fYx%Wle&zjT|RtzJbUT|5Y*z=tr|rAFTrF(L~QS z;H9OYJoe_h_Vwe0^HxL6T|Kv`nuLKCu3J)ntJ zkGihjI)&Ma^tO`lxltf&KN)gZeBY6CPc#2Br*>D6do|#iewi z)XkVujO&RYYf5LnZu#>W3qP}7($IJ1HK~rUkkExFjTDVV ztSxt2C78CSCiZ$Qw=yILF<~d0;v2f|kStCD-=HDb*@{OhVoqnNrMxBBI6mLSKYmee za<7GlZ4Iaa@~&H6Wgyl#L@HoTbQt&MBSH;0f=@Ii5=l{YkU71$PV&+T7@tHGSusg3 zNkQ4y2ukN^9s0J8U@|V_#z|6D-d!`pI$DvcvSt+koMSmx9wG3~%IKY1^ry!Vinj}8 z(+~a3TArkbn)cCccyyr<`Mz!pAyNd4mmg$g4z2Z)INlRUCQjgoc$Y!KRVjCIO!FxS zym$#521Q$&YpSKo1XmMCS-g6|WT#OLOz091ve-OSVZS^#53iDXu7Nax1@3Q^YY%^lEFznVc{vFb0D zuroX%n?{{OYPY@})++AFzmLn3ZxKJ_x7=(OWUtSu+4(w=9OC^5Xc8kTQ0p%7%y$^ zIRFRbbp{}~wmT;0nbb?{a3c}IV+n|CX%eogu&B;L`zfu#aWPg7gp0qxR5Fnx$16nF z+b_msdu^Rr=`5V4Os0*U^W)KaJZ5TM3W=I!`PX%Y``f2nngA=yLO!twa@FgXdiSkE zB?BM(+QOzjy{*9ko#_53-}ye>R(fptW4L66*d{su{XSc@CQaKVMxh z^NHc0MD-iVN`g~rCGbTGlb!ryqAK;|(e7*Ovmez*$C%Jb!$$~J=g0sE3C(T7kTfFmAKUb6%UX0x?~(JisBgXkVB6{Tj6!wk{qbV*`vgH14drXIWC zy|mUow68h-Rd*D3IwuF%o0tZ=c(W7fo-vPlk%fuN7MUelpX<`LDr2H=DzE_Qn+6?n zpv_hxvOC_QGxzgq)Fq2=``F#Ur=Z|y5uV6dY9iVf&x>dghXQWWvSfa`qOm)_I2jXZ zfuUVqotN9;GV)*nj2iXzAFH1mn3l$Y9^{Ks!Gs^@H4S{Y^=Rz5@|eRMR^L5u^N!GX zD&+MeRE^$Ui@QAt*wr*FDcVg-&v}X$dl_2RhhN;o@_}_1G#+f8+|Bbmk*<4 z#v!0^2A5AePxllNtZ-8xttbF;&Qa|`62rdtvR9HuoPuc(FoM+5kZ7KDmW&? zNZYEX@bZ|U#h2q|s|(=Cp|iRVWl$wUi+6C7`Q(zZWtLu*$z_*{Is=rJ1=bV$@rZ^~-_dS+-JE*6bcy{``Y4-nthbr;rX8-M?(7?(dprd} z*W6cn9enYoNsbF6(s2^LK5Hd*U<1ll4dVx%!BUrG)AhHLY0#qnvsvYBk{UjFec~?2 zy=cGeB{&?pJ>YA*8y@{)2P!2`f>QWkf{TB9X$tp_?Lk8la+q?kdd31a(h4|Q> z1<*C3#cq`5I`Wyzj2ENRj7h_TZc*`ILGWQrka=fa61cEJ@3L=Ecw*@@ z6-QnMQ^DEJ8>_27zOZE#8cBB(W|wUY{8U-S!-zSs9ip-obkv6b9ZaZd5Ou`WGap zR;f&!)TdU8-OUWGskRvRU9CLRVFY#e{HuK0)$ycgaoLKAXXxE~k349++b&X;5A0oC z+?#8rLOR@Rr=7;mfbn5H%Y*E^9Ld7By`O6EWlia5TvkP2u*DHJb-8OBi>h(dU#l)Y znrLG!`u6m=Psb5B*r-=APr^7UQjo|qUpXB#<}7t}>aV_TgE zGzL1e8Xlrw@?BX|i$?CIq&e1Wy0MW^Q6m@LcNiU_a6vNbs0b?cW_*W{c3zeOoGbM1Yuk%9JujY| z7IQBe`hz^HO(N>cr&H`qv+Gd?i9vj2lezVbd8m-0Mo`Mcxm7kyWP{lK%`%Is-=v7k zW?J?Zx7%fc4O9wbNY%+C;|n6ILjjpK=VlI$FH~5_(6Q*J1y%8K)Medbk*h!90G{c> z(%hhHrv+`3^K9mcoFvx3Buq0eBv=I|j+@a`0r^l-p)ANYWj;G_I^V6x2>Sm~Lyrl% zE+>V;OipU~iZ=vfo7?hD;&8J+fY_Vhtor(*=(j9KdblaqM(c57B7)`_qQehe+T2A8 zZdM8IK6)M#2Kq-UEtbXO%%K1{2nL7xS+(PFbSYt>rY(#6%D_M!ZkvGFB2|bGIK$uf zC@!Z4#I$oZBMsFXhZ6>Y`5OX5@IJ>9u#ml@=QZ^fQfwiGuHZrLYL8a7Xe?RAsYe4u z1+0y0&K#U+A~uVd2mHd}>h9_xWC8(Ml3v!JnsaF-mN(>raUiao&-htbU)mko4tH4- z>3#q&bQze80JM#;^^jE3-s{;$;l1e##CSr9WUek7CZG}I~ z23!dWn$ts5;B9&fjw5=sf2k^8FS=lIkT=kJqHdJXxAa)ob4Xm&%3b-{F024P3qEvK z;utz95>DJX5D6)|^rJn2@)9sEa@$-Tt+v^NOU9-{ty)kKQ<1)_&up|E@jt1%SW!45 zM|wzuerY_SjFcvH6=0#%JakTr;po}4nvdfrWa?y2W_dm~tPhf|_Mw~NjQVnR1Rc%- z+#r|U6C;-KJp(-4o=~|JPPzTKi)>`f1-+m@x0hH~jl^Nk{u4jdN_~7qfIGn?h>}Zo zL(s$;br6hb*6TkWUU8g{KfGQz^j3DLMV8Z!xI)_9lrMvfGOHYKS~OjZ8|pI4dI!Yl zlZcdzVM$Oi?@xx5#<{(jKt}PJsbg&7d6Y~#r=IVgg0T;cS>k0+n>LIB1b7U4IDfDe^>3qU7>Ru7}1R_N$7uee9c}R8w`XU>P z0h3*Mvi8$Gh*BDns}q<3(8>f2J3#iC3Ot`jVY>g=B~f4)2OXu|f*0krYC*$~hwLta z+iRbQNM2*p&QITWO6Y5_xzR@GETsA`B{Su?qChL^=N5dauUjWKp)JPLs$CuLL|Tse zbOn7?aC_*=Dq26H_n>NbHI8d4`MOc-+j*x`Rxy)8O1o$G(L9t3avG_-K6=fKKEM2|MVLL2&aQrLZhpQpc4| z&^2p0Jp$5LfkLAqEO_5Ch^&H`rwHe<_s=32tg{n25B0dN^d_oCVKGB4Dp>&)poLa5 zC3zY4Pj@0EdCb~TtGbC_7mNQ|jy}S2terG+|FsE~KA>7%{-a-3gJ5*(5LM#SwDHa~P(Jy?u6>my*Gr{{w^gQ6F}s4G>X~q? zCV-iRcY*RLo{k(->)`*XCurbM;%S7v)VJJ4!m;cM-N6SQ?@kU_jx$C%34hd*W$8Y} zdS@^>G!r}zJoGKUAo9bP0IK8ch|xjV$p`d1`cLv7kf_`B6^(wM30Y5Cy+_xp-<-C( zGST15i<1;lYCq5cMTc&_)7A1-a|4&KnL@?lM>CFs(8UBgLl_;fD_<0|K%c*|^3hya zaS{Xwhw{bA>j6*wU%agaBs3K0y)OY#d4nX^Ny31gKiuY&T{MB2E5%Qw<&+LT&*;sU zLwus4VWMrBloqb(${5B(tGzgr@!Jc~uNtH1-hmS)H+Cin2Ozlc7mH_emlS|{7k2U# zr##zrwG6Fxr>kM#tZeopr6p3!&&P85JVI#M^8>xF_mtkZ*_}3o4DU5%_q(SsGi7l# zVpC5y>4|>51Gh8Wfp)yT;f3DzYX_N$(A7=U&Tf1hPm)fL^g|&~SZ*Zf+N2!3Qi=Ld zVyqu$fVTFq@njn}d0tN4Feb)zx=~A*RQlS3b%6w0uiL_voUG z9@psN=m^xWVOC_nQ8(;}KlFiJCeO7~x*yfu=`e2tlqO)9YS9hq0_OptS&z9(RK=-g z*@?!PP87+&^J{hGTbPy7WS!)VDMGX|W540$a7BH1tG7dFzO%P5mL2_Ge>sh3HGtf! zh-&yPSO*b98h;DgCe~VQ-4?qmh)jmh0svql9;MqG8s9DODR;jznalkULO z>#Nh*f;QYe!zzPSYAZ2x{#xpRG`B_TzfVxISA_+OrbZz(p}ER##ZhwOH}1Ej{<4@; zeqDl5g2o~={jS%;9WXSOaw{z;iDOJw1{@A0bGKdVlA*i5zUB<03e%=A@1J3!1rwza*5Uz{sH+JU zdHU>py@T*|*Z={|qFgdI@2JXbGbvs%*m-KHlb7yzsHLcUZarnb2ii^9MHia@nlT;A z?k~hn8Q@Zp_Gi2)cTx8AHo5#7{>H-yM@WIW5Tmm?3{Q>P%|i(W zIS7KSJX4>|=(B$Q%S{bN=)R>E9wh?rRASWZ7I6L}}q z2v@DO5O=~7V#NRy83}EEibq{!D!hfs#*#^DH?StkL6M769s~=k?_S%?*}4E0CTP|r zY|h!W(7s;3cCpHo$aY)XA#j8Y`w1!fo4ldycEt+wa3jy4JCEAh8*i(?BC+KrsKV9q z_9<9|mlux_JgY50^(o$uv-6AE(k^^(H;vo10(?uOE_b!=5Owdm!>AcXrraL?%i+Xx z_LOUp!KdekBwRNpc(%t~n{BrdXVG#ycVV7TrgUN&&fw5w_JeOX(tKJ1gA$oS?M?+G zhA!eC-^xzxEaU4;NKr2CVRja7!szwJSuk^JrZ6kgx$cl|FVVtgS>9CK-azsI52J}d zcFmRx1BVk6#L=ia5%SMt>)HW5QY6plA6!N|$%H8*mBCxX9IR`v+1BG;K{F5FtqwXl z-z2l{{0x(2I*x2n#@#AQ4WFTzQD4LWf|CXlMCAM+p^gB{brTqCc0$f-Jle69e?|W` za*9VOg-NNm3((6DLDj{L;QCKrwU;~PICN+*W)uu9K5kD56M3iH*B=f#y7mS>t3$=- zMt!3*ZADhfb2G{7k2PH&W**Hij=X<-qL0`&THs+U_^_%?oHq0x0U?9Ibwf1-{|&17 zgmfg&ng(~_*>|pHc>;nvJ7X^bc$-bkG5EC{-<25x%|AR^Y-;#2WN3OBzY?iXSIhj0 z9g+WN|EY_bxt~~BOuN?ihK1;H(yVTcS^ z1ID$xWjD9h7v9)hoP$o4L@x=nn-(Y_*_mgXh6pB-J*Qoo#<~)#L?S20n&ZIUZ~(U} zwE2e1bbRPX{0)g@dF~tX$W4t5tzpt>`7T21KyZV0-6E;NBRP1rz^rWm15%m69o&x3 zsE*s~iF8v}OCKf`bZECbW4rUp8P5e$^7whBZR9H2M*h{nN*o>g5#=X3dh6ce)&MIV z!F>g?x6!S=%Js~q_PRoRXgt)akjd@>%p+$);&)k{r(01(y@qzj=~I0}7rj}Nlq?U< zwIP7zjSj(0%$*q;AEq33`MyLq>&K2Q5DZ(O!SpcH2Zy$75%^L*+PfCK=>IjERiVvq zykTn5y67y=T?vD1_qf?(T}9R=xmoo}AJt;pQul%|7v3xcdP@KR^W#hH3RR6>nVQH% zo8?EQylBx5q-*bVPS9*S@@@bmwKX(y3rrI}7=%2>>CE3l7!a+Z>;${hlaf#L+jb`g zq78=2ydsM}%oEXWc-GIucNVwUfw1mENne2CY|GZ~gyTqRFcRVyR#dMS3d@HBDYG)& zD__E*Q?We{5qqyc>}G0J%KGZU)$6Jp?rGO+C@&T=T<9<^Cg+IjPF6nKJo&*UJSwsm zSn5{9hZaR_qGpOE(?_twO_8QpKnvg;4+W)q^jTNqY11u&;i+I$dL_lP^UhDeNhI3M z2Xo1Yui4)!o{m%QhF;S~@0=7mak#xV*U2lgwKqeXpAMLWC7MOYHaP_PQqm6g)5F67 z2;aV`W4}ia?ou~V=i@!oy7yr37Q3j)t-;t!9r$KQGIyW#zazn1Wi%D1DLU7uviDNq zlqPrS_xIiz-Pg)CH-P!MpykH%R-Ri9aJd2uqh}MQxfPn)M&w$gGTZC(P(2wzXWOld zoi3UI#T9Qo{6#Nu-qk0nIn}dI6m2%kiMWQ2@6Wma%*Jwp021(AwNTAzTV_P5V#krO z82Oe3>mRds)V z1F-EdxCh<=YsS)`@v?E8*Gs9Mh|KKRZoZaE?4%D3dTrTsiU4)TDC%x3*M3Jl4R)-F=#3=km z)<095Gko)9BcJVz;7JbjXEh}aJ7EmA4${@xN5gF;fl(VNqr{1_!Hp(|jwLkMe*Z+l zlBOXmZEBAsQ-CqcvtXaPtrmF*VH+Se*_J4Oym`USq6<{0c*qNUV$`WKARSBvQp=A| zegu~23&gA;aLnq=Vb_xly$B8l+(1G5lQDA+KcR3HFIC`ytjiaNeQukO?Aad(lS9Wj zZ$;0YxMF>5g@619c!8uKy*088rr%#$#m3me7^Ga{%c5&A1X;h%oX2JOMF8_gUE={; zO~$AWPWGr?xKWln$nlM#oy}eQPY5u;gh&u@0#`XHO@fMp3@^esYkO!>bFoOr?*l;2 zaIl0>0ap5QtbbJiY6Ve|+pX4NTJuWvBBp*fWc$YV_tQd93DCl)qS~z8MjC-=^>degBZ|Ifvt3b(TohfEKLhMwOFED zXnHl1im`CN3TCOzBfxEDxGtcEBGafo2wit-65!`cB6C;GSSf!-1)BTsZ#;<*?J*42 zmB?oJa{AOpXl8{o4@kSWZL%ZGvBg_IgkQ@=EM26ewhiO%CsYrlZ zbR-bjkz2!B78r{~G{-){(IMZbKilDqCocq@!S3+)nQeldftmE@7~`1)qe~`Asuhc3 zSE!7}o}ut7_yps3mzt|l$?ERC&{xox$^N)Es@%X|YjrWU^94XrkL z{@!zNG#uP$Zl;b`pcl;@gptmWMmj29@yw=f(Zub@q<-{pL{!Bh&HkC}s3YYePrt55 z)z_jzQ+Xc6^3jYhz(@(f$45m(dQY3deRI9~7B%Wzak4QmDY)a#CNrrZQ8Ri@R40_Bmz=xC2i|s&;&uCt8POyIm4_7k+AkU;ckoh^MDw9j`KGrb4IF2-l_FF(^BT_(9fwCD^9@* z6CpZa%CY|GU6*FETQYydp`0m?@!oT{J&#&VPV~Yl%wEy_6D#f8I zd_lbv%RWEs2>@9J0?gMdCG4}{y-&{tI)%GM87}WOYK&loA-VMC5nd8q?q&qsA>;Dn z0XR}*#c3U3F!zUw`d9mAQWcAx7ugJeNTeph8%p`lhwkz`cJK$xro{u-EkP%azq>(b zF-S`8y@1pqRmtr25X3OTMEa%sqLsK#?B(n{cMfHwE*Kz=o7EeY{ps-^G6FHJV0o^4 z2PmtQARJ4D)kfH*-6s+aUBL`}0nrkl6Bx<~AVpLQ2 z%0&oV&g6a$_W>}$r8rkAo~bOc4u1drQKI+Y?iTsUHO%y%gF~$&iYYoS;#S%KEgg`D zj`>3~z-qGKiCm$|QtLdQI(u-9SgUaj=u_KXDHu^R{1V6tv!n-gi6PlN)O*0;YTq8G#D@rup|%{1zg_^F#@JI4x68SX5l9VFeNRXSiPUVtpH!J@voyrjbppxfS-@>vS z<`|cMr3-Q43a)cQ@KjzdprDE|UABeAOMZIL#^KHGe-Qs1K^<_L;kf!&Z6rM?^}ti# zed88{2O>VEssIY&@udV+TS^nwQ9x;5z`z&l{GL%IPJ${0(B~0OJsmhmv`aWc&p+Sy z1!4F%XPUoc$KVVhoTM0ts9u};eaS%Z4FHSQTRgXN`P>8iB7!ixb6AO&b?U^648pJ? zn1)ygw^8U6SxXZx4FZkbzmNy}&zA<;AOOy%qWQi*VfyPY8SuLalpiPwcDl(yi|sAt zs2_)$<{Lq;q;{o&r>AMAGWF1l&H#6WOo~dvIoQ)fW!GqTtOA1?0EG8@xqgp_{*HUk z^GCGI+OiR*&T#Jt$DcqT5kBEfB}|POKpf>-_6ia% zqA~zFLY>J2+K_QlnWZ$y_13tIf-k=Oedob5{pR#J`^Ucz9fRL2oSeg(hj#sw&$bsz zZ#Urk^Kh|=b#EW2Y(rsYnGdHVpTU(xDwSbHPGe1H2*x?MqiK3F5vFZbK#F^%<1+Vo z?m*T4KjC!n;31fYBHQuxezsMg-g}UJnjb;qin*y?T71p+q)yLN`MU`dKMbC76NiY6 ze`?9EH#z5B33Zo%qcWZ0BA0X@dfqR2r%(Rgg`fGz^MCneA3E-Mn3KqKhFVfYz$yPR z2Y!xlSBrSb;N+i-`g3s@&X8XFF+eij9QO>Tw*~_?yk;G?Q<8iB3OR;mXdaonP8wJI z>pN4NnT1L1dcc4>D}rw8j|kXXXRZ17H449;Uy29L)(iWZPR_qKU-Sk!4|rFMFi}&* z*lp*w?*E5hR!hIF3>O0p6=PuxNDAQL|NAGWQl7Zxa@z=1f6$KML)y-=+CQ)z;jHn z*ydXGmmk4C0DDLs+Y-n+OEzLJj6K%v^duJfW-yu7?R^gX;Tl1r@@0q_qkxrvxowqz^=`qyO*E ztW1Qyk)d0Xzr+i${sN${VA%cJzEf=}0X`?LwAeL(?a)_dSBZl7zYgEOh_`>%znQ5i z=f33XSlk;Sj#T+@KEice+P)kx>N?p|)N0oNIhqNsx%$sgh$VzVDQ@zQnfq5L#1cby zT44^N{sJ$Ua1A4xQ;F25;=@qTn_nkfZxfKY`1_WcU-3b$0BP+*uQlJVn+yJI5P~oi zR6E^UB33-P-63V#5NhW<&|QVgCEs7N8NdkMe?D6(;n~U-CXfHYAcX&8xJL|bm0+-B zdAkTxy@lb!%L78tT-oF*%V*w#YwtoY-Mvw3c8~b)xYZ+Ymbgem{eSJU@>xH)kFLLi zFz*bvxw*YuC|m9_<1%eKA7?gxLXZ}K?9SZ7UZA@|lC04L8ouo2+x-@##! z2A*{|NZaz^$U2NG#smG!1^9i?46c9D2BWOuzeEoY(O5tY@~EqgZaZ=-AyYG6%!by5 zP-|zf?enEtUmP~qS98G{k+zy7AI=Q!qFj2hf8GA%Pe@TMy+&F8ddd`Lu(=b6K-*MZ zk<}*!=ADe7G~I1WfA-{`ll1df{^KH|?In0F>v_NCwKPmJW^hY>^n9(Y)~yPRwuv|`_H{wzaZ1US*~9t4<6*Z!X23TC2Tdg zxs1hE)VGg|cvju#ox8lt^ETnys8nQlcOrf`p19uxO-fQA&3RA}J`{7L9%_(V_vA%hr4iAZ z(5>;vIrT5Ah?@wQPt1sL(#qnvM9vmM;l)6eBNXrq&y53G{;>*tZUO)51i9N&v42hz z3A_x@P1B%qkvx^@pMnt96vA04|C?}NpeQU_M>oYmihntFI1fUCcLgpoOaQiBb3OUH zDOmI74`}Guh+afw5T`2J#{Mx(aLLnn4}qQc#(T-BdC-uFSV2%{0P=qOWG3pj2-D9$ zB1Zx*LFyPg$oQ`>aUcieexNhP0mlQr>iffUH3P!O1V9<++0<3~fBZNk5Xok~<`w>P z#qaTnqP*bsy$o_QRYd>JssmO^xW{63f4A=be_35{l@xfoDDLqui!+eM2fDRCRu?df znrs#zk?P6O>n@Y{uZ#|4A?=tTlH+^-@m?@UJZEk|hZafivMOwll~F3~|E4ynhHS7FC>H-H#Q`Wf;R|4)RgnH0l_a9xr2fw*QIJT% zqeyNY`j=E2K5`#Yh~Tqm4*}k!0wBCT=T$C*A_@_@KP`bYC(b0a{h#7@|FSBPe|05v#J?;awcSZy-3T>O_<_22&W zKQH-b7XE~@e-7vWeKn6JtX>_n$=xo`&u$Z5^y*t4>6|j&@?0|>pQ>4G^a~A^+(Srl zcMlQ$9s-j8@gGeoCFKr3WL*9qfAC+P5X5jdBqmC&fbxHS+@D{AfGA0p>CbL)-WM8^2}r-y>~pCfy?YzYf9ACxcoAeX9TE(A)pc zZyPBj<&>3J2TA_-!Ts@MilYR6JGOY>Hsr}K&)f|;aa4Zt z-}x=-5{w#8(zUa{hw*=zZ%JL4j)pTW7yq5#n!%_sGl$y}{hyZ_1<6C04qe7*;(z0} za))5l{;7jE{|+JlS;W6X$bTa7uMpz%pGf@c9F+V|B>r_~0&nI2OFDsjRJXgc zxSRfEU~4z~5b^MR|ssWIkdZ)d#vx_>{x$=*rQIZ-ZDVO^)u@Lh+QqfcRNh%ya z1X#?*SKi(_;^*%xM+b^`F$8{4OFv`M@@Lmd7+jCA`n zn_o|&+u=`t5;Vzn(ray!Ork84^{+gCd|tBxeQajyxTHO#orx5L3rg!|icgLeh(jar z#|yT~*Ky@k7>J@9l4SgPoW`tj_WemC*=6b??#Hx>PlTJ^zxT4>$4IE%B=ow+Ze~4r zss0{CAT@!*%#8vib87LppNoQ*;)afp;*c}5Uk~qyBqzb&XUW$wCj$lS>APy|p#wIT zkiT8>73tF2i;oO-1SB#sk|oyM8yB7x1+;9;$G`aTcTt!SzM?y!pmFlBU(fp3>sw@h zo>qW*F7}<_AU1aY&rx3TCbBhSAIutIpvbf*l-k!5E{Xf*Z~WrO*P@C=hMVN@2kC=| zMqXOadS@4TQq&79B;-?Sgx$JQ$ns}_UVXHYu)C4KPv*G$UP0~G%1NWY8>N2$*G}rQ zPT9qm+qCXF$*C(P-PoXBS`jO%KtW(^0n1xbfOgDqUbRu0B``o+Nc$r^g$`D^rFg<@&nS=PxqA5{v0&e8!|zA&RB;h`<(mnTeyC=DuqSG@HY08U8eA6 zpoqIGbTEFt`~CcJ(w{@$9C`Pd&Y(N4De1r=Y3rJ|78n0)h`{p%ZYDitScRbT+J?SA zA_V)H_tcsj9{QK-Ewj;5bz5}(>B9W#)Jc|MKmR%c!jeP1ncicva=~@Mz6bw)mZzGNWI$HLJx|8o(2yj*8sA%Tm| zBFXw%k-v2G&mifNmYqIy8Sl%I^y9M#NE(Co1cZM)!gc4qq&?xn(c|IYIyxRw{(OBO zCyKyJOvhm>jgW%=`|s7Ipj5YgH88Y>$-m=gq*N0iYF5aNQ7e4wN3~>8wRBtf1A(Q- z&vD@(9INeC&pz%Gaw2<})bei-V(mr4aq z>oKB1EOkB&jR8@IrD2x`iur(aRBUm-vFPd+iTPgj+%xc9vM$kk{|vBJ;2u3*YQoN6 z5e7{(;_Z;qSD1w(thZnYAtoTBog{de^kZxw{6Ljm2!O5vpjXZh7zF_+Q}ct8h2!9M z;Lttw?_0rO?o&zR{Wm7k5shTb&kRQpU+_m>2o}^gdbqX|bNWlupCM|N4VI(7@$+SJ zh@f?F4O(;eV){jZ2xVXPVgGd+^AYcclpBGR4A^1RpLcEyzgy}}ByFv_Yp9K3=|S+jHyHK?h`IXr1+W zP3O@L@RtmWIS{|gy$$+AILga?on!7V-s!<=9*Ow#X!>+W1k6ud81etHL3L4q`av&r zr5mkC!hQA(W@7HPqkaxEzYhQ=N8+5{2n#HKpE&2u1Mh9ADcYe$)eGQRz zzOmgi&sDubml=)j5<3IUTCeRcAwJ;krRoqHfFIug`GzlY>%LSJ znKubRa42AAw6Zi>#1-c&&E&>}+u2_9Ca3N|glQ3%Ai*i1K-vfnT~AT#4_dyazVRK_ z<5VTd`0rhusZV&35jWIT==dWxWK#HKUTn_lTGcyB_=3L9^0TfVDa)<|!M0I3a6USq zU*4f@U^hV`goxR}r~4eBx&=a&?I7nEKly6kF~g9)PX0s35cvz>2!|A99p%)$3Q~kr zO)EiBXW0{KDK)4ZMPHEq4n(-%J>)?+u8;G_ek^5R@fK{V2NH@))+OBC5)%Qn*A(87 z<3!1Fh}9ZkeFSOp04bBr$Rv}}yeEM^B)ASZph|!NXxF#NMc-6I#A$_>+xfcts)2~B z^n^}K8V~j@JQkoNVC6jP)&L~jQ?nwD37iLzclqRX?1RMb@Qjw4|tEm6yCOZ z!S;I!LsjSxuzU-B6Se&-oveaQUCu%e=sNj}*Du%&@h<}hEEc5tx>*%xa!bc}O+4{f z2Z|Mn8fc77LM##jxTUv?FM8rxu5tAc;3gW8l}^J*=eeTZU1X)n!9bdXNQ$2gMeOxx z%#oh_U9+EXjB^)pvLX`l_S7+Veqh(`rf2a5rC&?d*} ze0mR?3wPeZ$U4!*s!xHVg0X;(8-y3&H4dG|ZEb>lucL8TV#=Y<;hrENHVLG$&NlT^ z;?n@~Fz!EpoG^{~{LOh?XJGzhaA`g$s2E&Ls!5wQnH8+AEde6ET4{bw_QLq`?|3V1 zbe1g)y?GuET#W_B)!*m7B%1QQls)=>Y{!p-M^Xm)gfl*Mhe>ovv^E)Rbfq>x=)N5| zIMd+^ha2)hHPCtG?Uh()Cg0~d|kz^B9GN9y*$|6+BJ9AkJsdd5+Qcy(BqI+qhFaGoH=H4iSJIIF%LWx`73r^rh32V z@RlTc43t#l`_B0ERp%}gdM$)l}jBC!Qn8YKWabQ?bd ztP^0ut~}rOjTR4TdC`Xj2k1MqkCXYlqbviAM>~L8-W^qInS}!FOtZH+s^C{4omZ0gaIQh5P?G9B>v z^CP}YUDp))^YHUYMn+sk{f#pZ5lV%AA>j$S0!_Y%<-xXv%d{?xMs698J07QTJb%? zI%^HBR36N1Rlvt=?$0mo)7Og(ziZtl-n;b$u>I!^Y+qb%!h?{(c_2z;Zj^5p;#6%0 z`m%M-@c}wE#tKmuSJ${OFYrmk&pk0>+erDW_5t3=AKQ6(;o9L;!@v;Z`pPO-SjMFogL~3;(+Jei`I!ov=~J?VYS&KavB*$ zOxj+EMk?RdLS_D0ui^H5iI0FJQ3jQy!UGc}8)O%kBntt37tWNMP+1g{Sog69XwQ8kmWzhMu zFaoM>`XtPJ$GYbb_|K&9&Vfbd9O8OXxAomW&asI`!1Iag#M3KuRzPaQjx(ZpE8Zp4 zQ)*;?mNk$4oxm_YM23Nh#7x%g!JWnKf=@ zo#17TZgK)E23ZV;T_508GIZ$S%Lm<4T4QauDY341AWb-lcem|I^cl2>1&6u{aEo|F z8D9ltOVKa<$eH@hA(WayjmQT$vu{D|;6hhYsBK%EYoDIQ`#PUD6x%9V3D@zbO9mMN zXw87s-xm`JbVc)<1Ev8*N$k0vU=eZT1>aVhF+=aUWxnf>1Ps&=TUMty*O%CR&c)H2jo$bYYtI0 zmC6N%u8W5b1k?cm@G2P~Fmu8HX6|JHXUy$$(_G{@QY|_>1pZDBZtlC1S){Dpn$>r) zNx4p+6TElIohc-xGei#-9hU&wj}?GFwD?{DoHcGd0VElURkp@=clvG1KKQ?}`6fk^ zWnVd+{IDG&sNN&RJ{wq9F3UhG>Z?k(-U}!^a?f~1rS(Pc-AZNyooWm=$x!8z92bjS z-@QLulL0A)kTP~lCO#m^pdJo1-K&r3fqx!D&=`k*}eXzl}I| z3b4P$h30I;w6239MVQ$}X*+9eIXEb)-TJagT>N)d=6 zAdL`dq>Q*~Jsi8a%iiGf`jlJyEj=cm#fmv@mP%0!J8_P9BVc}?EuS?|MPpBrRjR(| z(=eF_{8`_6GN(SK+bkM6SULXOB;ew}3>OC>UKm3(Rae$1kbFg%K4~2dAIc?@X@jePe;!lc=N{tjTN*co}Q|O7h86uS|^@NYz?j7fYH$5*u@~?w&Ql z&H($i+p|YKlBN81q7v-XwZv5b z!*TZH3EFI@A797nRcqUGzU?TGI9kDp=?z=jjDVYPoAesRsw+6BRBJ9Q!_nkQLYX7NFVL(Kf?&r2+_0dT>>DS{gB!Au<__ zg*>e0ti?>12|M5OuG{6@F+>HYXh~*}^9M0R!=4-iZ@uYlQv&4$@EpPg6Lyy|Hjs?gT-T;n_aI~$~6-3#SCUP%j55=Otf z?)k-FR%|B(A}Cxfz5#yp<|iqZm|%Z)5cph21*U{-0x>1VmrR6p?{$EFw*yB{-Gv$F z@ey`oedcRizt(2|7EQgF>;!jjt?ITI20kj@`GO6cVge9R?XL?pz^1c$=Dl+piOgw1 z2eS6LkSJ}FqW2eupk5na!P0;>*`Z~SXo*5X&UG)$^wHSuIio3HT%Cti9CsV23EjpT zEu%2JVqX$t_q)uzdo^wRrhn(_6Yo^a#thvFA$!A`?r=<%8CTM5$*{e%yV;P^y#jZQ zibA4g#{~!td=*n5Xnw6`XVV6`KV1pW2NzzxjD=5NbK9t|%iiz7cY2|`X-HvmvVs|3 zA=8fBKP=O&x;6@eD?!n?9a7>r!O!sj{0o-nO>edvS-JpcK#vX7bvCA1y}K8%uY#`6V{wpw6yEvSsJ`?8d-l9t+Q z>W#v@aBC&WriBS)^fRl(O|MUF-_e${G70jB{Ly&`)p`?uKCbHjUM z;+STsy;0#zdTb1wej^)^s_7|X9soHnvUvTlwsl$XdRLvEVz2WEv!nVMb77@LJ%>SO zypVRw`{x}VB+|7f8S^I15#^o+3-I*u;qG3fDR$ShjCUmre$q)fyUVc?l>P13$ctST>Chvl@5S61b@a|YXeo9D!{Be%hWqXH7hp(Je;K-U5f2X9~us#)Qjt& z8h|Ncap zGuta(tpFm&vi=YkcPh6Qb4Knt8)K?~3%WF!6}wKDprMGKY9_~)&N0t5BfeWpSI2ql zH*7zFu$YUx`B9CJdfWaiN>T4eB8M^xb`Yzqryp{jVa<{by$-i}(wdC+&q%<|RkN9@ zNGIoJ6eKu+dUK(L@m6rZ&wk8fjs}D{NYl`$8XaS=Ir&^jkw0NzlCLk831@>)<8AJ^ z(9388jQ>u=ji#a1tk6%Q^Up6OmLH&2*9_zE1qqBipS}k#VR5s{5S;$IxZ9D%&3U!# zpw9sma%S)`mr{RPZ?=e>l;nxh~e;fCZ6;V;&0_I zbKjNE#!r=8A{XmvVN{G(xWz7-r=}^G(D)2;Ue2i__f0Z7pw8G9Qu zpvAMa3Y5giR4;t+i>skNV^P6^E=iIp0(nVH4j$g&WKb3_-f|Qs=Ur3W%^@obpezb| zx{#|S41-Z`s*$FW#FZev);n64sYu06+7&%LOP|H$w^IoK^^A;^)+&3jK~l-;2bbUr z`B3LERyn6pDu13+JIB#kv7;JPnpn)=d6-P?>*Upqu0j=7o8ib#0bQqw?ez#maDMfU zRXqdEx#xGco`p@{1jQbaMXG#Y2DHgF7weUe#Xi6;>`D_koYGg*&M$3uN&339dbYRT z4{BDGT{G5sVaGFg%PO8wUvZ!@!k)H=ytzrFwMyr^7PC=pSh!yHZn`d;0lL;fYu2y^ zx9ggBNZUZWLg55rRwKNYDH}R@rvB#jV*D` zv77hCfSzn6VNiAJ1>zc|9La4&NGgMpU}U8Z#b(||8{Y&xJk4=LI?lPB=XKBbJ78m0kZpG6j8!E3yisfxrvrH8*%*JbkV_wvKsCCOyu9fx-xDL@UBmYp@rN53hqt{3)WUcE z;h(ClILPCjq|HjroSGQ|Qt^%*5#=*$JDpQr8tIutlBM#JoPAYy4eNcaIZ3UdnUGe9d6@3kV;Eh|Hts3`0u1NvIMTcvtaovS z<{nR2Jq#TScRK`kicN|H6m$?Xdc_W{z^tj$p}cq`o#Ar zqDMTAR2scD`C!RF^YOm&T~6k@*_ll7VB_Tn#3)`Y<*X%YFYs||4r$znuFA-B@MV@J76eq$CCUAY9GlOL`0gTN4S}wPVO!z>|PiQ5j1@nAr-?# z=H;o*l!~|-(VU5!EC^yn=`+p>*pCeG+w`^D7PK%6H`Ict4Gl&IbeJ1}EcoVjyjvaA zJ5TKlx^qv)Xf0mC9>H#49>K+ALq*V8P_t!HOFMTVfaTe(Grsq0b~BHqb;v%Zjn*;I z#>~wuR_Z9Z9;|^A@nc@=$WTGGra0RUVguCB$lT-Zsq@6l!QYl`Ja$to)x+rCi~f=5 zSAtFMyMcur;Z}E`Lp51pR8W(*HSTe45=G<7Io%O76O;>%r%$X^tON39!UQS0KqW5KS+j$$G8~Fo(~% zU3oDv;(MOl2Ab6WKv!x}LT<)>;ATXwIOr(JGPP^y?RUSG`pz1&;=zFjDy#|KbsI%S^A(LGOC5;ni7ksFYF9Izgz;VIv5@th!>I71=CfHv$Q*Zi0| z%y?e@`hkY;yoaz%nYJQpA1cB3A(~AyeVyB8PCfdB?OgT@H4)S?CrHv>1-jTztU*lJ+my?QxRA826^|>6DuW zHP2q2u|&OuYh7hg8*K}= z>OLD#-?IPyGKlGeQnGCTKAIXvI;LBfy5fHSo- zhxs z_0U>1=OMPQzLzoCw>`695WBdY`*qCgElP$dDuXjN1h<~)Qd*G0q&1{8k&NFSu^8LQ z5aE2kux$IV)`Je?k^R|4H~m>P`ylPHOLD=AsY_IUx~?*TkWBd9Q)kdw>AF&_Jwkq! z@w~gof%`MhxaJbO0;o4O(}-0!lOx^VC-Ulp#=G?!Vjhv8>~mQtV|Ks$MMNq%ZUIw_vRdsN4JV( z_f;W2q8Xu@@YXg5U+xTieBphXwbhfeR=aS$zqaCpCRHbWhAq-cqs=FO6<9S^{vATz z$HLRcFn9$?w^%3Z<~Orb4jVly4qsHp2BKw4hVRj~_j*QW zljB-<#%?qm-e+EbT}9XGoqD%a2)A|$OioH^=EYRxs%#~qLYOULdSm4@pr zbI^nKAzNl&?&xONw~nA}i`hnT{lxyAjiChE(lP@t?q* zAD8zifOrL4y_a?wUhR@KIkc@(=l@~_QL(w>;}ToSZ+LueVDh=AsSX<#{&lOxw?#-v z@@SI7CPiGU>i3@vZ|!aFnA;BJCT^HEI#M3|k%FjI5@PM45IsV|eH2h;VgH~)_^h9C zOzxT<+~Ag(zd}>vLPD^{$0u%tLD|qASKN9*7XRgW16hp?^posqq6^3BjUA=)D=+Ug zxS1@&5%j#IAld7U+;z2*xXl{(4%I5f;O=77Q>M6E9TAt)A2l0t@iC5sC-XkM(>rC4 zm3tFWZPk|CQ-7t_w!F$AlknU+?496+llk&RHjoMX6sDU@vZNl>-|5ws))CXzyl1K_ zjg!&Boa1Q1F{Tz|+dF&9p2K@|8;I-FWcApTEXuPJR0BtsScFzhiG$Li#hOQ>5{1<) zi7r56pFZQQ6@NW@>7vy;Mw3Y|X>0L`n(bB{N{4Lw9^;k6M-zIaO6gIw<`= z#eXo+tr_U+8yQq7$(|2aczRozB@U{KV{wByrW?jHw{4RvOJNauUh@P#Bs|aX%>9I3 zS^CG)INSqP-**<9ImrPF8%4ZoeVObx%eF8TQ{rIp8UM`R*io%|TkED!EJFB2jBaW! z*?#m>IH(wfc>Lsg4x-0yLRZMG_0ZgVgFN0WRMbSRiR)9tt>(UEM<|m|MwvgBA2!MB zEc>RDX41f}6Q^%TV-3}eh{5YA!A>>7cG616CsdueyvgWi0!&v26^hx@Pq4JPs@fdsPT&z4NHsb(5l*#r+L)0M6ci*kiD>>;y#%zb-X!EZ1lw+eKGag?DPcjK6- zlz_}oUyreQ!Ax<50bOTDc#WN6Hqs5GX|8=S&7g~RU*cwX4{p!jf|Cj=!zZlg1gzSu zS5z3^Ue+@NyMzwiPsp(>WhD z6)$>k^MgWJ;i+44*x_RQcXZR|y99f`?bKY)_Dfyl_X-fNnke7i3?5Rzd3Z~(+L+Mb zjSp%B$){D}PmMAQDqeLRzaq%cE6O}^sXxz%uVBq}Pot;A%-UPH)>~*1>a9J#gBW&% zof-qNk(8YQ$2Mu#*ATo0#$&)c;P+(XLSQFR?;kD17|2>@z2b4uz79orz($xW&^_vj zl&rnKa$i@>yvm}DTQ2zRv1_$srcC2%(Z|$+xG|Y!TS$RkZ&kIhOwq#6{A^y}aHt{X1@^klmoc@D5C{axLZN5O?h)dg``5lMK9(y#v*LktK zWd+@oPDVXT7p4|>g5|^Imk)TuT|KkfnZH#?1Y}ECLY@DCzA*-iHXgJy%iiKKD7Owa zDY1yYx#0s8g<|=bjZr^Oj~_b^-F<8l z#yuxF&5oT-mF1)_`^&ES2c@a(MtB&q?xTPIl31m_T4X9 zdX1AnTF=(_;Lv6=pi!*uJtsQzrPHpJSJNU&d0o3NPe!vky`ThDhy7}q9TOU(X%_r( zQIM3UHKZ_FMjaoHO-y{(GDx&ePmATsX>ZhsGMO3e(XWV95yNXuolljimS!JeRn*rT z6%>W~ZM?0$rvvy6mz?FyUdbbpK(9VcoV4`ZT;TzU2CtzuFKOJC=>3Ec>pA7P7(-&V zk?0Q8Ft(hX1J1>B%NmOY)vsT*=WCUH57W(EG|0*^Iz>p~7CW<@_-#n4FxL+16YDQM zST7N#k5!_WMkn2L8@QU%nmAI7%^X{MZ1fi`CBnM0_L6S0Xm1*2t_ls3>0}7ZK)qD~fTSPSU_2 z*8mCR8W8QckEm6O{*5$lUG4(u@{O3CK8_Scz0~sd)$=9^VYY-;dF2}Um2Dq4F}?S> zJ`Vb6VMb<_pZOIT-h{>w4ZhLEfE{Ef!|t-F4>21nFly{K52;CC*uZaSCpbejxCSnc zz2C2cOU+}GS@LM*p4MWC;kk-M#u0350jbS`0A8;KB6vDf6;xtT_{u_lRXgaB*gKK&wbo|{E~imzD$XX zO;EpmA$F$(^YG|P`TlkMGqhlF0d6#X-pGMmr+Pt5CO2UN({XIpvL_#sdUwC2zZr}S z5+{j#HDn{To^=>#=&($>I@M@RdY(@__tO15QWw_X(&Ni-6`3);Dp-h>YSlJYB=4(Y ziBzn4dVuRa*Ipy#GfwwI3T^L^0`u-dnJrfz(MRso12yhcWGDEWXCzD>K{4+3WKQ9m z))>;6XXDJrS=*GPag|T3)Di$LF<q|M zR#D!k(|H9?*3r)xBcB*ihU?5jb=r!#`>s$PE-2~Kynk}hw6~|2yJ< zbdGz;Cgz%@&4*-PQqW#da#q&sU{_x#KHsQ?x2HBYXexHU1h(FfY361fd`gd$d*u(+ zR4Uxio>{U!@7H4;$m{VJ9LxZw%3f z`8CZut;H%MpVYZ_&ZcTBrVd(;kT6QgQ7C5N7f#=KT5?DSa|;)4r%SvH$~7MXv-A~dY8D>Q zN=4l&oMFsiqf*)pe>yAR*{8LtD_(tC(>e1BPGQe*`)J#kHZ^mzag{maY}U$i8P;u` zGy_e~^9Q5PCDJsB~L|G(%P05;Yv~)I?qn^Wnwi*+~ zbc=)PU5DCu_llQ#1dfOoV?lgd!hK4+FKx3wVzZiUzq?DIW5wuBRh-3onjWM_ONmE>tt`xj#i)x$_X5 zh=_$=_pQ7d85WW^*}-FqwJbPUA{W z?m3@SlB|WpxT*}gXB^A3kj)7oc1-Gm*w#S6ICai+W;D342Z0u*bqOkly(NadmQ7tX zI!hH!nI@EbVE$gO)*P)xH3LRHh{goyN<*yF~B>3VVHR4=d@ufjmyhWXuJ8 z_uIW1V(szZJo%n##;ZEFK#HSZG%%*LA+ zjWX3zHF8ihXtMGU&d;`YUQIxRa+A^bT!@4QXY@Bky>~w#Qx~KH4m8RyD)Y2O_{7+m z;OJ5~JxKjgtLQTZ6EVI~jgF_@sSSQHO!s}Yd5oWXXdJ)`lk+^&YuC5!)uJ6XQapqD zAynE!-9mJVK7j1pG+kSN_$d#-Bb^v6{B}9bA7pA5A6Xqa=^$9*5kh7Fy^em{l0l0a z?rAy?fThO!W)0N}eR2ccG+B&a3P3B=9~}_?%P$K=n$5TFDoj5sU9`+<+nf!@t+nN~ zK9`T<2tXhUnYIoTieeXfkG(Y%tJd0)W9dt=a1T6V`W@b=FrqH3wl%yAa9&Q&N>sjd z7}O|`&uhKpz@@j7i|K~l!Ud#l#F!+UUA#D<&tkGmbJ`BdXKx>qkD<>&)Ol&1uhb1f zm+8uhp*8of31OC{13rbPVF9VqD2XXkP*U@cYUB+fX5Y*HmM3yL-B+EB5_jXwjLvM~ z^VS?|JJM>vZ|9zU=7&MB!pr67FzjbVZoIF1G{cmqaQ*3XCc*m$3`{ppHC;p{7oEd2 z&LnKs4=J!#_P1*L3mi&F=Dz}sPW7R}{$r$(voCD6s*_N?uW6khIcFwO5b(8btuEXB z0;ua5^)npAQ}?YI;q`kLxvM)FWh%(83^PBa$X@JG=ZAi??7oD*{pLoNH(+AzO*NkQwW*e05Z@xJS+S}Gm!6%PhxAxT2+Te@t>A)xE_7J+z z2Z;HuB&xAwIeXUkVnlND22xuWFb1j_4_uSQFeNpjQs1}X>KotP&%VuQ@*60^GABiT z1g}cIb}#%&VQxm)hs}=*s{(;0V?6B2ea}rVjVrblt1#3!E3Y6gCn0-FHy$bs2cJ~# z*t@scrYk%=DxSTqfMxBQ3WXcK*nC#}MmSr9L(qHr$EgMTurt*Bdxe^-PMqaYr!hnM zUlaQnLMiQ9XK*#AGd~v> zNV_)c1U$r7r<~v#4}HCH{vE?(Y4?|tg|bQJlo|(_Uc3IrZUWp7UQyd}sdq|an|m~U zOz-dsepZ?gaJ@?to7dPli>1g0UQbJ(l5$RfHpo`CPdhSFs$-HEW89Sqv*@Zm^I|Se zx9MsrG2Q94;V_8s@6d50=c}!z1yY6zX>s`BQzAWa?z34wrQ8JQ9JIDbq&KRtH$l6) z?3(~-6}i}onsMX|!2|bKGe;^EZX>OcmDyg7Ia*l5`N8Ke2*S9TQ?b+L^1@=iPduW% z4FENo%Xp)X$>7!$_7rRA5Z@R5Qm&Dpim)v*_7U#)Nu^+2^Wb9}!I_@6E_7~X%lQ1= zK7#xpcE2|1Y9`%hNx2S|m`6XjFF&gSH=w(%JswQ2u8w&%;;FRqJab^a+Lp0bM@~9@ z*abBbo1~Ase#=86t?zAQ6M{@|fo6JU+A&a(?7^EF<#^J$jW`MyUbmauS58a@r(zLr zJ-fUcH06k$Op;xIeL)!1okhx2V*-Q-pGnoj8WD3fTMMzvC$Kc(??=FQ;9h|VESNXJ z=jlt3?>#9(t9k+1vwL$c-QKjF5X25ZljzIA1pYd#{v{?myK0rahHw`55a0kv9({>S zxG^i#M~Ysah#fzsglb@1yu7XAN{<2i+kKu-j-2$21hl4D2Lj^-1x=KdAxYL#GV(Wn z{yaA~L&==74 zc~v0qmb6;N-biU+xE#--bD9qu=xjOjm`-fy_H6z@XZ_ zcn}VVcdh%dh54B^6ws+X^T{WsfP6*GG`F~qcOf8s@OWlX?xre&Z?S4v>B@r`o>q74 zx(enBZYp>D(xtBp+48)Vlg6_q3c|+qu7IlFHFaC%$eO=O{b0!akGzu4dvnND0OO+~B-!=@cwSx0+Im9>c!4(d@ONVcs-fA*_D|S-@#q8ZkEyhY|ZBb0(T?9 z*;j&#R|n5)+D~ou+gsAUba$4d3gHCadhF&ck0%Q)<2bD)ttuw3S>iOIz{1yP1D;hC zZ;kH_Krt`wnbn{Y4J-KBM5~|clg>4qJ2HE^|8gvhQqd=0w_Zgk3@#1i>%Mr&_H?cU zGUL%AkZ)a*F9$LqYs{X-4yf%(0j)iinKjCG-=9`jI3f*GE_cN|9>B9aDN~;zsksWI zL_=O;FiseF^bCyvC3y)I>}H9xl{%nsYr&aTaG8BBmN%#x<+;t-o9_(PeJc5?rPS}* z_;RYx0XVK@y*PUFVXIJV)^{VMYGT~lx2%;B3Jt>$!5yHXEPe4gTdTFu62X+s+xbm# zu+G>6TI`;SS4an2^$(k|k9z=Ob%GYRRcP7yOy#BmgLa?f$;wbw8TwjZISW8p zeZ%?cfV};Oj$cJC*B_ZDtbx;bov=#ip=>h|DFf_+5XpkWzpJG5YAQ(YE!AkS*p}($ z#9O!^K{qq)Il1_H9VPE*Ivsl}iYs6PBcmXR4Vf(gS^AM8>)zJ>TrJgp*bM!&P>!+s z1ho_)KG)eECtEi2!`NwMOkah`MQHH`w_dR1BYlbT$ILpPHPyl_+NPsgpfzVRq*&1x z@s^EXD>LP>lLCtemySg%IG>;91BuaW^41)5?Qo)?s^U)VqW|Senw)`=ycl} z_~LK)hhN{qZa(p4{k;($0vZ?hD&P{!ZcoL#Su+oU{%<=N$2^D)&t}-;lT5la#(9E) ziqQ||Q<|_v@NYgL4Ci4)*iB5oJDjs{##H1&cCX^yr7;MdT>GZSr(y-G3n-cB!y6bM z4HBtH@Tf99uEuywWEk8k#$M@*dLq8rJX4FdUlu>|c9Y$}18$F>=Xde2#Dp=y{RR$V zg%C1hfvE7he;Zo5(GWbuzPRGK@Xze~_h4zX%T4p;8N~o!b*DdM;M@lz&h2?%Yq{Pz zY#pO1X6uaIRPN{pS#p$_91q#!TkMo?#Rz!I_#U{Ssr-3!NUK)3IR9g>L(FO+?H5F7PlQb}M^ z51c;6g7l;B-D}bQJE`R`ED;Y~F#YP2AKr*A#(ii6ozr;rJONHsbRPm`wCc|Cd#*ql zIfSr89*b{Yx3SX0v^=}@F{p-HPO19+y@TywUi~qAskcGW;pqKF@gOhzZeJbDdPKL^ z_pet{3SJs(=J$hHg8yvmw~02s%_R4&_Vl9=;-e8ZnRTB7m~5OivwvQ{zeb$PsG?^c z@S#33Dz&b@wzoi+Q# z;r-%?m?_qko#!UN9ZL$- zYqc7;jT7KhPz}CQz(FdCWUONSBJD9mE4|RfDt>y9) z@9$^7SVHNk3gH6Y8C5U87Bmc0E0WIztqTfDMnJwG3|>2#nll0M^sUmfi^5YE!4RDH z3f=8%ZyZYqCiFAtr^+(T;A1%apWSB&!NO2D3I&>64uzw+Px!MxBr#ZqI*&r$5j_ES z_*u#^s*8VFEl&Z|cw&R%{>S$a3a*ohOea1n?`dEkf0rh8ssg&e9d$l?`TTc5^g{sP z(WYHi-{^z@W#v3%m7$kE2TG~JuyU85b$j?IYxP7yIFA8FY3cOFa&#h4_uf<(AFMo< zYu5>E_x5HHZDt&=#was^LB@N#mn%T}c^{aGWJyhQF7Zkqb0DAqkLS>FwW288Z3KPJ z2f(OhsApJQ&R3cMOf|wVoS;gMdGYJ3=izhO9>?utpgFO>5IMDr7|YSa^%yxSr!(^um!ML)bk>*zNKu*D7py6Ad}R3D+4 zAq5wKT@%Z=?mE5MYl5QW~Kudp+uRBel8+k~Ri zrp3LRvpqi$<26ja2n&(L*%kWaB^C?r{o9|zRNKMn zf(_!u(*wkBBoh&Zy{|&PWFVJ5z+;yT=y(zDFdjGporrEZXg?O?xDtSDGBY2RKQ|-# zpLY*jGA&lprH+93bKIqB$=#i`TG8EbJvJUC!Wf18A>RGfRIAf(8&;qaKq<^5EAP^Z zL1~u`TAWPTZDnjv?{irjEb2&kbQx0(z1vaBX~uAc=g{Lhh_Uj=48`i)&}7b0zDQ=H zJbck>@I}^(2Q$zQo+TM&|73Vw*w7$7%S70mw5-Ib{DBUb{{-KOkesyy9JVjgFte6% z5W9uvKQV3a-(8Bj~o!Koe7(u1nk&SH$~je@j?L;=Eyn#LnVcE~eR zWg|TQ-WgEH5z8<_qWlB@%yBOAJdJ-FN(RfoW?Bdm38qTX&SaQ?F<226z-*m_6TRTE zH^BXr5O%;DYL_~&%zpMXI89|$* z0k%Cgxc3g+@ba?YHagT@JqN17o`*^x7_B`vEBN2t9qm1P}VT4`Pgs_+D9^-0_3 zQrtSUraQ1k-ZbZ-Z;wNVzTeO!&(%q!{jM$$-h$C(H%cDB+2Bt#59Fv(#0Eme!QGRt zhk(C!Z23b@Gbb=!C?HGnRWnlxh0K6ixMKeB-*gb!m_c>2H}q?trKR0K4|@OLa;$G+ z@b|0p3Z#11u0)rP&FgI|rTI7TV~=UyOzZDSK@(!i5d&j>mbe?zV{Po_XuWup4x~+P zSFYxQL9{C!o)_wE7DG0zv>?NTXpWE~m=if_M?H(cVQ zpqMCb!}|ZQ_vP_WzVE-8C`7ibg(+*KWG#x(BH8ynDp|86TZl}H7E4o6)=ZJ?`&wBl zAraZLW=SHEExU6)GiLaFf4}eVyw3UKoYz_Y8)Kg5zVGY4*7tSY*X^zWN~8`>_Wnvs z=gAz&j}GaB_Rk|-wtK->2rqhRoa+vJtyc9oa#Z%;aM7^l zz2TxrtxRmrix(0zrw!(t!%he2@v6-{-50)BFB{ShpQphrUG^~-annAQQ8;+fSfRaU z(ZTtySC2%|x%9}3JavOck?6raM&FbH}!v05q);(Gz8o!Tkwl+ojZepJ3lrk`=6&VL;A7+hL8>* zrDZLIqQgr@er{pmkY^Z$t{Ec{2skTz@xY9hq8Zfr#Qb?>xT?Qp205|5Qn+B9@+KvH z>vt#*9-2M~&~))UsO4SQi;({O90i4`FWo{~v(xwD!KS#i&LPPrq`){$Y1v4?6YAO; zb0k;c;OnjjhKrXvv0Hf}yi7W?V%h7TSonNQn*I%8qBbsK4zPGACFq0+iMheuQBiO( zs7ct(;OGgM?OfXLQU3*~lnfbSkql=sJlzeqX;{}`>STU`Z{f(_a19KF4ZyTAgd0 z>)z@|#9O5kV=-dFf+JLs6ysWRIsA19iYvU@ATX;&OzlE=dD3bbRL08r8G_#UD_q3+ ziUH*RpolEuddt7KpF49b{A|bbliT7IvXCg~E8OO|c~K8g1Nn;bYDD#!t6ZW+Z3hl= zru^@5kc4YElkqF}5AmMbd2?gpF#3y1mx!Nx9O&~YFr5|YK+Ds5bYz>mF+IP)xy-7eelC1cY z)h9*Rnu-`8PwnPAi+3T2jcRXr>}Lf9Vfn?-CZP^4){4#dogO~7W&5t-T^%`*3)R=4 z{6sliqt(K9ENAi4b;OvbVBrnYCnAro^%;WmEI5#rK)SG>Yb2)tdT$A+V5HVZP!uY> z(4~(c@{w>n>af_3E*a#;jHja(5TK6M!r2Jpsez#Vud0!z4mNNMR}0NuKnYv}p2Qkw zt(_PTz&OG%l``&!h%M3%BEjjvw?3ceh@{HBvF?mn$nt{gh^$JW(&vl0|NM7*#W*OG zLLeTA$`m8t-cmoLkRH)?<}$q|)e%U%2tm&WxZ>**m@4+6+5gXH z!u&*}Ak;~^?rJ5cZG`PpZ$Yb1B42&MwkIwx!S%$_;1GSx~ynKNga%KDlqAIk!qAFgWy5P{Y5cxsYhJFkZ6oVfwDcU_{5Ug z1iAP$7tpHjxmQ0_`on47&`T*C!`HSR{7?XZs8XN=bR@gO0rS5@Md`IJth->EmAh5v zs0o_mG#_wj-kqGd1mJiLResSzhP$Bvsb|UIEt25;otp0$go840@7La%b2QPncU``7 z_*=l{TEs2?YmS@{M2^Hb_`Gs>3M#Ipb_(|X!nAK)!5zvLzSLZJarMd$U zI|Rr;UFQcVXO94j{KJo%9nPbKL(F43&+%=&Eh1bgc(b|v7Ts-PV;u@$2NuQ?1vnKj z?|11gn#f8+*PbTiUg9Vuw6=%Kcsnj4&2m7S)~Y2UbjZPvkedBkQBuj@zC(PvbqZRIGz#UPy>3qZKQ8A^RyBwG zK)%vP_ionp(<~8WngS^L;6T-VhB__la3~@Xwl%cF@uc56)B<`=NK~43wnj8F0`Ag7 zY=>l)yog4cM6F2Nn+qSAr{G*y1cV9AyF8k8IFFv+tW|>nQRqhmg9o*1O%((2{=oim zl|}R&Bq|@I`GIiZkQ1pEV9Ba_`$K4uBpvB0)|LTnU#vn8^;7qLhj_vUDefN~1w&g~ zZKy%-*jrjMKUJ?7LizTp0jWolBS=f9XP=|T zFYx+JKPVnaNsW;xRZ2&ibAa^7Y~^;xA#QKOCyuSq=MI6a)KRy{?Kd!JRai#+4*IKv z5D@EtNRZeAn>bfwEroYZezN>K?ip>1I3!S^StwU>_ARRE+40-KZOyNzL6AKBSHOx! zAlM$H;NByS+Yq>GG?bX{b^;L{C%TZ!*O}<24u&(X{7_uA_v~^#mHS0`XC!1Uq9A5H z%o)wRXLX^iY)JA(n(dUtkSFTDL#+n1_%$3JRg*tumlU_sbFJgJlCX{zJU0bGMU?GN1AV!u$${ z0Jr$Wap0M*Nsdk~lxa4en*;4eby?$jR?S}?IBsYIXXWYK>7g3~_6m}=^;(+XA-h)k z$R?c3$`vNCiT+`bFOf2ecK-{AN|lj%;OPjyQ6lWwVM(-sLu;n_FK7j)0hyr2*qAo+TGisXwl40g`=${f>$;&S6<6DBOaQY<;}$7|{e^x!}((I>tMKFipEk zff0){kcb)r%~Cs(c%l|O?8Q*EQ|t2;rg=4o>B?PtqeSl$ibuFi{bVn*)5s2 zDSuI!xD>x7z)L(-=auVrZ2s`BUh~MG6Y;tqMLf*M5`71s`u=uRJy{4~@SQrOiO867 z`!!bB$n@*z+>_)PGRnYIvhGT(b$dV~om{(uIViBI$WJL}u4L}jfj$MHhh=wDxXu3h zS~>v5#XqKJ+&d9$KkehOUjR8vf&ICG^t!41j`Upl36)ED`|e(L4<)!1F($25xC7Lq zlk$C@vC*g#Kh3{Ci7(1@RPcjilA3$1mW>YfI_B2ZvY9!alj*r)Wtr{!r~d<7jJ)4o z0EgVA`?A*c*(@B7h@xQ;(p@Q4TMSdGke?x+crRT}bqeLYMg z6!Y!Q)oABe_8*sB4@^LPg#6zdE6dPp&juYuFs6XB76Iju|4XN&8^m zY9?(x2pgM(8C|W}$JPYZFmEffD-4@P3 zFxkCE`h?B*2OX$|>`;GLw;$3*Pwha-psy2nKCzB1=1AHNW*D7!4x~Bz@bYw%+ZogB zwiZ5v5?!~i&|~3ySZ+F8Z;?!)6mnbp@vK=h{XGuQDM2#l zN$%mj+1KFQhz)Z7A#g9BH=MQMd~yWd>OE7hp@H}P2^7=SLGMms3ZqkrrYRWy@nOiI8u% zfr}`I02Gm7KHSnf2`7=%&4k#6>*N8CL{bJnxE%B|W&p?0*eRqm0Xpupx6_L)aMf|8 zx#w4F8_0uW%^AIBG`b;J*&Ms; zFbS7D68??AtFog)6oLMLpd@#P5+^-2(@DI&#-OMZeN5g_uU31?3W1MH}t;%716 zq$`ZJZs`(8^T^K0hRzUYV}-U~owneVvwNAcSG4#JgLQkX5Rg81ouy!4X+5llOC)YF1r2v zhz>%iXt^yR4hIJT@@!LSF!dRJZ;1H{^*pWg`v&x)G

S5(v2;gPd?neFX0vbC0O7- ztYz|iP3XGY{*h(^T{w8hgtWU#0cB-fJIFRt#Nu?s2poX$IU}j zVQapKcuN3ilztOa?FzHYhwibQ)+HhX`m2`j#$JtPm`%KSHF1Y8R2(Am^g7Xn0?5 zevmZ;IslGMyUv()&AG!tB6FnhSHK zPnCH0Vqb1*+Q~xgR){6ygS+_K0SBrDN9{uMfZlT(nNG;r)WQ{TlIMSwD!&MD`kKMq zAkc+$+q1e0oqD2?j+RSHQ1UJd6%f{S^#V3(&9jh-iM4^01heoi$l(?sH!6F>Me4y7 zElAW1C+8V7i){-}o5N+i=RO~9Ii>%rOHjx1dwR=_mW!@~e9Fd?uoA%oiNC1H-dkAA zCWgyup%GMT?7&Gmtn*m|X(R8R?1bN$-udY?xlIGL*a7aVGVqkB{$`Sw` zh7ulW*35as)tq(DZu5*>c-)$4*^BgIO9aTk`uMYzCq0vCq~v7H(DB1;rhxK!1FBVv zB>BSLLBFG9*pep({9$ufAiBL^o}ybU(v;fnt+9Ms;ATgKQFyt>lXISG0CmmB zLc5C?9EiYWJka{vCN&$ml_&L5c%4aEX_EiDFW`%&^`|WCPoYcAVGP$0c@0aJ%%K%X zvjFR2-5KY!+uBj}hIigvD~))+Lyvb=DSQl{XU?l4_(hd3^RW_r-S5035&%cVD-lk zG5H`_3Wc3j^8PO-{GsJ855~Hg$G=ZQe;4Ib zXLl-GEdpJs-m%1-z^`u5==k*>G(eMtEJ}KUnSiC)ch7qH@gvaTb~-p;U(*}v(CihB z1V`~oH;zq>wwk$ksH(iyK>Y@Xtlk!C{t0zP;Qu~pvTAWsTGSuPwJ5cMTU3X_o@zu^e|1F;W3o_F{ZT3h86p3n_#2_*J=L#bUz%}mYGLZ`?@ zz5err!?}FV8e6qmJnuv85x>UpXIrtq#@#=XmlOOme6`N6AZl8&*(DR&W2JFIS(V2; z|HR%f(YBLMFfSBuw=E!^Pw;-8BkQ>W#w@0!>DwyL*P65^$)J|T(hKh7_8vk!0lhrc zAE^?|_vL>ditlYZXNu^Clk8{oAALzG{tFlA2Cqaxm%TKDifv6#%-!grVWk*fQ|oKV zwq4(SO2B)y0W`<{6RR1IHzdoH`x6Nb&nnbC5eo@{P$K=|KNT#oaR-dE%8`<5&aag+ zf;Icc;6`u6)TUJfyzs5|2LeX>>?<>_y20upu>Nj3i zp^ei3kboa)^$aLiG}5dcDK`2AjhI^Z=@*tUXP1Cez}8{*P&V%9tEz<=3I%}F z)In_P0LF;iJv-_nyREb1bf2EEQAUs;^zqybr=5oUutw`zQi#GUPUbI7XAHZ z948Z!bEN*zKyXxCx_NPqNzjBY@`W9 z9bzjX;M?V@owyuyTa{fFikuvOcjowX2dOcJUE13!GH>?U(@8JYQ0zxpO^!L_VRD~< zixQ2X9EIcQM(Uo|?rWyM{H0OHo%nV3N1IkY&LbR~Zc#oz?PE}od}74$@eYMs;@_i& z5j*GXII*58IuS*<03#~L04WV+C68dV zv{fBi&GtGl@i>HPwLDI^_+;{Q+fEjjp9{)1gTa$*0$#f#bssLG4_67ef2m7k zA>YhipQkg4vL$=f;}nkWgB#q%K82)eL0+ffq(4vLY^RN#{JnLCT^ZSxTz$`j_N%5pH5`|8GE|DG5{h-#?JO@+S4dV8wfly;v2xVq ze4U{tDbETHF)Ka z7v#fsCzWYHUT3s0hx>q3U=q9eC8etIeIcnj11Zf5e1Wd6Edz6|0Q}tYdNaVy(h=vW z(n>3wfeG%V%6v7FLNi|Wi%L4`L%QRlM;Aay=Yqrv?26&!wr9o4)gSV`-kxf`ej%In=?-{*RMSlz4cNRrX~r#wFXNp8;NKn2r*?f3 zj@EK$pyf&mKzrbqM1+VdquyauP9NVN8vB?9){4od+~PNtXl!b#hF?V~u>bA7+8|w- zenz(E$^gwcK6vcjTU)viXo{B~8tq-dbgJ2KOq$2Fvy1)BH>!NaP>OG$e&zdO{Hc;5 z;<Ydn3v$kXDRB53H<*AXsFtex|rQVPH7+Ns+cGUIe^Y<-f(^n^-lGb9QC!h0i3R@O zZ&=(!#fR(ruQzvT($n*;?O7iC#M9p!rwm-=8Bmm=4VE)HBbbS}{_!X%Crf{Jc4>=HcMpHe&L+z#ON z;Wz!PnbetnV;A4UdK7-XKL6-(><4ex(f+y5Z2&%%6^ywo32nJ`ZF$r(E`9%&8w0Bq zPM4vvK%J@ZuHIYGgLlSrm)|n@i}y9#vEG;~Vs(C51%ZOT87r63J_O#9{(F(b(A{VIfWgKa~=+rp~%Dp(Xp&(e{E@Nz%8HJLA^KIq#guMh=oM%sB z&@B;?ULAOGixR%nK7w7?DVsghbKmsM!RCGk>h|OpTm!p&=W8YF)!VtGONB0j$4^)9 z_uI|)`kmeFaI*2xIki@#hPeXoxpZYD`Q)5#xk(4#V|#n7w; zmHUTpOY!QA)oHLvmfGw%QQ^+ao7X})zK z;OJH2M~>_=G(LCh@eia4Sbq86+lou)il9ZjP}-KGP$lkwovBio&F;8rYX2hq9e6uz z7Sqq!zdwDX_4{_xW%^yydr+dfqO#xJBM|*V%_#f`v)uGI+wxDh(4QcvbRc}Ga_Ve@ zR{6`OZ7&#w%L35(OIGH+@Bc)PxXO0`-c-uAC(*+kqLXu=Zu|<0K!>)J+DdP;x59T^ zywa)hdUd}3T43DLpwRACUizq#ZF$j`Q{6Qldev2C-ybgx>0GX4%{s`qJd=5iyZ`UR zpYm7XCUoadDJwhl%{mo!8;aUOv+aF!2jkw+83=HRsH9hjkMa}dg~p)eDN=QFa}6Yr zzxB=Vf3O0nI;lr_rxkiZ1h3xLVoMJam3(6AkgRTKwgL@7GfN8H0IS{xS!Dk!ecLP} zPO^s`K8pp+OLo_lZ`jDZelP=PsnID zVZ0ON4$b%I@|+(ZD}NoVb}(2@2|tcPN~zi*=5!c8uQ7VZY%_w2Z8k}dG32fM)c2?$ zY4Q~l$Mk#Mg4r>fW^>Yet0RHz-mZ)Iq7d8~(T6q8Hpn{pTr5eY(N{a$kZO?fBE68Z zMu4%`%Ll3zZtH0twtlzh`e;_F^bt~JxgEKg;!0C=PuO(Mv6!ltjPLbFl(#QP7Nhf< zo78>~`5K!-Z5(bI{-9M0+3evb0WC?(4-FZx?vx{|pSRP~xAfTmM0A-D3d+%qiy&5g z5B>Q#{r?16s{8yDNlnx3oSk=a@in`%T6eb-^JMv(-YnB++tta5NON3Neep8`L{>n3 zU_+_ih~`^KuoM4EM9s)L_S1O!itA7hTH;sU9#e8ysZ!fUGk6ARU_uKH{k0;%)86r} zOx!kgdtzb}InmEvb|?*O)wx@&hSk385>C8v>fWptLTD9fZjzM3o%eR;;UkGI4tcbD zkYQB#BtE``^SFx$ znl5Wr>hY8qT0U_`?3=Dj+$ui|R0({iDu}!NH(ps+$>F2mRPksQSliO>%8o{JlbX{cWT6Qr;8QE~CQVrU{Y{HPs}%YUJ8WhRLa7bc;FvnvxIG z$3&PmVq3%zj!@AkAFpW^W<7zHMa{_PUa5C=4m^3Bux89Dl(*0KKy-fmGdF^`!93tn zqYz$&0s0qn{BeI1*LiRgRFMHfB&94{QCuDHfZe&Ser7IqZjlCnOZziEL4;3 zQ!R6A>yM=ww3l{p1oqDT<0eCTNfxTkc~{lviWb>Fj7e;scG58jWjm;)d>KaO1PWH9 z{HVjU%2S#rLy~!z5Ki}?92vL#ey)l`$-!gj=~HGRDJhS=NtW`LEat45dt6qqENz02 z06&cS^Y_dV=&Ee6*xh!;_?GE3Q>#Df(YEGHRc>zX8PjbnfdqxQ28y7m3l-}q-b0LCY3d1V5VYcEmZntE?cG=G_@M>AhS;p6JN<{=qh%L-tuy$toM!R z5huUzOx1CiffW674l9)^LBwr{$M6*X07iOAVx)W9^0w)ZdUF}2gl9cFNH-jh zp;H~bZQ52|?OdJeaR=!SVmSH+gny8;NQW>gs?|oYvM`Irnr{qn^b-ek=@@KY$ z$F@&@(yZ+T+5nqi#-C`K{EqmmC#V4>`!_%L92#T(Voj+kjQg=OS75MJfUwgI*_a;Q ztIP?WM-uk0Z2`5u5#DvNBB67yp^*oI;7RX1Al$ zD>;z=i!b9Efo1ryZ(KecP!Vf%Y&0ain#kw0d$q4^_HuMg!}&GX5xxiJT8Pfe=JoE7 z$g0lhAj-!p);kP8>Z7DZn-vq{s>VNEZ+1S_?m?yvd{7bAzBbh=5^>W`ljtYyrIm8t zDXgx+WTZp*9sW3W&$0zug6E+kO_KME@GRyf>iT*A)7GR~fM(4PF)s3DfZVYb@GkgO|&%cl(Hy+fJ7D&&ISWqaU=C2VX=p>HnNw-ICP56HJ?SE z6^_8j`qz=sxg-_y^HU6Vw@Bb?i)btShL9wBh7iVpq^-t?N@ zp_Ortj8}1(-W)7=|2!!ML3UDnKUhU}B30%oE8jO) zYI|J$^nP;2_&)L_J_R+?*?#}_3u`)!=#WfweoC%I*@owbqk$`2nnj2LkGzmDw`3Y> zUA=gHq6J-(%BXGXkrLm(@&<>5ocoubQ_`Y=HOkTUd)71nvQ;107!%lhm-3!&q&sEG zUSNTtqEV9aM}FzIgE3*2$&BPdkhF*ZIOeLjd&E5^P7NbUp_Ed*f%3?^?}1|=))QfW zaKqqKEHJoZxzv+?3{HKMkuo`vV#Dt!C192|bZJ*Ma48T0hlN0Av;IkvQ6P}g0)ul< zSR>FHrLg@#s32JzQ}qqZ`4*UyYNvRe0BM_v1T2`9-b$h`@;I&p#_+Y0{+BoAQKhM9 znCW|y_X0lHF)}7ZF>m~aV1chU_lHEXAz}5}Dz(LdWN){qyKUHE_-{bEU){zvQ(pi{ zuIn+T|K}UgP86cLO>71PqePk&7c3@`@*ht#3O@y=xnh<0VZ-w~)G;{IEQ%)ce-iTS zMN2o*>%?}p;Ad4)K&+Bva<5Ki3xE;^U>*JaB|xI;Tt!}6E^AV zDdSZ#Gc++m@3 z_E-DKOK}(Ymg{EZxPK1@eH@k)SaeTt{apb8(Mm(>!iT+pQjiyNyUa#`S_aXatvA(J zUlFywc6lBC|G)eH5}JI2QWJWp8BCeEdf@^;9Rq`rw$c-_pi$?6Fc!#O-U6J!3Y^ep z=CqL$5H*rb(RPNUMz9%RTF*2U<2GS~K^1Yn6?weQRTL;4(8)X`T?H=MesQ?k!B&fB z=T0^MXOb}Jf2PULn!{83H$H_{(Qy8*j-t$gx!Jg+Z6S>gT1sReNYuo)Q8|*v!c>7E zgnl#!zwW;;QNAa{oJ5$tps$Dn;d_+~2|*3AG0{|=E=hSH36S%p=MMbuJJQeOZKQO9WX zg*+&H00pyo{qKh`i$`XMA8o?^0@{DwnTlPfIF=UvDSBk?713#i)ifY(DIwg)J%hy_ zH$O=p6etXY-u{0)p(SlaJ6mfC?1qT!{>G!>D^B@TcAU;?9fB#$Ny2Mw# zYn?Z7=H)+r{1<%dOlfds!h`IqyS|6U}zs3kV|RQ&7{^N0TnY1^s9%5(J$IWikI zhN=8}L6*H|??6#5j0wN!lq%tK*qE?Yeer>5V^2U4W*pls) zD~BHW3x|;>fl-9{Eo&RX3{QhjKK&wQgQtv1rW|o;s9=;1Og7oIOsiWp`cY!X=Xa+f z*A>$UJTwG8_kXs$QksgEXi#xi3QDuHv(-m_pTdpU=m{QwsZADe3hL%-n)Q>|DhhXv zZ~*bN!AO@YH4&jY;>sJD?kG9D(P6otW_Wp)PbEE*FV#PRFJlMk9Tb3;0Ua3s4INk^ z=aTEwadZ@&?=S68@yWs3$#Sc*HF*W$;3^n;4tvC4_QO(&cd>Hw~UqQoBR!ImY~X{jHSLVaL&X&`+IhiwT7HU)ErzlB+DEi!Q&kG5_w>Ge$2(QqZL6DvVYt@}y*-U0@H z1dQ-x@QbJG_7B|vC`ad_4QE2@6ET&s^Al%|k$FlUtoZN16E~>!0TeL(+1KVxdL{77 zbIfCSj5VP(Q#r(sW9LrE2=YWJX-TjQRu^GZ=@>auZUOiYBa1TWN&9&U=OFDM8~>Y& z^iJ#%+6(hvkKA8wf#_jU3@G$>&W?K{XEIh__U%o(-n?-6)TvX)ua>2*f!)YUzrY@x zCXDNM`UEFS|$Z*40dnq=2FZitUtI^i^=Kx*Z)U(SqwKjTWZopnbOTHSvsvdwN+izlzw1dXqEhz1WX1ZX_hnZ!nj@r>N_fXPqp>M))^#5i!RdmRhUlXnFs0%-huA8)m6cNV<;-EJJ4YwrS z+ujn$FdxW0=Da{9xWhw80mnb$l zY%jk-xFTtZi356oD7agmcp}nyYDtqEJ_|z$MLvG>%dY^I!_$5E5#|j~|D>LNRpP>> zloGA-Gx&urS@iDleUrT8TB)i=6Ltnv$ShN`54wlww?Cl2^>b0(4{FI~H1!viq#8tj^^3o?K_IdxEgMjyv`Tu#+I2 zpX3XCHVliQMU{YxHAJXGyE&Pfv>5C=L;*nJ2ha#yBK_ym++WmD-UMISxp!5cZ zwFmj#gRs1)Cl4AxyZj>*1LAhTBZMt+^3xzBSdrCZQ$Qc5%JeU&q03;NOQ9i;*F^`7 zz~*v$f_7G4tFVF(G9;psrlm!Q7pvsnse>Kuhx$;cSO@bKR1p;~rjG1vgVDT(igh0ER719NwbEc6|CwhVw#9KN4it9U3tGyV9uL4M z!qdymhU{yLLRNQ+efWZJuO_f>PP($#lKvOc|1kDcIoo*`SrbC9S=?80xax5m(YGO} z>-uijNmM~zz)juU+idUk{>3A7{z{kUch3g%m*lNwR53Wa)K|98OB+mO$4_U;>_tcU(0M$1dW&}Bl*I5oP{Jg=?9Sital05dtGUK6{?>Ga+rC(`Fta| zWVrY-CLfswmrKb8bQ~>!<%el>ZBo6gsj6W}N7~v=SbfTz9xmBW_0>EV)U9cSZ(^+B z>2OX5?u}1ttUaA_jjP9@S?ZL^dXB|}wpr2m5YfNZP2KnySfOF@&@*;Cj}j~=(-YA8=8D;K;iwvyYl(LB84x0`!mNMX85;pGfI z?K2>99#t0gB+3D&4C~;#XG~i9)J=}z&Ekwdes|hGSfnB)0xv8s1HnqN{U`9F! zEI1(59mx?>8zzC)783tj7yt3PzO1V{ADKBMs4x5}mI*@iqjiK6IJ&E#b5^2LACOqCC3`w);o8wm`T}+{J$M|Q|$wOv;p0)X*!U2CWJRjllaKrQa zs7$F6PhZy`x2d^&A&d-JCW0#!l7`i1?JT^GNjpjc|Qm825zt%miL`y%FguH_7 z#oTlk+KGFlRWMdV6k^+3B(K)>;zcK49U*ec92+T6%;3Y|i(pSkMVD)I#Yti-Urt!r zak-#A@b##qFzBW$*M0qF4@^v?97YQjlKFc0`?I%c*Y)Gx`f{CTXGux`;y!$z(X@fM z7j`!kq4E!^gedYVR`$k6os@^BH>Axw%ZRG8aQdPl-$i#Xr-vo!jiOMNyq4)RIY1n zG|mgVAk<@)-}3mpH}AA6c59~ty`|iLe_EgCQGa7zZSXJm#uR0L6K@}O&a5|2S^qFp!GLJa&W#c(U|@x`amq`iS^#8FPHOdnl1xZA zURqB$4yxi;rY^~X1I?6>m@Oe=l*DyBpd170Z!_G%M%poW*qSZE(buwb;j#DNObBTT zXxip$JUm=nv}`acU`@ZkaImbIdxwLPyvr5F6o=aROz`#|~JIo{UOf+5hs6GLvvH@Phd97Z6I5_i4vz{MVFbXJ<5Pa(xG}XbJh}SrBx> z^kTptp_oG}BQHh{v@oFOxn_9g5BCb!8TxjXP#!Hn2k!k4S4!;oqka>Ph&0K_2h z1m7Fz;azEN;gOHgc{}fOJ#<)AYfHF)vJRwtz z5h|<)_KU?HF>iOmb&Pp@FlYs9ngfY#o8(l-jgONl2SrZjunVluV;#pkHP8-by|h|V zXM}JMz{>I&f7rm}&-hDS^p(px;}07 z9d@AZf2K5p4`ornBm>QR80`cgG?HEa&4#%lW7SfQ6#mtCrJL7;==wy*)t#HdOJH?itDySW$ZGl~J(`60-3f!>=(f zQmF^p>P(UYY>2>|atzNAeoP^1nsKyOJx za(Yf}qhtf5qJ{3M4mCq?#YIO~7Gu0k~c(8vvB-A)H*VDZr%!Oh_4M z-Xh6O2W`d9FFlzZk3Z5N^KH0+d5Dwxc=&W7&F^Q)`$72ofgbm?Hgyvz>AfSV_f?@~ zs@6-*=*OpMok$p=HG)CY461@7$k#Yb7BWJEY#Igwxywi)S{fY z59^u+4?)>yDsVHL^6Ipzr)k%FNvm0hA<8L!7YfcfaHaK3X5u~i-p%h$Z!o}{h890Z z(81~;%)l%XSLSy!j@;M0u3z?YQX;2A34ax4N~$dcjEYuD;dF<+5Z7@%CHt4xKd*5@ zjR@n7*1UHHvMHbB)TV<#T`xnmxoY&Xzdd21t5%81CnP|RAB47gFrODH|7pXkBQcPo`Ldy_PLE9Sp(J)+)UUyj` zWs+VeE2?w&OqL;FlA@^GUv{G5LbLKYUZ4M*B^ALsK-*o zKEUSn1|Se4~%yClTxQ~BT7%t2WJ~H3+I5^sg8d+MwkH} zjSA60P){ahT77p!T(T{K~T!L!}fazeO1-5vYY&gPRa3 z!Kza|3`}}Go^9LibU!k(;AG$0(kM_RH2n1X1Mn;(?NL2P{^M;15ZntMc6zAPU zH?Tf>K%>OO9HK75o}_$uGs*B-loW|R=V_?*f}|}>)nKGO)XgsA@}(mAXK-Lj_5`QN zFbkK$YUZqxwi14iE=0{}g~?n@Fi0xqmRmpR!D`2z?jraR8MjXj5U7IrLwPgl!%j&tQdB!F(y(7WL^P;h!<9(a z2z9*wXwRH4UtsE=J8yr^IW*6$4z!jb5b^YJSQ&%02f?yH0lE6Dg}$hcs1%d?TBMW| zgLy69qA^~hLBkCLK<0nUf<=laWjh0dmw5pKf;Avjl{=-|){K(|m7rOO$O&nC;+9dM zzOzT{Q_rWjny<2CfF%JqwLDvzUNgCvU}W9Kf^N7AkF$vVSX;dq)y*V9;&54s_a*`QvY z33D(Z*n8Uuo@##oqvL;R=m7nd+vILQlsE|*b++f4u`*({Pu(0MU!@)(vpNlx5#x(-ICV-72kAsj?t74(HuT*9G5<%xhY4Rq(MXwv!@P@+ z1(c&fuiel#5+cqUe&weofqO;rfT~rX+B^6xDBq=9o@(v0WBILirV9-X+8o^6by+rqvIfipDz3u6;A}iNrPoQb z!w4e2uIwfvZX$jOYuLH)yR_=*xx%A1+u!(ck^LO;E7*?(a~^tECkEmX6CMeKMts#_HoN@v@n-v`cfSt594h+0 z4j79I!_l0r0+|-1w_%;($?lqBB6y{542ajpM7;Z6#Q40Ac~q*<=K!7~&4>i~NvK#! zj$}zChGt=E_!fue_D}PLT^1RiM9UlWOp3M-r{>qNGbRF{`M$yFo-H_S&bh7sGLyq%EhO zK1CGmFOXrn#)W^}xQ$v{O|e3b;By~l=jdH29DbI9mj1z_0TCdgb1f*5IgRAmpPP-* zn`%@aTjL&po&o~?Ca|5w(RQ4ma8TN+nTya3pNBfhpih18bwZDJJLd!(+#FOnddkZ zolPdjS4%GSEoEi>UiIpD_1t-0RAtOPAy$jPyCNbDBZ)WB`OjI76_RwvQR5xS>v@nIo#ws_^J++OnCp>4(U8LVBSg1Vd+9wYk+M?uVdg z?wz<9klR{jqIJX4f;v$g0S)cZ^z8c7dP!dib;Zt0l3tzG$CysODQ*qAanYCEF5gWv z_2E(Kj@QM^L?XsxQg08Mo@TX&`|etmP3vaY|ruR(_I}lbBW^%Ug9o4xZ=hu0^&eQ2wMMPD>@r^2-FOb{^ z>fR6pRiFJ;5&sUdtbFoL3>=9 zI$1}2SR9q^YQW__ zgyxMBzoY_=W97JThU6|xn}AKVow#n;2A&BcWNuT4;Wu8BplU`QyfjF@uCKS22e5XbXV8T6BNA%F}D*~Jl&K%(w)CB!&Rrk6!5sh#|a`}aEIQ5mHujKHOZ{#Xr(%MSE15{hjJRwO7C$PBWmi<@$7{;?* z-6S1Hpfmptvg=vT!_7_q$1nND5%AcWjR0N%b`ZV;8;9QfB9g!wz;Gy-+cqiV&2}SU zd-bAP9i&G#Z8;A0M!>abD>$&EOUS<{@mz0K8c8L>tveu$G{s>Xk$OymuExUFP;sSE zNaY*gM!J(WbI`5%r@>TUx>@0@N9NjD4>UI{LM~L53`ouu0yyeuC(tmmSheQYTwhby zs=`^brWd8gTJXJ(98NmW2t^Z)J3+-AY@L0O2suxL)VsPP&{taw@x9*(rW+m;Q4t%D zX_6$&6Bh+UmsVY`q*zje2SxIjp+}{1gse-?P2O#~b(}<6TVM@Ig-txhRaD8F0_$3Ni$t?<$WRF}p7Ff^yc$Dhx{zaVuU)`PezY4{1C2Gb#R^_H$ zV~EMUxT^Zw1+#TO%)bzoISwfV$p}1dMVj&p#_EBP9NluZH!RniuQT9>h&~_><>cJF zHm>7+z$;<})4vu*U{)z7uf(TaD>2gBaG)tg6dCY(^T_8@M~U|FE>QSxaty%>AW%aa zeP6bQS*F{blNJtRy|Di|0qw^0+v`OY0$}k@8^!B4D}&j;?XfG1?K7#eWE;DXG;F!=lJV1^vU> z!QVVcVYB9fVb>HKE2FV*x^&d+WdJSFC4WPkv014^g6{x%GEMu_Xi8EGOfb8X94}T1 zbZ{M(E<_CpHE`6rQ-?%4);tGwIGrf1MA}=*MwoM=nCa3aW%38!<41{nRRWLrvbwKj zavHo~oxL9oNE;-K>u2~-JYpgz&Fh2egJoAn|l4QNt=Lt&P z3D(u|{KEk)+`=`hxo^BHjQfk{7W-@!7wxa~Og&{!%v@~EnrAZ${J>_GHg_Y9h-Bdb zb7|SJCZ)8<;lD1NJp#ueTZjG&<8?9U^zIZ^p+EQQ_v`n!z1kIy%nru~&AF_;<$hPa zvUS=|U7twj1<1^e6&4W{+9vE%Sin&h5&Rz1wfU*I_q@C-R>s!hd(T0`eBLXWQ>{<2 zH;CLi2<2{3seRQattgXsI~I<$_GR=gCH}m!WY@ElRl%$_cs^|{;JN?_i8@uoV&Z0C zY+!b^cC6;iSYC5i??YbGW_`1r&Z#qYAD)!#yE-tP>j3R%9u&?t??gJw_9eeuVR~%b zKYs(SLqItI40cd*prZkBjTM4C9yq`2CdE8&ui65TJ7br8+v@n^&677$Gq)Y^`4TiK zdDisdPAt+fwXf87oc`0kTy^8V4+nUzyq+Yjg&SOmN;@Y-7~eOY zfty#3O)2!kNMHS9U$^3=e50lbI{6D)rm1N`%XGY_Yz`pjQ z$=W`If<8FRMW^vGwwz*-{`P3*YtLh^OJ#gDjTgm6r%Ot_FKQhvFJ1h_?Y!_MBq+jh zvGl+3^`7x;x9=P9-Jv>eU0Qo}sG_8;O^FUQLrX;wR24N-dxo~!>N0BYv}%-yEq1hK z?Gmv`ts;>G5eboazV6?B|39DSdGWmHTP3-!^E}SuIIrU%H^1N(Jq?&Xm)(H^w&u^s zhvsHK$caI19TTloYw01?Jb*p1do~HDzC55S&I4w5nlyV(l`As*d zzH;KRs|tLNdV1)G$JfcXyaxXF%MsdRK#iGk`P+ET;K4pu(wH?}g5GEix16Z`I(EuV z(5urCe7VIAfzZ>JIOMG14))^CP#T6QWO$DYO5@`iQW~oAu?t}KvPpBFhWs|-0w;5O zFgfN3JaA5!H(=d*F;oGzIz?+SkZruN=P@#aCb0uN@-xH4`ft#TYFNef&47InAD?`R zcvvW{;iR~-N5GZFp6m9=7@kO`1S&#%8YmJ$;^+QbHn;>dL&(wnyu5<7jRYVXZd#;8IxZZ9;iESXw2YCY!QP6{<{dqYdIA{&a&EC`}d$s6FIa0b^#Xc z1Z?YO-xnB<06taJDtz9EO^9F3O1U9&>NBwJb3Y$GL z16X1m2?s1}N{Ozh*X+&RK!G%^#$W6~0O!UXNs!b#`}vYE^`f)6njYf!3qoYC%Gfta z9Eq^{=2^St;zfI3W1WN1oWc9{8xw@IUlS+7o@vx1ncrib6Y>(X!~^-z=hlYq10Di`HD-wyY^-QZ($?&XUf+sut5xq)>qp$;skG;hNvsX%Affs z3vgGZeS{l5W$_=^IPc!D?1Z42?cK;yQlAE92ewOzQa&CjzSV}YjYM!NWBRw-TiZRlWTS%Q};&(aM-FG=ZJl)KJi&p1W za)g@#9`vo5jrvFf)m(0vmjvlw&*(d+{rU5Loxe@FzKt1U3J(dBSSb`2ZCPTnI2a-PTRnlMEd1HIM1i`0jNEk zrtUGgMnEKIHUn8JgXgSDHH;G=u3LCQRVIBE7L_-#_uZq>Ixz%B-sy@3R)li z$#j)}uo0EqZynOUk5bQ=Ud@tI&K$ZxiB(W&q*~2 zzYYuTYtSn;GRqpqG(l-$%HfZ#{m7JcQ_`gm*Y5wx23DpN{#~N|KU9f0 z{d808^Udi7pggokp6fLGy*7Ck`tTeqY(Yt`nxXlqY33Q*Z^T%=j-bWZSw)rvn@YYkzT2w%bWtINOt13Ywr`SF)W}_A4Y0!n9UT*hL`Hq7h=y>O&y(kTeT2^ zIf~rba6aU%P)fe&4NMlA$L2MGVzzmsRh0i;n}m{lU-SxCYvUfyywd!HDKh9!>S|Jfn=w&5 zf-X?1@jp(k-s*p7{)HG&h50GEA=Bu&_O*lS0>Dw?)(9vR z$jJ(zJY~+_*R6aF{yoPXDM;Pbo}gvi<9Pu^mYh&$^4d?3oi;?z!N~>O%waXc-yI@T zTCiwq=MK5bbkU}Ux#-m|WT^o1#a4cQrh0 z)F;MUJ(#9CTiS0@Hy8+{@wrkB0jc+% z){7fl1^b6WdrAtq3c|O#?V9=0J#H-ff0?_W@U1^(D@D+1;PNxTcKQvkiE7yXp8vkS zR_?R22ab`WnVvlk(V0($NN=;=>Z1f)QoZSYp2eX3)Uv@75>>v_}F zKGL}5pW~$M2Otf+Xox0U?iVo=X#op@?4fsxhRt55iJfn>n_H|t+Kcs}RJSBYtCyom z@9e%>N zL|tQ;)K&+Wb1ExzUH{9Yslzt@%aSwSE}TGa*e8L)wq(zRfqXXrQT6{ubYMvnfLCVP z5AGBkq%_bhW)4hvXnZlr3=J~oy$C0}tW5k@pj&OUsP%+nX0}^hI(;#+R&!m${v8%3 z8KQ6UOeVc+*Xs5l;mi~@R2Mfp+Z*4H;9OOaov?RarLm~>AlMH3Lo+BmNH=i(omBJn zZh2vV3sR1iBVJ$10v79^o)mZxE2G^@eNbc?yZym#$Ar1c9S_HAEGFQZ(?`iAI;D7g2uVts<%jLrCvZX=kr_MP^T|n!HhnB*X3&(uz&5W8hqVTqp9Vtx_L(%gN)9VV_!c=Gkmh!l^hgFQOyO871}<^)ST@Lu-z|MZlpV;c$p z9`Xkpvm44~ZW7H*G|i9Q52vGcAw8^KpSvw?wyg2iMHBzWuufI~{)Ym9`};G7P1BL4 zEqaODF4)r!h;Ig=8hzFpsT1-=m4gNaL;113T(f)?UZ}D53(K#=L|pN3EC|1&8x^}__Z%X z`lF~;Y~~N-;k}{vbmBse@4?59I4z`-V=B|DEvw%ToOmUf3@hJMPdacyJqIIN{&nv? z4ksA(@4l=apv9%_&pvM;`Rq66!VCa{Qh()t0M(ywSaIAfp(1+!8GECem*LCq^gDC7 zNqx+qPzjC6i{;BJNndg5H1TYbTPer222^8$P233F(ATf|->N(j3tc@?l|M1Z@&cmf zb`O_43|hz{R9OcKh)S%2UW6S4s?cLdb>YDRf+{^>wTicCeBk&0b%y9-LGTt}!FXhs z)onL?>m`t=sW_>iJdDYxIS*U~(BB{%uQtr0`j_ziYZYz3#@!&_u11gGM>tn+FlzhB zb`G}=1MtsbF?<_`e3con!#I#eewIC#Xko{{XDRAz0@=37iI<@75d+k*$GE&+!6DS+ z@iC^_|L3Ko#9826$$UI2{q8$3-JenpUEJoy_zMkTu)xosyIOrzb^;PQnpAM8Ek3-j7qm?`g;cd2u9tZ&{N3s4OW?8qF?#Qf|Ba-suf`eSy~b{0{# zXR`pm)&O4*L1Q4Zy`yE9vmU+2K@7Kh?^%k<7Ol>fZoAp??r|Df^#UEfECyWVO9sP} zg?+XvrG@v|%H)d$?jdG(1)g;_TGMIP_*Oif)>?u*GhN2tR8<{apy|r!Gf~y zP^xF67bQk^D?sX0dA#~*CdqDBXqJJD6brq4rFJOJzRKL{hZ<-12`rS-j*JoDCMoeD0ezRD8y{V8?3VGB>auvd(BUGlR;u)xqLS`oUALmsjA zXoRS4m>~}*+XU;+l48^-DEme_M`K_M*q8u`_}>`~#63|lBFY8>xu|Z`&7CQDxHSu~ z>uhfp7Z3i%=h_}V8&xHh6?*rd?N=}MkFOY+145sU6Et@l zR`GK&9<1Z&wYphldj-b0hWl0)8f{AU`wM2L2Ry$37y>v>V!KLXJX0ZnGIxXJT)60{n%NZOJBrm z>7udQ#41J}qx^so^n^KkHatwM@5bClp7mt(fno(kP4~v$kQNO>mHl~~lN16-T50y` z5L1}ZrO()nybkcP2O8<r%MQ)?c96(5=@qO49PO8SqkC56!-2#SEgTIHE*V{ zi7OrhEWKpad~C>ME`q_Q#kO!4W;(-fScb&@251G%EA=boL2iGv$9Z>!H@{QE#b*QF z=!$KbF=KWEbsnxcf(#qw_ORTu>7d0od(0z_dU1`_kv{xTA(VMR<8=-1ga5Z>;EN;x z`^QIoZ{pHociv8$o)jA5GupfFura?y(H3&`)bQGw_b7uozIqJ=^R}oBYoVm)=vtml z+*kA1@y^0wAdp8EV7_8nAS&$3ldCPH_y>S*cXg8k`&sLKyT1*ulr0rT)rhX>@FPHz z3jP8A@T`ryVIkcCU)wyoXU)hln%hrt%B|_q+@d@0ipP^XKDeT^K=q1OU^SmN<*SuO ztWb{6IUQiFGn->>{rVP>S)t?0RSXHgkbeg3MJ?8H@t@!#nx?gfsFvT7b3J<6@8D6{ z*t@Y-NvoAM+Jf9cbp{D*Z=FLJ%Yg`nnXB z4Hp@P!T9LqcWf$5tNc()^6B+c`Fs9L5x{CqD;%;Qq3ezFTdNrMwwn>C7Jy^5uV;8> zGPebUvJ|#6TNhQ1nL3pEcRKPC!qanpsq#~I@R>7MR+z;EmQ~%v_5d8FJKRqAYOOJO ztGLzY%2uv-zv0I&=~SVcRi=G$(Yx|WZiYMX?`s`myt7AtZ(p#=QVlU)2>AK#%O9x| zYESo1ozg`}yE7?Kaeu=nd;mvk!W)oi(47JMBe-bBSyNzdnbQR0_{>Q(o|D?ff=lf?}qdcmEOsqSMZ~cM3ZVM>V-1r?)C6_$M z(z%(VH5@AasKe#>?2&Ew2XnI6A%1oeRzM`#X}N{ zS>ZPyhHdduG(LeT0lL)7<$98Wy`Mym8wcygSz4Y#TzE z#ydY}TE#lVfXlw(g9E1M>3KhFK}ZL6p_DiAfR@pYZOzo%gH=g3ANTp9B2SuV%oRNw zm-cA}7@XLYr+7T{*dE_FOH1b}$SiV2tdw=^G1{iqWr!9Yu+au(uDx9+0u;6^YyWh6 zLnrJEnAQ<_XO5A{EVU06y&L*1drbD)<9osu6ijf6#IV;N%fg4b%2^B z2~C%G$OF1==hlE}_q6SLT(Y7ZrGc6uj18QhkwX!cjH1qME>-s*0cO&%aA%V62HB^- z#G@lCAJ}fmANa-hujI4$Wad=fk^3d@8|~cg5hA70-mPZ}t?=%u zJ5L96r%UeD9<)S{YfqScLHsxB+Z<5nJGxD@snYRq&f-B#4NZ4)Urzdk;?Eyc`sbjv zUa!ZmcV7`rf_WEMJv<$r{zd?p4C`e|F{Lr&*Sm9(@od(y7NTIPMDo~#R$>hjHsFn0h@`tR`c5=MC)?=OTnNC2K+%M-$ zHhG-U)*$k7sNv%)|9@-Lzl!@q?-w=1qOtnC;wCT8hd@#TYnrupo+OG7ezfjW)@W~8 zgxWh*r-+YoZre8xx!K`^!Y}C_AH%CXvs6kpjdRhM@w|n?^gu`jKncCY za^8f)9q}$d1uf2X;D74Jb?Q7;qF*j!xV z4#vd4PSGzm@$UQlqFW%R{VaTeC(PwR4rst5$PsQ&dFLGG9idi0|(l z>Vo?fsj#F`cbZ(wI6TNOkVE4~ptY;#!tsgV!)2mTx;vtQ94`JUSOLBe3!M;-L5dUV zgS$GVcvij?x|P(bhweOQ*DYAh*EvM9T)Rhj6``egzH~^6?waNYujJfcMmHF|v z00D9v(_tQ*-oVP7nTZtl9M4tg=Rh9@7v%DiiG%O-rKl=X05_YC1*u`vRk5>kfU1UW z6b{^ZEBky7>FlLQ`AJ&XA|=8`>Qzqzg9`^0dKyvx6HG6UobbQ%GQuj-Jw6$SgFuI6}=WVPGcLuF)1r) z!qWOyIppbyxZicco=5kDSC6X2L)3?$`+6$24;!QzohG8=8=4B44e8&;%znGXjLDeg zy-I`z2a%6ZYvhJ^ovFsZ(Ly8K(cpHsnmdkf4f7`5YMP}p&+8n1XCR}W8x-63>7?nW zRAYVG=LBz~sPjS>&+uHgq4LF7&cKJ|H@PJv~lrpQQ(rT(|bs2#3 zKtg5GSev3xORRl$UAg#UZ?;07Njyt8F&BoRCR)5Z^!mEAF;^((e67Hs z!?a@vJ*yb_C7#zW?y>js6vJS75U-9vTa%57JK$t}Ph?&_d7?hwg`vc}5%6P^ppSaw zgz>C9kMAtR!ZDtSmz*=Q$c`|uiIh}EKv+drF#3{Pd;W7s@=J}{Pur|o->qYJkNV{6 zR4-nxXNc+kxaHTh`@#zoe+^xX+Q-m2#{UxtyOY@78J<};E}x6mNE7o^vZQSbdlvTe zeT~0g?@zg$wtv(m^&08aKw`ad7mzl6K9|i>vq~%~n4@nwO^sK%poaAe##W{>Ftb;V zOT?ye3FS*X|7-tYiRaq(T!(Y`hLI@N?MDyB5Z8U$Z%aq%X9W?>-Pprf^I_J#kB8?J zOLJI$?eDRHIR0nG-8?EhNqLJ|peSPn6A*J<)(yg->K7B-&A;N5Q$+JCzK}-X5;}fF z1M;|3*G+D5cgswz9pZG_ZXiqU!msi9q6#yp*LZFT-g3->K}|OILE3CLtN^1x(jxJg z5h5s?we`g8I>aq+Bt_Zp{l}o+oLAk;T}C!vT%bj^l!nS}^RY^oc;EW$_=Tef>$RpV zlnspGL!+1pp}(7IbbKbZm97?tOjCgPt~@`N_fOcO*JNK?&1CgH^f|u2_UIEKJv_-0 z0g8efUU|CwBxVT4`o2Xu`bsWVJ$xGGK-ET3c`CZaO;QzoM>4Cs=VFrbGs>B@j;2*s zRSCNk#gjR_1-`e;wcg`p#@(?})?7=E$# zmy5B5Kz@C{f=dj(Y&&!3yO7e8UG|Bq(AOkd2M}nwfCDHNo_p=d9mtI2*shSxqSU+) z&vwk0Uu2VDe^T82&-Y4H!e-A%7n(_VFfg8Mt4q=1@LKd+8?L%>CXllV35gorr#@Cj zmPDV1){jH(1ukZW>veR?Lke~aIl10J`F-u-0dW-7+Wk{l9b8hr?D(|xn`!sDWdEI7 ze%qnul%L)|&qWIDT1GIsljM}H^H`$Y#WbBlyTWRh$Ky+oyn4OKSLo;eJ!8{=4d$Py zr1uBTe?IYkQAtzQ(Jx&qgBW&wEazHpE-B2-#M`cCGwL1KiAfGH!1?r}c1+Up3VYlB zu--_Hph4+bHszCgIwKoJ3Pur@F`8-?$2C?~D_fx_oLx*lYG_-0&+_Y2ySqyM_SMzV z+jZQbcRL@YuWmDdSAJW_Scn0$pVSH|1hof+-VS5Va69BuyZE6w9!UH?rxy zi*C)2J^Kqf_c53x9$Yf_C6u~zBKVTUCdEZ~e#rbV6r>PuaVVXB&BkC!nES*hJ~`{L zZFFE*EzHO7-SV|=Xv0&%=e@!AR8Q?g9(}^A7v-mOz;oNg(vm1s!=$)gH8<3Mw?)k` zaXCLxbd}3gDVLS&VPzeiizi+{X%hz*Ez+D^e_p)(8aqFtZ;*DjwRi9<>1^K3+<63l zCd{X2>Hc-{O&XeCQCrRVv1g$3f4czFx5oTNY@+n-oF;_l-AofHoukDbZWHeO+nV&Q zFpGC#$WgzM*2+$QcCg>np^2deJH%G5q9&{7j8+AECR1QB_#43}R-hF~03 zocm--U{y7a-F#~K@7NdXnC~~lLcJ;bQ?CbQ-%xe*?o!&`AWK>JLME+;iq1io?R^A= zdF~Bs=4O5Rvzg)$W2pBd9YzIm^c<(sEEa%4DYhbA`7Aq3#I6Wx6Pf`wIlUyq+Bf)&w&zw7><^1}ls+>jgr}0&RAHGRK)fV{i_=Vg^cRR9*nY zV%C}HPa4#>C6u09NSN0Q>md#aj`0w4kvty$#ai>gpylH-RO4kkIJc~X7F)t?Ep=yh zn&ka(sqK`sT}<*?8a2bL}Rl+44?M-;(YR@~Zun@MR2Nik3&=(w*Y@Y)dOEO!@I) zjkvh`ve5g=aDeve!mIPsW2(1FV|uo1lbAPXSn*labm5k)^D-!A2DE#o|R34&^PjM zV+gHt85E#kkM;)NND+)yhE8R^So(TGokZ7lzP z&eC|LXKE}_YFGyqi3;UqqBYmpc<%d%(n_nfk?lBT@2StiW^a9=8?Sr&Acl+{AH!U$ zDq_1=owuol{Y8E$mX}K@#BN(nZhNqdF6j{^IAIK}S4WQ!Yuq;>@44=~iTK9eCm)l| zO5T@S@PfnCcHW`uM%P$~Ww|tg^Rc0WXYMa%;40T0qp(+snX3na9Ol=ShYYtQvF2TJ>r@hp1@a_Lp50dsoHg5n<`D?SE6G^( zC#zBAO9ijG=pk(jtnV!~t}e|L@Scm4Mc(jwX(Rw19#D!uF(_dlf63W!%FOs$`NsU$8;b0k zf|{;R+RaIYMmBg=?^Z5J9-MUO9XI!n@qRS*f;L^$~QbwuAH^a ziFTqNB)2GDqwhw=yUNB&M6+!U^uUK&~H{zQLEFAsRU!=mDv^udcS<~p98LgugeIYjO zggNofKlKdA4n2BOuU8h7$bxAz%2uWwS+-JZI?nN0*POA7Xohu2+`hry^O$2=;LW8A z-i*v)g_?NhKx;CZcu~MnDPFY}y!{z927JfTkw*rEZG*XCZ}dd_UuX26Ord5+n9m(eNBir=xMEab>cb=Xum zr<`RMKje(^djEbA9?D%ooCEg7qs~^ZN!mP`a@BZ#H>&?+Z}Y6*8t~Dvc6+k(pR7Vu z9+qDT4K5RT_Wgx4Slo-MnJsNbHehwjClBtMzzv42a#-2%*1cVY79*WNqNKZZ@|32d zfA*n6poVX`pNh?!tpC)1l&Lpy>I=O5wY!zOKuL9MW)PDX{$ch&X4x@S{Y-QRk!qNZ zN$wg5XQRpr%}XkRXp8HAgyBCY;EmE9VgU_`Y9#kLc&ex6VD{El9_l;N8gpxN%vpSH zz3wqRA)D81+qFv6Aa9=?T9xlk6c-kKsdU`2VX%pssbTA+c1h@=Rv}cbzyfw9s{iTv zy~xXjGWB;4r5H;oR*=BqhNE3#kQ%5)Ky^~nl_ zd5VX*b$v00YCq&^^KDyETg;LY4_n2$Pi!`ptq;oRSZj4?#HaRc@27s!Md}^SZXb85 zdg)D1qaP_i`|@G?+N)(o2+!h|j!hbM^+e`%fBSU7$Yem#H!WHo>YJF4QL0^HXEPw7 zor>hOZvze=lleP!>@F@DX3xikBHO||t8QK9<*j!hZ@%&)2@OR&CmdWC)r+EnZkjhm zVS_?mis>P!cDo6iQ5h1Gc*sg*j(doGW`s>B6f+ zZ>~$L>z7(Hva+rJS^>1cXp^e6jJ)NY{Qi`MnyiKHq^_zFm`8BKp-+m;({>FR5AeCT zg(QLr8teSf55@?Gv52U05|VR~ThuppP1G?z)4$x8h^weywoM3_MwFbox?Tr#kWO}W z+S|_^V~CMKZJZ?9dfYfPWaFKhH*-c;AbIFW1VjjKM^Axe9Q;~jY9=+3yU%AT0PoJ_ ztkp5?Y2G;I*P9iEHq|EF@J&%{D%EjR+GP=T??~N$sCj_+LjBYRC8Yaw78j0AkYCs; zTJQL1PAPl5$;hFdwaI}u4XtP$`_rFj{pPrSz*ZMhiu*pD8{UAuS!X*Sof}O{w~=V^ zZ0xP{tbcdONovX2m`W$c>JShV(E~iHt{m6-U!~d?4mecM zJ5+xirE>Y|5XK-v>BZlK!Cvpr;5$8Idfh{2t2Zw)T8J;M`e)#arkTOm?RCT2a+8vT zvF^0v4Pj~UY}3QK)Zfq(-+84>661~KyZqn6H2USMY}R$!Kc-9M56LPw z`fpvRi1LMYImW7b|I^!)I0L#!pjdWswUNF(0Ef>9z$x@LT?@2^#?_XiLXkCP{W(r@OuY-_eAIW_$)TFAjC z7AqnX9%{VM!-SFMLJz(?|JxB@;9h+$45!9QaGN%jo2NqATBsSwJLnQ)g0n0+~o z&+fprbTHu}GIaf2kzG^Y{XZ|6pK!kG4pK(tB1Y`c5lw+)Ih(kF=OpLBvo>{|ujQ<| zfK1R&n=n*A&X;#-R(2ZWkzQH1w7o+D*0n#VQG~8x zb&0Fi`261IdTr{?or=FD*Zf8uhk+_!CnlhwH52C)a)7jtkr)++6IQ~A?niwd%_+dm zcLP`*$PhO=eq;6f%Y)rjGtgA)9(BjF&nm5MrZ(_=z`yDmtx~jEf2D8c!C)JCCYw8D zjd~s9(~K{DKk07j!rtcj$i5qIc*!M(j`+bo@ewjgxlO|T_F|ivBGq~+3YxanhnGjs z(JecB2|k(3Y~DvZriXL39WOrR{~E@) z!j`1fZIl~MJnZww3zqfETT(8|6=5G(E5hs7fY9chScR_-S5oA z65@@ohcD;SjRaKgUNT8bF?BrGjxVzFL5AM)6wczbW#rA`1b;)@hzm}k?-yRv7T@)~ zSCMs+)y|ll6Vk?O7n%?7A5BZU0xjD7=@A%wLGao-=auUq*^E!)p&jg_pvr)EVkF*h z)UCMDb{tc$IFlyyarkqkrQkK!%ErDx+c{Tp}tr4{^qYl_3k!x7qF00`xO z(46fj#ibNmh?E|I*_=;XA3$Zx>QZZD*@b2{*S3dVa%HF9P~<7WWfP_pKNkv0uWU|M zq`4>PyV~(Et;qSKl)=sG0DM2vyXkG3c&Jf06G}*cM0^u6>b1DfTct0*Eo#w33f>!U zm>jp@VPKG`?D;t7Tb1c_e(-N?;6((!GpW(ojofx#vdCmphqXt29uu30*rG;GjM8SXz{QiPDR zU&zH>(F#VODjPaIa7|z8)YY`wJa?{br+#X%D8O&tOf5d?PduYxY@um4P-UH$zd#S% zUl6pA>kZ6kd*$M1Bw(W*`pwk&N+nZ>+!Hi_=rK&y@ff*^_%gn>zL_UmOTWP*qDAlVITE7lB7wD! z2+qS_%0Cz8!3G{|^l}y+)baBtf_RJu7bK}|s19Wfst7LeR+%;i`wQ`8^GYgx)u=lx z-Ot#$ju1#YiEFGo8no->aj@a^>TL-nY{^0MMxh`Q>7Y{V_ILKzJGh6z=8E@lobuy> zWJ2-HsAOB1w9n#=b9MWBTn73d^irPbrz__XxD_P}i+dDgWue$B!C0Fh|1|~gxzDZI zE-QhW0n;yaqy>e6hVw73{=>T+C-p{6$uAgIEuT40=MgqNKJpur=fAnvU)I>w=>p$6 z5@Coa*zga_H;JPG4(z`1?KsR}o{dI2xx@g_WlCbkZ;db?kQ)}_y%xP-fdsuZkFnh@0%z6f+Ne7{sKht1hX=;YiYRX=T zd3c{E>$U!t5gQL7Kf?NL3Q{BDA{x@cyRbg+$c?CK&ty}yl{xN6vvH$u{}`%7@M-$v zNf#SWF+@IcX^iyR^>caL&*nb-mrqki(It8|ap!9SHWajdZHP3wV#FhKD( z#_oot1L4RgfVk`3ctRKpAGl_U4pu6FqU+XTunA+;uI9iyuU+?`%z8=?c%9zk+8YuPTTIC65H?7l3ks|0U9x0U@+PPVv4I+7Pu1pM{P`j!c{hX zVEeSw=1QcTnWAfO`RweW!P5H|x)X<8;M&KvB`)p{JC@U4ar%K=4}`B(yKw5&#p<&f z0n0MUeoHAnGJ!U^>K?>1sbF~V+$EyS&k)^@35}=+`f43S1=xYoG$%q~j^XlHN%%Y8 z6|!LPo(cfy zg`GU-RNFyH>TJsiaIB#fyG-|K(aTVRi80j-Raw!wn1AdHam_5>D9+2_G(%;W`l}zH zbbvVa<98FGj6_aP=kV1v@?dAp*RHz{i=T}+z(sD{v8r@W{FQlRuA^)I9k1!z9*khw z1ok5IW?j+E;7PXt=3?o#7|u=ILu}Fc~@tZaX*cy zy1&3y-$(S{%JK~IT)WUM;lY`!`|8bK)b*Q}Oe3ya6(qYc*u!Gvn7j*vm3RClkt|)r zdS}Prdh;_;hbgX z3vyX_j&o{sSze=XaQ8E{^>1HK8{-kcgmjd%TY;b%*+ICqT`h4 zFN^httf4^HV^r6Fz7pKpj?0bbCTpfkA%%ocJ8Dv?U&J( zs3`pG!J*0tb-{t6BAbf4!3-nF*TVkfZPskg(w#mzJA|VVak9=21#Gcbmy=o41wbrF z6vyuV_-R1v27!h93(ZmHi9|d@;gSy&Z3QWWqMQyAHdqCgW!C!yHL%tUvFv`x@_RhC zVy)i=Djf%KhuAWytU%K`!*>|#YT2@oH-o=tW*3tL`$N`r6hvI|zRo|!^uEOpUoZ3V z0}klBcwF)7(7PIEWp%lQv`*<3v?)@Gs)Rb*xVC*L=;?N4Lc))-4^Maf+A5utU5ATm zms?XRe~F1|OW-d&m5=yGYWZtF1u*0C>yJx&DY^PPq4TN$8}s&lI(Ec7QT>=*#H-XZQ~l=7Pty~%F1h&A zRS{ouKuM3=Wavp8nA~EI_0!4${W7al&n83zI@KWkN`<8cXBU0p>Qdz20td5YP9|u( zt}~Yc99|Y~)w)HT(X**t=c=s>Z_1GH-EC^(km07Jr6vQkRn`3Gh~sg| z#kI{DbEx@9A#24YPVDmgvFYg({TcQv!cWaB$x|U4Nx^Mx>kG+t0X+#wLe;JIo^kV?{1qE z`(Ees?|Ay#xJmX>GIIsg39d-H^O@if!l1~eZ8KQn*klia2XL+cBN;LYpb{m(Y-W7u z#+t66hon6|jBPV~UtYO zW`qoDGmlicb$^K*HadKd=;PZKDgM*!7=I2sJ)2w}%x8)5eFsx{_S^+&XH)%Uy|$ae zkns1ul$z3$l^cIx7r5hqD3P>|l^8zckqTBYKf}1-fBEn43IP9|b>HV4^2liJV3lj3)?ebGzkb}kM`sYi6vSROjqd0`Wd-Q_ zV2tmDwvBaOYQU#uCWS{ifHjyqPW(Sht>N9IpG2%xo$h__vT>xI-JOguS1;R%1eXQF zMWc0OO4?e;i{yHnC*T+K5HuURDfNaqaGr;epv%h`G@mvc8t$Q04U23TCzq?@OCik6Pnpm)Ka%%psU zP(szAl6LU1$3`nVf6~$G6Y!>PL1*Ppv_ANyz1?b=dSK3d+Oo1a5~VaL|9QDx)g^ev zI+91fz@UjC9}#3%2<3!v1TQHbA>13(CD(@Ye*eui4%=mmvDwxtk<$v0LAqP7?=EtU%4Xe zuRdq%UgB`_Cx948IT>3K{HX6zVtSt|eDt$zSFT@MZgIoML`b?;{2*!@`2E?kpo5v} z0iR?`h;Ym5V$QJnNaq42L{|W`0hF8Aw9XYK(=COZjB+S{x^{YQAvw4b9`eqm9xkTd z6Zk^^=AI2`^GSfp+qn%UIJP5DrIt*LL&=XO6gqm9)t3;tV3*3RU>lheSV}^ zmRyI3Pm!dr_dQ*^pCp4~bw84mqH3MrOHP;-1UA?OjB^i)YyDX{7v$zTqEAh|X-1dt zyiUHth-U-?^VTU)ZO!YiN1W9KQERErfYe>(Grj*}`rk4_r6c;8A)Fu+2YU7)Lol0~ zFUXbQ`!~J**LTt($(($;b%>LeqRrhX5&MvC&CPk38i0qU>g3FbTffbUJS`Ior3o&>%l;x_KT3IcnsEfa7zfoo+mbbZjVD+0Ih zYFz3)Vf^cIr7f&>FyXw~y<`vw5)%s0Cq}3y8YLvR)6!XLr-!mn)z0H19tBdQpR2c` z6WhCtAY4;FG{&g=OTOb?^QK4z}|mgC!YKAt2&c#pa9Bn?}32l zI!rR+a2K!FQF-!=5HqZ}y~UK`7#mqC?#s*yhe+p(k*FWZw$K<(*mYv#EPe zaY%)&Kp)gAd8Q8e(3K7>3~q|bn3NsMAsXl-vO05?818IrCQMOt^^~rd`gd6ES~y8*p)^m-SbjT*wozetAii*< zKz{{5&#CXdZv4y3oB_Ul*B_D34*{8OjI+y=Ww-BwAnOB@(V4}uVQj;X2;-fhECz>e z;zB5na!GauZ`X+UJwS&(%NcS}##x>itb9_(Z4z1*ax>M?fi5{_$MJr<7H1^xuGCo3 zcX4>v4gu!LfO<8QEK{3yriWQw{s}_a&_J0u(Li7- zU(aH)#@yHM6{XYFqG?MsQB8J!GM(5!ov5?8NcCmc_ zg6Dqa*~V5^d}Hus_9Wa zMRU}1+N9`zy8!0#2*%XN&o@5v_jPmKKYT$c)Y}Y5FK9JoyQwVWRA`7V0FJKOAWtde zx@iV)Di7Ls@Y~cD_WODAXP*2#kv4j^q6yh)-L?a~!5%Cz0R@q(=ts=iY^v1H1eX(P zVZ$FX-4ppVIb-dVipA+u7jo}A(2gKIg@bLxUpy#js}=iCJ-DJRq2zo^=5WpPydhCv zc2%B2R3p)9CnAq9?y&hRzoNvC4`bmtvlDdPeW^DuImCPVWx`pc6Kg=#1BE@KHuGxS zd;Wga{M!rBv49Y0dR+DQUn2)xgpBYHOgV!VtW6^Q|SHA<63UVJ7DJj8h%woDjF9c-Q`MC4H_o7D&6{n6_q_;%2$igY(Amgy-}` zm6ZG_$nOMXiYuTm1xvWxr4c!P;t1G08ldMLfyJI!#O7GXDEwKZsDG^8E(5i6Fgtvg zr5Qwv8gT@+x7LAa`=Kqh?xu{4Wb|*MT&F?AVr#hG?AuARXQj|AW=jFX#%21?^QvHfC9k+iiNHsRRQT$dcA8W zLH)kpIWu?eU-!;EGw(ROJA1A5tf#lNHtXfs37Tw>Inm{f0{Gw_*AJ;EB+ zPBsNF9PU)EprY*ba?Fw#QR=C($tRP!v!uRbv(>B}F;HlvVy@%}Ard%j@b;Z{lMU71h znzc^EMKVN;*m`G%>zUmZSx{bE)Uf|hcYV<5yiaZXTu;S&+A&Z&czwK$VlO|a3G|S< z1v|ST4)j+kw>Y%kC#U`;^PTIxE=6GD;+sBl%FWa31qvIlW}e8o#`$@TX_lC4Xr~E2 zz2C`t$~zA2Vp+S}+2fFux7uA5AL)XF z+mSnbZ(1KAt8WneFDvIud0=GuGstc&0h#*T`e7;Huaw1O`3Qa{hBbn#j2-B*#95-841eQ9;mEpamThCi6C88a^XE@)wTUiXp2 z$)YL=`MTJ9hx{6{gndDky_5@NOIk1baP= z-Fbb+Jg1*AiL_V+%p3!}0@`{;U)hiGdg}(CZPHCX0m{Vhay~?XOmg&CYGAI@&|D_7 zP-I~EaKw|Vu3ly9^@V!E=1;5Z%aYD_{+xV+OliySy1%=yH7>5XuyQ>Q~F!(pZErE~2HkB9k$k19Pcz}b_3_h{xg2hS+mkPwfe zcs$cmm}m`k$HcjZ&2kh4yyH!DYuGk<^5T~>Xc^jZDITBs!sGL{EM4wy^evjYrVcE`>>8%tY_7cgEqY42rPVE7h! z&zZHmlJ#%CmWktf~O_J@(T)=R^LQqikw`!J^|wDQp3iT6CGnZ)?HH{4(rb7D5I~!5r(f%W!9`VN8^l&x_*DZ zpzBvS*tzn=@Yw3--C&xtLKbLMxBBP^48lh4a=S%9#-*gMbipD{X zR`g~;IcMslPO83<+PKx?uCyyLqp>}-kDt_k%#S!_w(p z$jeo(ohJUPH66YKi_gB5h^4Hwz4q)|Xy%h}AoR^ODw*Vb*PZO@Zjo)obb9X2Jhatm z$n)>{di7}~XkE#4imY@@*EhXuF9ut9Tynv#bL{cuR)rPDz})utf(ygNgE^b4Q;$33 zix@w=-#=OY@sm_Px>dd|e^Db^@nmxxS&lvZ_?MjWh3luAHy`x6O5J>2wmARsu$bqY zx(7?L3#&K}hI?l|)i4Go(tS{`@cV7wQ#B*UvuSvPYSm)0bp-ToJ#zFoK{B3e;OicR zBB40-(?*zmC2}(rac=%|&sW~O{#G~MwZEgFv+hF1kH@gzrQ92fg%jrs0+z=2)#bjY zQLS-V6B1i}&)0EH&6G;QxzEg*W7|~bXBZEI)#CVE8y{Ej)#;L@jtId~(30KpWlTcS z^Lw~787vK?q?UWIYw0pU2eOlLf=vZJO}l$coqCeD^7XC)=zrwD^5tcc*b&yN=gSuD zdSf>hnz`(z_hs3Eo>M=LcHsS;EhD5It|N7N)}A1FQGyI!JD z^W7n!J}MyM*RKLwNxR>-IvsMiO4xoK^_jl*Q{A;z+MUOy>dPNpe%4MYQe+); z-g|Dsxagyjb#nHF*6gQxQO9B?Pv5oO3$09nv;|7COHjZNR}uGDKZJ7>ityyNKD=$o zNe;zi>XSnw9R*%2HH(EielMS0H=c^n>Q|ab=q?hg&Hqfl``aqu@Yr}ns}41;OXsb$ z;&CwFEiuD^%Fbw97Y&5=h>W$xFmx;?@Dy2n20dqU1}5xaO#+=W$_mw{^~ zPS!6SI%lVzG&Gv6C%UxTwA^ONw!g683BH@rn>s6R`F45bf?LZ4w|f3sLqk42JM+}! z=jf-aiJdFsR0`V~<&|q0Q2PRJt-u%On|1;W5(%6>D zcIbIDelj??*}>p;Wy?`vHxt_#?+k^ zMN)5@);SPhYpr~uRJxrz5Hf4x~2d6zz_2j6Q+KTuL@0yhKz_;$AJ@nFDvOn z+7_$krxMeel#<3rX3k$MoHtf_-PBCKvKAfv`c(X8fPI0_GHY$-1a{xS7@lZ(E#T5p zdaBHPZ`xUKEzx=TGVn!w!ZZI*V-FO$3MFz)hI5>D27`B%7JD4h-haJNFSZiU=a3dS zdQbfEe4u3FVlA zQ#?p_PQmuq)nTv353jx)YbD zN9)oc<=S_=++OkW^xa?f?JUfosa@r>!rWG}>2`IG@T3af1@MP!1Vw`)ykHf9lIRsT=5A&Nu%ltBX zUYN_t082^Mw^$vCPa6-)0Fl?hlt+3YKwpcVe}2uyYBM)ZX8)jk*Kc&;wQ!8H)tv_| zbfm6j@J&_2-x*pd&@qi_#KzJzXwN$#AfndLQ)>+MF^fCsJh$k#mHJ-k%pn9Wr>pHM zGPRD9+gZEUaVm65I8(}i0r)bNbY};x;RO?sH-p}QirVE@Yi0cX4`l;G$0tux0c^z- zU}pvUr(bp*5VmbT8(*8(^`W8`!~T@v;saqnJZ2tzkWCmLj*K_#7)L3rNXsl_$L07hPiCA<=RsZuM3iJGab(v< zOZE)|PZ@M2o3yNwZ(mIo55CxF_3h4ulmTv35KQ;NK%Mqje3at$bcg1R)-T>Opj^N$ zm};H6U~{EqfPzLZfg&$ez-RJH9IK@BWeW{OJ#Y%?p*tL z5{M5&WJM=r{O6KEgN&=DBZ|b;*?^!)dSPi0nC;BIn+XoO)MW^ zQH6^KyVQ``6wcLR&eIQsE{r7eMxOgz5cu=1nBU~my~&?8Kx?C|(kpfo7Js%RP##aL znnRmfi=*KHgFG>;1C2q`?HVJqa&wv!LqpuJM_xsk zhddu%Isii~3cGuJpdQT%4-c8@llBd(aO@G7`}zyM4qW~uE>mYx%Nn@x7c|%M2pdCK zR%9?+>QbqJk~^C&y#g1$>^%FBGcpN#_WGdQ!>Vhf=JsP-#ji&>a)OVyb zi-XQp`I6o6^@A5EvNI+ob=A)tqYPMnTWGnlrPTvc_Lj9&-QdMT2yg4w-IuGWM!jxI z`~rnMal6Y_{8!VvizcvAqbf-1b zK?Od?rzk=vpfAr&sRpEcI@2m02Xqv-m*PUwXb$^FSh#2SS<)TJDMF%ccrfUMUY_%S zxYBcXuMY-l2)hjm=Xi}ty}*R_`(Kcj(|rmkW{Z1Tzhwvx&ugkA4cwyLiRcs0)N$^X z{a#T|tLSU}-J(xy)B#j~&;h0e_dVF88zFtQeez%h96ECWSWyhz%$)>dz5^Wtf-xTw zTT^dmDli#ck(+x^-s+SzKkQb&uAU^}`p}cA9KY`dPn$PF9^An?N#~6f2JJGKTM5eN zH2MOfKc-5GkKQX|;5`GH)GzBZfC(`V$wO)C&I2J!?05^Rzb&t$<$1e{@?51Br++$( zG%0^QmYmfrV)ab^qfy2j8?LL3_9O>URiVLs$%e0`a+i<*Oi>2lyT`o*2^FEwAU~50 z@-)VrccX@5QY2Tn7}BDI|bE*zg0aJtkmnZT4BhT*rt`n zFoWZbE-jzuo!WR1s8agU5BIXh*aP}J#9Uup?y-$lpf5b6;G7Lw%o{^?O+ZX6FM3V> z8w@nOu?{t6O%NMzPa6Z07-^&i@nA7sh-8FececY*#g4_F5u;N=*4l}JvbitW`)ykl z?B2E{W;JO913^&*TyglO+K@ZE;0N1?op$0opMB?6b>_a8+*z9Jl)Ax@s{wR;0zS#q zI##jH|6A&XhM;{loAqKnF3QatXzNn}>aOh#RmSmB-hatEd zI-B`kDm5erkPWH=h$~MRmWRaI4BjeY%?mn$e)Wwv>~13-$p)%_Ue<@fks%krCaRmI z9uLrIep3H5IJPwR)zm6z;%&V8=ZDK-TQhFWYcCcm4#E)f4DR5pO4E2t*FfOLnUq?e z(VbYkT&LGh>w>_kab6#5`&QZ4w><%llEk^0Nd6A?D}bVodCVVz=m%o2eNB&MysIX{ z!q0d&d2;A7P*)8=mCZ+cyT_@q{(Eap82Fmx2+v{1jeryL_yL$QkIE9I&4g zKcs0ORa}af+x5NBBrj?Xj>ZUNS;c&j;t@Wmj_v@c;ZfMiP2r1SO$D8`gF$h>z}b_Y z7y1FdUwX#gaa`|bNd4Tw05qoVP#U;<#Wc~%ZsYmu_iCS={Gaq)Yqr!oIC_W+*pB6e zp*9Jk6r)60&@nA3IKTs^O>GAHWW=@}WJJN#BRP3!^b|fVEU%=mb(C{wb5L*Cy78wH8W>!+a+pGV6S9Gs?E`1@MYl3-_1s`GBFru z-_Q-v5)vn_r_^1c3VLJA%&VL^h&z23fggK(>V4Z|xBHS;tbB|b?3*4*_#{nNbm~P# zjiGmeu3w(})+V;MI?!1yCD7{Vd8lVB`|hyXs+_nWR9lc2b-g(a?$Nb@h#v)nL-ta5 zmoiW$l6{e^r$jAvo5IW7BgN~K(P{7`-mM>pY2R;WwOkb`2NzGb@Qg-lN8UQv;^39v z3mr$}RQ5VdTGRb3h-LyFbcKLb5tc&@X+Z*Uyt*-<9o6`n+N$gezj2{wo3k(W*9BB? z0jYsFi~AZ@*tpzYw-X!&+x5r#E1xq?4{oSm1lP^v+Q`ee;n+RjWuOUgl7riPxWvFg zoyEq@xyPyJPEEpON-nB4FMFg1lY_&gf_1gmb)9THJj&t|6Irr<$Zdh7R|CsPiv=Tz z=i$I!NQ$besX~rvD`M$bQ!ilIcS?Umlv@+nVlz?0FCyxq%rtiT;JnQMP>%AeHd z#w(GTug7s%F6jO=xN7>g3%jWX84gd7xW&Go4wsk44KkgAD~&)GAO&sBZM5@{dS`7T z!cbM>p*J}J0NinqqloNq>@+3e3xj6^2po{!KO+Dl-BXvo%xr_Jphi}HJbyREcIus; z+}zP-uHEYs-7iE6q+W}^Swi4$BF7)_KpJavx;#4B!sw2I0ks&_`itYh-{~D(C}1UR92J%ib`^H;Mv^Oo=4d_1Ew2SqmL<2c#7=|`1G!pD^XKrQn~Dv-s#i+8!Rs;9=cQL@5tZ2kSeA-jDhd z+tt&}4@Za%4xO&%2BIy$=FI^uie7{VZ3Tfo`^D$S$cRI;fsiH*gz=D)3Xt_>07IXB z^vAcaLw5L&vx`1N8hmK=U>Pt72DJkRt ztT>r!(Ys_0Tu?T+FeE(?-g>}+gHP&_UVHGs5Yv!L9Q=^7*s9K4qR!2eJEmp(CgE+& z8hJ6ifQgBTd`e!>kXpU(%;` z7IY*|k*6!&aLwlr^igxppu%;jy!WysZW|81X$?FXwR3+^iv{q}yu&c1KdL=qY?{lP zxDc1CX<(27uL#rLJ1|{OzO~U0^jf_}>IMlS9DoRQ;D{xW2?2d?@D6=Vv{SSVzw8C} zki8l?GK0w@ynN)y9;&t6w6(Q!va_>or~rZgZ*8xKuRFl&2~g<(nV1ant!kyhB$TGm zQcB}82@>!@kS?+V^tyXwK)`vy9&Pshy69B`FZhmFIkW7Ju4W`sgOjrO!gCXUpE+m$ z%J6WiX!DCfhv5a}qhmjbe2V~ImUI?vx@+>6>OlXqX16camrTb^*2$2l4ZiChv}2`- zuGGIy{*_1-GzM_(J_$1kraf^*hGEmJACiZNc73{>)}>D?lA;O_1>{8;FHqu^v8vIn zPOmv?zvt&yfbpI59wfej_8SVGh$AAt{kS#}*nBqWh?<(u5nh$+q>blcyRWOvxMXy0 z&}-6+DQEyE8Jgam)GR~98JO*~71+~=8?J(^SW8#?4UDu?D|~B_0cAGD&PwJ!;v-+6 zSi9LAo)DQ48hIw&H2TRi*O~9}!ikBA+8!Pr@yExDZj%)m)<_apLq5k2nq~n2R<||g zNC4>KC&|?fQe?%zng6d(RMm*HL&DGwRWnk`KIh~WYkg9S_9E_No;1hu>bB|>agO;& zU`+=PkkddhB;kxJX35Mh{n4fG@&Mo5m<%nNO_ID~el5(#xwFcSDz;?h?+ZAATq@}E4;vM=7REPQlmN~xH0HLi1B0> zXIFaPw;0uBQW%|ma_)|f?Cjz>HX_CN;d=Y>{V(bBw$W*hs%wz=T?1ZZ`jk1)Al3r- zbw7tmGy+iIq;XaXwMkF(kBf+~SU$oi;|nmg3g?EW+OdTXKx>dpN$MWc<8c^Xm{Ex^ zU}hIVoY8(MvO7eSVXvmJYL zv#{7`RFQc00qq5&Nq%De!UVW#ai;>^JZF?2OKYGde{bFrCo)c0F}7Rr^mH6+&xM;& zq*#p%ewcbZ;<;7-3wq5@W>E~&h-uy++1O4UIiS)a8S!O?MoZ1R*9`YJ69Gh>f^82l z3)YSmJn-I3H9w5FsAN=L#h^*kYpEMdT;^RCwwv=b$Vi6ib^oE?JH$6G$;GDiHKXcn`&y&PNFT1UFA>WOgo_Wl$7J{1N~o9Q($X@ED};iF0G{1wj4e z8L}rvU=;Ti)>K50G*wtQ_6eq&tZ+x}-EE9>j7D@7af3r;SmW9gz>B6uBocKzKqveJ zWcNDb-IaSq1o5h)X0dj)Yw9H8qTV3S?QcGE>1)ANnwaS7UTsoNnUUeWjo+DE6GAs zVm>8F>TWhG7G!WKalSaS zS}WcDqGRI|0W<1}7=cwq(%{QS+p1hHbe`|aUi}mnduvY};xwsBm7I^>>0eEWqAi{* z>^8f&TsU_pih^KBN6w>VFs!%nPGbDHF8RUBM$rUVFy9cGb8J;~IZAtb#a@S-I0Y`} zSe!X?_d3lFE#zfxjEy9<1R0rQJ-Tev`Pm z6l>`}Nb-%4Q*AzbtU`vn3^U2{Jo|@ONCf1zqckEn)KV8B7K^@d7xyS z<`i3~K{heb4PeJC0o||e_Hp7Q@ueron1QdPx&vV&Dk2#3!N1ujfIk&MU|1_uwVO$D z)j&28_}lFx6G;?+8K5p@l^o~M-26l9ae$i0*>9a*L@+&>OD`mR&vI~^7$`UW{1tfb zf5r~|W0{ApMLqbV50D{{S2BL*;X$IO6=Mx6GA{7%CSiE%6!!bxa=^Fbw_!OgBrN=3 zanaMd>aqGi+8+gMX-nOnpvVA=q$N~%PNNC5PM+IN{$pX$C>>Z)eEoMelPn4^9Vf{> z6rv;nV;Hb`Y!Cwq3@2jRBPF7^FEiYREtclT;2i%39Rj05Ziy3i^qzTS_Mtnkm^|MDa;73Ats?EH3i zb`Ox#iL5i5w+2Ym#7L8OkQ)$P`=iMzd8U7bC<98we9y?SB|c)*!wbKCS{0s9aGQ>*E(Vi-R3~aa0w~~ydx#m*#xD~PJVV#m6+E8KzoO=I`n~dBpcMho zlbyJX^TbCX%2*rcL+Y`Le?c%$`QUoD3d~uG&ZUHT9@r!O zP$cFlr)6jELiYnlR%^?kzyLxHAv{%H1~i#7W5pGFUL>8^cw-B+<|9J9!HwUnS~UPj?3s z9X=v3$Z{}|XtZ!3X(9~pC5&t>PXIj=9zMysn>gf*rAW=)gL8oV=dE1-GM_8Y=Yo@T zFETlwa!%ema`iuX14#~gr*evmKPC*k-ERt#7g5PT9s3Bp^&pKHdW4_sE>hd|5KK>c@`KeqQlu=QWa9Ji*xrm- z7Kne()ZDuO>+qi8!|s*vh`)zGg_wsLiSC5Dh& z6AOR%Cj>>u)Ly(O`@M=c5QkLh5rS<9T+awx|4a$e;*;y`ZhQajha-YXJ%&NOMWc=`x^mGHb9{# zZ0=RGh%7K~26RR9kOe4V{N)Mo7w!b`1?+K6e>jO?&@jS4s=eBWE&lUmP$B^0HW?{y z`X}NhOeO#nsi*iK(vtuh3I^!g;o>p>lVpJ2;D)yJ43;DJUtni|6ftYd#hi2hJ|A)b zQv`8xeE;jE5g7r04g3H;ul?_u*k%B<0>zl;e>?^I0q!CCk9)YM>Hb@D5cW+V9F$dX zORjP`C@#9d-}3tNe>7==w_;=UP{@1|GI)%HQo`+wxA2*fl3T5!~U4-I=5$L|8p*;Z1oK>dp-7055Tyie=; ze_#pt!hjVd<6mp#{reo>3Qh4Jju^o{lHq2+58#`$|FLE?z>@!g&!hkO6kH|&S}C2G z9skim5K2a%JT8mEXHpLE4~(Ref)~@?J_swO8v>k4?fXxovz8TSzWZ?FfTt9Y6o75yWMV4egBr-%O=d-I@vYP1_o zC-f)F3*NxRfN+VX2iviCte!*&P%Z$^M@u{S|HU5gr*TF~(Fa}rdYQ!cZC-#MW)c6s zi^1ChpgC`Ok(~3t;5>th7k}@oCE>6J;@>~{%)k6F4=N~yQlBmMqIBXs`9L4GeENe< z(*Ci#fO9s2?>Tb+i>MIrX*B0|6!EkFi6vwRU)t_$KM2_p0i3gIdz$(mHw}Zn3}#S0mF~U&5Bn3i9XOZZxlFM))xXcTAm(e$ zQvJtKBx3cOFffgB|Eb3Rk7*=jPX18fzntnnbR7CWrtuFhAj$kcrtz=NBmDLM%rv^6 zNq6b3Qr89Eu%U&;v|)+SUvq>~2Rim5yYU7iFRr`DG-BtRp&@;)Ydg!on)sbQP&=VE z>`|C!SVfD(goMMiJ?0W&<_$rC>Hk1j%Gfp+U6#qPa2&YjTsY$*6Jy1}h0TdHkBnm= z<_dysZ)#J9V(T|AKQj<*3J2vq%q-9@UQ*eiVN{Mo5>-@^nFw#Te-+K&*^eAgk2urv z;RDY%uUZfJkYkElA;*sP=`TI zuMw5Daks0!j~LBZQN2;TrKvEUBl}eaeH~_a2Ylq7205aQ9eZ)(y`c!u$u=VCb6v+9 z>1;e@P$K8kNqU;2X27l{$rvd4ajA zlDadn_~P*kK1mR}U|oM?rj?nm-S9d0pq4P=rZ8&r;0LX)^ys5Apc_I53L7%tmLbi6 zRU7?6P#woZ$kLQ<>yIT7wERUV@LHkE4JQ0w4|0C%Q6%da(E5iU*YP6bglZ%C+7W|! z?sTZwh&f8Wwsm);w3ATG!O2625EtzxB##h=*s)g3LAFSBG8hXOT#MCYLe8t*3?TYD zDEN&XMwOH+1phPyc$CiHG#>5{4Ax~H{BH+hWPK4T%tKjFa+aZp)6l_S$k4?y4*ll4 z|5}e&gkFTQA+2~~;u)n0OI6w@b5#OV11-SdN%#?CG@gZWmOy`;3UWU4Icm~^0s3F` zH2g+=Xkf?$7i5+(r{FJlKN8CM=mUU(UWopl~ zAPKEGs53Cd;3G2A-h{$6x)m_vufq=~3H}qDA3tV`z7wg=gT+z;Vf@&Ye}qTB-2PKbr+|RKvx1vmL~;S2d8D-6v4L_vedak`h62Ry z5(wXz$R_DdZGujABloTU{pE2HK>$Ik#8|7%Jq9R`whG+KS~smoI>sSoxtq825}`G@ zkn`xNTgvIgP(FiUXkFM_RqGBxEpwl{#ezPe>= zlB_sb0{WC%T3YUHXlZHnT{$XAj2DK)VcQ=&TQ!6>*3Vm6&6YEFutVxE81Uh52klRE z^zIw!9NhG^!1N&p({lLsdI8WwpmoTIHl_wa(5H@wP^da5JuTHTyDKme**5VcEhcI8Qa zzGxjbZtO)Yj|OXzf$iHu;p+hyQfND(f!--^ofiLe1}*uZKzX-B(4<66X(Vrd7X;}D z=zVnFxmqBj0@vfL;CZ8LN{}96AGky!AvcDM#Kt*D@io8MZwP80XJ*)vfq zS9`{e#YR6MV;Z6^2oeb^-e)f!j{sKYQ zBg|I$<8Dqjz0|p%O1IAY#Pu9q*OnS&i(#?RR3?wt#CHJ)(Bi< zMg{?fSSHQY^@p?Taji}724ub2ahkyBN4^p5e6K^^5$LD+M5O=LwXKEhns?q3(u5`r zybikm)(L0AXyBL#$IPw^^|$)v4u2-(zK~hi{xR#?vDUTj{Op&s!~M!>C{UQf5eie! z=`>g8*@K;V3z*J$HM$TI2t%Qm+WqIHo+xt=rwlPb_d7V&IyCB}?!Fq=NF(TrSD=k! z;OlI;c+kz*79%605QZ=-ZY<_9&^zka#wUDbao)6Fdk}`}mtyr&(g?1FG}F5guNJ2i zc4C?We4J;RxB5G|;rU3`1i27!^coP1q4YzW!|HmogYD8Rlm9)?3+gBarVhQ*GklcE@D$ESL1UMnpZ{*Q2+{6g(7ZM*Cn`4&7f{y_)!VBR zHp_<{SM}ni11{F{jDv6-bZ#2W?gJ+(J950kQzj*1CYPis2oW}DOe)G*@aahrsJuL3cwyMxQ?#%Z=th8NV* zf$!yVdLp_Au4ip1M;Y^swYxhbLkyD2c(|U@^@`t**o&Iw){Ci*36S!Xcon=ex$dbV z4-CP8V}MU%5*J!LaHT2OUiN)4BL&X}0ZqWaf>PCok={e)>snM%!lY?(AiA4RbdVs|tZ+&=A=eQW$9`3CjINJZd%`vS^^UcWl= zNE@!#v6&05jSTK-vyU}w&uPqHg(ePx>nTDLFaDGYk)k5-H0s@X&{^hQ%koaXb5!Rn zIRQP2%%YDSFEeY}y?PS8KQ92>s)PFyS8)kL)rNcwyhB&jYaviD8y-fSnN^1 zGxxJbN-tY@h8`PB9=-PO9NCwHx9Qhe-XTfJMo&+dL6 z`>i;q@GDu?r@ni3LeZy?3V|U*V1~HhhAffU$+@2Vi;eWqBb7)p_JRz1gW_gJxU&d_ zbCgl#psoR{iZXbc3~^N(<1)C=9UFejx>u22pPeuX{wUngK*})6=tk$G%Vc5XFccXK zReBUQR~M5~6vua#*?V+O%k{}_gZqzH6`rG7+E4ar;mD9!EFEGs`*QFi_uifNI04d9*WZMz3xk25?ds_Ci&(l3JyI)qcRcMnXd5%qZf2pGHKxt z$O%t}l<}LH24-dt@9-?Yi{*JG#V=*??d|J*;QdLk42=)9Z6KIWuGmC2`&#ejCu8jPxjy?duOwjc>jhKfGtYcVo|U!wGvP zLlkR+1?jAhdR|Yi%{1^MmvS4Zm{mzXjKq>54`IlnjP{{bLd22BbI!+mWoz}jUZW5y zCSVwYw}v}U1XfBLdOT{j7POIvMqsfFNMCE4pOP=+4E>aRB-xqee%!^g5+Qkr-CWI< zqjSp>5I*$74+cQ?fxBmi*4{2te2qNLq5eU}u#(-AEf`=%}<7WsDs~reDKa&-q)eN(7wrF7f>-?*|mbv(*aeY$Knw%)O;Kueh~bP`xr`J ztNi$wrqAy6muAvq=NmyS7Jwt z3;vpST{es^?*O!=VlZjLlB*o!_A+o!C*~aXg4zAwUZP%_nSD;yM?&a4aTdd0_cSSs zG12dYo7Hgw@zFNiaOo3OF(=chhzPPn9=DWGX$!Oah)@G#ba=@$3g2V76)TX{4f5^^ z*aSgSDrzII?4NIsRHW`Jrhq1ft6}&{Uf$s+~uO;aY{Ja1_h?1fm5FR%srH1A8Y!VtFO<( zpwM}OIYpIPkSH|siC7DN>zq2kh+8>P3lOkmaJ{4H@dGRdQhGOzl-d&s_qB#^E_RkC z%1E9Kdf^DbL|k!XC94Wvu1=&u+E>odAn`M*izG1f4IgBmM4PNuEP6&6S^R3WC87az zA~3{+YrVZ#HYb0Q#F;yY>=ktj(Ps^oWpJ?643M1e&r}M^81zt}_M@rnS~SHC{7Ou4u3%6tlb*eLlguqByI&1Do>L|b)m#~AJCh@Jq-$kg;11?W&t**AWQ-|AD9SS zkNKKlp1R8JI=ics>7cTG7r#No{r~*;BP0|(A>2xrDSf|Y{I)uN1gG+4$?Om$L5SPf z>;BdzuM+f9&uuTxJqi6FuY8h@m&kcU(f#*d9_=X&;H{cyPYXmGO~XP!Kh86QDR395#;WzxUcN06G26TW;$9#*qd4<>y%QHCLG zj|x^3c;`xWP3F0>{p4>Y9x{?h^WPgoBPj8)tT)x)-iK>ogaw#lF}QM=JsctqADw!%aapG4$L+W&hTL9&Stw|D?%g11D!|3GB37la`^*Hm$%w;(pVi-_-Nd6E0pj+23g zteq%x|MTA{NXW82M!ro_#>>efV=?ynp*lovXMyG9LvrOsv`>44{dbNgIxJ7O^n;iY zefVsQbiN&}4~6~RKI#U1)}SCvou*y^r11V7`A`ijVpyBBxf?k0j;=g6!~&v6dTFUB z-R)Pmsi}8=Z4BZOa6L4HyyAP3H}@6_If~2AKmKEwoJ*A3$?62r6(aHq9*3}q>UbZ= zM_qLNda^8P|APZSb$h^XaQ^VDz6LyipWjs!P~b!WdjVBnww2NR;?Le&p#}ijUDW(F zUp(w)-oF0@1(MHTTUmcd_~g!%IirxqaN#il_M!~ui(@4p+M|qu41Vl-N>HNbS^F=ihMNe-_bVW2X3!}^EOr&Tade^=;ds4Qp; zj;{o>2&}a>^xDm|F#v0=i1-=x_HMia^Xh8D+wHmFq7q&)F~f0qULh<0ma82+$|L)g zlk;dZsuoDKBRAIQu=(+x7ttU1!?hJxs(32)oVGFc zdh+ZX)9Bs#k#DgWO*zBLm&wNuf7-4}EaI}VnqGUT>VN_q^YxQZqKUJ}VbAY8x}lZR zg{Ew7dHhGTm+4Oj1Ih}buwLC%CP)R2QjO45-LfMPB4;f7^zVN3h zE&B0jB>wDcKQs^n1QKNdkUd#Z5G;4Tj_tF=tt(s~pQK^l$rvt6pY?dHy}R!g9U$H> z29&G&wun#YgI-7<>=H{e5%K3hl~dfBF6& zmkdfvaO=QmY#qaDtMAYsC|P*E?xgNpS zbP%D^6kB*rlqRo$gHvN1xH{>lF;@x?ybLj$lQHbg>XW?I5czR8D%yFfhk+G`JxA%- zf5+kmSh(P9_=?X|57+s^4!a{=VV$2j6u{yYbozzV&GZ%a%&&s-Bd_N#-|rpv6}k0W zqN_M^83>CbcRFKEc)5-cqMh#{;_94cw`V!^{`STjZ$zbl@;i@MBR|c5iAjXpKK84y z=?wOe-np3*?GXDUAu41|%lOWB1~s;6xwv^xgeZs|`qG#T(B)4#{)snaTVLd2z)Ey@ zcqWjSfaA3L3l&@_O8`tCxzS8H;R+q>Ut*jOzzPFvW%`f<(rM7uIT;E!Vj6isy6_O+bY_=_>2!21}PK9sn-iT?`V1sO*`9-)4ll8|C=u*_9-)*00A4$W{qBCPGFh=M zN9-4On<|&;5(V&EcwwjWWIJcCU8pQ!FR1RZd)`dF0aOYig3Q&2JXKzJ9g!+y51i=I z{jljXE2VLV&0Fjo@r}<8S8zk1MmuuKG|zEKMA%#cf+NQ;Kftvf0PfC7xMoG~O-l!_ za?jSPiyyK#4U@;2Q2<0tAm(FTIWK{wqJ@qEs~ExytBG%@c43!YIY>*((ygq9a3250 zA_&AS~@)S+fEv->Z349W({KhQs9L4c?p={py_W^&-z)c5N`53XnXa~Tfjys4*U zlK?&xQW(b@$!1_-VeyjQmu{hzvwCEhOX(hj)l)EwMM~XGZ5ac)1cs!lCXj_i+~dJ2 zv`at0#UD-hyxJ>hx~~BK3o`b&m}6uP&hL`GR8f~#@EEt;t-b72>~c}n1Je7*Q?Qp| zhit)gCqsF%31&igJ)}t-&@$tqMllH6(7p(1+kL@BKYm`BznXsCe$GwBg1ti>4`W`#S(QD7)Zfx7EF)_ef; z!XV8e0g2b`=`0o(BH7dlz7_C}&vhfuudd!ZEbf2&P8j>tcd8`t%ePx(E2pP=-v`dA z4&7Ovfexb~rghOD?r^;kA+u|lcF~r*;&w)(Fu;Z5$z3YG!W6EP;|Ia|#=wJQbQ^f- zWNa*y!IGT+Ky+U`gsqE^oGA_ae?5Jhp*!eQNE2YjFGtB73MFS=nb!of{QTHad+nw@ zc$%S#?Hq)z1PW5tTws%}s_fJ{F{Zh!n6I93O-@0t4U$?{L|;llb$8BHh1CnqYc^HA zaR&n-09EjT)+&>{S1!;v(UiDwfv*`s7^$|otiX0@t~^yYoenUcIC4m?{qVJ_MFF8P zO*Q`mf=a?|K%CKC$RU%&QxQM!N1xIgCq((`D9j`;&+#O|edk#}j#5tm=+Hw)0ENH! zGrQY62?c=9nv%iBQ1glnr$xiOuDT^8n771+FC8-RJCVL0EENY$`K+>T>RJ{>RNT%K z-W))AQ<*IB#!Y3*!DMO8FU{0-5If+x0F)!N3d2o5Jxe@Ja8p7?{RTHyho)Zt^`R`J zSras^`Xfe2^e`AD*p1{5;$Ba8DfkL^9H@Dqhj{j)C%GU8XiJssd2MzY1v~R_13iKF55lrtsSaMRJRjR(8i@+p58I?)n|j)i zub1-Odv>~KP0f3*{nqyirz?YMw_|=)+b~+#{(K=?y>(I!ny(sJ9n8lpZ1M2hqi_m1 z4sAoJAz-Bmzv%oC28Ajuh9Xt=2X;j7uo)&8yKQ!Me zSGvBqNN~>%>ESnrlg|an)MU6mk1L6;gv8`Fnee*VTl&-0Th1I8FbpWs2<&EK_LV4? z-sk6Xbsl|6qWAPc0St1K9+;gBa$TQED5B85BMqU345omXkIcWryuqr59tSz97&bdC zC@Vn*+oeqZf>%1=@xFO?jFO%}GHZxrBFe)}w$>6+$ByE##*~^ehF?oIZ+eb=bTUrh z$V=Q@HNSlMs1I-jJr2O@Y))rUBuW1wqy_BYYwB+p=z;D4nFIxOQBD}7I3UjuQ-W(a zKXjjVr-%U26bgSVZKxN%d1>B-j0S^7K_SUW?FaN<(W>PXup3Xw2M$(iGWz~{NC)8? z$N`%HpsG|7rj4UuR^+I5Vv}K`%I$krJR^|KAVZF$vF7_yL2$p=zA&FI2t9g;!7C(i zbN%w-V6VW_Velup?dglPiyINvh#=vs@sH^~{b+6OAo2edJy#`<0K`DR_`H9B<|d&X#cMXe)YJUl@q?XVn^-?SHWZu3s<+JuqkZ z^UYnF1e%NALGI^y-%G!9z33%4jEfS{9m~qesizUm$z-JQCOVq=jVJ_|=tFGh6`CR2 zo}v9zKM8~;gXdE^ke%jI8(J)i9uoZ+i;(qpJj)(C*?(=iZ`8w}HQaq?ZJ2iB_yv<# zXVp72Srjy^<_31nlOs!Wl^E86?LPYROYk$CUT5fRnl0z76!?XO_10umLl0DVb~|f5 z5<7+^kHOz%XfC*L`DlMbu=T^rt3A&oTs979#UD2ytHc}WfTK+d?1=jqeU5^#HOlOQ zlPnP39QlSu66w0|?K{cogoqjuPG%@Y9$w+gX%R%j8SwF{5y>7$SVua>`$8giR z$RtUjy7b+h)rR=t;prnBn=4ai>FDSleCfa2BC{SK6tmJ)V~2M7(7N1tR;gWT&>|b$ z)M^#>Mko3BNv4C;S?n^N(k*_HL4DFmQf^Q7=hLSJY5g7(DGkkF&s6)w+C2ecDbAvq z?iW&E3oYD;;&L5}s>&##6#7m6mY2Mr&$Utq+^F zNE8Qt8oA=SZ~wteUfsv$=GXZI?og}$y1)^TzJ9OY!FOYczwXBR{hf^|k=lUOgO>H) zkLq_eu7pR(2_&nYk<`2Lp)y}PL6~Q2?)iYfV5!!A->UX3^^Q!nh|}eqk#fw7S7=3@ z-%bda7HXP4^-pE8`}834l*{+4$0Rv2s8>`p9`=Yugz$8WQ5CwHS>wo42SmyjAF6hsp{_a!t3Pap4pwZZGAXfA^;;Y% zTWCqIWryW`>2GWn+j&gXPW1RYoUSL-y)7ee>GO zcf-2h8Dtf>_Sc4rg^2pWJ)^ovY8)qwNA!s>^}qBJ$Cum6^3frac$j=BRDPcv?xIk= zbAVSEYgKL{RA`XJdrbSmJ*$_#+}R4!K8r^exGGQIrSEv`R)6L${juC5Y7vHIm)@!{ zXoeMQ&0Z7J$o9M}VD3oo{*%T5IGdjw4i^D$5_r?@*%Sj&YxdMcHuT)z_c9ZFO+ddn zD&<6-^p2j0Y1KW==KOqqqZ%zH?WtL#(OliftXywd4j(>a`Qn<70M)rKk#4HzN}H(b zd>4*LI)B4Fds1N_9?dQjKKZ4{A-+2O!;cdlH=LSZaFYkv=$^~<=6~Jl$eRfs42Sl5)6R8p%cUYH-nwr6I^XS=Bc`J0@&u`@c zQk8ClpYzUatjuW8eR{kp!F=l@ANV|*l74LItIWIL7CqaIOWPE`neYdcxrg2)(y<@f zKWgwg8yF#U`Xe$|_ABakni|s)p*9sEG2jxp$oogns8G@$Ee%o3$UhzNi_Uup4||_e z>D-;<|9>&|mQhi7U(~P&h(RgRB_LfY9n#$}14xTBNT}{u_s%_MpS|}v=Va4l!^EK0!LI;X{CwE{1YQfRbF%n*40f$u zy1hm?oDu98+M`8~X()Yx{3SRW)p@UL2GvZ7RaOR(C@>jV9@#eDKb~WyxjC3&^s%Ca zlQx(z>+*CjM1&B5b~614<>jV=6yV-xv@23<{1~_Gfwxrf^=Xv-t!?olY~6r@~1@xv5_g0 zOr|9WU=$P6Q(Nxskk+zKFAoP2UZ&(qNDkdz&)=$og`D}$$=s4&o49xGJi8++Aqo{6 zf5vI_po_flfCV`AvVx-Nhy^lQhX>Mxl`AdBab@)yc>TWz2R5T)rBW<>cJla(R?N0di zA%TJEP1hG&t3fDDkMA?<${d_0^End4vT6?SWDu=zpy)Ck5R>2AL=t~WZmA=d%yJ~^zx)f9rk^2ni~y%^6HbkP}K*{3Tzz0efjz*bm=0+8BYB*pNps4rj}XkC^qP>()A0rSkK;KQ7X)jhd5<51;eCg)ym@ zJq2lDL{Ml{K*;?lFqj~6EmGJUMZ3zgHs_@saq7i!n)%x~HLDqwmIx zMJe=sJdxP_cv}=SRS_1AU4ADu!)kzCR;g(3>_-R>Q+nTsmNXnZq*Y9ciK3I0>yDzQ zcabw2fNP!caKg}*$q;jqt<5(V%rNgmy4fbjfK(wEwL;Xgf$V2YD%oh-tmW>gp)E!= z6#x?7?=4SA5B{1{5~$~lqsZQ%!hI^uw28U?QuWWp0p-<9t;0P9pTfzqA@D;+<))+? zR{A}VG7T*-mhDC%Y_63E!lNH&!+dQm-eSQ6w}1OSSsE`Gl>fX5cZ6b4%{)(StCUg% zAb51Z75b5xWK8Fo0EEZ-CQ2<^CWD`FqV%h3V^Th4-s5GkC5lj?!h2rGa#9oTva0$$BQ>IYXFlK}(eSe^ z+NVVs$rP0NQ2cdOAD3*0FlcdXu<7f&aQB?|#2azs^8Dl|AW1Uudn z^MmoTzdW=8?MECF$iDmiP9B}#Z<6`&q=^nt%AsGuh+EDN4l$cY4qTIN;E>MWPHk8( zd_w0m?K9WHBz|R_u2+Bhh9nx!;nq2zw|sNjG8|%;?x4in`Gf&)DXM5c+sMesVjXwK zq-}G&NS;Qz`&RFG{7-ub;rDsNfx^7e)xH$@9J$Go-NSOTpMmU3c2A2Vi|nK6IL*Jf z)D!VllbhJb*w&VrU~0lPD>k5|?3f1py(gEHV*j+S!vVQ@o?cFn#6VePi zlwX8ghGkClN;{{jRfWO)?#;dO&Kj3=p#m;&uslamzWjwwEtN0ENSR4#*~l-d%i#fX zfqdAdOC}c?zvpMQ961wiv98m|J8VSf_mxbD$PgV4jV?PR>IH8^%Lj3U&OU0CXl+40 z&w(A4`8vkg+qz`Ooy5e$dCDbPX(G;7JJs{+0hC_pV9DD9Gs}tQ%W$v`^p!o96hQ2m*8oydQOz7eml zgbv<}s{7U1OTD_IFymIhKUdhy`q!PV`#|3kl$OFt?6S3IL{xD81^YXcJ*%YTj9C3< zWP!vrLfRW@!z+BchaMB^(ZlkLe&nZVd{*4Ctjw^DSGZdh-^m`Qc`F5AlgFt0jhwXq zU@84nP{#lM{6KzlsxoB#bi`~x4b(@hMfK6K;2V=|wMANzO$gs!MS~w(R+3Bn&$uLR zYpERh_*|TF`g3~i5-gYBkJ(%eUyh9z zMNy93Y_*^S-wBKI&q(Tv>*B*4u$R@;uu zvKQ0iLY#on!vP0tyL=hF-=QnL@mZu3CB4A~y0xz)6%#FgKy$l))!3sRug~c{ zza3t&y>>89$bEs=Uw3*f39I3C~{sM3x z-o#rh(|X4>ZZ7L#2}f2%8mAS<+0!|X2;LRX#IlwAx2>O~0xK&++X8Tg*7}psov`>E zS2Dqv!}wQC$&r?0#jIEJ=1q7lLhAwop`%K`f(G@BvEm8Wb1!~V3crU%#-KKVU2U`T z{nz`~d^;0Ag>My3_Z%<2csOm1pYEe?DyIoqPaS~+@gINDs>D~=F=zaX22~UZlHT&8 zf22Sd%k8#Dh0noOy5M_&o|n(4AP%bjhE8yf=F)ILwE?F zkyJdpUKBo2qg&~hmA+7J_ag@F3cHMK^Cl6eKZ6VD_q-M0FML+}60JLi(Gwe_*bs4R z(aH>(1G+J^RvH;c1WEKhQ@0e%d-x7Ub_U5iN2V!QFrhKkV# zJc@kOYF|Z*U3UdP9H`8Bo-+W7Glu>reGEU>s}{t|<+_SmhgFrW+$~oZeeWn9_Lbh` z2%&*UUw!XuXT7B%EkM{&ZkqW&z5v2Sr3?xnSCYn7W1 zX9uA`KHGA5AE~-f>7lP(mk?4YAxmvkZehBEu$Br7h{e=s^T-!TP#yu#01VG~U-kb9 zO*8j@i%(v4qJ%>ai80>YtE@}e<+mH9YA*4p2KD67!AkGbu@!NkqiWoHCmRP}0zwM( z+s87Rx8HvRL5q?qtp;WJ)XopK;))Ekq6A_t0)Xz=A&h00P106jIrh^yOO;P9E$g`7 zhLrn#dg|*R7QG+l{*^F2@qT@y%0GpQvN5DncjwA^(tRF+<#u5MseH`FT0F=Po)+`ESR@@+2XfFJJUJ7N4E{{YW218JU_$G0ygnJNU2Q{i5k{lMhi(IE zAcZkytJV8lwEruhDsB9)ycWKrFJQWLp6o;&{6gpyX>%3mu5h#>HmyAN7tsN9FFBgB zDoMP7$1v*keBrOWV%g{LBXGO>3j_7%Oc2MtnaJRXgF%nJGoujUx`yeY8l$6e9b3P- zPvUPS$zT%|vPNwh-`VBh!yypO9I82Y}LZBa&?~CE&!O8{Q#AGt$H`r|`$0hGq z|3=Av8Z7Tve5C0z+lL&}Qi%I&YrWQdc>6cbL{>0GXzxyEU6 zO0UV^M=GtBr@OOR2Hr5GpWYvoysysbO@5}P=zmzjZytLPJBoi5rP1tFGcWk2C0*1P zVUKr=N+3?>`{g4ZF9D%6Sj_Wel~uRqMNv!*=VQAZg#=uT4muG(nZNX65O|h!KF)X{ zup!S5)1P)1M-~N|nwJ^u-p%dg0&Eh$Ym(wBnr0?8!l5@(g>PCpk|w5`0AhN74)rcK z9modH{zLx2fbTxuT}|wc?OQf6bs_98Vbv^Th#QXB-e8pq^OxFB$#-tiNN5>y41_&) z1wl!~sUM|A^82;NscN0m2Im0N?g^6M18m)A797CsK5NSV9(H@X(B?nX1oPNvW6{co zcB?_NWgb(Bv-ZEb+ua=I$)eUfP%Ut8zcK+?+2dS)X5_hF_}@`EosO5z2pn$A2yhoRY#Q?|4;L zWk%@{|#fyzn2@d9$SR14&#S+&MCl-|3_MK*{35>8+hvqs-EPv5#dk z@hF|@F4Ye;kl-p~_|h?vG~CBKk6R-xZ;U>o8XpoTzi!L)dt?D7>DJ{2={F< zxyX%Y=1W?k>zLJ_2I=x~;{)$|!5)j?0a#_Y>+w?lFfoT&ffP0K4#KbPI|H3*&>Ujv zl2X7$AyCknE8Iq}=^Z-2ruy*;wPppHdvl{@IcHiUv}7qT>W{EN`u9ZHNS7zSPey%$ zNF0tZ?zItsH5PrpO1+jAu{=gP!od7l`fS_ZNy(sakBhlnAEL3G6BIk~f0}3XrpnVV zmsF+rff%22yQTq5GDW~S#<-i^5M;3I_me`IfMa;Xb9Ba?0flAL${YDrG?{0BD;^!J zVAl96;kh6kYUkr^C}roN{izv}hxhsIet#^s;O)gUOI{c+u{F>xH?7RwmzprG{{V{m zdXIPtK~EVRikZB4b_2BnC}IPYD7n>;yxi)M>^{C1 zG}!jNc^Ml5H%54z?2hf!97rdt(nza0Rq$#@lY8h?dEv8b7qQhqN!}K@cboVbNweMV zj8ENM9V++4afRg9Cmt7i{9`gE3Nqgb>oZR{sLTB%t#@L?BIlP-3vZ6B{;%#h`ZC)f z`Dk-QnoO^t;tSvmRImTIXR?tCxI7ug$wE)PbJE(v;kp=Vm(4%4d9OGT0(r{xM&x;^ zPWA3+m7YMeS&|TpL^0)6ySUQc94ue5H;&@A1A79j1}Vr*4E)C|Sq=9-Kro-|O-s{A zg{KSaH)X$T55^NvIb8e6PFy=(F73L=Nk7-07B8#lFOrx3ONz(g#i8I=06K8G9DXchtgg2^iIo)WDVi*yeeH1)T4ggU*ZVI1 zi5aSz6&X4*T{0Oo-OW)<)+nk+GPmKsV)PHZW@0tk1i3FsANz( zZ_O!k9dyFj5POZguiaIWxvdpfFcxm`TAJ2{#cdz_!nihaU;JTj`77N=M&?D&M|&4^ z1ZSyY5_gUdDn|R38wf>AdU8#3TzifU^XJ1g8Y~l`$2M^O6iEJ-1yD;^>lBO#G@Du= zS?*k~HxVZ${>+-inakbL@+T`XwQzpTg8iK5P4H!TBk6PL$+BSoGKlLfit<`P(iOnz z4=0Hmb%1$-(Xt0xxo|d%hAgJ;;8RN?VYT=IAxiCZwG>!WY|unDTjMn=E{#am0Ln#r zEVG8Wb}s;NjGDzW!LZ0Wu~4(C8hcLLIo%-~fNu6TI?CycG9ZIUbE6ZU2vU;-f7tKx zzTm!%1jQ71A#k5wIH&cLgv&!AyVxGiHq^6OuhmfE9~N5`+8w3 zWU8;xNrG%k!%gBCOd5p=+L~jwEw*g>PswVL0^DA)LEeOcho0T&>Oe~JLjop-jRZlr zKhL-#&UDpWeD7gdtba2U+GvlIe)%#aC=20fo!mSlrd1Z06`#FaX+4tUvNyN0kC%76 za@1r#PWKTujEb70cNrnBS9Y1%XCkpLsU3CCE*3Ost_ zSOZas8%PBlCNsX~#WLuJ?#?%}lQ>-ckWK4N?Czh>bF~GCtjP2jN#{%o4~Uk^$1~*K zEk+VW7Q@D}sXNn^IRL844e&WnIZ=6}vhM)C3tI2O+MO}4Ut1ub4Ii0Z$0V7Xn`ARgJfv{P^Ek@~ZGB0)=;LL@MfE-tn50M5d|I{9uBUF7y@!vV< zvIlIfy78Q9%XyKK-DQymd8s!M?IBWN1y1DOEx)d$?_hMTa&SA|3wK{wCqxhoHSje? zE@CAKeuy$;I&*(9Fwm1B9`JaG}l5>BvEGAc&<}D9&+pn=g$+uq(x)Y^HZ!e`w{o_t_sRd}eKl3Qm_A+xaGO z4h|>IaAaXp`uX_>BzE2S6Veh&JYpdSjgj$J<;lplX+JP7o)&{E`D_h!&F07Hz=u~u zM)8M7c^Bhpyc?&#EwmxLVD>!(%@iz{RhS_9uo^vN_-Cj6BmDP0sQ!BoNj@}!=XQHQ z9FBp1VsB`=uyiO;X==z|S`S)v19TBUMUfgsx)%A^Y=+kunCYhd*)$T3^u~I;)@;Cb zSS9B7Q1xh$2#BzeFFtp=Y|O|yoVDAssRBK@GMM1v`s(oKC`i$ZUGHc}q;>6=W3`t! z{jOj;bEX4uNt51K2x;)Vufm8?Sf1rfi2*k1P(MO3j&6LFoUHuUUckx1-QAgyz zluDr%!$!69t2X47XnlMRxY6icwG=ml*-}{jIz1cyHz_IMaq`u!FKHBMLw@Qvik&gKo(%t%ajv>9bw`G!Z`3GcM9H@J zWY3a+{wgHKrWBG$8)RXO5LT^TCW?a5v zkJFtz*SQbUUx~)NX=K*Y)e#9r{l!Ah}<^>k0%BO*b!`&^bYyk2nK zYEwMWo9jl}ye*=+~f3Y6_2-qt1yxnzkR7*y_H;hJtS4~2BX&=P8 zNI~3G4p%mSg|t=SYRkDZEl8?lf01_G7e_d|Q}{*(Xcg2h1Xi^Qz4MG@DM`;n??$>6 z+jb;|RdhuT_g{bGtWiXDDVsV&o%d>@&xABxEBVfen<12#^bouKd(^;3jN%@VD zyi-ZV?w8uW3Zo|vgv?gf6%h)b=D5bsG2>}$a*|32J$@zXWJx*Kd2U2y>lzHB8`Qgm zF{u|mivs+TVMLmZ=s#<@5H2v|W0(^i;GiH#a;5mkV>XEyat>3P-%_poOiN4RiNGEE zV7RlAX$n&I%|_f;!l8Usnng~E{};+;D1DrNOu!zVYo%GD`X=%PU`HR)0-<&^ekb90 zq2-q+Wa-W%C;5 zSP|G4^!Fz>XR~OPTS|WN5(S09klP;sv&=~T2UovndF%YL7;n_c-u9!`SRGtwi(!i8 zMYs7$G3$qkI#;B7e!ywep)f1L%LIhMGA}*5$RtkSMwHnsv%YlciD4|adA)K^CI?hN zU&p!|+{c)xNd2IWuEzS!H!1Hoe8Q5sRjb}pZs?~C&&-AE7JZ*&)T!Jw%iNVRH6JZ_ zbaAZ+60qO{A7I|Me8v#)hfIz13{;+eIQldswh9i_{>#ht^Hs2;^~dvtCg$R4a%Hdg_M%{)dT0CmSm;SW>M6|u3WI3 z+lLJ!5@#y-rti|j_NFFq&V7qjt8!9wCS4TP z9n;UMQ%jB!} zqb+wXx07vRto3>2p`fX11MlF#z%Nf(34OZf9|IC-Z6n8X66HMKC!{>0IjmY7)HoOKhOLTQ786BL>a;$b7Q!Y*?E0X)p-C} znVm$G$(~GKtq=eiBjiuAZjbJbb^lal40F60ziDL3nw0hirgX0KYNltj`6E~{nt{)s20udj8R$D9x!*&@k0I@0C--1)wRe>+K!)wK zS=Sp8Bk7=8jy3zXR9EVnHJ~4^6>M3b)&r+lh(*h-e{xNXSh$$HYach zO|ldCt@+gp-@JT2zIG&uM0522(44zJC`I%FU&K;4+nN8*3(z~&b}Y!pG}Ne9W9MTz zo?k4BKS4YCKWWyeLc*DW&t31X7Pf%U@%RUwdKawYoe^mV*6sTE6u+eXmilUkhETQnJx>_{*_Z6`$X^zb(R;c3`B{dQdFP18D3a+PALglR=Rzl)CdKBQxy5de3WrGP=r4Cnr=KTo5lON!FcV-tHq{VWc90x4ysC-y3P;mxZ zU0P0a_rT9#vs>|xco&{n=~^lxkPt{LrqDzCa=d67H;>rG7SLvR!}|qXr%@=Duv)rW zC_&Ia+~7Syx)LHU`91q|zN^b-v>4`5X|d?NwvBH+T@hLIvxmG=YSdl1hw=E)@=ta| zuV2XgAahAWX8zi_<66E(Q4$60m)1`u56*dHixDYL=r9s=!x{#s!#?Wvpf@F6f_>}L zwi*o^CcPJ@MbBBS!S5Qn2-%n3|Bn91ypPEK^(f#S9#ZD)Vj-B9WqWf>5c-Y$VQDW^ zLxKi7{gI?l#!&hz0N|x9DG9NKh}L?A%}?k*q8c4q2O`-a(${SrVWj!er@LkEPIrVw zEJkwiuZ1j^Wfo(~|cP5=O!7!eDW%HVW1fyE^AZN4MlwABBMvSUzyKw9RNPX2de0v7W z!vFdk?f27Y1mre4DvEp!7$EZe9<4I-K7GQV%bzqWjaLf3o$ucnErM1R5%nlAQT+o=%3i-0bj`AC0Atflef$b5Vc7) zP8%~pH5zrUOPEgq>%yu-RtV)`WUMt@Lt_bCdm&KW@(QY=y*lKGR3+T)=IZ3%)~Ms9vKx*YO^`62=}| z3j5mWmas^SzJ{XjY`yEqmsa1Wr|fa9*fm(BJRfw$BBNMcUDh}2-Rwgxd(HvR3fa+vX=R{vRgujbxB@A9=f0M3VY;tq;fzxLDjrw&CysNxcsb`~9zPss1>mx$dnUT)!snOYqTW#BtVxul8%m zbJ-l;nN(q{Ye$KT&*jIY`FM}2SV%5h%NFCH;o6xEI=$H;bdDV}L6*|ScARlkEZ`*==UJ}BOJAm1vB{oj1s zd1k+4J}3B7MxscGi&Z9^QdkMB@Kf3DVShDUwWrdkHC|zX$M3!z7*O~R%||cZg7_O# zMC~dNfkSroTeL`ate*UDxx^A~sS)irj|u|&XiI5_!n3!ucLjjJ#Hv$$-`NPn9I6j* z-_pVwoq-=*iqlpJhzjLpQ-g{$pxU%TTao4=dv=M!-XebfyITEgTvWh^!iUl{HuI$r zE_YuVD6d+3DjIFF*ad<|&Ib4t-t`Lf`@&B}j$Y%x`B%WiLNTNK7Vs1P2U$ul>Y z%9hGOVrPn4+YvswIcB9Ypv$*$eJY&Z89@orol%{tv8S=<#{JD2zvXJym!P^8_1ViI zT#8E?J~RKJxVQV5Z+05aS8lP022*;?cHZt$YR9n;|Bd5eF#D@<)l!&uV#{{(MxvWQ_M(c=E!XR9f5dW27sjo`~)=bl;Up_|m)y z*_F&=LzEK%_<{?!jA~L8#x>)~G82o{!byL-se_exu~N63T$lPOE|>3T`<$-3WH_pG z)x}@x#3tW6NY3Zjlzfnn3OR{ovb61LL#6UZe&CB3T(rAYRL|7E76H5ekH;x47HS9> z>{_O(Hz_5X9y_$Znj8B|zuEgyOowpsc+b}uj>T0S@&IWs$ezIMSAHc>`ux;HfmqHumn%_5z_zRM%;@ zg7`L1UVk`={NAcK)*iQeA5npxju4hs*L8qV8#)l##IrgacZ@AK zf$FYEgZkI;3bUU!-M*Uuey9XLO;uHOP?hxj)?jr?;l5Q=z5ZuGoD(+XG|L5@^ce`K z)lP=fjK#N^6gg(5*&_jT=P>Pq%k+JPvCP+D36bn~_xTJb9acOE@=mJtBi@b&KpHzb zI^@RbOtb>9DRZF+H7f$<)6*pMA1qo-5|(v5N&z#~%C@orbbCCWh8^`nwa^z2d;5~v zDO31AsFfF6lG7`PXELS1=q9a!sk*|U{UO)mB(JqHoBm{;JT(xCPS)mYuaZ-%-rjhU z2)Klr2kI)$MT76s8BYG3#yYHTH{T2^W0Wv_RHZD}dE%~gfwQ%dRzL6QZRo~`i7mbT zbu5osb~Ch+`1KU3ar$5*GAVkWP%d5&DT{w-^8d%A5<@mmsAZErwAb3Ntt{~ zsTWB|b6|UIt-raO+iNX)rjid1qffOP+)9nLWe`VAaG@yjM{`BMe>j?3WEQ*(G_;|s z4~`aV11V+morZ>n&~GYK1zP2~T^3*OY3#mDr3vL5ChDI;Uc4x+|E^)L=YJ8g^NS+C_khV5n+?ZL61 zqDlH)0R?z5fK5{gGPSw@v-Ih*PA9!`YPONRv;{C775FV4e~)4+D~7U5*;*^Q+<3(( z!l_#uLFbxgzE0TIs+}JC9=GA5M1h9lg^C2*ED@zSocDvAYXQD9>rk5v_;eCkyfTrN z3$&U6Wx!0);;%%gD0SNe&{Y}`Yk9R`V~S}>Be@a1zrZVv_*c+Wow|Ix(+8F`t1K{6 zq|sZ` zf|0&6T^;@UY_078*zO+@yYtE=r1Z`0_~Hnqv+6V-oSB8�GCqR#OW#`*{JnxNX=H+mZaKl9BVI9$;KM$6lO?#VYClAoirk-bv( zKe9$C7JCcf!!AJThGqvk6*w6fA0(uCT?Fx0*p@KUe6%W70uJu!s+~m+!EX>dkz21Q zF0RRi@iG(Ue#-+P6;RN7rS+B_ekrE}Eg8C;JQFBu?!V{4M*cWdeeRB?lg+Ihs|&nA zy?1{F^$DFqKEDK;hwqfgoGZNoTmo<@5qkwstqZGv+`+&>x`q|8#^!Sd+SD;Jd;>xL zbz`_I3Q&@WO4SA#JX&aX{qf>cg{x9i(gPbHS`8hkA)p7Yr@3G zd7U`TrNQ<@e5WjX<`k`WWtbK99%lA)*{J-2F2+C{GF}l-*{KzptWm^*3WXC0l5ALQ zTnc9Wsp3dcN^o_>fg6u5l#uM3L)b*USDO^oV%J#qrqiMAKRd)>=jkKR3Gu2uEfL$U z*11da(NePFv1-NtW&ZaS8fhJ}=LomPhd@AblFmOci9T5Af$Tq_WdKb9UV6_}wHf2- zbSf=r!zg)_)TQTesHu}4c_c#p(S>NX2qt5JO1=5gJ{r(}&PfW}R;NI#2dx?X&OEyV%|2z8yc_x^CSt)LI*$gh=SVqPU9uldRU zu6cpX^RPgr^)AYYZowR*bW@cTbGWBg4MS2eKHmI+Iz)qQaa3LW4!GR=PZ%X1n00;e z;dk1WX%@VctSRlpcB{P*y(;@=dyC?JeLOKnFB$Qlh|kK!H=ntJ)2*rhpc7>t~bfbC~VwGN~6-bUN@} z+g!S&B0kiLj}?rWFJ{Z`!&b_!(b_(IK&nP+TJWeu2!UgrDW?^(Xl{TZ-!xQ-`Zi1$ z;L&#S*K#-%Lg}C@Z7?eU$1jA=)AYyVQ;Xr;oKb`tWXg3e<1YHcL4mynJYMvm^|dr_ zAYL>*8LrUh^%fG+V-xu8*8FZw4~*2|@?{+*_oE-b6m}2gygxS(VA-!ndLG( z(Rkxb855loujS(Jdg*ebmGD;VORT|F~EmgpwY;` z(x0r^dtiKNck;Q><75~Z6E|jR%fEAcM|njyqnhdVfC|4c?ui|+Bj@xda}U+Aj}OfX zxOx_U?R6h5K)hxxzx;mIXtdg=RYuHkb$6wu75G60*G1pFB14FNvm!3nWd}6xKKRMOfynZTAPVC>%fiK`B$*hlWMMnH^W8{kZuo%Z~!s96B3@-;!-# z$h~3e$un$|;xx|Z&B-cpe5?ECKJrI&!Yjg^Rw~TVmnIz1!KjP5@AB)MBYx%=(mF5a zr7bo*)OGy2b~gz+k{tB~=lu$s8B@_$APsVwZu80|aawrtC~as05x{=|v@xym}} z;xxKsh>=Uj?;$?ItVI84w|=-6z%M2lN z;Iwu&!{dyLIv;jFSyzI4y-{9zWCY*no@FdD6TAop!816I8W$WF7Mcm^K&f5pT@nt{ zij1Vgd#dRoq8H1+oNZRD`@JZeXWkRyjIDI|(tVU6R+Lk7KXFAXj#B<#DNLN~K+cUyVdjEv$f>JWMAbQg(8CiL)^w&E+w3v%9X1~rO3 zkxwnH_ZNvCGOI}xk$P1VqtrTmZU_Z>?IX%L7jvY^h2PC~E))HDBWv$lE4Gz$W;pPX z%y0g8CtYLZPTf}yy&4ywU+_9nY~+oSGcRzWs2Tuz2X=|AMQnLtm86@2x2HK<=Sm2( zwd4Q&3H{)5IpH{Uz?oQ639I%|K77)LL(ncd|OC)6#|WP>j`h zaSrGx!%)NT%ZUb8lG4KI)G3sY@}v$e_KtTmk0k)I<5l(VJPCRU<-ShjivlmiQ1;>a zP?k9$QZt*}w*oDb`c_kTjqXY3lyPVw_Ul!BFR<3j3)C6a3!vNvmB*t2G-K}E42g<`Fh3 zXFu*4_@d=#SL77G$MNEYcr=}~tQ6(tb6j5qB$>Do=qrRGoqS+u0A@54u;1?X94t^! z9%T3t_&jM3>CSzLDe*a4m9yOQ%tz*F-uvy*+HzHhh!XDK5PJ6aq?!tdWpoMSjXH_%c$Dhg#+^T_#BLkSC ziJEbfe@OmFvZI{O=Sohlp&?r54MN^ zqiA2miRjn}1QwpXeY{)JJo?`{N+pdaNe@VfX`)6{?2~eU3(#XmO=|O@wWs&x4iA-j zZ1r`ll%snAkO`o9HhOXiFEXlv278|%W6x(|OwCV_<(=7bX&KQrjKn5Rp+eo#cRB3h zFFP{PC|$ozZPLjJUxG})m~i$d@YO(dH-LevWE$zlMFRl;bYu3LM@{> zmK;zMoipo?xD}@ja7|@rzCgN*Xrvj)A6&jE&6G5zF+RWJ(m&;V zbaS~1m?ML>>Ul5uG<5_^ZQ+i9%S9})To}IBX#{u;w`a0pfLK7@PsE))>v_*Ztl1j- zlvl=Xnl+!JS+riI(Zm1glG2Z(x_}v6}iBU~tSs$!~m$P|(8^Tr77r{IpFThE z!)XAWR+DiIYMSOey_{{-8Ia52mK#n%c&A_U8dy@=X|1H}3 zp$;??Hm_BGqB*`e-ZSst4Lt&hB+VlJmHutuW#)@tx_qK3SIv?d$o;$`-uc?Y zWuzOA9!bX*nEz^hlPU24X(GgD*4I5v^1eAY3l808@?5jD5HtXtr9eTNs$=&9KaiNI zN^875mJ~t0-~!JR0+ciJf%o<&j9?U~k$9d37?d}{ zLIFSi9!WyBn=tkX4W!dFRbEB)H6?P%NQ5U|s_kd*6Rok2v;Gr#eC}IRAfixck8k%{ zm}->hOMb&5Jb(Y>ENl^~A+DATQv zD@K}tGQB_G)Aox`6WMPbyVe&Q0{;1|-t#WP=3@O?u?V0*>o<9df&(SIKJTQAd7(<} zP9ynH?)qd}?3{?EY{pI_9Q|-@1OOIE)r{>L&(m2~KZXg=OLpn36>^#rM#`;{RAGTI z?vA2}kC<#F)-Nf>ZOt478l)IJ^9E?dqgOGo+v`(gH~CbfRonkDgKLThy2g3>Yk&&c?m2dIwNukz$xKtza!g>f89iawF z=+7~n(@ht_+%_{JwhjAqW5p2ZO74WYbI*DzCJag-Hof zWQojxERlI*OxFi9Uhn*B(sPh~L?{UX;|N%qXOM=O!?oszLq%{Ht-yDo5D*$o=+%8T)=Hz!;$F-g>O#$;AikTeeZ%|`E;{c!Dz|gSkb2MCd`8&S2C5E>M_oX;uVaL%+L3s1|*cjmDUsC zn-jL_0_v&khvys}(~*{h(rLn0+>Y;mtfevMLYJL(UcPL4t!cWs!+P23ivewTqdF9$ z7I5wW)AJ={56}2UEqNCqv^tEOi;VT`Zm|NTEFeqH}SNNtxvUXHMLTKaO`{o;BEu{bn}C3Sx)bphO?agl|u1=h-aCqS*1Atymh&y)wi=XC+CC)h4fW;NsWIc zCoiBtm^}t#8aSu3+pzBT{>4gEXixaONgfXd#xplBqv!wGjRUH3Cx}8l4kYM*fC$L@3E{9?IVFa+SPoMQ@S-L=e;(gIJU4#&(BpW zSVrle0PMug4gL2Jpk}|PwgYg;Gm$O}8PQ~rs8sX{7`GO?hJ8>kAiBD6#sfXEbB9sj ze_N2?igoz?8H`>^<@#Sh(EYF3a>JuX=_LLr=?hQ2{CyWC3u&Irq@F&SgVEr|SK_iph__NRakt z9&S#`1+4u{gQU*XK6S2=?^ofI-rRld!3eaxOfIFASAMz5X}L8g|C@>vt0=Ce^?$iA zB94zA$xysHFDqoELj5s`^WFBJi1kmsGU=V#y_;hAOiTqA)F@%WutB%n zFXohYJsAt$sM>#sMvUY_+Jo``|6Sg@*KB$h{srnq1@f)HeE^yUE}$9jB$4I6b6Dj& zGejT|$CHOX`k)r&7w0%5V=X?*7X#JK2YB>qV5SFaBesUpKqQIlHgDtkYBKdTUzx}7 zkD%{9m1+V`XD>jv$JD`=>nG&>&G#U|EOP1~D;iP&3?X1k2li{rjol#-o8lN$Kb^f& zS@>=k!=}eS`|3pt#b08T{$B}piHn>)#$XkPBl)n}Vk8c9Z5yWfi9kA$zPp#X*qdVD zfiih1*w7*$D6N$U;aB9}Fva0++5hMAr$Z+N$z zFahZK<@@>4=sAzQ*S#h44<0Up!j<{2S1$nt?FhF9G@Yafe)!bXeEwRm`Gy?kdd&~U z3o+`WO&lpj(_z{7Uv5*br8j&yd}tAiM=O_YMsifih}4ZpvHt}oeYDN7B5@GwPgIgP zNVlgNJM|y=RD~m_(9-mg9Z3TCfHeDm1+(lPetmP-{SC5vOry*{kFDNiCli=W^O)Nvv7H~;;f-lKos&J~3lEvO61nUUzeNZp}fNn0(5STXK#+wC=JR2{h3Wr~VDIkiW zNSHKLV~Co<`{9$<)xI>+5W=9|1LZVcS5aIV>h?sssncE}sl8BQ;3ySoVYvH?FpT;< zg6!Yzc|0cKwYyWKTPl`)bb&?6=y9@rqK#RBg>-iNHw3YkiU4=E;XMXJbmZ-!GmBt+ zMXG+51n1Ie0QspM2L4pXUv_HJomOSaL2SMaaeY%^M-@i#S}vPbKy5N3I5(xmbzUzk z82^VVeD_IHtNXuSV} zO;1id0Q)iHESMFwIaHq4NPucBo(fKp%J!W+3vM`6J^@7z42@IM1ViHn+nA6^EtvL) zyP)?C{9t(?M^;p+mBXz|B|}N$8);td((U;|fHWA(^==UgDuzaJ;D{bCngYGm`L=07 znf3nKgg_VWI?l@MUoC{RR56gJSQZ@{6|KFgmNalaW8Z?iC-_6&h_wd%3xaSdh2=Svo`q1p(%fxUPAZhw z$G4f!$7wzs5R%Zi_)lv!-B6&M#xT`|YY&b#V|lKOcsgeVFR-J2K+&UhTcWyaJ1f zd*5d3_r;Cr=yq#qtjI^;SH!VCG_-VvcfONQ{p&60zuRx=`s3;%thwW$t?32I1;T`( zkdOVbA^0veoGWXZU^VsD2Q8c!2Rvd0iZS`7T=S{YZ|0f z9w9(>rvLw6;syT3cr|*NX{(8*b5pEOCAGWN4i@a?e6rqhc)`UeP9QxG=0?gb$FM$p z=KLS4S%TDMYm9kow4k0Xn6l(MZWS2UX}-8RkL56vmEw6`Fx-ZOV%48KF8Vz(ELk$H zL|cP`HLg7lU7FoQ8LY#Otxf3A5PGi8h`TB!y5&a-MiX|^R?tO@j=!{czQzZlx;I!P z8!$v2%^zZL}Ad1fY z0aCsk_@<{FHHP@_!GRJSz|g(l_SarF?%B8D zVYiC`doQuW3k1PfhhZ-`l4!ApQE`XF zvzzAgUxSK98BjYd(8~jtjZ7o_j+j-#=*8MT-J`<%N`eVwrDOK;_w5J3@}e zfr8KnlFGoOta1N328jKLc18^Om2-{as-oNRmQKv?-gtIKHpCIyx$r~Sh#iW;b39RItOV62TQgiUbC^Fv0Vl)DV;z-w``m-~ha)(j1^s(RH zwx1*~%B&;C88-eSL6(tMa=qAi_vLGpc3Byir4u&uGb~s$>gbKpZ~x#+Abi=K_igj{#m$K*w90yga4& z5MKmMBKU(BERM%i%y4KYV%(4OcBiXTI-|)MUk-iY3Be*{k{9thAq@Zy6X_gi!p>HyOtZbxim^nid&@Qyla?Ib?5G=yU$F`CKl*0jK~J12Xgcioe5; z$q9ijp4I$rLASVaC2G6ne1yM>6ExG=S=H-(^mQ>tE9ZqqjN{D>A9WNG9NA5YDMrLD zPf?tW$HRzNdpxK`Uy(ix4aL7|IqQ8GORq${yy!UJC}W}6E2ra)w?^37rItFj+P%I3 zQft{et*wPTSPDP2ba@3%h)0Y#ai{gds70WOy#|CbK^e~nuU>N7L5 zH{ng?bxs=GKrHTS!3tcXMNrq?aTU>%pvo1~_ao;8Dj+5f1!~vF2&7E6I>guK)Nv-@ z>cV>AbAzZ%=zp$=Wr7?9Pp&>2XsNhrgImHa;s?^~*Dn+i*P+Xi_1w$ROo0=`T+avl_&x}^dYpOm68Ga!P7J{i$f}sYmX`kq+!ugotDDT_U00KFNW!Lx7{dpt zg?YUo6Y0^zeio(X|6H*)RRcgCNGRAmDAh)YO^c@|C8QFqb%6Z z3h;1cZ|;sZC1`o-R{xJh+>iQ`>##Ymxvb}hE2=0$m zQCF6CC%feeyrqKEO`oGyY!ddkLOhQ z53GPm(fiicDOyKcrtGk{QUp;dG17ATStcUhf+B{^UN}U|hOvz>W&Eu1)V@8Hg#zEF z5lJh}kP!_STvx%dvmKzIM!|VF?sEQYL3EQa7P9VT*Ohf!3Bqxg) zS440kOT^qN)|T5`XhuJd8%ucYT4Rp6Zdi_>7nEmw?q+~locShNonCs0J$9ytaivS% z-sdO^QROs2_t?4s0zv-!_v?s@3TY`4ySE6x|Dadu?&?Y+BQ_{~l)Q;~?l4_U6fi(a z`pV;&g;Lao9~Za##eB0Z2{7x*sn1~$90ulZ-h3P8o>s&AUoT-E=OHuDqp$Gs7WBQ$ zh%oTxFeFi@6cX$^-blA^wBYe{ocm0lDrx@Hl1$J%WV9Np>g~PT*_E?o>)lBo_U4;e z$HJADJLIB9K<#kvl|#+C_AieoCiG(~V`9z@}`)Jm!&Bl0j{*)wXSAC+;A?R{X*$5Nhqp8R;}&$*w&MsVKgfZT0H^uABFU80U^zso^&{el~TsAO-^lcbiFQsn@dt655&k+DzZ$HF-h+RgfR#voY0STq zSEGfqRio|o+Qok&$Fh$O*BbGIFvHDO5llfj<^^Mir}~4#!??>W--EnFdBYb1{~ zAYD%;Uo}Y*KJ&0qH#Ilz)t=*kU+(6^tNklGc3X1|STI8*wp{E9*^JICK0KdYduTZH z8grmliX(HiH-7kQXx2y(Z8)E8e5p1VJF*z(V}oM?Z{5SH>xnmSz|F_@b!?{R7tGw}S&z|W3C*e#^WvVZMpU*CHNiR)SP zDnNBA@G#uAR1D{gk>N3t@j>WOXKL(~U?VVDWjKsyRk6&}?6!FiQ1L{tiOI7_-x4Sl z6Y+VAa46&&4X>|*cUpMNPW(i;a}MR-J=UddThG>!ZyD-X&pb}7lKpqY!f$Q}X^Omg zuw5$$xBN3TDdItSf|`p_LC0flk8A!yHG@|tnI4^c!S}*ma$br5DGjhAb8rK8--v&S z`Q%|w3X)4B%x+M`DfLFlc-VjuPHlvu`PgP?p(rD#g!l253PipS=N&%Os(S!h>iCvR zaQPc)sIDOA=Ne3FU5eFV$sw8`XRYvFF@0VfF){x4O1JFNdgs$8w=So!FU}8(8N;#L zbxJ0`I!|pyqEAt8g^9vgYRdZ|?jI=q2RZeJ{VV?BDzWzw|9 zgvLL>T%)dHYFfa+$@pL`@~kARn>MlyvZ?tMH@=3BQVF5 zHkbR}gwio|wQoPNBaF(oq~Ex@UB1TDu6Jmj5{#w&@}I9~&+1fsVs2xjl0U^aM+yFOl?8(`* z8nkfx3gW`-d~ihGbSgdc-WZZ7s-|Z5J6#U0;kBb|KAw22xFSKPu0?WL=wEY{)gi~E z6#t;-{6{q{d4xzb@Bh`O-mIrmcU+%Dg>yr{@}6tzgHby8DjO^Y>T* z6}mboOtuLJ+K5>7hS90N=3$}La@e?pnNR(qxrL1#;dA77{m0KRQq$bcOL5{&FoDg8 zU=IKlg?_ERs@DsJ&SuHn3e6_#$>Sxpjfur~C@3WRlZ8eXWMv)lv!vc)j2pE&b4_TJ z%?v^uPN7y8?#LSNjHQTAwn&!bE^AI$xnV8KL!+9hqTci4?M+MJSrc)FogSSMv+7T~ zM6{zd=AqGP6rg?=eD24Y~ogtEQLWLKnf@C5#v7S&`*UAudjMGo|KW@-*~*-r%A-G5l;M& z#a`646FXVVFMafoLcmp1xo=_04?XT7lRf3lT4o7atQ}yFN@9`pc^a}dS7v7ny;C>7 zH3Ki^f~xUnDzNAB27rPI#A2@SEfXbS(nFCEx+rD z*L_MMeU-t_yKNpUJyEEXkPbHYA+K2UG`%Kc()Er;fVfa-+;k(W#dV$i>jdjJ zvJi?$eEe<90qv9I}oy=HMK|6XWqZzg`iR3(4>6NF@WPo;u~AQb~= z@{U<5k9U-V5Io}u$FCASN2x2|V`oDzP~Z`Dv3{`9rCk}=5~NMC741)}*f zf*4gP3cy%u6x3&`03y;;Q&V?1Vr=6-`>!nFAx4Ds+80T3?a;HnbbsmauP%KGg(@|V zI`m^wElp4c$^N){t0&>3cmSoKI*g>7wUZ`VR7hJ=3r(I0Y(}5Nn>_Y%0nJieto0-f zJSQ=D#Tmf$&mI((P^*=NyqBO{{{HbPO2=9pi(dY>@C4Ek>g$6pYVAgCg{&OsgZK2W zv6c9*!U)-ZjxnaHIax9&#VhltgAcw+$Dfn3*`wM0R%_fU$iqk;xM$L2i4#fwCz@c^ zWGBrb8>vlUuGyQfh=2(96nZ0@0j?oAI+PI&gy+i)Wo_5yqQ%|Dy0=cR+L2h^Lz8cH zC}CL=c4Hm3v)dDUgaVo=^;QxD45;KD0gaV~)Z0uvuY3S7IIFzIUeW0#9^f&v$GJUK z;q4jwUPEauwsw#k0F|3>P`9JS`0vLebWH*-KSO}E@_ru{&fvs%i>ruoz4-mHEkMU%?-z@1Kq7AYJl-`-)a#)Uf_#LfOk@9ra z1j-k(ePDC?`I58USB1vQi!p+S50pdS)lgVgwMc$)%?fm z*E`yu+mE$E*K~*|rOoOcCspeOC>)(Qi$|yckR1BXAApODf`VxfYU+EIC;M@SNj6G* zT;Ogkg3NtWEpcfKn@5@Cfsypb*1uk<&msK;OC?S4&60l{W@nUOO5KiJ&Gr`Lu=|OEF~oIFI0>72-vX(i`|sT0*tB$Vjp;Cmtyhe_KW4}C z{=cRxHTlckR=3TnDZj-QA95UuN8-ghUm}?P4;Mi7&B)|m3ytRS|MiH@hFt0#dLMOl z>U~R_MM=jrbCrt(3N5$?vcH(IQwYBLN*919JC*Ws0L_fcs$pV+9H+ay|MlWC#Mv&b z%0UCSJvl`__yy6TW(NGeVBBlUaNCvzn!zNFU=7v7iSB>pqWvCBdau8U6&(#kX*b28 ztjL`yOpO#T~Dm9IZZ08_OE~ThtI0nmiqE+J>y6t;1ZLUiC

q zZupGUhJR3foHm9KN`#RROd1$BTRx0q*6OsHauAv9AlpVcH#CoWmfAff!$8NWe2nU@ zT%uh>R`9*W=c3TNUMTM0?O+dm@+8r1P(k)30~2Ub;VHwcXw1 zWF0079Vzo1n!UhPq;JY^p~jjHhniQJ|6$jTPF#c8;M4u-a5H<8@m8+5zvlMZg^BaS zj^j;skLqqIG*CHm(|G$%{5NTY62RRrGv#v>Mvv5gV#lYm{tVy6oPK=8r}Bne&E<9hgA{VCu?B2o&I`SL(i$%z{P*n6UVLw&)~4=p6lv) z8*|Etd{XzB`86(6R4@*?LoUOBtNO2a;7}xf3s!D^U{LjrX$4>L^77K^cWhDqIoB-d z-M7cvlUx_cHCI2sYjuCrpetC{*II-q9#9FoczOKX;P6^c^ZapnfpE_qR`olXejXpu zzg80~u(1x#_dK>?+c$e{ruL>PM=RbwLf#TYt@BKl5*HUDd+Mn+=9;@F=Jc+jwZHrS z^s*m!W+-JET-T$nJPl;4l3I>t(nVc)HBj-ZzDyMpC2SwaqExLv?-%(uPrT#D2b|*E z{jk$jy)!k3)4L+Bt-&x$@+ON;BX=IyimmcEAdV|SJX;*5y3l7ycdlrrW6rNP&&|1g z>c-!2?0j&TM+CscLKTv3-4OoY)Ta$UMlp-#!K`XL&w7!@QinIV6f_Rdj_o9g+LJvr z`P{ww)IaQ#Qm**PY!ETWMSKewoMT(bVu8GZ{j&?+=T}Z-jbm7+=}kU6tQ|vN>fUJ-b`3s@6P+W~N4a zA^DMy?45|1?f(FvfUlu|Wlnrd;Y$z7LVz7-E=mko@kzYY zi#Il2e>SCHDX6*GUeu59W;HT+9=0%?)1|)6YtP;DSRHs{HI2*o_-q1BI_RvfN5w68 z&sPD4tG$Rwr(vyy0tdQ}Z~Xn>Nm8sK9FDrLoXoG0L&(eO^fQu7gaOz62!?6Q;XhG7 z^sC4Ip0zZ47kU37QB|AedK5`hqVLFl4U$6bs;^zDQ8xVc%b#mW_>EI|?C;D@%%5lf zug@vu_o=a-jfefkF1MfgUyrRRi3ZE!9z)Xcbe&;*Q+vb!o!dO@zjI~)$g#gCtv?r0 z-)-1b7uZeDv4<`hSFa2M69cg1?6YV#csrv1;ftH-HyZ;{do-b?$^h@t_N3zS8{PYe z%N>Zo-%w2h?{nNh{0LMcDZ^|xYN2YcB5Lc;m6Z=n8$%ImUkCq{*a`%~w7-bedi_@j zh&2C(7w;44yAvJSN%)*nKDMWttbTqy?`VU1Se<7mgkSipw-QD%lzdAMI{~M|v0@#i z0=7qgLqd)n#*uH~VCECC{t!f2PJ9z|0_vh<6cvWbJbQX$T;y!(wK)%)GY2TKQG4lX|;EWZ(0cQiJB? zcNRA6B3?)L#|{ZGHLv8+_`YQjDkYIznz&TD6%Y^sDoK z%jkuKtx$nH-hY%lW#W(0Dtm0>TtUU<@!;&Do_OmLim>%GL?14F(KCGV=4z2VwXu7n z7L(!2a>etXDz#v`SQ>T}>sslqazSs|aqt1tY;% zrxhp7Zye2DRfF&IRbGu z^g$N%|C&`&*bO@nJUoB+_c&qjf z`14Qj%S|lT?F!9_21#f=F3ESKcy3gp`ND}_xEb%IT3h}E%fT6u?`w%tS0jzb z5!*HQKCBI8Nx7YmYPqfU=m^5vf5JX0RiG#=8ZYjnx<+r9N*qhvveTn$u)kJ!bE~Nf zULQy)7$1*QP*&zX-Wl%Lbas+w;;|*>s~)QyhJ*&W-I<%Ww}vGv3ZkflsG8QX(YIw$ zF0kj|p}$=vZ3>!Ri%7A{UlD&Vj{P>ptVeF2IDzbD8q=i75IasCIX>!m$U4dJ`0^U-Y*xCkh0gzt7A7 z7awvkl^V8R&$AaUP)*PE_pht$z{17Nvg}W>y{3+8-Wo67!lfKg8tb*2E{?0>!tRV2 z6uXGPr4~@mEWG)qBxlJtAmTAbYD2do02aj`IBq62bW2LdoA1uJ$5O;sVP2iCMA!I$ z_s#meCl6Yb-nDyoigDAaO>`!*I&#jdAeE&DAeNv9ij9sn>68{YwB!OE>6`8A15nj( zl+`bI*fyIJ7&Tc&#xw6Q?arc){K}-EH~O4~5BON?tpywLlgqx9lV4zCvdee2oD+Nm zNmiqHgT6GVp}BrsBCuGzIFcwgIszMPuPeuAuU0y*8iH?}K*q|z(M925WS=&%Y#Tn} zV_YOy@WuTh8tk1`kcYrWoQ7!TZ^)`e2OM)DF9fzF4(GrU=+Uy%ZTUz?6~}AU6XFMD zROmvQWwNoyq42A0wHCex)O(6!fTyr@hPP0_|G_)JjF`!_%C=IDA0dY#A(3u^OGdc> zu6P#)ndH!Y-9mL*4(#hZN;)A;)0sS#ccHX�I{A80I6!QJ{#|%$QnA>vD}m;BwhZ z1n@sllZ11$9Q`+iw63SzovKyme<^ z1hU|aUz-3q@pK4`9wqSsFhJXMIMcsD>(2H${gt(5Z=TwX4Flj}(+Ymg)*6O4Zr< z;M2r@eX_i{MT5YIW;YZLHj(H)#}LjK{2`SW7*fak>;q`j6TP{VCeErry*!5IU1>n;KT6LWJC1#m(LP>iV6GrcgK*ukC zq-lGbfuN>mf4Vx@>(AEa!IggmA!(4n?7)h;D|+Y+B=anAKvv<`B*lJqvX=)c8~VVc z&#$D+VS=d74}B_IxEc0i^`M}9L z;9=kEI>gW}G5p4?j&yvY?#a%9iTE517aYVrNRZ4geY~~^cV}upz|pLiH%|Szx_UgC zSUGSs9q-;2_8t}a$$F8tMqiKX%GyIEj;U6^R$06-f8AG}<0YmUZ}IS+$%W=Id7IpY zZqjULcb!I2-)_Ef-1W>OVf{fXd7D}WA2 z;_GY#^vH#yf^OZnBa`Poj1*}-2hBCjHI>($p9$K0TY_tY36s_ya98t9n!Opp)B&KV z-ql?+a}iXyj|xqmyCW^5;IAwG^7mREl-=8@1=m;Swq++ZEEXRPa+AEqA3U2gd40pi zLhs-JKk)F3WReM@FXL~(euZP}_SAv`N#VkJziFE%j*@y5H9cIf=xVrLNJ#cPx4h(l ztyQ8sI#xF7Uhnhx7(abwKy1= zfYsCgeDuL+5jKb}O6fENmU6`?J0TmA<_YpoK2r$8-k46>B~8RT9+zrLO+!N?Ke7-( zG6UhUCK#REF(+^;Kmf%RKw7*-CaUa7ni-RHn?f1PVw5(P8H)Z+$0X;*r!a0aAS1Al zvd1K{z42^3rLH#ltmYxywiBsj=V$m4M-L!~4x^hh6?CF17QF&m@{J>MubMkxElRfw zM|c-GQvh}TsU#*eLbuKmN=(PNOHEXa>d(gt3Tl7f&={5wZe45^5o! z)hdfU6R84_K(bN=T$yXvWh2^pU*7;*<=LC06M2#}D9B{`Kk#58>ZG2S8|@9cu_Y=e zw?9l#?1hFg9RcW`{qN(lqT%a=2RdP3aC80Jzk^HR5$Zsih!VIoyKP;ch7yEEiDuw* zrZ39E2N+MAsoB(n?NygN0nHm|85eVFtQUt&U0KOg5b z{yLbcaMyiUvgm( z&J*(79|8gkW*`0O^%K_bsb`cA)f?hkTSoI6@x2FzLwh(Y2601QPm!4RdMjxf@nCtogf|l>b?7s+MFEcd)BSQuyEuZJ! z%Mxq_vnTRGXP#X0(49HWv^paHf5pUx4V z_k5GrDj{1y|o|FJVNMIDGX%H%`o!e?*6rk~O_7mJJ|aHiI^ zTwwEzXobpz81?Vp3&$A(A{lT(t9+#xg^gN8R&+wy@#)pklyjY$ehb+I%TL!~L*cCc zl)*0*wYJvq_EJ?oo?&hF{u32{J5W>k-|}(ZtLKTMezg}5(3!@ZVwNz_ESz=)-8bev z7kTV5NXclP-njb?y0b)e;yr}UFtrPQr|?O^v2JXi%hO6?;X~k@Gu#LI$E4_|PDU!Vtu}x z{rR=jGpKUe=jRi*WlQ=}_%dux&kd_?-j3mVHKlIt!7@o#iMp0xh-sy}w|K4D>(uB< zeneQmF$09?pCAD=K0k`(Q6fgTTga-#u;{QvKS3fZYRc^T@9zUvGf>sTL1cLfAVN!c=Bjm?~mCgu8tB)19D(Uj}PzFjMpBp+ORcpn)u35 zLSH{#scFC2zrNHRLuch9{d!c}5)55xZ6|fpvbBUz(R~iS@ie*}-21i@In1Pgj2f^9 zG|X7R);fo2NmvrPSPT1{pNyv(X0q_HB1+YKi+ebg-(lpM@Fo09H^iVT|HwAVa0e-n+KD4DlX3AiL8;z^)*>u$=<Q+dW+UwH59|N31X zpc4Vfz)3Fk{D>PtJc!?ohxaS>l8pD=mXPZS1$=4)6=M8}qD@|L>B2tQ8s?Y3cF-07 z8oHEC5N&X9S`2^-(E3uem(=~{RwSFfGy&IS*d|*gyw33ZJMA=-X^4sS`1$m?81Tgr z3Jjouzc<04M6Y$5c7fvZep$aEz!QPw?HiF$5&dJ;SmNKliG_w9tX{p=-(Rfw>jW`w z+<8%rWZ-zR(iJ_5o!~VBm3wP0FWlNJZCzuX%ESY8Nn4yasmC*QT%Tst*7Yaxa!c?i zl4=qWYdluuZWaNfjRg9Gh46n+WF(ZM$V)OC`mYog5Q~i(e^W$aZRv`=b!}G>MMFq zH2!jGkIqqc`fgD5B4$mU>kfr{H+JdckLkn>zd143{FXk;y(Y?jIhv=Wba!Du{0xgQ zL@x*9UWNGh!Q#i(35sb)NGSs}u*wcQ#0h=%&PJN}ho>jc!fO{kS#uFFYkpm|K<`9& zVa{tbZT778<{xVp8zsfF8C#x<_`>qID^dyEV#<7)= z%S7!IKyHU%0BNOfP;VbTX2U)B>4nc1?8GNhu0VoYXiLKJkQ}>;%d(G~o70=hc#oF( zUIloTKz5MfXBA4-z{@kNA;7QHpcl>pUK;7h;g3>rl?EwvJbdg`*!zxEnaD;`0O?0^PdHWf9ij_UhqF7s!H5*VQxl$X0ci@Nw<$g36Un zOjC_49KO-Xp{$NVpC_3|$a63AZ{UxK%sZ4KmZiOqZw!k=&=Jj;udv5ah^D=N^Tml4!nxn) zUP^7PA8SIYjZ;3_&B#oMDo#wt6D@U^|6HVD=6?^RhB{;hIg-_igY$W(%&u|t-*=|X zE^nZrFMV9`8N?ih+D6cGDi3O`-du9JoZ>KSwycd6L>~C#V$o|&QVF}D!otYWH9#I} zJRKv5@4niecsRO0+gjl=&XQNidEsd{{jrdwR1Z(e;)zT+aUR$L7jZ{U@QJww?pbmm4BTp$^@3@!`*IUA>`5y;iKMY3wFoC4Wci{Onx zD)cFI=6#LtQ8W2o1F?UkW|Qj&f8$c3Vauz#{mBBOUpH1&menb_JScdqhT3YRT)$kN z!B)_%zH>NFNso-bn}@~R?LkSJt%o{-V6o$I57TaOA@6}YKg;22@5ku|F3TSI&C$Gq z*dk5M9zteji(+lbBZT{YbefiBx#8CP-Hp8$-AhQ~MKC)%(hhVyrklKV%~o(em;)WL7IH zP_O+R%0asOsmZj}*SPnc^lkIv*rJ-9ZamS;cQD&0GEHm&cuhD|7inSk?Gjk5DhQe5 zk8feNi|pXCUZE_;s`v^)?Y&BVxh;U9+xR8uPJR@foh|raE^o8m{0wn|4wJ>P%yLUh zzlm6Ro>Nq*bwyH|y|lc4F8kybQ@_RchbJp2O7pSbKUi;$(8t?J&h;OVJIX#no2rg+ zd-mo@S~R7Y=>WF&|KS2Gn=8dt^7ouePgp>qF7k>bQ`9ex08#!#LpLGeZG#syA` z9!>C-(P{jpPp*+f#9zAnsgA?|$9%Yr*M33y4OYr?8fi;!;I2t}uXnEjpG4LmqPWk0 z%o;gB ztp7)K%B!;^yF09%GE8q1u0-b~wW~j*M#S_h#zO-cCoYF(>FH@jmS(W<1fxqYas5e2 z)8Rqg9~k%TS=(2hq9P#6Ht;wv%#S)KM(~*}`wbOg%%DhovgFKNp4dS5f_x3p&4j@s zz`$bd)FVJ*SfkEicCXe#)hR<}`i^=}L%U^8V0anMxun7!b=r1(8*Meb{Q!JSp7qN8~LgkdC!cvR5#m^S8V_%>LqI1O4IG*^^wbUF?Jc6hX94_ zlR1i@OG!zk2DX=FTwz64#EcbcNSV1YnPvq#c*$zPlV{CqsBhjk)1K|XN*KG9r* zPYR@J2X*9C1u=IQwok&f3x;?SUK)>|I98%&Gc?@*WguhI?Oa<61Tk zrlOdwNV3RsZs1ji;cgVk`F6RY9K>uCaR7m31KWbsJx2t6*R|TR|6z%U?RJgfILpk{ zH+d($dP_?-IoJMdg>FBq;; zHksXv$71hU-0DwM2hl+uhBJQ{t1i?2$lorio)kXz<<>%ctS;d|T=C%m4DP%1Axh!2 zFBIGi62>hkfh0=op2v79GzVbC>eaJR96?<3La`U8@BV%e_Sj;&lE^Qyx!22IF z-Qph+N*I|%LM>A?Maa>2Ck^Y1;Wz*zoQhe*qaiVL{dnzI3r}EbCi(Ihr<;W(-6Bp4_C)U@iTjbywZYR%rAYQ_i*72ypka9&{`q`33r#CWXb5W?39T1s#!MSl|VN8F*{TwlMs_+!13IINgwZFdEZ z)ZyoN8~ZL#?(GdhX>-qR9YrC)9S0!M2JIPF;IWh+)Bdu9v0$@rjLPE%O&Kn!4|T?$ z%`wkTwJW6)7-Nl^z2bQ7=L&!7`y<^Am;{3Ec*|oA-tG^9cEvzb7Dx-ulmoo7Fwf65 zJvCUK5AHnO9&~}t)ooh_E0+i>rCQfg@c`sx96E5;=xL2+5~&2ssLhHaCNq2XF9Nk- zo|Vk)oUL@i+%;g|%GEs+RCa6qY})lXCK8)$pr^C!+CbXlDZ<$H!?k`M&78DKjFaF< z+^fBX=Es73kGA`hMY_}r%8-7c?Rf?M)p$N(nbOg`dU{}n$SY>Xh5DP>f}6JwUxj3v zpvNN$@*QWVmY4U`^MM40AY68Eyb#PnLGkBu0cB6688avWz@t)DuuiSf1D|1V?$eo? z0AH-T?Ln`-a5t0je7K}oE&48PnS{KLLAMfTfswGRa>b0D9*FS~Vh}34j zQ?U5^sSyf7n`95|UrC?Qn!JeT-@rRcID2b)<=_v<7)22AmZej7_!Am7sl2h+*;l7- zl=D9YltOh~*1;pBfbcr^R-&NqrUT~{6v2(MU0~ip(DHESrbOop_WS%(PM1|^U`jo) zWtW6M=D#;rl%w>eLl(`k%bBKwx>T;NkC7i2kzUh$aWot#QAS*PlQcv+aQ_!r=vg1H zEY*ms+e~xjy;%&lop@_q)3U%&QP5Txp-iHOx*fdV3uaDa@n0)9yJl$3&KBmIVnyU| ziw$d&xwk3k?U2E1*w-~DmOxREUUwN-OmVwK(wy6QzCubP-T%@$CVn)Poi_vo&0%!GJVZ);Z~awY$Ta$nrWUQTa>Be zimUyatr6fHVdf$#wQ`Es#0-1|$CMc-QjIqeKv(V4$U!nm#+pvA=++&zwjEb%Ha^Ij_9E{;&jHDL(^7~LxyIdOW2B$Y994vF7 zXBL{+friqWX8!bGrAB0(Nw3^De;#=5R(tPvUw&zq(S``tc_06(M@)Dt#XGZp{tJHB zK6<9n34Z_pWPikXraBdKzE2FxBQf`jT-(nM7!>coB!(e_MYP>Zx9-al^0HP#)t&`r zOezl9xrT33DIdiE_(_C-^_cQDUnyV24AQr^-9wfmuA`iUS~fG4Lef2`@>%K zvF#f%9k6_7rZf)`M&O}1)w;|6($Xo-{#zK+nJj_Py}nl$tbRtt8`3EpIdZHrI6(`j zBpQ8_A6&rv`k~@2?@tDY#Esl`RlOn22c34AsBFE@2hzESh$3lTFTH!hEb>Xjqzym8 zO!}o)6HM~6dLdn1Ks!^wL?V53c}YgLz^DBPU&Ev5^EwWK{d1=C3s`Jhr4~_RYHArV zXgBUz0ri@|?N^(GpwpAq$FepT&VlwArQDnnG!~WFKW21+0yCzVn@Du2YByV_gSDfD z30?{uk|j?q3FcbmP;nCC+tn%YQez(1IcSw1qcx6uAlb(iu9=&)QBp15kWK1p>#mn| ze`?zQxF!>mz{r!v%nkzs%~ti?ZL`Su!lM#f;r@krYu}le+jtBhLx@D;p_{+=qN6w*l zoZmi^4tKN#lN02!c++Qu2ofX@5SjYT>*@IK>uin1uFgL&R9CXd=+UmoF|n-*C{!y+mb1n9XTBqLM*k1Kk5mmV=&6c2)arYA3x>~ zfPLHM5etQHe+rLIxk`9j1udkdZp@Y4DIEvBQ|#E66W(WE<%_lI%a@a~bBD@fx?_sv zTJhu zEVn%Hv>S}l_Qe1BN9P~0fN43{0%CJn5|bQ)qPvA(9@=P_o?*R`K092^fe}M}TER?P z*|)7NV63*pxrQsdm@$Hyy)Twag!x`s+hx}`vHS-Rm8X%pG%e=3R`Y%tNt@`?RETuk z0Ldq`8ruR5tN8|>;OuMiw2dW~`#R1kiD=C|LjyS_Q zyHNxI{+sZ}-|tY9<0yFh0-ZBjT3#)MH%^@0?tXVF+P7fFo5N0-eASo}dqc}~n(G{? z*V$Gt$yM%d9oC#US_oDd^URPZ1W#8RhsQ=h@+hdwHx&5iC;Fqb>uJqz{)6`Q!w}d% z_Y9-w>QPd4=zABcL(8DbrV_`?79br)%3Y~pO*c;r4yxrOdH=nXJ6}!6*aRk4L9O5U zu9i|Df%FPW2WFLodG@;eHzD2$n(>AeA(s0u0+;zBnnf>WDuL>hD&BXYip_=bbkCsC8F>w;#|zNAKSe9;h1k-oe%ps>xU?#<0HD|_lll( zae2po@APk7zxM{~F0S(n^>@TH=5MPgF&}i_x!S27e0^Rpm?&|)b>-?{H&+o582DDH zE31cvHrqO0;1Q+JXf?q~y&c$7zdg>d2zb7s#9j!#wQ^*x8H$lAv!8^(mzz@1t)}nb zt@008sF3a5e2fzNYiqiM4^XIYWqr06xH3^=Ce2=29~YY6tZV^QU7rPsS54$?dtPt6 zt%M$>AnGjx&7+-pNb{mCtqv2%Q=|;BA)H00Yp;Lg@)g7W%DL-{uOs!837>>=o}cF| zym-c_XFfy#@tm_O0teEiL|*{62_Ox1U~p|T+wE(BaZ61vmaq0)*)hk9A#rQzM};iy z?GwzV@Bf}}v)?kUR}Ej_kfOJ3O>$xyOyM)tzqmt^^6>G!S}V3@_lPuDRyUmUw8=bqPN(>4)qZf(O0^lzPzNpb&@ z+^E4Oj-a_xaaWb8V=fpEFS}DEndxIg4_<)DQRd1;NHH}Pce_~)ZnjsY7p{>3hllhc z(f{P>Ki+d-5I)Zo93>dLfqa#ySko#$2t?=_!@CO)KuEGSzFf%xAcUsO4UL^oC8YDU z=BHz2dTQbC)k^;fepBIs>pY(a@z~}KL7jQa*-Z;f@q`#UBF}4)VUAbrUStUMcx@1VP_aU@OhAJ2nuAvV7h!7Uk6 zM%DC@G7{qdDRyM}?{n3J-Me|Og9W?0Q0=*_x$Lvy`rNzCRduK7(~TBbUy5)PWl29< zL?uFE!_e!4mAR^n!ic*b!4>Y?sZ2ZvY@O;Ysa#5T5*sZCyzs{!n6bY<-cQUb<5YFD z>Hr2%$x`HcW?uk0BR=VkJDL+^Sfhyp6gQ^}n>^&0XMaSjsTX)WoPUzVtW^N{wm6Yv z7#Va~_a0R=Kk6k2yx$t_w=p#P^h^u)LmWuw`dix!x?usM#?U5h5Jm9D7ry z*V>{hHh1EUEBh8UHY2dEcT*yv>A?3SO=|o95%t$$S#96U&Q%#s%zk9E@PVgXY6dWZYfF)l?^G>-lmoyJ_^(U<@+f!nF%<$CsZg(t}5x%(dk0gJPBUKIu@bo*-zgw%2eH1w-7 z4c{-HO7VWf?*QHdX`wQ((|-vP7jwhVC*WoP9J$$=Ps3Q37vZ`w`1Ob-slogx^g-e_ zAPpddla#*I&fe$~5~9_Q2V^Qg5YAMfKC7oq7ne-VK|2=C++VF@-wULogDDQ?^GeCave`%CB zB<@M$wlA9V8w1P~?)Ret@A#$e(-Yx5_8-@C`eiH9I_0{Ifq=CT@CR*n)J^j8k+@jQ zVbb0c$eUwv`V!c$Hmm$4D3>+*;38>ukFkMkgvU;OL``M2J%p)L6-g{?U@B3oTK(jH zG#h=j7R|r1*4kx+RD1Se&cnn9;WLMmQ*TN3J4K?qKi^pPk6Y^GZbHo)97SzYS*l#o z4DK1NI2x|)OeHlpz;uRTGYVfHw3#vLfL^}#m|Y{8|479<*BO{|IXx?{9E<5W0uL9j zGrj}QST0aAl$2pLCYZP7v)O2aO)!6Z@NDk1M6f913t+Mow%wlEKuf~`dlTi9A>fL-YS&=+%=C@Ta$K)y-PJDA)e}Iq zifmi^@WQZ=dDFW!|3rkvXI+y-csKf~Jm=jfF#f=`Lq1no+y+V9>QxS;YxlV00%kht zr6?@-_Z+`q4v^QS&cLpYNa^nY2TIJhROc)B^nc@Ce;^@Y(rKI}G5o9I@^xk-^D7LK z^%Rwfl8>(fItk|x=}pZU74qMQ7T4C+*h*na-o?Z%)X?RUIO+oOvTi)MAxho=+l4YZ z(pOYVniU4(gd7&Lq{Wd%|3JgP=Em9#`S)Z$ftk`xBQ!J)4yRZOGK>_g^6-dTHy=$z zXRpxlLK8+Hm8`~Zqy9}tjS59&&VX?Vjn{7@pVB$)ZEOtp5AgX^G3WL^?ynb_4#)r0 zL8GmMX+={ORzyY11rQxXL+NcZ07=cv>e-A^pwNh$oqea(iJ|C*Vylgkz> z@8-3BS}dqjzer@?Tm08?A%I>LfX5&!$Ozy4xHsI4aAw={!g2_I+1f2CETJ^K3}@ld zTfnW@)-Ak5F;&RVyOcihxO?qjw<2BVS}0OoDo<=MSt(m9pXmGdk&#p_&R5B0L=~j}6JeqEU}js?Iz+H1f8eN)IPH`SLF&-%IF~x zaKJT+nR*n^2lOkhNp1d*R&MVbjhmLNv+id9<<6sICU`QmmWjZh6@$l^%mQd106pk4hFBP|24exLBo6AdxuMXy{UoDurtVG zqe``*%>1VP`Yt96u~>y#`Vf6e^AKhOOtU+O=dF8 zJ^C1=Ru5tm#fmG-ElMNdr^&u>+4hyZxDoWs&{guweL5oLH1MdWa;|L}Q|UNftpOb( zY~wp=(VQK`;0_aus+G=AiKR@)jjKJsDAU>v)0i_ z<#jA=8^XpB;I7WO=k+>0tvGhc-m{Oq(Mt1u&-2H1%DcPcg5=t9PVW1^J+y`d^JABb z9hskeJQl|vn|NT3xrLG#kX{AnnVRfhY&p{ARPY|nX$**P1Opdf{`5absbW7}Wf&Ii z*3m-#yqqtRuTcnaqsatqY!LUGd~V5Jir@eC;97H|g^yiAVsuyYkj|4%*p((~$x1sB zhGdrF$;sEngK=@XB}Tc;4ZbhBl0vz%F~zF!vSZY&|KKy7=eGN6t~T@Kk{p&}?DiYk zJdVs`X>`nJ!P^{*AFx#y1If2L ziSdFeNZNe!0uwUO_5PYY;-imXo|TJ=aL#a=kUjs`&?+@TDLZ04A%4#TIl$FnyU8kH zqdJ$0L}b_ajFu8psV|(X`BZ^g_u2eaqCV<`VRMO!C|`sDnz0HwL`i+cHSP*;LX1YK z;GLp@gkIpl&n^x5wIQz#qTX7*KC6J%pHysl?_7Oy(Qv@&GlN*?|KGi%Tpa3_JmXt! zEpi?d<&k%cup(HS@8|z_={V%;iTxANy{KE|Ab44-D#>#+i@QdDR-$}EseA^;%*(ew z;<^JP*hzqRcE%n#l<{m~!z$O_tEDh9(~ zM{r%!B2*>G;&YWUg>gyz`V9{oMo|*R=J#}Ae`;?O5fA9;j&iIm-W^Qk3!FrSJ0H#` zEgQ4zECc)4%s$(x0Ln1}-LFJ#Ft4O`ebEr$H56C>{iFVFmh7d7Y%&Re+TYPN=zmiFlh}&8_mETiFNw^TO)d9 z=Lul+d1yxq3~Q-&nM?5!54uf7z^At#ZF~>0=X46-aP6<&6vZJRAlTEVhL_Scg^j7u zYtXUjO3w6yX!kq4%MYoPF{*kG47S!%qOfaRC5DEr$+Y*ZM%^M!^P_7iU<$_^^w5d? za?P0b|G3HOf}@-aIatwxla6Uv>ab0@rif&brq8Qf-KjtjDcPVNN$ifXFCNb8ali0=!ayS9ycf?K9#kZUq3tzHlt+XK=!(91ou){)m zjuD?w))-z~3LN)P9-}$F$JD3IZOO;i(xMecn1U#fcg(QdaZzIsk`%4kYAlta;qyLe z-<}v`nN2~rzhSEH8kcTV>_JOMqoA4eKjqVUAXLzT$H}s6NxnHp<7Pf#k7zX4a))XE zt~9tPTzyaV>jQRcdRJbTZA5oq1}rR{hp+E=dV12q5q?s_(gCh8dE13@XIp~50tgAp z0DysdGj!VBh_8}G<+$Fi6Tl`HVFqTmnl$VPz*HmmD}^a3nN!ph6n9hnrSN{;B>-$= zNHR!#m<8$oi_sS?%0;z zxI}WQHmKUHyV0-P5~e^0%N5x5SVkQN$TD~wJGtR__2tt>^=kP*y3^yHqv`Y0C2+YLP5t;oy-xGVf!p;C5bg@VMCl;;)hagWiA2}g zX=|!qHfgB^<`$&#g~3NvLD5>G%#&xN9Q=k7&JA#oBk|BFg-A;Oev*oFW`HFR`b*)K z_&Zj)Ot-1B9 zcP({NBDMQG5I%CIVZ|BwV;(K24Meh_D$x6EUrz}pcK$ytz`=};A8l&bAw$W21Mo_! zR*1y|zY;j?Uw0u=Y@wv8gweBTwASn12K*&$adxz0*K3m`0mIE? z5SJt<*MTU;D)y4(H)w^};!iz4Njr8tHYSIc;WyubsVXMznS=?41cT?3qqwj9-9N5G z|Ab~Sd`1+A18Agf-Y?l;PHm3_DVs;5Mp|F*v(2_feTS&Jk9v=vGH40Upgxvh9s`wo zncUrAcxx75g%RCHGv?rm;?jKJ6HrYQkPirXDZ^TuGK0OBuJewak1(eeXzPA`%#5SY zcdIh!;2$$I5`ymLW_XCcJzNqWsWi$j?eafKvk1mQ8Xvkj;!)0O_Y#D8?tJn6dhV6$ zVy07NRMOsKws_Sg4v>uZ*6xhhKo<2{gwBV)pIm%Z1T|jj2)YKZhcmkNTc;9cgZfx* z2b?-5UdSt-!^<6pfY$M++s4ffb}d<~?PWERa1syCE>_NvagsZ2(P? z+&;WJOy0>htq=OTEpwl@Dl2&eRP`mJH#lBo7vVCyByoHWqfo~Vx`aEl*8`!{4Zz!5 z@7>^|GDYx%4DNN_p0aeUa!=7^8ry#)v z{Gue_@L$b(MUtkX($?Tz$6K!fhp_WoH26h2QdT9ivPmKNCPn)d#^G0wqxnoAdMObI z0c=nkC41>ONp?I2o&M^qayhegv#NJ9wdhfY++`4J zcY#%mP&Cp!bDuKg3< zm<(6N$N5$gb41H7O^@0F+#l6%oQi=+=Zh*wG!|w}mWXg^3hwUTap9XD&q3$B<>euW zi5Nm%n!ceB{uq_T>iWvKs9>v-AiwKYs?Lm!ZwRicBT zPY#DLslVJ|Yraj2H?M2pB`; zenr`3kp5uY-pm;i@}*UuRiD$-j6H}kQ2Q8PC6c4}yX}OzY$- z*buLr?w693RLNfN$Qa0zO9($(05M5wZf=~9msb-ynjfz4mdgYtZfEpFaUshg{en);NmB9Nx5JSFqy&RrUOe-FOdaCqxeSHtqFC`r(BoZe^ zue4^Uy$dHrH zM}$Oqg@Dc$&9O8k@kYzHEx=E#c_$({DI5gn|9rw@9p#crX2uolH%0t~v~;~1ndO<_ z#Q|MQME?KGb^FXBiYl{sxWk~WZ%X?96y0U6ED7hmF$f`k$jK7)TxS|wxitS}^+$s3 z@wE^mkF>+oe%3WRO2zgdzVX|YSB+s>D*}yu{s48=+sb4WD&FW~PEM~|nIRPy_uk9J zGe!E=&*@~V56Rt29k#(7^b{;wo0}DQNXm|Y#=!*S8VhJi2O(k3zJ zN-fUXMG5jg2pueIq_mV+P36G`muZW>Lt>XBk({%lyX#zVMIVW5kKQmOh#8G5+k4N$ z@eJCCuQN`lYWU35pQXGZ$C5zq5a;m>m^}%Geeg;XAm@RdL;A-G@E>vItq+doXkp1D z3pxnWyF_`5?$P2G=^PGQjpi^_=oU9R4Aa3zaav8rf&vQfmlISQE{Ww!gS|#AN3$sc z%^rYM0AI9%x6k`}KMwEM_)Rrh&|$mHoJ}H&p%6`)PyQ`w_Ae}T^$EDT$AboGLGAKa z`0&QoLmt?ACKDk2+_4pLBJsOFu{dVTR%7MGuh}Ndv)WN%KnGLporw& z_%vgwRe{3(6%6S2swmN$mAf0|K~aBIybfB3WRD;fNaAyHz9r&1l(ZJHVK? zj_${7OCfM6JM;{)FuWKXsNx^b-S-#cLeTv#^2@_Y3&)hnaXBlgYS`sm7REDYs|*c| zh5r-&Jl>^dEno=0&o5Ogma~-;%k4;@2jE`)!T6;};W_AERGphc{Gb7MT^kH8iawqP z|Ht`K0dj6S>F5k3ycTZ&LQUiGjK;cw5!%ORGB8=*&kLnPjtvd1qBip=<5dTP8!>-S z7Ha(G2P)iD4fqwP$hw!zv`v^kgt*=zAUB-K#6i1s*7iRrcJ=9e@6U=)7k?5#hmCiD zS`YdEm|gU9X59$FLJ!zFpqVhq(Ea#-tCN?i8}v5M!uto&B^qZ7V)(ZqWrYzVV{>H-FoFA51QuDf!j=#r#=9)ZZ_$U^9kAR#qQ zkR6YGv>QX17967X3*9~u5|bXsVC%q(FyZU4K%2Y zgHP=Dau}O?xvwUKo_q7__oWbzl0?Od_}S+x;wy9;Cx7?YG}JpYi>lkHL$>;SQ1F=J z`9Z>BuQkZv98nAEd?8Rxnfxqq%I(!5)!-n^*q4#|7i~H8{R=l11}{A|4}cJb-& z(f*DCqn@Cm5SX^(yss#QH#3rZ-$oq};o;v%rWP^}WWb}7(2m98&IzPshDJlYQ>o`7 z$EL}YG3!F9JD$KfUdfjR^al<$OzOmKBh9&G8+HO71!|7~q{sMz9h{4k<)4J+Z#vhx(2ZNCFi0f?fx6WPx)Ab1k{@p>!f zdS*g+iu`$l_N%{vY_fuB?(la?YV)D*aUi!cecfLmcsIZ;R8c~T!-n&7-T3TetdX%7 zk{M{`%o@A~2;E_mrUJNMbat{PLulc8YxS?UE@6%hP?d zJirPoxB~4?7;u#yolHfUr>&MI4F7@pp>|aJxGkYrXzm2>W{h5|Y=5>$KBbb=DR%K+ zz;An4XMu?Qbd?Z8&_A|*f^p^}r_MMh(dO>(hQwB)mB?QRA`)_V*E5q+JHkdfMiT22&w`!j{srGTs9usmf;tf`!-}pAOgyLA%?V0*C96PxZu}KH|_V zeY42~B*NGG+&VDK@*M=Ap}VtPZ4c-F5dI$`cJq?pw;KqjiXCVLR0`u48yU7=d525K z)1Uk|X^$UY+s$?A3mmMnWswQ`sDQ|nqzlnYXh@ulgeRY%rZ_L{U?d@k;XL42dGGYQVO>C}cNK15yVnG;jbpc8tszo;P_ ztY<<-l9}**n){$pbWnKLL=FNl?31kUrzm5{{{qz=(dCw@?~vSe#pX-6&;oL28#QY* z4vVIGN$bGM*W!MZEahpB z?fV6tgf|Ryw74bTK7!VVPKP({W(AYw2iyHhN>YE5g zcE?UtypN-zeJr4yT^^QN6Wsh4amwFyJwX^A?|xNXUrh)V}Kv+Mm$ z@f{<+j*H)?dX>Ij$Vf;U=n=sx1fnJ08zxc&-<^v_O5EcP!tPsSSfzZ=fM+=!V0YAi z$c7xJ0RGLw#>AJ#r$wlW8A-sAM#gbhca^pWD)KIzb%@`-RdG!= z8w_JC^z8^k)klmkc~TujrAG9K$CIKe2|65RD=*!#?;VvzqN~Vuz5ozIFHmUgc{JUU zP}=R)aSKE%EvVKDm_pozk1a4IpILER-Fn z%j-!C*O(aHEFfZ|%s{`A7b%cUzvRjL;Uh59!V?4PwbZxD#o8_EZ(&R{!# zi0=;RFF0|hnr<#oQWR&X6VgxpR{ovWEwC4fd{i@>kwdMB93eH8Q6Qac@@+7;_M4$1 zRJ)DwL<5otr;;Et>d%2NFHuS}v5U{Ym%C7EfT>C;>EG&af~hy`-*~MDBuZUY_PeZGMek0ib`+# z(;n>TF-why1flC>F@NOky!zeqN(W47^4(?=1gEAF85R&~6t552@Sgvg0_VE22fh9O z6j%c87}90H3tDGur{Oc+UxPf1@7;cK|2n!j<{Lit@<@9wF@3f3dr!>u>9`k9tn1BzC8r1yc5 zL>omL3d%`8OnZIG+)dUF`kwUTTvZ1ujWx2Py(M~N^6pXXioc~WVxqq)_s*rH)L{ji z_sm+JsgTDpfjgodc`dZh=b) zHBp12?dKHg-i!PgL}p%16kKKrazFLS*lFp3iWoj}0sr1}KZfc+bnOn!bpJ`k7y`EG zS>tcCKR^X9%i%|w>A@a>7zw=?q*Us4g$Eb~q|{Z%7q!J!wU76AVTeS0E?=V>9Z1A+jcZ? z_TTmOoSiyWeZ1(4USI#d3@%|J7M3`*oUz%=?-koW5`GB8B;h4$+8_fgXzDdixfF~P zE-cFJ=}ZbhVI0ruY0fYRMO}DnM?qe}|55;DCDQ?V!lP@D9I<&r8A z3+(@oJV2AsvXw&8;}TH*sa&|zT&~^m4B>RY>lFY27V}U@AKab zViqj5F|mTSZ}L!y1&O9ARvK+_Qp>1*m8;h1mq0cTfJ?7|%-HckNRRci(Aal2!Ev*W zLdMs2+FX9Nyyioj$6`SdG;n^>($X`jGuc@2p50kSZ+LY@XLKsO-A`_NBkT-d-Ixq( z^oA-eILSKpfDm|Kp5&_r4*gU-OtbWT$^^E{+?qBK5rt(CGq1n z*e%f~Gns6xcgzB+aMPWNAK|E*QAr&>uPYc+nS4(wTMkuedxe03lnO%{jQE=OD^%Iv z*Tmo6`Xd-2Ncw%$_(wziSNHC8<%|}X8^6?%dP?16;m#B(6-s`~c$InmT|oW5sg?Bq zv9)9_qtzdj1LigGI?QlR4i4-K2PZ1=cm9kwn+YeL?n~g{3XRs84d)fWYex<#%-!>< zsa`u$OR!MhW2NxUk_`cec1E9Ql@fXhS1&bH;^E_W!i4e)MuGWGL3Z9>Ni5R;uBcz} z1aouas1EuM82|k>Av`lTQ91q3bq|##9=L0#k=IsWZ&Zv6rm%l&x71=J(P{_PnvY4n zc%(t)mS;9>Fn8Ca@;NYXYjp|44>V`s$!_x|sw9U!k=0120o3C?&+%#KmlBJFZTH#ha)d>;YWeTsp5O~7`l3soa$qFI-2RTJMJh%E=(`rx;uJ1 z>av^h40wWz?}~-HPFpv#9CkRbWDphrad|%xQr_5=<$H;=LQ%HPsTJ#c4>_5Jc&65A z%rx=Iz7I$-hiULD3SA&H^P|VISKMboH zoWfH>Hv2@DX_xye;V8GR>a6{Wd16HbuBw4UGSm2JZ+)+5)}F(bEDlWMa1WCk4*=a& z1E|&*o*y8(OBcbA506Hf#qI}BifKcEis^Dds)&+IgLZzvt)CtC`S?%sN_Q@}i z*KM~7{;Za?RdR7|K`)`UR;LnSz0p3M@zbNbEmsS^VTNP+t!x5K@v%VJ4?S*OrgLB%}r&{ea$E6n1xQ zTi&wls5=m}gz>glPNwBq;_!FJ$A%&t@{3-rDexl}q^T1Xrl%|-s5V$-62<$^GhiGh z4L8F3Q$h$QeUA+L`O}WpeNG{`L$e_<-0Sx@XJ^(DR=rBnyC$<}q69$WwTiHXLhFWGLHm5PSFN2z?S1$loi#a2hS6^nNg zPNa;A9T&4?oKDiXTrPeJNE_(;jWtXI=0^KfRVIJytkxw$)tUifII-{LFwNy;FvfgL zxf1$QF-uR8=)YhLp~b`VR367Hdoo2`F?!}G7noA7=Mg7L7(|NKts;7dMU%z}4pt@z zF3d4@xO z7f7Pq`MP{?M_n#d_tfwi4Q!76>u4 z_L~8f#Uu>3!&mn<`pnXgTDQs9`(Dw-B3Uaw5E+$jya%i8%3HRR^SuQKw!8;3aI@n7!{h8e9aP{#Ls5hSTWy{64fI z=^6wTx*15lWOx4aMMTQ=?PY#`og4PQe&>mXN zDb50DU)r+#c!aEd0PjfHR!Sn_@wN#?qqeVF&U=mE66(q>AzKmU zl-r|)v-XUWd^i+*i{pRXJDZ_91H{j}-($ATVO=fe_mGwkgzOfTQ?IUPgw&S3yiC&Z z^;I>ssBC76qHq_-J@$_)DT6yEEapYyJ=eMD%_pmwPfA4j+O%bqg2gf~ck30Zv`RoU zSXgCs!cnzu;js1OkD@|oQvFQnd$pkV^z;lB9xrW})9>Yt+K!zEMTyYREf#AQzw&3G zJlZz0DYCgr>c59$CvL|aObVf{C*pX*le!DkaoK9>iVC9{@WVa3X&#KVW*3c(_%COd z=;i^(^Ah$Cbk3zJXVZU!jrLhpbHqz%1?N z)Y@eBJv3TV2k_AL+hR99Tx(*Dtt97^ysSE?;^O^Cr+tlQ_NENt_CqVT5*Ql z7YwMu^4H1bt#B(=N@oV}k*1E&C#6;IBPKop%Lg;%l=s9O5TQYge9$H|F!=n)wbbaI z-r)H&%jfi9&m*hy1;J{T?FSkAF8iAfGc;l5R#_&ah!mdaH_t(syd@W!xn$Z#)Bn=~ zB&_pa20_6fn&H1J%F$bjh(L8x*yW4w`B!9-B^*SbSBjzyoxj=%V;xt%*a9BzfXx=ayXOk+E%273jo%-Mc0TtmKg zdrDqiGilKFmP+Z|YZ}t^<2G~MR+r9T{y6;kNoyBsvY&5Rn$>TabHltia~^!8Mb}3g zEY#;`>JH1+{zptrAA_;$6@6AtsQnt&%6^oY0gl&A)652W=5Ts@y$3-T#=q5IaEq{6 zs><~vXs4rFMtO6m)8q?DwPIDh??Jcsuo}T?_0T%Znt2>QB9?hBNw1x947Q%JY}Rph zx;gZeJQ7_c3(!c+K$F#jrp(DFOy*wNzO?J@csC{3N!lY}nV+^idwbolw zgRSb>^1+3zFhLHaP7-Y%^AzQ*jeJg92Ia0VGOr_Dwny-#yLBhXN?$gzjLLA8n#r-S zz^W#Uxc!XaKYDiCkY%Zx=>5c0AD5F9P^nmn#e z?~Z*yIy|~@xYe9a4F=}GSWPBfZ4O!Y0pmZDwbKs=gF*0OdlTs?br(N4#P_=Xst!=^ z4;`mFw6g8p7-Z`UtGji*(MM)j$(_jUnaz<3quG#`PC39LRI&Q*eKY3%#Kf~<*B09M zxHa$%K1GrL&$PCp_)-O<>WI3bdSZPDu-{Xez$8`{AX^Ozx$D}&Bi%gxAQTC%RgwY(DGxe++sBd zToXNxd-B(Juv$-V-)XpB`jK-Hl#rGH^G@P~(7n2wI~ke{nWx?^PnE~J3I9VnLY~tn z5rL4p6Rq?oy=9v7OL>-G0%z^ij$IwSHld|o8@s^ZfU#9BF1^syJw2`1aw1tfW+J}p z^Fz(*r!B?#GDU+JFzt+JB)#MVJm@$1Wo2rb1H;MDji&C|o7>tm6GFG`KLT2lGaWGM zx1)5Xx`b=p(`YVp>U6FJC!y=^vO*e)GZF%Y#Bb{+YAsZ)?|u8jj;LqO#3;D3H)2;8 zzVjlz@vP*dR)T4j=&YpG3S*jzCv8}t7yOa%nWmlp=ndtgk7iTrw7hHGZQ!}>la*bHihTdp8lf%9i>CP6(IDGe6^`9Z> zoJg6_MKV*iwkWIquml3yVSHBtjbgIl(`7*Bz17pMll6RiOR?kT!&|kgB(7F`SX{0qt5-(|{5LAG>uJ+Fjm| z`(4M)&NC;AKW{19O4$kx0@0>9fZSeAICdqGJ{ye5%}hnRu6_91U8GnDrdj{q=ug_# zCm33OA^r3_5U%6HWVjj&JX0%22k_Bpzj}2I7|dz)EDQ{zQ8KvD*)Yop{Qj$t?2ha2 zv*{i`wLn}w_)dBGTKlG4mJf~kD2-K zoUqD8fBWb*Y?$fygLsYwj4g=VPjMD!;p;-1tt_tWV3H{s7n6N%XYW+E3_);)ie$9Vu9fvvp1DD=^JNRm!gm^{ zXTRIw5*5!vGX*WzRDoN#Eq3{p$!03LB~(lqrlq`{o(E(`px>(`;U1mY-|#&x^;ttF zde3|ImWi{uRNm~s1*etsYxJf4WR9qi_V7Is+O9tcB%SAYbD8tY-N_8uD&QHM85R#B z(svk)EIbj64)>^&%G`%H)EtNu0Ra`=#0?G8U@&!;ZNFTPR0@OmT80U#EUZD{7)d|o z0}|{ZR(tE0%Jdx*lsJaWnjb$3KMA^hkRPk_Sl+EOiCPgU)9(~|yjHWt%>gZcEzR~0 zF+ApvM(GG>P#C1#3T_)qCFJ}oGPiwe?Ov`U`TKS&uHHBCGfD!2E_Jl|2Jectg-oF>-JHim z0N)FWd3x|UPICe|^a(~b*HKO48zsRT^-{3O2oK|J>%f%48NXE_4$Om9{YS8=9~sdN zK&TZR@MPZSJtDVsb1}(Tf&eNo{+?rAepPBm5RpB5`QKEF+F<0HB})pKorh_v*tBe7 zElRN9jFGDmls+w&&ngqx>KpIohLR;>2>(@%=KX`n_ssH{?(SHWRK(l=`RS{dQM$Uj zR3Au3#az4`Y28RY9WIqewynPV#nM>~pK_R0(E2DxN7{OPTBgCUbAQ&^lw3A1LNDD8 zSAvhC^>nR|H=dk`Dbj5hc)B)`+@w`5Gi)YlUa03XTqtG!-T_-AvNy)q0kjg(+a!K*BGN53n+z3K%@mRZQ-`irJ7I4T{ zwLyqVD=L_*JlweZ#+p`%KE7_qNqy@KOGP2l@o`biL|MhyPnHxKb`%>VQj!+{GySl(Upwtx||K9P9Jkc^O9$Yr|Y9GoxB^YBl5FH|X#`_Gm# z&>%{bwqE3tB0DXOXv!Rw51BfaL}u;;!HpOudKzysx^6GaPt%QM!52P;|6sp-SSgcV zH(0JM%PFYgEfN9@A}qB?2SH@M8a*mKeK|`hbwGd@3{Z*1nMR++#?|a)eKr6_eh2lC zEpF(3+6MxQBC9}kAt{r;jTb|R6cnCoUd55bo}VMWfJDwMJefH$<5-NYf|oCB3C6iR zJwk&Dm6blZo$yw_)+}5h<2)9SR3}_R++vOPfRg;GW^hDB+TG|(X(C83AixLuQ1Vvu zUxw_4xsZ)sFci!?q<#8ttqN7ADJ!k+#K7_9+Gy?KuT@@>K(^Lq`M$TzQ%gP=9!yO#jY}hHfh^AojU-p-41;PEAyd%s;4RD1 z52PvX>z~6vnPDUFWOhvZB@djCVGt8DM4B9EihR5GUjY{Z)AUt_Yozyl^C^c2#azik z?C8r9rh{>=NRRkORIMt`cGCjV&?#c|1H9QXc;2)yXZ-KPMA;iyg#(M{obtTt^P{{H z-@k&ylYOk=gEGpkfv6_ZzuYE&ol@CMy+<-9_(8JdBP3491^c0JHn1l>-oyi%8m2Vu zXT#BYe)($=sK6nl{jdiv+Xb0~0mX9oxexxA*n^gfHKw5tts1`~vnR^E`iVLmr{mh+ zOPQM&s;jGiPwWWVDQs=kDJv^0zgEWt#i(bLMTLeH!}$xc0(TC7RLW@=-R10MyselrIv9*ElrL1t?+_dHgXsSBI*`zmXWiv`Lk*r%Bt*o;R68YO;pK~C zh+Q=x{%FAb3LzOE5>gNi3O|b9Isx8OTuD{f2`gb~udzjaon6>q5Wl=}IupG8xZ+_h zz}#k+lW6Gg^+i!W91)~Cj z3ynK6W-HQ7HCG-WO@Ml*z67cdwec3yZ5c0cMWOrbG0H}2{-;1zO3-6fn!C;|L8Xd1 zRTMl9V_-AJCEc!V_qx+h5J>s53Ob-}Oqw0xUw!Mn*>wOvr=bx5Lru4+Nk1cn59zuy zu;1~%lI4w&AOJ@$Q3@=T@+`RZLjRJ4Q;mKDDsM_;MxAEmyW6vrn`kiqj0ZFL5r?)D zi_GHm_Uy@0_BfUt65y9J2X6Yv5{pmwUV5I)u z=QI+$P+p{dzjIr*byRpfT=7gFs(=9UK){F{Mq7wEPxwOO|9A89FZPE!IqIQrgfch< z@2tHrX}u|uRP+rZzPW-k4HlXwP<^a;T%n@8y?43Ke*2{9SAi@xN3F0)|IKuyXfF! zXzy10AV>!?Z$<<*0gzSJJJZKueF4GC_--4c;<=t|TZ95NFb_fXpc*~@d&|q7PgiX^ zh|YI4%3(HkIMMc1$p;O5DjoO*Q&=sdR`Un_~UlG*~;Z03C{vVMOv+&ZFazJ zJzEmDcZBQ=ebOsd5?`bXy#-ThZf?#F(7;Xq_TGc)C5INq-Q9hwWe0M zrk0kxhK9yti|s3k~~+0bCItYKt>JKCn|DfNi*|8{g2E zlIQ6J-s^Js_&)2sm!$tyn1mVs4&A z7LowA!@z1d;O8e!<#&&0kQe|F04E2KcT7jWE(ua4Q-p>>y@zLx#)e-$A;-XLQK3kF zh~gmwG=H?;g)`WID1Pe(M4N4V349|W!1h`S3#21aq3o}hQUlTV$*)y#mEDPZ_E$&e z-_N1ibJ;tYI9idmv9+yueivpV5!cU-4ZeAe>;W&v4N)?p!1&jT?Ot_Y03(6NqO*PI zwcKdW01A*H=CKAhM52+FqgZbU2?p^K_TR%FRok1PH1h7#@jFbq4e=mkPUg~vb6ph;#0fJ7-@Ivf zfQv*fU@v2M4%(Ij_n*%0Xi@>QKmADTmyne@3yT;*NNn-mXxu9dJDqCB~Pd02k1X2;3*%IdgG7V`7n!HI-LQHUY@Du~LE)1v55 z{4}c~Evu-gcp$|3!J<9DIZz!p-`U~8M;sp9RYC*Cl`=W@tXr<41td})=frObnr|9c zB9Zdg%a$H(N2e7}D|o?v{`#Cj#InY)r`KQtD!=zGo*%sYSmxMOhkWX#@>^;@_Mz@e z7ee0Z)pCV+;+N+!fjRiGr*bP<4kgdoCIzp$^to7KgPRN`kO0{iR92V0t1Y$Quav*9 zWv7Nn%$XQ9#@wc`d3{EdBKEo6LI(ed?kMnnS7~vdN#F@^yHJOqVG#Sp13c3I?PQQC z6joUIr^bnciy8P6*Jm;umg|RCQ4{JG5eW!oH2kF8f9{fQi-JGU44P=`Um(X2jWAE; z%XhnqtSsD;#J=@j6O%okWC#x2fGS1qA~Qxt_FbyzgiFCwYpax6YYM~1`fx>?8TkvJ0S0Fl2iI(ybW)s2mfPrHnJ#QuCv zgU2vZ2&za$mv>ZP74;xhtN5@c!>R>4zrlU^MJYvIbG4qqxSP2Bip~HR>lT;;QS~vp zUqorT2E%-HC`DMxQBq5B@?Zr=cYFEW7r-iW;lrwYQ^Q;3`V*+$W<*a;**^dBj6Tq9 zar%rXJL?mt|6hC885dQO?H$M2b#~_c#+h+v#+lt6lH_o2N1B|Yl7qk%=u3AaAWi62 z5F~>VCAJ_rN)iy13<3fUf`TAO5)ma!5Xm6=y6dtZXWy6ialcg6Ip=>)g?s9ps(bqf zXXI`i?0OJ+PxFM1o2;p_=IVsKTirb86ova4g!R=@a1oJ6lytekj31qi9xeV}0;=MM zrkrm^hP+b0UTk_675rPnNyuNz`3+^~xQA74`ZaFg68d_?b-+O65>#<~(*(Ma&rP9g zC+*`5oj8r>A@l99=Lb`=ya0Uc<<}$C6Z1xw9Ub3TjWP;vR?FnwAXGl<`tj)TlSpg% zKa0rppPbtk>Vph}x{jG_Qmh2p9i3Pdt|w@O<`nl` z52^m|)7+FMYW_u`T;w%Y@y)seJ+=G&dbLegaNJEPtuiq`apz?DGg98axEbn3Y+_1r ziO92?l3^xur%r@l4}9TL?KT+s$tHFz;IsU}!YRpt9&^ZoO^)z|$TV{0d{l4BE{SWT!r;hzHZ7Mr1n*U^JP&Da>E8OYd{i^7<_Mgbx z{-eU-`EzoVy42%$g|pf-r)O{CDeBoLkNwcr-*N9kt640f`h40^B+&YAZfZ?g{2#a- zuGOCx4(UhM{HLz?9LtrA+fnJC$=a}IC)XF;lE^Nhh!i<9cjNKLPITD?sw(Y{a~3(H zM@t5;7g#SU{LJ;LZ-n{=DqY$u)xKQSFIPGn%DXCv{_TW93|buTC=!(Lb-wUl%{E)^ zNhRheI~ITIFUAcbv@dE^pLgKjd!P)bS?3cX76$flvA1Tl%egzMHgJ3|(nq|OE|xWQ zTPa^!a)un}sLq#pwEizY8r9Y1NEJ$ooO9pkn($4-vGS+zBGH&Dh@Cuc^fnUU@2t9+k5s$nGuYM={ zPR%EBbH}eUy3*f=mj;e!J3YKECtmIrxz1`V{_Yg5!hyZEpH? z$t=%R#7^lQvyDpAU*X;d3dH07b#j^erlE)Zb2B|l|Gv83 zG+ud#?&V#fC!>(}Q5aYTg`QqutF!LkMZODB>Hq4NGiDnCd|&>1B7xr@J!TRk`WGB8 z$nX~IPYdC%PjJ5-5xTzla|ovU3qSWZ9**+;_@|~01@;L?g3J>2LcsT({{;T;Aphf& zInMXrmHbcc!*%Na`sDwyYybatH=-dyJ96a6_qrOYX8-QgKB4vV7e{d~f9w#T>K!@q z6P>Mjj!kuUzv$#ZWjnbsWiB&m(sXYJ>P07-C)G>BndQbj;wx~O?#bdx1>^)^0Z9Qm z)4`4A#B}76D2wgMYe@B|UWWjU^cT!J7N@7GuH zo@&c$L*%oTtBP*IfG_I~SU6C^Xw7sQ1@Ijl00$k9AYUAnWPO-yaRv z>B(aNgseb?QHRNeGh_VV>x0wTt>GB6UD z+>)CZfU$h@R9!tC@d&!tuKamN@an_TO}TBuUDuYkULpLiwv|76<-G5kcKdkSytWT$ z(3N5u(|9eI(NlX}-QraRf|wt)f7+g!?tgqXHU@|&JJs1^uJ;e#mjV)PlCE33|He!| zFB%a*_h!mmZv$J9p?=AGc>~c;R3d%1AsLC^OkRT9>>1Zg~m- z>60?$XIJ+(8~3ZXfi*^!>#`l&_xkC}dylUwQpai6m$6Z zUY3Q;1V5eMuLV$W|7*4MS*P&lkpPYhNm{pMmWygLfB;(;Ndw? zL!Aj&oFWY-stv?VT6$-;-OIGJaw@3`o(k6MJujOqciu|O6hQ!-@>Km(FXB0`2Pm{W z?~#Jcv;AfZ1Zil`LYPxRXFkCPyQ`BKUKNEHr075)m9di=%A+7i5J5s>n+-4-(f%{w z(l27Jk2~xHH3MvN8>mRVj-}%%8MPR16$A&EUY^3EVaURj0E4XoF|=bv0S2$rs*$~| z4%&p+moJL8^0wqF_tHjhpI+)%eUZhq=hI#;gXiW1wWQ~@8b7@q->fVRdRG&sp(E{) zOM&fEatQqzD+v2WM=ri(IbjZ%a?VsDr`*(vta&2FQ7?t@>I?>-G|MF=-w zI=_2vQUr_%z6Iqd5Ne~lLQM|+)*Zwt7!GiX2ip|JWswPE!1xY2LU`4 z7v=d>;XXhS^I+Lac`rp{0fl18h!B_`?IfPaXLm&t>`@4syrtZ2oSm-`<246N5J8E1 zp9$1(w~RMpgkqnx;p5UVajwflS>J1G4Z-Qy7A9klhdc?ylxYpMx5EhPnQ5e_9S2x= zH(w>y-r2QJPnMM~i6=H1JayM0ju=OX- z^AsR*^JVcG6-QH`jVAaOtxp;)_32BHhQcj7`2iR18>z%5ip37pBVGTjD8g(&5TT69 zY;+Z%Xc=Vz&XY@C)8_ThF$RKElbGct71?)gtxiZ*rA4i;ZHAp5}3 z!`5iPf^#`$!5X%A6P6BL?}2;^&A~{=Jvd2%%0B?KvBW_FZ7@v;Dz?^*ucetXpC#I8 zRYkRrGndaVY++y!Hekx|GUg8SX~9>c={Gq!9kOHULziSguQ+TT1O=c%Mi}ZIw-L4@ zf?TsxMpHfdUQ1zqNCP1TBE|AqUFgMTjAbrr(mAR;Fhv~q?GX`+=TysU>RO4vi2+Ok z$P5sK#$S+^SJ%aYJrpISIh~esW~}f)ND6<|SS^~>W6Uts;!n-Plmx+Oh^p;slmaZa zLq_q9E9(IRc|tea#GCx961+8_V#Kq(ze6z>; zEVh5>?cDM%{|GD-3jmvbAxQ7=B5s|-vKv?M@{y95CE`QhCRL8DOJB-}#xnGYLSa3pDB{gVNP(H(%L0Mf42 zJvycaN@^Iw=nKafGA1EdqdNC6b~+Sf4;!v0FyyAJEBt_r;x%7m(bz~yut7YZ^XHAd zazD6ZH>Quu5tDPk6NGRGsz1EQ14H=kJ0fC;Kw9Cj=7NYN&}Vd%e8jQ}MC{_stOKTO z$F)L)nIs>t`qr`#8y%$!i`RsYR1Z88qaN&&w)QaltI88FeB^n93kshj)Nfjy9ndmScc{CKv_Y@2T8$vtIpGv!(_K&JZ|PLZr`< zK**4BP6jII`r@iZ!C2!1%IN2fwG}K$(z1YzhLtOJ2Ti^D0k>XbF^Rf6U(uM%nwLog zVnTcosEH$Cl+%bnX1^jJp#m(x$Z~ijL{hi+0(@E(oT-7$kDMWHGt~vSgtNIGGYDD; zu<+UjU{XE+5=9u{DPmKxv-{7JR|r1ZKqmVlK@(wuCgC0iC+7na0!oBTYBRzIGvEX-2D<4 zwN~bsG%zn&k2xzzSXYPoyj|iJmYLCs%>p`DXBdYtTnH@bwa091F6i{(bpaa*z5*NY zOm}~B_wLh~WkWzSI*Ke@m(0$&1=w=R4zkG{b=g;#0IWSkU-f5Z_*vweJvO8VDM}c0 zspZ{EW)6Btll9+; zU--z^n=w^-X3-UJA@U#f@u@A0j}3n6!%p}1mM?#{VHxd@HMnq;TzgCJw6D#!c~sV1 z(FX=mdnL=BNolg)ZYTH72w0R<(@!H4J(MJKggN8U$m2o89w3rE?FwqJ@Gw<9_e1!# zXdhyTtQJ)h;K(VW!k4CZ%|Gs8ApxW%7zGguLM3uzoYh~3JTK)rMCKwtPAxKG=Ct^ATpy8m{rA@6}sgb_6v7>|8 z1nvZpE#)1(IOFSQwGG5w!(Jc)i3SeCwBQs3#WE42*<@tD9`iN)@Y4cdjfgoMK3b)| z-=I!AFj)a?JqVX_)29;gJROCwmmZ%FBZIYr$f_~bMm%eI5{P&s0(*Q6s!Pv3lBWk#LHaq+ak#*4@ffqzG1T7cD~Rts4d6I^ z7sod9M~krlLP%gSYylryuzshbx@p5dK~LGb2ZZ#o=@!s%*cdezv663=wI9@Pb0_t0 z2rY(91V0@ZzJ;KVI>+k!fd!PpnF&bh4gdssb-7^8pt}DuCTf=8{>RkB*}xGze~j=h z;Nh{OIUFy3=4ta3!Zy37<)j=tyCY|0IIhk)FJ+RV_?~;023SI_P~iE5+j?5P{XQd_ znu;{vPqtphbeZRlplcYX70koIz@s3%;FD%Npm8FnXM)UjS_X~US@QgcDa!WE)k2UV zAiCHk;PMcIdoiwgiUt-$n*wV5+-lHY@E8K+u^r*J-jM8xGEe$9}gvh8#)ADGr$f& z9j7e}211$;WFV+d%mSRo*9|zd`)wW7siVm)X#PWlA-H+dD;_&rlClE<0N0yzaj(A( zEF*-5XvIBf1;;Pg!G62`0p)a=h~#sBM1ZtgGo93&fY&!Lfj}hVz9x}V(=niWy6PD` ze0vQH1CKc;{rq-jmzb}hhcHEV8D?PpfKv@brogRYw{Z?!2W;AjG^zu$htbH{$V7&0 zaSK5NJN!UuU~G8D#gyBiMY0+Fz_yq1{MgB=O>IFK{HLS){X?H5UK{8V_LW(c9CdfD zQ>-~m1^CBcG*GL@<@#u#&bWx3KFGsQIwSjJTe|?48l{TWwT1r9+)S_Fl>=xf)83$( z*Mgw6tvfqYR13g{5#&HG@CA0$q5(Nb6cE9w6aeU@r|YndN5WDuo)L>6Ag#hgRsnM} z7rgVi2AEw9{%Le47HEIAIy_h^3t<_JA4N+0)>mtGtm?-&ApzW?eq=e~_TqN{13 JQLbhm{J$t~5mx{J literal 0 HcmV?d00001 diff --git a/docs/source/design/arch_overview.rst b/docs/source/design/arch_overview.rst new file mode 100644 index 0000000000000..a9e7b4bd69bc7 --- /dev/null +++ b/docs/source/design/arch_overview.rst @@ -0,0 +1,274 @@ +.. _arch_overview: + +Architecture Overview +====================== + +This document provides an overview of the vLLM architecture. + +.. contents:: Table of Contents + :local: + :depth: 2 + +Entrypoints +----------- + +vLLM provides a number of entrypoints for interacting with the system. The +following diagram shows the relationship between them. + +.. image:: /assets/design/arch_overview/entrypoints.excalidraw.png + :alt: Entrypoints Diagram + +LLM Class +^^^^^^^^^ + +The LLM class provides the primary Python interface for doing offline inference, +which is interacting with a model without using a separate model inference +server. + +Here is a sample of `LLM` class usage: + +.. code-block:: python + + from vllm import LLM, SamplingParams + + # Define a list of input prompts + prompts = [ + "Hello, my name is", + "The capital of France is", + "The largest ocean is", + ] + + # Define sampling parameters + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + # Initialize the LLM engine with the OPT-125M model + llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct") + + # Generate outputs for the input prompts + outputs = llm.generate(prompts, sampling_params) + + # Print the generated outputs + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +More API details can be found in the :doc:`Offline Inference +` section of the API docs. + +The code for the `LLM` class can be found in `vllm/entrypoints/llm.py +`_. + +OpenAI-compatible API server +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The second primary interface to vLLM is via its OpenAI-compatible API server. +This server can be started using the `vllm serve` command. + +.. code-block:: bash + + vllm serve + +The code for the `vllm` CLI can be found in `vllm/scripts.py +`_. + +Sometimes you may see the API server entrypoint used directly instead of via the +`vllm` CLI command. For example: + +.. code-block:: bash + + python -m vllm.entrypoints.openai.api_server --model + +That code can be found in `vllm/entrypoints/openai/api_server.py +`_. + +More details on the API server can be found in the :doc:`OpenAI Compatible +Server ` document. + +LLM Engine +---------- + +The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of +the vLLM system, handling model inference and asynchronous request processing. + +.. image:: /assets/design/arch_overview/llm_engine.excalidraw.png + :alt: LLMEngine Diagram + +LLMEngine +^^^^^^^^^ + +The `LLMEngine` class is the core component of the vLLM engine. It is +responsible for receiving requests from clients and generating outputs from the +model. The `LLMEngine` includes input processing, model execution (possibly +distributed across multiple hosts and/or GPUs), scheduling, and output +processing. + +- **Input Processing**: Handles tokenization of input text using the specified + tokenizer. + +- **Scheduling**: Chooses which requests are processed in each step. + +- **Model Execution**: Manages the execution of the language model, including + distributed execution across multiple GPUs. + +- **Output Processing**: Processes the outputs generated by the model, decoding the + token IDs from a language model into human-readable text. + +The code for `LLMEngine` can be found in `vllm/engine/llm_engine.py`_. + +.. _vllm/engine/llm_engine.py: https://github.com/vllm-project/vllm/tree/main/vllm/engine/llm_engine.py + +AsyncLLMEngine +^^^^^^^^^^^^^^ + +The `AsyncLLMEngine` class is an asynchronous wrapper for the `LLMEngine` class. +It uses `asyncio` to create a background loop that continuously processes +incoming requests. The `AsyncLLMEngine` is designed for online serving, where it +can handle multiple concurrent requests and stream outputs to clients. + +The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo +API server that serves as a simpler example in +`vllm/entrypoints/api_server.py`_. + +.. _vllm/entrypoints/api_server.py: https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/api_server.py + +The code for `AsyncLLMEngine` can be found in `vllm/engine/async_llm_engine.py`_. + +.. _vllm/engine/async_llm_engine.py: https://github.com/vllm-project/vllm/tree/main/vllm/engine/async_llm_engine.py + +Worker +------ + +A worker is a process that runs the model inference. vLLM follows the common +practice of using one process to control one accelerator device, such as GPUs. +For example, if we use tensor parallelism of size 2 and pipeline parallelism of +size 2, we will have 4 workers in total. Workers are identified by their +``rank`` and ``local_rank``. ``rank`` is used for global orchestration, while +``local_rank`` is mainly used for assigning the accelerator device and accessing +local resources such as the file system and shared memory. + +Model Runner +------------ + +Every worker has one model runner object, responsible for loading and running +the model. Much of the model execution logic resides here, such as preparing +input tensors and capturing cudagraphs. + +Model +----- + +Every model runner object has one model object, which is the actual +``torch.nn.Module`` instance. See :ref:`huggingface_integration` for how various +configurations affect the class we ultimately get. + +Class Hierarchy +--------------- + +The following figure shows the class hierarchy of vLLM: + + .. figure:: /assets/design/hierarchy.png + :alt: query + :width: 100% + :align: center + +There are several important design choices behind this class hierarchy: + +1. **Extensibility**: All classes in the hierarchy accept a configuration object +containing all the necessary information. The `VllmConfig +`__ +class is the main configuration object that is passed around. The class +hierarchy is quite deep, and every class needs to read the configuration it is +interested in. By encapsulating all configurations in one object, we can easily +pass the configuration object around and access the configuration we need. +Suppose we want to add a new feature (this is often the case given how fast the +field of LLM inference is evolving) that only touches the model runner. We will +have to add a new configuration option in the `VllmConfig` class. Since we pass +the whole config object around, we only need to add the configuration option to +the `VllmConfig` class, and the model runner can access it directly. We don't +need to change the constructor of the engine, worker, or model class to pass the +new configuration option. + +2. **Uniformity**: The model runner needs a unified interface to create and +initialize the model. vLLM supports more than 50 types of popular open-source +models. Each model has its own initialization logic. If the constructor +signature varies with models, the model runner does not know how to call the +constructor accordingly, without complicated and error-prone inspection logic. +By making the constructor of the model class uniform, the model runner can +easily create and initialize the model without knowing the specific model type. +This is also useful for composing models. Vision-language models often consist +of a vision model and a language model. By making the constructor uniform, we +can easily create a vision model and a language model and compose them into a +vision-language model. + +.. note:: + + To support this change, all vLLM models' signatures have been updated to: + + .. code-block:: python + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + + To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one: + + .. code-block:: python + + class MyOldModel(nn.Module): + def __init__( + self, + config, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + prefix: str = "", + ) -> None: + ... + + from vllm.config import VllmConfig + class MyNewModel(MyOldModel): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + super().__init__(config, cache_config, quant_config, lora_config, prefix) + + if __version__ >= "0.6.4": + MyModel = MyNewModel + else: + MyModel = MyOldModel + + This way, the model can work with both old and new versions of vLLM. + +3. **Sharding and Quantization at Initialization**: Certain features require +changing the model weights. For example, tensor parallelism needs to shard the +model weights, and quantization needs to quantize the model weights. There are +two possible ways to implement this feature. One way is to change the model +weights after the model is initialized. The other way is to change the model +weights during the model initialization. vLLM chooses the latter. The first +approach is not scalable to large models. Suppose we want to run a 405B model +(with roughly 810GB weights) with 16 H100 80GB GPUs. Ideally, every GPU should +only load 50GB weights. If we change the model weights after the model is +initialized, we need to load the full 810GB weights to every GPU and then shard +the weights, leading to a huge memory overhead. Instead, if we shard the weights +during the model initialization, every layer will only create a shard of the +weights it needs, leading to a much smaller memory overhead. The same idea +applies to quantization. Note that we also add an additional argument ``prefix`` +to the model's constructor so that the model can initialize itself differently +based on the prefix. This is useful for non-uniform quantization, where +different parts of the model are quantized differently. The ``prefix`` is +usually an empty string for the top-level model and a string like ``"vision"`` +or ``"language"`` for the sub-models. In general, it matches the name of the +module's state dict in the checkpoint file. + +One disadvantage of this design is that it is hard to write unit tests for +individual components in vLLM because every component needs to be initialized by +a complete config object. We solve this problem by providing a default +initialization function that creates a default config object with all fields set +to ``None``. If the component we want to test only cares about a few fields in +the config object, we can create a default config object and set the fields we +care about. This way, we can test the component in isolation. Note that many +tests in vLLM are end-to-end tests that test the whole system, so this is not a +big problem. + +In summary, the complete config object ``VllmConfig`` can be treated as an +engine-level global state that is shared among all vLLM classes. diff --git a/docs/source/design/class_hierarchy.rst b/docs/source/design/class_hierarchy.rst deleted file mode 100644 index 58a888b17ba53..0000000000000 --- a/docs/source/design/class_hierarchy.rst +++ /dev/null @@ -1,74 +0,0 @@ -.. _class_hierarchy: - -vLLM's Class Hierarchy -======================= - -This document describes the class hierarchy of vLLM. We will explain the relationships between the core classes, their responsibilities, and the design choices behind them to make vLLM more modular and extensible. - -1. **Entrypoints**: vLLM has two entrypoints: `command line usage `__ with ``vllm serve`` for launching an OpenAI-API compatible server, and `library-style usage `__ with the ``vllm.LLM`` class for running inference in a Python script. These are user-facing entrypoints that end-users interact with. Under the hood, both create an engine object to handle model inference. - -2. **Engine**: Each vLLM instance contains one engine object, orchestrating and serving as the control plane for model inference. Depending on the configuration, the engine can create multiple workers to handle the inference workload. - -3. **Worker**: A worker is a process that runs the model inference. vLLM follows the common practice of using one process to control one accelerator device, such as GPUs. For example, if we use tensor parallelism of size 2 and pipeline parallelism of size 2, we will have 4 workers in total. Workers are identified by their ``rank`` and ``local_rank``. ``rank`` is used for global orchestration, while ``local_rank`` is mainly used for assigning the accelerator device and accessing local resources such as the file system and shared memory. - -4. **Model Runner**: Every worker has one model runner object, responsible for loading and running the model. Much of the model execution logic resides here, such as preparing input tensors and capturing cudagraphs. - -5. **Model**: Every model runner object has one model object, which is the actual ``torch.nn.Module`` instance. See :ref:`huggingface_integration` for how various configurations affect the class we ultimately get. - -The following figure shows the class hierarchy of vLLM: - - .. figure:: ../assets/design/hierarchy.png - :alt: query - :width: 100% - :align: center - -There are several important design choices behind this class hierarchy: - -1. **Extensibility**: All classes in the hierarchy accept a configuration object containing all the necessary information. The `VllmConfig `__ class is the main configuration object that is passed around. The class hierarchy is quite deep, and every class needs to read the configuration it is interested in. By encapsulating all configurations in one object, we can easily pass the configuration object around and access the configuration we need. Suppose we want to add a new feature (this is often the case given how fast the field of LLM inference is evolving) that only touches the model runner. We will have to add a new configuration option in the `VllmConfig` class. Since we pass the whole config object around, we only need to add the configuration option to the `VllmConfig` class, and the model runner can access it directly. We don't need to change the constructor of the engine, worker, or model class to pass the new configuration option. - -2. **Uniformity**: The model runner needs a unified interface to create and initialize the model. vLLM supports more than 50 types of popular open-source models. Each model has its own initialization logic. If the constructor signature varies with models, the model runner does not know how to call the constructor accordingly, without complicated and error-prone inspection logic. By making the constructor of the model class uniform, the model runner can easily create and initialize the model without knowing the specific model type. This is also useful for composing models. Vision-language models often consist of a vision model and a language model. By making the constructor uniform, we can easily create a vision model and a language model and compose them into a vision-language model. - -.. note:: - - To support this change, all vLLM models' signatures have been updated to: - - .. code-block:: python - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - - To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one: - - .. code-block:: python - - class MyOldModel(nn.Module): - def __init__( - self, - config, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: - ... - - from vllm.config import VllmConfig - class MyNewModel(MyOldModel): - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config - super().__init__(config, cache_config, quant_config, lora_config, prefix) - - if __version__ >= "0.6.4": - MyModel = MyNewModel - else: - MyModel = MyOldModel - - This way, the model can work with both old and new versions of vLLM. - -3. **Sharding and Quantization at Initialization**: Certain features require changing the model weights. For example, tensor parallelism needs to shard the model weights, and quantization needs to quantize the model weights. There are two possible ways to implement this feature. One way is to change the model weights after the model is initialized. The other way is to change the model weights during the model initialization. vLLM chooses the latter. The first approach is not scalable to large models. Suppose we want to run a 405B model (with roughly 810GB weights) with 16 H100 80GB GPUs. Ideally, every GPU should only load 50GB weights. If we change the model weights after the model is initialized, we need to load the full 810GB weights to every GPU and then shard the weights, leading to a huge memory overhead. Instead, if we shard the weights during the model initialization, every layer will only create a shard of the weights it needs, leading to a much smaller memory overhead. The same idea applies to quantization. Note that we also add an additional argument ``prefix`` to the model's constructor so that the model can initialize itself differently based on the prefix. This is useful for non-uniform quantization, where different parts of the model are quantized differently. The ``prefix`` is usually an empty string for the top-level model and a string like ``"vision"`` or ``"language"`` for the sub-models. In general, it matches the name of the module's state dict in the checkpoint file. - -One disadvantage of this design is that it is hard to write unit tests for individual components in vLLM because every component needs to be initialized by a complete config object. We solve this problem by providing a default initialization function that creates a default config object with all fields set to ``None``. If the component we want to test only cares about a few fields in the config object, we can create a default config object and set the fields we care about. This way, we can test the component in isolation. Note that many tests in vLLM are end-to-end tests that test the whole system, so this is not a big problem. - -In summary, the complete config object ``VllmConfig`` can be treated as an engine-level global state that is shared among all vLLM classes. diff --git a/docs/source/design/plugin_system.rst b/docs/source/design/plugin_system.rst index bfca702b9267a..5a96cc8b3a464 100644 --- a/docs/source/design/plugin_system.rst +++ b/docs/source/design/plugin_system.rst @@ -8,7 +8,7 @@ The community frequently requests the ability to extend vLLM with custom feature How Plugins Work in vLLM ------------------------ -Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see :ref:`class_hierarchy`), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the `load_general_plugins `__ function in the ``vllm.plugins`` module. This function is called for every process created by vLLM before it starts any work. +Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see :ref:`arch_overview`), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the `load_general_plugins `__ function in the ``vllm.plugins`` module. This function is called for every process created by vLLM before it starts any work. How vLLM Discovers Plugins -------------------------- @@ -59,4 +59,4 @@ Guidelines for Writing Plugins Compatibility Guarantee ----------------------- -vLLM guarantees the interface of documented plugins, such as ``ModelRegistry.register_model``, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, ``"vllm_add_dummy_model.my_llava:MyLlava"`` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development. \ No newline at end of file +vLLM guarantees the interface of documented plugins, such as ``ModelRegistry.register_model``, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, ``"vllm_add_dummy_model.my_llava:MyLlava"`` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development. diff --git a/docs/source/index.rst b/docs/source/index.rst index b04acbbce4169..c2afd806c50f9 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -157,7 +157,7 @@ Documentation :maxdepth: 2 :caption: Design - design/class_hierarchy + design/arch_overview design/huggingface_integration design/plugin_system design/input_processing/model_inputs_index diff --git a/format.sh b/format.sh index a57882d2ac3f9..b3dcdc15bf948 100755 --- a/format.sh +++ b/format.sh @@ -299,6 +299,10 @@ echo 'vLLM shellcheck:' tools/shellcheck.sh echo 'vLLM shellcheck: Done' +echo 'excalidraw png check:' +tools/png-lint.sh +echo 'excalidraw png check: Done' + if ! git diff --quiet &>/dev/null; then echo echo "🔍🔍There are files changed by the format checker or by you that are not added and committed:" diff --git a/tools/png-lint.sh b/tools/png-lint.sh new file mode 100755 index 0000000000000..a80fe9837342f --- /dev/null +++ b/tools/png-lint.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +# Ensure that *.excalidraw.png files have the excalidraw metadata +# embedded in them. This ensures they can be loaded back into +# the tool and edited in the future. + +find . -iname '*.excalidraw.png' | while read -r file; do + if git check-ignore -q "$file"; then + continue + fi + if ! grep -q "excalidraw+json" "$file"; then + echo "$file was not exported from excalidraw with 'Embed Scene' enabled." + exit 1 + fi +done diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 92fa87c7fa45b..ee4b6addfd466 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -793,7 +793,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: type=str, default=[], help="The pattern(s) to ignore when loading the model." - "Default to 'original/**/*' to avoid repeated loading of llama's " + "Default to `original/**/*` to avoid repeated loading of llama's " "checkpoints.") parser.add_argument( '--preemption-mode', From 25f9c78961daae10b9084d78901d71bc56691aa1 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 19 Nov 2024 02:43:21 -0800 Subject: [PATCH 055/255] [misc][plugin] improve plugin loading (#10443) Signed-off-by: youkaichao --- vllm/plugins/__init__.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index fdc848cedf054..05a9739d99e71 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -9,12 +9,19 @@ logger = logging.getLogger(__name__) +# make sure one process only loads plugins once +plugins_loaded = False + def load_general_plugins(): """WARNING: plugins can be loaded for multiple times in different processes. They should be designed in a way that they can be loaded multiple times without causing issues. """ + global plugins_loaded + if plugins_loaded: + return + plugins_loaded = True import sys if sys.version_info < (3, 10): from importlib_metadata import entry_points From b4614656b832aa8ac95e5450ca7b861f46049635 Mon Sep 17 00:00:00 2001 From: Yuan Date: Tue, 19 Nov 2024 21:16:43 +0800 Subject: [PATCH 056/255] [CI][CPU] adding numa node number as container name suffix (#10441) Signed-off-by: Yuan Zhou --- .buildkite/run-cpu-test.sh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 14756b5964aaf..f0128f091b742 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -13,26 +13,26 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile. numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu . # Setup cleanup -remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; } +remove_docker_container() { docker rm -f cpu-test-"$NUMA_NODE" cpu-test-avx2-"$NUMA_NODE" || true; } trap remove_docker_container EXIT remove_docker_container # Run the image, setting --shm-size=4g for tensor parallel. docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ - --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test + --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ - --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2 + --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2-"$NUMA_NODE" cpu-test-avx2 function cpu_tests() { set -e # offline inference - docker exec cpu-test-avx2 bash -c " + docker exec cpu-test-avx2-"$NUMA_NODE" bash -c " set -e python3 examples/offline_inference.py" # Run basic model test - docker exec cpu-test bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e pip install pytest pytest-asyncio \ decord einops librosa peft Pillow sentence-transformers soundfile \ @@ -45,20 +45,20 @@ function cpu_tests() { pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" # Run compressed-tensor test - docker exec cpu-test bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e pytest -s -v \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token" # Run AWQ test - docker exec cpu-test bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e pytest -s -v \ tests/quantization/test_ipex_quant.py" # online inference - docker exec cpu-test bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c " set -e export VLLM_CPU_KVCACHE_SPACE=10 export VLLM_CPU_OMP_THREADS_BIND=$1 From f028dff33d3d0b0dfe71e0e0354b355b8232a4ec Mon Sep 17 00:00:00 2001 From: COSMOPlat Date: Tue, 19 Nov 2024 21:42:50 +0800 Subject: [PATCH 057/255] [BugFix] Fix hermes tool parser output error stream arguments in some cases (#10395) (#10398) Signed-off-by: xiyuan lee --- .../openai/tool_parsers/hermes_tool_parser.py | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index faa6f653b835c..18816cd665b3e 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -12,8 +12,6 @@ FunctionCall, ToolCall) from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ToolParserManager) -from vllm.entrypoints.openai.tool_parsers.utils import ( - extract_intermediate_diff) from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer from vllm.utils import random_uuid @@ -190,8 +188,11 @@ def extract_tool_calls_streaming( diff = self.prev_tool_call_arr[self.current_tool_id].get( "arguments") if diff: - diff = json.dumps(diff).replace( - self.streamed_args_for_tool[self.current_tool_id], "") + diff = diff.encode('utf-8').decode( + 'unicode_escape') if diff is str else diff + diff = json.dumps( + diff, ensure_ascii=False + )[len(self.streamed_args_for_tool[self.current_tool_id]):] logger.debug( "Finishing tool and found diff that had not " "been streamed yet: %s", diff) @@ -307,22 +308,20 @@ def extract_tool_calls_streaming( # last case -- we have an update to existing arguments. elif cur_arguments and prev_arguments: + if isinstance(delta_text, str) and len(delta_text.rstrip( + )) >= 1 and delta_text.rstrip()[-1] == '}': + delta_text = delta_text.rstrip()[:-1] + + logger.debug("got diff %s", delta_text) - cur_args_json = json.dumps(cur_arguments) - prev_args_json = json.dumps(prev_arguments) - logger.debug("Searching for diff between\n%s", cur_args_json) - logger.debug("and\n%s", prev_args_json) - argument_diff = extract_intermediate_diff( - cur_args_json, prev_args_json) - logger.debug("got argument diff %s", argument_diff) delta = DeltaMessage(tool_calls=[ DeltaToolCall(index=self.current_tool_id, function=DeltaFunctionCall( - arguments=argument_diff).model_dump( + arguments=delta_text).model_dump( exclude_none=True)) ]) self.streamed_args_for_tool[self.current_tool_id] \ - += argument_diff + += delta_text # handle saving the state for the current tool into # the "prev" list for use in diffing for the next iteration From 11fd7ea639cf3c4fae29322d8e5c839ff6f8a1ca Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 19 Nov 2024 18:33:06 +0100 Subject: [PATCH 058/255] [Pixtral-Large] Pixtral actually has no bias in vision-lang adapter (#10449) --- vllm/model_executor/models/pixtral.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index f7f46770057e2..d14b89d6b3f85 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -331,6 +331,7 @@ class VisionEncoderArgs: num_attention_heads: int rope_theta: float # for rope-2D image_token_id: int + adapter_bias: bool = True def _reshape_for_broadcast(freqs_cis: torch.Tensor, @@ -595,10 +596,10 @@ def __init__(self, args: VisionEncoderArgs, dim: int): self.w_in = nn.Linear( args.hidden_size, dim, - bias=True, + bias=args.adapter_bias, ) self.gelu = nn.GELU() - self.w_out = nn.Linear(dim, dim, bias=True) + self.w_out = nn.Linear(dim, dim, bias=args.adapter_bias) def forward(self, x: torch.Tensor) -> torch.Tensor: return self.w_out(self.gelu(self.w_in(x))) From 1ea291a4173a82c537ab42487e23375be4926d30 Mon Sep 17 00:00:00 2001 From: Manjul Mohan <49657164+mikejuliet13@users.noreply.github.com> Date: Tue, 19 Nov 2024 23:04:57 +0530 Subject: [PATCH 059/255] Fix: Build error seen on Power Architecture (#10421) Signed-off-by: Manjul Mohan Signed-off-by: B-201 Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: youkaichao Signed-off-by: ismael-dm Signed-off-by: Andrew Nesbitt Signed-off-by: mgoin Signed-off-by: yan ma Signed-off-by: Angus Wang Signed-off-by: Lucas Wilkinson Signed-off-by: rickyx Signed-off-by: Jee Jee Li Signed-off-by: Mengqing Cao Signed-off-by: Travis Johnson Co-authored-by: Manjul Mohan manjul.mohan@ibm.com Co-authored-by: B-201 Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: youkaichao Co-authored-by: ismael-dm Co-authored-by: Andrew Nesbitt Co-authored-by: Michael Goin Co-authored-by: Yan Ma Co-authored-by: Angus Wang Co-authored-by: Lucas Wilkinson Co-authored-by: Ricky Xu Co-authored-by: Kevin H. Luu Co-authored-by: Jee Jee Li Co-authored-by: Mengqing Cao Co-authored-by: Travis Johnson Co-authored-by: Russell Bryant --- cmake/cpu_extension.cmake | 14 ++++++++++---- csrc/cpu/attention.cpp | 12 ++++++++++-- csrc/cpu/quant.cpp | 6 ++++++ 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index 5912c5c02ede7..426189481575b 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -16,10 +16,16 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc") # # Check the compile flags # -list(APPEND CXX_COMPILE_FLAGS - "-fopenmp" - "-mf16c" - "-DVLLM_CPU_EXTENSION") +if (CMAKE_SYSTEM_PROCESSOR STREQUAL "ppc64le") + list(APPEND CXX_COMPILE_FLAGS + "-fopenmp" + "-DVLLM_CPU_EXTENSION") +else() + list(APPEND CXX_COMPILE_FLAGS + "-fopenmp" + "-mf16c" + "-DVLLM_CPU_EXTENSION") +endif() execute_process(COMMAND cat /proc/cpuinfo RESULT_VARIABLE CPUINFO_RET diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp index e73eca1b345fd..e6c03dcb034fd 100644 --- a/csrc/cpu/attention.cpp +++ b/csrc/cpu/attention.cpp @@ -24,12 +24,20 @@ struct KernelVecType { template <> struct KernelVecType { +#ifdef __powerpc64__ + // Power architecture-specific vector types + using q_load_vec_type = vec_op::FP32Vec8; + using k_load_vec_type = vec_op::FP32Vec16; + using v_load_vec_type = vec_op::FP32Vec16; +#else + // Fallback for other architectures, including x86 using q_load_vec_type = vec_op::FP16Vec8; - using q_vec_type = vec_op::FP32Vec16; using k_load_vec_type = vec_op::FP16Vec16; + using v_load_vec_type = vec_op::FP16Vec16; +#endif + using q_vec_type = vec_op::FP32Vec16; using k_vec_type = vec_op::FP32Vec16; using qk_acc_vec_type = vec_op::FP32Vec16; - using v_load_vec_type = vec_op::FP16Vec16; }; #ifdef __AVX512BF16__ diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp index f42fa2361a2db..d9aed657a3113 100644 --- a/csrc/cpu/quant.cpp +++ b/csrc/cpu/quant.cpp @@ -25,7 +25,13 @@ struct KernelVecType { template <> struct KernelVecType { +#ifdef __powerpc64__ + // Power architecture-specific vector type + using load_vec_type = vec_op::FP32Vec16; +#else + // Fallback for other architectures using load_vec_type = vec_op::FP16Vec16; +#endif using azp_adj_load_vec_type = vec_op::INT32Vec16; using cvt_vec_type = vec_op::FP32Vec16; }; From fd9f124971c58376ca294091951dfcc96cc03474 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Tue, 19 Nov 2024 12:48:30 -0500 Subject: [PATCH 060/255] [Doc] fix link for page that was renamed (#10455) Signed-off-by: Russell Bryant --- vllm/model_executor/model_loader/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index b41c23704b7ff..936c2fe415375 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -105,7 +105,7 @@ def _initialize_model(vllm_config: VllmConfig, prefix: str = "") -> nn.Module: msg = ("vLLM model class should accept `vllm_config` and `prefix` as " "input arguments. Possibly you have an old-style model class" " registered from out of tree and it is used for new vLLM version. " - "Check https://docs.vllm.ai/en/latest/design/class_hierarchy.html " + "Check https://docs.vllm.ai/en/latest/design/arch_overview.html " "for the design and update the model class accordingly.") logger.warning(msg) logger.warning( From 803f37eaaa11568f65acbf0bcd1044fb9b1610bf Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 19 Nov 2024 10:09:03 -0800 Subject: [PATCH 061/255] [6/N] torch.compile rollout to users (#10437) Signed-off-by: youkaichao --- .../piecewise_compilation_config.json | 5 -- tests/compile/piecewise/test_simple.py | 18 +++---- tests/compile/piecewise/test_toy_llama.py | 45 +++++++----------- tests/compile/test_basic_correctness.py | 13 +++-- tests/compile/utils.py | 4 +- .../model_executor/test_enabled_custom_ops.py | 4 +- tests/tpu/test_compilation.py | 47 ++++++++++++++----- tests/tpu/test_custom_dispatcher.py | 10 ++-- vllm/config.py | 43 ++++++++--------- vllm/engine/arg_utils.py | 29 +++++++++--- vllm/engine/llm_engine.py | 4 +- vllm/envs.py | 8 ---- vllm/platforms/tpu.py | 4 +- vllm/plugins/__init__.py | 14 +----- vllm/v1/worker/gpu_model_runner.py | 22 ++------- 15 files changed, 129 insertions(+), 141 deletions(-) delete mode 100644 tests/compile/piecewise/piecewise_compilation_config.json diff --git a/tests/compile/piecewise/piecewise_compilation_config.json b/tests/compile/piecewise/piecewise_compilation_config.json deleted file mode 100644 index 798a34e8dd92d..0000000000000 --- a/tests/compile/piecewise/piecewise_compilation_config.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "use_cudagraph": true, - "non_cudagraph_ops": ["silly.attention"], - "cudagraph_copy_inputs": true -} \ No newline at end of file diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index 45f56cbbd4b16..0e40e3b4ebc96 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -2,7 +2,6 @@ Test the piecewise compilation with a simple model so that we can exactly calculate the expected output and side effects. """ -import os import torch from torch import nn @@ -11,7 +10,7 @@ from vllm.compilation.compile_context import set_compile_context from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile -from vllm.config import CompilationLevel, VllmConfig +from vllm.config import CompilationConfig, CompilationLevel, VllmConfig from vllm.plugins import set_current_vllm_config from vllm.utils import direct_register_custom_op @@ -77,12 +76,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: def test_simple_piecewise_compile(): - directory = os.path.dirname(__file__) - config = os.path.join(directory, "piecewise_compilation_config.json") - os.environ["VLLM_TORCH_COMPILE_CONFIG"] = config - os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE) - - vllm_config = VllmConfig() + vllm_config = VllmConfig(compilation_config=CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_cudagraph=True, + non_cudagraph_ops=["silly.attention"], + cudagraph_copy_inputs=True, + )) with set_current_vllm_config(vllm_config): model = SillyModel(vllm_config=vllm_config, prefix='') @@ -109,6 +108,3 @@ def test_simple_piecewise_compile(): output = model(input) assert global_counter == 2 assert torch.allclose(output.cpu(), torch.tensor([3., 1.])) - - # clean up to avoid side effects for other tests - del os.environ["VLLM_TORCH_COMPILE_CONFIG"] diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index 8032304e95806..356d119a40334 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -6,7 +6,6 @@ if the config `tractable_init` is set to True. Otherwise, the weights are initialized randomly with a fixed seed. """ -import os from dataclasses import dataclass from typing import Optional, Tuple @@ -18,7 +17,7 @@ from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile from vllm.config import CompilationConfig, CompilationLevel, VllmConfig -from vllm.plugins import set_compilation_config, set_current_vllm_config +from vllm.plugins import set_current_vllm_config from vllm.utils import direct_register_custom_op # create a library to hold the custom op @@ -254,23 +253,17 @@ def run_model(llama_config, split_attn: bool = False) -> torch.Tensor: if use_compile: - os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str( - CompilationLevel.PIECEWISE) - + compilation_config = CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_cudagraph=True, + ) if split_attn: - set_compilation_config( - CompilationConfig( - use_cudagraph=True, - non_cudagraph_ops=["silly.attention"], - )) - else: - set_compilation_config(CompilationConfig(use_cudagraph=True, )) + compilation_config.non_cudagraph_ops = ["silly.attention"] else: - os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str( - CompilationLevel.NO_COMPILATION) - set_compilation_config(None) + compilation_config = CompilationConfig( + level=CompilationLevel.NO_COMPILATION, ) - vllm_config = VllmConfig() + vllm_config = VllmConfig(compilation_config=compilation_config) with set_current_vllm_config(vllm_config): model = LlamaModel(config=llama_config, vllm_config=vllm_config, @@ -288,10 +281,6 @@ def run_model(llama_config, input_ids[:2].zero_() output = model(input_ids[:2], positions[:2]) - # manual cleanup - del os.environ["VLLM_TORCH_COMPILE_LEVEL"] - set_compilation_config(None) - output = output.cpu() if llama_config.tractable_init: @@ -361,7 +350,6 @@ def test_toy_llama(): @torch.inference_mode def benchmark(): - os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE) from triton.testing import do_bench # similar to llama 3.1-8B @@ -387,15 +375,16 @@ def benchmark(): for piecewise in [False, True]: if piecewise: - set_compilation_config( - CompilationConfig( - use_cudagraph=True, - non_cudagraph_ops=["silly.attention"], - )) + compilation_config = CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_cudagraph=True, + non_cudagraph_ops=["silly.attention"], + ) else: - set_compilation_config(None) + compilation_config = CompilationConfig( + level=CompilationLevel.PIECEWISE, ) - vllm_config = VllmConfig() + vllm_config = VllmConfig(compilation_config=compilation_config) with set_current_vllm_config(vllm_config): model = LlamaModel(config=llama_config, vllm_config=vllm_config, diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 08747ebc58b75..c0db2e78824be 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -96,31 +96,36 @@ def test_compile_correctness(test_setting: TestSetting): final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \ ["-tp", str(tp_size)] + all_args: List[List[str]] = [] all_envs: List[Optional[Dict[str, str]]] = [] for level in [ CompilationLevel.NO_COMPILATION, CompilationLevel.PIECEWISE, ]: - all_envs.append({"VLLM_TORCH_COMPILE_LEVEL": str(level)}) + all_args.append(final_args + ["-O", str(level)]) + all_envs.append({}) # inductor will change the output, so we only compare if the output # is close, not exactly the same. compare_all_settings( - model, [final_args] * 2, + model, + all_args, all_envs, method=method if method != "generate" else "generate_close") all_envs.clear() + all_args.clear() for level in [ CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE, ]: - all_envs.append({"VLLM_TORCH_COMPILE_LEVEL": str(level)}) + all_args.append(final_args + ["-O", str(level)]) + all_envs.append({}) if level != CompilationLevel.DYNAMO_ONCE and not fullgraph: # "DYNAMO_ONCE" will always use fullgraph all_envs[-1][ "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore - compare_all_settings(model, [final_args] * 3, all_envs, method=method) + compare_all_settings(model, all_args * 3, all_envs, method=method) diff --git a/tests/compile/utils.py b/tests/compile/utils.py index 729f10676888b..078c6bf9ea1df 100644 --- a/tests/compile/utils.py +++ b/tests/compile/utils.py @@ -4,7 +4,7 @@ from tests.quantization.utils import is_quant_method_supported from vllm import LLM, SamplingParams -from vllm.config import CompilationLevel +from vllm.config import CompilationConfig, CompilationLevel from vllm.platforms import current_platform TEST_MODELS = [ @@ -65,7 +65,6 @@ def check_full_graph_support(model, optimization_level, tp_size=1): # make sure these models can be captured in full graph mode - os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(optimization_level) os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1" # The base meta llama uses too much memory. @@ -86,6 +85,7 @@ def check_full_graph_support(model, enforce_eager=True, tensor_parallel_size=tp_size, disable_custom_all_reduce=True, + compilation_config=CompilationConfig(level=optimization_level), **model_kwargs) outputs = llm.generate(prompts, sampling_params) diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index c3219bc50646b..c54e30995da49 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -1,4 +1,3 @@ -import os from typing import List import pytest @@ -53,9 +52,8 @@ class Relu3(ReLUSquaredActivation): ]) def test_enabled_ops(env: str, torch_level: int, ops_enabled: List[int], default_on: bool): - os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(torch_level) vllm_config = VllmConfig(compilation_config=CompilationConfig( - custom_ops=env.split(","))) + level=torch_level, custom_ops=env.split(","))) with set_current_vllm_config(vllm_config): assert CustomOp.default_on() == default_on diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py index 941abe17a3378..65bee85e7a1ea 100644 --- a/tests/tpu/test_compilation.py +++ b/tests/tpu/test_compilation.py @@ -1,24 +1,47 @@ import glob import os -import runpy import tempfile import depyf -from vllm.config import CompilationLevel - -# disable custom dispatcher, let Dynamo takes over -# all the control -os.environ['VLLM_TORCH_COMPILE_LEVEL'] = str(CompilationLevel.DYNAMO_AS_IS) +from vllm.config import CompilationConfig, CompilationLevel temp_dir = tempfile.mkdtemp() with depyf.prepare_debug(temp_dir): - cur_dir = os.path.dirname(__file__) - parent_dir = os.path.dirname(cur_dir) - root_dir = os.path.dirname(parent_dir) - example_file = os.path.join(root_dir, "examples", - "offline_inference_tpu.py") - runpy.run_path(example_file) + from vllm import LLM, SamplingParams + + prompts = [ + "A robot may not injure a human being", + "It is only with the heart that one can see rightly;", + "The greatest glory in living lies not in never falling,", + ] + answers = [ + " or, through inaction, allow a human being to come to harm.", + " what is essential is invisible to the eye.", + " but in rising every time we fall.", + ] + N = 1 + # Currently, top-p sampling is disabled. `top_p` should be 1.0. + sampling_params = SamplingParams(temperature=0.7, + top_p=1.0, + n=N, + max_tokens=16) + + # Set `enforce_eager=True` to avoid ahead-of-time compilation. + # In real workloads, `enforace_eager` should be `False`. + + # disable custom dispatcher, let Dynamo takes over + # all the control + llm = LLM(model="google/gemma-2b", + enforce_eager=True, + compilation_config=CompilationConfig( + level=CompilationLevel.DYNAMO_AS_IS)) + outputs = llm.generate(prompts, sampling_params) + for output, answer in zip(outputs, answers): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + assert generated_text.startswith(answer) compiled_code = sorted( glob.glob(os.path.join(temp_dir, "__transformed_code*.py"))) diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py index 53b10c06135a1..df348258efcba 100644 --- a/tests/tpu/test_custom_dispatcher.py +++ b/tests/tpu/test_custom_dispatcher.py @@ -13,7 +13,9 @@ def test_custom_dispatcher(): compare_two_settings( "google/gemma-2b", - arg1=["--enforce-eager"], - arg2=["--enforce-eager"], - env1={"VLLM_TORCH_COMPILE_LEVEL": str(CompilationLevel.DYNAMO_ONCE)}, - env2={"VLLM_TORCH_COMPILE_LEVEL": str(CompilationLevel.DYNAMO_AS_IS)}) + arg1=["--enforce-eager", "-O", + str(CompilationLevel.DYNAMO_ONCE)], + arg2=["--enforce-eager", "-O", + str(CompilationLevel.DYNAMO_AS_IS)], + env1={}, + env2={}) diff --git a/vllm/config.py b/vllm/config.py index ea9ec43cc5a15..e69cbd3eb402a 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2174,8 +2174,14 @@ class CompilationConfig(BaseModel): enabled_custom_ops: Counter[str] = PrivateAttr disabled_custom_ops: Counter[str] = PrivateAttr + @classmethod + def from_cli(cls, cli_value: str) -> "CompilationConfig": + """Parse the CLI value for the compilation config.""" + if cli_value in ["0", "1", "2", "3"]: + return cls(level=int(cli_value)) + return CompilationConfig.model_validate_json(cli_value) + def model_post_init(self, __context: Any) -> None: - self.level = envs.VLLM_TORCH_COMPILE_LEVEL count_none = self.custom_ops.count("none") count_all = self.custom_ops.count("all") @@ -2249,26 +2255,6 @@ def init_during_runtime(self): "inductor_specialize_for_cudagraph_no_more_than is None") self.compile_sizes = self.inductor_compile_sizes - @staticmethod - def select_and_init_config() -> "CompilationConfig": - """The order of selecting config is: - 1. Use the config specified in environment variable. - 2. Use the config specified in plugins. - 3. Use the default config. - """ - config_path = envs.VLLM_TORCH_COMPILE_CONFIG - if config_path is not None: - with open(config_path) as json_file: - config = CompilationConfig.model_validate_json( - json_file.read()) - else: - from vllm.plugins import get_compilation_config - predefined_config = get_compilation_config() - config = predefined_config if predefined_config is not None else ( - CompilationConfig()) - - return config - @dataclass class VllmConfig: @@ -2354,8 +2340,19 @@ def __post_init__(self): self.model_config, self.load_config) if self.compilation_config is None: - self.compilation_config = CompilationConfig.select_and_init_config( - ) + self.compilation_config = CompilationConfig() + if envs.VLLM_USE_V1: + # NOTE(woosuk): Currently, we use inductor because the piecewise + # CUDA graphs do not work properly with the custom CUDA kernels. + # FIXME(woosuk): Disable inductor to reduce the compilation time + # and avoid any potential issues with the inductor. + self.compilation_config.custom_ops = ["none"] + self.compilation_config.use_cudagraph = True + self.compilation_config.non_cudagraph_ops = [ + "vllm.unified_v1_flash_attention" + ] + self.compilation_config.use_inductor = True + self.compilation_config.enable_fusion = False current_platform.check_and_update_config(self) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ee4b6addfd466..a3ae1889774f3 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -8,12 +8,13 @@ import torch import vllm.envs as envs -from vllm.config import (CacheConfig, ConfigFormat, DecodingConfig, - DeviceConfig, HfOverrides, LoadConfig, LoadFormat, - LoRAConfig, ModelConfig, ObservabilityConfig, - ParallelConfig, PoolerConfig, PromptAdapterConfig, - SchedulerConfig, SpeculativeConfig, TaskOption, - TokenizerPoolConfig, VllmConfig) +from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat, + DecodingConfig, DeviceConfig, HfOverrides, LoadConfig, + LoadFormat, LoRAConfig, ModelConfig, + ObservabilityConfig, ParallelConfig, PoolerConfig, + PromptAdapterConfig, SchedulerConfig, + SpeculativeConfig, TaskOption, TokenizerPoolConfig, + VllmConfig) from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS @@ -189,6 +190,7 @@ class EngineArgs: override_neuron_config: Optional[Dict[str, Any]] = None override_pooler_config: Optional[PoolerConfig] = None + compilation_config: Optional[CompilationConfig] = None def __post_init__(self): if not self.tokenizer: @@ -868,6 +870,20 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: help="Override or set the pooling method in the embedding model. " "e.g. {\"pooling_type\": \"mean\", \"normalize\": false}.'") + parser.add_argument('--compilation-config', + '-O', + type=CompilationConfig.from_cli, + default=None, + help='torch.compile configuration for the model.' + 'When it is a number (0, 1, 2, 3), it will be ' + 'interpreted as the optimization level.\n' + 'NOTE: level 0 is the default level without ' + 'any optimization. level 1 and 2 are for internal ' + 'testing only. level 3 is the recommended level ' + 'for production.\n' + 'To specify the full compilation config, ' + 'use a JSON string.') + return parser @classmethod @@ -1142,6 +1158,7 @@ def create_engine_config(self) -> VllmConfig: decoding_config=decoding_config, observability_config=observability_config, prompt_adapter_config=prompt_adapter_config, + compilation_config=self.compilation_config, ) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index e72dc81f35b67..2a5eaf1340762 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -262,7 +262,8 @@ def __init__( "num_scheduler_steps=%d, chunked_prefill_enabled=%s " "multi_step_stream_outputs=%s, enable_prefix_caching=%s, " "use_async_output_proc=%s, use_cached_outputs=%s, " - "mm_processor_kwargs=%s, pooler_config=%r)", + "mm_processor_kwargs=%s, pooler_config=%r," + "compilation_config=%r", VLLM_VERSION, model_config.model, speculative_config, @@ -297,6 +298,7 @@ def __init__( use_cached_outputs, model_config.mm_processor_kwargs, model_config.pooler_config, + vllm_config.compilation_config, ) # TODO(woosuk): Print more configs in debug mode. self.model_config = model_config diff --git a/vllm/envs.py b/vllm/envs.py index 716e835a555f1..853c49bc4dbc1 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -67,8 +67,6 @@ VLLM_USE_TRITON_AWQ: bool = False VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False VLLM_SKIP_P2P_CHECK: bool = False - VLLM_TORCH_COMPILE_LEVEL: int = 0 - VLLM_TORCH_COMPILE_CONFIG: Optional[str] = None VLLM_DISABLED_KERNELS: List[str] = [] VLLM_USE_V1: bool = False VLLM_ENABLE_V1_MULTIPROCESSING: bool = False @@ -209,12 +207,6 @@ def get_default_config_root(): "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE": lambda: bool( os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"), - "VLLM_TORCH_COMPILE_LEVEL": - lambda: int(os.environ.get("VLLM_TORCH_COMPILE_LEVEL", "0")), - - # Path to the config file for torch compile - "VLLM_TORCH_COMPILE_CONFIG": - lambda: os.environ.get("VLLM_TORCH_COMPILE_CONFIG", None), # local rank of the process in the distributed setting, used to determine # the GPU device id diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 9057afb6514e4..2a7ca9fb8c576 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -1,4 +1,3 @@ -import os from typing import TYPE_CHECKING import torch @@ -40,7 +39,8 @@ def inference_mode(cls): def check_and_update_config(cls, vllm_config: VllmConfig) -> None: from vllm.config import CompilationLevel compilation_config = vllm_config.compilation_config - if "VLLM_TORCH_COMPILE_LEVEL" not in os.environ: + if compilation_config.level == CompilationLevel.NO_COMPILATION: + # TPU does not support NO_COMPILATION compilation_config.level = CompilationLevel.DYNAMO_ONCE assert compilation_config.level < CompilationLevel.PIECEWISE,\ "TPU does not support Inductor." diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 05a9739d99e71..dc183dbfc9b96 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -5,7 +5,7 @@ import vllm.envs as envs if TYPE_CHECKING: - from vllm.config import CompilationConfig, VllmConfig + from vllm.config import VllmConfig logger = logging.getLogger(__name__) @@ -54,18 +54,6 @@ def load_general_plugins(): logger.exception("Failed to load plugin %s", plugin.name) -_compilation_config: Optional["CompilationConfig"] = None - - -def set_compilation_config(config: Optional["CompilationConfig"]): - global _compilation_config - _compilation_config = config - - -def get_compilation_config() -> Optional["CompilationConfig"]: - return _compilation_config - - _current_vllm_config: Optional["VllmConfig"] = None diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d60f93a44f6dd..1f9b544637bf7 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -8,13 +8,12 @@ import torch.nn as nn from vllm.compilation.compile_context import set_compile_context -from vllm.config import CompilationConfig, CompilationLevel, VllmConfig +from vllm.config import CompilationLevel, VllmConfig from vllm.forward_context import set_forward_context from vllm.inputs import INPUT_REGISTRY, InputRegistry from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model from vllm.multimodal import MultiModalKwargs -from vllm.plugins import set_compilation_config from vllm.sampling_params import SamplingParams, SamplingType from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, cdiv, is_pin_memory_available) @@ -508,20 +507,6 @@ def execute_model( return model_runner_output def load_model(self) -> None: - if self.use_cuda_graph: - # NOTE(woosuk): Currently, we use inductor because the piecewise - # CUDA graphs do not work properly with the custom CUDA kernels. - # FIXME(woosuk): Disable inductor to reduce the compilation time - # and avoid any potential issues with the inductor. - set_compilation_config( - CompilationConfig( - custom_ops=["none"], - use_cudagraph=True, - non_cudagraph_ops=["vllm.unified_v1_flash_attention"], - use_inductor=True, - enable_fusion=False, - )) - logger.info("Starting to load model %s...", self.model_config.model) with DeviceMemoryProfiler() as m: # noqa: SIM117 self.model = get_model(vllm_config=self.vllm_config) @@ -562,9 +547,8 @@ def profile_run(self) -> None: def capture_model(self) -> None: if not self.use_cuda_graph: logger.warning( - "Skipping CUDA graph capture. Please set " - "VLLM_TORCH_COMPILE_LEVEL=%d to use CUDA graphs.", - CompilationLevel.PIECEWISE) + "Skipping CUDA graph capture. Please add " + "-O 3 to use CUDA graphs.", CompilationLevel.PIECEWISE) return start_time = time.perf_counter() From efa9084628b32787ae1901a2d1e9b80f7d08809b Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Tue, 19 Nov 2024 16:05:25 -0500 Subject: [PATCH 062/255] [Core] Avoid metrics log noise when idle (#8868) Signed-off-by: Russell Bryant --- vllm/engine/metrics.py | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 47472c274ccb6..5bfd6a9f4b386 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -421,6 +421,11 @@ def get_throughput(tracked_stats: List[int], now: float, class LoggingStatLogger(StatLoggerBase): """LoggingStatLogger is used in LLMEngine to log to Stdout.""" + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + self.last_prompt_throughput: Optional[float] = None + self.last_generation_throughput: Optional[float] = None + def log(self, stats: Stats) -> None: """Called by LLMEngine. Logs to Stdout every self.local_interval seconds.""" @@ -445,8 +450,14 @@ def log(self, stats: Stats) -> None: now=stats.now, last_log=self.last_local_log) - # Log to stdout. - logger.info( + log_fn = logger.info + if not any((prompt_throughput, generation_throughput, + self.last_prompt_throughput, + self.last_generation_throughput)): + # Avoid log noise on an idle production system + log_fn = logger.debug + + log_fn( "Avg prompt throughput: %.1f tokens/s, " "Avg generation throughput: %.1f tokens/s, " "Running: %d reqs, Swapped: %d reqs, " @@ -472,11 +483,16 @@ def log(self, stats: Stats) -> None: self._format_spec_decode_metrics_str( self.spec_decode_metrics)) - # Reset tracked stats for next interval. - self.num_prompt_tokens = [] - self.num_generation_tokens = [] - self.last_local_log = stats.now - self.spec_decode_metrics = None + self._reset(stats, prompt_throughput, generation_throughput) + + def _reset(self, stats, prompt_throughput, generation_throughput) -> None: + # Reset tracked stats for next interval. + self.num_prompt_tokens = [] + self.num_generation_tokens = [] + self.last_local_log = stats.now + self.spec_decode_metrics = None + self.last_prompt_throughput = prompt_throughput + self.last_generation_throughput = generation_throughput def _format_spec_decode_metrics_str( self, metrics: "SpecDecodeWorkerMetrics") -> str: From b00b33d77e33c5516e73de663539dff96e8b61a4 Mon Sep 17 00:00:00 2001 From: ElizaWszola Date: Tue, 19 Nov 2024 22:31:12 +0100 Subject: [PATCH 063/255] [Model][Quantization] HQQ support through Marlin kernel expansion (#9766) Signed-off-by: ElizaWszola --- benchmarks/kernels/benchmark_machete.py | 3 +- benchmarks/kernels/benchmark_marlin.py | 4 +- csrc/quantization/gptq_marlin/gptq_marlin.cu | 277 ++++++++++----- csrc/torch_bindings.cpp | 2 +- tests/kernels/test_marlin_gemm.py | 88 ++++- tests/weight_loading/models.txt | 3 +- vllm/_custom_ops.py | 8 +- vllm/model_executor/layers/linear.py | 3 +- .../layers/quantization/__init__.py | 2 + .../layers/quantization/hqq_marlin.py | 325 ++++++++++++++++++ .../layers/quantization/utils/marlin_utils.py | 6 +- 11 files changed, 632 insertions(+), 89 deletions(-) create mode 100644 vllm/model_executor/layers/quantization/hqq_marlin.py diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index a0342d08f1db8..46bab74ae8adf 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -210,7 +210,8 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable: size_m=bt.a.shape[0], size_n=bt.w_ref.shape[1], size_k=bt.w_ref.shape[0], - is_k_full=True) + is_k_full=True, + is_zp_float=False) else: assert bt.a.dtype == torch.int8 assert bt.wtype == scalar_types.uint4b8 diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py index 536c133bb3341..8fb44e3a3dbd8 100644 --- a/benchmarks/kernels/benchmark_marlin.py +++ b/benchmarks/kernels/benchmark_marlin.py @@ -131,7 +131,7 @@ def bench_run(results: List[benchmark.Measurement], model: str, results.append( benchmark.Timer( stmt= - "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False)", # noqa: E501 + "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, @@ -141,7 +141,7 @@ def bench_run(results: List[benchmark.Measurement], model: str, results.append( benchmark.Timer( stmt= - "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True)", # noqa: E501 + "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu index 6dbf9594e8492..0c698ced7713d 100644 --- a/csrc/quantization/gptq_marlin/gptq_marlin.cu +++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu @@ -54,9 +54,10 @@ template shared // fetch pipeline - const bool has_act_order, // whether act_order is enabled - const int group_blocks = -1 // number of consecutive 16x16 blocks - // with a separate quantization scale + const bool has_act_order, // whether act_order is enabled + const int group_blocks = -1, // number of consecutive 16x16 blocks + // with a separate quantization scale + const bool is_zp_float // is zero point of float16 type? > __global__ void Marlin( const int4* __restrict__ A, // fp16 input matrix of shape mxk @@ -82,7 +83,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, torch::Tensor& workspace, vllm::ScalarTypeId const b_q_type_id, int64_t size_m, int64_t size_n, int64_t size_k, - bool is_k_full, bool has_zp) { + bool is_k_full, bool has_zp, bool is_zp_float) { TORCH_CHECK_NOT_IMPLEMENTED(false, "marlin_gemm(..) requires CUDA_ARCH >= 8.0"); return torch::empty({1, 1}); @@ -516,10 +517,11 @@ template shared // fetch pipeline - const bool has_act_order, // whether act_order is enabled - const bool has_zp, // whether zero-points are enabled - const int group_blocks = -1 // number of consecutive 16x16 blocks - // with a separate quantization scale + const bool has_act_order, // whether act_order is enabled + const bool has_zp, // whether zero-points are enabled + const int group_blocks = -1, // number of consecutive 16x16 blocks + // with a separate quantization scale + const bool is_zp_float // is zero point of float16 type? > __global__ void Marlin( const int4* __restrict__ A, // fp16 input matrix of shape mxk @@ -692,8 +694,10 @@ __global__ void Marlin( int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps; // Zero-points sizes/strides - int zp_gl_stride = (prob_n / pack_factor) / 4; - constexpr int zp_sh_stride = ((16 * thread_n_blocks) / pack_factor) / 4; + int zp_gl_stride = is_zp_float ? prob_n / 8 : (prob_n / pack_factor) / 4; + constexpr int zp_sh_stride = is_zp_float + ? 16 * thread_n_blocks / 8 + : ((16 * thread_n_blocks) / pack_factor) / 4; constexpr int zp_tb_groups = s_tb_groups; constexpr int zp_sh_stage = has_zp ? zp_tb_groups * zp_sh_stride : 0; int zp_gl_rd_delta = zp_gl_stride; @@ -768,9 +772,16 @@ __global__ void Marlin( constexpr int num_ints_per_thread = 8 / pack_factor; int zp_sh_rd; if constexpr (has_zp) { - zp_sh_rd = num_ints_per_thread * num_col_threads * - ((threadIdx.x / 32) % (thread_n_blocks / 4)) + - num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads); + if constexpr (is_zp_float) { + if constexpr (group_blocks != -1) { + zp_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + + (threadIdx.x % 32) / 4; + } + } else { + zp_sh_rd = num_ints_per_thread * num_col_threads * + ((threadIdx.x / 32) % (thread_n_blocks / 4)) + + num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads); + } } // Precompute which thread should not read memory in which iterations; this is @@ -832,6 +843,7 @@ __global__ void Marlin( FragS act_frag_s[2][4][4]; // For act-order int frag_qzp[2][num_ints_per_thread]; // Zero-points FragZP frag_zp; // Zero-points in fp16 + FragZP frag_zpf[2]; // Zero-points in fp16 in HQQ // Zero accumulators. auto zero_accums = [&]() { @@ -1126,7 +1138,7 @@ __global__ void Marlin( // has_zp implies AWQ, which doesn't have act_order, static_assert(!has_zp || group_blocks != 0); - if constexpr (has_zp) { + if constexpr (has_zp && !is_zp_float) { int pipe = full_pipe % stages; if constexpr (group_blocks == -1) { @@ -1170,11 +1182,44 @@ __global__ void Marlin( } } } + + else if constexpr (has_zp && is_zp_float) { + int pipe = full_pipe % stages; + + if constexpr (group_blocks != -1) { + if constexpr (group_blocks >= thread_k_blocks) { + int4* sh_zp_stage = + sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) * + (pipe / (group_blocks / thread_k_blocks))); + reinterpret_cast(&frag_zpf[k % 2])[0] = sh_zp_stage[zp_sh_rd]; + } else { + int warp_id = threadIdx.x / 32; + int n_warps = thread_n_blocks / 4; + + int warp_row = warp_id / n_warps; + + int cur_k = warp_row * 16; + cur_k += k_iter_size * (k % b_sh_wr_iters); + + int k_blocks = cur_k / 16; + // Suppress bogus and persistent divide-by-zero warning + #pragma nv_diagnostic push + #pragma nv_diag_suppress divide_by_zero + int cur_group_id = k_blocks / group_blocks; + #pragma nv_diagnostic pop + + int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe; + + reinterpret_cast(&frag_zpf[k % 2])[0] = + sh_zp_stage[zp_sh_rd + cur_group_id * zp_sh_stride]; + } + } + } }; // Execute the actual tensor core matmul of a sub-tile. auto matmul = [&](int k) { - if constexpr (has_zp) { + if constexpr (has_zp && !is_zp_float) { FragB frag_zp_0; FragB frag_zp_1; int zp_quant_0, zp_quant_1; @@ -1219,10 +1264,14 @@ __global__ void Marlin( frag_b1 = dequant(b_quant_1); // Apply zero-point to frag_b0 - if constexpr (has_zp) { + if constexpr (has_zp && !is_zp_float) { sub_zp(frag_b0, frag_zp[j], 0); } + else if constexpr (has_zp && is_zp_float && group_blocks != -1) { + sub_zp(frag_b0, frag_zpf[k % 2][j], 0); + } + // Apply scale to frag_b0 if constexpr (has_act_order) { scale4(frag_b0, act_frag_s[k % 2][0][j], @@ -1235,10 +1284,14 @@ __global__ void Marlin( } // Apply zero-point to frag_b1 - if constexpr (has_zp) { + if constexpr (has_zp && !is_zp_float) { sub_zp(frag_b1, frag_zp[j], 1); } + else if constexpr (has_zp && is_zp_float && group_blocks != -1) { + sub_zp(frag_b1, frag_zpf[k % 2][j], 1); + } + // Apply scale to frag_b1 if constexpr (has_act_order) { scale4(frag_b1, act_frag_s[k % 2][0][j], @@ -1510,7 +1563,7 @@ __global__ void Marlin( fetch_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]); } - if constexpr (has_zp && group_blocks == -1) { + if constexpr (has_zp && !is_zp_float && group_blocks == -1) { if (i == 0) { fetch_zp_to_shared(); } @@ -1697,23 +1750,27 @@ __global__ void Marlin( } #define __CALL_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \ - HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS, NUM_THREADS) \ + HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS, NUM_THREADS, \ + IS_ZP_FLOAT) \ else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS && \ thread_n_blocks == THREAD_N_BLOCKS && \ thread_k_blocks == THREAD_K_BLOCKS && \ has_act_order == HAS_ACT_ORDER && has_zp == HAS_ZP && \ - group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) { \ - cudaFuncSetAttribute( \ - Marlin, \ - cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \ - Marlin \ - <<>>( \ - A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr, \ - num_groups, prob_m, prob_n, prob_k, locks, use_fp32_reduce); \ + group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS && \ + is_zp_float == IS_ZP_FLOAT) { \ + if constexpr (!IS_ZP_FLOAT || std::is_same::value) { \ + cudaFuncSetAttribute( \ + Marlin, \ + cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \ + Marlin \ + <<>>( \ + A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr, \ + num_groups, prob_m, prob_n, prob_k, locks, use_fp32_reduce); \ + } \ } typedef struct { @@ -1905,51 +1962,96 @@ exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k, } #define GPTQ_CALL_IF(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ - __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS) \ - __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS) \ - __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS) \ - __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS) \ + __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS, \ + false) \ \ - __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \ - __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS) \ - __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS) \ - __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS) \ + __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS, \ + false) \ \ - __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \ - __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS) \ - __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS) \ - __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS) \ + __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS, \ + false) \ \ - __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \ - __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS) \ - __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS) \ - __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS) \ + __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS, \ + false) \ \ - __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \ - __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS) \ - __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS) \ - __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS) + __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS, \ + false) #define AWQ_CALL_IF(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ - __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \ - __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS) \ - __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS) \ - __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS) \ + __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS, \ + false) \ \ - __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \ - __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS) \ - __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS) \ - __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS) \ + __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS, \ + false) \ \ - __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \ - __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS) \ - __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS) \ - __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS) \ + __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS, \ + false) \ \ - __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \ - __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS) \ - __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS) \ - __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS) + __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS, false) + + // We currently have 4-bit models only with group_blocks == 4 + #define HQQ_CALL_IF(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, \ + true) \ + __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, \ + true) \ + __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, \ + true) \ + __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, true) template void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s, @@ -1958,7 +2060,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s, vllm::ScalarType const& q_type, bool has_act_order, bool is_k_full, bool has_zp, int num_groups, int group_size, int dev, cudaStream_t stream, int thread_k, int thread_n, - int sms, int max_par, bool use_fp32_reduce) { + int sms, int max_par, bool use_fp32_reduce, bool is_zp_float) { if (has_zp) { TORCH_CHECK( q_type == vllm::kU4 || q_type == vllm::kU8, @@ -2111,6 +2213,11 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s, AWQ_CALL_IF(vllm::kU8, 8, 8, 256) AWQ_CALL_IF(vllm::kU8, 8, 4, 128) AWQ_CALL_IF(vllm::kU8, 4, 8, 128) + + HQQ_CALL_IF(vllm::kU4, 16, 4, 256) + HQQ_CALL_IF(vllm::kU4, 8, 8, 256) + HQQ_CALL_IF(vllm::kU4, 8, 4, 128) + HQQ_CALL_IF(vllm::kU4, 4, 8, 128) else { TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n, ", ", prob_k, "]", ", has_act_order = ", has_act_order, @@ -2135,7 +2242,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n, int64_t size_k, bool is_k_full, bool has_zp, - bool use_fp32_reduce) { + bool use_fp32_reduce, bool is_zp_float) { vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id); if (has_zp) { TORCH_CHECK( @@ -2148,6 +2255,12 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, b_q_type.str()); } + if (has_zp && is_zp_float) { + TORCH_CHECK(a.scalar_type() == at::ScalarType::Half, + "Computation type must be float16 (half) when using float zero " + "points."); + } + int pack_factor = 32 / b_q_type.size_bits(); // Verify A @@ -2257,12 +2370,22 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, if (has_zp) { int rank = b_zeros.sizes().size(); TORCH_CHECK(rank == 2, "b_zeros rank = ", rank, " is not 2"); - TORCH_CHECK(b_zeros.size(0) == num_groups, - "b_zeros dim 0 = ", b_zeros.size(0), - " is not num_groups = ", num_groups); - TORCH_CHECK(b_zeros.size(1) == size_n / pack_factor, - "b_zeros dim 1 = ", b_zeros.size(1), - " is not size_n / pack_factor = ", size_n / pack_factor); + if (is_zp_float) { + TORCH_CHECK(b_zeros.size(1) == size_n, + "b_zeros dim 1 = ", b_zeros.size(1), + " is not size_n = ", size_n); + TORCH_CHECK(num_groups == b_zeros.size(0), + "b_zeros dim 0 = ", b_zeros.size(0), + " is not num_groups = ", num_groups); + TORCH_CHECK(num_groups != -1, "num_groups must be != -1"); + } else { + TORCH_CHECK(b_zeros.size(0) == num_groups, + "b_zeros dim 0 = ", b_zeros.size(0), + " is not num_groups = ", num_groups); + TORCH_CHECK(b_zeros.size(1) == size_n / pack_factor, + "b_zeros dim 1 = ", b_zeros.size(1), + " is not size_n / pack_factor = ", size_n / pack_factor); + } } // Verify workspace size @@ -2282,7 +2405,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, a_tmp.data_ptr(), size_m, size_n, size_k, workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp, num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev), - thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce); + thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce, is_zp_float); } else if (a.scalar_type() == at::ScalarType::BFloat16) { marlin::marlin_mm( a.data_ptr(), b_q_weight.data_ptr(), @@ -2291,7 +2414,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, perm.data_ptr(), a_tmp.data_ptr(), size_m, size_n, size_k, workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp, num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev), - thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce); + thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce, is_zp_float); } else { TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16"); } diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index e4cc7ec951848..3dccdf61abf3b 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -244,7 +244,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "Tensor b_zeros, Tensor g_idx, Tensor perm, Tensor workspace, " "int b_q_type, " "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, " - "bool has_zp, bool use_fp32_reduce) -> Tensor"); + "bool has_zp, bool use_fp32_reduce, bool is_zp_float) -> Tensor"); // conditionally compiled so impl registration is in source file // gptq_marlin repack from GPTQ. diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py index b6dd68cc51a9f..3899ad1a325cf 100644 --- a/tests/kernels/test_marlin_gemm.py +++ b/tests/kernels/test_marlin_gemm.py @@ -29,6 +29,7 @@ marlin_qqq_quantize) from vllm.model_executor.layers.quantization.utils.quant_utils import ( awq_pack, gptq_pack, gptq_quantize_weights, quantize_weights, sort_weights) +from vllm.scalar_type import scalar_types ACT_ORDER_OPTS = [False, True] K_FULL_OPTS = [False, True] @@ -40,6 +41,8 @@ MARLIN_24_K_CHUNKS = [128] MARLIN_24_N_CHUNKS = [512] +HQQ_SUPPORTED_GROUP_SIZES = [64] + MNK_FACTORS = [ (1, 1, 1), (1, 4, 8), @@ -226,7 +229,7 @@ def test_gptq_marlin_gemm( torch.ops._C.gptq_marlin_gemm, (a_input, marlin_q_w, marlin_s, marlin_zp, g_idx, sort_indices, workspace.scratch, quant_type.id, a_input.shape[0], b_weight.shape[1], - a_input.shape[1], is_k_full, False, use_fp32_reduce), + a_input.shape[1], is_k_full, False, use_fp32_reduce, False), test_utils=DEFAULT_OPCHECK_TEST_UTILS) output = ops.gptq_marlin_gemm( @@ -244,6 +247,7 @@ def test_gptq_marlin_gemm( is_k_full=is_k_full, has_zp=False, use_fp32_reduce=use_fp32_reduce, + is_zp_float=False, ) output_ref = torch.matmul(a_input, w_ref) @@ -441,6 +445,7 @@ def test_awq_marlin_gemm( is_k_full=is_k_full, has_zp=has_zp, use_fp32_reduce=use_fp32_reduce, + is_zp_float=False, ) output_ref = torch.matmul(a_input, w_ref) @@ -451,6 +456,87 @@ def test_awq_marlin_gemm( assert max_diff < 0.04 +@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"), + reason="Marlin is not supported on this GPU type.") +@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS) +@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS) +@pytest.mark.parametrize("group_size", HQQ_SUPPORTED_GROUP_SIZES) +@pytest.mark.parametrize("mnk_factors", MNK_FACTORS) +@pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS) +def test_hqq_marlin_gemm( + k_chunk, + n_chunk, + group_size, + mnk_factors, + use_fp32_reduce, +): + m_factor, n_factor, k_factor = mnk_factors + + size_m = m_factor + size_k = k_chunk * k_factor + size_n = n_chunk * n_factor + + quant_type = scalar_types.uint4 + + a_input = rand_data((size_m, size_k)) + dev = a_input.device + + b_weight = torch.randint(0, + 10, (size_n, size_k), + dtype=torch.uint8, + device=dev) + scale = rand_data((size_n, size_k // group_size)) + zero = rand_data((size_n, size_k // group_size)) + + gptq_w_q = gptq_pack(b_weight.transpose(1, 0), 4, size_k, size_n) + + sort_indices = torch.empty(0, dtype=torch.int, device=dev) + marlin_w_q = ops.gptq_marlin_repack(gptq_w_q, sort_indices, size_k, size_n, + 4).to(dev) + marlin_s = marlin_permute_scales(scale.transpose(1, 0), size_k, size_n, + group_size).to(dev) + marlin_zp = marlin_permute_scales(zero.transpose(1, 0), size_k, size_n, + group_size).to(dev) + + g_idx = marlin_make_empty_g_idx(dev) + g_idx_sort_indices = marlin_make_empty_g_idx(dev) + + workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N, + GPTQ_MARLIN_MAX_PARALLEL) + + output = ops.gptq_marlin_gemm( + a_input, + marlin_w_q, + marlin_s, + marlin_zp, + g_idx, + g_idx_sort_indices, + workspace.scratch, + quant_type, + a_input.shape[0], + b_weight.shape[0], + a_input.shape[1], + is_k_full=True, + has_zp=True, + use_fp32_reduce=use_fp32_reduce, + is_zp_float=True, + ) + + b_flat = b_weight.reshape(-1, group_size) + zp_flat = zero.reshape(-1, 1) + s_flat = scale.reshape(-1, 1) + dequant = (b_flat - zp_flat) * s_flat + + output_ref = torch.matmul(a_input, + dequant.reshape(b_weight.shape).transpose(1, 0)) + + torch.cuda.synchronize() + + max_diff = compute_max_diff(output, output_ref) + + assert max_diff < 0.04 + + @pytest.mark.skipif(not is_quant_method_supported("qqq"), reason="Marlin is not supported on this GPU type.") @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS) diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt index a4ee9538d646b..2afffb5b9d1c8 100644 --- a/tests/weight_loading/models.txt +++ b/tests/weight_loading/models.txt @@ -27,4 +27,5 @@ fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main marlin, nm-testing/zephyr-beta-7b-marlin-g128, main marlin, robertgshaw2/zephyr-7b-beta-channelwise-marlin, main qqq, HandH1998/QQQ-Llama-3-8b-g128, main -qqq, HandH1998/QQQ-Llama-3-8b, main \ No newline at end of file +qqq, HandH1998/QQQ-Llama-3-8b, main +hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main \ No newline at end of file diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index aa89010ca8ecd..782dc6aed1b8c 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -343,7 +343,8 @@ def _gptq_marlin_gemm_fake(a: torch.Tensor, size_k: torch.SymInt, is_k_full: bool, has_zp: bool = False, - use_fp32_reduce: bool = False) -> torch.Tensor: + use_fp32_reduce: bool = False, + is_zp_float: bool = False) -> torch.Tensor: return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype) @register_fake("_C::ggml_dequantize") @@ -601,11 +602,12 @@ def gptq_marlin_gemm(a: torch.Tensor, size_k: int, is_k_full: bool, has_zp: bool = False, - use_fp32_reduce: bool = False) -> torch.Tensor: + use_fp32_reduce: bool = False, + is_zp_float: bool = False) -> torch.Tensor: return torch.ops._C.gptq_marlin_gemm(a, b_q_weight, b_scales, b_zeros, g_idx, perm, workspace, b_q_type.id, size_m, size_n, size_k, is_k_full, - has_zp, use_fp32_reduce) + has_zp, use_fp32_reduce, is_zp_float) # fp8 marlin diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 9da38d4857d6d..2471c160d66b7 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -27,7 +27,8 @@ "AWQLinearMethod", "GPTQMarlinLinearMethod", "Fp8LinearMethod", "MarlinLinearMethod", "QQQLinearMethod", "GPTQMarlin24LinearMethod", "TPUInt8LinearMethod", "GPTQLinearMethod", "FBGEMMFp8LinearMethod", - "ModelOptFp8LinearMethod", "IPEXAWQLinearMethod", "IPEXGPTQLinearMethod" + "ModelOptFp8LinearMethod", "IPEXAWQLinearMethod", "IPEXGPTQLinearMethod", + "HQQMarlinMethod" ] diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index da841d052d728..ff342c4f9479e 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -21,6 +21,7 @@ GPTQMarlinConfig) from vllm.model_executor.layers.quantization.gptq_marlin_24 import ( GPTQMarlin24Config) +from vllm.model_executor.layers.quantization.hqq_marlin import HQQMarlinConfig from vllm.model_executor.layers.quantization.ipex_quant import IPEXConfig from vllm.model_executor.layers.quantization.marlin import MarlinConfig from vllm.model_executor.layers.quantization.modelopt import ModelOptFp8Config @@ -48,6 +49,7 @@ "compressed-tensors": CompressedTensorsConfig, "bitsandbytes": BitsAndBytesConfig, "qqq": QQQConfig, + "hqq": HQQMarlinConfig, "experts_int8": ExpertsInt8Config, "neuron_quant": NeuronQuantConfig, "ipex": IPEXConfig, diff --git a/vllm/model_executor/layers/quantization/hqq_marlin.py b/vllm/model_executor/layers/quantization/hqq_marlin.py new file mode 100644 index 0000000000000..28538d2993355 --- /dev/null +++ b/vllm/model_executor/layers/quantization/hqq_marlin.py @@ -0,0 +1,325 @@ +from typing import Any, Dict, List, Optional + +import torch + +from vllm import _custom_ops as ops +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, + UnquantizedLinearMethod) +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, QuantizeMethodBase) +from vllm.model_executor.layers.quantization.utils.marlin_utils import ( + GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, + marlin_make_empty_g_idx, marlin_permute_scales) +from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( + MarlinWorkspace) +from vllm.model_executor.layers.quantization.utils.quant_utils import gptq_pack +from vllm.model_executor.parameter import (BasevLLMParameter, + GroupQuantScaleParameter, + PackedvLLMParameter) +from vllm.scalar_type import scalar_types + +logger = init_logger(__name__) + + +class HQQMarlinConfig(QuantizationConfig): + """Config class for HQQ Marlin""" + + def __init__( + self, + weight_bits: int, + group_size: int, + skip_modules: Optional[List[str]] = None, + ) -> None: + assert group_size == 64, ("The only supported HQQ group size is " + "currently 64.") + assert weight_bits == 4, ("The only supported HQQ quantization " + "bitsize is currently 4.") + + self.weight_bits = weight_bits + self.group_size = group_size + self.pack_factor = 32 // weight_bits # packed into int32 in GPTQ format + self.quant_type = scalar_types.uint4 + self.skip_modules = skip_modules + + def __repr__(self) -> str: + return (f"HQQMarlinConfig(quant_type={self.quant_type}, " + f"group_size={self.group_size})") + + @classmethod + def get_name(cls) -> str: + return "hqq" + + @classmethod + def get_supported_act_dtypes(cls) -> List[torch.dtype]: + return [torch.half, torch.bfloat16] + + @classmethod + def get_min_capability(cls) -> int: + return 80 + + @classmethod + def get_config_filenames(cls) -> List[str]: + return ["quantize_config.json"] + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "HQQMarlinConfig": + wq_params = (config["quant_config"]["weight_quant_params"]) + weight_bits = cls.get_from_keys(wq_params, ["nbits"]) + group_size = cls.get_from_keys(wq_params, ["group_size"]) + skip_modules = config["skip_modules"] + return cls(weight_bits, group_size, skip_modules) + + def is_layer_skipped(self, prefix: str) -> bool: + # Split the prefix into its dot-separated components + components = prefix.split('.') + + # Check if any of the skip modules exactly matches any component + return self.skip_modules is not None and any( + module_name in components for module_name in self.skip_modules) + + def get_quant_method(self, layer: torch.nn.Module, + prefix: str) -> Optional["QuantizeMethodBase"]: + if isinstance(layer, LinearBase): + if self.is_layer_skipped(prefix): + return UnquantizedLinearMethod() + return HQQMarlinMethod(self) + return None + + +# Empty HQQ parameter, will be ignored during loading +class HQQEmptyParameter(BasevLLMParameter): + + def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs): + pass + + def load_row_parallel_weight(self, loaded_weight: torch.Tensor): + pass + + def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs): + pass + + +def error_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None: + raise ValueError("No loader provided for HQQ parameter!") + + +# HQQ packing creates issues with sharding - therefore, prior to loading, we +# repack to GPTQ. We also reshape the weights to their proper GPTQ shape. +class HQQweightParameter(PackedvLLMParameter): + + # unpack function from https://github.com/mobiusml/hqq + def unpack_4bit_u8(self, + W_q: torch.Tensor) -> torch.Tensor: # uint8/2 > uint8 + assert self.weight_bits == 4, "Unsupported quant bitsize (must be 4)" + + dtype = torch.uint8 + step = W_q.shape[0] + tmp = torch.empty([2 * step, W_q.shape[1]], + dtype=dtype, + device=W_q.device) + tmp[:step] = (W_q & 0b11110000) >> 4 + tmp[step:] = W_q & 0b00001111 + return tmp + + def __init__(self, packed_factor: int, packed_dim: int, weight_bits: int, + **kwargs): + super().__init__(packed_factor, packed_dim, None, **kwargs) + self.weight_bits = weight_bits + self.input_shape = self.shape[self.input_dim] * self.packed_factor + self.output_shape = self.shape[self.output_dim] + + def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs): + loaded_weight = self.unpack_4bit_u8(loaded_weight) + loaded_weight = loaded_weight.reshape(-1, self.input_shape).transpose( + 1, 0) + loaded_weight = gptq_pack(loaded_weight, self.weight_bits, + loaded_weight.shape[0], + loaded_weight.shape[1]) + super().load_merged_column_weight(loaded_weight, **kwargs) + + def load_row_parallel_weight(self, loaded_weight: torch.Tensor): + loaded_weight = self.unpack_4bit_u8(loaded_weight) + loaded_weight = loaded_weight.reshape(self.output_shape, + -1).transpose(1, 0) + loaded_weight = gptq_pack(loaded_weight, self.weight_bits, + loaded_weight.shape[0], + loaded_weight.shape[1]) + super().load_row_parallel_weight(loaded_weight) + + def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs): + loaded_weight = self.unpack_4bit_u8(loaded_weight) + loaded_weight = loaded_weight.reshape(-1, self.input_shape).transpose( + 1, 0) + loaded_weight = gptq_pack(loaded_weight, self.weight_bits, + loaded_weight.shape[0], + loaded_weight.shape[1]) + super().load_qkv_weight(loaded_weight, **kwargs) + + +# Zero points and scales in HQQ must also be reshaped to correspond to W_q's +# GPTQ shape (transposed - we transpose them too when processing weights). +class HQQZeroScaleParameter(GroupQuantScaleParameter): + + def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs): + loaded_weight = loaded_weight.reshape(-1, self.shape[1]) + super().load_merged_column_weight(loaded_weight, **kwargs) + + def load_row_parallel_weight(self, loaded_weight: torch.Tensor): + loaded_weight = loaded_weight.reshape(self.shape[0], -1) + super().load_row_parallel_weight(loaded_weight) + + def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs): + loaded_weight = loaded_weight.reshape(-1, self.shape[1]) + super().load_qkv_weight(loaded_weight, **kwargs) + + +class HQQMarlinMethod(LinearMethodBase): + """Linear method for HQQ Marlin. + """ + + def __init__( + self, + quant_config: HQQMarlinConfig, + ): + self.quant_config = quant_config + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: List[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ) -> None: + self.output_size_per_partition = sum(output_partition_sizes) + self.input_size_per_partition = input_size_per_partition + + weight_loader = extra_weight_attrs.get("weight_loader", error_loader) + + self.scales_and_zp_size = (input_size_per_partition // + self.quant_config.group_size) + + qweight = HQQweightParameter( + data=torch.empty( + self.input_size_per_partition // self.quant_config.pack_factor, + self.output_size_per_partition, + dtype=torch.int32, + ), + input_dim=0, + output_dim=1, + packed_dim=0, + packed_factor=self.quant_config.pack_factor, + weight_bits=self.quant_config.weight_bits, + weight_loader=weight_loader) + + zeros = HQQZeroScaleParameter(data=torch.empty( + self.output_size_per_partition, + self.scales_and_zp_size, + dtype=params_dtype, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader) + + scales = HQQZeroScaleParameter(data=torch.empty( + self.output_size_per_partition, + self.scales_and_zp_size, + dtype=params_dtype, + ), + input_dim=1, + output_dim=0, + weight_loader=weight_loader) + + layer.register_parameter("W_q", qweight) + layer.register_parameter("zero", zeros) + layer.register_parameter("scale", scales) + + # Ignore extra parameters in the HQQ model. + # To be added as needed. + ignore_parameters = ("axis", "channel_wise", "compute_dtype", + "encoded_state_dict", "group_size", "nbits", + "offload_meta", "optimize", "packing", + "quant_scale", "quant_zero", "round_zero", + "shape", "stores_quant_config", + "unpack_view_dtype", "view_as_float") + for name in ignore_parameters: + layer.register_parameter( + name, + HQQEmptyParameter(data=torch.empty(0), + weight_loader=weight_loader)) + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + dev = layer.W_q.device + + # Repack to Marlin + sort_indices = torch.empty(0, dtype=torch.int, device=dev) + marlin_w_q = ops.gptq_marlin_repack( + layer.W_q, + sort_indices, + self.input_size_per_partition, + self.output_size_per_partition, + self.quant_config.weight_bits, + ).to(dev) + marlin_s = marlin_permute_scales(layer.scale.transpose(1, 0), + self.input_size_per_partition, + self.output_size_per_partition, + self.quant_config.group_size).to(dev) + marlin_zp = marlin_permute_scales(layer.zero.transpose(1, 0), + self.input_size_per_partition, + self.output_size_per_partition, + self.quant_config.group_size).to(dev) + + layer.g_idx = marlin_make_empty_g_idx(dev) + layer.g_idx_sort_indices = marlin_make_empty_g_idx(dev) + + layer.marlin_qweight = marlin_w_q + layer.marlin_zeros = marlin_zp + layer.marlin_scales = marlin_s + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + workspace = MarlinWorkspace(self.output_size_per_partition, + GPTQ_MARLIN_MIN_THREAD_N, + GPTQ_MARLIN_MAX_PARALLEL) + + scales = layer.marlin_scales + zeros = layer.marlin_zeros + orig_type = x.dtype + + if orig_type != torch.float16: + x = x.to(torch.float16) + scales = scales.to(torch.float16) + zeros = zeros.to(torch.float16) + + marlin_out = ops.gptq_marlin_gemm( + x, + layer.marlin_qweight, + scales, + zeros, + layer.g_idx, + layer.g_idx_sort_indices, + workspace.scratch, + scalar_types.uint4, + x.shape[0], + self.output_size_per_partition, + self.input_size_per_partition, + True, # is_k_full + True, # has_zp + True, # use 32-bit reduce + True, # use float zp + ) + + if orig_type != torch.float16: + marlin_out = marlin_out.to(orig_type) + + if bias is not None: + marlin_out.add_(bias) + + return marlin_out diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py index 9a1defa409714..c9366ca97d149 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py @@ -303,7 +303,8 @@ def apply_gptq_marlin_linear( size_k=input_size_per_partition, is_k_full=is_k_full, has_zp=False, - use_fp32_reduce=use_fp32_reduce) + use_fp32_reduce=use_fp32_reduce, + is_zp_float=False) if bias is not None: output.add_(bias) # In-place add @@ -340,7 +341,8 @@ def apply_awq_marlin_linear( size_k=input_size_per_partition, is_k_full=True, has_zp=True, - use_fp32_reduce=use_fp32_reduce) + use_fp32_reduce=use_fp32_reduce, + is_zp_float=False) if bias is not None: output.add_(bias) # In-place add From a324d3a1a74ab0a3fafc0f2d19860bd1d1301a85 Mon Sep 17 00:00:00 2001 From: Maximilien de Bayser Date: Tue, 19 Nov 2024 22:16:54 -0300 Subject: [PATCH 064/255] Change granite chat template to keep json list formatting for tool calls (#10452) Signed-off-by: Max de Bayser --- examples/tool_chat_template_granite.jinja | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/examples/tool_chat_template_granite.jinja b/examples/tool_chat_template_granite.jinja index 2cc19e77188dc..467dcb2d10237 100644 --- a/examples/tool_chat_template_granite.jinja +++ b/examples/tool_chat_template_granite.jinja @@ -21,11 +21,7 @@ {{- '<|start_of_role|>user<|end_of_role|>' + message['content'] + '<|end_of_text|> ' }} {%- elif message['role'] == 'assistant_tool_call' or (message['role'] == 'assistant' and message.tool_calls is defined) %} - {{- '<|start_of_role|>assistant<|end_of_role|>' }} - {% for tc in message.tool_calls %} - {{- '<|tool_call|> ' + {'name': tc.function.name, 'arguments': tc.function.arguments}|tojson }} - {% endfor %} - {{- '<|end_of_text|> + {{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message.tool_calls|map(attribute='function')|list|tojson(indent=4) + '<|end_of_text|> ' }} {%- elif message['role'] == 'assistant' %} {{- '<|start_of_role|>assistant<|end_of_role|>' + message['content'] + '<|end_of_text|> From d5b68aba2ff6dd17060a62c0cb799c0acedb524f Mon Sep 17 00:00:00 2001 From: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com> Date: Tue, 19 Nov 2024 19:19:59 -0600 Subject: [PATCH 065/255] [CI/Build] Update Dockerfile.rocm (#10434) Signed-off-by: Alexei V. Ivanov --- Dockerfile.rocm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 8fb79afaebe97..62d4a9b4909c3 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -51,9 +51,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \ *"rocm-6.2"*) \ python3 -m pip uninstall -y torch torchvision \ && python3 -m pip install --pre \ - torch==2.6.0.dev20240918 \ + torch==2.6.0.dev20241113+rocm6.2 \ 'setuptools-scm>=8' \ - torchvision==0.20.0.dev20240918 \ + torchvision==0.20.0.dev20241113+rocm6.2 \ --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \ *) ;; esac From d200972e7f4969da50f533b46c856c5ff5a9d27d Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Tue, 19 Nov 2024 22:40:33 -0500 Subject: [PATCH 066/255] [Bugfix] Marlin 2:4 temp fix for large M dim (>256) (#10464) Signed-off-by: Lucas Wilkinson --- .../marlin/sparse/marlin_24_cuda_kernel.cu | 15 +++++++++++---- tests/kernels/test_marlin_gemm.py | 2 ++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu index a33e2660d760e..8fce76eb52f9b 100644 --- a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu +++ b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu @@ -910,13 +910,16 @@ void marlin_cuda_2_4(const void* A, const void* B, const void* meta, void* C, // than better compute utilization thread_k = 128; thread_m = 128; - } else if (prob_n <= 256) { + } else { thread_k = 64; thread_m = 256; - } else { - thread_k = 32; - thread_m = 512; } + // Also had + // if prob_n > 256 + // thread_k = 32; + // thread_m = 512; + // but this is broken, + // TODO(Lucas, Alex M): figure out why } int thread_k_blocks = thread_k / 32; // 2:4 version with m16n8k32 instruction @@ -1079,6 +1082,8 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, // Verify A device and strides TORCH_CHECK(a.device().is_cuda(), "A is not on GPU"); TORCH_CHECK(a.is_contiguous(), "A is not contiguous"); + TORCH_CHECK(a.dtype() == torch::kFloat16, + "A is not float16, currently only float16 is supported"); // Verify B device and strides TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU"); @@ -1091,6 +1096,8 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, // Verify scales device and strides TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU"); TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous"); + TORCH_CHECK(b_scales.dtype() == torch::kFloat16, + "A is not float16, currently only float16 is supported"); // Alloc C matrix const at::cuda::OptionalCUDAGuard device_guard(device_of(a)); diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py index 3899ad1a325cf..5e047f4b099f1 100644 --- a/tests/kernels/test_marlin_gemm.py +++ b/tests/kernels/test_marlin_gemm.py @@ -50,6 +50,8 @@ (13, 17, 67), (26, 37, 13), (67, 13, 11), + (257, 13, 11), + (658, 13, 11), ] DTYPES = [torch.float16, torch.bfloat16] From 9e05252b46a92a5d14e4e6fd02b75383c5cf243b Mon Sep 17 00:00:00 2001 From: Yanyi Liu Date: Wed, 20 Nov 2024 12:44:57 +0800 Subject: [PATCH 067/255] [Misc] Add __setitem__ for LazyDict (#10469) Signed-off-by: Yanyi Liu --- vllm/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/utils.py b/vllm/utils.py index 5d0514cd9d168..2bbdc8d1ebde8 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1491,6 +1491,9 @@ def __getitem__(self, key) -> T: self._dict[key] = self._factory[key]() return self._dict[key] + def __setitem__(self, key: str, value: Callable[[], T]): + self._factory[key] = value + def __iter__(self): return iter(self._factory) From ad44437ba33e8d31962d272be238eeed4a1b4f84 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Wed, 20 Nov 2024 13:04:05 +0800 Subject: [PATCH 068/255] [Bugfix] Fix Mamba model initialization and MLP Speculator weights loading (#10456) Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/mamba.py | 8 ++------ vllm/model_executor/models/mlp_speculator.py | 3 ++- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index 405b8f7787ba8..ac0d265a961f0 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -1,5 +1,5 @@ """PyTorch MAMBA model.""" -from typing import Iterable, List, Optional, Set, Tuple +from typing import Iterable, List, Optional, Tuple import torch from torch import nn @@ -243,10 +243,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() for name, loaded_weight in weights: if "A_log" in name: name = name.replace("A_log", "A") @@ -258,5 +256,3 @@ def load_weights(self, weights: Iterable[Tuple[str, weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index f2aa2653c4f5c..d49da5f29aa14 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -193,7 +193,8 @@ def load_weights(self, weights: Iterable[Tuple[str, params_dict = dict(self.named_parameters()) loaded_params: Set[str] = set() for name, loaded_weight in weights: - param = params_dict.get(name.replace("speculator.", "")) + name = name.replace("speculator.", "") + param = params_dict.get(name) if param is not None: weight_loader = getattr(param, "weight_loader", default_weight_loader) From b4be5a8adba95020187ae3cb43a7db7eef20c0ff Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 20 Nov 2024 13:12:51 +0800 Subject: [PATCH 069/255] [Bugfix] Enforce no chunked prefill for embedding models (#10470) Signed-off-by: DarkLight1337 --- docs/source/serving/compatibility_matrix.rst | 69 ++++++++++++++++---- vllm/engine/arg_utils.py | 6 +- 2 files changed, 60 insertions(+), 15 deletions(-) diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst index f629b3ca78318..5fc86ab0a11d5 100644 --- a/docs/source/serving/compatibility_matrix.rst +++ b/docs/source/serving/compatibility_matrix.rst @@ -39,12 +39,13 @@ Feature x Feature - :abbr:`prmpt adptr (Prompt Adapter)` - :ref:`SD ` - CUDA graph + - :abbr:`emd (Embedding Models)` - :abbr:`enc-dec (Encoder-Decoder Models)` - :abbr:`logP (Logprobs)` - :abbr:`prmpt logP (Prompt Logprobs)` - :abbr:`async output (Async Output Processing)` - multi-step - - :abbr:`MM (Multimodal)` + - :abbr:`mm (Multimodal)` - best-of - beam-search - :abbr:`guided dec (Guided Decoding)` @@ -64,6 +65,7 @@ Feature x Feature - - - + - * - :ref:`APC ` - ✅ - @@ -80,6 +82,7 @@ Feature x Feature - - - + - * - :ref:`LoRA ` - `✗ `__ - ✅ @@ -96,6 +99,7 @@ Feature x Feature - - - + - * - :abbr:`prmpt adptr (Prompt Adapter)` - ✅ - ✅ @@ -112,6 +116,7 @@ Feature x Feature - - - + - * - :ref:`SD ` - ✗ - ✅ @@ -128,6 +133,7 @@ Feature x Feature - - - + - * - CUDA graph - ✅ - ✅ @@ -144,6 +150,24 @@ Feature x Feature - - - + - + * - :abbr:`emd (Embedding Models)` + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + - + - + - + - + - + - + - + - + - + - * - :abbr:`enc-dec (Encoder-Decoder Models)` - ✗ - `✗ `__ @@ -151,6 +175,7 @@ Feature x Feature - ✗ - `✗ `__ - ✅ + - ✅ - - - @@ -166,7 +191,8 @@ Feature x Feature - ✅ - ✅ - ✅ - - ✅ + - ✅ + - ✗ - ✅ - - @@ -183,7 +209,8 @@ Feature x Feature - ✅ - `✗ `__ - ✅ - - ✅ + - ✗ + - ✅ - ✅ - - @@ -199,6 +226,7 @@ Feature x Feature - ✅ - ✗ - ✅ + - ✗ - ✗ - ✅ - ✅ @@ -215,6 +243,7 @@ Feature x Feature - ✅ - ✗ - ✅ + - ✗ - ✗ - ✅ - `✗ `__ @@ -224,14 +253,15 @@ Feature x Feature - - - - * - :abbr:`MM (Multimodal)` - - `✗ `__ + * - :abbr:`mm (Multimodal)` + - ✅ - `✗ `__ - `✗ `__ - ? - ? - ✅ - - ✗ + - ✅ + - ✅ - ✅ - ✅ - ✅ @@ -247,6 +277,7 @@ Feature x Feature - ✅ - `✗ `__ - ✅ + - ✗ - ✅ - ✅ - ✅ @@ -263,6 +294,7 @@ Feature x Feature - ✅ - `✗ `__ - ✅ + - ✗ - ✅ - ✅ - ✅ @@ -279,6 +311,7 @@ Feature x Feature - ? - ✅ - ✅ + - ✗ - ? - ✅ - ✅ @@ -353,6 +386,14 @@ Feature x Hardware - ✅ - ✗ - ✅ + * - :abbr:`emd (Embedding Models)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✗ * - :abbr:`enc-dec (Encoder-Decoder Models)` - ✅ - ✅ @@ -361,7 +402,7 @@ Feature x Hardware - ✅ - ✅ - ✗ - * - :abbr:`logP (Logprobs)` + * - :abbr:`mm (Multimodal)` - ✅ - ✅ - ✅ @@ -369,7 +410,7 @@ Feature x Hardware - ✅ - ✅ - ✅ - * - :abbr:`prmpt logP (Prompt Logprobs)` + * - :abbr:`logP (Logprobs)` - ✅ - ✅ - ✅ @@ -377,29 +418,29 @@ Feature x Hardware - ✅ - ✅ - ✅ - * - :abbr:`async output (Async Output Processing)` + * - :abbr:`prmpt logP (Prompt Logprobs)` - ✅ - ✅ - ✅ - ✅ - ✅ - - ✗ - - ✗ - * - multi-step - ✅ - ✅ + * - :abbr:`async output (Async Output Processing)` - ✅ - ✅ - ✅ - - `✗ `__ - ✅ - * - :abbr:`MM (Multimodal)` - ✅ + - ✗ + - ✗ + * - multi-step - ✅ - ✅ - ✅ - ✅ - ✅ + - `✗ `__ - ✅ * - best-of - ✅ diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index a3ae1889774f3..9288cd22c0036 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1014,7 +1014,8 @@ def create_engine_config(self) -> VllmConfig: use_spec_decode = self.speculative_model is not None if (is_gpu and not use_sliding_window and not use_spec_decode and not self.enable_lora - and not self.enable_prompt_adapter): + and not self.enable_prompt_adapter + and model_config.task != "embedding"): self.enable_chunked_prefill = True logger.warning( "Chunked prefill is enabled by default for models with " @@ -1031,6 +1032,9 @@ def create_engine_config(self) -> VllmConfig: "errors during the initial memory profiling phase, or result " "in low performance due to small KV cache space. Consider " "setting --max-model-len to a smaller value.", max_model_len) + elif self.enable_chunked_prefill and model_config.task == "embedding": + msg = "Chunked prefill is not supported for embedding models" + raise ValueError(msg) speculative_config = SpeculativeConfig.maybe_create_spec_config( target_model_config=model_config, From 709c9f1f257fd15545ad19b89ed5019cb5ea338b Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Wed, 20 Nov 2024 00:35:31 -0500 Subject: [PATCH 070/255] [CI/Build] Add sphinx/rst linter for docs (#10366) --- .github/workflows/sphinx-lint.yml | 32 +++++++++++++++++++++++++++++++ format.sh | 6 ++++++ requirements-lint.txt | 1 + tools/sphinx-lint.sh | 3 +++ 4 files changed, 42 insertions(+) create mode 100644 .github/workflows/sphinx-lint.yml create mode 100755 tools/sphinx-lint.sh diff --git a/.github/workflows/sphinx-lint.yml b/.github/workflows/sphinx-lint.yml new file mode 100644 index 0000000000000..e0bb24276a653 --- /dev/null +++ b/.github/workflows/sphinx-lint.yml @@ -0,0 +1,32 @@ +name: Lint documentation + +on: + push: + branches: + - main + paths: + - "docs/**" + pull_request: + branches: + - main + paths: + - "docs/**" + +jobs: + sphinx-lint: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.12"] + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements-lint.txt + - name: Linting docs + run: tools/sphinx-lint.sh diff --git a/format.sh b/format.sh index b3dcdc15bf948..0b196de9d0773 100755 --- a/format.sh +++ b/format.sh @@ -41,6 +41,7 @@ MYPY_VERSION=$(mypy --version | awk '{print $2}') CODESPELL_VERSION=$(codespell --version) ISORT_VERSION=$(isort --vn) CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}') +SPHINX_LINT_VERSION=$(sphinx-lint --version | awk '{print $2}') # # params: tool name, tool version, required version tool_version_check() { @@ -57,6 +58,7 @@ tool_version_check "mypy" "$MYPY_VERSION" tool_version_check "isort" "$ISORT_VERSION" tool_version_check "codespell" "$CODESPELL_VERSION" tool_version_check "clang-format" "$CLANGFORMAT_VERSION" +tool_version_check "sphinx-lint" "$SPHINX_LINT_VERSION" YAPF_FLAGS=( '--recursive' @@ -313,3 +315,7 @@ if ! git diff --quiet &>/dev/null; then else echo "✨🎉 Format check passed! Congratulations! 🎉✨" fi + +echo 'vLLM sphinx-lint:' +tools/sphinx-lint.sh +echo 'vLLM sphinx-lint: Done' diff --git a/requirements-lint.txt b/requirements-lint.txt index f9132bbf96437..711bb50a0e936 100644 --- a/requirements-lint.txt +++ b/requirements-lint.txt @@ -6,6 +6,7 @@ ruff==0.6.5 codespell==2.3.0 isort==5.13.2 clang-format==18.1.5 +sphinx-lint==1.0.0 # type checking mypy==1.11.1 diff --git a/tools/sphinx-lint.sh b/tools/sphinx-lint.sh new file mode 100755 index 0000000000000..04f8075c5527f --- /dev/null +++ b/tools/sphinx-lint.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +sphinx-lint --disable trailing-whitespace,missing-final-newline docs From 7629a9c6e5e29d60be9ef60e4afb9842effcdc73 Mon Sep 17 00:00:00 2001 From: wchen61 Date: Wed, 20 Nov 2024 13:35:50 +0800 Subject: [PATCH 071/255] [CI/Build] Support compilation with local cutlass path (#10423) (#10424) --- CMakeLists.txt | 17 +++++++++++++++-- docs/source/getting_started/installation.rst | 12 ++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5acbd762ee957..bfe435937e3bb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -206,7 +206,19 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case. set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use") - FetchContent_Declare( + # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided + if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR}) + set(VLLM_CUTLASS_SRC_DIR $ENV{VLLM_CUTLASS_SRC_DIR}) + endif() + + if(VLLM_CUTLASS_SRC_DIR) + if(NOT IS_ABSOLUTE VLLM_CUTLASS_SRC_DIR) + get_filename_component(VLLM_CUTLASS_SRC_DIR "${VLLM_CUTLASS_SRC_DIR}" ABSOLUTE) + endif() + message(STATUS "The VLLM_CUTLASS_SRC_DIR is set, using ${VLLM_CUTLASS_SRC_DIR} for compilation") + FetchContent_Declare(cutlass SOURCE_DIR ${VLLM_CUTLASS_SRC_DIR}) + else() + FetchContent_Declare( cutlass GIT_REPOSITORY https://github.com/nvidia/cutlass.git GIT_TAG v3.5.1 @@ -216,7 +228,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags. # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE GIT_SHALLOW TRUE - ) + ) + endif() FetchContent_MakeAvailable(cutlass) list(APPEND VLLM_EXT_SRC diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst index f02626bda4c64..e3dbbc9affe66 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation.rst @@ -170,6 +170,18 @@ To build vLLM using an existing PyTorch installation: $ pip install -e . --no-build-isolation +Use the local cutlass for compilation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead. +To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory. + +.. code-block:: console + + $ git clone https://github.com/vllm-project/vllm.git + $ cd vllm + $ VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e . + + Troubleshooting ~~~~~~~~~~~~~~~ From ed701ca9637306a44ba8403ba9e85be024e0dafd Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Tue, 19 Nov 2024 19:36:03 -1000 Subject: [PATCH 072/255] [ci/build] Combine nightly and optional (#10465) --- .buildkite/test-pipeline.yaml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 24bf223fb12c0..501743c887596 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -9,8 +9,7 @@ # label(str): the name of the test. emoji allowed. # fast_check(bool): whether to run this on each commit on fastcheck pipeline. # fast_check_only(bool): run this test on fastcheck pipeline only -# nightly(bool): run this test in nightly pipeline only -# optional(bool): never run this test by default (i.e. need to unblock manually) +# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run. # command(str): the single command to run for tests. incompatible with commands. # commands(list): the list of commands to run for test. incompatbile with command. # mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd] @@ -336,7 +335,7 @@ steps: - pytest -v -s models/embedding/vision_language -m core_model - label: Language Models Test (Extended) # 50min - nightly: true + optional: true source_file_dependencies: - vllm/ - tests/models/decoder_only/language @@ -362,7 +361,7 @@ steps: - pytest -v -s models/encoder_decoder/vision_language -m core_model - label: Multi-Modal Models Test (Extended) # 1h15m - nightly: true + optional: true source_file_dependencies: - vllm/ - tests/models/decoder_only/audio_language @@ -513,6 +512,7 @@ steps: - label: Distributed Tests (A100) # optional gpu: a100 + optional: true num_gpus: 4 source_file_dependencies: - vllm/ @@ -526,6 +526,7 @@ steps: - label: LM Eval Large Models # optional gpu: a100 + optional: true num_gpus: 4 working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" source_file_dependencies: From 343041c4c4db93b4693ba437df7ae8bea485d18e Mon Sep 17 00:00:00 2001 From: Sky Lee <46676799+skylee-01@users.noreply.github.com> Date: Wed, 20 Nov 2024 14:05:55 +0800 Subject: [PATCH 073/255] [model] Reduce medusa weight (#10454) Signed-off-by: skylee-01 <497627264@qq.com> --- vllm/model_executor/models/medusa.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py index b4ed6538bddac..66bdcb89a0213 100644 --- a/vllm/model_executor/models/medusa.py +++ b/vllm/model_executor/models/medusa.py @@ -61,14 +61,25 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.truncated_vocab_size = config.truncated_vocab_size self.unpadded_vocab_size = self.truncated_vocab_size - self.lm_heads = nn.ModuleList([ - ParallelLMHead( + if getattr(config, "original_lm_head", False): + self.lm_head = ParallelLMHead( self.unpadded_vocab_size, config.hidden_size, org_num_embeddings=self.truncated_vocab_size, padding_size=DEFAULT_VOCAB_PADDING_SIZE, - ) for _ in range(self.config.num_heads) - ]) + ) + self.lm_heads = [ + self.lm_head for _ in range(self.config.num_heads) + ] + else: + self.lm_heads = nn.ModuleList([ + ParallelLMHead( + self.unpadded_vocab_size, + config.hidden_size, + org_num_embeddings=self.truncated_vocab_size, + padding_size=DEFAULT_VOCAB_PADDING_SIZE, + ) for _ in range(self.config.num_heads) + ]) logit_scale = getattr(config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, @@ -172,6 +183,9 @@ def load_weights(self, weights: Iterable[Tuple[str, requires_grad=False) elif name in params_dict: weights_map[name] = loaded_weight + elif (getattr(self.config, "original_lm_head", False) + and name == "lm_heads.0.weight"): + weights_map["lm_head.weight"] = loaded_weight for name, loaded_weight in weights_map.items(): if "lm_head" in name and self.token_map is not None and\ From 09dbf9ff16410d0f83adcc9705764ea1c7f5f017 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 20 Nov 2024 14:45:08 +0800 Subject: [PATCH 074/255] [Bugfix] Handle conflicts between modern and legacy fields (#10471) Signed-off-by: DarkLight1337 --- vllm/transformers_utils/config.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 054845584c2ef..59096753c395d 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -107,6 +107,15 @@ def patch_rope_scaling(config: PretrainedConfig) -> None: def patch_rope_scaling_dict(rope_scaling: Dict[str, Any]) -> None: + if "rope_type" in rope_scaling and "type" in rope_scaling: + rope_type = rope_scaling["rope_type"] + rope_type_legacy = rope_scaling["type"] + if rope_type != rope_type_legacy: + raise ValueError( + f"Found conflicts between 'rope_type={rope_type}' (modern " + f"field) and 'type={rope_type_legacy}' (legacy field). " + "You should only specify one of them.") + if "rope_type" not in rope_scaling and "type" in rope_scaling: rope_scaling["rope_type"] = rope_scaling["type"] logger.info("Replacing legacy 'type' key with 'rope_type'") From d5b28447e005a79dec417a706900db0dad4e1a47 Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Wed, 20 Nov 2024 14:52:13 +0800 Subject: [PATCH 075/255] [Platforms] Refactor xpu code (#10468) Signed-off-by: MengqingCao --- vllm/executor/xpu_executor.py | 27 --------------------------- vllm/platforms/xpu.py | 21 +++++++++++++++++++++ 2 files changed, 21 insertions(+), 27 deletions(-) diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py index 36b7e2265efab..ba6177e51a453 100644 --- a/vllm/executor/xpu_executor.py +++ b/vllm/executor/xpu_executor.py @@ -1,8 +1,5 @@ from typing import Callable, List, Optional, Tuple, Type, Union -import torch - -from vllm.config import ModelConfig, ParallelConfig from vllm.executor.executor_base import ExecutorAsyncBase from vllm.executor.gpu_executor import GPUExecutor from vllm.logger import init_logger @@ -23,7 +20,6 @@ def _init_executor(self) -> None: assert self.speculative_config is None, ( "Speculative decoding not yet supported for XPU backend") - self.model_config = _verify_and_get_model_config(self.model_config) GPUExecutor._init_executor(self) def _get_worker_module_and_class( @@ -53,26 +49,3 @@ async def execute_model_async( output = await make_async(self.driver_worker.execute_model )(execute_model_req=execute_model_req) return output - - -def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig: - if config.dtype == torch.bfloat16: - logger.warning( - "bfloat16 is not fully supported on XPU, casting to float16.") - config.dtype = torch.float16 - if not config.enforce_eager: - logger.warning( - "CUDA graph is not supported on XPU, fallback to the eager " - "mode.") - config.enforce_eager = True - return config - - -def _verify_and_get_parallel_config(config: ParallelConfig) -> ParallelConfig: - if (config.distributed_executor_backend is not None - and config.distributed_executor_backend != "ray"): - logger.warning( - "%s is not supported on XPU, fallback to ray distributed executor " - "backend.", config.distributed_executor_backend) - config.distributed_executor_backend = "ray" - return config diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index d0b3dca9a4195..62db285f6696a 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -1,9 +1,16 @@ +from typing import TYPE_CHECKING + import torch from vllm.logger import init_logger from .interface import DeviceCapability, Platform, PlatformEnum, _Backend +if TYPE_CHECKING: + from vllm.config import VllmConfig +else: + VllmConfig = None + logger = init_logger(__name__) @@ -34,3 +41,17 @@ def get_device_total_memory(cls, device_id: int = 0) -> int: @staticmethod def inference_mode(): return torch.no_grad() + + @classmethod + def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + # check and update model config + model_config = vllm_config.model_config + if model_config.dtype == torch.bfloat16: + logger.warning( + "bfloat16 is not fully supported on XPU, casting to float16.") + model_config.dtype = torch.float16 + if not model_config.enforce_eager: + logger.warning( + "CUDA graph is not supported on XPU, fallback to the eager " + "mode.") + model_config.enforce_eager = True From 63f1fde277d063fbd36ccf43cb709fafca754ed5 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Wed, 20 Nov 2024 18:57:39 +0800 Subject: [PATCH 076/255] [Hardware][CPU] Support chunked-prefill and prefix-caching on CPU (#10355) Signed-off-by: jiang1.li --- .buildkite/run-cpu-test.sh | 9 +- .../getting_started/cpu-installation.rst | 10 +- docs/source/serving/compatibility_matrix.rst | 4 +- .../basic_correctness/test_chunked_prefill.py | 63 ++- vllm/attention/backends/torch_sdpa.py | 189 +++++-- vllm/attention/ops/ipex_attn.py | 150 ++++-- vllm/platforms/cpu.py | 15 +- vllm/worker/cpu_model_runner.py | 488 ++++++++---------- 8 files changed, 559 insertions(+), 369 deletions(-) diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index f0128f091b742..4f1729d46dae2 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -25,6 +25,7 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg function cpu_tests() { set -e + export NUMA_NODE=$2 # offline inference docker exec cpu-test-avx2-"$NUMA_NODE" bash -c " @@ -57,6 +58,12 @@ function cpu_tests() { pytest -s -v \ tests/quantization/test_ipex_quant.py" + # Run chunked-prefill and prefix-cache test + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + pytest -s -v -k cpu_model \ + tests/basic_correctness/test_chunked_prefill.py" + # online inference docker exec cpu-test-"$NUMA_NODE" bash -c " set -e @@ -75,4 +82,4 @@ function cpu_tests() { # All of CPU tests are expected to be finished less than 25 mins. export -f cpu_tests -timeout 25m bash -c "cpu_tests $CORE_RANGE" +timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst index 69530fd778c55..649de1cd9b53c 100644 --- a/docs/source/getting_started/cpu-installation.rst +++ b/docs/source/getting_started/cpu-installation.rst @@ -5,11 +5,11 @@ Installation with CPU vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features: -- Tensor Parallel (``-tp = N``) -- Quantization (``INT8 W8A8, AWQ``) - -.. note:: - More advanced features on `chunked-prefill`, `prefix-caching` and `FP8 KV cache` are under development and will be available soon. +- Tensor Parallel +- Model Quantization (``INT8 W8A8, AWQ``) +- Chunked-prefill +- Prefix-caching +- FP8-E5M2 KV-Caching (TODO) Table of contents: diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst index 5fc86ab0a11d5..a4300761d2635 100644 --- a/docs/source/serving/compatibility_matrix.rst +++ b/docs/source/serving/compatibility_matrix.rst @@ -344,7 +344,7 @@ Feature x Hardware - ✅ - ✅ - ✅ - - ✗ + - ✅ - ✅ * - :ref:`APC ` - `✗ `__ @@ -352,7 +352,7 @@ Feature x Hardware - ✅ - ✅ - ✅ - - ✗ + - ✅ - ✅ * - :ref:`LoRA ` - ✅ diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index cc5bc2aca27c9..469d18a4dd7af 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -12,6 +12,7 @@ import pytest from tests.kernels.utils import override_backend_env_variable +from vllm.platforms import current_platform from ..models.utils import check_logprobs_close, check_outputs_equal from ..utils import multi_gpu_test @@ -206,12 +207,14 @@ def test_models_with_fp8_kv_cache( # NOTE: Increasing this in this suite will fail CI because we currently cannot # reset distributed env properly. Use a value > 1 just when you test. @pytest.mark.parametrize("tensor_parallel_size", [1]) +@pytest.mark.parametrize("dtype", ["half"]) def test_with_prefix_caching( vllm_runner, max_tokens: int, enforce_eager: bool, chunk_size: int, tensor_parallel_size: int, + dtype: str, ) -> None: """ Checks exact match decode with and without prefix caching @@ -233,7 +236,7 @@ def test_with_prefix_caching( for enable in (True, False): with vllm_runner( model, - dtype="half", + dtype=dtype, max_num_batched_tokens=max_num_batched_tokens, enable_chunked_prefill=True, enable_prefix_caching=enable, @@ -260,3 +263,61 @@ def test_with_prefix_caching( name_0="w/o prefix caching", name_1="with prefix caching", ) + + +@pytest.mark.parametrize("model", ["facebook/opt-125m"]) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16]) +@pytest.mark.parametrize("enforce_eager", [False]) +@pytest.mark.parametrize("attention_backend", ["TORCH_SDPA"]) +@pytest.mark.cpu_model +@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only") +def test_models_cpu( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, + max_tokens: int, + chunked_prefill_token_size: int, + enforce_eager: bool, + attention_backend: str, + monkeypatch, +) -> None: + test_models( + hf_runner, + vllm_runner, + example_prompts, + model, + dtype, + max_tokens, + chunked_prefill_token_size, + enforce_eager, + 1, + attention_backend, + monkeypatch, + ) + + +@pytest.mark.parametrize("max_tokens", [16]) +@pytest.mark.parametrize("enforce_eager", [False]) +@pytest.mark.parametrize("chunk_size", [30, 32]) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.cpu_model +@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only") +def test_with_prefix_caching_cpu( + vllm_runner, + max_tokens: int, + enforce_eager: bool, + chunk_size: int, + dtype: str, +) -> None: + test_with_prefix_caching( + vllm_runner, + max_tokens, + enforce_eager, + chunk_size, + 1, + dtype, + ) diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 563178d3ab60d..3d025df26a7a1 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -7,18 +7,14 @@ from torch.nn.functional import scaled_dot_product_attention from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, - AttentionMetadata, AttentionType) + AttentionMetadata, + AttentionMetadataBuilder, + AttentionType) from vllm.attention.backends.utils import CommonAttentionState +from vllm.attention.ops.ipex_attn import PagedAttention from vllm.attention.ops.paged_attn import PagedAttentionMetadata -from vllm.platforms import current_platform - -if current_platform.is_cpu(): - try: - from vllm.attention.ops.ipex_attn import PagedAttention - except ImportError: - from vllm.attention.ops.paged_attn import PagedAttention -else: - from vllm.attention.ops.paged_attn import PagedAttention +from vllm.utils import make_tensor_with_pad +from vllm.worker.cpu_model_runner import ModelInputForCPUBuilder class TorchSDPABackend(AttentionBackend): @@ -39,6 +35,10 @@ def get_metadata_cls() -> Type["AttentionMetadata"]: def get_state_cls() -> Type["CommonAttentionState"]: return CommonAttentionState + @staticmethod + def get_builder_cls() -> Type["TorchSDPAMetadataBuilder"]: + return TorchSDPAMetadataBuilder + @staticmethod def get_kv_cache_shape( num_blocks: int, @@ -71,9 +71,15 @@ class TorchSDPAMetadata(AttentionMetadata, PagedAttentionMetadata): """ # Currently, input sequences can only contain all prompts # or all decoding. True if all sequences are prompts. - is_prompt: bool - slot_mapping: torch.Tensor - seq_lens: Optional[List[int]] + chunked_prefill: bool + seq_lens: Optional[List[int]] = None # For non-chunked prefill + + # For chunked prefill only + max_query_len: Optional[int] = None + max_kv_len: Optional[int] = None + query_start_loc: Optional[torch.Tensor] = None + kv_start_loc: Optional[torch.Tensor] = None + prefill_block_tables: Optional[torch.Tensor] = None # Begin encoder attn & enc/dec cross-attn fields... # Encoder sequence lengths representation @@ -123,20 +129,14 @@ def is_all_cross_attn_metadata_set(self): @property def prefill_metadata(self) -> Optional["TorchSDPAMetadata"]: - # Currently chunked prefill is not supported - if self.num_decode_tokens == 0: - assert self.num_prefills > 0 - return self - - return None + if self.num_prefill_tokens == 0: + return None + return self @property def decode_metadata(self) -> Optional["TorchSDPAMetadata"]: - # Currently chunked prefill is not supported - if self.num_prefills > 0: - assert self.num_decode_tokens == 0 + if self.num_decode_tokens == 0: return None - return self def get_seq_lens( @@ -274,6 +274,105 @@ def get_seq_len_block_table_args( raise AttributeError(f"Invalid attention type {str(attn_type)}") +class TorchSDPAMetadataBuilder(AttentionMetadataBuilder[TorchSDPAMetadata]): + + def __init__(self, input_builder: ModelInputForCPUBuilder) -> None: + self.chunked_prefill = input_builder.chunked_prefill + self.input_data = input_builder.input_data + + def build(self, seq_lens: List[int], query_lens: List[int], + cuda_graph_pad_size: int, batch_size: int) -> TorchSDPAMetadata: + input_data = self.input_data + prefill_seq_lens = seq_lens[0:input_data.num_prefills] + prefill_query_lens = query_lens[0:input_data.num_prefills] + slot_mapping = torch.tensor(input_data.slot_mapping, + dtype=torch.long, + device="cpu") + + # For chunked-prefill + if self.chunked_prefill and input_data.num_prefill_tokens != 0: + prefill_block_tables = make_tensor_with_pad( + self.input_data.prefill_block_tables, + pad=0, + dtype=torch.int32, + device="cpu", + ) + query_lens_tensor = torch.tensor(prefill_query_lens, + dtype=torch.int32, + device="cpu") + kv_lens_tensor = torch.tensor(prefill_seq_lens, + dtype=torch.int32, + device="cpu") + query_start_loc = torch.zeros(input_data.num_prefills + 1, + dtype=torch.int32, + device="cpu") + kv_start_loc = torch.zeros(input_data.num_prefills + 1, + dtype=torch.int32, + device="cpu") + torch.cumsum(query_lens_tensor, + dim=0, + dtype=torch.int32, + out=query_start_loc[1:]) + torch.cumsum(kv_lens_tensor, + dim=0, + dtype=torch.int32, + out=kv_start_loc[1:]) + max_query_len = max(prefill_query_lens) + max_kv_len = max(prefill_seq_lens) + else: + prefill_block_tables = None + query_start_loc = None + kv_start_loc = None + max_query_len = None + max_kv_len = None + + # For paged attention + if input_data.num_decode_tokens != 0: + seq_lens_tensor = torch.tensor( + input_data.seq_lens[input_data.num_prefills:], + dtype=torch.int32, + device="cpu", + ) + block_tables = make_tensor_with_pad( + self.input_data.decode_block_tables, + pad=0, + dtype=torch.int32, + device="cpu", + ) + else: + block_tables = torch.tensor([]) + seq_lens_tensor = torch.tensor([]) + + # For multi-modal models + placeholder_index_maps = None + if len(input_data.multi_modal_inputs_list) != 0: + placeholder_index_maps = { + modality: placeholder_map.index_map() + for modality, placeholder_map in + input_data.multi_modal_placeholder_maps.items() + } + + attn_metadata = TorchSDPAMetadata( + chunked_prefill=self.chunked_prefill, + seq_lens=prefill_seq_lens, + seq_lens_tensor=seq_lens_tensor, + max_query_len=max_query_len, + max_kv_len=max_kv_len, + query_start_loc=query_start_loc, + kv_start_loc=kv_start_loc, + max_decode_seq_len=input_data.max_decode_seq_len, + num_prefills=input_data.num_prefills, + num_prefill_tokens=input_data.num_prefill_tokens, + num_decode_tokens=input_data.num_decode_tokens, + block_tables=block_tables, + prefill_block_tables=prefill_block_tables, + slot_mapping=slot_mapping, + multi_modal_placeholder_index_maps=placeholder_index_maps, + ) + + return attn_metadata + + class TorchSDPABackendImpl(AttentionImpl[TorchSDPAMetadata]): def __init__( @@ -409,19 +508,35 @@ def forward( assert key.shape[0] == num_prefill_tokens + num_decode_tokens assert value.shape[0] == num_prefill_tokens + num_decode_tokens + output = torch.empty_like(query) if prefill_meta := attn_metadata.prefill_metadata: assert attn_metadata.seq_lens is not None - if (kv_cache.numel() == 0 - or prefill_meta.block_tables.numel() == 0): - output = self._run_sdpa_forward(query, - key, - value, - prefill_meta, - attn_type=attn_type) + if not prefill_meta.prefill_metadata.chunked_prefill: # type: ignore + self._run_sdpa_forward(output, + query, + key, + value, + prefill_meta, + attn_type=attn_type) else: # prefix-enabled attention - raise RuntimeError( - "Torch SDPA backend doesn't support prefix decoding.") + assert not self.need_mask + import intel_extension_for_pytorch.llm.modules as ipex_modules + output = torch.empty_like(query) + ipex_modules.PagedAttention.flash_attn_varlen_func( + output[:prefill_meta.num_prefill_tokens, :, :], + query[:prefill_meta.num_prefill_tokens, :, :], + key_cache, + value_cache, + prefill_meta.query_start_loc, + prefill_meta.kv_start_loc, + prefill_meta.max_query_len, + prefill_meta.max_kv_len, + self.scale, + True, + prefill_meta.prefill_block_tables, + self.alibi_slopes, + ) if decode_meta := attn_metadata.decode_metadata: assert attn_type != AttentionType.ENCODER_ONLY, ( @@ -433,8 +548,9 @@ def forward( block_tables_arg, ) = decode_meta.get_seq_len_block_table_args(attn_type) - output = PagedAttention.forward_decode( - query, + PagedAttention.forward_decode( + output[attn_metadata.num_prefill_tokens:, :, :], + query[attn_metadata.num_prefill_tokens:, :, :], key_cache, value_cache, block_tables_arg, @@ -453,12 +569,13 @@ def forward( def _run_sdpa_forward( self, + output: torch.Tensor, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, attn_metadata: TorchSDPAMetadata, attn_type: AttentionType = AttentionType.DECODER, - ): + ) -> None: if self.num_kv_heads != self.num_heads: key = key.repeat_interleave(self.num_queries_per_kv, dim=1) value = value.repeat_interleave(self.num_queries_per_kv, dim=1) @@ -479,7 +596,6 @@ def _run_sdpa_forward( attn_masks = [None] * len(seq_lens) attn_metadata.set_attn_bias(attn_masks, attn_type) - output = torch.empty_like(query) query = query.movedim(0, query.dim() - 2) key = key.movedim(0, key.dim() - 2) value = value.movedim(0, value.dim() - 2) @@ -502,7 +618,6 @@ def _run_sdpa_forward( scale=self.scale).squeeze(0).movedim(query.dim() - 2, 0) output[start_q:end_q, :, :] = sub_out start_q, start_kv = end_q, end_kv - return output def _make_alibi_bias( diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py index 8df6d4ced9dc6..cbc6c74acf09a 100644 --- a/vllm/attention/ops/ipex_attn.py +++ b/vllm/attention/ops/ipex_attn.py @@ -1,12 +1,17 @@ from typing import Dict, List, Optional, Tuple -import intel_extension_for_pytorch.llm.modules as ipex_modules +try: + import intel_extension_for_pytorch.llm.modules as ipex_modules + _use_ipex = True +except ImportError: + _use_ipex = False + import torch from vllm import _custom_ops as ops -class PagedAttention: +class _PagedAttention: @staticmethod def get_supported_head_sizes() -> List[int]: @@ -22,6 +27,105 @@ def get_kv_cache_shape( ) -> Tuple[int, ...]: return (2, num_blocks, block_size * num_kv_heads * head_size) + @staticmethod + def split_kv_cache( + kv_cache: torch.Tensor, + num_kv_heads: int, + head_size: int, + *args, + ) -> Tuple[torch.Tensor, torch.Tensor]: + x = 16 // kv_cache.element_size() + num_blocks = kv_cache.shape[1] + + key_cache = kv_cache[0] + key_cache = key_cache.view(num_blocks, num_kv_heads, head_size // x, + -1, x) + value_cache = kv_cache[1] + value_cache = value_cache.view(num_blocks, num_kv_heads, head_size, -1) + return key_cache, value_cache + + @staticmethod + def write_to_paged_cache( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, + kv_cache_dtype: str, + k_scale: float, + v_scale: float, + *args, + ) -> None: + ops.reshape_and_cache( + key, + value, + key_cache, + value_cache, + slot_mapping.flatten(), + kv_cache_dtype, + k_scale, + v_scale, + ) + + @staticmethod + def forward_decode( + output: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + block_tables: torch.Tensor, + context_lens: torch.Tensor, + max_context_len: int, + kv_cache_dtype: str, + num_kv_heads: int, + scale: float, + alibi_slopes: Optional[torch.Tensor], + k_scale: float, + v_scale: float, + *args, + ) -> None: + tp_rank: int = 0 + blocksparse_local_blocks: int = 0 + blocksparse_vert_stride: int = 0 + blocksparse_block_size: int = 64 + blocksparse_head_sliding_step: int = 0 + block_size = value_cache.shape[3] + + ops.paged_attention_v1( + output, + query, + key_cache, + value_cache, + num_kv_heads, + scale, + block_tables, + context_lens, + block_size, + max_context_len, + alibi_slopes, + kv_cache_dtype, + k_scale, + v_scale, + tp_rank, + blocksparse_local_blocks, + blocksparse_vert_stride, + blocksparse_block_size, + blocksparse_head_sliding_step, + ) + + @staticmethod + def copy_blocks( + kv_caches: List[torch.Tensor], + src_to_dists: Dict[int, List[int]], + *args, + ) -> None: + key_caches = [kv_cache[0] for kv_cache in kv_caches] + value_caches = [kv_cache[1] for kv_cache in kv_caches] + ops.copy_blocks(key_caches, value_caches, src_to_dists) + + +class _IPEXPagedAttention(_PagedAttention): + @staticmethod def split_kv_cache( kv_cache: torch.Tensor, @@ -55,6 +159,7 @@ def write_to_paged_cache( @staticmethod def forward_decode( + output: torch.Tensor, query: torch.Tensor, key_cache: torch.Tensor, value_cache: torch.Tensor, @@ -68,8 +173,7 @@ def forward_decode( k_scale: float, v_scale: float, *args, - ) -> torch.Tensor: - output = torch.empty_like(query) + ) -> None: block_size = value_cache.shape[2] head_mapping = torch.arange( 0, @@ -83,41 +187,5 @@ def forward_decode( scale, block_tables, context_lens, block_size, max_context_len, alibi_slopes) - return output - - @staticmethod - def forward_prefix( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - kv_cache_dtype: str, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - block_tables: torch.Tensor, - subquery_start_loc: torch.Tensor, - prompt_lens_tensor: torch.Tensor, - context_lens: torch.Tensor, - max_subquery_len: int, - alibi_slopes: Optional[torch.Tensor], - *args, - ) -> torch.Tensor: - raise NotImplementedError - - @staticmethod - def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dst: Dict[int, int], - *args, - ) -> None: - raise NotImplementedError - @staticmethod - def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: Dict[int, List[int]], - *args, - ) -> None: - key_caches = [kv_cache[0] for kv_cache in kv_caches] - value_caches = [kv_cache[1] for kv_cache in kv_caches] - ops.copy_blocks(key_caches, value_caches, src_to_dists) +PagedAttention = _IPEXPagedAttention if _use_ipex else _PagedAttention diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index f9a34a47959ec..43cbafe709d84 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -53,11 +53,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: cache_config = vllm_config.cache_config - if cache_config.enable_prefix_caching: - logger.warning( - "Prefix caching is not supported on CPU, disable it.") - cache_config.enable_prefix_caching = False - kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE if kv_cache_space >= 0: @@ -74,10 +69,12 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: f" {kv_cache_space}, expect a positive integer value.") scheduler_config = vllm_config.scheduler_config - if scheduler_config.chunked_prefill_enabled: - logger.warning( - "Chunked prefill is not supported on CPU, disable it.") - scheduler_config.chunked_prefill_enabled = False + if ((scheduler_config.chunked_prefill_enabled + or cache_config.enable_prefix_caching) + and model_config.dtype == torch.half): + logger.warning("Chunked-prefill on the CPU backend only does not" + " support fp16 for now, cast to bf16.") + model_config.dtype = torch.bfloat16 parallel_config = vllm_config.parallel_config if (parallel_config.distributed_executor_backend is not None diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index d3e1202c15e61..66bd844c94901 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -2,8 +2,8 @@ import weakref from collections import defaultdict from dataclasses import dataclass -from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, - TypeVar, Union) +from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Type, TypeVar, + Union) import torch from torch import nn @@ -19,7 +19,6 @@ MultiModalKwargs, MultiModalPlaceholderMap) from vllm.sequence import (IntermediateTensors, SequenceData, SequenceGroupMetadata) -from vllm.utils import make_tensor_with_pad from vllm.worker.model_runner_base import ( ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase, _add_attn_metadata_broadcastable_dict, @@ -104,65 +103,223 @@ def from_broadcasted_tensor_dict( class ModelInputForCPUBuilder(ModelRunnerInputBuilderBase[ModelInputForCPU]): + class ModelInputData: + + def __init__(self, use_mrope: bool): + self.use_mrope = use_mrope + self.input_tokens: List[int] = [] + self.input_positions: Optional[ + List[int]] = [] if not self.use_mrope else None + self.seq_lens: List[int] = [] + self.query_lens: List[int] = [] + self.prefill_block_tables: List[List[int]] = [] + self.decode_block_tables: List[List[int]] = [] + self.max_decode_seq_len: int = 0 + self.num_prefills: int = 0 + self.num_prefill_tokens: int = 0 + self.num_decode_tokens: int = 0 + self.slot_mapping: List[int] = [] + self.multi_modal_inputs_list: List[MultiModalKwargs] = [] + self.multi_modal_placeholder_maps: Dict[ + str, MultiModalPlaceholderMap] = defaultdict( + MultiModalPlaceholderMap) + self.input_mrope_positions: Optional[List[List[int]]] = [ + [] for _ in range(3) + ] if self.use_mrope else None + def __init__(self, runner: "CPUModelRunner", finished_requests_ids: Optional[List[str]] = None) -> None: super().__init__() self.seq_group_metadata_list: List[SequenceGroupMetadata] = [] self.runner = runner + + self.chunked_prefill = (runner.scheduler_config.chunked_prefill_enabled + or runner.cache_config.enable_prefix_caching) self.model_input_cls = self.runner._model_input_cls self.attn_backend = self.runner.attn_backend - self.sliding_window = self.runner.sliding_window - self.block_size = self.runner.block_size - self.device = self.runner.device self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper + self.input_data = ModelInputForCPUBuilder.ModelInputData( + self.runner.model_config.uses_mrope) + self.att_metadata_builder = self.runner.attn_backend.get_builder_cls()( + self) def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): self.seq_group_metadata_list.append(seq_group_metadata) + def set_seq_group_list( + self, seq_group_metadata_list: List[SequenceGroupMetadata]): + self.seq_group_metadata_list = seq_group_metadata_list + def build(self) -> ModelInputForCPU: + self._build_input_data() + + input_data = self.input_data + input_tokens = torch.tensor(input_data.input_tokens, + dtype=torch.long, + device="cpu") + input_positions = torch.tensor( + input_data.input_positions + if not input_data.use_mrope else input_data.input_mrope_positions, + dtype=torch.long, + device="cpu") + + # For multi-modal models multi_modal_kwargs = None - # NOTE: We assume that all sequences in the group are all prompts or - # all decodes. - is_prompt = self.seq_group_metadata_list[0].is_prompt - # Prepare input tensors. - if is_prompt: - (input_tokens, input_positions, attn_metadata, seq_lens, - multi_modal_kwargs) = self._prepare_prompt( - self.seq_group_metadata_list) - else: - (input_tokens, input_positions, - attn_metadata) = self._prepare_decode( - self.seq_group_metadata_list) - seq_lens = None + if len(input_data.multi_modal_inputs_list) != 0: + multi_modal_kwargs = MultiModalKwargs.batch( + input_data.multi_modal_inputs_list) + + attn_metadata = self.att_metadata_builder.build( + input_data.seq_lens, input_data.query_lens, -1, -1) return self.model_input_cls( input_tokens=input_tokens, input_positions=input_positions, + seq_lens=input_data.seq_lens, + query_lens=input_data.query_lens, attn_metadata=attn_metadata, multi_modal_kwargs=multi_modal_kwargs, - # query_lens is not needed if chunked prefill is not - # supported. Since CPU worker doesn't support chunked prefill - # just use seq_lens instead. - seq_lens=seq_lens, - query_lens=seq_lens, ) - def _compute_multi_modal_input( - self, - seq_data: SequenceData, - computed_len: int, - seq_group_metadata: SequenceGroupMetadata, - ): + def _build_input_data(self): + for seq_group_metadata in self.seq_group_metadata_list: + for seq_id, seq_data in seq_group_metadata.seq_data.items(): + if seq_group_metadata.is_prompt: + self._compute_prompt_input_tokens(self.input_data, + seq_group_metadata, + seq_data, seq_id) + if seq_group_metadata.multi_modal_data: + self._compute_multi_modal_input( + seq_group_metadata, seq_data) + else: + self._compute_decode_input_tokens(self.input_data, + seq_group_metadata, + seq_data, seq_id) + + def _compute_decode_input_tokens(self, data: ModelInputData, + seq_group_metadata: SequenceGroupMetadata, + seq_data: SequenceData, seq_id: int): + """ + Compute decode input tokens, positions, block table and slot mapping. + """ + block_size = self.runner.block_size + + block_table = seq_group_metadata.block_tables[seq_id] + seq_len = seq_data.get_len() + context_len = seq_data.get_num_computed_tokens() + + tokens = seq_data.get_last_token_id() + token_positions = seq_len - 1 + block_number = block_table[token_positions // block_size] + block_offset = token_positions % block_size + slot = block_number * block_size + block_offset + + # For paged_attention kernel + if self.runner.sliding_window: + start_idx = max(0, seq_len - self.runner.sliding_window) + start_block = start_idx // block_size + start_idx = start_block * block_size + seq_len = seq_len - start_idx + block_table = block_table[start_block:] + + # For MRotaryEmbedding + if data.input_positions is None: + next_pos = MRotaryEmbedding.get_next_input_positions( + seq_data.mrope_position_delta, + context_len, + seq_len, + ) + for idx in range(3): + data.input_mrope_positions[idx].extend( # type: ignore + next_pos[idx]) + else: + data.input_positions.append(token_positions) # type: ignore + + # Update fields + data.input_tokens.append(tokens) + data.max_decode_seq_len = max(data.max_decode_seq_len, seq_len) + data.num_decode_tokens += 1 + data.slot_mapping.append(slot) + data.decode_block_tables.append(block_table) + data.query_lens.append(1) + data.seq_lens.append(seq_len) + + def _compute_prompt_input_tokens(self, data: ModelInputData, + seq_group_metadata: SequenceGroupMetadata, + seq_data: SequenceData, seq_id: int): + """ + Compute prompt input tokens, positions, block table and slot mapping. + """ + token_chunk_size = seq_group_metadata.token_chunk_size + block_size = self.runner.block_size + + block_table = seq_group_metadata.block_tables[seq_id] + seq_len = seq_data.get_len() + context_len = seq_data.get_num_computed_tokens() + seq_len = min(seq_len, context_len + token_chunk_size) + + # For prefix caching + prefix_cache_block_num = len(seq_group_metadata.computed_block_nums) + if prefix_cache_block_num > 0: + prefix_cache_len = (prefix_cache_block_num * + self.runner.block_size) + if prefix_cache_len <= context_len: + # We already passed the cache hit region, + # so do normal computation. + pass + elif context_len < prefix_cache_len < seq_len: + # Partial hit. Compute the missing part. + context_len = prefix_cache_len + token_chunk_size = seq_len - context_len + elif seq_len <= prefix_cache_len: + # Full hit. Only compute the last token to avoid + # erroneous behavior. FIXME: Ideally we should directly + # mark all tokens as computed in the scheduler and do not + # schedule this sequence, so this case should not happen. + context_len = seq_len - 1 + token_chunk_size = 1 + + tokens = seq_data.get_token_ids() + tokens = tokens[context_len:seq_len] + token_positions = range(context_len, seq_len) + + # For encoder-only models, the block_table is None, + # and there is no need to initialize the slot_mapping. + if block_table is not None: + slot_mapping = [_PAD_SLOT_ID] * len(token_positions) + for i, pos in enumerate(token_positions): + block_number = block_table[pos // block_size] + block_offset = pos % block_size + slot = block_number * block_size + block_offset + slot_mapping[i] = slot + data.slot_mapping.extend(slot_mapping) + + # The MROPE positions are prepared in _compute_multi_modal_input + if data.input_positions is not None: + data.input_positions.extend(token_positions) + + # Update fields + data.input_tokens.extend(tokens) + data.num_prefills += 1 + data.num_prefill_tokens += len(tokens) + data.query_lens.append(len(tokens)) + data.prefill_block_tables.append(block_table) + data.seq_lens.append(seq_len) + + def _compute_multi_modal_input(self, + seq_group_metadata: SequenceGroupMetadata, + seq_data: SequenceData): + computed_len = seq_data.get_num_computed_tokens() + seq_len = self.input_data.seq_lens[-1] + # NOTE: mm_data only includes the subset of multi-modal items that # intersect with the current prefill positions. mm_data, placeholder_maps = MultiModalPlaceholderMap.from_seq_group( - seq_group_metadata, - range(computed_len, len(seq_data.get_token_ids())), - ) + seq_group_metadata, range(computed_len, seq_len)) if not mm_data: - return None, None, None + return if self.runner.mm_registry.has_processor(self.runner.model_config): mm_kwargs = mm_data @@ -173,8 +330,10 @@ def _compute_multi_modal_input( ) # special processing for mrope position deltas. - mrope_positions = None if self.runner.model_config.uses_mrope: + assert not self.chunked_prefill, \ + "MROPE on CPU does not support chunked-prefill." + image_grid_thw = mm_kwargs.get("image_grid_thw", None) video_grid_thw = mm_kwargs.get("video_grid_thw", None) assert image_grid_thw is not None or video_grid_thw is not None, ( @@ -198,226 +357,15 @@ def _compute_multi_modal_input( context_len=computed_len, ) seq_data.mrope_position_delta = mrope_position_delta - return mm_kwargs, placeholder_maps, mrope_positions - def _prepare_prompt( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int], - BatchedTensorInputs]: - assert len(seq_group_metadata_list) > 0 - input_tokens: List[int] = [] - input_positions: List[int] = [] - input_mrope_positions: List[List[int]] = [[] for _ in range(3)] - - slot_mapping: List[int] = [] - seq_lens: List[int] = [] - multi_modal_kwargs_list: List[MultiModalKwargs] = [] - multi_modal_placeholder_maps: Dict[ - str, - MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) - - for seq_group_metadata in seq_group_metadata_list: - assert seq_group_metadata.is_prompt - seq_ids = list(seq_group_metadata.seq_data.keys()) - assert len(seq_ids) == 1 - seq_id = seq_ids[0] - - seq_data = seq_group_metadata.seq_data[seq_id] - prompt_tokens = seq_data.get_token_ids() - computed_len = seq_data.get_num_computed_tokens() - seq_len = len(prompt_tokens) - - seq_lens.append(seq_len) # Prompt token num - input_tokens.extend(prompt_tokens) # Token ids - - mrope_positions = None - if seq_group_metadata.multi_modal_data: - ( - mm_kwargs, - placeholder_maps, - mrope_positions, - ) = self._compute_multi_modal_input(seq_data, computed_len, - seq_group_metadata) - - multi_modal_kwargs_list.append(mm_kwargs) - for modality, placeholder_map in placeholder_maps.items(): - multi_modal_placeholder_maps[modality].extend( - placeholder_map) - - # Token position ids - # NOTE(woosuk): Here we assume that the first token in the prompt - # is always the first token in the sequence. - if mrope_positions: - for idx in range(3): - input_mrope_positions[idx].extend(mrope_positions[idx]) - else: - input_positions.extend(list(range(computed_len, seq_len))) - - # Compute the slot mapping. - block_table = seq_group_metadata.block_tables[seq_id] - # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID, - # where start_idx is max(0, seq_len - sliding_window). - # For example, if the prompt len is 10, sliding window is 8, and - # block size is 4, the first two tokens are masked and the slot - # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1]. - start_idx = 0 - if self.sliding_window is not None: - start_idx = max(0, seq_len - self.sliding_window) - - for i in range(computed_len, seq_len): - if i < start_idx: - slot_mapping.append(_PAD_SLOT_ID) - continue - - # For encoder-only models, the block_table is None, - # and there is no need to initialize the slot_mapping. - if block_table is not None: - block_number = block_table[i // - self.block_size] # type: ignore - block_offset = i % self.block_size # type: ignore - slot = block_number * self.block_size + block_offset - slot_mapping.append(slot) - - if any(input_mrope_positions): - input_positions = None # type: ignore - else: - input_mrope_positions = None # type: ignore + for i in range(3): + self.input_data.input_mrope_positions[ # type: ignore + i].extend(mrope_positions[i]) - num_prompt_tokens = len(input_tokens) - - input_tokens = torch.tensor(input_tokens, - dtype=torch.long, - device=self.device) # type: ignore - input_positions = torch.tensor(input_positions - or input_mrope_positions, - dtype=torch.long, - device=self.device) # type: ignore - slot_mapping = torch.tensor(slot_mapping, - dtype=torch.long, - device=self.device) # type: ignore - placeholder_index_maps = { - modality: placeholder_map.index_map() - for modality, placeholder_map in - multi_modal_placeholder_maps.items() - } - - attn_metadata = self.attn_backend.make_metadata( - is_prompt=True, - seq_lens=seq_lens, - seq_lens_tensor=torch.tensor([]), - max_decode_seq_len=0, - num_prefills=len(seq_lens), - num_prefill_tokens=num_prompt_tokens, - num_decode_tokens=0, - block_tables=torch.tensor([]), - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=placeholder_index_maps, - ) - - multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) - - return (input_tokens, input_positions, attn_metadata, seq_lens, - multi_modal_kwargs) - - def _prepare_decode( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata]: - assert len(seq_group_metadata_list) > 0 - input_tokens: List[int] = [] - input_positions: List[int] = [] - input_mrope_positions: List[List[int]] = [[] for _ in range(3)] - slot_mapping: List[int] = [] - seq_lens: List[int] = [] - block_tables: List[List[int]] = [] - - for seq_group_metadata in seq_group_metadata_list: - assert not seq_group_metadata.is_prompt - assert seq_group_metadata.token_chunk_size == 1 - - seq_ids = list(seq_group_metadata.seq_data.keys()) - - for seq_id in seq_ids: - seq_data = seq_group_metadata.seq_data[seq_id] - generation_token = seq_data.get_last_token_id() - input_tokens.append(generation_token) - - seq_len = seq_data.get_len() - position = seq_len - 1 - if seq_data.mrope_position_delta is not None: - context_len = seq_data.get_num_computed_tokens() - next_pos = MRotaryEmbedding.get_next_input_positions( - seq_data.mrope_position_delta, - context_len, - seq_len, - ) - for idx in range(3): - input_mrope_positions[idx].extend(next_pos[idx]) - else: - input_positions.append(position) - - seq_len = seq_len if self.sliding_window is None else min( - seq_len, self.sliding_window) - seq_lens.append(seq_len) - - block_table = seq_group_metadata.block_tables[seq_id] - block_number = block_table[position // self.block_size] - block_offset = position % self.block_size - slot = block_number * self.block_size + block_offset - slot_mapping.append(slot) - - if self.sliding_window is not None: - sliding_window_blocks = (self.sliding_window // - self.block_size) - block_table = block_table[-sliding_window_blocks:] - block_tables.append(block_table) - - if any(input_mrope_positions): - input_positions = None # type: ignore - else: - input_mrope_positions = None # type: ignore - - max_decode_seq_len = max(seq_lens) - - input_tokens = torch.tensor(input_tokens, - dtype=torch.long, - device=self.device) - input_positions = torch.tensor(input_positions - or input_mrope_positions, - dtype=torch.long, - device=self.device) - slot_mapping = torch.tensor(slot_mapping, - dtype=torch.long, - device=self.device) - seq_lens_tensor = torch.tensor(seq_lens, - dtype=torch.int, - device=self.device) - - block_tables = make_tensor_with_pad( - block_tables, - pad=0, - dtype=torch.int, - device=self.device, - ) - - attn_metadata = self.attn_backend.make_metadata( - is_prompt=False, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=None, - seq_lens=seq_lens, - seq_lens_tensor=seq_lens_tensor, - max_decode_seq_len=max_decode_seq_len, - num_prefill_tokens=0, - num_decode_tokens=len(input_tokens), - num_prefills=0, - block_tables=block_tables, - ) - return ( - input_tokens, - input_positions, - attn_metadata, - ) + self.input_data.multi_modal_inputs_list.append(mm_kwargs) + for modality, placeholder_map in placeholder_maps.items(): + self.input_data.multi_modal_placeholder_maps[modality].extend( + placeholder_map) class CPUModelRunnerBase(ModelRunnerBase[TModelInputForCPU]): @@ -436,8 +384,6 @@ def __init__( **kwargs, ): ModelRunnerBase.__init__(self, vllm_config) - # Currently, CPU worker doesn't support chunked prefill. - assert self.scheduler_config.chunked_prefill_enabled is False model_config = self.model_config cache_config = self.cache_config @@ -479,8 +425,7 @@ def _prepare_model_input_tensors( """ builder = self._builder_cls(weakref.proxy(self), finished_requests_ids) - for seq_group_metadata in seq_group_metadata_list: - builder.add_seq_group(seq_group_metadata) + builder.set_seq_group_list(seq_group_metadata_list) return builder.build() # type: ignore @@ -537,22 +482,19 @@ def execute_model( "CPU worker does not support multi-step execution.") model_executable = self.model - execute_model_kwargs = { - "input_ids": - model_input.input_tokens, - "positions": - model_input.input_positions, - "kv_caches": - kv_caches, - "attn_metadata": - model_input.attn_metadata, - **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, - device=self.device), - "intermediate_tensors": - intermediate_tensors, - } - - hidden_states = model_executable(**execute_model_kwargs) + multimodal_kwargs = {} + if model_input.multi_modal_kwargs is not None: + multimodal_kwargs = MultiModalKwargs.as_kwargs( + model_input.multi_modal_kwargs, device=self.device) + + hidden_states = model_executable( + input_ids=model_input.input_tokens, + positions=model_input.input_positions, + kv_caches=kv_caches, + attn_metadata=model_input.attn_metadata, + intermediate_tensors=intermediate_tensors, + **multimodal_kwargs, + ) # Compute the logits. logits = self.model.compute_logits(hidden_states, From 772a66732d0ff58a43dbd1ae79c0d165659aa96d Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 20 Nov 2024 09:13:28 -0800 Subject: [PATCH 077/255] [platforms] restore xpu check for parallel config (#10479) Signed-off-by: youkaichao --- vllm/platforms/xpu.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 62db285f6696a..c3c4746d3cc25 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -55,3 +55,13 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: "CUDA graph is not supported on XPU, fallback to the eager " "mode.") model_config.enforce_eager = True + + # check and update parallel config + parallel_config = vllm_config.parallel_config + if (parallel_config.distributed_executor_backend is not None + and parallel_config.distributed_executor_backend != "ray"): + logger.warning( + "%s is not supported on XPU, fallback to ray distributed" + " executor backend.", + parallel_config.distributed_executor_backend) + parallel_config.distributed_executor_backend = "ray" From 5f1d6af2b619b07b2af3151d6aa59f9adc17e1eb Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Wed, 20 Nov 2024 11:06:56 -0800 Subject: [PATCH 078/255] [perf bench] H200 development (#9768) Signed-off-by: simon-mo --- .../benchmark-pipeline.yaml | 23 +++++++++++++++++++ .../convert-results-json-to-markdown.py | 5 ++++ .../scripts/run-performance-benchmarks.sh | 11 ++++----- 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml index eec2a51e2f8fd..5c069b38b2d7d 100644 --- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml @@ -9,7 +9,9 @@ steps: - image: badouralix/curl-jq command: - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh + - wait + - label: "A100" agents: queue: A100 @@ -41,6 +43,27 @@ steps: - name: devshm emptyDir: medium: Memory + + - label: "H200" + agents: + queue: H200 + plugins: + - docker#v5.12.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + command: + - bash + - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh + mount-buildkite-agent: true + propagate-environment: true + ipc: host + gpus: 4,5,6,7 + volumes: + - /data/benchmark-hf-cache:/root/.cache/huggingface + environment: + - VLLM_USAGE_SOURCE + - HF_TOKEN + + # - label: "H100" # agents: # queue: H100 diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index 7cf05610b9953..d640563252a0c 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -157,6 +157,11 @@ def results_to_json(latency, throughput, serving): throughput_results, serving_results) + # Sort all dataframes by their respective "Test name" columns + for df in [latency_results, serving_results, throughput_results]: + if not df.empty: + df.sort_values(by="Test name", inplace=True) + # get markdown tables latency_md_table = tabulate(latency_results, headers='keys', diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh index d397b05cdff23..0d16a83781ab2 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -6,6 +6,7 @@ # Do not set -e, as the mixtral 8x22B model tends to crash occasionally # and we still want to see other benchmarking results even when mixtral crashes. +set -x set -o pipefail check_gpus() { @@ -85,11 +86,7 @@ kill_gpu_processes() { ps -aux lsof -t -i:8000 | xargs -r kill -9 - pkill -f pt_main_thread - # this line doesn't work now - # ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9 - pkill -f python3 - pkill -f /usr/bin/python3 + pgrep python3 | xargs -r kill -9 # wait until GPU memory usage smaller than 1GB @@ -289,7 +286,7 @@ run_serving_tests() { # run the server echo "Running test case $test_name" echo "Server command: $server_command" - eval "$server_command" & + bash -c "$server_command" & server_pid=$! # wait until the server is alive @@ -322,7 +319,7 @@ run_serving_tests() { echo "Running test case $test_name with qps $qps" echo "Client command: $client_command" - eval "$client_command" + bash -c "$client_command" # record the benchmarking commands jq_output=$(jq -n \ From 0cd3d9717e38c7a122ed01fe2a8fddd8b37dff4b Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 20 Nov 2024 11:20:38 -0800 Subject: [PATCH 079/255] [7/N] torch.compile, reduce compilation time (#10460) Signed-off-by: youkaichao --- tests/compile/piecewise/test_simple.py | 2 +- tests/compile/piecewise/test_toy_llama.py | 4 ++-- vllm/compilation/backends.py | 2 +- vllm/config.py | 17 ++++++++++------- vllm/worker/worker.py | 18 +++++++++++++----- 5 files changed, 27 insertions(+), 16 deletions(-) diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index 0e40e3b4ebc96..0db12d6b6a43c 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -79,7 +79,7 @@ def test_simple_piecewise_compile(): vllm_config = VllmConfig(compilation_config=CompilationConfig( level=CompilationLevel.PIECEWISE, use_cudagraph=True, - non_cudagraph_ops=["silly.attention"], + splitting_ops=["silly.attention"], cudagraph_copy_inputs=True, )) with set_current_vllm_config(vllm_config): diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index 356d119a40334..cfe661b8871e0 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -258,7 +258,7 @@ def run_model(llama_config, use_cudagraph=True, ) if split_attn: - compilation_config.non_cudagraph_ops = ["silly.attention"] + compilation_config.splitting_ops = ["silly.attention"] else: compilation_config = CompilationConfig( level=CompilationLevel.NO_COMPILATION, ) @@ -378,7 +378,7 @@ def benchmark(): compilation_config = CompilationConfig( level=CompilationLevel.PIECEWISE, use_cudagraph=True, - non_cudagraph_ops=["silly.attention"], + splitting_ops=["silly.attention"], ) else: compilation_config = CompilationConfig( diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 0cf1e3a95fcba..416cffd326489 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -447,7 +447,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: self.add_passes_to_config() self.split_gm, self.piecewise_graphs = split_graph( - graph, self.compilation_configs.non_cudagraph_ops) + graph, self.compilation_configs.splitting_ops) from torch._dynamo.utils import lazy_format_graph_code logger.debug("%s", lazy_format_graph_code("before split", self.graph)) diff --git a/vllm/config.py b/vllm/config.py index e69cbd3eb402a..3d0c616868225 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2089,13 +2089,15 @@ class CompilationConfig(BaseModel): - 'none,+op1,+op2' to enable only op1 and op2 By default, all custom ops are enabled when running without Inductor and disabled when running with Inductor (compile_level >= Inductor). + - splitting_ops: a list of ops to split the full graph into subgraphs, used in piecewise compilation. - CudaGraph capture: - use_cudagraph: whether to use cudagraph inside compilation. - False: cudagraph inside compilation is not used. - True: cudagraph inside compilation is used. It requires - that all input buffers have fixed addresses. - Note that this is orthogonal to the cudagraph capture out - side of compilation. + that all input buffers have fixed addresses, and all + splitting ops write their outputs to input buffers. + Note that this is orthogonal to the cudagraph capture logic + outside of compilation. TODO: move outside cudagraph logic into compilation. torch.compile will handle cudagraph capture logic in the future. - cudagraph_capture_sizes: sizes to capture cudagraph. @@ -2149,6 +2151,11 @@ class CompilationConfig(BaseModel): level: int = 0 backend: str = "" custom_ops: List[str] = Field(default_factory=list) + splitting_ops: List[str] = Field(default_factory=lambda: [ + "vllm.unified_flash_attention", + "vllm.unified_flash_infer", + "vllm.unified_v1_flash_attention", + ]) use_inductor: bool = True inductor_specialize_for_cudagraph_no_more_than: Optional[int] = None @@ -2157,7 +2164,6 @@ class CompilationConfig(BaseModel): inductor_passes: Dict[str, str] = Field(default_factory=dict) use_cudagraph: bool = False - non_cudagraph_ops: List[str] = Field(default_factory=list) cudagraph_num_of_warmups: int = 0 cudagraph_capture_sizes: Optional[List[int]] = None cudagraph_copy_inputs: bool = False @@ -2348,9 +2354,6 @@ def __post_init__(self): # and avoid any potential issues with the inductor. self.compilation_config.custom_ops = ["none"] self.compilation_config.use_cudagraph = True - self.compilation_config.non_cudagraph_ops = [ - "vllm.unified_v1_flash_attention" - ] self.compilation_config.use_inductor = True self.compilation_config.enable_fusion = False diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index d3ca6d9d0b17e..80fd7bc3b67cc 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -1,6 +1,7 @@ """A GPU worker class.""" import gc import os +import time from typing import Dict, List, Optional, Set, Tuple, Type, Union import torch @@ -189,6 +190,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: torch.cuda.reset_peak_memory_stats() free_memory_pre_profile, total_gpu_memory = torch.cuda.mem_get_info() + start_time = time.time() # Execute a forward pass with dummy inputs to profile the memory usage # of the model. @@ -229,12 +231,18 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: num_gpu_blocks = max(num_gpu_blocks, 0) num_cpu_blocks = max(num_cpu_blocks, 0) + end_time = time.time() logger.info( - "Memory profiling results: total_gpu_memory=%.2fGiB" - " initial_memory_usage=%.2fGiB peak_torch_memory=%.2fGiB" - " memory_usage_post_profile=%.2fGiB" - " non_torch_memory=%.2fGiB kv_cache_size=%.2fGiB" - " gpu_memory_utilization=%.2f", total_gpu_memory / (1024**3), + "Memory profiling results: " + "duration=%.2f seconds, " + "total_gpu_memory=%.2fGiB, " + "initial_memory_usage=%.2fGiB, " + "peak_torch_memory=%.2fGiB, " + "memory_usage_post_profile=%.2fGiB, " + "non_torch_memory=%.2fGiB, " + "kv_cache_size=%.2fGiB, " + "gpu_memory_utilization=%.2f.", end_time - start_time, + total_gpu_memory / (1024**3), (total_gpu_memory - free_memory_pre_profile) / (1024**3), (peak_memory - non_torch_allocations) / (1024**3), total_allocated_bytes / (1024**3), From c68f7ede6a4aef0cd31f531b5d7ec22ab224de95 Mon Sep 17 00:00:00 2001 From: Guillaume Calmettes Date: Wed, 20 Nov 2024 22:42:21 +0100 Subject: [PATCH 080/255] [Bugfix]: allow extra fields in requests to openai compatible server (#10463) Signed-off-by: Guillaume Calmettes --- tests/entrypoints/openai/test_chat.py | 26 +++++++++++++------------- vllm/entrypoints/openai/protocol.py | 18 ++++++++++++++++-- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index 8d13f64dce01c..843d15e768093 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -899,19 +899,19 @@ async def test_response_format_json_schema(client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_extra_fields(client: openai.AsyncOpenAI): - with pytest.raises(BadRequestError) as exc_info: - await client.chat.completions.create( - model=MODEL_NAME, - messages=[{ - "role": "system", - "content": "You are a helpful assistant.", - "extra_field": "0", - }], # type: ignore - temperature=0, - seed=0) - - assert "extra_forbidden" in exc_info.value.message +async def test_extra_fields_allowed(client: openai.AsyncOpenAI): + resp = await client.chat.completions.create( + model=MODEL_NAME, + messages=[{ + "role": "user", + "content": "what is 1+1?", + "extra_field": "0", + }], # type: ignore + temperature=0, + seed=0) + + content = resp.choices[0].message.content + assert content is not None @pytest.mark.asyncio diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index b7b064ae01f05..a82212677f63a 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -9,12 +9,15 @@ from typing_extensions import Annotated from vllm.entrypoints.chat_utils import ChatCompletionMessageParam +from vllm.logger import init_logger from vllm.pooling_params import PoolingParams from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams, RequestOutputKind, SamplingParams) from vllm.sequence import Logprob from vllm.utils import random_uuid +logger = init_logger(__name__) + # torch is mocked during docs generation, # so we have to provide the values as literals _MOCK_LONG_INFO = Namespace(min=-9223372036854775808, max=9223372036854775807) @@ -35,8 +38,19 @@ class OpenAIBaseModel(BaseModel): - # OpenAI API does not allow extra fields - model_config = ConfigDict(extra="forbid") + # OpenAI API does allow extra fields + model_config = ConfigDict(extra="allow") + + @model_validator(mode="before") + @classmethod + def __log_extra_fields__(cls, data): + if isinstance(data, dict): + extra_fields = data.keys() - cls.model_fields.keys() + if extra_fields: + logger.warning( + "The following fields were present in the request " + "but ignored: %s", extra_fields) + return data class ErrorResponse(OpenAIBaseModel): From 2f77b6cfec32c8054f996aee4b021f511630ea6f Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 20 Nov 2024 13:54:15 -0800 Subject: [PATCH 081/255] [TPU] Implement prefix caching for TPUs (#10307) Signed-off-by: Woosuk Kwon --- requirements-tpu.txt | 6 +- vllm/attention/backends/pallas.py | 66 ++++++---- vllm/worker/tpu_model_runner.py | 211 +++++++++++++++++++----------- vllm/worker/tpu_worker.py | 4 +- 4 files changed, 182 insertions(+), 105 deletions(-) diff --git a/requirements-tpu.txt b/requirements-tpu.txt index f9a0770804e55..3d1e80f6be620 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -16,8 +16,8 @@ ray[default] --find-links https://storage.googleapis.com/libtpu-releases/index.html --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html -torch==2.6.0.dev20241028+cpu -torchvision==0.20.0.dev20241028+cpu -torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241028-cp310-cp310-linux_x86_64.whl +torch==2.6.0.dev20241114+cpu +torchvision==0.20.0.dev20241114+cpu +torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241114-cp310-cp310-linux_x86_64.whl jaxlib==0.4.32.dev20240829 jax==0.4.32.dev20240829 diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index 6fee81de14420..eeab8731a2c39 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -65,6 +65,7 @@ class PallasMetadata(AttentionMetadata): # or all decoding. block_tables: Optional[torch.Tensor] = None context_lens: Optional[torch.Tensor] = None + effective_query_lens: Optional[torch.Tensor] = None @property def prefill_metadata(self) -> Optional["PallasMetadata"]: @@ -72,8 +73,6 @@ def prefill_metadata(self) -> Optional["PallasMetadata"]: return None assert self.num_decode_tokens == 0 - assert self.block_tables is None - assert self.context_lens is None return self @property @@ -186,29 +185,50 @@ def forward( query = query * self.scale if attn_metadata.num_prefills > 0: - assert seq_len % 16 == 0, ( - "Pallas FlashAttention kernel requires seq_len to be a " - f"multiple of 16 but got {seq_len}") - - # Handle GQA/MQA. - if self.num_kv_heads != self.num_heads: - key = key.repeat_interleave(self.num_queries_per_kv, dim=-2) - key = key.view(batch_size, seq_len, self.num_heads, - self.head_size) - value = value.repeat_interleave(self.num_queries_per_kv, + if attn_metadata.block_tables is None: + # Prefill without paged KV cache. + assert seq_len % 16 == 0, ( + "Pallas FlashAttention kernel requires seq_len to be a " + f"multiple of 16 but got {seq_len}") + + # Handle GQA/MQA. + if self.num_kv_heads != self.num_heads: + key = key.repeat_interleave(self.num_queries_per_kv, dim=-2) - value = value.view(batch_size, seq_len, self.num_heads, + key = key.view(batch_size, seq_len, self.num_heads, self.head_size) - # FlashAttention requires [batch_size, num_heads, seq_len, d_model] - # while the input is [batch_size, seq_len, num_heads, d_model]. - # Permute the input to match the required format. - output = torch.ops.xla.flash_attention( - query.permute(0, 2, 1, 3), - key.permute(0, 2, 1, 3), - value.permute(0, 2, 1, 3), - True, - ) - output = output.permute(0, 2, 1, 3) + value = value.repeat_interleave(self.num_queries_per_kv, + dim=-2) + value = value.view(batch_size, seq_len, self.num_heads, + self.head_size) + # FlashAttention kernel requires the input shape to be + # [batch_size, num_heads, seq_len, d_model] + # while the input is [batch_size, seq_len, num_heads, d_model]. + # Permute the input to match the required format. + output = torch.ops.xla.flash_attention( + query.permute(0, 2, 1, 3), + key.permute(0, 2, 1, 3), + value.permute(0, 2, 1, 3), + True, + ) + output = output.permute(0, 2, 1, 3) + else: + # Prefill with paged KV cache. + # TODO(woosuk): Tune the below knobs. + num_kv_pages_per_compute_block = 16 + num_queries_per_compute_block = 16 + assert seq_len % num_queries_per_compute_block == 0 + output = torch.ops.xla.multi_queries_paged_attention( + query, + key_cache, + value_cache, + attn_metadata.context_lens, + attn_metadata.block_tables, + attn_metadata.effective_query_lens, + num_kv_pages_per_compute_block, + num_queries_per_compute_block, + use_kernel=True, + ) else: # Decoding run. assert kv_cache[0].numel() > 0 diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index d7a641857a613..9a054eb8a4cf7 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -1,3 +1,4 @@ +import enum import time from dataclasses import dataclass from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, @@ -11,7 +12,6 @@ import torch_xla.runtime as xr from vllm.attention import AttentionMetadata, get_attn_backend -from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.sampler import SamplerOutput @@ -39,6 +39,15 @@ _MAX_NUM_SAMPLES = 128 +class ExecutionMode(enum.Enum): + PREFILL = enum.auto() + DECODE = enum.auto() + PREFIX_PREFILL = enum.auto() + + def is_prefill(self) -> bool: + return self in (ExecutionMode.PREFILL, ExecutionMode.PREFIX_PREFILL) + + @dataclass(frozen=True) class ModelInputForTPU(ModelRunnerInputBase): token_ids: torch.Tensor @@ -140,16 +149,21 @@ def load_model(self) -> None: model = get_model(vllm_config=self.vllm_config) model = model.eval() xm.wait_device_ops() - self.model = ModelWrapper(model, self.vllm_config) + model = ModelWrapper(model) + self.model = torch.compile(model, + backend="openxla", + fullgraph=True, + dynamic=False) def _dummy_run( self, batch_size: int, seq_len: int, kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], - is_prompt: bool, + exec_mode: ExecutionMode, ) -> None: - if is_prompt: + exec_mode = ExecutionMode(exec_mode) + if exec_mode.is_prefill(): seq_len = (seq_len + 15) // 16 * 16 token_ids = torch.zeros((batch_size, seq_len), dtype=torch.int32, @@ -160,18 +174,38 @@ def _dummy_run( slot_mapping = torch.zeros((batch_size, seq_len), dtype=torch.int64, device=self.device) - attn_metadata = self.attn_backend.make_metadata( - num_prefills=batch_size, - num_prefill_tokens=batch_size * seq_len, - num_decode_tokens=0, - slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=None, - block_tables=None, - context_lens=None, - ) input_lens = torch.ones((batch_size, ), dtype=torch.int32, device=self.device) + if exec_mode == ExecutionMode.PREFILL: + attn_metadata = self.attn_backend.make_metadata( + num_prefills=batch_size, + num_prefill_tokens=batch_size * seq_len, + num_decode_tokens=0, + slot_mapping=slot_mapping, + multi_modal_placeholder_index_maps=None, + block_tables=None, + context_lens=None, + effective_query_lens=None, + ) + else: + context_lens = torch.ones((batch_size, ), + dtype=torch.int32, + device=self.device) + block_tables = torch.tensor(self.block_tables[:batch_size], + dtype=torch.int32, + device=self.device) + effective_query_lens = torch.ones_like(context_lens) + attn_metadata = self.attn_backend.make_metadata( + num_prefills=batch_size, + num_prefill_tokens=batch_size * seq_len, + num_decode_tokens=0, + slot_mapping=slot_mapping, + multi_modal_placeholder_index_maps=None, + block_tables=block_tables, + context_lens=context_lens, + effective_query_lens=effective_query_lens, + ) else: assert seq_len == 1 token_ids = torch.zeros((batch_size, seq_len), @@ -204,7 +238,7 @@ def _dummy_run( ) t = torch.ones((batch_size, ), dtype=torch.float32, device=self.device) p = torch.ones((batch_size, ), dtype=torch.float32, device=self.device) - num_samples = _MAX_NUM_SAMPLES if is_prompt else 1 + num_samples = _MAX_NUM_SAMPLES if exec_mode.is_prefill() else 1 # NOTE(woosuk): There are two stages of compilation: torch.compile and # XLA compilation. Using `mark_dynamic` can reduce the torch.compile @@ -213,7 +247,7 @@ def _dummy_run( # be re-compiled for every different shapes. This overhead is inevitable # in the first run, but can be skipped afterwards as we cache the XLA # graphs in the disk (VLLM_XLA_CACHE_PATH). - if is_prompt: + if exec_mode.is_prefill(): # Prefll torch._dynamo.mark_dynamic(token_ids, 1) torch._dynamo.mark_dynamic(position_ids, 1) @@ -229,15 +263,8 @@ def _dummy_run( torch._dynamo.mark_dynamic(t, 0) torch._dynamo.mark_dynamic(p, 0) # Dummy run. - self.model(token_ids, - position_ids, - attn_metadata, - input_lens, - t, - p, - num_samples, - kv_caches, - is_prompt=is_prompt) + self.model(token_ids, position_ids, attn_metadata, input_lens, t, p, + num_samples, kv_caches) def warmup_model( self, @@ -248,13 +275,13 @@ def warmup_model( start = time.time() for batch_size in [1]: seq_len = 16 - while True: - self._dummy_run(batch_size, seq_len, kv_caches, is_prompt=True) + while seq_len <= self.model_config.max_model_len: + self._dummy_run(batch_size, + seq_len, + kv_caches, + exec_mode=ExecutionMode.PREFILL) xm.wait_device_ops() logger.info("batch_size: %d, seq_len: %d", batch_size, seq_len) - - if seq_len >= self.model_config.max_model_len: - break num_tokens = batch_size * seq_len if num_tokens >= self.scheduler_config.max_num_batched_tokens: break @@ -263,12 +290,39 @@ def warmup_model( end = time.time() logger.info("Compilation for prefill done in %.2f s.", end - start) + # Prefix prefill + if self.cache_config.enable_prefix_caching: + logger.info("Compiling the model with different input shapes for " + "prefix prefill...") + start = time.time() + for batch_size in [1]: + seq_len = 16 + while seq_len <= self.model_config.max_model_len: + self._dummy_run(batch_size, + seq_len, + kv_caches, + exec_mode=ExecutionMode.PREFIX_PREFILL) + xm.wait_device_ops() + logger.info("batch_size: %d, seq_len: %d", batch_size, + seq_len) + num_tokens = batch_size * seq_len + if (num_tokens >= + self.scheduler_config.max_num_batched_tokens): + break + seq_len = seq_len * 2 + end = time.time() + logger.info("Compilation for prefix prefill done in %.2f s.", + end - start) + # Decode start = time.time() seq_len = 1 batch_size = 8 # Must be in sync with _get_padded_batch_size() while True: - self._dummy_run(batch_size, seq_len, kv_caches, is_prompt=False) + self._dummy_run(batch_size, + seq_len, + kv_caches, + exec_mode=ExecutionMode.DECODE) xm.wait_device_ops() logger.info("batch_size: %d, seq_len: %d", batch_size, seq_len) @@ -287,9 +341,11 @@ def _prepare_prompt( input_tokens: List[int] = [] input_positions: List[int] = [] prompt_lens: List[int] = [] + context_lens: List[int] = [] slot_mapping: List[int] = [] - for seq_group_metadata in seq_group_metadata_list: + for batch_idx, seq_group_metadata in enumerate( + seq_group_metadata_list): assert seq_group_metadata.is_prompt seq_ids = list(seq_group_metadata.seq_data.keys()) assert len(seq_ids) == 1 @@ -298,19 +354,31 @@ def _prepare_prompt( seq_data = seq_group_metadata.seq_data[seq_id] # Could include output tokens when a request is preempted. prompt_tokens = seq_data.get_token_ids() + seq_len = len(prompt_tokens) + + num_computed_blocks = len(seq_group_metadata.computed_block_nums) + num_computed_tokens = num_computed_blocks * self.block_size + if num_computed_tokens > 0: + prompt_tokens = prompt_tokens[num_computed_tokens:] + context_lens.append(seq_len) + else: + context_lens.append(0) + prompt_len = len(prompt_tokens) prompt_lens.append(prompt_len) input_tokens.extend(prompt_tokens) - input_positions.extend(list(range(prompt_len))) + input_positions.extend(range(num_computed_tokens, seq_len)) assert seq_group_metadata.block_tables is not None block_table = seq_group_metadata.block_tables[seq_id] - for i in range(prompt_len): + for i in range(num_computed_tokens, seq_len): block_number = block_table[i // self.block_size] block_offset = i % self.block_size slot = block_number * self.block_size + block_offset slot_mapping.append(slot) + if num_computed_tokens > 0: + self.block_tables[batch_idx, :len(block_table)] = block_table # Add paddings to EACH prompt to the smallest power of 2 that is # greater than or equal to the prompt length. @@ -338,14 +406,21 @@ def _prepare_prompt( prompt_lens = torch.tensor(prompt_lens, dtype=torch.int32, device="cpu") + context_lens = torch.tensor(context_lens, + dtype=torch.int32, + device="cpu") + block_tables = torch.tensor(self.block_tables[:num_prefills], + dtype=torch.int32, + device="cpu") attn_metadata = self.attn_backend.make_metadata( num_prefills=num_prefills, num_prefill_tokens=0, # NOTE: This is not used. num_decode_tokens=0, slot_mapping=slot_mapping, multi_modal_placeholder_index_maps=None, - block_tables=None, - context_lens=None, + block_tables=block_tables, + context_lens=context_lens, + effective_query_lens=prompt_lens, ) return input_tokens, input_positions, attn_metadata, prompt_lens @@ -550,6 +625,10 @@ def execute_model( # process them separately. This is a temporary hack that should be # optimized by using SplashAttention. orig_slot_mapping = model_input.attn_metadata.slot_mapping + orig_block_tables = model_input.attn_metadata.block_tables + orig_context_lens = model_input.attn_metadata.context_lens + orig_effective_query_lens = \ + model_input.attn_metadata.effective_query_lens batch_size = model_input.input_lens.shape[0] start_idx = 0 next_token_ids = [] @@ -568,18 +647,24 @@ def execute_model( attn_metadata.num_prefills = 1 attn_metadata.slot_mapping = orig_slot_mapping[ None, start_idx:end_idx].to(self.device) + if orig_context_lens[i].item() > 0: + attn_metadata.context_lens = orig_context_lens[i:i + 1].to( + self.device) + attn_metadata.block_tables = orig_block_tables[ + i].unsqueeze(0).to(self.device) + attn_metadata.effective_query_lens = \ + orig_effective_query_lens[i:i + 1].to(self.device) + else: + attn_metadata.context_lens = None + attn_metadata.block_tables = None + attn_metadata.effective_query_lens = None input_lens = model_input.input_lens[i:i + 1].to(self.device) t = model_input.t[i:i + 1].to(self.device) p = model_input.p[i:i + 1].to(self.device) - output_token_ids = self.model(token_ids, - position_ids, - attn_metadata, - input_lens, - t, - p, + output_token_ids = self.model(token_ids, position_ids, + attn_metadata, input_lens, t, p, model_input.num_samples, - kv_caches, - is_prompt=True) + kv_caches) next_token_ids.append(output_token_ids[0]) start_idx = end_idx @@ -624,15 +709,10 @@ def execute_model( input_lens = model_input.input_lens.to(self.device) for i in range(num_steps): slot_mapping = attn_metadata.slot_mapping - output_token_ids = self.model(token_ids, - position_ids, - attn_metadata, - input_lens, - t, - p, + output_token_ids = self.model(token_ids, position_ids, + attn_metadata, input_lens, t, p, model_input.num_samples, - kv_caches, - is_prompt=False) + kv_caches) self.cached_step_outputs.append(output_token_ids) if i < num_steps - 1: @@ -667,34 +747,11 @@ def execute_model( return [sampler_output] -class ModelWrapper(TorchCompileWrapperWithCustomDispatcher): +class ModelWrapper(nn.Module): - def __init__(self, model: nn.Module, vllm_config: VllmConfig): + def __init__(self, model: nn.Module): + super().__init__() self.model = model - compiled_callable = torch.compile(self.forward, - backend="openxla", - fullgraph=True, - dynamic=False) - super().__init__( - compiled_callable, - compilation_level=vllm_config.compilation_config.level) - - def __call__(self, *args, is_prompt: bool, **kwargs): - if len(self.compiled_codes) < 3 or not self.use_custom_dispatcher: - # not fully compiled yet, or not using the custom dispatcher, - # let PyTorch handle it - return self.compiled_callable(*args, **kwargs) - # the 3 compiled codes are: - # 0: for profiling - # 1: for prompt - # 2: for decode - # dispatch to the compiled code directly, skip PyTorch - if is_prompt: - with self.dispatch_to_code(1): - return self.forward(*args, **kwargs) - else: - with self.dispatch_to_code(2): - return self.forward(*args, **kwargs) def forward( self, diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index 096cb23416909..8754f7538f251 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -13,7 +13,7 @@ from vllm.model_executor import set_random_seed from vllm.sequence import ExecuteModelRequest from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size -from vllm.worker.tpu_model_runner import TPUModelRunner +from vllm.worker.tpu_model_runner import ExecutionMode, TPUModelRunner from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, LoraNotSupportedWorkerBase, WorkerBase, WorkerInput) @@ -112,7 +112,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: batch_size=1, seq_len=self.scheduler_config.max_num_batched_tokens, kv_caches=kv_caches, - is_prompt=True, + exec_mode=ExecutionMode.PREFILL, ) # Synchronize before measuring the memory usage. xm.wait_device_ops() From 388ee3de665c3055fbe610b66ebeef096a23cfe1 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 20 Nov 2024 18:36:33 -0800 Subject: [PATCH 082/255] [torch.compile] limit inductor threads and lazy import quant (#10482) Signed-off-by: youkaichao --- .buildkite/test-pipeline.yaml | 2 + tests/quantization/utils.py | 4 +- tests/test_lazy_torch_compile.py | 68 ++++++++++ vllm/_custom_ops.py | 3 - vllm/config.py | 8 +- .../layers/quantization/__init__.py | 124 +++++++++++------- vllm/model_executor/models/internvl.py | 4 +- vllm/model_executor/models/qwen2_vl.py | 7 +- vllm/platforms/cuda.py | 2 + vllm/platforms/rocm.py | 11 ++ vllm/plugins/__init__.py | 9 ++ 11 files changed, 178 insertions(+), 64 deletions(-) create mode 100644 tests/test_lazy_torch_compile.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 501743c887596..c436d2b48d20f 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -50,7 +50,9 @@ steps: - tests/multimodal - tests/test_utils - tests/worker + - tests/test_lazy_torch_compile.py commands: + - python3 test_lazy_torch_compile.py - pytest -v -s mq_llm_engine # MQLLMEngine - pytest -v -s async_engine # AsyncLLMEngine - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py index 061a077592e80..8ebd8dd2be0d5 100644 --- a/tests/quantization/utils.py +++ b/tests/quantization/utils.py @@ -1,4 +1,4 @@ -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.model_executor.layers.quantization import get_quantization_config from vllm.platforms import current_platform @@ -10,6 +10,6 @@ def is_quant_method_supported(quant_method: str) -> bool: capability = current_platform.get_device_capability() assert capability is not None - min_capability = QUANTIZATION_METHODS[quant_method].get_min_capability() + min_capability = get_quantization_config(quant_method).get_min_capability() return capability.to_int() >= min_capability diff --git a/tests/test_lazy_torch_compile.py b/tests/test_lazy_torch_compile.py new file mode 100644 index 0000000000000..b8ac4dd93732b --- /dev/null +++ b/tests/test_lazy_torch_compile.py @@ -0,0 +1,68 @@ +# Description: Test the lazy import module +# The utility function cannot be placed in `vllm.utils` +# this needs to be a standalone script + +import contextlib +import dataclasses +import sys +import traceback +from typing import Callable, Generator + + +@dataclasses.dataclass +class BlameResult: + found: bool = False + trace_stack: str = "" + + +@contextlib.contextmanager +def blame(func: Callable) -> Generator[BlameResult, None, None]: + """ + Trace the function calls to find the first function that satisfies the + condition. The trace stack will be stored in the result. + + Usage: + + ```python + with blame(lambda: some_condition()) as result: + # do something + + if result.found: + print(result.trace_stack) + """ + result = BlameResult() + + def _trace_calls(frame, event, arg=None): + nonlocal result + if event in ['call', 'return']: + # for every function call or return + try: + # Temporarily disable the trace function + sys.settrace(None) + # check condition here + if not result.found and func(): + result.found = True + result.trace_stack = "".join(traceback.format_stack()) + # Re-enable the trace function + sys.settrace(_trace_calls) + except NameError: + # modules are deleted during shutdown + pass + return _trace_calls + + sys.settrace(_trace_calls) + + yield result + + sys.settrace(None) + + +module_name = "torch._inductor.async_compile" + +with blame(lambda: module_name in sys.modules) as result: + import vllm # noqa + +assert not result.found, (f"Module {module_name} is already imported, the" + f" first import location is:\n{result.trace_stack}") + +print(f"Module {module_name} is not imported yet") diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 782dc6aed1b8c..41892e4dddf7e 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -19,9 +19,6 @@ except ImportError as e: logger.warning("Failed to import from vllm._C with %r", e) -if current_platform.is_rocm(): - import vllm._rocm_C # noqa: F401 - supports_moe_ops = False with contextlib.suppress(ImportError): import vllm._moe_C # noqa: F401 diff --git a/vllm/config.py b/vllm/config.py index 3d0c616868225..7522486782cc9 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -14,7 +14,8 @@ import vllm.envs as envs from vllm.logger import init_logger -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS, + get_quantization_config) from vllm.model_executor.models import ModelRegistry from vllm.platforms import current_platform from vllm.tracing import is_otel_available, otel_import_error_traceback @@ -370,7 +371,7 @@ def _parse_quant_hf_config(self): return quant_cfg def _verify_quantization(self) -> None: - supported_quantization = [*QUANTIZATION_METHODS] + supported_quantization = QUANTIZATION_METHODS rocm_supported_quantization = [ "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors", "fbgemm_fp8" @@ -392,7 +393,8 @@ def _verify_quantization(self) -> None: quant_method = quant_cfg.get("quant_method", "").lower() # Detect which checkpoint is it - for _, method in QUANTIZATION_METHODS.items(): + for name in QUANTIZATION_METHODS: + method = get_quantization_config(name) quantization_override = method.override_quantization_method( quant_cfg, self.quantization) if quantization_override: diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index ff342c4f9479e..dd10c434f0752 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -1,65 +1,87 @@ -from typing import Dict, Type +from typing import Dict, List, Type -from vllm.model_executor.layers.quantization.aqlm import AQLMConfig -from vllm.model_executor.layers.quantization.awq import AWQConfig -from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.quantization.bitsandbytes import ( - BitsAndBytesConfig) -from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 - CompressedTensorsConfig) -from vllm.model_executor.layers.quantization.deepspeedfp import ( - DeepSpeedFPConfig) -from vllm.model_executor.layers.quantization.experts_int8 import ( - ExpertsInt8Config) -from vllm.model_executor.layers.quantization.fbgemm_fp8 import FBGEMMFp8Config -from vllm.model_executor.layers.quantization.fp8 import Fp8Config -from vllm.model_executor.layers.quantization.gguf import GGUFConfig -from vllm.model_executor.layers.quantization.gptq import GPTQConfig -from vllm.model_executor.layers.quantization.gptq_marlin import ( - GPTQMarlinConfig) -from vllm.model_executor.layers.quantization.gptq_marlin_24 import ( - GPTQMarlin24Config) -from vllm.model_executor.layers.quantization.hqq_marlin import HQQMarlinConfig -from vllm.model_executor.layers.quantization.ipex_quant import IPEXConfig -from vllm.model_executor.layers.quantization.marlin import MarlinConfig -from vllm.model_executor.layers.quantization.modelopt import ModelOptFp8Config -from vllm.model_executor.layers.quantization.neuron_quant import ( - NeuronQuantConfig) -from vllm.model_executor.layers.quantization.qqq import QQQConfig -from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig -QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = { - "aqlm": AQLMConfig, - "awq": AWQConfig, - "deepspeedfp": DeepSpeedFPConfig, - "tpu_int8": Int8TpuConfig, - "fp8": Fp8Config, - "fbgemm_fp8": FBGEMMFp8Config, - "modelopt": ModelOptFp8Config, +QUANTIZATION_METHODS: List[str] = [ + "aqlm", + "awq", + "deepspeedfp", + "tpu_int8", + "fp8", + "fbgemm_fp8", + "modelopt", # The order of gptq methods is important for config.py iteration over # override_quantization_method(..) - "marlin": MarlinConfig, - "gguf": GGUFConfig, - "gptq_marlin_24": GPTQMarlin24Config, - "gptq_marlin": GPTQMarlinConfig, - "awq_marlin": AWQMarlinConfig, - "gptq": GPTQConfig, - "compressed-tensors": CompressedTensorsConfig, - "bitsandbytes": BitsAndBytesConfig, - "qqq": QQQConfig, - "hqq": HQQMarlinConfig, - "experts_int8": ExpertsInt8Config, - "neuron_quant": NeuronQuantConfig, - "ipex": IPEXConfig, -} + "marlin", + "gguf", + "gptq_marlin_24", + "gptq_marlin", + "awq_marlin", + "gptq", + "compressed-tensors", + "bitsandbytes", + "qqq", + "hqq", + "experts_int8", + "neuron_quant", + "ipex", +] def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: if quantization not in QUANTIZATION_METHODS: raise ValueError(f"Invalid quantization method: {quantization}") - return QUANTIZATION_METHODS[quantization] + + # lazy import to avoid triggering `torch.compile` too early + from .aqlm import AQLMConfig + from .awq import AWQConfig + from .awq_marlin import AWQMarlinConfig + from .bitsandbytes import BitsAndBytesConfig + from .compressed_tensors.compressed_tensors import ( # noqa: E501 + CompressedTensorsConfig) + from .deepspeedfp import DeepSpeedFPConfig + from .experts_int8 import ExpertsInt8Config + from .fbgemm_fp8 import FBGEMMFp8Config + from .fp8 import Fp8Config + from .gguf import GGUFConfig + from .gptq import GPTQConfig + from .gptq_marlin import GPTQMarlinConfig + from .gptq_marlin_24 import GPTQMarlin24Config + from .hqq_marlin import HQQMarlinConfig + from .ipex_quant import IPEXConfig + from .marlin import MarlinConfig + from .modelopt import ModelOptFp8Config + from .neuron_quant import NeuronQuantConfig + from .qqq import QQQConfig + from .tpu_int8 import Int8TpuConfig + + method_to_config: Dict[str, Type[QuantizationConfig]] = { + "aqlm": AQLMConfig, + "awq": AWQConfig, + "deepspeedfp": DeepSpeedFPConfig, + "tpu_int8": Int8TpuConfig, + "fp8": Fp8Config, + "fbgemm_fp8": FBGEMMFp8Config, + "modelopt": ModelOptFp8Config, + # The order of gptq methods is important for config.py iteration over + # override_quantization_method(..) + "marlin": MarlinConfig, + "gguf": GGUFConfig, + "gptq_marlin_24": GPTQMarlin24Config, + "gptq_marlin": GPTQMarlinConfig, + "awq_marlin": AWQMarlinConfig, + "gptq": GPTQConfig, + "compressed-tensors": CompressedTensorsConfig, + "bitsandbytes": BitsAndBytesConfig, + "qqq": QQQConfig, + "hqq": HQQMarlinConfig, + "experts_int8": ExpertsInt8Config, + "neuron_quant": NeuronQuantConfig, + "ipex": IPEXConfig, + } + + return method_to_config[quantization] __all__ = [ diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 7ea2f9be2191d..5d38b4b1ef14b 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -19,8 +19,8 @@ from vllm.config import VllmConfig from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, InputContext, token_inputs) -from vllm.model_executor.layers.quantization import (AWQConfig, - QuantizationConfig) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.models.intern_vit import (InternVisionModel, InternVisionPatchModel) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 0ac81387b1bd8..531608a877f2f 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -51,9 +51,10 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.pooler import Pooler, PoolingType -from vllm.model_executor.layers.quantization import (GPTQConfig, - GPTQMarlinConfig, - QuantizationConfig) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.quantization.gptq import GPTQConfig +from vllm.model_executor.layers.quantization.gptq_marlin import ( + GPTQMarlinConfig) from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 9c5212ace1346..d2911ef650743 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -10,6 +10,8 @@ import torch from typing_extensions import ParamSpec +# import custom ops, trigger op registration +import vllm._C # noqa from vllm.logger import init_logger from .interface import DeviceCapability, Platform, PlatformEnum diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 022256996f97b..bb3a49c8b73bc 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -9,6 +9,17 @@ logger = init_logger(__name__) +try: + import vllm._C # noqa: F401 +except ImportError as e: + logger.warning("Failed to import from vllm._C with %r", e) + +# import custom ops, trigger op registration +try: + import vllm._rocm_C # noqa: F401 +except ImportError as e: + logger.warning("Failed to import from vllm._rocm_C with %r", e) + if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", None) in ["fork", None]: logger.warning("`fork` method is not supported by ROCm. " "VLLM_WORKER_MULTIPROC_METHOD is overridden to" diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index dc183dbfc9b96..d5056b18fe968 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -1,4 +1,5 @@ import logging +import os from contextlib import contextmanager from typing import TYPE_CHECKING, Optional @@ -18,6 +19,14 @@ def load_general_plugins(): processes. They should be designed in a way that they can be loaded multiple times without causing issues. """ + + # all processes created by vllm will load plugins, + # and here we can inject some common environment variables + # for all processes. + + # see https://github.com/vllm-project/vllm/issues/10480 + os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1' + global plugins_loaded if plugins_loaded: return From 6c1208d083fbaaf89c6d812f4d3424e15182f652 Mon Sep 17 00:00:00 2001 From: Pavani Majety Date: Wed, 20 Nov 2024 19:56:47 -0800 Subject: [PATCH 083/255] [Core] Add Sliding Window Support with Flashinfer (#10462) Signed-off-by: Pavani Majety --- .../block/e2e/test_correctness_sliding_window.py | 12 ++++++++++-- vllm/attention/backends/flashinfer.py | 13 ++++++++----- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py index 9320a9ef62314..415d0bd8237df 100644 --- a/tests/core/block/e2e/test_correctness_sliding_window.py +++ b/tests/core/block/e2e/test_correctness_sliding_window.py @@ -3,6 +3,7 @@ import pytest +from tests.kernels.utils import override_backend_env_variable from vllm import LLM, SamplingParams from .conftest import get_text_from_llm_generator @@ -28,8 +29,9 @@ @pytest.mark.parametrize("test_llm_kwargs", [{}]) @pytest.mark.parametrize("batch_size", [5]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"]) def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator, - batch_size, seed): + batch_size, seed, backend, monkeypatch): """ The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then asks for value of one of them (which is outside the sliding window). @@ -38,6 +40,8 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator, Additionally, we compare the results of the v1 and v2 managers. """ + override_backend_env_variable(monkeypatch, backend) + sampling_params = SamplingParams( max_tokens=1024, ignore_eos=True, @@ -84,7 +88,9 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator, @pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}]) @pytest.mark.parametrize("batch_size", [5]) @pytest.mark.parametrize("seed", [1]) -def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed): +@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"]) +def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed, + backend, monkeypatch): """ This is similar to test_sliding_window_retrival, however, it doesn't compare against the v1 block manager since v1 doesn't support @@ -93,6 +99,8 @@ def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed): The results with and without chunked prefill are not the same due to numerical instabilities. """ + override_backend_env_variable(monkeypatch, backend) + sampling_params = SamplingParams( max_tokens=10, ignore_eos=True, diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 107e3bbf79666..b61c660e3e280 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -757,9 +757,8 @@ def __init__( if alibi_slopes is not None: alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) self.alibi_slopes = alibi_slopes - if sliding_window is not None: - raise ValueError("Sliding window is not supported in FlashInfer.") - self.sliding_window = (-1, -1) + self.sliding_window = ((sliding_window - 1, + 0) if sliding_window is not None else (-1, -1)) self.kv_cache_dtype = kv_cache_dtype self.logits_soft_cap = logits_soft_cap @@ -865,6 +864,8 @@ def unified_flash_infer( assert query.shape[0] == num_prefill_tokens assert decode_query.shape[0] == num_decode_tokens + window_left = window_size[0] if window_size is not None else -1 + prefill_output: Optional[torch.Tensor] = None decode_output: Optional[torch.Tensor] = None if prefill_meta := attn_metadata.prefill_metadata: @@ -895,7 +896,8 @@ def unified_flash_infer( logits_soft_cap=logits_soft_cap, causal=True, k_scale=k_scale, - v_scale=v_scale) + v_scale=v_scale, + window_left=window_left) if decode_meta := attn_metadata.decode_metadata: assert attn_metadata.decode_metadata is not None assert attn_metadata.decode_metadata.decode_wrapper is not None @@ -905,7 +907,8 @@ def unified_flash_infer( sm_scale=softmax_scale, logits_soft_cap=logits_soft_cap, k_scale=k_scale, - v_scale=v_scale) + v_scale=v_scale, + window_left=window_left) if prefill_output is None and decode_output is not None: # Decode only batch. From 9d827170a3aa586dfb458bf28d18fd279bdbf580 Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Thu, 21 Nov 2024 12:44:20 +0800 Subject: [PATCH 084/255] [Platforms] Add `device_type` in `Platform` (#10508) Signed-off-by: MengqingCao --- vllm/config.py | 17 ++--------------- vllm/platforms/cpu.py | 1 + vllm/platforms/cuda.py | 1 + vllm/platforms/hpu.py | 1 + vllm/platforms/interface.py | 1 + vllm/platforms/neuron.py | 1 + vllm/platforms/openvino.py | 1 + vllm/platforms/rocm.py | 1 + vllm/platforms/tpu.py | 1 + vllm/platforms/xpu.py | 1 + 10 files changed, 11 insertions(+), 15 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 7522486782cc9..0ed92f370cf50 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1193,21 +1193,8 @@ class DeviceConfig: def __init__(self, device: str = "auto") -> None: if device == "auto": # Automated device type detection - if current_platform.is_cuda_alike(): - self.device_type = "cuda" - elif current_platform.is_neuron(): - self.device_type = "neuron" - elif current_platform.is_hpu(): - self.device_type = "hpu" - elif current_platform.is_openvino(): - self.device_type = "openvino" - elif current_platform.is_tpu(): - self.device_type = "tpu" - elif current_platform.is_cpu(): - self.device_type = "cpu" - elif current_platform.is_xpu(): - self.device_type = "xpu" - else: + self.device_type = current_platform.device_type + if self.device_type is None: raise RuntimeError("Failed to infer device type") else: # Device type is assigned explicitly diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 43cbafe709d84..0c4c916406223 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -19,6 +19,7 @@ class CpuPlatform(Platform): _enum = PlatformEnum.CPU + device_type: str = "cpu" @classmethod def get_device_name(cls, device_id: int = 0) -> str: diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index d2911ef650743..07562a8c3d71e 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -109,6 +109,7 @@ def device_id_to_physical_device_id(device_id: int) -> int: class CudaPlatform(Platform): _enum = PlatformEnum.CUDA + device_type: str = "cuda" @classmethod def get_device_capability(cls, device_id: int = 0) -> DeviceCapability: diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index 1e0888a30ba96..36d944b3f24b8 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -5,6 +5,7 @@ class HpuPlatform(Platform): _enum = PlatformEnum.HPU + device_type: str = "hpu" @classmethod def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index f4849fa2ccfb0..68abec28ad71e 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -56,6 +56,7 @@ def to_int(self) -> int: class Platform: _enum: PlatformEnum + device_type: str def is_cuda(self) -> bool: return self._enum == PlatformEnum.CUDA diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py index 07d8398eda525..57e3c0dfae84c 100644 --- a/vllm/platforms/neuron.py +++ b/vllm/platforms/neuron.py @@ -3,6 +3,7 @@ class NeuronPlatform(Platform): _enum = PlatformEnum.NEURON + device_type: str = "neuron" @classmethod def get_device_name(cls, device_id: int = 0) -> str: diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py index ad69ced5417b3..130b8eec1b386 100644 --- a/vllm/platforms/openvino.py +++ b/vllm/platforms/openvino.py @@ -10,6 +10,7 @@ class OpenVinoPlatform(Platform): _enum = PlatformEnum.OPENVINO + device_type: str = "openvino" @classmethod def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index bb3a49c8b73bc..c62241d8bb47b 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -29,6 +29,7 @@ class RocmPlatform(Platform): _enum = PlatformEnum.ROCM + device_type: str = "cuda" @classmethod def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 2a7ca9fb8c576..863875ef5c2d6 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -16,6 +16,7 @@ class TpuPlatform(Platform): _enum = PlatformEnum.TPU + device_type: str = "tpu" @classmethod def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index c3c4746d3cc25..536e17a5f93e8 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -16,6 +16,7 @@ class XPUPlatform(Platform): _enum = PlatformEnum.XPU + device_type: str = "xpu" @classmethod def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: From 8b0fe06c890a202eba24d517cc77562e4a8b0d0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luka=20Govedi=C4=8D?= Date: Thu, 21 Nov 2024 00:44:57 -0500 Subject: [PATCH 085/255] [torch.compile] Inductor code caching fix (#10273) Signed-off-by: luka Signed-off-by: Luka Govedic --- tests/compile/backend.py | 16 +- tests/compile/test_functionalization.py | 95 +++++++++ tests/compile/test_fusion.py | 11 +- tests/compile/test_pass_manager.py | 35 ++++ vllm/compilation/backends.py | 236 ++-------------------- vllm/compilation/fix_functionalization.py | 177 ++++++++++++++++ vllm/compilation/fusion.py | 13 +- vllm/compilation/inductor_pass.py | 100 ++++++--- vllm/compilation/pass_manager.py | 77 +++++++ vllm/compilation/reshapes.py | 8 +- vllm/compilation/vllm_inductor_pass.py | 53 +++++ vllm/config.py | 60 ++++-- vllm/utils.py | 9 - vllm/v1/worker/gpu_model_runner.py | 2 +- 14 files changed, 604 insertions(+), 288 deletions(-) create mode 100644 tests/compile/test_functionalization.py create mode 100644 tests/compile/test_pass_manager.py create mode 100644 vllm/compilation/fix_functionalization.py create mode 100644 vllm/compilation/pass_manager.py create mode 100644 vllm/compilation/vllm_inductor_pass.py diff --git a/tests/compile/backend.py b/tests/compile/backend.py index 9d5c68274374e..8fa10e5bd1b37 100644 --- a/tests/compile/backend.py +++ b/tests/compile/backend.py @@ -1,7 +1,9 @@ from copy import deepcopy -from typing import Callable +from typing import Callable, Union -import torch +from torch import fx + +from vllm.compilation.inductor_pass import InductorPass class TestBackend: @@ -11,19 +13,21 @@ class TestBackend: It also saves the graph before and after the custom passes for inspection. """ - def __init__(self, *args: Callable[[torch.fx.Graph], None]): - self.custom_passes = args + def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph], + None]]): + self.custom_passes = list(passes) from torch._inductor import config self.current_config = config.shallow_copy_dict() + self.current_config['force_disable_caches'] = True self.current_config['post_grad_custom_post_pass'] = self.post_pass - def __call__(self, graph: torch.fx.GraphModule, example_inputs): + def __call__(self, graph: fx.GraphModule, example_inputs): from torch._inductor.compile_fx import compile_fx return compile_fx(graph, example_inputs, config_patches=self.current_config) - def post_pass(self, graph: torch.fx.Graph): + def post_pass(self, graph: fx.Graph): self.graph_pre_pass = deepcopy(graph) for pass_ in self.custom_passes: pass_(graph) diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py new file mode 100644 index 0000000000000..5036189077be2 --- /dev/null +++ b/tests/compile/test_functionalization.py @@ -0,0 +1,95 @@ +import pytest +import torch + +import vllm.envs as envs +from vllm import LLM, SamplingParams +from vllm.compilation.fix_functionalization import FixFunctionalizationPass +from vllm.compilation.fusion import (FusionPass, find_auto_fn, + find_auto_fn_maybe) +from vllm.compilation.reshapes import RedundantReshapesPass +from vllm.compilation.vllm_inductor_pass import is_func +from vllm.config import CompilationConfig + +from .backend import TestBackend + +OPS_IN_MODEL = [ + torch.ops._C.rotary_embedding.default, + torch.ops._C.fused_add_rms_norm.default, + torch.ops._C.silu_and_mul.default, +] + +RMS_OP = torch.ops._C.rms_norm.default + +RMS_QUANT_OPS = { + "static_fp8": [ + torch.ops._C.rms_norm_static_fp8_quant.default, + torch.ops._C.fused_add_rms_norm_static_fp8_quant.default + ], +} + +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + + +@pytest.mark.parametrize("model", + ["nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"]) +@pytest.mark.parametrize("do_fusion", [True, False]) +@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda", + reason="Only test on CUDA") +def test_fix_functionalization(model: str, do_fusion: bool): + torch.set_default_device("cuda") + + config = CompilationConfig.PassConfig(enable_fusion=do_fusion, + enable_reshape=True) + reshape_pass = RedundantReshapesPass(config) + fusion_pass = FusionPass.instance(config) + + passes = [reshape_pass, fusion_pass] if do_fusion else [reshape_pass] + func_pass = FixFunctionalizationPass(config) + backend_func = TestBackend(*passes, func_pass) + backend_no_func = TestBackend(*passes) + + # instantiate a full engine and manually compile the model 2x + # (with and without FixFunctionalizationPass) + llm = LLM(model=model, enforce_eager=True) + model_runner = llm.llm_engine.model_executor.driver_worker.model_runner + orig_model = model_runner.model + # TODO mark inputs dynamic? (currently torch.compile is triggered 4x) + # Can only do that by using the decorator but then we'd have to instantiate + # 2 LLM instances. + + sampling_params = SamplingParams(temperature=0.0, top_p=1.0) + model_runner.model = torch.compile(orig_model, + fullgraph=True, + backend=backend_func) + gen_func = llm.generate(prompts, sampling_params) + + model_runner.model = torch.compile(orig_model, + fullgraph=True, + backend=backend_no_func) + gen_no_func = llm.generate(prompts, sampling_params) + + for output_func, output_no_func in zip(gen_func, gen_no_func): + assert output_func.outputs[0].text == output_no_func.outputs[0].text + + # OPS_IN_MODEL always appear. RMS_OP is fused away if we run fusion, + # and replaced by fused quantized ops in RMS_QUANT_OPS. + ops = OPS_IN_MODEL + (RMS_QUANT_OPS["static_fp8"] + if do_fusion else [RMS_OP]) + + for op in ops: + find_auto_fn(backend_no_func.graph_post_pass.nodes, op) + assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, + op) is None # noqa: E501 + + # make sure the ops were all de-functionalized + found = dict() + for node in backend_func.graph_post_pass.nodes: + for op in ops: + if is_func(node, op): + found[op] = True + assert all(found[op] for op in ops) diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index 4db79b070fd8d..f92ec8d0de5f1 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -38,12 +38,6 @@ def forward(self, x): return y3 -# Init does pattern registration, which can only happen once -config = CompilationConfig(enable_fusion=True) -reshape_pass = RedundantReshapesPass(config) -fusion_pass = FusionPass.instance(config) - - @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) @pytest.mark.parametrize("hidden_size", [64, 3392, 4096]) @pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049]) @@ -58,6 +52,11 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps): pytest.skip("Only test eps=1e-5 for now") # Reshape pass is needed for the fusion pass to work + config = CompilationConfig.PassConfig(enable_fusion=True, + enable_reshape=True) + reshape_pass = RedundantReshapesPass(config) + fusion_pass = FusionPass.instance(config) + backend = TestBackend(reshape_pass, fusion_pass) model = TestModel(hidden_size, eps) diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py new file mode 100644 index 0000000000000..03e7535093c5d --- /dev/null +++ b/tests/compile/test_pass_manager.py @@ -0,0 +1,35 @@ +import pickle + +import pytest +import torch +from torch._inductor.codecache import BypassFxGraphCache + +from vllm.compilation.config import CompilationConfig +from vllm.compilation.inductor_pass import (CallableInductorPass, + as_inductor_pass) +from vllm.compilation.pass_manager import PostGradPassManager + + +def simple_callable(graph: torch.fx.Graph): + pass + + +@as_inductor_pass(files=(__file__, )) +def callable_decorated(graph: torch.fx.Graph): + pass + + +@pytest.mark.parametrize( + "works, callable", + [(False, simple_callable), (True, callable_decorated), + (True, CallableInductorPass(simple_callable, "simple_callable"))]) +def test_pass_manager(works: bool, callable): + config = CompilationConfig().pass_config + pass_manager = PostGradPassManager([callable]) + pass_manager.configure(config) # Adds default passes + + if works: + pickle.dumps(pass_manager) + else: + with pytest.raises(BypassFxGraphCache): + pickle.dumps(pass_manager) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 416cffd326489..464bc2af8fd6d 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -1,6 +1,5 @@ import copy import dataclasses -import operator from contextlib import ExitStack from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple from unittest.mock import patch @@ -11,205 +10,15 @@ import vllm.envs as envs from vllm.config import CompilationConfig from vllm.logger import init_logger -from vllm.utils import combine_fx_passes, weak_ref_tensors +from vllm.utils import weak_ref_tensors from .counter import compilation_counter -from .fusion import FusionPass -from .reshapes import RedundantReshapesPass +from .inductor_pass import InductorPass +from .pass_manager import PostGradPassManager logger = init_logger(__name__) -def fix_functionalization(graph: fx.Graph): - """ - Rewrite the graph module to replace the pattern involving - torch._higher_order_ops.auto_functionalize.auto_functionalized - with a direct call to the inplace custom op. - - # TODO: check if PyTorch nightly has fixed this issue - """ - - # debug code, if we want to see the graph before the transformation - # with open("before.py", "w") as f: - # print(graph.python_code(root_module="self", verbose=True).src, file=f) - - nodes_to_remove = [] - - for node in graph.nodes: - # Identify the auto_functionalized node - if node.op == 'call_function' and node.target == torch._higher_order_ops.auto_functionalize.auto_functionalized: # noqa - if node.args[0] == torch.ops._C.rotary_embedding.default: - # manual replace for rotary_embedding - - # Now, collect the arguments - kwargs = node.kwargs - - query = kwargs['query'] - mm_node = query.args[0].args[0] - - # Create a new call to torch.ops._C.rotary_embedding.default - with graph.inserting_before(node): - # just insert the call to the custom op - # NOTE: don't run dead code elimination, - # otherwise this op will be removed - graph.call_function(torch.ops._C.rotary_embedding.default, - kwargs=kwargs) - - # Remove the auto_functionalized node - # Since the node may have outputs, we need to handle its users - # Replace uses of the outputs (getitem nodes) with mm_node - for user in list(node.users): - if user.op == 'call_function' and user.target == operator.getitem: # noqa - # Remove the getitem node - for getitem_user in list(user.users): - if (getitem_user.op == 'call_function' - and getitem_user.target - == torch.ops.aten.slice_scatter.default): - # Replace the uses of slice_scatter node - # with mm_node - getitem_user.replace_all_uses_with(mm_node) - nodes_to_remove.append(getitem_user) - nodes_to_remove.append(user) - nodes_to_remove.append(node) - - elif node.args[0] == torch.ops._C.fused_add_rms_norm.default: - # manual replace for fused_add_rms_norm - # this is the most effective optimization for llama - # failing to do this will result in many unnecessary copies - - kwargs = node.kwargs - - input = kwargs['input'] - residual = kwargs['residual'] - - # Create a new call to torch.ops._C.rotary_embedding.default - with graph.inserting_before(node): - # just insert the call to the custom op - # NOTE: don't run dead code elimination, - # otherwise this op will be removed - graph.call_function( - torch.ops._C.fused_add_rms_norm.default, kwargs=kwargs) - - for user in list(node.users): - if user.op == 'call_function' and user.target == operator.getitem: # noqa - # Remove the getitem node - if user.args[1] == 1: - replace_node = input - elif user.args[1] == 2: - replace_node = residual - user.replace_all_uses_with(replace_node) - nodes_to_remove.append(user) - nodes_to_remove.append(node) - elif (node.args[0] == - torch.ops._C.fused_add_rms_norm_static_fp8_quant.default): - # manual replace for fused_add_rms_norm_static_fp8_quant - # this is the most effective optimization for llama - # failing to do this will result in many unnecessary copies - - kwargs = node.kwargs - - result = kwargs['result'] - residual = kwargs['residual'] - - # Create a new call to - # torch.ops._C.fused_add_rms_norm_static_fp8_quant.default - with graph.inserting_before(node): - # just insert the call to the custom op - # NOTE: don't run dead code elimination, - # otherwise this op will be removed - graph.call_function( - torch.ops._C.fused_add_rms_norm_static_fp8_quant. - default, - kwargs=kwargs) - - for user in list(node.users): - if user.op == 'call_function' and user.target == operator.getitem: # noqa - # Remove the getitem node - if user.args[1] == 1: - replace_node = result - elif user.args[1] == 2: - replace_node = residual - user.replace_all_uses_with(replace_node) - nodes_to_remove.append(user) - nodes_to_remove.append(node) - - elif node.args[0] == torch.ops._C.rms_norm.default: - # manual replace for rms_norm - - kwargs = node.kwargs - - replace_node = kwargs['result'] - # Create a new call to torch.ops._C.rms_norm.default - with graph.inserting_before(node): - # just insert the call to the custom op - # NOTE: don't run dead code elimination, - # otherwise this op will be removed - graph.call_function(torch.ops._C.rms_norm.default, - kwargs=kwargs) - - for user in list(node.users): - if user.op == 'call_function' and user.target == operator.getitem: # noqa - user.replace_all_uses_with(replace_node) - nodes_to_remove.append(user) - nodes_to_remove.append(node) - - elif node.args[ - 0] == torch.ops._C.rms_norm_static_fp8_quant.default: # noqa - # manual replace for rms_norm_static_fp8_quant - - kwargs = node.kwargs - - replace_node = kwargs['result'] - # Create a new call to torch.ops._C.rms_norm_static_fp8_quant.default # noqa - with graph.inserting_before(node): - # just insert the call to the custom op - # NOTE: don't run dead code elimination, - # otherwise this op will be removed - graph.call_function( - torch.ops._C.rms_norm_static_fp8_quant.default, - kwargs=kwargs) - - for user in list(node.users): - if user.op == 'call_function' and user.target == operator.getitem: # noqa - user.replace_all_uses_with(replace_node) - nodes_to_remove.append(user) - nodes_to_remove.append(node) - - elif node.args[0] == torch.ops._C.silu_and_mul.default: - # manual replace for silu_and_mul - - kwargs = node.kwargs - - input = kwargs['input'] - out = kwargs['out'] - - # Create a new call to torch.ops._C.silu_and_mul.default - # cannot use kwargs, because we have an `out`, see https://github.com/pytorch/pytorch/blob/a00faf440888ffb724bad413f329a49e2b6388e7/torch/_inductor/lowering.py#L351 # noqa - with graph.inserting_before(node): - # just insert the call to the custom op - # NOTE: don't run dead code elimination, - # otherwise this op will be removed - graph.call_function( - torch.ops._C.silu_and_mul.default, - args=(out, input), - ) - replace_node = out - - for user in list(node.users): - if user.op == 'call_function' and user.target == operator.getitem: # noqa - user.replace_all_uses_with(replace_node) - nodes_to_remove.append(user) - nodes_to_remove.append(node) - - # Remove the nodes all at once - for node in nodes_to_remove: - graph.erase_node(node) - - # debug code, if we want to see the graph after the transformation - # with open("after.py", "w") as f: - # print(graph.python_code(root_module="self", verbose=True).src, file=f) - - def wrap_inductor(graph, example_inputs, additional_inductor_config, @@ -368,12 +177,8 @@ class VllmBackend: The major work of this backend is to split the graph into piecewise graphs, and pass them to the piecewise backend. - This backend also handles custom passes and adds them to Inductor config. - The order of the post-grad post-passes is: - 1. post_grad_passes (constructor parameter) - 2. config["post_grad_custom_post_pass"] - 3. fix_functionalization - This way, all passes operate on a functionalized graph. + This backend also adds the PostGradPassManager to Inductor config, + which handles the post-grad passes. """ compilation_configs: CompilationConfig @@ -402,7 +207,9 @@ def __init__( # streams, it might not be safe to share a global pool. # only investigate this when we use multiple streams self.graph_pool = global_graph_pool - self.post_grad_passes = [] + + # Passes to run on the graph post-grad. + self.post_grad_pass_manager = PostGradPassManager() self.sym_tensor_indices = [] self.input_buffers = [] @@ -412,24 +219,19 @@ def __init__( # `torch.compile` is JIT compiled, so we don't need to # do anything here - def add_passes_to_config(self): + def configure_post_pass(self): config = self.compilation_configs - passes = list(self.post_grad_passes) - - passes = passes + [RedundantReshapesPass(config)] - - if config.enable_fusion: - passes = passes + [FusionPass.instance(config)] + self.post_grad_pass_manager.configure(config.pass_config) + # Post-grad custom passes are run using the post_grad_custom_post_pass + # hook. If a pass for that hook exists, add it to the pass manager. inductor_config = config.inductor_compile_config - if "post_grad_custom_post_pass" in inductor_config: - passes = passes + [inductor_config["post_grad_custom_post_pass"]] - - # add the fix_functionalization pass last, so that all other - # passes operate on a functionalized graph - passes = passes + [fix_functionalization] - combined_pass = combine_fx_passes(passes) - inductor_config["post_grad_custom_post_pass"] = combined_pass + PASS_KEY = "post_grad_custom_post_pass" + if PASS_KEY in inductor_config: + # Config should automatically wrap all inductor passes + assert isinstance(inductor_config[PASS_KEY], InductorPass) + self.post_grad_pass_manager.add(inductor_config[PASS_KEY]) + inductor_config[PASS_KEY] = self.post_grad_pass_manager def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: @@ -444,7 +246,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: # we get the sizes to capture for cudagraph # from compilation context self.compilation_configs.init_during_runtime() - self.add_passes_to_config() + self.configure_post_pass() self.split_gm, self.piecewise_graphs = split_graph( graph, self.compilation_configs.splitting_ops) diff --git a/vllm/compilation/fix_functionalization.py b/vllm/compilation/fix_functionalization.py new file mode 100644 index 0000000000000..3584cc3608caf --- /dev/null +++ b/vllm/compilation/fix_functionalization.py @@ -0,0 +1,177 @@ +import operator +from typing import Dict, Iterable, List, Optional, Tuple, Union + +import torch +from torch._higher_order_ops.auto_functionalize import auto_functionalized + +from vllm.logger import init_logger + +from .vllm_inductor_pass import VllmInductorPass, is_func + +logger = init_logger(__name__) + + +class FixFunctionalizationPass(VllmInductorPass): + """ + This pass defunctionalizes certain nodes to avoid redundant tensor copies. + After this pass, DCE (dead-code elimination) should never be run, + as de-functionalized nodes may appear as dead code. + + To add new nodes to defunctionalize, add to the if-elif chain in __call__. + """ + + def __call__(self, graph: torch.fx.Graph): + self.begin() + self.dump_graph(graph, "before_fix_functionalization") + + self.nodes_to_remove: List[torch.fx.Node] = [] + count = 0 + for node in graph.nodes: + if not is_func(node, auto_functionalized): + continue # Avoid deep if-elif nesting + + kwargs = node.kwargs + at_target = node.args[0] + + if at_target == torch.ops._C.rotary_embedding.default: + query = kwargs['query'] + mm_node = query.args[0].args[0] + + # rotary_embedding is a special case: the two mutating inputs + # are query and key, which are slices of mm_node. + # While functionalized, results at[1] and at[2] are scattered + # back into mm_node. After de-functionalization, we can just + # use mm_node directly. + for idx, user in self.getitem_users(node).items(): + for user_of_getitem in user.users: + if is_func(user_of_getitem, + torch.ops.aten.slice_scatter.default): + user_of_getitem.replace_all_uses_with(mm_node) + self._remove(user_of_getitem) + self._remove(user) + + self.insert_defunctionalized(graph, node) + self._remove(node) + + # These 2 replacements avoid the most copies for LLaMa. + elif at_target == torch.ops._C.fused_add_rms_norm.default: + mutated_args = {1: 'input', 2: 'residual'} + self.defunctionalize(graph, node, mutated_args) + elif at_target == torch.ops._C.fused_add_rms_norm_static_fp8_quant.default: # noqa: E501 + mutated_args = {1: 'result', 2: 'residual'} + self.defunctionalize(graph, node, mutated_args) + + elif at_target in [ + torch.ops._C.rms_norm.default, + torch.ops._C.rms_norm_static_fp8_quant.default + ]: + mutated_args = {1: 'result'} + self.defunctionalize(graph, node, mutated_args) + + elif at_target == torch.ops._C.silu_and_mul.default: + mutated_args = {1: 'out'} + # Because we have an 'out', need to specify args directly + self.defunctionalize(graph, + node, + mutated_args, + args=('out', 'input')) + else: + continue # skip the count + + count += 1 + + self.dump_graph(graph, "before_fix_functionalization_cleanup") + + # Remove the nodes all at once + count_removed = len(self.nodes_to_remove) + for node in self.nodes_to_remove: + graph.erase_node(node) + + logger.debug("De-functionalized %s nodes, removed %s nodes", count, + count_removed) + self.dump_graph(graph, "after_fix_functionalization") + self.end_and_log() + + def _remove(self, node_or_nodes: Union[torch.fx.Node, + Iterable[torch.fx.Node]]): + """ + Stage a node (or nodes) for removal at the end of the pass. + """ + if isinstance(node_or_nodes, torch.fx.Node): + self.nodes_to_remove.append(node_or_nodes) + else: + self.nodes_to_remove.extend(node_or_nodes) + + def defunctionalize(self, + graph: torch.fx.Graph, + node: torch.fx.Node, + mutated_args: Dict[int, Union[torch.fx.Node, str]], + args: Optional[Tuple[Union[torch.fx.Node, str], + ...]] = None): + """ + De-functionalize a node by replacing it with a call to the original. + It also replaces the getitem users with the mutated arguments. + See replace_users_with_mutated_args and insert_defunctionalized. + """ + self.replace_users_with_mutated_args(node, mutated_args) + self.insert_defunctionalized(graph, node, args=args) + self._remove(node) + + def replace_users_with_mutated_args(self, node: torch.fx.Node, + mutated_args: Dict[int, + Union[torch.fx.Node, + str]]): + """ + Replace all getitem users of the auto-functionalized node with the + mutated arguments. + :param node: The auto-functionalized node + :param mutated_args: The mutated arguments, indexed by getitem index. + If the value of an arg is a string, `node.kwargs[arg]` is used. + """ + for idx, user in self.getitem_users(node).items(): + arg = mutated_args[idx] + arg = node.kwargs[arg] if isinstance(arg, str) else arg + user.replace_all_uses_with(arg) + self._remove(user) + + def getitem_users(self, node: torch.fx.Node) -> Dict[int, torch.fx.Node]: + """ + Returns the operator.getitem users of the auto-functionalized node, + indexed by the index they are getting. + """ + users = {} + for user in node.users: + if is_func(user, operator.getitem): + idx = user.args[1] + users[idx] = user + return users + + def insert_defunctionalized(self, + graph: torch.fx.Graph, + node: torch.fx.Node, + args: Optional[Tuple[Union[torch.fx.Node, str], + ...]] = None): + """ + Insert a new defunctionalized node into the graph before node. + If one of the kwargs is 'out', provide args directly, + as node.kwargs cannot be used. + See https://github.com/pytorch/pytorch/blob/a00faf440888ffb724bad413f329a49e2b6388e7/torch/_inductor/lowering.py#L351 + + :param graph: Graph to insert the defunctionalized node into + :param node: The auto-functionalized node to defunctionalize + :param args: If we cannot use kwargs, specify args directly. + If an arg is a string, `node.kwargs[arg]` is used. + """ # noqa: E501 + assert is_func(node, auto_functionalized), \ + f"node must be auto-functionalized, is {node} instead" + + # Create a new call to the original function + with graph.inserting_before(node): + function = node.args[0] + if args is None: + graph.call_function(function, kwargs=node.kwargs) + else: + # Args passed as strings refer to items in node.kwargs + args = tuple(node.kwargs[arg] if isinstance(arg, str) else arg + for arg in args) + graph.call_function(function, args=args) diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py index e6a3afef85e1b..5efa410fab6a0 100644 --- a/vllm/compilation/fusion.py +++ b/vllm/compilation/fusion.py @@ -6,10 +6,11 @@ from torch._inductor.pattern_matcher import (Match, PatternMatcherPass, fwd_only, register_replacement) -from vllm.compilation.inductor_pass import InductorPass from vllm.config import CompilationConfig from vllm.logger import init_logger +from .vllm_inductor_pass import VllmInductorPass, is_func + logger = init_logger(__name__) @@ -90,8 +91,6 @@ def empty_fp32(*args, **kwargs): # Utilities for post-processing multi-output matches -def is_func(node: torch.fx.Node, target) -> bool: - return node.op == "call_function" and node.target == target # Returns the first auto_functionalized node with the given op (if it exists) @@ -127,7 +126,7 @@ def find_getitem(node: torch.fx.Node, idx: int) -> torch.fx.Node: return ret -class FusionPass(InductorPass): +class FusionPass(VllmInductorPass): """ This pass fuses a pre-defined set of custom ops into fused ops. It uses the torch pattern matcher to find the patterns and replace them. @@ -142,7 +141,7 @@ class FusionPass(InductorPass): _instance: 'Optional[FusionPass]' = None @classmethod - def instance(cls, config: CompilationConfig): + def instance(cls, config: CompilationConfig.PassConfig): """ Get the singleton instance of the FusionPass. If the instance exists, the config is updated but @@ -154,7 +153,7 @@ def instance(cls, config: CompilationConfig): cls._instance.config = config return cls._instance - def __init__(self, config: CompilationConfig): + def __init__(self, config: CompilationConfig.PassConfig): assert self.__class__._instance is None, \ "FusionPass singleton instance already exists" super().__init__(config) @@ -278,6 +277,7 @@ def process_matches(self, graph: torch.fx.Graph): for node in match.nodes) def __call__(self, graph: torch.fx.Graph): + self.begin() self.dump_graph(graph, "before_fusion") count = self.patterns.apply(graph) @@ -289,3 +289,4 @@ def __call__(self, graph: torch.fx.Graph): logger.debug("Post-processed %s matches", len(self.matches)) self.dump_graph(graph, "after_fusion") self.matches.clear() + self.end_and_log() diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py index 8082a08b40019..f6846c08ac841 100644 --- a/vllm/compilation/inductor_pass.py +++ b/vllm/compilation/inductor_pass.py @@ -1,38 +1,84 @@ +import hashlib +import inspect +import types from abc import ABC, abstractmethod +from typing import Any, Callable, Optional, Union import torch - -from vllm.config import CompilationConfig -# yapf: disable -from vllm.distributed import get_tensor_model_parallel_rank as get_tp_rank -from vllm.distributed import ( - get_tensor_model_parallel_world_size as get_tp_world_size) -from vllm.distributed import model_parallel_is_initialized as p_is_init -# yapf: enable -from vllm.logger import init_logger - -logger = init_logger(__name__) +from torch import fx class InductorPass(ABC): + """ + General custom inductor pass interface. + TODO(torch==2.6) use torch._inductor.custom_graph_pass.CustomGraphPass + """ @abstractmethod def __call__(self, graph: torch.fx.Graph): + """ + Execute the pass on the given graph. + """ raise NotImplementedError - def __init__(self, config: CompilationConfig): - self.config = config - - def dump_graph(self, graph: torch.fx.Graph, stage: str): - if stage in self.config.dump_graph_stages: - # Make sure filename includes rank in the distributed setting - parallel = p_is_init() and get_tp_world_size() > 1 - rank = f"-{get_tp_rank()}" if parallel else "" - filepath = self.config.dump_graph_dir / f"{stage}{rank}.py" - - logger.info("Printing graph to %s", filepath) - with open(filepath, "w") as f: - src = graph.python_code(root_module="self", verbose=True).src - # Add imports so it's not full of errors - print("import torch; from torch import device", file=f) - print(src, file=f) + def uuid(self) -> Any: + """ + Provide a unique identifier for the pass, used in Inductor code cache. + This should depend on the pass implementation, so that changes to the + pass result in recompilation. + By default, the object source is hashed. + """ + return InductorPass.hash_source(self) + + @staticmethod + def hash_source(*srcs: Union[str, Any]): + """ + Utility method to hash the sources of functions or objects. + :param srcs: strings or objects to add to the hash. + Objects and functions have their source inspected. + :return: + """ + hasher = hashlib.sha256() + for src in srcs: + if isinstance(src, str): + src_str = src + elif isinstance(src, types.FunctionType): + src_str = inspect.getsource(src) + else: + src_str = inspect.getsource(src.__class__) + hasher.update(src_str.encode("utf-8")) + return hasher.digest() + + +class CallableInductorPass(InductorPass): + """ + This class is a wrapper for a callable that automatically provides an + implementation of the UUID. + """ + + def __init__(self, + callable: Callable[[fx.Graph], None], + uuid: Optional[Any] = None): + self.callable = callable + if uuid is None: + uuid = InductorPass.hash_source(callable) + self._uuid = uuid + + def __call__(self, graph: torch.fx.Graph): + self.callable(graph) + + def uuid(self) -> Any: + return self._uuid + + def __getstate__(self): + """ + Pickling occurs in the Inductor code cache if a pass is not given to + the pass manager but is instead directly added to config as a pass. + See PostGradPassManager for more. + + TODO(torch==2.6), use the `uuid` method in CustomGraphPass instead. + """ + return self._uuid + + def __setstate__(self, state): + raise ValueError("Cannot unpickle CallableInductorPass") diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py new file mode 100644 index 0000000000000..fb522ae053e97 --- /dev/null +++ b/vllm/compilation/pass_manager.py @@ -0,0 +1,77 @@ +from typing import List + +from torch import fx as fx + +from vllm.config import CompilationConfig +from vllm.logger import init_logger + +from .fix_functionalization import FixFunctionalizationPass +from .fusion import FusionPass +from .inductor_pass import InductorPass +from .reshapes import RedundantReshapesPass + +logger = init_logger(__name__) + + +class PostGradPassManager: + """ + The pass manager for post-grad passes. + It handles configuration, adding custom passes, and running passes. + It also supports pickling, which is used by the Inductor code cache. + TODO(torch==2.6), use CustomGraphPass + (torch._inductor.custom_graph_pass.CustomGraphPass) + + The order of the post-grad post-passes is: + 1. passes (constructor parameter) + 2. default passes (RedundantReshapesPass, FusionPass) + 3. config["post_grad_custom_post_pass"] (if it exists) + 4. fix_functionalization + This way, all passes operate on a functionalized graph. + """ + + def __init__(self): + self.passes: List[InductorPass] = [] + + def __call__(self, graph: fx.Graph): + for pass_ in self.passes: + pass_(graph) + + # always run fix_functionalization last + self.fix_functionalization(graph) + + def configure(self, pass_config: CompilationConfig.PassConfig): + self.pass_config = pass_config + if pass_config.enable_reshape: + self.passes += [RedundantReshapesPass(pass_config)] + + if pass_config.enable_fusion: + self.passes += [FusionPass.instance(pass_config)] + + self.fix_functionalization = FixFunctionalizationPass(pass_config) + + def add(self, pass_: InductorPass): + assert isinstance(pass_, InductorPass) + self.passes.append(pass_) + + def __getstate__(self): + """ + Custom pickling for the pass manager, as some passes cannot be pickled. + Pickling occurs because the pass manager is set as the value of + `config["post_grad_custom_post_pass"]` in the Inductor config. + The config is pickled to act as a key in the Inductor code cache. + Any other passes in the config are pickled as well. + + TODO(torch==2.6), use the `uuid` method in CustomGraphPass instead. + """ + state = {"pass_config": self.pass_config.uuid(), "passes": []} + for pass_ in self.passes: + state["passes"].append(pass_.uuid()) + state["passes"].append(self.fix_functionalization.uuid()) + return state + + def __setstate__(self, state): + """ + Do not allow unpickling of the pass manager. + If this is needed in the future, it should properly pickle the passes. + """ + raise ValueError("Cannot unpickle PostGradPassManager") diff --git a/vllm/compilation/reshapes.py b/vllm/compilation/reshapes.py index 36597e119d2e1..63a369fe8d966 100644 --- a/vllm/compilation/reshapes.py +++ b/vllm/compilation/reshapes.py @@ -3,14 +3,14 @@ import torch.fx from torch import SymInt -from vllm.compilation.fusion import is_func -from vllm.compilation.inductor_pass import InductorPass from vllm.logger import init_logger +from .vllm_inductor_pass import VllmInductorPass, is_func + logger = init_logger(__name__) -class RedundantReshapesPass(InductorPass): +class RedundantReshapesPass(VllmInductorPass): """ This is an inductor pass that removes redundant reshape operations. It is required for RMSNorm-quant fusion to work properly. @@ -31,6 +31,7 @@ class RedundantReshapesPass(InductorPass): """ def __call__(self, graph: torch.fx.Graph): + self.begin() self.dump_graph(graph, "before_reshapes") count = 0 # Remove no-op reshapes/views: @@ -56,6 +57,7 @@ def __call__(self, graph: torch.fx.Graph): logger.debug("Removed %s no-op reshapes", count) self.dump_graph(graph, "after_reshapes") + self.end_and_log() def dims_equivalent(self, dim: Union[int, torch.fx.Node], i_dim: Union[int, SymInt]) -> bool: diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py new file mode 100644 index 0000000000000..dbf6b8f7789e1 --- /dev/null +++ b/vllm/compilation/vllm_inductor_pass.py @@ -0,0 +1,53 @@ +import time + +import torch + +from vllm.config import CompilationConfig +# yapf: disable +from vllm.distributed import get_tensor_model_parallel_rank as get_tp_rank +from vllm.distributed import ( + get_tensor_model_parallel_world_size as get_tp_world_size) +from vllm.distributed import model_parallel_is_initialized as p_is_init +# yapf: enable +from vllm.logger import init_logger + +from .inductor_pass import InductorPass + +logger = init_logger(__name__) + + +def is_func(node: torch.fx.Node, target) -> bool: + return node.op == "call_function" and node.target == target + + +class VllmInductorPass(InductorPass): + """ + An inductor pass with access to vLLM PassConfig. + It provides timing, logging, and dumping utilities. + """ + + def __init__(self, config: CompilationConfig.PassConfig): + self.config = config + self.pass_name = self.__class__.__name__ + + def dump_graph(self, graph: torch.fx.Graph, stage: str): + if stage in self.config.dump_graph_stages: + # Make sure filename includes rank in the distributed setting + parallel = p_is_init() and get_tp_world_size() > 1 + rank = f"-{get_tp_rank()}" if parallel else "" + filepath = self.config.dump_graph_dir / f"{stage}{rank}.py" + + logger.info("%s printing graph to %s", self.pass_name, filepath) + with open(filepath, "w") as f: + src = graph.python_code(root_module="self", verbose=True).src + # Add imports so it's not full of errors + print("import torch; from torch import device", file=f) + print(src, file=f) + + def begin(self): + self._start_time = time.perf_counter_ns() + + def end_and_log(self): + self._end_time = time.perf_counter_ns() + duration_ms = float(self._end_time - self._start_time) / 1.0e6 + logger.debug("%s completed in %.1f ms", self.pass_name, duration_ms) diff --git a/vllm/config.py b/vllm/config.py index 0ed92f370cf50..b2785e1ce2d5f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1,5 +1,6 @@ import copy import enum +import hashlib import json import warnings from dataclasses import dataclass, field, replace @@ -13,6 +14,7 @@ from transformers import PretrainedConfig import vllm.envs as envs +from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass from vllm.logger import init_logger from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS, get_quantization_config) @@ -2120,12 +2122,7 @@ class CompilationConfig(BaseModel): name because the config uses json format. If we pass the config from Python, functions can also be passed directly via Python object constructor, e.g. `CompilationConfig(inductor_passes={"a": func})` - - custom inductor passes: - - dump_graph_stages: list of stages for which we want to dump the graph. - Each pass defines its own stages (before, after, maybe in-between). - - dump_graph_dir: directory to dump the graph. Default is . - - enable_fusion: whether to enable the custom fusion pass. - TODO better pass enabling system. + - custom inductor passes: see PassConfig for more details Why we have different sizes for cudagraph and inductor: - cudagraph: a cudagraph captured for a specific size can only be used @@ -2157,9 +2154,43 @@ class CompilationConfig(BaseModel): cudagraph_capture_sizes: Optional[List[int]] = None cudagraph_copy_inputs: bool = False - dump_graph_stages: List[str] = Field(default_factory=list) - dump_graph_dir: Path = Field(default=Path(".")) - enable_fusion: bool = True + class PassConfig(BaseModel): + """ + Configuration for custom Inductor passes. + This is separate from general CompilationConfig so that inductor passes + don't all have access to full configuration - that would create a cycle + as the PassManager is set as a property of config. + - dump_graph_stages: list of stages for which we want to dump the graph. + Each pass defines its own stages (before, after, maybe in-between). + - dump_graph_dir: directory to dump the graphs. Default is . + - enable_fusion: whether to enable the custom fusion pass. + - enable_reshape: whether to enable the custom reshape elimination pass. + TODO better pass enabling system. + """ + dump_graph_stages: List[str] = Field(default_factory=list) + dump_graph_dir: Path = Field(default=Path(".")) + enable_fusion: bool = True + enable_reshape: bool = True + + def uuid(self): + """ + Produces a hash unique to the pass configuration. + Any new fields that affect compilation should be added to the hash. + Do not include dump_graph_* in the hash - they don't affect + compilation. + """ + dict_ = self.model_dump( + include={"enable_fusion", "enable_reshape"}) + encoded = json.dumps(dict_, sort_keys=True).encode("utf-8") + return hashlib.sha256(encoded).digest() + + def model_post_init(self, __context: Any) -> None: + if not self.enable_reshape and self.enable_fusion: + print_warning_once( + "Fusion enabled but reshape elimination disabled." + "RMSNorm + quant (fp8) fusion might not work") + + pass_config: PassConfig = Field(default_factory=PassConfig) # not configurable, computed after init compile_sizes: List[int] = PrivateAttr @@ -2185,8 +2216,9 @@ def model_post_init(self, __context: Any) -> None: for k, v in self.inductor_passes.items(): if not isinstance(v, str): assert callable(v), ( - f"pass {k} should be a function or a qualified name") - self.inductor_compile_config[k] = v + f"pass {k} should be callable or a qualified name") + self.inductor_compile_config[k] = v if isinstance( + v, InductorPass) else CallableInductorPass(v) continue # resolve function from qualified name @@ -2194,7 +2226,8 @@ def model_post_init(self, __context: Any) -> None: module = ".".join(names[:-1]) func_name = names[-1] func = __import__(module).__dict__[func_name] - self.inductor_compile_config[k] = func + self.inductor_compile_config[k] = func if isinstance( + func, InductorPass) else CallableInductorPass(func) self.enabled_custom_ops = Counter() self.disabled_custom_ops = Counter() @@ -2344,7 +2377,8 @@ def __post_init__(self): self.compilation_config.custom_ops = ["none"] self.compilation_config.use_cudagraph = True self.compilation_config.use_inductor = True - self.compilation_config.enable_fusion = False + self.compilation_config.pass_config.enable_fusion = False + self.compilation_config.pass_config.enable_reshape = False current_platform.check_and_update_config(self) diff --git a/vllm/utils.py b/vllm/utils.py index 2bbdc8d1ebde8..cb2ad43a2ae8d 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1501,15 +1501,6 @@ def __len__(self): return len(self._factory) -def combine_fx_passes(passes: List[Callable]) -> Callable: - - def combined_fx(graph) -> None: - for fx in passes: - fx(graph) - - return combined_fx - - def weak_ref_tensor(tensor: torch.Tensor) -> torch.Tensor: """ Create a weak reference to a tensor. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 1f9b544637bf7..5f66293cbe8e4 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -548,7 +548,7 @@ def capture_model(self) -> None: if not self.use_cuda_graph: logger.warning( "Skipping CUDA graph capture. Please add " - "-O 3 to use CUDA graphs.", CompilationLevel.PIECEWISE) + "-O %s to use CUDA graphs.", CompilationLevel.PIECEWISE) return start_time = time.perf_counter() From 3430857b641131ffabf215ab569c41696b57b953 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 21 Nov 2024 15:06:42 +0800 Subject: [PATCH 086/255] [Misc] Increase default video fetch timeout (#10495) Signed-off-by: DarkLight1337 --- vllm/envs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/envs.py b/vllm/envs.py index 853c49bc4dbc1..14c1617f1be19 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -49,7 +49,7 @@ VLLM_WORKER_MULTIPROC_METHOD: str = "fork" VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets") VLLM_IMAGE_FETCH_TIMEOUT: int = 5 - VLLM_VIDEO_FETCH_TIMEOUT: int = 15 + VLLM_VIDEO_FETCH_TIMEOUT: int = 30 VLLM_AUDIO_FETCH_TIMEOUT: int = 10 VLLM_TARGET_DEVICE: str = "cuda" MAX_JOBS: Optional[str] = None From aaddce5d268d2c82d49b0240d6c112ba4941f69e Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 20 Nov 2024 23:07:56 -0800 Subject: [PATCH 087/255] [platforms] improve error message for unspecified platforms (#10520) Signed-off-by: youkaichao --- vllm/config.py | 3 ++- vllm/platforms/interface.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/config.py b/vllm/config.py index b2785e1ce2d5f..ed09f8ae31863 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1191,12 +1191,13 @@ def is_multi_step(self) -> bool: class DeviceConfig: device: Optional[torch.device] + device_type: str def __init__(self, device: str = "auto") -> None: if device == "auto": # Automated device type detection self.device_type = current_platform.device_type - if self.device_type is None: + if not self.device_type: raise RuntimeError("Failed to infer device type") else: # Device type is assigned explicitly diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 68abec28ad71e..07f23167d509a 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -170,3 +170,4 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: class UnspecifiedPlatform(Platform): _enum = PlatformEnum.UNSPECIFIED + device_type = "" From f0e02380169b99a20cc5a4cd1848bbe085b50d5c Mon Sep 17 00:00:00 2001 From: Zhong Qishuai Date: Thu, 21 Nov 2024 17:05:23 +0800 Subject: [PATCH 088/255] [Doc] fix a small typo in docstring of llama_tool_parser (#10513) --- vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py index a5f44d69e5fd2..1856308b88cfa 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py @@ -29,7 +29,8 @@ class Llama3JsonToolParser(ToolParser): Tool call parser for Llama 3.1 models intended for use with the examples/tool_chat_template_llama.jinja template. - Used when --enable-auto-tool-choice --tool-call-parser mistral are all set + Used when --enable-auto-tool-choice --tool-call-parser llama3_json + are all set """ def __init__(self, tokenizer: PreTrainedTokenizerBase): From 1cfde82ffd6edfca6029a7e312c848386ea322c1 Mon Sep 17 00:00:00 2001 From: Alex Brooks Date: Thu, 21 Nov 2024 03:46:20 -0700 Subject: [PATCH 089/255] [Model] Add Support for Multimodal Granite Models (#10291) Signed-off-by: Alex-Brooks Co-authored-by: Cyrus Leung --- vllm/model_executor/models/clip.py | 47 ++++++++++++++++++------ vllm/model_executor/models/llava.py | 45 +++++++++++++++++++---- vllm/model_executor/models/llava_next.py | 20 +++++++++- vllm/model_executor/models/pixtral.py | 28 ++++++++++++-- vllm/model_executor/models/siglip.py | 42 ++++++++++++++++----- vllm/multimodal/utils.py | 44 ++++++++++++++++++++++ 6 files changed, 191 insertions(+), 35 deletions(-) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 7f638506f9fb2..cd89519e95986 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -21,7 +21,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.multimodal.utils import (cached_get_tokenizer, consecutive_placeholder_ranges, - repeat_and_pad_placeholder_tokens) + repeat_and_pad_placeholder_tokens, + resolve_visual_encoder_outputs) from vllm.sequence import SequenceData from .utils import get_vit_attn_backend @@ -389,12 +390,20 @@ def __init__( for layer_idx in range(num_hidden_layers) ]) - def forward(self, inputs_embeds: torch.Tensor): - + def forward( + self, inputs_embeds: torch.Tensor, return_all_hidden_states: bool + ) -> Union[torch.Tensor, list[torch.Tensor]]: + hidden_states_pool = [] hidden_states = inputs_embeds + for encoder_layer in self.layers: hidden_states = encoder_layer(hidden_states) - + if return_all_hidden_states: + hidden_states_pool.append(hidden_states) + # If we have multiple feature sample layers, we return all hidden + # states in order and grab the ones we need by index. + if return_all_hidden_states: + return hidden_states_pool return hidden_states @@ -419,6 +428,7 @@ def __init__( # NOTE: This typo of "layrnorm" is not fixed on purpose to match # the original transformers code and name of the model weights. self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + self.encoder = CLIPEncoder( config=config, quant_config=quant_config, @@ -446,16 +456,26 @@ def __init__( def forward( self, pixel_values: torch.Tensor, + feature_sample_layers: Optional[list[int]] = None, ) -> torch.Tensor: hidden_states = self.embeddings(pixel_values) hidden_states = self.pre_layrnorm(hidden_states) - hidden_states = self.encoder(inputs_embeds=hidden_states) - if self.post_layernorm is None: - return hidden_states + return_all_hidden_states = feature_sample_layers is not None + + # Produces either the last layer output or all of the hidden states, + # depending on if we have feature_sample_layers or not + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + return_all_hidden_states=return_all_hidden_states) + + # Handle post-norm (if applicable) and stacks feature layers if needed + encoder_outputs = resolve_visual_encoder_outputs( + encoder_outputs, feature_sample_layers, self.post_layernorm, + self.config.num_hidden_layers) - return self.post_layernorm(hidden_states) + return encoder_outputs class CLIPVisionModel(nn.Module): @@ -478,11 +498,14 @@ def __init__( quant_config=quant_config, num_hidden_layers_override=num_hidden_layers_override, require_post_norm=require_post_norm, - prefix=f"{prefix}.vision_model", - ) + prefix=f"{prefix}.vision_model") - def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: - return self.vision_model(pixel_values) + def forward( + self, + pixel_values: torch.Tensor, + feature_sample_layers: Optional[list[int]] = None, + ) -> torch.Tensor: + return self.vision_model(pixel_values, feature_sample_layers) @property def device(self): diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index e7d3161a7cb2d..05c6cc62efcd7 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -204,7 +204,41 @@ def input_processor_for_llava(ctx: InputContext, inputs: DecoderOnlyInputs): class LlavaLikeConfig(Protocol): vision_config: PretrainedConfig - vision_feature_layer: int + vision_feature_layer: Union[int, List[int]] + + +def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int: + """Determine the number of hidden layers to initialize up to in the + visual encoder. + + Args: + hf_config: Model config with vision feature layer(s). + """ + feature_layers = hf_config.vision_feature_layer + num_hidden_layers = hf_config.vision_config.num_hidden_layers + # If we have one feature layer, initialize up to that layer + if isinstance(feature_layers, int): + return _get_layer_index(feature_layers, num_hidden_layers) + # If we have multiple feature layers, initialize up to the deepest one + elif isinstance(feature_layers, (list, tuple)): + return max( + _get_layer_index(idx, num_hidden_layers) for idx in feature_layers) + raise TypeError(f"vision_layer_feature type: {type(feature_layers)}" + " is not supported") + + +def _get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int: + """Given an signed vision feature layer, get the number of hidden layers + needed to leverage it. + + Args: + feature_layer_index: Index of a required layer in the visual encoder. + num_hidden_layers: The total number of hidden layers in the visual + encoder. + """ + if feature_layer_index < 0: + return num_hidden_layers + feature_layer_index + 1 + return feature_layer_index + 1 def init_vision_tower_for_llava( @@ -216,13 +250,8 @@ def init_vision_tower_for_llava( ): vision_config = hf_config.vision_config - # Initialize the vision tower only up to the required feature layer - vision_feature_layer = hf_config.vision_feature_layer - if vision_feature_layer < 0: - num_hidden_layers = hf_config.vision_config.num_hidden_layers \ - + vision_feature_layer + 1 - else: - num_hidden_layers = vision_feature_layer + 1 + # Initialize the vision tower only up to the deepest required feature layer + num_hidden_layers = _get_num_hidden_layers(hf_config) if isinstance(vision_config, CLIPVisionConfig): return CLIPVisionModel( diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 37e2227a52dcd..abeebb45fc4a7 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -288,6 +288,21 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: pooler_config = vllm_config.model_config.pooler_config multimodal_config = vllm_config.model_config.multimodal_config + vision_feature_layer = config.vision_feature_layer + # Determine the layer up to which we will initialize the vision tower + if isinstance(vision_feature_layer, int): + vision_hidden_size = config.vision_config.hidden_size + self.feature_sample_layers = None + # Used for multimodal granite models to control encoder outputs + elif isinstance(vision_feature_layer, (list, tuple)): + vision_hidden_size = config.vision_config.hidden_size * len( + vision_feature_layer) + self.feature_sample_layers = vision_feature_layer + else: + raise TypeError( + f"vision_layer_feature type: {type(vision_feature_layer)}" + " is not supported") + self.config = config self.multimodal_config = multimodal_config @@ -300,7 +315,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.image_newline = nn.Parameter( torch.empty(config.text_config.hidden_size)) self.multi_modal_projector = LlavaMultiModalProjector( - vision_hidden_size=config.vision_config.hidden_size, + vision_hidden_size=vision_hidden_size, text_hidden_size=config.text_config.hidden_size, projector_hidden_act=config.projector_hidden_act) @@ -419,7 +434,8 @@ def _image_pixels_to_features( # NOTE: we skip the step to select the vision feature layer since # this is already done inside the vision tower - image_features = vision_tower(pixel_values) + image_features = vision_tower( + pixel_values, feature_sample_layers=self.feature_sample_layers) return self._select_image_features( image_features, diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index d14b89d6b3f85..6711cbf5694b9 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -33,7 +33,8 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.inputs import PlaceholderRange from vllm.multimodal.utils import (cached_get_tokenizer, - consecutive_placeholder_ranges) + consecutive_placeholder_ranges, + resolve_visual_encoder_outputs) from vllm.sequence import IntermediateTensors, SequenceData from vllm.transformers_utils.processor import cached_get_processor from vllm.utils import is_list_of @@ -970,9 +971,18 @@ def forward( x: torch.Tensor, attention_mask: torch.Tensor, position_embeddings: torch.Tensor, + return_all_hidden_states: bool, ) -> torch.Tensor: + hidden_states_pool = [] + for layer in self.layers: x = layer(x, attention_mask, position_embeddings) + if return_all_hidden_states: + hidden_states_pool.append(x) + # If we have multiple feature sample layers, we return all hidden + # states in order and grab the ones we need by index. + if return_all_hidden_states: + return hidden_states_pool return x @@ -990,6 +1000,7 @@ def __init__( super().__init__() self.config = config + self.patch_conv = nn.Conv2d( in_channels=config.num_channels, out_channels=config.hidden_size, @@ -1024,6 +1035,7 @@ def __init__( def forward( self, pixel_values: List[torch.Tensor], + feature_sample_layers: Optional[list[int]] = None, ) -> torch.Tensor: """ Args: @@ -1031,6 +1043,9 @@ def forward( in pixel_values. This means it will be a list of tensors because multiple requests batched can have multiple images, each with their own shape potentially + feature_sample_layers: Layer indices whose features should be + concatenated and used as the visual encoder output. If none + are provided, the last layer is used. Returns: image_features: tensor of token features for @@ -1065,8 +1080,15 @@ def forward( [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], patch_embeds) - out = self.transformer(patch_embeds, attention_mask, - position_embedding) + return_all_hidden_states = feature_sample_layers is not None + out = self.transformer( + patch_embeds, + attention_mask, + position_embedding, + return_all_hidden_states=return_all_hidden_states) + + out = resolve_visual_encoder_outputs(out, feature_sample_layers, None, + self.config.num_hidden_layers) return out diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index c58ad99692900..deaed0ba7e4ce 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -25,7 +25,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.multimodal.utils import (cached_get_tokenizer, consecutive_placeholder_ranges, - repeat_and_pad_placeholder_tokens) + repeat_and_pad_placeholder_tokens, + resolve_visual_encoder_outputs) from vllm.sequence import SequenceData from .utils import get_vit_attn_backend @@ -450,11 +451,19 @@ def __init__( def forward( self, inputs_embeds: torch.Tensor, - ) -> torch.Tensor: + return_all_hidden_states: bool, + ) -> Union[torch.Tensor, list[torch.Tensor]]: + hidden_states_pool = [] hidden_states = inputs_embeds + for encoder_layer in self.layers: hidden_states, _ = encoder_layer(hidden_states) - + if return_all_hidden_states: + hidden_states_pool.append(hidden_states) + # If we have multiple feature sample layers, we return all hidden + # states in order and grab the ones we need by index. + if return_all_hidden_states: + return hidden_states_pool return hidden_states @@ -509,6 +518,7 @@ def __init__( embed_dim = config.hidden_size self.embeddings = SiglipVisionEmbeddings(config) + self.encoder = SiglipEncoder( config, quant_config=quant_config, @@ -546,23 +556,33 @@ def forward( self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = True, + feature_sample_layers: Optional[list[int]] = None, ) -> torch.Tensor: + hidden_states = self.embeddings( pixel_values, interpolate_pos_encoding=interpolate_pos_encoding, ) - encoder_outputs = self.encoder(inputs_embeds=hidden_states) + return_all_hidden_states = feature_sample_layers is not None + + # Produces either the last layer output or all of the hidden states, + # depending on if we have feature_sample_layers or not + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + return_all_hidden_states=return_all_hidden_states, + ) - if self.post_layernorm is None: - return encoder_outputs + # Handle post-norm (if applicable) and stacks feature layers if needed + encoder_outputs = resolve_visual_encoder_outputs( + encoder_outputs, feature_sample_layers, self.post_layernorm, + self.config.num_hidden_layers) - last_hidden_state = self.post_layernorm(encoder_outputs) - # TODO: add this back when pooled_output is used in inference + # TODO: add this back when pooled_output is used in inference. # if self.use_head: - # pooled_output = self.head(last_hidden_state) + # pooled_output = self.head(encoder_outputs) - return last_hidden_state + return encoder_outputs class SiglipVisionModel(nn.Module): @@ -595,10 +615,12 @@ def forward( self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False, + feature_sample_layers: Optional[list[int]] = None, ) -> torch.Tensor: return self.vision_model( pixel_values=pixel_values, interpolate_pos_encoding=interpolate_pos_encoding, + feature_sample_layers=feature_sample_layers, ) def load_weights(self, weights: Iterable[Tuple[str, diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 40194716bbf94..d4333b7519b47 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -6,6 +6,7 @@ import numpy as np import numpy.typing as npt +import torch from PIL import Image import vllm.envs as envs @@ -392,6 +393,49 @@ def encode_video_base64(frames: npt.NDArray): return ",".join(base64_frames) +def resolve_visual_encoder_outputs( + encoder_outputs: Union[torch.Tensor, list[torch.Tensor]], + feature_sample_layers: Optional[list[int]], + post_layer_norm: Optional[torch.nn.LayerNorm], + max_possible_layers: int, +) -> torch.Tensor: + """Given the outputs a visual encoder module that may correspond to the + output of the last layer, or a list of hidden states to be stacked, + handle post normalization and resolve it into a single output tensor. + + Args: + encoder_outputs: Output of encoder's last layer or all hidden states. + feature_sample_layers: Optional layer indices to grab from the encoder + outputs; if provided, encoder outputs must be a list. + post_layer_norm: Post norm to apply to the output of the encoder. + max_possible_layers: Total layers in the fully loaded visual encoder. + + """ + if feature_sample_layers is None: + if post_layer_norm is not None: + return post_layer_norm(encoder_outputs) + return encoder_outputs + + # Get the hidden states corresponding to the layer indices. + # Negative values are relative to the full visual encoder, + # so offset them depending on how many layers were loaded. + # NOTE: this assumes that encoder_outputs contains a list + # of hidden states in the same order as the encoder layers + # that produced them. + offset = max_possible_layers - len(encoder_outputs) + hs_pool = [ + encoder_outputs[layer_idx] + if layer_idx >= 0 else encoder_outputs[layer_idx + offset] + for layer_idx in feature_sample_layers + ] + + # Apply post-norm on the final hidden state if we are using it + uses_last_layer = feature_sample_layers[-1] in (len(hs_pool) - 1, -1) + if post_layer_norm is not None and uses_last_layer: + hs_pool[-1] = post_layer_norm(encoder_outputs) + return torch.cat(hs_pool, dim=-1) + + # Utilities for input processors _T = TypeVar("_T", str, int) From 8a93a598d9ac265882e55432e7aef55c8bff23f4 Mon Sep 17 00:00:00 2001 From: "Wang, Yi" Date: Thu, 21 Nov 2024 19:15:36 +0800 Subject: [PATCH 090/255] fix the issue that len(tokenizer(prompt)["input_ids"]) > prompt_len (#10524) Signed-off-by: Wang, Yi A --- benchmarks/backend_request_func.py | 1 + 1 file changed, 1 insertion(+) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 25c8b1bbf3e22..c3fed56e8a956 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -54,6 +54,7 @@ async def async_request_tgi( "do_sample": True, "temperature": 0.01, # TGI does not accept 0.0 temperature. "top_p": 0.99, # TGI does not accept 1.0 top_p. + "truncate": request_func_input.prompt_len, # TGI does not accept ignore_eos flag. } payload = { From d5ec121f95f51184acce4e2c27ad8fc01904d3d9 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Thu, 21 Nov 2024 22:20:08 +0800 Subject: [PATCH 091/255] [Model] Expose `dynamic_image_size` as mm_processor_kwargs for InternVL2 models (#10518) Signed-off-by: Isotr0py <2037008807@qq.com> --- .../mm_processor_kwargs/test_internvl.py | 206 ++++++++++++++++++ vllm/model_executor/models/internvl.py | 63 ++++-- 2 files changed, 255 insertions(+), 14 deletions(-) create mode 100644 tests/models/decoder_only/vision_language/mm_processor_kwargs/test_internvl.py diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_internvl.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_internvl.py new file mode 100644 index 0000000000000..af0c2aa211998 --- /dev/null +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_internvl.py @@ -0,0 +1,206 @@ +"""Tests for InternVL's multimodal preprocessing kwargs.""" +from typing import Callable, Optional + +import pytest +from transformers import AutoTokenizer + +from vllm.inputs import InputContext, token_inputs +from vllm.multimodal import MultiModalRegistry + +from .....conftest import _ImageAssets +from ....utils import build_model_context + +models = ["OpenGVLab/InternVL2-2B"] + + +# Wrap lazy imports to avoid initializing CUDA during test collection +@pytest.fixture() +def input_processor_for_internvl(): + from vllm.model_executor.models.internvl import InternVLInputPipeline + + pipeline = InternVLInputPipeline('', '', '') + return pipeline.input_processor + + +@pytest.fixture() +def dummy_data_for_internvl(): + from vllm.model_executor.models.internvl import InternVLInputPipeline + + pipeline = InternVLInputPipeline('', '', '') + return pipeline.dummy_data + + +@pytest.fixture() +def get_max_internvl_image_tokens(): + from vllm.model_executor.models.internvl import ( + get_max_internvl_image_tokens) + return get_max_internvl_image_tokens + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("max_dynamic_patch", [1, 4]) +@pytest.mark.parametrize("dynamic_image_size", [True, False, None]) +def test_input_mapper_override( + model: str, + image_assets: _ImageAssets, + max_dynamic_patch: int, + dynamic_image_size: Optional[bool], +): + mm_processor_kwargs = { + "max_dynamic_patch": max_dynamic_patch, + } + if dynamic_image_size is not None: + mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size + + expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1 + if dynamic_image_size is False: + expected_num_patches = 1 + + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + mm_processor_kwargs=mm_processor_kwargs, + ) + + mm_registry = MultiModalRegistry() + mm_registry.init_mm_limits_per_prompt(ctx.model_config) + + image = image_assets[0].pil_image.resize((448 * 2, 448 * 2)) + vllm_result = mm_registry.map_input( + ctx.model_config, + {"image": image}, + ) + assert vllm_result["pixel_values"].size(1) == expected_num_patches + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("max_dynamic_patch", [1, 4, None]) +@pytest.mark.parametrize("dynamic_image_size", [True, False, None]) +def test_max_tokens_override( + get_max_internvl_image_tokens: Callable, + model: str, + max_dynamic_patch: Optional[int], + dynamic_image_size: Optional[bool], +): + """Ensure get_max_internvl_image_tokens handles mm_processor_kwargs.""" + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + mm_processor_kwargs=None, + ) + + if max_dynamic_patch is None: + max_dynamic_patch = ctx.get_hf_config().max_dynamic_patch + expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1 + if dynamic_image_size is False: + expected_num_patches = 1 + expected_max_tokens = 256 * expected_num_patches + + actual_max_tokens = get_max_internvl_image_tokens( + ctx=InputContext(ctx.model_config), + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + ) + assert expected_max_tokens == actual_max_tokens + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("num_imgs", [1, 2]) +@pytest.mark.parametrize("max_dynamic_patch", [1, 4, None]) +@pytest.mark.parametrize("dynamic_image_size", [True, False, None]) +def test_dummy_data_override( + dummy_data_for_internvl: Callable, + model: str, + num_imgs: int, + max_dynamic_patch: Optional[int], + dynamic_image_size: Optional[bool], +): + """Ensure dummy_data_for_internvl handles kwargs properly.""" + # Same as the previous test - don't initialize mm_processor_kwargs + # in this test and assume that the kwargs will be correctly expanded by + # the partial when calling the dummy data func. + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + mm_processor_kwargs=None, + ) + + if max_dynamic_patch is None: + max_dynamic_patch = ctx.get_hf_config().max_dynamic_patch + expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1 + if dynamic_image_size is False: + expected_num_patches = 1 + expected_max_tokens = 256 * expected_num_patches + + dummy_data = dummy_data_for_internvl( + ctx=ctx, + seq_len=8192, # Should be bigger than num_imgs * toks_per_img + mm_counts={"image": num_imgs}, + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + ) + sequence_data = dummy_data.seq_data + + tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) + image_token_id = tokenizer.encode('', + add_special_tokens=False)[0] + + # Ensure we have the right number of placeholders per size + img_tok_count = sequence_data.get_token_ids().count(image_token_id) + assert img_tok_count == expected_max_tokens * num_imgs + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("max_dynamic_patch", [1, 4]) +@pytest.mark.parametrize("dynamic_image_size", [True, False, None]) +@pytest.mark.parametrize("num_imgs", [1, 2]) +def test_input_processor_override( + input_processor_for_internvl: Callable, + image_assets: _ImageAssets, + model: str, + num_imgs: int, + max_dynamic_patch: int, + dynamic_image_size: Optional[bool], +): + """Ensure input_processor_for_internvl handles kwargs properly.""" + # Same as the previous test - don't initialize mm_processor_kwargs + # in this test and assume that the kwargs will be correctly expanded by + # the partial when calling the custom input processor. + expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1 + if dynamic_image_size is False: + expected_num_patches = 1 + + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + mm_processor_kwargs=None, + ) + expected_toks_per_img = 256 * expected_num_patches + + # Build the image str / prompt based on the number of images we pass + tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) + placeholders = "" if num_imgs == 1 else "\n".join( + f"Image-{i}: \n" for i in range(1, num_imgs + 1)) + prompt = placeholders + images = [image_assets[0].pil_image.resize((448 * 2, 448 * 2))] * num_imgs + + inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt), + prompt=prompt, + multi_modal_data={"image": images}) + + processed_inputs = input_processor_for_internvl( + ctx, + inputs, + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + ) + + # Ensure we have the right number of placeholders per num_crops size + image_token_id = tokenizer.encode('', + add_special_tokens=False)[0] + img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) + assert img_tok_count == expected_toks_per_img * num_imgs diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 5d38b4b1ef14b..47ac00b6afe9b 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -123,8 +123,15 @@ def calculate_num_blocks(orig_width: int, orig_height: int, min_num: int, return blocks, target_width, target_height -def calculate_num_blocks_wrapper(hf_config: PretrainedConfig, - max_dynamic_patch: Optional[int] = None): +def calculate_num_blocks_wrapper( + hf_config: PretrainedConfig, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, +): + if dynamic_image_size is None: + dynamic_image_size = hf_config.dynamic_image_size + + max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1 if max_dynamic_patch is None: max_dynamic_patch = hf_config.max_dynamic_patch min_num = hf_config.min_dynamic_patch @@ -183,10 +190,17 @@ def image_to_pixel_values(image: Image.Image, input_size: int, min_num: int, return pixel_values -def image_to_pixel_values_wrapper(hf_config: PretrainedConfig, - max_dynamic_patch: Optional[int] = None): +def image_to_pixel_values_wrapper( + hf_config: PretrainedConfig, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, +): image_size = hf_config.vision_config.image_size min_num = hf_config.min_dynamic_patch + if dynamic_image_size is None: + dynamic_image_size = hf_config.dynamic_image_size + + max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1 if max_dynamic_patch is None: max_dynamic_patch = hf_config.max_dynamic_patch use_thumbnail = hf_config.use_thumbnail @@ -207,11 +221,17 @@ def get_internvl_num_patches(hf_config: PretrainedConfig): (downsample_ratio**2)) -def get_max_internvl_image_tokens(ctx: InputContext, - *, - max_dynamic_patch: Optional[int] = None): +def get_max_internvl_image_tokens( + ctx: InputContext, + *, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, +): hf_config = ctx.get_hf_config() + if dynamic_image_size is None: + dynamic_image_size = hf_config.dynamic_image_size + max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1 if max_dynamic_patch is None: max_dynamic_patch = hf_config.max_dynamic_patch use_thumbnail = hf_config.use_thumbnail @@ -222,12 +242,18 @@ def get_max_internvl_image_tokens(ctx: InputContext, return num_patches * max_dynamic_patch -def get_max_internvl_image_size(ctx: InputContext, - *, - max_dynamic_patch: Optional[int] = None): +def get_max_internvl_image_size( + ctx: InputContext, + *, + max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, +): hf_config = ctx.get_hf_config() image_size = hf_config.vision_config.image_size + if dynamic_image_size is None: + dynamic_image_size = hf_config.dynamic_image_size + max_dynamic_patch = max_dynamic_patch if dynamic_image_size else 1 if max_dynamic_patch is None: max_dynamic_patch = hf_config.max_dynamic_patch use_thumbnail = hf_config.use_thumbnail @@ -281,6 +307,7 @@ def input_processor( inputs: DecoderOnlyInputs, *, max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, ) -> DecoderOnlyInputs: multi_modal_data = inputs.get("multi_modal_data") if multi_modal_data is None or "image" not in multi_modal_data: @@ -292,7 +319,7 @@ def input_processor( image_data = multi_modal_data["image"] num_patches = get_internvl_num_patches(hf_config) num_blocks_calculator = calculate_num_blocks_wrapper( - hf_config, max_dynamic_patch) + hf_config, max_dynamic_patch, dynamic_image_size) if isinstance(image_data, Image.Image): width, height = image_data.size num_blocks, _, _ = num_blocks_calculator(width, height) @@ -332,11 +359,12 @@ def input_mapper( data: object, *, max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, ): hf_config = ctx.get_hf_config() image_pixel_values_mapper = image_to_pixel_values_wrapper( - hf_config, max_dynamic_patch) + hf_config, max_dynamic_patch, dynamic_image_size) if isinstance(data, Image.Image): data = image_pixel_values_mapper(data) # Add an N dimension for number of images per prompt (currently 1). @@ -366,13 +394,17 @@ def dummy_data( mm_counts: Mapping[str, int], *, max_dynamic_patch: Optional[int] = None, + dynamic_image_size: Optional[bool] = None, ): num_images = mm_counts["image"] hf_config = ctx.get_hf_config() image_feature_size = get_max_internvl_image_tokens( - ctx, max_dynamic_patch=max_dynamic_patch) + ctx, + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + ) model_config = ctx.model_config tokenizer = cached_get_tokenizer( model_config.tokenizer, @@ -388,7 +420,10 @@ def dummy_data( ) max_image_width, max_image_height = get_max_internvl_image_size( - ctx, max_dynamic_patch=max_dynamic_patch) + ctx, + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + ) mm_data = dummy_image_for_clip( hf_config.vision_config, From 4d676f085295d92a9248c4944433b4ade52a8ff3 Mon Sep 17 00:00:00 2001 From: Xiaoyu Zhang <35585791+BBuf@users.noreply.github.com> Date: Thu, 21 Nov 2024 22:40:02 +0800 Subject: [PATCH 092/255] [Bugfix] Embedding model pooling_type equals ALL and multi input's bug (#10494) --- vllm/model_executor/layers/pooler.py | 29 ++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index bfe2d7d0f382e..df1978241340b 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -94,14 +94,10 @@ def forward( pooled_data = hidden_states[last_token_flat_indices] elif self.pooling_type == PoolingType.ALL: offset = 0 - pooled_data_lst = [] + pooled_data = [] for prompt_len in prompt_lens: - pooled_data_i = hidden_states[offset:offset + prompt_len] - - pooled_data_lst.append(pooled_data_i) + pooled_data.append(hidden_states[offset:offset + prompt_len]) offset += prompt_len - - pooled_data = torch.stack(pooled_data_lst) elif self.pooling_type == PoolingType.MEAN: # Calculate mean pooling cumsum = torch.cumsum(hidden_states, dim=0) @@ -121,7 +117,7 @@ def forward( step_tag_id = self.step_tag_id offset = 0 - pooled_data_lst = [] + pooled_data = [] for prompt_len, seq_data_i in zip( prompt_lens, pooling_metadata.seq_data.values()): pooled_data_i = hidden_states[offset:offset + prompt_len] @@ -130,17 +126,26 @@ def forward( pooled_data_i = pooled_data_i[token_ids == step_tag_id] offset += prompt_len - pooled_data_lst.append(pooled_data_i) - - pooled_data = torch.stack(pooled_data_lst) + pooled_data.append(pooled_data_i) else: raise ValueError(f"Invalid pooling type: {self.pooling_type}") if self.normalize: - pooled_data = nn.functional.normalize(pooled_data, p=2, dim=1) + if isinstance(pooled_data, list): + pooled_data = [ + nn.functional.normalize(data, p=2, dim=1) + for data in pooled_data + ] + else: + pooled_data = nn.functional.normalize(pooled_data, p=2, dim=1) if self.softmax: - pooled_data = nn.functional.softmax(pooled_data, dim=-1) + if isinstance(pooled_data, list): + pooled_data = [ + nn.functional.softmax(data, dim=-1) for data in pooled_data + ] + else: + pooled_data = nn.functional.softmax(pooled_data, dim=-1) pooled_outputs = [ EmbeddingSequenceGroupOutput(data.tolist()) for data in pooled_data From da7e702c6fae521bf8633affb8fe7b834f5cb94b Mon Sep 17 00:00:00 2001 From: Chauncey Date: Fri, 22 Nov 2024 00:24:32 +0800 Subject: [PATCH 093/255] [Bug]: When apply continue_final_message for OpenAI server, the "echo":false is ignored (#10180) Signed-off-by: chaunceyjiang --- tests/entrypoints/openai/test_chat_echo.py | 79 ++++++++++++++++++++++ vllm/entrypoints/openai/serving_chat.py | 4 +- 2 files changed, 81 insertions(+), 2 deletions(-) create mode 100644 tests/entrypoints/openai/test_chat_echo.py diff --git a/tests/entrypoints/openai/test_chat_echo.py b/tests/entrypoints/openai/test_chat_echo.py new file mode 100644 index 0000000000000..223ac5b41aa83 --- /dev/null +++ b/tests/entrypoints/openai/test_chat_echo.py @@ -0,0 +1,79 @@ +from typing import NamedTuple + +import openai # use the official client for correctness check +import pytest +import pytest_asyncio + +from ...utils import RemoteOpenAIServer + +# # any model with a chat template should work here +MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct" +DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 + + +@pytest.fixture(scope="module") +def server(): + args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "float16", + "--enforce-eager", + "--max-model-len", + "4080", + "--chat-template", + DUMMY_CHAT_TEMPLATE, + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +class TestCase(NamedTuple): + model_name: str + echo: bool + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "test_case", + [ + TestCase(model_name=MODEL_NAME, echo=True), + TestCase(model_name=MODEL_NAME, echo=False) + ], +) +async def test_chat_session_with_echo_and_continue_final_message( + client: openai.AsyncOpenAI, test_case: TestCase): + saying: str = "Here is a common saying about apple. An apple a day, keeps" + # test echo with continue_final_message parameter + chat_completion = await client.chat.completions.create( + model=test_case.model_name, + messages=[{ + "role": "user", + "content": "tell me a common saying" + }, { + "role": "assistant", + "content": saying + }], + extra_body={ + "echo": test_case.echo, + "continue_final_message": True, + "add_generation_prompt": False + }) + assert chat_completion.id is not None + assert len(chat_completion.choices) == 1 + + choice = chat_completion.choices[0] + assert choice.finish_reason == "stop" + + message = choice.message + if test_case.echo: + assert message.content is not None and saying in message.content + else: + assert message.content is not None and saying not in message.content + assert message.role == "assistant" diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 2eef909eb9319..54ca0463bcab1 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -361,7 +361,7 @@ async def chat_completion_stream_generator( # Send response to echo the input portion of the # last message - if request.echo or request.continue_final_message: + if request.echo: last_msg_content: Union[str, List[Dict[str, str]]] = "" if conversation and "content" in conversation[ -1] and conversation[-1].get("role") == role: @@ -706,7 +706,7 @@ async def chat_completion_full_generator( stop_reason=output.stop_reason) choices.append(choice_data) - if request.echo or request.continue_final_message: + if request.echo: last_msg_content: Union[str, List[Dict[str, str]]] = "" if conversation and "content" in conversation[-1] and conversation[ -1].get("role") == role: From 2385b60d8300ce730ae67d9ea945f06de9ec4e21 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 22 Nov 2024 01:18:11 +0800 Subject: [PATCH 094/255] [Kernel] Register punica ops directly (#10522) Signed-off-by: Jee Jee Li --- tests/lora/test_punica_variation.py | 23 ++++++++++++++++------ vllm/lora/ops/bgmv_expand.py | 23 +++++++++++++++++++--- vllm/lora/ops/bgmv_expand_slice.py | 25 +++++++++++++++++++++--- vllm/lora/ops/bgmv_shrink.py | 23 +++++++++++++++++++--- vllm/lora/ops/sgmv_expand.py | 29 +++++++++++++++++++++++++--- vllm/lora/ops/sgmv_expand_slice.py | 30 ++++++++++++++++++++++++++--- vllm/lora/ops/sgmv_shrink.py | 28 ++++++++++++++++++++++++--- 7 files changed, 157 insertions(+), 24 deletions(-) diff --git a/tests/lora/test_punica_variation.py b/tests/lora/test_punica_variation.py index 52b82f25d23e1..3b20033271d26 100644 --- a/tests/lora/test_punica_variation.py +++ b/tests/lora/test_punica_variation.py @@ -6,12 +6,13 @@ import pytest import torch -from vllm.lora.ops.bgmv_expand import bgmv_expand -from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice -from vllm.lora.ops.bgmv_shrink import bgmv_shrink -from vllm.lora.ops.sgmv_expand import sgmv_expand -from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice -from vllm.lora.ops.sgmv_shrink import sgmv_shrink +# Enable custom op register +import vllm.lora.ops.bgmv_expand +import vllm.lora.ops.bgmv_expand_slice +import vllm.lora.ops.bgmv_shrink +import vllm.lora.ops.sgmv_expand +import vllm.lora.ops.sgmv_expand_slice +import vllm.lora.ops.sgmv_shrink # noqa: F401 from vllm.platforms import current_platform from .utils import (generate_data, generate_data_for_expand_nslices, @@ -37,6 +38,16 @@ def assert_close(a, b): torch.testing.assert_close(a, b, rtol=rtol, atol=atol) +# Unlike test_punica_sizes.py, we directly utilize custom op for +# testing, which verifies the correct registration of these ops. +bgmv_expand = torch.ops.vllm.bgmv_expand +bgmv_expand_slice = torch.ops.vllm.bgmv_expand_slice +bgmv_shrink = torch.ops.vllm.bgmv_shrink +sgmv_expand = torch.ops.vllm.sgmv_expand +sgmv_expand_slice = torch.ops.vllm.sgmv_expand_slice +sgmv_shrink = torch.ops.vllm.sgmv_shrink + + @pytest.mark.parametrize("batches", BATCHES) @pytest.mark.parametrize("num_loras", NUM_LORA) @pytest.mark.parametrize("rank", MAX_RANKS) diff --git a/vllm/lora/ops/bgmv_expand.py b/vllm/lora/ops/bgmv_expand.py index f176259fddc78..42adb191b8ead 100644 --- a/vllm/lora/ops/bgmv_expand.py +++ b/vllm/lora/ops/bgmv_expand.py @@ -9,6 +9,8 @@ import triton import triton.language as tl +from vllm.utils import direct_register_custom_op + from .utils import get_lora_op_configs @@ -162,9 +164,24 @@ def _bgmv_expand( return +def bgmv_expand_fake( + inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + add_inputs: bool = True, +) -> None: + return + + try: - bgmv_expand = torch.library.custom_op("lora::bgmv_expand", - _bgmv_expand, - mutates_args=["output_tensor"]) + direct_register_custom_op( + op_name="bgmv_expand", + op_func=_bgmv_expand, + mutates_args=["output_tensor"], + fake_impl=bgmv_expand_fake, + ) + bgmv_expand = torch.ops.vllm.bgmv_expand + except AttributeError: bgmv_expand = _bgmv_expand diff --git a/vllm/lora/ops/bgmv_expand_slice.py b/vllm/lora/ops/bgmv_expand_slice.py index 2c6ed96c253f0..f397d752a3ea9 100644 --- a/vllm/lora/ops/bgmv_expand_slice.py +++ b/vllm/lora/ops/bgmv_expand_slice.py @@ -9,6 +9,8 @@ import triton import triton.language as tl +from vllm.utils import direct_register_custom_op + from .utils import get_lora_op_configs @@ -179,9 +181,26 @@ def _bgmv_expand_slice( return +def bgmv_expand_slice_fake( + inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + slice_offset: int, + slice_size: int, + add_inputs: bool = True, +) -> None: + return + + try: - bgmv_expand_slice = torch.library.custom_op("lora::bgmv_expand_slice", - _bgmv_expand_slice, - mutates_args=["output_tensor"]) + direct_register_custom_op( + op_name="bgmv_expand_slice", + op_func=_bgmv_expand_slice, + mutates_args=["output_tensor"], + fake_impl=bgmv_expand_slice_fake, + ) + bgmv_expand_slice = torch.ops.vllm.bgmv_expand_slice + except AttributeError: bgmv_expand_slice = _bgmv_expand_slice diff --git a/vllm/lora/ops/bgmv_shrink.py b/vllm/lora/ops/bgmv_shrink.py index 0846ff36b1692..f3ef01d39e776 100644 --- a/vllm/lora/ops/bgmv_shrink.py +++ b/vllm/lora/ops/bgmv_shrink.py @@ -9,6 +9,8 @@ import triton import triton.language as tl +from vllm.utils import direct_register_custom_op + from .utils import get_lora_op_configs @@ -142,9 +144,24 @@ def _bgmv_shrink( return +def bgmv_shrink_fake( + inputs: torch.Tensor, + lora_a_weights: torch.Tensor, + output_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + scaling: float = 1.0, +) -> None: + return + + try: - bgmv_shrink = torch.library.custom_op("lora::bgmv_shrink", - _bgmv_shrink, - mutates_args=["output_tensor"]) + direct_register_custom_op( + op_name="bgmv_shrink", + op_func=_bgmv_shrink, + mutates_args=["output_tensor"], + fake_impl=bgmv_shrink_fake, + ) + bgmv_shrink = torch.ops.vllm.bgmv_shrink + except AttributeError: bgmv_shrink = _bgmv_shrink diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py index ee2cd2e05e2ee..77c5178493c44 100644 --- a/vllm/lora/ops/sgmv_expand.py +++ b/vllm/lora/ops/sgmv_expand.py @@ -9,6 +9,8 @@ import triton import triton.language as tl +from vllm.utils import direct_register_custom_op + @triton.jit def _sgmv_expand_kernel( @@ -196,9 +198,30 @@ def _sgmv_expand( return +def sgmv_expand_fake( + inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + b_seq_start_loc: torch.Tensor, + seq_len_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + batches: int, + max_seq_length: int, + token_nums: int, + add_inputs: bool = False, +) -> None: + return + + try: - sgmv_expand = torch.library.custom_op("lora::sgmv_expand", - _sgmv_expand, - mutates_args=["output_tensor"]) + + direct_register_custom_op( + op_name="sgmv_expand", + op_func=_sgmv_expand, + mutates_args=["output_tensor"], + fake_impl=sgmv_expand_fake, + ) + sgmv_expand = torch.ops.vllm.sgmv_expand + except AttributeError: sgmv_expand = _sgmv_expand diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py index 5244fa14913a4..55c4fb68ed128 100644 --- a/vllm/lora/ops/sgmv_expand_slice.py +++ b/vllm/lora/ops/sgmv_expand_slice.py @@ -9,6 +9,8 @@ import triton import triton.language as tl +from vllm.utils import direct_register_custom_op + @triton.jit def _sgmv_expand_slice_kernel( @@ -209,9 +211,31 @@ def _sgmv_expand_slice( return +def sgmv_expand_slice_fake( + inputs: torch.Tensor, + lora_b_weights: torch.Tensor, + output_tensor: torch.Tensor, + b_seq_start_loc: torch.Tensor, + seq_len_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + batches: int, + max_seq_length: int, + token_nums: int, + slice_offset: int, + slice_size: int, + add_inputs: bool = False, +) -> None: + return + + try: - sgmv_expand_slice = torch.library.custom_op("lora::sgmv_expand_slice", - _sgmv_expand_slice, - mutates_args=["output_tensor"]) + direct_register_custom_op( + op_name="sgmv_expand_slice", + op_func=_sgmv_expand_slice, + mutates_args=["output_tensor"], + fake_impl=sgmv_expand_slice_fake, + ) + sgmv_expand_slice = torch.ops.vllm.sgmv_expand_slice + except AttributeError: sgmv_expand_slice = _sgmv_expand_slice diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py index b4d893047b06b..37d1dc84eebca 100644 --- a/vllm/lora/ops/sgmv_shrink.py +++ b/vllm/lora/ops/sgmv_shrink.py @@ -9,6 +9,8 @@ import triton import triton.language as tl +from vllm.utils import direct_register_custom_op + @triton.jit def _sgmv_shrink_kernel( @@ -190,9 +192,29 @@ def _sgmv_shrink( return +def sgmv_shrink_fake( + inputs: torch.Tensor, + lora_a_weights: torch.Tensor, + output_tensor: torch.Tensor, + b_seq_start_loc: torch.Tensor, + seq_len_tensor: torch.Tensor, + lora_indices_tensor: torch.Tensor, + batches: int, + max_seq_length: int, + token_nums: int, + scaling: float, +) -> None: + return + + try: - sgmv_shrink = torch.library.custom_op("lora::sgmv_shrink", - _sgmv_shrink, - mutates_args=["output_tensor"]) + direct_register_custom_op( + op_name="sgmv_shrink", + op_func=_sgmv_shrink, + mutates_args=["output_tensor"], + fake_impl=sgmv_shrink_fake, + ) + sgmv_shrink = torch.ops.vllm.sgmv_shrink + except AttributeError: sgmv_shrink = _sgmv_shrink From c51e397fe8db2ef0664814ef3f80e1237c7283da Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Thu, 21 Nov 2024 09:21:31 -0800 Subject: [PATCH 095/255] [Misc] Suppress duplicated logging regarding multimodal input pipeline (#10530) Signed-off-by: Roger Wang --- vllm/inputs/preprocess.py | 4 ++-- vllm/utils.py | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index aacff87df6d79..853257c5ad71f 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -10,7 +10,7 @@ from vllm.multimodal.processing import MultiModalDataDict, MultiModalInputsV2 from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup -from vllm.utils import print_warning_once +from vllm.utils import print_info_once, print_warning_once from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs, PromptType, SingletonInputs, SingletonPrompt, token_inputs) @@ -212,7 +212,7 @@ def _can_process_multimodal(self) -> bool: # updated to use the new multi-modal processor can_process_multimodal = self.mm_registry.has_processor(model_config) if not can_process_multimodal: - logger.info( + print_info_once( "Your model uses the legacy input pipeline instead of the new " "multi-modal processor. Please note that the legacy pipeline " "will be removed in a future release. For more details, see: " diff --git a/vllm/utils.py b/vllm/utils.py index cb2ad43a2ae8d..424e7d0947790 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -705,6 +705,12 @@ def create_kv_caches_with_random( return key_caches, value_caches +@lru_cache +def print_info_once(msg: str) -> None: + # Set the stacklevel to 2 to print the caller's line info + logger.info(msg, stacklevel=2) + + @lru_cache def print_warning_once(msg: str) -> None: # Set the stacklevel to 2 to print the caller's line info From e7a8341c7c7481a0c797d50ead7a698255ac8a9f Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 22 Nov 2024 02:09:43 +0800 Subject: [PATCH 096/255] [Bugfix] Allow token ID-only inputs in Qwen2-Audio (#10536) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/qwen2_audio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index a4965f34b1ca8..0c2374c3c3fc9 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -212,7 +212,7 @@ def input_processor_for_qwen2_audio( return token_inputs( prompt_token_ids=new_input_ids, - prompt=inputs['prompt'], + prompt=inputs.get("prompt"), multi_modal_data=multi_modal_data, ) From 7560ae5cafbae3af9967ac7dc979cb31a40fc572 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 21 Nov 2024 12:30:42 -0800 Subject: [PATCH 097/255] [8/N] enable cli flag without a space (#10529) Signed-off-by: youkaichao --- tests/compile/test_basic_correctness.py | 4 ++-- tests/engine/test_arg_utils.py | 28 +++++++++++++++++++++++++ tests/tpu/test_custom_dispatcher.py | 9 ++++---- vllm/engine/arg_utils.py | 5 ++++- vllm/utils.py | 4 ++++ 5 files changed, 43 insertions(+), 7 deletions(-) diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index c0db2e78824be..b7170886d2556 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -103,7 +103,7 @@ def test_compile_correctness(test_setting: TestSetting): CompilationLevel.NO_COMPILATION, CompilationLevel.PIECEWISE, ]: - all_args.append(final_args + ["-O", str(level)]) + all_args.append(final_args + [f"-O{level}"]) all_envs.append({}) # inductor will change the output, so we only compare if the output @@ -121,7 +121,7 @@ def test_compile_correctness(test_setting: TestSetting): CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE, ]: - all_args.append(final_args + ["-O", str(level)]) + all_args.append(final_args + [f"-O{level}"]) all_envs.append({}) if level != CompilationLevel.DYNAMO_ONCE and not fullgraph: # "DYNAMO_ONCE" will always use fullgraph diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 7b1be5a9802fd..5b0e76fe53685 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -31,6 +31,34 @@ def test_limit_mm_per_prompt_parser(arg, expected): assert args.limit_mm_per_prompt == expected +def test_compilation_config(): + parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) + + # default value + args = parser.parse_args([]) + assert args.compilation_config is None + + # set to O3 + args = parser.parse_args(["-O3"]) + assert args.compilation_config.level == 3 + + # set to O 3 (space) + args = parser.parse_args(["-O", "3"]) + assert args.compilation_config.level == 3 + + # set to O 3 (equals) + args = parser.parse_args(["-O=3"]) + assert args.compilation_config.level == 3 + + # set to json + args = parser.parse_args(["--compilation-config", '{"level": 3}']) + assert args.compilation_config.level == 3 + + # set to json + args = parser.parse_args(['--compilation-config={"level": 3}']) + assert args.compilation_config.level == 3 + + def test_valid_pooling_config(): parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) args = parser.parse_args([ diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py index df348258efcba..bb1379deba3fc 100644 --- a/tests/tpu/test_custom_dispatcher.py +++ b/tests/tpu/test_custom_dispatcher.py @@ -13,9 +13,10 @@ def test_custom_dispatcher(): compare_two_settings( "google/gemma-2b", - arg1=["--enforce-eager", "-O", - str(CompilationLevel.DYNAMO_ONCE)], - arg2=["--enforce-eager", "-O", - str(CompilationLevel.DYNAMO_AS_IS)], + arg1=[ + "--enforce-eager", + f"-O{CompilationLevel.DYNAMO_ONCE}", + ], + arg2=["--enforce-eager", f"-O{CompilationLevel.DYNAMO_AS_IS}"], env1={}, env2={}) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 9288cd22c0036..88862a185ac75 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -882,7 +882,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: 'testing only. level 3 is the recommended level ' 'for production.\n' 'To specify the full compilation config, ' - 'use a JSON string.') + 'use a JSON string.\n' + 'Following the convention of traditional ' + 'compilers, using -O without space is also ' + 'supported. -O3 is equivalent to -O 3.') return parser diff --git a/vllm/utils.py b/vllm/utils.py index 424e7d0947790..67b2629ecc933 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1192,6 +1192,10 @@ def parse_args(self, args=None, namespace=None): else: processed_args.append('--' + arg[len('--'):].replace('_', '-')) + elif arg.startswith('-O') and arg != '-O' and len(arg) == 2: + # allow -O flag to be used without space, e.g. -O3 + processed_args.append('-O') + processed_args.append(arg[2:]) else: processed_args.append(arg) From f9310cbd0c1109c4f22cf9f1dc615b2d08f06408 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 21 Nov 2024 12:53:39 -0800 Subject: [PATCH 098/255] [V1] Fix Compilation config & Enable CUDA graph by default (#10528) Signed-off-by: Woosuk Kwon --- vllm/config.py | 3 +- vllm/v1/worker/gpu_model_runner.py | 62 ++++++++++++++++-------------- vllm/v1/worker/gpu_worker.py | 39 ++++++++++++------- 3 files changed, 62 insertions(+), 42 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index ed09f8ae31863..d1c6a850cb78c 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2370,7 +2370,7 @@ def __post_init__(self): if self.compilation_config is None: self.compilation_config = CompilationConfig() - if envs.VLLM_USE_V1: + if envs.VLLM_USE_V1 and not self.model_config.enforce_eager: # NOTE(woosuk): Currently, we use inductor because the piecewise # CUDA graphs do not work properly with the custom CUDA kernels. # FIXME(woosuk): Disable inductor to reduce the compilation time @@ -2380,6 +2380,7 @@ def __post_init__(self): self.compilation_config.use_inductor = True self.compilation_config.pass_config.enable_fusion = False self.compilation_config.pass_config.enable_reshape = False + self.compilation_config.level = CompilationLevel.PIECEWISE current_platform.check_and_update_config(self) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 5f66293cbe8e4..2cf55cd497659 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1,3 +1,4 @@ +import gc import time from dataclasses import dataclass from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple @@ -515,7 +516,25 @@ def load_model(self) -> None: logger.info("Loading model weights took %.4f GB", self.model_memory_usage / float(2**30)) - def _dummy_run(self, model: nn.Module, num_tokens: int) -> None: + @torch.inference_mode() + def _dummy_run( + self, + model: nn.Module, + num_tokens: int, + kv_caches: List[torch.Tensor], + ) -> torch.Tensor: + with set_forward_context(None): + hidden_states = model( + input_ids=None, + positions=self.positions[:num_tokens], + kv_caches=kv_caches, + attn_metadata=None, + inputs_embeds=self.inputs_embeds[:num_tokens]) + return hidden_states + + def profile_run(self) -> None: + # TODO(woosuk): Profile the max memory usage of the encoder and + # the encoder cache. # use an empty tensor instead of `None`` to force Dynamo to pass # it by reference, rather by specializing on the value `None`. # the `dtype` argument does not matter, and we use `float32` as @@ -527,23 +546,17 @@ def _dummy_run(self, model: nn.Module, num_tokens: int) -> None: torch.tensor([], dtype=torch.float32, device=self.device) for _ in range(self.num_attn_layers) ] - with set_forward_context(None): # noqa: SIM117 - with set_compile_context(self.cudagraph_batch_sizes): - # Trigger compilation for general shape. - model(input_ids=None, - positions=self.positions, - kv_caches=dummy_kv_caches, - attn_metadata=None, - inputs_embeds=self.inputs_embeds) - - @torch.inference_mode() - def profile_run(self) -> None: - # TODO(woosuk): Profile the max memory usage of the encoder and - # the encoder cache. - self._dummy_run(self.model, self.max_num_tokens) + with set_compile_context(self.cudagraph_batch_sizes): + # Trigger compilation for general shape. + hidden_states = self._dummy_run(self.model, self.max_num_tokens, + dummy_kv_caches) + logits = self.model.compute_logits(hidden_states, None) + logits = logits[:self.max_num_tokens] + # TODO(woosuk): Consider the memory usage of the sampler. torch.cuda.synchronize() + del hidden_states, logits + gc.collect() - @torch.inference_mode() def capture_model(self) -> None: if not self.use_cuda_graph: logger.warning( @@ -554,18 +567,11 @@ def capture_model(self) -> None: start_time = time.perf_counter() start_free_gpu_memory = torch.cuda.mem_get_info()[0] - with set_forward_context(None): - # Trigger CUDA graph capture for specific shapes. - # Capture the large shapes first so that the smaller shapes - # can reuse the memory pool allocated for the large shapes. - for num_tokens in reversed(self.cudagraph_batch_sizes): - self.model( - input_ids=None, - positions=self.positions[:num_tokens], - kv_caches=self.kv_caches, - attn_metadata=None, - inputs_embeds=self.inputs_embeds[:num_tokens], - ) + # Trigger CUDA graph capture for specific shapes. + # Capture the large shapes first so that the smaller shapes + # can reuse the memory pool allocated for the large shapes. + for num_tokens in reversed(self.cudagraph_batch_sizes): + self._dummy_run(self.model, num_tokens, self.kv_caches) end_time = time.perf_counter() end_free_gpu_memory = torch.cuda.mem_get_info()[0] diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index c8192b7f86eb0..7973349f14a5d 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -105,35 +105,48 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + _, total_gpu_memory = torch.cuda.mem_get_info() # Execute a forward pass with dummy inputs to profile the memory usage # of the model. self.model_runner.profile_run() - - # Calculate the number of blocks that can be allocated with the - # profiled peak memory. torch.cuda.synchronize() - free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info() + + free_gpu_memory, _ = torch.cuda.mem_get_info() # NOTE(woosuk): Here we assume that the other processes using the same # GPU did not change their memory usage during the profiling. - peak_memory = self.init_gpu_memory - free_gpu_memory - assert peak_memory > 0, ( + assert self.init_gpu_memory > free_gpu_memory, ( "Error in memory profiling. " f"Initial free memory {self.init_gpu_memory}, current free memory" f" {free_gpu_memory}. This happens when the GPU memory was " "not properly cleaned up before initializing the vLLM instance.") + # Get the peak memory allocation recorded by torch + peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"] + + # Check for any memory left around that may have been allocated on the + # gpu outside of `torch`. NCCL operations, for example, can use a few + # GB during a forward pass + torch.cuda.empty_cache() + torch_allocated_bytes = torch.cuda.memory_stats( + )["allocated_bytes.all.current"] + total_allocated_bytes = torch.cuda.mem_get_info( + )[1] - torch.cuda.mem_get_info()[0] + non_torch_allocations = total_allocated_bytes - torch_allocated_bytes + if non_torch_allocations > 0: + peak_memory += non_torch_allocations + available_kv_cache_memory = ( + total_gpu_memory * self.cache_config.gpu_memory_utilization - + peak_memory) + + # Calculate the number of blocks that can be allocated with the + # profiled peak memory. cache_block_size = _get_cache_block_size(self.cache_config, self.model_config, self.parallel_config) - num_gpu_blocks = int( - (total_gpu_memory * self.cache_config.gpu_memory_utilization - - peak_memory) // cache_block_size) + num_gpu_blocks = int(available_kv_cache_memory // cache_block_size) num_gpu_blocks = max(num_gpu_blocks, 0) - # if self.model_runner.lora_manager: - # self.model_runner.remove_all_loras() - gc.collect() - torch.cuda.empty_cache() return num_gpu_blocks, 0 def initialize_cache(self, num_gpu_blocks: int) -> None: From edec3385b641afb22739a6ec0fd0145f8f1141c5 Mon Sep 17 00:00:00 2001 From: Yunmeng Date: Fri, 22 Nov 2024 05:03:58 +0800 Subject: [PATCH 099/255] [CI][Installation] Avoid uploading CUDA 11.8 wheel (#10535) Signed-off-by: simon-mo Co-authored-by: simon-mo --- .buildkite/upload-wheels.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.buildkite/upload-wheels.sh b/.buildkite/upload-wheels.sh index 541b395eddbe7..7345dd4e66b29 100644 --- a/.buildkite/upload-wheels.sh +++ b/.buildkite/upload-wheels.sh @@ -25,7 +25,12 @@ echo "Version: $version" # If the version contains "dev", rename it to v1.0.0.dev for consistency if [[ $version == *dev* ]]; then - new_version="1.0.0.dev" + suffix="${version##*.}" + if [[ $suffix == cu* ]]; then + new_version="1.0.0.dev+${suffix}" + else + new_version="1.0.0.dev" + fi new_wheel="${wheel/$version/$new_version}" mv -- "$wheel" "$new_wheel" wheel="$new_wheel" From cf656f5a022c1ef6f0513c53c5106c8eeff7fdaa Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 21 Nov 2024 13:13:17 -0800 Subject: [PATCH 100/255] [misc] improve error message (#10553) Signed-off-by: youkaichao --- vllm/platforms/cuda.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 07562a8c3d71e..b38dd7c936896 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -99,8 +99,14 @@ def device_id_to_physical_device_id(device_id: int) -> int: if "CUDA_VISIBLE_DEVICES" in os.environ: device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",") if device_ids == [""]: - raise RuntimeError("CUDA_VISIBLE_DEVICES is set to empty string," - " which means GPU support is disabled.") + msg = ( + "CUDA_VISIBLE_DEVICES is set to empty string, which means" + " GPU support is disabled. If you are using ray, please unset" + " the environment variable `CUDA_VISIBLE_DEVICES` inside the" + " worker/actor. " + "Check https://github.com/vllm-project/vllm/issues/8402 for" + " more information.") + raise RuntimeError(msg) physical_device_id = device_ids[device_id] return int(physical_device_id) else: From 46fe9b46d83e733130ce952eb3967a9c96713583 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 21 Nov 2024 13:28:16 -0800 Subject: [PATCH 101/255] [Minor] Revert change in offline inference example (#10545) Signed-off-by: Woosuk Kwon --- examples/offline_inference.py | 98 +++++++------------------------ examples/offline_inference_cli.py | 80 +++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 78 deletions(-) create mode 100644 examples/offline_inference_cli.py diff --git a/examples/offline_inference.py b/examples/offline_inference.py index 391ac6b9b6b03..9b758fa2479f6 100644 --- a/examples/offline_inference.py +++ b/examples/offline_inference.py @@ -1,80 +1,22 @@ -from dataclasses import asdict - from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import EngineArgs -from vllm.utils import FlexibleArgumentParser - - -def get_prompts(num_prompts: int): - # The default sample prompts. - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - - if num_prompts != len(prompts): - prompts = (prompts * ((num_prompts // len(prompts)) + 1))[:num_prompts] - - return prompts - - -def main(args): - # Create prompts - prompts = get_prompts(args.num_prompts) - - # Create a sampling params object. - sampling_params = SamplingParams(n=args.n, - temperature=args.temperature, - top_p=args.top_p, - top_k=args.top_k, - max_tokens=args.max_tokens) - - # Create an LLM. - # The default model is 'facebook/opt-125m' - engine_args = EngineArgs.from_cli_args(args) - llm = LLM(**asdict(engine_args)) - - # Generate texts from the prompts. - # The output is a list of RequestOutput objects - # that contain the prompt, generated text, and other information. - outputs = llm.generate(prompts, sampling_params) - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - - -if __name__ == '__main__': - parser = FlexibleArgumentParser() - parser = EngineArgs.add_cli_args(parser) - group = parser.add_argument_group("SamplingParams options") - group.add_argument("--num-prompts", - type=int, - default=4, - help="Number of prompts used for inference") - group.add_argument("--max-tokens", - type=int, - default=16, - help="Generated output length for sampling") - group.add_argument('--n', - type=int, - default=1, - help='Number of generated sequences per prompt') - group.add_argument('--temperature', - type=float, - default=0.8, - help='Temperature for text generation') - group.add_argument('--top-p', - type=float, - default=0.95, - help='top_p for text generation') - group.add_argument('--top-k', - type=int, - default=-1, - help='top_k for text generation') - args = parser.parse_args() - main(args) +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Create an LLM. +llm = LLM(model="facebook/opt-125m") +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts, sampling_params) +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/examples/offline_inference_cli.py b/examples/offline_inference_cli.py new file mode 100644 index 0000000000000..391ac6b9b6b03 --- /dev/null +++ b/examples/offline_inference_cli.py @@ -0,0 +1,80 @@ +from dataclasses import asdict + +from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import EngineArgs +from vllm.utils import FlexibleArgumentParser + + +def get_prompts(num_prompts: int): + # The default sample prompts. + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + + if num_prompts != len(prompts): + prompts = (prompts * ((num_prompts // len(prompts)) + 1))[:num_prompts] + + return prompts + + +def main(args): + # Create prompts + prompts = get_prompts(args.num_prompts) + + # Create a sampling params object. + sampling_params = SamplingParams(n=args.n, + temperature=args.temperature, + top_p=args.top_p, + top_k=args.top_k, + max_tokens=args.max_tokens) + + # Create an LLM. + # The default model is 'facebook/opt-125m' + engine_args = EngineArgs.from_cli_args(args) + llm = LLM(**asdict(engine_args)) + + # Generate texts from the prompts. + # The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + +if __name__ == '__main__': + parser = FlexibleArgumentParser() + parser = EngineArgs.add_cli_args(parser) + group = parser.add_argument_group("SamplingParams options") + group.add_argument("--num-prompts", + type=int, + default=4, + help="Number of prompts used for inference") + group.add_argument("--max-tokens", + type=int, + default=16, + help="Generated output length for sampling") + group.add_argument('--n', + type=int, + default=1, + help='Number of generated sequences per prompt') + group.add_argument('--temperature', + type=float, + default=0.8, + help='Temperature for text generation') + group.add_argument('--top-p', + type=float, + default=0.95, + help='top_p for text generation') + group.add_argument('--top-k', + type=int, + default=-1, + help='top_k for text generation') + + args = parser.parse_args() + main(args) From 9afa01455237892c878bb2810912c487d66149a9 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 21 Nov 2024 18:43:43 -0500 Subject: [PATCH 102/255] Add small example to metrics.rst (#10550) --- docs/source/serving/metrics.rst | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/docs/source/serving/metrics.rst b/docs/source/serving/metrics.rst index 15e57bd3fec65..231111cd7b738 100644 --- a/docs/source/serving/metrics.rst +++ b/docs/source/serving/metrics.rst @@ -2,9 +2,34 @@ Production Metrics ================== vLLM exposes a number of metrics that can be used to monitor the health of the -system. These metrics are exposed via the `/metrics` endpoint on the vLLM +system. These metrics are exposed via the ``/metrics`` endpoint on the vLLM OpenAI compatible API server. +You can start the server using Python, or using [Docker](deploying_with_docker.rst): + +.. code-block:: console + + $ vllm serve unsloth/Llama-3.2-1B-Instruct + +Then query the endpoint to get the latest metrics from the server: + +.. code-block:: console + + $ curl http://0.0.0.0:8000/metrics + + # HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step. + # TYPE vllm:iteration_tokens_total histogram + vllm:iteration_tokens_total_sum{model_name="unsloth/Llama-3.2-1B-Instruct"} 0.0 + vllm:iteration_tokens_total_bucket{le="1.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 + vllm:iteration_tokens_total_bucket{le="8.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 + vllm:iteration_tokens_total_bucket{le="16.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 + vllm:iteration_tokens_total_bucket{le="32.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 + vllm:iteration_tokens_total_bucket{le="64.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 + vllm:iteration_tokens_total_bucket{le="128.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 + vllm:iteration_tokens_total_bucket{le="256.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 + vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 + ... + The following metrics are exposed: .. literalinclude:: ../../../vllm/engine/metrics.py From aed074860a46536faf77bacd76d02efccbaf4a5d Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Thu, 21 Nov 2024 18:27:20 -0800 Subject: [PATCH 103/255] [Benchmark] Add new H100 machine (#10547) --- .../benchmark-pipeline.yaml | 39 ++++++++++--------- .../convert-results-json-to-markdown.py | 13 +++++-- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml index 5c069b38b2d7d..3db77d5f16022 100644 --- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml @@ -13,6 +13,7 @@ steps: - wait - label: "A100" + # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" agents: queue: A100 plugins: @@ -45,6 +46,7 @@ steps: medium: Memory - label: "H200" + # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" agents: queue: H200 plugins: @@ -63,21 +65,22 @@ steps: - VLLM_USAGE_SOURCE - HF_TOKEN - - # - label: "H100" - # agents: - # queue: H100 - # plugins: - # - docker#v5.11.0: - # image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - # command: - # - bash - # - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh - # mount-buildkite-agent: true - # propagate-environment: true - # ipc: host - # gpus: all - # environment: - # - VLLM_USAGE_SOURCE - # - HF_TOKEN - + - label: "H100" + # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" + agents: + queue: H100 + plugins: + - docker#v5.12.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + command: + - bash + - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh + mount-buildkite-agent: true + propagate-environment: true + ipc: host + gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used + volumes: + - /data/benchmark-hf-cache:/root/.cache/huggingface + environment: + - VLLM_USAGE_SOURCE + - HF_TOKEN diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index d640563252a0c..9d3646e2f6a15 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -157,10 +157,17 @@ def results_to_json(latency, throughput, serving): throughput_results, serving_results) - # Sort all dataframes by their respective "Test name" columns for df in [latency_results, serving_results, throughput_results]: - if not df.empty: - df.sort_values(by="Test name", inplace=True) + if df.empty: + continue + + # Sort all dataframes by their respective "Test name" columns + df.sort_values(by="Test name", inplace=True) + + # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...", + # we want to turn it into "8xGPUTYPE" + df["GPU"] = df["GPU"].apply( + lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}") # get markdown tables latency_md_table = tabulate(latency_results, From 33e0a2540a6bff23cbc6a4b8f7a6784a2bc87d47 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 21 Nov 2024 19:13:31 -0800 Subject: [PATCH 104/255] [9/N] torch.compile LLM usage (#10552) Signed-off-by: youkaichao --- tests/tpu/test_compilation.py | 5 ++--- vllm/entrypoints/llm.py | 15 ++++++++++++++- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py index 65bee85e7a1ea..b7124ebc1b0f3 100644 --- a/tests/tpu/test_compilation.py +++ b/tests/tpu/test_compilation.py @@ -4,7 +4,7 @@ import depyf -from vllm.config import CompilationConfig, CompilationLevel +from vllm.config import CompilationLevel temp_dir = tempfile.mkdtemp() with depyf.prepare_debug(temp_dir): @@ -34,8 +34,7 @@ # all the control llm = LLM(model="google/gemma-2b", enforce_eager=True, - compilation_config=CompilationConfig( - level=CompilationLevel.DYNAMO_AS_IS)) + compilation_config={"level": CompilationLevel.DYNAMO_AS_IS}) outputs = llm.generate(prompts, sampling_params) for output, answer in zip(outputs, answers): prompt = output.prompt diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 86b0b6893f1d9..2446a64a02eb2 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1,4 +1,5 @@ import itertools +import json import warnings from contextlib import contextmanager from typing import (Any, ClassVar, Dict, List, Optional, Sequence, Tuple, Type, @@ -9,6 +10,7 @@ from vllm import envs from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, BeamSearchSequence, get_beam_search_score) +from vllm.config import CompilationConfig from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig, TaskOption) from vllm.engine.llm_engine import LLMEngine @@ -107,13 +109,16 @@ class LLM: hf_overrides: If a dictionary, contains arguments to be forwarded to the HuggingFace config. If a callable, it is called to update the HuggingFace config. + compilation_config: Either an integer or a dictionary. If it is an integer, + it is used as the level of compilation optimization. If it is a dictionary, + it can specify the full compilation configuration. **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See :ref:`engine_args`) Note: This class is intended to be used for offline inference. For online serving, use the :class:`~vllm.AsyncLLMEngine` class instead. - """ + """ # noqa DEPRECATE_LEGACY: ClassVar[bool] = False """A flag to toggle whether to deprecate the legacy generate/encode API.""" @@ -166,6 +171,7 @@ def __init__( # After positional args are removed, move this right below `model` task: TaskOption = "auto", override_pooler_config: Optional[PoolerConfig] = None, + compilation_config: Optional[Union[int, Dict[str, Any]]] = None, **kwargs, ) -> None: ''' @@ -178,6 +184,12 @@ def __init__( if "disable_log_stats" not in kwargs: kwargs["disable_log_stats"] = True + if compilation_config is not None: + compilation_config_instance = CompilationConfig.from_cli( + json.dumps(compilation_config)) + else: + compilation_config_instance = None + engine_args = EngineArgs( model=model, task=task, @@ -202,6 +214,7 @@ def __init__( hf_overrides=hf_overrides, mm_processor_kwargs=mm_processor_kwargs, override_pooler_config=override_pooler_config, + compilation_config=compilation_config_instance, **kwargs, ) # Logic to switch between engines is done at runtime instead of import From 446c7806b21d810b90604097487cc87393542aad Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 21 Nov 2024 19:40:40 -0800 Subject: [PATCH 105/255] [Minor] Fix line-too-long (#10563) Signed-off-by: Woosuk Kwon --- vllm/entrypoints/llm.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 2446a64a02eb2..c211ec5aee080 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -109,16 +109,16 @@ class LLM: hf_overrides: If a dictionary, contains arguments to be forwarded to the HuggingFace config. If a callable, it is called to update the HuggingFace config. - compilation_config: Either an integer or a dictionary. If it is an integer, - it is used as the level of compilation optimization. If it is a dictionary, - it can specify the full compilation configuration. + compilation_config: Either an integer or a dictionary. If it is an + integer, it is used as the level of compilation optimization. If it + is a dictionary, it can specify the full compilation configuration. **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See :ref:`engine_args`) Note: This class is intended to be used for offline inference. For online serving, use the :class:`~vllm.AsyncLLMEngine` class instead. - """ # noqa + """ DEPRECATE_LEGACY: ClassVar[bool] = False """A flag to toggle whether to deprecate the legacy generate/encode API.""" From a111d0151ffed94582bec65635979e04e5b63676 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 21 Nov 2024 21:00:32 -0800 Subject: [PATCH 106/255] [platforms] absorb worker cls difference into platforms folder (#10555) Signed-off-by: youkaichao Co-authored-by: Nick Hill --- vllm/config.py | 238 ++++++++++++------------ vllm/engine/arg_utils.py | 11 +- vllm/executor/cpu_executor.py | 7 +- vllm/executor/gpu_executor.py | 49 +---- vllm/executor/hpu_executor.py | 5 +- vllm/executor/multiproc_gpu_executor.py | 2 +- vllm/executor/neuron_executor.py | 5 +- vllm/executor/openvino_executor.py | 8 +- vllm/executor/ray_gpu_executor.py | 16 +- vllm/executor/ray_hpu_executor.py | 36 +--- vllm/executor/ray_tpu_executor.py | 19 +- vllm/executor/xpu_executor.py | 14 +- vllm/platforms/cpu.py | 2 + vllm/platforms/cuda.py | 21 ++- vllm/platforms/hpu.py | 23 +++ vllm/platforms/neuron.py | 14 ++ vllm/platforms/openvino.py | 18 ++ vllm/platforms/rocm.py | 20 ++ vllm/platforms/tpu.py | 12 ++ vllm/platforms/xpu.py | 6 + vllm/worker/worker_base.py | 30 +-- 21 files changed, 273 insertions(+), 283 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index d1c6a850cb78c..b5f2116e3557b 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -926,56 +926,56 @@ def _verify_load_format(self) -> None: f"{rocm_supported_load_format}") +@dataclass class ParallelConfig: - """Configuration for the distributed execution. + """Configuration for the distributed execution.""" - Args: - pipeline_parallel_size: Number of pipeline parallel groups. - tensor_parallel_size: Number of tensor parallel groups. - worker_use_ray: Deprecated, use distributed_executor_backend instead. - max_parallel_loading_workers: Maximum number of multiple batches - when load model sequentially. To avoid RAM OOM when using tensor - parallel and large models. - disable_custom_all_reduce: Disable the custom all-reduce kernel and - fall back to NCCL. - tokenizer_pool_config: Config for the tokenizer pool. - If None, will use synchronous tokenization. - ray_workers_use_nsight: Whether to profile Ray workers with nsight, see - https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler. - placement_group: ray distributed model workers placement group. - distributed_executor_backend: Backend to use for distributed model - workers, either "ray" or "mp" (multiprocessing). If the product - of pipeline_parallel_size and tensor_parallel_size is less than - or equal to the number of GPUs available, "mp" will be used to - keep processing on a single host. Otherwise, this will default - to "ray" if Ray is installed and fail otherwise. Note that tpu - and hpu only support Ray for distributed inference. - """ + pipeline_parallel_size: int = 1 # Number of pipeline parallel groups. + tensor_parallel_size: int = 1 # Number of tensor parallel groups. - def __init__( - self, - pipeline_parallel_size: int, - tensor_parallel_size: int, - worker_use_ray: Optional[bool] = None, - max_parallel_loading_workers: Optional[int] = None, - disable_custom_all_reduce: bool = False, - tokenizer_pool_config: Optional[TokenizerPoolConfig] = None, - ray_workers_use_nsight: bool = False, - placement_group: Optional["PlacementGroup"] = None, - distributed_executor_backend: Optional[Union[ - str, Type["ExecutorBase"]]] = None, - ) -> None: - self.pipeline_parallel_size = pipeline_parallel_size - self.tensor_parallel_size = tensor_parallel_size - self.distributed_executor_backend = distributed_executor_backend - self.max_parallel_loading_workers = max_parallel_loading_workers - self.disable_custom_all_reduce = disable_custom_all_reduce - self.tokenizer_pool_config = tokenizer_pool_config - self.ray_workers_use_nsight = ray_workers_use_nsight - self.placement_group = placement_group - self.world_size = pipeline_parallel_size * self.tensor_parallel_size - - if worker_use_ray: + # Deprecated, use distributed_executor_backend instead. + worker_use_ray: Optional[bool] = None + + # Maximum number of multiple batches + # when load model sequentially. To avoid RAM OOM when using tensor + # parallel and large models. + max_parallel_loading_workers: Optional[int] = None + + # Disable the custom all-reduce kernel and fall back to NCCL. + disable_custom_all_reduce: bool = False + + # Config for the tokenizer pool. If None, will use synchronous tokenization. + tokenizer_pool_config: Optional[TokenizerPoolConfig] = None + + # Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler. + ray_workers_use_nsight: bool = False + + # ray distributed model workers placement group. + placement_group: Optional["PlacementGroup"] = None + + # Backend to use for distributed model + # workers, either "ray" or "mp" (multiprocessing). If the product + # of pipeline_parallel_size and tensor_parallel_size is less than + # or equal to the number of GPUs available, "mp" will be used to + # keep processing on a single host. Otherwise, this will default + # to "ray" if Ray is installed and fail otherwise. Note that tpu + # and hpu only support Ray for distributed inference. + distributed_executor_backend: Optional[Union[str, + Type["ExecutorBase"]]] = None + + # the full name of the worker class to use. If "auto", the worker class + # will be determined based on the platform. + worker_cls: str = "auto" + + world_size: int = field(init=False) + + rank: int = 0 + + def __post_init__(self) -> None: + self.world_size = self.pipeline_parallel_size * \ + self.tensor_parallel_size + + if self.worker_use_ray: if self.distributed_executor_backend is None: self.distributed_executor_backend = "ray" elif not self.use_ray: @@ -1026,7 +1026,6 @@ def __init__( backend) self._verify_args() - self.rank: int = 0 @property def use_ray(self) -> bool: @@ -1059,100 +1058,97 @@ def _verify_args(self) -> None: "run with Ray.") +@dataclass class SchedulerConfig: - """Scheduler configuration. + """Scheduler configuration.""" - Args: - task: The task to use the model for. - max_num_batched_tokens: Maximum number of tokens to be processed in - a single iteration. - max_num_seqs: Maximum number of sequences to be processed in a single - iteration. - max_model_len: Maximum length of a sequence (including prompt - and generated text). - num_lookahead_slots: The number of slots to allocate per sequence per - step, beyond the known token ids. This is used in speculative - decoding to store KV activations of tokens which may or may not be - accepted. - delay_factor: Apply a delay (of delay factor multiplied by previous - prompt latency) before scheduling next prompt. - enable_chunked_prefill: If True, prefill requests can be chunked based - on the remaining max_num_batched_tokens. - preemption_mode: Whether to perform preemption by swapping or - recomputation. If not specified, we determine the mode as follows: - We use recomputation by default since it incurs lower overhead than - swapping. However, when the sequence group has multiple sequences - (e.g., beam search), recomputation is not currently supported. In - such a case, we use swapping instead. - send_delta_data: Private API. If used, scheduler sends delta data to - workers instead of an entire data. It should be enabled only - when SPMD worker architecture is enabled. I.e., - VLLM_USE_RAY_SPMD_WORKER=1 - policy: The scheduling policy to use. "fcfs" (default) or "priority". - """ + task: str = "generate" # The task to use the model for. + + # Maximum number of tokens to be processed in a single iteration. + max_num_batched_tokens: int = field(default=None) # type: ignore + + # Maximum number of sequences to be processed in a single iteration. + max_num_seqs: int = 128 + + # Maximum length of a sequence (including prompt and generated text). + max_model_len: int = 8192 + + # The number of slots to allocate per sequence per + # step, beyond the known token ids. This is used in speculative + # decoding to store KV activations of tokens which may or may not be + # accepted. + num_lookahead_slots: int = 0 + + # Apply a delay (of delay factor multiplied by previous + # prompt latency) before scheduling next prompt. + delay_factor: float = 0.0 + + # If True, prefill requests can be chunked based + # on the remaining max_num_batched_tokens. + enable_chunked_prefill: bool = False + + is_multimodal_model: bool = False - def __init__(self, - task: _Task, - max_num_batched_tokens: Optional[int], - max_num_seqs: int, - max_model_len: int, - num_lookahead_slots: int = 0, - delay_factor: float = 0.0, - enable_chunked_prefill: bool = False, - is_multimodal_model: bool = False, - preemption_mode: Optional[str] = None, - num_scheduler_steps: int = 1, - multi_step_stream_outputs: bool = False, - send_delta_data: bool = False, - policy: str = "fcfs") -> None: - if max_num_batched_tokens is None: - if enable_chunked_prefill: - if num_scheduler_steps > 1: + # Whether to perform preemption by swapping or + # recomputation. If not specified, we determine the mode as follows: + # We use recomputation by default since it incurs lower overhead than + # swapping. However, when the sequence group has multiple sequences + # (e.g., beam search), recomputation is not currently supported. In + # such a case, we use swapping instead. + preemption_mode: Optional[str] = None + + num_scheduler_steps: int = 1 + + multi_step_stream_outputs: bool = False + + # Private API. If used, scheduler sends delta data to + # workers instead of an entire data. It should be enabled only + # when SPMD worker architecture is enabled. I.e., + # VLLM_USE_RAY_SPMD_WORKER=1 + send_delta_data: bool = False + + # The scheduling policy to use. "fcfs" (default) or "priority". + policy: str = "fcfs" + + chunked_prefill_enabled: bool = field(init=False) + + def __post_init__(self) -> None: + if self.max_num_batched_tokens is None: + if self.enable_chunked_prefill: + if self.num_scheduler_steps > 1: # Multi-step Chunked-Prefill doesn't allow prompt-chunking # for now. Have max_num_batched_tokens set to max_model_len # so we don't reject sequences on account of a short # max_num_batched_tokens. - max_num_batched_tokens = max(max_model_len, 2048) + self.max_num_batched_tokens = max(self.max_model_len, 2048) else: # It is the values that have the best balance between ITL # and TTFT on A100. Note it is not optimized for throughput. - max_num_batched_tokens = 512 + self.max_num_batched_tokens = 512 else: # If max_model_len is too short, use 2048 as the default value # for higher throughput. - max_num_batched_tokens = max(max_model_len, 2048) + self.max_num_batched_tokens = max(self.max_model_len, 2048) - if task == "embedding": + if self.task == "embedding": # For embedding, choose specific value for higher throughput - max_num_batched_tokens = max( - max_num_batched_tokens, + self.max_num_batched_tokens = max( + self.max_num_batched_tokens, _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS, ) - if is_multimodal_model: + if self.is_multimodal_model: # The value needs to be at least the number of multimodal tokens - max_num_batched_tokens = max( - max_num_batched_tokens, + self.max_num_batched_tokens = max( + self.max_num_batched_tokens, _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, ) - self.max_num_batched_tokens = max_num_batched_tokens - - if enable_chunked_prefill: + if self.enable_chunked_prefill: logger.info( "Chunked prefill is enabled with max_num_batched_tokens=%d.", self.max_num_batched_tokens) - self.task: Final = task - self.max_num_seqs = max_num_seqs - self.max_model_len = max_model_len - self.num_lookahead_slots = num_lookahead_slots - self.delay_factor = delay_factor - self.chunked_prefill_enabled = enable_chunked_prefill - self.preemption_mode = preemption_mode - self.num_scheduler_steps = num_scheduler_steps - self.multi_step_stream_outputs = multi_step_stream_outputs - self.send_delta_data = send_delta_data - self.policy = policy + self.chunked_prefill_enabled = self.enable_chunked_prefill self._verify_args() def _verify_args(self) -> None: @@ -2293,10 +2289,10 @@ class VllmConfig: model_config: ModelConfig = field(default=None, init=True) # type: ignore cache_config: CacheConfig = field(default=None, init=True) # type: ignore - parallel_config: ParallelConfig = field(default=None, - init=True) # type: ignore - scheduler_config: SchedulerConfig = field(default=None, - init=True) # type: ignore + parallel_config: ParallelConfig = field(default_factory=ParallelConfig, + init=True) + scheduler_config: SchedulerConfig = field(default_factory=SchedulerConfig, + init=True) device_config: DeviceConfig = field(default=None, init=True) # type: ignore load_config: LoadConfig = field(default=None, init=True) # type: ignore diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 88862a185ac75..82f1ef51255e9 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -191,6 +191,7 @@ class EngineArgs: override_neuron_config: Optional[Dict[str, Any]] = None override_pooler_config: Optional[PoolerConfig] = None compilation_config: Optional[CompilationConfig] = None + worker_cls: str = "auto" def __post_init__(self): if not self.tokenizer: @@ -887,6 +888,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: 'compilers, using -O without space is also ' 'supported. -O3 is equivalent to -O 3.') + parser.add_argument( + '--worker-cls', + type=str, + default="auto", + help='The worker class to use for distributed execution.') + return parser @classmethod @@ -999,7 +1006,9 @@ def create_engine_config(self) -> VllmConfig: self.tokenizer_pool_extra_config, ), ray_workers_use_nsight=self.ray_workers_use_nsight, - distributed_executor_backend=self.distributed_executor_backend) + distributed_executor_backend=self.distributed_executor_backend, + worker_cls=self.worker_cls, + ) max_model_len = model_config.max_model_len use_long_context = max_model_len > 32768 diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 1542a2ae367eb..336f9bc8efb20 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -115,13 +115,8 @@ def _create_worker( local_rank: int = 0, rank: int = 0, ): - worker_module_name = "vllm.worker.cpu_worker" - worker_class_name = "CPUWorker" - wrapper = WorkerWrapperBase( - worker_module_name=worker_module_name, - worker_class_name=worker_class_name, - ) + wrapper = WorkerWrapperBase(vllm_config=self.vllm_config) assert self.distributed_init_method is not None diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index c65d0836e5ff7..7fa34456028dd 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -1,4 +1,4 @@ -from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union +from typing import Any, Dict, List, Optional, Set, Tuple, Union from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.logger import init_logger @@ -8,19 +8,14 @@ from vllm.sequence import ExecuteModelRequest, PoolerOutput from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, make_async) -from vllm.worker.worker_base import WorkerBase, WorkerWrapperBase +from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) -def create_worker(worker_module_name: str, worker_class_name: str, - worker_class_fn: Optional[Callable[[], Type[WorkerBase]]], - **kwargs): - wrapper = WorkerWrapperBase( - worker_module_name=worker_module_name, - worker_class_name=worker_class_name, - worker_class_fn=worker_class_fn, - ) +def create_worker(**kwargs): + vllm_config = kwargs.get("vllm_config") + wrapper = WorkerWrapperBase(vllm_config=vllm_config) wrapper.init_worker(**kwargs) return wrapper.worker @@ -57,43 +52,11 @@ def _get_worker_kwargs( or (rank % self.parallel_config.tensor_parallel_size == 0), ) - def _get_worker_module_and_class( - self) -> Tuple[str, str, Optional[Callable[[], Type[WorkerBase]]]]: - worker_class_fn = None - if self.scheduler_config.is_multi_step: - worker_module_name = "vllm.worker.multi_step_worker" - worker_class_name = "MultiStepWorker" - elif self.speculative_config: - worker_module_name = "vllm.spec_decode.spec_decode_worker" - worker_class_name = "create_spec_worker" - else: - worker_module_name = "vllm.worker.worker" - worker_class_name = "Worker" - return (worker_module_name, worker_class_name, worker_class_fn) - - def _get_create_worker_kwargs( - self, - local_rank: int = 0, - rank: int = 0, - distributed_init_method: Optional[str] = None) -> Dict: - worker_kwargs = self._get_worker_kwargs(local_rank, rank, - distributed_init_method) - - (worker_module_name, worker_class_name, - worker_class_fn) = self._get_worker_module_and_class() - worker_kwargs.update( - worker_module_name=worker_module_name, - worker_class_name=worker_class_name, - worker_class_fn=worker_class_fn, - ) - - return worker_kwargs - def _create_worker(self, local_rank: int = 0, rank: int = 0, distributed_init_method: Optional[str] = None): - return create_worker(**self._get_create_worker_kwargs( + return create_worker(**self._get_worker_kwargs( local_rank=local_rank, rank=rank, distributed_init_method=distributed_init_method)) diff --git a/vllm/executor/hpu_executor.py b/vllm/executor/hpu_executor.py index 220e9eee87bb3..c9b7bfa71edfa 100644 --- a/vllm/executor/hpu_executor.py +++ b/vllm/executor/hpu_executor.py @@ -48,10 +48,7 @@ def _create_worker(self, local_rank: int = 0, rank: int = 0, distributed_init_method: Optional[str] = None): - wrapper = WorkerWrapperBase( - worker_module_name="vllm.worker.hpu_worker", - worker_class_name="HPUWorker", - ) + wrapper = WorkerWrapperBase(vllm_config=self.vllm_config) wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank, distributed_init_method)) return wrapper.worker diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py index 3eb14fb931925..a6c05a71d2b6f 100644 --- a/vllm/executor/multiproc_gpu_executor.py +++ b/vllm/executor/multiproc_gpu_executor.py @@ -90,7 +90,7 @@ def _init_executor(self) -> None: result_handler, partial( create_worker, - **self._get_create_worker_kwargs( + **self._get_worker_kwargs( rank=rank, local_rank=rank, distributed_init_method=distributed_init_method, diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index 02d37cd7fbf23..31e6fdc3ab1bb 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -7,6 +7,7 @@ from vllm.sequence import ExecuteModelRequest from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, make_async) +from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -25,10 +26,10 @@ def _init_executor(self) -> None: self._init_worker() def _init_worker(self): - from vllm.worker.neuron_worker import NeuronWorker + wrapper = WorkerWrapperBase(vllm_config=self.vllm_config) distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) - self.driver_worker = NeuronWorker( + self.driver_worker = wrapper.init_worker( vllm_config=self.vllm_config, local_rank=0, rank=0, diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py index d06b0ccb7906e..dcd4b7621381d 100644 --- a/vllm/executor/openvino_executor.py +++ b/vllm/executor/openvino_executor.py @@ -14,6 +14,7 @@ from vllm.sequence import ExecuteModelRequest from vllm.utils import (GiB_bytes, get_distributed_init_method, get_ip, get_open_port, make_async) +from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -38,15 +39,12 @@ def _init_executor(self) -> None: self._init_worker() def _init_worker(self): - from vllm.worker.openvino_worker import OpenVINOWorker - assert ( - self.parallel_config.world_size == 1 - ), "OpenVINOExecutor only supports single CPU socket currently." + wrapper = WorkerWrapperBase(vllm_config=self.vllm_config) distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) - self.driver_worker = OpenVINOWorker( + self.driver_worker = wrapper.init_worker( ov_core=self.ov_core, vllm_config=self.vllm_config, local_rank=0, diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 66bab2c686c67..810b0f06ff7b2 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -91,17 +91,6 @@ def _configure_ray_workers_use_nsight(self, return ray_remote_kwargs - def _get_worker_wrapper_args(self) -> Dict[str, Any]: - (worker_module_name, worker_class_name, - worker_class_fn) = self._get_worker_module_and_class() - - return dict( - worker_module_name=worker_module_name, - worker_class_name=worker_class_name, - worker_class_fn=worker_class_fn, - trust_remote_code=self.model_config.trust_remote_code, - ) - # child class could overwrite this to return actual env vars. def _get_env_vars_to_be_updated(self): return self._env_vars_for_all_workers @@ -135,7 +124,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # Create the workers. driver_ip = get_ip() - worker_wrapper_kwargs = self._get_worker_wrapper_args() for bundle_id, bundle in enumerate(placement_group.bundle_specs): if not bundle.get("GPU", 0): continue @@ -150,7 +138,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", num_gpus=num_gpus, scheduling_strategy=scheduling_strategy, **ray_remote_kwargs, - )(RayWorkerWrapper).remote(**worker_wrapper_kwargs) + )(RayWorkerWrapper).remote(vllm_config=self.vllm_config) if self.use_ray_spmd_worker: self.workers.append(worker) @@ -161,7 +149,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # as the resource holder for the driver process. self.driver_dummy_worker = worker self.driver_worker = RayWorkerWrapper( - **worker_wrapper_kwargs) + vllm_config=self.vllm_config) else: # Else, added to the list of workers. self.workers.append(worker) diff --git a/vllm/executor/ray_hpu_executor.py b/vllm/executor/ray_hpu_executor.py index a24bab6df370e..6fe8c6c403358 100644 --- a/vllm/executor/ray_hpu_executor.py +++ b/vllm/executor/ray_hpu_executor.py @@ -2,8 +2,7 @@ import os from collections import defaultdict from itertools import islice, repeat -from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, - Type) +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple import msgspec @@ -18,7 +17,6 @@ from vllm.utils import (_run_task_with_lock, get_distributed_init_method, get_ip, get_open_port, get_vllm_instance_id, make_async) -from vllm.worker.worker_base import WorkerBase if ray is not None: from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy @@ -81,33 +79,6 @@ def shutdown(self) -> None: def finish_measurements(self): self._run_workers("finish_measurements") - def _get_worker_module_and_class( - self - ) -> Tuple[str, str, Optional[Callable[[], - Type[WorkerBase]]]]: # noqa: F821 - worker_class_fn = None - if self.scheduler_config.is_multi_step: - raise NotImplementedError( - "Multi-step execution is not implemented for HPU") - elif self.speculative_config: - raise NotImplementedError( - "Speculative decoding is not implemented for HPU") - else: - worker_module_name = "vllm.worker.hpu_worker" - worker_class_name = "HPUWorker" - return (worker_module_name, worker_class_name, worker_class_fn) - - def _get_worker_wrapper_args(self) -> Dict[str, Any]: - (worker_module_name, worker_class_name, - worker_class_fn) = self._get_worker_module_and_class() - - return dict( - worker_module_name=worker_module_name, - worker_class_name=worker_class_name, - worker_class_fn=worker_class_fn, - trust_remote_code=self.model_config.trust_remote_code, - ) - def _init_workers_ray(self, placement_group: "PlacementGroup", **ray_remote_kwargs): # Otherwise, the ray workers are allocated with a full GPU. @@ -128,7 +99,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # Create the workers. driver_ip = get_ip() - worker_wrapper_kwargs = self._get_worker_wrapper_args() for bundle_id, bundle in enumerate(placement_group.bundle_specs): if not bundle.get("HPU", 0): continue @@ -144,7 +114,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", resources={'HPU': num_gpus}, scheduling_strategy=scheduling_strategy, **ray_remote_kwargs, - )(RayWorkerWrapper).remote(**worker_wrapper_kwargs) + )(RayWorkerWrapper).remote(vllm_config=self.vllm_config) if self.use_ray_spmd_worker: self.workers.append(worker) @@ -155,7 +125,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # as the resource holder for the driver process. self.driver_dummy_worker = worker self.driver_worker = RayWorkerWrapper( - **worker_wrapper_kwargs) + vllm_config=self.vllm_config) else: # Else, added to the list of workers. self.workers.append(worker) diff --git a/vllm/executor/ray_tpu_executor.py b/vllm/executor/ray_tpu_executor.py index d02fecb46f007..c227b5e283c68 100644 --- a/vllm/executor/ray_tpu_executor.py +++ b/vllm/executor/ray_tpu_executor.py @@ -69,14 +69,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", placement_group_bundle_index=bundle_id, ) - assert self.speculative_config is None - if self.scheduler_config.is_multi_step: - worker_module_name = "vllm.worker.multi_step_tpu_worker" - worker_class_name = "MultiStepTPUWorker" - else: - worker_module_name = "vllm.worker.tpu_worker" - worker_class_name = "TPUWorker" - # GKE does not fetch environment information from metadata server # and instead sets these from within the Ray process. Therefore we # need to override the Ray environment variables manually. @@ -95,11 +87,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", resources={"TPU": 1}, scheduling_strategy=scheduling_strategy, **ray_remote_kwargs, - )(RayWorkerWrapper).remote( - worker_module_name=worker_module_name, - worker_class_name=worker_class_name, - trust_remote_code=self.model_config.trust_remote_code, - ) + )(RayWorkerWrapper).remote(vllm_config=self.vllm_config) if override_env: worker.override_env_vars.remote(override_env) @@ -109,10 +97,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # as the resource holder for the driver process. self.driver_dummy_worker = worker self.driver_worker = RayWorkerWrapper( - worker_module_name=worker_module_name, - worker_class_name=worker_class_name, - trust_remote_code=self.model_config.trust_remote_code, - ) + vllm_config=self.vllm_config) else: # Else, added to the list of workers. self.workers.append(worker) diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py index ba6177e51a453..722b86a95ff8a 100644 --- a/vllm/executor/xpu_executor.py +++ b/vllm/executor/xpu_executor.py @@ -1,4 +1,4 @@ -from typing import Callable, List, Optional, Tuple, Type, Union +from typing import List, Optional, Union from vllm.executor.executor_base import ExecutorAsyncBase from vllm.executor.gpu_executor import GPUExecutor @@ -6,7 +6,6 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest, PoolerOutput from vllm.utils import make_async -from vllm.worker.worker_base import WorkerBase logger = init_logger(__name__) @@ -22,17 +21,6 @@ def _init_executor(self) -> None: GPUExecutor._init_executor(self) - def _get_worker_module_and_class( - self) -> Tuple[str, str, Optional[Callable[[], Type[WorkerBase]]]]: - worker_class_fn = None - if self.speculative_config is not None: - raise NotImplementedError( - "XPU does not support speculative decoding") - else: - worker_module_name = "vllm.worker.xpu_worker" - worker_class_name = "XPUWorker" - return (worker_module_name, worker_class_name, worker_class_fn) - def execute_model( self, execute_model_req: ExecuteModelRequest ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]: diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 0c4c916406223..9be9031dc3baf 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -84,3 +84,5 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: "distributed executor backend."), parallel_config.distributed_executor_backend) parallel_config.distributed_executor_backend = "mp" + if parallel_config.worker_cls == "auto": + parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker" diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index b38dd7c936896..cf0d41081a5aa 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -4,7 +4,7 @@ import os from functools import lru_cache, wraps -from typing import Callable, List, Tuple, TypeVar +from typing import TYPE_CHECKING, Callable, List, Tuple, TypeVar import pynvml import torch @@ -16,6 +16,11 @@ from .interface import DeviceCapability, Platform, PlatformEnum +if TYPE_CHECKING: + from vllm.config import VllmConfig +else: + VllmConfig = None + logger = init_logger(__name__) _P = ParamSpec("_P") @@ -157,3 +162,17 @@ def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool: " machine has no NVLink equipped.") return False return True + + @classmethod + def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + parallel_config = vllm_config.parallel_config + scheduler_config = vllm_config.scheduler_config + if parallel_config.worker_cls == "auto": + if scheduler_config.is_multi_step: + parallel_config.worker_cls = \ + "vllm.worker.multi_step_worker.MultiStepWorker" + elif vllm_config.speculative_config: + parallel_config.worker_cls = \ + "vllm.spec_decode.spec_decode_worker.create_spec_worker" + else: + parallel_config.worker_cls = "vllm.worker.worker.Worker" diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index 36d944b3f24b8..a8f568d31d5a7 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -1,7 +1,14 @@ +from typing import TYPE_CHECKING + import torch from .interface import Platform, PlatformEnum, _Backend +if TYPE_CHECKING: + from vllm.config import VllmConfig +else: + VllmConfig = None + class HpuPlatform(Platform): _enum = PlatformEnum.HPU @@ -14,3 +21,19 @@ def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: @staticmethod def inference_mode(): return torch.no_grad() + + @classmethod + def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + + scheduler_config = vllm_config.scheduler_config + if scheduler_config.is_multi_step: + raise NotImplementedError( + "Multi-step execution is not implemented for HPU") + + if vllm_config.speculative_config is not None: + raise NotImplementedError( + "Speculative decoding is not implemented for HPU") + + parallel_config = vllm_config.parallel_config + if parallel_config.worker_cls == "auto": + parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker" diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py index 57e3c0dfae84c..4c4d778ed3dd4 100644 --- a/vllm/platforms/neuron.py +++ b/vllm/platforms/neuron.py @@ -1,5 +1,12 @@ +from typing import TYPE_CHECKING + from .interface import Platform, PlatformEnum +if TYPE_CHECKING: + from vllm.config import VllmConfig +else: + VllmConfig = None + class NeuronPlatform(Platform): _enum = PlatformEnum.NEURON @@ -8,3 +15,10 @@ class NeuronPlatform(Platform): @classmethod def get_device_name(cls, device_id: int = 0) -> str: return "neuron" + + @classmethod + def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + parallel_config = vllm_config.parallel_config + if parallel_config.worker_cls == "auto": + parallel_config.worker_cls = \ + "vllm.worker.neuron_worker.NeuronWorker" diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py index 130b8eec1b386..33a41933e9fff 100644 --- a/vllm/platforms/openvino.py +++ b/vllm/platforms/openvino.py @@ -1,3 +1,5 @@ +from typing import TYPE_CHECKING + import torch import vllm.envs as envs @@ -5,6 +7,11 @@ from .interface import Platform, PlatformEnum, _Backend +if TYPE_CHECKING: + from vllm.config import VllmConfig +else: + VllmConfig = None + logger = init_logger(__name__) @@ -38,3 +45,14 @@ def is_openvino_gpu(self) -> bool: def is_pin_memory_available(self) -> bool: logger.warning("Pin memory is not supported on OpenViNO.") return False + + @classmethod + def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + parallel_config = vllm_config.parallel_config + assert ( + parallel_config.world_size == 1 + ), "OpenVINOExecutor only supports single CPU socket currently." + + if parallel_config.worker_cls == "auto": + parallel_config.worker_cls = \ + "vllm.worker.openvino_worker.OpenVINOWorker" diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index c62241d8bb47b..3fe8c01c15787 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -1,5 +1,6 @@ import os from functools import lru_cache +from typing import TYPE_CHECKING import torch @@ -7,6 +8,11 @@ from .interface import DeviceCapability, Platform, PlatformEnum, _Backend +if TYPE_CHECKING: + from vllm.config import VllmConfig +else: + VllmConfig = None + logger = init_logger(__name__) try: @@ -58,3 +64,17 @@ def get_device_name(cls, device_id: int = 0) -> str: def get_device_total_memory(cls, device_id: int = 0) -> int: device_props = torch.cuda.get_device_properties(device_id) return device_props.total_memory + + @classmethod + def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + parallel_config = vllm_config.parallel_config + scheduler_config = vllm_config.scheduler_config + if parallel_config.worker_cls == "auto": + if scheduler_config.is_multi_step: + parallel_config.worker_cls = \ + "vllm.worker.multi_step_worker.MultiStepWorker" + elif vllm_config.speculative_config: + parallel_config.worker_cls = \ + "vllm.spec_decode.spec_decode_worker.create_spec_worker" + else: + parallel_config.worker_cls = "vllm.worker.worker.Worker" diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 863875ef5c2d6..513cfa54687dc 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -48,3 +48,15 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: if compilation_config.backend == "": compilation_config.backend = "openxla" + + assert vllm_config.speculative_config is None, \ + "TPU does not support speculative decoding" + + parallel_config = vllm_config.parallel_config + scheduler_config = vllm_config.scheduler_config + if parallel_config.worker_cls == "auto": + if scheduler_config.is_multi_step: + parallel_config.worker_cls = \ + "vllm.worker.multi_step_tpu_worker.MultiStepTPUWorker" + else: + parallel_config.worker_cls = "vllm.worker.tpu_worker.TPUWorker" diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 536e17a5f93e8..b2ee0ef2f71cd 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -57,6 +57,10 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: "mode.") model_config.enforce_eager = True + if vllm_config.speculative_config is not None: + raise NotImplementedError( + "XPU does not support speculative decoding") + # check and update parallel config parallel_config = vllm_config.parallel_config if (parallel_config.distributed_executor_backend is not None @@ -66,3 +70,5 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: " executor backend.", parallel_config.distributed_executor_backend) parallel_config.distributed_executor_backend = "ray" + if parallel_config.worker_cls == "auto": + parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker" diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index cf8a4946a71c4..e7fec6d17eecd 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -1,9 +1,8 @@ import dataclasses -import importlib import os import time from abc import ABC, abstractmethod -from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union +from typing import Any, Dict, List, Optional, Set, Tuple, Type, Union import torch @@ -15,7 +14,7 @@ from vllm.platforms import current_platform from vllm.sequence import ExecuteModelRequest, IntermediateTensors from vllm.utils import (enable_trace_function_call_for_thread, - update_environment_variables) + resolve_obj_by_qualname, update_environment_variables) from vllm.worker.model_runner_base import (BroadcastableModelInput, ModelRunnerBase, ModelRunnerInputBase) @@ -411,23 +410,14 @@ class WorkerWrapperBase: We first instantiate the WorkerWrapper, which remembers the worker module and class name. Then, when we call `update_environment_variables`, and the real initialization happens in `init_worker`. - - If worker_class_fn is specified, it will be executed to get the worker - class. - Otherwise, the worker class will be obtained by dynamically importing it - using worker_module_name and worker_class_name. """ def __init__( self, - worker_module_name: str, - worker_class_name: str, - trust_remote_code: bool = False, - worker_class_fn: Optional[Callable[[], - Type[WorkerBase]]] = None) -> None: - self.worker_module_name = worker_module_name - self.worker_class_name = worker_class_name - self.worker_class_fn = worker_class_fn + vllm_config: VllmConfig, + ) -> None: + self.vllm_config = vllm_config + trust_remote_code = vllm_config.model_config.trust_remote_code self.worker: Optional[WorkerBase] = None if trust_remote_code: # note: lazy import to avoid importing torch before initializing @@ -456,12 +446,8 @@ def init_worker(self, *args, **kwargs): from vllm.plugins import load_general_plugins load_general_plugins() - if self.worker_class_fn: - worker_class = self.worker_class_fn() - else: - mod = importlib.import_module(self.worker_module_name) - worker_class = getattr(mod, self.worker_class_name) - + worker_class = resolve_obj_by_qualname( + self.vllm_config.parallel_config.worker_cls) self.worker = worker_class(*args, **kwargs) assert self.worker is not None From b6374e09b0af4f8fa4c0b911b3cd1bd45342ead6 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 22 Nov 2024 15:01:56 +0800 Subject: [PATCH 107/255] [Bugfix] Fix Phi-3 BNB quantization with tensor parallel (#9948) Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/layers/linear.py | 19 +++++++--- vllm/model_executor/model_loader/loader.py | 43 +++++++++++++++++++++- 2 files changed, 56 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 2471c160d66b7..46ef11e7d02c6 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -1,3 +1,4 @@ +import itertools from abc import abstractmethod from typing import Dict, List, Optional, Tuple @@ -41,12 +42,12 @@ def adjust_marlin_shard(param, shard_size, shard_offset): def adjust_bitsandbytes_4bit_shard(param: Parameter, - qkv_offsets: Dict[str, Tuple[int, int]], + shard_offsets: Dict[str, Tuple[int, int]], loaded_shard_id: str) -> Tuple[int, int]: """Adjust the quantization offsets and sizes for BitsAndBytes sharding.""" - total, _ = qkv_offsets["total"] - orig_offset, orig_size = qkv_offsets[loaded_shard_id] + total, _ = shard_offsets["total"] + orig_offset, orig_size = shard_offsets[loaded_shard_id] quantized_total = param.data.shape[0] quantized_offset = orig_offset * quantized_total // total @@ -499,9 +500,17 @@ def weight_loader(self, # Special case for Marlin. shard_size, shard_offset = adjust_marlin_shard( param, shard_size, shard_offset) + if use_bitsandbytes_4bit: - shard_size = loaded_weight.shape[output_dim] // 2 - shard_offset = shard_size * shard_id + index = list(itertools.accumulate([0] + self.output_sizes)) + orig_offsets = { + str(i): (index[i], size) + for i, size in enumerate(self.output_sizes) + } + orig_offsets["total"] = (self.output_size, 0) + shard_size, shard_offset = adjust_bitsandbytes_4bit_shard( + param, orig_offsets, str(shard_id)) + loaded_weight_shard = loaded_weight.narrow( output_dim, shard_offset, shard_size) self.weight_loader(param, loaded_weight_shard, shard_id) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 936c2fe415375..34e0860162260 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -5,6 +5,7 @@ import fnmatch import glob import inspect +import itertools import json import math import os @@ -27,7 +28,9 @@ get_tensor_model_parallel_world_size) from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger -from vllm.model_executor.layers.linear import (ReplicatedLinear, +from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, + QKVParallelLinear, + ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.quantization.base_config import ( QuantizeMethodBase) @@ -936,6 +939,34 @@ def _unquantized_generator(self, hf_weights_files, use_safetensors, end_index = total_size // tp_size * (tp_rank + 1) weight_sub_tensor = weight_tensor[..., start_index:end_index] + # Weights have fused on disk. In this case, we assume that the + # weight and module use same name. + elif any( + weight_name.startswith(module) + for module in self.maybe_fused_weights_modules): + # special case for fused weights + # get the size of each shard weight tensor + total_shard_sizes = next( + (sizes for module, sizes in + self.maybe_fused_weights_modules.items() + if weight_name.startswith(module))) + total_size = weight_tensor.size(0) + assert total_size == sum(total_shard_sizes) + # get the start/end index of each shard weight tensor + total_start_index = list( + itertools.accumulate([0] + total_shard_sizes))[:-1] + shard_weights_index = [ + (idx + size // tp_size * tp_rank, + idx + size // tp_size * (tp_rank + 1)) + for idx, size in zip(total_start_index, + total_shard_sizes) + ] + # slice and reorder the weight tensor + weight_tensor = [ + weight_tensor[start_index:end_index, ...] + for start_index, end_index in shard_weights_index + ] + weight_sub_tensor = torch.cat(weight_tensor, dim=0) # Shard by row else: total_size = weight_tensor.size(0) @@ -985,12 +1016,22 @@ def _load_weights(self, model_config: ModelConfig, else: self.target_modules = self.default_target_modules + # Modules whose weights might have fused on disk + # we need their output_sizes to make shard in flight correctly with TP + self.maybe_fused_weights_modules: Dict[str, List[int]] = {} + for name, module in model.named_modules(): # Some modules like `ReplicatedLinear` should not have their weights # sharded. The reason for implementing it this way is to avoid new # static variable in the model implementation. if isinstance(module, (ReplicatedLinear, )): self.unsharded_weights_modules.append(name) + # `QKVParallelLinear` and `MergedColumnParallelLinear` might have + # fused weights on disk. We need to use the output sizes of these + # modules to shard the weights correctly. + elif isinstance(module, + (QKVParallelLinear, MergedColumnParallelLinear)): + self.maybe_fused_weights_modules[name] = module.output_sizes # In TP, these weights are partitioned along the column # dimension (dim=-1) elif isinstance(module, (RowParallelLinear, )): From 11fcf0e0661365f24bfff9591434a0cec640df6c Mon Sep 17 00:00:00 2001 From: Noam Gat Date: Fri, 22 Nov 2024 09:59:47 +0200 Subject: [PATCH 108/255] Remove token-adding chat embedding params (#10551) Signed-off-by: Noam Gat --- vllm/entrypoints/openai/protocol.py | 16 ---------------- vllm/entrypoints/openai/serving_embedding.py | 6 ++++-- 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index a82212677f63a..9db5951e5fe5b 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -760,22 +760,6 @@ class EmbeddingChatRequest(OpenAIBaseModel): # doc: end-chat-embedding-pooling-params # doc: begin-chat-embedding-extra-params - add_generation_prompt: bool = Field( - default=True, - description= - ("If true, the generation prompt will be added to the chat template. " - "This is a parameter used by chat template in tokenizer config of the " - "model."), - ) - continue_final_message: bool = Field( - default=False, - description= - ("If this is set, the chat will be formatted so that the final " - "message in the chat is open-ended, without any EOS tokens. The " - "model will continue this message rather than starting a new one. " - "This allows you to \"prefill\" part of the model's response for it. " - "Cannot be used at the same time as `add_generation_prompt`."), - ) add_special_tokens: bool = Field( default=False, description=( diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 74ad7389784fc..c84a7d2d8e13e 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -148,8 +148,10 @@ async def create_embedding( chat_template=request.chat_template or self.chat_template, chat_template_content_format=self. chat_template_content_format, - add_generation_prompt=request.add_generation_prompt, - continue_final_message=request.continue_final_message, + # In embedding requests, we are not generating tokens, + # so there is no need to append extra tokens to the input + add_generation_prompt=False, + continue_final_message=False, truncate_prompt_tokens=truncate_prompt_tokens, add_special_tokens=request.add_special_tokens, ) From db100c5cdebc7140b57cbb40b20b5a28d7bff386 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Nov 2024 10:02:14 -0800 Subject: [PATCH 109/255] [bugfix] fix full graph tests (#10581) Signed-off-by: youkaichao --- tests/compile/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/compile/utils.py b/tests/compile/utils.py index 078c6bf9ea1df..7c92d165d05f7 100644 --- a/tests/compile/utils.py +++ b/tests/compile/utils.py @@ -4,7 +4,7 @@ from tests.quantization.utils import is_quant_method_supported from vllm import LLM, SamplingParams -from vllm.config import CompilationConfig, CompilationLevel +from vllm.config import CompilationLevel from vllm.platforms import current_platform TEST_MODELS = [ @@ -85,7 +85,7 @@ def check_full_graph_support(model, enforce_eager=True, tensor_parallel_size=tp_size, disable_custom_all_reduce=True, - compilation_config=CompilationConfig(level=optimization_level), + compilation_config=optimization_level, **model_kwargs) outputs = llm.generate(prompts, sampling_params) From eebad39f265606cfe35af4d1e0bea678516648a3 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Nov 2024 14:04:42 -0800 Subject: [PATCH 110/255] [torch.compile] support all attention backends (#10558) Signed-off-by: youkaichao --- tests/kernels/test_encoder_decoder_attn.py | 37 +- vllm/attention/backends/abstract.py | 23 +- vllm/attention/backends/blocksparse_attn.py | 2 +- vllm/attention/backends/flash_attn.py | 412 ++++++++---------- vllm/attention/backends/flashinfer.py | 280 +++++------- vllm/attention/backends/hpu_attn.py | 2 +- vllm/attention/backends/ipex_attn.py | 2 +- vllm/attention/backends/pallas.py | 2 +- vllm/attention/backends/rocm_flash_attn.py | 2 +- vllm/attention/backends/torch_sdpa.py | 12 +- vllm/attention/backends/utils.py | 4 +- vllm/attention/backends/xformers.py | 8 +- vllm/attention/layer.py | 81 +++- vllm/config.py | 9 +- vllm/forward_context.py | 27 +- vllm/model_executor/models/arctic.py | 15 +- vllm/model_executor/models/baichuan.py | 18 +- vllm/model_executor/models/bart.py | 48 +- vllm/model_executor/models/bloom.py | 14 +- vllm/model_executor/models/chameleon.py | 11 +- vllm/model_executor/models/chatglm.py | 25 +- vllm/model_executor/models/commandr.py | 14 +- vllm/model_executor/models/dbrx.py | 21 +- vllm/model_executor/models/deepseek.py | 9 +- vllm/model_executor/models/deepseek_v2.py | 3 +- vllm/model_executor/models/exaone.py | 3 +- vllm/model_executor/models/falcon.py | 22 +- vllm/model_executor/models/florence2.py | 10 +- vllm/model_executor/models/gemma.py | 3 +- vllm/model_executor/models/gemma2.py | 15 +- .../models/glm4_vision_encoder.py | 17 +- vllm/model_executor/models/gpt2.py | 3 +- vllm/model_executor/models/gpt_bigcode.py | 13 +- vllm/model_executor/models/gpt_j.py | 13 +- vllm/model_executor/models/gpt_neox.py | 13 +- vllm/model_executor/models/granite.py | 3 +- vllm/model_executor/models/granitemoe.py | 3 +- vllm/model_executor/models/internlm2.py | 21 +- vllm/model_executor/models/internlm2_ve.py | 23 +- vllm/model_executor/models/jais.py | 13 +- vllm/model_executor/models/jamba.py | 8 +- vllm/model_executor/models/llama.py | 1 + vllm/model_executor/models/minicpm.py | 11 +- vllm/model_executor/models/minicpm3.py | 9 +- vllm/model_executor/models/mixtral.py | 3 +- vllm/model_executor/models/mixtral_quant.py | 12 +- vllm/model_executor/models/molmo.py | 13 +- vllm/model_executor/models/mpt.py | 13 +- vllm/model_executor/models/nemotron.py | 3 +- vllm/model_executor/models/olmo.py | 16 +- vllm/model_executor/models/olmoe.py | 13 +- vllm/model_executor/models/orion.py | 11 +- vllm/model_executor/models/persimmon.py | 16 +- vllm/model_executor/models/phi.py | 17 +- vllm/model_executor/models/phi3_small.py | 26 +- vllm/model_executor/models/phimoe.py | 8 +- vllm/model_executor/models/qwen.py | 11 +- vllm/model_executor/models/qwen2_moe.py | 9 +- vllm/model_executor/models/solar.py | 1 + vllm/model_executor/models/stablelm.py | 16 +- vllm/model_executor/models/starcoder2.py | 15 +- vllm/model_executor/models/xverse.py | 10 +- vllm/platforms/cpu.py | 1 + vllm/platforms/cuda.py | 1 + vllm/platforms/hpu.py | 1 + vllm/platforms/interface.py | 4 + vllm/platforms/openvino.py | 1 + vllm/platforms/rocm.py | 1 + vllm/platforms/tpu.py | 1 + vllm/platforms/xpu.py | 1 + vllm/spec_decode/draft_model_runner.py | 3 +- vllm/utils.py | 3 +- vllm/v1/attention/backends/flash_attn.py | 3 +- vllm/v1/worker/gpu_model_runner.py | 4 +- vllm/worker/embedding_model_runner.py | 2 +- vllm/worker/enc_dec_model_runner.py | 2 +- vllm/worker/model_runner.py | 4 +- 77 files changed, 879 insertions(+), 651 deletions(-) diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py index 3d3724c50421d..c4b72ba6bf4ee 100644 --- a/tests/kernels/test_encoder_decoder_attn.py +++ b/tests/kernels/test_encoder_decoder_attn.py @@ -18,8 +18,10 @@ from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP from vllm.attention.selector import (_Backend, _cached_get_attn_backend, global_force_attn_backend_context_manager) +from vllm.config import VllmConfig from vllm.forward_context import set_forward_context from vllm.platforms import current_platform +from vllm.plugins import set_current_vllm_config # List of support backends for encoder/decoder models LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN] @@ -594,6 +596,7 @@ def _run_encoder_attention_test( encoder_test_params: PhaseTestParameters, attn_metadata: AttentionMetadata, test_pt: TestPoint, + vllm_config: VllmConfig, ) -> torch.Tensor: ''' Run encoder attention. @@ -623,7 +626,7 @@ def _run_encoder_attention_test( attn_type = AttentionType.ENCODER packed_qkv = encoder_test_params.packed_qkvo.packed_qkv assert packed_qkv is not None - with set_forward_context(attn_metadata): + with set_forward_context(attn_metadata, vllm_config): # In the test setup the shape of the query is # [batch_size, seq_len, num_heads, head_size]. However # the attention backend expect the shape to be @@ -648,6 +651,7 @@ def _run_decoder_self_attention_test( decoder_test_params: PhaseTestParameters, attn_metadata: AttentionMetadata, test_pt: TestPoint, + vllm_config: VllmConfig, ) -> torch.Tensor: ''' Run decoder self-attention test. @@ -677,7 +681,7 @@ def _run_decoder_self_attention_test( kv_cache = test_rsrcs.kv_cache packed_qkv = decoder_test_params.packed_qkvo.packed_qkv assert packed_qkv is not None - with set_forward_context(attn_metadata): + with set_forward_context(attn_metadata, vllm_config): # In the test setup the shape of the query is # [batch_size, seq_len, num_heads, head_size]. However # the attention backend expect the shape to be @@ -701,6 +705,7 @@ def _run_encoder_decoder_cross_attention_test( cross_test_params: Optional[PhaseTestParameters], attn_metadata: AttentionMetadata, test_pt: TestPoint, + vllm_config: VllmConfig, ) -> torch.Tensor: ''' Run encoder/decoder cross-attention test. @@ -748,7 +753,7 @@ def _run_encoder_decoder_cross_attention_test( cross_pckd_qkv = cross_test_params.packed_qkvo.packed_qkv key = (None if cross_pckd_qkv is None else cross_pckd_qkv.key) value = (None if cross_pckd_qkv is None else cross_pckd_qkv.value) - with set_forward_context(attn_metadata): + with set_forward_context(attn_metadata, vllm_config): # In the test setup the shape of the query is # [batch_size, seq_len, num_heads, head_size]. However # the attention backend expect the shape to be @@ -839,7 +844,9 @@ def test_encoder_only( # Attention scale factor, attention backend instance, attention wrapper # instance, KV cache init - test_rsrcs = _make_test_resources(test_pt) + vllm_config = VllmConfig() + with set_current_vllm_config(vllm_config): + test_rsrcs = _make_test_resources(test_pt) # Construct encoder attention test params (only used # during prefill) @@ -863,7 +870,8 @@ def test_encoder_only( test_rsrcs.attn, enc_test_params, prephase_attn_metadata, - test_pt=test_pt)) + test_pt=test_pt, + vllm_config=vllm_config)) # - Is encoder attention result correct? assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out, @@ -960,7 +968,9 @@ def test_e2e_enc_dec_attn( # Attention scale factor, attention backend instance, attention wrapper # instance, KV cache init - test_rsrcs = _make_test_resources(test_pt) + vllm_config = VllmConfig() + with set_current_vllm_config(vllm_config): + test_rsrcs = _make_test_resources(test_pt) # Construct encoder attention test params (only used # during prefill) @@ -1011,7 +1021,8 @@ def test_e2e_enc_dec_attn( enc_pckd_act_out = _run_encoder_attention_test(test_rsrcs.attn, enc_test_params, prephase_attn_metadata, - test_pt=test_pt) + test_pt=test_pt, + vllm_config=vllm_config) # - Is encoder attention result correct? assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out, @@ -1023,7 +1034,8 @@ def test_e2e_enc_dec_attn( test_rsrcs, prephase_dec_test_params, prephase_attn_metadata, - test_pt=test_pt) + test_pt=test_pt, + vllm_config=vllm_config) # - Is prefill decoder self-attention correct? assert_actual_matches_ideal(prephase_dec_test_params, @@ -1037,7 +1049,8 @@ def test_e2e_enc_dec_attn( prephase_dec_test_params, prephase_cross_test_params, prephase_attn_metadata, - test_pt=test_pt) + test_pt=test_pt, + vllm_config=vllm_config) # - Is prefill encoder/decoder cross-attention correct? assert_actual_matches_ideal(prephase_cross_test_params, @@ -1061,7 +1074,8 @@ def test_e2e_enc_dec_attn( test_rsrcs, decphase_dec_test_params, decphase_attn_metadata, - test_pt=test_pt) + test_pt=test_pt, + vllm_config=vllm_config) # - Is decode-phase decoder self-attention correct? assert_actual_matches_ideal(decphase_dec_test_params, @@ -1075,7 +1089,8 @@ def test_e2e_enc_dec_attn( decphase_dec_test_params, None, decphase_attn_metadata, - test_pt=test_pt) + test_pt=test_pt, + vllm_config=vllm_config) # - Is decode-phase encoder/decoder cross-attention correct? assert_actual_matches_ideal(decphase_cross_test_params, diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index a504cb1f7e318..5be2d83346d00 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -1,7 +1,6 @@ from abc import ABC, abstractmethod from contextlib import contextmanager from dataclasses import dataclass, fields -from enum import Enum, auto from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Set, Tuple, Type, TypeVar) @@ -15,13 +14,19 @@ ModelRunnerInputBuilderBase) -class AttentionType(Enum): - DECODER = auto() # Decoder attention between previous layer Q/K/V - ENCODER = auto( - ) # Encoder attention between previous layer Q/K/V for encoder-decoder - ENCODER_ONLY = auto() # Encoder attention between previous layer Q/K/V - ENCODER_DECODER = auto( - ) # Attention between dec. Q and enc. K/V for encoder-decoder +class AttentionType: + """ + Attention type. + Use string to be compatible with `torch.compile`. + """ + # Decoder attention between previous layer Q/K/V + DECODER = "decoder" + # Encoder attention between previous layer Q/K/V for encoder-decoder + ENCODER = "encoder" + # Encoder attention between previous layer Q/K/V + ENCODER_ONLY = "encoder_only" + # Attention between dec. Q and enc. K/V for encoder-decoder + ENCODER_DECODER = "encoder_decoder" class AttentionBackend(ABC): @@ -241,6 +246,6 @@ def forward( attn_metadata: T, k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: AttentionType = AttentionType.DECODER, + attn_type: str = AttentionType.DECODER, ) -> torch.Tensor: raise NotImplementedError diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py index 409a42187f46c..94002e36db2bb 100644 --- a/vllm/attention/backends/blocksparse_attn.py +++ b/vllm/attention/backends/blocksparse_attn.py @@ -354,7 +354,7 @@ def forward( attn_metadata: BlocksparseFlashAttentionMetadata, k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: AttentionType = AttentionType.DECODER, + attn_type: str = AttentionType.DECODER, ) -> torch.Tensor: """Forward pass with FlashAttention and PagedAttention. diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 314822b695722..32738d1043b1d 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -16,10 +16,8 @@ compute_slot_mapping_start_idx, get_num_prefill_decode_query_kv_tokens, get_seq_len_block_table_args, is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set, is_block_tables_empty) -from vllm.forward_context import get_forward_context from vllm.multimodal import MultiModalPlaceholderMap -from vllm.utils import (async_tensor_h2d, direct_register_custom_op, - make_tensor_with_pad) +from vllm.utils import async_tensor_h2d, make_tensor_with_pad if TYPE_CHECKING: from vllm.worker.model_runner import (ModelInputForGPUBuilder, @@ -639,7 +637,7 @@ def forward( attn_metadata: FlashAttentionMetadata, k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: AttentionType = AttentionType.DECODER, + attn_type: str = AttentionType.DECODER, ) -> torch.Tensor: """Forward pass with FlashAttention. @@ -668,23 +666,174 @@ def forward( "requires setting cross-attention " "metadata attributes.") - output = torch.ops.vllm.unified_flash_attention( - query, - key, - value, - self.num_heads, - self.head_size, - self.num_kv_heads, - kv_cache, - self.kv_cache_dtype, - k_scale, - v_scale, - self.scale, - attn_type.value, - self.sliding_window, - self.alibi_slopes, - self.logits_soft_cap, - ) + num_heads: int = self.num_heads + head_size: int = self.head_size + num_kv_heads: int = self.num_kv_heads + kv_cache_dtype: str = self.kv_cache_dtype + softmax_scale: float = self.scale + window_size = self.sliding_window + alibi_slopes: Optional[torch.Tensor] = self.alibi_slopes + logits_soft_cap: Optional[float] = self.logits_soft_cap + + num_tokens, hidden_size = query.shape + + # Reshape the query, key, and value tensors. + query = query.view(-1, num_heads, head_size) + if (key is not None) and (value is not None): + key = key.view(-1, num_kv_heads, head_size) + value = value.view(-1, num_kv_heads, head_size) + + if kv_cache.numel() > 0: + key_cache = kv_cache[0] + value_cache = kv_cache[1] + # We skip updating the KV cache under two conditions: + # a. When the Attention Type is ENCODER. In this phase, we compute + # only the encoder attention without updating the cache. + # b. When both Key and Value are None. This occurs during + # cross-attention computation in the decoding phase, where the + # KV cache is already populated with the cross-attention + # tensor. Thus, we skip cache updates during this time. + if (attn_type != AttentionType.ENCODER) and (key is not None) and ( + value is not None): + if attn_type == AttentionType.ENCODER_DECODER: + # Update cross-attention KV cache (prefill-only) + updated_slot_mapping = attn_metadata.cross_slot_mapping + else: + # Update self-attention KV cache (prefill/decode) + updated_slot_mapping = attn_metadata.slot_mapping + + # Reshape the input keys and values and store them in the cache. + # If kv_cache is not provided, the new key and value tensors are + # not cached. This happens during the initial memory + # profiling run. + torch.ops._C_cache_ops.reshape_and_cache_flash( + key, + value, + kv_cache[0], + kv_cache[1], + updated_slot_mapping.flatten(), # type: ignore[union-attr] + kv_cache_dtype, + k_scale, + v_scale, + ) + + (num_prefill_query_tokens, num_prefill_kv_tokens, + num_decode_query_tokens) = \ + get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type) + decode_query = query[num_prefill_query_tokens:] + # QKV for prefill. + query = query[:num_prefill_query_tokens] + assert query.shape[0] == num_prefill_query_tokens + assert decode_query.shape[0] == num_decode_query_tokens + + prefill_output: Optional[torch.Tensor] = None + decode_output: Optional[torch.Tensor] = None + if prefill_meta := attn_metadata.prefill_metadata: + # Prompt run. + if (kv_cache.numel() == 0 or prefill_meta.block_tables is None + or prefill_meta.block_tables.numel() == 0): + # normal attention + # When block_tables are not filled, it means q and k are the + # prompt, and they have the same length. + q_seq_start_loc, q_seq_len, k_seq_start_loc, k_seq_len = \ + _get_query_key_seq_metadata(prefill_meta, True, attn_type) + + key = key[:num_prefill_kv_tokens] + value = value[:num_prefill_kv_tokens] + + prefill_output = flash_attn_varlen_func( + q=query, + k=key, + v=value, + cu_seqlens_q=q_seq_start_loc, + cu_seqlens_k=k_seq_start_loc, + max_seqlen_q=q_seq_len, + max_seqlen_k=k_seq_len, + softmax_scale=softmax_scale, + causal=_get_causal_option(attn_type), + window_size=window_size, + alibi_slopes=alibi_slopes, + softcap=logits_soft_cap, + ) + else: + # prefix-enabled attention + assert attn_type == AttentionType.DECODER, ( + "Only decoder-only models support prefix caching") + assert prefill_meta.seq_lens is not None + max_seq_len = max(prefill_meta.seq_lens) + prefill_output = flash_attn_varlen_func( # noqa + q=query, + k=key_cache, + v=value_cache, + cu_seqlens_q=prefill_meta.query_start_loc, + max_seqlen_q=prefill_meta.max_query_len, + cu_seqlens_k=prefill_meta.seq_start_loc, + max_seqlen_k=max_seq_len, + softmax_scale=softmax_scale, + causal=True, + window_size=window_size, + alibi_slopes=alibi_slopes, + block_table=prefill_meta.block_tables, + softcap=logits_soft_cap, + ) + + if decode_meta := attn_metadata.decode_metadata: + # Decoding run. + # Use flash_attn_varlen_func kernel for speculative decoding + # because different queries might have different lengths. + + assert decode_meta.max_decode_query_len is not None + # use only for actual varlen decoding + if decode_meta.max_decode_query_len > 1: + assert attn_type == AttentionType.DECODER, ( + "Only decoder-only models support max_decode_query_len > 1" + ) + decode_output = flash_attn_varlen_func( + q=decode_query, + k=key_cache, + v=value_cache, + cu_seqlens_q=decode_meta.query_start_loc, + max_seqlen_q=decode_meta.max_decode_query_len, + cu_seqlens_k=decode_meta.seq_start_loc, + max_seqlen_k=decode_meta.max_decode_seq_len, + softmax_scale=softmax_scale, + causal=True, + window_size=window_size, + alibi_slopes=alibi_slopes, + softcap=logits_soft_cap, + block_table=decode_meta.block_tables, + ) + else: + # Use flash_attn_with_kvcache for normal decoding. + ( + seq_lens_arg, + _, + block_tables_arg, + ) = get_seq_len_block_table_args(decode_meta, False, attn_type) + decode_output = flash_attn_with_kvcache( + q=decode_query.unsqueeze(1), + k_cache=key_cache, + v_cache=value_cache, + block_table=block_tables_arg, + cache_seqlens=seq_lens_arg, + softmax_scale=softmax_scale, + causal=True, + window_size=window_size, + alibi_slopes=alibi_slopes, + softcap=logits_soft_cap, + ).squeeze(1) + + if prefill_output is None: + assert decode_output is not None + return decode_output.view(num_decode_query_tokens, hidden_size) + if decode_output is None: + assert prefill_output is not None + return prefill_output.view(num_prefill_query_tokens, hidden_size) + + assert decode_meta is not None + decode_output = decode_output.squeeze(1) + output = torch.cat([prefill_output, decode_output], dim=0) + return output.view(num_tokens, hidden_size) return output @@ -692,7 +841,7 @@ def forward( def _get_query_key_seq_metadata( attn_metadata, is_prompt: bool, - attn_type: AttentionType, + attn_type: str, ) -> tuple: """ Returns sequence metadata for key and query based on the specified @@ -754,7 +903,7 @@ def _get_query_key_seq_metadata( raise AttributeError(f"Invalid attention type {str(attn_type)}") -def _get_causal_option(attn_type: AttentionType) -> bool: +def _get_causal_option(attn_type: str) -> bool: """ Determine whether the given attention type is suitable for causal attention mechanisms. @@ -770,220 +919,3 @@ def _get_causal_option(attn_type: AttentionType) -> bool: return not (attn_type == AttentionType.ENCODER or attn_type == AttentionType.ENCODER_ONLY or attn_type == AttentionType.ENCODER_DECODER) - - -def unified_flash_attention( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - num_heads: int, - head_size: int, - num_kv_heads: int, - kv_cache: torch.Tensor, - kv_cache_dtype: str, - k_scale: float, - v_scale: float, - softmax_scale: float, - attn_type_int_val: int, - window_size: Optional[List[int]] = None, - alibi_slopes: Optional[torch.Tensor] = None, - logits_soft_cap: Optional[float] = None, -) -> torch.Tensor: - - # Convert integer attn_type to enum - try: - attn_type = AttentionType(attn_type_int_val) - except ValueError as err: - raise AttributeError( - f"Invalid attention type {str(attn_type_int_val)}") from err - - current_metadata = get_forward_context() - assert current_metadata is not None - assert isinstance(current_metadata, FlashAttentionMetadata) - attn_metadata: FlashAttentionMetadata = current_metadata - - num_tokens, hidden_size = query.shape - - # Reshape the query, key, and value tensors. - query = query.view(-1, num_heads, head_size) - if (key is not None) and (value is not None): - key = key.view(-1, num_kv_heads, head_size) - value = value.view(-1, num_kv_heads, head_size) - - if kv_cache.numel() > 0: - key_cache = kv_cache[0] - value_cache = kv_cache[1] - # We skip updating the KV cache under two conditions: - # a. When the Attention Type is ENCODER. In this phase, we compute - # only the encoder attention without updating the cache. - # b. When both Key and Value are None. This occurs during - # cross-attention computation in the decoding phase, where the KV - # cache is already populated with the cross-attention tensor. - # Thus, we skip cache updates during this time. - if (attn_type != AttentionType.ENCODER) and (key is not None) and ( - value is not None): - if attn_type == AttentionType.ENCODER_DECODER: - # Update cross-attention KV cache (prefill-only) - updated_slot_mapping = attn_metadata.cross_slot_mapping - else: - # Update self-attention KV cache (prefill/decode) - updated_slot_mapping = attn_metadata.slot_mapping - - # Reshape the input keys and values and store them in the cache. - # If kv_cache is not provided, the new key and value tensors are - # not cached. This happens during the initial memory profiling run. - torch.ops._C_cache_ops.reshape_and_cache_flash( - key, - value, - kv_cache[0], - kv_cache[1], - updated_slot_mapping.flatten(), # type: ignore[union-attr] - kv_cache_dtype, - k_scale, - v_scale, - ) - - (num_prefill_query_tokens, num_prefill_kv_tokens, - num_decode_query_tokens) = \ - get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type) - decode_query = query[num_prefill_query_tokens:] - # QKV for prefill. - query = query[:num_prefill_query_tokens] - assert query.shape[0] == num_prefill_query_tokens - assert decode_query.shape[0] == num_decode_query_tokens - - prefill_output: Optional[torch.Tensor] = None - decode_output: Optional[torch.Tensor] = None - if prefill_meta := attn_metadata.prefill_metadata: - # Prompt run. - if (kv_cache.numel() == 0 or prefill_meta.block_tables is None - or prefill_meta.block_tables.numel() == 0): - # normal attention - # When block_tables are not filled, it means q and k are the - # prompt, and they have the same length. - q_seq_start_loc, q_seq_len, k_seq_start_loc, k_seq_len = \ - _get_query_key_seq_metadata(prefill_meta, True, attn_type) - - key = key[:num_prefill_kv_tokens] - value = value[:num_prefill_kv_tokens] - - prefill_output = flash_attn_varlen_func( - q=query, - k=key, - v=value, - cu_seqlens_q=q_seq_start_loc, - cu_seqlens_k=k_seq_start_loc, - max_seqlen_q=q_seq_len, - max_seqlen_k=k_seq_len, - softmax_scale=softmax_scale, - causal=_get_causal_option(attn_type), - window_size=window_size, - alibi_slopes=alibi_slopes, - softcap=logits_soft_cap, - ) - else: - # prefix-enabled attention - assert attn_type == AttentionType.DECODER, ( - "Only decoder-only models support prefix caching") - assert prefill_meta.seq_lens is not None - max_seq_len = max(prefill_meta.seq_lens) - prefill_output = flash_attn_varlen_func( # noqa - q=query, - k=key_cache, - v=value_cache, - cu_seqlens_q=prefill_meta.query_start_loc, - max_seqlen_q=prefill_meta.max_query_len, - cu_seqlens_k=prefill_meta.seq_start_loc, - max_seqlen_k=max_seq_len, - softmax_scale=softmax_scale, - causal=True, - window_size=window_size, - alibi_slopes=alibi_slopes, - block_table=prefill_meta.block_tables, - softcap=logits_soft_cap, - ) - - if decode_meta := attn_metadata.decode_metadata: - # Decoding run. - # Use flash_attn_varlen_func kernel for speculative decoding - # because different queries might have different lengths. - - assert decode_meta.max_decode_query_len is not None - # use only for actual varlen decoding - if decode_meta.max_decode_query_len > 1: - assert attn_type == AttentionType.DECODER, ( - "Only decoder-only models support max_decode_query_len > 1") - decode_output = flash_attn_varlen_func( - q=decode_query, - k=key_cache, - v=value_cache, - cu_seqlens_q=decode_meta.query_start_loc, - max_seqlen_q=decode_meta.max_decode_query_len, - cu_seqlens_k=decode_meta.seq_start_loc, - max_seqlen_k=decode_meta.max_decode_seq_len, - softmax_scale=softmax_scale, - causal=True, - window_size=window_size, - alibi_slopes=alibi_slopes, - softcap=logits_soft_cap, - block_table=decode_meta.block_tables, - ) - else: - # Use flash_attn_with_kvcache for normal decoding. - ( - seq_lens_arg, - _, - block_tables_arg, - ) = get_seq_len_block_table_args(decode_meta, False, attn_type) - decode_output = flash_attn_with_kvcache( - q=decode_query.unsqueeze(1), - k_cache=key_cache, - v_cache=value_cache, - block_table=block_tables_arg, - cache_seqlens=seq_lens_arg, - softmax_scale=softmax_scale, - causal=True, - window_size=window_size, - alibi_slopes=alibi_slopes, - softcap=logits_soft_cap, - ).squeeze(1) - - if prefill_output is None: - assert decode_output is not None - return decode_output.view(num_decode_query_tokens, hidden_size) - if decode_output is None: - assert prefill_output is not None - return prefill_output.view(num_prefill_query_tokens, hidden_size) - - assert decode_meta is not None - decode_output = decode_output.squeeze(1) - output = torch.cat([prefill_output, decode_output], dim=0) - return output.view(num_tokens, hidden_size) - - -def unified_flash_attention_fake( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - num_heads: int, - head_size: int, - num_kv_heads: int, - kv_cache: torch.Tensor, - kv_cache_dtype: str, - k_scale: float, - v_scale: float, - softmax_scale: float, - attn_type_int_val: int, - window_size: Optional[List[int]] = None, - alibi_slopes: Optional[torch.Tensor] = None, - logits_soft_cap: Optional[float] = None, -) -> torch.Tensor: - return torch.empty_like(query) - - -direct_register_custom_op( - op_name="unified_flash_attention", - op_func=unified_flash_attention, - mutates_args=["kv_cache"], - fake_impl=unified_flash_attention_fake, -) diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index b61c660e3e280..1a2024705eb04 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -30,9 +30,8 @@ compute_slot_mapping_start_idx, is_block_tables_empty) from vllm.attention.ops.paged_attn import PagedAttention -from vllm.forward_context import get_forward_context -from vllm.utils import (async_tensor_h2d, direct_register_custom_op, - get_kv_cache_torch_dtype, make_tensor_with_pad) +from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype, + make_tensor_with_pad) if TYPE_CHECKING: from vllm.worker.model_runner import (ModelInputForGPUBuilder, @@ -774,7 +773,7 @@ def forward( attn_metadata: FlashInferMetadata, k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: AttentionType = AttentionType.DECODER, + attn_type: str = AttentionType.DECODER, ) -> torch.Tensor: if attn_type != AttentionType.DECODER: raise NotImplementedError("Encoder self-attention and " @@ -782,174 +781,117 @@ def forward( "are not implemented for " "FlashInferImpl") - return torch.ops.vllm.unified_flash_infer( - query, - key, - value, - self.num_heads, - self.head_size, - self.num_kv_heads, - kv_cache, - self.kv_cache_dtype, - k_scale, - v_scale, - self.scale, - self.sliding_window, - self.alibi_slopes, - self.logits_soft_cap, - ) - - -def unified_flash_infer( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - num_heads: int, - head_size: int, - num_kv_heads: int, - kv_cache: torch.Tensor, - kv_cache_dtype: str, - k_scale: float, - v_scale: float, - softmax_scale: float, - window_size: Optional[List[int]] = None, - alibi_slopes: Optional[torch.Tensor] = None, - logits_soft_cap: Optional[float] = None, -) -> torch.Tensor: - - current_metadata = get_forward_context() - assert current_metadata is not None - assert isinstance(current_metadata, FlashInferMetadata) - attn_metadata: FlashInferMetadata = current_metadata - - num_tokens, hidden_size = query.shape - query = query.view(-1, num_heads, head_size) - key = key.view(-1, num_kv_heads, head_size) - value = value.view(-1, num_kv_heads, head_size) - - if kv_cache.numel() > 0: - # Use the same reshape and cache kernel as flash attention. - ops.reshape_and_cache_flash( - key, - value, - kv_cache[:, 0], - kv_cache[:, 1], - attn_metadata.slot_mapping.flatten(), - kv_cache_dtype, - k_scale, - v_scale, - ) - # The FlashInfer api requires data to be in fp8_e4m3 or fp8_e5m2 - # to process the cache when the kv_cache_dtype is fp8 - if kv_cache_dtype.startswith("fp8"): - torch_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer( - kv_cache_dtype) - kv_cache = kv_cache.view(torch_dtype) - - num_prefill_tokens = attn_metadata.num_prefill_tokens - num_decode_tokens = attn_metadata.num_decode_tokens - assert key.shape[0] == num_prefill_tokens + num_decode_tokens, \ - f"key : {key.shape} : #prefill tokens {num_prefill_tokens} : #decode tokens {num_decode_tokens}" # noqa - assert value.shape[0] == num_prefill_tokens + num_decode_tokens, \ - f"value : {value.shape} : #prefill toks {num_prefill_tokens} : #decode toks {num_decode_tokens}" # noqa - query = query.contiguous() # Flashinfer requires query to be contiguous - # Query for decode. KV is not needed because it is already cached. - # QKV for prefill. - decode_query = query[num_prefill_tokens:] - query = query[:num_prefill_tokens] - - key = key[:num_prefill_tokens] - value = value[:num_prefill_tokens] - - assert query.shape[0] == num_prefill_tokens - assert decode_query.shape[0] == num_decode_tokens - - window_left = window_size[0] if window_size is not None else -1 - - prefill_output: Optional[torch.Tensor] = None - decode_output: Optional[torch.Tensor] = None - if prefill_meta := attn_metadata.prefill_metadata: - # We will use flash attention for prefill - # when kv_cache is not provided. - # This happens when vllm runs the profiling to - # determine the number of blocks. - if kv_cache.numel() == 0: - prefill_output = flash_attn_varlen_func( - q=query, - k=key, - v=value, - cu_seqlens_q=prefill_meta.seq_start_loc, - cu_seqlens_k=prefill_meta.seq_start_loc, - max_seqlen_q=prefill_meta.max_prefill_seq_len, - max_seqlen_k=prefill_meta.max_prefill_seq_len, - softmax_scale=softmax_scale, - causal=True, - window_size=window_size, - alibi_slopes=alibi_slopes, + num_heads: int = self.num_heads + head_size: int = self.head_size + num_kv_heads: int = self.num_kv_heads + kv_cache_dtype: str = self.kv_cache_dtype + softmax_scale: float = self.scale + window_size = self.sliding_window + alibi_slopes = self.alibi_slopes + logits_soft_cap = self.logits_soft_cap + + num_tokens, hidden_size = query.shape + query = query.view(-1, num_heads, head_size) + key = key.view(-1, num_kv_heads, head_size) + value = value.view(-1, num_kv_heads, head_size) + + if kv_cache.numel() > 0: + # Use the same reshape and cache kernel as flash attention. + ops.reshape_and_cache_flash( + key, + value, + kv_cache[:, 0], + kv_cache[:, 1], + attn_metadata.slot_mapping.flatten(), + kv_cache_dtype, + k_scale, + v_scale, ) - else: - assert prefill_meta is not None - assert prefill_meta.prefill_wrapper is not None - prefill_output = prefill_meta.prefill_wrapper.forward( - query, + # The FlashInfer api requires data to be in fp8_e4m3 or fp8_e5m2 + # to process the cache when the kv_cache_dtype is fp8 + if kv_cache_dtype.startswith("fp8"): + torch_dtype = FlashInferBackend.get_fp8_dtype_for_flashinfer( + kv_cache_dtype) + kv_cache = kv_cache.view(torch_dtype) + + num_prefill_tokens = attn_metadata.num_prefill_tokens + num_decode_tokens = attn_metadata.num_decode_tokens + assert key.shape[0] == num_prefill_tokens + num_decode_tokens, \ + f"key : {key.shape} : #prefill tokens {num_prefill_tokens} : #decode tokens {num_decode_tokens}" # noqa + assert value.shape[0] == num_prefill_tokens + num_decode_tokens, \ + f"value : {value.shape} : #prefill toks {num_prefill_tokens} : #decode toks {num_decode_tokens}" # noqa + query = query.contiguous( + ) # Flashinfer requires query to be contiguous + # Query for decode. KV is not needed because it is already cached. + # QKV for prefill. + decode_query = query[num_prefill_tokens:] + query = query[:num_prefill_tokens] + + key = key[:num_prefill_tokens] + value = value[:num_prefill_tokens] + + assert query.shape[0] == num_prefill_tokens + assert decode_query.shape[0] == num_decode_tokens + + window_left = window_size[0] if window_size is not None else -1 + + prefill_output: Optional[torch.Tensor] = None + decode_output: Optional[torch.Tensor] = None + if prefill_meta := attn_metadata.prefill_metadata: + # We will use flash attention for prefill + # when kv_cache is not provided. + # This happens when vllm runs the profiling to + # determine the number of blocks. + if kv_cache.numel() == 0: + prefill_output = flash_attn_varlen_func( + q=query, + k=key, + v=value, + cu_seqlens_q=prefill_meta.seq_start_loc, + cu_seqlens_k=prefill_meta.seq_start_loc, + max_seqlen_q=prefill_meta.max_prefill_seq_len, + max_seqlen_k=prefill_meta.max_prefill_seq_len, + softmax_scale=softmax_scale, + causal=True, + window_size=window_size, + alibi_slopes=alibi_slopes, + ) + else: + assert prefill_meta is not None + assert prefill_meta.prefill_wrapper is not None + prefill_output = prefill_meta.prefill_wrapper.forward( + query, + kv_cache, + logits_soft_cap=logits_soft_cap, + causal=True, + k_scale=k_scale, + v_scale=v_scale, + window_left=window_left) + if decode_meta := attn_metadata.decode_metadata: + assert decode_meta is not None + assert decode_meta.decode_wrapper is not None + decode_output = decode_meta.decode_wrapper.forward( + decode_query, kv_cache, + sm_scale=softmax_scale, logits_soft_cap=logits_soft_cap, - causal=True, k_scale=k_scale, v_scale=v_scale, window_left=window_left) - if decode_meta := attn_metadata.decode_metadata: - assert attn_metadata.decode_metadata is not None - assert attn_metadata.decode_metadata.decode_wrapper is not None - decode_output = attn_metadata.decode_metadata.decode_wrapper.forward( - decode_query, - kv_cache, - sm_scale=softmax_scale, - logits_soft_cap=logits_soft_cap, - k_scale=k_scale, - v_scale=v_scale, - window_left=window_left) - - if prefill_output is None and decode_output is not None: - # Decode only batch. - output, num_tokens = decode_output, num_decode_tokens - elif decode_output is None and prefill_output is not None: - # Prefill only batch. - output, num_tokens = prefill_output, num_prefill_tokens - else: - # Chunked prefill batch does not work with speculative decoding in - # FlashInfer backend, so the query length for decode should be 1. - assert prefill_output is not None - assert decode_output is not None - assert decode_meta is not None - assert decode_meta.decode_query_len == 1 - decode_output = decode_output.squeeze(1) - output = torch.cat([prefill_output, decode_output], dim=0) - return output.view(num_tokens, hidden_size) - - -def unified_flash_infer_fake( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - num_heads: int, - head_size: int, - num_kv_heads: int, - kv_cache: torch.Tensor, - kv_cache_dtype: str, - k_scale: float, - v_scale: float, - softmax_scale: float, - window_size: Optional[List[int]] = None, - alibi_slopes: Optional[torch.Tensor] = None, - logits_soft_cap: Optional[float] = None, -) -> torch.Tensor: - return torch.empty_like(query).contiguous() - - -direct_register_custom_op( - op_name="unified_flash_infer", - op_func=unified_flash_infer, - mutates_args=["kv_cache"], - fake_impl=unified_flash_infer_fake, -) + + if prefill_output is None and decode_output is not None: + # Decode only batch. + output, num_tokens = decode_output, num_decode_tokens + elif decode_output is None and prefill_output is not None: + # Prefill only batch. + output, num_tokens = prefill_output, num_prefill_tokens + else: + # Chunked prefill batch does not work with speculative decoding in + # FlashInfer backend, so the query length for decode should be 1. + assert prefill_output is not None + assert decode_output is not None + assert decode_meta is not None + assert decode_meta.decode_query_len == 1 + decode_output = decode_output.squeeze(1) + output = torch.cat([prefill_output, decode_output], dim=0) + return output.view(num_tokens, hidden_size) diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index a8f4b09b67274..4a3ddd5db94e5 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -140,7 +140,7 @@ def forward( attn_metadata: HPUAttentionMetadata, k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: AttentionType = AttentionType.DECODER, + attn_type: str = AttentionType.DECODER, ) -> torch.Tensor: """Forward pass with xFormers and PagedAttention. diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index 87bdb1e0e6565..3b0d51ea4a3d8 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -172,7 +172,7 @@ def forward( attn_metadata: IpexAttnMetadata, # type: ignore k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: AttentionType = AttentionType.DECODER, + attn_type: str = AttentionType.DECODER, ) -> torch.Tensor: """Forward pass with IPEX varlen_attention and PagedAttention. diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index eeab8731a2c39..5988be0e6b687 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -150,7 +150,7 @@ def forward( attn_metadata: PallasMetadata, k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: AttentionType = AttentionType.DECODER, + attn_type: str = AttentionType.DECODER, ) -> torch.Tensor: """Forward pass with Pallas attention. diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 2bae370eaa90f..6a494f4e73cb4 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -414,7 +414,7 @@ def forward( attn_metadata: ROCmFlashAttentionMetadata, k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: AttentionType = AttentionType.DECODER, + attn_type: str = AttentionType.DECODER, ) -> torch.Tensor: """Forward pass with FlashAttention and PagedAttention. diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 3d025df26a7a1..16e044b618c40 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -141,7 +141,7 @@ def decode_metadata(self) -> Optional["TorchSDPAMetadata"]: def get_seq_lens( self, - attn_type: AttentionType, + attn_type: str, ): ''' Extract appropriate sequence lengths from attention metadata @@ -174,7 +174,7 @@ def get_seq_lens( def get_attn_bias( self, - attn_type: AttentionType, + attn_type: str, ) -> Optional[List[torch.Tensor]]: ''' Extract appropriate attention bias from attention metadata @@ -203,7 +203,7 @@ def get_attn_bias( def set_attn_bias( self, attn_bias: List[torch.Tensor], - attn_type: AttentionType, + attn_type: str, ) -> None: ''' Update appropriate attention bias field of attention metadata, @@ -229,7 +229,7 @@ def set_attn_bias( def get_seq_len_block_table_args( self, - attn_type: AttentionType, + attn_type: str, ) -> tuple: ''' The particular choice of sequence-length- and block-table-related @@ -426,7 +426,7 @@ def forward( attn_metadata: TorchSDPAMetadata, # type: ignore k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: AttentionType = AttentionType.DECODER, + attn_type: str = AttentionType.DECODER, ) -> torch.Tensor: """Forward pass with torch SDPA and PagedAttention. @@ -574,7 +574,7 @@ def _run_sdpa_forward( key: torch.Tensor, value: torch.Tensor, attn_metadata: TorchSDPAMetadata, - attn_type: AttentionType = AttentionType.DECODER, + attn_type: str = AttentionType.DECODER, ) -> None: if self.num_kv_heads != self.num_heads: key = key.repeat_interleave(self.num_queries_per_kv, dim=1) diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 12800668af223..56cc43430301f 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -478,7 +478,7 @@ def is_all_cross_attn_metadata_set(attn_metadata): def get_seq_len_block_table_args( attn_metadata, is_prompt: bool, - attn_type: AttentionType, + attn_type: str, ) -> tuple: ''' The particular choice of sequence-length- and block-table-related @@ -529,7 +529,7 @@ def get_seq_len_block_table_args( def get_num_prefill_decode_query_kv_tokens( attn_metadata, - attn_type: AttentionType, + attn_type: str, ) -> Tuple[int, int, int]: """ Calculate the number of prefill and decode tokens for query, key/value diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 83d03606524dc..292575a8736bc 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -284,7 +284,7 @@ def decode_metadata(self) -> Optional["XFormersMetadata"]: def _get_attn_bias( attn_metadata: XFormersMetadata, - attn_type: AttentionType, + attn_type: str, ) -> Optional[AttentionBias]: ''' Extract appropriate attention bias from attention metadata @@ -314,7 +314,7 @@ def _get_attn_bias( def _set_attn_bias( attn_metadata: XFormersMetadata, attn_bias: List[Optional[AttentionBias]], - attn_type: AttentionType, + attn_type: str, ) -> None: ''' Update appropriate attention bias field of attention metadata, @@ -416,7 +416,7 @@ def forward( attn_metadata: "XFormersMetadata", k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: AttentionType = AttentionType.DECODER, + attn_type: str = AttentionType.DECODER, ) -> torch.Tensor: """Forward pass with xFormers and PagedAttention. @@ -617,7 +617,7 @@ def _run_memory_efficient_xformers_forward( key: torch.Tensor, value: torch.Tensor, attn_metadata: XFormersMetadata, - attn_type: AttentionType = AttentionType.DECODER, + attn_type: str = AttentionType.DECODER, ) -> torch.Tensor: """Attention for 1D query of multiple prompts. Multiple prompt tokens are flattened in to `query` input. diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 33d05cbd3fe01..8acbeaf12b0cf 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -4,12 +4,17 @@ import torch import torch.nn as nn +import vllm.envs as envs from vllm.attention import AttentionMetadata, AttentionType from vllm.attention.selector import get_attn_backend from vllm.config import CacheConfig +from vllm.forward_context import ForwardContext, get_forward_context from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod +from vllm.platforms import current_platform +from vllm.plugins import get_current_vllm_config +from vllm.utils import direct_register_custom_op class Attention(nn.Module): @@ -86,6 +91,18 @@ def __init__( alibi_slopes, sliding_window, kv_cache_dtype, blocksparse_params, logits_soft_cap) + # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how + # torch.compile works by registering the attention as one giant + # opaque custom op. For other platforms, we directly call them + # and let torch.compile handle them. + self.use_direct_call = envs.VLLM_USE_V1 or not ( + current_platform.is_cuda_alike() or current_platform.is_cpu()) + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + compilation_config.static_forward_context[prefix] = self + self.layer_name = prefix + def forward( self, query: torch.Tensor, @@ -93,17 +110,22 @@ def forward( value: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: AttentionMetadata, - attn_type: AttentionType = AttentionType.DECODER, + attn_type: str = AttentionType.DECODER, ) -> torch.Tensor: - return self.impl.forward(query, - key, - value, - kv_cache, - attn_metadata, - self._k_scale, - self._v_scale, - attn_type=attn_type) + if self.use_direct_call: + return self.impl.forward(query, + key, + value, + kv_cache, + attn_metadata, + self._k_scale, + self._v_scale, + attn_type=attn_type) + else: + return torch.ops.vllm.unified_attention(query, key, value, + kv_cache, attn_type, + self.layer_name) def extra_repr(self) -> str: s = f"head_size={self.impl.head_size}" # type: ignore @@ -112,3 +134,44 @@ def extra_repr(self) -> str: s += f", scale={self.impl.scale}" # type: ignore s += f", backend={self.impl.__class__.__name__}" return s + + +def unified_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: torch.Tensor, + attn_type: str, + layer_name: str, +) -> torch.Tensor: + forward_context: ForwardContext = get_forward_context() + attn_metadata = forward_context.dynamic_forward_context + self = forward_context.static_forward_context[layer_name] + return self.impl.forward(query, + key, + value, + kv_cache, + attn_metadata, + self._k_scale, + self._v_scale, + attn_type=attn_type) + + +def unified_attention_fake( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: torch.Tensor, + attn_type: str, + layer_name: str, +) -> torch.Tensor: + return torch.empty_like(query).contiguous() + + +direct_register_custom_op( + op_name="unified_attention", + op_func=unified_attention, + mutates_args=["kv_cache"], + fake_impl=unified_attention_fake, + dispatch_key=current_platform.dispatch_key, +) diff --git a/vllm/config.py b/vllm/config.py index b5f2116e3557b..bb02c2ad4c7d4 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2135,8 +2135,7 @@ class CompilationConfig(BaseModel): backend: str = "" custom_ops: List[str] = Field(default_factory=list) splitting_ops: List[str] = Field(default_factory=lambda: [ - "vllm.unified_flash_attention", - "vllm.unified_flash_infer", + "vllm.unified_attention", "vllm.unified_v1_flash_attention", ]) @@ -2197,6 +2196,11 @@ def model_post_init(self, __context: Any) -> None: enabled_custom_ops: Counter[str] = PrivateAttr disabled_custom_ops: Counter[str] = PrivateAttr + # Per-model forward context + # Mainly used to store attention cls + # Map from layer name to the attention cls + static_forward_context: Dict[str, Any] = PrivateAttr + @classmethod def from_cli(cls, cli_value: str) -> "CompilationConfig": """Parse the CLI value for the compilation config.""" @@ -2228,6 +2232,7 @@ def model_post_init(self, __context: Any) -> None: self.enabled_custom_ops = Counter() self.disabled_custom_ops = Counter() + self.static_forward_context = {} def init_backend(self) -> Union[str, Callable]: if self.level == CompilationLevel.NO_COMPILATION: diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 777747505e14a..aaa3e4bb3a1e8 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -1,21 +1,38 @@ from contextlib import contextmanager -from typing import Any +from dataclasses import dataclass +from typing import Any, Dict, Optional -_forward_context: Any = None +from vllm.config import VllmConfig -def get_forward_context() -> Any: +@dataclass +class ForwardContext: + static_forward_context: Dict[str, Any] + # TODO: extend to support per-layer dynamic forward context + dynamic_forward_context: Any + + +_forward_context: Optional[ForwardContext] = None + + +def get_forward_context() -> ForwardContext: """Get the current forward context.""" + assert _forward_context is not None, ( + "Forward context is not set. " + "Please use `set_forward_context` to set the forward context.") return _forward_context @contextmanager -def set_forward_context(context: Any): +def set_forward_context(context: Any, vllm_config: VllmConfig): """A context manager that stores the current forward context, can be attention metadata, etc.""" global _forward_context prev_context = _forward_context - _forward_context = context + _forward_context = ForwardContext( + static_forward_context=vllm_config.compilation_config. + static_forward_context, + dynamic_forward_context=context) try: yield finally: diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index e58ad19cab54c..ac4c464aa10ac 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -223,6 +223,7 @@ def __init__( layer_idx: Optional[int] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.config = config @@ -274,7 +275,8 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, @@ -299,6 +301,7 @@ def __init__( layer_idx: int, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.layer_idx = layer_idx @@ -308,7 +311,8 @@ def __init__( self.self_attn = ArcticAttention(config, layer_idx, cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.self_attn") self.block_sparse_moe = ArcticMoE( config, layer_id=layer_idx, @@ -380,8 +384,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): org_num_embeddings=self.vocab_size) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: ArcticDecoderLayer(config, int( - prefix.split(".")[-1]), cache_config, quant_config), + lambda prefix: ArcticDecoderLayer(config, + int(prefix.split(".")[-1]), + cache_config, + quant_config, + prefix=prefix), prefix=f"{prefix}.layers") self._attn_implementation = config._attn_implementation self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 3749a16a38994..a923ed36a9db2 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -116,6 +116,7 @@ def __init__( max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.hidden_size = hidden_size @@ -158,7 +159,8 @@ def __init__( self.head_dim, scaling, alibi_slopes=alibi_slopes, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") else: self.rotary_emb = get_rope( self.head_dim, @@ -171,7 +173,8 @@ def __init__( self.head_dim, self.scaling, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, @@ -195,7 +198,8 @@ def __init__(self, config: PretrainedConfig, position_embedding: str, cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None): + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): super().__init__() self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 10000) @@ -209,6 +213,7 @@ def __init__(self, max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, + prefix=f"{prefix}.self_attn", ) self.mlp = BaiChuanMLP( hidden_size=self.hidden_size, @@ -275,8 +280,11 @@ def __init__( ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: BaiChuanDecoderLayer(config, position_embedding, - cache_config, quant_config), + lambda prefix: BaiChuanDecoderLayer(config, + position_embedding, + cache_config, + quant_config, + prefix=prefix), prefix=f"{prefix}.layers", ) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index a50a5a5b018e1..3776490cb3465 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -126,6 +126,7 @@ def __init__( config: Optional[BartConfig] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.d_model = config.d_model @@ -178,7 +179,8 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: AttentionMetadata) -> torch.Tensor: @@ -208,6 +210,7 @@ def __init__( config: Optional[BartConfig] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.d_model = config.d_model @@ -260,7 +263,8 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: AttentionMetadata) -> torch.Tensor: @@ -290,6 +294,7 @@ def __init__( config: Optional[BartConfig] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.d_model = config.d_model @@ -342,7 +347,8 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, @@ -384,6 +390,7 @@ def __init__( config: BartConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.embed_dim = config.d_model @@ -393,7 +400,9 @@ def __init__( num_heads=config.encoder_attention_heads, config=config, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) self.activation_fn = get_act_fn(config.activation_function) @@ -464,6 +473,7 @@ def __init__( config: BartConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.embed_dim = config.d_model @@ -473,7 +483,9 @@ def __init__( num_heads=config.decoder_attention_heads, config=config, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) self.activation_fn = get_act_fn(config.activation_function) self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) @@ -486,6 +498,7 @@ def __init__( self.embed_dim, config.decoder_attention_heads, config=config, + prefix=f"{prefix}.encoder_attn", ) self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) @@ -578,7 +591,8 @@ def __init__(self, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, lora_config: Optional[LoRAConfig] = None, - embed_tokens: Optional[nn.Embedding] = None): + embed_tokens: Optional[nn.Embedding] = None, + prefix: str = ""): super().__init__() self.cache_config = cache_config @@ -599,9 +613,13 @@ def __init__(self, config.max_position_embeddings, embed_dim, ) - self.layers = nn.ModuleList( - [BartEncoderLayer(config,cache_config,quant_config) \ - for _ in range(config.encoder_layers)]) + self.layers = nn.ModuleList([ + BartEncoderLayer(config, + cache_config, + quant_config, + prefix=f"{prefix}.layers.{layer_idx}") + for layer_idx in range(config.encoder_layers) + ]) self.layernorm_embedding = nn.LayerNorm(embed_dim) @@ -661,6 +679,7 @@ def __init__( quant_config: Optional[QuantizationConfig] = None, lora_config: Optional[LoRAConfig] = None, embed_tokens: Optional[nn.Embedding] = None, + prefix: str = "", ): super().__init__() self.cache_config = cache_config @@ -683,8 +702,9 @@ def __init__( ) self.layers = nn.ModuleList( - [BartDecoderLayer(config,cache_config,quant_config) \ - for _ in range(config.decoder_layers)]) + [BartDecoderLayer(config,cache_config,quant_config, + prefix=f"{prefix}.layers.{layer_idx}") \ + for layer_idx in range(config.decoder_layers)]) self.layernorm_embedding = nn.LayerNorm(config.d_model) @@ -759,10 +779,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.encoder = BartEncoder(config, cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.encoder") self.decoder = BartDecoder(config, cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.decoder") def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, encoder_input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 1060d418474ef..fee74f491acc1 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -78,6 +78,7 @@ def __init__( config: BloomConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.hidden_size = config.hidden_size @@ -116,7 +117,8 @@ def __init__( scaling, alibi_slopes=alibi_slopes, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, @@ -168,14 +170,17 @@ def __init__( config: BloomConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() hidden_size = config.hidden_size self.input_layernorm = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) - self.self_attention = BloomAttention(config, cache_config, - quant_config) + self.self_attention = BloomAttention(config, + cache_config, + quant_config, + prefix=f"{prefix}.self_attention") self.post_attention_layernorm = nn.LayerNorm( hidden_size, eps=config.layer_norm_epsilon) self.mlp = BloomMLP(config, quant_config) @@ -242,7 +247,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # Transformer blocks self.start_layer, self.end_layer, self.h = make_layers( config.num_hidden_layers, - lambda prefix: BloomBlock(config, cache_config, quant_config), + lambda prefix: BloomBlock( + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.h") # Final Layer Norm diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 8f91abffaea90..5a6d6432112f0 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -223,6 +223,7 @@ def __init__( quant_config: Optional[QuantizationConfig] = None, bias: bool = False, cache_config: Optional[CacheConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = hidden_size @@ -276,7 +277,8 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def _apply_qk_norm(self, q: torch.Tensor, k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: @@ -313,6 +315,7 @@ def __init__( config: ChameleonConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -336,6 +339,7 @@ def __init__( quant_config=quant_config, bias=False, cache_config=cache_config, + prefix=f"{prefix}.self_attn", ) self.mlp = ChameleonMLP( hidden_size=self.hidden_size, @@ -386,6 +390,7 @@ def __init__( config: ChameleonConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -409,6 +414,7 @@ def __init__( quant_config=quant_config, bias=False, cache_config=cache_config, + prefix=f"{prefix}.self_attn", ) self.mlp = ChameleonMLP( hidden_size=self.hidden_size, @@ -855,7 +861,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.num_hidden_layers, lambda prefix: decoder_layer(config=config, cache_config=cache_config, - quant_config=quant_config), + quant_config=quant_config, + prefix=prefix), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 2ea592aaba9f9..e3a068908b7f3 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -230,6 +230,7 @@ def __init__( config: ChatGLMConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.hidden_size = config.hidden_size @@ -285,7 +286,8 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, @@ -364,6 +366,7 @@ def __init__( config: ChatGLMConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.apply_residual_connection_post_layernorm = ( @@ -377,7 +380,10 @@ def __init__( eps=config.layernorm_epsilon) # Self attention. - self.self_attention = GLMAttention(config, cache_config, quant_config) + self.self_attention = GLMAttention(config, + cache_config, + quant_config, + prefix=f"{prefix}.self_attention") self.hidden_dropout = config.hidden_dropout # Layernorm on the attention output @@ -446,7 +452,8 @@ def __init__( # Transformer layers. self.start_layer, self.end_layer, self.layers = make_layers( self.num_layers, - lambda prefix: GLMBlock(config, cache_config, quant_config), + lambda prefix: GLMBlock( + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.layers", ) @@ -500,16 +507,22 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.num_layers = config.num_layers self.multi_query_group_num = config.multi_query_group_num self.kv_channels = config.kv_channels - self.encoder = GLMTransformer(config, cache_config, quant_config) + self.encoder = GLMTransformer(config, + cache_config, + quant_config, + prefix=f"{prefix}.encoder") self.output_layer = ParallelLMHead(config.padded_vocab_size, config.hidden_size, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.output_layer") vision_config_flag = getattr(config, 'vision_config', None) if vision_config_flag is not None: self.vision_config = Namespace(**config.vision_config) - self.vision = EVA2CLIPModel(self.config, quant_config) + self.vision = EVA2CLIPModel(self.config, + quant_config, + prefix=f"{prefix}.vision") else: self.vision = None diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 9fd083e5a02a9..85e24ca660686 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -120,6 +120,7 @@ def __init__( config: CohereConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() tp_size = get_tensor_model_parallel_world_size() @@ -175,7 +176,8 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") if self.use_qk_norm: self.q_norm = LayerNorm(param_shape=(self.num_heads, self.head_dim), @@ -215,13 +217,15 @@ class CohereDecoderLayer(nn.Module): def __init__(self, config: CohereConfig, cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None): + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): super().__init__() self.hidden_size = config.hidden_size self.self_attn = CohereAttention(config, cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.self_attn") self.mlp = CohereMLP(config, quant_config=quant_config) self.input_layernorm = LayerNorm(param_shape=(config.hidden_size), @@ -271,8 +275,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.hidden_size) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: CohereDecoderLayer(config, cache_config, - quant_config), + lambda prefix: CohereDecoderLayer( + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.layers") self.norm = LayerNorm(param_shape=(config.hidden_size), eps=config.layer_norm_eps) diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index eab338800249e..3932d8b52a9d1 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -154,6 +154,7 @@ def __init__( config: DbrxConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.d_model = config.d_model @@ -208,7 +209,8 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, @@ -234,10 +236,14 @@ def __init__( config: DbrxConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.d_model = config.d_model - self.attn = DbrxAttention(config, cache_config, quant_config) + self.attn = DbrxAttention(config, + cache_config, + quant_config, + prefix=f"{prefix}.attn") self.norm_1 = nn.LayerNorm(self.d_model) self.norm_2 = nn.LayerNorm(self.d_model) @@ -269,10 +275,14 @@ def __init__( config: DbrxConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() - self.norm_attn_norm = DbrxFusedNormAttention(config, cache_config, - quant_config) + self.norm_attn_norm = DbrxFusedNormAttention( + config, + cache_config, + quant_config, + prefix=f"{prefix}.norm_attn_norm") self.ffn = DbrxMoE(config, quant_config) def forward( @@ -308,7 +318,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.start_layer, self.end_layer, self.blocks = make_layers( config.n_layers, - lambda prefix: DbrxBlock(config, cache_config, quant_config), + lambda prefix: DbrxBlock( + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.blocks", ) self.norm_f = nn.LayerNorm(config.d_model, eps=1e-5) diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index 8c5ad9904e925..32488d931ea1c 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -184,6 +184,7 @@ def __init__( max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = hidden_size @@ -236,7 +237,8 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, @@ -261,6 +263,7 @@ def __init__( layer_idx: int, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -277,6 +280,7 @@ def __init__( max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, + prefix=f"{prefix}.self_attn", ) if (config.n_routed_experts is not None and layer_idx >= config.first_k_dense_replace @@ -346,7 +350,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): lambda prefix: DeepseekDecoderLayer(config, int(prefix.split(".")[-1]), cache_config, - quant_config=quant_config), + quant_config=quant_config, + prefix=prefix), prefix=f"{prefix}.layers") self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.make_empty_intermediate_tensors = ( diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index d2c4ca0bf85e9..4cf4e6c358bf2 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -268,7 +268,8 @@ def __init__( self.scaling, num_kv_heads=self.num_local_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 9d739d0479548..5ca26d53a17e7 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -174,6 +174,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + prefix=f"{prefix}.attn", ) def forward( @@ -219,7 +220,7 @@ def __init__( quant_config=quant_config, bias=bias, cache_config=cache_config, - prefix=prefix, + prefix=f"{prefix}.attention", ) def forward( diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 2aa4b67d99894..096ad32b38e86 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -84,6 +84,7 @@ def __init__( config: FalconConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() @@ -158,7 +159,8 @@ def __init__( self.head_dim, self.inv_norm_factor, num_kv_heads=self.num_kv_heads, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") elif self.use_alibi: tp_rank = get_tensor_model_parallel_rank() head_start = tp_rank * self.num_heads @@ -171,14 +173,16 @@ def __init__( self.inv_norm_factor, num_kv_heads=self.num_kv_heads, alibi_slopes=alibi_slopes, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") else: self.attn = Attention(self.num_heads, self.head_dim, scale=self.inv_norm_factor, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, @@ -241,12 +245,16 @@ def __init__( config: FalconConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() hidden_size = config.hidden_size self.num_heads = config.num_attention_heads - self.self_attention = FalconAttention(config, cache_config, - quant_config) + self.self_attention = FalconAttention( + config, + cache_config, + quant_config, + prefix=f"{prefix}.self_attention") self.mlp = FalconMLP(config, quant_config) self.config = config @@ -357,8 +365,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # Transformer blocks self.start_layer, self.end_layer, self.h = make_layers( config.num_hidden_layers, - lambda prefix: FalconDecoderLayer(config, cache_config, - quant_config), + lambda prefix: FalconDecoderLayer( + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.h") # Final Layer Norm diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index d3a9ff6915b84..3a5fe8e1f4144 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -35,10 +35,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.shared = BartScaledWordEmbedding(self.vocab_size, config.d_model) self.encoder = BartEncoder(config, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.encoder") self.decoder = BartDecoder(config, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.decoder") if self.config.tie_word_embeddings: self.encoder.embed_tokens.weight = self.shared.weight @@ -99,7 +101,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.config = config self.model = Florence2LanguageModel(vllm_config=vllm_config, - prefix=prefix) + prefix=f"{prefix}.model") embed_scale = math.sqrt( config.d_model) if config.scale_embedding else 1.0 @@ -198,7 +200,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # TODO(Isotr0py): Add vision backbone self.language_model = Florence2LanguageForConditionalGeneration( vllm_config=vllm_config.with_hf_config(config.text_config), - prefix=prefix, + prefix=f"{prefix}.language_model", ) @property diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 64e03b30bf2f1..131e9af139c2a 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -174,7 +174,8 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 4ba39223cc07f..839130364ef4d 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -95,7 +95,8 @@ def __init__(self, rope_theta: float, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, - attn_logits_soft_cap: Optional[float] = None) -> None: + attn_logits_soft_cap: Optional[float] = None, + prefix: str = "") -> None: super().__init__() self.layer_idx = layer_idx self.config = config @@ -154,7 +155,8 @@ def __init__(self, num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, - logits_soft_cap=attn_logits_soft_cap) + logits_soft_cap=attn_logits_soft_cap, + prefix=f"{prefix}.attn") def forward( self, @@ -179,6 +181,7 @@ def __init__( config: Gemma2Config, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -194,6 +197,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, attn_logits_soft_cap=config.attn_logit_softcapping, + prefix=f"{prefix}.self_attn", ) self.hidden_size = config.hidden_size self.mlp = Gemma2MLP( @@ -257,8 +261,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: Gemma2DecoderLayer(int(prefix.split(".")[ - -1]), config, cache_config, quant_config), + lambda prefix: Gemma2DecoderLayer(int(prefix.split(".")[-1]), + config, + cache_config, + quant_config, + prefix=prefix), prefix=f"{prefix}.layers") self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py index 025615b0920fd..f37ab0f82d52a 100644 --- a/vllm/model_executor/models/glm4_vision_encoder.py +++ b/vllm/model_executor/models/glm4_vision_encoder.py @@ -56,6 +56,7 @@ def __init__( self, config, quant_config: Optional[QuantizationConfig] = None, + prefix: str = '', ): super().__init__() self.hidden_size = config.hidden_size @@ -135,11 +136,14 @@ def __init__( self, config, quant_config: Optional[QuantizationConfig] = None, + prefix: str = '', ): super().__init__() self.input_layernorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.attention = Attention(config, quant_config=quant_config) + self.attention = Attention(config, + quant_config=quant_config, + prefix=f"{prefix}.attention") self.mlp = MLP(config, quant_config=quant_config) self.post_attention_layernorm = LayerNorm(config.hidden_size, eps=config.layer_norm_eps) @@ -161,11 +165,14 @@ def __init__( self, config, quant_config: Optional[QuantizationConfig] = None, + prefix: str = '', ): super().__init__() self.layers = nn.ModuleList([ - TransformerLayer(config, quant_config=quant_config) - for _ in range(config.num_hidden_layers) + TransformerLayer(config, + quant_config=quant_config, + prefix=f"{prefix}.layer.{layer_idx}") + for layer_idx in range(config.num_hidden_layers) ]) def forward(self, hidden_states): @@ -252,12 +259,14 @@ def __init__( self, config, quant_config: Optional[QuantizationConfig] = None, + prefix: str = '', ): super().__init__() vision_config = Namespace(**config.vision_config) self.patch_embedding = PatchEmbedding(vision_config) self.transformer = Transformer(vision_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.transformer") self.linear_proj = GLU(config, in_features=config.hidden_size, quant_config=quant_config) diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 1c61408ae1dd9..fd926ff0254d4 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -84,7 +84,8 @@ def __init__( self.head_dim, scale=self.scale, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 50a143cb1b600..c64bc70688806 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -52,6 +52,7 @@ def __init__( config: GPTBigCodeConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.hidden_size = config.hidden_size @@ -92,7 +93,8 @@ def __init__( scale=self.scale, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, @@ -151,6 +153,7 @@ def __init__( config: GPTBigCodeConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() hidden_size = config.hidden_size @@ -158,7 +161,10 @@ def __init__( hidden_size) self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) - self.attn = GPTBigCodeAttention(config, cache_config, quant_config) + self.attn = GPTBigCodeAttention(config, + cache_config, + quant_config, + prefix=f"{prefix}.attn") self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) self.mlp = GPTBigMLP(inner_dim, config, quant_config) @@ -210,7 +216,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) self.start_layer, self.end_layer, self.h = make_layers( config.num_hidden_layers, - lambda prefix: GPTBigCodeBlock(config, cache_config, quant_config), + lambda prefix: GPTBigCodeBlock( + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.h", ) self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index d5defc60764e6..4829578a56959 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -53,6 +53,7 @@ def __init__( config: GPTJConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.total_num_heads = config.num_attention_heads @@ -94,7 +95,8 @@ def __init__( self.head_size, scaling, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, @@ -147,12 +149,16 @@ def __init__( config: GPTJConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() inner_dim = (4 * config.n_embd if config.n_inner is None else config.n_inner) self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) - self.attn = GPTJAttention(config, cache_config, quant_config) + self.attn = GPTJAttention(config, + cache_config, + quant_config, + prefix=f"{prefix}.attn") self.mlp = GPTJMLP(inner_dim, config, quant_config) def forward( @@ -193,7 +199,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.start_layer, self.end_layer, self.h = make_layers( config.n_layer, - lambda prefix: GPTJBlock(config, cache_config, quant_config), + lambda prefix: GPTJBlock( + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.h", ) self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 0bb5e2f9b95f9..731642772011c 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -52,6 +52,7 @@ def __init__( config: GPTNeoXConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.total_num_heads = config.num_attention_heads @@ -94,7 +95,8 @@ def __init__( self.head_size, scaling, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, @@ -145,6 +147,7 @@ def __init__( config: GPTNeoXConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.use_parallel_residual = config.use_parallel_residual @@ -152,7 +155,10 @@ def __init__( eps=config.layer_norm_eps) self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.attention = GPTNeoXAttention(config, cache_config, quant_config) + self.attention = GPTNeoXAttention(config, + cache_config, + quant_config, + prefix=f"{prefix}.attention") self.mlp = GPTNeoXMLP(config, quant_config) def forward( @@ -205,7 +211,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: GPTNeoXLayer(config, cache_config, quant_config), + lambda prefix: GPTNeoXLayer( + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.layers", ) self.final_layer_norm = nn.LayerNorm(config.hidden_size, diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index c1e2e87f08ec3..bd2394e71c973 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -161,7 +161,8 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index a91a18816995f..51296ef0cc08e 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -164,7 +164,8 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 94b819b5d9366..906128940ff76 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -1,5 +1,5 @@ from functools import partial -from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Type, Union import torch from torch import nn @@ -250,7 +250,12 @@ def forward( @support_torch_compile class InternLM2Model(nn.Module): - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + layer_type: Type[InternLMDecoderLayer] = InternLMDecoderLayer): super().__init__() config = vllm_config.model_config.hf_config @@ -266,7 +271,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: InternLMDecoderLayer( + lambda prefix: layer_type( config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.layers") self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -316,14 +321,18 @@ def forward( class InternLM2ForCausalLM(nn.Module, SupportsPP): - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + def __init__(self, + *, + vllm_config: VllmConfig, + prefix: str = "", + model_type: Type[InternLM2Model] = InternLM2Model): super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.model = InternLM2Model(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) + self.model = model_type(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.output = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config, diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py index f1b7c896cadfe..93ac2dcf8d587 100644 --- a/vllm/model_executor/models/internlm2_ve.py +++ b/vllm/model_executor/models/internlm2_ve.py @@ -14,8 +14,6 @@ InternLM2MLP, InternLM2Model) from vllm.sequence import IntermediateTensors -from .utils import make_layers, maybe_prefix - class InternLM2VEDecoderLayer(nn.Module): @@ -105,17 +103,9 @@ def forward( class InternLM2VEModel(InternLM2Model): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__(vllm_config=vllm_config, prefix=prefix) - - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - - self.start_layer, self.end_layer, self.layers = make_layers( - config.num_hidden_layers, - lambda prefix: InternLM2VEDecoderLayer( - config, cache_config, quant_config, prefix=prefix), - prefix=f"{prefix}.layers") + super().__init__(vllm_config=vllm_config, + prefix=prefix, + layer_type=InternLM2VEDecoderLayer) def forward( self, @@ -159,7 +149,6 @@ def forward( class InternLM2VEForCausalLM(InternLM2ForCausalLM): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__(vllm_config=vllm_config, prefix=prefix) - - self.model = InternLM2VEModel(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) + super().__init__(vllm_config=vllm_config, + prefix=prefix, + model_type=InternLM2VEModel) diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index 41db85b678456..8c81dff6b5768 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -76,6 +76,7 @@ def __init__( config: JAISConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.hidden_size = config.hidden_size @@ -114,7 +115,8 @@ def __init__( scale=self.scale, alibi_slopes=alibi_slopes, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, @@ -178,6 +180,7 @@ def __init__( config: JAISConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() hidden_size = config.hidden_size @@ -185,7 +188,10 @@ def __init__( hidden_size) self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) - self.attn = JAISAttention(config, cache_config, quant_config) + self.attn = JAISAttention(config, + cache_config, + quant_config, + prefix=f"{prefix}.attn") self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) self.mlp = JAISMLP(inner_dim, config, quant_config) @@ -241,7 +247,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.num_hidden_layers, lambda prefix: JAISBlock(config=config, cache_config=cache_config, - quant_config=quant_config), + quant_config=quant_config, + prefix=prefix), prefix=f"{prefix}.h", ) diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index f83f0fce7275f..099ca7e12b288 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -102,7 +102,8 @@ def __init__(self, config: JambaConfig, layer_idx: int, cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None) -> None: + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "") -> None: super().__init__() self.config = config self.mamba = MambaMixer(hidden_size= config.hidden_size, @@ -157,6 +158,7 @@ def __init__( layer_idx: int, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -198,6 +200,7 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, + prefix=f"{prefix}.attn", ) num_experts = config.layers_num_experts[layer_idx] @@ -287,7 +290,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): layer_class(config, layer_idx=i, cache_config=cache_config, - quant_config=quant_config)) + quant_config=quant_config, + prefix=f"{prefix}.layers.{i}")) self.layers = nn.ModuleList(decoder_layers) self.final_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 2b40e9ec73fad..66b29e72cfa89 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -174,6 +174,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + prefix=f"{prefix}.attn", ) def forward( diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index b92bff4d7c28c..c9a573278a136 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -192,6 +192,7 @@ def __init__( max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = hidden_size @@ -246,7 +247,8 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, @@ -273,6 +275,7 @@ def __init__( config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.config = config @@ -283,6 +286,7 @@ def __init__( self.rope_scaling = getattr(config, "rope_scaling", None) self.max_position_embeddings = getattr(config, "max_position_embeddings", 8192) + self.prefix = prefix self._init_attn_block() self._init_ffn_block() @@ -298,6 +302,7 @@ def _init_attn_block(self): max_position_embeddings=self.max_position_embeddings, cache_config=self.cache_config, quant_config=self.quant_config, + prefix=f"{self.prefix}.self_attn", ) def _init_ffn_block(self): @@ -388,8 +393,8 @@ def _init_layers( ): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: MiniCPMDecoderLayer(config, cache_config, - quant_config), + lambda prefix: MiniCPMDecoderLayer( + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.layers") def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index 278c4bbe6e563..c38c31a0d4953 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -60,6 +60,7 @@ def __init__( max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = hidden_size @@ -119,7 +120,8 @@ def __init__( self.scaling, num_kv_heads=self.num_local_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, @@ -195,6 +197,7 @@ def _init_attn_block(self): max_position_embeddings=self.max_position_embeddings, cache_config=self.cache_config, quant_config=self.quant_config, + prefix=f"{self.prefix}.self_attn", ) @@ -209,8 +212,8 @@ def _init_layers( ): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: MiniCPM3DecoderLayer(config, cache_config, - quant_config), + lambda prefix: MiniCPM3DecoderLayer( + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.layers") diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 0faffb4f1b00c..a5b364fe5ec85 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -166,7 +166,8 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index ddd6afcf6a1b6..7a9b8cd88cfd0 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -170,6 +170,7 @@ def __init__( rope_theta: float = 10000, quant_config: Optional[QuantizationConfig] = None, cache_config: Optional[CacheConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = hidden_size @@ -219,7 +220,8 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, @@ -243,6 +245,7 @@ def __init__( config: MixtralConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -255,7 +258,9 @@ def __init__( num_kv_heads=config.num_key_value_heads, rope_theta=rope_theta, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) self.block_sparse_moe = MixtralMoE(config=config, quant_config=quant_config) self.input_layernorm = RMSNorm(config.hidden_size, @@ -311,7 +316,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: MixtralDecoderLayer( - config, cache_config, quant_config=quant_config), + config, cache_config, quant_config=quant_config, prefix=prefix + ), prefix=f"{prefix}.layers") self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.make_empty_intermediate_tensors = ( diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 2528f741864b3..ee7b560fe1ee4 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -370,6 +370,7 @@ def __init__( config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -427,7 +428,8 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") # Attention output projection. self.o_proj = RowParallelLinear( @@ -517,10 +519,14 @@ def __init__( config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() # Attention block. - self.self_attn = MolmoAttention(config, cache_config, quant_config) + self.self_attn = MolmoAttention(config, + cache_config, + quant_config, + prefix=f"{prefix}.self_attn") # MLP block. self.mlp = MolmoMLP(config, quant_config=quant_config) @@ -738,7 +744,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): else MolmoDecoderLayer self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: decoder_layer(config, cache_config, quant_config), + lambda prefix: decoder_layer( + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.layers", ) diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 8716e92b0f1c2..1235816413a44 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -50,6 +50,7 @@ def __init__( config: MPTConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.d_model = config.d_model @@ -115,7 +116,8 @@ def __init__( alibi_slopes=alibi_slopes, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, @@ -176,11 +178,15 @@ def __init__( config: MPTConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() hidden_size = config.d_model self.norm_1 = nn.LayerNorm(hidden_size) - self.attn = MPTAttention(config, cache_config, quant_config) + self.attn = MPTAttention(config, + cache_config, + quant_config, + prefix=f"{prefix}.attn") self.norm_2 = nn.LayerNorm(hidden_size) self.ffn = MPTMLP(config, quant_config) @@ -224,7 +230,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.start_layer, self.end_layer, self.blocks = make_layers( config.n_layers, - lambda prefix: MPTBlock(config, cache_config, quant_config), + lambda prefix: MPTBlock( + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.blocks") self.norm_f = nn.LayerNorm(config.d_model) if config.no_bias: diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index ceab299a7950a..c7b4c22b6896b 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -195,7 +195,8 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index dc138e2e636ad..538e31ec91699 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -62,6 +62,7 @@ def __init__( config: OlmoConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.config = config @@ -101,7 +102,8 @@ def __init__( self.head_dim, scale=self.scaling, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") # Attention output projection. self.o_proj = RowParallelLinear( @@ -184,10 +186,14 @@ class OlmoDecoderLayer(nn.Module): def __init__(self, config: OlmoConfig, cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None): + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): super().__init__() # Attention block. - self.self_attn = OlmoAttention(config, cache_config, quant_config) + self.self_attn = OlmoAttention(config, + cache_config, + quant_config, + prefix=f"{prefix}.self_attn") # MLP block. self.mlp = OlmoMLP(config, quant_config) @@ -238,8 +244,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.hidden_size) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: OlmoDecoderLayer(config, cache_config, quant_config - ), + lambda prefix: OlmoDecoderLayer( + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.layers") self.norm = nn.LayerNorm(config.hidden_size, elementwise_affine=False, diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index ab87695d8e650..5b5b3ef48b035 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -102,6 +102,7 @@ def __init__( max_position_embeddings: int = 4096, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = hidden_size @@ -156,7 +157,8 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, @@ -182,6 +184,7 @@ def __init__( layer_idx: int, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -199,6 +202,7 @@ def __init__( max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, + prefix=f"{prefix}.self_attn", ) self.mlp = OlmoeMoE( @@ -260,8 +264,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: OlmoeDecoderLayer(config, int( - prefix.split(".")[-1]), cache_config, quant_config), + lambda prefix: OlmoeDecoderLayer(config, + int(prefix.split(".")[-1]), + cache_config, + quant_config, + prefix=prefix), prefix=f"{prefix}.layers") self.norm = RMSNorm(config.hidden_size, eps=1e-5) diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index b01734af8ddd8..a3757b5c8808e 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -75,6 +75,7 @@ def __init__( max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = hidden_size @@ -126,7 +127,8 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, @@ -150,6 +152,7 @@ def __init__( config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -166,6 +169,7 @@ def __init__( max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, + prefix=f"{prefix}.self_attn", ) self.mlp = OrionMLP( hidden_size=self.hidden_size, @@ -226,10 +230,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: OrionDecoderLayer( - config, - cache_config, - quant_config, - ), + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.layers") self.norm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps) self.make_empty_intermediate_tensors = ( diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index 3b8199f4f1661..14dd4b5b1b4da 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -75,7 +75,8 @@ class PersimmonAttention(nn.Module): def __init__(self, config: PersimmonConfig, cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None): + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): super().__init__() self.config = config tensor_parallel_world_size = get_tensor_model_parallel_world_size() @@ -122,7 +123,8 @@ def __init__(self, self.head_dim, scale=self.scaling, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def _split_heads(self, x: torch.Tensor) -> torch.Tensor: # [seq_length, hidden_size] -> [seq_length, num_heads, head_dim] @@ -167,12 +169,14 @@ class PersimmonDecoderLayer(nn.Module): def __init__(self, config: PersimmonConfig, cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None): + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): super().__init__() self.hidden_size = config.hidden_size self.self_attn = PersimmonAttention(config=config, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.self_attn") self.mlp = PersimmonMLP(config, quant_config=quant_config) self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) @@ -226,8 +230,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.hidden_size) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: PersimmonDecoderLayer(config, cache_config, - quant_config), + lambda prefix: PersimmonDecoderLayer( + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.layers") self.final_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 0a117bf16c9b3..998d3723a0d7d 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -69,7 +69,8 @@ class PhiAttention(nn.Module): def __init__(self, config: PhiConfig, cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None): + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): super().__init__() self.total_num_heads = config.num_attention_heads self.hidden_size = config.hidden_size @@ -116,7 +117,8 @@ def __init__(self, self.head_size, scaling, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, @@ -167,11 +169,15 @@ class PhiLayer(nn.Module): def __init__(self, config: PhiConfig, cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None): + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): super().__init__() self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.self_attn = PhiAttention(config, cache_config, quant_config) + self.self_attn = PhiAttention(config, + cache_config, + quant_config, + prefix=f"{prefix}.self_attn") self.mlp = PhiMLP(config, quant_config) def forward( @@ -210,7 +216,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.hidden_size) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: PhiLayer(config, cache_config, quant_config), + lambda prefix: PhiLayer( + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.layers") self.final_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py index f71cbd1264c45..da7e4cdbc6940 100644 --- a/vllm/model_executor/models/phi3_small.py +++ b/vllm/model_executor/models/phi3_small.py @@ -117,6 +117,7 @@ def __init__( layer_idx: int, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.layer_idx = layer_idx @@ -214,15 +215,14 @@ def __init__( "homo_head": self.homo_heads } - self.attn = Attention( - self.num_heads_per_partition, - self.head_dim, - self.scale, - num_kv_heads=self.num_kv_heads_per_partion, - cache_config=cache_config, - quant_config=quant_config, - blocksparse_params=bs_params, - ) + self.attn = Attention(self.num_heads_per_partition, + self.head_dim, + self.scale, + num_kv_heads=self.num_kv_heads_per_partion, + cache_config=cache_config, + quant_config=quant_config, + blocksparse_params=bs_params, + prefix=f"{prefix}.attn") def forward( self, @@ -259,13 +259,15 @@ def __init__( layer_idx: int, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.hidden_size = config.hidden_size self.self_attn = Phi3SmallSelfAttention(config, layer_idx, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.self_attn") self.mlp = Phi3SmallMLP(config, quant_config) self.input_layernorm = nn.LayerNorm(config.hidden_size, @@ -315,7 +317,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.num_hidden_layers, lambda prefix: Phi3SmallDecoderLayer(config, int(prefix.split('.')[-1]), - cache_config, quant_config), + cache_config, + quant_config, + prefix=prefix), prefix=f"{prefix}.layers") self.final_layernorm = nn.LayerNorm(config.hidden_size, diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index e475d286bd7ea..1febd62f2f705 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -294,6 +294,7 @@ def __init__( cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, rope_scaling: Optional[dict] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = hidden_size @@ -347,6 +348,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + prefix=f"{prefix}.attn", ) def forward( @@ -371,6 +373,7 @@ def __init__( config: PhiMoEConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -385,6 +388,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config, rope_scaling=config.rope_scaling, + prefix=f"{prefix}.self_attn", ) self.block_sparse_moe = PhiMoE( num_experts=config.num_local_experts, @@ -454,8 +458,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: PhiMoEDecoderLayer(config, cache_config, - quant_config), + lambda prefix: PhiMoEDecoderLayer( + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.layers") self.norm = nn.LayerNorm(config.hidden_size, eps=config.rms_norm_eps, diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 44ce6eda42943..d3a776f665c74 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -442,6 +442,7 @@ def __init__( rope_scaling: Optional[Dict[str, Any]] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.hidden_size = hidden_size @@ -478,7 +479,8 @@ def __init__( self.head_dim, self.scaling, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, @@ -502,6 +504,7 @@ def __init__( config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.ln_1 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) @@ -514,7 +517,8 @@ def __init__( rope_theta=rope_theta, rope_scaling=rope_scaling, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") self.ln_2 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) @@ -568,7 +572,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.start_layer, self.end_layer, self.h = make_layers( config.num_hidden_layers, - lambda prefix: QWenBlock(config, cache_config, quant_config), + lambda prefix: QWenBlock( + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.h") self.ln_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) self.make_empty_intermediate_tensors = ( diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 96a9bc451f4df..1091f88ab2534 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -168,6 +168,7 @@ def __init__( max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = hidden_size @@ -220,7 +221,8 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, @@ -245,6 +247,7 @@ def __init__( layer_idx: int, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -261,6 +264,7 @@ def __init__( max_position_embeddings=max_position_embeddings, cache_config=cache_config, quant_config=quant_config, + prefix=f"{prefix}.self_attn", ) # Note: Qwen/Qwen2-57B-A14B-Instruct does not have @@ -336,7 +340,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): layer_idx=int( prefix.split(".")[-1]), cache_config=cache_config, - quant_config=quant_config), + quant_config=quant_config, + prefix=prefix), prefix=f"{prefix}.layers", ) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 6d6fafc5ab0eb..f58710d215056 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -167,6 +167,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + prefix=f"{prefix}.attn", ) def forward( diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index e11d2e916730a..6b2107bef0a66 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -77,7 +77,8 @@ class StablelmAttention(nn.Module): def __init__(self, config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None) -> None: + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "") -> None: super().__init__() self.config = config self.hidden_size = config.hidden_size @@ -131,7 +132,8 @@ def __init__(self, self.scaling, num_kv_heads=self.num_key_value_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, @@ -155,9 +157,13 @@ def __init__( config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() - self.self_attn = StablelmAttention(config, cache_config, quant_config) + self.self_attn = StablelmAttention(config, + cache_config, + quant_config, + prefix=f"{prefix}.self_attn") self.mlp = StablelmMLP(config, quant_config) norm_eps = getattr(config, "norm_eps", getattr(config, "layer_norm_eps", 1e-05)) @@ -207,8 +213,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: StablelmDecoderLayer(config, cache_config, - quant_config), + lambda prefix: StablelmDecoderLayer( + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.layers", ) norm_eps = getattr(config, "norm_eps", diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 74c66042226de..15e8f2af52cda 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -52,7 +52,8 @@ class Starcoder2Attention(nn.Module): def __init__(self, config: Starcoder2Config, cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None): + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): super().__init__() self.config = config @@ -105,7 +106,8 @@ def __init__(self, self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, @@ -154,12 +156,14 @@ class Starcoder2DecoderLayer(nn.Module): def __init__(self, config: Starcoder2Config, cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None): + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): super().__init__() self.hidden_size = config.hidden_size self.self_attn = Starcoder2Attention(config, cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.self_attn") self.mlp = Starcoder2MLP(config, quant_config=quant_config) self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon) @@ -213,7 +217,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: Starcoder2DecoderLayer( - config, cache_config, quant_config=quant_config), + config, cache_config, quant_config=quant_config, prefix=prefix + ), prefix=f"{prefix}.layers", ) self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon) diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py index bc37a997eabb5..25a0d474e2863 100644 --- a/vllm/model_executor/models/xverse.py +++ b/vllm/model_executor/models/xverse.py @@ -93,6 +93,7 @@ def __init__( quant_config: Optional[QuantizationConfig] = None, bias: bool = False, cache_config: Optional[CacheConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = hidden_size @@ -138,7 +139,8 @@ def __init__( self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") def forward( self, @@ -162,6 +164,7 @@ def __init__( config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -180,6 +183,7 @@ def __init__( quant_config=quant_config, bias=getattr(config, "bias", False), cache_config=cache_config, + prefix=f"{prefix}.self_attn", ) self.mlp = XverseMLP( hidden_size=self.hidden_size, @@ -243,8 +247,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: XverseDecoderLayer(config, cache_config, - quant_config), + lambda prefix: XverseDecoderLayer( + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.layers", ) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 9be9031dc3baf..cbc982752c6b4 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -20,6 +20,7 @@ class CpuPlatform(Platform): _enum = PlatformEnum.CPU device_type: str = "cpu" + dispatch_key: str = "CPU" @classmethod def get_device_name(cls, device_id: int = 0) -> str: diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index cf0d41081a5aa..70724b8be4c45 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -121,6 +121,7 @@ def device_id_to_physical_device_id(device_id: int) -> int: class CudaPlatform(Platform): _enum = PlatformEnum.CUDA device_type: str = "cuda" + dispatch_key: str = "CUDA" @classmethod def get_device_capability(cls, device_id: int = 0) -> DeviceCapability: diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index a8f568d31d5a7..3071136e43b85 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -13,6 +13,7 @@ class HpuPlatform(Platform): _enum = PlatformEnum.HPU device_type: str = "hpu" + dispatch_key: str = "HPU" @classmethod def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 07f23167d509a..3328665029039 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -57,6 +57,10 @@ def to_int(self) -> int: class Platform: _enum: PlatformEnum device_type: str + # available dispatch keys: + # check https://github.com/pytorch/pytorch/blob/313dac6c1ca0fa0cde32477509cce32089f8532a/torchgen/model.py#L134 # noqa + # use "CPU" as a fallback for platforms not registered in PyTorch + dispatch_key: str = "CPU" def is_cuda(self) -> bool: return self._enum == PlatformEnum.CUDA diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py index 33a41933e9fff..694de836e1517 100644 --- a/vllm/platforms/openvino.py +++ b/vllm/platforms/openvino.py @@ -18,6 +18,7 @@ class OpenVinoPlatform(Platform): _enum = PlatformEnum.OPENVINO device_type: str = "openvino" + dispatch_key: str = "CPU" @classmethod def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 3fe8c01c15787..d2f44c3e423e3 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -36,6 +36,7 @@ class RocmPlatform(Platform): _enum = PlatformEnum.ROCM device_type: str = "cuda" + dispatch_key: str = "CUDA" @classmethod def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 513cfa54687dc..137af57023ea9 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -17,6 +17,7 @@ class TpuPlatform(Platform): _enum = PlatformEnum.TPU device_type: str = "tpu" + dispatch_key: str = "XLA" @classmethod def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index b2ee0ef2f71cd..69388a8e0f27c 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -17,6 +17,7 @@ class XPUPlatform(Platform): _enum = PlatformEnum.XPU device_type: str = "xpu" + dispatch_key: str = "XPU" @classmethod def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index cd4d7eb0e6e4e..cf166e3eb5bad 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -273,7 +273,8 @@ def execute_model( if previous_hidden_states is not None else {} # Run model - with set_forward_context(model_input.attn_metadata): + with set_forward_context(model_input.attn_metadata, + self.vllm_config): hidden_states = model_executable( input_ids=model_input.input_tokens, positions=model_input.input_positions, diff --git a/vllm/utils.py b/vllm/utils.py index 67b2629ecc933..30c371b0e3591 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1573,6 +1573,7 @@ def direct_register_custom_op( mutates_args: List[str], fake_impl: Optional[Callable] = None, target_lib: Optional[Library] = None, + dispatch_key: str = "CUDA", ): """ `torch.library.custom_op` can have significant overhead because it @@ -1601,7 +1602,7 @@ def direct_register_custom_op( schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args) my_lib = target_lib or vllm_lib my_lib.define(op_name + schema_str) - my_lib.impl(op_name, op_func, "CUDA") + my_lib.impl(op_name, op_func, dispatch_key=dispatch_key) if fake_impl is not None: my_lib._register_fake(op_name, fake_impl) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index e73a1e60b2730..d98bb5a716e97 100644 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -173,7 +173,8 @@ def unified_v1_flash_attention( alibi_slopes: Optional[torch.Tensor] = None, logits_soft_cap: Optional[float] = None, ) -> None: - current_metadata = get_forward_context() + context = get_forward_context() + current_metadata = context.dynamic_forward_context if current_metadata is None: # Profiling run. return diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 2cf55cd497659..02f9498142bb7 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -447,7 +447,7 @@ def execute_model( # Run the decoder. # Use persistent buffers for CUDA graphs. - with set_forward_context(attn_metadata): + with set_forward_context(attn_metadata, self.vllm_config): hidden_states = self.model( input_ids=None, positions=self.positions[:num_input_tokens], @@ -523,7 +523,7 @@ def _dummy_run( num_tokens: int, kv_caches: List[torch.Tensor], ) -> torch.Tensor: - with set_forward_context(None): + with set_forward_context(None, self.vllm_config): hidden_states = model( input_ids=None, positions=self.positions[:num_tokens], diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py index 37cfcbf13d7a3..4a55d91e71484 100644 --- a/vllm/worker/embedding_model_runner.py +++ b/vllm/worker/embedding_model_runner.py @@ -97,7 +97,7 @@ def execute_model( model_forward_end = torch.cuda.Event(enable_timing=True) model_forward_start.record() - with set_forward_context(model_input.attn_metadata): + with set_forward_context(model_input.attn_metadata, self.vllm_config): hidden_or_intermediate_states = model_executable( input_ids=model_input.input_tokens, positions=model_input.input_positions, diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 687d2cc79360f..ae18c79c980c8 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -176,7 +176,7 @@ def execute_model( } if self.has_inner_state else {} multi_modal_kwargs = model_input.multi_modal_kwargs or {} - with set_forward_context(model_input.attn_metadata): + with set_forward_context(model_input.attn_metadata, self.vllm_config): hidden_or_intermediate_states = model_executable( input_ids=model_input.input_tokens, positions=model_input.input_positions, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index ed0360fb7f727..13301b876217d 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1503,7 +1503,7 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: self._update_inputs_to_capture_for_enc_dec_model( capture_inputs) - with set_forward_context(attn_metadata): + with set_forward_context(attn_metadata, self.vllm_config): graph_runner.capture(**capture_inputs) self.graph_memory_pool = graph_runner.graph.pool() self.graph_runners[virtual_engine][batch_size] = ( @@ -1649,7 +1649,7 @@ def execute_model( model_forward_end = torch.cuda.Event(enable_timing=True) model_forward_start.record() - with set_forward_context(model_input.attn_metadata): + with set_forward_context(model_input.attn_metadata, self.vllm_config): hidden_or_intermediate_states = model_executable( input_ids=model_input.input_tokens, positions=model_input.input_positions, From 97814fbf0f847a11d2e0eb339e3e7572ca69379d Mon Sep 17 00:00:00 2001 From: Ricky Xu Date: Fri, 22 Nov 2024 15:27:25 -0800 Subject: [PATCH 111/255] [v1] Refactor KVCacheManager for more hash input than token ids (#10507) Signed-off-by: rickyx Signed-off-by: Cody Yu Co-authored-by: Cody Yu --- tests/v1/core/test_prefix_caching.py | 225 +++++++++++++++++++-- vllm/v1/core/kv_cache_manager.py | 289 +++++++++++++-------------- vllm/v1/core/kv_cache_utils.py | 37 ++-- 3 files changed, 365 insertions(+), 186 deletions(-) diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index d614d3e67460f..83bfbb6ade8d7 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -1,8 +1,11 @@ """Compare the with and without prefix caching.""" +import pytest + from vllm.inputs import token_inputs from vllm.sampling_params import SamplingParams +from vllm.utils import cdiv from vllm.v1.core.kv_cache_manager import KVCacheManager, Request -from vllm.v1.core.kv_cache_utils import hash_block_tokens +from vllm.v1.core.kv_cache_utils import KVCacheBlock, hash_block_tokens def make_request(request_id, prompt_token_ids): @@ -31,7 +34,8 @@ def test_prefill(): # Fully cache miss # Incomplete 1 block (7 tokens) unique_token_ids = [3] * 7 - req0 = make_request("0", common_token_ids + unique_token_ids) + all_token_ids = common_token_ids + unique_token_ids + req0 = make_request("0", all_token_ids) computed_blocks = manager.get_computed_blocks(req0) assert not computed_blocks blocks = manager.allocate_slots(req0, 55, computed_blocks) @@ -40,24 +44,16 @@ def test_prefill(): # Check full block metadata parent_block_hash = None for block_id in (0, 1, 2): - block_hash = hash_block_tokens(parent_block_hash, - manager.block_pool[block_id].token_ids) + block_tokens = tuple(all_token_ids[block_id * 16:(block_id + 1) * 16]) + block_hash = hash_block_tokens(parent_block_hash, block_tokens) assert manager.block_pool[block_id].block_hash == block_hash assert manager.block_pool[block_id].ref_cnt == 1 - assert manager.block_pool[block_id].num_hashed_tokens == 16 * ( - block_id + 1) - assert manager.block_pool[block_id].token_ids == tuple([block_id] * 16) parent_block_hash = block_hash # Check partial/preallocated block metadata for block_id in (3, 4): assert manager.block_pool[block_id].block_hash is None assert manager.block_pool[block_id].ref_cnt == 1 - assert manager.block_pool[block_id].num_hashed_tokens == 0 - if block_id == 3: - assert manager.block_pool[block_id].token_ids == [3] * 7 - else: - assert not manager.block_pool[block_id].token_ids # Cache hit in the common prefix when the original block is still in use. # Incomplete 1 block (5 tokens) @@ -113,7 +109,7 @@ def test_prefill(): req3 = make_request("3", [99] * (16 * 9)) computed_blocks = manager.get_computed_blocks(req3) assert not computed_blocks - blocks = manager.allocate_slots(req2, 16 * 9, computed_blocks) + blocks = manager.allocate_slots(req3, 16 * 9, computed_blocks) # This block ID order also checks the eviction order. assert [b.block_id for b in blocks] == [9, 4, 3, 6, 5, 8, 7, 2, 1, 0] assert manager.free_block_queue.num_free_blocks == 0 @@ -148,7 +144,7 @@ def test_decode(): req0.append_output_token_ids(8) new_blocks = manager.append_slots(req0, 4) assert new_blocks is not None and len(new_blocks) == 0 - assert len(manager.block_pool[3].token_ids) == 11 + assert manager.req_to_blocks[req0.request_id][-2].block_hash is None # Append slots without allocating a new block, but start using the # preallocated block. @@ -159,8 +155,7 @@ def test_decode(): req0.append_output_token_ids(7) new_blocks = manager.append_slots(req0, 15) assert new_blocks is not None and len(new_blocks) == 0 - assert len(manager.block_pool[3].token_ids) == 16 - assert len(manager.block_pool[4].token_ids) == 10 + assert manager.req_to_blocks[req0.request_id][-2].block_hash is not None # Append slots with allocating a new block. req0.num_computed_tokens = 74 @@ -171,9 +166,6 @@ def test_decode(): new_blocks = manager.append_slots(req0, 17) # Plus one preallocated block. assert new_blocks is not None and len(new_blocks) == 2 - assert len(manager.block_pool[4].token_ids) == 16 - assert len(manager.block_pool[5].token_ids) == 11 - assert len(manager.block_pool[6].token_ids) == 0 def test_evict(): @@ -217,3 +209,198 @@ def test_evict(): blocks = manager.allocate_slots(req2, 3, computed_blocks) assert [b.block_id for b in blocks] == [6, 5] assert manager.free_block_queue.num_free_blocks == 6 + + +def test_hash_block_correct_reuse(): + """ + This tests when a previously cached block is reused as a new block, + its hash metadata should be correctly reset. + """ + block_size = 16 + manager = KVCacheManager( + block_size=block_size, + num_gpu_blocks=1, + sliding_window=False, + enable_caching=True, + num_preallocate_tokens=0, + ) + + # Allocate 1 block and cache it. + num_tokens = block_size * 1 + req = make_request("0", list(range(num_tokens))) + computed_blocks = manager.get_computed_blocks(req) + assert not computed_blocks + blocks = manager.allocate_slots(req, num_tokens, computed_blocks) + assert len(blocks) == 1 + + # Deallocate the block. + manager.free(req) + + # Allocate a new block that's not full, make sure hash info on the + # block is cleared. + req = make_request("1", list(range(num_tokens - 1))) + computed_blocks = manager.get_computed_blocks(req) + assert not computed_blocks + blocks = manager.allocate_slots(req, num_tokens - 1, computed_blocks) + assert len(blocks) == 1 + + assert manager.block_pool[blocks[0].block_id].block_hash is None + + +def test_computed_blocks_not_evicted(): + """ + Test that the computed blocks are not evicted when getting new blocks + for a request if there are any other free blocks. + """ + block_size = 16 + manager = KVCacheManager( + block_size=block_size, + num_gpu_blocks=2, + sliding_window=False, + enable_caching=True, + num_preallocate_tokens=0, + ) + + # Allocate a block and cache it. + num_tokens = block_size * 1 + req0 = make_request("0", list(range(num_tokens))) + computed_blocks = manager.get_computed_blocks(req0) + assert not computed_blocks + blocks = manager.allocate_slots(req0, num_tokens, computed_blocks) + assert len(blocks) == 1 + assert blocks[0].block_id == 0 + + # Allocate another block. + req1 = make_request("1", list(range(num_tokens, num_tokens * 2))) + computed_blocks = manager.get_computed_blocks(req1) + assert not computed_blocks + blocks = manager.allocate_slots(req1, num_tokens, computed_blocks) + assert len(blocks) == 1 + assert blocks[0].block_id == 1 + + # Free the blocks. + manager.free(req0) + manager.free(req1) + + # Now if we have a cache hit on the first block, we should evict the second + # cached block rather than the first one. + req2 = make_request("2", list(range(num_tokens * 2))) + computed_blocks = manager.get_computed_blocks(req2) + assert len(computed_blocks) == 1 + assert computed_blocks[0].block_id == 0 + + blocks = manager.allocate_slots(req2, num_tokens * 2 - num_tokens, + computed_blocks) + assert len(blocks) == 1 + assert blocks[0].block_id == 1 + + +def test_basic_prefix_caching_disabled(): + """ + This tests that the prefix caching is disabled. + """ + block_size = 4 + manager = KVCacheManager( + block_size=block_size, + num_gpu_blocks=4, + sliding_window=False, + enable_caching=False, + num_preallocate_tokens=0, + ) + + req1 = make_request("1", list(range(10))) # 2 blocks and some more + + computed_blocks = manager.get_computed_blocks(req1) + assert not computed_blocks + blocks = manager.allocate_slots(req1, 10, computed_blocks) + assert len(blocks) == 3 + + # Free the blocks. + manager.free(req1) + + # No caching. + req2 = make_request("2", list(range(16))) # shared prefix + computed_blocks = manager.get_computed_blocks(req2) + assert not computed_blocks + blocks = manager.allocate_slots(req2, 16, computed_blocks) + assert len(blocks) == 4 + + # New requests should not have any blocks. + req3 = make_request("3", list(range(4))) + computed_blocks = manager.get_computed_blocks(req3) + assert not computed_blocks + blocks = manager.allocate_slots(req3, 4, computed_blocks) + assert not blocks + + +@pytest.mark.parametrize("num_preallocate_tokens", list(range(0, 8))) +@pytest.mark.parametrize("block_size", [4]) +def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int): + """ + This tests that the preallocated blocks are correctly added. + """ + manager = KVCacheManager( + block_size=block_size, + num_gpu_blocks=10, + sliding_window=False, + enable_caching=True, + num_preallocate_tokens=num_preallocate_tokens, + ) + num_preallocated_blocks = cdiv(num_preallocate_tokens, block_size) + + req = make_request("0", list(range(block_size * 30))) + computed_blocks = manager.get_computed_blocks(req) + assert not computed_blocks + # Just ask for 1 block. + blocks = manager.allocate_slots(req, block_size, computed_blocks) + assert len(blocks) == 1 + num_preallocated_blocks + + # Append slots to the block. + req.num_computed_tokens = block_size * len(blocks) # Assume all used. + blocks = manager.append_slots(req, block_size) # Append 1 block. + assert len(blocks) == 1 + num_preallocated_blocks + + +def test_cache_blocks(): + """ + This is a unit test that tests the correctness of the _cache_full_blocks + function of KVCacheManager. + """ + block_size = 4 + manager = KVCacheManager( + block_size=block_size, + num_gpu_blocks=5, + sliding_window=False, + enable_caching=True, + num_preallocate_tokens=0, + ) + # Req: + # Block 0: [0, 1, 2, 3] + # Block 1: [4, 5, 6, 7] + # Block 2: [8, 9, 10, 11] + # Block 3: [12, 13] + req = make_request("0", list(range(14))) + + # Test that blocks are cached correctly for 2 full blocks from the start. + blocks = [KVCacheBlock(block_id=i) for i in range(2)] + + manager._cache_full_blocks( + request=req, + blk_start_idx=0, + full_blocks=blocks, + prev_block=None, + ) + + assert len(manager.cached_block_hash_to_block) == 2 + assert all([block.block_hash is not None for block in blocks]) + + # Test that blocks that don't start from the beginning are cached correctly. + blocks = [KVCacheBlock(block_id=2)] + manager._cache_full_blocks( + request=req, + blk_start_idx=2, + full_blocks=blocks, + prev_block=None, + ) + assert len(manager.cached_block_hash_to_block) == 3 + assert blocks[0].block_hash is not None diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 38f1c03a4d3ac..8eb3fb976eb87 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -79,6 +79,9 @@ def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]: return [] computed_blocks = [] + + # TODO(rickyx): potentially we could cache this so we don't have to + # recompute it every time. block_hashes = hash_request_tokens(self.block_size, request.all_token_ids) @@ -120,47 +123,45 @@ def append_slots( # slots, but we cannot allocate new blocks due to the limit. return None - # When caching is enabled, assign token IDs to already allocated blocks. - new_token_ids = None - parent_block = None - if self.enable_caching: - # Figure out the token IDs to add to the blocks. - new_token_ids = request.all_token_ids[ - request.num_computed_tokens:request.num_computed_tokens + - num_tokens] - - # Find the last full block index. - # TODO: This may be optimized by calculating the computed tokens. - last_full_block_idx = len(req_blocks) - 1 - while (last_full_block_idx >= 0 - and req_blocks[last_full_block_idx].block_hash is None): - last_full_block_idx -= 1 - - parent_block = (req_blocks[last_full_block_idx] - if last_full_block_idx >= 0 else None) - token_id_idx = self._add_token_ids_to_blocks( - blocks=req_blocks[last_full_block_idx + 1:], - token_ids=new_token_ids, - parent_block=parent_block) - - new_token_ids = new_token_ids[token_id_idx:] - parent_block = req_blocks[-1] - - # No new block is needed. When caching is enabled, we make sure - # token_id_idx is equal to len(new_token_ids), meaning that all tokens - # are added to allocated blocks. - if num_required_blocks <= len(req_blocks): - assert not self.enable_caching or token_id_idx == num_tokens, \ - f"{token_id_idx=} != {num_tokens=}" - return [] + if num_new_blocks <= 0: + # No new block is needed. + new_blocks = [] + else: + # Get new blocks from the free block pool considering + # preallocated blocks. + num_new_blocks = min( + num_new_blocks + self.num_preallocate_blocks, + self.free_block_queue.num_free_blocks, + ) + + new_blocks = self._get_new_blocks(num_new_blocks) + req_blocks.extend(new_blocks) + + if not self.enable_caching: + return new_blocks + + num_computed_full_blocks = (request.num_computed_tokens // + self.block_size) + + # NOTE(rickyx): We are assuming the `num_tokens` are actual + # tokens rather than lookahead slots (e.g. for speculative decoding). + # TODO(rickyx): When supporting speculative decoding, we will need to + # differentiate between them so that we can know how many blocks are + # full after appending the actual tokens. + num_full_blocks_after_append = (request.num_computed_tokens + + num_tokens) // self.block_size + assert num_full_blocks_after_append <= len(req_blocks) + + new_full_blocks = req_blocks[ + num_computed_full_blocks:num_full_blocks_after_append] + self._cache_full_blocks( + request=request, + blk_start_idx=num_computed_full_blocks, + full_blocks=new_full_blocks, + prev_block=req_blocks[num_computed_full_blocks - 1] + if num_computed_full_blocks >= 1 else None, + ) - # Allocate new blocks considering preallocated blocks, and - # add token IDs to them if caching is enabled. - num_new_blocks = min(num_new_blocks + self.num_preallocate_blocks, - self.free_block_queue.num_free_blocks) - new_blocks = self._get_new_blocks(num_new_blocks, new_token_ids, - parent_block) - req_blocks.extend(new_blocks) return new_blocks def allocate_slots( @@ -184,11 +185,20 @@ def allocate_slots( raise ValueError( f"num_tokens must be greater than 0, got {num_tokens}") - # If a computed block of a request is an eviction candidate (in the - # free queue and ref_cnt == 0), it cannot be counted as a free block - # when allocating this request. - num_evictable_computed_blocks = len( - [blk for blk in computed_blocks if blk.ref_cnt == 0]) + # Touch the computed blocks to make sure they won't be evicted. + num_evictable_computed_blocks = 0 + if self.enable_caching: + self._touch(computed_blocks) + + # If a computed block of a request is an eviction candidate (in the + # free queue and ref_cnt == 0), it cannot be counted as a free block + # when allocating this request. + num_evictable_computed_blocks = len( + [blk for blk in computed_blocks if blk.ref_cnt == 0]) + else: + assert not computed_blocks, ( + "Computed blocks should be empty when " + "prefix caching is disabled") num_required_blocks = cdiv(num_tokens, self.block_size) if (num_required_blocks > self.free_block_queue.num_free_blocks - @@ -201,35 +211,28 @@ def allocate_slots( num_new_blocks = min( num_required_blocks + self.num_preallocate_blocks, self.free_block_queue.num_free_blocks - - num_evictable_computed_blocks) - - num_computed_tokens = len(computed_blocks) * self.block_size + num_evictable_computed_blocks, + ) - # When caching is enabled, get the new token IDs and the parent block - # ID to generate cache keys. - new_token_ids = None - parent_block = None - if self.enable_caching: - # Touch the computed blocks to make sure they won't be evicted. - self._touch(computed_blocks) + # Concatenate the computed block IDs and the new block IDs. + new_blocks = self._get_new_blocks(num_new_blocks) + self.req_to_blocks[request.request_id] = computed_blocks + new_blocks - # Get the token IDs for the blocks being allocated for hashing. - new_token_ids = request.all_token_ids[ - num_computed_tokens:num_computed_tokens + num_tokens] - if not new_token_ids: - raise RuntimeError( - "Failed to infer the token IDs for allocation. " - f"#all_tokens={len(request.all_token_ids)} < " - f"#computed_tokens={num_computed_tokens}") + if not self.enable_caching: + return new_blocks - # Get the parent block ID to construct the block chain. - parent_block = computed_blocks[-1] if computed_blocks else None + num_computed_tokens = len(computed_blocks) * self.block_size + num_full_blocks = (num_computed_tokens + num_tokens) // self.block_size - new_blocks = self._get_new_blocks(num_new_blocks, new_token_ids, - parent_block) + self._cache_full_blocks( + request=request, + blk_start_idx=len(computed_blocks), + # The new full blocks are the full blocks that are not computed. + full_blocks=self.req_to_blocks[request.request_id] + [len(computed_blocks):num_full_blocks], + prev_block=computed_blocks[-1] if computed_blocks else None, + ) - # Concatenate the computed block IDs and the new block IDs. - self.req_to_blocks[request.request_id] = computed_blocks + new_blocks return new_blocks def free(self, request: Request) -> None: @@ -248,24 +251,17 @@ def free(self, request: Request) -> None: blocks = reversed(blocks) for block in blocks: - block.ref_cnt -= 1 + block.decr_ref() if block.ref_cnt == 0: self.free_block_queue.append(block) - def _get_new_blocks( - self, - num_blocks: int, - token_ids: Optional[List[int]] = None, - parent_block: Optional[int] = None) -> List[KVCacheBlock]: - """Get new blocks from the free block pool, and add token IDs to - allocated blocks if caching is enabled. + def _get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]: + """Get new blocks from the free block pool. + Note that we do not check block cache in this function. Args: num_blocks: The number of blocks to allocate. - token_ids: The token IDs in the blocks. None if caching is disabled. - parent_block: The parent block. Used to include block chain - in the block hash. Returns: A list of new block. @@ -274,56 +270,38 @@ def _get_new_blocks( raise ValueError( f"Cannot get {num_blocks} free blocks from the pool") - # First allocate blocks. ret: List[KVCacheBlock] = [] idx = 0 while idx < num_blocks: + # First allocate blocks. curr_block = self.free_block_queue.popleft() assert curr_block.ref_cnt == 0 - # Evict blocks from the cache. + # If the block is cached, evict it. if self.enable_caching: - block_hash = curr_block.block_hash - if (block_hash is not None - and block_hash in self.cached_block_hash_to_block): - if len(self.cached_block_hash_to_block[block_hash]) == 1: - del self.cached_block_hash_to_block[block_hash] - else: - del self.cached_block_hash_to_block[block_hash][ - curr_block.block_id] - curr_block.reset() - - curr_block.ref_cnt = 1 + self._evict_cached_block(curr_block) + + curr_block.incr_ref() ret.append(curr_block) idx += 1 - # Then assign token IDs to the allocated blocks. - if self.enable_caching: - assert token_ids is not None - token_id_idx = self._add_token_ids_to_blocks( - blocks=ret, token_ids=token_ids, parent_block=parent_block) - assert token_id_idx == len(token_ids) - return ret - def _cache_full_block(self, - block: KVCacheBlock, - parent_block: Optional[KVCacheBlock] = None) -> None: - """Cache a full block for prefix caching. + def _evict_cached_block(self, block: KVCacheBlock) -> None: + """ + If a block is cached in `cached_block_hash_to_block`, we reset its hash + metadata and evict it from the cache. Args: - block: The block to cache. - parent_block: The parent block. None if this is the first block. + block: The block to evict. """ - parent_block_hash = (parent_block.block_hash - if parent_block is not None else None) - assert len(block.token_ids) == self.block_size - block.token_ids = tuple(block.token_ids) - block_hash = hash_block_tokens(parent_block_hash, block.token_ids) - block.block_hash = block_hash - block.num_hashed_tokens = self.block_size + ( - parent_block.num_hashed_tokens if parent_block is not None else 0) - self.cached_block_hash_to_block[block_hash][block.block_id] = block + block_hash = block.block_hash + if block_hash and block_hash in self.cached_block_hash_to_block: + block.reset_hash() + del self.cached_block_hash_to_block[block_hash][block.block_id] + + if len(self.cached_block_hash_to_block[block_hash]) == 0: + del self.cached_block_hash_to_block[block_hash] def _get_cached_block(self, block_hash: BlockHashType) -> Optional[KVCacheBlock]: @@ -355,43 +333,50 @@ def _touch(self, blocks: List[KVCacheBlock]) -> None: # candidate), so remove it. if block.ref_cnt == 0: self.free_block_queue.remove(block) - block.ref_cnt += 1 - - def _add_token_ids_to_blocks( - self, - blocks: List[KVCacheBlock], - token_ids: List[int], - parent_block: Optional[KVCacheBlock] = None) -> int: - """Add token IDs to a list of allocated blocks. - If a block becomes full after adding token IDs, cache it. - Return the token ID index that has not been added to the blocks - if the blocks are not enough to hold all the token IDs. + block.incr_ref() - Args: - blocks: A list of blocks to add token IDs. - token_ids: A list of token IDs to add. - parent_block: The parent block. None if this is the - first block. + def _cache_full_blocks( + self, + request: Request, + blk_start_idx: int, + full_blocks: List[KVCacheBlock], + prev_block: Optional[KVCacheBlock], + ) -> None: + """Cache a list of full blocks for prefix caching. - Returns: - The starting token ID index that has not been added to the blocks - due to insufficient given blocks. + This function takes a list of blocks that will have their block hash + metadata to be updated and cached. Given a request, it computes the + block hashes for the blocks starting from `blk_start_idx` to the end + of the request's full blocks, updating the metadata for each block + and caching them in the `cached_block_hash_to_block`. + + Args: + request: The request to cache the blocks. + blk_start_idx: The index of the first block in the request's blocks + to cache. + full_blocks: The list of blocks to update hash metadata. + prev_block: The previous block in the chain. """ - token_id_start = 0 - for curr_block in blocks: - # If all token IDs are added, then the rest of the blocks are - # preallocated blocks, so we only need to update the - # parent_block_id. FIXME - if token_id_start == len(token_ids): - continue - - # Add token IDs to the empty slots in the block. - empty_slots = self.block_size - len(curr_block.token_ids) - token_id_end = min(token_id_start + empty_slots, len(token_ids)) - curr_block.token_ids.extend(token_ids[token_id_start:token_id_end]) - # Cache the block if it becomes full. - if len(curr_block.token_ids) == self.block_size: - self._cache_full_block(curr_block, parent_block) - parent_block = curr_block - token_id_start = token_id_end - return token_id_start + # Update the new blocks with the block hashes through the chain. + prev_block_hash = (prev_block.block_hash + if prev_block is not None else None) + for i, blk in enumerate(full_blocks): + blk_idx = blk_start_idx + i + + block_tokens = request.all_token_ids[blk_idx * + self.block_size:(blk_idx + + 1) * + self.block_size] + assert len(block_tokens) == self.block_size, ( + f"Expected {self.block_size} tokens, got {len(block_tokens)} " + f"at {blk_idx}th block for request " + f"{request.request_id}({request})") + + # Compute the hash of the current block. + block_hash = hash_block_tokens(prev_block_hash, + tuple(block_tokens)) + + # Update and added the full block to the cache. + blk.block_hash = block_hash + self.cached_block_hash_to_block[block_hash][blk.block_id] = blk + prev_block_hash = block_hash diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 33dbfb7377bfd..fb666c364bfb2 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -1,6 +1,6 @@ """KV-Cache Utilities.""" -from dataclasses import dataclass, field -from typing import List, Optional, Tuple, Union +from dataclasses import dataclass +from typing import List, Optional, Tuple from vllm.logger import init_logger @@ -16,27 +16,34 @@ class KVCacheBlock: block_id: int # Reference count. ref_cnt: int = 0 - # Token IDs in the block. When the block is full, the type of token_ids - # should be Tuple[int] for fast matching. - token_ids: Union[List[int], Tuple[int]] = field(default_factory=list) # The hash of the block composed of (block hash, tuple of token IDs). # It is only available when the block is full. - block_hash: Optional[BlockHashType] = None - # The number of hashed tokens. More hashed tokens means the block - # is closer to the end of a prompt and more likely to be evicted. - num_hashed_tokens: int = 0 + _block_hash: Optional[BlockHashType] = None # Used to construct a doubly linked list for free blocks. # These two attributes should only be manipulated by FreeKVCacheBlockQueue. prev_free_block: Optional["KVCacheBlock"] = None next_free_block: Optional["KVCacheBlock"] = None - def reset(self): - """Reset the block metadata.""" - self.ref_cnt = 0 - self.token_ids = [] - self.block_hash = None - self.num_hashed_tokens = 0 + def incr_ref(self): + self.ref_cnt += 1 + + def decr_ref(self): + self.ref_cnt -= 1 + + @property + def block_hash(self) -> Optional[BlockHashType]: + return self._block_hash + + @block_hash.setter + def block_hash(self, block_hash: BlockHashType): + assert self.block_hash is None, ( + "The block already has a hash. This should not happen.") + self._block_hash = block_hash + + def reset_hash(self): + """Reset the block hash when the block is evicted.""" + self._block_hash = None class FreeKVCacheBlockQueue: From 948c859571af9588e344079cc0e79bbf8597cb18 Mon Sep 17 00:00:00 2001 From: zixuanzhang226 Date: Fri, 22 Nov 2024 16:16:14 -0800 Subject: [PATCH 112/255] support bitsandbytes quantization with qwen model (#10549) Signed-off-by: Ubuntu --- vllm/model_executor/models/qwen.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index d3a776f665c74..8f001200308fe 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -1028,6 +1028,18 @@ class QWenLLM(QWenBaseModel): embedding_modules = {} embedding_padding_modules = [] + default_bitsandbytes_target_modules = [ + ".c_attn.", + ".c_proj.", + ".w1.", + ".w2.", + ] + bitsandbytes_stacked_params_mapping = { + # shard_name, weight_name, index + "w2": ("gate_up_proj", 0), + "w1": ("gate_up_proj", 1), + } + class QWenVL(QWenBaseModel, SupportsMultiModal): packed_modules_mapping = { From 28598f3939f9a04800f514e7fe62ab9bb8f617ec Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Fri, 22 Nov 2024 19:22:53 -0500 Subject: [PATCH 113/255] [Core] remove temporary local variables in LLMEngine.__init__ (#10577) Signed-off-by: Russell Bryant --- vllm/engine/llm_engine.py | 143 ++++++++++++++++++-------------------- 1 file changed, 66 insertions(+), 77 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 2a5eaf1340762..fb21b2dedeb74 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -231,19 +231,18 @@ def __init__( use_cached_outputs: bool = False, ) -> None: - # TODO: remove the local variables and use self.* throughout the class. - model_config = self.model_config = vllm_config.model_config - cache_config = self.cache_config = vllm_config.cache_config - lora_config = self.lora_config = vllm_config.lora_config - parallel_config = self.parallel_config = vllm_config.parallel_config - scheduler_config = self.scheduler_config = vllm_config.scheduler_config - device_config = self.device_config = vllm_config.device_config - speculative_config = self.speculative_config = vllm_config.speculative_config # noqa - load_config = self.load_config = vllm_config.load_config - decoding_config = self.decoding_config = vllm_config.decoding_config or DecodingConfig( # noqa + self.model_config = vllm_config.model_config + self.cache_config = vllm_config.cache_config + self.lora_config = vllm_config.lora_config + self.parallel_config = vllm_config.parallel_config + self.scheduler_config = vllm_config.scheduler_config + self.device_config = vllm_config.device_config + self.speculative_config = vllm_config.speculative_config # noqa + self.load_config = vllm_config.load_config + self.decoding_config = vllm_config.decoding_config or DecodingConfig( # noqa ) - prompt_adapter_config = self.prompt_adapter_config = vllm_config.prompt_adapter_config # noqa - observability_config = self.observability_config = vllm_config.observability_config or ObservabilityConfig( # noqa + self.prompt_adapter_config = vllm_config.prompt_adapter_config # noqa + self.observability_config = vllm_config.observability_config or ObservabilityConfig( # noqa ) logger.info( @@ -265,54 +264,43 @@ def __init__( "mm_processor_kwargs=%s, pooler_config=%r," "compilation_config=%r", VLLM_VERSION, - model_config.model, - speculative_config, - model_config.tokenizer, - model_config.skip_tokenizer_init, - model_config.tokenizer_mode, - model_config.revision, - model_config.override_neuron_config, - model_config.tokenizer_revision, - model_config.trust_remote_code, - model_config.dtype, - model_config.max_model_len, - load_config.download_dir, - load_config.load_format, - parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size, - parallel_config.disable_custom_all_reduce, - model_config.quantization, - model_config.enforce_eager, - cache_config.cache_dtype, - model_config.quantization_param_path, - device_config.device, - decoding_config, - observability_config, - model_config.seed, - model_config.served_model_name, - scheduler_config.num_scheduler_steps, - scheduler_config.chunked_prefill_enabled, - scheduler_config.multi_step_stream_outputs, - cache_config.enable_prefix_caching, - model_config.use_async_output_proc, + self.model_config.model, + self.speculative_config, + self.model_config.tokenizer, + self.model_config.skip_tokenizer_init, + self.model_config.tokenizer_mode, + self.model_config.revision, + self.model_config.override_neuron_config, + self.model_config.tokenizer_revision, + self.model_config.trust_remote_code, + self.model_config.dtype, + self.model_config.max_model_len, + self.load_config.download_dir, + self.load_config.load_format, + self.parallel_config.tensor_parallel_size, + self.parallel_config.pipeline_parallel_size, + self.parallel_config.disable_custom_all_reduce, + self.model_config.quantization, + self.model_config.enforce_eager, + self.cache_config.cache_dtype, + self.model_config.quantization_param_path, + self.device_config.device, + self.decoding_config, + self.observability_config, + self.model_config.seed, + self.model_config.served_model_name, + self.scheduler_config.num_scheduler_steps, + self.scheduler_config.chunked_prefill_enabled, + self.scheduler_config.multi_step_stream_outputs, + self.cache_config.enable_prefix_caching, + self.model_config.use_async_output_proc, use_cached_outputs, - model_config.mm_processor_kwargs, - model_config.pooler_config, + self.model_config.mm_processor_kwargs, + self.model_config.pooler_config, vllm_config.compilation_config, ) # TODO(woosuk): Print more configs in debug mode. - self.model_config = model_config - self.cache_config = cache_config - self.lora_config = lora_config - self.parallel_config = parallel_config - self.scheduler_config = scheduler_config - self.device_config = device_config - self.speculative_config = speculative_config - self.load_config = load_config - self.decoding_config = decoding_config or DecodingConfig() - self.prompt_adapter_config = prompt_adapter_config - self.observability_config = observability_config or ObservabilityConfig( - ) + self.log_stats = log_stats self.use_cached_outputs = use_cached_outputs @@ -334,15 +322,15 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: self.seq_counter = Counter() self.generation_config_fields = _load_generation_config_dict( - model_config) + self.model_config) - self.input_preprocessor = InputPreprocessor(model_config, + self.input_preprocessor = InputPreprocessor(self.model_config, self.tokenizer, mm_registry) self.input_registry = input_registry self.input_processor = input_registry.create_input_processor( - model_config) + self.model_config) self.model_executor = executor_class(vllm_config=vllm_config, ) @@ -354,36 +342,36 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: from vllm.model_executor.model_loader import ( get_architecture_class_name) usage_message.report_usage( - get_architecture_class_name(model_config), + get_architecture_class_name(self.model_config), usage_context, extra_kvs={ # Common configuration "dtype": - str(model_config.dtype), + str(self.model_config.dtype), "tensor_parallel_size": - parallel_config.tensor_parallel_size, + self.parallel_config.tensor_parallel_size, "block_size": - cache_config.block_size, + self.cache_config.block_size, "gpu_memory_utilization": - cache_config.gpu_memory_utilization, + self.cache_config.gpu_memory_utilization, # Quantization "quantization": - model_config.quantization, + self.model_config.quantization, "kv_cache_dtype": - str(cache_config.cache_dtype), + str(self.cache_config.cache_dtype), # Feature flags "enable_lora": - bool(lora_config), + bool(self.lora_config), "enable_prompt_adapter": - bool(prompt_adapter_config), + bool(self.prompt_adapter_config), "enable_prefix_caching": - cache_config.enable_prefix_caching, + self.cache_config.enable_prefix_caching, "enforce_eager": - model_config.enforce_eager, + self.model_config.enforce_eager, "disable_custom_all_reduce": - parallel_config.disable_custom_all_reduce, + self.parallel_config.disable_custom_all_reduce, }) if self.tokenizer: @@ -402,7 +390,7 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: for _ in range(self.parallel_config.pipeline_parallel_size) ] - if model_config.use_async_output_proc: + if self.model_config.use_async_output_proc: process_model_outputs = weak_bind(self._process_model_outputs) self.async_callbacks = [ @@ -422,11 +410,11 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: # GPU and CPU blocks, which are profiled in the distributed executor. self.scheduler = [ Scheduler( - scheduler_config, cache_config, lora_config, - parallel_config.pipeline_parallel_size, + self.scheduler_config, self.cache_config, self.lora_config, + self.parallel_config.pipeline_parallel_size, self.async_callbacks[v_id] - if model_config.use_async_output_proc else None) - for v_id in range(parallel_config.pipeline_parallel_size) + if self.model_config.use_async_output_proc else None) + for v_id in range(self.parallel_config.pipeline_parallel_size) ] # Metric Logging. @@ -448,7 +436,8 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: "prometheus": PrometheusStatLogger( local_interval=_LOCAL_LOGGING_INTERVAL_SEC, - labels=dict(model_name=model_config.served_model_name), + labels=dict( + model_name=self.model_config.served_model_name), max_model_len=self.model_config.max_model_len), } self.stat_loggers["prometheus"].info("cache_config", From d345f409b7478c0e547b238916ec9e90b6156bbc Mon Sep 17 00:00:00 2001 From: Zhonghua Deng Date: Sat, 23 Nov 2024 09:16:15 +0800 Subject: [PATCH 114/255] [V1] EngineCore supports profiling (#10564) Signed-off-by: Abatom --- vllm/v1/engine/__init__.py | 6 ++++++ vllm/v1/engine/async_llm.py | 4 ++-- vllm/v1/engine/core.py | 14 ++++++++++++-- vllm/v1/engine/core_client.py | 28 +++++++++++++++++++++++----- vllm/v1/worker/gpu_worker.py | 25 +++++++++++++++++++++++++ 5 files changed, 68 insertions(+), 9 deletions(-) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index edfb8bd7c2fc1..967124fd850ea 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -68,6 +68,11 @@ class EngineCoreOutputs(msgspec.Struct, outputs: List[EngineCoreOutput] +@dataclass +class EngineCoreProfile: + is_start: bool + + class EngineCoreRequestType(enum.Enum): """ Request types defined as hex byte strings, so it can be sent over sockets @@ -75,3 +80,4 @@ class EngineCoreRequestType(enum.Enum): """ ADD = b'\x00' ABORT = b'\x01' + PROFILE = b'\x02' diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 09bff9655a882..c44ebb2a85ba0 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -346,10 +346,10 @@ async def check_health(self) -> None: logger.debug("Called check_health.") async def start_profile(self) -> None: - raise ValueError("Not supported on V1 yet.") + await self.engine_core.profile(True) async def stop_profile(self) -> None: - raise ValueError("Not supported on V1 yet.") + await self.engine_core.profile(False) @property def is_running(self) -> bool: diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 35ed131d50de9..1a978fbe7355f 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -1,4 +1,5 @@ import multiprocessing +import pickle import queue import threading import time @@ -16,7 +17,8 @@ from vllm.usage.usage_lib import UsageContext from vllm.v1.core.scheduler import Scheduler from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, - EngineCoreRequest, EngineCoreRequestType) + EngineCoreProfile, EngineCoreRequest, + EngineCoreRequestType) from vllm.v1.engine.mm_input_mapper import MMInputMapper from vllm.v1.executor.gpu_executor import GPUExecutor from vllm.v1.request import Request, RequestStatus @@ -126,6 +128,9 @@ def step(self) -> List[EngineCoreOutput]: scheduler_output, output) return engine_core_outputs + def profile(self, is_start=True): + self.model_executor.worker.profile(is_start) + class EngineCoreProc(EngineCore): """ZMQ-wrapper for running EngineCore in background process.""" @@ -312,11 +317,14 @@ def _log_stats(self): self._last_logging_time = now def _handle_client_request( - self, request: Union[EngineCoreRequest, List[str]]) -> None: + self, request: Union[EngineCoreRequest, EngineCoreProfile, + List[str]]) -> None: """Handle EngineCoreRequest or EngineCoreABORT from Client.""" if isinstance(request, EngineCoreRequest): self.add_request(request) + elif isinstance(request, EngineCoreProfile): + self.model_executor.worker.profile(request.is_start) else: # TODO: make an EngineCoreAbort wrapper assert isinstance(request, list) @@ -341,6 +349,8 @@ def process_input_socket(self, input_path: str): request = decoder_add_req.decode(request_data) elif request_type == EngineCoreRequestType.ABORT.value: request = decoder_abort_req.decode(request_data) + elif request_type == EngineCoreRequestType.PROFILE.value: + request = pickle.loads(request_data) else: raise ValueError(f"Unknown RequestType: {request_type}") diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 09801e20e16ca..835963f7ee86c 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -9,7 +9,8 @@ from vllm.logger import init_logger from vllm.utils import get_open_zmq_ipc_path from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, - EngineCoreRequest, EngineCoreRequestType) + EngineCoreProfile, EngineCoreRequest, + EngineCoreRequestType) from vllm.v1.engine.core import EngineCore, EngineCoreProc from vllm.v1.serial_utils import PickleEncoder @@ -58,6 +59,9 @@ def get_output(self) -> List[EngineCoreOutput]: def add_request(self, request: EngineCoreRequest) -> None: raise NotImplementedError + async def profile(self, is_start=True) -> None: + raise NotImplementedError + def abort_requests(self, request_ids: List[str]) -> None: raise NotImplementedError @@ -95,6 +99,9 @@ def add_request(self, request: EngineCoreRequest) -> None: def abort_requests(self, request_ids: List[str]) -> None: self.engine_core.abort_requests(request_ids) + async def profile(self, is_start=True) -> None: + self.engine_core.profile(is_start) + class MPClient(EngineCoreClient): """ @@ -177,8 +184,10 @@ def get_output(self) -> List[EngineCoreOutput]: engine_core_outputs = self.decoder.decode(frame.buffer).outputs return engine_core_outputs - def _send_input(self, request_type: EngineCoreRequestType, - request: Union[EngineCoreRequest, List[str]]) -> None: + def _send_input( + self, request_type: EngineCoreRequestType, + request: Union[EngineCoreRequest, EngineCoreProfile, + List[str]]) -> None: # (RequestType, SerializedRequest) msg = (request_type.value, self.encoder.encode(request)) @@ -190,6 +199,10 @@ def add_request(self, request: EngineCoreRequest) -> None: def abort_requests(self, request_ids: List[str]) -> None: self._send_input(EngineCoreRequestType.ABORT, request_ids) + async def profile(self, is_start=True) -> None: + self._send_input(EngineCoreRequestType.PROFILE, + EngineCoreProfile(is_start)) + class AsyncMPClient(MPClient): """Asyncio-compatible client for multi-proc EngineCore.""" @@ -205,8 +218,9 @@ async def get_output_async(self) -> List[EngineCoreOutput]: return engine_core_outputs async def _send_input( - self, request_type: EngineCoreRequestType, - request: Union[EngineCoreRequest, List[str]]) -> None: + self, request_type: EngineCoreRequestType, + request: Union[EngineCoreRequest, EngineCoreProfile, + List[str]]) -> None: msg = (request_type.value, self.encoder.encode(request)) await self.input_socket.send_multipart(msg, copy=False) @@ -217,3 +231,7 @@ async def add_request_async(self, request: EngineCoreRequest) -> None: async def abort_requests_async(self, request_ids: List[str]) -> None: if len(request_ids) > 0: await self._send_input(EngineCoreRequestType.ABORT, request_ids) + + async def profile(self, is_start=True) -> None: + await self._send_input(EngineCoreRequestType.PROFILE, + EngineCoreProfile(is_start)) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 7973349f14a5d..d33b55a8a9f9a 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -6,6 +6,7 @@ import torch import torch.distributed +import vllm.envs as envs from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment, @@ -56,6 +57,22 @@ def __init__( init_cached_hf_modules() self.model_runner = GPUModelRunner(vllm_config) + # Torch profiler. Enabled and configured through env vars: + # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace + if envs.VLLM_TORCH_PROFILER_DIR: + torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR + logger.info("Profiling enabled. Traces will be saved to: %s", + torch_profiler_trace_dir) + self.profiler = torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA, + ], + with_stack=True, + on_trace_ready=torch.profiler.tensorboard_trace_handler( + torch_profiler_trace_dir, use_gzip=True)) + else: + self.profiler = None def initialize(self): if self.device_config.device.type == "cuda": @@ -184,6 +201,14 @@ def execute_model( # TODO(woosuk): Send the output to the engine process. return output + def profile(self, is_start=True): + if self.profiler is None: + raise RuntimeError("Profiler is not enabled.") + if is_start: + self.profiler.start() + else: + self.profiler.stop() + def init_worker_distributed_environment( parallel_config: ParallelConfig, From d559979c548c4bee6eca089d5e6dc318630bf465 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Nov 2024 17:34:03 -0800 Subject: [PATCH 115/255] [bugfix] fix cpu tests (#10585) Signed-off-by: youkaichao --- vllm/worker/cpu_embedding_model_runner.py | 4 +++- vllm/worker/cpu_enc_dec_model_runner.py | 4 +++- vllm/worker/cpu_model_runner.py | 18 ++++++++++-------- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/vllm/worker/cpu_embedding_model_runner.py b/vllm/worker/cpu_embedding_model_runner.py index d0b8fec48d74f..978de73df6b70 100644 --- a/vllm/worker/cpu_embedding_model_runner.py +++ b/vllm/worker/cpu_embedding_model_runner.py @@ -3,6 +3,7 @@ import torch +from vllm.forward_context import set_forward_context from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.multimodal import MultiModalKwargs from vllm.pooling_params import PoolingParams @@ -64,7 +65,8 @@ def execute_model( intermediate_tensors, } - hidden_states = model_executable(**execute_model_kwargs) + with set_forward_context(model_input.attn_metadata, self.vllm_config): + hidden_states = model_executable(**execute_model_kwargs) # Only perform pooling in the driver worker. if not self.is_driver_worker: diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py index d040831870bd8..1f8e2d2d88a23 100644 --- a/vllm/worker/cpu_enc_dec_model_runner.py +++ b/vllm/worker/cpu_enc_dec_model_runner.py @@ -4,6 +4,7 @@ import torch from vllm.attention import AttentionMetadata +from vllm.forward_context import set_forward_context from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.sampler import SamplerOutput from vllm.multimodal import MultiModalKwargs @@ -303,7 +304,8 @@ def execute_model( intermediate_tensors, } - hidden_states = model_executable(**execute_model_kwargs) + with set_forward_context(model_input.attn_metadata, self.vllm_config): + hidden_states = model_executable(**execute_model_kwargs) # Compute the logits. logits = self.model.compute_logits(hidden_states, diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 66bd844c94901..2cf573625401a 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -10,6 +10,7 @@ from vllm.attention import AttentionMetadata, get_attn_backend from vllm.config import VllmConfig +from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding @@ -487,14 +488,15 @@ def execute_model( multimodal_kwargs = MultiModalKwargs.as_kwargs( model_input.multi_modal_kwargs, device=self.device) - hidden_states = model_executable( - input_ids=model_input.input_tokens, - positions=model_input.input_positions, - kv_caches=kv_caches, - attn_metadata=model_input.attn_metadata, - intermediate_tensors=intermediate_tensors, - **multimodal_kwargs, - ) + with set_forward_context(model_input.attn_metadata, self.vllm_config): + hidden_states = model_executable( + input_ids=model_input.input_tokens, + positions=model_input.input_positions, + kv_caches=kv_caches, + attn_metadata=model_input.attn_metadata, + intermediate_tensors=intermediate_tensors, + **multimodal_kwargs, + ) # Compute the logits. logits = self.model.compute_logits(hidden_states, From 9195dbdbcadb681db67181a664521bd6ef98deee Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Fri, 22 Nov 2024 19:17:38 -0700 Subject: [PATCH 116/255] [Bugfix][Frontend] Update Llama Chat Templates to also support Non-Tool use (#10164) Signed-off-by: Travis Johnson --- .../tool_chat_template_llama3.1_json.jinja | 46 +++++++-- .../tool_chat_template_llama3.2_json.jinja | 96 ++++++++++++++----- tests/entrypoints/test_chat_utils.py | 4 +- 3 files changed, 110 insertions(+), 36 deletions(-) diff --git a/examples/tool_chat_template_llama3.1_json.jinja b/examples/tool_chat_template_llama3.1_json.jinja index c24a7e51335ef..033830936a56b 100644 --- a/examples/tool_chat_template_llama3.1_json.jinja +++ b/examples/tool_chat_template_llama3.1_json.jinja @@ -19,10 +19,18 @@ {#- This block extracts the system message, so we can slot it into the right place. #} {%- if messages[0]['role'] == 'system' %} - {%- set system_message = messages[0]['content']|trim %} + {%- if messages[0]['content'] is string %} + {%- set system_message = messages[0]['content']|trim %} + {%- else %} + {%- set system_message = messages[0]['content'][0]['text']|trim %} + {%- endif %} {%- set messages = messages[1:] %} {%- else %} - {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %} + {%- if tools is not none %} + {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %} + {%- else %} + {%- set system_message = "" %} + {%- endif %} {%- endif %} {#- System message #} @@ -33,8 +41,8 @@ {{- "Cutting Knowledge Date: December 2023\n" }} {{- "Today Date: " + date_string + "\n\n" }} {%- if tools is not none and not tools_in_user_message %} - {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} - {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call. " }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. ' }} {{- "Do not use variables.\n\n" }} {%- for t in tools %} {{- t | tojson(indent=4) }} @@ -48,7 +56,11 @@ {%- if tools_in_user_message and not tools is none %} {#- Extract the first user message so we can plug it in here #} {%- if messages | length != 0 %} - {%- set first_user_message = messages[0]['content']|trim %} + {%- if messages[0]['content'] is string %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- else %} + {%- set first_user_message = messages[0]['content'] | selectattr('type', 'equalto', 'text') | map(attribute='text') | map('trim') | join('\n') %} + {%- endif %} {%- set messages = messages[1:] %} {%- else %} {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} @@ -56,7 +68,7 @@ {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} {{- "Given the following functions, please respond with a JSON for a function call " }} {{- "with its proper arguments that best answers the given prompt.\n\n" }} - {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. ' }} {{- "Do not use variables.\n\n" }} {%- for t in tools %} {{- t | tojson(indent=4) }} @@ -67,7 +79,17 @@ {%- for message in messages %} {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} - {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }} + {%- if message['content'] is string %} + {{- message['content'] | trim}} + {%- else %} + {%- for content in message['content'] %} + {%- if content['type'] == 'text' %} + {{- content['text'] | trim }} + {%- endif %} + {%- endfor %} + {%- endif %} + {{- '<|eot_id|>' }} {%- elif 'tool_calls' in message %} {%- if not message.tool_calls|length == 1 %} {{- raise_exception("This model only supports single tool-calls at once!") }} @@ -81,10 +103,14 @@ {{- "<|eot_id|>" }} {%- elif message.role == "tool" or message.role == "ipython" %} {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} - {%- if message.content is mapping %} - {{- message.content | tojson }} - {%- else %} + {%- if message.content is string %} {{- { "output": message.content } | tojson }} + {%- else %} + {%- for content in message['content'] %} + {%- if content['type'] == 'text' %} + {{- { "output": content['text'] } | tojson }} + {%- endif %} + {%- endfor %} {%- endif %} {{- "<|eot_id|>" }} {%- endif %} diff --git a/examples/tool_chat_template_llama3.2_json.jinja b/examples/tool_chat_template_llama3.2_json.jinja index 7e24777726a35..39f902c1c3c40 100644 --- a/examples/tool_chat_template_llama3.2_json.jinja +++ b/examples/tool_chat_template_llama3.2_json.jinja @@ -16,38 +16,70 @@ {%- set tools = none %} {%- endif %} +{#- Find out if there are any images #} +{% set image_ns = namespace(has_images=false) %} +{%- for message in messages %} + {%- for content in message['content'] %} + {%- if content['type'] == 'image' %} + {%- set image_ns.has_images = true %} + {%- endif %} + {%- endfor %} +{%- endfor %} + + {#- This block extracts the system message, so we can slot it into the right place. #} {%- if messages[0]['role'] == 'system' %} - {%- set system_message = messages[0]['content']|trim %} + {%- if messages[0]['content'] is string %} + {%- set system_message = messages[0]['content']|trim %} + {%- else %} + {#- Support vLLM's transforming of a content string to JSON. #} + {%- set system_message = messages[0]['content'][0]['text']|trim %} + {%- endif %} {%- set messages = messages[1:] %} {%- else %} - {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %} + {%- if tools is not none %} + {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %} + {%- else %} + {%- set system_message = "" %} + {%- endif %} {%- endif %} -{#- System message #} -{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} -{%- if tools is not none %} - {{- "Environment: ipython\n" }} +{#- Including an image is not compatible with a system message #} +{%- if image_ns.has_images and not system_message == "" %} + {{- raise_exception("Prompting with images is incompatible with system messages and tool use.") }} {%- endif %} -{{- "Cutting Knowledge Date: December 2023\n" }} -{{- "Today Date: " + date_string + "\n\n" }} -{%- if tools is not none and not tools_in_user_message %} - {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} - {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} - {{- "Do not use variables.\n\n" }} - {%- for t in tools %} - {{- t | tojson(indent=4) }} - {{- "\n\n" }} - {%- endfor %} + + +{#- System message, if there are no images #} +{%- if not image_ns.has_images %} + {{- "<|start_header_id|>system<|end_header_id|>\n\n" }} + {%- if tools is not none %} + {{- "Environment: ipython\n" }} + {%- endif %} + {{- "Cutting Knowledge Date: December 2023\n" }} + {{- "Today Date: " + date_string + "\n\n" }} + {%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call. " }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. ' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {%- endif %} + {{- system_message }} + {{- "<|eot_id|>" }} {%- endif %} -{{- system_message }} -{{- "<|eot_id|>" }} {#- Custom tools are passed in a user message with some extra guidance #} {%- if tools_in_user_message and not tools is none %} {#- Extract the first user message so we can plug it in here #} {%- if messages | length != 0 %} - {%- set first_user_message = messages[0]['content']|trim %} + {%- if messages[0]['content'] is string %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- else %} + {%- set first_user_message = messages[0]['content'] | selectattr('type', 'equalto', 'text') | map(attribute='text') | map('trim') | join('\n') %} + {%- endif %} {%- set messages = messages[1:] %} {%- else %} {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} @@ -55,7 +87,7 @@ {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} {{- "Given the following functions, please respond with a JSON for a function call " }} {{- "with its proper arguments that best answers the given prompt.\n\n" }} - {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. ' }} {{- "Do not use variables.\n\n" }} {%- for t in tools %} {{- t | tojson(indent=4) }} @@ -66,7 +98,19 @@ {%- for message in messages %} {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} - {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }} + {%- if message['content'] is string %} + {{- message['content'] | trim}} + {%- else %} + {%- for content in message['content'] %} + {%- if content['type'] == 'image' %} + {{- '<|image|>' }} + {%- elif content['type'] == 'text' %} + {{- content['text'] | trim }} + {%- endif %} + {%- endfor %} + {%- endif %} + {{- '<|eot_id|>' }} {%- elif 'tool_calls' in message %} {%- if not message.tool_calls|length == 1 %} {{- raise_exception("This model only supports single tool-calls at once!") }} @@ -80,10 +124,14 @@ {{- "<|eot_id|>" }} {%- elif message.role == "tool" or message.role == "ipython" %} {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} - {%- if message.content is mapping %} - {{- message.content | tojson }} - {%- else %} + {%- if message.content is string %} {{- { "output": message.content } | tojson }} + {%- else %} + {%- for content in message['content'] %} + {%- if content['type'] == 'text' %} + {{- { "output": content['text'] } | tojson }} + {%- endif %} + {%- endfor %} {%- endif %} {{- "<|eot_id|>" }} {%- endif %} diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 72477e048eafa..996e60bfee592 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -766,8 +766,8 @@ def test_resolve_content_format_hf_defined(model, expected_format): ("tool_chat_template_granite_20b_fc.jinja", "string"), ("tool_chat_template_hermes.jinja", "string"), ("tool_chat_template_internlm2_tool.jinja", "string"), - ("tool_chat_template_llama3.1_json.jinja", "string"), - ("tool_chat_template_llama3.2_json.jinja", "string"), + ("tool_chat_template_llama3.1_json.jinja", "openai"), + ("tool_chat_template_llama3.2_json.jinja", "openai"), ("tool_chat_template_mistral_parallel.jinja", "string"), ("tool_chat_template_mistral.jinja", "string")], ) From ebda51968b12b85c8b5b82727b2b7713dfc44f88 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Fri, 22 Nov 2024 21:23:51 -0500 Subject: [PATCH 117/255] [Core] Fix broken log configuration (#10458) Signed-off-by: Russell Bryant --- examples/logging_configuration.md | 2 +- vllm/logger.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/examples/logging_configuration.md b/examples/logging_configuration.md index 0d278b0392403..9ac8b13cd5eaf 100644 --- a/examples/logging_configuration.md +++ b/examples/logging_configuration.md @@ -118,7 +118,7 @@ configuration for the root vLLM logger and for the logger you wish to silence: { "formatters": { "vllm": { - "class": "vllm.logging.NewLineFormatter", + "class": "vllm.logging_utils.NewLineFormatter", "datefmt": "%m-%d %H:%M:%S", "format": "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s" } diff --git a/vllm/logger.py b/vllm/logger.py index 9e16e591315ba..538db0dcf19aa 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -50,7 +50,7 @@ def _configure_vllm_root_logger() -> None: - logging_config: Optional[Dict] = None + logging_config: Dict = {} if not VLLM_CONFIGURE_LOGGING and VLLM_LOGGING_CONFIG_PATH: raise RuntimeError( @@ -75,6 +75,11 @@ def _configure_vllm_root_logger() -> None: type(custom_config).__name__) logging_config = custom_config + for formatter in logging_config.get("formatters", {}).values(): + # This provides backwards compatibility after #10134. + if formatter.get("class") == "vllm.logging.NewLineFormatter": + formatter["class"] = "vllm.logging_utils.NewLineFormatter" + if logging_config: dictConfig(logging_config) From 978b39744b22e90d49a0f5367c3d933ed26d66c8 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Fri, 22 Nov 2024 22:14:03 -0500 Subject: [PATCH 118/255] [Misc] Add pynccl wrappers for all_gather and reduce_scatter (#9432) --- tests/distributed/test_pynccl.py | 69 +++++++++++++++++++ .../device_communicators/pynccl.py | 42 +++++++++++ .../device_communicators/pynccl_wrapper.py | 44 ++++++++++++ 3 files changed, 155 insertions(+) diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index e0e424439e3a5..f702d7c46ea73 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -150,6 +150,75 @@ def worker_fn_with_cudagraph(): assert a.mean().cpu().item() == pynccl_comm.world_size**1 +@worker_fn_wrapper +def all_gather_worker_fn(): + pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group, + device=get_world_group().device) + + rank = pynccl_comm.rank + world_size = pynccl_comm.world_size + device = f'cuda:{pynccl_comm.rank}' + + num_elems = 1000 + tensor = torch.arange(num_elems, dtype=torch.float32, + device=device) + rank * num_elems + result = torch.zeros(num_elems * world_size, + dtype=torch.float32, + device=device) + + expected = torch.cat([ + torch.arange(num_elems, dtype=torch.float32) + r * num_elems + for r in range(world_size) + ]).to(device) + + with pynccl_comm.change_state(enable=True): + pynccl_comm.all_gather(result, tensor) + torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8) + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason="Need at least 2 GPUs to run the test.") +def test_pynccl_all_gather(): + distributed_run(all_gather_worker_fn, 2) + + +@worker_fn_wrapper +def reduce_scatter_worker_fn(): + pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group, + device=get_world_group().device) + + rank = pynccl_comm.rank + world_size = pynccl_comm.world_size + device = f'cuda:{pynccl_comm.rank}' + + num_elems = 1000 + tensor = torch.arange(num_elems, dtype=torch.float32, + device=device) + rank * num_elems + assert (num_elems % world_size == 0) + result = torch.zeros(num_elems // world_size, + dtype=torch.float32, + device=device) + + # Calculate expected result for this rank's chunk + scattered_size = num_elems // world_size + all_tensors = [ + torch.arange(num_elems, dtype=torch.float32) + r * num_elems + for r in range(world_size) + ] + expected = sum(tensor[rank * scattered_size:(rank + 1) * scattered_size] + for tensor in all_tensors).to(device) + + with pynccl_comm.change_state(enable=True): + pynccl_comm.reduce_scatter(result, tensor) + torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8) + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason="Need at least 2 GPUs to run the test.") +def test_pynccl_reduce_scatter(): + distributed_run(reduce_scatter_worker_fn, 2) + + @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test.") def test_pynccl_with_cudagraph(): diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index 7c6f48e88637b..7411304eb18fa 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -131,6 +131,48 @@ def all_reduce(self, ncclRedOpTypeEnum.from_torch(op), self.comm, cudaStream_t(stream.cuda_stream)) + def all_gather(self, + output_tensor: torch.Tensor, + input_tensor: torch.Tensor, + stream=None): + if self.disabled: + return + # nccl communicator created on a specific device + # will only work on tensors on the same device + # otherwise it will cause "illegal memory access" + assert input_tensor.device == self.device, ( + f"this nccl communicator is created to work on {self.device}, " + f"but the input tensor is on {input_tensor.device}") + if stream is None: + stream = self.stream + self.nccl.ncclAllGather( + buffer_type(input_tensor.data_ptr()), + buffer_type(output_tensor.data_ptr()), input_tensor.numel(), + ncclDataTypeEnum.from_torch(input_tensor.dtype), self.comm, + cudaStream_t(stream.cuda_stream)) + + def reduce_scatter(self, + output_tensor: torch.Tensor, + input_tensor: torch.Tensor, + op: ReduceOp = ReduceOp.SUM, + stream=None): + if self.disabled: + return + # nccl communicator created on a specific device + # will only work on tensors on the same device + # otherwise it will cause "illegal memory access" + assert input_tensor.device == self.device, ( + f"this nccl communicator is created to work on {self.device}, " + f"but the input tensor is on {input_tensor.device}") + if stream is None: + stream = self.stream + self.nccl.ncclReduceScatter( + buffer_type(input_tensor.data_ptr()), + buffer_type(output_tensor.data_ptr()), output_tensor.numel(), + ncclDataTypeEnum.from_torch(input_tensor.dtype), + ncclRedOpTypeEnum.from_torch(op), self.comm, + cudaStream_t(stream.cuda_stream)) + def send(self, tensor: torch.Tensor, dst: int, stream=None): if self.disabled: return diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index 7619c98f22148..ff88f72470b27 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -151,6 +151,28 @@ class NCCLLibrary: ncclRedOp_t, ncclComm_t, cudaStream_t ]), + # ncclResult_t ncclAllGather( + # const void* sendbuff, void* recvbuff, size_t count, + # ncclDataType_t datatype, ncclComm_t comm, + # cudaStream_t stream); + # note that cudaStream_t is a pointer type, so the last argument + # is a pointer + Function("ncclAllGather", ncclResult_t, [ + buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t, + ncclComm_t, cudaStream_t + ]), + + # ncclResult_t ncclReduceScatter( + # const void* sendbuff, void* recvbuff, size_t count, + # ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, + # cudaStream_t stream); + # note that cudaStream_t is a pointer type, so the last argument + # is a pointer + Function("ncclReduceScatter", ncclResult_t, [ + buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t, + ncclRedOp_t, ncclComm_t, cudaStream_t + ]), + # ncclResult_t ncclSend( # const void* sendbuff, size_t count, ncclDataType_t datatype, # int dest, ncclComm_t comm, cudaStream_t stream); @@ -258,6 +280,28 @@ def ncclAllReduce(self, sendbuff: buffer_type, recvbuff: buffer_type, datatype, op, comm, stream)) + def ncclReduceScatter(self, sendbuff: buffer_type, recvbuff: buffer_type, + count: int, datatype: int, op: int, comm: ncclComm_t, + stream: cudaStream_t) -> None: + # `datatype` actually should be `ncclDataType_t` + # and `op` should be `ncclRedOp_t` + # both are aliases of `ctypes.c_int` + # when we pass int to a function, it will be converted to `ctypes.c_int` + # by ctypes automatically + self.NCCL_CHECK(self._funcs["ncclReduceScatter"](sendbuff, recvbuff, + count, datatype, op, + comm, stream)) + + def ncclAllGather(self, sendbuff: buffer_type, recvbuff: buffer_type, + count: int, datatype: int, comm: ncclComm_t, + stream: cudaStream_t) -> None: + # `datatype` actually should be `ncclDataType_t` + # which is an aliases of `ctypes.c_int` + # when we pass int to a function, it will be converted to `ctypes.c_int` + # by ctypes automatically + self.NCCL_CHECK(self._funcs["ncclAllGather"](sendbuff, recvbuff, count, + datatype, comm, stream)) + def ncclSend(self, sendbuff: buffer_type, count: int, datatype: int, dest: int, comm: ncclComm_t, stream: cudaStream_t) -> None: self.NCCL_CHECK(self._funcs["ncclSend"](sendbuff, count, datatype, From 4aba6e3d1a0cc5cec45efdee0adeaa09278f7518 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 22 Nov 2024 20:13:54 -0800 Subject: [PATCH 119/255] [core] gemma2 full context length support (#10584) Signed-off-by: youkaichao --- .../test_basic_correctness.py | 25 +++++++++++----- vllm/attention/layer.py | 12 ++++++-- vllm/config.py | 29 +++++++++++++------ vllm/model_executor/models/gemma2.py | 13 +++++---- 4 files changed, 55 insertions(+), 24 deletions(-) diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 7f16baa65a644..fcba253d159f3 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -14,11 +14,12 @@ from vllm.platforms import current_platform from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata +from ..conftest import VllmRunner from ..models.utils import check_outputs_equal from ..utils import multi_gpu_test MODELS = [ - "facebook/opt-125m", + "google/gemma-2-2b-it", "meta-llama/Llama-3.2-1B", ] @@ -42,8 +43,6 @@ def test_vllm_gc_ed(): @pytest.mark.parametrize("enforce_eager", [False, True]) def test_models( hf_runner, - vllm_runner, - example_prompts, model: str, backend: str, dtype: str, @@ -54,15 +53,27 @@ def test_models( if backend == "FLASHINFER" and current_platform.is_rocm(): pytest.skip("Flashinfer does not support ROCm/HIP.") + if backend == "XFORMERS" and model == "google/gemma-2-2b-it": + pytest.skip( + "XFORMERS does not support gemma2 with full context length.") + os.environ["VLLM_ATTENTION_BACKEND"] = backend + # 5042 tokens for gemma2 + # gemma2 has alternating sliding window size of 4096 + # we need a prompt with more than 4096 tokens to test the sliding window + prompt = "The following numbers of the sequence " + ", ".join( + str(i) for i in range(1024)) + " are:" + example_prompts = [prompt] + with hf_runner(model, dtype=dtype) as hf_model: hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - with vllm_runner(model, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7) as vllm_model: + with VllmRunner(model, + max_model_len=8192, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) check_outputs_equal( diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 8acbeaf12b0cf..cb4dedf481c77 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -40,18 +40,26 @@ def __init__( quant_config: Optional[QuantizationConfig] = None, blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, + per_layer_sliding_window: Optional[int] = None, prefix: str = "", ) -> None: super().__init__() + if per_layer_sliding_window is not None: + # per-layer sliding window + sliding_window = per_layer_sliding_window + elif cache_config is not None: + # model-level sliding window + sliding_window = cache_config.sliding_window + else: + sliding_window = None + if cache_config is not None: kv_cache_dtype = cache_config.cache_dtype block_size = cache_config.block_size - sliding_window = cache_config.sliding_window is_attention_free = cache_config.is_attention_free else: kv_cache_dtype = "auto" block_size = 16 - sliding_window = None is_attention_free = False if num_kv_heads is None: num_kv_heads = num_heads diff --git a/vllm/config.py b/vllm/config.py index bb02c2ad4c7d4..730b069e076fb 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -233,15 +233,26 @@ def __init__( (self.hf_text_config.model_type in ["gemma2"])) if (not self.disable_sliding_window and has_interleaved_attention): - sliding_window_len_min = get_min_sliding_window( - self.hf_text_config.sliding_window) - - print_warning_once( - f"{self.hf_text_config.model_type} has interleaved attention, " - "which is currently not supported by vLLM. Disabling sliding " - "window and capping the max length to the sliding window size " - f"({sliding_window_len_min}).") - self.disable_sliding_window = True + if envs.VLLM_ATTENTION_BACKEND == "XFORMERS": + sliding_window_len_min = get_min_sliding_window( + self.hf_text_config.sliding_window) + + print_warning_once( + f"{self.hf_text_config.model_type} has interleaved " + "attention, which is currently not supported by the " + "XFORMERS backend. Disabling sliding window and capping " + "the max length to the sliding window size " + f"({sliding_window_len_min}).") + self.disable_sliding_window = True + else: + # for a model with interleaved attention, + # the scheduler and the model treat it as full attention + # (i.e., not dropping any tokens outside the window). + # only the attention layer itself is aware of the sliding + # window, and use the window size to compute the attention. + self.hf_text_config.interleaved_sliding_window = sliding_window + delattr(self.hf_text_config, "sliding_window") + sliding_window = None self.max_model_len = _get_and_verify_max_len( hf_config=self.hf_text_config, diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 839130364ef4d..9309cced61bb3 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -143,12 +143,12 @@ def __init__(self, is_neox_style=True, ) - # FIXME(woosuk): While Gemma 2 uses sliding window attention for every - # odd layer, vLLM currently ignores it and uses global attention for - # all layers. - use_sliding_window = (layer_idx % 2 == 1 - and config.sliding_window is not None) - del use_sliding_window # Unused. + # reference: + # https://github.com/huggingface/transformers/blob/54be2d7ae87e873482b984cc956e165ca4dc0ba3/src/transformers/models/gemma2/modeling_gemma2.py#L312 # noqa + use_sliding_window = (layer_idx % 2 == 0 and + config.interleaved_sliding_window is not None) + sliding_window = config.interleaved_sliding_window if \ + use_sliding_window else None self.attn = Attention(self.num_heads, self.head_dim, self.scaling, @@ -156,6 +156,7 @@ def __init__(self, cache_config=cache_config, quant_config=quant_config, logits_soft_cap=attn_logits_soft_cap, + per_layer_sliding_window=sliding_window, prefix=f"{prefix}.attn") def forward( From 7d8ffb344f3b9a571d94073644b829eb4baa0a65 Mon Sep 17 00:00:00 2001 From: Varun Vinayak Shenoy Date: Fri, 22 Nov 2024 21:13:29 -0800 Subject: [PATCH 120/255] [Bugfix] Internal Server Error when tool_choice is incorrect. (#10567) Signed-off-by: Varun Shenoy --- tests/entrypoints/openai/test_chat.py | 14 ++++++++++++++ vllm/entrypoints/openai/protocol.py | 12 ++++++------ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index 843d15e768093..8d23a2be6f9bb 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -829,6 +829,20 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI, "name": "nondefined_function_name" } }) + with pytest.raises(openai.BadRequestError): + await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + max_completion_tokens=1000, + tools=[{ + "type": "function", + "function": { + "name": "dummy_function_name", + "description": "This is a dummy function", + "parameters": sample_json_schema + } + }], + tool_choice={}) @pytest.mark.asyncio diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 9db5951e5fe5b..f343732174014 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -478,17 +478,17 @@ def check_tool_usage(cls, data): # it matches a valid tool if isinstance(data["tool_choice"], dict): valid_tool = False - specified_function = data["tool_choice"]["function"] + specified_function = data["tool_choice"].get("function") if not specified_function: raise ValueError( - "Incorrectly formatted `tool_choice`. Should be like " - "`{\"type\": \"function\"," + "Expected field `function` in `tool_choice`." + " Correct usage: `{\"type\": \"function\"," " \"function\": {\"name\": \"my_function\"}}`") - specified_function_name = specified_function["name"] + specified_function_name = specified_function.get("name") if not specified_function_name: raise ValueError( - "Incorrectly formatted `tool_choice`. Should be like " - "`{\"type\": \"function\", " + "Expected field `name` in `function` in `tool_choice`." + "Correct usage: `{\"type\": \"function\", " "\"function\": {\"name\": \"my_function\"}}`") for tool in data["tools"]: if tool["function"]["name"] == specified_function_name: From cfea9c04ef43420be594f23fc1773009d1fe88c3 Mon Sep 17 00:00:00 2001 From: Chen Wu <72850361+CNTRYROA@users.noreply.github.com> Date: Sat, 23 Nov 2024 13:13:59 +0800 Subject: [PATCH 121/255] [Model] Fix Baichuan BNB online quantization (#10572) Signed-off-by: Chen Wu --- vllm/model_executor/models/baichuan.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index a923ed36a9db2..39cb5a8b2cbbe 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -350,6 +350,21 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP): embedding_modules = {} embedding_padding_modules = [] + # BitandBytes specific attributes + default_bitsandbytes_target_modules = [ + ".W_pack.", + ".o_proj.", + ".down_proj.", + ".up_proj.", + ".gate_proj.", + ".up_proj.", + ] + bitsandbytes_stacked_params_mapping = { + # shard_name, weight_name, index + "gate_proj": ("gate_up_proj", 0), + "up_proj": ("gate_up_proj", 1), + } + def __init__( self, *, From 02a43f82a97e37581b48f1c177d3393aca4fe3f2 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sat, 23 Nov 2024 00:14:19 -0500 Subject: [PATCH 122/255] Update default max_num_batch_tokens for chunked prefill to 2048 (#10544) --- vllm/config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 730b069e076fb..42a44f5415e9f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1133,9 +1133,9 @@ def __post_init__(self) -> None: # max_num_batched_tokens. self.max_num_batched_tokens = max(self.max_model_len, 2048) else: - # It is the values that have the best balance between ITL - # and TTFT on A100. Note it is not optimized for throughput. - self.max_num_batched_tokens = 512 + # This value is chosen to have a balance between ITL + # and TTFT. Note it is not optimized for throughput. + self.max_num_batched_tokens = 2048 else: # If max_model_len is too short, use 2048 as the default value # for higher throughput. From 7c25fe45a6ef4fb5be148217cc7110e88e186446 Mon Sep 17 00:00:00 2001 From: kliuae <17350011+kliuae@users.noreply.github.com> Date: Sat, 23 Nov 2024 13:14:49 +0800 Subject: [PATCH 123/255] [AMD] Add support for GGUF quantization on ROCm (#10254) --- .buildkite/run-amd-test.sh | 1 - CMakeLists.txt | 2 +- csrc/ops.h | 2 + csrc/quantization/gguf/ggml-common.h | 17 +- csrc/quantization/gguf/gguf_kernel.cu | 6 +- csrc/quantization/gguf/mmq.cuh | 70 +++---- csrc/quantization/gguf/mmvq.cuh | 4 +- csrc/quantization/gguf/vecdotq.cuh | 286 +++++++++++++------------- csrc/torch_bindings.cpp | 2 + vllm/_custom_ops.py | 53 ++--- vllm/config.py | 2 +- 11 files changed, 234 insertions(+), 211 deletions(-) diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index 902e162720b89..3515ccd65667e 100755 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -85,7 +85,6 @@ if [[ $commands == *" kernels "* ]]; then --ignore=kernels/test_encoder_decoder_attn.py \ --ignore=kernels/test_flash_attn.py \ --ignore=kernels/test_flashinfer.py \ - --ignore=kernels/test_gguf.py \ --ignore=kernels/test_int8_quant.py \ --ignore=kernels/test_machete_gemm.py \ --ignore=kernels/test_mamba_ssm.py \ diff --git a/CMakeLists.txt b/CMakeLists.txt index bfe435937e3bb..ff34225537cdd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -196,6 +196,7 @@ set(VLLM_EXT_SRC "csrc/quantization/gptq/q_gemm.cu" "csrc/quantization/compressed_tensors/int8_quant_kernels.cu" "csrc/quantization/fp8/common.cu" + "csrc/quantization/gguf/gguf_kernel.cu" "csrc/cuda_utils_kernels.cu" "csrc/prepare_inputs/advance_step.cu" "csrc/torch_bindings.cpp") @@ -237,7 +238,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "csrc/mamba/causal_conv1d/causal_conv1d.cu" "csrc/quantization/aqlm/gemm_kernels.cu" "csrc/quantization/awq/gemm_kernels.cu" - "csrc/quantization/gguf/gguf_kernel.cu" "csrc/custom_all_reduce.cu" "csrc/permute_cols.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu") diff --git a/csrc/ops.h b/csrc/ops.h index 672e608e9c47e..ea001190bc202 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -128,6 +128,7 @@ torch::Tensor awq_dequantize(torch::Tensor _kernel, int64_t thx, int64_t thy); torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm); +#endif torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m, int64_t n); @@ -138,6 +139,7 @@ torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X, torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type, int64_t row); +#ifndef USE_ROCM bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability); void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a, diff --git a/csrc/quantization/gguf/ggml-common.h b/csrc/quantization/gguf/ggml-common.h index fba94fd1d157b..d42205a6571db 100644 --- a/csrc/quantization/gguf/ggml-common.h +++ b/csrc/quantization/gguf/ggml-common.h @@ -1,7 +1,7 @@ // copied from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-common.h #define QK_K 256 #define K_QUANTS_PER_ITERATION 2 -#define WARP_SIZE 32 +#define WARP_SIZE_GGUF 32 #define K_SCALE_SIZE 12 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256 #define CUDA_QUANTIZE_BLOCK_SIZE 256 @@ -1112,4 +1112,19 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { #endif return c; } + +static __device__ __forceinline__ uint32_t __vcmpeq4(const uint32_t a, const uint32_t b) { + uint32_t neq = a^b; + return !(neq & 0xff000000) * 0xff000000 | + !(neq & 0x00ff0000) * 0x00ff0000 | + !(neq & 0x0000ff00) * 0x0000ff00 | + !(neq & 0x000000ff) * 0x000000ff; +} + +static __device__ __forceinline__ uint32_t __vsub4(const uint32_t a, const uint32_t b) { + return (static_cast(((a & 0xff000000) >> 24) - ((b & 0xff000000) >> 24)) << 24) + + (static_cast(((a & 0x00ff0000) >> 16) - ((b & 0x00ff0000) >> 16)) << 16) + + (static_cast(((a & 0x0000ff00) >> 8) - ((b & 0x0000ff00) >> 8)) << 8) + + (static_cast(((a & 0x000000ff) >> 0) - ((b & 0x000000ff) >> 0)) << 0); +} #endif // defined(USE_ROCM) diff --git a/csrc/quantization/gguf/gguf_kernel.cu b/csrc/quantization/gguf/gguf_kernel.cu index 37e4de4e14dd3..5f0eaf5a973fb 100644 --- a/csrc/quantization/gguf/gguf_kernel.cu +++ b/csrc/quantization/gguf/gguf_kernel.cu @@ -4,6 +4,8 @@ #include #include +#include "cuda_compat.h" + #include "ggml-common.h" #include "vecdotq.cuh" #include "dequantize.cuh" @@ -32,8 +34,8 @@ static __global__ void quantize_q8_1(const half* __restrict__ x, #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { - amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32)); - sum += __shfl_xor_sync(0xffffffff, sum, mask, 32); + amax = fmaxf(amax, VLLM_SHFL_XOR_SYNC_WIDTH(amax, mask, 32)); + sum += VLLM_SHFL_XOR_SYNC_WIDTH(sum, mask, 32); } const float d = amax / 127; diff --git a/csrc/quantization/gguf/mmq.cuh b/csrc/quantization/gguf/mmq.cuh index d13efd5965313..c935faa07df0c 100644 --- a/csrc/quantization/gguf/mmq.cuh +++ b/csrc/quantization/gguf/mmq.cuh @@ -10,7 +10,7 @@ static __device__ __forceinline__ void mul_mat_q( const int blocks_per_row_x = ncols_x / qk; const int blocks_per_col_y = nrows_y / QK8_1; - const int blocks_per_warp = WARP_SIZE / qi; + const int blocks_per_warp = WARP_SIZE_GGUF / qi; const int & ncols_dst = ncols_y; @@ -27,10 +27,10 @@ static __device__ __forceinline__ void mul_mat_q( allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - __shared__ int tile_y_qs[mmq_x * WARP_SIZE]; - __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1]; + __shared__ int tile_y_qs[mmq_x * WARP_SIZE_GGUF]; + __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE_GGUF/QI8_1]; - float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}}; + float sum[mmq_y/WARP_SIZE_GGUF][mmq_x/nwarps] = {{0.0f}}; for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) { @@ -39,26 +39,26 @@ static __device__ __forceinline__ void mul_mat_q( #pragma unroll for (int ir = 0; ir < qr; ++ir) { - const int kqs = ir*WARP_SIZE + threadIdx.x; + const int kqs = ir*WARP_SIZE_GGUF + threadIdx.x; const int kbxd = kqs / QI8_1; #pragma unroll for (int i = 0; i < mmq_x; i += nwarps) { const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd]; - const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE; + const int index_y = (threadIdx.y + i) * WARP_SIZE_GGUF + kqs % WARP_SIZE_GGUF; tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1); } #pragma unroll for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) { - const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x; - const int kby = threadIdx.x % (WARP_SIZE/QI8_1); + const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE_GGUF/QI8_1)) % mmq_x; + const int kby = threadIdx.x % (WARP_SIZE_GGUF/QI8_1); const int col_y_eff = min(col_y_0 + ids, ncols_y-1); // if the sum is not needed it's faster to transform the scale to f32 ahead of time - const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds; - half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby]; + const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE_GGUF/QI8_1) + kby].ds; + half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE_GGUF/QI8_1) + kby]; if (need_sum) { *dsi_dst = *dsi_src; } else { @@ -70,12 +70,12 @@ static __device__ __forceinline__ void mul_mat_q( __syncthreads(); // #pragma unroll // unrolling this loop causes too much register pressure - for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) { + for (int k = ir*WARP_SIZE_GGUF/qr; k < (ir+1)*WARP_SIZE_GGUF/qr; k += vdr) { #pragma unroll for (int j = 0; j < mmq_x; j += nwarps) { #pragma unroll - for (int i = 0; i < mmq_y; i += WARP_SIZE) { - sum[i/WARP_SIZE][j/nwarps] += vec_dot( + for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) { + sum[i/WARP_SIZE_GGUF][j/nwarps] += vec_dot( tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds, threadIdx.x + i, threadIdx.y + j, k); } @@ -93,12 +93,12 @@ static __device__ __forceinline__ void mul_mat_q( } #pragma unroll - for (int i = 0; i < mmq_y; i += WARP_SIZE) { + for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) { const int row_dst = row_dst_0 + threadIdx.x + i; if (row_dst >= nrows_dst) { continue; } - dst[col_dst*nrows_dst + row_dst] = __float2half(sum[i/WARP_SIZE][j/nwarps]); + dst[col_dst*nrows_dst + row_dst] = __float2half(sum[i/WARP_SIZE_GGUF][j/nwarps]); } } } @@ -115,7 +115,7 @@ static __device__ __forceinline__ void mul_mat_q( template static __global__ void #if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE*NWARPS_Q4_0, 2) +__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_0, 2) #endif mul_mat_q4_0( const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst, @@ -140,7 +140,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda( const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); + const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); if (nrows_x % mmq_y == 0) { const bool need_check = false; @@ -165,7 +165,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda( template static __global__ void #if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE*NWARPS_Q4_1, 2) +__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_1, 2) #endif mul_mat_q4_1( const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst, @@ -190,7 +190,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda( const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); + const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); if (nrows_x % mmq_y == 0) { const bool need_check = false; @@ -215,7 +215,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda( template static __global__ void #if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE*NWARPS_Q5_0, 2) +__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_0, 2) #endif mul_mat_q5_0( const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst, @@ -240,7 +240,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda( const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); + const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); if (nrows_x % mmq_y == 0) { const bool need_check = false; @@ -265,7 +265,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda( template static __global__ void #if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE*NWARPS_Q5_1, 2) +__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_1, 2) #endif mul_mat_q5_1( const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst, @@ -289,7 +289,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda( const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); + const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); if (nrows_x % mmq_y == 0) { const bool need_check = false; @@ -314,7 +314,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda( template static __global__ void #if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE*NWARPS_Q8_0, 2) +__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q8_0, 2) #endif mul_mat_q8_0( const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst, @@ -338,7 +338,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda( const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); + const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); if (nrows_x % mmq_y == 0) { const bool need_check = false; @@ -363,7 +363,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda( template static __global__ void #if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE*NWARPS_Q2_K, 2) +__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q2_K, 2) #endif mul_mat_q2_K( const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst, @@ -387,7 +387,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda( const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); + const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); if (nrows_x % mmq_y == 0) { const bool need_check = false; @@ -412,7 +412,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda( template static __global__ void #if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE*NWARPS_Q3_K, 2) +__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q3_K, 2) #endif mul_mat_q3_K( const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst, @@ -438,7 +438,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda( const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); + const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); if (nrows_x % mmq_y == 0) { const bool need_check = false; @@ -463,7 +463,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda( template static __global__ void #if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE*NWARPS_Q4_K, 2) +__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_K, 2) #endif mul_mat_q4_K( const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst, @@ -487,7 +487,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda( const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); + const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); if (nrows_x % mmq_y == 0) { const bool need_check = false; @@ -512,7 +512,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda( template static __global__ void #if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE*NWARPS_Q5_K, 2) +__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_K, 2) #endif mul_mat_q5_K( const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst, @@ -537,7 +537,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda( const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); + const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); if (nrows_x % mmq_y == 0) { const bool need_check = false; @@ -562,7 +562,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda( template static __global__ void #if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE*NWARPS_Q6_K, 2) +__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q6_K, 2) #endif mul_mat_q6_K( const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst, @@ -586,7 +586,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda( const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); + const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); if (nrows_x % mmq_y == 0) { const bool need_check = false; diff --git a/csrc/quantization/gguf/mmvq.cuh b/csrc/quantization/gguf/mmvq.cuh index b221ae7896138..b01e939808a3f 100644 --- a/csrc/quantization/gguf/mmvq.cuh +++ b/csrc/quantization/gguf/mmvq.cuh @@ -28,8 +28,8 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * // sum up partial sums and write back result #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1) { + tmp += VLLM_SHFL_XOR_SYNC(tmp, mask); } if (threadIdx.x == 0) { diff --git a/csrc/quantization/gguf/vecdotq.cuh b/csrc/quantization/gguf/vecdotq.cuh index d5af345a6b26f..e00422637c65b 100644 --- a/csrc/quantization/gguf/vecdotq.cuh +++ b/csrc/quantization/gguf/vecdotq.cuh @@ -43,7 +43,7 @@ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * template static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl( const int * v, const int * u, const float & d4, const half2 & ds8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM int sumi = 0; #pragma unroll @@ -68,7 +68,7 @@ template static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp template static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl( const int * v, const int * u, const half2 & dm4, const half2 & ds8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM int sumi = 0; #pragma unroll @@ -95,7 +95,7 @@ template static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp template static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl( const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM int sumi = 0; #pragma unroll @@ -128,7 +128,7 @@ template static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl( const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM int sumi = 0; #pragma unroll @@ -162,7 +162,7 @@ template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp template static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl( const int * v, const int * u, const float & d8_0, const float & d8_1) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM int sumi = 0; #pragma unroll @@ -176,7 +176,7 @@ template static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl( const int * v, const int * u, const half2 & dm8, const half2 & ds8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM int sumi = 0; @@ -202,7 +202,7 @@ template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq( const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales, const half2 & dm2, const float * __restrict__ d8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -230,7 +230,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq( static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq( const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales, const half2 & dm2, const float & d8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM int sumi_d = 0; int sumi_m = 0; @@ -267,7 +267,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq( static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq( const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales, const int & scale_offset, const float & d3, const float * __restrict__ d8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM float sumf = 0.0f; @@ -301,7 +301,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq( static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq( const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales, const float & d3, const float & d8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM int sumi = 0; #pragma unroll @@ -326,7 +326,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq( static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq( const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -351,7 +351,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq( static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -382,7 +382,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq( const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc, const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -413,7 +413,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq( static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq( const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -445,7 +445,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq( static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq( const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales, const float & d, const float * __restrict__ d8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM float sumf = 0.0f; #pragma unroll @@ -465,7 +465,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq( static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq( const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc, const float & d6, const float * __restrict__ d8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM float sumf_d = 0.0f; #pragma unroll @@ -507,8 +507,8 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y]; - __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0]; + __shared__ int tile_x_qs[mmq_y * (WARP_SIZE_GGUF) + mmq_y]; + __shared__ float tile_x_d[mmq_y * (WARP_SIZE_GGUF/QI4_0) + mmq_y/QI4_0]; *x_ql = tile_x_qs; *x_dm = (half2 *) tile_x_d; } @@ -529,11 +529,11 @@ template static __device__ __forceinlin i = min(i, i_max); } const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx; - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); - // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d; + x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); + // x_dmf[i * (WARP_SIZE_GGUF/QI4_0) + i / QI4_0 + kbx] = bxi->d; } - const int blocks_per_tile_x_row = WARP_SIZE / QI4_0; + const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI4_0; const int kbxd = k % blocks_per_tile_x_row; #pragma unroll @@ -543,7 +543,7 @@ template static __device__ __forceinlin i = min(i, i_max); } const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd; - x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = __half2float(bxi->d); + x_dmf[i * (WARP_SIZE_GGUF/QI4_0) + i / QI4_0 + kbxd] = __half2float(bxi->d); } } @@ -559,13 +559,13 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat( #pragma unroll for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; - u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE]; + u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l) % WARP_SIZE_GGUF]; + u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI4_0) % WARP_SIZE_GGUF]; } return vec_dot_q4_0_q8_1_impl - (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0], - y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); + (&x_ql[i * (WARP_SIZE_GGUF + 1) + k], u, x_dmf[i * (WARP_SIZE_GGUF/QI4_0) + i/QI4_0 + k/QI4_0], + y_ds[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]); } static __device__ __forceinline__ float vec_dot_q4_1_q8_1( @@ -587,8 +587,8 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1]; + __shared__ int tile_x_qs[mmq_y * (WARP_SIZE_GGUF) + + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI4_1) + mmq_y/QI4_1]; *x_ql = tile_x_qs; *x_dm = tile_x_dm; } @@ -608,10 +608,10 @@ template static __device__ __forceinlin i = min(i, i_max); } const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx; - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); } - const int blocks_per_tile_x_row = WARP_SIZE / QI4_1; + const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI4_1; const int kbxd = k % blocks_per_tile_x_row; #pragma unroll @@ -621,7 +621,7 @@ template static __device__ __forceinlin i = min(i, i_max); } const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd; - x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm; + x_dm[i * (WARP_SIZE_GGUF/QI4_1) + i / QI4_1 + kbxd] = bxi->dm; } } @@ -634,13 +634,13 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat( #pragma unroll for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; - u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE]; + u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l) % WARP_SIZE_GGUF]; + u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI4_1) % WARP_SIZE_GGUF]; } return vec_dot_q4_1_q8_1_impl - (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1], - y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); + (&x_ql[i * (WARP_SIZE_GGUF + 1) + k], u, x_dm[i * (WARP_SIZE_GGUF/QI4_1) + i/QI4_1 + k/QI4_1], + y_ds[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]); } static __device__ __forceinline__ float vec_dot_q5_0_q8_1( @@ -664,8 +664,8 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; - __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0]; + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF) + mmq_y]; + __shared__ float tile_x_d[mmq_y * (WARP_SIZE_GGUF/QI5_0) + mmq_y/QI5_0]; *x_ql = tile_x_ql; *x_dm = (half2 *) tile_x_d; @@ -697,7 +697,7 @@ template static __device__ __forceinlin qs0 |= (qh << 25) & 0x10000000; // 3 -> 28 qs0 = __vsubss4(qs0, 0x10101010); // subtract 16 - x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0; + x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+0] = qs0; int qs1 = (ql >> 4) & 0x0F0F0F0F; qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4 @@ -706,10 +706,10 @@ template static __device__ __forceinlin qs1 |= (qh << 9) & 0x10000000; // 19 -> 28 qs1 = __vsubss4(qs1, 0x10101010); // subtract 16 - x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1; + x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+1] = qs1; } - const int blocks_per_tile_x_row = WARP_SIZE / QI5_0; + const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI5_0; const int kbxd = k % blocks_per_tile_x_row; float * x_dmf = (float *) x_dm; @@ -722,7 +722,7 @@ template static __device__ __forceinlin } const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd; - x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = __half2float(bxi->d); + x_dmf[i * (WARP_SIZE_GGUF/QI5_0) + i / QI5_0 + kbxd] = __half2float(bxi->d); } } @@ -730,7 +730,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); - const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0; + const int index_bx = i * (WARP_SIZE_GGUF/QI5_0) + i/QI5_0 + k/QI5_0; const float * x_dmf = (const float *) x_dm; const float * y_df = (const float *) y_ds; @@ -738,12 +738,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat( #pragma unroll for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; - u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE]; + u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l) % WARP_SIZE_GGUF]; + u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI5_0) % WARP_SIZE_GGUF]; } return vec_dot_q8_0_q8_1_impl - (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); + (&x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]); } static __device__ __forceinline__ float vec_dot_q5_1_q8_1( @@ -767,8 +767,8 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1]; + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI5_1) + mmq_y/QI5_1]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; @@ -801,7 +801,7 @@ template static __device__ __forceinlin qs0 |= (qh << 18) & 0x00100000; // 2 -> 20 qs0 |= (qh << 25) & 0x10000000; // 3 -> 28 - x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0; + x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+0] = qs0; int qs1 = (ql >> 4) & 0x0F0F0F0F; qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4 @@ -809,10 +809,10 @@ template static __device__ __forceinlin qs1 |= (qh << 2) & 0x00100000; // 18 -> 20 qs1 |= (qh << 9) & 0x10000000; // 19 -> 28 - x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1; + x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+1] = qs1; } - const int blocks_per_tile_x_row = WARP_SIZE / QI5_1; + const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI5_1; const int kbxd = k % blocks_per_tile_x_row; #pragma unroll @@ -825,7 +825,7 @@ template static __device__ __forceinlin const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd; - x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm; + x_dm[i * (WARP_SIZE_GGUF/QI5_1) + i / QI5_1 + kbxd] = bxi->dm; } } @@ -833,18 +833,18 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); - const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1; + const int index_bx = i * (WARP_SIZE_GGUF/QI5_1) + + i/QI5_1 + k/QI5_1; int u[2*VDR_Q5_1_Q8_1_MMQ]; #pragma unroll for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; - u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE]; + u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l) % WARP_SIZE_GGUF]; + u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI5_1) % WARP_SIZE_GGUF]; } return vec_dot_q8_1_q8_1_impl - (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); + (&x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]); } static __device__ __forceinline__ float vec_dot_q8_0_q8_1( @@ -865,8 +865,8 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y]; - __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0]; + __shared__ int tile_x_qs[mmq_y * (WARP_SIZE_GGUF) + mmq_y]; + __shared__ float tile_x_d[mmq_y * (WARP_SIZE_GGUF/QI8_0) + mmq_y/QI8_0]; *x_ql = tile_x_qs; *x_dm = (half2 *) tile_x_d; @@ -889,10 +889,10 @@ template static __device__ __forceinlin i = min(i, i_max); } const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx; - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx); + x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_int8(bxi->qs, kqsx); } - const int blocks_per_tile_x_row = WARP_SIZE / QI8_0; + const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI8_0; const int kbxd = k % blocks_per_tile_x_row; #pragma unroll @@ -903,7 +903,7 @@ template static __device__ __forceinlin i = min(i, i_max); } const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd; - x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = __half2float(bxi->d); + x_dmf[i * (WARP_SIZE_GGUF/QI8_0) + i / QI8_0 + kbxd] = __half2float(bxi->d); } } @@ -914,8 +914,8 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat( const float * y_df = (const float *) y_ds; return vec_dot_q8_0_q8_1_impl - (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0], - y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]); + (&x_ql[i * (WARP_SIZE_GGUF + 1) + k], &y_qs[j * WARP_SIZE_GGUF + k], x_dmf[i * (WARP_SIZE_GGUF/QI8_0) + i/QI8_0 + k/QI8_0], + y_df[j * (WARP_SIZE_GGUF/QI8_1) + k/QI8_1]); } static __device__ __forceinline__ float vec_dot_q2_K_q8_1( @@ -942,9 +942,9 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K]; - __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4]; + __shared__ int tile_x_ql[mmq_y * (WARP_SIZE_GGUF) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI2_K) + mmq_y/QI2_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE_GGUF/4) + mmq_y/4]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; @@ -967,10 +967,10 @@ template static __device__ __forceinlin i = min(i, i_max); } const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx; - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); } - const int blocks_per_tile_x_row = WARP_SIZE / QI2_K; + const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI2_K; const int kbxd = k % blocks_per_tile_x_row; #pragma unroll @@ -981,18 +981,18 @@ template static __device__ __forceinlin i = min(i, i_max); } const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd; - x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm; + x_dm[i * (WARP_SIZE_GGUF/QI2_K) + i / QI2_K + kbxd] = bxi->dm; } #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { - int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); + int i = i0 + i_offset * 4 + k / (WARP_SIZE_GGUF/4); if (need_check) { i = min(i, i_max); } - const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4); - x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4)); + const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/4)) / (QI2_K/4); + x_sc[i * (WARP_SIZE_GGUF/4) + i / 4 + k % (WARP_SIZE_GGUF/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4)); } } @@ -1005,7 +1005,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat( int v[QR2_K*VDR_Q2_K_Q8_1_MMQ]; - const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2); + const int kqsx = i * (WARP_SIZE_GGUF + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2); const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2)); #pragma unroll @@ -1013,10 +1013,10 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat( v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303; } - const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4; + const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE_GGUF/4) + i/4 + kbx*4]) + ky/4; - const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE; - return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]); + const int index_y = j * WARP_SIZE_GGUF + (QR2_K*k) % WARP_SIZE_GGUF; + return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE_GGUF/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]); } static __device__ __forceinline__ float vec_dot_q3_K_q8_1( @@ -1047,10 +1047,10 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K]; - __shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2]; - __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4]; + __shared__ int tile_x_ql[mmq_y * (WARP_SIZE_GGUF) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI3_K) + mmq_y/QI3_K]; + __shared__ int tile_x_qh[mmq_y * (WARP_SIZE_GGUF/2) + mmq_y/2]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE_GGUF/4) + mmq_y/4]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; @@ -1073,10 +1073,10 @@ template static __device__ __forceinlin i = min(i, i_max); } const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx; - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); + x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); } - const int blocks_per_tile_x_row = WARP_SIZE / QI3_K; + const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI3_K; const int kbxd = k % blocks_per_tile_x_row; float * x_dmf = (float *) x_dm; @@ -1087,27 +1087,27 @@ template static __device__ __forceinlin i = min(i, i_max); } const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd; - x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = __half2float(bxi->d); + x_dmf[i * (WARP_SIZE_GGUF/QI3_K) + i / QI3_K + kbxd] = __half2float(bxi->d); } #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) { - int i = i0 + i_offset * 2 + k / (WARP_SIZE/2); + int i = i0 + i_offset * 2 + k / (WARP_SIZE_GGUF/2); if (need_check) { i = min(i, i_max); } - const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2); + const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/2)) / (QI3_K/2); // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted - x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2)); + x_qh[i * (WARP_SIZE_GGUF/2) + i / 2 + k % (WARP_SIZE_GGUF/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2)); } #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { - int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); + int i = i0 + i_offset * 4 + k / (WARP_SIZE_GGUF/4); if (need_check) { i = min(i, i_max); } - const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4); + const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/4)) / (QI3_K/4); const int ksc = k % (QI3_K/4); @@ -1121,7 +1121,7 @@ template static __device__ __forceinlin const int sc = __vsubss4(sc_low | sc_high, 0x20202020); - x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc; + x_sc[i * (WARP_SIZE_GGUF/4) + i / 4 + k % (WARP_SIZE_GGUF/4)] = sc; } } @@ -1134,24 +1134,24 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat( const float * x_dmf = (const float *) x_dm; const float * y_df = (const float *) y_ds; - const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4; + const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE_GGUF/4) + i/4 + kbx*4)) + ky/4; int v[QR3_K*VDR_Q3_K_Q8_1_MMQ]; #pragma unroll for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) { - const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2); + const int kqsx = i * (WARP_SIZE_GGUF + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2); const int shift = 2 * ((ky % 32) / 8); const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303; - const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8); + const int vh = x_qh[i * (WARP_SIZE_GGUF/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8); const int vlh = (vh << 2) & 0x04040404; v[l] = __vsubss4(vll, vlh); } - const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE; - return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]); + const int index_y = j * WARP_SIZE_GGUF + (k*QR3_K) % WARP_SIZE_GGUF; + return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE_GGUF/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]); } static __device__ __forceinline__ float vec_dot_q4_K_q8_1( @@ -1200,9 +1200,9 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K]; - __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; + __shared__ int tile_x_ql[mmq_y * (WARP_SIZE_GGUF) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI4_K) + mmq_y/QI4_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE_GGUF/8) + mmq_y/8]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; @@ -1225,10 +1225,10 @@ template static __device__ __forceinlin i = min(i, i_max); } const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx; - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); } - const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256 + const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI4_K; // == 1 if QK_K == 256 const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 #pragma unroll @@ -1238,27 +1238,27 @@ template static __device__ __forceinlin i = min(i, i_max); } const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd; - x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm; + x_dm[i * (WARP_SIZE_GGUF/QI4_K) + i / QI4_K + kbxd] = bxi->dm; } #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; + int i = (i0 + i_offset * 8 + k / (WARP_SIZE_GGUF/8)) % mmq_y; if (need_check) { i = min(i, i_max); } - const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8); + const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/8)) / (QI4_K/8); const int * scales = (const int *) bxi->scales; - const int ksc = k % (WARP_SIZE/8); + const int ksc = k % (WARP_SIZE_GGUF/8); // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8 int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits - x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8; + x_sc[i * (WARP_SIZE_GGUF/8) + i / 8 + ksc] = scales8; } } @@ -1267,11 +1267,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat( const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { (void)x_qh; - const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8); + const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE_GGUF/8) + i/8 + k/16]) + 2*((k % 16) / 8); - const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE; - return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8, - x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]); + const int index_y = j * WARP_SIZE_GGUF + (QR4_K*k) % WARP_SIZE_GGUF; + return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE_GGUF + 1) + k], &y_qs[index_y], sc, sc+8, + x_dm[i * (WARP_SIZE_GGUF/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]); } static __device__ __forceinline__ float vec_dot_q5_K_q8_1( @@ -1321,9 +1321,9 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K]; - __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI5_K) + mmq_y/QI5_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE_GGUF/8) + mmq_y/8]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; @@ -1360,11 +1360,11 @@ template static __device__ __forceinlin const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0; const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4); - x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0; - x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1; + x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq0] = ql0 | qh0; + x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq1] = ql1 | qh1; } - const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256 + const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI5_K; // == 1 if QK_K == 256 const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 #pragma unroll @@ -1376,40 +1376,40 @@ template static __device__ __forceinlin } const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd; - x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm; + x_dm[i * (WARP_SIZE_GGUF/QI5_K) + i / QI5_K + kbxd] = bxi->dm; } #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; + int i = (i0 + i_offset * 8 + k / (WARP_SIZE_GGUF/8)) % mmq_y; if (need_check) { i = min(i, i_max); } - const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8); + const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/8)) / (QI5_K/8); const int * scales = (const int *) bxi->scales; - const int ksc = k % (WARP_SIZE/8); + const int ksc = k % (WARP_SIZE_GGUF/8); // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8 int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits - x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8; + x_sc[i * (WARP_SIZE_GGUF/8) + i / 8 + ksc] = scales8; } } static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8); + const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE_GGUF/8) + i/8 + k/16]) + 2 * ((k % 16) / 8); - const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k; - const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE; + const int index_x = i * (QR5_K*WARP_SIZE_GGUF + 1) + QR5_K*k; + const int index_y = j * WARP_SIZE_GGUF + (QR5_K*k) % WARP_SIZE_GGUF; return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, - x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]); + x_dm[i * (WARP_SIZE_GGUF/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]); } static __device__ __forceinline__ float vec_dot_q6_K_q8_1( @@ -1439,9 +1439,9 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K]; - __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI6_K) + mmq_y/QI6_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE_GGUF/8) + mmq_y/8]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; @@ -1478,11 +1478,11 @@ template static __device__ __forceinlin const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0; const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2); - x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020); - x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020); + x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020); + x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020); } - const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256 + const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI6_K; // == 1 if QK_K == 256 const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 float * x_dmf = (float *) x_dm; @@ -1496,20 +1496,20 @@ template static __device__ __forceinlin const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd; - x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = __half2float(bxi->d); + x_dmf[i * (WARP_SIZE_GGUF/QI6_K) + i / QI6_K + kbxd] = __half2float(bxi->d); } #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; + int i = (i0 + i_offset * 8 + k / (WARP_SIZE_GGUF/8)) % mmq_y; if (need_check) { i = min(i, i_max); } - const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4; + const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/8)) / 4; - x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8)); + x_sc[i * (WARP_SIZE_GGUF/8) + i / 8 + k % (WARP_SIZE_GGUF/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8)); } } @@ -1519,11 +1519,11 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat( const float * x_dmf = (const float *) x_dm; const float * y_df = (const float *) y_ds; - const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]); + const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE_GGUF/8) + i/8 + k/8]); - const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k; - const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE; - return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]); + const int index_x = i * (QR6_K*WARP_SIZE_GGUF + 1) + QR6_K*k; + const int index_y = j * WARP_SIZE_GGUF + (QR6_K*k) % WARP_SIZE_GGUF; + return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE_GGUF/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]); } static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1( @@ -1582,7 +1582,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1( static __device__ __forceinline__ float vec_dot_iq2_s_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM const block_iq2_s * bq2 = (const block_iq2_s *) vbq; const int ib32 = iqs; @@ -1619,7 +1619,7 @@ static __device__ __forceinline__ float vec_dot_iq2_s_q8_1( static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq; const int ib32 = iqs; @@ -1646,7 +1646,7 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1( static __device__ __forceinline__ float vec_dot_iq3_s_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM const block_iq3_s * bq2 = (const block_iq3_s *) vbq; const int ib32 = iqs; @@ -1671,7 +1671,7 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1( static __device__ __forceinline__ float vec_dot_iq1_s_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM const block_iq1_s * bq1 = (const block_iq1_s *) vbq; const int qs_packed = get_int_b2(bq1->qs, iqs); @@ -1703,7 +1703,7 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1( static __device__ __forceinline__ float vec_dot_iq1_m_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM const block_iq1_m * bq1 = (const block_iq1_m *) vbq; @@ -1763,7 +1763,7 @@ static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4 static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM const block_iq4_nl * bq = (const block_iq4_nl *) vbq; @@ -1788,7 +1788,7 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1( static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq; const uint8_t * values = (const uint8_t *)kvalues_iq4nl; diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 3dccdf61abf3b..4e64b9c92773a 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -258,6 +258,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "awq_marlin_repack(Tensor b_q_weight, SymInt size_k, " "SymInt size_n, int num_bits) -> Tensor"); // conditionally compiled so impl registrations are in source file +#endif // Dequantization for GGML. ops.def("ggml_dequantize(Tensor W, int type, SymInt m, SymInt n) -> Tensor"); @@ -274,6 +275,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "ggml_mul_mat_a8(Tensor W, Tensor X, int type, SymInt row) -> Tensor"); ops.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8); +#ifndef USE_ROCM // fp8_marlin Optimized Quantized GEMM for FP8 weight-only. ops.def( "fp8_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, " diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 41892e4dddf7e..c192c9a7b0e4d 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -344,31 +344,6 @@ def _gptq_marlin_gemm_fake(a: torch.Tensor, is_zp_float: bool = False) -> torch.Tensor: return torch.empty((size_m, size_n), device=a.device, dtype=a.dtype) - @register_fake("_C::ggml_dequantize") - def _ggml_dequantize_fake(W: torch.Tensor, quant_type: int, - m: torch.SymInt, - n: torch.SymInt) -> torch.Tensor: - return torch.empty((m, n), dtype=torch.float16, device=W.device) - - @register_fake("_C::ggml_mul_mat_vec_a8") - def _ggml_mul_mat_vec_a8_fake( - W: torch.Tensor, - X: torch.Tensor, - quant_type: int, - row: torch.SymInt, - ) -> torch.Tensor: - return torch.empty((1, row), dtype=torch.float16, device=W.device) - - @register_fake("_C::ggml_mul_mat_a8") - def _ggml_mul_mat_a8_fake( - W: torch.Tensor, - X: torch.Tensor, - quant_type: int, - row: torch.SymInt, - ) -> torch.Tensor: - batch = X.size(0) - return torch.empty((batch, row), dtype=torch.float16, device=W.device) - @register_fake("_C::marlin_qqq_gemm") def _marlin_qqq_gemm_fake(a: torch.Tensor, b_q_weight: torch.Tensor, s_tok: torch.Tensor, s_ch: torch.Tensor, @@ -468,6 +443,34 @@ def machete_prepack_B_fake( memory_format=torch.contiguous_format) +if hasattr(torch.ops._C, "ggml_dequantize"): + + @register_fake("_C::ggml_dequantize") + def _ggml_dequantize_fake(W: torch.Tensor, quant_type: int, + m: torch.SymInt, + n: torch.SymInt) -> torch.Tensor: + return torch.empty((m, n), dtype=torch.float16, device=W.device) + + @register_fake("_C::ggml_mul_mat_vec_a8") + def _ggml_mul_mat_vec_a8_fake( + W: torch.Tensor, + X: torch.Tensor, + quant_type: int, + row: torch.SymInt, + ) -> torch.Tensor: + return torch.empty((1, row), dtype=torch.float16, device=W.device) + + @register_fake("_C::ggml_mul_mat_a8") + def _ggml_mul_mat_a8_fake( + W: torch.Tensor, + X: torch.Tensor, + quant_type: int, + row: torch.SymInt, + ) -> torch.Tensor: + batch = X.size(0) + return torch.empty((batch, row), dtype=torch.float16, device=W.device) + + # cutlass def cutlass_scaled_mm_supports_fp8(cuda_device_capability: int) -> bool: return torch.ops._C.cutlass_scaled_mm_supports_fp8(cuda_device_capability) diff --git a/vllm/config.py b/vllm/config.py index 42a44f5415e9f..f163665e2c063 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -387,7 +387,7 @@ def _verify_quantization(self) -> None: supported_quantization = QUANTIZATION_METHODS rocm_supported_quantization = [ "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors", - "fbgemm_fp8" + "fbgemm_fp8", "gguf" ] optimized_quantization_methods = [ "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin", From 4634a89d18569ef0ee2d7dd2d535377a1f460188 Mon Sep 17 00:00:00 2001 From: Ricky Xu Date: Fri, 22 Nov 2024 21:15:55 -0800 Subject: [PATCH 124/255] Prefix Cache Aware Scheduling [1/n] (#10128) Signed-off-by: rickyx --- tests/core/block/test_prefix_caching_block.py | 181 +++++++++- tests/core/test_scheduler.py | 179 +++++++++- tests/core/utils.py | 51 ++- tests/prefix_caching/test_prefix_caching.py | 106 +++++- vllm/core/block/cpu_gpu_block_allocator.py | 15 +- vllm/core/block/interfaces.py | 36 +- vllm/core/block/naive_block.py | 11 +- vllm/core/block/prefix_caching_block.py | 258 ++++++++----- vllm/core/block_manager.py | 23 +- vllm/core/interfaces.py | 4 + vllm/core/placeholder_block_space_manager.py | 3 + vllm/core/scheduler.py | 338 +++++++++++++----- vllm/sequence.py | 3 + 13 files changed, 967 insertions(+), 241 deletions(-) diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py index d325b9606843e..bbeb4b3a58f2a 100644 --- a/tests/core/block/test_prefix_caching_block.py +++ b/tests/core/block/test_prefix_caching_block.py @@ -5,9 +5,14 @@ import pytest +from tests.core.utils import create_dummy_sequence +from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator from vllm.core.block.interfaces import Block, BlockAllocator -from vllm.core.block.prefix_caching_block import (PrefixCachingBlock, +from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker, + PrefixCachingBlock, PrefixCachingBlockAllocator) +from vllm.sequence import Logprob +from vllm.utils import Device class TestPrefixCachingBlock: @@ -726,18 +731,71 @@ def test_touch_block(): token_ids=common_token_ids, allocator=allocator, ) - block_ids = [block.block_id for block in blocks] + block_hashes = [block.content_hash for block in blocks] # The allocated blocks should be marked as touched # but not computed. - computed_block_ids = allocator.get_computed_block_ids( - [], block_ids, skip_last_block_id=False) + computed_block_ids = allocator.find_cached_blocks_prefix( + block_hashes) assert len(computed_block_ids) == 0 allocator.mark_blocks_as_computed([]) - computed_block_ids = allocator.get_computed_block_ids( - [], block_ids, skip_last_block_id=False) + computed_block_ids = allocator.find_cached_blocks_prefix( + block_hashes=block_hashes) assert len(computed_block_ids) == common_blocks + @staticmethod + def test_find_cached_blocks_prefix(): + """ + This test verifies the behavior of find_cached_blocks_prefix. + """ + block_size = 4 + num_blocks = 8 + total_test_blocks = 12 + allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks, + block_size=block_size) + + token_ids = list(range(total_test_blocks * block_size)) + block_tokens_seq1 = token_ids[:num_blocks * block_size] + blocks_seq1 = TestPrefixCachingBlockAllocator.create_immutable_chain( + block_size=block_size, + token_ids=block_tokens_seq1, + allocator=allocator, + ) + block_hashes_seq1 = [block.content_hash for block in blocks_seq1] + allocator.mark_blocks_as_computed([]) + + # All blocks should be cached. + cached_blocks_seq1 = allocator.find_cached_blocks_prefix( + block_hashes=block_hashes_seq1) + assert len(cached_blocks_seq1) == num_blocks + + # Free the first sequence. + for block in blocks_seq1: + allocator.free(block) + + # All blocks should be still be cached if not required to be allocated. + cached_blocks = allocator.find_cached_blocks_prefix( + block_hashes=block_hashes_seq1) + assert len(cached_blocks) == num_blocks + + block_tokens_seq2 = token_ids[num_blocks * block_size:] + blocks_seq2 = TestPrefixCachingBlockAllocator.create_immutable_chain( + block_size=block_size, + token_ids=block_tokens_seq2, + allocator=allocator, + ) + block_hashes_seq2 = [block.content_hash for block in blocks_seq2] + allocator.mark_blocks_as_computed([]) + cached_blocks = allocator.find_cached_blocks_prefix( + block_hashes=block_hashes_seq2) + assert len(cached_blocks) == len(blocks_seq2) + + # Half of the blocks from seq1 should still be cached. + num_evicted_blocks = len(blocks_seq2) + cached_blocks = allocator.find_cached_blocks_prefix( + block_hashes=block_hashes_seq1) + assert len(cached_blocks) == len(blocks_seq1) - num_evicted_blocks + @staticmethod def create_immutable_chain( block_size: int, @@ -762,3 +820,114 @@ def create_immutable_chain( blocks.append(prev_block) return blocks + + +class TestComputedBlocksTracker: + + @staticmethod + def _get_mock_allocator(): + return MagicMock(spec=PrefixCachingBlockAllocator) + + @staticmethod + def test_get_num_cached_tokens(): + """ + Test it correctly computes the number of cached tokens for a given + sequence: + + - The cache token count is derived from the number of cached blocks. + - The cache token count is updated when the allocator is updated. + - When a sequence is removed, the cache token count should be updated + accordingly. + + # TODO(rickyx): This behaviour for prefill sequence is a hack until + we fix the computed blocks tracking. + - The cache token count for prefill sequence doesn't change while + the sequence is in continuous prefill (chunked prefill). + """ + block_size = 4 + mock_allocator = TestComputedBlocksTracker._get_mock_allocator() + tracker = ComputedBlocksTracker( + allocator=mock_allocator, + block_size=block_size, + enable_caching=True, + ) + + # Not yet allocated. + tokens = [0, 1, 2, 3, 4, 5] + seq1 = create_dummy_sequence(request_id=0, + token_ids=tokens, + block_size=block_size) + mock_allocator.find_cached_blocks_prefix.return_value = [] + assert tracker.get_num_cached_tokens(seq1) == 0 + + mock_allocator.find_cached_blocks_prefix.return_value = [ + None + ] # 1 block cached. + # Result is cached for prefill sequence. + assert tracker.get_num_cached_tokens(seq1) == 0 + + # Mark the sequence as non-prefill. + seq1.data.update_num_computed_tokens(len(tokens)) # 6 tokens computed. + assert not seq1.is_prefill() + + # Recomputes for decoding sequence. + assert tracker.get_num_cached_tokens(seq1) == 4 + + # Append new tokens to the sequence. + num_new_tokens = 3 + for i in range(num_new_tokens): + seq1.append_token_id(i, {i: Logprob(logprob=0.0)}) + + assert tracker.get_num_cached_tokens(seq1) == 4 + + # Update the allocator. + mock_allocator.find_cached_blocks_prefix.return_value = [ + None + ] * 2 # 2 blocks cached. + assert tracker.get_num_cached_tokens(seq1) == 8 + + # Remove the sequence. + tracker.remove_seq(seq1.seq_id) + + # Re-create the sequence with the same request id to simulate recompute. + seq1 = create_dummy_sequence(request_id=0, + token_ids=tokens, + block_size=block_size) + mock_allocator.find_cached_blocks_prefix.return_value = [ + ] # no cached block + assert tracker.get_num_cached_tokens(seq1) == 0 + + @staticmethod + def test_correct_block_hash(): + """ + Test that the block hash is correctly computed for a sequence (should + match the underlying block allocator's block hash). So the number of + cached tokens is correctly retrieved. + """ + block_size = 4 + allocator = CpuGpuBlockAllocator.create( + allocator_type="prefix_caching", + num_gpu_blocks=16, + num_cpu_blocks=16, + block_size=block_size, + ) + gpu_allocator = allocator._allocators[Device.GPU] + + tracker = ComputedBlocksTracker( + allocator=allocator, + block_size=block_size, + enable_caching=True, + ) + + tokens = list(range(block_size * 4)) # 4 blocks. + seq = create_dummy_sequence(request_id=0, + token_ids=tokens, + block_size=block_size) + _ = TestPrefixCachingBlockAllocator.create_immutable_chain( + block_size=block_size, + token_ids=tokens, + allocator=gpu_allocator, + ) + allocator.mark_blocks_as_computed([]) + + assert tracker.get_num_cached_tokens(seq) == len(tokens) diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index 5ff32be611592..8f6de84e566e7 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -12,9 +12,9 @@ from vllm.lora.request import LoRARequest from vllm.sequence import SequenceGroup -from .utils import (append_new_token, append_new_token_seq_group, - create_dummy_prompt, get_sequence_groups, - schedule_and_update_computed_tokens) +from .utils import (append_new_token, append_new_token_seq, + append_new_token_seq_group, create_dummy_prompt, + get_sequence_groups, schedule_and_update_computed_tokens) def test_scheduler_add_seq_group(): @@ -305,6 +305,8 @@ def initialize_scheduler( block_size=4, num_cpu_blocks=8, num_gpu_blocks=8, + enable_prefix_caching=False, + enable_chunked_prefill=False, ): block_size = block_size scheduler_config = SchedulerConfig( @@ -312,8 +314,15 @@ def initialize_scheduler( max_num_batched_tokens=max_token_budget, max_num_seqs=max_num_seqs, max_model_len=max_model_len, + enable_chunked_prefill=enable_chunked_prefill, + ) + cache_config = CacheConfig( + block_size, + 1.0, + 1, + "auto", + enable_prefix_caching=enable_prefix_caching, ) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = num_cpu_blocks cache_config.num_gpu_blocks = num_gpu_blocks scheduler = Scheduler(scheduler_config, cache_config, lora_config) @@ -800,3 +809,165 @@ def test_scheduling_budget(): assert budget.num_curr_seqs == 0 budget.subtract_num_seqs(seq_group.request_id, 2) assert budget.num_curr_seqs == 0 + + +@pytest.mark.parametrize("enable_prefix_caching", [True, False]) +def test_prefix_caching_aware_prefills(enable_prefix_caching): + """ + Test the below scenario: + + For 3 sequences, seqA, seqB, seqC, share the first block as prefix. + + The test verifies the below scenarios: + 1. SeqA is first scheduled. + 2. SeqB and SeqC can be prefilled together in a single schedule round + even though there are not enough token budgets to prefill both without + considering prefix caching. + """ + + block_size = 4 + max_num_batched_tokens = 12 + max_seq_group = 3 + scheduler = initialize_scheduler( + block_size=block_size, + num_cpu_blocks=16, + num_gpu_blocks=16, + max_token_budget=max_num_batched_tokens, + max_num_seqs=max_seq_group, + max_model_len=max_num_batched_tokens, + enable_prefix_caching=enable_prefix_caching, + ) + + seqA_tokens = list(range(8)) + num_shared_tokens = 4 + seqB_tokens = seqA_tokens[:num_shared_tokens] + list(range( + 12, 16)) # Shared prefix first 4. + seqC_tokens = seqA_tokens[:num_shared_tokens] + list(range( + 16, 20)) # Shared prefix first 4. + + seqA, seqA_group = create_dummy_prompt("0", + prompt_tokens=seqA_tokens, + block_size=block_size) + seqB, seqB_group = create_dummy_prompt("1", + prompt_tokens=seqB_tokens, + block_size=block_size) + seqC, seqC_group = create_dummy_prompt("2", + prompt_tokens=seqC_tokens, + block_size=block_size) + + # Schedule seqA prefill. + scheduler.add_seq_group(seqA_group) + metas, out, _ = scheduler.schedule() + assert (len(out.scheduled_seq_groups) == 1 + and out.scheduled_seq_groups[0].seq_group == seqA_group) + assert out.scheduled_seq_groups[0].token_chunk_size == len(seqA_tokens) + + # Schedule seqA decode. + append_new_token_seq_group(len(seqA_tokens), seqA_group, 999) + metas, out, _ = scheduler.schedule() + + assert len(out.scheduled_seq_groups) == 1 + assert out.scheduled_seq_groups[0].seq_group == seqA_group + assert out.scheduled_seq_groups[0].token_chunk_size == 1 + + # Schedule seqB and seqC prefills should work with prefix caching. + scheduler.add_seq_group(seqB_group) + scheduler.add_seq_group(seqC_group) + metas, out, _ = scheduler.schedule() + + if enable_prefix_caching: + assert len(out.scheduled_seq_groups) == 2 + assert set([ + out.scheduled_seq_groups[0].seq_group, + out.scheduled_seq_groups[1].seq_group, + ]) == set([seqB_group, seqC_group]) + assert len(metas) == 2 + for meta in metas: + assert meta.token_chunk_size == 8 + assert (len(meta.computed_block_nums) == num_shared_tokens // + block_size) # 1 Block for the 8 tokens. + else: + assert len(out.scheduled_seq_groups) == 1 + assert len(metas) == 1 + assert metas[0].token_chunk_size == 8 + assert len(metas[0].computed_block_nums) == 0 # No blocks computed. + + +def test_no_multiple_partial_prefills_with_chunked_prefill_and_prefix_caching( +): + """ + This test verifies that we don't schedule new prefills if there's already + a continuous prefill in progress even though the new prefills with shared + prefix can fit in the token budget: + + - SeqA is being chunked prefill. + - SeqB with the same prompt shouldn't be scheduled for prefill even though + there's enough token budget to prefill the cached tokens. + - Neither should seqC be scheduled. + + - When seqA is in decoding phase, seqB and seqC can be scheduled. + - Entire seqB should be prefilled since it's a full prefix cache hit. + - SeqC would be partially prefilled with the prefix shared, and the + remaining unique tokens would be prefilled (rounded down to be + block-size aligned). + """ + + block_size = 2 + max_num_batched_tokens = 4 + max_seq_group = 3 + scheduler = initialize_scheduler( + block_size=block_size, + num_cpu_blocks=16, + num_gpu_blocks=16, + max_token_budget=max_num_batched_tokens, + max_num_seqs=max_seq_group, + max_model_len=100, + enable_prefix_caching=True, + enable_chunked_prefill=True, + ) + + seqA_tokens = list(range(8)) + seqB_tokens = seqA_tokens + seqC_shared_prefix_len = 4 + seqC_tokens = seqA_tokens[:seqC_shared_prefix_len] + list(range(12, 20)) + + seqA, seqA_group = create_dummy_prompt("0", + prompt_tokens=seqA_tokens, + block_size=block_size) + seqB, seqB_group = create_dummy_prompt("1", + prompt_tokens=seqB_tokens, + block_size=block_size) + + # Chunked prefill seqA. + scheduler.add_seq_group(seqA_group) + metas, out = schedule_and_update_computed_tokens(scheduler) + assert len(out.scheduled_seq_groups) == 1 + assert out.scheduled_seq_groups[0].seq_group == seqA_group + assert out.scheduled_seq_groups[0].token_chunk_size == 4 + + # seqB should not be scheduled with ongoing prefills. + scheduler.add_seq_group(seqB_group) + metas, out = schedule_and_update_computed_tokens(scheduler) + assert len(out.scheduled_seq_groups) == 1 + assert out.scheduled_seq_groups[0].seq_group == seqA_group + assert out.scheduled_seq_groups[0].token_chunk_size == 4 + + # both seqB and seqC can now be scheduled with seqA is over. + # seqA is in decoding phase. + append_new_token_seq(seqA, 999) + seqC, seqC_group = create_dummy_prompt("2", + prompt_tokens=seqC_tokens, + block_size=block_size) + scheduler.add_seq_group(seqC_group) + metas, out = schedule_and_update_computed_tokens(scheduler) + assert len(out.scheduled_seq_groups) == 3 + + metas = {meta.request_id: meta for meta in metas} + assert metas[seqA_group.request_id].token_chunk_size == 1 # Decode + assert (metas[seqB_group.request_id].token_chunk_size == 8 + ) # Fully cached prefill + assert ( + metas[seqC_group.request_id].token_chunk_size == 6 + ), "A partial prefix of C (4 tokens) should be prefilled, with the " + "remaining tokens fit into 3 token budget (4-1 from the seqA). It will " + "then be rounded down to 2 tokens on block size, thus 6 tokens in total." diff --git a/tests/core/utils.py b/tests/core/utils.py index cd0caa4704e11..277368b57b938 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -1,17 +1,20 @@ import time -from typing import List, Optional +from collections import defaultdict +from typing import Any, Dict, List, Optional from typing import Sequence as GenericSequence from typing import Tuple from vllm import SamplingParams +from vllm.core.scheduler import Scheduler, SchedulerOutputs from vllm.inputs import EncoderDecoderInputs, token_inputs from vllm.lora.request import LoRARequest -from vllm.sequence import Logprob, Sequence, SequenceGroup +from vllm.sequence import (Logprob, Sequence, SequenceGroup, + SequenceGroupMetadata) def create_dummy_prompt( request_id: str, - prompt_length: int, + prompt_length: int = -1, block_size: Optional[int] = None, lora_request: Optional[LoRARequest] = None, best_of: int = 1, @@ -26,6 +29,7 @@ def create_dummy_prompt( # Create dummy prompt sequence with tokens 0...block_size-1 # and prompt "0 ... block_size". prompt_tokens = list(range(prompt_length)) + prompt_str = " ".join([str(t) for t in prompt_tokens]) prompt = Sequence(int(request_id), inputs=token_inputs(prompt_tokens, prompt=prompt_str), @@ -42,6 +46,15 @@ def create_dummy_prompt( return prompt, seq_group +def create_dummy_sequence(request_id: int, token_ids: List[int], + block_size: int) -> Sequence: + return Sequence( + seq_id=request_id, + inputs=token_inputs(token_ids), + block_size=block_size, + ) + + def create_dummy_prompt_encoder_decoder( request_id: str, decoder_prompt_length: int, @@ -194,12 +207,40 @@ def append_new_token(out, token_id: int): def schedule_and_update_computed_tokens(scheduler): metas, out, _ = scheduler.schedule() - for s, meta in zip(out.scheduled_seq_groups, metas): - s.seq_group.update_num_computed_tokens(meta.token_chunk_size) + for s in out.scheduled_seq_groups: + s.seq_group.update_num_computed_tokens(s.token_chunk_size) return metas, out +def append_new_token_seq(seq: Sequence, token_id: int): + seq.append_token_id(token_id, {token_id: Logprob(token_id)}) + + def append_new_token_seq_group(token_chunk_size, seq_group, token_id: int): seq_group.update_num_computed_tokens(token_chunk_size) for seq in seq_group.get_seqs(): seq.append_token_id(token_id, {token_id: Logprob(token_id)}) + + +class SchedulerProxy: + """ + A proxy class to forward calls to the scheduler. + """ + + def __init__(self, scheduler: Scheduler): + self.scheduler_ = scheduler + self.call_history: Dict[str, List[Any]] = defaultdict(list) + + def __getattr__(self, name: str) -> Any: + + def wrapper(*args, **kwargs): + result = getattr(self.scheduler_, name)(*args, **kwargs) + self.call_history[name].append((args, kwargs, result)) + return result + + return wrapper + + def last_schedule_ret( + self, ) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs, Any]: + _, _, ret = self.call_history["schedule"][-1] + return ret diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index 50723dbb610ac..8d16710f14585 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -2,10 +2,15 @@ Run `pytest tests/prefix_caching/test_prefix_caching.py`. """ + import pytest +from tests.conftest import VllmRunner +from tests.core.utils import SchedulerProxy, create_dummy_prompt from tests.kernels.utils import override_backend_env_variable from vllm import SamplingParams, TokensPrompt +from vllm.core.scheduler import Scheduler +from vllm.engine.llm_engine import LLMEngine from ..models.utils import check_outputs_equal @@ -27,6 +32,7 @@ @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("cached_position", [0, 1]) +@pytest.mark.parametrize("enable_chunked_prefill", [True, False]) @pytest.mark.parametrize("block_size", [16]) def test_mixed_requests( hf_runner, @@ -37,6 +43,7 @@ def test_mixed_requests( dtype: str, max_tokens: int, cached_position: int, + enable_chunked_prefill: bool, block_size: int, monkeypatch, ) -> None: @@ -55,6 +62,7 @@ def test_mixed_requests( model, dtype=dtype, enable_prefix_caching=True, + enable_chunked_prefill=enable_chunked_prefill, block_size=block_size, ) as vllm_model: # Run the first prompt so the cache is populated @@ -72,13 +80,13 @@ def test_mixed_requests( block_size) * block_size else: expected_num_cached_tokens = 0 - assert req_outputs[ - i].num_cached_tokens == expected_num_cached_tokens + assert ( + req_outputs[i].num_cached_tokens == expected_num_cached_tokens) - vllm_outputs = [ - (output.prompt_token_ids + list(output.outputs[0].token_ids), - output.prompt + output.outputs[0].text) for output in req_outputs - ] + vllm_outputs = [( + output.prompt_token_ids + list(output.outputs[0].token_ids), + output.prompt + output.outputs[0].text, + ) for output in req_outputs] check_outputs_equal( outputs_0_lst=hf_outputs, @@ -105,3 +113,89 @@ def test_unstable_prompt_sequence( for prompt in UNSTABLE_PROMPT_SEQUENCE: vllm_model.generate(TokensPrompt(prompt_token_ids=prompt), SamplingParams(max_tokens=1)) + + +@pytest.mark.parametrize("model", MODELS) +def test_fully_cached_prefill_needs_uncached_token(model): + block_size = 16 + max_num_batched_tokens = 16 + num_output_tokens = 5 + # Make a vllm engine + runner = VllmRunner( + model_name=model, + gpu_memory_utilization=0.7, + enable_chunked_prefill=True, + enforce_eager=True, + enable_prefix_caching=True, + block_size=block_size, + max_num_batched_tokens=max_num_batched_tokens, + max_num_seqs=max_num_batched_tokens, + ) + engine: LLMEngine = runner.model.llm_engine + + scheduler: Scheduler = SchedulerProxy(engine.scheduler[0]) # type: ignore + engine.scheduler[0] = scheduler + + # SeqA + seqA_tokens = list(range(2 * block_size)) + seqA, seq_groupA = create_dummy_prompt( + request_id="0", + prompt_tokens=seqA_tokens, + max_tokens=num_output_tokens, + block_size=block_size, + ) + + scheduler.add_seq_group(seq_groupA) + + assert seqA.data.get_num_computed_tokens() == 0 + + # Prefill seqA + while not seqA.is_finished(): + engine.step() + + # seqB + seqB_tokens = [t + 1 for t in seqA_tokens] # shift by 1 + seqB, seq_groupB = create_dummy_prompt( + request_id="1", + prompt_tokens=seqB_tokens, + max_tokens=num_output_tokens, + block_size=block_size, + ) + + # seqC is the same as seqA + seqC, seq_groupC = create_dummy_prompt( + request_id="2", + prompt_tokens=seqA_tokens, + max_tokens=num_output_tokens, + block_size=block_size, + ) + + scheduler.add_seq_group(seq_groupB) + scheduler.add_seq_group(seq_groupC) + + # Even seqC is fully cached, it should not be prefilled since we + # require at least 1 uncached token. + engine.step() + + sched_metas, sched_out, _ = scheduler.last_schedule_ret() + assert len(sched_out.scheduled_seq_groups) == 1 + assert (sched_out.scheduled_seq_groups[0].seq_group.request_id == + seq_groupB.request_id) + assert (sched_out.scheduled_seq_groups[0].token_chunk_size == + max_num_batched_tokens) + + # When seqB is finished, seqC could be prefilled. + while not seqB.is_finished(): + engine.step() + sched_metas, sched_out, _ = scheduler.last_schedule_ret() + assert len(sched_out.scheduled_seq_groups) == 1 + assert (sched_out.scheduled_seq_groups[0].seq_group.request_id == + seq_groupB.request_id) + + engine.step() + sched_metas, sched_out, _ = scheduler.last_schedule_ret() + assert len(sched_out.scheduled_seq_groups) == 1 + assert (sched_out.scheduled_seq_groups[0].seq_group.request_id == + seq_groupC.request_id) + assert sched_out.scheduled_seq_groups[0].token_chunk_size == len( + seqA_tokens) diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index 9727f6e19b84e..3197af3c2b7a4 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -306,14 +306,6 @@ def mark_blocks_as_computed(self, block_ids: List[int]) -> None: device = Device.GPU return self._allocators[device].mark_blocks_as_computed(block_ids) - def get_computed_block_ids(self, prev_computed_block_ids: List[int], - block_ids: List[int], - skip_last_block_id: bool) -> List[int]: - # Prefix caching only supported on GPU. - device = Device.GPU - return self._allocators[device].get_computed_block_ids( - prev_computed_block_ids, block_ids, skip_last_block_id) - def get_common_computed_block_ids( self, computed_seq_block_ids: List[List[int]]) -> List[int]: # Prefix caching only supported on GPU. @@ -342,6 +334,13 @@ def get_and_reset_swaps(self) -> List[Tuple[int, int]]: self._swap_mapping.clear() return list(mapping.items()) + def find_cached_blocks_prefix( + self, + block_hashes: List[int], + device: Device = Device.GPU, + ) -> List[int]: + return self._allocators[device].find_cached_blocks_prefix(block_hashes) + class NullBlock(Block): """ diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py index 72bbab1dcea5d..06f4851af3466 100644 --- a/vllm/core/block/interfaces.py +++ b/vllm/core/block/interfaces.py @@ -159,12 +159,6 @@ def mark_blocks_as_accessed(self, block_ids: List[int], def mark_blocks_as_computed(self, block_ids: List[int]) -> None: pass - @abstractmethod - def get_computed_block_ids(self, prev_computed_block_ids: List[int], - block_ids: List[int], - skip_last_block_id: bool) -> List[int]: - pass - @abstractmethod def get_common_computed_block_ids( self, computed_seq_block_ids: List[List[int]]) -> List[int]: @@ -192,6 +186,13 @@ def get_prefix_cache_hit_rate(self) -> float: class NoFreeBlocksError(ValueError): pass + @abstractmethod + def find_cached_blocks_prefix( + self, + block_hashes: List[int], + ) -> List[int]: + pass + class DeviceAwareBlockAllocator(ABC): @@ -207,9 +208,12 @@ def allocate_immutable_block(self, prev_block: Optional[Block], pass @abstractmethod - def allocate_immutable_blocks(self, prev_block: Optional[Block], - block_token_ids: List[List[int]], - device: Device) -> List[Block]: + def allocate_immutable_blocks( + self, + prev_block: Optional[Block], + block_token_ids: List[List[int]], + device: Device, + ) -> List[Block]: pass @abstractmethod @@ -246,12 +250,6 @@ def mark_blocks_as_accessed(self, block_ids: List[int], def mark_blocks_as_computed(self, block_ids: List[int]) -> None: pass - @abstractmethod - def get_computed_block_ids(self, prev_computed_block_ids: List[int], - block_ids: List[int], - skip_last_block_id: bool) -> List[int]: - pass - @abstractmethod def get_common_computed_block_ids( self, computed_seq_block_ids: List[List[int]]) -> List[int]: @@ -284,3 +282,11 @@ def allocate_or_get_null_block(self) -> Block: def get_prefix_cache_hit_rate(self, device: Device) -> float: """Prefix cache hit rate. -1 means not supported or disabled.""" pass + + @abstractmethod + def find_cached_blocks_prefix( + self, + block_hashes: List[int], + device: Device = Device.GPU, + ) -> List[int]: + pass diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index 9341a518d11c6..a2af5ad6362c1 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -262,13 +262,6 @@ def mark_blocks_as_computed(self, block_ids: List[int]) -> None: """ pass - def get_computed_block_ids(self, prev_computed_block_ids: List[int], - block_ids: List[int], - skip_last_block_id: bool) -> List[int]: - """No prefix caching here => return empty list - """ - return [] - def get_common_computed_block_ids( self, computed_seq_block_ids: List[List[int]]) -> List[int]: """Determine blocks that can be skipped in prefill. @@ -329,6 +322,10 @@ def swap_in(self, blocks: List[Block]) -> None: def get_prefix_cache_hit_rate(self) -> float: return -1 + def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]: + # Not applicable for naive block allocator. + return [] + class NaiveBlock(Block): """An implementation of the Block class that does not support prefix diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 57527e39b9bdd..b736167f6ceb4 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -1,13 +1,18 @@ """Token blocks.""" +import sys +from bisect import bisect_left from os.path import commonprefix -from typing import Dict, FrozenSet, Iterable, List, Optional, Set, Tuple +from typing import (Callable, Dict, FrozenSet, Iterable, List, Optional, Set, + Tuple) from vllm.core.block.common import (CacheMetricData, CopyOnWriteTracker, get_all_blocks_recursively) -from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device +from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId, Device, + DeviceAwareBlockAllocator) from vllm.core.block.naive_block import (BlockPool, NaiveBlock, NaiveBlockAllocator) from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor +from vllm.sequence import Sequence PrefixHash = int @@ -534,26 +539,6 @@ def block_is_computed(self, block_id: int) -> bool: else: return block_id in self.evictor - def get_computed_block_ids(self, - prev_computed_block_ids: List[int], - block_ids: List[int], - skip_last_block_id: bool = True) -> List[int]: - prev_prefix_size = len(prev_computed_block_ids) - cur_size = len(block_ids) - if skip_last_block_id: - cur_size -= 1 - - # Sanity checks - assert cur_size >= 0 - assert prev_prefix_size <= cur_size - - ret = prev_computed_block_ids - for i in range(prev_prefix_size, cur_size): - block_id = block_ids[i] - if self.block_is_computed(block_id): - ret.append(block_id) - return ret - def get_common_computed_block_ids( self, computed_seq_block_ids: List[List[int]]) -> List[int]: """Return the block ids that are common for a given sequence group. @@ -634,6 +619,47 @@ def swap_in(self, blocks: List[Block]) -> None: block.block_id = block_id # Assign block_id + def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]: + """ + Given a list of block hashes, return the prefix of the block hashes that + are all cached. + + Since a block's block hash includes the hashes of all previous blocks, + and we only allocate/deallocate blocks in the entire sequence, so if a + block is cached, then all previous blocks are also cached. With this + property, we can use binary search to find the prefix of cached blocks. + + Args: + block_hashes (List[int]): The list of block hashes. + + Returns: + List[int]: The prefix of the `block_hashes` that are cached. + """ + + def _block_is_cached(block_hash: PrefixHash) -> bool: + if block_hash not in self._cached_blocks: + return False + + cached_block_id = self._cached_blocks[block_hash] + # We only consider the blocks that are marked as computed. + return self.block_is_computed(cached_block_id) + + def _bisect_left(a, x, key: Callable[[PrefixHash], bool]) -> int: + + # python <= 3.10 don't have the key argument + if sys.version_info < (3, 10): + a = [key(e) for e in a] + return bisect_left(a, x) + else: + return bisect_left(a, x, key=key) + + # Look for the first block that's not cached, and returns the prefix + # i.e. blocks that are cached. + idx = _bisect_left(block_hashes, + True, + key=lambda x: not _block_is_cached(x)) + return block_hashes[:idx] + class PrefixCachingBlock(Block): """A block implementation that supports prefix caching. @@ -843,86 +869,126 @@ def hash_block_tokens(is_first_block: bool, prev_block_hash: Optional[int], class ComputedBlocksTracker: - """Handles caching of per-sequence computed block ids. - When a sequence appears for the first time, it traverses all of the - blocks and detects the prefix of blocks that is computed. On the - subsequent times, it only traverses the new blocks that were added - and updates the already recorded prefix of blocks with the newly - computed blocks. - - To avoid redundant traversals, the algorithm also detects when there - is a "gap" in the computed prefix. For example, if we have blocks = - [1,2,3,4,5], and we have detected [1,2,3] as the computed prefix, then - we won't try to add more computed blocks to [1,2,3] in this sequence - iteration, and will add more computed blocks only after the sequence is - freed and reused again. - - Note that currently, for a given sequence, we also skip the last - block id for caching purposes, to avoid caching of a full sequence """ + Tracks the computed blocks for each sequence. - def __init__(self, allocator): - self._allocator = allocator - self._cached_computed_seq_blocks: Dict[int, Tuple[List[int], - bool]] = {} + Internally, it maintains a map from sequence id to the list of block hashes + for the sequence. We cache the hashes of the full blocks for each sequence, + and make sure the hash is calculated in the same way as the allocator. + When a sequence is being decoded, we also update the sequence's hash + accordingly and incrementally. - def add_seq(self, seq_id: int) -> None: - """Start tracking seq_id - """ - assert seq_id not in self._cached_computed_seq_blocks - self._cached_computed_seq_blocks[seq_id] = ([], False) - - def remove_seq(self, seq_id: int) -> None: - """Stop tracking seq_id - """ - assert seq_id in self._cached_computed_seq_blocks - del self._cached_computed_seq_blocks[seq_id] - - def get_cached_computed_blocks_and_update( - self, seq_id: int, block_ids: List[int]) -> List[int]: - """ Look at the class documentation for details - """ - # Ensure seq_id is already tracked - assert seq_id in self._cached_computed_seq_blocks - - # Get cached data (may be empty on the first time) - prev_computed_block_ids, has_gap = self._cached_computed_seq_blocks[ - seq_id] - - if has_gap: - # When gap is detected, we do not add more computed blocks at this - # sequence iteration - return prev_computed_block_ids - - # We do not consider the last block id for caching purposes. - num_cur_blocks = len(block_ids) - 1 - assert num_cur_blocks >= 0 - - if len(prev_computed_block_ids) >= num_cur_blocks: - # Cache HIT - assert len(prev_computed_block_ids) == num_cur_blocks - return prev_computed_block_ids - - # If here, then we may possibly add more computed blocks. As a result, - # traverse the additional blocks after prev_computed_block_ids to - # detect more computed blocks and add them. - - # Incremental init for seq_id => Look only at the new blocks - computed_block_ids = self._allocator.get_computed_block_ids( # noqa: E501 - prev_computed_block_ids, - block_ids, - skip_last_block_id= - True, # We skip last block id to avoid caching of full seq - ) + From the sequence hash, with prefix caching enabled, we could also calculate + the number of cached tokens for the sequence by looking up the number of + cached block hashes in the allocator. + """ - # Detect if there is a "gap" - has_gap = len(computed_block_ids) < num_cur_blocks + def __init__( + self, + allocator: DeviceAwareBlockAllocator, + block_size: int, + enable_caching: bool, + ): + self._allocator = allocator + self._block_size = block_size + self._enable_caching = enable_caching + + # A map from seq_id to the list of block hashes for the + # sequence. This is so that we don't have to recompute the block hashes + # for the sequence when we need to check if the sequence is cached. + # Note a block that's not full will not have its hash calculated and + # recorded. + self._seq_id_to_blocks_hashes: Dict[int, List[int]] = {} + + # A map from seq_id to the number of tokens that are cached for the + # sequence. + # We need this so that a sequence in continuous prefill doesn't + # accidentally see its cached token count change. See comments in + # `get_num_cached_tokens` for more details. + self._seq_id_to_num_tokens_computed: Dict[int, int] = {} + + def _update_seq_hashes(self, seq: Sequence) -> None: + """Incrementally update the sequence's block hashes and record them.""" + assert self._enable_caching + + block_hashes_recorded = self._seq_id_to_blocks_hashes.get( + seq.seq_id, []) + cur_num_blocks_recorded = len(block_hashes_recorded) + token_ids = seq.get_token_ids() + assert len(token_ids) >= cur_num_blocks_recorded * self._block_size, ( + f"The sequence has {len(token_ids)} tokens, but" + f" already recorded {cur_num_blocks_recorded} blocks. " + "This should not happen since we assume blocks are " + "only appended other than recomputation. When the sequence is " + "recomputed, we should have removed the info of the old blocks.") + # Update the computed block hashes for the sequence. Since only full + # blocks are considered as "computed", we take floor here. + num_computed_blocks = len(token_ids) // self._block_size + + # We need to know the hash of the previous block to compute the hash of + # the current block so that blocks could be uniquely identified across + # sequences of prefixes. + prev_block_hash = (None if cur_num_blocks_recorded == 0 else + block_hashes_recorded[-1]) + # Only update the computed block hashes for the new blocks + for i in range(cur_num_blocks_recorded, num_computed_blocks): + assert len(token_ids) >= (i + 1) * self._block_size + block_token_ids = token_ids[i * self._block_size:(i + 1) * + self._block_size] + # This has to be kept in sync with the allocator's hash + # calculation. + block_hash = PrefixCachingBlock.hash_block_tokens( + is_first_block=prev_block_hash is None, + prev_block_hash=prev_block_hash, + cur_block_token_ids=block_token_ids, + ) + block_hashes_recorded.append(block_hash) + prev_block_hash = block_hash + + self._seq_id_to_blocks_hashes[seq.seq_id] = block_hashes_recorded + + def get_num_cached_tokens(self, seq: Sequence) -> int: + if not self._enable_caching: + return 0 + + # We always try to update the sequence hashes on the fly. + # This is to ensure that we don't miss any cached tokens for the + # sequence during decode. + # This routine should only update hash for any new blocks too. + self._update_seq_hashes(seq) + + num_computed_tokens_prev = self._seq_id_to_num_tokens_computed.get( + seq.seq_id, None) + + # TODO(rickyx): This hack could be removed once we mark blocks as + # computed correctly with chunked prefills. + if num_computed_tokens_prev is not None and seq.is_prefill(): + # For a sequence that is still in prefill, we don't + # recompute the number of cached tokens. + # This also handles correctly chunked prefill since currently + # we mark blocks as computed even if the sequence is still partially + # prefilled. So a continuously prefilled sequence should not + # see its cached token count change while running. + return num_computed_tokens_prev + + block_hashes = self._seq_id_to_blocks_hashes[seq.seq_id] + + # This is O(logN), where N is the number of blocks. + num_cached_blocks = len( + self._allocator.find_cached_blocks_prefix(block_hashes)) + num_cached_tokens = num_cached_blocks * self._block_size + self._seq_id_to_num_tokens_computed[seq.seq_id] = num_cached_tokens + return num_cached_tokens - # Record - self._cached_computed_seq_blocks[seq_id] = (computed_block_ids, - has_gap) + def remove_seq(self, seq_id: int) -> None: + """Stop tracking the sequence.""" + if not self._enable_caching: + return + assert seq_id in self._seq_id_to_blocks_hashes + del self._seq_id_to_blocks_hashes[seq_id] - return computed_block_ids + assert seq_id in self._seq_id_to_num_tokens_computed + del self._seq_id_to_num_tokens_computed[seq_id] class LastAccessBlocksTracker: diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 21f4c63b6572d..209487c6b4f9e 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -101,7 +101,7 @@ def __init__( self.cross_block_tables: Dict[EncoderSeqId, BlockTable] = {} self._computed_blocks_tracker = ComputedBlocksTracker( - self.block_allocator) + self.block_allocator, self.block_size, self.enable_caching) self._last_access_blocks_tracker = LastAccessBlocksTracker( self.block_allocator) @@ -170,7 +170,6 @@ def allocate(self, seq_group: SequenceGroup) -> None: self.block_tables[seq.seq_id] = block_table # Track seq - self._computed_blocks_tracker.add_seq(seq.seq_id) self._last_access_blocks_tracker.add_seq(seq.seq_id) # Assign the block table for each sequence. @@ -178,7 +177,6 @@ def allocate(self, seq_group: SequenceGroup) -> None: self.block_tables[seq.seq_id] = block_table.fork() # Track seq - self._computed_blocks_tracker.add_seq(seq.seq_id) self._last_access_blocks_tracker.add_seq(seq.seq_id) # Allocate cross-attention block table for encoder sequence @@ -314,11 +312,13 @@ def get_common_computed_block_ids( """ computed_seq_block_ids = [] for seq in seqs: - computed_seq_block_ids.append( - self._computed_blocks_tracker. - get_cached_computed_blocks_and_update( - seq.seq_id, - self.block_tables[seq.seq_id].physical_block_ids)) + all_blocks = self.block_tables[seq.seq_id].physical_block_ids + num_cached_tokens = ( + self._computed_blocks_tracker.get_num_cached_tokens(seq)) + assert num_cached_tokens % self.block_size == 0 + num_cached_blocks = num_cached_tokens // self.block_size + computed_block_ids = all_blocks[:num_cached_blocks] + computed_seq_block_ids.append(computed_block_ids) # NOTE(sang): This assumes seq_block_ids doesn't contain any None. return self.block_allocator.get_common_computed_block_ids( @@ -332,7 +332,6 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: self.block_tables[child_seq.seq_id] = src_block_table.fork() # Track child seq - self._computed_blocks_tracker.add_seq(child_seq.seq_id) self._last_access_blocks_tracker.add_seq(child_seq.seq_id) def can_swap_in(self, seq_group: SequenceGroup, @@ -503,3 +502,9 @@ def _can_swap(self, return AllocStatus.OK else: return AllocStatus.LATER + + def get_num_cached_tokens(self, seq: Sequence) -> int: + """Get the number of tokens in blocks that are already computed and + cached in the block manager for the sequence. + """ + return self._computed_blocks_tracker.get_num_cached_tokens(seq) diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py index 9501a516bf020..b10b8d3f4a5bf 100644 --- a/vllm/core/interfaces.py +++ b/vllm/core/interfaces.py @@ -121,3 +121,7 @@ def mark_blocks_as_computed(self, seq_group: SequenceGroup, def get_prefix_cache_hit_rate(self, device: Device) -> float: """Prefix cache hit rate. -1 means not supported or disabled.""" pass + + @abstractmethod + def get_num_cached_tokens(self, seq: Sequence) -> int: + pass diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py index a337392bbed53..26d42b7f1790e 100644 --- a/vllm/core/placeholder_block_space_manager.py +++ b/vllm/core/placeholder_block_space_manager.py @@ -89,3 +89,6 @@ def mark_blocks_as_computed(self, seq_group: SequenceGroup, def get_prefix_cache_hit_rate(self, device: Device) -> float: return -1 + + def get_num_cached_tokens(self, seq: Sequence) -> int: + return 0 diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index af4671ec29be9..841e65c488fc6 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -56,11 +56,16 @@ class SchedulingBudget: max_num_seqs: int _request_ids_num_batched_tokens: Set[str] = field(default_factory=set) _request_ids_num_curr_seqs: Set[str] = field(default_factory=set) + # Number of cached tokens in the batch. + _num_cached_tokens: int = 0 + # Number of actual non-cached tokens in the batch. _num_batched_tokens: int = 0 _num_curr_seqs: int = 0 def can_schedule(self, *, num_new_tokens: int, num_new_seqs: int): - assert num_new_tokens != 0 + # We allow num_new_tokens to be 0 when the entire sequence has + # been cached. + assert num_new_tokens >= 0 assert num_new_seqs != 0 return (self.num_batched_tokens + num_new_tokens <= self.token_budget and self.num_curr_seqs + num_new_seqs <= self.max_num_seqs) @@ -68,12 +73,18 @@ def can_schedule(self, *, num_new_tokens: int, num_new_seqs: int): def remaining_token_budget(self): return self.token_budget - self.num_batched_tokens - def add_num_batched_tokens(self, req_id: str, num_batched_tokens: int): + def add_num_batched_tokens(self, + req_id: str, + num_batched_tokens: int, + num_cached_tokens: int = 0): if req_id in self._request_ids_num_batched_tokens: return + assert num_cached_tokens >= 0 + assert num_batched_tokens >= 0 self._request_ids_num_batched_tokens.add(req_id) self._num_batched_tokens += num_batched_tokens + self._num_cached_tokens += num_cached_tokens def subtract_num_batched_tokens(self, req_id: str, num_batched_tokens: int): @@ -101,6 +112,10 @@ def num_batched_tokens(self): def num_curr_seqs(self): return self._num_curr_seqs + @property + def num_cached_tokens(self): + return self._num_cached_tokens + @dataclass class ScheduledSequenceGroup: @@ -541,9 +556,19 @@ def _schedule_running( assert len(self._async_stopped) == 0 while running_queue: seq_group = running_queue[0] - num_running_tokens = self._get_num_new_tokens( - seq_group, SequenceStatus.RUNNING, enable_chunking, budget) - + # We discard the cached tokens info here because we don't need it + # for running sequence: + # 1. If a sequence is running with chunked prefill, the cached + # tokens info was already used for the first prefill. + # 2. If a sequence is running with non-chunked prefill, then + # there it's a decoding sequence, and the cached tokens info is + # irrelevant. + num_uncached_new_tokens, _ = ( + self._get_num_new_uncached_and_cached_tokens( + seq_group, SequenceStatus.RUNNING, enable_chunking, + budget)) + + num_running_tokens = num_uncached_new_tokens if num_running_tokens == 0: # No budget => Stop break @@ -715,13 +740,15 @@ def _schedule_swapped( # The total number of sequences in the RUNNING state should not # exceed the maximum number of sequences. num_new_seqs = seq_group.get_max_num_running_seqs() - num_new_tokens = self._get_num_new_tokens(seq_group, - SequenceStatus.SWAPPED, - enable_chunking, budget) - - if (num_new_tokens == 0 - or not budget.can_schedule(num_new_tokens=num_new_tokens, - num_new_seqs=num_new_seqs)): + num_new_tokens_uncached, num_new_tokens_cached = ( + self._get_num_new_uncached_and_cached_tokens( + seq_group, SequenceStatus.SWAPPED, enable_chunking, + budget)) + + if num_new_tokens_uncached == 0 or not budget.can_schedule( + num_new_tokens=num_new_tokens_uncached, + num_new_seqs=num_new_seqs, + ): break if lora_int_id > 0 and curr_loras is not None: @@ -732,12 +759,19 @@ def _schedule_swapped( is_prefill = seq_group.is_prefill() if is_prefill: prefill_seq_groups.append( - ScheduledSequenceGroup(seq_group, - token_chunk_size=num_new_tokens)) + ScheduledSequenceGroup( + seq_group, + token_chunk_size=num_new_tokens_uncached + + num_new_tokens_cached, + )) else: decode_seq_groups.append( ScheduledSequenceGroup(seq_group, token_chunk_size=1)) - budget.add_num_batched_tokens(seq_group.request_id, num_new_tokens) + budget.add_num_batched_tokens( + seq_group.request_id, + num_batched_tokens=num_new_tokens_uncached, + num_cached_tokens=num_new_tokens_cached, + ) budget.add_num_seqs(seq_group.request_id, num_new_seqs) swapped_queue.extendleft(leftover_swapped) @@ -803,26 +837,30 @@ def _schedule_priority_preemption( if waiting_queue: seq_group = waiting_queue.popleft() num_new_seqs = seq_group.get_max_num_running_seqs() - num_new_tokens = self._get_num_new_tokens(seq_group, - SequenceStatus.WAITING, - False, budget) + num_new_tokens_uncached, _ = ( + self._get_num_new_uncached_and_cached_tokens( + seq_group, SequenceStatus.WAITING, False, budget)) #Only preempt if priority inversion exists while running_queue and self._get_priority( running_queue[-1]) > self._get_priority(seq_group): #Only preempt if waiting sequence cannot be allocated can_allocate = self.block_manager.can_allocate(seq_group) - if (num_new_tokens and can_allocate == AllocStatus.OK - and budget.can_schedule(num_new_tokens=num_new_tokens, - num_new_seqs=num_new_seqs)): + if (num_new_tokens_uncached > 0 + and can_allocate == AllocStatus.OK + and budget.can_schedule( + num_new_tokens=num_new_tokens_uncached, + num_new_seqs=num_new_seqs, + )): break #Adjust budget to remove the victim sequence group vseq_group = running_queue.pop() - num_running_tokens = self._get_num_new_tokens( - vseq_group, SequenceStatus.RUNNING, False, budget) - budget.subtract_num_batched_tokens(vseq_group.request_id, - num_running_tokens) + num_running_tokens_uncached, _ = ( + self._get_num_new_uncached_and_cached_tokens( + vseq_group, SequenceStatus.RUNNING, False, budget)) + budget.subtract_num_batched_tokens( + vseq_group.request_id, num_running_tokens_uncached) num_running_seqs = vseq_group.get_max_num_running_seqs() budget.subtract_num_seqs(vseq_group.request_id, num_running_seqs) @@ -882,9 +920,12 @@ def _schedule_prefills( assert len(waiting_seqs) == 1, ( "Waiting sequence group should have only one prompt " "sequence.") - num_new_tokens = self._get_num_new_tokens(seq_group, - SequenceStatus.WAITING, - enable_chunking, budget) + num_new_tokens_uncached, num_new_tokens_cached = ( + self._get_num_new_uncached_and_cached_tokens( + seq_group, SequenceStatus.WAITING, enable_chunking, + budget)) + num_new_tokens = num_new_tokens_uncached + num_new_tokens_cached + if not enable_chunking: num_prompt_tokens = waiting_seqs[0].get_len() assert num_new_tokens == num_prompt_tokens @@ -935,10 +976,18 @@ def _schedule_prefills( waiting_queue.popleft() continue + if (budget.num_batched_tokens >= + self.scheduler_config.max_num_batched_tokens): + # We've reached the budget limit - since there might be + # continuous prefills in the running queue, we should break + # to avoid scheduling any new prefills. + break + num_new_seqs = seq_group.get_max_num_running_seqs() - if (num_new_tokens == 0 - or not budget.can_schedule(num_new_tokens=num_new_tokens, - num_new_seqs=num_new_seqs)): + if num_new_tokens_uncached == 0 or not budget.can_schedule( + num_new_tokens=num_new_tokens_uncached, + num_new_seqs=num_new_seqs, + ): break # Can schedule this request. @@ -967,7 +1016,11 @@ def _schedule_prefills( seq_groups.append( ScheduledSequenceGroup(seq_group=seq_group, token_chunk_size=num_new_tokens)) - budget.add_num_batched_tokens(seq_group.request_id, num_new_tokens) + budget.add_num_batched_tokens( + seq_group.request_id, + num_batched_tokens=num_new_tokens_uncached, + num_cached_tokens=num_new_tokens_cached, + ) budget.add_num_seqs(seq_group.request_id, num_new_seqs) # Queue requests that couldn't be scheduled. @@ -1075,7 +1128,8 @@ def _schedule_default(self) -> SchedulerOutputs: return SchedulerOutputs( scheduled_seq_groups=scheduled_seq_groups, num_prefill_groups=num_prefill_groups, - num_batched_tokens=budget.num_batched_tokens, + num_batched_tokens=budget.num_batched_tokens + + budget.num_cached_tokens, blocks_to_swap_in=swapped_in.blocks_to_swap_in, blocks_to_swap_out=running_scheduled.blocks_to_swap_out, blocks_to_copy=blocks_to_copy, @@ -1119,7 +1173,6 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs: running_scheduled.swapped_out) == 0: swapped_in = self._schedule_swapped(budget, curr_loras) - # Schedule new prefills. prefills = self._schedule_prefills(budget, curr_loras, enable_chunking=True) @@ -1157,7 +1210,8 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs: num_prefill_groups=(len(prefills.seq_groups) + len(swapped_in.prefill_seq_groups) + len(running_scheduled.prefill_seq_groups)), - num_batched_tokens=budget.num_batched_tokens, + num_batched_tokens=budget.num_batched_tokens + + budget.num_cached_tokens, blocks_to_swap_in=swapped_in.blocks_to_swap_in, blocks_to_swap_out=running_scheduled.blocks_to_swap_out, blocks_to_copy=running_scheduled.blocks_to_copy + @@ -1584,64 +1638,178 @@ def _get_num_lookahead_slots(self, is_prefill: bool, return self.scheduler_config.num_lookahead_slots - def _get_num_new_tokens(self, seq_group: SequenceGroup, - status: SequenceStatus, enable_chunking: bool, - budget: SchedulingBudget) -> int: - """Get the next new tokens to compute for a given sequence group - that's in a given `status`. + def _get_num_new_uncached_and_cached_tokens( + self, + seq_group: SequenceGroup, + status: SequenceStatus, + enable_chunking: bool, + budget: SchedulingBudget, + ) -> Tuple[int, int]: + """ + Returns the number of new uncached and cached tokens to schedule for a + given sequence group that's in a given `status`. The API could chunk the number of tokens to compute based on `budget` if `enable_chunking` is True. If a sequence group has multiple sequences (e.g., running beam search), it means it is in decoding phase, so chunking doesn't happen. - Returns 0 if the new token cannot be computed due to token budget. + Returns (0, 0) if the new token cannot be computed due to token budget. + + The cached tokens's blocks are already computed, and the attention + backend will reuse the cached blocks rather than recomputing them. So + the scheduler could schedule these cached tokens "for free". + + Args: + seq_group: The sequence group to get the number of new tokens to + schedule. + status: The status of the sequences to get the number of new tokens + to schedule. + enable_chunking: Whether to chunk the number of tokens to compute. + budget: The budget to chunk the number of tokens to compute. + + + Returns: + A tuple of two ints. The first int is the number of new uncached + tokens to schedule. The second int is the number of cached tokens. + If no more new tokens can be scheduled, returns (0, 0). """ - num_new_tokens = 0 + num_cached_new_tokens = 0 + num_uncached_new_tokens = 0 + seqs = seq_group.get_seqs(status=status) + # Compute the number of new uncached and cached tokens for + # each sequence. for seq in seqs: - num_new_tokens += seq.get_num_new_tokens() - assert num_new_tokens > 0 - # Chunk if a running request cannot fit in the given budget. - # If number of seq > 1, it means it is doing beam search - # in a decode phase. Do not chunk. + if not seq.is_prefill(): + # Decode sequences should always just have 1 uncached token + # TODO(rickyx): Actually is this still correct for multi-step? + num_uncached_new_tokens += 1 + continue + + num_computed_tokens_seq = seq.get_num_computed_tokens() + all_num_new_tokens_seq = seq.get_len() - num_computed_tokens_seq + if not self.cache_config.enable_prefix_caching: + # If prefix caching is not enabled, all new tokens are uncached. + num_uncached_new_tokens += all_num_new_tokens_seq + continue + + # NOTE: the cache token might be currently in a block that's in an + # evictor meaning that it's not yet allocated. However, we don't + # exclude such tokens in the cache count because it will be + # guaranteed to be allocated later if the sequence can be allocated. + num_cached_tokens_seq = self.block_manager.get_num_cached_tokens( + seq) + + # Sanity check. + if num_cached_tokens_seq < num_computed_tokens_seq: + # This should only happen with chunked prefill, and + # the seq is still in prefill. The `num_cached_tokens_seq` + # is the value we calculated on scheduling the first prefill. + # For subsequent continuous prefill steps, we cached the + # number of cache tokens for the sequence so the cached token + # count could be less than the number of computed tokens. + # See comments on `ComputedBlocksTracker` for more details. + assert ( + seq.is_prefill() and seq.status == SequenceStatus.RUNNING + and self.scheduler_config.chunked_prefill_enabled + ), ("Number of cached tokens should not be less than the " + "number of computed tokens for a sequence that's still " + f"in prefill. But there are {num_cached_tokens_seq} cached " + f"tokens and {num_computed_tokens_seq} computed tokens " + f"for sequence {seq.seq_id}.") + + num_cached_new_tokens_seq = max( + 0, num_cached_tokens_seq - num_computed_tokens_seq) + num_uncached_new_tokens_seq = (all_num_new_tokens_seq - + num_cached_new_tokens_seq) + + num_uncached_new_tokens += num_uncached_new_tokens_seq + num_cached_new_tokens += num_cached_new_tokens_seq + + if num_uncached_new_tokens == 0 and num_cached_new_tokens > 0: + # For a fully cached hit sequence, we actually need to recompute the + # last token. So we need at least 1 uncached token to schedule. + # See ModelRunner._compute_for_prefix_cache_hit for more details. + num_uncached_new_tokens = 1 + num_cached_new_tokens -= 1 + if enable_chunking and len(seqs) == 1: - remaining_token_budget = budget.remaining_token_budget() - if self.scheduler_config.is_multi_step: - # The current multi-step + chunked prefill capability does - # not actually support chunking prompts. - # - # Therefore, `num_new_tokens` is computed in the same fashion - # for both multi-step+chunked-prefill & - # multi-step+chunked-prefill+APC - # - # Prompts with more tokens than the current remaining budget - # are postponed to future scheduler steps - if num_new_tokens > self._get_prompt_limit(seq_group): - # If the seq_group is in prompt-stage, pass the - # num_new_tokens as-is so the caller can ignore - # the sequence. - pass - else: - num_new_tokens = 0 \ - if num_new_tokens > remaining_token_budget \ - else num_new_tokens - elif self.cache_config.enable_prefix_caching: - # When prefix caching is enabled, we always allocate - # the number of new tokens that is dividable by the block - # size to avoid partial block matching. - block_size = self.cache_config.block_size - remainder = budget.token_budget % block_size - if remainder != 0: - raise ValueError("When enabling chunked prefill and " - "prefix caching, max_num_batched_tokens " - "(chunk size) must be dividable by " - "block size, but got chunk_size " - f"({budget.token_budget}) % block_size " - f"({block_size}) = {remainder}") - if remaining_token_budget < num_new_tokens: - num_new_tokens = (remaining_token_budget // - block_size) * block_size - else: - num_new_tokens = min(num_new_tokens, remaining_token_budget) + # Chunk if a running request cannot fit in the given budget. + # If number of seq > 1, it means it is doing beam search + # in a decode phase. Do not chunk. + num_uncached_new_tokens = self._chunk_new_tokens_to_schedule( + self.scheduler_config, + self.cache_config, + budget, + self._get_prompt_limit(seq_group), + num_uncached_new_tokens, + ) + + return num_uncached_new_tokens, num_cached_new_tokens + + @staticmethod + def _chunk_new_tokens_to_schedule( + scheduler_config: SchedulerConfig, + cache_config: CacheConfig, + budget: SchedulingBudget, + prompt_limit: int, + num_new_tokens: int, + ) -> int: + """ + Chunks the number of new tokens to schedule based on the budget when + chunked prefill is enabled. + + Args: + scheduler_config: The scheduler config. + cache_config: The cache config. + budget: The budget to chunk the number of tokens to compute. + prompt_limit: The maximum number of tokens allowed in a prompt. + num_new_tokens: The number of new tokens to schedule. + + Returns: + The number of new tokens to schedule after chunking. + """ + remaining_token_budget = budget.remaining_token_budget() + if scheduler_config.is_multi_step: + # The current multi-step + chunked prefill capability does + # not actually support chunking prompts. + # + # Therefore, `num_new_tokens` is computed in the same fashion + # for both multi-step+chunked-prefill & + # multi-step+chunked-prefill+APC + # + # Prompts with more tokens than the current remaining budget + # are postponed to future scheduler steps + if num_new_tokens > prompt_limit: + # If the seq_group is in prompt-stage, pass the + # num_new_tokens as-is so the caller can ignore + # the sequence. + return num_new_tokens + + return (0 if num_new_tokens > remaining_token_budget else + num_new_tokens) + + if cache_config.enable_prefix_caching: + # Adjust the remaining token budget to be divisible by the block + # size when prefix caching is enabled. + + # When prefix caching is enabled, we always allocate + # the number of new tokens that is dividable by the block + # size to avoid partial block matching. + block_size = cache_config.block_size + remainder = budget.token_budget % block_size + if remainder != 0: + raise ValueError("When enabling chunked prefill and " + "prefix caching, max_num_batched_tokens " + "(chunk size) must be dividable by " + "block size, but got chunk_size " + f"({budget.token_budget}) % block_size " + f"({block_size}) = {remainder}") + # Round down to block size. + remaining_token_budget = (remaining_token_budget // block_size * + block_size) + + num_new_tokens = min(num_new_tokens, remaining_token_budget) + return num_new_tokens diff --git a/vllm/sequence.py b/vllm/sequence.py index 3b41d25a2fe42..a1cc8fc3b09de 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -579,6 +579,9 @@ def get_num_new_tokens(self) -> int: return 1 return self.data.get_num_uncomputed_tokens() + def get_num_computed_tokens(self) -> int: + return self.data.get_num_computed_tokens() + def is_prefill(self) -> bool: return self.data.stage == SequenceStage.PREFILL From c8acd80548c77bd5d6302353708dd16ea705f031 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 23 Nov 2024 13:25:09 +0800 Subject: [PATCH 125/255] [2/N] handling placeholders in merged multi-modal processor (#10485) Signed-off-by: DarkLight1337 --- tests/multimodal/test_processing.py | 370 ++++++++++++++ tests/multimodal/test_utils.py | 3 +- vllm/multimodal/inputs.py | 9 +- vllm/multimodal/processing.py | 720 ++++++++++++++++++++++------ vllm/utils.py | 20 +- 5 files changed, 975 insertions(+), 147 deletions(-) create mode 100644 tests/multimodal/test_processing.py diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py new file mode 100644 index 0000000000000..b2367060c6c1b --- /dev/null +++ b/tests/multimodal/test_processing.py @@ -0,0 +1,370 @@ +from typing import cast + +import pytest +from transformers import BatchFeature + +from vllm.multimodal.processing import (PromptReplacement, find_text_matches, + find_token_matches, iter_token_matches, + iter_token_runs, replace_text_matches) +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils import full_groupby + + +# yapf: disable +@pytest.mark.parametrize( + ("token_ids", "expected"), + [ + ([], []), + ( + [32000, 32000, 32000], + [{ "token_id": 32000, "start_idx": 0, "length": 3 }], + ), + ( + [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918], + [ + { "token_id": 9833, "start_idx": 0, "length": 1 }, + { "token_id": 28747, "start_idx": 1, "length": 1 }, + { "token_id": 32000, "start_idx": 2, "length": 3 }, + { "token_id": 9833, "start_idx": 5, "length": 1 }, + { "token_id": 28747, "start_idx": 6, "length": 1 }, + { "token_id": 32000, "start_idx": 7, "length": 2 }, + { "token_id": 918, "start_idx": 9, "length": 1 }, + ], + ), + ], +) +# yapf: enable +def test_iter_token_runs(token_ids, expected): + result = list(iter_token_runs(token_ids)) + + # Only displayed on error + print("result:", result) + + # Manually constructed results + assert [item._asdict() for item in result] == expected + + # Invariants + assert sum(run_info.length for run_info in result) == len(token_ids) + + +# yapf: disable +@pytest.mark.parametrize( + ("token_ids", "match_ids", "expected"), + [ + ([], [], [{ "start_idx": 0, "end_idx": 0 }]), + ([], [32000], []), + ( + [32000, 32000, 32000], + [32000], + [ + { "start_idx": 0, "end_idx": 1 }, + { "start_idx": 1, "end_idx": 2 }, + { "start_idx": 2, "end_idx": 3 }, + ], + ), + ( + [32000, 32000, 32000], + [32000, 32000], + [{ "start_idx": 0, "end_idx": 2 }], + ), + ( + [32000, 32000, 32000], + [32000, 32000, 32000], + [{ "start_idx": 0, "end_idx": 3 }], + ), + ( + [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918], + [28747, 32000], + [ + { "start_idx": 1, "end_idx": 3 }, + { "start_idx": 6, "end_idx": 8 }, + ], + ), + ( + [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918], + [28747, 32000, 32000, 32000], + [ + { "start_idx": 1, "end_idx": 5 }, + ], + ), + ( + [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918], + [28747, 0, 32000], + [], + ), + ], +) +# yapf: enable +def test_iter_token_matches(token_ids, match_ids, expected): + result = list(iter_token_matches(token_ids, match_ids)) + + # Manually constructed results + assert [item._asdict() for item in result] == expected + + # Invariants + match_lens = [end - start for start, end in result] + print("match_lens:", match_lens) # Only displayed on error + assert all(match_len == len(match_ids) for match_len in match_lens) + + +# yapf: disable +@pytest.mark.parametrize( + ("prompt", "target_by_key", "expected_by_key"), + [ + ( + [], + { + "pattern_1": [], + "pattern_2": [32000], + }, + { + "pattern_1": [{ "start_idx": 0, "end_idx": 0 }], + "pattern_2": [], + } + ), + ( + [32000, 32000, 32000, 32000], + { + "pattern_1": [32000], + "pattern_2": [32000, 32000], + "pattern_3": [32000, 32000, 32000], + }, + { + "pattern_1": [ + { "start_idx": 0, "end_idx": 1 }, + { "start_idx": 1, "end_idx": 2 }, + { "start_idx": 2, "end_idx": 3 }, + { "start_idx": 3, "end_idx": 4 }, + ], + "pattern_2": [ + { "start_idx": 0, "end_idx": 2 }, + { "start_idx": 2, "end_idx": 4 }, + ], + "pattern_3": [ + { "start_idx": 0, "end_idx": 3 }, + ], + }, + ), + ( + [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918], + { + "pattern_1": [28747, 32000], + "pattern_2": [28747, 32000, 32000, 32000], + "pattern_3": [28747, 0, 32000], + }, + { + "pattern_1": [ + { "start_idx": 1, "end_idx": 3 }, + { "start_idx": 6, "end_idx": 8 }, + ], + "pattern_2": [ + { "start_idx": 1, "end_idx": 5 }, + ], + "pattern_3": [], + }, + ), + ], +) +# yapf: enable +def test_find_token_matches(prompt, target_by_key, expected_by_key): + # Should not be used since there is nothing to convert to token IDs + mock_tokenizer = cast(AnyTokenizer, object()) + + result = find_token_matches( + prompt, + [ + PromptReplacement(target, [], 0).bind(key, mock_tokenizer) + for key, target in target_by_key.items() + ], + ) + + # Only displayed on error + print("result:", result) + + # Manually constructed results + result_groups = dict(full_groupby(result, key=lambda x: x.modality)) + assert { + key: [ + dict(start_idx=item.start_idx, end_idx=item.end_idx) + for item in result_groups.get(key, []) + ] + for key in expected_by_key + } == expected_by_key + + +# yapf: disable +@pytest.mark.parametrize( + ("prompt", "target_by_key", "expected_by_key"), + [ + # Detokenized test cases of `test_find_token_matches` + # using the vocab of llava-hf/llava-v1.6-mistral-7b-hf + ( + "", + { + "pattern_1": "", + "pattern_2": "", + }, + { + "pattern_1": [{ "start_idx": 0, "end_idx": 0 }], + "pattern_2": [], + } + ), + ( + "", + { + "pattern_1": "", + "pattern_2": "", + "pattern_3": "", + }, + { + "pattern_1": [ + { "start_idx": 0, "end_idx": 7 }, + { "start_idx": 7, "end_idx": 14 }, + { "start_idx": 14, "end_idx": 21 }, + { "start_idx": 21, "end_idx": 28 }, + ], + "pattern_2": [ + { "start_idx": 0, "end_idx": 14 }, + { "start_idx": 14, "end_idx": 28 }, + ], + "pattern_3": [ + { "start_idx": 0, "end_idx": 21 }, + ], + }, + ), + ( + "Image:Image:!", + { + "pattern_1": "Image:", + "pattern_2": "Image:", + "pattern_3": "Image:", + }, + { + "pattern_1": [ + { "start_idx": 0, "end_idx": 13 }, + { "start_idx": 27, "end_idx": 40 }, + ], + "pattern_2": [ + { "start_idx": 0, "end_idx": 27 }, + ], + "pattern_3": [], + }, + ), + # Test regex escape + ( + "<|image|><|image|>", + { + "pattern_1": "<|image|>", + "pattern_2": "<|image|>", + "pattern_3": "<|image|><|image|>", + }, + { + "pattern_1": [ + { "start_idx": 0, "end_idx": 9 }, + { "start_idx": 16, "end_idx": 25 }, + ], + "pattern_2": [ + { "start_idx": 0, "end_idx": 16 }, + { "start_idx": 16, "end_idx": 32 }, + ], + "pattern_3": [ + { "start_idx": 0, "end_idx": 25 }, + ], + }, + ), + ], +) +# yapf: enable +def test_find_text_matches(prompt, target_by_key, expected_by_key): + # Should not be used since there is nothing to convert to text + mock_tokenizer = cast(AnyTokenizer, object()) + + result = find_text_matches( + prompt, + [ + PromptReplacement(target, [], 0).bind(key, mock_tokenizer) + for key, target in target_by_key.items() + ], + ) + + # Only displayed on error + print("result:", result) + + # Manually constructed results + result_groups = dict(full_groupby(result, key=lambda x: x.modality)) + assert { + key: [ + dict(start_idx=item.start_idx, end_idx=item.end_idx) + for item in result_groups.get(key, []) + ] + for key in expected_by_key + } == expected_by_key + + +# yapf: disable +@pytest.mark.parametrize( + ("prompt", "target_by_key", "repl_by_key", "expected_by_mm_count"), + [ + ( + "Image:Image:!", + { + # We use `` before `Image:` to test matches that + # occur out of order + "pattern_1": "", + "pattern_2": "Image:", + "pattern_3": "!", + }, + { + # Test whether target is confused with repl_unit + "pattern_1": ("", 1), + # Test empty repl_unit + "pattern_2": ("", 1), + # Test multiple repl_count + "pattern_3": ("?", 2), + }, + { + # Test no replacement + 0: "Image:Image:!", + # Test single replacement + 1: "Image:??", + # Test repeated replacement + 2: "??", + }, + ), + ] +) +# yapf: enable +def test_find_replace_text( + prompt, + target_by_key, + repl_by_key, + expected_by_mm_count, +): + # Should not be used since there is nothing to convert to text + mock_tokenizer = cast(AnyTokenizer, object()) + + matches = find_text_matches( + prompt, + [ + PromptReplacement(target, *repl_by_key[key]) \ + .bind(key, mock_tokenizer) + for key, target in target_by_key.items() + ], + ) + result_by_mm_count = { + mm_count: replace_text_matches( + prompt, + matches, + {key: list(range(mm_count)) + for key in repl_by_key}, + BatchFeature(), + ) + for mm_count in expected_by_mm_count + } + + # Only displayed on error + print("matches:", matches) + print("result_by_mm_count:", result_by_mm_count) + + # Manually constructed results + assert result_by_mm_count == expected_by_mm_count diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index 9869c8123f001..fd82fb0c55fd7 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -139,7 +139,8 @@ def test_repeat_and_pad_placeholder_tokens(model): 2, "", [32000, 32000, 32000], - [{ "offset": 0, "length": 2 }]), + [{ "offset": 0, "length": 2 }], + ), ( "", [3, 2], diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 64a4c58d5509c..8e67a552afe12 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -203,14 +203,7 @@ class MultiModalInputsV2(TypedDict): """The type of inputs.""" prompt: str - """ - The original, unprocessed prompt text. - - Note: - Since prompt text is not required by vLLM internals, we leave this - unprocessed to save CPU computation. You can still call - :code:`tokenizer.decode(prompt_token_ids)` to get the processed text. - """ + """The processed prompt text.""" prompt_token_ids: List[int] """The processed token IDs which includes placeholder tokens.""" diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 88a924da174a6..28c8dda581982 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1,34 +1,91 @@ +import re +from abc import ABC, abstractmethod +from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence from dataclasses import dataclass -from functools import lru_cache, partial -from typing import (Any, Callable, Collection, Generic, List, Mapping, - Optional, TypedDict, TypeVar, final) +from functools import lru_cache +from itertools import groupby +from typing import Any, Generic, NamedTuple, Optional, Protocol, TypeVar, Union +import numpy as np from transformers import BatchFeature -from typing_extensions import TypeAlias +from typing_extensions import TypeAlias, TypedDict from vllm.inputs import InputProcessingContext from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer -from vllm.utils import is_list_of +from vllm.utils import flatten_2d_lists, full_groupby, is_list_of from .inputs import (AudioItem, ImageItem, MultiModalDataDict, MultiModalInputsV2, MultiModalKwargs, PlaceholderRange, VideoItem) + +def bind_prompt_sequence( + seq: Union[str, list[int]], + tokenizer: AnyTokenizer, +) -> "_BoundPromptSequence": + """ + Bind a text or token sequence to a tokenizer so that it can be + lazily converted into the other format on demand. + """ + return _BoundPromptSequence( + tokenizer=tokenizer, + _text=seq if isinstance(seq, str) else None, + _token_ids=seq if isinstance(seq, list) else None, + ) + + _T = TypeVar("_T") +_S = TypeVar("_S", str, list[int]) -ReplacementFunc: TypeAlias = Callable[[_T, BatchFeature, int], List[int]] -""" -Given the original data item, HF-processed data, and index of the processed -item, output the replacement token IDs to be allocated in vLLM. -""" + +@dataclass +class PromptReplacement(Generic[_S, _T]): + target: _S + """The text or token sequence to find and replace.""" + + repl_unit: _S + """ + The unit making up the replacement text or token sequence. + + See :code:`repl_count` for more details. + """ + + repl_count: Union[Callable[[list[_T], BatchFeature, int], int], int] + """ + Given the original multi-modal items for this modality, HF-processed data, + and index of the processed item, output the number of repetitions of + :code:`repl_unit` to build up the replacement text or token sequence. + + For convenience, you can pass in an integer if the number of repetitions is + a constant. + """ + + def __repr__(self) -> str: + return (f"{type(self).__name__}(target={self.target!r}, " + f"repl_unit={self.repl_unit!r})") + + def bind( + self, + modality: str, + tokenizer: AnyTokenizer, + ) -> "_BoundPromptReplacement[_T]": + return _BoundPromptReplacement( + modality=modality, + target=bind_prompt_sequence(self.target, tokenizer), + repl_unit=bind_prompt_sequence(self.repl_unit, tokenizer), + repl_count=self.repl_count, + ) @dataclass class ModalityProcessingMetadata(Generic[_T]): - placeholder_replacements: Mapping[str, ReplacementFunc] + prompt_repls: Sequence[Union[PromptReplacement[str, _T], + PromptReplacement[list[int], _T]]] """ - A dictionary where each item represents the original placeholder in the - prompt text and the corresponding replacement. + Defines each text or token sequence to replace in the HF-processed prompt. + + This is skipped if the HF-processed prompt is found to already contain + the replacement prompts. """ @@ -52,46 +109,138 @@ class MultiModalProcessingMetadataBuiltins(TypedDict, total=False): Read more on that :ref:`here `. """ -MultiModalMultiData: TypeAlias = List[_T] -""" -A list of data items, where the number of data items allowed -per modality is restricted by :code:`--limit-mm-per-prompt`. -""" +def _encode( + tokenizer: AnyTokenizer, + text: str, + *, + add_special_tokens: bool = False, +) -> list[int]: + """ + Backend-agnostic equivalent of HF's + :code:`tokenizer.encode(text, add_special_tokens=...)`. + """ + if isinstance(tokenizer, MistralTokenizer): + return tokenizer.tokenizer.encode(text, + bos=add_special_tokens, + eos=add_special_tokens) -@final -class MultiModalMultiDataBuiltins(TypedDict, total=False): - """Type annotations for modality types predefined by vLLM.""" + return tokenizer.encode(text, add_special_tokens=add_special_tokens) - image: MultiModalMultiData[ImageItem] - """The input images.""" - video: MultiModalMultiData[VideoItem] - """The input videos.""" +@lru_cache(maxsize=2048) +def _cached_encode( + tokenizer: AnyTokenizer, + text: str, + *, + add_special_tokens: bool = False, +) -> list[int]: + return _encode(tokenizer, text, add_special_tokens=add_special_tokens) - audio: MultiModalMultiData[AudioItem] - """The input audios.""" +def _decode( + tokenizer: AnyTokenizer, + token_ids: list[int], + *, + skip_special_tokens: bool = False, +) -> str: + """ + Backend-agnostic equivalent of HF's + :code:`tokenizer.decode(token_ids, skip_special_tokens=...)`. + """ + return tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) -MultiModalMultiDataDict: TypeAlias = Mapping[str, MultiModalMultiData[Any]] -""" -A dictionary containing an entry for each modality type to input. -Note: - This dictionary also accepts modality keys defined outside - :class:`MultiModalMultiDataBuiltins` as long as a customized plugin - is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`. - Read more on that :ref:`here `. -""" +@lru_cache(maxsize=2048) +def _cached_decode( + tokenizer: AnyTokenizer, + token_ids: tuple[int, ...], + *, + skip_special_tokens: bool = False, +) -> str: + return _decode(tokenizer, + list(token_ids), + skip_special_tokens=skip_special_tokens) + + +class _HasModalityAttr(Protocol): + modality: str + +class _HasModalityProp(Protocol): -def to_multi_format(data: MultiModalDataDict) -> MultiModalMultiDataDict: + @property + def modality(self) -> str: + ... + + +_M = TypeVar("_M", bound=Union[_HasModalityAttr, _HasModalityProp]) + + +def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]: + """Convenience function to apply :func:`full_groupby` based on modality.""" + return full_groupby(values, key=lambda x: x.modality) + + +@dataclass +class _BoundPromptSequence: + tokenizer: AnyTokenizer + _text: Optional[str] + _token_ids: Optional[list[int]] + + def __post_init__(self) -> None: + if self._text is None and self._token_ids is None: + raise ValueError("At least one of 'text' and 'token_ids' must be " + "specified") + + @property + def text(self) -> str: + if self._text is None: + assert self._token_ids is not None + self._text = _cached_decode(self.tokenizer, tuple(self._token_ids)) + + return self._text + + @property + def token_ids(self) -> list[int]: + if self._token_ids is None: + assert self._text is not None + self._token_ids = _cached_encode(self.tokenizer, self._text) + + return self._token_ids + + def __repr__(self) -> str: + return (f"{type(self).__name__}(_text={self._text!r}, " + f"_token_ids={self._token_ids!r})") + + +@dataclass +class _BoundPromptReplacement(Generic[_T]): + modality: str + target: _BoundPromptSequence + repl_unit: _BoundPromptSequence + repl_count: Union[Callable[[list[_T], BatchFeature, int], int], int] + + def get_count( + self, + mm_items: list[_T], + hf_inputs: BatchFeature, + item_idx: int, + ) -> int: + repl_count = self.repl_count + if isinstance(repl_count, int): + return repl_count + + return repl_count(mm_items, hf_inputs, item_idx) + + +def to_multi_format(data: MultiModalDataDict) -> dict[str, list[Any]]: """ Convert a :class:`MultiModalDataDict` containing single data items to a :class:`MultiModalMultiDataDict` containing multiple data items per entry. """ - multi_data: Mapping[str, MultiModalMultiData[Any]] = {} + multi_data = dict[str, list[Any]]() for k, v in data.items(): # yapf: disable @@ -107,86 +256,279 @@ def to_multi_format(data: MultiModalDataDict) -> MultiModalMultiDataDict: return multi_data -def encode_no_special_tokens( - tokenizer: AnyTokenizer, - text: str, -) -> List[int]: +class _TokenRun(NamedTuple): + token_id: int + + start_idx: int + length: int + + +def iter_token_runs(token_ids: list[int]) -> Iterable[_TokenRun]: """ - Backend-agnostic equivalent of HF's - :code:`tokenizer.encode(text, add_special_tokens=False)`. + Yield the starting index and length of each run of tokens that are the same. """ - if isinstance(tokenizer, MistralTokenizer): - return tokenizer.tokenizer.encode(text, bos=False, eos=False) + start_idx = 0 + + for token_id, it in groupby(token_ids): + length = sum(1 for _ in it) + yield _TokenRun(token_id=token_id, start_idx=start_idx, length=length) + + start_idx += length + + +class _PlaceholderInfo(NamedTuple): + modality: str + offset: int + length: int + + def to_range(self) -> PlaceholderRange: + return PlaceholderRange(offset=self.offset, length=self.length) + + +def iter_placeholders( + prompt_repls: Sequence[_BoundPromptReplacement[Any]], + token_ids: list[int], + *, + min_placeholder_count: int, +) -> Iterable[_PlaceholderInfo]: + """Yield each set of placeholder tokens found in :code:`token_ids`.""" + placeholder_ids_by_modality = { + modality: { + token_id + for prompt_repl in repls + for token_id in prompt_repl.repl_unit.token_ids + } + for modality, repls in full_groupby_modality(prompt_repls) + } - return tokenizer.encode(text, add_special_tokens=False) + for run_info in iter_token_runs(token_ids): + if run_info.length > min_placeholder_count: + for (modality, + placeholder_ids) in placeholder_ids_by_modality.items(): + if run_info.token_id in placeholder_ids: + yield _PlaceholderInfo( + modality=modality, + offset=run_info.start_idx, + length=run_info.length, + ) -@lru_cache -def candidate_placeholders( - tokenizer: AnyTokenizer, - placeholder_text: str, -) -> Collection[List[int]]: - """Generate token ID sequences that may represent a placeholder text.""" - # When the placeholder text is not mapped to a special token ID, - # it may be tokenized differently based on whether it is at the start/end - # of the string. So, we go through each combination of whether the text - # is at the start and end boundaries of the string - - # Matches the placeholder when it is in the middle of the string - start_id, = encode_no_special_tokens(tokenizer, "a") - end_id, = encode_no_special_tokens(tokenizer, "b") - - candidate_basic = encode_no_special_tokens(tokenizer, placeholder_text) - - start_id_, *candidate_a = encode_no_special_tokens( - tokenizer, - f"a{placeholder_text}", - ) - assert start_id == start_id_ +class _TokenMatch(NamedTuple): + start_idx: int + end_idx: int - start_id_, *candidate_ab, end_id_ = encode_no_special_tokens( - tokenizer, - f"a{placeholder_text}b", - ) - assert start_id == start_id_ and end_id == end_id_ - *candidate_b, end_id_ = encode_no_special_tokens( - tokenizer, - f"{placeholder_text}b", - ) - assert end_id == end_id_ +def iter_token_matches( + token_ids: list[int], + match_ids: list[int], +) -> Iterable[_TokenMatch]: + """Yield each occurrence of :code:`match_ids` in :code:`token_ids`.""" + match_len = len(match_ids) - # Remove duplicates (need to convert to tuple to be hashable) - unique_candidates = { - tuple(c) - for c in [candidate_basic, candidate_a, candidate_ab, candidate_b] - } + last_end_idx = 0 + for start_idx in range(len(token_ids) - match_len + 1): + if start_idx < last_end_idx: + continue # Exclude overlapping matches - # Convert back to list - return [list(c) for c in unique_candidates] + end_idx = start_idx + match_len + if token_ids[start_idx:end_idx] == match_ids: + yield _TokenMatch(start_idx=start_idx, end_idx=end_idx) + last_end_idx = end_idx -def apply_placeholders( - token_ids: List[int], - placeholder_ids: List[int], - get_replacement_ids: Callable[[], List[int]], -) -> Optional[PlaceholderRange]: - """ - Find the first occurrence of :code:`placeholder_ids`, - and replace it with the output of :code:`get_replacement_ids`. +class _PromptReplacementMatch(ABC, Generic[_T, _S]): + prompt_repl: _BoundPromptReplacement[_T] + + @property + def modality(self) -> str: + return self.prompt_repl.modality + + @property + @abstractmethod + def start_idx(self) -> int: + raise NotImplementedError + + @property + @abstractmethod + def end_idx(self) -> int: + raise NotImplementedError + + @abstractmethod + def get_repl( + self, + mm_items: list[_T], + hf_inputs: BatchFeature, + item_idx: int, + ) -> _S: + raise NotImplementedError + + def __repr__(self) -> str: + return (f"{type(self).__name__}(modality={self.modality!r}, " + f"start_idx={self.start_idx!r}, end_idx={self.end_idx!r})") + + +@dataclass(repr=False) +class _PromptReplacementTokenMatch(_PromptReplacementMatch[_T, list[int]]): + prompt_repl: _BoundPromptReplacement[_T] + match: _TokenMatch + + @property + def start_idx(self) -> int: + return self.match.start_idx + + @property + def end_idx(self) -> int: + return self.match.end_idx + + def get_repl( + self, + mm_items: list[_T], + hf_inputs: BatchFeature, + item_idx: int, + ) -> list[int]: + prompt_repl = self.prompt_repl + count = prompt_repl.get_count(mm_items, hf_inputs, item_idx) + return prompt_repl.repl_unit.token_ids * count - This function updates :code:`token_ids` in place. + +@dataclass(repr=False) +class _PromptReplacementTextMatch(_PromptReplacementMatch[_T, str]): + prompt_repl: _BoundPromptReplacement[_T] + match: re.Match[str] + + @property + def start_idx(self) -> int: + return self.match.start() + + @property + def end_idx(self) -> int: + return self.match.end() + + def get_repl( + self, + mm_items: list[_T], + hf_inputs: BatchFeature, + item_idx: int, + ) -> str: + prompt_repl = self.prompt_repl + count = prompt_repl.get_count(mm_items, hf_inputs, item_idx) + return prompt_repl.repl_unit.text * count + + +def find_token_matches( + prompt: list[int], + prompt_repls: Sequence[_BoundPromptReplacement[_T]], +) -> list[_PromptReplacementTokenMatch[_T]]: + """Return each target of :code:`prompt_repls` found in :code:`prompt`.""" + return [ + _PromptReplacementTokenMatch(prompt_repl, match) + for prompt_repl in prompt_repls + for match in iter_token_matches(prompt, prompt_repl.target.token_ids) + ] + + +def find_text_matches( + prompt: str, + prompt_repls: Sequence[_BoundPromptReplacement[_T]], +) -> list[_PromptReplacementTextMatch[_T]]: + """Return each target of :code:`prompt_repls` found in :code:`prompt`.""" + return [ + _PromptReplacementTextMatch(prompt_repl, match) + for prompt_repl in prompt_repls + for match in re.finditer(re.escape(prompt_repl.target.text), prompt) + ] + + +def _resolve_matches( + prompt: _S, + matches: Sequence[_PromptReplacementMatch[_T, _S]], +) -> list[_PromptReplacementMatch[_T, _S]]: + """ + Resolve :code:`matches` to ensure that there are no overlapping matches, + and sort them such that earlier matches take priority over later ones. """ - placeholder_length = len(placeholder_ids) + num_matches_by_idx = np.zeros(len(prompt), dtype=int) + for match in matches: + num_matches_by_idx[match.start_idx:match.end_idx] += 1 + + duplicate_matches_idxs, = np.nonzero(num_matches_by_idx > 1) + if len(duplicate_matches_idxs) > 0: + raise ValueError("Unable to find a unique replacement " + f"at indices={duplicate_matches_idxs} " + f"of prompt={prompt}") + + return sorted(matches, key=lambda x: x.start_idx) + + +def _replace_matches( + prompt: _S, + matches: Sequence[_PromptReplacementMatch[_T, _S]], + mm_items_by_modality: Mapping[str, list[_T]], + hf_inputs: BatchFeature, +) -> list[_S]: + out_seqs = list[_S]() + prev_end_idx = 0 + next_idx_by_modality = {modality: 0 for modality in mm_items_by_modality} + + for match in _resolve_matches(prompt, matches): + modality = match.modality + mm_items = mm_items_by_modality[modality] + + item_idx = next_idx_by_modality[modality] + if item_idx >= len(mm_items): + continue + + start_idx = match.start_idx + end_idx = match.end_idx + repl_ids = match.get_repl(mm_items, hf_inputs, item_idx) + + out_seqs.append(prompt[prev_end_idx:start_idx] + repl_ids) + prev_end_idx = end_idx + next_idx_by_modality[modality] += 1 + + out_seqs.append(prompt[prev_end_idx:]) + + return out_seqs + + +def replace_token_matches( + prompt: list[int], + matches: Sequence[_PromptReplacementMatch[_T, list[int]]], + mm_items_by_modality: Mapping[str, list[_T]], + hf_inputs: BatchFeature, +) -> list[int]: + """Apply :code:`prompt_repls` to :code:`prompt`.""" + if not matches: + return prompt + + token_id_seqs = _replace_matches( + prompt, + matches, + mm_items_by_modality, + hf_inputs, + ) + + return flatten_2d_lists(token_id_seqs) - for start_idx in range(len(token_ids) - placeholder_length + 1): - if token_ids[start_idx:placeholder_length] == placeholder_ids: - token_ids[start_idx:placeholder_length] = get_replacement_ids() - return PlaceholderRange(offset=start_idx, - length=placeholder_length) +def replace_text_matches( + prompt: str, + matches: Sequence[_PromptReplacementMatch[_T, str]], + mm_items_by_modality: Mapping[str, list[_T]], + hf_inputs: BatchFeature, +) -> str: + """Apply :code:`prompt_repls` to :code:`prompt`.""" + if not matches: + return prompt - return None + texts = _replace_matches( + prompt, + matches, + mm_items_by_modality, + hf_inputs, + ) + + return "".join(texts) class MultiModalProcessor: @@ -212,62 +554,166 @@ def __call__( ) -> MultiModalInputsV2: return self.apply(prompt, mm_data, mm_processor_kwargs) - def apply( + def _find_placeholders( + self, + all_prompt_repls: Sequence[_BoundPromptReplacement[Any]], + new_token_ids: list[int], + *, + # To avoid false positives from multi-input when detecting + # whether placeholder tokens have been inserted, in case + # the target sequence is a subset of the replacement tokens + min_placeholder_count: int = 16, + ) -> list[_PlaceholderInfo]: + return list( + iter_placeholders( + all_prompt_repls, + new_token_ids, + min_placeholder_count=min_placeholder_count, + )) + + def _apply_hf_processor( self, prompt: str, mm_data: MultiModalDataDict, mm_processor_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: - tokenizer = self.ctx.tokenizer + ) -> BatchFeature: hf_processor = self.ctx.get_hf_processor() - processed_inputs = hf_processor( + return hf_processor( text=prompt, # type: ignore **mm_data, **mm_processor_kwargs, ) - new_token_ids, = processed_inputs.pop("input_ids").tolist() - mm_kwargs = MultiModalKwargs(processed_inputs) - mm_placeholders: Mapping[str, List[PlaceholderRange]] = {} + def _bind_prompt_replacements( + self, + mm_data: MultiModalDataDict, + ) -> list[_BoundPromptReplacement[Any]]: + tokenizer = self.ctx.tokenizer - for modality, orig_inputs in to_multi_format(mm_data).items(): - assert isinstance(orig_inputs, list) + return [ + prompt_repl.bind(modality, tokenizer) + for modality, metadata in self.metadata.items() + if modality in mm_data for prompt_repl in metadata.prompt_repls + ] - metadata = self.metadata[modality] - placeholder_replacements = metadata.placeholder_replacements + def _apply_prompt_replacements( + self, + mm_data: MultiModalDataDict, + hf_inputs: BatchFeature, + token_ids: list[int], + prompt_repls: Sequence[_BoundPromptReplacement[Any]], + ) -> tuple[list[int], str, list[_PlaceholderInfo]]: + tokenizer = self.ctx.tokenizer - modality_placeholders: List[PlaceholderRange] = [] + mm_items = to_multi_format(mm_data) + token_matches = find_token_matches(token_ids, prompt_repls) + + # If the search text does not represent a special token, + # it may have different token IDs in the prompt, because + # the tokens may go across the boundaries of the search text. + # ---- + # e.g. when searching for "foo" in "food", if "food" itself makes + # up a token, then the token ID of "foo" will not appear at all + # ---- + # Since it is inefficient to search for all possible tokenizations + # of the search text in the prompt, we instead perform string + # replacement on the decoded token IDs, then encode them back. + if all( + len(matches) >= len(mm_data[modality]) + for modality, matches in full_groupby_modality(token_matches) + ): # yapf: disable + token_ids = replace_token_matches( + token_ids, + token_matches, + mm_items, + hf_inputs, + ) + + text = _decode(tokenizer, token_ids) + matched_repls = [match.prompt_repl for match in token_matches] + else: + text = _decode(tokenizer, token_ids) + + text_matches = find_text_matches(text, prompt_repls) + text = replace_text_matches( + text, + text_matches, + mm_items, + hf_inputs, + ) + + token_ids = _encode(tokenizer, text) + matched_repls = [match.prompt_repl for match in text_matches] + + placeholders = self._find_placeholders(matched_repls, token_ids) + + # Sanity check + assert len(placeholders) == len(matched_repls), dict( + # Log this information for easier debugging + text=text, + token_ids=token_ids, + placeholders=placeholders, + matched_repls=matched_repls, + ) - for item_idx, orig_item in enumerate(orig_inputs): - for match_text, replace_fn in placeholder_replacements.items(): - candidates = candidate_placeholders(tokenizer, match_text) - get_replacement_ids = partial( - replace_fn, - orig_item, - processed_inputs, - item_idx, - ) + return token_ids, text, placeholders - for match_ids in candidates: - # TODO(youkaichao): Don't update new_token_ids - placeholders = apply_placeholders( - new_token_ids, - match_ids, - get_replacement_ids, - ) + def apply( + self, + prompt_text: str, + mm_data: MultiModalDataDict, + mm_processor_kwargs: Mapping[str, object], + ) -> MultiModalInputsV2: + """ + Process multi-modal inputs to be used in vLLM. + + The main steps are: + + 1. Apply HF Processor on prompt text and multi-modal data together, + outputting token IDs and processed tensors. + 2. Find and replace sequences in the token IDs with placeholder tokens. + The number of placeholder tokens equals the feature size of the + multi-modal data outputted by the multi-modal encoder. + 3. Extract information about the placeholder tokens from the + processed token IDs. + """ + tokenizer = self.ctx.tokenizer + + hf_inputs = self._apply_hf_processor(prompt_text, mm_data, + mm_processor_kwargs) + prompt_ids, = hf_inputs.pop("input_ids").tolist() + mm_kwargs = MultiModalKwargs(hf_inputs) - if placeholders is not None: - modality_placeholders.append(placeholders) + all_prompt_repls = self._bind_prompt_replacements(mm_data) - # yapf: disable - mm_placeholders[modality] = modality_placeholders # type: ignore[index] - # yapf: enable + # If HF processor already inserts placeholder tokens, + # there is no need for us to insert them + all_placeholders = self._find_placeholders(all_prompt_repls, + prompt_ids) + if all_placeholders: + prompt_text = _decode(tokenizer, prompt_ids) + else: + ( + prompt_ids, + prompt_text, + all_placeholders, + ) = self._apply_prompt_replacements( + mm_data, + hf_inputs, + prompt_ids, + all_prompt_repls, + ) + + mm_placeholders = { + modality: [item.to_range() for item in items] + for modality, items in full_groupby_modality(all_placeholders) + } return MultiModalInputsV2( type="multimodal", - prompt=prompt, - prompt_token_ids=new_token_ids, + prompt=prompt_text, + prompt_token_ids=prompt_ids, mm_kwargs=mm_kwargs, mm_placeholders=mm_placeholders, ) diff --git a/vllm/utils.py b/vllm/utils.py index 30c371b0e3591..dd4283e3ac381 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -19,7 +19,8 @@ import warnings import weakref from asyncio import FIRST_COMPLETED, AbstractEventLoop, Future, Task -from collections.abc import Mapping +from collections import defaultdict +from collections.abc import Iterable, Mapping from functools import lru_cache, partial, wraps from platform import uname from typing import (Any, AsyncGenerator, Awaitable, Callable, Dict, Generic, @@ -905,6 +906,23 @@ def flatten_2d_lists(lists: List[List[T]]) -> List[T]: return [item for sublist in lists for item in sublist] +_K = TypeVar("_K", bound=Hashable) +_V = TypeVar("_V") + + +def full_groupby(values: Iterable[_V], *, key: Callable[[_V], _K]): + """ + Unlike :class:`itertools.groupby`, groups are not broken by + non-contiguous data. + """ + groups = defaultdict[_K, list[_V]](list) + + for value in values: + groups[key(value)].append(value) + + return groups.items() + + # TODO: This function can be removed if transformer_modules classes are # serialized by value when communicating between processes def init_cached_hf_modules() -> None: From 4cfe5d2bcafe1f47d1df046e6788ebbe038eaf3f Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 23 Nov 2024 13:25:46 +0800 Subject: [PATCH 126/255] [Bugfix] `multi_modal_kwargs` broadcast for CPU tensor parallel (#10541) Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/worker/cpu_enc_dec_model_runner.py | 1 + vllm/worker/cpu_model_runner.py | 1 + 2 files changed, 2 insertions(+) diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py index 1f8e2d2d88a23..cc24cfe04d2ba 100644 --- a/vllm/worker/cpu_enc_dec_model_runner.py +++ b/vllm/worker/cpu_enc_dec_model_runner.py @@ -35,6 +35,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: "input_positions": self.input_positions, "encoder_input_tokens": self.encoder_input_tokens, "encoder_input_positions": self.encoder_input_positions, + "multi_modal_kwargs": self.multi_modal_kwargs, } _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) _add_sampling_metadata_broadcastable_dict(tensor_dict, diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 2cf573625401a..7cab476d7fca4 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -83,6 +83,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, + "multi_modal_kwargs": self.multi_modal_kwargs, } _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) _add_sampling_metadata_broadcastable_dict(tensor_dict, From 86a44fb8967f757b0701aaa33aeaa8a431714a27 Mon Sep 17 00:00:00 2001 From: JiHuazhong Date: Sat, 23 Nov 2024 14:23:12 +0800 Subject: [PATCH 127/255] [Platforms] Refactor openvino code (#10573) Signed-off-by: statelesshz --- vllm/executor/openvino_executor.py | 81 ++---------------------------- vllm/platforms/openvino.py | 69 +++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 78 deletions(-) diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py index dcd4b7621381d..db0070ce510ee 100644 --- a/vllm/executor/openvino_executor.py +++ b/vllm/executor/openvino_executor.py @@ -1,19 +1,16 @@ from typing import List, Set, Tuple import openvino as ov -import openvino.properties.hint as hints -import torch import vllm.envs as envs -from vllm.config import CacheConfig, ModelConfig from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.layers.sampler import SamplerOutput from vllm.platforms import current_platform from vllm.sequence import ExecuteModelRequest -from vllm.utils import (GiB_bytes, get_distributed_init_method, get_ip, - get_open_port, make_async) +from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, + make_async) from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -30,11 +27,6 @@ def _init_executor(self) -> None: current_platform.is_openvino_gpu(), \ "OpenVINO backend supports only CPU and GPU devices" - self.ov_core = ov.Core() - self.model_config = _verify_and_get_model_config(self.model_config) - self.cache_config = _verify_and_get_cache_config( - self.ov_core, self.cache_config) - # Instantiate the worker and load the model to CPU. self._init_worker() @@ -45,7 +37,7 @@ def _init_worker(self): distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) self.driver_worker = wrapper.init_worker( - ov_core=self.ov_core, + ov_core=ov.Core(), vllm_config=self.vllm_config, local_rank=0, rank=0, @@ -130,70 +122,3 @@ async def check_health_async(self) -> None: # OpenVINOExecutor will always be healthy as long as # it's running. return - - -def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig: - if config.dtype != torch.float32: - logger.warning( - f"Only float32 dtype is supported on OpenVINO, casting from {config.dtype}." # noqa: G004, E501 - ) - config.dtype = torch.float32 - if not config.enforce_eager: - logger.warning( - "CUDA graph is not supported on OpenVINO backend, fallback to the " - "eager mode.") - config.enforce_eager = True - return config - - -def _verify_and_get_cache_config(ov_core: ov.Core, - config: CacheConfig) -> CacheConfig: - if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8": - if not current_platform.is_openvino_cpu(): - logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is" - "ignored for GPU, f16 data type will be used.") - config.cache_dtype = ov.Type.f16 - else: - logger.info("KV cache type is overridden to u8 via " - "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.") - config.cache_dtype = ov.Type.u8 - else: - if current_platform.is_openvino_cpu(): - ov_device = envs.VLLM_OPENVINO_DEVICE - inference_precision = ov_core.get_property( - ov_device, hints.inference_precision) - if inference_precision == ov.Type.bf16: - config.cache_dtype = ov.Type.bf16 - else: - config.cache_dtype = ov.Type.f16 - else: - config.cache_dtype = ov.Type.f16 - - if current_platform.is_openvino_cpu(): - if config.block_size != 32: - logger.info( - f"OpenVINO CPU optimal block size is 32, overriding currently set {config.block_size}" # noqa: G004, E501 - ) - config.block_size = 32 - else: - if config.block_size != 16: - logger.info( - f"OpenVINO GPU optimal block size is 16, overriding currently set {config.block_size}" # noqa: G004, E501 - ) - config.block_size = 16 - - kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE - if kv_cache_space >= 0: - if kv_cache_space == 0 and current_platform.is_openvino_cpu(): - config.openvino_kvcache_space_bytes = 4 * GiB_bytes # type: ignore - logger.warning( - "Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) " - "for OpenVINO backend is not set, using 4 by default.") - else: - config.openvino_kvcache_space_bytes = kv_cache_space * GiB_bytes # type: ignore - else: - raise RuntimeError( - "Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE" - f" {kv_cache_space}, expect a positive integer value.") - - return config diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py index 694de836e1517..91e615481ff8e 100644 --- a/vllm/platforms/openvino.py +++ b/vllm/platforms/openvino.py @@ -1,5 +1,7 @@ from typing import TYPE_CHECKING +import openvino as ov +import openvino.properties.hint as hints import torch import vllm.envs as envs @@ -49,6 +51,8 @@ def is_pin_memory_available(self) -> bool: @classmethod def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + from vllm.utils import GiB_bytes + parallel_config = vllm_config.parallel_config assert ( parallel_config.world_size == 1 @@ -57,3 +61,68 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: if parallel_config.worker_cls == "auto": parallel_config.worker_cls = \ "vllm.worker.openvino_worker.OpenVINOWorker" + + # check and update model config + model_config = vllm_config.model_config + if model_config.dtype != torch.float32: + logger.warning( + f"Only float32 dtype is supported on OpenVINO, casting from {model_config.dtype}." # noqa: G004, E501 + ) + model_config.dtype = torch.float32 + if not model_config.enforce_eager: + logger.warning( + "CUDA graph is not supported on OpenVINO backend, fallback to " + "the eager mode.") + model_config.enforce_eager = True + + # check and update cache config + ov_core = ov.Core() + cache_config = vllm_config.cache_config + if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8": + if not OpenVinoPlatform.is_openvino_cpu(): + logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is" + "ignored for GPU, f16 data type will be used.") + cache_config.cache_dtype = ov.Type.f16 + else: + logger.info("KV cache type is overridden to u8 via " + "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.") + cache_config.cache_dtype = ov.Type.u8 + else: + if OpenVinoPlatform.is_openvino_cpu(): + ov_device = envs.VLLM_OPENVINO_DEVICE + inference_precision = ov_core.get_property( + ov_device, hints.inference_precision) + if inference_precision == ov.Type.bf16: + cache_config.cache_dtype = ov.Type.bf16 + else: + cache_config.cache_dtype = ov.Type.f16 + else: + cache_config.cache_dtype = ov.Type.f16 + + if OpenVinoPlatform.is_openvino_cpu(): + if cache_config.block_size != 32: + logger.info( + f"OpenVINO CPU optimal block size is 32, overriding currently set {cache_config.block_size}" # noqa: G004, E501 + ) + cache_config.block_size = 32 + else: + if cache_config.block_size != 16: + logger.info( + f"OpenVINO GPU optimal block size is 16, overriding currently set {cache_config.block_size}" # noqa: G004, E501 + ) + cache_config.block_size = 16 + + kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE + if kv_cache_space >= 0: + if kv_cache_space == 0 and OpenVinoPlatform.is_openvino_cpu(): + cache_config.openvino_kvcache_space_bytes = 4 * GiB_bytes # type: ignore + logger.warning( + "Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) " + "for OpenVINO backend is not set, using 4 by default.") + else: + cache_config.openvino_kvcache_space_bytes = ( # type: ignore + kv_cache_space * GiB_bytes) + else: + raise RuntimeError( + "Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE" + f" {kv_cache_space}, expect a positive integer value.") From 651f6c31ac86f29aa72fa682ef6c34349bcc75db Mon Sep 17 00:00:00 2001 From: Nishidha Date: Sat, 23 Nov 2024 15:03:53 +0530 Subject: [PATCH 128/255] For ppc64le, disabled tests for now and addressed space issues (#10538) --- .buildkite/run-cpu-test-ppc64le.sh | 44 ++---------------------------- 1 file changed, 3 insertions(+), 41 deletions(-) diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh index 5d7a0bff90963..bc06838d804ff 100755 --- a/.buildkite/run-cpu-test-ppc64le.sh +++ b/.buildkite/run-cpu-test-ppc64le.sh @@ -4,49 +4,11 @@ # It serves a sanity check for compilation and basic model usage. set -ex -# Try building the docker image -docker build -t cpu-test -f Dockerfile.ppc64le . - # Setup cleanup -remove_docker_container() { docker rm -f cpu-test || true; } +remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; } trap remove_docker_container EXIT remove_docker_container -# Run the image, setting --shm-size=4g for tensor parallel. -source /etc/environment -#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test -docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN="$HF_TOKEN" --name cpu-test cpu-test - -function cpu_tests() { - set -e - - # Run basic model test - docker exec cpu-test bash -c " - set -e - pip install pytest pytest-asyncio \ - decord einops librosa peft Pillow sentence-transformers soundfile \ - transformers_stream_generator matplotlib datamodel_code_generator - pip install torchvision --index-url https://download.pytorch.org/whl/cpu - pytest -v -s tests/models/decoder_only/language -m cpu_model - pytest -v -s tests/models/embedding/language -m cpu_model - pytest -v -s tests/models/encoder_decoder/language -m cpu_model - pytest -v -s tests/models/decoder_only/audio_language -m cpu_model - pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" - - # online inference - docker exec cpu-test bash -c " - set -e - python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & - timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 - python3 benchmarks/benchmark_serving.py \ - --backend vllm \ - --dataset-name random \ - --model facebook/opt-125m \ - --num-prompts 20 \ - --endpoint /v1/completions \ - --tokenizer facebook/opt-125m" -} +# Try building the docker image +docker build -t cpu-test -f Dockerfile.ppc64le . -# All of CPU tests are expected to be finished less than 25 mins. -export -f cpu_tests -timeout 25m bash -c "cpu_tests" From 04668ebe7a35b69f1d2f8b04ef255bb16c8d2a01 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sun, 24 Nov 2024 02:12:20 +0800 Subject: [PATCH 129/255] [Bugfix] Avoid import AttentionMetadata explicitly in Mllama (#10593) Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/attention/backends/blocksparse_attn.py | 5 +++++ vllm/attention/layer.py | 3 ++- vllm/model_executor/models/mllama.py | 14 +++++++------- vllm/platforms/openvino.py | 8 ++++++-- vllm/v1/attention/backends/flash_attn.py | 2 +- 5 files changed, 21 insertions(+), 11 deletions(-) diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py index 94002e36db2bb..9e54c3b40c54e 100644 --- a/vllm/attention/backends/blocksparse_attn.py +++ b/vllm/attention/backends/blocksparse_attn.py @@ -87,6 +87,11 @@ def __post_init__(self): class BlocksparseFlashAttentionBackend(AttentionBackend): + @staticmethod + def get_name() -> str: + # For attention layer compatibility + return "FLASH_ATTN" + @staticmethod def get_impl_cls() -> Type["BlocksparseFlashAttentionImpl"]: return BlocksparseFlashAttentionImpl diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index cb4dedf481c77..1bb335909484b 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -6,7 +6,7 @@ import vllm.envs as envs from vllm.attention import AttentionMetadata, AttentionType -from vllm.attention.selector import get_attn_backend +from vllm.attention.selector import backend_name_to_enum, get_attn_backend from vllm.config import CacheConfig from vllm.forward_context import ForwardContext, get_forward_context from vllm.model_executor.layers.quantization.base_config import ( @@ -98,6 +98,7 @@ def __init__( self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, blocksparse_params, logits_soft_cap) + self.backend = backend_name_to_enum(attn_backend.get_name()) # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how # torch.compile works by registering the attention as one giant diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 41f62b37f3bd9..9e6634a9a7579 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -32,9 +32,8 @@ import vllm.distributed.parallel_state as ps from vllm.attention import Attention, AttentionMetadata, AttentionType -from vllm.attention.backends.flash_attn import FlashAttentionMetadata -from vllm.attention.backends.xformers import XFormersMetadata from vllm.attention.ops.paged_attn import PagedAttention +from vllm.attention.selector import _Backend from vllm.config import VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.inputs import (INPUT_REGISTRY, DummyData, EncoderDecoderInputs, @@ -828,7 +827,8 @@ def _attention_with_mask( ) -> torch.Tensor: # Skip writing kv-cache for the initial profiling run. if len(kv_cache.shape) > 1: - if isinstance(attn_metadata, FlashAttentionMetadata): + if self.attn.backend in (_Backend.FLASH_ATTN, + _Backend.FLASH_ATTN_VLLM_V1): cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode]) cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode]) torch.ops._C_cache_ops.reshape_and_cache_flash( @@ -842,7 +842,7 @@ def _attention_with_mask( 1.0, 1.0, ) - elif isinstance(attn_metadata, XFormersMetadata): + elif self.attn.backend in (_Backend.XFORMERS, _Backend.TORCH_SDPA): key_cache, value_cache = PagedAttention.split_kv_cache( kv_cache, self.num_local_key_value_heads, self.head_dim) cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode]) @@ -852,9 +852,9 @@ def _attention_with_mask( attn_metadata.cross_slot_mapping, "auto", 1.0, 1.0) else: raise ValueError( - f"Unsupported AttentionMetadata {type(attn_metadata)} " - f"class found. Expected the AttentionMetadata to " - f"be either XFormersMetadata or FlashAttentionMetadata.") + f"Unsupported Attention backend {self.attn.backend} " + "enum found. Expected the Attention backend to be " + "FLASH_ATTN, FLASH_ATTN_VLLM_V1, XFORMERS or TORCH_SDPA.") # We have to call torch.sdpa for prefill when using a # custom cross-attention mask. Because the mask is not a diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py index 91e615481ff8e..ea5ec7b40b95c 100644 --- a/vllm/platforms/openvino.py +++ b/vllm/platforms/openvino.py @@ -1,7 +1,5 @@ from typing import TYPE_CHECKING -import openvino as ov -import openvino.properties.hint as hints import torch import vllm.envs as envs @@ -16,6 +14,12 @@ logger = init_logger(__name__) +try: + import openvino as ov + import openvino.properties.hint as hints +except ImportError as e: + logger.warning("Failed to import OpenVINO with %r", e) + class OpenVinoPlatform(Platform): _enum = PlatformEnum.OPENVINO diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index d98bb5a716e97..5f8535eaa303f 100644 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -19,7 +19,7 @@ def get_supported_head_sizes() -> List[int]: @staticmethod def get_name() -> str: - return "flash-attn-vllm-v1" + return "FLASH_ATTN_VLLM_V1" @staticmethod def get_impl_cls() -> Type["FlashAttentionImpl"]: From 17d8fc1806c61e3f859a45b69be9f8dccf9a5fcc Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sun, 24 Nov 2024 09:22:33 +0800 Subject: [PATCH 130/255] [bugfix] Fix example/tensorize_vllm_model tests (#10595) Signed-off-by: Jee Jee Li --- vllm/model_executor/model_loader/tensorizer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index c48b287ed181a..3fd668765a1b1 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -19,6 +19,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) +from vllm.plugins import set_current_vllm_config from vllm.utils import FlexibleArgumentParser tensorizer_error_msg = None @@ -284,7 +285,8 @@ def _init_model(self): model_args = self.tensorizer_config.hf_config model_args.torch_dtype = self.tensorizer_config.dtype assert self.tensorizer_config.model_class is not None - with no_init_or_tensor(): + # TODO: Do we need to consider old-style model class? + with no_init_or_tensor(), set_current_vllm_config(self.vllm_config): return self.tensorizer_config.model_class( vllm_config=self.vllm_config, ) From 1700c543a556e669e559c369a36c0a0d36a8de19 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sun, 24 Nov 2024 09:23:17 +0800 Subject: [PATCH 131/255] [Bugfix] Fix LoRA weight sharding (#10450) Signed-off-by: Jee Jee Li Co-authored-by: Cyrus Leung --- .buildkite/test-pipeline.yaml | 13 +- .../{test_chatglm3.py => test_chatglm3_tp.py} | 63 +++++-- tests/lora/test_llama.py | 146 ---------------- tests/lora/test_llama_tp.py | 161 ++++++++++++++++++ vllm/lora/fully_sharded_layers.py | 5 + vllm/lora/layers.py | 34 +++- vllm/model_executor/models/chatglm.py | 4 +- 7 files changed, 258 insertions(+), 168 deletions(-) rename tests/lora/{test_chatglm3.py => test_chatglm3_tp.py} (56%) delete mode 100644 tests/lora/test_llama.py create mode 100644 tests/lora/test_llama_tp.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c436d2b48d20f..bff33d35b423e 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -230,7 +230,7 @@ steps: source_file_dependencies: - vllm/lora - tests/lora - command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py + command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore lora/test_long_context.py lora/test_chatglm3_tp.py lora/test_llama_tp.py parallelism: 4 - label: "PyTorch Fullgraph Smoke Test" # 9min @@ -475,18 +475,23 @@ steps: - pytest -v -s distributed/test_pp_cudagraph.py - pytest -v -s distributed/test_pipeline_parallel.py -- label: LoRA Long Context (Distributed) # 11min - # This test runs llama 13B, so it is required to run on 4 GPUs. +- label: LoRA TP Test (Distributed) num_gpus: 4 soft_fail: true source_file_dependencies: - vllm/lora - - tests/lora/test_long_context + - tests/lora commands: # FIXIT: find out which code initialize cuda before running the test # before the fix, we need to use spawn to test it - export VLLM_WORKER_MULTIPROC_METHOD=spawn + # This test runs llama 13B, so it is required to run on 4 GPUs. - pytest -v -s -x lora/test_long_context.py + # There is some Tensor Parallelism related processing logic in LoRA that + # requires multi-GPU testing for validation. + - pytest -v -s -x lora/test_chatglm3_tp.py + - pytest -v -s -x lora/test_llama_tp.py + - label: Weight Loading Multiple GPU Test # 33min working_dir: "/vllm-workspace/tests" diff --git a/tests/lora/test_chatglm3.py b/tests/lora/test_chatglm3_tp.py similarity index 56% rename from tests/lora/test_chatglm3.py rename to tests/lora/test_chatglm3_tp.py index de4cbea80924e..f17464573459f 100644 --- a/tests/lora/test_chatglm3.py +++ b/tests/lora/test_chatglm3_tp.py @@ -1,12 +1,21 @@ from typing import List import vllm +from tests.utils import fork_new_process_for_each_test from vllm.lora.request import LoRARequest +from ..utils import multi_gpu_test + MODEL_PATH = "THUDM/chatglm3-6b" PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501 +EXPECTED_LORA_OUTPUT = [ + "SELECT count(*) FROM singer", + "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", # noqa: E501 + "SELECT name , country , age FROM singer ORDER BY age", +] + def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: prompts = [ @@ -20,7 +29,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: "Show name, country, age for all singers ordered by age from the oldest to the youngest." # noqa: E501 ), ] - print(prompts) sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32) outputs = llm.generate( prompts, @@ -37,23 +45,58 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: return generated_texts +@fork_new_process_for_each_test def test_chatglm3_lora(chatglm3_lora_files): llm = vllm.LLM(MODEL_PATH, max_model_len=1024, enable_lora=True, max_loras=4, max_lora_rank=64, + tensor_parallel_size=1, trust_remote_code=True) - expected_lora_output = [ - "SELECT count(*) FROM singer", - "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", # noqa: E501 - "SELECT name , country , age FROM singer ORDER BY age", - ] + output1 = do_sample(llm, chatglm3_lora_files, lora_id=1) + for i in range(len(EXPECTED_LORA_OUTPUT)): + assert output1[i] == EXPECTED_LORA_OUTPUT[i] + output2 = do_sample(llm, chatglm3_lora_files, lora_id=2) + for i in range(len(EXPECTED_LORA_OUTPUT)): + assert output2[i] == EXPECTED_LORA_OUTPUT[i] + +@multi_gpu_test(num_gpus=4) +@fork_new_process_for_each_test +def test_chatglm3_lora_tp4(chatglm3_lora_files): + llm = vllm.LLM(MODEL_PATH, + max_model_len=1024, + enable_lora=True, + max_loras=4, + max_lora_rank=64, + tensor_parallel_size=4, + trust_remote_code=True, + fully_sharded_loras=False) + + output1 = do_sample(llm, chatglm3_lora_files, lora_id=1) + for i in range(len(EXPECTED_LORA_OUTPUT)): + assert output1[i] == EXPECTED_LORA_OUTPUT[i] + output2 = do_sample(llm, chatglm3_lora_files, lora_id=2) + for i in range(len(EXPECTED_LORA_OUTPUT)): + assert output2[i] == EXPECTED_LORA_OUTPUT[i] + + +@multi_gpu_test(num_gpus=4) +@fork_new_process_for_each_test +def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files): + llm = vllm.LLM(MODEL_PATH, + max_model_len=1024, + enable_lora=True, + max_loras=4, + max_lora_rank=64, + tensor_parallel_size=4, + trust_remote_code=True, + fully_sharded_loras=True) output1 = do_sample(llm, chatglm3_lora_files, lora_id=1) - for i in range(len(expected_lora_output)): - assert output1[i] == expected_lora_output[i] + for i in range(len(EXPECTED_LORA_OUTPUT)): + assert output1[i] == EXPECTED_LORA_OUTPUT[i] output2 = do_sample(llm, chatglm3_lora_files, lora_id=2) - for i in range(len(expected_lora_output)): - assert output2[i] == expected_lora_output[i] + for i in range(len(EXPECTED_LORA_OUTPUT)): + assert output2[i] == EXPECTED_LORA_OUTPUT[i] diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py deleted file mode 100644 index e2a4f1ed0496a..0000000000000 --- a/tests/lora/test_llama.py +++ /dev/null @@ -1,146 +0,0 @@ -from typing import List - -import pytest -import ray - -import vllm -from vllm.distributed import cleanup_dist_env_and_memory -from vllm.lora.request import LoRARequest - -MODEL_PATH = "meta-llama/Llama-2-7b-hf" - - -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: - prompts = [ - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 - ] - sampling_params = vllm.SamplingParams(temperature=0, - max_tokens=256, - stop=["[/assistant]"]) - outputs = llm.generate( - prompts, - sampling_params, - lora_request=LoRARequest(str(lora_id), lora_id, lora_path) - if lora_id else None) - # Print the outputs. - generated_texts: List[str] = [] - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - generated_texts.append(generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - return generated_texts - - -@pytest.mark.parametrize("tp_size", [1, 2, 4]) -def test_llama_lora(sql_lora_files, tp_size, num_gpus_available): - if num_gpus_available < tp_size: - pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") - - llm = vllm.LLM(MODEL_PATH, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - tensor_parallel_size=tp_size) - - expected_no_lora_output = [ - "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]", # noqa: E501 - " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ", # noqa: E501 - "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m", # noqa: E501 - " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ", # noqa: E501 - " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ", # noqa: E501 - "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE", # noqa: E501 - ] - expected_lora_output = [ - " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 - " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501 - " SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501 - " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501 - " SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501 - " SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " # noqa: E501 - ] - - print("lora adapter created") - assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output - - print("lora 1") - assert do_sample(llm, sql_lora_files, lora_id=1) == expected_lora_output - - print("no lora") - assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output - - print("lora 2") - assert do_sample(llm, sql_lora_files, lora_id=2) == expected_lora_output - - print("removing lora") - - -def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available): - if num_gpus_available < 4: - pytest.skip("Not enough GPUs for tensor parallelism 4") - - llm_tp1 = vllm.LLM(MODEL_PATH, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - tensor_parallel_size=1) - output_tp1 = do_sample(llm_tp1, sql_lora_files, lora_id=1) - - del llm_tp1 - cleanup_dist_env_and_memory() - - llm_tp2 = vllm.LLM(MODEL_PATH, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - tensor_parallel_size=2) - output_tp2 = do_sample(llm_tp2, sql_lora_files, lora_id=1) - - del llm_tp2 - cleanup_dist_env_and_memory() - - assert output_tp1 == output_tp2 - - llm_tp4 = vllm.LLM(MODEL_PATH, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - tensor_parallel_size=4) - output_tp4 = do_sample(llm_tp4, sql_lora_files, lora_id=1) - - del llm_tp4 - cleanup_dist_env_and_memory() - - assert output_tp1 == output_tp4 - - -def test_llama_lora_warmup(sql_lora_files): - """Test that the LLM initialization works with a warmup LORA path and - is more conservative""" - - @ray.remote(num_gpus=1) - def get_num_gpu_blocks_lora(): - llm = vllm.LLM(MODEL_PATH, enable_lora=True, max_num_seqs=16) - num_gpu_blocks_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks - return num_gpu_blocks_lora_warmup - - @ray.remote(num_gpus=1) - def get_num_gpu_blocks_no_lora(): - llm = vllm.LLM(MODEL_PATH, max_num_seqs=16) - num_gpu_blocks_no_lora_warmup = ( - llm.llm_engine.cache_config.num_gpu_blocks) - return num_gpu_blocks_no_lora_warmup - - num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote()) - num_gpu_blocks_no_lora_warmup = ray.get( - get_num_gpu_blocks_no_lora.remote()) - assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, ( - "The warmup with lora should be more " - "conservative than without lora, therefore the number of " - "memory blocks for the KV cache should be " - "less when using lora than when not using lora") diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py new file mode 100644 index 0000000000000..aae6310a2a213 --- /dev/null +++ b/tests/lora/test_llama_tp.py @@ -0,0 +1,161 @@ +from typing import List + +import ray + +import vllm +from tests.utils import fork_new_process_for_each_test +from vllm.lora.request import LoRARequest + +from ..utils import multi_gpu_test + +MODEL_PATH = "meta-llama/Llama-2-7b-hf" + +EXPECTED_NO_LORA_OUTPUT = [ + "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]", # noqa: E501 + " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ", # noqa: E501 + "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m", # noqa: E501 + " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ", # noqa: E501 + " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ", # noqa: E501 + "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE", # noqa: E501 +] +EXPECTED_LORA_OUTPUT = [ + " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 + " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501 + " SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501 + " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501 + " SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501 + " SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " # noqa: E501 +] + + +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: + prompts = [ + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 + ] + sampling_params = vllm.SamplingParams(temperature=0, + max_tokens=256, + stop=["[/assistant]"]) + outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest(str(lora_id), lora_id, lora_path) + if lora_id else None) + # Print the outputs. + generated_texts: List[str] = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + generated_texts.append(generated_text) + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + return generated_texts + + +@fork_new_process_for_each_test +def test_llama_lora(sql_lora_files): + + llm = vllm.LLM(MODEL_PATH, + enable_lora=True, + max_num_seqs=16, + max_loras=4, + tensor_parallel_size=1) + + print("lora adapter created") + assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT + + print("lora 1") + assert do_sample(llm, sql_lora_files, lora_id=1) == EXPECTED_LORA_OUTPUT + + print("no lora") + assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT + + print("lora 2") + assert do_sample(llm, sql_lora_files, lora_id=2) == EXPECTED_LORA_OUTPUT + + print("removing lora") + + +@fork_new_process_for_each_test +def test_llama_lora_warmup(sql_lora_files): + """Test that the LLM initialization works with a warmup LORA path and + is more conservative""" + + @ray.remote(num_gpus=1) + def get_num_gpu_blocks_lora(): + llm = vllm.LLM(MODEL_PATH, enable_lora=True, max_num_seqs=16) + num_gpu_blocks_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks + return num_gpu_blocks_lora_warmup + + @ray.remote(num_gpus=1) + def get_num_gpu_blocks_no_lora(): + llm = vllm.LLM(MODEL_PATH, max_num_seqs=16) + num_gpu_blocks_no_lora_warmup = ( + llm.llm_engine.cache_config.num_gpu_blocks) + return num_gpu_blocks_no_lora_warmup + + num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote()) + num_gpu_blocks_no_lora_warmup = ray.get( + get_num_gpu_blocks_no_lora.remote()) + assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, ( + "The warmup with lora should be more " + "conservative than without lora, therefore the number of " + "memory blocks for the KV cache should be " + "less when using lora than when not using lora") + + +@multi_gpu_test(num_gpus=4) +@fork_new_process_for_each_test +def test_llama_lora_tp4(sql_lora_files): + + llm = vllm.LLM( + MODEL_PATH, + enable_lora=True, + max_num_seqs=16, + max_loras=4, + tensor_parallel_size=4, + ) + + print("lora adapter created") + assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT + + print("lora 1") + assert do_sample(llm, sql_lora_files, lora_id=1) == EXPECTED_LORA_OUTPUT + + print("no lora") + assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT + + print("lora 2") + assert do_sample(llm, sql_lora_files, lora_id=2) == EXPECTED_LORA_OUTPUT + + print("removing lora") + + +@multi_gpu_test(num_gpus=4) +@fork_new_process_for_each_test +def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files): + + llm = vllm.LLM( + MODEL_PATH, + enable_lora=True, + max_num_seqs=16, + max_loras=4, + tensor_parallel_size=4, + fully_sharded_loras=True, + ) + print("lora adapter created") + assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT + + print("lora 1") + assert do_sample(llm, sql_lora_files, lora_id=1) == EXPECTED_LORA_OUTPUT + + print("no lora") + assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT + + print("lora 2") + assert do_sample(llm, sql_lora_files, lora_id=2) == EXPECTED_LORA_OUTPUT + + print("removing lora") diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index 3443c3feb4d2a..f5c2eced9d2bb 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -44,6 +44,11 @@ class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA): Based on S-LoRA, slicing happens along the rank dim. """ + # For all LoRA layers where the `base_layer` is `ColumnParallelLinear`, + # their `lora_a` and `lora_b` have different sharding patterns. After + # completing the `lora_a` GEMM , a gather operation is performed. + # Therefore, the sharding of `lora_a` only needs to correspond with the + # gather operation. def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: tp_rank = get_tensor_model_parallel_rank() shard_size = self.lora_a_stacked.shape[2] diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 6afe80219fe07..3701988ff692f 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -451,6 +451,12 @@ class ColumnParallelLinearWithLoRA(BaseLayerWithLoRA): def __init__(self, base_layer: ColumnParallelLinear) -> None: super().__init__() + # The base_layer type is ColumnParallelLinear or + # MergedColumnParallelLinear, their weight sharding logic is + # inconsistent when TP is greater than 1. + self.is_merged_col_linear = type( + base_layer) is MergedColumnParallelLinear + self.base_layer = base_layer self.tp_size = get_tensor_model_parallel_world_size() self.input_size = self.base_layer.input_size @@ -508,14 +514,30 @@ def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: return lora_a def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: - tensor_model_parallel_rank = get_tensor_model_parallel_rank() - shard_size = self.output_dim - start_idx = tensor_model_parallel_rank * shard_size - end_idx = (tensor_model_parallel_rank + 1) * shard_size - lora_b = lora_b[:, start_idx:end_idx] + # Applicable to cases where the base_layer is + # MergedColumnParallelLinear. + if self.is_merged_col_linear: + tp_rank = get_tensor_model_parallel_rank() + shard_size = self.output_size // 2 + offset = lora_b.shape[-1] // 2 + + left_weight = lora_b[:, tp_rank * shard_size:(tp_rank + 1) * + shard_size] + right_weight = lora_b[:, offset + tp_rank * shard_size:offset + + (tp_rank + 1) * shard_size] + lora_b = torch.cat([left_weight, right_weight], dim=1) + # Applicable to cases where the base_layer is + # ColumnParallelLinear. + else: + tensor_model_parallel_rank = get_tensor_model_parallel_rank() + shard_size = self.output_dim + start_idx = tensor_model_parallel_rank * shard_size + end_idx = (tensor_model_parallel_rank + 1) * shard_size + lora_b = lora_b[:, start_idx:end_idx] return lora_b def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: + # TODO: Fix the slicing logic of bias. if bias is None: return bias tensor_model_parallel_rank = get_tensor_model_parallel_rank() @@ -779,7 +801,7 @@ def can_replace_layer( class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA): """ ColumnParallelLinear layer that is specifically designed for - qkv_proj. Certain models, such as chtglm3 and baichuan-7b, + qkv_proj. Certain models, such as chatglm3 and baichuan-7b, only contains a single LoRA within their qkv_proj layer. During inference with Tensor Parallel, the weights of lora_b diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index e3a068908b7f3..5bcbce7180ca4 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -760,7 +760,7 @@ def __new__( config = vllm_config.model_config.hf_config # Initialize VL if hasattr(config, "visual"): - return ChatGLM(vllm_config=vllm_config, prefix=prefix) + return ChatGLMV(vllm_config=vllm_config, prefix=prefix) # Initialize LLM else: - return ChatGLMV(vllm_config=vllm_config, prefix=prefix) + return ChatGLM(vllm_config=vllm_config, prefix=prefix) From 1c445dca51a877ac6a5b7e03ecdb73e0e34d139e Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sun, 24 Nov 2024 11:57:13 +0800 Subject: [PATCH 132/255] [CI/Build] Print running script to enhance CI log readability (#10594) Signed-off-by: Jee Jee Li --- .buildkite/test-pipeline.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index bff33d35b423e..ed8c84ce9f5c0 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -52,6 +52,7 @@ steps: - tests/worker - tests/test_lazy_torch_compile.py commands: + - echo 'Running test_lazy_torch_compile.py...' # print running script to enhance CI log readability - python3 test_lazy_torch_compile.py - pytest -v -s mq_llm_engine # MQLLMEngine - pytest -v -s async_engine # AsyncLLMEngine @@ -182,15 +183,25 @@ steps: - examples/ commands: - pip install awscli tensorizer # for llava example and tensorizer test + - echo 'Running offline_inference.py...' # print running script to enhance CI log readability - python3 offline_inference.py + - echo 'Running cpu_offload.py...' - python3 cpu_offload.py + - echo 'Running offline_inference_chat.py...' - python3 offline_inference_chat.py + - echo 'Running offline_inference_with_prefix.py...' - python3 offline_inference_with_prefix.py + - echo 'Running llm_engine_example.py...' - python3 llm_engine_example.py + - echo 'Running offline_inference_vision_language.py...' - python3 offline_inference_vision_language.py + - echo 'Running offline_inference_vision_language_multi_image.py...' - python3 offline_inference_vision_language_multi_image.py + - echo 'Running tensorize_vllm_model.py...' - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - echo 'Running offline_inference_encoder_decoder.py...' - python3 offline_inference_encoder_decoder.py + - echo 'Running offline_profile.py...' - python3 offline_profile.py --model facebook/opt-125m - label: Prefix Caching Test # 9min From eda2b3589c8b27a9b8f8aea24afe1673890d19d2 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sat, 23 Nov 2024 21:31:47 -0800 Subject: [PATCH 133/255] Revert "Print running script to enhance CI log readability" (#10601) --- .buildkite/test-pipeline.yaml | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index ed8c84ce9f5c0..bff33d35b423e 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -52,7 +52,6 @@ steps: - tests/worker - tests/test_lazy_torch_compile.py commands: - - echo 'Running test_lazy_torch_compile.py...' # print running script to enhance CI log readability - python3 test_lazy_torch_compile.py - pytest -v -s mq_llm_engine # MQLLMEngine - pytest -v -s async_engine # AsyncLLMEngine @@ -183,25 +182,15 @@ steps: - examples/ commands: - pip install awscli tensorizer # for llava example and tensorizer test - - echo 'Running offline_inference.py...' # print running script to enhance CI log readability - python3 offline_inference.py - - echo 'Running cpu_offload.py...' - python3 cpu_offload.py - - echo 'Running offline_inference_chat.py...' - python3 offline_inference_chat.py - - echo 'Running offline_inference_with_prefix.py...' - python3 offline_inference_with_prefix.py - - echo 'Running llm_engine_example.py...' - python3 llm_engine_example.py - - echo 'Running offline_inference_vision_language.py...' - python3 offline_inference_vision_language.py - - echo 'Running offline_inference_vision_language_multi_image.py...' - python3 offline_inference_vision_language_multi_image.py - - echo 'Running tensorize_vllm_model.py...' - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - echo 'Running offline_inference_encoder_decoder.py...' - python3 offline_inference_encoder_decoder.py - - echo 'Running offline_profile.py...' - python3 offline_profile.py --model facebook/opt-125m - label: Prefix Caching Test # 9min From c055747867e771dbc791c9aa3c394c4d4489cd82 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sat, 23 Nov 2024 22:22:54 -0800 Subject: [PATCH 134/255] [model][utils] add extract_layer_index utility function (#10599) Signed-off-by: youkaichao --- vllm/model_executor/models/arctic.py | 41 +++++++++++-------------- vllm/model_executor/models/deepseek.py | 19 +++++++----- vllm/model_executor/models/gemma2.py | 15 +++------ vllm/model_executor/models/olmoe.py | 8 ++--- vllm/model_executor/models/qwen2_moe.py | 6 ++-- vllm/model_executor/models/utils.py | 21 +++++++++++++ 6 files changed, 59 insertions(+), 51 deletions(-) diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index ac4c464aa10ac..fd6b5659df5d1 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -33,7 +33,7 @@ from vllm.transformers_utils.configs.arctic import ArcticConfig from .interfaces import SupportsPP -from .utils import (is_pp_missing_parameter, +from .utils import (extract_layer_index, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -44,15 +44,14 @@ class ArcticMLP(nn.Module): def __init__(self, config: ArcticConfig, - layer_id: int, expert_id: int = -1, is_residual_mlp: bool = False, quant_config: Optional[QuantizationConfig] = None, - reduce_results: bool = True): + reduce_results: bool = True, + prefix: str = ""): super().__init__() self.hidden_size = config.hidden_size self.expert_id = expert_id - self.layer_id = layer_id self.ffn_dim = config.intermediate_size if not is_residual_mlp \ else self.hidden_size @@ -85,13 +84,14 @@ class ArcticMoE(nn.Module): def __init__(self, config: ArcticConfig, - layer_id: int, tp_size: Optional[int] = None, params_dtype: Optional[torch.dtype] = None, quant_config: Optional[QuantizationConfig] = None, - reduce_results: bool = True): + reduce_results: bool = True, + prefix: str = ""): super().__init__() + layer_id = extract_layer_index(prefix) self.tp_size = tp_size or get_tensor_model_parallel_world_size() self.hidden_size = config.hidden_size self.num_experts = config.num_local_experts @@ -109,15 +109,16 @@ def __init__(self, if not self.is_moe_layer: self.mlp = ArcticMLP(config, - layer_id=layer_id, quant_config=quant_config, - reduce_results=reduce_results) + reduce_results=reduce_results, + prefix=f"{prefix}.mlp") else: self.gate = ReplicatedLinear(self.hidden_size, self.num_experts, bias=False, params_dtype=self.params_dtype, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.gate") if self.is_quant: self.ws = DeepSpeedFPParameter( torch.Size((self.num_experts, 2 * self.intermediate_size, @@ -220,14 +221,12 @@ class ArcticAttention(nn.Module): def __init__( self, config: ArcticConfig, - layer_idx: Optional[int] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ): super().__init__() self.config = config - self.layer_idx = layer_idx self.hidden_size = config.hidden_size tp_size = get_tensor_model_parallel_world_size() @@ -298,26 +297,25 @@ class ArcticDecoderLayer(nn.Module): def __init__( self, config: ArcticConfig, - layer_idx: int, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ) -> None: super().__init__() - self.layer_idx = layer_idx self.hidden_size = config.hidden_size + layer_idx = extract_layer_index(prefix) is_moe_layer = (layer_idx + 1) % config.moe_layer_frequency == 0 self.use_residual = config.use_residual and is_moe_layer self.self_attn = ArcticAttention(config, - layer_idx, cache_config, quant_config=quant_config, prefix=f"{prefix}.self_attn") self.block_sparse_moe = ArcticMoE( config, - layer_id=layer_idx, quant_config=quant_config, - reduce_results=(not self.use_residual)) + reduce_results=(not self.use_residual), + prefix=f"{prefix}.block_sparse_moe", + ) self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -328,9 +326,9 @@ def __init__( self.residual_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.residual_mlp = ArcticMLP(config, - layer_id=layer_idx, is_residual_mlp=True, - reduce_results=False) + reduce_results=False, + prefix=f"{prefix}.residual_mlp") def forward( self, @@ -384,11 +382,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): org_num_embeddings=self.vocab_size) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: ArcticDecoderLayer(config, - int(prefix.split(".")[-1]), - cache_config, - quant_config, - prefix=prefix), + lambda prefix: ArcticDecoderLayer( + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.layers") self._attn_implementation = config._attn_implementation self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index 32488d931ea1c..74b6bfdf21909 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -49,7 +49,7 @@ from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP -from .utils import (is_pp_missing_parameter, +from .utils import (extract_layer_index, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -63,6 +63,7 @@ def __init__( hidden_act: str, quant_config: Optional[QuantizationConfig] = None, reduce_results: bool = True, + prefix: str = "", ) -> None: super().__init__() self.gate_up_proj = MergedColumnParallelLinear( @@ -92,6 +93,7 @@ def __init__( self, config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.config = config @@ -260,12 +262,12 @@ class DeepseekDecoderLayer(nn.Module): def __init__( self, config: PretrainedConfig, - layer_idx: int, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ) -> None: super().__init__() + layer_idx = extract_layer_index(prefix) self.hidden_size = config.hidden_size rope_theta = getattr(config, "rope_theta", 10000) rope_scaling = getattr(config, "rope_scaling", None) @@ -285,13 +287,16 @@ def __init__( if (config.n_routed_experts is not None and layer_idx >= config.first_k_dense_replace and layer_idx % config.moe_layer_freq == 0): - self.mlp = DeepseekMoE(config=config, quant_config=quant_config) + self.mlp = DeepseekMoE(config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") else: self.mlp = DeepseekMLP( hidden_size=config.hidden_size, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, quant_config=quant_config, + prefix=f"{prefix}.mlp", ) self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -347,11 +352,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: DeepseekDecoderLayer(config, - int(prefix.split(".")[-1]), - cache_config, - quant_config=quant_config, - prefix=prefix), + lambda prefix: DeepseekDecoderLayer( + config, cache_config, quant_config=quant_config, prefix=prefix + ), prefix=f"{prefix}.layers") self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.make_empty_intermediate_tensors = ( diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 9309cced61bb3..fd8223dd9be1b 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -42,7 +42,8 @@ from vllm.sequence import IntermediateTensors, PoolerOutput from .interfaces import SupportsLoRA, SupportsPP -from .utils import (AutoWeightsLoader, is_pp_missing_parameter, +from .utils import (AutoWeightsLoader, extract_layer_index, + is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -85,7 +86,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class Gemma2Attention(nn.Module): def __init__(self, - layer_idx: int, config: Gemma2Config, hidden_size: int, num_heads: int, @@ -98,7 +98,6 @@ def __init__(self, attn_logits_soft_cap: Optional[float] = None, prefix: str = "") -> None: super().__init__() - self.layer_idx = layer_idx self.config = config self.hidden_size = hidden_size tp_size = get_tensor_model_parallel_world_size() @@ -145,6 +144,7 @@ def __init__(self, # reference: # https://github.com/huggingface/transformers/blob/54be2d7ae87e873482b984cc956e165ca4dc0ba3/src/transformers/models/gemma2/modeling_gemma2.py#L312 # noqa + layer_idx = extract_layer_index(prefix) use_sliding_window = (layer_idx % 2 == 0 and config.interleaved_sliding_window is not None) sliding_window = config.interleaved_sliding_window if \ @@ -178,7 +178,6 @@ class Gemma2DecoderLayer(nn.Module): def __init__( self, - layer_idx: int, config: Gemma2Config, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -187,7 +186,6 @@ def __init__( super().__init__() self.hidden_size = config.hidden_size self.self_attn = Gemma2Attention( - layer_idx=layer_idx, config=config, hidden_size=self.hidden_size, num_heads=config.num_attention_heads, @@ -262,11 +260,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: Gemma2DecoderLayer(int(prefix.split(".")[-1]), - config, - cache_config, - quant_config, - prefix=prefix), + lambda prefix: Gemma2DecoderLayer( + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.layers") self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 5b5b3ef48b035..5d9091cfb9311 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -181,7 +181,6 @@ class OlmoeDecoderLayer(nn.Module): def __init__( self, config: PretrainedConfig, - layer_idx: int, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -264,11 +263,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: OlmoeDecoderLayer(config, - int(prefix.split(".")[-1]), - cache_config, - quant_config, - prefix=prefix), + lambda prefix: OlmoeDecoderLayer( + config, cache_config, quant_config, prefix=prefix), prefix=f"{prefix}.layers") self.norm = RMSNorm(config.hidden_size, eps=1e-5) diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 1091f88ab2534..ba70243c6533d 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -53,7 +53,7 @@ from vllm.utils import print_warning_once from .interfaces import SupportsPP -from .utils import (is_pp_missing_parameter, +from .utils import (extract_layer_index, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -244,7 +244,6 @@ class Qwen2MoeDecoderLayer(nn.Module): def __init__( self, config: PretrainedConfig, - layer_idx: int, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -269,6 +268,7 @@ def __init__( # Note: Qwen/Qwen2-57B-A14B-Instruct does not have # `mlp_only_layers` in the config. + layer_idx = extract_layer_index(prefix) mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else config.mlp_only_layers) if (layer_idx not in mlp_only_layers) and ( @@ -337,8 +337,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: Qwen2MoeDecoderLayer(config=config, - layer_idx=int( - prefix.split(".")[-1]), cache_config=cache_config, quant_config=quant_config, prefix=prefix), diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 2ab9b19e22068..dcfd2cb7d2622 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -629,3 +629,24 @@ def maybe_prefix(prefix: str, name: str) -> str: The string "prefix.name" if prefix was non-empty, otherwise just "name". """ return name if not prefix else f"{prefix}.{name}" + + +def extract_layer_index(layer_name: str) -> int: + """ + Extract the layer index from the module name. + Examples: + - "encoder.layers.0" -> 0 + - "encoder.layers.1.self_attn" -> 1 + - "2.self_attn" -> 2 + - "model.encoder.layers.0.sub.1" -> ValueError + """ + subnames = layer_name.split(".") + int_vals: List[int] = [] + for subname in subnames: + try: + int_vals.append(int(subname)) + except ValueError: + continue + assert len(int_vals) == 1, (f"layer name {layer_name} should" + " only contain one integer") + return int_vals[0] From e4fbb1441454847fdd871c9959b5cb05b5037aa2 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sun, 24 Nov 2024 11:21:40 -0800 Subject: [PATCH 135/255] [doc] update the code to add models (#10603) Signed-off-by: youkaichao Co-authored-by: Cyrus Leung --- docs/source/models/adding_model.rst | 85 +++++++++++++++++++---------- 1 file changed, 57 insertions(+), 28 deletions(-) diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst index a70ebf99c746f..df06d736ca86b 100644 --- a/docs/source/models/adding_model.rst +++ b/docs/source/models/adding_model.rst @@ -38,41 +38,70 @@ For instance, vLLM's `OPT model Union[Tuple, CausalLMOutputWithPast]: - + positions: torch.Tensor, - + kv_caches: List[torch.Tensor], - + attn_metadata: AttentionMetadata, - + ) -> Optional[SamplerOutput]: - -1. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors. -2. Replace the attention operation with either :code:`PagedAttention`, :code:`PagedAttentionWithRoPE`, or :code:`PagedAttentionWithALiBi` depending on the model's architecture. +To ensure compatibility with vLLM, your model must meet the following requirements: + +Initialization Code +^^^^^^^^^^^^^^^^^^^ + +All vLLM modules within the model must include a ``prefix`` argument in their constructor. This ``prefix`` is typically the full name of the module in the model's state dictionary and is crucial for: + +* Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts. +* Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the ``prefix`` during initialization, vLLM can match the current layer's ``prefix`` with the quantization configuration to determine if the layer should be initialized in quantized mode. + +The initialization code should look like this: + +.. code-block:: python + + from torch import nn + from vllm.config import VllmConfig + from vllm.attention import Attention + + class MyAttention(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.attn = Attention(prefix=f"{prefix}.attn") + + class MyDecoderLayer(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.self_attn = MyAttention(prefix=f"{prefix}.self_attn") + + class MyModel(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.layers = nn.ModuleList( + [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)] + ) + + class MyModelForCausalLM(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + self.model = MyModel(vllm_config, prefix=f"{prefix}.model") + +Computation Code +^^^^^^^^^^^^^^^^ + +Rewrite the :meth:`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat ``input_ids`` and ``positions`` as flattened tensors with a single batch size dimension, without a max-sequence length dimension. + +.. code-block:: python + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + ... .. note:: Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. +For reference, check out the `LLAMA model `__. vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out the `vLLM models `__ directory for more examples. 3. (Optional) Implement tensor parallelism and quantization support ------------------------------------------------------------------- From 49628fe13e1021ce036bbae257242ab71e40aa25 Mon Sep 17 00:00:00 2001 From: Zhuohan Li Date: Sun, 24 Nov 2024 16:45:09 -0800 Subject: [PATCH 136/255] [Doc] Update README.md with Ray Summit talk links (#10610) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0ef073210d070..4e1353d98f1dc 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ Easy, fast, and cheap LLM serving for everyone *Latest News* 🔥 - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing). - [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there! -- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/sessioncatalog?tab.day=20241001&search.sessiontracks=1719251906298001uzJ2) from other vLLM contributors and users! +- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users! - [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing). - [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing). - [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html). From 214efc2c3cb568e8eb3f7d234f3bd8f5bbe24795 Mon Sep 17 00:00:00 2001 From: Maximilien de Bayser Date: Sun, 24 Nov 2024 23:56:20 -0300 Subject: [PATCH 137/255] Support Cross encoder models (#10400) Signed-off-by: Max de Bayser Signed-off-by: Max de Bayser Signed-off-by: Flavia Beo Co-authored-by: Flavia Beo --- .../serving/openai_compatible_server.md | 142 ++++++++++++ examples/openai_cross_encoder_score.py | 58 +++++ tests/conftest.py | 20 ++ tests/entrypoints/openai/test_score.py | 93 ++++++++ .../models/embedding/language/test_scoring.py | 95 ++++++++ tests/models/registry.py | 9 + tests/models/test_registry.py | 23 +- vllm/config.py | 5 + vllm/core/scheduler.py | 1 + vllm/entrypoints/llm.py | 124 +++++++++- vllm/entrypoints/openai/api_server.py | 35 ++- vllm/entrypoints/openai/protocol.py | 36 +++ vllm/entrypoints/openai/serving_score.py | 215 ++++++++++++++++++ vllm/inputs/data.py | 18 ++ vllm/inputs/preprocess.py | 2 + vllm/model_executor/layers/pooler.py | 64 ++++++ vllm/model_executor/models/bert.py | 128 ++++++++++- vllm/model_executor/models/interfaces.py | 36 +++ vllm/model_executor/models/registry.py | 23 +- vllm/model_executor/models/roberta.py | 179 ++++++++++++--- vllm/multimodal/inputs.py | 5 +- vllm/outputs.py | 45 +++- vllm/sequence.py | 9 + vllm/transformers_utils/config.py | 15 ++ vllm/worker/cpu_embedding_model_runner.py | 4 + vllm/worker/cpu_model_runner.py | 13 ++ vllm/worker/embedding_model_runner.py | 7 +- vllm/worker/model_runner.py | 28 +++ 28 files changed, 1370 insertions(+), 62 deletions(-) create mode 100644 examples/openai_cross_encoder_score.py create mode 100644 tests/entrypoints/openai/test_score.py create mode 100644 tests/models/embedding/language/test_scoring.py create mode 100644 vllm/entrypoints/openai/serving_score.py diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 79d032bf8b211..c39cef85897ed 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -44,6 +44,148 @@ We currently support the following OpenAI APIs: - This enables multi-modal inputs to be passed to embedding models, see [Using VLMs](../models/vlm.rst). - *Note: You should run `vllm serve` with `--task embedding` to ensure that the model is being run in embedding mode.* +## Score API for Cross Encoder Models + +vLLM supports *cross encoders models* at the **/v1/score** endpoint, which is not an OpenAI API standard endpoint. You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). + +A ***Cross Encoder*** takes exactly two sentences / texts as input and either predicts a score or label for this sentence pair. It can for example predict the similarity of the sentence pair on a scale of 0 … 1. + +### Example of usage for a pair of a string and a list of texts + +In this case, the model will compare the first given text to each of the texts containing the list. + +```bash +curl -X 'POST' \ + 'http://127.0.0.1:8000/v1/score' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "BAAI/bge-reranker-v2-m3", + "text_1": "What is the capital of France?", + "text_2": [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris." + ] +}' +``` + +Response: + +```bash +{ + "id": "score-request-id", + "object": "list", + "created": 693570, + "model": "BAAI/bge-reranker-v2-m3", + "data": [ + { + "index": 0, + "object": "score", + "score": [ + 0.001094818115234375 + ] + }, + { + "index": 1, + "object": "score", + "score": [ + 1 + ] + } + ], + "usage": {} +} +``` + +### Example of usage for a pair of two lists of texts + +In this case, the model will compare the one by one, making pairs by same index correspondent in each list. + +```bash +curl -X 'POST' \ + 'http://127.0.0.1:8000/v1/score' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "BAAI/bge-reranker-v2-m3", + "encoding_format": "float", + "text_1": [ + "What is the capital of Brazil?", + "What is the capital of France?" + ], + "text_2": [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris." + ] +}' +``` + +Response: + +```bash +{ + "id": "score-request-id", + "object": "list", + "created": 693447, + "model": "BAAI/bge-reranker-v2-m3", + "data": [ + { + "index": 0, + "object": "score", + "score": [ + 1 + ] + }, + { + "index": 1, + "object": "score", + "score": [ + 1 + ] + } + ], + "usage": {} +} +``` + +### Example of usage for a pair of two strings + +In this case, the model will compare the strings of texts. + +```bash +curl -X 'POST' \ + 'http://127.0.0.1:8000/v1/score' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "BAAI/bge-reranker-v2-m3", + "encoding_format": "float", + "text_1": "What is the capital of France?", + "text_2": "The capital of France is Paris." +}' +``` + +Response: + +```bash +{ + "id": "score-request-id", + "object": "list", + "created": 693447, + "model": "BAAI/bge-reranker-v2-m3", + "data": [ + { + "index": 0, + "object": "score", + "score": [ + 1 + ] + } + ], + "usage": {} +} +``` + ## Extra Parameters vLLM supports a set of parameters that are not part of the OpenAI API. diff --git a/examples/openai_cross_encoder_score.py b/examples/openai_cross_encoder_score.py new file mode 100644 index 0000000000000..8c32eea5dd252 --- /dev/null +++ b/examples/openai_cross_encoder_score.py @@ -0,0 +1,58 @@ +"""Examples Python client Score for Cross Encoder Models +""" + +import argparse +import json +import pprint + +import requests + + +def post_http_request(prompt: json, api_url: str) -> requests.Response: + headers = {"User-Agent": "Test Client"} + response = requests.post(api_url, headers=headers, json=prompt) + return response + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3") + args = parser.parse_args() + api_url = f"http://{args.host}:{args.port}/v1/score" + + model_name = args.model + + text_1 = "What is the capital of France?" + text_2 = [ + "The capital of Brazil is Brasilia.", "The capital of France is Paris." + ] + prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} + score_response = post_http_request(prompt=prompt, api_url=api_url) + print("Prompt for text_1 is string and text_2 is a list:") + pprint.pprint(prompt) + print("Score Response:") + pprint.pprint(score_response.data) + + text_1 = [ + "What is the capital of Brazil?", "What is the capital of France?" + ] + text_2 = [ + "The capital of Brazil is Brasilia.", "The capital of France is Paris." + ] + prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} + score_response = post_http_request(prompt=prompt, api_url=api_url) + print("Prompt for text_1 and text_2 are lists:") + pprint.pprint(prompt) + print("Score Response:") + pprint.pprint(score_response.data) + + text_1 = "What is the capital of Brazil?" + text_2 = "The capital of Brazil is Brasilia." + prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} + score_response = post_http_request(prompt=prompt, api_url=api_url) + print("Prompt for text_1 and text_2 are strings:") + pprint.pprint(prompt) + print("Score Response:") + pprint.pprint(score_response.data) \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 0dc1cc6e83c18..29707f975e2a0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -265,6 +265,7 @@ def __init__( model_kwargs: Optional[Dict[str, Any]] = None, is_embedding_model: bool = False, is_sentence_transformer: bool = False, + is_cross_encoder: bool = False, skip_tokenizer_init: bool = False, auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM, postprocess_inputs: Callable[..., BatchEncoding] = identity, @@ -282,6 +283,14 @@ def __init__( device="cpu", trust_remote_code=True, ).to(dtype=torch_dtype)) + elif is_cross_encoder: + # Lazy init required for AMD CI + from sentence_transformers import CrossEncoder + self.model = CrossEncoder(model_name, + device="cpu", + trust_remote_code=True) + self.model.model = self.wrap_device(self.model.model)\ + .to(dtype=torch_dtype) else: model_kwargs = model_kwargs if model_kwargs is not None else {} self.model = self.wrap_device( @@ -625,6 +634,9 @@ def generate_encoder_decoder_greedy_logprobs_limit( def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]: return self.model.encode(prompts) + def predict(self, prompts: List[List[str]]) -> torch.Tensor: + return self.model.predict(prompts, convert_to_tensor=True) + def __enter__(self): return self @@ -898,6 +910,14 @@ def encode( req_outputs = self.model.encode(inputs) return [req_output.outputs.embedding for req_output in req_outputs] + def score( + self, + text_1: Union[str, List[str]], + text_2: Union[str, List[str]], + ) -> List[List[float]]: + req_outputs = self.model.score(text_1, text_2) + return [req_output.outputs.embedding for req_output in req_outputs] + def __enter__(self): return self diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py new file mode 100644 index 0000000000000..7565ff7192f67 --- /dev/null +++ b/tests/entrypoints/openai/test_score.py @@ -0,0 +1,93 @@ +import pytest +import requests + +from vllm.entrypoints.openai.protocol import ScoreResponse + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "BAAI/bge-reranker-v2-m3" + + +@pytest.fixture(scope="module") +def server(): + args = [ + "--enforce-eager", + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_text_1_str_text_2_list(server: RemoteOpenAIServer, + model_name: str): + text_1 = "What is the capital of France?" + text_2 = [ + "The capital of Brazil is Brasilia.", "The capital of France is Paris." + ] + + score_response = requests.post(server.url_for("v1/score"), + json={ + "model": model_name, + "text_1": text_1, + "text_2": text_2, + }) + score_response.raise_for_status() + score = ScoreResponse.model_validate(score_response.json()) + + assert score.id is not None + assert score.data is not None + assert len(score.data) == 2 + assert score.data[0].score[0] <= 0.01 + assert score.data[1].score[0] >= 0.9 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_text_1_list_text_2_list(server: RemoteOpenAIServer, + model_name: str): + text_1 = [ + "What is the capital of the United States?", + "What is the capital of France?" + ] + text_2 = [ + "The capital of Brazil is Brasilia.", "The capital of France is Paris." + ] + + score_response = requests.post(server.url_for("v1/score"), + json={ + "model": model_name, + "text_1": text_1, + "text_2": text_2, + }) + score_response.raise_for_status() + score = ScoreResponse.model_validate(score_response.json()) + + assert score.id is not None + assert score.data is not None + assert len(score.data) == 2 + assert score.data[0].score[0] <= 0.01 + assert score.data[1].score[0] >= 0.9 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_text_1_str_text_2_str(server: RemoteOpenAIServer, + model_name: str): + text_1 = "What is the capital of France?" + text_2 = "The capital of France is Paris." + + score_response = requests.post(server.url_for("v1/score"), + json={ + "model": model_name, + "text_1": text_1, + "text_2": text_2, + }) + score_response.raise_for_status() + score = ScoreResponse.model_validate(score_response.json()) + + assert score.id is not None + assert score.data is not None + assert len(score.data) == 1 + assert score.data[0].score[0] >= 0.9 diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/embedding/language/test_scoring.py new file mode 100644 index 0000000000000..30fa5ea7b36c0 --- /dev/null +++ b/tests/models/embedding/language/test_scoring.py @@ -0,0 +1,95 @@ +"""Compare the embedding outputs of HF and vLLM models. + +Run `pytest tests/models/embedding/language/test_embedding.py`. +""" +import math + +import pytest + +MODELS = [ + "cross-encoder/ms-marco-MiniLM-L-6-v2", # Bert + "BAAI/bge-reranker-v2-m3", # Roberta +] + +TEXTS_1 = [ + "What is the capital of France?", + "What is the capital of Germany?", +] + +TEXTS_2 = [ + "The capital of France is Paris.", + "The capital of Germany is Berlin.", +] + + +@pytest.fixture(scope="module", params=MODELS) +def model_name(request): + yield request.param + + +@pytest.mark.parametrize("dtype", ["half"]) +def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str): + + text_pair = [TEXTS_1[0], TEXTS_2[0]] + + with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model: + hf_outputs = hf_model.predict([text_pair]).tolist() + + with vllm_runner(model_name, + task="embedding", + dtype=dtype, + max_model_len=None) as vllm_model: + vllm_outputs = vllm_model.score(text_pair[0], text_pair[1]) + + assert len(vllm_outputs) == 1 + assert len(hf_outputs) == 1 + + assert math.isclose(hf_outputs[0], vllm_outputs[0][0], rel_tol=0.01) + + +@pytest.mark.parametrize("dtype", ["half"]) +def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str): + + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[0], TEXTS_2[1]], + ] + + with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model: + hf_outputs = hf_model.predict(text_pairs).tolist() + + with vllm_runner(model_name, + task="embedding", + dtype=dtype, + max_model_len=None) as vllm_model: + vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2) + + assert len(vllm_outputs) == 2 + assert len(hf_outputs) == 2 + + assert math.isclose(hf_outputs[0], vllm_outputs[0][0], rel_tol=0.01) + assert math.isclose(hf_outputs[1], vllm_outputs[1][0], rel_tol=0.01) + + +@pytest.mark.parametrize("dtype", ["half"]) +def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str): + + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[1], TEXTS_2[1]], + ] + + with hf_runner(model_name, dtype=dtype, is_cross_encoder=True) as hf_model: + hf_outputs = hf_model.predict(text_pairs).tolist() + + with vllm_runner(model_name, + task="embedding", + dtype=dtype, + max_model_len=None) as vllm_model: + vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2) + + assert len(vllm_outputs) == 2 + assert len(hf_outputs) == 2 + + assert math.isclose(hf_outputs[0], vllm_outputs[0][0], rel_tol=0.01) + assert math.isclose(hf_outputs[1], vllm_outputs[1][0], rel_tol=0.01) diff --git a/tests/models/registry.py b/tests/models/registry.py index 3848367b6126c..fa0818c4f0bd1 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -135,6 +135,7 @@ class _HfExamplesInfo: "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"), "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"), # noqa: E501 "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"), # noqa: E501 + "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1"), # noqa: E501 "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-large"), # [Multimodal] "LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"), @@ -143,6 +144,13 @@ class _HfExamplesInfo: "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"), # noqa: E501 } +_CROSS_ENCODER_EXAMPLE_MODELS = { + # [Text-only] + "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2"), # noqa: E501 + "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base"), # noqa: E501 + "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3"), # noqa: E501 +} + _MULTIMODAL_EXAMPLE_MODELS = { # [Decoder-only] "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"), # noqa: E501 @@ -195,6 +203,7 @@ class _HfExamplesInfo: _EXAMPLE_MODELS = { **_TEXT_GENERATION_EXAMPLE_MODELS, **_EMBEDDING_EXAMPLE_MODELS, + **_CROSS_ENCODER_EXAMPLE_MODELS, **_MULTIMODAL_EXAMPLE_MODELS, **_SPECULATIVE_DECODING_EXAMPLE_MODELS, } diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index e462dae3dc688..289ea66b5ebc5 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -6,7 +6,10 @@ from vllm.model_executor.models import (is_embedding_model, is_text_generation_model, supports_multimodal) -from vllm.model_executor.models.registry import (_EMBEDDING_MODELS, +# yapf conflicts with isort for this block +# yapf: disable +from vllm.model_executor.models.registry import (_CROSS_ENCODER_MODELS, + _EMBEDDING_MODELS, _MULTIMODAL_MODELS, _SPECULATIVE_DECODING_MODELS, _TEXT_GENERATION_MODELS, @@ -29,22 +32,28 @@ def test_registry_imports(model_arch): model_arch in _TEXT_GENERATION_MODELS or model_arch in _MULTIMODAL_MODELS) + embedding_models = {**_EMBEDDING_MODELS, **_CROSS_ENCODER_MODELS} assert is_embedding_model(model_cls) is (model_arch - in _EMBEDDING_MODELS) + in embedding_models) assert supports_multimodal(model_cls) is (model_arch in _MULTIMODAL_MODELS) @fork_new_process_for_each_test -@pytest.mark.parametrize("model_arch,is_mm,init_cuda", [ - ("LlamaForCausalLM", False, False), - ("MllamaForConditionalGeneration", True, False), - ("LlavaForConditionalGeneration", True, True), +@pytest.mark.parametrize("model_arch,is_mm,init_cuda,is_ce", [ + ("LlamaForCausalLM", False, False, False), + ("MllamaForConditionalGeneration", True, False, False), + ("LlavaForConditionalGeneration", True, True, False), + ("BertForSequenceClassification", False, False, True), + ("RobertaForSequenceClassification", False, False, True), + ("XLMRobertaForSequenceClassification", False, False, True), ]) -def test_registry_is_multimodal(model_arch, is_mm, init_cuda): +def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce): assert ModelRegistry.is_multimodal_model(model_arch) is is_mm + assert ModelRegistry.is_cross_encoder_model(model_arch) is is_ce + if init_cuda and current_platform.is_cuda_alike(): assert not torch.cuda.is_initialized() diff --git a/vllm/config.py b/vllm/config.py index f163665e2c063..4ea56a14cabba 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -712,6 +712,11 @@ def uses_mrope(self) -> bool: def is_multimodal_model(self) -> bool: return self.multimodal_config is not None + @property + def is_cross_encoder(self) -> bool: + architectures = getattr(self.hf_config, "architectures", []) + return ModelRegistry.is_cross_encoder_model(architectures) + class CacheConfig: """Configuration for the KV cache. diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 841e65c488fc6..530cbdc3a9190 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -1357,6 +1357,7 @@ def schedule( encoder_seq_data=encoder_seq_data, cross_block_table=cross_block_table, state=seq_group.state, + token_type_ids=seq_group.token_type_ids, # `multi_modal_data` will only be present for the 1st comm # between engine and worker. # the subsequent comms can still use delta, but diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index c211ec5aee080..e07f4c04abd84 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -20,7 +20,7 @@ apply_mistral_chat_template, parse_chat_messages, resolve_chat_template_content_format) -from vllm.inputs import PromptType, TextPrompt, TokensPrompt +from vllm.inputs import PromptType, SingletonPrompt, TextPrompt, TokensPrompt from vllm.inputs.parse import parse_and_batch_prompt from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -817,6 +817,128 @@ def encode( return self.engine_class.validate_outputs(outputs, EmbeddingRequestOutput) + def score( + self, + text_1: Union[SingletonPrompt, Sequence[SingletonPrompt]], + text_2: Union[SingletonPrompt, Sequence[SingletonPrompt]], + /, + truncate_prompt_tokens: Optional[int] = None, + use_tqdm: bool = True, + lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + ) -> List[EmbeddingRequestOutput]: + """Generates similarity scores for all pairs . + + The inputs can be 1 -> 1, 1 -> N or N -> N. In the 1 - N case + the text_1 sentence will be replicated N times to pair with the text_2 + sentences. The input pairs are used to build a list of prompts for the + cross encoder model. This class automatically batches the prompts, + considering the memory constraint. For the best performance, put all + of your texts into a single list and pass it to this method. + + Args: + text_1: can be a single prompt or a list of prompts, in which + case it has to have the same length as the text_2 list + text_2: The texts to pair with the query to form the input + to the LLM. See :class:`~vllm.inputs.PromptType` for + more details about the format of each prompts. + use_tqdm: Whether to use tqdm to display the progress bar. + lora_request: LoRA request to use for generation, if any. + prompt_adapter_request: Prompt Adapter request to use for + generation, if any. + + Returns: + A list of ``EmbeddingRequestOutput`` objects containing the + generated scores in the same order as the input prompts. + """ + task = self.llm_engine.model_config.task + if task != "embedding": + messages = ["LLM.score() is only supported for embedding models."] + + supported_tasks = self.llm_engine.model_config.supported_tasks + if "embedding" in supported_tasks: + messages.append( + "Your model supports the 'embedding' task, but is " + f"currently initialized for the '{task}' task. Please " + "initialize the model using `--task embedding`.") + + raise ValueError(" ".join(messages)) + + if not self.llm_engine.model_config.is_cross_encoder: + raise ValueError("Your model does not support the cross encoding") + + tokenizer = self.llm_engine.get_tokenizer() + + if isinstance(tokenizer, MistralTokenizer): + raise ValueError( + "MistralTokenizer not supported for cross-encoding") + + # the tokenizer for models such as + # "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing + # lists of tokens to the `text` and `text_pair` kwargs + def ensure_str(prompt: SingletonPrompt): + if isinstance(prompt, dict): + if "multi_modal_data" in prompt: + raise ValueError("Multi-modal prompt is not " + "supported for cross encoding") + elif "prompt_token_ids" in prompt: + prompt = tokenizer.decode( + cast(TokensPrompt, prompt)["prompt_token_ids"]) + elif "prompt" in prompt: + prompt = cast(TextPrompt, prompt)["prompt"] + assert type(prompt) is str + return prompt + + if isinstance(text_1, (str, dict)): + # Convert a single prompt to a list. + text_1 = [text_1] + text_1 = [ensure_str(t) for t in text_1] + + if isinstance(text_2, (str, dict)): + # Convert a single prompt to a list. + text_2 = [text_2] + text_2 = [ensure_str(t) for t in text_2] + + if len(text_1) > 1 and len(text_1) != len(text_2): + raise ValueError("Input lengths must be either 1:1, 1:N or N:N") + if len(text_1) == 0: + raise ValueError("At least one text element must be given") + if len(text_2) == 0: + raise ValueError("At least one text_pair element must be given") + + if len(text_1) == 1: + text_1 = text_1 * len(text_2) + + input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)] + pooling_params = PoolingParams() + + tokenization_kwargs: Dict[str, Any] = {} + if truncate_prompt_tokens is not None: + tokenization_kwargs["truncation"] = True + tokenization_kwargs["max_length"] = truncate_prompt_tokens + + parsed_prompts = [] + + for q, t in input_pairs: + prompt_inputs = tokenizer(text=q, + text_pair=t, + **tokenization_kwargs) + engine_prompt = TokensPrompt( + prompt_token_ids=prompt_inputs["input_ids"], + token_type_ids=prompt_inputs.get("token_type_ids")) + parsed_prompts.append(engine_prompt) + + self._validate_and_add_requests( + prompts=parsed_prompts, + params=pooling_params, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request, + ) + + outputs = self._run_engine(use_tqdm=use_tqdm) + return self.engine_class.validate_outputs(outputs, + EmbeddingRequestOutput) + def start_profile(self) -> None: self.llm_engine.start_profile() diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index b0fe061f5db4a..2b1f14b89b1f2 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -45,6 +45,7 @@ EmbeddingRequest, EmbeddingResponse, ErrorResponse, LoadLoraAdapterRequest, + ScoreRequest, ScoreResponse, TokenizeRequest, TokenizeResponse, UnloadLoraAdapterRequest) @@ -53,6 +54,7 @@ from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing +from vllm.entrypoints.openai.serving_score import OpenAIServingScores from vllm.entrypoints.openai.serving_tokenization import ( OpenAIServingTokenization) from vllm.entrypoints.openai.tool_parsers import ToolParserManager @@ -280,6 +282,10 @@ def embedding(request: Request) -> Optional[OpenAIServingEmbedding]: return request.app.state.openai_serving_embedding +def score(request: Request) -> Optional[OpenAIServingScores]: + return request.app.state.openai_serving_scores + + def tokenization(request: Request) -> OpenAIServingTokenization: return request.app.state.openai_serving_tokenization @@ -391,6 +397,23 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request): assert_never(generator) +@router.post("/v1/score") +async def create_score(request: ScoreRequest, raw_request: Request): + handler = score(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Score API") + + generator = await handler.create_score(request, raw_request) + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) + elif isinstance(generator, ScoreResponse): + return JSONResponse(content=generator.model_dump()) + + assert_never(generator) + + if envs.VLLM_TORCH_PROFILER_DIR: logger.warning( "Torch Profiler is enabled in the API server. This should ONLY be " @@ -466,8 +489,9 @@ def build_app(args: Namespace) -> FastAPI: @app.exception_handler(RequestValidationError) async def validation_exception_handler(_, exc): - chat = app.state.openai_serving_chat - err = chat.create_error_response(message=str(exc)) + err = ErrorResponse(message=str(exc), + type="BadRequestError", + code=HTTPStatus.BAD_REQUEST) return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST) @@ -565,6 +589,13 @@ def init_app_state( chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, ) if model_config.task == "embedding" else None + state.openai_serving_scores = OpenAIServingScores( + engine_client, + model_config, + base_model_paths, + request_logger=request_logger + ) if (model_config.task == "embedding" \ + and model_config.is_cross_encoder) else None state.openai_serving_tokenization = OpenAIServingTokenization( engine_client, model_config, diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index f343732174014..ee94a9413f098 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -806,6 +806,27 @@ def to_pooling_params(self): EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest] +class ScoreRequest(OpenAIBaseModel): + model: str + text_1: Union[List[str], str] + text_2: Union[List[str], str] + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None + + # doc: begin-chat-embedding-pooling-params + additional_data: Optional[Any] = None + # doc: end-chat-embedding-pooling-params + + priority: int = Field( + default=0, + description=( + "The priority of the request (lower means earlier handling; " + "default: 0). Any priority other than 0 will raise an error " + "if the served model does not use priority scheduling.")) + + def to_pooling_params(self): + return PoolingParams(additional_data=self.additional_data) + + class CompletionLogProbs(OpenAIBaseModel): text_offset: List[int] = Field(default_factory=list) token_logprobs: List[Optional[float]] = Field(default_factory=list) @@ -876,6 +897,21 @@ class EmbeddingResponse(OpenAIBaseModel): usage: UsageInfo +class ScoreResponseData(OpenAIBaseModel): + index: int + object: str = "score" + score: Union[List[float], str] + + +class ScoreResponse(OpenAIBaseModel): + id: str = Field(default_factory=lambda: f"embd-{random_uuid()}") + object: str = "list" + created: int = Field(default_factory=lambda: int(time.time())) + model: str + data: List[ScoreResponseData] + usage: UsageInfo + + class FunctionCall(OpenAIBaseModel): name: str arguments: str diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py new file mode 100644 index 0000000000000..156fea6f47982 --- /dev/null +++ b/vllm/entrypoints/openai/serving_score.py @@ -0,0 +1,215 @@ +import asyncio +import time +from typing import Any, AsyncGenerator, Dict, List, Optional, Union, cast + +from fastapi import Request + +from vllm.config import ModelConfig +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.protocol import (ErrorResponse, ScoreRequest, + ScoreResponse, ScoreResponseData, + UsageInfo) +from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing +from vllm.inputs.data import TokensPrompt +from vllm.logger import init_logger +from vllm.outputs import EmbeddingRequestOutput +from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer +from vllm.utils import merge_async_iterators, random_uuid + +logger = init_logger(__name__) + + +def request_output_to_score_response( + final_res_batch: List[EmbeddingRequestOutput], request_id: str, + created_time: int, model_name: str) -> ScoreResponse: + data: List[ScoreResponseData] = [] + score = None + num_prompt_tokens = 0 + for idx, final_res in enumerate(final_res_batch): + if final_res is not None: + score = final_res.outputs.embedding + score_data = ScoreResponseData(index=idx, score=score) + data.append(score_data) + + usage = UsageInfo( + prompt_tokens=num_prompt_tokens, + total_tokens=num_prompt_tokens, + ) + + return ScoreResponse( + id=request_id, + created=created_time, + model=model_name, + data=data, + usage=usage, + ) + + +def make_pairs(text_1: Union[List[str], str], text_2: Union[List[str], + str]) -> List: + if isinstance(text_1, (str, dict)): + # Convert a single prompt to a list. + text_1 = [text_1] + text_1 = [t for t in text_1] + + if isinstance(text_2, (str, dict)): + # Convert a single prompt to a list. + text_2 = [text_2] + text_2 = [t for t in text_2] + if len(text_1) > 1 and len(text_1) != len(text_2): + raise ValueError("Input lengths must be either 1:1, 1:N or N:N") + if len(text_1) == 0: + raise ValueError("At least one text element must be given") + if len(text_2) == 0: + raise ValueError("At least one text_pair element must be given") + + if len(text_1) == 1: + text_1 = text_1 * len(text_2) + + return [(t1, t2) for t1, t2 in zip(text_1, text_2)] + + +class OpenAIServingScores(OpenAIServing): + + def __init__( + self, + engine_client: EngineClient, + model_config: ModelConfig, + base_model_paths: List[BaseModelPath], + *, + request_logger: Optional[RequestLogger], + ) -> None: + super().__init__(engine_client=engine_client, + model_config=model_config, + base_model_paths=base_model_paths, + lora_modules=None, + prompt_adapters=None, + request_logger=request_logger) + + async def create_score( + self, + request: ScoreRequest, + raw_request: Optional[Request] = None, + ) -> Union[ScoreResponse, ErrorResponse]: + """ + Score API similar to Sentence Transformers cross encoder + + See https://sbert.net/docs/package_reference/cross_encoder + """ + error_check_ret = await self._check_model(request) + if error_check_ret is not None: + return error_check_ret + + model_name = request.model + request_id = f"score-{random_uuid()}" + created_time = int(time.monotonic()) + truncate_prompt_tokens = request.truncate_prompt_tokens + + request_prompts = [] + engine_prompts = [] + + try: + ( + lora_request, + prompt_adapter_request, + ) = self._maybe_get_adapters(request) + + tokenizer = await self.engine_client.get_tokenizer(lora_request) + + if prompt_adapter_request is not None: + raise NotImplementedError("Prompt adapter is not supported " + "for embedding models") + + if isinstance(tokenizer, MistralTokenizer): + raise ValueError( + "MistralTokenizer not supported for cross-encoding") + + if not self.model_config.is_cross_encoder: + raise ValueError("Model is not cross encoder.") + + except ValueError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) + + # Schedule the request and get the result generator. + generators: List[AsyncGenerator[EmbeddingRequestOutput, None]] = [] + + input_pairs = make_pairs(request.text_1, request.text_2) + + for q, t in input_pairs: + request_prompt = f"{q}{tokenizer.sep_token}{t}" + + tokenization_kwargs: Dict[str, Any] = {} + if truncate_prompt_tokens is not None: + tokenization_kwargs["truncation"] = True + tokenization_kwargs["max_length"] = truncate_prompt_tokens + + prompt_inputs = tokenizer(text=q, + text_pair=t, + **tokenization_kwargs) + engine_prompt = TokensPrompt( + prompt_token_ids=prompt_inputs["input_ids"], + token_type_ids=prompt_inputs.get("token_type_ids")) + + request_prompts.append(request_prompt) + engine_prompts.append(engine_prompt) + + try: + pooling_params = request.to_pooling_params() + + for i, engine_prompt in enumerate(engine_prompts): + request_id_item = f"{request_id}-{i}" + + self._log_inputs(request_id_item, + request_prompts[i], + params=pooling_params, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request) + + trace_headers = (None if raw_request is None else await + self._get_trace_headers(raw_request.headers)) + + generator = self.engine_client.encode( + engine_prompt, + pooling_params, + request_id_item, + lora_request=lora_request, + trace_headers=trace_headers, + priority=request.priority, + ) + + generators.append(generator) + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + result_generator = merge_async_iterators( + *generators, + is_cancelled=raw_request.is_disconnected if raw_request else None, + ) + + num_prompts = len(engine_prompts) + + # Non-streaming response + final_res_batch: List[Optional[EmbeddingRequestOutput]] + final_res_batch = [None] * num_prompts + + try: + async for i, res in result_generator: + final_res_batch[i] = res + + assert all(final_res is not None for final_res in final_res_batch) + + final_res_batch_checked = cast(List[EmbeddingRequestOutput], + final_res_batch) + + response = request_output_to_score_response( + final_res_batch_checked, request_id, created_time, model_name) + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + return response diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 07ff9faa50f13..fb7dbbebd7b90 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -38,6 +38,9 @@ class TokensPrompt(TypedDict): prompt_token_ids: List[int] """A list of token IDs to pass to the model.""" + token_type_ids: NotRequired[List[int]] + """A list of token type IDs to pass to the cross encoder model.""" + multi_modal_data: NotRequired["MultiModalDataDict"] """ DEPRECATED: Optional multi-modal data to pass to the model, @@ -133,6 +136,9 @@ class TokenInputs(TypedDict): prompt_token_ids: List[int] """The token IDs of the prompt.""" + token_type_ids: NotRequired[List[int]] + """The token type IDs of the prompt.""" + prompt: NotRequired[str] """ The original prompt text corresponding to the token IDs, if available. @@ -160,6 +166,7 @@ class TokenInputs(TypedDict): def token_inputs( prompt_token_ids: List[int], + token_type_ids: Optional[List[int]] = None, prompt: Optional[str] = None, multi_modal_data: Optional["MultiModalDataDict"] = None, multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None, @@ -170,6 +177,8 @@ def token_inputs( if prompt is not None: inputs["prompt"] = prompt + if token_type_ids is not None: + inputs["token_type_ids"] = token_type_ids if multi_modal_data is not None: inputs["multi_modal_data"] = multi_modal_data if multi_modal_placeholders is not None: @@ -234,6 +243,15 @@ def prompt_token_ids(self) -> List[int]: assert_never(inputs) + @cached_property + def token_type_ids(self) -> List[int]: + inputs = self.inputs + + if inputs["type"] == "token" or inputs["type"] == "multimodal": + return inputs.get("token_type_ids", []) + + assert_never(inputs) + @cached_property def prompt_embeds(self) -> Optional[torch.Tensor]: inputs = self.inputs diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 853257c5ad71f..3d606817e90aa 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -305,6 +305,7 @@ def _prompt_to_llm_inputs( tokens_content = parsed["content"] prompt_token_ids = tokens_content["prompt_token_ids"] + token_type_ids = tokens_content.get("token_type_ids") multi_modal_data = tokens_content.get("multi_modal_data") mm_processor_kwargs = tokens_content.get("mm_processor_kwargs") @@ -318,6 +319,7 @@ def _prompt_to_llm_inputs( return token_inputs( prompt_token_ids=prompt_token_ids, + token_type_ids=token_type_ids, multi_modal_data=multi_modal_data, mm_processor_kwargs=mm_processor_kwargs, ) diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index df1978241340b..f9437b4112ceb 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -3,11 +3,14 @@ import torch import torch.nn as nn +from transformers import PretrainedConfig from vllm.config import PoolerConfig from vllm.model_executor.pooling_metadata import (PoolingMetadata, PoolingTensors) from vllm.sequence import EmbeddingSequenceGroupOutput, PoolerOutput +from vllm.transformers_utils.config import ( + get_cross_encoder_activation_function) class PoolingType(IntEnum): @@ -152,3 +155,64 @@ def forward( ] return PoolerOutput(outputs=pooled_outputs) + + +class CrossEncodingPooler(nn.Module): + """A layer that pools specific information from hidden states. + + This layer does the following: + 1. Extracts specific tokens or aggregates data based on pooling method. + 2. Normalizes output if specified. + 3. Returns structured results as `PoolerOutput`. + + Attributes: + pooling_type: The type of pooling to use. + normalize: Whether to normalize the pooled data. + """ + + def __init__( + self, + config: PretrainedConfig, + classifier: nn.Module, + pooler: Optional[nn.Module] = None, + ): + super().__init__() + self.classifier = classifier + self.pooler = pooler + self.default_activation_function = \ + get_cross_encoder_activation_function(config) + + def forward( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> PoolerOutput: + """Pools sentence pair scores from the hidden_states.""" + + prompt_lens = PoolingTensors.from_pooling_metadata( + pooling_metadata, hidden_states.device).prompt_lens + + offset = 0 + pooled_data_lst = [] + for prompt_len in prompt_lens: + pooled_data_i = hidden_states[offset:offset + prompt_len] + + if self.pooler is not None: + final_shape_tensor = self.pooler(pooled_data_i) + else: + final_shape_tensor = self.classifier(pooled_data_i) + + pooled_data_lst.append(final_shape_tensor) + offset += prompt_len + + pooled_output = torch.stack(pooled_data_lst) + + if self.pooler is not None: + # apply classifier once on the full batch if possible + pooled_output = self.classifier(pooled_output) + logits = self.default_activation_function(pooled_output) + + pooled_outputs = [ + EmbeddingSequenceGroupOutput(data.tolist()) for data in logits + ] + return PoolerOutput(outputs=pooled_outputs) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index d8301a36acb01..1fc87bc650d92 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -11,14 +11,18 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.pooler import Pooler, PoolingType +from vllm.model_executor.layers.pooler import (CrossEncodingPooler, Pooler, + PoolingType) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.interfaces import SupportsCrossEncoding from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors, PoolerOutput +from vllm.transformers_utils.config import ( + get_cross_encoder_activation_function) from .utils import maybe_prefix @@ -48,7 +52,9 @@ def __init__(self, config: BertConfig): def forward( self, input_ids: torch.Tensor, - position_ids: Optional[torch.Tensor] = None, + seq_lens: torch.Tensor, + position_ids: torch.Tensor, + token_type_ids: Optional[torch.Tensor] = None, ) -> torch.Tensor: input_shape = input_ids.size() @@ -58,17 +64,34 @@ def forward( # Position embeddings. position_embeddings = self.position_embeddings(position_ids) - # Token type embeddings. (TODO: move off hotpath?) - token_type_embeddings = self.token_type_embeddings( - torch.zeros(input_shape, - dtype=torch.long, - device=inputs_embeds.device)) + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, + dtype=torch.long, + device=inputs_embeds.device) + + token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = inputs_embeds + token_type_embeddings + position_embeddings embeddings = self.LayerNorm(embeddings) return embeddings +class BertPooler(nn.Module): + + def __init__(self, config: BertConfig): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[0, :] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + class BertEncoder(nn.Module): def __init__(self, @@ -309,7 +332,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", - embedding_class: type = BertEmbedding): + embedding_class: type = BertEmbedding, + add_pooling_layer: bool = False): super().__init__() config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config @@ -319,6 +343,7 @@ def __init__(self, cache_config, quant_config, prefix=f"{prefix}.encoder") + self.pooler = BertPooler(config) if add_pooling_layer else None def forward( self, @@ -328,13 +353,17 @@ def forward( attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, ) -> torch.Tensor: if inputs_embeds is not None: hidden_states = inputs_embeds else: - hidden_states = self.embeddings(input_ids=input_ids, - position_ids=position_ids) - + assert hasattr(attn_metadata, "seq_lens_tensor") + hidden_states = self.embeddings( + input_ids=input_ids, + seq_lens=attn_metadata.seq_lens_tensor, + position_ids=position_ids, + token_type_ids=token_type_ids) return self.encoder(hidden_states, kv_caches, attn_metadata) def load_weights(self, weights: Iterable[Tuple[str, @@ -349,7 +378,7 @@ def load_weights(self, weights: Iterable[Tuple[str, params_dict = dict(self.named_parameters()) loaded_params: Set[str] = set() for name, loaded_weight in weights: - if "pooler" in name: + if self.pooler is None and "pooler" in name: continue for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: @@ -430,3 +459,78 @@ def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler: pooling_type=PoolingType.CLS, normalize=True, softmax=False) + + +class BertForSequenceClassification(nn.Module, SupportsCrossEncoding): + """A model that uses Bert to provide embedding functionalities. + + This class encapsulates the BertModel and provides an interface for + embedding operations and customized pooling functions. + + Attributes: + model: An instance of BertModel used for forward operations. + _pooler: An instance of Pooler used for pooling operations. + """ + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + + self.default_activation_function = \ + get_cross_encoder_activation_function(config) + + self.num_labels = config.num_labels + self.bert = BertModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "bert"), + embedding_class=BertEmbedding, + add_pooling_layer=True) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + self._pooler = CrossEncodingPooler(config, self.classifier, + self.bert.pooler) + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + + self_weights = [] + + def weight_filter(): + for name, weight in weights: + if name.startswith("bert."): + yield (name[len("bert."):], weight) + else: + self_weights.append((name, weight)) + + self.bert.load_weights(weight_filter()) + + params_dict = dict(self.named_parameters()) + + for name, loaded_weight in self_weights: + if name.startswith("classifier"): + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + + def pooler( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> Optional[PoolerOutput]: + return self._pooler(hidden_states, pooling_metadata) + + def forward( + self, + input_ids: Optional[torch.Tensor], + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + return self.bert(input_ids=input_ids, + position_ids=positions, + kv_caches=kv_caches, + inputs_embeds=inputs_embeds, + intermediate_tensors=intermediate_tensors, + attn_metadata=attn_metadata, + token_type_ids=token_type_ids) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index dcead65115132..4f0c75b2c6a57 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -7,6 +7,8 @@ from vllm.logger import init_logger from vllm.utils import supports_kw +from .interfaces_base import is_embedding_model + if TYPE_CHECKING: from vllm.config import LoRAConfig, MultiModalConfig, SchedulerConfig from vllm.sequence import IntermediateTensors @@ -350,3 +352,37 @@ def is_attention_free( return isinstance(model, _IsAttentionFreeType) return isinstance(model, IsAttentionFree) + + +@runtime_checkable +class SupportsCrossEncoding(Protocol): + """The interface required for all models that support cross encoding.""" + + supports_cross_encoding: ClassVar[Literal[True]] = True + + +@overload +def supports_cross_encoding( + model: Type[object]) -> TypeIs[Type[SupportsCrossEncoding]]: + ... + + +@overload +def supports_cross_encoding(model: object) -> TypeIs[SupportsCrossEncoding]: + ... + + +def _supports_cross_encoding( + model: Union[Type[object], object], +) -> Union[TypeIs[Type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]: + + if isinstance(model, type): + return isinstance(model, SupportsCrossEncoding) + + return isinstance(model, SupportsCrossEncoding) + + +def supports_cross_encoding( + model: Union[Type[object], object], +) -> Union[TypeIs[Type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]: + return is_embedding_model(model) and _supports_cross_encoding(model) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 22c2e328bfb65..789ffb4d3bde0 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -21,7 +21,8 @@ from vllm.platforms import current_platform from .interfaces import (has_inner_state, is_attention_free, - supports_multimodal, supports_pp) + supports_cross_encoding, supports_multimodal, + supports_pp) from .interfaces_base import is_embedding_model, is_text_generation_model logger = init_logger(__name__) @@ -100,6 +101,7 @@ # [Text-only] "BertModel": ("bert", "BertEmbeddingModel"), "RobertaModel": ("roberta", "RobertaEmbeddingModel"), + "RobertaForMaskedLM": ("roberta", "RobertaEmbeddingModel"), "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"), "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"), "Gemma2Model": ("gemma2", "Gemma2EmbeddingModel"), @@ -121,6 +123,14 @@ "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration") # noqa: E501, } +_CROSS_ENCODER_MODELS = { + "BertForSequenceClassification": ("bert", "BertForSequenceClassification"), + "RobertaForSequenceClassification": ("roberta", + "RobertaForSequenceClassification"), + "XLMRobertaForSequenceClassification": ("roberta", + "RobertaForSequenceClassification"), +} + _MULTIMODAL_MODELS = { # [Decoder-only] "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"), @@ -159,6 +169,7 @@ _VLLM_MODELS = { **_TEXT_GENERATION_MODELS, **_EMBEDDING_MODELS, + **_CROSS_ENCODER_MODELS, **_MULTIMODAL_MODELS, **_SPECULATIVE_DECODING_MODELS, } @@ -193,6 +204,7 @@ class _ModelInfo: is_text_generation_model: bool is_embedding_model: bool + supports_cross_encoding: bool supports_multimodal: bool supports_pp: bool has_inner_state: bool @@ -203,6 +215,7 @@ def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo": return _ModelInfo( is_text_generation_model=is_text_generation_model(model), is_embedding_model=is_embedding_model(model), + supports_cross_encoding=supports_cross_encoding(model), supports_multimodal=supports_multimodal(model), supports_pp=supports_pp(model), has_inner_state=has_inner_state(model), @@ -415,6 +428,12 @@ def is_embedding_model( ) -> bool: return self.inspect_model_cls(architectures).is_embedding_model + def is_cross_encoder_model( + self, + architectures: Union[str, List[str]], + ) -> bool: + return self.inspect_model_cls(architectures).supports_cross_encoding + def is_multimodal_model( self, architectures: Union[str, List[str]], @@ -489,4 +508,4 @@ def _run() -> None: if __name__ == "__main__": - _run() \ No newline at end of file + _run() diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index c1dcdd36ec3de..5a296e311f079 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import Iterable, List, Optional, Tuple import torch from torch import nn @@ -6,10 +6,17 @@ from vllm.attention import AttentionMetadata from vllm.config import VllmConfig +from vllm.model_executor.layers.pooler import CrossEncodingPooler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.bert import BertEmbeddingModel, BertModel -from vllm.sequence import IntermediateTensors +from vllm.model_executor.models.interfaces import SupportsCrossEncoding +from vllm.model_executor.models.utils import maybe_prefix +from vllm.model_executor.pooling_metadata import PoolingMetadata +from vllm.sequence import IntermediateTensors, PoolerOutput +from vllm.transformers_utils.config import ( + get_cross_encoder_activation_function) class RobertaEmbedding(nn.Module): @@ -39,34 +46,93 @@ def __init__(self, config: RobertaConfig): def forward( self, input_ids: torch.Tensor, - position_ids: Optional[torch.Tensor] = None, + seq_lens: torch.Tensor, + position_ids: torch.Tensor, + token_type_ids: Optional[torch.Tensor] = None, ) -> torch.Tensor: input_shape = input_ids.size() - - # Input embeddings. inputs_embeds = self.word_embeddings(input_ids) - # TODO: figure out if there is a better way - # to make to make position ids start at padding_idx + 1 + # Replace position ids because in RoBERTa models + # they have to start at padding_idx + 1 and ignore + # existing padding tokens # References: # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133 # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669 - position_ids += self.padding_idx + 1 + pos_list = [] + token_list = [] + offset = 0 + for seq_len in seq_lens: + pos_list.append(position_ids[offset:offset + seq_len]) + token_list.append(input_ids[offset:offset + seq_len]) + offset += seq_len + + new_pos_list = [] + for positions, tokens in zip(pos_list, token_list): + # Verify assumption that incoming position are + # always a sequence from 0 to N. + expected_pos = torch.arange(positions.size()[0], + dtype=torch.long, + device=inputs_embeds.device) + assert torch.equal(positions, expected_pos) + new_pos_list.append( + create_position_ids_from_input_ids(tokens, self.padding_idx)) + position_ids = torch.cat(new_pos_list) # Position embeddings. position_embeddings = self.position_embeddings(position_ids) + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, + dtype=torch.long, + device=inputs_embeds.device) - # Token type embeddings. (TODO: move off hotpath?) - token_type_embeddings = self.token_type_embeddings( - torch.zeros(input_shape, - dtype=torch.long, - device=inputs_embeds.device)) - + token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = inputs_embeds + token_type_embeddings + position_embeddings embeddings = self.LayerNorm(embeddings) return embeddings +# Adapted from transformers +def create_position_ids_from_input_ids(input_ids, + padding_idx, + past_key_values_length=0): + """ + Replace non-padding symbols with their position numbers. + Position numbers begin at padding_idx+1. Padding symbols + are ignored. This is modified from fairseq's `utils.make_positions`. + + Args: + x: torch.Tensor x: + + Returns: torch.Tensor + """ + # The series of casts and type-conversions here are carefully + # balanced to both work with ONNX export and XLA. + mask = input_ids.ne(padding_idx).int() + + incremental_indices = (torch.cumsum(mask, dim=0).type_as(mask) + + past_key_values_length) * mask + + return incremental_indices.long() + padding_idx + + +# Adapted from transformers +class RobertaClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__(self, config: RobertaConfig): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.out_proj = nn.Linear(config.hidden_size, config.num_labels) + + def forward(self, features, **kwargs): + x = features[0, :] # take token (equiv. to [CLS]) + x = self.dense(x) + x = torch.tanh(x) + x = self.out_proj(x) + return x + + class RobertaEmbeddingModel(BertEmbeddingModel): """A model that uses Roberta to provide embedding functionalities. @@ -85,6 +151,62 @@ def _build_model(self, prefix=prefix, embedding_class=RobertaEmbedding) + +class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding): + """A model that uses Roberta to provide embedding functionalities. + + This class encapsulates the BertModel and provides an interface for + embedding operations and customized pooling functions. + + Attributes: + roberta: An instance of BertModel used for forward operations. + _pooler: An instance of Pooler used for pooling operations. + """ + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + + self.default_activation_function = \ + get_cross_encoder_activation_function(config) + + self.num_labels = config.num_labels + self.roberta = BertModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "bert"), + embedding_class=RobertaEmbedding, + add_pooling_layer=False) + self.classifier = RobertaClassificationHead(config) + self._pooler = CrossEncodingPooler(config, self.classifier) + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + + self_weights = [] + + def weight_filter(): + for name, weight in weights: + if name.startswith("roberta."): + yield (name[len("roberta."):], weight) + else: + self_weights.append((name, weight)) + + self.roberta.load_weights(weight_filter()) + + params_dict = dict(self.named_parameters()) + + for name, loaded_weight in self_weights: + if name.startswith("classifier"): + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + + def pooler( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> Optional[PoolerOutput]: + return self._pooler(hidden_states, pooling_metadata) + def forward( self, input_ids: Optional[torch.Tensor], @@ -93,25 +215,12 @@ def forward( attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, ) -> torch.Tensor: - - # Verify assumption that position are always a sequence from - # 0 to N. (Actually here we just check 0 and N to simplify). - # This is important to fix the position which are assumed to - # start from padding_idx + 1 instead of 0 in the Roberta models. - assert hasattr(attn_metadata, "seq_lens_tensor") - cumulative = attn_metadata.seq_lens_tensor.cumsum(dim=0) - start_pos = torch.cat( - (torch.tensor([0], device=attn_metadata.seq_lens_tensor.device), - cumulative[:-1])) - assert len(torch.nonzero(positions[start_pos])) == 0 - end_pos = cumulative - 1 - last_tokens = attn_metadata.seq_lens_tensor - 1 - assert len(torch.nonzero(positions[end_pos] - last_tokens)) == 0 - - return super().forward(input_ids=input_ids, - positions=positions, - kv_caches=kv_caches, - attn_metadata=attn_metadata, - intermediate_tensors=intermediate_tensors, - inputs_embeds=inputs_embeds) + return self.roberta(input_ids=input_ids, + position_ids=positions, + kv_caches=kv_caches, + inputs_embeds=inputs_embeds, + intermediate_tensors=intermediate_tensors, + attn_metadata=attn_metadata, + token_type_ids=token_type_ids) diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 8e67a552afe12..640c7c04b8817 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -6,7 +6,7 @@ import torch import torch.types from PIL.Image import Image -from typing_extensions import TypeAlias +from typing_extensions import NotRequired, TypeAlias from vllm.utils import JSONTree, is_list_of, json_map_leaves @@ -208,6 +208,9 @@ class MultiModalInputsV2(TypedDict): prompt_token_ids: List[int] """The processed token IDs which includes placeholder tokens.""" + token_type_ids: NotRequired[List[int]] + """The token type IDs of the prompt.""" + mm_kwargs: MultiModalKwargs """Keyword arguments to be directly passed to the model after batching.""" diff --git a/vllm/outputs.py b/vllm/outputs.py index 4ae9b377ae693..2d256803edfe8 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -60,7 +60,6 @@ class EmbeddingOutput: embedding: The embedding vector, which is a list of floats. The length of vector depends on the model as listed in the embedding guide. """ - embedding: List[float] def __repr__(self) -> str: @@ -363,6 +362,50 @@ def __repr__(self): f"finished={self.finished})") +@dataclass +class ScoreOutput: + """The output data of one completion output of a request. + + Args: + score: The score, which is a list of floats. + index: The correspondent text index of the score. + """ + index: int + score: List[float] + + def __repr__(self) -> str: + return (f"ScoreOutput(" + f"score={self.score}), " + f"index={self.index})") + + +class ScoreRequestOutput: + """ + The output data of an score request to the LLM. + + Args: + request_id (str): A unique identifier for the score request. + outputs (score): The embedding results for the given input. + """ + + def __init__(self, request_id: str, outputs: "ScoreOutput"): + self.request_id = request_id + self.outputs = outputs + + def __repr__(self): + """ + Returns a string representation of an ScoreRequestOutput instance. + + The representation includes the request_id and the number of outputs, + providing a quick overview of the embedding request's results. + + Returns: + str: A string representation of the ScoreRequestOutput instance. + """ + return (f"ScoreRequestOutput(request_id='{self.request_id}', " + f"outputs={repr(self.outputs)}") + + class RequestOutputFactory: @staticmethod diff --git a/vllm/sequence.py b/vllm/sequence.py index a1cc8fc3b09de..669124319c4f4 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -449,6 +449,10 @@ def prompt_token_ids(self) -> List[int]: def prompt_embeds(self) -> Optional[torch.Tensor]: return self.inputs.prompt_embeds + @property + def token_type_ids(self) -> List[int]: + return self.inputs.token_type_ids + @property def multi_modal_data(self) -> "MultiModalDataDict": return self.inputs.multi_modal_data @@ -687,6 +691,10 @@ def encoder_prompt_token_ids(self) -> Optional[List[int]]: return (self.encoder_seq.prompt_token_ids if self.encoder_seq is not None else None) + @property + def token_type_ids(self) -> Optional[List[int]]: + return self.first_seq.token_type_ids + @property def multi_modal_data(self) -> MultiModalDataDict: return self.first_seq.multi_modal_data @@ -909,6 +917,7 @@ class SequenceGroupMetadata( default_factory=lambda: SequenceGroupState()) # "MultiModalDataDict" types. We have to use Any due to msgspec # doesn't allow to have union of 2 different dicts. + token_type_ids: Optional[List[int]] = None multi_modal_data: Optional[Any] = None multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None mm_processor_kwargs: Optional[Dict[str, Any]] = None diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 59096753c395d..70d18d40b7aa7 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -9,6 +9,7 @@ from huggingface_hub.utils import (EntryNotFoundError, LocalEntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError) +from torch import nn from transformers import GenerationConfig, PretrainedConfig from transformers.models.auto.image_processing_auto import ( get_image_processor_config) @@ -31,6 +32,7 @@ UltravoxConfig) # yapf: enable from vllm.transformers_utils.utils import check_gguf_file +from vllm.utils import resolve_obj_by_qualname if VLLM_USE_MODELSCOPE: from modelscope import AutoConfig @@ -577,3 +579,16 @@ def try_get_generation_config( return GenerationConfig.from_model_config(config) except OSError: # Not found return None + + +def get_cross_encoder_activation_function(config: PretrainedConfig): + if (hasattr(config, "sbert_ce_default_activation_function") + and config.sbert_ce_default_activation_function is not None): + + function_name = config.sbert_ce_default_activation_function + assert function_name.startswith("torch.nn.modules."), \ + "Loading of activation functions is restricted to " \ + "torch.nn.modules for security reasons" + return resolve_obj_by_qualname(function_name)() + else: + return nn.Sigmoid() if config.num_labels == 1 else nn.Identity() diff --git a/vllm/worker/cpu_embedding_model_runner.py b/vllm/worker/cpu_embedding_model_runner.py index 978de73df6b70..3954e4c4c8a5b 100644 --- a/vllm/worker/cpu_embedding_model_runner.py +++ b/vllm/worker/cpu_embedding_model_runner.py @@ -50,6 +50,9 @@ def execute_model( ] model_executable = self.model + cross_enc_kwargs = {} + if model_input.token_type_ids is not None: + cross_enc_kwargs["token_type_ids"] = model_input.token_type_ids execute_model_kwargs = { "input_ids": model_input.input_tokens, @@ -61,6 +64,7 @@ def execute_model( model_input.attn_metadata, **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, device=self.device), + **cross_enc_kwargs, "intermediate_tensors": intermediate_tensors, } diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 7cab476d7fca4..b08171d79f002 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -43,6 +43,7 @@ class ModelInputForCPU(ModelRunnerInputBase): """ input_tokens: Optional[torch.Tensor] = None input_positions: Optional[torch.Tensor] = None + token_type_ids: Optional[torch.Tensor] = None attn_metadata: Optional["AttentionMetadata"] = None multi_modal_kwargs: Optional[BatchedTensorInputs] = None virtual_engine: Optional[int] = None @@ -54,6 +55,7 @@ def as_broadcastable_tensor_dict( tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, + "token_type_ids": self.token_type_ids, "multi_modal_kwargs": self.multi_modal_kwargs, } _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) @@ -83,6 +85,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, + "token_type_ids": self.token_type_ids, "multi_modal_kwargs": self.multi_modal_kwargs, } _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) @@ -112,6 +115,7 @@ def __init__(self, use_mrope: bool): self.input_tokens: List[int] = [] self.input_positions: Optional[ List[int]] = [] if not self.use_mrope else None + self.token_type_ids: Optional[List[int]] = [] self.seq_lens: List[int] = [] self.query_lens: List[int] = [] self.prefill_block_tables: List[List[int]] = [] @@ -165,6 +169,10 @@ def build(self) -> ModelInputForCPU: if not input_data.use_mrope else input_data.input_mrope_positions, dtype=torch.long, device="cpu") + token_type_ids = torch.tensor(input_data.token_type_ids, + dtype=torch.long, + device="cpu") \ + if input_data.token_type_ids else None # For multi-modal models multi_modal_kwargs = None @@ -178,6 +186,7 @@ def build(self) -> ModelInputForCPU: return self.model_input_cls( input_tokens=input_tokens, input_positions=input_positions, + token_type_ids=token_type_ids, seq_lens=input_data.seq_lens, query_lens=input_data.query_lens, attn_metadata=attn_metadata, @@ -285,6 +294,7 @@ def _compute_prompt_input_tokens(self, data: ModelInputData, tokens = seq_data.get_token_ids() tokens = tokens[context_len:seq_len] token_positions = range(context_len, seq_len) + token_types = seq_group_metadata.token_type_ids # For encoder-only models, the block_table is None, # and there is no need to initialize the slot_mapping. @@ -301,6 +311,9 @@ def _compute_prompt_input_tokens(self, data: ModelInputData, if data.input_positions is not None: data.input_positions.extend(token_positions) + if data.token_type_ids is not None: + data.token_type_ids.extend(token_types if token_types else []) + # Update fields data.input_tokens.extend(tokens) data.num_prefills += 1 diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py index 4a55d91e71484..f56805918fd15 100644 --- a/vllm/worker/embedding_model_runner.py +++ b/vllm/worker/embedding_model_runner.py @@ -97,6 +97,10 @@ def execute_model( model_forward_end = torch.cuda.Event(enable_timing=True) model_forward_start.record() + cross_enc_kwargs = {} + if model_input.token_types is not None: + cross_enc_kwargs["token_type_ids"] = model_input.token_types + with set_forward_context(model_input.attn_metadata, self.vllm_config): hidden_or_intermediate_states = model_executable( input_ids=model_input.input_tokens, @@ -105,7 +109,8 @@ def execute_model( attn_metadata=model_input.attn_metadata, intermediate_tensors=intermediate_tensors, **MultiModalKwargs.as_kwargs(multi_modal_kwargs, - device=self.device)) + device=self.device), + **cross_enc_kwargs) if (self.observability_config is not None and self.observability_config.collect_model_forward_time): diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 13301b876217d..1f654a9cce465 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -92,6 +92,7 @@ class ModelInputForGPU(ModelRunnerInputBase): """ input_tokens: Optional[torch.Tensor] = None input_positions: Optional[torch.Tensor] = None + token_types: Optional[torch.Tensor] = None seq_lens: Optional[List[int]] = None query_lens: Optional[List[int]] = None lora_mapping: Optional["LoRAMapping"] = None @@ -200,6 +201,7 @@ class InterDataForSeqGroup: def simple_reinit(self): self.input_tokens[0].clear() # type: ignore self.input_positions[0].clear() # type: ignore + self.token_types[0].clear() # type: ignore self.mrope_input_positions = None # type: ignore self.seq_lens[0] = 0 # type: ignore self.orig_seq_lens[0] = 0 # type: ignore @@ -226,6 +228,7 @@ def __init__( # Input tokens and positions. input_tokens: Optional[List[List[int]]] = None, input_positions: Optional[List[List[int]]] = None, + token_types: Optional[List[List[int]]] = None, mrope_input_positions: Optional[List[List[List[int]]]] = None, # The sequence length (may be capped to the sliding window). @@ -291,6 +294,12 @@ def __init__( for seq_id in range(len(self.seq_ids)): self.input_positions[seq_id].clear() + if token_types: + self.token_types = token_types + else: + for seq_id in range(len(self.seq_ids)): + self.token_types[seq_id].clear() + self.mrope_input_positions = None if seq_lens: @@ -354,6 +363,7 @@ def __init__( else: self.input_tokens = input_tokens or [] self.input_positions = input_positions or [] + self.token_types = token_types or [] self.mrope_input_positions = mrope_input_positions or None self.seq_lens = seq_lens or [] self.orig_seq_lens = orig_seq_lens or [] @@ -386,6 +396,7 @@ def __post_init__(self): self.input_tokens = [[] for _ in range(self.n_seqs)] self.input_positions = [[] for _ in range(self.n_seqs)] + self.token_types = [[] for _ in range(self.n_seqs)] self.mrope_input_positions = None self.seq_lens = [0] * self.n_seqs self.orig_seq_lens = [0] * self.n_seqs @@ -498,12 +509,15 @@ def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int, # Compute tokens. tokens = seq_data.get_token_ids()[context_len:seq_len] + token_types = seq_group_metadata.token_type_ids inter_data.seq_lens[seq_idx] = seq_len inter_data.orig_seq_lens[seq_idx] = seq_len inter_data.context_lens[seq_idx] = context_len inter_data.input_tokens[seq_idx].extend(tokens) inter_data.input_positions[seq_idx].extend(range(context_len, seq_len)) + inter_data.token_types[seq_idx].extend( + token_types if token_types else []) inter_data.query_lens[seq_idx] = seq_len - context_len if seq_data.mrope_position_delta is not None: @@ -561,6 +575,8 @@ def _compute_for_prefix_cache_hit( seq_idx][uncomputed_start:] inter_data.input_positions[seq_idx] = inter_data.input_positions[ seq_idx][uncomputed_start:] + inter_data.token_types[seq_idx] = inter_data.token_types[seq_idx][ + uncomputed_start:] context_len = prefix_cache_len inter_data.context_lens[seq_idx] = context_len @@ -575,6 +591,8 @@ def _compute_for_prefix_cache_hit( seq_idx][-1:] inter_data.input_positions[seq_idx] = inter_data.input_positions[ seq_idx][-1:] + inter_data.token_types[seq_idx] = inter_data.token_types[seq_idx][ + -1:] inter_data.query_lens[seq_idx] = 1 inter_data.context_lens[seq_idx] = inter_data.seq_lens[seq_idx] - 1 @@ -803,9 +821,12 @@ def build(self) -> ModelInputForGPU: """ # Combine and flatten intermediate data. input_tokens = [] + token_types = [] for inter_data in self.inter_data_list: for cur_input_tokens in inter_data.input_tokens: input_tokens.extend(cur_input_tokens) + for cur_token_types in inter_data.token_types: + token_types.extend(cur_token_types) if not input_tokens: # This may happen when all prefill requests hit @@ -874,6 +895,12 @@ def build(self) -> ModelInputForGPU: input_tokens_tensor = async_tensor_h2d(input_tokens, torch.long, self.runner.device, self.runner.pin_memory) + + token_types_tensor = async_tensor_h2d(token_types, torch.long, + self.runner.device, + self.runner.pin_memory) \ + if token_types else None + if mrope_input_positions is not None: for idx in range(3): mrope_input_positions[idx].extend( @@ -952,6 +979,7 @@ def build(self) -> ModelInputForGPU: return self.model_input_cls( input_tokens=input_tokens_tensor, input_positions=input_positions_tensor, + token_types=token_types_tensor, attn_metadata=attn_metadata, seq_lens=seq_lens, query_lens=query_lens, From 7ea3cd7c3e9fa1db06cdf8ad1973237b061b7d64 Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Mon, 25 Nov 2024 13:14:56 +0800 Subject: [PATCH 138/255] [Refactor][MISC] del redundant code in ParallelConfig.postinit (#10614) Signed-off-by: MengqingCao --- vllm/config.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 4ea56a14cabba..dcdaf58b5ccdb 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -998,20 +998,15 @@ def __post_init__(self) -> None: raise ValueError(f"worker-use-ray can't be used with " f"distributed executor backend " f"'{self.distributed_executor_backend}'.") - - if current_platform.is_tpu() and self.world_size > 1: - if self.distributed_executor_backend is None: - self.distributed_executor_backend = "ray" - if self.distributed_executor_backend != "ray": - raise ValueError( - "TPU backend only supports Ray for distributed inference.") - - if current_platform.is_hpu() and self.world_size > 1: + ray_only_devices = ["tpu", "hpu"] + if (current_platform.device_type in ray_only_devices + and self.world_size > 1): if self.distributed_executor_backend is None: self.distributed_executor_backend = "ray" if self.distributed_executor_backend != "ray": raise ValueError( - "HPU backend only supports Ray for distributed inference.") + f"{current_platform.device_type.upper()} backend only " + "supports Ray for distributed inference.") if self.distributed_executor_backend is None and self.world_size > 1: # We use multiprocessing by default if world_size fits on the From 571841b7fcc67f8b1d171522f6249ed4224033e1 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sun, 24 Nov 2024 21:24:33 -0800 Subject: [PATCH 139/255] [torch.compile] support encoder based models (#10613) Signed-off-by: youkaichao --- tests/compile/test_basic_correctness.py | 10 ++++++++++ vllm/model_executor/models/bert.py | 17 +++++++---------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index b7170886d2556..99781c55b672e 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -62,6 +62,16 @@ class TestSetting: method="encode", fullgraph=True, ), + # encoder-based embedding model (BERT) + TestSetting( + model="BAAI/bge-base-en-v1.5", + model_args=["--task", "embedding"], + pp_size=1, + tp_size=1, + attn_backend="XFORMERS", + method="encode", + fullgraph=True, + ), # vision language model TestSetting( model="microsoft/Phi-3.5-vision-instruct", diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 1fc87bc650d92..f570d6d3c12b3 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -5,6 +5,7 @@ from transformers import BertConfig from vllm.attention import Attention, AttentionMetadata, AttentionType +from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, PoolerConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn @@ -92,14 +93,14 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return pooled_output +@support_torch_compile class BertEncoder(nn.Module): - def __init__(self, - config: BertConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): + def __init__(self, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config self.layer = nn.ModuleList([ BertLayer(config=config, cache_config=cache_config, @@ -336,12 +337,8 @@ def __init__(self, add_pooling_layer: bool = False): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config self.embeddings = embedding_class(config) - self.encoder = BertEncoder(config, - cache_config, - quant_config, + self.encoder = BertEncoder(vllm_config=vllm_config, prefix=f"{prefix}.encoder") self.pooler = BertPooler(config) if add_pooling_layer else None From a30a605d214e051c31057f8c0cb948c841a2f743 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 25 Nov 2024 14:34:07 +0800 Subject: [PATCH 140/255] [Doc] Add encoder-based models to Supported Models page (#10616) Signed-off-by: DarkLight1337 --- docs/source/models/supported_models.rst | 45 +++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index e902d393f2f70..ccd2d8de8ec0b 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -325,6 +325,11 @@ Text Embedding - Example HF Models - :ref:`LoRA ` - :ref:`PP ` + * - :code:`BertModel` + - BERT-based + - :code:`BAAI/bge-base-en-v1.5`, etc. + - + - * - :code:`Gemma2Model` - Gemma2-based - :code:`BAAI/bge-multilingual-gemma2`, etc. @@ -340,6 +345,16 @@ Text Embedding - :code:`ssmits/Qwen2-7B-Instruct-embed-base`, :code:`Alibaba-NLP/gte-Qwen2-1.5B-instruct`, etc. - ✅︎ - ✅︎ + * - :code:`RobertaModel`, :code:`RobertaForMaskedLM` + - RoBERTa-based + - :code:`sentence-transformers/all-roberta-large-v1`, :code:`sentence-transformers/all-roberta-large-v1`, etc. + - + - + * - :code:`XLMRobertaModel` + - XLM-RoBERTa-based + - :code:`intfloat/multilingual-e5-large`, etc. + - + - .. important:: Some model architectures support both generation and embedding tasks. @@ -390,6 +405,36 @@ Classification .. note:: As an interim measure, these models are supported in both offline and online inference via Embeddings API. +Sentence Pair Scoring +--------------------- + +.. list-table:: + :widths: 25 25 50 5 5 + :header-rows: 1 + + * - Architecture + - Models + - Example HF Models + - :ref:`LoRA ` + - :ref:`PP ` + * - :code:`BertForSequenceClassification` + - BERT-based + - :code:`cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. + - + - + * - :code:`RobertaForSequenceClassification` + - RoBERTa-based + - :code:`cross-encoder/quora-roberta-base`, etc. + - + - + * - :code:`XLMRobertaForSequenceClassification` + - XLM-RoBERTa-based + - :code:`BAAI/bge-reranker-v2-m3`, etc. + - + - + +.. note:: + These models are supported in both offline and online inference via Score API. Multimodal Language Models ^^^^^^^^^^^^^^^^^^^^^^^^^^ From 7c2134beda9a4f72c71c4faffcca22cebd4e1c3c Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Mon, 25 Nov 2024 15:04:21 +0800 Subject: [PATCH 141/255] [torch.compile] force inductor threads (#10620) Signed-off-by: Jee Jee Li --- vllm/plugins/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index d5056b18fe968..bd4764c5cc79c 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -3,6 +3,8 @@ from contextlib import contextmanager from typing import TYPE_CHECKING, Optional +import torch + import vllm.envs as envs if TYPE_CHECKING: @@ -26,7 +28,8 @@ def load_general_plugins(): # see https://github.com/vllm-project/vllm/issues/10480 os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1' - + # see https://github.com/vllm-project/vllm/issues/10619 + torch._inductor.config.compile_threads = 1 global plugins_loaded if plugins_loaded: return From 65813781a2e2e76d18741601afe66b870a90a717 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sun, 24 Nov 2024 23:27:51 -0800 Subject: [PATCH 142/255] [torch.compile] add warning for unsupported models (#10622) Signed-off-by: youkaichao --- vllm/compilation/counter.py | 1 + vllm/compilation/decorators.py | 2 ++ vllm/plugins/__init__.py | 15 +++++++++++++++ 3 files changed, 18 insertions(+) diff --git a/vllm/compilation/counter.py b/vllm/compilation/counter.py index 100a49aba74ac..6385f1c5dbf81 100644 --- a/vllm/compilation/counter.py +++ b/vllm/compilation/counter.py @@ -5,6 +5,7 @@ @dataclasses.dataclass class CompilationCounter: + num_models_seen: int = 0 num_graphs_seen: int = 0 # including the splitting ops num_piecewise_graphs_seen: int = 0 diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 4b78491bc5a48..8b81a29936989 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -3,6 +3,7 @@ import torch +from vllm.compilation.counter import compilation_counter from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher from vllm.config import CompilationLevel, VllmConfig from vllm.logger import init_logger @@ -130,6 +131,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs): ] or not supports_dynamo() if self.do_not_compile: return + compilation_counter.num_models_seen += 1 TorchCompileWrapperWithCustomDispatcher.__init__( self, compilation_level=vllm_config.compilation_config.level) diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index bd4764c5cc79c..8b43167693598 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -80,6 +80,9 @@ def set_current_vllm_config(vllm_config: "VllmConfig"): """ global _current_vllm_config old_vllm_config = _current_vllm_config + from vllm.compilation.counter import compilation_counter + from vllm.config import CompilationLevel + num_models_seen = compilation_counter.num_models_seen try: _current_vllm_config = vllm_config yield @@ -88,6 +91,18 @@ def set_current_vllm_config(vllm_config: "VllmConfig"): vllm_config.compilation_config.enabled_custom_ops) logger.debug("disabled custom ops: %s", vllm_config.compilation_config.disabled_custom_ops) + if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \ + and compilation_counter.num_models_seen == num_models_seen: + # If the model supports compilation, + # compilation_counter.num_models_seen should be increased + # by at least 1. + # If it is not increased, it means the model does not support + # compilation (does not have @support_torch_compile decorator). + logger.warning( + "`torch.compile` is turned on, but the model %s" + " does not support it. Please open an issue on GitHub" + "if you want it to be supported.", + vllm_config.model_config.model) _current_vllm_config = old_vllm_config From 25d806e95391a8556deb69bdb214714425f776c9 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sun, 24 Nov 2024 23:40:08 -0800 Subject: [PATCH 143/255] [misc] add torch.compile compatibility check (#10618) Signed-off-by: youkaichao --- tests/v1/engine/test_engine_core_client.py | 2 +- vllm/config.py | 14 ++++++++++++++ vllm/engine/arg_utils.py | 7 +++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 7b241bf836a0e..e248e35ae4069 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -81,7 +81,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool): with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") - engine_args = EngineArgs(model=MODEL_NAME) + engine_args = EngineArgs(model=MODEL_NAME, compilation_config=3) vllm_config = engine_args.create_engine_config() executor_class = AsyncLLM._get_executor_cls(vllm_config) client = EngineCoreClient.make_client( diff --git a/vllm/config.py b/vllm/config.py index dcdaf58b5ccdb..68720f3a3034d 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2394,6 +2394,20 @@ def __post_init__(self): self.compilation_config.pass_config.enable_reshape = False self.compilation_config.level = CompilationLevel.PIECEWISE + if self.cache_config is not None and \ + self.cache_config.cpu_offload_gb > 0 and \ + self.compilation_config.level != CompilationLevel.NO_COMPILATION: + logger.warning( + "CPU offload is not supported with `torch.compile` yet." + " Disabling `torch.compile`.") + self.compilation_config.level = CompilationLevel.NO_COMPILATION + + if self.lora_config is not None and self.compilation_config.level !=\ + CompilationLevel.NO_COMPILATION: + logger.warning("LoRA is not supported with `torch.compile` yet. " + "Disabling `torch.compile`.") + self.compilation_config.level = CompilationLevel.NO_COMPILATION + current_platform.check_and_update_config(self) def __str__(self): diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 82f1ef51255e9..a43e133f21ac2 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -197,6 +197,13 @@ def __post_init__(self): if not self.tokenizer: self.tokenizer = self.model + # support `EngineArgs(compilation_config={...})` + # without having to manually construct a + # CompilationConfig object + if isinstance(self.compilation_config, (int, dict)): + self.compilation_config = CompilationConfig.from_cli( + json.dumps(self.compilation_config)) + # Setup plugins from vllm.plugins import load_general_plugins load_general_plugins() From 05d1f8c9c64b4458ae7cee2650eb97498146ee50 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 25 Nov 2024 01:27:30 -0800 Subject: [PATCH 144/255] [misc] move functions to config.py (#10624) Signed-off-by: youkaichao --- tests/compile/piecewise/test_simple.py | 4 +- tests/compile/piecewise/test_toy_llama.py | 4 +- tests/kernels/test_encoder_decoder_attn.py | 3 +- .../model_executor/test_enabled_custom_ops.py | 3 +- vllm/attention/layer.py | 3 +- vllm/compilation/wrapper.py | 3 +- vllm/config.py | 51 +++++++++++++++++ vllm/model_executor/custom_op.py | 2 +- vllm/model_executor/model_loader/loader.py | 3 +- .../model_executor/model_loader/tensorizer.py | 3 +- vllm/plugins/__init__.py | 56 ------------------- 11 files changed, 62 insertions(+), 73 deletions(-) diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index 0db12d6b6a43c..7ef502abee345 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -10,8 +10,8 @@ from vllm.compilation.compile_context import set_compile_context from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile -from vllm.config import CompilationConfig, CompilationLevel, VllmConfig -from vllm.plugins import set_current_vllm_config +from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig, + set_current_vllm_config) from vllm.utils import direct_register_custom_op global_counter = 0 diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index cfe661b8871e0..dbd5a3bbffeab 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -16,8 +16,8 @@ from vllm.compilation.compile_context import set_compile_context from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile -from vllm.config import CompilationConfig, CompilationLevel, VllmConfig -from vllm.plugins import set_current_vllm_config +from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig, + set_current_vllm_config) from vllm.utils import direct_register_custom_op # create a library to hold the custom op diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py index c4b72ba6bf4ee..d943b048b7934 100644 --- a/tests/kernels/test_encoder_decoder_attn.py +++ b/tests/kernels/test_encoder_decoder_attn.py @@ -18,10 +18,9 @@ from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP from vllm.attention.selector import (_Backend, _cached_get_attn_backend, global_force_attn_backend_context_manager) -from vllm.config import VllmConfig +from vllm.config import VllmConfig, set_current_vllm_config from vllm.forward_context import set_forward_context from vllm.platforms import current_platform -from vllm.plugins import set_current_vllm_config # List of support backends for encoder/decoder models LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN] diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py index c54e30995da49..0a3aba255fd76 100644 --- a/tests/model_executor/test_enabled_custom_ops.py +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -2,13 +2,12 @@ import pytest -from vllm.config import CompilationConfig, VllmConfig +from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.activation import (GeluAndMul, ReLUSquaredActivation, SiluAndMul) from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.plugins import set_current_vllm_config # Registered subclass for test diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 1bb335909484b..17157617248f7 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -7,13 +7,12 @@ import vllm.envs as envs from vllm.attention import AttentionMetadata, AttentionType from vllm.attention.selector import backend_name_to_enum, get_attn_backend -from vllm.config import CacheConfig +from vllm.config import CacheConfig, get_current_vllm_config from vllm.forward_context import ForwardContext, get_forward_context from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.platforms import current_platform -from vllm.plugins import get_current_vllm_config from vllm.utils import direct_register_custom_op diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index 0143d0301ca1a..bc4d292fef402 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -8,7 +8,7 @@ import torch import vllm.envs as envs -from vllm.config import CompilationLevel +from vllm.config import CompilationLevel, get_current_vllm_config class TorchCompileWrapperWithCustomDispatcher: @@ -32,7 +32,6 @@ def __init__(self, # default compilation settings # compiling the forward method - from vllm.plugins import get_current_vllm_config backend = get_current_vllm_config( ).compilation_config.init_backend() diff --git a/vllm/config.py b/vllm/config.py index 68720f3a3034d..0a390c4311ba6 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3,6 +3,7 @@ import hashlib import json import warnings +from contextlib import contextmanager from dataclasses import dataclass, field, replace from pathlib import Path from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Counter, Dict, @@ -2450,3 +2451,53 @@ def __str__(self): self.cache_config.enable_prefix_caching, self.model_config.use_async_output_proc, self.model_config.mm_processor_kwargs) + + +_current_vllm_config: Optional[VllmConfig] = None + + +@contextmanager +def set_current_vllm_config(vllm_config: VllmConfig): + """ + Temporarily set the current VLLM config. + Used during model initialization. + We save the current VLLM config in a global variable, + so that all modules can access it, e.g. custom ops + can access the VLLM config to determine how to dispatch. + """ + global _current_vllm_config + old_vllm_config = _current_vllm_config + from vllm.compilation.counter import compilation_counter + num_models_seen = compilation_counter.num_models_seen + try: + _current_vllm_config = vllm_config + yield + finally: + logger.debug("enabled custom ops: %s", + vllm_config.compilation_config.enabled_custom_ops) + logger.debug("disabled custom ops: %s", + vllm_config.compilation_config.disabled_custom_ops) + if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \ + and compilation_counter.num_models_seen == num_models_seen: + # If the model supports compilation, + # compilation_counter.num_models_seen should be increased + # by at least 1. + # If it is not increased, it means the model does not support + # compilation (does not have @support_torch_compile decorator). + logger.warning( + "`torch.compile` is turned on, but the model %s" + " does not support it. Please open an issue on GitHub" + "if you want it to be supported.", + vllm_config.model_config.model) + _current_vllm_config = old_vllm_config + + +def get_current_vllm_config() -> VllmConfig: + if _current_vllm_config is None: + # in ci, usually when we test custom ops/modules directly, + # we don't set the vllm config. In that case, we set a default + # config. + logger.warning("Current VLLM config is not set.") + from vllm.config import VllmConfig + return VllmConfig() + return _current_vllm_config diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index b07966f2ab7d0..fddc8bad09ef5 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -2,9 +2,9 @@ import torch.nn as nn +from vllm.config import get_current_vllm_config from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.plugins import get_current_vllm_config from vllm.utils import print_warning_once logger = init_logger(__name__) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 34e0860162260..441dd409b4f9d 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -23,7 +23,7 @@ from transformers.utils import SAFE_WEIGHTS_INDEX_NAME from vllm.config import (LoadConfig, LoadFormat, ModelConfig, ParallelConfig, - VllmConfig) + VllmConfig, set_current_vllm_config) from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.envs import VLLM_USE_MODELSCOPE @@ -47,7 +47,6 @@ safetensors_weights_iterator) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform -from vllm.plugins import set_current_vllm_config from vllm.utils import is_pin_memory_available diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 3fd668765a1b1..87f3fcb5cae00 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -13,13 +13,12 @@ from transformers import PretrainedConfig import vllm.envs as envs -from vllm.config import ModelConfig, ParallelConfig +from vllm.config import ModelConfig, ParallelConfig, set_current_vllm_config from vllm.engine.arg_utils import EngineArgs from vllm.engine.llm_engine import LLMEngine from vllm.logger import init_logger from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) -from vllm.plugins import set_current_vllm_config from vllm.utils import FlexibleArgumentParser tensorizer_error_msg = None diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 8b43167693598..3c64726ca3344 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -1,15 +1,10 @@ import logging import os -from contextlib import contextmanager -from typing import TYPE_CHECKING, Optional import torch import vllm.envs as envs -if TYPE_CHECKING: - from vllm.config import VllmConfig - logger = logging.getLogger(__name__) # make sure one process only loads plugins once @@ -64,54 +59,3 @@ def load_general_plugins(): logger.info("plugin %s loaded.", plugin.name) except Exception: logger.exception("Failed to load plugin %s", plugin.name) - - -_current_vllm_config: Optional["VllmConfig"] = None - - -@contextmanager -def set_current_vllm_config(vllm_config: "VllmConfig"): - """ - Temporarily set the current VLLM config. - Used during model initialization. - We save the current VLLM config in a global variable, - so that all modules can access it, e.g. custom ops - can access the VLLM config to determine how to dispatch. - """ - global _current_vllm_config - old_vllm_config = _current_vllm_config - from vllm.compilation.counter import compilation_counter - from vllm.config import CompilationLevel - num_models_seen = compilation_counter.num_models_seen - try: - _current_vllm_config = vllm_config - yield - finally: - logger.debug("enabled custom ops: %s", - vllm_config.compilation_config.enabled_custom_ops) - logger.debug("disabled custom ops: %s", - vllm_config.compilation_config.disabled_custom_ops) - if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \ - and compilation_counter.num_models_seen == num_models_seen: - # If the model supports compilation, - # compilation_counter.num_models_seen should be increased - # by at least 1. - # If it is not increased, it means the model does not support - # compilation (does not have @support_torch_compile decorator). - logger.warning( - "`torch.compile` is turned on, but the model %s" - " does not support it. Please open an issue on GitHub" - "if you want it to be supported.", - vllm_config.model_config.model) - _current_vllm_config = old_vllm_config - - -def get_current_vllm_config() -> "VllmConfig": - if _current_vllm_config is None: - # in ci, usually when we test custom ops/modules directly, - # we don't set the vllm config. In that case, we set a default - # config. - logger.warning("Current VLLM config is not set.") - from vllm.config import VllmConfig - return VllmConfig() - return _current_vllm_config From ed46f143212203b7afcbc8538119b6e8155c643e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 25 Nov 2024 17:51:20 +0800 Subject: [PATCH 145/255] [Model] Support `is_causal` HF config field for Qwen2 model (#10621) Signed-off-by: DarkLight1337 --- docs/source/models/supported_models.rst | 13 +++++++++--- .../embedding/language/test_embedding.py | 12 +++++++++-- tests/models/embedding/utils.py | 4 ++-- vllm/config.py | 15 ++++++++++---- vllm/model_executor/models/qwen2.py | 20 +++++++++++++++++-- 5 files changed, 51 insertions(+), 13 deletions(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index ccd2d8de8ec0b..54e2c4479c2c9 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -342,7 +342,7 @@ Text Embedding - ✅︎ * - :code:`Qwen2Model`, :code:`Qwen2ForCausalLM` - Qwen2-based - - :code:`ssmits/Qwen2-7B-Instruct-embed-base`, :code:`Alibaba-NLP/gte-Qwen2-1.5B-instruct`, etc. + - :code:`ssmits/Qwen2-7B-Instruct-embed-base`, :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. - ✅︎ - ✅︎ * - :code:`RobertaModel`, :code:`RobertaForMaskedLM` @@ -363,6 +363,13 @@ Text Embedding .. tip:: You can override the model's pooling method by passing :code:`--override-pooler-config`. +.. note:: + Unlike base Qwen2, :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention. + You can set `--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly. + + On the other hand, its 1.5B variant (:code:`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention + despite being described otherwise on its model card. + Reward Modeling --------------- @@ -606,10 +613,10 @@ Text Generation | :sup:`+` Multiple items can be inputted per text prompt for this modality. .. note:: - vLLM currently only supports adding LoRA to the language backbone of multimodal models. + vLLM currently only supports adding LoRA to the language backbone of multimodal models. .. note:: - For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now. + The official :code:`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now. For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 Multimodal Embedding diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index c3f351ef707be..36b1e5887981c 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -21,6 +21,7 @@ marks=[pytest.mark.core_model]), pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"), pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"), + pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"), ], ) @pytest.mark.parametrize("dtype", ["half"]) @@ -31,6 +32,10 @@ def test_models( model, dtype: str, ) -> None: + vllm_extra_kwargs = {} + if model == "Alibaba-NLP/gte-Qwen2-7B-instruct": + vllm_extra_kwargs["hf_overrides"] = {"is_causal": False} + # The example_prompts has ending "\n", for example: # "Write a short story about a robot that dreams for the first time.\n" # sentence_transformers will strip the input texts, see: @@ -43,8 +48,11 @@ def test_models( is_sentence_transformer=True) as hf_model: hf_outputs = hf_model.encode(example_prompts) - with vllm_runner(model, task="embedding", dtype=dtype, - max_model_len=None) as vllm_model: + with vllm_runner(model, + task="embedding", + dtype=dtype, + max_model_len=None, + **vllm_extra_kwargs) as vllm_model: vllm_outputs = vllm_model.encode(example_prompts) # This test is for verifying whether the model's extra_repr # can be printed correctly. diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py index fd1c44d9c117e..f96c7d2b176db 100644 --- a/tests/models/embedding/utils.py +++ b/tests/models/embedding/utils.py @@ -24,7 +24,7 @@ def check_embeddings_close( dim=0) fail_msg = (f"Test{prompt_idx}:" - f"\n{name_0}:\t{embeddings_0!r}" - f"\n{name_1}:\t{embeddings_1!r}") + f"\n{name_0}:\t{embeddings_0[:16]!r}" + f"\n{name_1}:\t{embeddings_1[:16]!r}") assert sim >= 1 - tol, fail_msg diff --git a/vllm/config.py b/vllm/config.py index 0a390c4311ba6..f9ecb02cd5bde 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -27,7 +27,7 @@ get_hf_text_config, get_pooling_config, get_sentence_transformer_tokenizer_config, is_encoder_decoder, uses_mrope) from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory, - identity, print_warning_once, resolve_obj_by_qualname) + print_warning_once, resolve_obj_by_qualname) if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup @@ -183,7 +183,7 @@ def __init__( hf_overrides_fn = hf_overrides else: hf_overrides_kw = hf_overrides - hf_overrides_fn = identity + hf_overrides_fn = None if rope_scaling is not None: hf_override: Dict[str, Any] = {"rope_scaling": rope_scaling} @@ -212,8 +212,15 @@ def __init__( self.skip_tokenizer_init = skip_tokenizer_init hf_config = get_config(self.model, trust_remote_code, revision, - code_revision, config_format, **hf_overrides_kw) - hf_config = hf_overrides_fn(hf_config) + code_revision, config_format) + + if hf_overrides_kw: + logger.info("Overriding HF config with %s", hf_overrides_kw) + hf_config.update(hf_overrides_kw) + if hf_overrides_fn: + logger.info("Overriding HF config with %s", hf_overrides_fn) + hf_config = hf_overrides_fn(hf_config) + self.hf_config = hf_config self.hf_text_config = get_hf_text_config(self.hf_config) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 370cff5fa153f..8da75c9935a13 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -27,7 +27,7 @@ from torch import nn from transformers import Qwen2Config -from vllm.attention import Attention, AttentionMetadata +from vllm.attention import Attention, AttentionMetadata, AttentionType from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size @@ -164,11 +164,17 @@ def forward( hidden_states: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: AttentionMetadata, + attn_type: str = AttentionType.DECODER, ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - attn_output = self.attn(q, k, v, kv_cache, attn_metadata) + attn_output = self.attn(q, + k, + v, + kv_cache, + attn_metadata, + attn_type=attn_type) output, _ = self.o_proj(attn_output) return output @@ -210,6 +216,15 @@ def __init__( self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + # By default, Qwen2 uses causal attention as it is a decoder-only model. + # You can override the HF config with `is_causal=False` to enable + # bidirectional attention, which is used in some embedding models + # (e.g. Alibaba-NLP/gte-Qwen2-7B-instruct) + if getattr(config, "is_causal", True): + self._attn_type = AttentionType.DECODER + else: + self._attn_type = AttentionType.ENCODER_ONLY + def forward( self, positions: torch.Tensor, @@ -230,6 +245,7 @@ def forward( hidden_states=hidden_states, kv_cache=kv_cache, attn_metadata=attn_metadata, + attn_type=self._attn_type, ) # Fully Connected From 2b0879bfc273a08d339b952890c4e88e77f0a014 Mon Sep 17 00:00:00 2001 From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> Date: Mon, 25 Nov 2024 21:08:30 +0800 Subject: [PATCH 146/255] Super tiny little typo fix (#10633) --- docs/source/quantization/fp8_e5m2_kvcache.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/quantization/fp8_e5m2_kvcache.rst b/docs/source/quantization/fp8_e5m2_kvcache.rst index 9ae07bcd3b991..b2d824427f786 100644 --- a/docs/source/quantization/fp8_e5m2_kvcache.rst +++ b/docs/source/quantization/fp8_e5m2_kvcache.rst @@ -4,7 +4,7 @@ FP8 E5M2 KV Cache ================== The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits. -The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bflaot16 and fp8 to each other. +The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bfloat16 and fp8 to each other. Here is an example of how to enable this feature: From d04b13a380da422afa1883efc81e0d4c4b18d091 Mon Sep 17 00:00:00 2001 From: Chauncey Date: Tue, 26 Nov 2024 00:21:41 +0800 Subject: [PATCH 147/255] [Bug]: Authorization ignored when root_path is set (#10606) Signed-off-by: chaunceyjiang --- tests/entrypoints/openai/test_root_path.py | 103 +++++++++++++++++++++ vllm/entrypoints/openai/api_server.py | 6 +- 2 files changed, 107 insertions(+), 2 deletions(-) create mode 100644 tests/entrypoints/openai/test_root_path.py diff --git a/tests/entrypoints/openai/test_root_path.py b/tests/entrypoints/openai/test_root_path.py new file mode 100644 index 0000000000000..20f7960619efb --- /dev/null +++ b/tests/entrypoints/openai/test_root_path.py @@ -0,0 +1,103 @@ +import contextlib +import os +from typing import Any, List, NamedTuple + +import openai # use the official client for correctness check +import pytest + +from ...utils import RemoteOpenAIServer + +# # any model with a chat template should work here +MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct" +DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 +API_KEY = "abc-123" +ERROR_API_KEY = "abc" +ROOT_PATH = "llm" + + +@pytest.fixture(scope="module") +def server(): + args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "float16", + "--enforce-eager", + "--max-model-len", + "4080", + "--root-path", # use --root-path=/llm for testing + "/" + ROOT_PATH, + "--chat-template", + DUMMY_CHAT_TEMPLATE, + ] + envs = os.environ.copy() + + envs["VLLM_API_KEY"] = API_KEY + with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server: + yield remote_server + + +class TestCase(NamedTuple): + model_name: str + base_url: List[str] + api_key: str + expected_error: Any + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "test_case", + [ + TestCase( + model_name=MODEL_NAME, + base_url=["v1"], # http://localhost:8000/v1 + api_key=ERROR_API_KEY, + expected_error=openai.AuthenticationError), + TestCase( + model_name=MODEL_NAME, + base_url=[ROOT_PATH, "v1"], # http://localhost:8000/llm/v1 + api_key=ERROR_API_KEY, + expected_error=openai.AuthenticationError), + TestCase( + model_name=MODEL_NAME, + base_url=["v1"], # http://localhost:8000/v1 + api_key=API_KEY, + expected_error=None), + TestCase( + model_name=MODEL_NAME, + base_url=[ROOT_PATH, "v1"], # http://localhost:8000/llm/v1 + api_key=API_KEY, + expected_error=None), + ], +) +async def test_chat_session_root_path_with_api_key(server: RemoteOpenAIServer, + test_case: TestCase): + saying: str = "Here is a common saying about apple. An apple a day, keeps" + ctx = contextlib.nullcontext() + if test_case.expected_error is not None: + ctx = pytest.raises(test_case.expected_error) + with ctx: + client = openai.AsyncOpenAI( + api_key=test_case.api_key, + base_url=server.url_for(*test_case.base_url), + max_retries=0) + chat_completion = await client.chat.completions.create( + model=test_case.model_name, + messages=[{ + "role": "user", + "content": "tell me a common saying" + }, { + "role": "assistant", + "content": saying + }], + extra_body={ + "continue_final_message": True, + "add_generation_prompt": False + }) + + assert chat_completion.id is not None + assert len(chat_completion.choices) == 1 + choice = chat_completion.choices[0] + assert choice.finish_reason == "stop" + message = choice.message + assert len(message.content) > 0 + assert message.role == "assistant" diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 2b1f14b89b1f2..bc018be982bff 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -499,10 +499,12 @@ async def validation_exception_handler(_, exc): @app.middleware("http") async def authentication(request: Request, call_next): - root_path = "" if args.root_path is None else args.root_path if request.method == "OPTIONS": return await call_next(request) - if not request.url.path.startswith(f"{root_path}/v1"): + url_path = request.url.path + if app.root_path and url_path.startswith(app.root_path): + url_path = url_path[len(app.root_path):] + if not url_path.startswith("/v1"): return await call_next(request) if request.headers.get("Authorization") != "Bearer " + token: return JSONResponse(content={"error": "Unauthorized"}, From c27df94e1ff98551b987b40bb2049bf4640e202a Mon Sep 17 00:00:00 2001 From: Wallas Henrique Date: Mon, 25 Nov 2024 14:23:32 -0300 Subject: [PATCH 148/255] [Bugfix] Fix chunked prefill with model dtype float32 on Turing Devices (#9850) Signed-off-by: Wallas Santos Co-authored-by: Michael Goin --- pyproject.toml | 1 + tests/conftest.py | 19 +++++++++ tests/kernels/test_prefix_prefill.py | 63 ++++++++++++++++++++++++++++ vllm/attention/ops/prefix_prefill.py | 41 ++++++++++++------ vllm/config.py | 10 +++++ vllm/engine/arg_utils.py | 1 + 6 files changed, 122 insertions(+), 13 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3c8c46cc8621e..253b706a774a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -98,4 +98,5 @@ markers = [ "quant_model: run this model test under Quantized category", "distributed_2_gpus: run this test only in distributed tests for 2 GPUs", "skip_v1: do not run this test with v1", + "optional: optional tests that are automatically skipped, include --optional to run them", ] diff --git a/tests/conftest.py b/tests/conftest.py index 29707f975e2a0..d56942d8912af 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1030,3 +1030,22 @@ def dummy_gemma2_embedding_path(): with open(json_path, "w") as f: json.dump(config, f) return _dummy_gemma2_embedding_path + + +# Add the flag `--optional` to allow run tests +# that are marked with @pytest.mark.optional +def pytest_addoption(parser): + parser.addoption("--optional", + action="store_true", + default=False, + help="run optional test") + + +def pytest_collection_modifyitems(config, items): + if config.getoption("--optional"): + # --optional given in cli: do not skip optional tests + return + skip_optional = pytest.mark.skip(reason="need --optional option to run") + for item in items: + if "optional" in item.keywords: + item.add_marker(skip_optional) diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py index a8a187ebaede4..3fdb7996ba4e0 100644 --- a/tests/kernels/test_prefix_prefill.py +++ b/tests/kernels/test_prefix_prefill.py @@ -40,6 +40,13 @@ def test_contexted_kv_attention( kv_cache_dtype: str, device: str, ) -> None: + + if 'fp8' in kv_cache_dtype and not current_platform.has_device_capability( + 89): + pytest.skip( + 'Triton limitation: fp8e4nv data type is not supported on CUDA' + ' arch < 89') + current_platform.seed_everything(0) torch.set_default_device(device) @@ -235,6 +242,13 @@ def test_contexted_kv_attention_alibi( kv_cache_dtype: str, device: str, ) -> None: + + if 'fp8' in kv_cache_dtype and not current_platform.has_device_capability( + 89): + pytest.skip( + 'Triton limitation: fp8e4nv data type is not supported on CUDA' + ' arch < 89') + current_platform.seed_everything(0) torch.set_default_device(device) @@ -462,3 +476,52 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: print(f"xformers Time: {(end_time - start_time)*1000:.2f} ms") atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-6 torch.testing.assert_close(output, output_ref, atol=atol, rtol=0) + + +# These tests are optional to only run when explicitly invoked +# +# pytest -v -s --optional \ +# tests/kernels/test_prefix_prefill.py::test_contexted_kv_attention_f32 +# +# These tests are useful to test model dtype float32 on Turing devices. +# We skip them to not increase the time when running tests on CI +@pytest.mark.optional +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("dtype", [torch.float32]) +@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("sliding_window", SLIDING_WINDOW) +@torch.inference_mode() +def test_contexted_kv_attention_f32( + num_heads: int, + num_queries_per_kv: int, + head_size: int, + sliding_window: int, + dtype: torch.dtype, + kv_cache_dtype: str, + device: str, +) -> None: + test_contexted_kv_attention(num_heads, num_queries_per_kv, head_size, + sliding_window, dtype, kv_cache_dtype, device) + + +@pytest.mark.optional +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("dtype", [torch.float32]) +@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_contexted_kv_attention_alibi_f32( + num_heads: int, + num_queries_per_kv: int, + head_size: int, + dtype: torch.dtype, + kv_cache_dtype: str, + device: str, +) -> None: + test_contexted_kv_attention_alibi(num_heads, num_queries_per_kv, head_size, + dtype, kv_cache_dtype, device) diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py index a2a649c8ebcfd..9c11a8df55278 100644 --- a/vllm/attention/ops/prefix_prefill.py +++ b/vllm/attention/ops/prefix_prefill.py @@ -7,6 +7,13 @@ from vllm.platforms import current_platform +# Static kernels parameters +BASE_BLOCK = 128 if current_platform.has_device_capability(80) else 64 +NUM_WARPS = 8 + +# To check compatibility +IS_TURING = current_platform.get_device_capability() == (7, 5) + if triton.__version__ >= "2.1.0": @triton.jit @@ -50,6 +57,7 @@ def _fwd_kernel( stride_v_cache_d, stride_v_cache_bl, num_queries_per_kv: int, + IN_PRECISION: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, # head size BLOCK_DMODEL_PADDED: tl.constexpr, # head size padded to a power of 2 @@ -130,7 +138,7 @@ def _fwd_kernel( k = k_load qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) # [M,N] - qk += tl.dot(q, k) + qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION) qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk, float("-inf")) qk *= sm_scale @@ -178,7 +186,7 @@ def _fwd_kernel( v = v_load p = p.to(v.dtype) - acc += tl.dot(p, v) + acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION) # # update m_i and l_i l_i = l_i_new m_i = m_i_new @@ -204,7 +212,7 @@ def _fwd_kernel( other=0.0) qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - qk += tl.dot(q, k) + qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION) qk *= sm_scale # apply causal mask qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, @@ -238,7 +246,7 @@ def _fwd_kernel( other=0.0) p = p.to(v.dtype) - acc += tl.dot(p, v) + acc = tl.dot(p, v, acc=acc, input_precision=IN_PRECISION) # update m_i and l_i l_i = l_i_new m_i = m_i_new @@ -485,6 +493,7 @@ def _fwd_kernel_alibi( stride_v_cache_d, stride_v_cache_bl, num_queries_per_kv: int, + IN_PRECISION: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, # head size BLOCK_DMODEL_PADDED: tl.constexpr, # head size padded to a power of 2 @@ -560,7 +569,7 @@ def _fwd_kernel_alibi( k = k_load qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - qk += tl.dot(q, k) + qk = tl.dot(q, k, acc=qk, input_precision=IN_PRECISION) qk = tl.where((start_n + offs_n[None, :]) < cur_batch_ctx_len, qk, float("-inf")) qk *= sm_scale @@ -600,7 +609,7 @@ def _fwd_kernel_alibi( v = v_load p = p.to(v.dtype) - acc += tl.dot(p, v, allow_tf32=False) + acc = tl.dot(p, v, acc=acc, input_precision='ieee') # update m_i and l_i l_i = l_i_new m_i = m_i_new @@ -635,7 +644,7 @@ def _fwd_kernel_alibi( other=0.0) qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - qk += tl.dot(q, k, allow_tf32=False) + qk = tl.dot(q, k, acc=qk, input_precision='ieee') qk *= sm_scale qk = tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), qk, float("-inf")) @@ -673,7 +682,7 @@ def _fwd_kernel_alibi( other=0.0) p = p.to(v.dtype) - acc += tl.dot(p, v, allow_tf32=False) + acc = tl.dot(p, v, acc=acc, input_precision='ieee') # update m_i and l_i l_i = l_i_new m_i = m_i_new @@ -709,13 +718,17 @@ def context_attention_fwd(q, alibi_slopes=None, sliding_window=None): - BLOCK = 128 if current_platform.has_device_capability(80) else 64 - NUM_WARPS = 8 - + q_dtype_is_f32 = q.dtype is torch.float32 # need to reduce num. blocks when using fp32 # due to increased use of GPU shared memory - if q.dtype is torch.float32: - BLOCK = BLOCK // 2 + # if q.dtype is torch.float32: + BLOCK = BASE_BLOCK // 2 if q_dtype_is_f32 else BASE_BLOCK + + # Turing does have tensor core for float32 multiplication + # use ieee as fallback for triton kernels work. There is also + # warning on vllm/config.py to inform users this fallback + # implementation + IN_PRECISION = 'ieee' if IS_TURING and q_dtype_is_f32 else None # Conversion of FP8 Tensor from uint8 storage to # appropriate torch.dtype for interpretation by Triton @@ -799,6 +812,7 @@ def context_attention_fwd(q, v_cache.stride( 3), #[num_blocks, num_kv_heads, head_size, block_size] num_queries_per_kv=num_queries_per_kv, + IN_PRECISION=IN_PRECISION, BLOCK_M=BLOCK, BLOCK_DMODEL=Lk, BLOCK_DMODEL_PADDED=Lk_padded, @@ -850,6 +864,7 @@ def context_attention_fwd(q, v_cache.stride( 3), #[num_blocks, num_kv_heads, head_size, block_size] num_queries_per_kv=num_queries_per_kv, + IN_PRECISION=IN_PRECISION, BLOCK_M=BLOCK, BLOCK_DMODEL=Lk, BLOCK_DMODEL_PADDED=Lk_padded, diff --git a/vllm/config.py b/vllm/config.py index f9ecb02cd5bde..c87feaec3e5f6 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2388,6 +2388,16 @@ def __post_init__(self): self.quant_config = VllmConfig._get_quantization_config( self.model_config, self.load_config) + if self.scheduler_config is not None and \ + self.model_config is not None and \ + self.scheduler_config.chunked_prefill_enabled and \ + self.model_config.dtype == torch.float32 and \ + current_platform.get_device_capability() == (7, 5): + print_warning_once( + "Turing devices tensor cores do not support float32 matmul. " + "To workaround this limitation, vLLM will set 'ieee' input " + "precision for chunked prefill triton kernels.") + if self.compilation_config is None: self.compilation_config = CompilationConfig() if envs.VLLM_USE_V1 and not self.model_config.enforce_eager: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index a43e133f21ac2..ca68c1d57151c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1055,6 +1055,7 @@ def create_engine_config(self) -> VllmConfig: msg = "Chunked prefill is not supported for embedding models" raise ValueError(msg) + speculative_config = SpeculativeConfig.maybe_create_spec_config( target_model_config=model_config, target_parallel_config=parallel_config, From 452a4e80c3dfc6596cd89c7a87dfb7036bab8acd Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Mon, 25 Nov 2024 09:34:46 -0800 Subject: [PATCH 149/255] [Docs] Add Snowflake Slides (#10641) Signed-off-by: simon-mo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4e1353d98f1dc..cfeb24cbb5823 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ Easy, fast, and cheap LLM serving for everyone --- *Latest News* 🔥 -- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing). +- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing). - [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there! - [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users! - [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing). From b1d920531f6d5fd6c020096499c91a8f26620cd6 Mon Sep 17 00:00:00 2001 From: zhou fan <1247714429@qq.com> Date: Tue, 26 Nov 2024 02:10:55 +0800 Subject: [PATCH 150/255] [Model]: Add support for Aria model (#10514) Signed-off-by: xffxff <1247714429@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com> --- docs/source/models/supported_models.rst | 6 + examples/offline_inference_vision_language.py | 18 + ...e_inference_vision_language_multi_image.py | 20 + tests/models/registry.py | 2 + vllm/entrypoints/chat_utils.py | 2 + vllm/model_executor/models/aria.py | 695 ++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + vllm/transformers_utils/configs/aria.py | 47 ++ 8 files changed, 791 insertions(+) create mode 100644 vllm/model_executor/models/aria.py create mode 100644 vllm/transformers_utils/configs/aria.py diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 54e2c4479c2c9..7a6932d65e653 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -476,6 +476,12 @@ Text Generation - Example HF Models - :ref:`LoRA ` - :ref:`PP ` + * - :code:`AriaForConditionalGeneration` + - Aria + - T + I + - :code:`rhymes-ai/Aria` + - + - ✅︎ * - :code:`Blip2ForConditionalGeneration` - BLIP-2 - T + I\ :sup:`E` diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 11af6880e1b5a..f08f22eec164a 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -402,6 +402,23 @@ def run_idefics3(question: str, modality: str): return llm, prompt, stop_token_ids +# Aria +def run_aria(question: str, modality: str): + assert modality == "image" + model_name = "rhymes-ai/Aria" + + llm = LLM(model=model_name, + tokenizer_mode="slow", + trust_remote_code=True, + dtype="bfloat16") + + prompt = (f"<|im_start|>user\n<|img|>\n{question}" + "<|im_end|>\n<|im_start|>assistant\n") + + stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] + return llm, prompt, stop_token_ids + + model_example_map = { "llava": run_llava, "llava-next": run_llava_next, @@ -423,6 +440,7 @@ def run_idefics3(question: str, modality: str): "molmo": run_molmo, "glm4v": run_glm4v, "idefics3": run_idefics3, + "aria": run_aria, } diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py index dc12df8d78211..788b604cfd4a0 100644 --- a/examples/offline_inference_vision_language_multi_image.py +++ b/examples/offline_inference_vision_language_multi_image.py @@ -321,6 +321,25 @@ def load_idefics3(question, image_urls: List[str]) -> ModelRequestData: ) +def load_aria(question, image_urls: List[str]) -> ModelRequestData: + model_name = "rhymes-ai/Aria" + llm = LLM(model=model_name, + tokenizer_mode="slow", + trust_remote_code=True, + dtype="bfloat16", + limit_mm_per_prompt={"image": len(image_urls)}) + placeholders = "<|img|>\n" * len(image_urls) + prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n" + "<|im_start|>assistant\n") + stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] + return ModelRequestData( + llm=llm, + prompt=prompt, + stop_token_ids=stop_token_ids, + image_data=[fetch_image(url) for url in image_urls], + chat_template=None) + + model_example_map = { "phi3_v": load_phi3v, "h2ovl_chat": load_h2onvl, @@ -330,6 +349,7 @@ def load_idefics3(question, image_urls: List[str]) -> ModelRequestData: "qwen_vl_chat": load_qwenvl_chat, "mllama": load_mllama, "idefics3": load_idefics3, + "aria": load_aria, } diff --git a/tests/models/registry.py b/tests/models/registry.py index fa0818c4f0bd1..669c832b1df3a 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -43,6 +43,8 @@ class _HfExamplesInfo: trust_remote_code=True), "ArcticForCausalLM": _HfExamplesInfo("Snowflake/snowflake-arctic-instruct", trust_remote_code=True), + "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria", + trust_remote_code=True), "BaiChuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan-7B", trust_remote_code=True), "BaichuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan2-7B-chat", diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index abee5ac46391c..c2054dcbfce0e 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -412,6 +412,8 @@ def _placeholder_str(self, modality: ModalityStr, return "" if model_type == "idefics3": return "" + if model_type == "aria": + return "<|fim_prefix|><|img|><|fim_suffix|>" raise TypeError(f"Unknown {modality} model type: {model_type}") elif modality == "audio": diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py new file mode 100644 index 0000000000000..0356435e9c257 --- /dev/null +++ b/vllm/model_executor/models/aria.py @@ -0,0 +1,695 @@ +import math +from typing import Iterable, List, Optional, Set, Tuple, TypedDict, Union + +import torch +import torch.nn as nn +from torch.nn.init import trunc_normal_ +from transformers import LlamaConfig + +from vllm.attention import AttentionMetadata +from vllm.config import CacheConfig, QuantizationConfig, VllmConfig +from vllm.distributed import get_tensor_model_parallel_rank +from vllm.inputs import INPUT_REGISTRY, token_inputs +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( + get_compressed_tensors_cache_scale) +from vllm.model_executor.layers.sampler import (Sampler, SamplerOutput, + SamplingMetadata) +from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) +from vllm.model_executor.models.idefics2_vision_model import ( + Idefics2VisionTransformer) +from vllm.model_executor.models.interfaces import SupportsMultiModal +from vllm.model_executor.models.llama import (LlamaDecoderLayer, LlamaMLP, + LlamaModel) +from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper, + is_pp_missing_parameter, + make_layers, maybe_prefix, + merge_multimodal_embeddings) +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.base import MultiModalInputs +from vllm.multimodal.image import cached_get_image_processor +from vllm.multimodal.inputs import NestedTensors +from vllm.multimodal.utils import (cached_get_tokenizer, + repeat_and_pad_placeholder_tokens) +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.configs.aria import (AriaMoELMConfig, + AriaVisionConfig) + +from .utils import flatten_bn + + +class AriaImagePixelInputs(TypedDict): + pixel_values: torch.Tensor + pixel_mask: Optional[torch.Tensor] + """ + Shape: + pixel_values: `(batch_size * num_images, num_channels, height, width)` + pixel_mask: `(batch_size * num_images, height, width)` + """ + + +class AriaVisionTransformer(Idefics2VisionTransformer): + """ + AriaVisionTransformer is a modified version of Idefics2VisionTransformer + that replaces the post-layernorm with an identity layer. + """ + + def __init__( + self, + config: AriaVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__(config, quant_config, prefix) + self.post_layernorm = nn.Identity() + + +class AriaVisionModel(nn.Module): + config_class = AriaVisionConfig + + def __init__( + self, + config: AriaVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + *, + prefix: str = "", + ) -> None: + super().__init__() + + self.vision_model = AriaVisionTransformer( + config, + quant_config, + prefix=f"{prefix}.vision_model", + ) + + def forward( + self, + pixel_values: torch.Tensor, + pixel_mask: Optional[torch.BoolTensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.BoolTensor]]: + patch_attention_mask = self._create_patch_attention_mask(pixel_mask) + + vit_oup = self.vision_model( + pixel_values=pixel_values, + patch_attention_mask=patch_attention_mask, + ) + + image_atts = self._create_image_attention_mask(patch_attention_mask) + + return vit_oup, image_atts + + def _create_patch_attention_mask(self, pixel_mask): + if pixel_mask is None: + return None + + patches_subgrid = pixel_mask.unfold( + dimension=1, + size=self.vision_model.config.patch_size, + step=self.vision_model.config.patch_size, + ).unfold( + dimension=2, + size=self.vision_model.config.patch_size, + step=self.vision_model.config.patch_size, + ) + return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() + + def _create_image_attention_mask(self, patch_attention_mask): + if patch_attention_mask is None: + return None + + flattened_mask = patch_attention_mask.flatten(1) + return torch.logical_not(flattened_mask) + + +class FFN(nn.Module): + + def __init__(self, embed_dim, ff_dim, output_dim): + super().__init__() + self.linear_in = ColumnParallelLinear(embed_dim, ff_dim, bias=False) + self.linear_out = RowParallelLinear(ff_dim, output_dim, bias=False) + self.act = get_act_fn("gelu_new") + + def forward(self, hidden_states): + hidden_states, _ = self.linear_in(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states, _ = self.linear_out(hidden_states) + return hidden_states + + +class CrossAttention(nn.Module): + + def __init__(self, kv_dim, embed_dim, num_heads, drop_out_rate=0): + super().__init__() + self.num_heads = num_heads + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False) + self.k_proj = nn.Linear(kv_dim, embed_dim, bias=False) + self.v_proj = nn.Linear(kv_dim, embed_dim, bias=False) + + self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads) + self.linear = nn.Linear(embed_dim, embed_dim) + self.dropout = nn.Dropout(drop_out_rate) + + self.layer_norm = nn.LayerNorm(embed_dim) + self.ln_kv = nn.LayerNorm(kv_dim) + + def forward(self, x, hidden_states, attn_mask=None, add_residual=False): + normed_hidden_states = self.layer_norm(hidden_states) + query = self.q_proj(normed_hidden_states).permute(1, 0, 2) + + x = self.ln_kv(x) + key = self.k_proj(x).permute(1, 0, 2) + value = self.v_proj(x).permute(1, 0, 2) + + attn_output, _ = self.multihead_attn(query, + key, + value, + attn_mask=attn_mask) + + attn_output = attn_output.permute(1, 0, 2) + + if add_residual: + attn_output = hidden_states + self.dropout( + self.linear(attn_output)) + else: + attn_output = self.dropout(self.linear(attn_output)) + + return attn_output + + +class AriaProjector(nn.Module): + """ + A projection module with one cross attention layer and one FFN layer, which + projects ViT's outputs into MoE's inputs. + + Args: + patch_to_query_dict (dict): Maps patch numbers to their corresponding + query numbers, + e.g., {1225: 128, 4900: 256}. This allows for different query sizes + based on image resolution. + embed_dim (int): Embedding dimension. + num_heads (int): Number of attention heads. + kv_dim (int): Dimension of key and value. + ff_dim (int): Hidden dimension of the feed-forward network. + output_dim (int): Output dimension. + norm_layer (nn.Module): Normalization layer. Default is nn.LayerNorm. + + Outputs: + A tensor with the shape of (batch_size, query_number, output_dim) + """ + + def __init__( + self, + patch_to_query_dict, + embed_dim, + num_heads, + kv_dim, + ff_dim, + output_dim, + norm_layer=nn.LayerNorm, + ): + super().__init__() + self.patch_to_query_dict = patch_to_query_dict + self.embed_dim = embed_dim + self.num_heads = num_heads + + self.query = nn.Parameter( + torch.zeros(max(patch_to_query_dict.values()), self.embed_dim)) + + trunc_normal_(self.query, std=0.02) + + self.cross_attn = CrossAttention(kv_dim, embed_dim, num_heads) + + self.ln_ffn = norm_layer(embed_dim) + self.ffn = FFN(embed_dim, ff_dim, output_dim) + + def forward(self, x, attn_mask=None): + bs = x.shape[0] + queries = self.query.unsqueeze(0).repeat(bs, 1, 1) + + query_num = self.patch_to_query_dict.get(x.shape[1], None) + assert (query_num is not None + ), f"Query number for {x.shape[1]} patches is not provided" + + queries = queries[:, :query_num, :] + + if attn_mask is not None: + attn_mask = attn_mask.repeat_interleave(self.num_heads, 0) + attn_mask = attn_mask.unsqueeze(1).expand(-1, queries.size(1), -1) + + attention_out = self.cross_attn(x, queries, attn_mask=attn_mask) + + out = self.ffn(self.ln_ffn(attention_out)) + + return out + + +class AriaFusedMoE(FusedMoE): + + def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, + shard_id: str) -> Set[str]: + # Override the weight_loader to handle the expert weights in the Aria + # model, which are already packed with experts, and merge the gate and + # up weights for each expert. + # Note: Loading expert weights with quantization is not supported + tp_rank = get_tensor_model_parallel_rank() + if shard_id == 'w13': + # the shape of loaded_weight is + # (num_experts, hidden_size, 2 * moe_intermediate_size) + if self.tp_size > 1: + up, gate = loaded_weight.chunk(2, dim=-1) + up_current_rank = up.chunk(self.tp_size, dim=-1)[tp_rank] + gate_current_rank = gate.chunk(self.tp_size, dim=-1)[tp_rank] + up_and_gate = torch.cat([up_current_rank, gate_current_rank], + dim=-1).transpose(1, 2) + param.data.copy_(up_and_gate) + else: + param.data.copy_(loaded_weight.transpose(1, 2)) + elif shard_id == 'w2': + # the shape of loaded_weight is + # (num_experts, moe_intermediate_size, hidden_size) + if self.tp_size > 1: + down_current_rank = loaded_weight.chunk(self.tp_size, + dim=1)[tp_rank] + param.data.copy_(down_current_rank.transpose(1, 2)) + else: + param.data.copy_(loaded_weight.transpose(1, 2)) + + +class MoELayer(nn.Module): + """ + Mixture of Experts (MoE) Layer for the AriaMoE model. + + This layer implements the MoE mechanism, which routes input tokens to + different experts based on a routing algorithm, processes them through the + experts, and then combines the outputs. + """ + + def __init__( + self, + config: AriaMoELMConfig, + quant_config: Optional[QuantizationConfig], + ) -> None: + super().__init__() + self.config = config + + self.router_weight = nn.Parameter( + torch.empty( + (self.config.moe_num_experts, self.config.hidden_size))) + + self.experts = AriaFusedMoE( + num_experts=config.moe_num_experts, + top_k=config.moe_topk, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + quant_config=quant_config, + reduce_results=True, + ) + self.shared_experts = LlamaMLP( + config.hidden_size, + config.moe_intermediate_size * config.moe_num_shared_experts, + "silu", + quant_config=quant_config, + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + """ + Forward pass of the MoE Layer. + + Args: + hidden_states (torch.Tensor): Input tensor of shape (batch_size, + sequence_length, hidden_size). + + Returns: + torch.Tensor: Output tensor after passing through the MoE layer. + """ + + router_output = torch.nn.functional.linear(hidden_states, + self.router_weight) + + shared_expert_output = self.shared_experts(hidden_states) + sparse_expert_output = self.experts(hidden_states, router_output) + + return sparse_expert_output + shared_expert_output + + +class MoEDecoderLayer(LlamaDecoderLayer): + """ + Custom Decoder Layer for the AriaMoE model which modifies the standard + `LlamaDecoderLayer` by replacing the traditional MLP with a Mixture of + Experts (MoE) Layer. + """ + + def __init__( + self, + config: LlamaConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__(config, cache_config, quant_config, prefix) + self.mlp = MoELayer(config, quant_config=quant_config) + + +class AriaMoELMModel(LlamaModel): + """ + Custom LlamaModel for the AriaMoE model which modifies the standard + LlamaModel by replacing the `LlamaDecoderLayer` with `MoEDecoderLayer`. + """ + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + + # FIXME: this is a hack to disable the compilation of the model + self.do_not_compile = True + + self.layers = None + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: MoEDecoderLayer( + config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix, + ), + prefix=f"{prefix}.layers", + ) + + # Adapted from LlamaModel.load_weights with the modification of adding + # the expert weights mapping to `stacked_params_mapping` + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".qkv_proj", ".q_proj", "q"), + (".qkv_proj", ".k_proj", "k"), + (".qkv_proj", ".v_proj", "v"), + (".gate_up_proj", ".gate_proj", 0), + (".gate_up_proj", ".up_proj", 1), + ("experts.w13_weight", "experts.fc1.weight", 'w13'), + ("experts.w2_weight", "experts.fc2.weight", 'w2'), + ] + params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + if ("rotary_emb.cos_cached" in name + or "rotary_emb.sin_cached" in name): + # Models trained using ColossalAI may include these tensors in + # the checkpoint. Skip them. + continue + if scale_name := get_compressed_tensors_cache_scale(name): + # Loading kv cache scales for compressed-tensors quantization + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = loaded_weight[0] + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +def build_mm_projector(config): + return AriaProjector( + patch_to_query_dict=config.projector_patch_to_query_dict, + embed_dim=config.vision_config.hidden_size, + num_heads=config.vision_config.num_attention_heads, + kv_dim=config.vision_config.hidden_size, + ff_dim=config.text_config.hidden_size, + output_dim=config.text_config.hidden_size, + ) + + +def get_max_multimodal_tokens(ctx): + return max(ctx.model_config.hf_config.image_size2tokens.values()) + + +def input_mapper_for_aria(ctx, data): + return MultiModalInputs(data) + + +def input_processor(ctx, llm_inputs): + multi_modal_data = llm_inputs.get("multi_modal_data") + # if it is pure text input, use it as is + if multi_modal_data is None or "image" not in multi_modal_data: + return llm_inputs + + model_config = ctx.model_config + + tokenizer = cached_get_tokenizer(model_config.tokenizer) + image_processor = cached_get_image_processor( + model_config.model, trust_remote_code=model_config.trust_remote_code) + hf_config = model_config.hf_config + + # prepare image tokens, the max_image_size is used to determine the number + # of patch_size for every image + max_image_size = multi_modal_data.pop("max_image_size", 980) + _split_image = multi_modal_data.pop("split_image", False) + + assert isinstance(max_image_size, + (int, float)), "max_image_size should be float or int" + images = (multi_modal_data["image"] if isinstance( + multi_modal_data["image"], list) else [multi_modal_data["image"]]) + + image_inputs = image_processor.preprocess(images, + max_image_size=max_image_size, + split_image=_split_image, + return_tensors="pt").data + image_inputs['pixel_values'] = image_inputs['pixel_values'].to( + ctx.model_config.dtype) + num_crops = image_inputs.pop("num_crops") + + prompt_token_ids = llm_inputs["prompt_token_ids"] + if num_crops.sum().item() > 0: + _, prompt_token_ids, _ = repeat_and_pad_placeholder_tokens( + tokenizer, + None, + prompt_token_ids, + placeholder_token_id=hf_config.image_token_index, + repeat_count=num_crops, + ) + + repeat_count = [hf_config.image_size2tokens[max_image_size] + ] * sum(num_crops).item() + new_prompt, new_token_ids, _ = repeat_and_pad_placeholder_tokens( + tokenizer, + None, + prompt_token_ids, + placeholder_token_id=hf_config.image_token_index, + repeat_count=repeat_count, + ) + + return token_inputs( + prompt_token_ids=new_token_ids, + prompt=new_prompt, + multi_modal_data={"image": image_inputs}, + ) + + +@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_multimodal_tokens) +@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_aria) +@INPUT_REGISTRY.register_input_processor(input_processor) +class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): + """ + Aria model for conditional generation tasks. + + This model combines a vision tower, a multi-modal projector, and a language + model to perform tasks that involve both image and text inputs. + """ + + def __init__( + self, + vllm_config: VllmConfig, + prefix: str = "", + ): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + + # prepare the image_size to tokens mapping for the image preprocess, see + # input_processor + config.image_size2tokens = { + int(math.sqrt(k) * config.vision_config.patch_size): v + for k, v in config.projector_patch_to_query_dict.items() + } + self.config = config + self.vision_tower = AriaVisionModel(config.vision_config) + self.multi_modal_projector = build_mm_projector(config) + self.vocab_size = config.text_config.vocab_size + self.language_model = AriaMoELMModel( + vllm_config=vllm_config.with_hf_config(config.text_config), + prefix=maybe_prefix(prefix, "language_model.model"), + ) + self.pad_token_id = (self.config.pad_token_id + if self.config.pad_token_id is not None else -1) + self.unpadded_vocab_size = config.text_config.vocab_size + self.lm_head = ParallelLMHead( + self.unpadded_vocab_size, + config.text_config.hidden_size, + org_num_embeddings=self.language_model.org_vocab_size, + quant_config=quant_config, + ) + logit_scale = getattr(config, "logit_scale", 1.0) + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + self.vocab_size, logit_scale) + self.sampler = Sampler() + + def _validate_image_sizes( + self, images: List[torch.Tensor]) -> List[torch.Tensor]: + if not all(img.shape == images[0].shape for img in images): + raise ValueError("All images must be the same size") + return images + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[AriaImagePixelInputs]: + pixel_values = kwargs.pop("pixel_values", None) + pixel_mask = kwargs.pop("pixel_mask", None) + + if pixel_values is None: + return None + + if not isinstance(pixel_values, (torch.Tensor, list)): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") + + pixel_values = self._validate_image_sizes(pixel_values) + pixel_values = flatten_bn(pixel_values, concat=True) + if pixel_mask is not None: + pixel_mask = flatten_bn(pixel_mask, concat=True) + + return AriaImagePixelInputs( + pixel_values=pixel_values, + pixel_mask=pixel_mask, + ) + + def _process_image_input( + self, image_input: AriaImagePixelInputs + ) -> Tuple[torch.Tensor, torch.Tensor]: + assert self.vision_tower is not None + + pixel_values = image_input['pixel_values'] + pixel_mask = image_input['pixel_mask'] + + image_feature, image_attn_mask = self.vision_tower( + pixel_values, pixel_mask=pixel_mask) + return self.multi_modal_projector(image_feature, image_attn_mask) + + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + multimodal_embeddings = self._process_image_input(image_input) + return multimodal_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + self.config.image_token_index) + return inputs_embeds + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + if inputs_embeds is None: + multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) + # always pass the input via `inputs_embeds` + # to make sure the computation graph is consistent + inputs_embeds = self.get_input_embeddings(input_ids, + multimodal_embeddings) + input_ids = None + + hidden_states = self.language_model( + input_ids, + positions, + kv_caches, + attn_metadata, + intermediate_tensors, + inputs_embeds=inputs_embeds, + ) + + return hidden_states + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "language_model.model": "language_model", + "language_model.lm_head": "lm_head", + }, + orig_to_new_suffix={ + "router.weight": "router_weight", + }, + ) + + loader = AutoWeightsLoader(self) + loader.load_weights(weights, mapper=hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 789ffb4d3bde0..184f4b2bc1526 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -133,6 +133,7 @@ _MULTIMODAL_MODELS = { # [Decoder-only] + "AriaForConditionalGeneration": ("aria", "AriaForConditionalGeneration"), "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"), "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"), # noqa: E501 "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"), diff --git a/vllm/transformers_utils/configs/aria.py b/vllm/transformers_utils/configs/aria.py new file mode 100644 index 0000000000000..d253da0d96a34 --- /dev/null +++ b/vllm/transformers_utils/configs/aria.py @@ -0,0 +1,47 @@ +from transformers.models.idefics2.configuration_idefics2 import ( + Idefics2VisionConfig) +from transformers.models.llama.configuration_llama import LlamaConfig + + +class AriaVisionConfig(Idefics2VisionConfig): + model_type = "aria_vision_model" + + +class AriaMoELMConfig(LlamaConfig): + """ + Configuration class for AriaMoE language model. + + This class extends the LlamaConfig to include additional parameters specific + to the Mixture of Experts (MoE) architecture. + """ + + model_type = "aria_moe_lm" + + def __init__( + self, + moe_intermediate_size: int = 4096, + moe_num_experts: int = 8, + moe_topk: int = 2, + moe_num_shared_experts: int = 2, + **kwargs, + ): + """ + Initialize the AriaMoELMConfig. + + Args: + moe_intermediate_size (int): The intermediate size for MoE layers. + Default is 4096. + moe_num_experts (int): The number of experts in the MoE layer. + Default is 8. + moe_topk (int): The number of top experts to route to for each + token. Default is 2. + moe_num_shared_experts (int): The number of shared experts. Default + is 2. + **kwargs: Additional keyword arguments to be passed to the parent + LlamaConfig. + """ + super().__init__(**kwargs) + self.moe_intermediate_size = moe_intermediate_size + self.moe_num_experts = moe_num_experts + self.moe_topk = moe_topk + self.moe_num_shared_experts = moe_num_shared_experts From cf73f0c95e09836efff876d5bfd9b9c6cc1ba06e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 26 Nov 2024 02:14:33 +0800 Subject: [PATCH 151/255] [Model] Enable optional prefix when loading embedding models (#10639) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/bert.py | 9 +++++---- vllm/model_executor/models/gemma2.py | 4 +++- vllm/model_executor/models/llama.py | 5 ++++- vllm/model_executor/models/qwen2.py | 12 ++++++------ vllm/model_executor/models/roberta.py | 3 ++- 5 files changed, 20 insertions(+), 13 deletions(-) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index f570d6d3c12b3..1fff72b3490e9 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -14,18 +14,17 @@ RowParallelLinear) from vllm.model_executor.layers.pooler import (CrossEncodingPooler, Pooler, PoolingType) -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) +from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.interfaces import SupportsCrossEncoding from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.transformers_utils.config import ( get_cross_encoder_activation_function) -from .utils import maybe_prefix +from .interfaces import SupportsCrossEncoding +from .utils import WeightsMapper, maybe_prefix class BertEmbedding(nn.Module): @@ -442,6 +441,8 @@ def pooler( return self._pooler(hidden_states, pooling_metadata) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) + weights = hf_to_vllm_mapper.apply(weights) self.model.load_weights(weights) def _build_model(self, diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index fd8223dd9be1b..d229eb74669ee 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -42,7 +42,7 @@ from vllm.sequence import IntermediateTensors, PoolerOutput from .interfaces import SupportsLoRA, SupportsPP -from .utils import (AutoWeightsLoader, extract_layer_index, +from .utils import (AutoWeightsLoader, WeightsMapper, extract_layer_index, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -511,4 +511,6 @@ def pooler( return self._pooler(hidden_states, pooling_metadata) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) + weights = hf_to_vllm_mapper.apply(weights) self.model.load_weights(weights) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 66b29e72cfa89..33d78d74129c8 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -53,7 +53,8 @@ from vllm.sequence import IntermediateTensors, PoolerOutput from .interfaces import SupportsLoRA, SupportsPP -from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, +from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, + is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -689,6 +690,8 @@ def pooler( return self._pooler(hidden_states, pooling_metadata) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) + weights = hf_to_vllm_mapper.apply(weights) self.model.load_weights(weights) def load_kv_cache_scales(self, quantization_param_path: str) -> None: diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 8da75c9935a13..46640226d4cf8 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -50,7 +50,8 @@ from vllm.sequence import IntermediateTensors, PoolerOutput from .interfaces import SupportsLoRA, SupportsPP -from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, +from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, + is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -585,8 +586,7 @@ def pooler( ) -> Optional[PoolerOutput]: return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: - loader = AutoWeightsLoader(self, - ignore_unexpected_prefixes=["lm_head."]) - return loader.load_weights(weights) + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) + weights = hf_to_vllm_mapper.apply(weights) + self.model.load_weights(weights) diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 5a296e311f079..ba1a78ac640fd 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -11,13 +11,14 @@ VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.bert import BertEmbeddingModel, BertModel -from vllm.model_executor.models.interfaces import SupportsCrossEncoding from vllm.model_executor.models.utils import maybe_prefix from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.transformers_utils.config import ( get_cross_encoder_activation_function) +from .interfaces import SupportsCrossEncoding + class RobertaEmbedding(nn.Module): From 1b583cfefad4ffa030bda1c1265aec6e7755a6d2 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 26 Nov 2024 02:15:45 +0800 Subject: [PATCH 152/255] [Doc] Fix typos in docs (#10636) Signed-off-by: DarkLight1337 --- docs/source/models/supported_models.rst | 2 +- docs/source/serving/compatibility_matrix.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 7a6932d65e653..3f012284bfbff 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -365,7 +365,7 @@ Text Embedding .. note:: Unlike base Qwen2, :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention. - You can set `--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly. + You can set :code:`--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly. On the other hand, its 1.5B variant (:code:`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention despite being described otherwise on its model card. diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst index a4300761d2635..fa03d2cde1486 100644 --- a/docs/source/serving/compatibility_matrix.rst +++ b/docs/source/serving/compatibility_matrix.rst @@ -393,7 +393,7 @@ Feature x Hardware - ✅ - ✅ - ✅ - - ✗ + - ? * - :abbr:`enc-dec (Encoder-Decoder Models)` - ✅ - ✅ From 9db713a1dca7e1bc9b6ecf5303c63c7352c52a13 Mon Sep 17 00:00:00 2001 From: Shane A Date: Mon, 25 Nov 2024 14:26:40 -0800 Subject: [PATCH 153/255] [Model] Add OLMo November 2024 model (#10503) --- docs/source/models/supported_models.rst | 5 + tests/distributed/test_pipeline_parallel.py | 1 + tests/models/registry.py | 1 + vllm/model_executor/models/olmo2.py | 432 ++++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + vllm/transformers_utils/config.py | 5 +- vllm/transformers_utils/configs/__init__.py | 2 + vllm/transformers_utils/configs/olmo2.py | 166 ++++++++ 8 files changed, 611 insertions(+), 2 deletions(-) create mode 100644 vllm/model_executor/models/olmo2.py create mode 100644 vllm/transformers_utils/configs/olmo2.py diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 3f012284bfbff..b5cbe6915d581 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -234,6 +234,11 @@ Text Generation - :code:`allenai/OLMo-1B-hf`, :code:`allenai/OLMo-7B-hf`, etc. - - ✅︎ + * - :code:`OLMo2ForCausalLM` + - OLMo2 + - :code:`allenai/OLMo2-7B-1124`, etc. + - + - ✅︎ * - :code:`OLMoEForCausalLM` - OLMoE - :code:`allenai/OLMoE-1B-7B-0924`, :code:`allenai/OLMoE-1B-7B-0924-Instruct`, etc. diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index c49ed9802cde8..386877e0e0a2c 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -167,6 +167,7 @@ def iter_params(self, model_name: str): "mosaicml/mpt-7b": PPTestSettings.fast(), "nvidia/Minitron-8B-Base": PPTestSettings.fast(), "allenai/OLMo-1B-hf": PPTestSettings.fast(), + "shanearora/OLMo-7B-1124-hf": PPTestSettings.fast(), "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(), "facebook/opt-iml-max-1.3b": PPTestSettings.fast(), "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True), diff --git a/tests/models/registry.py b/tests/models/registry.py index 669c832b1df3a..865e90b3f8b0e 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -93,6 +93,7 @@ class _HfExamplesInfo: "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"), "NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"), "OlmoForCausalLM": _HfExamplesInfo("allenai/OLMo-1B-hf"), + "Olmo2ForCausalLM": _HfExamplesInfo("shanearora/OLMo-7B-1124-hf"), "OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"), "OPTForCausalLM": _HfExamplesInfo("facebook/opt-iml-max-1.3b"), "OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat", diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py new file mode 100644 index 0000000000000..a35c911f90d96 --- /dev/null +++ b/vllm/model_executor/models/olmo2.py @@ -0,0 +1,432 @@ +# Adapted from +# https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modeling_olmo2.py +# Copyright 2024 The vLLM team. +# Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only OLMo2 model compatible with HuggingFace weights.""" + +from functools import partial +from typing import Iterable, List, Optional, Tuple, Union + +import torch +from torch import nn + +from vllm.attention import Attention, AttentionMetadata +from vllm.config import VllmConfig +from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.distributed.communication_op import tensor_model_parallel_all_gather +from vllm.distributed.parallel_state import get_tensor_model_parallel_rank +from vllm.distributed.utils import split_tensor_along_last_dim +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.interfaces import SupportsPP +from vllm.model_executor.models.utils import ( + is_pp_missing_parameter, make_empty_intermediate_tensors_factory, + make_layers, maybe_prefix) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.configs.olmo2 import Olmo2Config + + +class Olmo2Attention(nn.Module): + """ + This is the attention block where the output is computed as + ``Attention(LN(x))`` in ``MLP(LN(x + Attention(LN(x))))`` + (plus another skip connection). + """ + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + self.config = vllm_config.model_config.hf_config + assert isinstance(self.config, Olmo2Config) + + hidden_size = self.config.hidden_size + self.tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = self.config.num_attention_heads + + assert hidden_size % self.total_num_heads == 0 + assert self.total_num_heads % self.tp_size == 0 + + self.num_heads = self.total_num_heads // self.tp_size + self.total_num_kv_heads = (self.config.num_key_value_heads + or self.total_num_heads) + if self.total_num_kv_heads >= self.tp_size: + assert self.total_num_kv_heads % self.tp_size == 0 + else: + assert self.tp_size % self.total_num_kv_heads == 0 + + self.num_kv_heads = max(1, self.total_num_kv_heads // self.tp_size) + self.head_dim = hidden_size // self.total_num_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.max_position_embeddings = self.config.max_position_embeddings + self.rope_theta = self.config.rope_theta + + # Attention input projection. Projects x -> (q, k, v) + self.qkv_proj = QKVParallelLinear( + hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=False, + quant_config=vllm_config.quant_config, + prefix=f"{prefix}.qkv_proj", + ) + + self.tp_rank = get_tensor_model_parallel_rank() + self.k_norm = RMSNorm( + self.total_num_kv_heads * self.head_dim, + eps=self.config.rms_norm_eps, + ) + self.q_norm = RMSNorm(self.config.hidden_size, + eps=self.config.rms_norm_eps) + + # Rotary embeddings. + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=self.max_position_embeddings, + base=self.rope_theta, # type: ignore + ) + self.scaling = self.head_dim**-0.5 + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=vllm_config.cache_config, + quant_config=vllm_config.quant_config, + prefix=prefix, + ) + + # Attention output projection. + self.o_proj = RowParallelLinear( + self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + quant_config=vllm_config.quant_config, + prefix=f"{prefix}.o_proj", + ) + + def _apply_qk_norm(self, q: torch.Tensor, + k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + if self.tp_size > 1: + q = tensor_model_parallel_all_gather(q.contiguous()) + k = tensor_model_parallel_all_gather(k.contiguous()) + q = self.q_norm.forward_native(q) + k = self.k_norm.forward_native(k) + if self.tp_size > 1: + splitter = partial(split_tensor_along_last_dim, + num_partitions=self.tp_size) + q = splitter(q)[self.tp_rank] + k = splitter(k)[self.tp_rank] + return q, k + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.chunk(chunks=3, dim=-1) + q, k = self._apply_qk_norm(q, k) + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v, kv_cache, attn_metadata) + output, _ = self.o_proj(attn_output) + return output + + +class Olmo2MLP(nn.Module): + """ + This is the MLP block where the output is computed as + ``MLP(x)`` in ``LN(MLP(x + LN(Attention(x))))`` + (plus another skip connection). + """ + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + assert isinstance(config, Olmo2Config) + hidden_size = config.hidden_size + intermediate_size = config.intermediate_size + + # Feed-forward input projection. + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, + [intermediate_size] * 2, + bias=False, + quant_config=vllm_config.quant_config, + prefix=f"{prefix}.gate_up_proj", + ) + + # Activation function. + self.act_fn = SiluAndMul() + + # Feed-forward output projection. + self.down_proj = RowParallelLinear( + intermediate_size, + hidden_size, + bias=False, + quant_config=vllm_config.quant_config, + prefix=f"{prefix}.down_proj", + ) + + def forward( + self, + x: torch.Tensor, + ) -> torch.Tensor: + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class Olmo2DecoderLayer(nn.Module): + """ + This is a typical transformer block where the output is + computed as ``MLP(LN(x + Attention(LN(x))))`` + (plus another skip connection). + """ + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + assert isinstance(config, Olmo2Config) + # Attention block. + self.self_attn = Olmo2Attention(vllm_config=vllm_config, + prefix=f"{prefix}.self_attn") + + # MLP block. + self.mlp = Olmo2MLP(vllm_config=vllm_config, prefix=f"{prefix}.mlp") + + # LayerNorm + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + self.post_feedforward_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + # Attention block. + residual = hidden_states + hidden_states = self.self_attn(positions, hidden_states, kv_cache, + attn_metadata) + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = hidden_states + residual + + # MLP block. + residual = hidden_states + hidden_states = self.mlp(hidden_states) + hidden_states = self.post_feedforward_layernorm(hidden_states) + hidden_states = residual + hidden_states + return hidden_states + + +class Olmo2Model(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + self.config = vllm_config.model_config.hf_config + assert isinstance(self.config, Olmo2Config) + + self.embed_tokens = VocabParallelEmbedding( + self.config.vocab_size, + self.config.hidden_size, + prefix=f"{prefix}.embed_tokens", + ) + self.start_layer, self.end_layer, self.layers = make_layers( + self.config.num_hidden_layers, + lambda prefix: Olmo2DecoderLayer(vllm_config=vllm_config, + prefix=prefix), + prefix=f"{prefix}.layers", + ) + self.norm = RMSNorm( + self.config.hidden_size, + eps=self.config.rms_norm_eps, + ) + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory(["hidden_states"], + self.config.hidden_size)) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors], + ) -> Union[torch.Tensor, IntermediateTensors]: + """ + :param input_ids: A tensor of shape `(batch_size, seq_len)`. + """ + if get_pp_group().is_first_rank: + # Get embeddings of input. + # shape: (batch_size, seq_len, d_model) + inputs_embeds = self.embed_tokens(input_ids) + + # embed positions + hidden_states = inputs_embeds + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + assert isinstance(hidden_states, torch.Tensor) + + # Apply blocks one-by-one. + for i in range(self.start_layer, self.end_layer): + # shape: (batch_size, seq_len, d_model) + hidden_states = self.layers[i]( + positions, + hidden_states, + kv_caches[i - self.start_layer], + attn_metadata, + ) + + if not get_pp_group().is_last_rank: + return IntermediateTensors({"hidden_states": hidden_states}) + + # Apply final layer norm. + # shape: (batch_size, seq_len or 1, d_model) + hidden_states = self.norm(hidden_states) + return hidden_states + + +class Olmo2ForCausalLM(nn.Module, SupportsPP): + """ + Extremely barebones HF model wrapper. + """ + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + assert isinstance(config, Olmo2Config) + self.config = config + self.model = Olmo2Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + if config.tie_word_embeddings: + self.lm_head = self.model.embed_tokens + else: + self.unpadded_vocab_size = config.vocab_size + self.lm_head = ParallelLMHead( + config.vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + quant_config=vllm_config.quant_config, + prefix=maybe_prefix(prefix, "lm_head"), + ) + self.logits_processor = LogitsProcessor(config.vocab_size) + self.sampler = Sampler() + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + hidden_states = self.model( + input_ids=input_ids, + positions=positions, + kv_caches=kv_caches, + attn_metadata=attn_metadata, + intermediate_tensors=intermediate_tensors, + ) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + params_dict = dict(self.named_parameters(remove_duplicate=False)) + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + if ("rotary_emb.cos_cached" in name + or "rotary_emb.sin_cached" in name): + # Models trained using ColossalAI may include these tensors in + # the checkpoint. Skip them. + continue + if is_pp_missing_parameter(name, self): + continue + # With tie_word_embeddings, we can skip lm_head.weight + # The weight might appear unnecessarily in the files if the model is + # processed with quantization, LoRA, fine-tuning, etc. + if self.config.tie_word_embeddings and "lm_head.weight" in name: + continue + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader # type: ignore + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 184f4b2bc1526..f5a02a5b25ca2 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -74,6 +74,7 @@ "MPTForCausalLM": ("mpt", "MPTForCausalLM"), "NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"), "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"), + "Olmo2ForCausalLM": ("olmo2", "Olmo2ForCausalLM"), "OlmoeForCausalLM": ("olmoe", "OlmoeForCausalLM"), "OPTForCausalLM": ("opt", "OPTForCausalLM"), "OrionForCausalLM": ("orion", "OrionForCausalLM"), diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 70d18d40b7aa7..4c096acdf2035 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -28,8 +28,8 @@ MedusaConfig, MllamaConfig, MLPSpeculatorConfig, MPTConfig, NemotronConfig, NVLM_D_Config, - RWConfig, SolarConfig, - UltravoxConfig) + Olmo2Config, RWConfig, + SolarConfig, UltravoxConfig) # yapf: enable from vllm.transformers_utils.utils import check_gguf_file from vllm.utils import resolve_obj_by_qualname @@ -62,6 +62,7 @@ "internvl_chat": InternVLChatConfig, "nemotron": NemotronConfig, "NVLM_D": NVLM_D_Config, + "olmo2": Olmo2Config, "solar": SolarConfig, "ultravox": UltravoxConfig, **_CONFIG_REGISTRY_OVERRIDE_HF diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index d1e19c9a33c24..4c721001d8434 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -15,6 +15,7 @@ from vllm.transformers_utils.configs.mpt import MPTConfig from vllm.transformers_utils.configs.nemotron import NemotronConfig from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config +from vllm.transformers_utils.configs.olmo2 import Olmo2Config from vllm.transformers_utils.configs.solar import SolarConfig from vllm.transformers_utils.configs.ultravox import UltravoxConfig @@ -33,6 +34,7 @@ "MLPSpeculatorConfig", "NemotronConfig", "NVLM_D_Config", + "Olmo2Config", "SolarConfig", "UltravoxConfig", ] \ No newline at end of file diff --git a/vllm/transformers_utils/configs/olmo2.py b/vllm/transformers_utils/configs/olmo2.py new file mode 100644 index 0000000000000..0e6d8e4879b06 --- /dev/null +++ b/vllm/transformers_utils/configs/olmo2.py @@ -0,0 +1,166 @@ +# yapf: disable +# ruff: noqa: E501 +# coding=utf-8 +# Copied from +# https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/configuration_olmo2.py +"""OLMo 2 configuration.""" + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + +logger = logging.get_logger(__name__) + + +class Olmo2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Olmo2Model`]. It is used to instantiate an OLMo2 + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the [allenai/Olmo2-7B-1124-hf](https://huggingface.co/allenai/Olmo2-7B-1124-hf). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 50304): + Vocabulary size of the Olmo2 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Olmo2Model`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*, defaults to 1): + Padding token id. + bos_token_id (`int`, *optional*): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 50279): + End of stream token id. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling + strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is + `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update + `max_position_embeddings` to the expected new maximum. See the following thread for more information on how + these scaling strategies behave: + https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an + experimental feature, subject to breaking API changes in future versions. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + + ```python + >>> from transformers import Olmo2Model, Olmo2Config + + >>> # Initializing a Olmo2 7B style configuration + >>> configuration = Olmo2Config() + + >>> # Initializing a model from the Olmo2 7B style configuration + >>> model = Olmo2Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` + """ + + model_type = "olmo2" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=50304, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + use_cache=True, + pad_token_id=1, + bos_token_id=None, + eos_token_id=50279, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + rms_norm_eps=1e-5, + **kwargs, + ): + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self._rope_scaling_validation() + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + + self.rms_norm_eps = rms_norm_eps + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + raise ValueError( + "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}" + ) + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + raise ValueError( + f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" + ) + if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}") From 6e9ff050c8e83ad6d5e5eab621e83549e35933a1 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 25 Nov 2024 17:04:50 -0800 Subject: [PATCH 154/255] [misc] do not read HOST_IP (#10644) Signed-off-by: youkaichao --- vllm/envs.py | 2 +- vllm/executor/ray_gpu_executor.py | 4 ++-- vllm/executor/ray_hpu_executor.py | 4 ++-- vllm/utils.py | 7 +++++++ 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 14c1617f1be19..c896770e5f6bc 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -153,7 +153,7 @@ def get_default_config_root(): # If you are using multi-node inference, you should set this differently # on each node. 'VLLM_HOST_IP': - lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""), + lambda: os.getenv('VLLM_HOST_IP', ""), # used in distributed environment to manually set the communication port # Note: if VLLM_PORT is set, and some code asks for multiple ports, the diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 810b0f06ff7b2..6542b18ae70b1 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -216,8 +216,8 @@ def sort_by_driver_then_worker_ip(worker): f"Every node should have a unique IP address. Got {n_nodes}" f" nodes with node ids {list(node_workers.keys())} and " f"{n_ips} unique IP addresses {all_ips}. Please check your" - " network configuration. If you set `VLLM_HOST_IP` or " - "`HOST_IP` environment variable, make sure it is unique for" + " network configuration. If you set `VLLM_HOST_IP`" + " environment variable, make sure it is unique for" " each node.") VLLM_INSTANCE_ID = get_vllm_instance_id() diff --git a/vllm/executor/ray_hpu_executor.py b/vllm/executor/ray_hpu_executor.py index 6fe8c6c403358..a74328e5aa272 100644 --- a/vllm/executor/ray_hpu_executor.py +++ b/vllm/executor/ray_hpu_executor.py @@ -192,8 +192,8 @@ def sort_by_driver_then_worker_ip(worker): f"Every node should have a unique IP address. Got {n_nodes}" f" nodes with node ids {list(node_workers.keys())} and " f"{n_ips} unique IP addresses {all_ips}. Please check your" - " network configuration. If you set `VLLM_HOST_IP` or " - "`HOST_IP` environment variable, make sure it is unique for" + " network configuration. If you set `VLLM_HOST_IP` " + "environment variable, make sure it is unique for" " each node.") VLLM_INSTANCE_ID = get_vllm_instance_id() diff --git a/vllm/utils.py b/vllm/utils.py index dd4283e3ac381..bec876d983701 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -467,6 +467,13 @@ async def collect_from_async_generator( def get_ip() -> str: host_ip = envs.VLLM_HOST_IP + if "HOST_IP" in os.environ and "VLLM_HOST_IP" not in os.environ: + logger.warning( + "The environment variable HOST_IP is deprecated and ignored, as" + " it is often used by Docker and other software to" + "interact with the container's network stack. Please" + "use VLLM_HOST_IP instead to set the IP address for vLLM processes" + " to communicate with each other.") if host_ip: return host_ip From 45ac4ff270b267765457159c0b75e1bb7ebf6d79 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 25 Nov 2024 18:32:09 -0800 Subject: [PATCH 155/255] [bugfix] fix aria model and add torch.compile (#10645) Signed-off-by: youkaichao --- vllm/model_executor/models/aria.py | 26 ++++---------------------- vllm/model_executor/models/llama.py | 16 ++++++++++------ 2 files changed, 14 insertions(+), 28 deletions(-) diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 0356435e9c257..fa6b95f5481ad 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -29,7 +29,7 @@ LlamaModel) from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter, - make_layers, maybe_prefix, + maybe_prefix, merge_multimodal_embeddings) from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.base import MultiModalInputs @@ -363,27 +363,9 @@ class AriaMoELMModel(LlamaModel): """ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__(vllm_config=vllm_config, prefix=prefix) - - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - - # FIXME: this is a hack to disable the compilation of the model - self.do_not_compile = True - - self.layers = None - - self.start_layer, self.end_layer, self.layers = make_layers( - config.num_hidden_layers, - lambda prefix: MoEDecoderLayer( - config=config, - cache_config=cache_config, - quant_config=quant_config, - prefix=prefix, - ), - prefix=f"{prefix}.layers", - ) + super().__init__(vllm_config=vllm_config, + prefix=prefix, + layer_type=MoEDecoderLayer) # Adapted from LlamaModel.load_weights with the modification of adding # the expert weights mapping to `stacked_params_mapping` diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 33d78d74129c8..355b2f3ef8b28 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -20,7 +20,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only LLaMA model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Type, Union import torch from torch import nn @@ -273,7 +273,11 @@ def forward( @support_torch_compile class LlamaModel(nn.Module): - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + def __init__(self, + *, + vllm_config: VllmConfig, + prefix: str = "", + layer_type: Type[LlamaDecoderLayer] = LlamaDecoderLayer): super().__init__() config = vllm_config.model_config.hf_config @@ -299,10 +303,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.embed_tokens = PPMissingLayer() self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: LlamaDecoderLayer(config=config, - cache_config=cache_config, - quant_config=quant_config, - prefix=prefix), + lambda prefix: layer_type(config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix), prefix=f"{prefix}.layers", ) if get_pp_group().is_last_rank: From a6760f6456b714409685e23301c820a85da856ca Mon Sep 17 00:00:00 2001 From: Sanket Kale Date: Tue, 26 Nov 2024 08:02:39 +0530 Subject: [PATCH 156/255] [Feature] vLLM ARM Enablement for AARCH64 CPUs (#9228) Signed-off-by: Sanket Kale Co-authored-by: Sanket Kale Co-authored-by: mgoin --- Dockerfile.arm | 62 +++ cmake/cpu_extension.cmake | 33 +- csrc/cpu/attention.cpp | 18 +- csrc/cpu/cpu_types.hpp | 6 +- csrc/cpu/cpu_types_arm.hpp | 515 ++++++++++++++++++ .../getting_started/arm-installation.rst | 50 ++ docs/source/index.rst | 1 + examples/offline_inference.py | 2 +- requirements-cpu.txt | 7 +- 9 files changed, 678 insertions(+), 16 deletions(-) create mode 100644 Dockerfile.arm create mode 100644 csrc/cpu/cpu_types_arm.hpp create mode 100644 docs/source/getting_started/arm-installation.rst diff --git a/Dockerfile.arm b/Dockerfile.arm new file mode 100644 index 0000000000000..093ee2209222f --- /dev/null +++ b/Dockerfile.arm @@ -0,0 +1,62 @@ +# This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform. + +FROM ubuntu:22.04 AS cpu-test-arm + +ENV CCACHE_DIR=/root/.cache/ccache + +ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache + +RUN --mount=type=cache,target=/var/cache/apt \ + apt-get update -y \ + && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \ + && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ + && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 + +# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects. +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install py-cpuinfo # Use this to gather CPU info and optimize based on ARM Neoverse cores + +# Set LD_PRELOAD for tcmalloc on ARM +ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4" + +RUN echo 'ulimit -c 0' >> ~/.bashrc + +WORKDIR /workspace + +ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" +ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \ + pip install --upgrade pip && \ + pip install -r requirements-build.txt + +FROM cpu-test-arm AS build + +WORKDIR /workspace/vllm + +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \ + --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \ + pip install -v -r requirements-cpu.txt + +COPY . . +ARG GIT_REPO_CHECK=0 +RUN --mount=type=bind,source=.git,target=.git \ + if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi + +# Disabling AVX512 specific optimizations for ARM +ARG VLLM_CPU_DISABLE_AVX512="true" +ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512} + +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/ccache \ + --mount=type=bind,source=.git,target=.git \ + VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \ + pip install dist/*.whl && \ + rm -rf dist + +WORKDIR /workspace/ + +RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks + +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] \ No newline at end of file diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index 426189481575b..68f7ca1af05ad 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -16,16 +16,15 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc") # # Check the compile flags # -if (CMAKE_SYSTEM_PROCESSOR STREQUAL "ppc64le") - list(APPEND CXX_COMPILE_FLAGS - "-fopenmp" - "-DVLLM_CPU_EXTENSION") -else() + +if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") list(APPEND CXX_COMPILE_FLAGS - "-fopenmp" "-mf16c" - "-DVLLM_CPU_EXTENSION") + ) endif() +list(APPEND CXX_COMPILE_FLAGS + "-fopenmp" + "-DVLLM_CPU_EXTENSION") execute_process(COMMAND cat /proc/cpuinfo RESULT_VARIABLE CPUINFO_RET @@ -59,6 +58,8 @@ find_isa(${CPUINFO} "avx2" AVX2_FOUND) find_isa(${CPUINFO} "avx512f" AVX512_FOUND) find_isa(${CPUINFO} "POWER10" POWER10_FOUND) find_isa(${CPUINFO} "POWER9" POWER9_FOUND) +find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support +find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support if (AVX512_FOUND AND NOT AVX512_DISABLED) list(APPEND CXX_COMPILE_FLAGS @@ -78,9 +79,11 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED) else() message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.") endif() + elseif (AVX2_FOUND) list(APPEND CXX_COMPILE_FLAGS "-mavx2") message(WARNING "vLLM CPU backend using AVX2 ISA") + elseif (POWER9_FOUND OR POWER10_FOUND) message(STATUS "PowerPC detected") # Check for PowerPC VSX support @@ -88,8 +91,20 @@ elseif (POWER9_FOUND OR POWER10_FOUND) "-mvsx" "-mcpu=native" "-mtune=native") + +elseif (ASIMD_FOUND) + message(STATUS "ARMv8 or later architecture detected") + if(ARM_BF16_FOUND) + message(STATUS "BF16 extension detected") + set(MARCH_FLAGS "-march=armv8.2-a+bf16+dotprod+fp16") + add_compile_definitions(ARM_BF16_SUPPORT) + else() + message(WARNING "BF16 functionality is not available") + set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16") + endif() + list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS}) else() - message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.") + message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.") endif() # @@ -159,4 +174,4 @@ define_gpu_extension_target( WITH_SOABI ) -message(STATUS "Enabling C extension.") +message(STATUS "Enabling C extension.") \ No newline at end of file diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp index e6c03dcb034fd..e21832ba7582f 100644 --- a/csrc/cpu/attention.cpp +++ b/csrc/cpu/attention.cpp @@ -51,6 +51,10 @@ struct KernelVecType { using v_load_vec_type = vec_op::BF16Vec16; }; #else + #ifdef __aarch64__ + #ifndef ARM_BF16_SUPPORT + // pass + #else template <> struct KernelVecType { using q_load_vec_type = vec_op::BF16Vec8; @@ -60,6 +64,18 @@ struct KernelVecType { using qk_acc_vec_type = vec_op::FP32Vec16; using v_load_vec_type = vec_op::BF16Vec16; }; + #endif + #else +template <> +struct KernelVecType { + using q_load_vec_type = vec_op::BF16Vec8; + using q_vec_type = vec_op::FP32Vec16; + using k_load_vec_type = vec_op::BF16Vec16; + using k_vec_type = vec_op::FP32Vec16; + using qk_acc_vec_type = vec_op::FP32Vec16; + using v_load_vec_type = vec_op::BF16Vec16; +}; + #endif #endif template @@ -779,4 +795,4 @@ void paged_attention_v2( CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t); CPU_KERNEL_GUARD_OUT(paged_attention_v2_impl) }); -} +} \ No newline at end of file diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp index 0213be09105ed..28db0479748bf 100644 --- a/csrc/cpu/cpu_types.hpp +++ b/csrc/cpu/cpu_types.hpp @@ -1,4 +1,3 @@ - #ifndef CPU_TYPES_HPP #define CPU_TYPES_HPP @@ -8,8 +7,11 @@ #elif defined(__POWER9_VECTOR__) //ppc implementation #include "cpu_types_vsx.hpp" +#elif defined(__aarch64__) + //arm implementation + #include "cpu_types_arm.hpp" #else #warning "unsupported vLLM cpu implementation" #endif -#endif +#endif \ No newline at end of file diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp new file mode 100644 index 0000000000000..73e0f8cb2e0fb --- /dev/null +++ b/csrc/cpu/cpu_types_arm.hpp @@ -0,0 +1,515 @@ +#include +#include +#include + +namespace vec_op { + +#ifdef ARM_BF16_SUPPORT + #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) +#else + #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) +#endif + +#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) + +#ifndef CPU_OP_GUARD +#define CPU_KERNEL_GUARD_IN(NAME) +#define CPU_KERNEL_GUARD_OUT(NAME) +#else +#define CPU_KERNEL_GUARD_IN(NAME) \ + std::cout << #NAME << " invoked." << std::endl; +#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl; +#endif + +#define FORCE_INLINE __attribute__((always_inline)) inline + +namespace { + template + constexpr void unroll_loop_item(std::integer_sequence, F &&f) { + (f(std::integral_constant{}), ...); + }; +}; + +template >> +constexpr void unroll_loop(F &&f) { + unroll_loop_item(std::make_integer_sequence{}, std::forward(f)); +} + +template struct Vec { + constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }; +}; + +struct FP32Vec8; +struct FP32Vec16; + +struct FP16Vec8 : public Vec { + constexpr static int VEC_ELEM_NUM = 8; + + float16x8_t reg; + + explicit FP16Vec8(const void *ptr) + : reg(vld1q_f16(static_cast(ptr))) {}; + + explicit FP16Vec8(const FP32Vec8 &); + + void save(void *ptr) const { + vst1q_f16(static_cast<__fp16 *>(ptr), reg); + } +}; + +struct FP16Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; + + float16x8x2_t reg; + + explicit FP16Vec16(const void *ptr) { + reg.val[0] = vld1q_f16(reinterpret_cast(ptr)); + reg.val[1] = vld1q_f16(reinterpret_cast(ptr) + 8); + } + + explicit FP16Vec16(const FP32Vec16& vec); + + void save(void *ptr) const { + vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); + vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); + } + + void save(void *ptr, const int elem_num) const { + int full_blocks = elem_num / 8; + int remainder = elem_num % 8; + + if (full_blocks > 0) { + vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); + if (full_blocks > 1) { + vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); + } + } + + if (remainder > 0) { + float16x8_t temp = reg.val[full_blocks]; + for (int i = 0; i < remainder; ++i) { + reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = vgetq_lane_f16(temp, i); + } + } + } +}; + + +#ifdef ARM_BF16_SUPPORT +struct BF16Vec8 : public Vec { + constexpr static int VEC_ELEM_NUM = 8; + + bfloat16x8_t reg; + + explicit BF16Vec8(const void *ptr) + : reg(*reinterpret_cast(ptr)) {}; + + explicit BF16Vec8(bfloat16x8_t data) : reg(data) {}; + + explicit BF16Vec8(const FP32Vec8 &); + + explicit BF16Vec8(float32x4x2_t v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1])) {}; + + void save(void *ptr) const { *reinterpret_cast(ptr) = reg; } +}; + +struct BF16Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; + + bfloat16x8x2_t reg; + + explicit BF16Vec16(const void *ptr) + : reg(*reinterpret_cast(ptr)) {}; + + explicit BF16Vec16(bfloat16x8x2_t data) : reg(data) {}; + + explicit BF16Vec16(const FP32Vec16 &); + + explicit BF16Vec16(float32x4x4_t v) : reg({ + vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1]), + vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[2]), v.val[3]) + }){}; + + void save(void *ptr) const { *reinterpret_cast(ptr) = reg; }; +}; + +struct BF16Vec32 : public Vec { + constexpr static int VEC_ELEM_NUM = 32; + + bfloat16x8x4_t reg; + + explicit BF16Vec32(const void *ptr) + : reg(*reinterpret_cast(ptr)) {}; + + explicit BF16Vec32(bfloat16x8x4_t data) : reg(data) {}; + + explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({ + vec8_data.reg, + vec8_data.reg, + vec8_data.reg, + vec8_data.reg + }) {}; + + void save(void *ptr) const { *reinterpret_cast(ptr) = reg; }; +}; +#endif + +struct FP32Vec4 : public Vec { + constexpr static int VEC_ELEM_NUM = 4; + + union AliasReg { + float32x4_t reg; + float values[VEC_ELEM_NUM]; + }; + + float32x4_t reg; + + explicit FP32Vec4(float v) : reg(vdupq_n_f32(v)) {}; + + explicit FP32Vec4() : reg(vdupq_n_f32(0.0f)) {}; + + explicit FP32Vec4(const float *ptr) : reg(vld1q_f32(ptr)) {}; + + explicit FP32Vec4(float32x4_t data) : reg(data) {}; + + explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}; +}; + +struct FP32Vec8 : public Vec { + constexpr static int VEC_ELEM_NUM = 8; + union AliasReg { + float32x4x2_t reg; + float values[VEC_ELEM_NUM]; + }; + + float32x4x2_t reg; + + explicit FP32Vec8(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v)}) {}; + + explicit FP32Vec8() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {}; + + explicit FP32Vec8(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4)}) {}; + + explicit FP32Vec8(float32x4x2_t data) : reg(data) {}; + + explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}; + + explicit FP32Vec8(const FP16Vec8 &v) { + reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg)); + reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg)); + }; + + explicit FP32Vec8(float16x8_t v) : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {}; + + #ifdef ARM_BF16_SUPPORT + + explicit FP32Vec8(bfloat16x8_t v) : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {}; + + explicit FP32Vec8(const BF16Vec8 &v) : reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {}; + + #endif + + float reduce_sum() const { + AliasReg ar; + ar.reg = reg; + float answer = 0; + unroll_loop([&answer, &ar](int i) { answer += ar.values[i]; }); + + return answer; + } + + FP32Vec8 exp() const { + AliasReg ar; + ar.reg = reg; + + float32x2_t exp_vec0 = {expf(ar.values[0]), expf(ar.values[1])}; + float32x2_t exp_vec1 = {expf(ar.values[2]), expf(ar.values[3])}; + float32x2_t exp_vec2 = {expf(ar.values[4]), expf(ar.values[5])}; + float32x2_t exp_vec3 = {expf(ar.values[6]), expf(ar.values[7])}; + + float32x4_t result0 = vcombine_f32(exp_vec0, exp_vec1); + float32x4_t result1 = vcombine_f32(exp_vec2, exp_vec3); + + float32x4x2_t result; + result.val[0] = result0; + result.val[1] = result1; + + return FP32Vec8(result); + } + + FP32Vec8 tanh() const { + AliasReg ar; + ar.reg = reg; + + float32x2_t tanh_vec0 = {tanhf(ar.values[0]), tanhf(ar.values[1])}; + float32x2_t tanh_vec1 = {tanhf(ar.values[2]), tanhf(ar.values[3])}; + float32x2_t tanh_vec2 = {tanhf(ar.values[4]), tanhf(ar.values[5])}; + float32x2_t tanh_vec3 = {tanhf(ar.values[6]), tanhf(ar.values[7])}; + + float32x4_t result0 = vcombine_f32(tanh_vec0, tanh_vec1); + float32x4_t result1 = vcombine_f32(tanh_vec2, tanh_vec3); + + float32x4x2_t result; + result.val[0] = result0; + result.val[1] = result1; + + return FP32Vec8(result); + } + + FP32Vec8 er() const { + AliasReg ar; + ar.reg = reg; + + float32x2_t er_vec0 = {static_cast(erf(ar.values[0])), static_cast(erf(ar.values[1]))}; + float32x2_t er_vec1 = {static_cast(erf(ar.values[2])), static_cast(erf(ar.values[3]))}; + float32x2_t er_vec2 = {static_cast(erf(ar.values[4])), static_cast(erf(ar.values[5]))}; + float32x2_t er_vec3 = {static_cast(erf(ar.values[6])), static_cast(erf(ar.values[7]))}; + + float32x4_t result0 = vcombine_f32(er_vec0, er_vec1); + float32x4_t result1 = vcombine_f32(er_vec2, er_vec3); + + float32x4x2_t result; + result.val[0] = result0; + result.val[1] = result1; + + return FP32Vec8(result); + } + + FP32Vec8 operator*(const FP32Vec8 &b) const { + return FP32Vec8(float32x4x2_t({vmulq_f32(reg.val[0], b.reg.val[0]), vmulq_f32(reg.val[1], b.reg.val[1])})); + } + + FP32Vec8 operator+(const FP32Vec8 &b) const { + return FP32Vec8(float32x4x2_t({vaddq_f32(reg.val[0], b.reg.val[0]), vaddq_f32(reg.val[1], b.reg.val[1])})); + } + + FP32Vec8 operator-(const FP32Vec8 &b) const { + return FP32Vec8(float32x4x2_t({vsubq_f32(reg.val[0], b.reg.val[0]), vsubq_f32(reg.val[1], b.reg.val[1])})); + } + + FP32Vec8 operator/(const FP32Vec8 &b) const { + return FP32Vec8(float32x4x2_t({vdivq_f32(reg.val[0], b.reg.val[0]), vdivq_f32(reg.val[1], b.reg.val[1])})); + } + + void save(float *ptr) const { + vst1q_f32(ptr, reg.val[0]); + vst1q_f32(ptr + 4, reg.val[1]); + } +}; + +struct FP32Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; + union AliasReg { + float32x4x4_t reg; + float values[VEC_ELEM_NUM]; + }; + + float32x4x4_t reg; + + explicit FP32Vec16(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v)}) {} + + explicit FP32Vec16() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {} + + explicit FP32Vec16(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8), vld1q_f32(ptr + 12)}) {} + + explicit FP32Vec16(float32x4x4_t data) : reg(data) {} + + explicit FP32Vec16(const FP32Vec8 &data) { + reg.val[0] = data.reg.val[0]; + reg.val[1] = data.reg.val[1]; + reg.val[2] = data.reg.val[0]; + reg.val[3] = data.reg.val[1]; + } + + explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {} + + explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v.reg)) {} + + #ifdef ARM_BF16_SUPPORT + explicit FP32Vec16(bfloat16x8x2_t v) : reg({ + vcvtq_low_f32_bf16(v.val[0]), + vcvtq_high_f32_bf16(v.val[0]), + vcvtq_low_f32_bf16(v.val[1]), + vcvtq_high_f32_bf16(v.val[1]) + }) {}; + #endif + + explicit FP32Vec16(const FP32Vec4 &data) { + reg.val[0] = data.reg; + reg.val[1] = data.reg; + reg.val[2] = data.reg; + reg.val[3] = data.reg; + }; + + #ifdef ARM_BF16_SUPPORT + explicit FP32Vec16(const BF16Vec16 &v) : reg({ + vcvtq_low_f32_bf16(v.reg.val[0]), + vcvtq_high_f32_bf16(v.reg.val[0]), + vcvtq_low_f32_bf16(v.reg.val[1]), + vcvtq_high_f32_bf16(v.reg.val[1]) + }) {}; + + explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}; + #endif + + explicit FP32Vec16(const FP16Vec16 &v) { + reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg.val[0])); + reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg.val[0])); + reg.val[2] = vcvt_f32_f16(vget_low_f16(v.reg.val[1])); + reg.val[3] = vcvt_f32_f16(vget_high_f16(v.reg.val[1])); + }; + + FP32Vec16 operator+(const FP32Vec16 &b) const { + return FP32Vec16(float32x4x4_t({ + vaddq_f32(reg.val[0], b.reg.val[0]), + vaddq_f32(reg.val[1], b.reg.val[1]), + vaddq_f32(reg.val[2], b.reg.val[2]), + vaddq_f32(reg.val[3], b.reg.val[3])})); + }; + + FP32Vec16 operator*(const FP32Vec16 &b) const { + return FP32Vec16(float32x4x4_t({ + vmulq_f32(reg.val[0], b.reg.val[0]), + vmulq_f32(reg.val[1], b.reg.val[1]), + vmulq_f32(reg.val[2], b.reg.val[2]), + vmulq_f32(reg.val[3], b.reg.val[3])})); + }; + + FP32Vec16 operator-(const FP32Vec16 &b) const { + return FP32Vec16(float32x4x4_t({ + vsubq_f32(reg.val[0], b.reg.val[0]), + vsubq_f32(reg.val[1], b.reg.val[1]), + vsubq_f32(reg.val[2], b.reg.val[2]), + vsubq_f32(reg.val[3], b.reg.val[3]) + })); + }; + + FP32Vec16 operator/(const FP32Vec16 &b) const { + return FP32Vec16(float32x4x4_t({ + vdivq_f32(reg.val[0], b.reg.val[0]), + vdivq_f32(reg.val[1], b.reg.val[1]), + vdivq_f32(reg.val[2], b.reg.val[2]), + vdivq_f32(reg.val[3], b.reg.val[3]) + })); + }; + + float reduce_sum() const { + AliasReg ar; + ar.reg = reg; + float answer = 0; + unroll_loop([&answer, &ar](int i) { answer += ar.values[i]; }); + + return answer; + }; + + template float reduce_sub_sum(int idx) { + static_assert(VEC_ELEM_NUM % group_size == 0); + + AliasReg ar; + ar.reg = reg; + float answer = 0; + const int start = idx * group_size; + unroll_loop( + [&answer, &start, ar](int i) { answer += ar.values[start + i]; }); + + return answer; + }; + + void save(float *ptr) const { + vst1q_f32(ptr, reg.val[0]); + vst1q_f32(ptr + 4, reg.val[1]); + vst1q_f32(ptr + 8, reg.val[2]); + vst1q_f32(ptr + 12, reg.val[3]); + }; +}; + +template struct VecType { using vec_type = void; }; + +template using vec_t = typename VecType::vec_type; + +template <> struct VecType { using vec_type = FP32Vec8; }; + +template <> struct VecType { using vec_type = FP16Vec8; }; + +#ifdef ARM_BF16_SUPPORT +template <> struct VecType { using vec_type = BF16Vec8; }; +#endif + +template void storeFP32(float v, T *ptr) { *ptr = v; } + +template <> inline void storeFP32(float v, c10::Half *ptr) { + *reinterpret_cast<__fp16 *>(ptr) = v; +} + +inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) { + float16x4_t low_0 = vcvt_f16_f32(v.reg.val[0]); + float16x4_t high_0 = vcvt_f16_f32(v.reg.val[1]); + float16x4_t low_1 = vcvt_f16_f32(v.reg.val[2]); + float16x4_t high_1 = vcvt_f16_f32(v.reg.val[3]); + + reg.val[0] = vcombine_f16(low_0, high_0); + reg.val[1] = vcombine_f16(low_1, high_1); +}; + +inline FP16Vec8 :: FP16Vec8(const FP32Vec8 &v) { + float16x4_t lower_half = vcvt_f16_f32(v.reg.val[0]); + float16x4_t upper_half = vcvt_f16_f32(v.reg.val[1]); + + reg = vcombine_f16(lower_half, upper_half); +}; + +inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { + + acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a.reg.val[0], b.reg.val[0]); + acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a.reg.val[1], b.reg.val[1]); + acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a.reg.val[2], b.reg.val[2]); + acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a.reg.val[3], b.reg.val[3]); +}; + +#ifdef ARM_BF16_SUPPORT +inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) { + + float32x4_t a0_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[0])); + float32x4_t a0_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[0])); + float32x4_t a1_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[1])); + float32x4_t a1_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[1])); + + float32x4_t b0_low = vcvt_f32_bf16(vget_low_bf16(b.reg.val[0])); + float32x4_t b0_high = vcvt_f32_bf16(vget_high_bf16(b.reg.val[0])); + float32x4_t b1_low = vcvt_f32_bf16(vget_low_bf16(b.reg.val[1])); + float32x4_t b1_high = vcvt_f32_bf16(vget_high_bf16(b.reg.val[1])); + + acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a0_low, b0_low); + acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a0_high, b0_high); + acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a1_low, b1_low); + acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a1_high, b1_high); +}; +#endif + +#ifdef ARM_BF16_SUPPORT +inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1])) {}; + +inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) : reg({ + vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1]), + vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[2]), v.reg.val[3]) + }){}; +#endif + +inline void prefetch(const void *addr) { + __builtin_prefetch(addr, 0, 1); +}; + +#ifdef ARM_BF16_SUPPORT +template <> +inline void storeFP32(float v, c10::BFloat16 *ptr) { + *reinterpret_cast<__bf16 *>(ptr) = vcvth_bf16_f32(v); +}; +#endif +}; \ No newline at end of file diff --git a/docs/source/getting_started/arm-installation.rst b/docs/source/getting_started/arm-installation.rst new file mode 100644 index 0000000000000..7b457df92c11d --- /dev/null +++ b/docs/source/getting_started/arm-installation.rst @@ -0,0 +1,50 @@ +.. _installation_arm: + +Installation for ARM CPUs +========================= + +vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering: + +* CPU backend inference capabilities +* Relevant runtime environment variables +* Performance optimization tips + +ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes. +Contents: + +1. :ref:`Requirements ` +2. :ref:`Quick Start with Dockerfile ` +3. :ref:`Building from Source ` + +.. _arm_backend_requirements: + +Requirements +------------ + +* **Operating System**: Linux or macOS +* **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended) +* **Instruction Set Architecture (ISA)**: NEON support is required + +.. _arm_backend_quick_start_dockerfile: + +Quick Start with Dockerfile +--------------------------- + +You can quickly set up vLLM on ARM using Docker: + +.. code-block:: console + + $ docker build -f Dockerfile.arm -t vllm-cpu-env --shm-size=4g . + $ docker run -it \ + --rm \ + --network=host \ + --cpuset-cpus= \ + --cpuset-mems= \ + vllm-cpu-env + +.. _build_arm_backend_from_source: + +Building from Source +-------------------- + +To build vLLM from source on Ubuntu 22.04 or other Linux distributions, follow a similar process as with x86. Testing has been conducted on AWS Graviton3 instances for compatibility. diff --git a/docs/source/index.rst b/docs/source/index.rst index c2afd806c50f9..0692e949f1c77 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -67,6 +67,7 @@ Documentation getting_started/openvino-installation getting_started/cpu-installation getting_started/gaudi-installation + getting_started/arm-installation getting_started/neuron-installation getting_started/tpu-installation getting_started/xpu-installation diff --git a/examples/offline_inference.py b/examples/offline_inference.py index 9b758fa2479f6..23cc6e8539431 100644 --- a/examples/offline_inference.py +++ b/examples/offline_inference.py @@ -19,4 +19,4 @@ for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") \ No newline at end of file diff --git a/requirements-cpu.txt b/requirements-cpu.txt index 749b03a0603d8..db8ad9d3a015d 100644 --- a/requirements-cpu.txt +++ b/requirements-cpu.txt @@ -1,6 +1,7 @@ # Common dependencies -r requirements-common.txt -# Dependencies for x86_64 CPUs -torch == 2.5.1+cpu; platform_machine != "ppc64le" -torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch +# Dependencies for CPUs +torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" +torch==2.5.1; platform_machine == "aarch64" +torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch \ No newline at end of file From 519e8e4182af8e25d78b062ba5e613df661e6e5d Mon Sep 17 00:00:00 2001 From: Ricky Xu Date: Mon, 25 Nov 2024 21:09:43 -0800 Subject: [PATCH 157/255] [v1] EngineArgs for better config handling for v1 (#10382) Signed-off-by: rickyx --- .buildkite/test-pipeline.yaml | 2 +- tests/v1/engine/test_async_llm.py | 3 ++ tests/v1/engine/test_engine_args.py | 42 +++++++++++++++++ tests/v1/engine/test_engine_core.py | 3 +- tests/v1/engine/test_engine_core_client.py | 6 ++- vllm/engine/arg_utils.py | 53 ++++++++++++++++++++-- vllm/engine/async_llm_engine.py | 2 +- vllm/engine/llm_engine.py | 2 +- vllm/engine/multiprocessing/engine.py | 2 +- vllm/entrypoints/openai/api_server.py | 4 +- vllm/v1/engine/async_llm.py | 2 +- vllm/v1/engine/core.py | 13 ------ vllm/v1/engine/llm_engine.py | 2 +- 13 files changed, 109 insertions(+), 27 deletions(-) create mode 100644 tests/v1/engine/test_engine_args.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index bff33d35b423e..fc23c9cff0d87 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -172,7 +172,7 @@ steps: - vllm/ - tests/v1 commands: - - pytest -v -s v1 + - VLLM_USE_V1=1 pytest -v -s v1 - label: Examples Test # 15min working_dir: "/vllm-workspace/examples" diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 1f26fe0fc892f..fffb5b8100ec7 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -32,6 +32,9 @@ async def generate(engine: AsyncLLM, request_id: str, @pytest.mark.asyncio async def test_load(monkeypatch): + # TODO(rickyx): Remove monkeypatch once we have a better way to test V1 + # so that in the future when we switch, we don't have to change all the + # tests. with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py new file mode 100644 index 0000000000000..69cfdf5a395c1 --- /dev/null +++ b/tests/v1/engine/test_engine_args.py @@ -0,0 +1,42 @@ +import pytest + +from vllm import envs +from vllm.config import VllmConfig +from vllm.engine.arg_utils import EngineArgs +from vllm.usage.usage_lib import UsageContext + +if not envs.VLLM_USE_V1: + pytest.skip( + "Skipping V1 tests. Rerun with `VLLM_USE_V1=1` to test.", + allow_module_level=True, + ) + + +def test_defaults(): + engine_args = EngineArgs(model="facebook/opt-125m") + + # Assert V1 defaults + assert (engine_args.enable_prefix_caching + ), "V1 turns on prefix caching by default" + + +def test_defaults_with_usage_context(): + engine_args = EngineArgs(model="facebook/opt-125m") + vllm_config: VllmConfig = engine_args.create_engine_config( + UsageContext.LLM_CLASS) + + assert vllm_config.scheduler_config.max_num_seqs == 1024 + assert vllm_config.scheduler_config.max_num_batched_tokens == 8192 + + engine_args = EngineArgs(model="facebook/opt-125m") + vllm_config = engine_args.create_engine_config( + UsageContext.OPENAI_API_SERVER) + assert vllm_config.scheduler_config.max_num_seqs == 1024 + assert vllm_config.scheduler_config.max_num_batched_tokens == 2048 + + +def test_prefix_cache_disabled_with_multimodel(): + engine_args = EngineArgs(model="llava-hf/llava-1.5-7b-hf") + + vllm_config = engine_args.create_engine_config(UsageContext.LLM_CLASS) + assert not vllm_config.cache_config.enable_prefix_caching diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index b3692b594326a..bd11ff1877064 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -43,7 +43,8 @@ def test_engine_core(monkeypatch): m.setenv("VLLM_USE_V1", "1") """Setup the EngineCore.""" engine_args = EngineArgs(model=MODEL_NAME) - vllm_config = engine_args.create_engine_config() + vllm_config = engine_args.create_engine_config( + usage_context=UsageContext.UNKNOWN_CONTEXT) executor_class = AsyncLLM._get_executor_cls(vllm_config) engine_core = EngineCore(vllm_config=vllm_config, diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index e248e35ae4069..582192196aaf9 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -82,7 +82,8 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool): m.setenv("VLLM_USE_V1", "1") engine_args = EngineArgs(model=MODEL_NAME, compilation_config=3) - vllm_config = engine_args.create_engine_config() + vllm_config = engine_args.create_engine_config( + UsageContext.UNKNOWN_CONTEXT) executor_class = AsyncLLM._get_executor_cls(vllm_config) client = EngineCoreClient.make_client( vllm_config, @@ -153,7 +154,8 @@ async def test_engine_core_client_asyncio(monkeypatch): m.setenv("VLLM_USE_V1", "1") engine_args = EngineArgs(model=MODEL_NAME) - vllm_config = engine_args.create_engine_config() + vllm_config = engine_args.create_engine_config( + usage_context=UsageContext.UNKNOWN_CONTEXT) executor_class = AsyncLLM._get_executor_cls(vllm_config) client = EngineCoreClient.make_client( vllm_config, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ca68c1d57151c..60ad5ee54a2f2 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -20,6 +20,7 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.platforms import current_platform from vllm.transformers_utils.utils import check_gguf_file +from vllm.usage.usage_lib import UsageContext from vllm.utils import FlexibleArgumentParser, StoreBoolean if TYPE_CHECKING: @@ -113,7 +114,7 @@ class EngineArgs: # NOTE(kzawora): default block size for Gaudi should be 128 # smaller sizes still work, but very inefficiently block_size: int = 16 if not current_platform.is_hpu() else 128 - enable_prefix_caching: bool = False + enable_prefix_caching: Optional[bool] = None disable_sliding_window: bool = False use_v2_block_manager: bool = True swap_space: float = 4 # GiB @@ -197,6 +198,11 @@ def __post_init__(self): if not self.tokenizer: self.tokenizer = self.model + # Override the default value of enable_prefix_caching if it's not set + # by user. + if self.enable_prefix_caching is None: + self.enable_prefix_caching = bool(envs.VLLM_USE_V1) + # support `EngineArgs(compilation_config={...})` # without having to manually construct a # CompilationConfig object @@ -953,7 +959,12 @@ def create_load_config(self) -> LoadConfig: ignore_patterns=self.ignore_patterns, ) - def create_engine_config(self) -> VllmConfig: + def create_engine_config(self, + usage_context: Optional[UsageContext] = None + ) -> VllmConfig: + if envs.VLLM_USE_V1: + self._override_v1_engine_args(usage_context) + # gguf file needs a specific model loader and doesn't use hf_repo if check_gguf_file(self.model): self.quantization = self.load_format = "gguf" @@ -1170,7 +1181,7 @@ def create_engine_config(self) -> VllmConfig: or "all" in detailed_trace_modules, ) - return VllmConfig( + config = VllmConfig( model_config=model_config, cache_config=cache_config, parallel_config=parallel_config, @@ -1185,6 +1196,42 @@ def create_engine_config(self) -> VllmConfig: compilation_config=self.compilation_config, ) + if envs.VLLM_USE_V1: + self._override_v1_engine_config(config) + return config + + def _override_v1_engine_args(self, usage_context: UsageContext) -> None: + """ + Override the EngineArgs's args based on the usage context for V1. + """ + assert envs.VLLM_USE_V1, "V1 is not enabled" + + if self.max_num_batched_tokens is None: + # When no user override, set the default values based on the + # usage context. + if usage_context == UsageContext.LLM_CLASS: + logger.warning("Setting max_num_batched_tokens to 8192 " + "for LLM_CLASS usage context.") + self.max_num_seqs = 1024 + self.max_num_batched_tokens = 8192 + elif usage_context == UsageContext.OPENAI_API_SERVER: + logger.warning("Setting max_num_batched_tokens to 2048 " + "for OPENAI_API_SERVER usage context.") + self.max_num_seqs = 1024 + self.max_num_batched_tokens = 2048 + + def _override_v1_engine_config(self, engine_config: VllmConfig) -> None: + """ + Override the EngineConfig's configs based on the usage context for V1. + """ + assert envs.VLLM_USE_V1, "V1 is not enabled" + # TODO (ywang96): Enable APC by default when VLM supports it. + if engine_config.model_config.is_multimodal_model: + logger.warning( + "Prefix caching is currently not supported for multimodal " + "models and has been disabled.") + engine_config.cache_config.enable_prefix_caching = False + @dataclass class AsyncEngineArgs(EngineArgs): diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 5a5388708b1c6..3224577c567f8 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -680,7 +680,7 @@ def from_engine_args( """Creates an async LLM engine from the engine arguments.""" # Create the engine configs. if engine_config is None: - engine_config = engine_args.create_engine_config() + engine_config = engine_args.create_engine_config(usage_context) executor_class = cls._get_executor_cls(engine_config) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index fb21b2dedeb74..a4975cece9a81 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -568,7 +568,7 @@ def from_engine_args( ) -> "LLMEngine": """Creates an LLM engine from the engine arguments.""" # Create the engine configs. - engine_config = engine_args.create_engine_config() + engine_config = engine_args.create_engine_config(usage_context) executor_class = cls._get_executor_cls(engine_config) # Create the LLM engine. engine = cls( diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 7de23643a2e1c..49a90b321dac4 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -111,7 +111,7 @@ def from_engine_args(cls, engine_args: AsyncEngineArgs, from vllm.plugins import load_general_plugins load_general_plugins() - engine_config = engine_args.create_engine_config() + engine_config = engine_args.create_engine_config(usage_context) executor_class = LLMEngine._get_executor_cls(engine_config) use_async_sockets = engine_config.model_config.use_async_output_proc diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index bc018be982bff..6bc31ef83ded4 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -135,8 +135,8 @@ async def build_async_engine_client_from_engine_args( # TODO: fill out feature matrix. if (MQLLMEngineClient.is_unsupported_config(engine_args) or envs.VLLM_USE_V1 or disable_frontend_multiprocessing): - - engine_config = engine_args.create_engine_config() + engine_config = engine_args.create_engine_config( + UsageContext.OPENAI_API_SERVER) uses_ray = getattr(AsyncLLMEngine._get_executor_cls(engine_config), "uses_ray", False) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index c44ebb2a85ba0..a17c8eac4b77c 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -94,7 +94,7 @@ def from_engine_args( # Create the engine configs. if engine_config is None: - vllm_config = engine_args.create_engine_config() + vllm_config = engine_args.create_engine_config(usage_context) else: vllm_config = engine_config diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 1a978fbe7355f..34f99dd30ef2e 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -41,19 +41,6 @@ def __init__( executor_class: Type[GPUExecutor], usage_context: UsageContext, ): - # Override the configs for V1. - # FIXME - if usage_context == UsageContext.LLM_CLASS: - vllm_config.scheduler_config.max_num_seqs = 1024 - vllm_config.scheduler_config.max_num_batched_tokens = 8192 - elif usage_context == UsageContext.OPENAI_API_SERVER: - vllm_config.scheduler_config.max_num_seqs = 1024 - vllm_config.scheduler_config.max_num_batched_tokens = 2048 - - # TODO (ywang96): Enable APC by default when VLM supports it. - if not vllm_config.model_config.is_multimodal_model: - vllm_config.cache_config.enable_prefix_caching = True - assert vllm_config.model_config.task != "embedding" logger.info("Initializing an LLM engine (v%s) with config: %s", diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 75a77be750acd..7a5482f03b6fa 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -82,7 +82,7 @@ def from_engine_args( """Creates an LLM engine from the engine arguments.""" # Create the engine configs. - vllm_config = engine_args.create_engine_config() + vllm_config = engine_args.create_engine_config(usage_context) executor_class = cls._get_executor_cls(vllm_config) if VLLM_ENABLE_V1_MULTIPROCESSING: From 9a88f897993a83fad79d1bf6b95595be25a8d68a Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Tue, 26 Nov 2024 00:00:16 -0600 Subject: [PATCH 158/255] custom allreduce + torch.compile (#10121) Signed-off-by: youkaichao Co-authored-by: youkaichao --- docs/source/getting_started/debugging.rst | 1 - tests/distributed/test_pynccl.py | 15 +-- tests/distributed/test_utils.py | 2 - .../device_communicators/pynccl.py | 26 ++--- vllm/distributed/parallel_state.py | 110 ++++++------------ vllm/v1/worker/gpu_model_runner.py | 6 +- 6 files changed, 59 insertions(+), 101 deletions(-) diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst index 77bf550601346..0c1afcbd7c0b9 100644 --- a/docs/source/getting_started/debugging.rst +++ b/docs/source/getting_started/debugging.rst @@ -86,7 +86,6 @@ If GPU/CPU communication cannot be established, you can use the following Python from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank) - pynccl.disabled = False s = torch.cuda.Stream() with torch.cuda.stream(s): diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index f702d7c46ea73..fb24d6bc2c100 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -60,7 +60,7 @@ def worker_fn(): tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank) with pynccl_comm.change_state(enable=True): - pynccl_comm.all_reduce(tensor) + tensor = pynccl_comm.all_reduce(tensor) result = tensor.mean().cpu().item() assert result == pynccl_comm.world_size @@ -84,12 +84,12 @@ def multiple_allreduce_worker_fn(): with pynccl_comm.change_state(enable=True): # two groups can communicate independently if torch.distributed.get_rank() in [0, 1]: - pynccl_comm.all_reduce(tensor) - pynccl_comm.all_reduce(tensor) + tensor = pynccl_comm.all_reduce(tensor) + tensor = pynccl_comm.all_reduce(tensor) result = tensor.mean().cpu().item() assert result == 4 else: - pynccl_comm.all_reduce(tensor) + tensor = pynccl_comm.all_reduce(tensor) result = tensor.mean().cpu().item() assert result == 2 @@ -140,14 +140,11 @@ def worker_fn_with_cudagraph(): with torch.cuda.graph( graph, stream=pynccl_comm.stream), pynccl_comm.change_state( enable=True): - # operation during the graph capture is recorded but not executed - # see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#creating-a-graph-using-stream-capture # noqa - pynccl_comm.all_reduce(a) + a_out = pynccl_comm.all_reduce(a) pynccl_comm.stream.synchronize() - assert a.mean().cpu().item() == pynccl_comm.world_size**0 graph.replay() pynccl_comm.stream.synchronize() - assert a.mean().cpu().item() == pynccl_comm.world_size**1 + assert a_out.mean().cpu().item() == pynccl_comm.world_size**1 @worker_fn_wrapper diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py index 686b697c98e03..5fb1ae7b29fd2 100644 --- a/tests/distributed/test_utils.py +++ b/tests/distributed/test_utils.py @@ -70,14 +70,12 @@ def gpu_worker(rank, WORLD_SIZE, port1, port2): rank=rank, world_size=WORLD_SIZE) pynccl1 = PyNcclCommunicator(pg1, device=rank) - pynccl1.disabled = False if rank <= 2: pg2 = StatelessProcessGroup.create(host="127.0.0.1", port=port2, rank=rank, world_size=3) pynccl2 = PyNcclCommunicator(pg2, device=rank) - pynccl2.disabled = False data = torch.tensor([rank]).cuda() pynccl1.all_reduce(data) pg1.barrier() diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index 7411304eb18fa..d4e3f81747038 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -106,30 +106,30 @@ def __init__( self.stream.synchronize() del data - # by default it is disabled, e.g. in profiling models and prefill phase. - # to use it, use under `with obj.change_state(enable=True)`, usually - # when we are using CUDA graph. - self.disabled = True - def all_reduce(self, - tensor: torch.Tensor, + in_tensor: torch.Tensor, op: ReduceOp = ReduceOp.SUM, - stream=None): + stream=None) -> torch.Tensor: if self.disabled: - return + return None # nccl communicator created on a specific device # will only work on tensors on the same device # otherwise it will cause "illegal memory access" - assert tensor.device == self.device, ( + assert in_tensor.device == self.device, ( f"this nccl communicator is created to work on {self.device}, " - f"but the input tensor is on {tensor.device}") + f"but the input tensor is on {in_tensor.device}") + + out_tensor = torch.empty_like(in_tensor) + if stream is None: stream = self.stream - self.nccl.ncclAllReduce(buffer_type(tensor.data_ptr()), - buffer_type(tensor.data_ptr()), tensor.numel(), - ncclDataTypeEnum.from_torch(tensor.dtype), + self.nccl.ncclAllReduce(buffer_type(in_tensor.data_ptr()), + buffer_type(out_tensor.data_ptr()), + in_tensor.numel(), + ncclDataTypeEnum.from_torch(in_tensor.dtype), ncclRedOpTypeEnum.from_torch(op), self.comm, cudaStream_t(stream.cuda_stream)) + return out_tensor def all_gather(self, output_tensor: torch.Tensor, diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 87ade377266a2..ccbe00386c5da 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -96,42 +96,24 @@ def _register_group(group: "GroupCoordinator") -> None: _groups[group.unique_name] = weakref.ref(group) -if supports_custom_op(): - - def inplace_all_reduce(tensor: torch.Tensor, group_name: str) -> None: - assert group_name in _groups, f"Group {group_name} is not found." - group = _groups[group_name]() - if group is None: - raise ValueError(f"Group {group_name} is destroyed.") - group._all_reduce_in_place(tensor) - - def inplace_all_reduce_fake(tensor: torch.Tensor, group_name: str) -> None: - return +def all_reduce(tensor: torch.Tensor, group_name: str) -> torch.Tensor: + assert group_name in _groups, f"Group {group_name} is not found." + group = _groups[group_name]() + if group is None: + raise ValueError(f"Group {group_name} is destroyed.") + return group._all_reduce_out_place(tensor) - direct_register_custom_op( - op_name="inplace_all_reduce", - op_func=inplace_all_reduce, - mutates_args=["tensor"], - fake_impl=inplace_all_reduce_fake, - ) - def outplace_all_reduce(tensor: torch.Tensor, - group_name: str) -> torch.Tensor: - assert group_name in _groups, f"Group {group_name} is not found." - group = _groups[group_name]() - if group is None: - raise ValueError(f"Group {group_name} is destroyed.") - return group._all_reduce_out_place(tensor) +def all_reduce_fake(tensor: torch.Tensor, group_name: str) -> torch.Tensor: + return torch.empty_like(tensor) - def outplace_all_reduce_fake(tensor: torch.Tensor, - group_name: str) -> torch.Tensor: - return torch.empty_like(tensor) +if supports_custom_op(): direct_register_custom_op( - op_name="outplace_all_reduce", - op_func=outplace_all_reduce, + op_name="all_reduce", + op_func=all_reduce, mutates_args=[], - fake_impl=outplace_all_reduce_fake, + fake_impl=all_reduce_fake, ) @@ -317,30 +299,13 @@ def graph_capture( stream.wait_stream(curr_stream) with torch.cuda.stream(stream), maybe_ca_context: - # In graph mode, we have to be very careful about the collective - # operations. The current status is: - # allreduce \ Mode | Eager | Graph | - # -------------------------------------------- - # custom allreduce | enabled | enabled | - # PyNccl | disabled| enabled | - # torch.distributed | enabled | disabled| - # - # Note that custom allreduce will have a runtime check, if the - # tensor size is too large, it will fallback to the next - # available option. - # In summary: When using CUDA graph, we use - # either custom all-reduce kernel or pynccl. When not using - # CUDA graph, we use either custom all-reduce kernel or - # PyTorch NCCL. We always prioritize using custom all-reduce - # kernel but fall back to PyTorch or pynccl if it is - # disabled or not supported. pynccl_comm = self.pynccl_comm maybe_pynccl_context: Any if not pynccl_comm: maybe_pynccl_context = nullcontext() else: maybe_pynccl_context = pynccl_comm.change_state( - enable=True, stream=torch.cuda.current_stream()) + stream=torch.cuda.current_stream()) with maybe_pynccl_context: yield graph_capture_context @@ -356,8 +321,8 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: coordinator. In addition, PyTorch custom ops do not support mutation or returning - a new tensor in the same op. So we need to figure out if the op is - in-place or out-of-place ahead of time. + a new tensor in the same op. So we always make the all-reduce operation + out-of-place. """ # Bypass the function if we are using only 1 GPU. if self.world_size == 1: @@ -368,10 +333,6 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: ipex.distributed.all_reduce(input_, group=self.device_group) return input_ - if not supports_custom_op(): - self._all_reduce_in_place(input_) - return input_ - if self.tpu_communicator is not None and \ not self.tpu_communicator.disabled: # TPU handles Dynamo with its own logic. @@ -385,30 +346,31 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: not self.xpu_communicator.disabled: return self.xpu_communicator.all_reduce(input_) - if self.ca_comm is not None and \ - not self.ca_comm.disabled and \ - self.ca_comm.should_custom_ar(input_): - return torch.ops.vllm.outplace_all_reduce( - input_, group_name=self.unique_name) - else: - torch.ops.vllm.inplace_all_reduce(input_, - group_name=self.unique_name) - return input_ + return torch.ops.vllm.all_reduce(input_, group_name=self.unique_name) def _all_reduce_out_place(self, input_: torch.Tensor) -> torch.Tensor: + # always try custom allreduce first, + # and then pynccl. ca_comm = self.ca_comm - assert ca_comm is not None - assert not ca_comm.disabled - out = ca_comm.custom_all_reduce(input_) - assert out is not None - return out - - def _all_reduce_in_place(self, input_: torch.Tensor) -> None: + if ca_comm is not None and not ca_comm.disabled and \ + ca_comm.should_custom_ar(input_): + out = ca_comm.custom_all_reduce(input_) + assert out is not None + return out pynccl_comm = self.pynccl_comm - if (pynccl_comm is not None and not pynccl_comm.disabled): - pynccl_comm.all_reduce(input_) - else: - torch.distributed.all_reduce(input_, group=self.device_group) + assert pynccl_comm is not None + # TODO: pynccl should not use `stream=` + # it can just always use the current stream. + out = pynccl_comm.all_reduce(input_, + stream=torch.cuda.current_stream()) + if out is None: + # fall back to the default all-reduce using PyTorch. + # this usually happens during testing. + # when we run the model, allreduce only happens for the TP + # group, where we always have either custom allreduce or pynccl. + out = input_.clone() + torch.distributed.all_reduce(out, group=self.device_group) + return out def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor: world_size = self.world_size diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 02f9498142bb7..13cbc8fa39c03 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -10,6 +10,7 @@ from vllm.compilation.compile_context import set_compile_context from vllm.config import CompilationLevel, VllmConfig +from vllm.distributed.parallel_state import graph_capture from vllm.forward_context import set_forward_context from vllm.inputs import INPUT_REGISTRY, InputRegistry from vllm.logger import init_logger @@ -570,8 +571,9 @@ def capture_model(self) -> None: # Trigger CUDA graph capture for specific shapes. # Capture the large shapes first so that the smaller shapes # can reuse the memory pool allocated for the large shapes. - for num_tokens in reversed(self.cudagraph_batch_sizes): - self._dummy_run(self.model, num_tokens, self.kv_caches) + with graph_capture(): + for num_tokens in reversed(self.cudagraph_batch_sizes): + self._dummy_run(self.model, num_tokens, self.kv_caches) end_time = time.perf_counter() end_free_gpu_memory = torch.cuda.mem_get_info()[0] From 940635343a087a5fb6548449989b84de77af5e73 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 26 Nov 2024 14:55:00 +0800 Subject: [PATCH 159/255] [Misc] Remove outdated init protocols (#10655) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/interfaces.py | 30 ------------------- vllm/model_executor/models/interfaces_base.py | 2 +- 2 files changed, 1 insertion(+), 31 deletions(-) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 4f0c75b2c6a57..9b4a97abf9b51 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -10,7 +10,6 @@ from .interfaces_base import is_embedding_model if TYPE_CHECKING: - from vllm.config import LoRAConfig, MultiModalConfig, SchedulerConfig from vllm.sequence import IntermediateTensors logger = init_logger(__name__) @@ -29,9 +28,6 @@ class SupportsMultiModal(Protocol): MRO of your model class. """ - def __init__(self, *, multimodal_config: "MultiModalConfig") -> None: - ... - # We can't use runtime_checkable with ClassVar for issubclass checks # so we need to treat the class as an instance and use isinstance instead @@ -39,9 +35,6 @@ def __init__(self, *, multimodal_config: "MultiModalConfig") -> None: class _SupportsMultiModalType(Protocol): supports_multimodal: Literal[True] - def __call__(self, *, multimodal_config: "MultiModalConfig") -> None: - ... - @overload def supports_multimodal( @@ -81,10 +74,6 @@ class SupportsLoRA(Protocol): embedding_modules: ClassVar[Dict[str, str]] embedding_padding_modules: ClassVar[List[str]] - # lora_config is None when LoRA is not enabled - def __init__(self, *, lora_config: Optional["LoRAConfig"] = None) -> None: - ... - # We can't use runtime_checkable with ClassVar for issubclass checks # so we need to treat the class as an instance and use isinstance instead @@ -97,9 +86,6 @@ class _SupportsLoRAType(Protocol): embedding_modules: Dict[str, str] embedding_padding_modules: List[str] - def __call__(self, *, lora_config: Optional["LoRAConfig"] = None) -> None: - ... - @overload def supports_lora(model: Type[object]) -> TypeIs[Type[SupportsLoRA]]: @@ -276,21 +262,11 @@ class HasInnerState(Protocol): for max_num_seqs, etc. True for e.g. both Mamba and Jamba. """ - def __init__(self, - *, - scheduler_config: Optional["SchedulerConfig"] = None) -> None: - ... - @runtime_checkable class _HasInnerStateType(Protocol): has_inner_state: ClassVar[Literal[True]] - def __init__(self, - *, - scheduler_config: Optional["SchedulerConfig"] = None) -> None: - ... - @overload def has_inner_state(model: object) -> TypeIs[HasInnerState]: @@ -323,17 +299,11 @@ class IsAttentionFree(Protocol): True for Mamba but not Jamba. """ - def __init__(self) -> None: - ... - @runtime_checkable class _IsAttentionFreeType(Protocol): is_attention_free: ClassVar[Literal[True]] - def __init__(self) -> None: - ... - @overload def is_attention_free(model: object) -> TypeIs[IsAttentionFree]: diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index 7bb43beff255c..957a5a6e26b5c 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -71,7 +71,7 @@ def _check_vllm_model_forward(model: Union[Type[object], object]) -> bool: and issubclass(model, nn.Module)): logger.warning( "The model (%s) is missing " - "vLLM-specific keywords from its initializer: %s", + "vLLM-specific keywords from its `forward` method: %s", model, missing_kws, ) From 334d64d1e816cc7c9fa2f67e22d24638e63c8e15 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 26 Nov 2024 00:20:04 -0800 Subject: [PATCH 160/255] [ci] add vllm_test_utils (#10659) Signed-off-by: youkaichao --- Dockerfile | 4 ++ Dockerfile.cpu | 4 ++ Dockerfile.hpu | 3 ++ Dockerfile.neuron | 3 ++ Dockerfile.openvino | 3 ++ Dockerfile.ppc64le | 3 ++ Dockerfile.rocm | 3 ++ Dockerfile.tpu | 3 ++ Dockerfile.xpu | 3 +- tests/entrypoints/llm/test_lazy_outlines.py | 23 +++++--- tests/test_lazy_torch_compile.py | 54 +------------------ tests/vllm_test_utils/setup.py | 7 +++ .../vllm_test_utils/__init__.py | 8 +++ .../vllm_test_utils/vllm_test_utils/blame.py | 53 ++++++++++++++++++ 14 files changed, 113 insertions(+), 61 deletions(-) create mode 100644 tests/vllm_test_utils/setup.py create mode 100644 tests/vllm_test_utils/vllm_test_utils/__init__.py create mode 100644 tests/vllm_test_utils/vllm_test_utils/blame.py diff --git a/Dockerfile b/Dockerfile index 220dbe26712ec..682f046d4b6ec 100644 --- a/Dockerfile +++ b/Dockerfile @@ -191,6 +191,10 @@ ADD . /vllm-workspace/ RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -r requirements-dev.txt +# install development dependencies (for testing) +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install -e tests/vllm_test_utils + # enable fast downloads from hf (for testing) RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install hf_transfer diff --git a/Dockerfile.cpu b/Dockerfile.cpu index 287b4958da4e5..d2f72ea975a3d 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -62,4 +62,8 @@ WORKDIR /workspace/ RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks +# install development dependencies (for testing) +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install -e tests/vllm_test_utils + ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/Dockerfile.hpu b/Dockerfile.hpu index d18fc016387bf..87e0c1a6a934e 100644 --- a/Dockerfile.hpu +++ b/Dockerfile.hpu @@ -11,6 +11,9 @@ ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install +# install development dependencies (for testing) +RUN python3 -m pip install -e tests/vllm_test_utils + WORKDIR /workspace/ RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks diff --git a/Dockerfile.neuron b/Dockerfile.neuron index 2143315d2a078..76dbd4c04d3f3 100644 --- a/Dockerfile.neuron +++ b/Dockerfile.neuron @@ -38,4 +38,7 @@ ENV VLLM_TARGET_DEVICE neuron RUN --mount=type=bind,source=.git,target=.git \ pip install --no-build-isolation -v -e . +# install development dependencies (for testing) +RUN python3 -m pip install -e tests/vllm_test_utils + CMD ["/bin/bash"] diff --git a/Dockerfile.openvino b/Dockerfile.openvino index a05ff452cd36e..8bd188ffde408 100644 --- a/Dockerfile.openvino +++ b/Dockerfile.openvino @@ -22,4 +22,7 @@ RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVIC COPY examples/ /workspace/examples COPY benchmarks/ /workspace/benchmarks +# install development dependencies (for testing) +RUN python3 -m pip install -e tests/vllm_test_utils + CMD ["/bin/bash"] diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le index b19c6ddec7948..971248577983f 100644 --- a/Dockerfile.ppc64le +++ b/Dockerfile.ppc64le @@ -29,6 +29,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=bind,source=.git,target=.git \ VLLM_TARGET_DEVICE=cpu python3 setup.py install +# install development dependencies (for testing) +RUN python3 -m pip install -e tests/vllm_test_utils + WORKDIR /workspace/ RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 62d4a9b4909c3..e733994f8c33e 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -168,4 +168,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ if ls libs/*.whl; then \ python3 -m pip install libs/*.whl; fi +# install development dependencies (for testing) +RUN python3 -m pip install -e tests/vllm_test_utils + CMD ["/bin/bash"] diff --git a/Dockerfile.tpu b/Dockerfile.tpu index 0a507b6ecdf60..b617932a85b47 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -22,4 +22,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ -r requirements-tpu.txt RUN python3 setup.py develop +# install development dependencies (for testing) +RUN python3 -m pip install -e tests/vllm_test_utils + CMD ["/bin/bash"] diff --git a/Dockerfile.xpu b/Dockerfile.xpu index 63bc682770422..a374f20d7d949 100644 --- a/Dockerfile.xpu +++ b/Dockerfile.xpu @@ -64,5 +64,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ENV VLLM_USAGE_SOURCE production-docker-image \ TRITON_XPU_PROFILE 1 - +# install development dependencies (for testing) +RUN python3 -m pip install -e tests/vllm_test_utils ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py index cbfb0cc32c1ce..81fb000d8ac56 100644 --- a/tests/entrypoints/llm/test_lazy_outlines.py +++ b/tests/entrypoints/llm/test_lazy_outlines.py @@ -1,12 +1,12 @@ import sys +from vllm_test_utils import blame + from vllm import LLM, SamplingParams from vllm.distributed import cleanup_dist_env_and_memory -def test_lazy_outlines(sample_regex): - """If users don't use guided decoding, outlines should not be imported. - """ +def run_normal(): prompts = [ "Hello, my name is", "The president of the United States is", @@ -25,13 +25,12 @@ def test_lazy_outlines(sample_regex): generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - # make sure outlines is not imported - assert 'outlines' not in sys.modules - # Destroy the LLM object and free up the GPU memory. del llm cleanup_dist_env_and_memory() + +def run_lmfe(sample_regex): # Create an LLM with guided decoding enabled. llm = LLM(model="facebook/opt-125m", enforce_eager=True, @@ -51,5 +50,15 @@ def test_lazy_outlines(sample_regex): generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +def test_lazy_outlines(sample_regex): + """If users don't use guided decoding, outlines should not be imported. + """ # make sure outlines is not imported - assert 'outlines' not in sys.modules + module_name = "outlines" + with blame(lambda: module_name in sys.modules) as result: + run_normal() + run_lmfe(sample_regex) + assert not result.found, ( + f"Module {module_name} is already imported, the" + f" first import location is:\n{result.trace_stack}") diff --git a/tests/test_lazy_torch_compile.py b/tests/test_lazy_torch_compile.py index b8ac4dd93732b..4756fac8e2a8d 100644 --- a/tests/test_lazy_torch_compile.py +++ b/tests/test_lazy_torch_compile.py @@ -1,61 +1,9 @@ # Description: Test the lazy import module # The utility function cannot be placed in `vllm.utils` # this needs to be a standalone script - -import contextlib -import dataclasses import sys -import traceback -from typing import Callable, Generator - - -@dataclasses.dataclass -class BlameResult: - found: bool = False - trace_stack: str = "" - - -@contextlib.contextmanager -def blame(func: Callable) -> Generator[BlameResult, None, None]: - """ - Trace the function calls to find the first function that satisfies the - condition. The trace stack will be stored in the result. - - Usage: - - ```python - with blame(lambda: some_condition()) as result: - # do something - - if result.found: - print(result.trace_stack) - """ - result = BlameResult() - - def _trace_calls(frame, event, arg=None): - nonlocal result - if event in ['call', 'return']: - # for every function call or return - try: - # Temporarily disable the trace function - sys.settrace(None) - # check condition here - if not result.found and func(): - result.found = True - result.trace_stack = "".join(traceback.format_stack()) - # Re-enable the trace function - sys.settrace(_trace_calls) - except NameError: - # modules are deleted during shutdown - pass - return _trace_calls - - sys.settrace(_trace_calls) - - yield result - - sys.settrace(None) +from vllm_test_utils import blame module_name = "torch._inductor.async_compile" diff --git a/tests/vllm_test_utils/setup.py b/tests/vllm_test_utils/setup.py new file mode 100644 index 0000000000000..790e891ec837d --- /dev/null +++ b/tests/vllm_test_utils/setup.py @@ -0,0 +1,7 @@ +from setuptools import setup + +setup( + name='vllm_test_utils', + version='0.1', + packages=['vllm_test_utils'], +) diff --git a/tests/vllm_test_utils/vllm_test_utils/__init__.py b/tests/vllm_test_utils/vllm_test_utils/__init__.py new file mode 100644 index 0000000000000..bf0b62a5b75e3 --- /dev/null +++ b/tests/vllm_test_utils/vllm_test_utils/__init__.py @@ -0,0 +1,8 @@ +""" +vllm_utils is a package for vLLM testing utilities. +It does not import any vLLM modules. +""" + +from .blame import BlameResult, blame + +__all__ = ["blame", "BlameResult"] diff --git a/tests/vllm_test_utils/vllm_test_utils/blame.py b/tests/vllm_test_utils/vllm_test_utils/blame.py new file mode 100644 index 0000000000000..ad23ab83c2d81 --- /dev/null +++ b/tests/vllm_test_utils/vllm_test_utils/blame.py @@ -0,0 +1,53 @@ +import contextlib +import dataclasses +import sys +import traceback +from typing import Callable, Generator + + +@dataclasses.dataclass +class BlameResult: + found: bool = False + trace_stack: str = "" + + +@contextlib.contextmanager +def blame(func: Callable) -> Generator[BlameResult, None, None]: + """ + Trace the function calls to find the first function that satisfies the + condition. The trace stack will be stored in the result. + + Usage: + + ```python + with blame(lambda: some_condition()) as result: + # do something + + if result.found: + print(result.trace_stack) + """ + result = BlameResult() + + def _trace_calls(frame, event, arg=None): + nonlocal result + if event in ['call', 'return']: + # for every function call or return + try: + # Temporarily disable the trace function + sys.settrace(None) + # check condition here + if not result.found and func(): + result.found = True + result.trace_stack = "".join(traceback.format_stack()) + # Re-enable the trace function + sys.settrace(_trace_calls) + except NameError: + # modules are deleted during shutdown + pass + return _trace_calls + + sys.settrace(_trace_calls) + + yield result + + sys.settrace(None) From 1f6584ee851501cfae672973b9e55d000729818c Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Tue, 26 Nov 2024 18:36:45 +0800 Subject: [PATCH 161/255] [V1] Enable profile for LLMEngine (#10665) --- vllm/v1/engine/llm_engine.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 7a5482f03b6fa..bd19d998a4adb 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -161,13 +161,13 @@ def step(self) -> List[RequestOutput]: # TODO(rob): Can we get rid of these? def get_model_config(self): - pass + return self.model_config def start_profile(self): - pass + self.engine_core.profile(True) def stop_profile(self): - pass + self.engine_core.profile(False) def get_tokenizer_group(self, group_type): pass From db66e018eaabcc5e5855e994b49931dbb4800ce1 Mon Sep 17 00:00:00 2001 From: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com> Date: Tue, 26 Nov 2024 09:11:16 -0800 Subject: [PATCH 162/255] [Bugfix] Fix for Spec model TP + Chunked Prefill (#10232) Signed-off-by: andoorve <37849411+andoorve@users.noreply.github.com> Signed-off-by: Sourashis Roy Co-authored-by: Sourashis Roy --- docs/source/serving/compatibility_matrix.rst | 2 +- tests/core/test_chunked_prefill_scheduler.py | 39 +++++++++++++ tests/spec_decode/e2e/test_compatibility.py | 46 --------------- .../e2e/test_integration_dist_tp2.py | 57 +++++++++++++++++++ tests/spec_decode/test_spec_decode_worker.py | 3 +- vllm/config.py | 10 ---- vllm/core/scheduler.py | 28 ++++++--- vllm/spec_decode/spec_decode_worker.py | 33 +++++++++-- 8 files changed, 145 insertions(+), 73 deletions(-) diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst index fa03d2cde1486..a93632ff36fb8 100644 --- a/docs/source/serving/compatibility_matrix.rst +++ b/docs/source/serving/compatibility_matrix.rst @@ -118,7 +118,7 @@ Feature x Feature - - * - :ref:`SD ` - - ✗ + - ✅ - ✅ - ✗ - ✅ diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index acd82065ae457..eaaf004df38b2 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -413,6 +413,45 @@ def cannot_append_second_group2(seq_group, num_lookahead_slots): assert out.num_batched_tokens == max_num_batched_tokens +@pytest.mark.parametrize("num_scheduler_steps", [1, 5]) +def test_chunked_prefill_spec_prefill(num_scheduler_steps): + """Verify that the num_lookahead_slots is set appropriately for an all""" + """prefill batch depending on whether multi-step scheduling is enabled""" + """or not""" + block_size = 4 + max_seqs = 30 + max_model_len = 200 + max_num_batched_tokens = 30 + num_lookahead_slots = 4 + scheduler_config = SchedulerConfig( + "generate", + max_num_batched_tokens, + max_seqs, + max_model_len, + enable_chunked_prefill=True, + num_lookahead_slots=num_lookahead_slots, + num_scheduler_steps=num_scheduler_steps, + ) + cache_config = CacheConfig(block_size, 1.0, 1, "auto") + cache_config.num_cpu_blocks = 16 + cache_config.num_gpu_blocks = 16 + scheduler = Scheduler(scheduler_config, cache_config, None) + + _, seq_group = create_dummy_prompt("1", + prompt_length=30, + block_size=block_size) + scheduler.add_seq_group(seq_group) + _, out = schedule_and_update_computed_tokens(scheduler) + # The request is chunked. + # prefill scheduled now. + assert len(out.scheduled_seq_groups) == 1 + assert out.num_prefill_groups == 1 + assert out.num_batched_tokens == max_num_batched_tokens + print(out.num_lookahead_slots) + assert out.num_lookahead_slots == (0 if (num_scheduler_steps == 1) else + num_lookahead_slots) + + def test_chunked_prefill_max_seqs(): block_size = 4 max_seqs = 2 diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py index a3f0464e79675..af8397c235f48 100644 --- a/tests/spec_decode/e2e/test_compatibility.py +++ b/tests/spec_decode/e2e/test_compatibility.py @@ -50,49 +50,3 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator): with pytest.raises(ValueError, match="cannot be larger than"): get_output_from_llm_generator(test_llm_generator, prompts, sampling_params) - - -@pytest.mark.parametrize("common_llm_kwargs", - [{ - "model": "meta-llama/Llama-2-7b-chat-hf", - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "enable_chunked_prefill": "True", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [ - { - "tensor_parallel_size": 2, - "speculative_draft_tensor_parallel_size": 2, - }, - { - "tensor_parallel_size": 4, - "speculative_draft_tensor_parallel_size": 4, - }, - { - "tensor_parallel_size": 8, - "speculative_draft_tensor_parallel_size": 8, - }, -]) -@pytest.mark.parametrize("test_llm_kwargs", [{}]) -@pytest.mark.parametrize("seed", [1]) -def test_spec_decode_xfail_chunked_prefill_draft_model_tp_not_one( - test_llm_generator): - """Verify that speculative decoding fails if chunked prefill is enabled for - draft model with tensor parallelism of more than 1. - """ - output_len = 128 - temperature = 0.0 - - prompts = [ - "Hello, my name is", - ] - - sampling_params = SamplingParams( - max_tokens=output_len, - ignore_eos=True, - temperature=temperature, - ) - - with pytest.raises(ValueError, match="with tensor parallel size 1"): - get_output_from_llm_generator(test_llm_generator, prompts, - sampling_params) diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py index 25562ca85adf4..02cba92795142 100644 --- a/tests/spec_decode/e2e/test_integration_dist_tp2.py +++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py @@ -115,3 +115,60 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs, max_output_len=32, seed=seed, temperature=0.0) + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason="Need at least 2 GPUs to run the test.") +@pytest.mark.parametrize( + "common_llm_kwargs", + [[ + # Skip cuda graph recording for fast test. + "--enforce-eager", + "--tensor_parallel_size", + "2", + + # precision + "--dtype", + "bfloat16", + ]]) +@pytest.mark.parametrize( + "per_test_common_llm_kwargs", + [["--enable-chunked-prefill", "False"], + [ + "--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4", + "--max-num-seqs", "4" + ]]) +@pytest.mark.parametrize("baseline_llm_kwargs", [[]]) +@pytest.mark.parametrize("model, test_llm_kwargs", + [("JackFram/llama-68m", [ + "--speculative-model", + "JackFram/llama-68m", + "--num_speculative-tokens", + "3", + ]), + ("JackFram/llama-68m", [ + "--speculative-model", + "JackFram/llama-68m", + "--num_speculative-tokens", + "3", + "--speculative-draft-tensor-parallel-size", + "1", + ])]) +@pytest.mark.parametrize("batch_size", [2]) +@pytest.mark.parametrize("seed", [1]) +def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs, + per_test_common_llm_kwargs, + baseline_llm_kwargs, test_llm_kwargs, + batch_size: int, seed: int): + """Verify spec decode works well with same and different TP size for + the draft model with chunked prefill. + """ + run_equality_correctness_test_tp(model, + common_llm_kwargs, + per_test_common_llm_kwargs, + baseline_llm_kwargs, + test_llm_kwargs, + batch_size, + max_output_len=32, + seed=seed, + temperature=0.0) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index 8df143104c279..d7caf57147278 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -867,7 +867,8 @@ def test_chunked_prefill_flow(k: int, batch_size: int, batch_composition: str): target_group_metadata_list = prefill + decodes execute_model_req = ExecuteModelRequest( seq_group_metadata_list=target_group_metadata_list, - num_lookahead_slots=k) + # For prefill only batches we expect num_lookahead_slots = 0. + num_lookahead_slots=k if n_decodes > 0 else 0) target_token_ids = torch.randint(low=0, high=vocab_size, diff --git a/vllm/config.py b/vllm/config.py index c87feaec3e5f6..eae6f909e3933 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1409,16 +1409,6 @@ def maybe_create_spec_config( draft_hf_config ) - if (enable_chunked_prefill and \ - speculative_draft_tensor_parallel_size != 1): - # TODO - Investigate why the error reported in - # https://github.com/vllm-project/vllm/pull/9291#issuecomment-2463266258 - # is happening and re-enable it. - raise ValueError( - "Chunked prefill and speculative decoding can be enabled " - "simultaneously only for draft models with tensor " - "parallel size 1.") - draft_model_config.max_model_len = ( SpeculativeConfig._maybe_override_draft_max_model_len( speculative_max_model_len, diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 530cbdc3a9190..d23009dae01ee 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -1201,15 +1201,25 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs: # Update swapped requests. self.swapped.extend(running_scheduled.swapped_out) # Put prefills first due to Attention backend ordering assumption. + scheduled_seq_groups = (prefills.seq_groups + + running_scheduled.prefill_seq_groups + + swapped_in.prefill_seq_groups + + running_scheduled.decode_seq_groups + + swapped_in.decode_seq_groups) + num_prefill_groups = (len(prefills.seq_groups) + + len(swapped_in.prefill_seq_groups) + + len(running_scheduled.prefill_seq_groups)) + # If all prompts, then we set num_lookahead_slots to 0 + # this allows us to go through the `no_spec` path in + # `spec_decode_worker.py` + all_prefills = (len(scheduled_seq_groups) == num_prefill_groups) + num_lookahead_slots = (0 if + (all_prefills + and not self.scheduler_config.is_multi_step) + else running_scheduled.num_lookahead_slots) return SchedulerOutputs( - scheduled_seq_groups=(prefills.seq_groups + - running_scheduled.prefill_seq_groups + - swapped_in.prefill_seq_groups + - running_scheduled.decode_seq_groups + - swapped_in.decode_seq_groups), - num_prefill_groups=(len(prefills.seq_groups) + - len(swapped_in.prefill_seq_groups) + - len(running_scheduled.prefill_seq_groups)), + scheduled_seq_groups=scheduled_seq_groups, + num_prefill_groups=num_prefill_groups, num_batched_tokens=budget.num_batched_tokens + budget.num_cached_tokens, blocks_to_swap_in=swapped_in.blocks_to_swap_in, @@ -1218,7 +1228,7 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs: swapped_in.blocks_to_copy, ignored_seq_groups=prefills.ignored_seq_groups + swapped_in.infeasible_seq_groups, - num_lookahead_slots=running_scheduled.num_lookahead_slots, + num_lookahead_slots=num_lookahead_slots, running_queue_size=len(self.running), preempted=(len(running_scheduled.preempted) + len(running_scheduled.swapped_out)), diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index b57742c2ebfdd..b279931ca4b02 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -408,7 +408,20 @@ def execute_model( disable_all_speculation = self._should_disable_all_speculation( execute_model_req) num_lookahead_slots = execute_model_req.num_lookahead_slots - + all_prompt = True + atleast_one_prompt = False + all_zero_spec_tokens = True + for sgm in execute_model_req.seq_group_metadata_list: + all_prompt = all_prompt and sgm.is_prompt + atleast_one_prompt = atleast_one_prompt or sgm.is_prompt + all_zero_spec_tokens = all_zero_spec_tokens and ( + sgm.num_speculative_tokens == 0) + + if all_prompt and execute_model_req.seq_group_metadata_list: + assert num_lookahead_slots == 0, ( + "Prompt only runs should have num_lookahead_slots equal to 0. " + "This should never happen, please file a bug at " + "https://github.com/vllm-project/vllm/issues") # Speculative decoding is disabled in the following cases: # 1. Prefill phase: Speculative decoding is not # used during the prefill phase. @@ -419,11 +432,8 @@ def execute_model( # In any of these cases, the proposer and scorer workers # are called normally. # We expect `num_speculative_tokens` to be None for prefills. - no_spec = all( - sgm.is_prompt for sgm in execute_model_req.seq_group_metadata_list - ) or num_lookahead_slots == 0 or disable_all_speculation or all( - sgm.num_speculative_tokens == 0 - for sgm in execute_model_req.seq_group_metadata_list) + no_spec = (num_lookahead_slots == 0 or disable_all_speculation + or all_zero_spec_tokens) # Broadcast how many lookahead slots are scheduled for this step, and # whether all speculation is disabled, to all non-driver workers. @@ -442,6 +452,15 @@ def execute_model( num_lookahead_slots=num_lookahead_slots, no_spec=no_spec, disable_all_speculation=disable_all_speculation, + # When both chunked prefill and speculative decoding are enabled + # it is possible that the same batch contains both prefill + # and decodes. If that happens in the scorer we run the batch + # as one single forward pass. However, in the proposer we + # run them as 2 different batches - one for prefill and + # the other for decodes. The variable indicates to the non-driver + # worker that there are prefills as part of the speculative batch + # and hence it needs to run an extra prefill forward pass. + run_spec_proposer_for_prefill=atleast_one_prompt, ) broadcast_tensor_dict(broadcast_dict, src=self._driver_rank) @@ -653,6 +672,8 @@ def _run_non_driver_rank(self) -> bool: if not data["no_spec"]: self.scorer_worker.execute_model() + if data["run_spec_proposer_for_prefill"]: + self.proposer_worker.execute_model() return True From f5792c7c4a63ecdd2dcaa068ac7986dc4a22436b Mon Sep 17 00:00:00 2001 From: Conroy Cheers Date: Wed, 27 Nov 2024 05:26:28 +1100 Subject: [PATCH 163/255] [Hardware][NVIDIA] Add non-NVML CUDA mode for Jetson (#9735) Signed-off-by: Conroy Cheers --- CMakeLists.txt | 10 +- vllm/platforms/__init__.py | 10 +- vllm/platforms/cuda.py | 222 +++++++++++++++++++++++-------------- 3 files changed, 155 insertions(+), 87 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ff34225537cdd..882d4412632a5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,7 +34,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12") # Supported NVIDIA architectures. -set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0") +set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0") # Supported AMD GPU architectures. set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101") @@ -249,7 +249,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # Only build Marlin kernels if we are building for at least some compatible archs. # Keep building Marlin for 9.0 as there are some group sizes and shapes that # are not supported by Machete yet. - cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS}) + cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS}) if (MARLIN_ARCHS) set(MARLIN_SRCS "csrc/quantization/fp8/fp8_marlin.cu" @@ -300,8 +300,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x) # kernels for the remaining archs that are not already built for 3x. - cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS - "7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS + "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}") # subtract out the archs that are already built for 3x list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) if (SCALED_MM_2X_ARCHS) @@ -427,7 +427,7 @@ set_gencode_flags_for_srcs( CUDA_ARCHS "${CUDA_ARCHS}") if(VLLM_GPU_LANG STREQUAL "CUDA") - cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.9;9.0" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}") if (MARLIN_MOE_ARCHS) set(MARLIN_MOE_SRC "csrc/moe/marlin_kernels/marlin_moe_kernel.h" diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index 1f68fc2e25df3..7cb8ac4b0a1e0 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -28,7 +28,15 @@ finally: pynvml.nvmlShutdown() except Exception: - pass + # CUDA is supported on Jetson, but NVML may not be. + import os + + def cuda_is_jetson() -> bool: + return os.path.isfile("/etc/nv_tegra_release") \ + or os.path.exists("/sys/class/tegra-firmware") + + if cuda_is_jetson(): + is_cuda = True is_rocm = False diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 70724b8be4c45..0d07050fd1b6a 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -4,7 +4,7 @@ import os from functools import lru_cache, wraps -from typing import TYPE_CHECKING, Callable, List, Tuple, TypeVar +from typing import TYPE_CHECKING, Callable, List, TypeVar import pynvml import torch @@ -38,10 +38,23 @@ # see https://github.com/huggingface/diffusers/issues/9704 for details torch.backends.cuda.enable_cudnn_sdp(False) -# NVML utils -# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`, -# all the related functions work on real physical device ids. -# the major benefit of using NVML is that it will not initialize CUDA + +def device_id_to_physical_device_id(device_id: int) -> int: + if "CUDA_VISIBLE_DEVICES" in os.environ: + device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",") + if device_ids == [""]: + msg = ( + "CUDA_VISIBLE_DEVICES is set to empty string, which means" + " GPU support is disabled. If you are using ray, please unset" + " the environment variable `CUDA_VISIBLE_DEVICES` inside the" + " worker/actor. " + "Check https://github.com/vllm-project/vllm/issues/8402 for" + " more information.") + raise RuntimeError(msg) + physical_device_id = device_ids[device_id] + return int(physical_device_id) + else: + return device_id def with_nvml_context(fn: Callable[_P, _R]) -> Callable[_P, _R]: @@ -57,87 +70,75 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R: return wrapper -@lru_cache(maxsize=8) -@with_nvml_context -def get_physical_device_capability(device_id: int = 0) -> Tuple[int, int]: - handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) - return pynvml.nvmlDeviceGetCudaComputeCapability(handle) - - -@lru_cache(maxsize=8) -@with_nvml_context -def get_physical_device_name(device_id: int = 0) -> str: - handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) - return pynvml.nvmlDeviceGetName(handle) - - -@lru_cache(maxsize=8) -@with_nvml_context -def get_physical_device_total_memory(device_id: int = 0) -> int: - handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) - return int(pynvml.nvmlDeviceGetMemoryInfo(handle).total) - +class CudaPlatformBase(Platform): + _enum = PlatformEnum.CUDA + device_type: str = "cuda" + dispatch_key: str = "CUDA" -@with_nvml_context -def warn_if_different_devices(): - device_ids: int = pynvml.nvmlDeviceGetCount() - if device_ids > 1: - device_names = [get_physical_device_name(i) for i in range(device_ids)] - if len(set(device_names)) > 1 and os.environ.get( - "CUDA_DEVICE_ORDER") != "PCI_BUS_ID": - logger.warning( - "Detected different devices in the system: \n%s\nPlease" - " make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to " - "avoid unexpected behavior.", "\n".join(device_names)) + @classmethod + def get_device_capability(cls, device_id: int = 0) -> DeviceCapability: + raise NotImplementedError + @classmethod + def get_device_name(cls, device_id: int = 0) -> str: + raise NotImplementedError -try: - from sphinx.ext.autodoc.mock import _MockModule + @classmethod + def get_device_total_memory(cls, device_id: int = 0) -> int: + raise NotImplementedError - if not isinstance(pynvml, _MockModule): - warn_if_different_devices() -except ModuleNotFoundError: - warn_if_different_devices() + @classmethod + def is_full_nvlink(cls, device_ids: List[int]) -> bool: + raise NotImplementedError + @classmethod + def log_warnings(cls): + pass -def device_id_to_physical_device_id(device_id: int) -> int: - if "CUDA_VISIBLE_DEVICES" in os.environ: - device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",") - if device_ids == [""]: - msg = ( - "CUDA_VISIBLE_DEVICES is set to empty string, which means" - " GPU support is disabled. If you are using ray, please unset" - " the environment variable `CUDA_VISIBLE_DEVICES` inside the" - " worker/actor. " - "Check https://github.com/vllm-project/vllm/issues/8402 for" - " more information.") - raise RuntimeError(msg) - physical_device_id = device_ids[device_id] - return int(physical_device_id) - else: - return device_id + @classmethod + def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + parallel_config = vllm_config.parallel_config + scheduler_config = vllm_config.scheduler_config + if parallel_config.worker_cls == "auto": + if scheduler_config.is_multi_step: + parallel_config.worker_cls = \ + "vllm.worker.multi_step_worker.MultiStepWorker" + elif vllm_config.speculative_config: + parallel_config.worker_cls = \ + "vllm.spec_decode.spec_decode_worker.create_spec_worker" + else: + parallel_config.worker_cls = "vllm.worker.worker.Worker" -class CudaPlatform(Platform): - _enum = PlatformEnum.CUDA - device_type: str = "cuda" - dispatch_key: str = "CUDA" +# NVML utils +# Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`, +# all the related functions work on real physical device ids. +# the major benefit of using NVML is that it will not initialize CUDA +class NvmlCudaPlatform(CudaPlatformBase): @classmethod + @lru_cache(maxsize=8) + @with_nvml_context def get_device_capability(cls, device_id: int = 0) -> DeviceCapability: physical_device_id = device_id_to_physical_device_id(device_id) - major, minor = get_physical_device_capability(physical_device_id) + handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id) + major, minor = pynvml.nvmlDeviceGetCudaComputeCapability(handle) return DeviceCapability(major=major, minor=minor) @classmethod + @lru_cache(maxsize=8) + @with_nvml_context def get_device_name(cls, device_id: int = 0) -> str: physical_device_id = device_id_to_physical_device_id(device_id) - return get_physical_device_name(physical_device_id) + return cls._get_physical_device_name(physical_device_id) @classmethod + @lru_cache(maxsize=8) + @with_nvml_context def get_device_total_memory(cls, device_id: int = 0) -> int: physical_device_id = device_id_to_physical_device_id(device_id) - return get_physical_device_total_memory(physical_device_id) + handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id) + return int(pynvml.nvmlDeviceGetMemoryInfo(handle).total) @classmethod @with_nvml_context @@ -153,27 +154,86 @@ def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool: if i < j: try: p2p_status = pynvml.nvmlDeviceGetP2PStatus( - handle, peer_handle, - pynvml.NVML_P2P_CAPS_INDEX_NVLINK) + handle, + peer_handle, + pynvml.NVML_P2P_CAPS_INDEX_NVLINK, + ) if p2p_status != pynvml.NVML_P2P_STATUS_OK: return False except pynvml.NVMLError: logger.exception( - "NVLink detection failed. This is normal if your" - " machine has no NVLink equipped.") + "NVLink detection failed. This is normal if" + " your machine has no NVLink equipped.") return False return True @classmethod - def check_and_update_config(cls, vllm_config: VllmConfig) -> None: - parallel_config = vllm_config.parallel_config - scheduler_config = vllm_config.scheduler_config - if parallel_config.worker_cls == "auto": - if scheduler_config.is_multi_step: - parallel_config.worker_cls = \ - "vllm.worker.multi_step_worker.MultiStepWorker" - elif vllm_config.speculative_config: - parallel_config.worker_cls = \ - "vllm.spec_decode.spec_decode_worker.create_spec_worker" - else: - parallel_config.worker_cls = "vllm.worker.worker.Worker" + def _get_physical_device_name(cls, device_id: int = 0) -> str: + handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) + return pynvml.nvmlDeviceGetName(handle) + + @classmethod + @with_nvml_context + def log_warnings(cls): + device_ids: int = pynvml.nvmlDeviceGetCount() + if device_ids > 1: + device_names = [ + cls._get_physical_device_name(i) for i in range(device_ids) + ] + if (len(set(device_names)) > 1 + and os.environ.get("CUDA_DEVICE_ORDER") != "PCI_BUS_ID"): + logger.warning( + "Detected different devices in the system: \n%s\nPlease" + " make sure to set `CUDA_DEVICE_ORDER=PCI_BUS_ID` to " + "avoid unexpected behavior.", + "\n".join(device_names), + ) + + +class NonNvmlCudaPlatform(CudaPlatformBase): + + @classmethod + def get_device_capability(cls, device_id: int = 0) -> DeviceCapability: + major, minor = torch.cuda.get_device_capability(device_id) + return DeviceCapability(major=major, minor=minor) + + @classmethod + def get_device_name(cls, device_id: int = 0) -> str: + return torch.cuda.get_device_name(device_id) + + @classmethod + def get_device_total_memory(cls, device_id: int = 0) -> int: + device_props = torch.cuda.get_device_properties(device_id) + return device_props.total_memory + + @classmethod + def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool: + logger.exception( + "NVLink detection not possible, as context support was" + " not found. Assuming no NVLink available.") + return False + + +# Autodetect either NVML-enabled or non-NVML platform +# based on whether NVML is available. +nvml_available = False +try: + try: + pynvml.nvmlInit() + nvml_available = True + except Exception: + # On Jetson, NVML is not supported. + nvml_available = False +finally: + if nvml_available: + pynvml.nvmlShutdown() + +CudaPlatform = NvmlCudaPlatform if nvml_available else NonNvmlCudaPlatform + +try: + from sphinx.ext.autodoc.mock import _MockModule + + if not isinstance(pynvml, _MockModule): + CudaPlatform.log_warnings() +except ModuleNotFoundError: + CudaPlatform.log_warnings() From 9a99273b482a3e90431069f37858d60827983e2f Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 26 Nov 2024 13:44:01 -0500 Subject: [PATCH 164/255] [Bugfix] Fix using `-O[0,3]` with LLM entrypoint (#10677) Signed-off-by: mgoin --- vllm/engine/arg_utils.py | 5 ++++- vllm/entrypoints/llm.py | 10 ++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 60ad5ee54a2f2..90b4798f17a13 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -206,7 +206,10 @@ def __post_init__(self): # support `EngineArgs(compilation_config={...})` # without having to manually construct a # CompilationConfig object - if isinstance(self.compilation_config, (int, dict)): + if isinstance(self.compilation_config, (int)): + self.compilation_config = CompilationConfig.from_cli( + str(self.compilation_config)) + elif isinstance(self.compilation_config, (dict)): self.compilation_config = CompilationConfig.from_cli( json.dumps(self.compilation_config)) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index e07f4c04abd84..1551a9a998160 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -185,8 +185,14 @@ def __init__( kwargs["disable_log_stats"] = True if compilation_config is not None: - compilation_config_instance = CompilationConfig.from_cli( - json.dumps(compilation_config)) + if isinstance(compilation_config, (int)): + compilation_config_instance = CompilationConfig.from_cli( + str(compilation_config)) + elif isinstance(compilation_config, (dict)): + compilation_config_instance = CompilationConfig.from_cli( + json.dumps(compilation_config)) + else: + compilation_config_instance = compilation_config else: compilation_config_instance = None From 7576cd38dfdf1672d04f4fe659f8260a9d319e8b Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 26 Nov 2024 15:29:00 -0500 Subject: [PATCH 165/255] [Bugfix] Check bnb_4bit_quant_storage for bitsandbytes (#10642) --- .../layers/quantization/bitsandbytes.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 39965ac9115c2..6a0de3034142a 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -20,6 +20,7 @@ def __init__( load_in_8bit: bool = False, load_in_4bit: bool = True, bnb_4bit_compute_dtype: str = "float32", + bnb_4bit_quant_storage: str = "uint8", bnb_4bit_quant_type: str = "fp4", bnb_4bit_use_double_quant: bool = False, llm_int8_enable_fp32_cpu_offload: bool = False, @@ -31,6 +32,7 @@ def __init__( self.load_in_8bit = load_in_8bit self.load_in_4bit = load_in_4bit self.bnb_4bit_compute_dtype = bnb_4bit_compute_dtype + self.bnb_4bit_quant_storage = bnb_4bit_quant_storage self.bnb_4bit_quant_type = bnb_4bit_quant_type self.bnb_4bit_use_double_quant = bnb_4bit_use_double_quant self.llm_int8_enable_fp32_cpu_offload = llm_int8_enable_fp32_cpu_offload @@ -38,10 +40,15 @@ def __init__( self.llm_int8_skip_modules = llm_int8_skip_modules or [] self.llm_int8_threshold = llm_int8_threshold + if self.bnb_4bit_quant_storage not in ["uint8"]: + raise ValueError("Unsupported bnb_4bit_quant_storage: " + f"{self.bnb_4bit_quant_storage}") + def __repr__(self) -> str: return (f"BitsAndBytesConfig(load_in_8bit={self.load_in_8bit}, " f"load_in_4bit={self.load_in_4bit}, " f"bnb_4bit_compute_dtype={self.bnb_4bit_compute_dtype}, " + f"bnb_4bit_quant_storage={self.bnb_4bit_quant_storage}, " f"bnb_4bit_quant_type={self.bnb_4bit_quant_type}, " f"llm_int8_skip_modules={self.llm_int8_skip_modules})") @@ -80,6 +87,9 @@ def get_safe_value(config, keys, default_value=None): bnb_4bit_compute_dtype = get_safe_value(config, ["bnb_4bit_compute_dtype"], default_value="float32") + bnb_4bit_quant_storage = get_safe_value(config, + ["bnb_4bit_quant_storage"], + default_value="uint8") bnb_4bit_quant_type = get_safe_value(config, ["bnb_4bit_quant_type"], default_value="fp4") bnb_4bit_use_double_quant = get_safe_value( @@ -99,6 +109,7 @@ def get_safe_value(config, keys, default_value=None): load_in_8bit=load_in_8bit, load_in_4bit=load_in_4bit, bnb_4bit_compute_dtype=bnb_4bit_compute_dtype, + bnb_4bit_quant_storage=bnb_4bit_quant_storage, bnb_4bit_quant_type=bnb_4bit_quant_type, bnb_4bit_use_double_quant=bnb_4bit_use_double_quant, llm_int8_enable_fp32_cpu_offload=llm_int8_enable_fp32_cpu_offload, From 2f0a0a17a47436fe9709462dfee3bb9d2f91e0a0 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Tue, 26 Nov 2024 12:46:11 -0800 Subject: [PATCH 166/255] [V1] Refactor model executable interface for multimodal models (#10570) Signed-off-by: Roger Wang --- vllm/model_executor/models/blip2.py | 61 ++++++----- vllm/model_executor/models/chameleon.py | 58 +++++++--- vllm/model_executor/models/chatglm.py | 54 ++++++---- vllm/model_executor/models/fuyu.py | 43 +++++--- vllm/model_executor/models/interfaces.py | 36 ++++++- vllm/model_executor/models/internvl.py | 54 +++++++--- vllm/model_executor/models/llava.py | 15 +-- vllm/model_executor/models/llava_next.py | 51 +++++---- .../model_executor/models/llava_next_video.py | 44 +++++--- vllm/model_executor/models/llava_onevision.py | 74 +++++++++---- vllm/model_executor/models/molmo.py | 88 +++++++-------- vllm/model_executor/models/paligemma.py | 52 +++++---- vllm/model_executor/models/phi3v.py | 16 +-- vllm/model_executor/models/qwen2_audio.py | 59 ++++++---- vllm/model_executor/models/qwen2_vl.py | 102 ++++++++++++------ vllm/model_executor/models/ultravox.py | 72 ++++++++----- vllm/model_executor/models/utils.py | 5 +- vllm/v1/worker/gpu_model_runner.py | 3 +- 18 files changed, 581 insertions(+), 306 deletions(-) diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 7d7639b4a92ce..d2592016aff34 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -16,6 +16,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import NestedTensors from vllm.multimodal.utils import consecutive_placeholder_ranges from vllm.sequence import IntermediateTensors, SequenceData @@ -609,6 +610,25 @@ def _process_image_input(self, return self.language_projection(query_output) + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + BLIP2_IMAGE_TOKEN_ID) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -616,6 +636,7 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> Union[SamplerOutput, IntermediateTensors]: """Run forward pass for BLIP-2. @@ -648,32 +669,24 @@ def forward( See also: :class:`Blip2ImageInputs` """ + if intermediate_tensors is not None: - input_ids = None inputs_embeds = None - else: - image_input = self._parse_and_validate_image_input(**kwargs) - - if image_input is not None: - vision_embeddings = self._process_image_input(image_input) - inputs_embeds = self.language_model.model.get_input_embeddings( - input_ids) - - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, - BLIP2_IMAGE_TOKEN_ID) - - input_ids = None - else: - inputs_embeds = None - - hidden_states = self.language_model.model( - input_ids, - positions, - kv_caches, - attn_metadata, - intermediate_tensors=intermediate_tensors, - inputs_embeds=inputs_embeds) + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None + + hidden_states = self.language_model.model(input_ids, + positions, + kv_caches, + attn_metadata, + intermediate_tensors, + inputs_embeds=inputs_embeds) return hidden_states diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 5a6d6432112f0..a40c321ce0a58 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -29,6 +29,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import NestedTensors from vllm.multimodal.utils import (cached_get_tokenizer, consecutive_placeholder_ranges, repeat_and_pad_placeholder_tokens) @@ -38,7 +39,7 @@ from .interfaces import SupportsMultiModal, SupportsPP from .utils import (is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, - maybe_prefix) + maybe_prefix, merge_multimodal_embeddings) # These configs are not part of the model config but the preprocessor # and processor files, so we hardcode them in the model file for now. @@ -987,6 +988,29 @@ def _parse_and_validate_image_input( data=self._validate_pixel_values(pixel_values), ) + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + assert self.model.vqmodel is not None + image_tokens = self.model.get_image_tokens(image_input["data"].to( + self.config.torch_dtype)) + vision_embeddings = self.model.get_input_embeddings(image_tokens) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + + inputs_embeds = self.model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + self.model.vocabulary_mapping.image_token_id) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -994,27 +1018,27 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs, ) -> Union[torch.Tensor, IntermediateTensors]: if intermediate_tensors is not None: + inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) input_ids = None - else: - image_input = self._parse_and_validate_image_input(**kwargs) - - if image_input is not None: - assert self.model.vqmodel is not None - image_tokens = self.model.get_image_tokens( - image_input["data"].to(self.config.torch_dtype)) - image_token_id = self.model.vocabulary_mapping.image_token_id - special_image_mask = input_ids == image_token_id - image_tokens = image_tokens.to(input_ids.device, - input_ids.dtype) - input_ids = input_ids.masked_scatter(special_image_mask, - image_tokens) - - hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + + hidden_states = self.model(input_ids, + positions, + kv_caches, + attn_metadata, + intermediate_tensors, + inputs_embeds=inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 5bcbce7180ca4..6c50882d83c3b 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -33,7 +33,8 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalData, MultiModalKwargs +from vllm.multimodal.inputs import (MultiModalData, MultiModalKwargs, + NestedTensors) from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, SequenceData) @@ -545,6 +546,30 @@ def _parse_and_validate_image_input( """) return GLMImagePixelInputs(pixel_values=pixel_values) + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input["pixel_values"] is None: + return None + pixel_values = image_input["pixel_values"].to( + dtype=self.config.torch_dtype) + vision_embeddings = self.vision(pixel_values) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.embedding(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = merge_glm_vision_embeddings( + input_ids=input_ids, + inputs_embeds=inputs_embeds, + vision_embeddings=multimodal_embeddings, + boi_token_id=self.config.boi_token_id, + eoi_token_id=self.config.eoi_token_id) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -552,26 +577,17 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> torch.Tensor: - if intermediate_tensors is None: - inputs_embeds = self.embedding(input_ids) - image_input = self._parse_and_validate_image_input(**kwargs) - - if image_input["pixel_values"] is not None: - pixel_values = image_input["pixel_values"].to( - dtype=inputs_embeds.dtype) - image_embeds = self.vision(pixel_values) - - boi_token_id = self.config.boi_token_id - eoi_token_id = self.config.eoi_token_id - - inputs_embeds = merge_glm_vision_embeddings( - input_ids=input_ids, - inputs_embeds=inputs_embeds, - vision_embeddings=image_embeds, - boi_token_id=boi_token_id, - eoi_token_id=eoi_token_id) + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + if intermediate_tensors is None and inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None else: inputs_embeds = intermediate_tensors["hidden_states"] diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 7b46907ac83ab..6e86900326c4b 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -35,6 +35,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.image import cached_get_image_processor +from vllm.multimodal.inputs import NestedTensors from vllm.multimodal.utils import (cached_get_tokenizer, consecutive_placeholder_ranges) from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, @@ -302,6 +303,25 @@ def _process_image_input( vision_embeddings, _ = self.vision_embed_tokens(image_input["data"]) return vision_embeddings + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + _IMAGE_TOKEN_ID) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -309,24 +329,19 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ): if intermediate_tensors is not None: - input_ids = None inputs_embeds = None - else: - image_input = self._parse_and_validate_image_input(**kwargs) - - if image_input is not None: - vision_embeddings = self._process_image_input(image_input) - inputs_embeds = self.language_model.model.embed_tokens( - input_ids) - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, - self.image_token_id) - - else: - inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None hidden_states = self.language_model( input_ids=input_ids, diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 9b4a97abf9b51..1545ce332309f 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -2,7 +2,7 @@ Protocol, Type, Union, overload, runtime_checkable) import torch -from typing_extensions import TypeIs +from typing_extensions import TypeIs, TypeVar from vllm.logger import init_logger from vllm.utils import supports_kw @@ -10,10 +10,14 @@ from .interfaces_base import is_embedding_model if TYPE_CHECKING: + from vllm.attention import AttentionMetadata + from vllm.multimodal.inputs import NestedTensors # noqa: F401 from vllm.sequence import IntermediateTensors logger = init_logger(__name__) +T = TypeVar("T", default="NestedTensors") + @runtime_checkable class SupportsMultiModal(Protocol): @@ -28,6 +32,36 @@ class SupportsMultiModal(Protocol): MRO of your model class. """ + def get_multimodal_embeddings(self, **kwargs) -> Optional[T]: + """ + Returns multimodal embeddings generated from multimodal kwargs + to be merged with text embeddings. + """ + ... + + # Only for models that support v0 chunked prefill + # TODO(ywang96): Remove this overload once v0 is deprecated + @overload + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[T] = None, + attn_metadata: Optional["AttentionMetadata"] = None, + ) -> torch.Tensor: + ... + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[T] = None, + ) -> torch.Tensor: + """ + Returns the input embeddings merged from the text embeddings from + input_ids and the multimodal embeddings generated from multimodal + kwargs. + """ + ... + # We can't use runtime_checkable with ClassVar for issubclass checks # so we need to treat the class as an instance and use isinstance instead diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 47ac00b6afe9b..b1c0065afbf30 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -26,6 +26,7 @@ InternVisionPatchModel) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs +from vllm.multimodal.inputs import NestedTensors from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of @@ -641,6 +642,26 @@ def _get_visual_token_mask(self, input_ids: torch.Tensor) -> torch.Tensor: visual_token_mask = None return visual_token_mask + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + assert self.img_context_token_id is not None + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + self.img_context_token_id) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -648,26 +669,22 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> Union[SamplerOutput, IntermediateTensors]: + + visual_token_mask = None if intermediate_tensors is not None: input_ids = None inputs_embeds = None - visual_token_mask = None - else: - image_input = self._parse_and_validate_image_input(**kwargs) - if image_input is not None: - inputs_embeds = self.language_model.model.get_input_embeddings( - input_ids) - vision_embeddings = self._process_image_input(image_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, - self.img_context_token_id) - visual_token_mask = self._get_visual_token_mask(input_ids) - input_ids = None - else: - inputs_embeds = None - visual_token_mask = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None forward_kwargs = { "input_ids": input_ids, @@ -677,6 +694,13 @@ def forward( "intermediate_tensors": intermediate_tensors, "inputs_embeds": inputs_embeds, } + if self.img_context_token_id is not None: + visual_token_mask = self._get_visual_token_mask(input_ids) + + # We always overwrite it back to None after computing visual token + # mask so that this doesn't need to depend on encoder output + self.img_context_token_id = None + if self.is_mono: forward_kwargs.update({"visual_token_mask": visual_token_mask}) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 05c6cc62efcd7..e7757b3c7d405 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -478,7 +478,7 @@ def _process_image_input(self, image_features = self._process_image_pixels(image_input) return self.multi_modal_projector(image_features) - def process_mm_inputs(self, **kwargs): + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None @@ -488,12 +488,12 @@ def process_mm_inputs(self, **kwargs): def get_input_embeddings( self, input_ids: torch.Tensor, - vision_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[NestedTensors] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if vision_embeddings is not None: + if multimodal_embeddings is not None: inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, + input_ids, inputs_embeds, multimodal_embeddings, self.config.image_token_index) return inputs_embeds @@ -544,10 +544,11 @@ def forward( """ if intermediate_tensors is not None: inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. elif inputs_embeds is None: - vision_embeddings = self.process_mm_inputs(**kwargs) - # always pass the input via `inputs_embeds` - # to make sure the computation graph is consistent + vision_embeddings = self.get_multimodal_embeddings(**kwargs) inputs_embeds = self.get_input_embeddings(input_ids, vision_embeddings) input_ids = None diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index abeebb45fc4a7..e113f5862830d 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -19,6 +19,7 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import NestedTensors from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.utils import is_list_of @@ -565,6 +566,30 @@ def _process_image_input( for i, patch_features_batch in enumerate(patch_embeddings) ] + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + + if multimodal_embeddings is None: + return self.language_model.get_input_embeddings(input_ids) + + inputs_embeds = embed_multimodal( + input_ids, + self.config.image_token_index, + self.language_model.model.get_input_embeddings, + multimodal_embeddings, + ) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -572,6 +597,7 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> Union[torch.Tensor, IntermediateTensors]: """Run forward pass for LlaVA-NeXT. @@ -620,24 +646,14 @@ def forward( """ if intermediate_tensors is not None: inputs_embeds = None - else: - image_input = self._parse_and_validate_image_input(**kwargs) - - if image_input is not None: - inputs_embeds = embed_multimodal( - input_ids, - self.config.image_token_index, - self.language_model.model.get_input_embeddings, - lambda _: self._process_image_input(image_input), - ) - else: - inputs_embeds = self.language_model.model.get_input_embeddings( - input_ids) - # always pass the input via `inputs_embeds` - # to make sure the computation graph is consistent - # for `torch.compile` integration - input_ids = None + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None hidden_states = self.language_model.model(input_ids, positions, @@ -645,7 +661,6 @@ def forward( attn_metadata, intermediate_tensors, inputs_embeds=inputs_embeds) - return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index e2880c76cf43d..b130791808924 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -18,6 +18,7 @@ from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import NestedTensors from vllm.multimodal.utils import (cached_get_tokenizer, repeat_and_pad_placeholder_tokens) from vllm.sequence import IntermediateTensors @@ -388,6 +389,25 @@ def _process_video_pixels(self, inputs: LlavaNextVideoPixelInputs): raise ValueError( f"Unsupported type of video input {type(video_pixels)}") + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: + video_input = self._parse_and_validate_video_input(**kwargs) + if video_input is None: + return None + vision_embeddings = self._process_video_pixels(video_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + self.config.video_token_index) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -395,6 +415,7 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> Union[torch.Tensor, IntermediateTensors]: """Run forward pass for LlaVA-NeXT-Video. @@ -404,22 +425,15 @@ def forward( pixel_values_videos: Pixels in each frames for each input videos. """ if intermediate_tensors is not None: - input_ids = None inputs_embeds = None - else: - video_input = self._parse_and_validate_video_input(**kwargs) - if video_input is not None: - video_embeddings = self._process_video_pixels(video_input) - inputs_embeds = self.language_model \ - .model.get_input_embeddings(input_ids) - - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, video_embeddings, - self.config.video_token_index) - - input_ids = None - else: - inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None hidden_states = self.language_model.model(input_ids, positions, diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 705ca1e4ab6e6..3166737d61582 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -21,6 +21,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import NestedTensors from vllm.multimodal.utils import (cached_get_tokenizer, repeat_and_pad_placeholder_tokens) from vllm.sequence import IntermediateTensors @@ -824,6 +825,49 @@ def apply_pooling(self, image_features, stride=2): image_feature = image_feature.view(batch_frames, -1, dim) return image_feature + def get_multimodal_embeddings( + self, **kwargs) -> Optional[List[Tuple[NestedTensors, str]]]: + modalities = self._parse_and_validate_multimodal_inputs(**kwargs) + if not modalities: + return None + + # We make a tuple of each embedding with its modality string. This is a + # temporary workaround for models to handle mixed modalities when + # get_multimodal_embeddings and get_input_embeddings are called + # separately. + # TODO(ywang96): Add support for mixed-modality inference for v1. + multimodal_embeddings: List[Tuple[NestedTensors, str]] = [] + + if "images" in modalities: + image_input = modalities["images"] + vision_embeddings = self._process_image_input(image_input) + multimodal_embeddings.append((vision_embeddings, "image")) + if "videos" in modalities: + video_input = modalities["videos"] + video_embeddings = self._process_video_pixels(video_input) + multimodal_embeddings.append((video_embeddings, "video")) + + return multimodal_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[List[Tuple[NestedTensors, + str]]] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + for embeddings, modality in multimodal_embeddings: + if modality == "image": + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, embeddings, + self.config.image_token_index) + if modality == "video": + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, embeddings, + self.config.video_token_index) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -831,6 +875,7 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> Union[torch.Tensor, IntermediateTensors]: """Run forward pass for LlaVA-Onevision. @@ -840,28 +885,15 @@ def forward( pixel_values_videos: Pixels in each frames for each input videos. """ if intermediate_tensors is not None: - input_ids = None inputs_embeds = None - else: - modalities = self._parse_and_validate_multimodal_inputs(**kwargs) - if modalities: - inputs_embeds = self.language_model.model.get_input_embeddings( - input_ids) - if "images" in modalities: - image_input = modalities["images"] - vision_embeddings = self._process_image_input(image_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, - self.config.image_token_index) - if "videos" in modalities: - video_input = modalities["videos"] - video_embeddings = self._process_video_pixels(video_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, video_embeddings, - self.config.video_token_index) - input_ids = None - else: - inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + multimodal_embeddings) + input_ids = None hidden_states = self.language_model.model(input_ids, positions, diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index ee7b560fe1ee4..acedddd84d7cb 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -3,7 +3,7 @@ from array import array from dataclasses import dataclass from functools import lru_cache, partial -from typing import Iterable, List, Mapping, Optional, Tuple, TypedDict, Union +from typing import Iterable, List, Mapping, Optional, Tuple, TypedDict import torch from einops import rearrange @@ -36,6 +36,7 @@ ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs +from vllm.multimodal.inputs import NestedTensors from vllm.multimodal.utils import cached_get_tokenizer from vllm.platforms import _Backend from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, @@ -756,6 +757,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) + def get_input_embeddings( + self, + input_ids: torch.Tensor, + ) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -1098,19 +1105,16 @@ def _process_image_input( return image_features - def _merge_multimodal_embeddings( - self, - inputs_embeds: torch.Tensor, - image_features: torch.Tensor, - image_input_idx: torch.Tensor, - seq_len: Union[torch.Tensor, List[torch.Tensor]], - ) -> torch.Tensor: + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + image_features = self._process_image_input(image_input) + image_input_idx = image_input["image_input_idx"] + seq_len = image_input["seq_len"] batch_size, num_image, num_patch = image_features.shape[:3] assert image_input_idx.shape == (batch_size, num_image, num_patch) - image_features = image_features.to(inputs_embeds.device) - seq_len = seq_len.to(inputs_embeds.device) - # insert the image feature into the embedding. image_features = image_features.view(batch_size, num_image * num_patch, -1) @@ -1130,12 +1134,24 @@ def _merge_multimodal_embeddings( image_input_idx = image_input_idx + offset.to(image_input_idx.dtype) image_input_idx = image_input_idx.flatten()[:, None] mat = image_input_idx == torch.arange( - seq_len.sum().item(), device=inputs_embeds.device)[None, :] + seq_len.sum().item(), device=image_features.device)[None, :] mat = mat.to(image_features.dtype) - inputs_embeds = inputs_embeds + torch.einsum('nd,nm->md', - image_features, mat) + # Note: In this original implementation from AI2, the final + # vision_embeddings will be always be the same length + # of input embedddings, which is not very efficient. + # TODO(ywang96): see if this can be optimized. + vision_embeddings = torch.einsum('nd,nm->md', image_features, mat) + return vision_embeddings + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = inputs_embeds + multimodal_embeddings return inputs_embeds def forward( @@ -1145,39 +1161,27 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> SamplerOutput: + if intermediate_tensors is not None: inputs_embeds = None - else: - image_input = self._parse_and_validate_image_input(**kwargs) - - if image_input is not None: - inputs_embeds = self.model.embed_tokens(input_ids) - image_features = self._process_image_input(image_input) - - inputs_embeds = self._merge_multimodal_embeddings( - inputs_embeds, - image_features, - image_input["image_input_idx"], - image_input["seq_len"], - ) - else: - inputs_embeds = self.model.embed_tokens(input_ids) - # always pass the input via `inputs_embeds` - # to make sure the computation graph is consistent - # for `torch.compile` integration - input_ids = None - - hidden_states = self.model( - input_ids=input_ids, - positions=positions, - kv_caches=kv_caches, - attn_metadata=attn_metadata, - intermediate_tensors=intermediate_tensors, - inputs_embeds=inputs_embeds, - ) + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None + + hidden_states = self.model(input_ids, + positions, + kv_caches, + attn_metadata, + intermediate_tensors, + inputs_embeds=inputs_embeds) return hidden_states diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index dd5256eb87ab3..2e5b6bee784e7 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -13,6 +13,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import NestedTensors from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors @@ -240,36 +241,45 @@ def _process_image_input( return self.multi_modal_projector(image_features) + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + vision_embeddings = self._process_image_input(image_input) + # https://github.com/huggingface/transformers/blob/main/src/transformers/models/paligemma/modeling_paligemma.py#L294 # noqa + vision_embeddings = vision_embeddings * (self.config.hidden_size**-0.5) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + self.config.image_token_index) + return inputs_embeds + def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object) -> Union[SamplerOutput, IntermediateTensors]: if intermediate_tensors is not None: - input_ids = None inputs_embeds = None - else: - parsed_image_input = self._parse_and_validate_image_input(**kwargs) - - if parsed_image_input is not None: - vision_embeddings = self._process_image_input( - parsed_image_input) - # https://github.com/huggingface/transformers/blob/main/src/transformers/models/paligemma/modeling_paligemma.py#L294 # noqa - vision_embeddings = vision_embeddings * ( - self.config.hidden_size**-0.5) - - inputs_embeds = self.language_model.model.get_input_embeddings( - input_ids) - - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, - self.config.image_token_index) - - input_ids = None - else: - inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None hidden_states = self.language_model.model(input_ids, positions, diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 2e583bb08e87a..4cb874a13e0c1 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -676,7 +676,7 @@ def _process_image_input( return image_embeds - def process_mm_inputs(self, **kwargs): + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None @@ -686,12 +686,12 @@ def process_mm_inputs(self, **kwargs): def get_input_embeddings( self, input_ids: torch.Tensor, - vision_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[NestedTensors] = None, ) -> torch.Tensor: inputs_embeds = self.embed_tokens(input_ids) - if vision_embeddings is not None: + if multimodal_embeddings is not None: inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, + input_ids, inputs_embeds, multimodal_embeddings, self.image_token_id) return inputs_embeds @@ -703,12 +703,14 @@ def forward(self, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object): + if intermediate_tensors is not None: inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility elif inputs_embeds is None: - vision_embeddings = self.process_mm_inputs(**kwargs) - # always pass the input via `inputs_embeds` - # to make sure the computation graph is consistent + vision_embeddings = self.get_multimodal_embeddings(**kwargs) inputs_embeds = self.get_input_embeddings(input_ids, vision_embeddings) input_ids = None diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 0c2374c3c3fc9..a0605fee82aca 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -42,10 +42,12 @@ from vllm.model_executor.models.qwen2 import Qwen2Model from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs +from vllm.multimodal.inputs import NestedTensors from vllm.multimodal.utils import consecutive_placeholder_ranges from vllm.sequence import IntermediateTensors, SequenceData from .interfaces import SupportsMultiModal, SupportsPP +from .utils import merge_multimodal_embeddings logger = init_logger(__name__) @@ -371,6 +373,25 @@ def _process_audio_input(self, return masked_audio_features + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: + audio_input = self._parse_and_validate_audio_input(**kwargs) + if audio_input is None: + return None + masked_audio_features = self._process_audio_input(audio_input) + return masked_audio_features + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + self.config.audio_token_index) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -378,33 +399,27 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> Union[torch.Tensor, IntermediateTensors]: + if intermediate_tensors is not None: - input_ids = None inputs_embeds = None - else: - audio_input = self._parse_and_validate_audio_input(**kwargs) - if audio_input is None: - inputs_embeds = None - else: - inputs_embeds = self.language_model.embed_tokens(input_ids) - masked_audio_features = self._process_audio_input(audio_input) - # merge llm embeddings and audio features - mask = (input_ids == self.config.audio_token_index) - inputs_embeds[mask, :] = masked_audio_features - - input_ids = None - - hidden_states = self.language_model( - input_ids=input_ids, - positions=positions, - kv_caches=kv_caches, - attn_metadata=attn_metadata, - intermediate_tensors=intermediate_tensors, - inputs_embeds=inputs_embeds, - ) + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + multimodal_embeddings) + input_ids = None + + hidden_states = self.language_model(input_ids, + positions, + kv_caches, + attn_metadata, + intermediate_tensors, + inputs_embeds=inputs_embeds) return hidden_states def compute_logits(self, hidden_states: torch.Tensor, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 531608a877f2f..7956a98b21569 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -63,7 +63,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.inputs import (MultiModalData, MultiModalDataDict, - MultiModalKwargs) + MultiModalKwargs, NestedTensors) from vllm.multimodal.utils import cached_get_tokenizer from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors, PoolerOutput, SequenceData @@ -1238,6 +1238,55 @@ def _merge_multimodal_embeddings( inputs_embeds[mask, :] = multimodal_embeddings return inputs_embeds + def get_multimodal_embeddings( + self, **kwargs) -> Optional[List[Tuple[NestedTensors, str]]]: + + image_input = self._parse_and_validate_image_input(**kwargs) + video_input = self._parse_and_validate_video_input(**kwargs) + if image_input is None and video_input is None: + return None + + # We make a tuple of each embedding with its modality string. This is a + # temporary workaround for models to handle mixed modalities when + # get_multimodal_embeddings and get_input_embeddings are called + # separately. + # TODO(ywang96): Add support for mixed-modality inference for v1. + multimodal_embeddings: List[Tuple[NestedTensors, str]] = [] + + if image_input is not None: + image_embeds = self._process_image_input(image_input) + multimodal_embeddings.append((image_embeds, "image")) + if video_input is not None: + video_embeds = self._process_video_input(video_input) + multimodal_embeddings.append((video_embeds, "video")) + + return multimodal_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[List[Tuple[NestedTensors, + str]]] = None, + ) -> torch.Tensor: + inputs_embeds = self.model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + for embeddings, modality in multimodal_embeddings: + if modality == "image": + inputs_embeds = self._merge_multimodal_embeddings( + input_ids, + inputs_embeds, + embeddings, + placeholder_token_id=self.config.image_token_id, + ) + if modality == "video": + inputs_embeds = self._merge_multimodal_embeddings( + input_ids, + inputs_embeds, + embeddings, + placeholder_token_id=self.config.video_token_id, + ) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -1245,6 +1294,7 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> Union[torch.Tensor, IntermediateTensors]: """Run forward pass for Qwen2-VL. @@ -1266,42 +1316,26 @@ def forward( video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM. `None` if no videos are passed. """ + if intermediate_tensors is not None: - input_ids = None inputs_embeds = None - else: - image_input = self._parse_and_validate_image_input(**kwargs) - video_input = self._parse_and_validate_video_input(**kwargs) - - if image_input is None and video_input is None: - inputs_embeds = None - else: - if uses_mrope(self.config): - assert positions.ndim == 2 and positions.size(0) == 3, ( - "multimodal section rotary embedding requires " - f"(3, seq_len) positions, but got {positions.size()}") - - inputs_embeds = self.model.embed_tokens(input_ids) - - if image_input is not None: - image_embeds = self._process_image_input(image_input) - inputs_embeds = self._merge_multimodal_embeddings( - input_ids, - inputs_embeds, - image_embeds, - placeholder_token_id=self.config.image_token_id, - ) - - if video_input is not None: - video_embeds = self._process_video_input(video_input) - inputs_embeds = self._merge_multimodal_embeddings( - input_ids, - inputs_embeds, - video_embeds, - placeholder_token_id=self.config.video_token_id, - ) - input_ids = None + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) + + # We need to check for usage of mrope here in case there is + # multimodal data. + # TODO (ywang96): move this to model runner in V1. + if multimodal_embeddings is not None and uses_mrope(self.config): + assert positions.ndim == 2 and positions.size(0) == 3, ( + "multimodal section rotary embedding requires " + f"(3, seq_len) positions, but got {positions.size()}") + + inputs_embeds = self.get_input_embeddings(input_ids, + multimodal_embeddings) + input_ids = None hidden_states = self.model( input_ids=input_ids, diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 512adbc7db35e..b61deccde45b7 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -449,10 +449,36 @@ def _process_audio_input( return result - def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: + audio_input = self._parse_and_validate_audio_input(**kwargs) + if audio_input is None: + return None + audio_embeddings = self._process_audio_input(audio_input) + return audio_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + attn_metadata: Optional[AttentionMetadata] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + + # TODO(ywang96): use merge_multimodal_embeddings after + # v0 is deprecated + merge_multimodal_embeddings_from_map( + inputs_embeds, multimodal_embeddings, + attn_metadata.multi_modal_placeholder_index_maps["audio"]) + return inputs_embeds + + def forward(self, + input_ids: torch.Tensor, + positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, - intermediate_tensors: Optional[torch.Tensor], + intermediate_tensors: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs) -> Union[torch.Tensor, IntermediateTensors]: """Run forward pass for Ultravox @@ -466,30 +492,28 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, Args: audio_features: A batch of audio inputs [B, N, 80, M]. """ + if intermediate_tensors is not None: - input_ids = None inputs_embeds = None - else: - audio_input = self._parse_and_validate_audio_input(**kwargs) - if audio_input is not None: - audio_embeddings = self._process_audio_input(audio_input) - inputs_embeds = self.language_model.model.get_input_embeddings( - input_ids) - - merge_multimodal_embeddings_from_map( - inputs_embeds, audio_embeddings, - attn_metadata.multi_modal_placeholder_index_maps["audio"]) - input_ids = None - else: - inputs_embeds = None - - hidden_states = self.language_model.model( - input_ids=input_ids, - positions=positions, - kv_caches=kv_caches, - attn_metadata=attn_metadata, - intermediate_tensors=intermediate_tensors, - inputs_embeds=inputs_embeds) + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) + + # TODO(ywang96): remove attn_metadata from get_input_embeddings + # after v0 is deprecated + inputs_embeds = self.get_input_embeddings(input_ids, + multimodal_embeddings, + attn_metadata) + input_ids = None + + hidden_states = self.language_model.model(input_ids, + positions, + kv_caches, + attn_metadata, + intermediate_tensors, + inputs_embeds=inputs_embeds) return hidden_states def compute_logits(self, hidden_states: torch.Tensor, diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index dcfd2cb7d2622..4c13cbc953273 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -356,8 +356,7 @@ def embed_multimodal( input_ids: torch.Tensor, multimodal_token_id: int, get_text_embeds: Callable[[torch.Tensor], torch.Tensor], - get_multimodal_embeds: Callable[[torch.Tensor], Union[torch.Tensor, - List[torch.Tensor]]], + multimodal_embeds: Union[torch.Tensor, List[torch.Tensor]], ) -> torch.Tensor: """ Embed token IDs and multimodal inputs and combine their embeddings. @@ -374,8 +373,6 @@ def embed_multimodal( is_text = ~is_multimodal text_embeds = get_text_embeds(input_ids[is_text]) - multimodal_embeds = get_multimodal_embeds(input_ids[is_multimodal]) - merged_embeds = torch.empty( (input_ids.shape[0], text_embeds.shape[1]), dtype=text_embeds.dtype, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 13cbc8fa39c03..1fa47f553dfd6 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -363,7 +363,8 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"): # 2. A list (length: num_images) of tensors, each of shape # [feature_size, hidden_size] in case when the feature size is # dynamic depending on input images. - encoder_outputs = self.model.process_mm_inputs(**batched_mm_inputs) + encoder_outputs = self.model.get_multimodal_embeddings( + **batched_mm_inputs) # Cache the encoder outputs. for (req_id, input_id), output in zip(req_input_ids, encoder_outputs): From 0a71900bc92b4a18d5545e9d5dc0ca750add3c69 Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Tue, 26 Nov 2024 19:57:11 -0600 Subject: [PATCH 167/255] Remove hard-dependencies of Speculative decode to CUDA workers (#10587) Signed-off-by: Chendi Xue --- tests/spec_decode/test_spec_decode_worker.py | 4 +- vllm/config.py | 1 + .../layers/spec_decode_base_sampler.py | 17 +++++++- vllm/platforms/cpu.py | 8 +++- vllm/platforms/cuda.py | 4 +- vllm/spec_decode/draft_model_runner.py | 24 ++++++------ vllm/spec_decode/interfaces.py | 8 ++-- vllm/spec_decode/medusa_worker.py | 9 +++-- vllm/spec_decode/metrics.py | 15 ++++++- vllm/spec_decode/multi_step_worker.py | 31 +++++++++++---- vllm/spec_decode/ngram_worker.py | 3 +- vllm/spec_decode/spec_decode_worker.py | 36 +++++++++++------ vllm/spec_decode/target_model_runner.py | 33 ++++++---------- vllm/spec_decode/util.py | 12 ++++-- vllm/worker/cpu_model_runner.py | 39 ++++++++++++++++++- vllm/worker/cpu_worker.py | 27 ++++++++++++- vllm/worker/model_runner_base.py | 15 +++++++ vllm/worker/worker.py | 7 ++-- vllm/worker/worker_base.py | 3 ++ 19 files changed, 219 insertions(+), 77 deletions(-) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index d7caf57147278..caf7a7e625b46 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -595,8 +595,8 @@ def test_init_device(acceptance_sampler_method: str): target_worker.init_device.assert_called_once() - metrics_collector.init_gpu_tensors.assert_called_once() - spec_decode_sampler.init_gpu_tensors.assert_called_once() + metrics_collector.init_tensors.assert_called_once() + spec_decode_sampler.init_tensors.assert_called_once() @pytest.mark.parametrize("acceptance_sampler_method", diff --git a/vllm/config.py b/vllm/config.py index eae6f909e3933..68f73bf4b4dc9 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -990,6 +990,7 @@ class ParallelConfig: # the full name of the worker class to use. If "auto", the worker class # will be determined based on the platform. worker_cls: str = "auto" + sd_worker_cls: str = "auto" world_size: int = field(init=False) diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py index 7e750a744e25f..6aa4b8bd34cde 100644 --- a/vllm/model_executor/layers/spec_decode_base_sampler.py +++ b/vllm/model_executor/layers/spec_decode_base_sampler.py @@ -43,6 +43,21 @@ def init_gpu_tensors(self, device: Union[int, str]) -> None: dtype=torch.long, device=device) + def init_tensors(self, + device: Union[int, str], + device_type: Union[torch.device, str] = 'cuda') -> None: + assert self.num_accepted_tokens is None + if isinstance(device_type, torch.device): + device_type = device_type.type + if isinstance(device, int): + device = f"{device_type}:{device}" + self.num_accepted_tokens = torch.tensor(0, + dtype=torch.long, + device=device) + self.num_emitted_tokens = torch.tensor(0, + dtype=torch.long, + device=device) + @property def probs_dtype(self): return torch.float32 @@ -77,7 +92,7 @@ def _create_output( tensor is [batch_size, k + num_bonus_tokens] """ batch_size, k = substitute_token_ids.shape - bonus_token_ids = bonus_token_ids.squeeze() + bonus_token_ids = bonus_token_ids.squeeze(-1) # Determine the index of the first False value for each row. limits = (accepted == 0).max(1).indices limits[~(accepted == 0).any(1)] = k diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index cbc982752c6b4..3e22c87f61fac 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -86,4 +86,10 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: parallel_config.distributed_executor_backend) parallel_config.distributed_executor_backend = "mp" if parallel_config.worker_cls == "auto": - parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker" + if vllm_config.speculative_config: + parallel_config.worker_cls = \ + "vllm.spec_decode.spec_decode_worker.create_spec_worker" + parallel_config.sd_worker_cls = \ + "vllm.worker.cpu_worker.CPUWorker" + else: + parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker" diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 0d07050fd1b6a..5e9ce551f2332 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -106,6 +106,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: elif vllm_config.speculative_config: parallel_config.worker_cls = \ "vllm.spec_decode.spec_decode_worker.create_spec_worker" + parallel_config.sd_worker_cls = \ + "vllm.worker.worker.Worker" else: parallel_config.worker_cls = "vllm.worker.worker.Worker" @@ -236,4 +238,4 @@ def is_full_nvlink(cls, physical_device_ids: List[int]) -> bool: if not isinstance(pynvml, _MockModule): CudaPlatform.log_warnings() except ModuleNotFoundError: - CudaPlatform.log_warnings() + CudaPlatform.log_warnings() \ No newline at end of file diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index cf166e3eb5bad..fe5fd39f42ac9 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -20,8 +20,9 @@ from vllm.logger import init_logger from vllm.multimodal import MultiModalKwargs from vllm.sequence import ExecuteModelRequest, IntermediateTensors -from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata, - ModelRunner) +from vllm.worker.model_runner_base import (ModelRunnerBase, + ModelRunnerInputBase, + ModelRunnerWrapperBase) logger = init_logger(__name__) @@ -33,7 +34,7 @@ allow_gpu_advance_step = True -class TP1DraftModelRunner(ModelRunner): +class TP1DraftModelRunner(ModelRunnerWrapperBase): """Specialized model runner for speculative decoding draft model. Since the draft model always execute k forward passes consecutively to generate k speculative tokens in a single speculative decoding step, @@ -46,13 +47,14 @@ class TP1DraftModelRunner(ModelRunner): any broadcasting inside execute_model). """ - def __init__(self, *args, **kwargs): - if kwargs.get("return_hidden_states"): + def __init__(self, model_runner: ModelRunnerBase): + if hasattr( + model_runner, + "return_hidden_states") and model_runner.return_hidden_states: raise ValueError( "return_hidden_states is not supported for TP1DraftModelRunner." ) - - super().__init__(*args, **kwargs) + super().__init__(model_runner) self.indices_of_seq_with_bonus_tokens = None @@ -73,10 +75,8 @@ def _update_sampling_metadata(self, sampling_metadata, num_seqs, assert seq_group.prompt_logprob_indices == [] # No prompt assert seq_group.sample_indices == [i] # Simple - def _gpu_advance_step( - self, model_input: ModelInputForGPUWithSamplingMetadata, - last_output: SamplerOutput - ) -> ModelInputForGPUWithSamplingMetadata: + def _gpu_advance_step(self, model_input: ModelRunnerInputBase, + last_output: SamplerOutput) -> ModelRunnerInputBase: # Currently, we expect "decode mode" only assert not model_input.is_prompt @@ -168,7 +168,7 @@ def set_indices_of_seq_with_bonus_tokens(self, @torch.inference_mode() def execute_model( self, - model_input: ModelInputForGPUWithSamplingMetadata, + model_input: ModelRunnerInputBase, kv_caches: List[torch.Tensor], previous_hidden_states: Optional[torch.Tensor] = None, intermediate_tensors: Optional[IntermediateTensors] = None, diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py index 029f56460f5c1..a4fe0f13c8db1 100644 --- a/vllm/spec_decode/interfaces.py +++ b/vllm/spec_decode/interfaces.py @@ -1,6 +1,6 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Optional, Set +from typing import Optional, Set, Union import torch @@ -75,9 +75,11 @@ def get_spec_proposals( class SpeculativeScorer(ABC): - def __init__(self, scorer_worker: WorkerBase, device: str, - vocab_size: int): + def __init__(self, scorer_worker: WorkerBase, + device: Union[torch.device, str], vocab_size: int): self._scorer_worker = scorer_worker + if isinstance(device, torch.device): + device = device.type self._device = device self._vocab_size = vocab_size diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py index 0d233f393cb8c..1ab691a7ef047 100644 --- a/vllm/spec_decode/medusa_worker.py +++ b/vllm/spec_decode/medusa_worker.py @@ -9,21 +9,22 @@ from vllm.spec_decode.interfaces import SpeculativeProposals from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase from vllm.spec_decode.top1_proposer import Top1Proposer -from vllm.worker.worker import Worker +from vllm.worker.worker_base import WorkerWrapperBase -class MedusaWorker(NonLLMProposerWorkerBase, Worker): +class MedusaWorker(NonLLMProposerWorkerBase, WorkerWrapperBase): """Worker for Medusa. """ def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + super().__init__(kwargs.get("vllm_config")) + self.init_worker(*args, **kwargs) # Lazy initialization list. self._proposer: Top1Proposer def init_device(self): - super().init_device() + self.worker.init_device() self._proposer = Top1Proposer( weakref.proxy(self), # type: ignore[arg-type] diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py index 89ccaba70e93c..03dc46600d8a9 100644 --- a/vllm/spec_decode/metrics.py +++ b/vllm/spec_decode/metrics.py @@ -1,11 +1,12 @@ import time -from typing import Callable, Optional +from typing import Callable, Optional, Union import msgspec import torch from vllm.model_executor.layers.spec_decode_base_sampler import ( SpecDecodeBaseSampler) +from vllm.platforms import current_platform from vllm.utils import is_pin_memory_available @@ -81,8 +82,20 @@ def init_gpu_tensors(self, rank: int) -> None: self._rank = rank self._copy_stream = torch.cuda.Stream() + def init_tensors(self, + rank: int, + device_type: Union[torch.device, str] = 'cuda') -> None: + self._rank = rank + if isinstance(device_type, torch.device): + device_type = device_type.type + if device_type == 'cuda': + self._copy_stream = torch.cuda.Stream() + def maybe_collect_rejsample_metrics( self, k: int) -> Optional[SpecDecodeWorkerMetrics]: + # currently using cuda.Event, skip for any non_cuda_alike platform + if not current_platform.is_cuda_alike(): + return None # If a copy was initiated in the previous call, collect and return. if self._in_flight_copy is not None: diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index f49b98f5c9528..d249b37c780e4 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -5,17 +5,21 @@ import torch from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.platforms import current_platform from vllm.sequence import (ExecuteModelRequest, HiddenStates, SequenceData, SequenceGroupMetadata) -from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner + +if current_platform.is_cuda_alike(): + from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner + from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeProposer) from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase from vllm.spec_decode.top1_proposer import Top1Proposer -from vllm.worker.worker import Worker +from vllm.worker.worker_base import WorkerWrapperBase -class MultiStepWorker(Worker, ProposerWorkerBase): +class MultiStepWorker(ProposerWorkerBase, WorkerWrapperBase): """The MultiStepWorker is equivalent to a Worker except that it allows multiple forward passes in a single call, assuming the scheduler has allocated enough space to store the additional KV. This reduces overhead @@ -28,13 +32,14 @@ class MultiStepWorker(Worker, ProposerWorkerBase): """ def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + super().__init__(kwargs.get("vllm_config")) + self.init_worker(*args, **kwargs) # Lazy initialization list. self._proposer: SpeculativeProposer def init_device(self) -> None: - super().init_device() + self.worker.init_device() self._proposer = Top1Proposer( weakref.proxy(self), # type: ignore[arg-type] @@ -51,6 +56,18 @@ def set_should_modify_greedy_probs_inplace(self) -> None: self.model_runner.model.sampler.should_modify_greedy_probs_inplace = ( True) + def determine_num_available_blocks(self) -> Tuple[int, int]: + return self.worker.determine_num_available_blocks() + + def get_cache_block_size_bytes(self) -> int: + return self.worker.get_cache_block_size_bytes() + + def initialize_cache(self, *args, **kwargs) -> None: + self.worker.initialize_cache(*args, **kwargs) + + def execute_model(self, *args, **kwargs) -> List[SamplerOutput]: + return self.worker.execute_model(*args, **kwargs) + @torch.inference_mode() def sampler_output( self, @@ -75,7 +92,7 @@ def sampler_output( # Run model sample_len times. model_outputs: List[SamplerOutput] = [] - if isinstance( + if current_platform.is_cuda_alike() and isinstance( self.model_runner, TP1DraftModelRunner ) and self.model_runner.supports_gpu_multi_step(expanded_request): # Here we run the draft_model_runner with multi-step prepare @@ -92,7 +109,7 @@ def sampler_output( # and other restrictions that are part of DraftModelRunner's # supports_gpu_multi_step(..) for _ in range(sample_len): - model_output: List[SamplerOutput] = super().execute_model( + model_output: List[SamplerOutput] = self.worker.execute_model( execute_model_req=expanded_request) assert (len(model_output) == 1 ), "composing multistep workers not supported" diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py index debb3b2d5ec30..bb6b99135580e 100644 --- a/vllm/spec_decode/ngram_worker.py +++ b/vllm/spec_decode/ngram_worker.py @@ -22,6 +22,7 @@ def __init__(self, *args, **kwargs): # Get local_rank/vocab_size from kwargs attribute self.local_rank = kwargs["local_rank"] self.vocab_size = kwargs["vllm_config"].model_config.get_vocab_size() + self.device_type = kwargs.get("device_type", "cuda") # Lazy initialization list. self._proposer: Top1Proposer @@ -34,7 +35,7 @@ def set_ngram_window_size(self, ngram_prompt_lookup_min: int, self.ngram_prompt_lookup_min = ngram_prompt_lookup_min def init_device(self): - self.device = torch.device(f"cuda:{self.local_rank}") + self.device = torch.device(f"{self.device_type}:{self.local_rank}") self.load_model = lambda *args, **kwargs: None # Current NGramWorker only supports Top1Proposer diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index b279931ca4b02..53634f7b0b366 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -14,12 +14,16 @@ SpecDecodeBaseSampler, SpecDecodeStochasticBaseSampler) from vllm.model_executor.layers.typical_acceptance_sampler import ( TypicalAcceptanceSampler) +from vllm.platforms import current_platform from vllm.sequence import (VLLM_INVALID_TOKEN_ID, CompletionSequenceGroupOutput, ExecuteModelRequest, HiddenStates, SequenceGroupMetadata, get_all_seq_ids_and_request_ids) from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer -from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner + +if current_platform.is_cuda_alike(): + from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner + from vllm.spec_decode.interfaces import (SpeculativeProposals, SpeculativeScorer, SpeculativeScores) from vllm.spec_decode.medusa_worker import MedusaWorker @@ -36,8 +40,8 @@ get_all_num_logprobs, get_sampled_token_logprobs, nvtx_range, split_batch_by_proposal_len) -from vllm.worker.worker import Worker -from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase +from vllm.worker.worker_base import (LoraNotSupportedWorkerBase, WorkerBase, + WorkerWrapperBase) logger = init_logger(__name__) @@ -53,7 +57,11 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": draft_worker_kwargs = kwargs.copy() kwargs["model_runner_cls"] = TargetModelRunner - target_worker = Worker(*args, **kwargs) + target_worker_config = copy.deepcopy(vllm_config) + target_worker_config.parallel_config.worker_cls =\ + target_worker_config.parallel_config.sd_worker_cls + target_worker = WorkerWrapperBase(vllm_config=target_worker_config) + target_worker.init_worker(*args, **kwargs) # Set the disable_logprobs variable in the TargetModelRunner instance # as per its value specified in the SpeculativeConfig. target_worker.model_runner.disable_logprobs =\ @@ -65,6 +73,8 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": draft_worker_config.model_config, vllm_config.load_config, ) + speculative_config.draft_parallel_config.worker_cls =\ + draft_worker_config.parallel_config.sd_worker_cls draft_worker_config.parallel_config = speculative_config.draft_parallel_config # noqa # TODO allow draft-model specific load config. @@ -125,7 +135,7 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase): @classmethod def create_worker( cls, - scorer_worker: Worker, + scorer_worker: WorkerBase, draft_worker_kwargs: Dict[str, Any], disable_mqa_scorer: bool, disable_by_batch_size: Optional[int], @@ -145,6 +155,8 @@ def create_worker( draft_parallel_config: ParallelConfig = draft_worker_kwargs[ 'vllm_config'].parallel_config if ngram_prompt_lookup_max > 0: + draft_worker_kwargs[ + "device_type"] = scorer_worker.device_config.device.type proposer_worker = NGramWorker(**draft_worker_kwargs) proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min, ngram_prompt_lookup_max) @@ -158,8 +170,9 @@ def create_worker( proposer_worker = MedusaWorker(**draft_worker_kwargs) else: if draft_tp == 1: - draft_worker_kwargs[ - "model_runner_cls"] = TP1DraftModelRunner + if current_platform.is_cuda_alike(): + draft_worker_kwargs[ + "model_runner_cls"] = TP1DraftModelRunner else: if draft_model_config.hf_config.model_type == "eagle": raise NotImplementedError( @@ -306,8 +319,9 @@ def init_device(self) -> None: self.scorer_worker.load_model() self.proposer_worker.load_model() - self._metrics.init_gpu_tensors(self.rank) - self.spec_decode_sampler.init_gpu_tensors(self.rank) + self._metrics.init_tensors(self.rank, device_type=self.device) + self.spec_decode_sampler.init_tensors(self.rank, + device_type=self.device) scorer_cls: Type[SpeculativeScorer] if self.disable_mqa_scorer: @@ -1111,11 +1125,11 @@ def get_cache_block_size_bytes(self): raise NotImplementedError def start_profile(self): - if isinstance(self.scorer_worker, Worker): + if isinstance(self.scorer_worker, WorkerBase): self.scorer_worker.start_profile() def stop_profile(self): - if isinstance(self.scorer_worker, Worker): + if isinstance(self.scorer_worker, WorkerBase): self.scorer_worker.stop_profile() diff --git a/vllm/spec_decode/target_model_runner.py b/vllm/spec_decode/target_model_runner.py index e61cde5b17f20..56540744b73a9 100644 --- a/vllm/spec_decode/target_model_runner.py +++ b/vllm/spec_decode/target_model_runner.py @@ -1,12 +1,12 @@ from typing import List, Optional -from vllm.config import VllmConfig from vllm.sequence import SequenceGroupMetadata -from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata, - ModelRunner) +from vllm.worker.model_runner_base import (ModelRunnerBase, + ModelRunnerInputBase, + ModelRunnerWrapperBase) -class TargetModelRunner(ModelRunner): +class TargetModelRunner(ModelRunnerWrapperBase): """Specialized model runner for speculative decoding target model. In speculative decoding, the log probabilities selected finally may not be the same ones as selected by the target model sampling. This means @@ -18,32 +18,21 @@ class TargetModelRunner(ModelRunner): requested or not. """ - def __init__( - self, - vllm_config: VllmConfig, - kv_cache_dtype: Optional[str] = "auto", - is_driver_worker: bool = False, - return_hidden_states: bool = False, - ): + def __init__(self, model_runner: ModelRunnerBase): # An internal boolean member variable to indicate if token log # probabilities are needed or not. + super().__init__(model_runner) self.disable_logprobs = True - super().__init__( - vllm_config=vllm_config, - kv_cache_dtype=kv_cache_dtype, - is_driver_worker=is_driver_worker, - return_hidden_states=return_hidden_states, - ) def prepare_model_input( self, seq_group_metadata_list: List[SequenceGroupMetadata], virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None - ) -> ModelInputForGPUWithSamplingMetadata: - model_input: ModelInputForGPUWithSamplingMetadata = super( - ).prepare_model_input(seq_group_metadata_list, virtual_engine, - finished_requests_ids) + finished_requests_ids: Optional[List[str]] = None, + ) -> ModelRunnerInputBase: + model_input: ModelRunnerInputBase =\ + self.model_runner.prepare_model_input( + seq_group_metadata_list, virtual_engine, finished_requests_ids) # If token log probabilities is disabled then skip generating sampler # CPU output. We directly serialize the GPU sampled_token_id tensors # as needed. If log probabilities is enabled then synchronize all the diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index 193ef870dfceb..da8706658d09a 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -5,6 +5,7 @@ import torch from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.platforms import current_platform from vllm.sequence import (CompletionSequenceGroupOutput, Logprob, PromptLogprobs, SequenceGroupMetadata, SequenceOutput) @@ -247,11 +248,14 @@ def nvtx_range(msg, *args, **kwargs): Arguments: msg (string): message to associate with the range """ - torch.cuda.nvtx.range_push(msg.format(*args, **kwargs)) - try: + if current_platform.is_cuda_alike(): + torch.cuda.nvtx.range_push(msg.format(*args, **kwargs)) + try: + yield + finally: + torch.cuda.nvtx.range_pop() + else: yield - finally: - torch.cuda.nvtx.range_pop() class Timer: diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index b08171d79f002..420aaf8a1b4cd 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -80,6 +80,7 @@ class ModelInputForCPUWithSamplingMetadata(ModelInputForCPU): Used by the ModelRunner. """ sampling_metadata: Optional["SamplingMetadata"] = None + is_prompt: Optional[bool] = None def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: tensor_dict = { @@ -395,6 +396,7 @@ def __init__( vllm_config: VllmConfig, kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, + return_hidden_states: bool = False, *args, **kwargs, ): @@ -403,19 +405,25 @@ def __init__( cache_config = self.cache_config self.is_driver_worker = is_driver_worker + self.return_hidden_states = return_hidden_states self.device = self.device_config.device + self.pin_memory = False self.kv_cache_dtype = kv_cache_dtype self.sliding_window = model_config.get_sliding_window() self.block_size = cache_config.block_size + num_attn_heads = self.model_config.get_num_attention_heads( + self.parallel_config) + needs_attn_backend = (num_attn_heads != 0 + or self.model_config.is_attention_free) self.attn_backend = get_attn_backend( self.model_config.get_head_size(), self.model_config.dtype, self.kv_cache_dtype, self.block_size, self.model_config.is_attention_free, - ) + ) if needs_attn_backend else None # Multi-modal data support self.mm_registry = MULTIMODAL_REGISTRY @@ -444,6 +452,15 @@ def _prepare_model_input_tensors( return builder.build() # type: ignore + # sampler property will be used by spec_decode_worker + @property + def sampler(self): + return self.model.sampler + + @property + def vocab_size(self) -> int: + return self.model_config.get_vocab_size() + class CPUModelRunner(CPUModelRunnerBase[ModelInputForCPUWithSamplingMetadata]): _model_input_cls: Type[ModelInputForCPUWithSamplingMetadata] = ( @@ -480,9 +497,12 @@ def prepare_model_input( pin_memory=False, generators=generators) + is_prompt = (seq_group_metadata_list[0].is_prompt + if seq_group_metadata_list else None) return dataclasses.replace(model_input, sampling_metadata=sampling_metadata, - virtual_engine=virtual_engine) + virtual_engine=virtual_engine, + is_prompt=is_prompt) @torch.no_grad() def execute_model( @@ -491,16 +511,22 @@ def execute_model( kv_caches: List[torch.Tensor], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, + previous_hidden_states: Optional[torch.Tensor] = None, ) -> Optional[List[SamplerOutput]]: if num_steps > 1: raise ValueError( "CPU worker does not support multi-step execution.") model_executable = self.model + multimodal_kwargs = {} if model_input.multi_modal_kwargs is not None: multimodal_kwargs = MultiModalKwargs.as_kwargs( model_input.multi_modal_kwargs, device=self.device) + execute_model_kwargs = {} + if previous_hidden_states is not None: + execute_model_kwargs.update( + {"previous_hidden_states": previous_hidden_states}) with set_forward_context(model_input.attn_metadata, self.vllm_config): hidden_states = model_executable( @@ -509,6 +535,7 @@ def execute_model( kv_caches=kv_caches, attn_metadata=model_input.attn_metadata, intermediate_tensors=intermediate_tensors, + **execute_model_kwargs, **multimodal_kwargs, ) @@ -525,4 +552,12 @@ def execute_model( logits=logits, sampling_metadata=model_input.sampling_metadata, ) + if self.return_hidden_states: + # we only need to pass hidden states of most recent token + if model_input.is_prompt: + output.prefill_hidden_states = hidden_states + output.hidden_states = hidden_states return [output] + + def generate_proposals(self, *args, **kwargs): + return self.model.generate_proposals(*args, **kwargs) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index bc9164bd9d5df..cf04808b73372 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -128,6 +128,7 @@ def __init__( distributed_init_method: str, kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, + model_runner_cls: Optional[Type[CPUModelRunner]] = None, ) -> None: WorkerBase.__init__(self, vllm_config=vllm_config) @@ -151,6 +152,16 @@ def __init__( else: self.local_omp_cpuid = omp_cpuids.split("|")[rank] + # Return hidden states from target model if the draft model is an + # mlp_speculator + speculative_config = self.speculative_config + model_config = self.model_config + speculative_args = {} if speculative_config is None \ + or (speculative_config.draft_model_config.model == + model_config.model) \ + or (speculative_config.draft_model_config.hf_config.model_type + not in ["medusa", "mlp_speculator", "eagle"]) \ + else {"return_hidden_states": True} ModelRunnerClass: Type[CPUModelRunnerBase] = CPUModelRunner if self.model_config.task == "embedding": ModelRunnerClass = CPUEmbeddingModelRunner @@ -159,7 +170,11 @@ def __init__( self.model_runner: CPUModelRunnerBase = ModelRunnerClass( vllm_config=vllm_config, kv_cache_dtype=kv_cache_dtype, - is_driver_worker=is_driver_worker) + is_driver_worker=is_driver_worker, + **speculative_args, + ) + if model_runner_cls is not None: + self.model_runner = model_runner_cls(self.model_runner) # Uninitialized cache engine. Will be initialized by # initialize_cache. self.cache_engine: List[CPUCacheEngine] @@ -197,7 +212,7 @@ def init_device(self) -> None: ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid) if ret: logger.info(ret) - + self.device = torch.device("cpu") self.init_distributed_environment() # Set random seed. set_random_seed(self.model_config.seed) @@ -297,6 +312,14 @@ def do_metadata_broadcast(self) -> bool: def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: return self.cpu_cache + @property + def vocab_size(self) -> int: + return self.model_runner.vocab_size + + @property + def max_model_len(self) -> int: + return self.model_config.max_model_len + def execute_worker( self, worker_input: WorkerInput, diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py index 9e529f86b46bb..cd4770202a186 100644 --- a/vllm/worker/model_runner_base.py +++ b/vllm/worker/model_runner_base.py @@ -289,3 +289,18 @@ def get_generators(self, finished_request_ids: Optional[List[str]] = None): self.generators.pop(request_id, None) return self.generators + + +class ModelRunnerWrapperBase: + """ + The whole point of this class is to lazily initialize the model_runner. + """ + + def __init__( + self, + moderl_runner: ModelRunnerBase, + ) -> None: + self.model_runner: ModelRunnerBase = moderl_runner + + def __getattr__(self, attr): + return getattr(self.model_runner, attr) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 80fd7bc3b67cc..24e7bc760b0c0 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -74,9 +74,7 @@ def __init__( else {"return_hidden_states": True} ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner - if model_runner_cls is not None: - ModelRunnerClass = model_runner_cls - elif model_config.task == "embedding": + if model_config.task == "embedding": ModelRunnerClass = EmbeddingModelRunner elif self.model_config.is_encoder_decoder: ModelRunnerClass = EncoderDecoderModelRunner @@ -86,6 +84,9 @@ def __init__( is_driver_worker=is_driver_worker, **speculative_args, ) + if model_runner_cls is not None: + self.model_runner = model_runner_cls(self.model_runner) + # Uninitialized cache engine. Will be initialized by # initialize_cache. self.cache_engine: List[CacheEngine] diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index e7fec6d17eecd..7aaa8b453cff1 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -466,6 +466,9 @@ def execute_method(self, method, *args, **kwargs): logger.exception(msg) raise e + def __getattr__(self, attr): + return getattr(self.worker, attr) + def extract_previous_hidden_states( data: Union[ExecuteModelRequest, Dict[str, torch.Tensor]]) -> \ From 0a4d96850013eb2c295b25df53177ad2302110ca Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Tue, 26 Nov 2024 18:04:01 -0800 Subject: [PATCH 168/255] [V1] Update interface for idefics3 (#10680) Signed-off-by: Roger Wang --- vllm/model_executor/models/idefics3.py | 73 ++++++++++++++++---------- 1 file changed, 46 insertions(+), 27 deletions(-) diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 5d176b2a4e416..58f7635275c05 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -39,6 +39,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.image import cached_get_image_processor +from vllm.multimodal.inputs import NestedTensors from vllm.sequence import IntermediateTensors, SequenceData from vllm.transformers_utils.processor import cached_get_processor from vllm.utils import is_list_of @@ -597,6 +598,12 @@ def _process_image_input(self, image_input: ImageInputs) -> torch.Tensor: image_features = self._process_image_pixels(image_input) return self.connector(image_features) + def get_input_embeddings( + self, + input_ids: torch.Tensor, + ) -> torch.Tensor: + return self.text_model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -604,26 +611,8 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, - **kwargs: object, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: - if intermediate_tensors is not None: - input_ids = None - inputs_embeds = None - else: - # always pass the input via `inputs_embeds` - # to make sure the computation graph is consistent - image_input = self._parse_and_validate_image_input(**kwargs) - - if image_input is not None: - vision_embeddings = self._process_image_input(image_input) - inputs_embeds = self.text_model.get_input_embeddings(input_ids) - - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, - self.image_token_id) - else: - inputs_embeds = self.text_model.get_input_embeddings(input_ids) - input_ids = None hidden_states = self.text_model( input_ids, @@ -718,6 +707,25 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.logits_processor = LogitsProcessor(config.text_config.vocab_size) self.sampler = Sampler() + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: + image_input = self.model._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + vision_embeddings = self.model._process_image_input(image_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + self.config.image_token_id) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -725,16 +733,27 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> Union[torch.Tensor, IntermediateTensors]: - hidden_states = self.model( - input_ids, - positions, - kv_caches, - attn_metadata, - intermediate_tensors, - **kwargs, - ) + if intermediate_tensors is not None: + inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None + + hidden_states = self.model.text_model(input_ids, + positions, + kv_caches, + attn_metadata, + intermediate_tensors, + inputs_embeds=inputs_embeds) + return hidden_states def compute_logits(self, hidden_states: torch.Tensor, From 1bf905ddaa969e6458fe0d15a1db80318f39fade Mon Sep 17 00:00:00 2001 From: jeongin601 <78595701+jeongin601@users.noreply.github.com> Date: Wed, 27 Nov 2024 14:07:30 +0900 Subject: [PATCH 169/255] [Bugfix][SpecDecode] apply sampling parameters to target probabilities for consistency in rejection sampling. (#10198) Signed-off-by: jeongin601 <0200angela@gmail.com> Signed-off-by: jeong_in.bae --- tests/spec_decode/e2e/test_mlp_correctness.py | 2 +- tests/spec_decode/test_batch_expansion.py | 8 ++++++++ vllm/spec_decode/batch_expansion.py | 14 +------------- 3 files changed, 10 insertions(+), 14 deletions(-) diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py index 5ecc0d4e95719..183ff2f5db274 100644 --- a/tests/spec_decode/e2e/test_mlp_correctness.py +++ b/tests/spec_decode/e2e/test_mlp_correctness.py @@ -203,7 +203,7 @@ def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs, @pytest.mark.parametrize("test_llm_kwargs", [{"seed": 5}]) @pytest.mark.parametrize("output_len", [64]) @pytest.mark.parametrize("batch_size", [1, 32]) -@pytest.mark.parametrize("temperature", [0.1, 1.0]) +@pytest.mark.parametrize("temperature", [1.0]) @pytest.mark.parametrize("seed", [1]) def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py index 0d6aaa449d856..3504fcf43e361 100644 --- a/tests/spec_decode/test_batch_expansion.py +++ b/tests/spec_decode/test_batch_expansion.py @@ -90,6 +90,14 @@ def test_create_single_target_seq_group_metadata(k: int): ) assert output.request_id == input_seq_group_metadata.request_id + assert output.sampling_params.repetition_penalty == \ + input_seq_group_metadata.sampling_params.repetition_penalty + assert output.sampling_params.temperature == \ + input_seq_group_metadata.sampling_params.temperature + assert output.sampling_params.top_p == \ + input_seq_group_metadata.sampling_params.top_p + assert output.sampling_params.top_k == \ + input_seq_group_metadata.sampling_params.top_k assert len(output.seq_data) == 1 assert output.seq_data[target_seq_id].get_prompt_token_ids() == tuple( prompt_tokens) diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 25ef27b8378f0..01b9cdad963da 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -307,28 +307,16 @@ def _create_target_seq_group_metadata( token_ids_to_score = self._get_token_ids_to_score( proposal_token_ids[batch_index]) - # Use simpler sampling parameters apart from for final token - # (in particular don't do seeded sampling) since those sampled tokens - # aren't used. - # We don't replace the sampling_params in the greedy case because - # this also controls whether the probs get modified in the sampler - # (see use of _modify_greedy_probs_inplace there). sampling_params = input_seq_group_metadata.sampling_params - non_bonus_sampling_params = DEFAULT_SIMPLE_SAMPLING_PARAMS \ - if sampling_params.temperature else sampling_params - target_seq_group_metadata_list: List[SequenceGroupMetadata] = [] - last_index = len(token_ids_to_score) - 1 for i, token_ids in enumerate(token_ids_to_score): - target_sampling_params = sampling_params if i == last_index \ - else non_bonus_sampling_params target_seq_group_metadata_list.append( self._create_single_target_seq_group_metadata( input_seq_group_metadata, input_seq_id, next(target_seq_ids_iter), token_ids, - sampling_params=target_sampling_params, + sampling_params=sampling_params, )) return target_seq_group_metadata_list From cfb3bf25fb981494fa6c575fb0714388c9df99b0 Mon Sep 17 00:00:00 2001 From: yansh97 Date: Wed, 27 Nov 2024 13:55:23 +0800 Subject: [PATCH 170/255] [bugfix] fix the default value of llm_int8_threshold in BitsAndBytesConfig (#10657) --- vllm/model_executor/layers/quantization/bitsandbytes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 6a0de3034142a..e01c713dd14db 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -26,7 +26,7 @@ def __init__( llm_int8_enable_fp32_cpu_offload: bool = False, llm_int8_has_fp16_weight: bool = False, llm_int8_skip_modules: Optional[List[str]] = None, - llm_int8_threshold: float = 0.0, + llm_int8_threshold: float = 6.0, ) -> None: self.load_in_8bit = load_in_8bit @@ -103,7 +103,7 @@ def get_safe_value(config, keys, default_value=None): ["llm_int8_skip_modules"], default_value=[]) llm_int8_threshold = get_safe_value(config, ["llm_int8_threshold"], - default_value=0.0) + default_value=6.0) return cls( load_in_8bit=load_in_8bit, From e85250b1d164c9975816fa7aaf591aa5abad577d Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Wed, 27 Nov 2024 14:49:40 +0800 Subject: [PATCH 171/255] [Hardware][Gaudi]add get_name method for HPUAttentionBackend (#10667) Signed-off-by: Kunshang Ji --- vllm/attention/backends/hpu_attn.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index 4a3ddd5db94e5..5359941d41fde 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -22,6 +22,10 @@ class HPUAttentionBackend(AttentionBackend): + @staticmethod + def get_name() -> str: + return "HPU_ATTN" + @staticmethod def get_impl_cls() -> Type["HPUAttentionImpl"]: return HPUAttentionImpl From 15cc2a9f1acb70b68366da0a6d2a4549da3d32f4 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 27 Nov 2024 14:54:12 +0800 Subject: [PATCH 172/255] [Misc]Further reduce BNB static variable (#10597) Signed-off-by: Jee Jee Li --- vllm/model_executor/model_loader/loader.py | 218 ++++++++++++--------- vllm/model_executor/models/baichuan.py | 8 - vllm/model_executor/models/falcon.py | 6 - vllm/model_executor/models/gemma.py | 9 - vllm/model_executor/models/gemma2.py | 9 - vllm/model_executor/models/idefics3.py | 15 -- vllm/model_executor/models/llama.py | 9 - vllm/model_executor/models/minicpmv.py | 34 ---- vllm/model_executor/models/mllama.py | 14 -- vllm/model_executor/models/opt.py | 3 - vllm/model_executor/models/phi.py | 3 - vllm/model_executor/models/phi3.py | 6 - vllm/model_executor/models/qwen.py | 7 +- vllm/model_executor/models/qwen2.py | 9 - 14 files changed, 131 insertions(+), 219 deletions(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 441dd409b4f9d..37c2d789030b6 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -28,7 +28,8 @@ get_tensor_model_parallel_world_size) from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger -from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, +from vllm.model_executor.layers.linear import (LinearBase, + MergedColumnParallelLinear, QKVParallelLinear, ReplicatedLinear, RowParallelLinear) @@ -78,12 +79,14 @@ def device_loading_context(module: torch.nn.Module, original_device: torch.device = original_device_states[name] if original_device.type == "cpu": # `torch.empty_like` does not support `pin_memory` argument - cpu_data = torch.empty_strided(size=p.data.size(), - stride=p.data.stride(), - dtype=p.data.dtype, - layout=p.data.layout, - device="cpu", - pin_memory=pin_memory) + cpu_data = torch.empty_strided( + size=p.data.size(), + stride=p.data.stride(), + dtype=p.data.dtype, + layout=p.data.layout, + device="cpu", + pin_memory=pin_memory, + ) cpu_data.copy_(p.data) p.data = cpu_data else: @@ -112,7 +115,8 @@ def _initialize_model(vllm_config: VllmConfig, prefix: str = "") -> nn.Module: logger.warning(msg) logger.warning( "Trying to guess the arguments for old-style model class %s", - model_class) + model_class, + ) # try to be compatible with old-style model class kwargs = {} if "prefix" in all_params: @@ -198,14 +202,17 @@ def _maybe_download_from_modelscope( return model_path return None - def _prepare_weights(self, model_name_or_path: str, - revision: Optional[str], - fall_back_to_pt: bool) -> Tuple[str, List[str], bool]: + def _prepare_weights( + self, + model_name_or_path: str, + revision: Optional[str], + fall_back_to_pt: bool, + ) -> Tuple[str, List[str], bool]: """Prepare weights for the model. If the model is not local, it will be downloaded.""" - model_name_or_path = self._maybe_download_from_modelscope( - model_name_or_path, revision) or model_name_or_path + model_name_or_path = (self._maybe_download_from_modelscope( + model_name_or_path, revision) or model_name_or_path) is_local = os.path.isdir(model_name_or_path) load_format = self.load_config.load_format @@ -258,8 +265,11 @@ def _prepare_weights(self, model_name_or_path: str, # any files not found in the index. if not is_local: download_safetensors_index_file_from_hf( - model_name_or_path, index_file, - self.load_config.download_dir, revision) + model_name_or_path, + index_file, + self.load_config.download_dir, + revision, + ) hf_weights_files = filter_duplicate_safetensors_files( hf_weights_files, hf_folder, index_file) else: @@ -282,8 +292,11 @@ def _get_weights_iterator( # Currently np_cache only support *.bin checkpoints assert use_safetensors is False weights_iterator = np_cache_weights_iterator( - source.model_or_path, self.load_config.download_dir, hf_folder, - hf_weights_files) + source.model_or_path, + self.load_config.download_dir, + hf_folder, + hf_weights_files, + ) elif use_safetensors: weights_iterator = safetensors_weights_iterator(hf_weights_files) else: @@ -310,17 +323,19 @@ def _get_all_weights( model_config: ModelConfig, model: nn.Module, ) -> Generator[Tuple[str, torch.Tensor], None, None]: - primary_weights = DefaultModelLoader.Source( model_config.model, model_config.revision, prefix="", fall_back_to_pt=getattr(model, "fall_back_to_pt_during_load", - True)) + True), + ) yield from self._get_weights_iterator(primary_weights) - secondary_weights = cast(Iterable[DefaultModelLoader.Source], - getattr(model, "secondary_weights", ())) + secondary_weights = cast( + Iterable[DefaultModelLoader.Source], + getattr(model, "secondary_weights", ()), + ) for source in secondary_weights: yield from self._get_weights_iterator(source) @@ -416,7 +431,7 @@ def _verify_config(self, model_config: ModelConfig, self.tensorizer_config.verify_with_parallel_config(parallel_config) def _get_weights_iterator( - self) -> Generator[Tuple[str, torch.Tensor], None, None]: + self, ) -> Generator[Tuple[str, torch.Tensor], None, None]: tensorizer_args = self.tensorizer_config._construct_tensorizer_args() return tensorizer_weights_iterator(tensorizer_args) @@ -479,9 +494,10 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module: if parallel_config.tensor_parallel_size > 1: from vllm.distributed import get_tensor_model_parallel_rank - self.tensorizer_config.tensorizer_uri = \ - self.tensorizer_config.tensorizer_uri \ - % get_tensor_model_parallel_rank() + + self.tensorizer_config.tensorizer_uri = ( + self.tensorizer_config.tensorizer_uri % + get_tensor_model_parallel_rank()) if is_vllm_tensorized(self.tensorizer_config): return self._load_model_serialized(vllm_config=vllm_config) @@ -520,13 +536,13 @@ def __init__(self, load_config: LoadConfig): @staticmethod def _filter_subtensors( - tensors: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: + tensors: Dict[str, torch.Tensor], ) -> Dict[str, torch.Tensor]: """ Filter out all tensors that share the same memory or a subset of the memory of another tensor. """ - same_storage_groups: Dict[Any, List[Tuple[ - str, torch.Tensor]]] = collections.defaultdict(list) + same_storage_groups: Dict[Any, List[Tuple[str, torch.Tensor]]] = ( + collections.defaultdict(list)) for key, tensor in tensors.items(): if tensor.numel(): ptr = tensor.untyped_storage().data_ptr() @@ -615,8 +631,11 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module: if tensor.shape != param_shape: logger.warning( "loading tensor of shape %s into " - "parameter '%s' of shape %s", tensor.shape, - key, param_shape) + "parameter '%s' of shape %s", + tensor.shape, + key, + param_shape, + ) param_data.copy_(tensor) state_dict.pop(key) if state_dict: @@ -634,6 +653,7 @@ def save_model( from safetensors.torch import save_file from vllm.distributed import get_tensor_model_parallel_rank + if pattern is None: pattern = ShardedStateLoader.DEFAULT_PATTERN rank = get_tensor_model_parallel_rank() @@ -667,24 +687,6 @@ class BitsAndBytesModelLoader(BaseModelLoader): possible_config_file_names = ["adapter_config.json"] - default_target_modules = [ - ".gate_proj.", - ".down_proj.", - ".up_proj.", - ".q_proj.", - ".k_proj.", - ".v_proj.", - ".o_proj.", - '.fc1.', - '.fc2.', - '.dense.', - '.query_key_value.', - '.qkv_proj.', - '.dense_h_to_4h.', - '.dense_4h_to_h.', - '.out_proj.', - ] - def __init__(self, load_config: LoadConfig): super().__init__(load_config) @@ -709,6 +711,11 @@ def __init__(self, load_config: LoadConfig): with open(config_file_path) as f: config = json.load(f) self.target_modules = config["target_modules"] + # TODO: target_modules could be either a list or a regex string. + # We need to handle both cases. + assert isinstance(self.target_modules, + list), "Unsupported target_modules: " + f"{self.target_modules}" def _get_config_file(self, qlora_adapter: str) -> str: is_local = os.path.isdir(qlora_adapter) @@ -734,12 +741,13 @@ def _get_config_file(self, qlora_adapter: str) -> str: return config_file_path def _get_weight_files( - self, - model_name_or_path: str, - allowed_patterns: List[str], - revision: Optional[str] = None) -> Tuple[List[str], str]: - """Retrieve weight files. Download the files if necessary. - + self, + model_name_or_path: str, + allowed_patterns: List[str], + revision: Optional[str] = None, + ) -> Tuple[List[str], str]: + """Retrieve weight files. Download the files if necessary. + Return the weight files and the file pattern.""" is_local = os.path.isdir(model_name_or_path) @@ -806,6 +814,7 @@ def _get_quantized_weights_iterator( # only load the bitsandbytes module when needed try: import bitsandbytes + if bitsandbytes.__version__ < "0.44.0": raise ImportError("bitsandbytes version is wrong. Please " "install bitsandbytes>=0.44.0.") @@ -839,8 +848,11 @@ def _is_8bit_weight_name(self, weight_name: str): def _is_4bit_weight_name(self, weight_name: str): quantized_suffix = { - "absmax", "quant_map", "nested_absmax", "nested_quant_map", - "bitsandbytes" + "absmax", + "quant_map", + "nested_absmax", + "nested_quant_map", + "bitsandbytes", } suffix = weight_name.split(".")[-1] return any(q_suffix in suffix for q_suffix in quantized_suffix) @@ -857,7 +869,6 @@ def _quantized_8bit_generator(self, hf_weights_files, use_safetensors, for weight_name, weight_tensor in self._hf_weight_iter( hf_weights_files, use_safetensors): - if self._is_8bit_weight_name(weight_name): continue @@ -899,14 +910,13 @@ def _parse_quant_state(param_name: str, # pre quantized weights would have a quant_state for weight_name, weight_tensor in self._hf_weight_iter( hf_weights_files, use_safetensors): - if self._is_4bit_weight_name(weight_name): continue - if (f"{weight_name}.quant_state.bitsandbytes__nf4" \ - in temp_state_dict) or \ - (f"{weight_name}.quant_state.bitsandbytes__fp4" \ - in temp_state_dict): + if (f"{weight_name}.quant_state.bitsandbytes__nf4" + in temp_state_dict) or ( + f"{weight_name}.quant_state.bitsandbytes__fp4" + in temp_state_dict): quant_state = _parse_quant_state(weight_name, temp_state_dict) quant_state_dict[weight_name] = quant_state yield weight_name, weight_tensor @@ -916,12 +926,12 @@ def _parse_quant_state(param_name: str, def _unquantized_generator(self, hf_weights_files, use_safetensors, quant_state_dict) -> Generator: from bitsandbytes.functional import quantize_4bit + tp_size = get_tensor_model_parallel_world_size() tp_rank = get_tensor_model_parallel_rank() for weight_name, weight_tensor in self._hf_weight_iter( hf_weights_files, use_safetensors): - if any(target_module in weight_name for target_module in self.target_modules) and weight_name.endswith(".weight"): # Without sharding @@ -954,12 +964,11 @@ def _unquantized_generator(self, hf_weights_files, use_safetensors, # get the start/end index of each shard weight tensor total_start_index = list( itertools.accumulate([0] + total_shard_sizes))[:-1] - shard_weights_index = [ - (idx + size // tp_size * tp_rank, - idx + size // tp_size * (tp_rank + 1)) - for idx, size in zip(total_start_index, - total_shard_sizes) - ] + shard_weights_index = [( + idx + size // tp_size * tp_rank, + idx + size // tp_size * (tp_rank + 1), + ) for idx, size in zip(total_start_index, + total_shard_sizes)] # slice and reorder the weight tensor weight_tensor = [ weight_tensor[start_index:end_index, ...] @@ -989,7 +998,8 @@ def _unquantized_generator(self, hf_weights_files, use_safetensors, processed_weight, quant_state = quantize_4bit( loaded_weight, compress_statistics=True, - quant_type="nf4") + quant_type="nf4", + ) quant_state_dict[weight_name] = quant_state else: @@ -997,28 +1007,58 @@ def _unquantized_generator(self, hf_weights_files, use_safetensors, yield weight_name, processed_weight + def _get_bnb_target_modules(self, model: nn.Module) -> None: + + # TODO: Maybe we can replace bitsandbytes_stacked_params_mapping with + # packed_modules_mapping. + inverse_stacked_mapping: Dict[str, List[str]] = {} + for orig, ( + packed, + idx, + ) in model.bitsandbytes_stacked_params_mapping.items(): + if packed not in inverse_stacked_mapping: + inverse_stacked_mapping[packed] = [] + inverse_stacked_mapping[packed].insert(idx, orig) + + linear_module_lst = [] + for name, module in model.named_modules(): + if isinstance(module, (LinearBase, )): + last_name = name.split(".")[-1] + if sub_modules := inverse_stacked_mapping.get(last_name, []): + # Map vllm's names to transformers' names. + for sub_name in sub_modules: + linear_module_lst.append( + name.replace(last_name, sub_name)) + else: + linear_module_lst.append(name) + if self.target_modules: + # Update self.target_modules + self.target_modules = [ + qual_name for qual_name in linear_module_lst + if any(t in qual_name for t in self.target_modules) + ] + else: + self.target_modules = linear_module_lst + assert (self.target_modules + ), "vllm currently does not support BNB quantization for" + f" {type(model).__name__}" + def _load_weights(self, model_config: ModelConfig, model: nn.Module) -> None: - if not hasattr(model, 'load_weights'): + if not hasattr(model, "load_weights"): raise AttributeError( "The required method 'load_weights' is not defined in class" f" {type(model).__name__}.") - if not hasattr(model, 'bitsandbytes_stacked_params_mapping'): + if not hasattr(model, "bitsandbytes_stacked_params_mapping"): raise AttributeError( f"Model {type(model).__name__} does not support BitsAndBytes " "quantization yet.") - if len(self.target_modules) == 0: - if hasattr(model, 'default_bitsandbytes_target_modules'): - self.target_modules = model.default_bitsandbytes_target_modules - else: - self.target_modules = self.default_target_modules - # Modules whose weights might have fused on disk # we need their output_sizes to make shard in flight correctly with TP self.maybe_fused_weights_modules: Dict[str, List[int]] = {} - + self._get_bnb_target_modules(model) for name, module in model.named_modules(): # Some modules like `ReplicatedLinear` should not have their weights # sharded. The reason for implementing it this way is to avoid new @@ -1046,7 +1086,7 @@ def _load_weights(self, model_config: ModelConfig, pre_quant = False if quant_config is not None: - quant_method = quant_config.get('quant_method') + quant_method = quant_config.get("quant_method") if quant_method == "bitsandbytes": pre_quant = True else: @@ -1063,11 +1103,12 @@ def _load_weights(self, model_config: ModelConfig, load_8bit = False if pre_quant: - load_8bit = quant_config.get('load_in_8bit', False) + load_8bit = quant_config.get("load_in_8bit", False) - qweight_iterator, quant_state_dict = \ - self._get_quantized_weights_iterator( - model_config.model, model_config.revision, pre_quant, load_8bit) + qweight_iterator, quant_state_dict = ( + self._get_quantized_weights_iterator(model_config.model, + model_config.revision, + pre_quant, load_8bit)) model.load_weights(qweight_iterator) @@ -1078,6 +1119,7 @@ def _load_weights(self, model_config: ModelConfig, # TODO: Change this lazy import to normal import # after the checks are updated to run on a new version from vllm.model_executor.models.utils import is_pp_missing_parameter + for quant_param_name in quant_state_dict: if is_pp_missing_parameter(quant_param_name, model): continue @@ -1086,9 +1128,9 @@ def _load_weights(self, model_config: ModelConfig, shard_index = 0 for shard_name, ( - weight_name, index + weight_name, + index, ) in model.bitsandbytes_stacked_params_mapping.items(): - shard_pos = quant_param_name.find(shard_name) # Some models, such as MiniCPM V2.5/2.6, contain both # module names 'kv_proj' and 'qkv_proj'. To prevent 'kv_proj' @@ -1123,8 +1165,8 @@ def _load_weights(self, model_config: ModelConfig, num_elements = [0] * len(quant_states) for seq, quant_state in quant_states.items(): - num_elements[seq] = math.prod( - quant_state.shape) // pack_ratio + num_elements[seq] = (math.prod(quant_state.shape) // + pack_ratio) offsets = np.concatenate(([0], np.cumsum(num_elements))) set_weight_attrs(param, {"bnb_shard_offsets": offsets}) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 39cb5a8b2cbbe..5e68b7f165bf4 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -351,14 +351,6 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP): embedding_padding_modules = [] # BitandBytes specific attributes - default_bitsandbytes_target_modules = [ - ".W_pack.", - ".o_proj.", - ".down_proj.", - ".up_proj.", - ".gate_proj.", - ".up_proj.", - ] bitsandbytes_stacked_params_mapping = { # shard_name, weight_name, index "gate_proj": ("gate_up_proj", 0), diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 096ad32b38e86..8660cf79b9cdb 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -412,12 +412,6 @@ class FalconForCausalLM(nn.Module, SupportsPP): # BitandBytes specific attributes bitsandbytes_stacked_params_mapping = {} - default_bitsandbytes_target_modules = [ - ".query_key_value.", - ".dense.", - ".dense_h_to_4h.", - ".dense_4h_to_h.", - ] def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 131e9af139c2a..b28715c48adfb 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -350,15 +350,6 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "down_proj", ] # BitandBytes specific attributes - default_bitsandbytes_target_modules = [ - ".gate_proj.", - ".down_proj.", - ".up_proj.", - ".q_proj.", - ".k_proj.", - ".v_proj.", - ".o_proj.", - ] bitsandbytes_stacked_params_mapping = { # shard_name, weight_name, index "q_proj": ("qkv_proj", 0), diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index d229eb74669ee..c93223c740272 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -386,15 +386,6 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): embedding_padding_modules = [] # BitandBytes specific attributes - default_bitsandbytes_target_modules = [ - ".gate_proj.", - ".down_proj.", - ".up_proj.", - ".q_proj.", - ".k_proj.", - ".v_proj.", - ".o_proj.", - ] bitsandbytes_stacked_params_mapping = { # shard_name, weight_name, index "q_proj": ("qkv_proj", 0), diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 58f7635275c05..014e27bc869d4 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -656,21 +656,6 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, ] # BitandBytes specific attributes - default_bitsandbytes_target_modules = [ - ".gate_proj.", - ".down_proj.", - ".up_proj.", - ".q_proj.", - ".k_proj.", - ".v_proj.", - ".o_proj.", - # vision_model - ".fc1.", - ".fc2.", - ".out_proj.", - # connector - ".proj.", - ] bitsandbytes_stacked_params_mapping = { # shard_name, weight_name, index "q_proj": ("qkv_proj", 0), diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 355b2f3ef8b28..7cc5547b4a4d5 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -463,15 +463,6 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): embedding_padding_modules = ["lm_head"] # BitandBytes specific attributes - default_bitsandbytes_target_modules = [ - ".gate_proj.", - ".down_proj.", - ".up_proj.", - ".q_proj.", - ".k_proj.", - ".v_proj.", - ".o_proj.", - ] bitsandbytes_stacked_params_mapping = { # shard_name, weight_name, index "q_proj": ("qkv_proj", 0), diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 99bf1d42d0355..aacce477e0460 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -822,25 +822,6 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA): ] # BitandBytes specific attributes - default_bitsandbytes_target_modules = [ - ".gate_proj.", - ".down_proj.", - ".up_proj.", - ".q_proj.", - ".k_proj.", - ".v_proj.", - ".o_proj.", - # vision encoder - ".fc1.", - ".fc2.", - # Currently, vllm does not support BNB quantization for the `out_proj` - # of the resampler, so it's necessary to distinguish between the - # vision encoder and the resampler's out_proj. The same applies to - # MiniCPMV2_6. - ".self_attn.out_proj.", # vision encoder out_proj - # resampler - ".kv_proj.", - ] bitsandbytes_stacked_params_mapping = { # shard_name, weight_name, index "q_proj": ("qkv_proj", 0), @@ -964,21 +945,6 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA): ] # BitandBytes specific attributes - default_bitsandbytes_target_modules = [ - ".gate_proj.", - ".down_proj.", - ".up_proj.", - ".q_proj.", - ".k_proj.", - ".v_proj.", - ".o_proj.", - # vision encoder - ".fc1.", - ".fc2.", - ".self_attn.out_proj.", - # resampler - ".kv_proj.", - ] bitsandbytes_stacked_params_mapping = { # shard_name, weight_name, index "q_proj": ("qkv_proj", 0), diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 9e6634a9a7579..6536f9807730c 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -1104,20 +1104,6 @@ def forward( @INPUT_REGISTRY.register_input_processor(input_processor_for_mllama) class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal): # BitandBytes specific attributes - default_bitsandbytes_target_modules = [ - ".gate_proj.", - ".down_proj.", - ".up_proj.", - ".q_proj.", - ".k_proj.", - ".v_proj.", - ".o_proj.", - ".fc1.", - ".fc2.", - # The `multi_modal_projector` is at the top level of the model, - # so we can't add a dot in front of it. - "multi_modal_projector." - ] bitsandbytes_stacked_params_mapping = { # shard_name, weight_name, index "q_proj": ("qkv_proj", 0), diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index db85a494980a7..7edafcd20b5db 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -337,9 +337,6 @@ class OPTForCausalLM(nn.Module, SupportsPP): "k_proj": ("qkv_proj", 1), "v_proj": ("qkv_proj", 2), } - default_bitsandbytes_target_modules = [ - ".q_proj.", ".k_proj.", ".v_proj.", ".out_proj.", ".fc1.", ".fc2." - ] def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 998d3723a0d7d..f9e972688ddd1 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -286,9 +286,6 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "k_proj": ("qkv_proj", 1), "v_proj": ("qkv_proj", 2), } - default_bitsandbytes_target_modules = [ - ".q_proj.", ".k_proj.", ".v_proj.", ".fc1.", ".fc2.", ".dense." - ] embedding_modules = {} embedding_padding_modules = [] diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py index 54158bc141235..937858ee3b8c2 100644 --- a/vllm/model_executor/models/phi3.py +++ b/vllm/model_executor/models/phi3.py @@ -16,11 +16,5 @@ class Phi3ForCausalLM(LlamaForCausalLM): } # BitandBytes specific attributes - default_bitsandbytes_target_modules = [ - ".gate_up_proj.", - ".down_proj.", - ".qkv_proj.", - ".o_proj.", - ] # Initialize an empty dict when there is no stacked parameter mapping. bitsandbytes_stacked_params_mapping = {} diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 8f001200308fe..63d1374ab4092 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -1028,12 +1028,7 @@ class QWenLLM(QWenBaseModel): embedding_modules = {} embedding_padding_modules = [] - default_bitsandbytes_target_modules = [ - ".c_attn.", - ".c_proj.", - ".w1.", - ".w2.", - ] + # BitandBytes specific attributes bitsandbytes_stacked_params_mapping = { # shard_name, weight_name, index "w2": ("gate_up_proj", 0), diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 46640226d4cf8..9f706610a129a 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -419,15 +419,6 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): embedding_padding_modules = [] # BitandBytes specific attributes - default_bitsandbytes_target_modules = [ - ".gate_proj.", - ".down_proj.", - ".up_proj.", - ".q_proj.", - ".k_proj.", - ".v_proj.", - ".o_proj.", - ] bitsandbytes_stacked_params_mapping = { # shard_name, weight_name, index "q_proj": ("qkv_proj", 0), From e2251109c746f0d08ab9b37b5abcf44ca105d426 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Wed, 27 Nov 2024 01:55:32 -0500 Subject: [PATCH 173/255] [Kernel] Remove if-else with identical branches in marlin 2:4 (#10687) Signed-off-by: Tyler Michael Smith --- .../marlin/sparse/marlin_24_cuda_kernel.cu | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu index 8fce76eb52f9b..17837351324be 100644 --- a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu +++ b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu @@ -296,13 +296,9 @@ __global__ void Marlin_24( // We use a different scale layout for grouped and column-wise quantization as // we scale a `half2` tile in column-major layout in the former and in // row-major in the latter case. - if (group_blocks != -1) { - s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + - (threadIdx.x % 32) / 4; - } else { - s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + - (threadIdx.x % 32) / 4; - } + s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + + (threadIdx.x % 32) / 4; // Note that in the original Marlin kernel + // this is (threadIdx.x % 32) / 4 // Precompute which thread should not read memory in which iterations; this is // needed if there are more threads than required for a certain tilesize or From 1209261e937f7cc5a933da48d625d17e6ee8eea9 Mon Sep 17 00:00:00 2001 From: shunxing12345 <168084185+shunxing12345@users.noreply.github.com> Date: Wed, 27 Nov 2024 19:32:35 +0800 Subject: [PATCH 174/255] [Model] Support telechat2 (#10311) Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: xiangw2 Co-authored-by: Isotr0py <2037008807@qq.com> --- docs/source/models/supported_models.rst | 5 + tests/models/registry.py | 2 + vllm/model_executor/models/llama.py | 6 +- vllm/model_executor/models/registry.py | 2 + vllm/model_executor/models/telechat2.py | 131 +++++++++++++++++++ vllm/transformers_utils/config.py | 4 +- vllm/transformers_utils/configs/__init__.py | 2 + vllm/transformers_utils/configs/telechat2.py | 61 +++++++++ 8 files changed, 210 insertions(+), 3 deletions(-) create mode 100644 vllm/model_executor/models/telechat2.py create mode 100644 vllm/transformers_utils/configs/telechat2.py diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index b5cbe6915d581..c5fbb30b24e28 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -309,6 +309,11 @@ Text Generation - :code:`upstage/solar-pro-preview-instruct`, etc. - ✅︎ - ✅︎ + * - :code:`TeleChat2ForCausalLM` + - TeleChat2 + - :code:`TeleAI/TeleChat2-3B`, :code:`TeleAI/TeleChat2-7B`, :code:`TeleAI/TeleChat2-35B`, etc. + - ✅︎ + - ✅︎ * - :code:`XverseForCausalLM` - XVERSE - :code:`xverse/XVERSE-7B-Chat`, :code:`xverse/XVERSE-13B-Chat`, :code:`xverse/XVERSE-65B-Chat`, etc. diff --git a/tests/models/registry.py b/tests/models/registry.py index 865e90b3f8b0e..a93bfe907e0d7 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -115,6 +115,8 @@ class _HfExamplesInfo: "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"), "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"), "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"), + "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B", + trust_remote_code=True), "XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat", is_available_online=False, trust_remote_code=True), diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 7cc5547b4a4d5..fffb3fe53b94c 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -501,8 +501,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.config = config self.lora_config = lora_config - self.model = LlamaModel(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) + self.model = self._init_model(vllm_config=vllm_config, prefix=prefix) if get_pp_group().is_last_rank: self.unpadded_vocab_size = config.vocab_size if lora_config: @@ -539,6 +538,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): normalize=False, softmax=False) + def _init_model(self, vllm_config: VllmConfig, prefix: str = ""): + return LlamaModel(vllm_config=vllm_config, prefix=prefix) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index f5a02a5b25ca2..4462f6ed55a9c 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -91,6 +91,7 @@ "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"), "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"), "SolarForCausalLM": ("solar", "SolarForCausalLM"), + "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"), "XverseForCausalLM": ("xverse", "XverseForCausalLM"), # [Encoder-decoder] "BartModel": ("bart", "BartForConditionalGeneration"), @@ -118,6 +119,7 @@ "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"), "Qwen2ForSequenceClassification": ("qwen2_cls", "Qwen2ForSequenceClassification"), # noqa: E501 + "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"), # [Multimodal] "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py new file mode 100644 index 0000000000000..39c9103527f01 --- /dev/null +++ b/vllm/model_executor/models/telechat2.py @@ -0,0 +1,131 @@ +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Iterable, Set, Tuple + +import torch + +from vllm.config import VllmConfig +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.llama import LlamaForCausalLM, LlamaModel + +from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, + is_pp_missing_parameter) + + +class TeleChat2Model(LlamaModel): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + # 1. Initialize the LlamaModel with bias + vllm_config.model_config.hf_config.bias = True + vllm_config.model_config.hf_config.mlp_bias = True + super().__init__(vllm_config=vllm_config, prefix=prefix) + # 2. Remove the bias from the qkv_proj and gate_up_proj based on config + # Telechat2's gate_up_proj and qkv_proj don't have bias + # see: https://github.com/vllm-project/vllm/pull/10311#issuecomment-2490297566 + for layer in self.layers: + if not isinstance(layer, PPMissingLayer): + layer.self_attn.qkv_proj.bias = None + layer.self_attn.qkv_proj.skip_bias_add = True + layer.mlp.gate_up_proj.bias = None + layer.mlp.gate_up_proj.skip_bias_add = True + + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + stacked_params_mapping = [ + ('gate_up_proj', 'gate_proj', 0), + ('gate_up_proj', 'up_proj', 1), + ] + params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() + total_num_heads = self.config.n_head + head_dim = self.config.hidden_size // total_num_heads + for name, loaded_weight in weights: + if "self_attn.key_value" in name: + k_weight = [] + v_weight = [] + for i in range(total_num_heads): + start = i * head_dim * 2 + k_weight.append(loaded_weight[start:start + head_dim, :]) + v_weight.append(loaded_weight[start + head_dim:start + + 2 * head_dim:]) + k_weight = torch.cat(k_weight, dim=0) + v_weight = torch.cat(v_weight, dim=0) + name = name.replace("key_value", "qkv_proj") + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, k_weight, "k") + weight_loader(param, v_weight, "v") + elif "query" in name: + name = name.replace("query", "qkv_proj") + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, "q") + else: + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class TeleChat2ForCausalLM(LlamaForCausalLM): + + def _init_model(self, vllm_config: VllmConfig, prefix: str = ""): + return TeleChat2Model(vllm_config=vllm_config, prefix=prefix) + + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "transformer.": "model.", + }, + orig_to_new_substr={ + ".h.": ".layers.", + ".self_attention.": ".self_attn.", + ".word_embeddings.": ".embed_tokens.", + ".dense.": ".o_proj.", + ".ln_f.": ".norm.", + }, + ) + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] + if self.config.tie_word_embeddings else None), + ) + return loader.load_weights(weights, mapper=hf_to_vllm_mapper) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 4c096acdf2035..3da99bcbee9ae 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -29,7 +29,8 @@ MLPSpeculatorConfig, MPTConfig, NemotronConfig, NVLM_D_Config, Olmo2Config, RWConfig, - SolarConfig, UltravoxConfig) + SolarConfig, Telechat2Config, + UltravoxConfig) # yapf: enable from vllm.transformers_utils.utils import check_gguf_file from vllm.utils import resolve_obj_by_qualname @@ -64,6 +65,7 @@ "NVLM_D": NVLM_D_Config, "olmo2": Olmo2Config, "solar": SolarConfig, + "telechat": Telechat2Config, "ultravox": UltravoxConfig, **_CONFIG_REGISTRY_OVERRIDE_HF } diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 4c721001d8434..c24433cd436b4 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -17,6 +17,7 @@ from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config from vllm.transformers_utils.configs.olmo2 import Olmo2Config from vllm.transformers_utils.configs.solar import SolarConfig +from vllm.transformers_utils.configs.telechat2 import Telechat2Config from vllm.transformers_utils.configs.ultravox import UltravoxConfig __all__ = [ @@ -36,5 +37,6 @@ "NVLM_D_Config", "Olmo2Config", "SolarConfig", + "Telechat2Config", "UltravoxConfig", ] \ No newline at end of file diff --git a/vllm/transformers_utils/configs/telechat2.py b/vllm/transformers_utils/configs/telechat2.py new file mode 100644 index 0000000000000..eb6f5a059169f --- /dev/null +++ b/vllm/transformers_utils/configs/telechat2.py @@ -0,0 +1,61 @@ +# adapted from https://www.modelscope.cn/models/TeleAI/TeleChat2-3B/resolve/master/configuration_telechat2.py +""" Telechat configuration compatible with LlamaConfig. """ + +from transformers.configuration_utils import PretrainedConfig + + +class Telechat2Config(PretrainedConfig): + + model_type = "telechat" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "num_hidden_layers": "n_layer", + "num_attention_heads": "n_head", + "intermediate_size": "ffn_hidden_size", + "rms_norm_eps": "layer_norm_epsilon" + } + + def __init__( + self, + vocab_size=160256, + hidden_size=4096, + n_layer=30, + n_head=32, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + use_cache=True, + bos_token_id=1, + eos_token_id=2, + apply_residual_connection_post_layernorm=False, + hidden_dropout=0.0, + attention_dropout=0.0, + ffn_hidden_size=12288, + training_seqlen=8192, + logn=True, + embed_layernorm=False, + hidden_act="silu", + **kwargs, + ): + self.vocab_size = vocab_size + n_embed = kwargs.pop("n_embed", None) + self.hidden_size = hidden_size if n_embed is None else n_embed + self.n_layer = n_layer + self.n_head = n_head + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + self.use_cache = use_cache + self.apply_residual_connection_post_layernorm = ( + apply_residual_connection_post_layernorm) + self.hidden_dropout = hidden_dropout + self.attention_dropout = attention_dropout + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + self.logn = logn + self.training_seqlen = training_seqlen + self.embed_layernorm = embed_layernorm + self.num_key_value_heads = kwargs.pop("num_key_value_heads", None) + self.ffn_hidden_size = ffn_hidden_size + self.hidden_act = hidden_act + super().__init__(bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs) From 418cb3b93fbf85f0735b5c0ed3f62d4b36808968 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Wed, 27 Nov 2024 19:55:38 +0800 Subject: [PATCH 175/255] [Bugfix][Hardware][CPU] Fix intel-omp version to avoid segfault (#10700) Signed-off-by: jiang1.li --- Dockerfile.cpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.cpu b/Dockerfile.cpu index d2f72ea975a3d..ebe226cf6d148 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -16,7 +16,7 @@ RUN --mount=type=cache,target=/var/cache/apt \ # intel-openmp provides additional performance improvement vs. openmp # tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects. RUN --mount=type=cache,target=/root/.cache/pip \ - pip install intel-openmp + pip install intel-openmp==2025.0.1 ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so" From 9e0a147d502758ed31b35df1361e37ea6bacd4a0 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Wed, 27 Nov 2024 04:26:27 -0800 Subject: [PATCH 176/255] [V1] Update interface for mistral-format Pixtral (#10703) Signed-off-by: Roger Wang --- vllm/model_executor/models/pixtral.py | 47 ++++++++++++++++----------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 6711cbf5694b9..45171c1a04b17 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -31,7 +31,7 @@ from vllm.model_executor.models.utils import merge_multimodal_embeddings from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs -from vllm.multimodal.inputs import PlaceholderRange +from vllm.multimodal.inputs import NestedTensors, PlaceholderRange from vllm.multimodal.utils import (cached_get_tokenizer, consecutive_placeholder_ranges, resolve_visual_encoder_outputs) @@ -190,6 +190,25 @@ def sampler(self): return get_sampler() + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + self.vision_args.image_token_id) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -197,31 +216,21 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> Union[torch.Tensor, IntermediateTensors]: """Run forward pass for pixtral. - - TODO - """ if intermediate_tensors is not None: - input_ids = None inputs_embeds = None - else: - image_input = self._parse_and_validate_image_input(**kwargs) - - if image_input is not None: - vision_embeddings = self._process_image_input(image_input) - inputs_embeds = self.language_model.model.get_input_embeddings( - input_ids) - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, - self.vision_args.image_token_id) - - input_ids = None - else: - inputs_embeds = None + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None hidden_states = self.language_model.model(input_ids, positions, From 308cc5e21e12fb0eea0a960d147dca7efc59d92f Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 27 Nov 2024 09:26:14 -0800 Subject: [PATCH 177/255] [ci] fix slow tests (#10698) Signed-off-by: youkaichao --- tests/entrypoints/llm/test_lazy_outlines.py | 22 ++++++++++++++----- tests/test_lazy_torch_compile.py | 22 ++++++++++++++----- .../vllm_test_utils/vllm_test_utils/blame.py | 10 ++++----- 3 files changed, 39 insertions(+), 15 deletions(-) diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py index 81fb000d8ac56..2c53676c5f5dd 100644 --- a/tests/entrypoints/llm/test_lazy_outlines.py +++ b/tests/entrypoints/llm/test_lazy_outlines.py @@ -1,6 +1,7 @@ import sys +from contextlib import nullcontext -from vllm_test_utils import blame +from vllm_test_utils import BlameResult, blame from vllm import LLM, SamplingParams from vllm.distributed import cleanup_dist_env_and_memory @@ -56,9 +57,20 @@ def test_lazy_outlines(sample_regex): """ # make sure outlines is not imported module_name = "outlines" - with blame(lambda: module_name in sys.modules) as result: + # In CI, we only check finally if the module is imported. + # If it is indeed imported, we can rerun the test with `use_blame=True`, + # which will trace every function call to find the first import location, + # and help find the root cause. + # We don't run it in CI by default because it is slow. + use_blame = False + context = blame( + lambda: module_name in sys.modules) if use_blame else nullcontext() + with context as result: run_normal() run_lmfe(sample_regex) - assert not result.found, ( - f"Module {module_name} is already imported, the" - f" first import location is:\n{result.trace_stack}") + if use_blame: + assert isinstance(result, BlameResult) + print(f"the first import location is:\n{result.trace_stack}") + assert module_name not in sys.modules, ( + f"Module {module_name} is imported. To see the first" + f" import location, run the test with `use_blame=True`.") diff --git a/tests/test_lazy_torch_compile.py b/tests/test_lazy_torch_compile.py index 4756fac8e2a8d..b950877a4337b 100644 --- a/tests/test_lazy_torch_compile.py +++ b/tests/test_lazy_torch_compile.py @@ -2,15 +2,27 @@ # The utility function cannot be placed in `vllm.utils` # this needs to be a standalone script import sys +from contextlib import nullcontext -from vllm_test_utils import blame +from vllm_test_utils import BlameResult, blame module_name = "torch._inductor.async_compile" -with blame(lambda: module_name in sys.modules) as result: +# In CI, we only check finally if the module is imported. +# If it is indeed imported, we can rerun the test with `use_blame=True`, +# which will trace every function call to find the first import location, +# and help find the root cause. +# We don't run it in CI by default because it is slow. +use_blame = False +context = blame( + lambda: module_name in sys.modules) if use_blame else nullcontext() +with context as result: import vllm # noqa -assert not result.found, (f"Module {module_name} is already imported, the" - f" first import location is:\n{result.trace_stack}") +if use_blame: + assert isinstance(result, BlameResult) + print(f"the first import location is:\n{result.trace_stack}") -print(f"Module {module_name} is not imported yet") +assert module_name not in sys.modules, ( + f"Module {module_name} is imported. To see the first" + f" import location, run the test with `use_blame=True`.") diff --git a/tests/vllm_test_utils/vllm_test_utils/blame.py b/tests/vllm_test_utils/vllm_test_utils/blame.py index ad23ab83c2d81..1ddd3471d357b 100644 --- a/tests/vllm_test_utils/vllm_test_utils/blame.py +++ b/tests/vllm_test_utils/vllm_test_utils/blame.py @@ -46,8 +46,8 @@ def _trace_calls(frame, event, arg=None): pass return _trace_calls - sys.settrace(_trace_calls) - - yield result - - sys.settrace(None) + try: + sys.settrace(_trace_calls) + yield result + finally: + sys.settrace(None) From c411def234b0e85a349c8d95b5f32eade4aa1ed6 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 27 Nov 2024 10:16:10 -0800 Subject: [PATCH 178/255] [torch.compile] fix shape specialization (#10722) Signed-off-by: youkaichao --- vllm/config.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 68f73bf4b4dc9..cd24e9ffdf598 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2151,7 +2151,7 @@ class CompilationConfig(BaseModel): use_inductor: bool = True inductor_specialize_for_cudagraph_no_more_than: Optional[int] = None - inductor_compile_sizes: Optional[List[int]] = Field(default_factory=dict) + inductor_compile_sizes: Optional[List[int]] = Field(default=None) inductor_compile_config: Dict = Field(default_factory=dict) inductor_passes: Dict[str, str] = Field(default_factory=dict) @@ -2290,9 +2290,8 @@ def init_during_runtime(self): if x <= self.inductor_specialize_for_cudagraph_no_more_than ] else: - assert self.inductor_compile_sizes is not None, ( - "inductor_compile_sizes should not be None when " - "inductor_specialize_for_cudagraph_no_more_than is None") + if self.inductor_compile_sizes is None: + self.inductor_compile_sizes = [] self.compile_sizes = self.inductor_compile_sizes From b98c62ba4947b93673c522b13464854acf8090a4 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 28 Nov 2024 02:43:17 +0800 Subject: [PATCH 179/255] [Bugfix] Fix GGUF inference with FP16 unquantized checkpoint (#10675) Signed-off-by: Isotr0py <2037008807@qq.com> --- .../layers/quantization/gguf.py | 69 ++++++++++++++++--- 1 file changed, 60 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index 24138662eb25c..f0943efa0039d 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -2,6 +2,7 @@ import gguf import torch +from gguf import GGMLQuantizationType as WeightType from torch.nn.parameter import Parameter, UninitializedParameter from vllm import _custom_ops as ops @@ -49,19 +50,65 @@ def get_quant_method(self, layer: torch.nn.Module, return None +UNQUANTIZED_TYPES = {WeightType.F32, WeightType.F16, WeightType.BF16} +STANDARD_QUANT_TYPES = { + WeightType.Q4_0, + WeightType.Q4_1, + WeightType.Q5_0, + WeightType.Q5_1, + WeightType.Q8_0, + WeightType.Q8_1, +} +KQUANT_TYPES = { + WeightType.Q2_K, + WeightType.Q3_K, + WeightType.Q4_K, + WeightType.Q5_K, + WeightType.Q6_K, +} +IMATRIX_QUANT_TYPES = { + WeightType.IQ1_M, + WeightType.IQ1_S, + WeightType.IQ2_XXS, + WeightType.IQ2_XS, + WeightType.IQ2_S, + WeightType.IQ3_XXS, + WeightType.IQ3_S, + WeightType.IQ4_XS, + WeightType.IQ4_NL, +} +# TODO(Isotr0py): Currently, we don't have MMQ kernel for I-Matrix quantization. +# Consolidate DEQUANT_TYPES, MMVQ_QUANT_TYPES and MMQ_QUANT_TYPES after we add +# MMQ kernel for I-Matrix quantization. +DEQUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES | IMATRIX_QUANT_TYPES +MMVQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES | IMATRIX_QUANT_TYPES +MMQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES + + def _fuse_mul_mat(x: torch.Tensor, qweight: torch.Tensor, qweight_type: int) -> torch.Tensor: - # use dequantize mulmat for IQmatrix, mmq for k-quants - if x.shape[0] == 1: - # enable mmvq in contiguous batching + # there is no need to call any kernel for fp16/bf16 + if qweight_type in UNQUANTIZED_TYPES: + return x @ qweight.T + # enable MMVQ in contiguous batching with batch_size=1 + if x.shape[0] == 1 and qweight_type in MMVQ_QUANT_TYPES: y = ops.ggml_mul_mat_vec_a8(qweight, x, qweight_type, qweight.shape[0]) - elif qweight_type >= 16: + # Use MMQ Kernel if it's available (standard + k-quants) + elif qweight_type in MMQ_QUANT_TYPES: + y = ops.ggml_mul_mat_a8(qweight, x, qweight_type, qweight.shape[0]) + # If there is no available MMQ kernel, fallback to dequantize + elif qweight_type in DEQUANT_TYPES: block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type] shape = (qweight.shape[0], qweight.shape[1] // type_size * block_size) weight = ops.ggml_dequantize(qweight, qweight_type, *shape) y = x @ weight.T else: - y = ops.ggml_mul_mat_a8(qweight, x, qweight_type, qweight.shape[0]) + # Raise an error if the quantization type is not supported. + # Might be useful if llama.cpp adds a new quantization type. + # Wrap to GGMLQuantizationType IntEnum to make sure it's a valid type. + qweight_type = WeightType(qweight_type) + raise NotImplementedError( + f"Unsupported GGUF quantization type: {qweight_type}") return y @@ -121,9 +168,9 @@ def apply(self, shard_id = ["q", "k", "v"] if "q" in shard_id else shard_id qweight = layer.qweight.unbind(0) result = [] - for id in shard_id: - q_idx = layer.qweight.shard_id_map[id] - qweight_type = layer.qweight_type.shard_weight_type[id] + for idx in shard_id: + q_idx = layer.qweight.shard_id_map[idx] + qweight_type = layer.qweight_type.shard_weight_type[idx] result.append(_fuse_mul_mat(x, qweight[q_idx], qweight_type)) out = torch.cat(result, axis=1) else: @@ -163,9 +210,13 @@ class GGUFUninitializedParameter(UninitializedParameter): data_container: List[torch.Tensor] def materialize_nested(self) -> Parameter: + dtype = {data.dtype for data in self.data_container} + assert len(dtype) == 1, ValueError( + f"Data container has mixed dtypes: {dtype}") + dtype = next(iter(dtype)) nested_data = torch.nested.nested_tensor(self.data_container, device=self.device, - dtype=torch.uint8) + dtype=dtype) self.data_container.clear() param = torch.Tensor._make_subclass(self.cls_to_become, nested_data, From 197b4484a3fba4a98921f903d6242677f97c63db Mon Sep 17 00:00:00 2001 From: Mor Zusman Date: Wed, 27 Nov 2024 21:02:27 +0200 Subject: [PATCH 180/255] [Bugfix][Mamba] Fix Multistep on Mamba-like models (#10705) Signed-off-by: mzusman --- .../decoder_only/language/test_jamba.py | 38 +++++++++++++++++++ .../decoder_only/language/test_mamba.py | 36 ++++++++++++++++++ vllm/engine/async_llm_engine.py | 7 +++- vllm/engine/llm_engine.py | 7 +++- 4 files changed, 84 insertions(+), 4 deletions(-) diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py index 6542689c3f277..87a05b3011393 100644 --- a/tests/models/decoder_only/language/test_jamba.py +++ b/tests/models/decoder_only/language/test_jamba.py @@ -275,6 +275,44 @@ def test_state_cleanup( "could be related to finished_requests_ids") +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float"]) +def test_multistep( + vllm_runner, + model: str, + dtype: str, + example_prompts, +) -> None: + # This test is verifying that multistep works correctly + #on mamba-like models + with vllm_runner(model, num_scheduler_steps=8, + max_num_seqs=2) as vllm_model: + vllm_model.generate_greedy([example_prompts[0]] * 10, 1) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float"]) +@pytest.mark.parametrize("max_tokens", [64]) +def test_multistep_correctness(vllm_runner, model: str, dtype: str, + max_tokens: int, example_prompts) -> None: + with vllm_runner(model, num_scheduler_steps=8, + max_num_seqs=2) as vllm_model: + vllm_outputs_multistep = vllm_model.generate_greedy( + example_prompts, max_tokens) + + with vllm_runner(model, num_scheduler_steps=1, + max_num_seqs=2) as vllm_model: + vllm_outputs_single_step = vllm_model.generate_greedy( + example_prompts, max_tokens) + + check_outputs_equal( + outputs_0_lst=vllm_outputs_multistep, + outputs_1_lst=vllm_outputs_single_step, + name_0="vllm_outputs_multistep", + name_1="vllm_outputs_single_step", + ) + + @multi_gpu_test(num_gpus=2) @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py index 78eab8d5354fd..01e208347bff4 100644 --- a/tests/models/decoder_only/language/test_mamba.py +++ b/tests/models/decoder_only/language/test_mamba.py @@ -283,3 +283,39 @@ def test_state_cleanup( except ValueError: pytest.fail("Mamba inner state wasn't cleaned up between states, " "could be related to finished_requests_ids") + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float"]) +def test_multistep( + vllm_runner, + model: str, + dtype: str, + example_prompts, +) -> None: + with vllm_runner(model, num_scheduler_steps=8, + max_num_seqs=2) as vllm_model: + vllm_model.generate_greedy([example_prompts[0]] * 10, 1) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float"]) +@pytest.mark.parametrize("max_tokens", [64]) +def test_multistep_correctness(vllm_runner, model: str, dtype: str, + max_tokens: int, example_prompts) -> None: + with vllm_runner(model, num_scheduler_steps=8, + max_num_seqs=2) as vllm_model: + vllm_outputs_multistep = vllm_model.generate_greedy( + example_prompts, max_tokens) + + with vllm_runner(model, num_scheduler_steps=1, + max_num_seqs=2) as vllm_model: + vllm_outputs_single_step = vllm_model.generate_greedy( + example_prompts, max_tokens) + + check_outputs_equal( + outputs_0_lst=vllm_outputs_multistep, + outputs_1_lst=vllm_outputs_single_step, + name_0="vllm_outputs_multistep", + name_1="vllm_outputs_single_step", + ) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 3224577c567f8..31a15b04314d5 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -300,6 +300,9 @@ async def step_async( ctx.seq_group_metadata_list = seq_group_metadata_list ctx.scheduler_outputs = scheduler_outputs + finished_requests_ids = self.scheduler[ + virtual_engine].get_and_reset_finished_requests_ids() + # Maybe switch from async mode to sync mode if not allow_async_output_proc and len(ctx.output_queue) > 0: self._process_model_outputs(ctx=ctx) @@ -311,13 +314,13 @@ async def step_async( self._cache_scheduler_outputs_for_multi_step( virtual_engine, seq_group_metadata_list, scheduler_outputs, allow_async_output_proc) + else: + finished_requests_ids = list() assert seq_group_metadata_list is not None assert scheduler_outputs is not None if not scheduler_outputs.is_empty(): - finished_requests_ids = self.scheduler[ - virtual_engine].get_and_reset_finished_requests_ids() # Check if we have a cached last_output from the previous iteration. # For supporting PP this is probably the best way to pass the diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index a4975cece9a81..ecc222f692c41 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1398,6 +1398,9 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: ctx.seq_group_metadata_list = seq_group_metadata_list ctx.scheduler_outputs = scheduler_outputs + finished_requests_ids = self.scheduler[ + virtual_engine].get_and_reset_finished_requests_ids() + # Maybe switch from async mode to sync mode if not allow_async_output_proc and len(ctx.output_queue) > 0: self._process_model_outputs(ctx=ctx) @@ -1409,13 +1412,13 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: self._cache_scheduler_outputs_for_multi_step( virtual_engine, seq_group_metadata_list, scheduler_outputs, allow_async_output_proc) + else: + finished_requests_ids = list() assert seq_group_metadata_list is not None assert scheduler_outputs is not None if not scheduler_outputs.is_empty(): - finished_requests_ids = self.scheduler[ - virtual_engine].get_and_reset_finished_requests_ids() # Check if we have a cached last_output from the previous iteration. # For supporting PP this is probably the best way to pass the From 9b4b150395d509a35031e58fb6e0f3331b532055 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 28 Nov 2024 03:05:29 +0800 Subject: [PATCH 181/255] [Bugfix] Ignore `lm_head` when loading embedding models (#10719) Signed-off-by: DarkLight1337 --- vllm/model_executor/models/bert.py | 2 ++ vllm/model_executor/models/gemma2.py | 2 ++ vllm/model_executor/models/llama.py | 2 ++ vllm/model_executor/models/qwen2.py | 2 ++ 4 files changed, 8 insertions(+) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 1fff72b3490e9..053d838432885 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -443,6 +443,8 @@ def pooler( def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) weights = hf_to_vllm_mapper.apply(weights) + weights = ((name, data) for name, data in weights + if not name.startswith("lm_head.")) self.model.load_weights(weights) def _build_model(self, diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index c93223c740272..d35fcb012e166 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -504,4 +504,6 @@ def pooler( def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) weights = hf_to_vllm_mapper.apply(weights) + weights = ((name, data) for name, data in weights + if not name.startswith("lm_head.")) self.model.load_weights(weights) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index fffb3fe53b94c..fe94bb352961b 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -689,6 +689,8 @@ def pooler( def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) weights = hf_to_vllm_mapper.apply(weights) + weights = ((name, data) for name, data in weights + if not name.startswith("lm_head.")) self.model.load_weights(weights) def load_kv_cache_scales(self, quantization_param_path: str) -> None: diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 9f706610a129a..87943e53d861c 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -580,4 +580,6 @@ def pooler( def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) weights = hf_to_vllm_mapper.apply(weights) + weights = ((name, data) for name, data in weights + if not name.startswith("lm_head.")) self.model.load_weights(weights) From 395b1c74543053ebf25d4ab3af828cd145506caa Mon Sep 17 00:00:00 2001 From: tomeras91 <57313761+tomeras91@users.noreply.github.com> Date: Wed, 27 Nov 2024 23:21:10 +0200 Subject: [PATCH 182/255] [Frontend] don't block event loop in tokenization (preprocess) in OpenAI compatible server (#10635) Signed-off-by: Tomer Asida --- .../openai/test_async_tokenization.py | 137 ++++++++++++++++++ vllm/entrypoints/openai/serving_completion.py | 2 +- vllm/entrypoints/openai/serving_embedding.py | 15 +- vllm/entrypoints/openai/serving_engine.py | 75 +++++----- vllm/entrypoints/openai/serving_score.py | 10 +- .../openai/serving_tokenization.py | 15 +- vllm/utils.py | 8 +- 7 files changed, 206 insertions(+), 56 deletions(-) create mode 100644 tests/entrypoints/openai/test_async_tokenization.py diff --git a/tests/entrypoints/openai/test_async_tokenization.py b/tests/entrypoints/openai/test_async_tokenization.py new file mode 100644 index 0000000000000..fcce8b46c4344 --- /dev/null +++ b/tests/entrypoints/openai/test_async_tokenization.py @@ -0,0 +1,137 @@ +import asyncio +import contextlib +import random +import time +from typing import Callable + +import openai +import pytest +import pytest_asyncio +import requests + +from tests.utils import RemoteOpenAIServer + +MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" + + +@pytest.fixture(scope="module") +def server(): # noqa: F811 + args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "8192", + "--enforce-eager", + "--max-num-seqs", + "128", + "--load-format", + "dummy", + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ids=["completion", "chat"], + argnames=["create_func_gen", "content_body"], + argvalues=[ + (lambda x: x.completions.create, { + "prompt": " ".join(['A'] * 10_000) + }), + (lambda x: x.chat.completions.create, { + "messages": [{ + "role": "user", + "content": " ".join(['A'] * 10_000) + }] + }), + ], +) +async def test_with_and_without_truncate( + server: RemoteOpenAIServer, + client: openai.AsyncOpenAI, + create_func_gen: Callable, + content_body: dict, +): + create_func = create_func_gen(client) + body = {"model": MODEL_NAME, **content_body, "max_tokens": 10} + + num_requests = 10 + truncate_prompt_tokens = ([1000] * (num_requests // 2) + [None] * + (num_requests - num_requests // 2)) + random.shuffle(truncate_prompt_tokens) + + bodies = [{ + **body, "extra_body": { + 'truncate_prompt_tokens': t + } + } for t in truncate_prompt_tokens] + + async def get_status_code(**kwargs): + try: + await create_func(**kwargs) + return 200 + except openai.APIStatusError as e: + return e.status_code + + responses = await asyncio.gather(*[get_status_code(**b) for b in bodies]) + assert 500 not in responses + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ids=["single completion", "multiple completions", "chat"], + argnames=["create_func_gen", "content_body"], + argvalues=[ + (lambda x: x.completions.create, { + "prompt": " ".join(['A'] * 300_000) + }), + (lambda x: x.completions.create, { + "prompt": [" ".join(['A'] * 300_000)] * 2 + }), + (lambda x: x.chat.completions.create, { + "messages": [{ + "role": "user", + "content": " ".join(['A'] * 300_000) + }] + }), + ], +) +async def test_healthcheck_response_time( + server: RemoteOpenAIServer, + client: openai.AsyncOpenAI, + create_func_gen: Callable, + content_body: dict, +): + num_requests = 50 + + create_func = create_func_gen(client) + body = {"model": MODEL_NAME, **content_body, "max_tokens": 10} + + def get_response_time(url): + start_time = time.monotonic() + res = requests.get(url) + end_time = time.monotonic() + assert res.status_code == 200 + return end_time - start_time + + no_load_response_time = get_response_time(server.url_for("health")) + tasks = [ + asyncio.create_task(create_func(**body)) for _ in range(num_requests) + ] + await asyncio.sleep(1) # give the tasks a chance to start running + load_response_time = get_response_time(server.url_for("health")) + + with contextlib.suppress(openai.APIStatusError): + await asyncio.gather(*tasks) + + assert load_response_time < 100 * no_load_response_time + assert load_response_time < 0.1 diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 936aae8f1c267..fc1c4908d6650 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -101,7 +101,7 @@ async def create_completion( tokenizer = await self.engine_client.get_tokenizer(lora_request) - request_prompts, engine_prompts = self._preprocess_completion( + request_prompts, engine_prompts = await self._preprocess_completion( request, tokenizer, request.prompt, diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index c84a7d2d8e13e..78e2416d9d4da 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -156,13 +156,14 @@ async def create_embedding( add_special_tokens=request.add_special_tokens, ) else: - request_prompts, engine_prompts = self._preprocess_completion( - request, - tokenizer, - request.input, - truncate_prompt_tokens=truncate_prompt_tokens, - add_special_tokens=request.add_special_tokens, - ) + (request_prompts, + engine_prompts) = await self._preprocess_completion( + request, + tokenizer, + request.input, + truncate_prompt_tokens=truncate_prompt_tokens, + add_special_tokens=request.add_special_tokens, + ) except ValueError as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index cae2877ea7e99..8232c6116c1bd 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1,5 +1,6 @@ import json import pathlib +from concurrent.futures.thread import ThreadPoolExecutor from dataclasses import dataclass from http import HTTPStatus from typing import (Any, Callable, Dict, Iterable, Iterator, List, Mapping, @@ -46,7 +47,7 @@ from vllm.tracing import (contains_trace_headers, extract_trace_headers, log_tracing_disabled_warning) from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer -from vllm.utils import AtomicCounter, is_list_of +from vllm.utils import AtomicCounter, is_list_of, make_async logger = init_logger(__name__) @@ -140,6 +141,14 @@ def __init__( self.request_logger = request_logger self.return_tokens_as_token_ids = return_tokens_as_token_ids + self._tokenizer_executor = ThreadPoolExecutor(max_workers=1) + + self._tokenize_prompt_input_async = make_async( + self._tokenize_prompt_input, executor=self._tokenizer_executor) + self._tokenize_prompt_input_or_inputs_async = make_async( + self._tokenize_prompt_input_or_inputs, + executor=self._tokenizer_executor) + async def show_available_models(self) -> ModelList: """Show available models. Right now we only have one model.""" model_cards = [ @@ -368,7 +377,7 @@ def _tokenize_prompt_input_or_inputs( input_or_inputs: Union[str, List[str], List[int], List[List[int]]], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, add_special_tokens: bool = True, - ) -> Iterator[TextTokensPrompt]: + ) -> List[TextTokensPrompt]: """ Tokenize/detokenize depending on the input format. @@ -376,45 +385,41 @@ def _tokenize_prompt_input_or_inputs( , each input can be a string or array of tokens. Note that each request can pass one or more inputs. """ - for prompt_input in parse_and_batch_prompt(input_or_inputs): - # Although our type checking is based on mypy, - # VSCode Pyright extension should still work properly - # "is True" is required for Pyright to perform type narrowing - # See: https://github.com/microsoft/pyright/issues/7672 - if prompt_input["is_tokens"] is False: - yield self._normalize_prompt_text_to_input( - request, - tokenizer, - prompt=prompt_input["content"], - truncate_prompt_tokens=truncate_prompt_tokens, - add_special_tokens=add_special_tokens, - ) - else: - yield self._normalize_prompt_tokens_to_input( - request, - tokenizer, - prompt_ids=prompt_input["content"], - truncate_prompt_tokens=truncate_prompt_tokens, - ) + # Although our type checking is based on mypy, + # VSCode Pyright extension should still work properly + # "is True" is required for Pyright to perform type narrowing + # See: https://github.com/microsoft/pyright/issues/7672 + return [ + self._normalize_prompt_text_to_input( + request, + tokenizer, + prompt=prompt_input["content"], + truncate_prompt_tokens=truncate_prompt_tokens, + add_special_tokens=add_special_tokens) + if prompt_input["is_tokens"] is False else + self._normalize_prompt_tokens_to_input( + request, + tokenizer, + prompt_ids=prompt_input["content"], + truncate_prompt_tokens=truncate_prompt_tokens) + for prompt_input in parse_and_batch_prompt(input_or_inputs) + ] - def _preprocess_completion( + async def _preprocess_completion( self, request: CompletionLikeRequest, tokenizer: AnyTokenizer, input_or_inputs: Union[str, List[str], List[int], List[List[int]]], truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, add_special_tokens: bool = True, - ) -> Tuple[Sequence[TextTokensPrompt], List[TokensPrompt]]: - request_prompts = [ - request_prompt - for request_prompt in self._tokenize_prompt_input_or_inputs( - request, - tokenizer, - input_or_inputs, - truncate_prompt_tokens=truncate_prompt_tokens, - add_special_tokens=add_special_tokens, - ) - ] + ) -> Tuple[List[TextTokensPrompt], List[TokensPrompt]]: + request_prompts = await self._tokenize_prompt_input_or_inputs_async( + request, + tokenizer, + input_or_inputs, + truncate_prompt_tokens=truncate_prompt_tokens, + add_special_tokens=add_special_tokens, + ) engine_prompts = [ TokensPrompt(prompt_token_ids=request_prompt["prompt_token_ids"]) @@ -493,7 +498,7 @@ async def _preprocess_chat( request=request) if isinstance(request_prompt, str): - prompt_inputs = self._tokenize_prompt_input( + prompt_inputs = await self._tokenize_prompt_input_async( request, tokenizer, request_prompt, diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index 156fea6f47982..7cd8ff08b5608 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -15,7 +15,7 @@ from vllm.logger import init_logger from vllm.outputs import EmbeddingRequestOutput from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer -from vllm.utils import merge_async_iterators, random_uuid +from vllm.utils import make_async, merge_async_iterators, random_uuid logger = init_logger(__name__) @@ -145,9 +145,11 @@ async def create_score( tokenization_kwargs["truncation"] = True tokenization_kwargs["max_length"] = truncate_prompt_tokens - prompt_inputs = tokenizer(text=q, - text_pair=t, - **tokenization_kwargs) + tokenize_async = make_async(tokenizer.__call__, + executor=self._tokenizer_executor) + prompt_inputs = await tokenize_async(text=q, + text_pair=t, + **tokenization_kwargs) engine_prompt = TokensPrompt( prompt_token_ids=prompt_inputs["input_ids"], token_type_ids=prompt_inputs.get("token_type_ids")) diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index 59b3b1311f881..9c3dc2c98b2dd 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -81,12 +81,13 @@ async def create_tokenize( add_special_tokens=request.add_special_tokens, ) else: - request_prompts, engine_prompts = self._preprocess_completion( - request, - tokenizer, - request.prompt, - add_special_tokens=request.add_special_tokens, - ) + (request_prompts, + engine_prompts) = await self._preprocess_completion( + request, + tokenizer, + request.prompt, + add_special_tokens=request.add_special_tokens, + ) except ValueError as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) @@ -134,7 +135,7 @@ async def create_detokenize( # Silently ignore prompt adapter since it does not affect tokenization # (Unlike in Embeddings API where an error is raised) - prompt_input = self._tokenize_prompt_input( + prompt_input = await self._tokenize_prompt_input_async( request, tokenizer, request.tokens, diff --git a/vllm/utils.py b/vllm/utils.py index bec876d983701..6f7a6f8c54e47 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1,5 +1,6 @@ import argparse import asyncio +import concurrent import contextlib import datetime import enum @@ -351,7 +352,10 @@ def in_wsl() -> bool: return "microsoft" in " ".join(uname()).lower() -def make_async(func: Callable[P, T]) -> Callable[P, Awaitable[T]]: +def make_async( + func: Callable[P, T], + executor: Optional[concurrent.futures.Executor] = None +) -> Callable[P, Awaitable[T]]: """Take a blocking function, and run it on in an executor thread. This function prevents the blocking function from blocking the @@ -362,7 +366,7 @@ def make_async(func: Callable[P, T]) -> Callable[P, Awaitable[T]]: def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> asyncio.Future: loop = asyncio.get_event_loop() p_func = partial(func, *args, **kwargs) - return loop.run_in_executor(executor=None, func=p_func) + return loop.run_in_executor(executor=executor, func=p_func) return _async_wrapper From cb4e1c3f3aee507130b64c9bacf5778ed265785d Mon Sep 17 00:00:00 2001 From: youkaichao Date: Wed, 27 Nov 2024 19:54:58 -0800 Subject: [PATCH 183/255] [misc] upgrade filelock version (#10731) Signed-off-by: youkaichao --- requirements-common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-common.txt b/requirements-common.txt index f62ad66a1ecc4..02e3d65fb774c 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -20,7 +20,7 @@ tiktoken >= 0.6.0 # Required for DBRX tokenizer lm-format-enforcer >= 0.10.9, < 0.11 outlines >= 0.0.43, < 0.1 typing_extensions >= 4.10 -filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4 +filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 partial-json-parser # used for parsing partial JSON outputs pyzmq msgspec From 70dc14fbd09d054ff75850036b81212ca67e5275 Mon Sep 17 00:00:00 2001 From: zixuanzhang226 Date: Wed, 27 Nov 2024 23:58:02 -0800 Subject: [PATCH 184/255] [Model] support bitsandbytes quantization with minicpm3 model (#10682) Signed-off-by: Ubuntu --- vllm/model_executor/models/minicpm3.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index c38c31a0d4953..c66be2d9c2d07 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -241,6 +241,12 @@ class MiniCPM3ForCausalLM(MiniCPMForCausalLM): # `embedding_modules` and `embedding_padding_modules` # are inherited from MiniCPMForCausalLM + bitsandbytes_stacked_params_mapping = { + # shard_name, weight_name, index + "gate_proj": ("gate_up_proj", 0), + "up_proj": ("gate_up_proj", 1), + } + def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""): self.model = MiniCPM3Model(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) From 278be671a355ea89843141928a426a303bfd8036 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=BD=97=E6=B3=BD=E8=BD=A9?= Date: Thu, 28 Nov 2024 15:58:39 +0800 Subject: [PATCH 185/255] [Doc] Update model in arch_overview.rst to match comment (#10701) Signed-off-by: spacewander --- docs/source/design/arch_overview.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/design/arch_overview.rst b/docs/source/design/arch_overview.rst index a9e7b4bd69bc7..bc3f509f0a66e 100644 --- a/docs/source/design/arch_overview.rst +++ b/docs/source/design/arch_overview.rst @@ -42,7 +42,7 @@ Here is a sample of `LLM` class usage: sampling_params = SamplingParams(temperature=0.8, top_p=0.95) # Initialize the LLM engine with the OPT-125M model - llm = LLM(model="Qwen/Qwen2.5-1.5B-Instruct") + llm = LLM(model="facebook/opt-125m") # Generate outputs for the input prompts outputs = llm.generate(prompts, sampling_params) From d9b4b3f069a9f602b067a5bb3efe57b106d39c09 Mon Sep 17 00:00:00 2001 From: Ricky Xu Date: Wed, 27 Nov 2024 23:59:28 -0800 Subject: [PATCH 186/255] [Bug][CLI] Allow users to disable prefix caching explicitly (#10724) Signed-off-by: rickyx --- tests/engine/test_arg_utils.py | 19 +++++++++++++++++++ tests/v1/engine/test_engine_args.py | 19 +++++++++++++++++++ vllm/engine/arg_utils.py | 10 +++++++--- 3 files changed, 45 insertions(+), 3 deletions(-) diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index 5b0e76fe53685..de78d41ad12eb 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -59,6 +59,25 @@ def test_compilation_config(): assert args.compilation_config.level == 3 +def test_prefix_cache_default(): + parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) + args = parser.parse_args([]) + + engine_args = EngineArgs.from_cli_args(args=args) + assert (not engine_args.enable_prefix_caching + ), "prefix caching defaults to off." + + # with flag to turn it on. + args = parser.parse_args(["--enable-prefix-caching"]) + engine_args = EngineArgs.from_cli_args(args=args) + assert engine_args.enable_prefix_caching + + # with disable flag to turn it off. + args = parser.parse_args(["--no-enable-prefix-caching"]) + engine_args = EngineArgs.from_cli_args(args=args) + assert not engine_args.enable_prefix_caching + + def test_valid_pooling_config(): parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) args = parser.parse_args([ diff --git a/tests/v1/engine/test_engine_args.py b/tests/v1/engine/test_engine_args.py index 69cfdf5a395c1..ac5e7dde525a7 100644 --- a/tests/v1/engine/test_engine_args.py +++ b/tests/v1/engine/test_engine_args.py @@ -4,6 +4,7 @@ from vllm.config import VllmConfig from vllm.engine.arg_utils import EngineArgs from vllm.usage.usage_lib import UsageContext +from vllm.utils import FlexibleArgumentParser if not envs.VLLM_USE_V1: pytest.skip( @@ -12,6 +13,24 @@ ) +def test_prefix_caching_from_cli(): + parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) + args = parser.parse_args([]) + engine_args = EngineArgs.from_cli_args(args=args) + assert (engine_args.enable_prefix_caching + ), "V1 turns on prefix caching by default." + + # Turn it off possible with flag. + args = parser.parse_args(["--no-enable-prefix-caching"]) + engine_args = EngineArgs.from_cli_args(args=args) + assert not engine_args.enable_prefix_caching + + # Turn it on with flag. + args = parser.parse_args(["--enable-prefix-caching"]) + engine_args = EngineArgs.from_cli_args(args=args) + assert engine_args.enable_prefix_caching + + def test_defaults(): engine_args = EngineArgs(model="facebook/opt-125m") diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 90b4798f17a13..f0020562c3c3a 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -416,9 +416,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: 'tokens. This is ignored on neuron devices and ' 'set to max-model-len') - parser.add_argument('--enable-prefix-caching', - action='store_true', - help='Enables automatic prefix caching.') + parser.add_argument( + "--enable-prefix-caching", + action=argparse.BooleanOptionalAction, + default=EngineArgs.enable_prefix_caching, + help="Enables automatic prefix caching. " + "Use --no-enable-prefix-caching to disable explicitly.", + ) parser.add_argument('--disable-sliding-window', action='store_true', help='Disables sliding window, ' From a79b1224005836bdf0ab6d3bab807d2f5d8a5ef1 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 28 Nov 2024 00:13:15 -0800 Subject: [PATCH 187/255] [V1] Do not allocate beyond the max_model_len (#10730) Signed-off-by: Woosuk Kwon --- tests/v1/core/test_prefix_caching.py | 24 ++++++++++++++++-------- vllm/v1/core/kv_cache_manager.py | 17 +++++++++++++++++ vllm/v1/core/scheduler.py | 15 ++++++++------- 3 files changed, 41 insertions(+), 15 deletions(-) diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 83bfbb6ade8d7..b44d3e5cb0678 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -23,7 +23,8 @@ def test_prefill(): manager = KVCacheManager( block_size=16, num_gpu_blocks=10, - sliding_window=False, + max_model_len=8192, + sliding_window=None, enable_caching=True, num_preallocate_tokens=16, ) @@ -121,7 +122,8 @@ def test_decode(): manager = KVCacheManager( block_size=16, num_gpu_blocks=10, - sliding_window=False, + max_model_len=8192, + sliding_window=None, enable_caching=True, num_preallocate_tokens=16, ) @@ -172,7 +174,8 @@ def test_evict(): manager = KVCacheManager( block_size=16, num_gpu_blocks=10, - sliding_window=False, + max_model_len=8192, + sliding_window=None, enable_caching=True, num_preallocate_tokens=16, ) @@ -220,7 +223,8 @@ def test_hash_block_correct_reuse(): manager = KVCacheManager( block_size=block_size, num_gpu_blocks=1, - sliding_window=False, + max_model_len=8192, + sliding_window=None, enable_caching=True, num_preallocate_tokens=0, ) @@ -256,7 +260,8 @@ def test_computed_blocks_not_evicted(): manager = KVCacheManager( block_size=block_size, num_gpu_blocks=2, - sliding_window=False, + max_model_len=8192, + sliding_window=None, enable_caching=True, num_preallocate_tokens=0, ) @@ -303,7 +308,8 @@ def test_basic_prefix_caching_disabled(): manager = KVCacheManager( block_size=block_size, num_gpu_blocks=4, - sliding_window=False, + max_model_len=8192, + sliding_window=None, enable_caching=False, num_preallocate_tokens=0, ) @@ -342,7 +348,8 @@ def test_preallocate_blocks(num_preallocate_tokens: int, block_size: int): manager = KVCacheManager( block_size=block_size, num_gpu_blocks=10, - sliding_window=False, + max_model_len=8192, + sliding_window=None, enable_caching=True, num_preallocate_tokens=num_preallocate_tokens, ) @@ -370,7 +377,8 @@ def test_cache_blocks(): manager = KVCacheManager( block_size=block_size, num_gpu_blocks=5, - sliding_window=False, + max_model_len=8192, + sliding_window=None, enable_caching=True, num_preallocate_tokens=0, ) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 8eb3fb976eb87..b492a755e6dd5 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -17,12 +17,15 @@ def __init__( self, block_size: int, num_gpu_blocks: int, + max_model_len: int, sliding_window: Optional[int] = None, enable_caching: bool = True, num_preallocate_tokens: int = 64, ) -> None: self.block_size = block_size self.num_gpu_blocks = num_gpu_blocks + self.max_model_len = max_model_len + self.max_num_blocks_per_req = cdiv(max_model_len, block_size) self.sliding_window = sliding_window self.enable_caching = enable_caching # NOTE(woosuk): To avoid frequent block allocation, we preallocate some @@ -132,7 +135,14 @@ def append_slots( num_new_blocks = min( num_new_blocks + self.num_preallocate_blocks, self.free_block_queue.num_free_blocks, + # Should not exceed the maximum number of blocks per request. + # This is especially because the block table has the shape + # [..., max_num_blocks_per_req]. + # TODO(woosuk): Check and reject requests if + # num_prompt_tokens + max_tokens > max_model_len. + self.max_num_blocks_per_req - len(req_blocks), ) + assert num_new_blocks > 0 new_blocks = self._get_new_blocks(num_new_blocks) req_blocks.extend(new_blocks) @@ -212,7 +222,14 @@ def allocate_slots( num_required_blocks + self.num_preallocate_blocks, self.free_block_queue.num_free_blocks - num_evictable_computed_blocks, + # Should not exceed the maximum number of blocks per request. + # This is especially because the block table has the shape + # [..., max_num_blocks_per_req]. + # TODO(woosuk): Check and reject requests if + # num_prompt_tokens + max_tokens > max_model_len. + self.max_num_blocks_per_req - len(computed_blocks), ) + assert num_new_blocks > 0 # Concatenate the computed block IDs and the new block IDs. new_blocks = self._get_new_blocks(num_new_blocks) diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index ba50a9786d805..f1f26f4e8d443 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -33,22 +33,23 @@ def __init__( # TODO: Support LoRA. assert lora_config is None, "V1 does not support LoRA yet." + # Scheduling constraints. + self.max_num_running_reqs = self.scheduler_config.max_num_seqs + self.max_num_scheduled_tokens = \ + self.scheduler_config.max_num_batched_tokens + self.max_model_len = self.scheduler_config.max_model_len + num_gpu_blocks = cache_config.num_gpu_blocks assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0 - # Create the block space manager. + # Create the KV cache manager. self.kv_cache_manager = KVCacheManager( block_size=self.cache_config.block_size, num_gpu_blocks=num_gpu_blocks, + max_model_len=self.max_model_len, sliding_window=self.cache_config.sliding_window, enable_caching=self.cache_config.enable_prefix_caching) self.block_size = self.cache_config.block_size - # Scheduling constraints. - self.max_num_running_reqs = self.scheduler_config.max_num_seqs - self.max_num_scheduled_tokens = \ - self.scheduler_config.max_num_batched_tokens - self.max_model_len = self.scheduler_config.max_model_len - # req_id -> Request self.requests: Dict[str, Request] = {} # Priority queues for requests. From 9a8bff028595d1c5c52bc225013908ca7a7b66d8 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 28 Nov 2024 02:25:59 -0800 Subject: [PATCH 188/255] [Kernel] Update vllm-flash-attn version (#10736) Signed-off-by: Woosuk Kwon --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 882d4412632a5..45a3b484e0360 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -522,7 +522,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 5259c586c403a4e4d8bf69973c159b40cc346fb9 + GIT_TAG d886f88165702b3c7e7744502772cd98b06be9e1 GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn From 3ed5e7314667f0a9c0c47e6d635ac82fd93296a2 Mon Sep 17 00:00:00 2001 From: Richard Liu <39319471+richardsliu@users.noreply.github.com> Date: Thu, 28 Nov 2024 02:30:48 -0800 Subject: [PATCH 189/255] [TPU] Update requirements-tpu (#10726) Signed-off-by: Richard Liu --- requirements-tpu.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/requirements-tpu.txt b/requirements-tpu.txt index 3d1e80f6be620..b8f0b15469e77 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -16,8 +16,8 @@ ray[default] --find-links https://storage.googleapis.com/libtpu-releases/index.html --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html -torch==2.6.0.dev20241114+cpu -torchvision==0.20.0.dev20241114+cpu -torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241114-cp310-cp310-linux_x86_64.whl -jaxlib==0.4.32.dev20240829 -jax==0.4.32.dev20240829 +torch==2.6.0.dev20241126+cpu +torchvision==0.20.0.dev20241126+cpu +torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl +jaxlib==0.4.36.dev20241122 +jax==0.4.36.dev20241122 From 5fc5ce0fe45f974fc8840175e8321652238400f0 Mon Sep 17 00:00:00 2001 From: sixgod Date: Thu, 28 Nov 2024 22:53:31 +0800 Subject: [PATCH 190/255] [Model] Added GLM-4 series hf format model support vllm==0.6.4 (#10561) Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung --- docs/source/models/supported_models.rst | 5 +++++ tests/models/registry.py | 1 + tests/models/test_initialization.py | 2 +- vllm/model_executor/models/glm.py | 21 +++++++++++++++++++++ vllm/model_executor/models/registry.py | 2 ++ 5 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 vllm/model_executor/models/glm.py diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index c5fbb30b24e28..fd0671beacee7 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -139,6 +139,11 @@ Text Generation - :code:`google/gemma-2-9b`, :code:`google/gemma-2-27b`, etc. - ✅︎ - ✅︎ + * - :code:`GlmForCausalLM` + - GLM-4 + - :code:`THUDM/glm-4-9b-chat-hf`, etc. + - ✅︎ + - ✅︎ * - :code:`GPT2LMHeadModel` - GPT-2 - :code:`gpt2`, :code:`gpt2-xl`, etc. diff --git a/tests/models/registry.py b/tests/models/registry.py index a93bfe907e0d7..461f453d8b1c3 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -63,6 +63,7 @@ class _HfExamplesInfo: "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"), "GemmaForCausalLM": _HfExamplesInfo("google/gemma-2b"), "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"), + "GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"), "GPT2LMHeadModel": _HfExamplesInfo("gpt2"), "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder"), "GPTJForCausalLM": _HfExamplesInfo("EleutherAI/gpt-j-6b"), diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index b8312c2d9b7cc..2a072737db043 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -11,7 +11,7 @@ @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs()) def test_can_initialize(model_arch): - if (model_arch == "Idefics3ForConditionalGeneration" + if (model_arch in {"Idefics3ForConditionalGeneration", "GlmForCausalLM"} and transformers.__version__ < "4.46.0"): pytest.skip(reason="Model introduced in HF >= 4.46.0") diff --git a/vllm/model_executor/models/glm.py b/vllm/model_executor/models/glm.py new file mode 100644 index 0000000000000..942d1e14baed1 --- /dev/null +++ b/vllm/model_executor/models/glm.py @@ -0,0 +1,21 @@ +"""Inference-only HF format GLM-4 model compatible with THUDM weights.""" +from vllm.config import VllmConfig +from vllm.model_executor.models.llama import LlamaForCausalLM + +from .utils import PPMissingLayer + + +class GlmForCausalLM(LlamaForCausalLM): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + # Hack Llama model to fit HF format GLM implementation + # Attention difference between GLM and Llama: + # 1. Half partial rotary_dim and no Neox style. + # 2. There is no bias for o_proj in attention + for layer in self.model.layers: + if not isinstance(layer, PPMissingLayer): + layer.self_attn.rotary_emb.rotary_dim //= 2 + layer.self_attn.rotary_emb.is_neox_style = False + layer.self_attn.o_proj.bias = None + layer.self_attn.o_proj.skip_bias_add = True diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 4462f6ed55a9c..c400c7d59828c 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -48,6 +48,7 @@ "FalconForCausalLM": ("falcon", "FalconForCausalLM"), "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"), "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"), + "GlmForCausalLM": ("glm", "GlmForCausalLM"), "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"), "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"), "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"), @@ -107,6 +108,7 @@ "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"), "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"), "Gemma2Model": ("gemma2", "Gemma2EmbeddingModel"), + "GlmForCausalLM": ("glm", "GlmForCausalLM"), "LlamaModel": ("llama", "LlamaEmbeddingModel"), **{ # Multiple models share the same architecture, so we include them all From 8c1e77fb585c4f42783a3d88c1efc7c9e15fd89f Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 28 Nov 2024 08:31:28 -0800 Subject: [PATCH 191/255] [Kernel] Update vllm-flash-attn version to reduce CPU overheads (#10742) Signed-off-by: Woosuk Kwon --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 45a3b484e0360..f43bf8143458b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -522,7 +522,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG d886f88165702b3c7e7744502772cd98b06be9e1 + GIT_TAG fdf6d72b48aea41f4ae6a89139a453dae554abc8 GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn From 98f47f2a4032f8c395268de80858c64ffcfc60fa Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Thu, 28 Nov 2024 09:01:02 -0800 Subject: [PATCH 192/255] [V1] Optimize the CPU overheads in FlashAttention custom op (#10733) Signed-off-by: Woosuk Kwon --- vllm/v1/attention/backends/flash_attn.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 5f8535eaa303f..e618edf7d35bf 100644 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -135,6 +135,13 @@ def forward( assert k_scale == 1.0 and v_scale == 1.0, ( "key/v_scale is not supported in FlashAttention.") + # Reshape the query, key, and value tensors. + # NOTE(woosuk): We do this outside the custom op to minimize the CPU + # overheads from the non-CUDA-graph regions. + query = query.view(-1, self.num_heads, self.head_size) + key = key.view(-1, self.num_kv_heads, self.head_size) + value = value.view(-1, self.num_kv_heads, self.head_size) + output = torch.empty_like(query) torch.ops.vllm.unified_v1_flash_attention( output, @@ -153,7 +160,7 @@ def forward( self.alibi_slopes, self.logits_soft_cap, ) - return output + return output.view(-1, self.num_heads * self.head_size) def unified_v1_flash_attention( @@ -184,11 +191,6 @@ def unified_v1_flash_attention( attn_metadata: FlashAttentionMetadata = current_metadata num_actual_tokens = attn_metadata.num_actual_tokens - # Reshape the query, key, and value tensors. - query = query.view(-1, num_heads, head_size) - key = key.view(-1, num_kv_heads, head_size) - value = value.view(-1, num_kv_heads, head_size) - # Reshape the input keys and values and store them in the cache. key_cache = kv_cache[0] value_cache = kv_cache[1] @@ -218,8 +220,7 @@ def unified_v1_flash_attention( block_table=attn_metadata.block_table, softcap=logits_soft_cap, ) - attn_output = attn_output.view(num_actual_tokens, -1) - # TODO(woosuk): Optimize this. + # TODO(woosuk): Remove this unnecessary copy. output[:num_actual_tokens].copy_(attn_output) From c83919c7a6bd47bb452321f08017ef5a5cdd553a Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 29 Nov 2024 01:29:04 +0800 Subject: [PATCH 193/255] [Model] Add Internlm2 LoRA support (#5064) Signed-off-by: Isotr0py <2037008807@qq.com> --- docs/source/models/supported_models.rst | 2 +- vllm/model_executor/models/internlm2.py | 22 ++++++++++++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index fd0671beacee7..7b7a83f20871b 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -182,7 +182,7 @@ Text Generation * - :code:`InternLM2ForCausalLM` - InternLM2 - :code:`internlm/internlm2-7b`, :code:`internlm/internlm2-chat-7b`, etc. - - + - ✅︎ - ✅︎ * - :code:`JAISLMHeadModel` - Jais diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 906128940ff76..41b9f110d771f 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -27,7 +27,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import SupportsPP +from .interfaces import SupportsLoRA, SupportsPP from .utils import (is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -319,7 +319,21 @@ def forward( return hidden_states -class InternLM2ForCausalLM(nn.Module, SupportsPP): +class InternLM2ForCausalLM(nn.Module, SupportsPP, SupportsLoRA): + packed_modules_mapping = { + "wqkv": ["wqkv"], + "gate_up_proj": ["w1", "w3"], + } + + # LoRA specific attributes + supported_lora_modules = [ + "wqkv", + "wo", + "gate_up_proj", + "w2", + ] + embedding_modules = {} + embedding_padding_modules = [] def __init__(self, *, @@ -329,8 +343,12 @@ def __init__(self, super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.config = config self.quant_config = quant_config + self.lora_config = lora_config + self.model = model_type(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) self.output = ParallelLMHead(config.vocab_size, From fa6ecb9aa7a55a99f87fdec7a75011f87af2176c Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 29 Nov 2024 12:47:06 +0800 Subject: [PATCH 194/255] [Model] Clean up MiniCPMV (#10751) Signed-off-by: DarkLight1337 --- .../vision_language/test_models.py | 19 ++- .../vision_language/vlm_utils/model_utils.py | 13 +- vllm/model_executor/layers/fused_moe/layer.py | 10 +- vllm/model_executor/models/minicpm.py | 153 +++++++++--------- vllm/model_executor/models/minicpm3.py | 5 +- vllm/model_executor/models/minicpmv.py | 136 ++++------------ vllm/model_executor/models/utils.py | 28 +--- 7 files changed, 149 insertions(+), 215 deletions(-) diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 3f6d8ef42cd5f..3457ec6b8e73b 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -295,16 +295,29 @@ ) ], ), - "minicpmv": VLMTestInfo( + "minicpmv_25": VLMTestInfo( models=["openbmb/MiniCPM-Llama3-V-2_5"], - test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + test_type=VLMTestType.IMAGE, prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501 img_idx_to_prompt=lambda idx: "(./)\n", max_model_len=4096, max_num_seqs=2, get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id], postprocess_inputs=model_utils.wrap_inputs_post_processor, - hf_output_post_proc=model_utils.minicmpv_trunc_hf_output, + hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, + ), + "minicpmv_26": VLMTestInfo( + models=["openbmb/MiniCPM-V-2_6"], + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501 + img_idx_to_prompt=lambda idx: "(./)\n", + max_model_len=4096, + max_num_seqs=2, + get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501 + postprocess_inputs=model_utils.ignore_inputs_post_processor( + "image_sizes" + ), + hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, ), # Tests for phi3v currently live in another file because of a bug in # transformers. Once this issue is fixed, we can enable them here instead. diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py index 849857b4232e7..15f15dd7d8030 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py @@ -170,7 +170,7 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput, ####### Post-processors for HF outputs -def minicmpv_trunc_hf_output(hf_output: RunnerOutput, +def minicpmv_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput: output_ids, output_str, out_logprobs = hf_output if output_str.endswith("<|eot_id|>"): @@ -197,6 +197,17 @@ def process(hf_inputs: BatchEncoding, dtype: str): return process +def ignore_inputs_post_processor( + hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]: + """Gets a handle to a post processor which ignores a given key.""" + + def process(hf_inputs: BatchEncoding, dtype: str): + del hf_inputs[hf_inp_key] + return hf_inputs + + return process + + def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str): return {"model_inputs": hf_inputs} diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 5570771ac917b..8c6f7c6e06515 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -242,7 +242,7 @@ def _load_per_tensor_weight_scale(self, shard_id: str, def _load_model_weight_or_group_weight_scale(self, shard_dim: int, expert_data: torch.Tensor, shard_id: str, - loaded_weight: torch.tensor, + loaded_weight: torch.Tensor, tp_rank: int): # Load grouped weight scales for group quantization # or model weights @@ -261,7 +261,7 @@ def _load_model_weight_or_group_weight_scale(self, shard_dim: int, def _load_per_channel_weight_scale(self, expert_data: torch.Tensor, shard_dim: int, shard_id: str, - loaded_weight: torch.tensor, + loaded_weight: torch.Tensor, tp_rank: int): # for per channel weight quantization if shard_id == "w2": @@ -274,7 +274,7 @@ def _load_per_channel_weight_scale(self, expert_data: torch.Tensor, tp_rank=tp_rank) def _load_w13(self, expert_data: torch.Tensor, shard_dim: int, - shard_id: str, loaded_weight: torch.tensor, tp_rank: int): + shard_id: str, loaded_weight: torch.Tensor, tp_rank: int): # Index the loaded weight for tp sharding. # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim @@ -292,7 +292,7 @@ def _load_w13(self, expert_data: torch.Tensor, shard_dim: int, expert_data.copy_(loaded_weight) def _load_w2(self, expert_data: torch.Tensor, shard_dim: int, - shard_id: str, loaded_weight: torch.tensor, tp_rank: int): + shard_id: str, loaded_weight: torch.Tensor, tp_rank: int): # Index the loaded weight for tp sharding. # down_proj: "RowParallel" so tp sharding on input_dim @@ -311,7 +311,7 @@ def _load_single_value(self, param: torch.nn.Parameter, param_data[expert_id] = loaded_weight def _load_g_idx(self, shard_id: str, expert_data: torch.Tensor, - shard_dim: int, loaded_weight: torch.tensor, tp_rank: int): + shard_dim: int, loaded_weight: torch.Tensor, tp_rank: int): if shard_id == "w2": self._load_w2(shard_id=shard_id, diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index c9a573278a136..6254d26c7060d 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -52,7 +52,7 @@ from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP -from .utils import (is_pp_missing_parameter, +from .utils import (AutoWeightsLoader, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -378,6 +378,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.hidden_size, org_num_embeddings=config.vocab_size, ) + self.num_experts = getattr(self.config, "num_experts", 0) self._init_layers(prefix, config, cache_config, quant_config) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.make_empty_intermediate_tensors = ( @@ -437,6 +438,73 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + expert_params_mapping = [ + # (param_name, weight_name, expert_id) + ("ws" if weight_name in ["w1", "w3"] else "w2s", + f"experts.{expert_id}.{weight_name}.weight", expert_id) + for expert_id in range(self.num_experts) + for weight_name in ["w1", "w2", "w3"] + ] + params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + if ("rotary_emb.cos_cached" in name + or "rotary_emb.sin_cached" in name): + # Models trained using ColossalAI may include these tensors in + # the checkpoint. Skip them. + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for param_name, weight_name, expert_id in expert_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, + loaded_weight, + weight_name, + expert_id=expert_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP): packed_modules_mapping = { @@ -480,8 +548,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.cache_config = cache_config self.quant_config = quant_config - self.num_experts = getattr(self.config, "num_experts", 0) - self._init_model(vllm_config=vllm_config, prefix=prefix) + self.model = self._init_model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + unpadded_vocab_size = config.vocab_size if lora_config: unpadded_vocab_size += lora_config.lora_extra_vocab_size @@ -506,8 +575,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.model.make_empty_intermediate_tensors) def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""): - self.model = MiniCPMModel(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) + return MiniCPMModel(vllm_config=vllm_config, prefix=prefix) def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) @@ -546,72 +614,9 @@ def sample( def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - expert_params_mapping = [ - # (param_name, weight_name, expert_id) - ("ws" if weight_name in ["w1", "w3"] else "w2s", - f"experts.{expert_id}.{weight_name}.weight", expert_id) - for expert_id in range(self.num_experts) - for weight_name in ["w1", "w2", "w3"] - ] - params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - if ("rotary_emb.cos_cached" in name - or "rotary_emb.sin_cached" in name): - # Models trained using ColossalAI may include these tensors in - # the checkpoint. Skip them. - continue - # With tie_word_embeddings, we can skip lm_head.weight - # The weight might appear unnecessarily in the files if the model is - # processed with quantization, LoRA, fine-tuning, etc. - if self.config.tie_word_embeddings and "lm_head.weight" in name: - continue - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - for param_name, weight_name, expert_id in expert_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, - loaded_weight, - weight_name, - expert_id=expert_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] + if self.config.tie_word_embeddings else None), + ) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index c66be2d9c2d07..e9d7eada1d16c 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -40,7 +40,7 @@ MiniCPMForCausalLM, MiniCPMModel) -from .utils import make_layers, maybe_prefix +from .utils import make_layers class MiniCPM3Attention(nn.Module): @@ -248,5 +248,4 @@ class MiniCPM3ForCausalLM(MiniCPMForCausalLM): } def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""): - self.model = MiniCPM3Model(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) + return MiniCPM3Model(vllm_config=vllm_config, prefix=prefix) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index aacce477e0460..1e8f9bd4cf418 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -22,7 +22,7 @@ """Inference-only MiniCPM-V model compatible with HuggingFace weights.""" import math import re -from functools import partial +from functools import cached_property, partial from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) @@ -37,19 +37,15 @@ from vllm.config import VllmConfig from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, InputContext, token_inputs) -from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.resampler import (BaseResampler, Resampler2, get_2d_sincos_pos_embed) from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.utils import set_default_torch_dtype -from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.llama import LlamaModel -from vllm.model_executor.models.minicpm import MiniCPMModel +from vllm.model_executor.models.llama import LlamaForCausalLM +from vllm.model_executor.models.minicpm import MiniCPMForCausalLM from vllm.model_executor.models.module_mapping import MultiModelKeys -from vllm.model_executor.models.qwen2 import Qwen2Model -from vllm.model_executor.models.utils import LLMWrapper +from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.image import cached_get_image_processor @@ -58,11 +54,7 @@ from .idefics2_vision_model import Idefics2VisionTransformer from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP -from .utils import is_pp_missing_parameter, maybe_prefix - -_KEYS_TO_MODIFY_MAPPING = { - "llm.lm_head": "lm_head", -} +from .utils import AutoWeightsLoader, maybe_prefix RawImageType = Union[Image.Image, torch.Tensor] @@ -297,10 +289,9 @@ def input_processor_for_minicpmv(ctx: InputContext, inputs: DecoderOnlyInputs): def get_placeholder(image_size: Tuple[int, int], num_image: int): if version == (2, 0) or version == (2, 5): - return image_processor. \ - get_slice_image_placeholder(image_size) - return image_processor. \ - get_slice_image_placeholder(image_size, num_image) + return image_processor.get_slice_image_placeholder(image_size) + return image_processor.get_slice_image_placeholder( + image_size, num_image) prompt = inputs.get("prompt") token_ids = inputs.get("prompt_token_ids") @@ -400,37 +391,32 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.vpm = self.init_vision_module(config, quant_config, prefix=maybe_prefix(prefix, "vpm")) - param_dtype = torch.get_default_dtype() - self.vpm.to(dtype=param_dtype) self.vision_dim = (self.vpm.embed_dim if self.version == (2, 0) else self.vpm.embeddings.embed_dim) self.embed_dim = self.config.hidden_size + self.resampler = self.init_resampler(self.embed_dim, self.vision_dim, quant_config=quant_config, prefix=maybe_prefix( prefix, "resampler")) - self.resampler.to(device="cuda", dtype=param_dtype) - # TODO: why is there _KEYS_TO_MODIFY_MAPPING? lm_head should be in llm - self.lm_head = ParallelLMHead(config.vocab_size, - config.hidden_size, - quant_config=quant_config, - prefix=maybe_prefix( - prefix, "llm.lm_head")) - self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.llm.make_empty_intermediate_tensors) + @cached_property + def sampler(self): + if hasattr(self.llm, "sampler"): + return self.llm.sampler + + return get_sampler() + def get_embedding( self, input_ids: torch.Tensor, image_inputs: Optional[MiniCPMVImageInputs], ) -> Tuple[torch.Tensor, torch.Tensor]: - vlm_embedding: torch.Tensor = self.llm.embed_tokens(input_ids) - if hasattr(self.config, "scale_emb"): - vlm_embedding *= self.config.scale_emb + vlm_embedding: torch.Tensor = self.llm.get_input_embeddings(input_ids) if image_inputs is None: # No image vision_hidden_states = torch.tensor([], device=input_ids.device) @@ -575,7 +561,7 @@ def forward( # for `torch.compile` integration input_ids = None - output = self.llm( + output = self.llm.model( input_ids=input_ids, positions=positions, kv_caches=kv_caches, @@ -590,9 +576,7 @@ def compute_logits( hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) - return logits + return self.llm.compute_logits(hidden_states, sampling_metadata) def sample( self, @@ -604,52 +588,8 @@ def sample( def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() - for name, loaded_weight in weights: - for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in name: - name = name.replace(key_to_modify, new_key) - if "rotary_emb.inv_freq" in name: - continue - if ("rotary_emb.cos_cached" in name - or "rotary_emb.sin_cached" in name): - # Models trained using ColossalAI may include these tensors in - # the checkpoint. Skip them. - continue - use_default_weight_loading = False - if self.is_default_weight_loading(name): - use_default_weight_loading = True - else: - for param_name, weight_name, shard_id in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - use_default_weight_loading = True - if use_default_weight_loading: - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) def get_mm_mapping(self) -> MultiModelKeys: """ @@ -693,9 +633,6 @@ def get_vision_hidden_states(self, data: MiniCPMVImageInputs) -> torch.Tensor: raise NotImplementedError - def is_default_weight_loading(self, name: str) -> bool: - raise NotImplementedError - class MiniCPMV2_0(MiniCPMVBaseModel): @@ -708,8 +645,7 @@ def init_llm( vllm_config: VllmConfig, prefix: str = "", ) -> nn.Module: - return LLMWrapper(MiniCPMModel(vllm_config=vllm_config, prefix=prefix), - name="model") + return MiniCPMForCausalLM(vllm_config=vllm_config, prefix=prefix) def init_vision_module( self, @@ -717,11 +653,12 @@ def init_vision_module( quant_config: Optional[QuantizationConfig], prefix: str = "", ) -> nn.Module: - # TODO :refactor this vision model + # TODO: refactor this vision model try: import timm except ImportError: raise ImportError("Please install timm==0.9.10") from ImportError + with set_default_torch_dtype(torch.float16): model = timm.create_model( "vit_so400m_patch14_siglip_384.webli", @@ -731,6 +668,8 @@ def init_vision_module( dynamic_img_pad=True, ) + model = model.to(dtype=torch.get_default_dtype()) + if (isinstance(model, timm.models.VisionTransformer) and model.attn_pool is not None): model.attn_pool = torch.nn.Identity() @@ -759,7 +698,7 @@ def init_resampler(self, quant_config=quant_config, prefix=prefix) - return resampler + return resampler.to(device="cuda", dtype=torch.get_default_dtype()) def get_vision_embedding( self, @@ -790,9 +729,6 @@ def get_vision_hidden_states(self, return self.get_vision_embedding(pixel_values) - def is_default_weight_loading(self, name: str) -> bool: - return "resampler" in name or "vpm" in name - class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA): packed_modules_mapping = { @@ -843,8 +779,7 @@ def init_llm( vllm_config: VllmConfig, prefix: str = "", ) -> nn.Module: - return LLMWrapper(LlamaModel(vllm_config=vllm_config, prefix=prefix), - name="model") + return LlamaForCausalLM(vllm_config=vllm_config, prefix=prefix) def init_vision_module( self, @@ -871,7 +806,8 @@ def init_resampler(self, kv_dim=vision_dim, quant_config=quant_config, prefix=prefix) - return resampler + + return resampler.to(device="cuda", dtype=torch.get_default_dtype()) def get_vision_embedding( self, @@ -913,9 +849,6 @@ def get_vision_hidden_states(self, return self.get_vision_embedding(all_pixel_values.type(dtype), patch_attn_mask, tgt_sizes) - def is_default_weight_loading(self, name: str) -> bool: - return "resampler" in name - class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA): packed_modules_mapping = { @@ -966,8 +899,7 @@ def init_llm( vllm_config: VllmConfig, prefix: str = "", ) -> nn.Module: - return LLMWrapper(Qwen2Model(vllm_config=vllm_config, prefix=prefix), - name="model") + return Qwen2ForCausalLM(vllm_config=vllm_config, prefix=prefix) def init_vision_module( self, @@ -995,7 +927,8 @@ def init_resampler(self, kv_dim=vision_dim, quant_config=quant_config, prefix=prefix) - return resampler + + return resampler.to(device="cuda", dtype=torch.get_default_dtype()) def get_vision_embedding( self, @@ -1043,9 +976,6 @@ def get_vision_hidden_states(self, return self.resampler(vision_embedding, tgt_sizes) - def is_default_weight_loading(self, name: str) -> bool: - return "resampler" in name - _SUPPORT_VERSION = { (2, 0): MiniCPMV2_0, diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 4c13cbc953273..a6b40a233439b 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -1,7 +1,7 @@ import itertools from dataclasses import dataclass, field -from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping, - Optional, Protocol, Set, Tuple, Union, overload) +from typing import (Callable, Dict, Iterable, List, Literal, Mapping, Optional, + Protocol, Set, Tuple, Union, overload) import torch import torch.nn as nn @@ -560,30 +560,6 @@ def make_empty_intermediate_tensors( return make_empty_intermediate_tensors -class LLMWrapper(nn.Module): - """ - To align with the key names of LoRA trained with PEFT, we need to add an - additional layer to the llm's implementation. - """ - - def __init__(self, llm: nn.Module, name: str) -> None: - super().__init__() - self.model_name = name - setattr(self, name, llm) - - def __getattr__(self, key: str): - llm = super().__getattr__(self.model_name) - if key == self.model_name: - return llm - - return getattr(llm, key) - - # We need to explicitly override this - def __call__(self, *args: Any, **kwargs: Any) -> Any: - llm = super().__getattr__(self.model_name) - return llm(*args, **kwargs) - - def get_vit_attn_backend(support_fa: bool = False) -> _Backend: """ Get the available attention backend for Vision Transformer. From c82b432d4a40fd6376a35fd38cb5fc37e9c53798 Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Fri, 29 Nov 2024 13:17:57 +0800 Subject: [PATCH 195/255] [Misc] typo find in sampling_metadata.py (#10740) --- vllm/model_executor/sampling_metadata.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 84f35f75a0c32..1df8f84ed4093 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -454,6 +454,7 @@ def from_sampling_metadata( if do_penalties: for seq_group in sampling_metadata.seq_groups: seq_ids = seq_group.seq_ids + sampling_params = seq_group.sampling_params if (seq_group.is_prompt and sampling_params.prompt_logprobs is not None): prefill_len = len(seq_group.prompt_logprob_indices) From 3132aac04326286ae996bf0887e920096b2bb210 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 29 Nov 2024 21:56:46 +0800 Subject: [PATCH 196/255] [Bugfix] Fix Idefics3 bug (#10778) Signed-off-by: Jee Jee Li --- vllm/model_executor/models/idefics3.py | 92 +++++++++++++------------- 1 file changed, 47 insertions(+), 45 deletions(-) diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 014e27bc869d4..e5d2edbd81eb1 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -267,54 +267,56 @@ def input_processor_for_idefics3(ctx: InputContext, n_images_in_text = [] text = inputs.get("prompt") - if text is not None: - if isinstance(text, str): - text = [text] - elif not isinstance(text, list) and not isinstance(text[0], str): - raise ValueError("Invalid input text. Please provide a string, " - "or a list of strings") - - fake_image_token = processor.fake_image_token.content - image_token = processor.image_token.content - global_img_token = processor.global_image_tag - - prompt_strings = [] - for sample, sample_rows, sample_cols in zip(text, image_rows, - image_cols): - n_images_in_text.append(sample.count(image_token)) - - # Replace the image token with fake tokens around the expanded - # image token sequence of length `image_seq_len` - image_prompt_strings = [] - for n_rows, n_cols in zip(sample_rows, sample_cols): - image_prompt_string = _get_image_prompt_string( - n_rows, - n_cols, - processor.image_seq_len, - image_token=image_token, - fake_token_around_image=fake_image_token, - global_img_token=global_img_token, - ) - image_prompt_strings.append(image_prompt_string) - - split_sample = sample.split(image_token) - if len(split_sample) == 0: - raise ValueError( - "The image token should be present in the text.") + if text is None: + prompt_token_ids = inputs.get("prompt_token_ids", []) + assert prompt_token_ids + text = tokenizer.decode(prompt_token_ids) + + if isinstance(text, str): + text = [text] + elif not isinstance(text, list) and not isinstance(text[0], str): + raise ValueError("Invalid input text. Please provide a string, " + "or a list of strings") + + fake_image_token = processor.fake_image_token.content + image_token = processor.image_token.content + global_img_token = processor.global_image_tag + + prompt_strings = [] + for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols): + n_images_in_text.append(sample.count(image_token)) + + # Replace the image token with fake tokens around the expanded + # image token sequence of length `image_seq_len` + image_prompt_strings = [] + for n_rows, n_cols in zip(sample_rows, sample_cols): + image_prompt_string = _get_image_prompt_string( + n_rows, + n_cols, + processor.image_seq_len, + image_token=image_token, + fake_token_around_image=fake_image_token, + global_img_token=global_img_token, + ) + image_prompt_strings.append(image_prompt_string) - # Place in the image prompt strings where the image tokens are - sample = split_sample[0] - for i, image_prompt_string in enumerate(image_prompt_strings): - sample += image_prompt_string + split_sample[i + 1] - prompt_strings.append(sample) + split_sample = sample.split(image_token) + if len(split_sample) == 0: + raise ValueError("The image token should be present in the text.") - prompt_token_ids = tokenizer(text=prompt_strings[0]).input_ids + # Place in the image prompt strings where the image tokens are + sample = split_sample[0] + for i, image_prompt_string in enumerate(image_prompt_strings): + sample += image_prompt_string + split_sample[i + 1] + prompt_strings.append(sample) - return token_inputs( - prompt_token_ids=prompt_token_ids, - prompt=prompt_strings[0], - multi_modal_data=multi_modal_data, - ) + prompt_token_ids = tokenizer(text=prompt_strings[0]).input_ids + + return token_inputs( + prompt_token_ids=prompt_token_ids, + prompt=prompt_strings[0], + multi_modal_data=multi_modal_data, + ) def _get_max_num_image_patch(image_processor: Idefics3ImageProcessor) -> int: From 661175bc826f4caba04182a1faeeca9e7a3259ac Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Fri, 29 Nov 2024 23:22:21 +0800 Subject: [PATCH 197/255] [platform] Add verify_quantization in platform. (#10757) Signed-off-by: wangxiyuan --- vllm/config.py | 28 +--------------------------- vllm/platforms/cpu.py | 1 + vllm/platforms/cuda.py | 1 + vllm/platforms/hpu.py | 1 + vllm/platforms/interface.py | 13 +++++++++++++ vllm/platforms/neuron.py | 2 ++ vllm/platforms/openvino.py | 1 + vllm/platforms/rocm.py | 15 +++++++++++++++ vllm/platforms/tpu.py | 2 ++ vllm/platforms/xpu.py | 1 + 10 files changed, 38 insertions(+), 27 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index cd24e9ffdf598..b1e5b412fec8f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -393,17 +393,11 @@ def _parse_quant_hf_config(self): def _verify_quantization(self) -> None: supported_quantization = QUANTIZATION_METHODS - rocm_supported_quantization = [ - "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors", - "fbgemm_fp8", "gguf" - ] optimized_quantization_methods = [ "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin", "awq_marlin", "fbgemm_fp8", "compressed_tensors", "compressed-tensors", "experts_int8" ] - tpu_supported_quantization = ["tpu_int8"] - neuron_supported_quantization = ["neuron_quant"] if self.quantization is not None: self.quantization = self.quantization.lower() @@ -438,32 +432,12 @@ def _verify_quantization(self) -> None: raise ValueError( f"Unknown quantization method: {self.quantization}. Must " f"be one of {supported_quantization}.") - if current_platform.is_rocm( - ) and self.quantization not in rocm_supported_quantization: - raise ValueError( - f"{self.quantization} quantization is currently not " - f"supported in ROCm.") - if current_platform.is_tpu( - ) and self.quantization not in tpu_supported_quantization: - raise ValueError( - f"{self.quantization} quantization is currently not " - f"supported in TPU Backend.") + current_platform.verify_quantization(self.quantization) if self.quantization not in optimized_quantization_methods: logger.warning( "%s quantization is not fully " "optimized yet. The speed can be slower than " "non-quantized models.", self.quantization) - if (self.quantization == "awq" and current_platform.is_rocm() - and not envs.VLLM_USE_TRITON_AWQ): - logger.warning( - "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ" - " is not set, enabling VLLM_USE_TRITON_AWQ.") - envs.VLLM_USE_TRITON_AWQ = True - if current_platform.is_neuron( - ) and self.quantization not in neuron_supported_quantization: - raise ValueError( - f"{self.quantization} quantization is currently not " - f"supported in Neuron Backend.") def _verify_cuda_graph(self) -> None: if self.max_seq_len_to_capture is None: diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 3e22c87f61fac..b5333fbd6f502 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -19,6 +19,7 @@ class CpuPlatform(Platform): _enum = PlatformEnum.CPU + device_name: str = "cpu" device_type: str = "cpu" dispatch_key: str = "CPU" diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 5e9ce551f2332..846a1869da228 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -72,6 +72,7 @@ def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _R: class CudaPlatformBase(Platform): _enum = PlatformEnum.CUDA + device_name: str = "cuda" device_type: str = "cuda" dispatch_key: str = "CUDA" diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index 3071136e43b85..10aaa6d54962c 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -12,6 +12,7 @@ class HpuPlatform(Platform): _enum = PlatformEnum.HPU + device_name: str = "hpu" device_type: str = "hpu" dispatch_key: str = "HPU" diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 3328665029039..eac2b413f9271 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -56,11 +56,13 @@ def to_int(self) -> int: class Platform: _enum: PlatformEnum + device_name: str device_type: str # available dispatch keys: # check https://github.com/pytorch/pytorch/blob/313dac6c1ca0fa0cde32477509cce32089f8532a/torchgen/model.py#L134 # noqa # use "CPU" as a fallback for platforms not registered in PyTorch dispatch_key: str = "CPU" + supported_quantization: list[str] = [] def is_cuda(self) -> bool: return self._enum == PlatformEnum.CUDA @@ -171,6 +173,17 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: """ pass + @classmethod + def verify_quantization(cls, quant: str) -> None: + """ + Verify whether the quantization is supported by the current platform. + """ + if cls.supported_quantization and \ + quant not in cls.supported_quantization: + raise ValueError( + f"{quant} quantization is currently not supported in " + f"{cls.device_name}.") + class UnspecifiedPlatform(Platform): _enum = PlatformEnum.UNSPECIFIED diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py index 4c4d778ed3dd4..87655ea198303 100644 --- a/vllm/platforms/neuron.py +++ b/vllm/platforms/neuron.py @@ -10,7 +10,9 @@ class NeuronPlatform(Platform): _enum = PlatformEnum.NEURON + device_name: str = "neuron" device_type: str = "neuron" + supported_quantization: list[str] = ["neuron_quant"] @classmethod def get_device_name(cls, device_id: int = 0) -> str: diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py index ea5ec7b40b95c..29b61e955d9ab 100644 --- a/vllm/platforms/openvino.py +++ b/vllm/platforms/openvino.py @@ -23,6 +23,7 @@ class OpenVinoPlatform(Platform): _enum = PlatformEnum.OPENVINO + device_name: str = "openvino" device_type: str = "openvino" dispatch_key: str = "CPU" diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index d2f44c3e423e3..3c14fbc179f69 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -4,6 +4,7 @@ import torch +import vllm.envs as envs from vllm.logger import init_logger from .interface import DeviceCapability, Platform, PlatformEnum, _Backend @@ -35,8 +36,13 @@ class RocmPlatform(Platform): _enum = PlatformEnum.ROCM + device_name: str = "rocm" device_type: str = "cuda" dispatch_key: str = "CUDA" + supported_quantization: list[str] = [ + "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors", + "fbgemm_fp8", "gguf" + ] @classmethod def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: @@ -79,3 +85,12 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: "vllm.spec_decode.spec_decode_worker.create_spec_worker" else: parallel_config.worker_cls = "vllm.worker.worker.Worker" + + @classmethod + def verify_quantization(cls, quant: str) -> None: + super().verify_quantization(quant) + if quant == "awq" and not envs.VLLM_USE_TRITON_AWQ: + logger.warning( + "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ" + " is not set, enabling VLLM_USE_TRITON_AWQ.") + envs.VLLM_USE_TRITON_AWQ = True diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 137af57023ea9..b138f7e1c54c5 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -16,8 +16,10 @@ class TpuPlatform(Platform): _enum = PlatformEnum.TPU + device_name: str = "tpu" device_type: str = "tpu" dispatch_key: str = "XLA" + supported_quantization: list[str] = ["tpu_int8"] @classmethod def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 69388a8e0f27c..9665786f4c499 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -16,6 +16,7 @@ class XPUPlatform(Platform): _enum = PlatformEnum.XPU + device_name: str = "xpu" device_type: str = "xpu" dispatch_key: str = "XPU" From 40bc242579d260e6da7614e1494cbd80a6f985b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Sat, 30 Nov 2024 05:07:13 +0100 Subject: [PATCH 198/255] [Bugfix] Fix OpenVino/Neuron `driver_worker` init (#10779) Signed-off-by: NickLucche Signed-off-by: Cyrus Leung Co-authored-by: Cyrus Leung --- vllm/executor/neuron_executor.py | 6 ++++-- vllm/executor/openvino_executor.py | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index 31e6fdc3ab1bb..a9efc4f9a801c 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -29,11 +29,13 @@ def _init_worker(self): wrapper = WorkerWrapperBase(vllm_config=self.vllm_config) distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) - self.driver_worker = wrapper.init_worker( + wrapper.init_worker( vllm_config=self.vllm_config, local_rank=0, rank=0, - distributed_init_method=distributed_init_method) + distributed_init_method=distributed_init_method, + ) + self.driver_worker = wrapper.worker self.driver_worker.init_device() self.driver_worker.load_model() diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py index db0070ce510ee..057a32364e512 100644 --- a/vllm/executor/openvino_executor.py +++ b/vllm/executor/openvino_executor.py @@ -36,7 +36,7 @@ def _init_worker(self): distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) - self.driver_worker = wrapper.init_worker( + wrapper.init_worker( ov_core=ov.Core(), vllm_config=self.vllm_config, local_rank=0, @@ -45,6 +45,7 @@ def _init_worker(self): kv_cache_dtype=self.cache_config.cache_dtype, is_driver_worker=True, ) + self.driver_worker = wrapper.worker self.driver_worker.init_device() self.driver_worker.load_model() From 16ee07f22ade57eb882b3c16ad3a6944635996df Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 30 Nov 2024 12:19:14 +0800 Subject: [PATCH 199/255] [Model] Refactor Molmo weights loading to use AutoWeightsLoader (#10771) Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/molmo.py | 213 +++++++++++++++------------- 1 file changed, 111 insertions(+), 102 deletions(-) diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index acedddd84d7cb..98caa6857e211 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -3,7 +3,7 @@ from array import array from dataclasses import dataclass from functools import lru_cache, partial -from typing import Iterable, List, Mapping, Optional, Tuple, TypedDict +from typing import Iterable, List, Mapping, Optional, Set, Tuple, TypedDict import torch from einops import rearrange @@ -44,7 +44,8 @@ from vllm.transformers_utils.processor import get_processor from .interfaces import SupportsMultiModal, SupportsPP -from .utils import (get_vit_attn_backend, +from .utils import (AutoWeightsLoader, WeightsMapper, get_vit_attn_backend, + is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -720,6 +721,42 @@ def forward( # image_features: (batch_size, num_image, num_patch, d_model) return image_features + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() + + for name, loaded_weight in weights: + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + @support_torch_compile class MolmoModel(nn.Module): @@ -804,6 +841,28 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() + + for name, loaded_weight in weights: + if "gate_up_proj" in name: + up_proj, gate_proj = loaded_weight.chunk(2, dim=0) + loaded_weight = torch.cat([gate_proj, up_proj], dim=0) + + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + cached_get_processor = lru_cache(get_processor) @@ -1200,103 +1259,53 @@ def sample( return next_tokens def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): - - params_mapping = [ - ("model.transformer.ln_f.weight", "model.norm.weight"), - ("attn_out", "self_attn.o_proj"), - ("att_proj", "self_attn.qkv_proj"), - ("q_norm", "self_attn.q_norm"), - ("k_norm", "self_attn.k_norm"), - ("attn_norm", "input_layernorm"), - ("ff_norm", "post_attention_layernorm"), - ] - - params_dict = dict(self.named_parameters(remove_duplicate=False)) - - embedding_weight = dict() - projector_weight = dict() - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - if self.config.tie_word_embeddings and "lm_head.weight" in name: - continue - - if "wte.embedding" in name: - embedding_weight["embedding"] = loaded_weight - continue - - if "wte.new_embedding" in name: - embedding_weight["new_embedding"] = loaded_weight - continue - - if "vision_backbone" in name: - if name.startswith("model"): - name = name[len("model."):] - if 'image_projector' in name: - if 'w1' in name: - projector_weight['gate_proj'] = loaded_weight - elif 'w3' in name: - projector_weight['up_proj'] = loaded_weight - elif 'w2' in name: - projector_weight['down_proj'] = loaded_weight - else: - raise ValueError( - f"Unexpected projector weight: {name}") - continue - else: - if "transformer.blocks" in name: - name = name.replace("transformer.blocks", "layers") - - if "ff_proj" in name: - name = name.replace("ff_proj", "mlp.gate_up_proj") - assert 'weight' in name - up_weight, gate_weight = loaded_weight.chunk(2, dim=0) - loaded_weight = torch.cat([gate_weight, up_weight], dim=0) - - elif "ff_out" in name: - if "layers" in name: - name = name.replace("ff_out", "mlp.down_proj") - else: - # lm head - name = name.replace("model.transformer.ff_out", - "lm_head") - - else: - for (param_name, weight_name) in params_mapping: - if param_name in name: - name = name.replace(param_name, weight_name) - break - - try: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - except KeyError: - raise ValueError(f"Unexpected weight: {name}") from None - - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - - gate_up_proj_weight = torch.cat( - [projector_weight["gate_proj"], projector_weight["up_proj"]], - dim=0) - name = "vision_backbone.image_projector.gate_up_proj.weight" - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, gate_up_proj_weight) - - down_proj_weight = projector_weight["down_proj"] - name = "vision_backbone.image_projector.down_proj.weight" - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, down_proj_weight) - - embedding_weight = torch.cat( - [embedding_weight["embedding"], embedding_weight["new_embedding"]], - dim=0) - name = "model.embed_tokens.weight" - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - weight_loader(param, embedding_weight) + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={ + # vision backbone mapping + "image_projector.w1.": "image_projector.gate_proj.", + "image_projector.w3.": "image_projector.up_proj.", + "image_projector.w2.": "image_projector.down_proj.", + # language backbone mapping + "att_proj": "self_attn.qkv_proj", + "attn_out": "self_attn.o_proj", + "q_norm": "self_attn.q_norm", + "k_norm": "self_attn.k_norm", + "ff_proj": "mlp.gate_up_proj", + "ff_out": "mlp.down_proj", + "attn_norm": "input_layernorm", + "ff_norm": "post_attention_layernorm", + }, + orig_to_new_prefix={ + # vision backbone mapping + "model.vision_backbone.": "vision_backbone.", + # language backbone mapping + "model.transformer.blocks.": "model.layers.", + "model.transformer.ln_f.": "model.norm.", + # lm_head is renamed to model.transformer.mlp.down_proj firstly, + # we need to run a second renaming for it + "model.transformer.mlp.down_proj.": "lm_head.", + }, + ) + loader = AutoWeightsLoader(self) + weights = _get_weights_with_merged_embedding(weights) + return loader.load_weights(weights, mapper=hf_to_vllm_mapper) + + +def _get_weights_with_merged_embedding( + weights: Iterable[Tuple[str, torch.Tensor]] +) -> Iterable[Tuple[str, torch.Tensor]]: + embedding_weights = {} + for name, weight in weights: + if "wte.embedding" in name: + embedding_weights["embedding"] = weight + elif "wte.new_embedding" in name: + embedding_weights["new_embedding"] = weight + else: + yield (name, weight) + # this is compatible with most of quantization, + # because they won't quantize embed_tokens + embedding_weights = torch.cat( + [embedding_weights["embedding"], embedding_weights["new_embedding"]], + dim=0, + ) + yield ("model.embed_tokens.weight", embedding_weights) From e7cfc4ef4cc017e0a0229adff9f4b143b38fb421 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Sat, 30 Nov 2024 08:45:50 +0100 Subject: [PATCH 200/255] [Interleaved ATTN] Support for Mistral-8B (#10591) Signed-off-by: youkaichao Co-authored-by: youkaichao --- vllm/model_executor/models/llama.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index fe94bb352961b..ff0ab011a9158 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -54,7 +54,7 @@ from .interfaces import SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, - is_pp_missing_parameter, + extract_layer_index, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -114,6 +114,7 @@ def __init__( prefix: str = "", ) -> None: super().__init__() + layer_idx = extract_layer_index(prefix) self.hidden_size = hidden_size tp_size = get_tensor_model_parallel_world_size() self.total_num_heads = num_heads @@ -168,6 +169,18 @@ def __init__( rope_scaling=rope_scaling, is_neox_style=is_neox_style, ) + + if hasattr(config, "interleaved_sliding_window"): + if isinstance(config.interleaved_sliding_window, int): + sliding_window = config.interleaved_sliding_window + elif isinstance(config.interleaved_sliding_window, list): + sw_idx = layer_idx % len(config.interleaved_sliding_window) + sliding_window = config.interleaved_sliding_window[sw_idx] + else: + raise ValueError(f"{type(sliding_window)} is not supported.") + else: + sliding_window = None + self.attn = Attention( self.num_heads, self.head_dim, @@ -175,6 +188,7 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + per_layer_sliding_window=sliding_window, prefix=f"{prefix}.attn", ) From 7e4bbda5735eaca3ce01860b8168feed32e339f4 Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Sat, 30 Nov 2024 19:38:40 +0800 Subject: [PATCH 201/255] [doc] format fix (#10789) Signed-off-by: wangxiyuan --- .../automatic_prefix_caching/details.md | 2 +- .../getting_started/gaudi-installation.rst | 36 +++++++++---------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/docs/source/automatic_prefix_caching/details.md b/docs/source/automatic_prefix_caching/details.md index 2d3214e28ed93..17f806217aa65 100644 --- a/docs/source/automatic_prefix_caching/details.md +++ b/docs/source/automatic_prefix_caching/details.md @@ -25,7 +25,7 @@ With this mapping, we can add another indirection in vLLM’s KV cache managemen This design achieves automatic prefix caching without the need of maintaining a tree structure among the KV blocks. More specifically, all of the blocks are independent of each other and can be allocated and freed by itself, which enables us to manages the KV cache as ordinary caches in operating system. -# Generalized Caching Policy +## Generalized Caching Policy Keeping all the KV blocks in a hash table enables vLLM to cache KV blocks from earlier requests to save memory and accelerate the computation of future requests. For example, if a new request shares the system prompt with the previous request, the KV cache of the shared prompt can directly be used for the new request without recomputation. However, the total KV cache space is limited and we have to decide which KV blocks to keep or evict when the cache is full. diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst index 68c1a56660fa4..249e08278ff8f 100644 --- a/docs/source/getting_started/gaudi-installation.rst +++ b/docs/source/getting_started/gaudi-installation.rst @@ -4,7 +4,7 @@ Installation with Intel® Gaudi® AI Accelerators This README provides instructions on running vLLM with Intel Gaudi devices. Requirements and Installation -============================= +----------------------------- Please follow the instructions provided in the `Gaudi Installation Guide `__ @@ -13,7 +13,7 @@ please follow the methods outlined in the `Optimizing Training Platform Guide `__. Requirements ------------- +~~~~~~~~~~~~ - OS: Ubuntu 22.04 LTS - Python: 3.10 @@ -22,7 +22,7 @@ Requirements Quick start using Dockerfile ----------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code:: console $ docker build -f Dockerfile.hpu -t vllm-hpu-env . @@ -34,10 +34,10 @@ Quick start using Dockerfile Build from source ------------------ +~~~~~~~~~~~~~~~~~ Environment verification -~~~~~~~~~~~~~~~~~~~~~~~~ +^^^^^^^^^^^^^^^^^^^^^^^^ To verify that the Intel Gaudi software was correctly installed, run: @@ -53,7 +53,7 @@ Verification `__ @@ -107,7 +107,7 @@ Supported Features - Attention with Linear Biases (ALiBi) Unsupported Features -==================== +-------------------- - Beam search - LoRA adapters @@ -115,7 +115,7 @@ Unsupported Features - Prefill chunking (mixed-batch inferencing) Supported Configurations -======================== +------------------------ The following configurations have been validated to be function with Gaudi2 devices. Configurations that are not listed may or may not work. @@ -152,10 +152,10 @@ Gaudi2 devices. Configurations that are not listed may or may not work. with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling Performance Tuning -================== +------------------ Execution modes ---------------- +~~~~~~~~~~~~~~~ Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via ``PT_HPU_LAZY_MODE`` environment variable), and ``--enforce-eager`` flag. @@ -184,7 +184,7 @@ Currently in vLLM for HPU we support four execution modes, depending on selected Bucketing mechanism -------------------- +~~~~~~~~~~~~~~~~~~~ Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. `Intel Gaudi Graph Compiler `__ is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution. In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - ``batch_size`` and ``sequence_length``. @@ -233,7 +233,7 @@ As an example, if a request of 3 sequences, with max sequence length of 412 come Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests. Warmup ------- +~~~~~~ Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup: @@ -257,7 +257,7 @@ This example uses the same buckets as in *Bucketing mechanism* section. Each out Compiling all the buckets might take some time and can be turned off with ``VLLM_SKIP_WARMUP=true`` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment. HPU Graph capture ------------------ +~~~~~~~~~~~~~~~~~ `HPU Graphs `__ are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management. @@ -321,7 +321,7 @@ Each described step is logged by vLLM server, as follows (negative values corres Recommended vLLM Parameters ---------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~ - We recommend running inference on Gaudi 2 with ``block_size`` of 128 for BF16 data type. Using default values (16, 32) might lead to @@ -333,7 +333,7 @@ Recommended vLLM Parameters If you encounter out-of-memory issues, see troubleshooting section. Environment variables ---------------------- +~~~~~~~~~~~~~~~~~~~~~ **Diagnostic and profiling knobs:** @@ -380,7 +380,7 @@ Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM - ``PT_HPU_ENABLE_LAZY_COLLECTIVES``: required to be ``true`` for tensor parallel inference with HPU Graphs Troubleshooting: Tweaking HPU Graphs -==================================== +------------------------------------ If you experience device out-of-memory issues or want to attempt inference at higher batch sizes, try tweaking HPU Graphs by following From 133707123e730a3544875d432a9435bdfe5e34cf Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sun, 1 Dec 2024 08:02:54 +0800 Subject: [PATCH 202/255] [Model] Replace embedding models with pooling adapter (#10769) Signed-off-by: DarkLight1337 --- .buildkite/test-pipeline.yaml | 4 +- docs/source/models/supported_models.rst | 15 ++- tests/conftest.py | 1 - .../embedding/language/test_embedding.py | 5 + tests/models/test_registry.py | 31 +++--- .../my_gemma_embedding.py | 45 +++++++- tests/test_config.py | 3 +- vllm/config.py | 25 +++++ vllm/inputs/registry.py | 16 +-- vllm/model_executor/layers/pooler.py | 4 +- vllm/model_executor/model_loader/loader.py | 18 +++- vllm/model_executor/model_loader/utils.py | 18 +++- vllm/model_executor/models/adapters.py | 98 +++++++++++++++++ vllm/model_executor/models/blip2.py | 5 +- vllm/model_executor/models/gemma2.py | 58 +--------- vllm/model_executor/models/internvl.py | 5 +- vllm/model_executor/models/llama.py | 102 ++---------------- vllm/model_executor/models/llava.py | 5 +- vllm/model_executor/models/llava_next.py | 26 +---- .../model_executor/models/llava_next_video.py | 5 +- vllm/model_executor/models/llava_onevision.py | 5 +- vllm/model_executor/models/paligemma.py | 5 +- vllm/model_executor/models/phi3v.py | 39 +++---- vllm/model_executor/models/pixtral.py | 5 +- vllm/model_executor/models/qwen2.py | 28 +++-- vllm/model_executor/models/qwen2_vl.py | 18 +--- vllm/model_executor/models/registry.py | 59 ++++++---- vllm/model_executor/models/ultravox.py | 5 +- vllm/model_executor/models/utils.py | 24 ++++- vllm/multimodal/base.py | 6 +- vllm/multimodal/registry.py | 5 +- vllm/utils.py | 22 +++- 32 files changed, 387 insertions(+), 323 deletions(-) create mode 100644 vllm/model_executor/models/adapters.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index fc23c9cff0d87..46692506f01d4 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -334,7 +334,6 @@ steps: commands: - pytest -v -s models/decoder_only/language -m 'core_model or quant_model' - pytest -v -s models/embedding/language -m core_model - - pytest -v -s models/embedding/vision_language -m core_model - label: Language Models Test (Extended) # 50min optional: true @@ -346,7 +345,6 @@ steps: commands: - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model' - pytest -v -s models/embedding/language -m 'not core_model' - - pytest -v -s models/embedding/vision_language -m 'not core_model' - label: Multi-Modal Models Test (Standard) # 26min #mirror_hardwares: [amd] @@ -359,6 +357,7 @@ steps: commands: - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model' - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model' + - pytest -v -s models/embedding/vision_language -m core_model - pytest -v -s models/encoder_decoder/language -m core_model - pytest -v -s models/encoder_decoder/vision_language -m core_model @@ -376,6 +375,7 @@ steps: # https://github.com/huggingface/transformers/issues/34307 - pytest -v -s models/decoder_only/vision_language/test_phi3v.py - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model' + - pytest -v -s models/embedding/vision_language -m 'not core_model' - pytest -v -s models/encoder_decoder/language -m 'not core_model' - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model' diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 7b7a83f20871b..f571b8bf6735e 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -357,7 +357,7 @@ Text Embedding - ✅︎ * - :code:`Qwen2Model`, :code:`Qwen2ForCausalLM` - Qwen2-based - - :code:`ssmits/Qwen2-7B-Instruct-embed-base`, :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. + - :code:`ssmits/Qwen2-7B-Instruct-embed-base` (see note), :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. - ✅︎ - ✅︎ * - :code:`RobertaModel`, :code:`RobertaForMaskedLM` @@ -378,6 +378,10 @@ Text Embedding .. tip:: You can override the model's pooling method by passing :code:`--override-pooler-config`. +.. note:: + :code:`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. + You should manually set mean pooling by passing :code:`--override-pooler-config '{"pooling_type": "MEAN"}'`. + .. note:: Unlike base Qwen2, :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention. You can set :code:`--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly. @@ -397,12 +401,21 @@ Reward Modeling - Example HF Models - :ref:`LoRA ` - :ref:`PP ` + * - :code:`LlamaForCausalLM` + - Llama-based + - :code:`peiyi9979/math-shepherd-mistral-7b-prm`, etc. + - ✅︎ + - ✅︎ * - :code:`Qwen2ForRewardModel` - Qwen2-based - :code:`Qwen/Qwen2.5-Math-RM-72B`, etc. - ✅︎ - ✅︎ +.. important:: + For process-supervised reward models such as :code:`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, + e.g.: :code:`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. + .. note:: As an interim measure, these models are supported in both offline and online inference via Embeddings API. diff --git a/tests/conftest.py b/tests/conftest.py index d56942d8912af..36f1d477fab59 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -263,7 +263,6 @@ def __init__( dtype: str = "half", *, model_kwargs: Optional[Dict[str, Any]] = None, - is_embedding_model: bool = False, is_sentence_transformer: bool = False, is_cross_encoder: bool = False, skip_tokenizer_init: bool = False, diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index 36b1e5887981c..5ef8540265d14 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -4,6 +4,8 @@ """ import pytest +from vllm.config import PoolerConfig + from ..utils import check_embeddings_close @@ -33,6 +35,9 @@ def test_models( dtype: str, ) -> None: vllm_extra_kwargs = {} + if model == "ssmits/Qwen2-7B-Instruct-embed-base": + vllm_extra_kwargs["override_pooler_config"] = \ + PoolerConfig(pooling_type="MEAN") if model == "Alibaba-NLP/gte-Qwen2-7B-instruct": vllm_extra_kwargs["hf_overrides"] = {"is_causal": False} diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index 289ea66b5ebc5..1886b1f9898ad 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -6,11 +6,8 @@ from vllm.model_executor.models import (is_embedding_model, is_text_generation_model, supports_multimodal) -# yapf conflicts with isort for this block -# yapf: disable -from vllm.model_executor.models.registry import (_CROSS_ENCODER_MODELS, - _EMBEDDING_MODELS, - _MULTIMODAL_MODELS, +from vllm.model_executor.models.adapters import as_embedding_model +from vllm.model_executor.models.registry import (_MULTIMODAL_MODELS, _SPECULATIVE_DECODING_MODELS, _TEXT_GENERATION_MODELS, ModelRegistry) @@ -26,18 +23,18 @@ def test_registry_imports(model_arch): model_cls, _ = ModelRegistry.resolve_model_cls(model_arch) if model_arch in _SPECULATIVE_DECODING_MODELS: - pass # Ignore these models which do not have a unified format - else: - assert is_text_generation_model(model_cls) is ( - model_arch in _TEXT_GENERATION_MODELS - or model_arch in _MULTIMODAL_MODELS) - - embedding_models = {**_EMBEDDING_MODELS, **_CROSS_ENCODER_MODELS} - assert is_embedding_model(model_cls) is (model_arch - in embedding_models) - - assert supports_multimodal(model_cls) is (model_arch - in _MULTIMODAL_MODELS) + return # Ignore these models which do not have a unified format + + if (model_arch in _TEXT_GENERATION_MODELS + or model_arch in _MULTIMODAL_MODELS): + assert is_text_generation_model(model_cls) + + # All vLLM models should be convertible to an embedding model + embed_model = as_embedding_model(model_cls) + assert is_embedding_model(embed_model) + + if model_arch in _MULTIMODAL_MODELS: + assert supports_multimodal(model_cls) @fork_new_process_for_each_test diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py index 21958b1640204..d676eacffb056 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py @@ -1,13 +1,34 @@ -from typing import List, Optional, Union +from typing import Iterable, List, Optional, Tuple, Union import torch +import torch.nn as nn from vllm.attention import AttentionMetadata -from vllm.model_executor.models.gemma2 import Gemma2EmbeddingModel -from vllm.sequence import IntermediateTensors +from vllm.config import VllmConfig +from vllm.model_executor.layers.pooler import Pooler, PoolingType +from vllm.model_executor.models.gemma2 import Gemma2Model +from vllm.model_executor.models.utils import WeightsMapper, maybe_prefix +from vllm.model_executor.pooling_metadata import PoolingMetadata +from vllm.sequence import IntermediateTensors, PoolerOutput -class MyGemma2Embedding(Gemma2EmbeddingModel): +class MyGemma2Embedding(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + self.model = Gemma2Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + + self._pooler = Pooler.from_config_with_defaults( + vllm_config.model_config.pooler_config, + pooling_type=PoolingType.LAST, + normalize=True, + softmax=False, + ) + + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) def forward( self, @@ -18,7 +39,7 @@ def forward( intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: - hidden_states = super().forward( + hidden_states = self.model( input_ids, positions, kv_caches, @@ -32,3 +53,17 @@ def forward( # Return all-zero embeddings return torch.zeros_like(hidden_states) + + def pooler( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> Optional[PoolerOutput]: + return self._pooler(hidden_states, pooling_metadata) + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) + weights = hf_to_vllm_mapper.apply(weights) + weights = ((name, data) for name, data in weights + if not name.startswith("lm_head.")) + return self.model.load_weights(weights) diff --git a/tests/test_config.py b/tests/test_config.py index 3cf90297ce177..45b0b938af215 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -26,8 +26,7 @@ def test_auto_task(model_id, expected_task): @pytest.mark.parametrize(("model_id", "bad_task"), [ - ("facebook/opt-125m", "embedding"), - ("intfloat/e5-mistral-7b-instruct", "generate"), + ("Qwen/Qwen2.5-Math-RM-72B", "generate"), ]) def test_incorrect_task(model_id, bad_task): with pytest.raises(ValueError, match=r"does not support the .* task"): diff --git a/vllm/config.py b/vllm/config.py index b1e5b412fec8f..51b8cf24803ab 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -370,6 +370,31 @@ def _resolve_task( selected_task = next(iter(supported_tasks_lst)) if len(supported_tasks) > 1: + suffix_to_preferred_task: List[Tuple[str, _Task]] = [ + # Hardcode the models that are exceptions + ("AquilaModel", "generate"), + ("ChatGLMModel", "generate"), + # Other models follow this pattern + ("ForCausalLM", "generate"), + ("ForConditionalGeneration", "generate"), + ("ChatModel", "generate"), + ("LMHeadModel", "generate"), + ("EmbeddingModel", "embedding"), + ("RewardModel", "embedding"), + ("ForSequenceClassification", "embedding"), + ] + info, arch = ModelRegistry.inspect_model_cls(architectures) + + for suffix, pref_task in suffix_to_preferred_task: + if arch.endswith(suffix) and pref_task in supported_tasks: + selected_task = pref_task + break + else: + if (arch.endswith("Model") + and info.architecture.endswith("ForCausalLM") + and "embedding" in supported_tasks): + selected_task = "embedding" + logger.info( "This model supports multiple tasks: %s. " "Defaulting to '%s'.", supported_tasks, selected_task) diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 68b4756331e6d..85ab4355cc2e4 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -11,8 +11,8 @@ from vllm.logger import init_logger from vllm.transformers_utils.processor import cached_get_processor from vllm.transformers_utils.tokenizer import AnyTokenizer -from vllm.utils import (get_allowed_kwarg_only_overrides, print_warning_once, - resolve_mm_processor_kwargs) +from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides, + print_warning_once, resolve_mm_processor_kwargs) from .data import ProcessorInputs, SingletonInputs from .parse import is_encoder_decoder_inputs @@ -136,12 +136,12 @@ class InputRegistry: """ def __init__(self) -> None: - self._dummy_factories_by_model_type: Dict[Type[nn.Module], - DummyDataFactory] = {} - self._dummy_encoder_factories_by_model_type: Dict[ - Type[nn.Module], DummyDataFactory] = {} - self._input_processors_by_model_type: Dict[Type[nn.Module], - InputProcessor] = {} + self._dummy_factories_by_model_type = \ + ClassRegistry[nn.Module, DummyDataFactory]() + self._dummy_encoder_factories_by_model_type = \ + ClassRegistry[nn.Module, DummyDataFactory]() + self._input_processors_by_model_type = \ + ClassRegistry[nn.Module, InputProcessor]() def _default_dummy_data_factory( self, diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index f9437b4112ceb..e0d42e30ebef3 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -60,9 +60,7 @@ def from_config_with_defaults( softmax: bool, step_tag_id: Optional[int] = None, returned_token_ids: Optional[List[int]] = None, - ) -> Optional["Pooler"]: - if pooler_config is None: - return None + ) -> "Pooler": return cls( pooling_type=PoolingType[pooler_config.pooling_type] if pooler_config.pooling_type is not None else pooling_type, diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 37c2d789030b6..0e12bc5691538 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -9,6 +9,7 @@ import json import math import os +import warnings from abc import ABC, abstractmethod from contextlib import contextmanager from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, cast @@ -97,22 +98,31 @@ def device_loading_context(module: torch.nn.Module, logger = init_logger(__name__) -def _initialize_model(vllm_config: VllmConfig, prefix: str = "") -> nn.Module: +def _initialize_model( + vllm_config: VllmConfig, + *, + prefix: str = "", + architectures: Optional[list[str]] = None, +) -> nn.Module: """Initialize a model with the given configurations.""" model_config = vllm_config.model_config - model_class, _ = get_model_architecture(model_config) + model_class, _ = get_model_architecture(model_config, + architectures=architectures) + signatures = inspect.signature(model_class.__init__) all_params = [param.name for param in signatures.parameters.values()] if "vllm_config" in all_params and "prefix" in all_params: # new-style model class with set_current_vllm_config(vllm_config): return model_class(vllm_config=vllm_config, prefix=prefix) + msg = ("vLLM model class should accept `vllm_config` and `prefix` as " "input arguments. Possibly you have an old-style model class" " registered from out of tree and it is used for new vLLM version. " "Check https://docs.vllm.ai/en/latest/design/arch_overview.html " "for the design and update the model class accordingly.") - logger.warning(msg) + warnings.warn(msg, DeprecationWarning, stacklevel=2) + logger.warning( "Trying to guess the arguments for old-style model class %s", model_class, @@ -356,7 +366,7 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module: weights_to_load = {name for name, _ in model.named_parameters()} loaded_weights = model.load_weights( self._get_all_weights(model_config, model)) - # We only enable strict check for non-quantiized models + # We only enable strict check for non-quantized models # that have loaded weights tracking currently. if model_config.quantization is None and loaded_weights is not None: weights_not_loaded = weights_to_load - loaded_weights diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index b95c0b7cd0612..864dd04e79921 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -1,12 +1,13 @@ """Utilities for selecting and loading models.""" import contextlib -from typing import Tuple, Type +from typing import Optional, Tuple, Type import torch from torch import nn from vllm.config import ModelConfig from vllm.model_executor.models import ModelRegistry +from vllm.model_executor.models.adapters import as_embedding_model @contextlib.contextmanager @@ -19,8 +20,13 @@ def set_default_torch_dtype(dtype: torch.dtype): def get_model_architecture( - model_config: ModelConfig) -> Tuple[Type[nn.Module], str]: - architectures = getattr(model_config.hf_config, "architectures", []) + model_config: ModelConfig, + *, + architectures: Optional[list[str]] = None, +) -> Tuple[Type[nn.Module], str]: + if architectures is None: + architectures = getattr(model_config.hf_config, "architectures", []) + # Special handling for quantized Mixtral. # FIXME(woosuk): This is a temporary hack. mixtral_supported = [ @@ -32,7 +38,11 @@ def get_model_architecture( and "MixtralForCausalLM" in architectures): architectures = ["QuantMixtralForCausalLM"] - return ModelRegistry.resolve_model_cls(architectures) + model_cls, arch = ModelRegistry.resolve_model_cls(architectures) + if model_config.task == "embedding": + model_cls = as_embedding_model(model_cls) + + return model_cls, arch def get_architecture_class_name(model_config: ModelConfig) -> str: diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py new file mode 100644 index 0000000000000..360433a07c5b8 --- /dev/null +++ b/vllm/model_executor/models/adapters.py @@ -0,0 +1,98 @@ +from collections.abc import Iterable +from typing import Any, TypeVar + +import torch +import torch.nn as nn + +from .interfaces_base import VllmModelForEmbedding, is_embedding_model + +_T = TypeVar("_T", bound=type[nn.Module]) + + +def as_embedding_model(cls: _T) -> _T: + """Subclass an existing vLLM model to support embeddings.""" + # Avoid modifying existing embedding models + if is_embedding_model(cls): + return cls + + # Lazy import + from vllm.config import VllmConfig + from vllm.model_executor.layers.pooler import (Pooler, PoolerOutput, + PoolingType) + from vllm.model_executor.pooling_metadata import PoolingMetadata + + from .utils import AutoWeightsLoader, WeightsMapper + + class ModelForEmbedding(cls, VllmModelForEmbedding): + + def __init__( + self, + *, + vllm_config: "VllmConfig", + prefix: str = "", + **kwargs: Any, + ) -> None: + super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs) + + # These are not used in embedding models + for attr in ("lm_head", "logits_processor"): + if hasattr(self, attr): + delattr(self, attr) + + pooler_config = vllm_config.model_config.pooler_config + assert pooler_config is not None + + # If the model already defines a pooler instance, don't overwrite it + if not getattr(self, "_pooler", None): + self._pooler = Pooler.from_config_with_defaults( + pooler_config, + pooling_type=PoolingType.LAST, + normalize=True, + softmax=False, + ) + + def pooler( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> PoolerOutput: + return self._pooler(hidden_states, pooling_metadata) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + # TODO: Support uninitialized params tracking + + # We have deleted this attribute, so don't load it + weights = ((name, data) for name, data in weights + if not name.startswith("lm_head.")) + + # If `*ForCausalLM` defines `load_weights` on the inner model + # and there are no other inner modules with parameters, + # we support loading from both `*Model` and `*ForCausalLM` + if hasattr(self, "model") and hasattr(self.model, "load_weights"): + # Whether only `self.model` contains parameters + model_is_only_param = all( + name == "model" or next(child.parameters(), None) is None + for name, child in self.named_children()) + + if model_is_only_param: + mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) + weights = mapper.apply(weights) + + self.model.load_weights(weights) + return + + # For most other models + if hasattr(cls, "load_weights"): + cls.load_weights(self, weights) # type: ignore + # Fallback + else: + loader = AutoWeightsLoader(self) + loader.load_weights(weights) + + ModelForEmbedding.__name__ = cls.__name__ \ + .removesuffix("ForCausalLM") \ + .removesuffix("ForConditionalGeneration") \ + .removesuffix("ChatModel") \ + .removesuffix("LMHeadModel") + "ForEmbedding" + + return ModelForEmbedding # type: ignore diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index d2592016aff34..76b8505ee1c2a 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -512,9 +512,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) self.language_model = init_vllm_registered_model( - config.text_config, vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "language_model")) + hf_config=config.text_config, + prefix=maybe_prefix(prefix, "language_model"), + ) self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index d35fcb012e166..4664aa53ea092 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -30,19 +30,17 @@ QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors, PoolerOutput +from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP -from .utils import (AutoWeightsLoader, WeightsMapper, extract_layer_index, +from .utils import (AutoWeightsLoader, extract_layer_index, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -455,55 +453,3 @@ def load_weights(self, weights: Iterable[Tuple[str, if self.config.tie_word_embeddings else None), ) return loader.load_weights(weights) - - -class Gemma2EmbeddingModel(nn.Module, SupportsPP): - """ - A model that uses Gemma2 with additional embedding functionalities. - - This class encapsulates the Gemma2Model and provides an interface for - embedding operations and customized pooling functions. - - Attributes: - model: An instance of Gemma2Model used for forward operations. - _pooler: An instance of Pooler used for pooling operations. - """ - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - self.model = Gemma2Model(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) - self._pooler = Pooler.from_config_with_defaults( - vllm_config.model_config.pooler_config, - pooling_type=PoolingType.LAST, - normalize=True, - softmax=False) - self.make_empty_intermediate_tensors = ( - self.model.make_empty_intermediate_tensors) - - def forward( - self, - input_ids: Optional[torch.Tensor], - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, IntermediateTensors]: - return self.model(input_ids, positions, kv_caches, attn_metadata, - intermediate_tensors, inputs_embeds) - - def pooler( - self, - hidden_states: torch.Tensor, - pooling_metadata: PoolingMetadata, - ) -> Optional[PoolerOutput]: - return self._pooler(hidden_states, pooling_metadata) - - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): - hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) - weights = hf_to_vllm_mapper.apply(weights) - weights = ((name, data) for name, data in weights - if not name.startswith("lm_head.")) - self.model.load_weights(weights) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index b1c0065afbf30..86aab38032450 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -474,9 +474,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: ) self.language_model = init_vllm_registered_model( - config.text_config, vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "language_model")) + hf_config=config.text_config, + prefix=maybe_prefix(prefix, "language_model"), + ) self.mlp1 = self._init_mlp1(config) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index ff0ab011a9158..31dfb235ae877 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -37,7 +37,6 @@ QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( get_compressed_tensors_cache_scale) @@ -47,14 +46,13 @@ DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.platforms import current_platform -from vllm.sequence import IntermediateTensors, PoolerOutput +from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP -from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, - extract_layer_index, is_pp_missing_parameter, +from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index, + is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -511,11 +509,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config - pooler_config = vllm_config.model_config.pooler_config self.config = config self.lora_config = lora_config - self.model = self._init_model(vllm_config=vllm_config, prefix=prefix) + self.model = self._init_model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + if get_pp_group().is_last_rank: self.unpadded_vocab_size = config.vocab_size if lora_config: @@ -544,13 +543,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.sampler = get_sampler() else: self.lm_head = PPMissingLayer() + self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) - self._pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.STEP, - normalize=False, - softmax=False) def _init_model(self, vllm_config: VllmConfig, prefix: str = ""): return LlamaModel(vllm_config=vllm_config, prefix=prefix) @@ -581,14 +576,6 @@ def compute_logits( sampling_metadata) return logits - def pooler( - self, - hidden_states: torch.Tensor, - pooling_metadata: PoolingMetadata, - ) -> Optional[PoolerOutput]: - logits = self.compute_logits(hidden_states, None) - return self._pooler(logits, pooling_metadata) - def sample(self, logits: torch.Tensor, sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]: next_tokens = self.sampler(logits, sampling_metadata) @@ -639,78 +626,3 @@ def permute(w: torch.Tensor, n_heads: int): name = name.replace(item, mapping[item]) return name, loaded_weight - - -class LlamaEmbeddingModel(nn.Module, SupportsLoRA, SupportsPP): - """ - A model that uses Llama with additional embedding functionalities. - - This class encapsulates the LlamaModel and provides an interface for - embedding operations and customized pooling functions. - - Attributes: - model: An instance of LlamaModel used for forward operations. - _pooler: An instance of Pooler used for pooling operations. - """ - packed_modules_mapping = { - "qkv_proj": ["q_proj", "k_proj", "v_proj"], - "gate_up_proj": ["gate_proj", "up_proj"] - } - - # LoRA specific attributes - supported_lora_modules = [ - "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens" - ] - embedding_modules = { - "embed_tokens": "input_embeddings", - } - embedding_padding_modules = [] - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - pooler_config = vllm_config.model_config.pooler_config - - self.model = LlamaModel(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) - self._pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.LAST, - normalize=True, - softmax=False) - self.make_empty_intermediate_tensors = ( - self.model.make_empty_intermediate_tensors) - - def forward( - self, - input_ids: Optional[torch.Tensor], - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, IntermediateTensors]: - return self.model(input_ids, positions, kv_caches, attn_metadata, - intermediate_tensors, inputs_embeds) - - def pooler( - self, - hidden_states: torch.Tensor, - pooling_metadata: PoolingMetadata, - ) -> Optional[PoolerOutput]: - return self._pooler(hidden_states, pooling_metadata) - - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): - hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={"model.": ""}) - weights = hf_to_vllm_mapper.apply(weights) - weights = ((name, data) for name, data in weights - if not name.startswith("lm_head.")) - self.model.load_weights(weights) - - def load_kv_cache_scales(self, quantization_param_path: str) -> None: - self.model.load_kv_cache_scales(quantization_param_path) - - # LRUCacheWorkerLoRAManager instantiation requires model config. - @property - def config(self): - return self.model.config diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index e7757b3c7d405..7fd4b32774798 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -319,9 +319,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: projector_hidden_act=config.projector_hidden_act) self.language_model = init_vllm_registered_model( - config.text_config, vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "language_model")) + hf_config=config.text_config, + prefix=maybe_prefix(prefix, "language_model"), + ) self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index e113f5862830d..a39f2f4124d05 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -14,13 +14,11 @@ from vllm.config import VllmConfig from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, InputContext) -from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler -from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import NestedTensors -from vllm.sequence import IntermediateTensors, PoolerOutput +from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of from .clip import (CLIPVisionModel, dummy_image_for_clip, @@ -286,7 +284,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - pooler_config = vllm_config.model_config.pooler_config multimodal_config = vllm_config.model_config.multimodal_config vision_feature_layer = config.vision_feature_layer @@ -321,17 +318,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: projector_hidden_act=config.projector_hidden_act) self.language_model = init_vllm_registered_model( - config.text_config, vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "language_model")) - - # The same model class supports both language generation and embedding - # because the architecture name is the same - self._pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.LAST, - normalize=True, - softmax=False) + hf_config=config.text_config, + prefix=maybe_prefix(prefix, "language_model"), + ) + self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) @@ -678,13 +669,6 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def pooler( - self, - hidden_states: torch.Tensor, - pooling_metadata: PoolingMetadata, - ) -> Optional[PoolerOutput]: - return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index b130791808924..0de9d8c5ea572 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -275,9 +275,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: text_hidden_size=config.text_config.hidden_size, projector_hidden_act=config.projector_hidden_act) self.language_model = init_vllm_registered_model( - config.text_config, vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "language_model")) + hf_config=config.text_config, + prefix=maybe_prefix(prefix, "language_model"), + ) self.make_empty_intermediate_tensors = ( self.language_model.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 3166737d61582..0bebc1c745e2b 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -422,9 +422,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: prefix=maybe_prefix(prefix, "vision_tower")) self.multi_modal_projector = LlavaOnevisionMultiModalProjector(config) self.language_model = init_vllm_registered_model( - config.text_config, vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "language_model")) + hf_config=config.text_config, + prefix=maybe_prefix(prefix, "language_model"), + ) self.image_newline = nn.Parameter( torch.empty(config.text_config.hidden_size)) diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 2e5b6bee784e7..253e689e50a3b 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -151,9 +151,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.quant_config = quant_config config.text_config.architectures = ["GemmaForCausalLM"] self.language_model = init_vllm_registered_model( - config.text_config, vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "language_model")) + hf_config=config.text_config, + prefix=maybe_prefix(prefix, "language_model"), + ) logit_scale = getattr(config, "logit_scale", 1.0) self.language_model.logits_processor.scale *= logit_scale diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 4cb874a13e0c1..eef23029a2aca 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -29,24 +29,22 @@ from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, InputContext, token_inputs) from vllm.logger import init_logger -from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.models.clip import CLIPVisionModel -from vllm.model_executor.models.llama import LlamaForCausalLM -from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import NestedTensors, PlaceholderRange from vllm.multimodal.utils import cached_get_tokenizer, repeat_and_pad_token -from vllm.sequence import IntermediateTensors, PoolerOutput +from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of from .clip import dummy_image_for_clip, dummy_seq_data_for_clip from .interfaces import SupportsMultiModal, SupportsPP -from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix, +from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, + init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) logger = init_logger(__name__) @@ -536,7 +534,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - pooler_config = vllm_config.model_config.pooler_config multimodal_config = vllm_config.model_config.multimodal_config self.config = config self.multimodal_config = multimodal_config @@ -556,18 +553,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config, prefix=maybe_prefix(prefix, "model.vision_embed_tokens")) - # The prefix is empty intentionally because default prefix of - # LlamaForCausalLM is "model" - self.language_model = LlamaForCausalLM(vllm_config=vllm_config, - prefix="") - - # The same model class supports both language generation and embedding - # because the architecture name is the same - self._pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.LAST, - normalize=True, - softmax=False) + self.language_model = init_vllm_registered_model( + vllm_config=vllm_config, + # The prefix is empty intentionally because default prefix of + # LlamaForCausalLM is "model" + prefix="", + # We don't directly initialize vLLM's LlamaForCausalLM so we + # can automatically apply embedding wrapper if this model is + # initialized as an embedding model + architectures=["LlamaForCausalLM"], + ) + self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) @@ -739,13 +735,6 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) - def pooler( - self, - hidden_states: torch.Tensor, - pooling_metadata: PoolingMetadata, - ) -> Optional[PoolerOutput]: - return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: hf_to_vllm_mapper = WeightsMapper( diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 45171c1a04b17..215727cadd954 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -172,9 +172,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # init MistralForCausalLM self.language_model = init_vllm_registered_model( - config.text_config, vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "language_model")) + hf_config=config.text_config, + prefix=maybe_prefix(prefix, "language_model"), + ) self.vision_encoder = VisionTransformer(self.vision_args) self.vision_language_adapter = VisionLanguageAdapter( diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 87943e53d861c..7d4cc4b69e614 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -31,6 +31,7 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, @@ -55,6 +56,8 @@ make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) +logger = init_logger(__name__) + class Qwen2MLP(nn.Module): @@ -433,7 +436,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config - pooler_config = vllm_config.model_config.pooler_config self.config = config self.lora_config = lora_config @@ -454,14 +456,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = get_sampler() - # The same model class supports both language generation and embedding - # because the architecture name is the same - self._pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.LAST, - normalize=True, - softmax=False) - self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -499,13 +493,6 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def pooler( - self, - hidden_states: torch.Tensor, - pooling_metadata: PoolingMetadata, - ) -> Optional[PoolerOutput]: - return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader( @@ -553,6 +540,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.model = Qwen2Model(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) + # TODO: Replace this model class with for_embedding(Qwen2ForCausalLM), + # after changing the default pooling method + if pooler_config.pooling_type is None: + logger.warning( + "This embedding model will default to last-token pooling in " + "an upcoming version. To avoid breaking changes, you should " + "pass `--override-pooler-config '{\"pooling_type\": \"MEAN\"}'`" + " explicitly.") + self._pooler = Pooler.from_config_with_defaults( pooler_config, pooling_type=PoolingType.MEAN, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 7956a98b21569..27175dbae7483 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -50,7 +50,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.gptq import GPTQConfig from vllm.model_executor.layers.quantization.gptq_marlin import ( @@ -59,14 +58,13 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.qwen2 import Qwen2Model -from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.inputs import (MultiModalData, MultiModalDataDict, MultiModalKwargs, NestedTensors) from vllm.multimodal.utils import cached_get_tokenizer from vllm.platforms import _Backend -from vllm.sequence import IntermediateTensors, PoolerOutput, SequenceData +from vllm.sequence import IntermediateTensors, SequenceData from vllm.transformers_utils.config import uses_mrope from vllm.transformers_utils.processor import cached_get_processor @@ -1070,7 +1068,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config - pooler_config = vllm_config.model_config.pooler_config multimodal_config = vllm_config.model_config.multimodal_config assert not cache_config.enable_prefix_caching, \ "Qwen2-VL currently does not support prefix caching" @@ -1102,11 +1099,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = get_sampler() - self._pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.LAST, - normalize=True, - softmax=False) + self.make_empty_intermediate_tensors = ( make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) @@ -1361,13 +1354,6 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def pooler( - self, - hidden_states: torch.Tensor, - pooling_metadata: PoolingMetadata, - ) -> Optional[PoolerOutput]: - return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: stacked_params_mapping = [ diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index c400c7d59828c..7d2bfce9ba264 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -20,6 +20,7 @@ from vllm.logger import init_logger from vllm.platforms import current_platform +from .adapters import as_embedding_model from .interfaces import (has_inner_state, is_attention_free, supports_cross_encoding, supports_multimodal, supports_pp) @@ -107,15 +108,15 @@ "RobertaForMaskedLM": ("roberta", "RobertaEmbeddingModel"), "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"), "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"), - "Gemma2Model": ("gemma2", "Gemma2EmbeddingModel"), + "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"), "GlmForCausalLM": ("glm", "GlmForCausalLM"), - "LlamaModel": ("llama", "LlamaEmbeddingModel"), + "LlamaModel": ("llama", "LlamaForCausalLM"), **{ # Multiple models share the same architecture, so we include them all k: (mod, arch) for k, (mod, arch) in _TEXT_GENERATION_MODELS.items() if arch == "LlamaForCausalLM" }, - "MistralModel": ("llama", "LlamaEmbeddingModel"), + "MistralModel": ("llama", "LlamaForCausalLM"), "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"), "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"), "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), @@ -125,7 +126,7 @@ # [Multimodal] "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), - "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration") # noqa: E501, + "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501 } _CROSS_ENCODER_MODELS = { @@ -208,6 +209,7 @@ @dataclass(frozen=True) class _ModelInfo: + architecture: str is_text_generation_model: bool is_embedding_model: bool supports_cross_encoding: bool @@ -218,9 +220,19 @@ class _ModelInfo: @staticmethod def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo": + is_embedding_model_ = is_embedding_model(model) + if not is_embedding_model_: + try: + as_embedding_model(model) + except Exception: + pass + else: + is_embedding_model_ = True + return _ModelInfo( + architecture=model.__name__, is_text_generation_model=is_text_generation_model(model), - is_embedding_model=is_embedding_model(model), + is_embedding_model=is_embedding_model_, supports_cross_encoding=supports_cross_encoding(model), supports_multimodal=supports_multimodal(model), supports_pp=supports_pp(model), @@ -399,13 +411,13 @@ def _normalize_archs( def inspect_model_cls( self, architectures: Union[str, List[str]], - ) -> _ModelInfo: + ) -> Tuple[_ModelInfo, str]: architectures = self._normalize_archs(architectures) for arch in architectures: model_info = self._try_inspect_model_cls(arch) if model_info is not None: - return model_info + return (model_info, arch) return self._raise_for_unsupported(architectures) @@ -426,39 +438,50 @@ def is_text_generation_model( self, architectures: Union[str, List[str]], ) -> bool: - return self.inspect_model_cls(architectures).is_text_generation_model + model_cls, _ = self.inspect_model_cls(architectures) + return model_cls.is_text_generation_model def is_embedding_model( self, architectures: Union[str, List[str]], ) -> bool: - return self.inspect_model_cls(architectures).is_embedding_model + model_cls, _ = self.inspect_model_cls(architectures) + return model_cls.is_embedding_model def is_cross_encoder_model( self, architectures: Union[str, List[str]], ) -> bool: - return self.inspect_model_cls(architectures).supports_cross_encoding + model_cls, _ = self.inspect_model_cls(architectures) + return model_cls.supports_cross_encoding def is_multimodal_model( self, architectures: Union[str, List[str]], ) -> bool: - return self.inspect_model_cls(architectures).supports_multimodal + model_cls, _ = self.inspect_model_cls(architectures) + return model_cls.supports_multimodal def is_pp_supported_model( self, architectures: Union[str, List[str]], ) -> bool: - return self.inspect_model_cls(architectures).supports_pp + model_cls, _ = self.inspect_model_cls(architectures) + return model_cls.supports_pp - def model_has_inner_state(self, architectures: Union[str, - List[str]]) -> bool: - return self.inspect_model_cls(architectures).has_inner_state + def model_has_inner_state( + self, + architectures: Union[str, List[str]], + ) -> bool: + model_cls, _ = self.inspect_model_cls(architectures) + return model_cls.has_inner_state - def is_attention_free_model(self, architectures: Union[str, - List[str]]) -> bool: - return self.inspect_model_cls(architectures).is_attention_free + def is_attention_free_model( + self, + architectures: Union[str, List[str]], + ) -> bool: + model_cls, _ = self.inspect_model_cls(architectures) + return model_cls.is_attention_free ModelRegistry = _ModelRegistry({ diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index b61deccde45b7..ea1e5401d42c0 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -360,9 +360,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): )) self.multi_modal_projector = UltravoxProjector(config) self.language_model = init_vllm_registered_model( - config.text_config, vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "language_model")) + hf_config=config.text_config, + prefix=maybe_prefix(prefix, "language_model"), + ) if config.text_model_id is not None: # this prefix is not for initialization, but for loading weights # note the trailing dot diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index a6b40a233439b..7a1e1f9bf2be4 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -173,8 +173,15 @@ def _load_module( module_load_weights = getattr(module, "load_weights", None) if callable(module_load_weights): loaded_params = module_load_weights(weights) - yield from map(lambda x: self._get_qualname(base_prefix, x), - loaded_params) + if loaded_params is None: + logger.warning( + "Unable to collect loaded parameters " + "for module %s", module) + else: + yield from map( + lambda x: self._get_qualname(base_prefix, x), + loaded_params, + ) child_modules = dict(module.named_children()) child_params = dict(module.named_parameters(recurse=False)) @@ -232,17 +239,24 @@ def load_weights( def init_vllm_registered_model( - hf_config: PretrainedConfig, vllm_config: VllmConfig, + *, prefix: str = "", + hf_config: Optional[PretrainedConfig] = None, + architectures: Optional[list[str]] = None, ) -> nn.Module: """ Helper function to initialize an inner model registered to vLLM, based on the arguments passed to the outer vLLM model. """ from vllm.model_executor.model_loader.loader import _initialize_model - vllm_config = vllm_config.with_hf_config(hf_config) - return _initialize_model(vllm_config, prefix) + + if hf_config is not None: + vllm_config = vllm_config.with_hf_config(hf_config) + + return _initialize_model(vllm_config=vllm_config, + prefix=prefix, + architectures=architectures) @overload diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 6eec660e42ac4..bbb8fb4bc1cd1 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -7,7 +7,7 @@ from vllm.inputs import InputContext from vllm.logger import init_logger -from vllm.utils import (get_allowed_kwarg_only_overrides, +from vllm.utils import (ClassRegistry, get_allowed_kwarg_only_overrides, resolve_mm_processor_kwargs) if TYPE_CHECKING: @@ -54,8 +54,8 @@ class MultiModalPlugin(ABC): """ def __init__(self) -> None: - self._input_mappers: Dict[Type[nn.Module], MultiModalInputMapper] = {} - self._max_mm_tokens: Dict[Type[nn.Module], MultiModalTokensCalc] = {} + self._input_mappers = ClassRegistry[nn.Module, MultiModalInputMapper]() + self._max_mm_tokens = ClassRegistry[nn.Module, MultiModalTokensCalc]() @abstractmethod def get_data_key(self) -> str: diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index b992442d3b314..b73daee98bd80 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -9,6 +9,7 @@ from vllm.inputs import InputProcessingContext from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils import ClassRegistry from .audio import AudioPlugin from .base import MultiModalInputMapper, MultiModalPlugin, MultiModalTokensCalc @@ -62,8 +63,8 @@ def __init__( plugins: Sequence[MultiModalPlugin] = DEFAULT_PLUGINS) -> None: self._plugins = {p.get_data_key(): p for p in plugins} - self._processor_factories: Dict[Type[nn.Module], - MultiModalProcessorFactory] = {} + self._processor_factories = ClassRegistry[nn.Module, + MultiModalProcessorFactory]() # This is used for non-multimodal models self._disabled_limits_per_plugin = {k: 0 for k in self._plugins} diff --git a/vllm/utils.py b/vllm/utils.py index 6f7a6f8c54e47..0165a22582e7b 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -20,7 +20,7 @@ import warnings import weakref from asyncio import FIRST_COMPLETED, AbstractEventLoop, Future, Task -from collections import defaultdict +from collections import UserDict, defaultdict from collections.abc import Iterable, Mapping from functools import lru_cache, partial, wraps from platform import uname @@ -1517,13 +1517,13 @@ def value(self): # Adapted from: https://stackoverflow.com/a/47212782/5082708 -class LazyDict(Mapping, Generic[T]): +class LazyDict(Mapping[str, T], Generic[T]): def __init__(self, factory: Dict[str, Callable[[], T]]): self._factory = factory self._dict: Dict[str, T] = {} - def __getitem__(self, key) -> T: + def __getitem__(self, key: str) -> T: if key not in self._dict: if key not in self._factory: raise KeyError(key) @@ -1540,6 +1540,22 @@ def __len__(self): return len(self._factory) +class ClassRegistry(UserDict[type[T], _V]): + + def __getitem__(self, key: type[T]) -> _V: + for cls in key.mro(): + if cls in self.data: + return self.data[cls] + + raise KeyError(key) + + def __contains__(self, key: object) -> bool: + if not isinstance(key, type): + return False + + return any(cls in self.data for cls in key.mro()) + + def weak_ref_tensor(tensor: torch.Tensor) -> torch.Tensor: """ Create a weak reference to a tensor. From f877a7d12a0490705e6bea0987c89548d1a015ea Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sun, 1 Dec 2024 09:48:35 +0800 Subject: [PATCH 203/255] [Misc] Improve type annotations for `support_torch_compile` (#10763) Signed-off-by: DarkLight1337 --- vllm/compilation/decorators.py | 38 ++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 8b81a29936989..8700243c9d904 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -1,7 +1,8 @@ import inspect -from typing import Dict, List, Optional, Union +from typing import Callable, Dict, List, Optional, TypeVar, Union, overload import torch +import torch.nn as nn from vllm.compilation.counter import compilation_counter from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher @@ -12,10 +13,27 @@ logger = init_logger(__name__) +_T = TypeVar("_T", bound=type[nn.Module]) + + +@overload +def support_torch_compile( + *, + dynamic_arg_dims: Optional[Dict[str, Union[int, List[int]]]], +) -> Callable[[_T], _T]: + ... + + +@overload +def support_torch_compile(cls: _T) -> _T: + ... + def support_torch_compile( - cls: Optional[type] = None, - dynamic_arg_dims: Optional[Dict[str, Union[int, List[int]]]] = None): + cls: Optional[_T] = None, + *, + dynamic_arg_dims: Optional[Dict[str, Union[int, List[int]]]] = None, +) -> Union[Callable[[_T], _T], _T]: """ A decorator to add support for compiling the forward method of a class. @@ -66,7 +84,7 @@ def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]): computation graph. """ - def cls_decorator_helper(cls: type): + def cls_decorator_helper(cls: _T) -> _T: # helper to pass `dynamic_arg_dims`` to `_support_torch_compile`` # to avoid too much indentation for `_support_torch_compile`` if not hasattr(cls, 'forward'): @@ -105,8 +123,10 @@ def cls_decorator_helper(cls: type): return cls_decorator_helper -def _support_torch_compile(cls: type, - dynamic_arg_dims: Dict[str, Union[int, List[int]]]): +def _support_torch_compile( + cls: _T, + dynamic_arg_dims: Dict[str, Union[int, List[int]]], +) -> _T: """ A decorator to add support for compiling the forward method of a class. """ @@ -119,7 +139,7 @@ def _support_torch_compile(cls: type, # other than TorchCompileWrapperWithCustomDispatcher cls.__bases__ = cls.__bases__ + (TorchCompileWrapperWithCustomDispatcher, ) - old_init = cls.__init__ # type: ignore + old_init = cls.__init__ def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs): old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs) @@ -135,7 +155,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs): TorchCompileWrapperWithCustomDispatcher.__init__( self, compilation_level=vllm_config.compilation_config.level) - cls.__init__ = __init__ # type: ignore + cls.__init__ = __init__ def __call__(self, *args, **kwargs): # torch.compiler.is_compiling() means we are inside the compilation @@ -180,5 +200,5 @@ def __call__(self, *args, **kwargs): model_output = self.forward(*args, **kwargs) return model_output - cls.__call__ = __call__ # type: ignore + cls.__call__ = __call__ return cls From d2f058e76c2a28d2109e163dc1123ead6983943c Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sun, 1 Dec 2024 14:36:51 +0800 Subject: [PATCH 204/255] [Misc] Rename embedding classes to pooling (#10801) Signed-off-by: DarkLight1337 --- examples/offline_inference_embedding.py | 2 +- tests/entrypoints/llm/test_encode.py | 6 +- tests/models/test_registry.py | 4 +- tests/worker/test_model_input.py | 4 +- vllm/__init__.py | 31 +++++++++-- vllm/config.py | 2 +- vllm/engine/async_llm_engine.py | 24 ++++---- vllm/engine/llm_engine.py | 8 +-- vllm/engine/multiprocessing/client.py | 14 ++--- vllm/engine/protocol.py | 5 +- vllm/entrypoints/llm.py | 30 +++++----- vllm/entrypoints/openai/serving_embedding.py | 12 ++-- vllm/entrypoints/openai/serving_score.py | 10 ++-- vllm/model_executor/models/__init__.py | 11 ++-- vllm/model_executor/models/adapters.py | 6 +- vllm/model_executor/models/interfaces.py | 4 +- vllm/model_executor/models/interfaces_base.py | 15 +++-- vllm/model_executor/models/registry.py | 16 +++--- vllm/outputs.py | 55 +++++++++++++------ vllm/v1/engine/async_llm.py | 4 +- vllm/v1/engine/async_stream.py | 8 +-- ..._runner.py => cpu_pooling_model_runner.py} | 4 +- vllm/worker/cpu_worker.py | 4 +- ...odel_runner.py => pooling_model_runner.py} | 6 +- vllm/worker/worker.py | 4 +- 25 files changed, 166 insertions(+), 123 deletions(-) rename vllm/worker/{cpu_embedding_model_runner.py => cpu_pooling_model_runner.py} (98%) rename vllm/worker/{embedding_model_runner.py => pooling_model_runner.py} (98%) diff --git a/examples/offline_inference_embedding.py b/examples/offline_inference_embedding.py index 7d5ef128bc8e0..ae158eef2ca4c 100644 --- a/examples/offline_inference_embedding.py +++ b/examples/offline_inference_embedding.py @@ -10,7 +10,7 @@ # Create an LLM. model = LLM(model="intfloat/e5-mistral-7b-instruct", enforce_eager=True) -# Generate embedding. The output is a list of EmbeddingRequestOutputs. +# Generate embedding. The output is a list of PoolingRequestOutputs. outputs = model.encode(prompts) # Print the outputs. for output in outputs: diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py index 4c9f796e5ed71..41163809237e9 100644 --- a/tests/entrypoints/llm/test_encode.py +++ b/tests/entrypoints/llm/test_encode.py @@ -3,7 +3,7 @@ import pytest -from vllm import LLM, EmbeddingRequestOutput, PoolingParams +from vllm import LLM, PoolingParams, PoolingRequestOutput from vllm.distributed import cleanup_dist_env_and_memory MODEL_NAME = "intfloat/e5-mistral-7b-instruct" @@ -43,8 +43,8 @@ def llm(): cleanup_dist_env_and_memory() -def assert_outputs_equal(o1: List[EmbeddingRequestOutput], - o2: List[EmbeddingRequestOutput]): +def assert_outputs_equal(o1: List[PoolingRequestOutput], + o2: List[PoolingRequestOutput]): assert [o.outputs for o in o1] == [o.outputs for o in o2] diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index 1886b1f9898ad..b5368aab3ecf1 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -3,7 +3,7 @@ import pytest import torch.cuda -from vllm.model_executor.models import (is_embedding_model, +from vllm.model_executor.models import (is_pooling_model, is_text_generation_model, supports_multimodal) from vllm.model_executor.models.adapters import as_embedding_model @@ -31,7 +31,7 @@ def test_registry_imports(model_arch): # All vLLM models should be convertible to an embedding model embed_model = as_embedding_model(model_cls) - assert is_embedding_model(embed_model) + assert is_pooling_model(embed_model) if model_arch in _MULTIMODAL_MODELS: assert supports_multimodal(model_cls) diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py index b36e8bfe73ff3..309854e6babf3 100644 --- a/tests/worker/test_model_input.py +++ b/tests/worker/test_model_input.py @@ -8,10 +8,10 @@ from vllm.attention.backends.utils import CommonAttentionState from vllm.model_executor import SamplingMetadata from vllm.model_executor.pooling_metadata import PoolingMetadata -from vllm.worker.embedding_model_runner import ( - ModelInputForGPUWithPoolingMetadata) from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata from vllm.worker.multi_step_model_runner import StatefulModelInput +from vllm.worker.pooling_model_runner import ( + ModelInputForGPUWithPoolingMetadata) class MockAttentionBackend(AttentionBackend): diff --git a/vllm/__init__.py b/vllm/__init__.py index 8f477ea84756d..a10f6d3128cb6 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -7,8 +7,8 @@ from vllm.executor.ray_utils import initialize_ray_cluster from vllm.inputs import PromptType, TextPrompt, TokensPrompt from vllm.model_executor.models import ModelRegistry -from vllm.outputs import (CompletionOutput, EmbeddingOutput, - EmbeddingRequestOutput, RequestOutput) +from vllm.outputs import (CompletionOutput, PoolingOutput, + PoolingRequestOutput, RequestOutput) from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams @@ -25,8 +25,8 @@ "SamplingParams", "RequestOutput", "CompletionOutput", - "EmbeddingOutput", - "EmbeddingRequestOutput", + "PoolingOutput", + "PoolingRequestOutput", "LLMEngine", "EngineArgs", "AsyncLLMEngine", @@ -34,3 +34,26 @@ "initialize_ray_cluster", "PoolingParams", ] + + +def __getattr__(name: str): + import warnings + + if name == "EmbeddingOutput": + msg = ("EmbeddingOutput has been renamed to PoolingOutput. " + "The original name will be removed in an upcoming version.") + + warnings.warn(DeprecationWarning(msg), stacklevel=2) + + return PoolingOutput + + if name == "EmbeddingRequestOutput": + msg = ("EmbeddingRequestOutput has been renamed to " + "PoolingRequestOutput. " + "The original name will be removed in an upcoming version.") + + warnings.warn(DeprecationWarning(msg), stacklevel=2) + + return PoolingRequestOutput + + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/vllm/config.py b/vllm/config.py index 51b8cf24803ab..da043afbe1ae7 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -359,7 +359,7 @@ def _resolve_task( # NOTE: Listed from highest to lowest priority, # in case the model supports multiple of them "generate": ModelRegistry.is_text_generation_model(architectures), - "embedding": ModelRegistry.is_embedding_model(architectures), + "embedding": ModelRegistry.is_pooling_model(architectures), } supported_tasks_lst: List[_Task] = [ task for task, is_supported in task_support.items() if is_supported diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 31a15b04314d5..7b1bb7b05708d 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -25,7 +25,7 @@ from vllm.model_executor.guided_decoding import ( get_guided_decoding_logits_processor) from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.outputs import EmbeddingRequestOutput, RequestOutput +from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams @@ -74,7 +74,7 @@ def _log_task_completion(task: asyncio.Task, class AsyncStream: - """A stream of RequestOutputs or EmbeddingRequestOutputs for a request + """A stream of RequestOutputs or PoolingRequestOutputs for a request that can be iterated over asynchronously via an async generator.""" def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None: @@ -83,7 +83,7 @@ def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None: self._queue: asyncio.Queue = asyncio.Queue() self._finished = False - def put(self, item: Union[RequestOutput, EmbeddingRequestOutput, + def put(self, item: Union[RequestOutput, PoolingRequestOutput, Exception]) -> None: if not self._finished: self._queue.put_nowait(item) @@ -103,7 +103,7 @@ def finished(self) -> bool: async def generator( self - ) -> AsyncGenerator[Union[RequestOutput, EmbeddingRequestOutput], None]: + ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]: try: while True: result = await self._queue.get() @@ -154,7 +154,7 @@ def propagate_exception(self, def process_request_output(self, request_output: Union[RequestOutput, - EmbeddingRequestOutput], + PoolingRequestOutput], *, verbose: bool = False) -> None: """Process a request output from the engine.""" @@ -265,7 +265,7 @@ def __init__(self, *args, **kwargs): async def step_async( self, virtual_engine: int - ) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: + ) -> List[Union[RequestOutput, PoolingRequestOutput]]: """Performs one decoding iteration and returns newly generated results. The workers are ran asynchronously if possible. @@ -907,7 +907,7 @@ def add_request( prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, ) -> Coroutine[None, None, AsyncGenerator[Union[ - RequestOutput, EmbeddingRequestOutput], None]]: + RequestOutput, PoolingRequestOutput], None]]: ... @overload @@ -922,7 +922,7 @@ def add_request( prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, ) -> Coroutine[None, None, AsyncGenerator[Union[ - RequestOutput, EmbeddingRequestOutput], None]]: + RequestOutput, PoolingRequestOutput], None]]: ... @deprecate_kwargs( @@ -941,7 +941,7 @@ async def add_request( priority: int = 0, *, inputs: Optional[PromptType] = None, # DEPRECATED - ) -> AsyncGenerator[Union[RequestOutput, EmbeddingRequestOutput], None]: + ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]: if inputs is not None: prompt = inputs assert prompt is not None and params is not None @@ -1070,7 +1070,7 @@ async def encode( lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, - ) -> AsyncGenerator[EmbeddingRequestOutput, None]: + ) -> AsyncGenerator[PoolingRequestOutput, None]: """Generate outputs for a request from an embedding model. Generate outputs for a request. This method is a coroutine. It adds the @@ -1088,7 +1088,7 @@ async def encode( Only applicable with priority scheduling. Yields: - The output `EmbeddingRequestOutput` objects from the LLMEngine + The output `PoolingRequestOutput` objects from the LLMEngine for the request. Details: @@ -1141,7 +1141,7 @@ async def encode( trace_headers=trace_headers, priority=priority, ): - yield LLMEngine.validate_output(output, EmbeddingRequestOutput) + yield LLMEngine.validate_output(output, PoolingRequestOutput) async def abort(self, request_id: str) -> None: """Abort a request. diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index ecc222f692c41..7911dc8d04500 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -40,7 +40,7 @@ get_local_guided_decoding_logits_processor) from vllm.model_executor.layers.sampler import SamplerOutput from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry -from vllm.outputs import (EmbeddingRequestOutput, RequestOutput, +from vllm.outputs import (PoolingRequestOutput, RequestOutput, RequestOutputFactory) from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest @@ -80,7 +80,7 @@ def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]: _G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup) -_O = TypeVar("_O", RequestOutput, EmbeddingRequestOutput) +_O = TypeVar("_O", RequestOutput, PoolingRequestOutput) @dataclass @@ -112,7 +112,7 @@ class SchedulerContext: def __init__(self, multi_step_stream_outputs: bool = False): self.output_queue: Deque[OutputData] = deque() self.request_outputs: List[Union[RequestOutput, - EmbeddingRequestOutput]] = [] + PoolingRequestOutput]] = [] self.seq_group_metadata_list: Optional[ List[SequenceGroupMetadata]] = None self.scheduler_outputs: Optional[SchedulerOutputs] = None @@ -1314,7 +1314,7 @@ def _advance_to_next_step( else: seq.append_token_id(sample.output_token, sample.logprobs) - def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: + def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]: """Performs one decoding iteration and returns newly generated results. .. figure:: https://i.imgur.com/sv2HssD.png diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index fe21c58c775fe..d26728e8c6e67 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -35,7 +35,7 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.outputs import EmbeddingRequestOutput, RequestOutput +from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs @@ -495,7 +495,7 @@ def encode( lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, - ) -> AsyncGenerator[EmbeddingRequestOutput, None]: + ) -> AsyncGenerator[PoolingRequestOutput, None]: ... @overload @@ -507,7 +507,7 @@ def encode( lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, - ) -> AsyncGenerator[EmbeddingRequestOutput, None]: + ) -> AsyncGenerator[PoolingRequestOutput, None]: ... @deprecate_kwargs( @@ -524,7 +524,7 @@ def encode( priority: int = 0, *, inputs: Optional[PromptType] = None # DEPRECATED - ) -> AsyncGenerator[EmbeddingRequestOutput, None]: + ) -> AsyncGenerator[PoolingRequestOutput, None]: """Generate outputs for a request from an embedding model. Generate outputs for a request. This method is a coroutine. It adds the @@ -540,7 +540,7 @@ def encode( trace_headers: OpenTelemetry trace headers. Yields: - The output `EmbeddingRequestOutput` objects from the LLMEngine + The output `PoolingRequestOutput` objects from the LLMEngine for the request. """ if inputs is not None: @@ -549,7 +549,7 @@ def encode( and request_id is not None) return cast( - AsyncGenerator[EmbeddingRequestOutput, None], + AsyncGenerator[PoolingRequestOutput, None], self._process_request(prompt, pooling_params, request_id, @@ -567,7 +567,7 @@ async def _process_request( prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, ) -> Union[AsyncGenerator[RequestOutput, None], AsyncGenerator[ - EmbeddingRequestOutput, None]]: + PoolingRequestOutput, None]]: """Send an RPCGenerateRequest to the RPCServer and stream responses.""" # If already dead, error out. diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index e15395d75c91f..4079de7d36793 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -11,8 +11,7 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.outputs import (CompletionOutput, EmbeddingRequestOutput, - RequestOutput) +from vllm.outputs import CompletionOutput, PoolingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import BeamSearchParams, SamplingParams @@ -209,7 +208,7 @@ def encode( lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, - ) -> AsyncGenerator[EmbeddingRequestOutput, None]: + ) -> AsyncGenerator[PoolingRequestOutput, None]: """Generate outputs for a request from an embedding model.""" ... diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 1551a9a998160..a25c401b4ea10 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -26,7 +26,7 @@ from vllm.lora.request import LoRARequest from vllm.model_executor.guided_decoding.guided_fields import ( GuidedDecodingRequest, LLMGuidedOptions) -from vllm.outputs import EmbeddingRequestOutput, RequestOutput +from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import (BeamSearchParams, GuidedDecodingParams, @@ -679,7 +679,7 @@ def encode( prompt_token_ids: Optional[List[int]] = None, use_tqdm: bool = True, lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, - ) -> List[EmbeddingRequestOutput]: + ) -> List[PoolingRequestOutput]: ... @overload # LEGACY: multi (prompt + optional token ids) @@ -691,7 +691,7 @@ def encode( prompt_token_ids: Optional[List[List[int]]] = None, use_tqdm: bool = True, lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, - ) -> List[EmbeddingRequestOutput]: + ) -> List[PoolingRequestOutput]: ... @overload # LEGACY: single (token ids + optional prompt) @@ -704,7 +704,7 @@ def encode( prompt_token_ids: List[int], use_tqdm: bool = True, lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, - ) -> List[EmbeddingRequestOutput]: + ) -> List[PoolingRequestOutput]: ... @overload # LEGACY: multi (token ids + optional prompt) @@ -717,7 +717,7 @@ def encode( prompt_token_ids: List[List[int]], use_tqdm: bool = True, lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, - ) -> List[EmbeddingRequestOutput]: + ) -> List[PoolingRequestOutput]: ... @overload # LEGACY: single or multi token ids [pos-only] @@ -728,7 +728,7 @@ def encode( prompt_token_ids: Union[List[int], List[List[int]]], use_tqdm: bool = True, lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, - ) -> List[EmbeddingRequestOutput]: + ) -> List[PoolingRequestOutput]: ... @overload @@ -741,7 +741,7 @@ def encode( Sequence[PoolingParams]]] = None, use_tqdm: bool = True, lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, - ) -> List[EmbeddingRequestOutput]: + ) -> List[PoolingRequestOutput]: ... @deprecate_kwargs( @@ -759,7 +759,7 @@ def encode( use_tqdm: bool = True, lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[EmbeddingRequestOutput]: + ) -> List[PoolingRequestOutput]: """Generates the completions for the input prompts. This class automatically batches the given prompts, considering @@ -778,7 +778,7 @@ def encode( generation, if any. Returns: - A list of ``EmbeddingRequestOutput`` objects containing the + A list of ``PoolingRequestOutput`` objects containing the generated embeddings in the same order as the input prompts. Note: @@ -821,7 +821,7 @@ def encode( outputs = self._run_engine(use_tqdm=use_tqdm) return self.engine_class.validate_outputs(outputs, - EmbeddingRequestOutput) + PoolingRequestOutput) def score( self, @@ -832,7 +832,7 @@ def score( use_tqdm: bool = True, lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> List[EmbeddingRequestOutput]: + ) -> List[PoolingRequestOutput]: """Generates similarity scores for all pairs . The inputs can be 1 -> 1, 1 -> N or N -> N. In the 1 - N case @@ -854,7 +854,7 @@ def score( generation, if any. Returns: - A list of ``EmbeddingRequestOutput`` objects containing the + A list of ``PoolingRequestOutput`` objects containing the generated scores in the same order as the input prompts. """ task = self.llm_engine.model_config.task @@ -943,7 +943,7 @@ def ensure_str(prompt: SingletonPrompt): outputs = self._run_engine(use_tqdm=use_tqdm) return self.engine_class.validate_outputs(outputs, - EmbeddingRequestOutput) + PoolingRequestOutput) def start_profile(self) -> None: self.llm_engine.start_profile() @@ -1085,7 +1085,7 @@ def _add_guided_params( def _run_engine( self, *, use_tqdm: bool - ) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: + ) -> List[Union[RequestOutput, PoolingRequestOutput]]: # Initialize tqdm. if use_tqdm: num_requests = self.llm_engine.get_num_unfinished_requests() @@ -1098,7 +1098,7 @@ def _run_engine( ) # Run the engine. - outputs: List[Union[RequestOutput, EmbeddingRequestOutput]] = [] + outputs: List[Union[RequestOutput, PoolingRequestOutput]] = [] total_in_toks = 0 total_out_toks = 0 while self.llm_engine.has_unfinished_requests(): diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 78e2416d9d4da..2cbb252610e39 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -18,14 +18,14 @@ ErrorResponse, UsageInfo) from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing from vllm.logger import init_logger -from vllm.outputs import EmbeddingOutput, EmbeddingRequestOutput +from vllm.outputs import PoolingOutput, PoolingRequestOutput from vllm.utils import merge_async_iterators, random_uuid logger = init_logger(__name__) def _get_embedding( - output: EmbeddingOutput, + output: PoolingOutput, encoding_format: Literal["float", "base64"], ) -> Union[List[float], str]: if encoding_format == "float": @@ -40,7 +40,7 @@ def _get_embedding( def request_output_to_embedding_response( - final_res_batch: List[EmbeddingRequestOutput], request_id: str, + final_res_batch: List[PoolingRequestOutput], request_id: str, created_time: int, model_name: str, encoding_format: Literal["float", "base64"]) -> EmbeddingResponse: data: List[EmbeddingResponseData] = [] @@ -169,7 +169,7 @@ async def create_embedding( return self.create_error_response(str(e)) # Schedule the request and get the result generator. - generators: List[AsyncGenerator[EmbeddingRequestOutput, None]] = [] + generators: List[AsyncGenerator[PoolingRequestOutput, None]] = [] try: pooling_params = request.to_pooling_params() @@ -207,7 +207,7 @@ async def create_embedding( num_prompts = len(engine_prompts) # Non-streaming response - final_res_batch: List[Optional[EmbeddingRequestOutput]] + final_res_batch: List[Optional[PoolingRequestOutput]] final_res_batch = [None] * num_prompts try: async for i, res in result_generator: @@ -215,7 +215,7 @@ async def create_embedding( assert all(final_res is not None for final_res in final_res_batch) - final_res_batch_checked = cast(List[EmbeddingRequestOutput], + final_res_batch_checked = cast(List[PoolingRequestOutput], final_res_batch) response = request_output_to_embedding_response( diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index 7cd8ff08b5608..a1f14449ba9c3 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -13,7 +13,7 @@ from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing from vllm.inputs.data import TokensPrompt from vllm.logger import init_logger -from vllm.outputs import EmbeddingRequestOutput +from vllm.outputs import PoolingRequestOutput from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer from vllm.utils import make_async, merge_async_iterators, random_uuid @@ -21,7 +21,7 @@ def request_output_to_score_response( - final_res_batch: List[EmbeddingRequestOutput], request_id: str, + final_res_batch: List[PoolingRequestOutput], request_id: str, created_time: int, model_name: str) -> ScoreResponse: data: List[ScoreResponseData] = [] score = None @@ -133,7 +133,7 @@ async def create_score( return self.create_error_response(str(e)) # Schedule the request and get the result generator. - generators: List[AsyncGenerator[EmbeddingRequestOutput, None]] = [] + generators: List[AsyncGenerator[PoolingRequestOutput, None]] = [] input_pairs = make_pairs(request.text_1, request.text_2) @@ -194,7 +194,7 @@ async def create_score( num_prompts = len(engine_prompts) # Non-streaming response - final_res_batch: List[Optional[EmbeddingRequestOutput]] + final_res_batch: List[Optional[PoolingRequestOutput]] final_res_batch = [None] * num_prompts try: @@ -203,7 +203,7 @@ async def create_score( assert all(final_res is not None for final_res in final_res_batch) - final_res_batch_checked = cast(List[EmbeddingRequestOutput], + final_res_batch_checked = cast(List[PoolingRequestOutput], final_res_batch) response = request_output_to_score_response( diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index d66373512b95e..a3ef9adad16d9 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -1,15 +1,14 @@ from .interfaces import (HasInnerState, SupportsLoRA, SupportsMultiModal, SupportsPP, has_inner_state, supports_lora, supports_multimodal, supports_pp) -from .interfaces_base import (VllmModelForEmbedding, - VllmModelForTextGeneration, is_embedding_model, - is_text_generation_model) +from .interfaces_base import (VllmModelForPooling, VllmModelForTextGeneration, + is_pooling_model, is_text_generation_model) from .registry import ModelRegistry __all__ = [ "ModelRegistry", - "VllmModelForEmbedding", - "is_embedding_model", + "VllmModelForPooling", + "is_pooling_model", "VllmModelForTextGeneration", "is_text_generation_model", "HasInnerState", @@ -20,4 +19,4 @@ "supports_multimodal", "SupportsPP", "supports_pp", -] \ No newline at end of file +] diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 360433a07c5b8..9cc43ae9181b9 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -4,7 +4,7 @@ import torch import torch.nn as nn -from .interfaces_base import VllmModelForEmbedding, is_embedding_model +from .interfaces_base import VllmModelForPooling, is_pooling_model _T = TypeVar("_T", bound=type[nn.Module]) @@ -12,7 +12,7 @@ def as_embedding_model(cls: _T) -> _T: """Subclass an existing vLLM model to support embeddings.""" # Avoid modifying existing embedding models - if is_embedding_model(cls): + if is_pooling_model(cls): return cls # Lazy import @@ -23,7 +23,7 @@ def as_embedding_model(cls: _T) -> _T: from .utils import AutoWeightsLoader, WeightsMapper - class ModelForEmbedding(cls, VllmModelForEmbedding): + class ModelForEmbedding(cls, VllmModelForPooling): def __init__( self, diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 1545ce332309f..01a381381ccec 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -7,7 +7,7 @@ from vllm.logger import init_logger from vllm.utils import supports_kw -from .interfaces_base import is_embedding_model +from .interfaces_base import is_pooling_model if TYPE_CHECKING: from vllm.attention import AttentionMetadata @@ -389,4 +389,4 @@ def _supports_cross_encoding( def supports_cross_encoding( model: Union[Type[object], object], ) -> Union[TypeIs[Type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]: - return is_embedding_model(model) and _supports_cross_encoding(model) + return is_pooling_model(model) and _supports_cross_encoding(model) diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index 957a5a6e26b5c..de733b6d49a53 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -141,7 +141,7 @@ def is_text_generation_model( @runtime_checkable -class VllmModelForEmbedding(VllmModel[C_co, T], Protocol[C_co, T]): +class VllmModelForPooling(VllmModel[C_co, T], Protocol[C_co, T]): def pooler( self, @@ -153,23 +153,22 @@ def pooler( @overload -def is_embedding_model( - model: Type[object]) -> TypeIs[Type[VllmModelForEmbedding]]: +def is_pooling_model(model: Type[object]) -> TypeIs[Type[VllmModelForPooling]]: ... @overload -def is_embedding_model(model: object) -> TypeIs[VllmModelForEmbedding]: +def is_pooling_model(model: object) -> TypeIs[VllmModelForPooling]: ... -def is_embedding_model( +def is_pooling_model( model: Union[Type[object], object], -) -> Union[TypeIs[Type[VllmModelForEmbedding]], TypeIs[VllmModelForEmbedding]]: +) -> Union[TypeIs[Type[VllmModelForPooling]], TypeIs[VllmModelForPooling]]: if not is_vllm_model(model): return False if isinstance(model, type): - return isinstance(model, VllmModelForEmbedding) + return isinstance(model, VllmModelForPooling) - return isinstance(model, VllmModelForEmbedding) + return isinstance(model, VllmModelForPooling) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 7d2bfce9ba264..2b7b69e8c3a95 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -24,7 +24,7 @@ from .interfaces import (has_inner_state, is_attention_free, supports_cross_encoding, supports_multimodal, supports_pp) -from .interfaces_base import is_embedding_model, is_text_generation_model +from .interfaces_base import is_pooling_model, is_text_generation_model logger = init_logger(__name__) @@ -211,7 +211,7 @@ class _ModelInfo: architecture: str is_text_generation_model: bool - is_embedding_model: bool + is_pooling_model: bool supports_cross_encoding: bool supports_multimodal: bool supports_pp: bool @@ -220,19 +220,19 @@ class _ModelInfo: @staticmethod def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo": - is_embedding_model_ = is_embedding_model(model) - if not is_embedding_model_: + is_pooling_model_ = is_pooling_model(model) + if not is_pooling_model_: try: as_embedding_model(model) except Exception: pass else: - is_embedding_model_ = True + is_pooling_model_ = True return _ModelInfo( architecture=model.__name__, is_text_generation_model=is_text_generation_model(model), - is_embedding_model=is_embedding_model_, + is_pooling_model=is_pooling_model_, supports_cross_encoding=supports_cross_encoding(model), supports_multimodal=supports_multimodal(model), supports_pp=supports_pp(model), @@ -441,12 +441,12 @@ def is_text_generation_model( model_cls, _ = self.inspect_model_cls(architectures) return model_cls.is_text_generation_model - def is_embedding_model( + def is_pooling_model( self, architectures: Union[str, List[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) - return model_cls.is_embedding_model + return model_cls.is_pooling_model def is_cross_encoder_model( self, diff --git a/vllm/outputs.py b/vllm/outputs.py index 2d256803edfe8..86264f604f6bc 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -53,8 +53,8 @@ def __repr__(self) -> str: @dataclass -class EmbeddingOutput: - """The output data of one completion output of a request. +class PoolingOutput: + """The output data of one pooling output of a request. Args: embedding: The embedding vector, which is a list of floats. The @@ -63,7 +63,7 @@ class EmbeddingOutput: embedding: List[float] def __repr__(self) -> str: - return (f"EmbeddingOutput(" + return (f"PoolingOutput(" f"embedding={len(self.embedding)})") @@ -316,18 +316,18 @@ def __repr__(self) -> str: f"multi_modal_placeholders={self.multi_modal_placeholders})") -class EmbeddingRequestOutput: +class PoolingRequestOutput: """ - The output data of an embedding request to the LLM. + The output data of a pooling request to the LLM. Args: - request_id (str): A unique identifier for the embedding request. - outputs (EmbeddingOutput): The embedding results for the given input. + request_id (str): A unique identifier for the pooling request. + outputs (PoolingOutput): The pooling results for the given input. prompt_token_ids (List[int]): A list of token IDs used in the prompt. - finished (bool): A flag indicating whether the embedding is completed. + finished (bool): A flag indicating whether the pooling is completed. """ - def __init__(self, request_id: str, outputs: "EmbeddingOutput", + def __init__(self, request_id: str, outputs: "PoolingOutput", prompt_token_ids: List[int], finished: bool): self.request_id = request_id self.prompt_token_ids = prompt_token_ids @@ -336,11 +336,11 @@ def __init__(self, request_id: str, outputs: "EmbeddingOutput", @classmethod def from_seq_group(cls, - seq_group: 'SequenceGroup') -> "EmbeddingRequestOutput": + seq_group: 'SequenceGroup') -> "PoolingRequestOutput": if seq_group.embeddings is None: raise ValueError( "Embeddings are missing in seq_group for EmbeddingRequest.") - output = EmbeddingOutput(seq_group.embeddings) + output = PoolingOutput(seq_group.embeddings) prompt_token_ids = seq_group.prompt_token_ids finished = seq_group.is_finished() @@ -348,15 +348,15 @@ def from_seq_group(cls, def __repr__(self): """ - Returns a string representation of an EmbeddingRequestOutput instance. + Returns a string representation of an PoolingRequestOutput instance. The representation includes the request_id and the number of outputs, - providing a quick overview of the embedding request's results. + providing a quick overview of the pooling request's results. Returns: - str: A string representation of the EmbeddingRequestOutput instance. + str: A string representation of the PoolingRequestOutput instance. """ - return (f"EmbeddingRequestOutput(request_id='{self.request_id}', " + return (f"PoolingRequestOutput(request_id='{self.request_id}', " f"outputs={repr(self.outputs)}, " f"prompt_token_ids={self.prompt_token_ids}, " f"finished={self.finished})") @@ -415,7 +415,30 @@ def create(seq_group: SequenceGroup, # Determine the type based on a condition, for example: if hasattr(seq_group, 'embeddings') and seq_group.embeddings is not None: - return EmbeddingRequestOutput.from_seq_group(seq_group) + return PoolingRequestOutput.from_seq_group(seq_group) else: return RequestOutput.from_seq_group(seq_group, use_cache, seq_id_to_seq_group) + + +def __getattr__(name: str): + import warnings + + if name == "EmbeddingOutput": + msg = ("EmbeddingOutput has been renamed to PoolingOutput. " + "The original name will be removed in an upcoming version.") + + warnings.warn(DeprecationWarning(msg), stacklevel=2) + + return PoolingOutput + + if name == "EmbeddingRequestOutput": + msg = ("EmbeddingRequestOutput has been renamed to " + "PoolingRequestOutput. " + "The original name will be removed in an upcoming version.") + + warnings.warn(DeprecationWarning(msg), stacklevel=2) + + return PoolingRequestOutput + + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index a17c8eac4b77c..7335c637f0f79 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -9,7 +9,7 @@ from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.outputs import EmbeddingRequestOutput, RequestOutput +from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams @@ -133,7 +133,7 @@ async def add_request( trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, - ) -> AsyncGenerator[Union[RequestOutput, EmbeddingRequestOutput], None]: + ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]: """Add new request to the AsyncLLM.""" if self.detokenizer.is_request_active(request_id): diff --git a/vllm/v1/engine/async_stream.py b/vllm/v1/engine/async_stream.py index 3e6c759ad5ebd..35449238c3259 100644 --- a/vllm/v1/engine/async_stream.py +++ b/vllm/v1/engine/async_stream.py @@ -1,11 +1,11 @@ import asyncio from typing import Any, AsyncGenerator, Callable, Optional, Type, Union -from vllm.outputs import EmbeddingRequestOutput, RequestOutput +from vllm.outputs import PoolingRequestOutput, RequestOutput class AsyncStream: - """A stream of RequestOutputs or EmbeddingRequestOutputs for a request + """A stream of RequestOutputs or PoolingRequestOutputs for a request that can be iterated over asynchronously via an async generator.""" STOP_ITERATION = Exception() # Sentinel @@ -16,7 +16,7 @@ def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None: self._queue: asyncio.Queue = asyncio.Queue() self._finished = False - def put(self, item: Union[RequestOutput, EmbeddingRequestOutput, + def put(self, item: Union[RequestOutput, PoolingRequestOutput, Exception]) -> None: if not self._finished: self._queue.put_nowait(item) @@ -32,7 +32,7 @@ def finish( async def generator( self - ) -> AsyncGenerator[Union[RequestOutput, EmbeddingRequestOutput], None]: + ) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]: finished = False try: while True: diff --git a/vllm/worker/cpu_embedding_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py similarity index 98% rename from vllm/worker/cpu_embedding_model_runner.py rename to vllm/worker/cpu_pooling_model_runner.py index 3954e4c4c8a5b..17b2fd2564a04 100644 --- a/vllm/worker/cpu_embedding_model_runner.py +++ b/vllm/worker/cpu_pooling_model_runner.py @@ -16,12 +16,12 @@ @dataclasses.dataclass(frozen=True) class ModelInputForCPUWithPoolingMetadata(ModelInputForCPU): """ - Used by the CPUEmbeddingModelRunner. + Used by the CPUPoolingModelRunner. """ pooling_metadata: Optional["PoolingMetadata"] = None -class CPUEmbeddingModelRunner( +class CPUPoolingModelRunner( CPUModelRunnerBase[ModelInputForCPUWithPoolingMetadata]): _model_input_cls: Type[ModelInputForCPUWithPoolingMetadata] = ( ModelInputForCPUWithPoolingMetadata) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index cf04808b73372..4fad1a3f4caeb 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -14,9 +14,9 @@ from vllm.model_executor import set_random_seed from vllm.sequence import ExecuteModelRequest from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE -from vllm.worker.cpu_embedding_model_runner import CPUEmbeddingModelRunner from vllm.worker.cpu_enc_dec_model_runner import CPUEncoderDecoderModelRunner from vllm.worker.cpu_model_runner import CPUModelRunner, CPUModelRunnerBase +from vllm.worker.cpu_pooling_model_runner import CPUPoolingModelRunner from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, LoraNotSupportedWorkerBase, WorkerBase, WorkerInput) @@ -164,7 +164,7 @@ def __init__( else {"return_hidden_states": True} ModelRunnerClass: Type[CPUModelRunnerBase] = CPUModelRunner if self.model_config.task == "embedding": - ModelRunnerClass = CPUEmbeddingModelRunner + ModelRunnerClass = CPUPoolingModelRunner elif self.model_config.is_encoder_decoder: ModelRunnerClass = CPUEncoderDecoderModelRunner self.model_runner: CPUModelRunnerBase = ModelRunnerClass( diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/pooling_model_runner.py similarity index 98% rename from vllm/worker/embedding_model_runner.py rename to vllm/worker/pooling_model_runner.py index f56805918fd15..1beae1e3884c5 100644 --- a/vllm/worker/embedding_model_runner.py +++ b/vllm/worker/pooling_model_runner.py @@ -21,12 +21,12 @@ @dataclasses.dataclass(frozen=True) class ModelInputForGPUWithPoolingMetadata(ModelInputForGPU): """ - Used by the EmbeddingModelRunner. + Used by the PoolingModelRunner. """ pooling_metadata: Optional["PoolingMetadata"] = None -class EmbeddingModelRunner( +class PoolingModelRunner( GPUModelRunnerBase[ModelInputForGPUWithPoolingMetadata]): _model_input_cls: Type[ModelInputForGPUWithPoolingMetadata] = ( ModelInputForGPUWithPoolingMetadata) @@ -52,7 +52,7 @@ def execute_model( ) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]: if num_steps > 1: raise ValueError( - "EmbeddingModelRunner does not support multi-step execution.") + "PoolingModelRunner does not support multi-step execution.") if self.lora_config: assert model_input.lora_requests is not None diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 24e7bc760b0c0..d58cb029618e9 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -22,9 +22,9 @@ from vllm.sequence import (ExecuteModelRequest, IntermediateTensors, SequenceGroupMetadata, SequenceGroupMetadataDelta) from vllm.worker.cache_engine import CacheEngine -from vllm.worker.embedding_model_runner import EmbeddingModelRunner from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner +from vllm.worker.pooling_model_runner import PoolingModelRunner from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase, WorkerInput) @@ -75,7 +75,7 @@ def __init__( ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner if model_config.task == "embedding": - ModelRunnerClass = EmbeddingModelRunner + ModelRunnerClass = PoolingModelRunner elif self.model_config.is_encoder_decoder: ModelRunnerClass = EncoderDecoderModelRunner self.model_runner: GPUModelRunnerBase = ModelRunnerClass( From 169a0ff911134b930adc0afc0d8c6f370091e10d Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sun, 1 Dec 2024 00:41:38 -0800 Subject: [PATCH 205/255] [doc] add warning about comparing hf and vllm outputs (#10805) Signed-off-by: youkaichao --- docs/source/models/supported_models.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index f571b8bf6735e..9f3b6f59068e2 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -701,6 +701,9 @@ At vLLM, we are committed to facilitating the integration and support of third-p 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results. +.. tip:: + When comparing the output of :code:`model.generate` from HuggingFace Transformers with the output of :code:`llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., `generation_config.json `__) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. + 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback. 4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use. From c11f172187b6f44710e1f011ca8bff923ce49a7f Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Sun, 1 Dec 2024 00:47:05 -0800 Subject: [PATCH 206/255] [Misc] Adding `MMMU-Pro` vision dataset to serving benchmark (#10804) Signed-off-by: Roger Wang Co-authored-by: Chen Zhang Co-authored-by: Isotr0py <2037008807@qq.com> --- benchmarks/benchmark_serving.py | 65 +++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index e9fc037a46965..3256692142c5e 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -199,6 +199,56 @@ def sample_sonnet_requests( return sampled_requests +def sample_mmmu_pro_vision_requests( + dataset, + num_requests: int, + tokenizer: PreTrainedTokenizerBase, + fixed_output_len: Optional[int] = None, +) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]: + sampled_requests: List[Tuple[str, int, int, Dict[str, + Collection[str]]]] = [] + for data in dataset: + if len(sampled_requests) == num_requests: + break + + # MMMU-Pro vision direct prompt + # Ref: https://github.com/MMMU-Benchmark/MMMU/blob/6ce42f4d8f70c1841c67867152648974415b5cac/mmmu-pro/prompts.yaml#L5 + prompt = ( + "Answer with the option letter from the given choices directly. " + "The last line of your response should be of the following " + "format: 'Answer: $LETTER' (without quotes) where LETTER is one of " + "options.") + + prompt_token_ids = tokenizer(prompt).input_ids + if fixed_output_len is None: + # Default max output len is set to 128 + print("--hf-output-len is not provided. Using default value 128.") + fixed_output_len = 128 + + prompt_len = len(prompt_token_ids) + output_len = fixed_output_len + + assert isinstance( + data["image"], + Image), ("Input image format must be `PIL.Image.Image`, " + f"given {type(data['image'])}.") + image: Image = data["image"] + image = image.convert("RGB") + image_data = io.BytesIO() + image.save(image_data, format='JPEG') + image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8") + mm_content = { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{image_base64}" + }, + } + + sampled_requests.append((prompt, prompt_len, output_len, mm_content)) + + return sampled_requests + + def sample_hf_requests( dataset_path: str, dataset_subset: str, @@ -208,6 +258,21 @@ def sample_hf_requests( random_seed: int, fixed_output_len: Optional[int] = None, ) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]: + + # Special case for MMMU-Pro vision dataset + if dataset_path == 'MMMU/MMMU_Pro' and dataset_subset == 'vision': + assert dataset_split == "test" + dataset = load_dataset(dataset_path, + name=dataset_subset, + split=dataset_split, + streaming=True) + assert "image" in dataset.features, ( + "MMMU/MMMU_Pro vision dataset must have 'image' column.") + filter_func = lambda x: isinstance(x["image"], Image) + dataset = dataset.shuffle(seed=random_seed).filter(filter_func) + return sample_mmmu_pro_vision_requests(dataset, num_requests, + tokenizer, fixed_output_len) + dataset = load_dataset(dataset_path, name=dataset_subset, split=dataset_split, From 0590ec3fd9857063c43c80df281e24c16c51b2ec Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Sun, 1 Dec 2024 19:01:00 -0600 Subject: [PATCH 207/255] [Core] Implement disagg prefill by StatelessProcessGroup (#10502) This PR provides initial support for single-node disaggregated prefill in 1P1D scenario. Signed-off-by: KuntaiDu Co-authored-by: ApostaC Co-authored-by: YaoJiayi <120040070@link.cuhk.edu.cn> --- .buildkite/test-pipeline.yaml | 4 + .../disagg_overhead_benchmark.sh | 144 +++++++++ .../disagg_performance_benchmark.sh | 164 +++++++++++ .../disagg_prefill_proxy_server.py | 61 ++++ .../disagg_benchmarks/round_robin_proxy.py | 60 ++++ .../visualize_benchmark_results.py | 46 +++ examples/disaggregated_prefill.sh | 109 +++++++ tests/kv_transfer/disagg_test.py | 119 ++++++++ tests/kv_transfer/module_test.py | 64 ++++ tests/kv_transfer/test_lookup_buffer.py | 160 ++++++++++ tests/kv_transfer/test_lookup_buffer.sh | 3 + tests/kv_transfer/test_send_recv.py | 155 ++++++++++ tests/kv_transfer/test_send_recv.sh | 3 + vllm/config.py | 84 ++++++ vllm/distributed/kv_transfer/README.md | 30 ++ vllm/distributed/kv_transfer/__init__.py | 0 .../kv_transfer/disagg_prefill_workflow.jpg | Bin 0 -> 142656 bytes .../kv_transfer/kv_connector/__init__.py | 0 .../kv_transfer/kv_connector/base.py | 122 ++++++++ .../kv_transfer/kv_connector/factory.py | 19 ++ .../kv_connector/simple_connector.py | 261 +++++++++++++++++ .../kv_transfer/kv_lookup_buffer/__init__.py | 0 .../kv_transfer/kv_lookup_buffer/base.py | 108 +++++++ .../kv_lookup_buffer/simple_buffer.py | 242 +++++++++++++++ .../kv_transfer/kv_pipe/__init__.py | 0 vllm/distributed/kv_transfer/kv_pipe/base.py | 65 +++++ .../kv_transfer/kv_pipe/pynccl_pipe.py | 276 ++++++++++++++++++ .../kv_transfer/kv_transfer_agent.py | 75 +++++ vllm/distributed/parallel_state.py | 35 ++- vllm/engine/arg_utils.py | 18 +- vllm/worker/model_runner.py | 105 ++++++- vllm/worker/worker.py | 13 +- vllm/worker/worker_base.py | 1 + 33 files changed, 2525 insertions(+), 21 deletions(-) create mode 100644 benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh create mode 100644 benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh create mode 100644 benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py create mode 100644 benchmarks/disagg_benchmarks/round_robin_proxy.py create mode 100644 benchmarks/disagg_benchmarks/visualize_benchmark_results.py create mode 100644 examples/disaggregated_prefill.sh create mode 100644 tests/kv_transfer/disagg_test.py create mode 100644 tests/kv_transfer/module_test.py create mode 100644 tests/kv_transfer/test_lookup_buffer.py create mode 100644 tests/kv_transfer/test_lookup_buffer.sh create mode 100644 tests/kv_transfer/test_send_recv.py create mode 100644 tests/kv_transfer/test_send_recv.sh create mode 100644 vllm/distributed/kv_transfer/README.md create mode 100644 vllm/distributed/kv_transfer/__init__.py create mode 100644 vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg create mode 100644 vllm/distributed/kv_transfer/kv_connector/__init__.py create mode 100644 vllm/distributed/kv_transfer/kv_connector/base.py create mode 100644 vllm/distributed/kv_transfer/kv_connector/factory.py create mode 100644 vllm/distributed/kv_transfer/kv_connector/simple_connector.py create mode 100644 vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py create mode 100644 vllm/distributed/kv_transfer/kv_lookup_buffer/base.py create mode 100644 vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py create mode 100644 vllm/distributed/kv_transfer/kv_pipe/__init__.py create mode 100644 vllm/distributed/kv_transfer/kv_pipe/base.py create mode 100644 vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py create mode 100644 vllm/distributed/kv_transfer/kv_transfer_agent.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 46692506f01d4..f5591f1098534 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -430,6 +430,9 @@ steps: - vllm/model_executor/models/ - tests/distributed/ - vllm/compilation + - vllm/worker/worker_base.py + - vllm/worker/worker.py + - vllm/worker/model_runner.py commands: - pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py @@ -443,6 +446,7 @@ steps: - pip install -e ./plugins/vllm_add_dummy_model - pytest -v -s distributed/test_distributed_oot.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py - label: Multi-step Tests (4 GPUs) # 36min working_dir: "/vllm-workspace/tests" diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh new file mode 100644 index 0000000000000..2924ea4a49f54 --- /dev/null +++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh @@ -0,0 +1,144 @@ +#!/bin/bash + +# benchmark the overhead of disaggregated prefill. +# methodology: +# - send all request to prefill vLLM instance. It will buffer KV cache. +# - then send all request to decode instance. +# - The TTFT of decode instance is the overhead. + +set -ex + +kill_gpu_processes() { + # kill all processes on GPU. + pkill -f pt_main_thread + sleep 10 + + # remove vllm config file + rm -rf ~/.config/vllm + + # Print the GPU memory usage + # so that we know if all GPU processes are killed. + gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0) + # The memory usage should be 0 MB. + echo "GPU 0 Memory Usage: $gpu_memory_usage MB" +} + +wait_for_server() { + # wait for vllm server to start + # return 1 if vllm server crashes + local port=$1 + timeout 1200 bash -c " + until curl -s localhost:${port}/v1/completions > /dev/null; do + sleep 1 + done" && return 0 || return 1 +} + + +benchmark() { + + export VLLM_LOGGING_LEVEL=DEBUG + export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') + + # compare chunked prefill with disaggregated prefill + + results_folder="./results" + model="meta-llama/Meta-Llama-3.1-8B-Instruct" + dataset_name="sonnet" + dataset_path="../sonnet_4x.txt" + num_prompts=10 + qps=$1 + prefix_len=50 + input_len=2048 + output_len=$2 + + + CUDA_VISIBLE_DEVICES=0 python3 \ + -m vllm.entrypoints.openai.api_server \ + --model meta-llama/Meta-Llama-3.1-8B-Instruct \ + --port 8100 \ + --max-model-len 10000 \ + --gpu-memory-utilization 0.6 \ + --kv-transfer-config \ + '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + + + CUDA_VISIBLE_DEVICES=1 python3 \ + -m vllm.entrypoints.openai.api_server \ + --model meta-llama/Meta-Llama-3.1-8B-Instruct \ + --port 8200 \ + --max-model-len 10000 \ + --gpu-memory-utilization 0.6 \ + --kv-transfer-config \ + '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + + wait_for_server 8100 + wait_for_server 8200 + + # let the prefill instance finish prefill + python3 ../benchmark_serving.py \ + --backend vllm \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --sonnet-input-len $input_len \ + --sonnet-output-len "$output_len" \ + --sonnet-prefix-len $prefix_len \ + --num-prompts $num_prompts \ + --port 8100 \ + --save-result \ + --result-dir $results_folder \ + --result-filename disagg_prefill_2xtp4.json \ + --request-rate "inf" + + + # send the request to decode. + # The TTFT of this command will be the overhead of disagg prefill impl. + python3 ../benchmark_serving.py \ + --backend vllm \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --sonnet-input-len $input_len \ + --sonnet-output-len "$output_len" \ + --sonnet-prefix-len $prefix_len \ + --num-prompts $num_prompts \ + --port 8200 \ + --save-result \ + --result-dir $results_folder \ + --result-filename disagg_prefill_2xtp4.json \ + --request-rate "$qps" + kill_gpu_processes + +} + + +main() { + + (which wget && which curl) || (apt-get update && apt-get install -y wget curl) + (which jq) || (apt-get -y install jq) + (which socat) || (apt-get -y install socat) + + pip install quart httpx + + cd "$(dirname "$0")" + + cd .. + # create sonnet-4x.txt + echo "" > sonnet_4x.txt + for _ in {1..4} + do + cat sonnet.txt >> sonnet_4x.txt + done + cd disagg_benchmarks + + rm -rf results + mkdir results + + default_qps=1 + default_output_len=1 + benchmark $default_qps $default_output_len + +} + + +main "$@" diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh new file mode 100644 index 0000000000000..d8d9e976dce76 --- /dev/null +++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh @@ -0,0 +1,164 @@ +#!/bin/bash + +# Requirement: 8x H100 GPUs. + + +# Model: neuralmagic/Meta-Llama-3-70B-Instruct-FP8-KV +# Query: 2048 input tokens, 11 output tokens, QPS 4, 500 requests +# Resource: 8x H100 +# Approaches: +# 1. Chunked prefill: 1 vllm instance with tp=8 +# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4 +# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance +# Prefilling instance: max_output_token=1 +# Decoding instance: force the input tokens be the same across requests to bypass prefilling + +set -ex + +kill_gpu_processes() { + # kill all processes on GPU. + pgrep pt_main_thread | xargs -r kill -9 + pgrep python3 | xargs -r kill -9 + for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done + sleep 1 +} + +wait_for_server() { + # wait for vllm server to start + # return 1 if vllm server crashes + local port=$1 + timeout 1200 bash -c " + until curl -s localhost:${port}/v1/completions > /dev/null; do + sleep 1 + done" && return 0 || return 1 +} + + +launch_chunked_prefill() { + model="meta-llama/Meta-Llama-3.1-8B-Instruct" + # disagg prefill + CUDA_VISIBLE_DEVICES=0 python3 \ + -m vllm.entrypoints.openai.api_server \ + --model $model \ + --port 8100 \ + --max-model-len 10000 \ + --enable-chunked-prefill \ + --gpu-memory-utilization 0.6 & + CUDA_VISIBLE_DEVICES=1 python3 \ + -m vllm.entrypoints.openai.api_server \ + --model $model \ + --port 8200 \ + --max-model-len 10000 \ + --enable-chunked-prefill \ + --gpu-memory-utilization 0.6 & + wait_for_server 8100 + wait_for_server 8200 + python3 round_robin_proxy.py & + sleep 1 +} + + +launch_disagg_prefill() { + model="meta-llama/Meta-Llama-3.1-8B-Instruct" + # disagg prefill + CUDA_VISIBLE_DEVICES=0 python3 \ + -m vllm.entrypoints.openai.api_server \ + --model $model \ + --port 8100 \ + --max-model-len 10000 \ + --gpu-memory-utilization 0.6 \ + --kv-transfer-config \ + '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + + CUDA_VISIBLE_DEVICES=1 python3 \ + -m vllm.entrypoints.openai.api_server \ + --model $model \ + --port 8200 \ + --max-model-len 10000 \ + --gpu-memory-utilization 0.6 \ + --kv-transfer-config \ + '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + + wait_for_server 8100 + wait_for_server 8200 + python3 disagg_prefill_proxy_server.py & + sleep 1 +} + + +benchmark() { + results_folder="./results" + model="meta-llama/Meta-Llama-3.1-8B-Instruct" + dataset_name="sonnet" + dataset_path="../sonnet_4x.txt" + num_prompts=100 + qps=$1 + prefix_len=50 + input_len=1024 + output_len=$2 + tag=$3 + + python3 ../benchmark_serving.py \ + --backend vllm \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --sonnet-input-len $input_len \ + --sonnet-output-len "$output_len" \ + --sonnet-prefix-len $prefix_len \ + --num-prompts $num_prompts \ + --port 8000 \ + --save-result \ + --result-dir $results_folder \ + --result-filename "$tag"-qps-"$qps".json \ + --request-rate "$qps" + + sleep 2 + +} + + +main() { + + (which wget && which curl) || (apt-get update && apt-get install -y wget curl) + (which jq) || (apt-get -y install jq) + (which socat) || (apt-get -y install socat) + + pip install quart httpx matplotlib aiohttp + + cd "$(dirname "$0")" + + cd .. + # create sonnet-4x.txt so that we can sample 2048 tokens for input + echo "" > sonnet_4x.txt + for _ in {1..4} + do + cat sonnet.txt >> sonnet_4x.txt + done + cd disagg_benchmarks + + rm -rf results + mkdir results + + default_output_len=6 + + export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') + + launch_chunked_prefill + for qps in 2 4 6 8; do + benchmark $qps $default_output_len chunked_prefill + done + kill_gpu_processes + + launch_disagg_prefill + for qps in 2 4 6 8; do + benchmark $qps $default_output_len disagg_prefill + done + kill_gpu_processes + + python3 visualize_benchmark_results.py + +} + + +main "$@" diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py new file mode 100644 index 0000000000000..4058b1c0a3b79 --- /dev/null +++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py @@ -0,0 +1,61 @@ +import os + +import aiohttp +from quart import Quart, make_response, request + +AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) + +app = Quart(__name__) + + +async def forward_request(url, data): + async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" + } + async with session.post(url=url, json=data, + headers=headers) as response: + if response.status == 200: + # if response.headers.get('Transfer-Encoding') == 'chunked': + if True: + async for chunk_bytes in response.content.iter_chunked( + 1024): + yield chunk_bytes + else: + content = await response.read() + yield content + + +@app.route('/v1/completions', methods=['POST']) +async def handle_request(): + try: + original_request_data = await request.get_json() + + prefill_request = original_request_data.copy() + # change max_tokens = 1 to let it only do prefill + prefill_request['max_tokens'] = 1 + + # finish prefill + async for _ in forward_request('http://localhost:8100/v1/completions', + prefill_request): + continue + + # return decode + generator = forward_request('http://localhost:8200/v1/completions', + original_request_data) + response = await make_response(generator) + response.timeout = None + + return response + + except Exception as e: + import sys + import traceback + exc_info = sys.exc_info() + print("Error occurred in disagg prefill proxy server") + print(e) + print("".join(traceback.format_exception(*exc_info))) + + +if __name__ == '__main__': + app.run(port=8000) diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.py b/benchmarks/disagg_benchmarks/round_robin_proxy.py new file mode 100644 index 0000000000000..6eb5f63980070 --- /dev/null +++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py @@ -0,0 +1,60 @@ +import asyncio +import itertools + +import aiohttp +from aiohttp import web + + +class RoundRobinProxy: + + def __init__(self, target_ports): + self.target_ports = target_ports + self.port_cycle = itertools.cycle(self.target_ports) + + async def handle_request(self, request): + target_port = next(self.port_cycle) + target_url = f"http://localhost:{target_port}{request.path_qs}" + + async with aiohttp.ClientSession() as session: + try: + # Forward the request + async with session.request( + method=request.method, + url=target_url, + headers=request.headers, + data=request.content, + ) as response: + # Start sending the response + resp = web.StreamResponse(status=response.status, + headers=response.headers) + await resp.prepare(request) + + # Stream the response content + async for chunk in response.content.iter_any(): + await resp.write(chunk) + + await resp.write_eof() + return resp + + except Exception as e: + return web.Response(text=f"Error: {str(e)}", status=500) + + +async def main(): + proxy = RoundRobinProxy([8100, 8200]) + app = web.Application() + app.router.add_route('*', '/{path:.*}', proxy.handle_request) + + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, 'localhost', 8000) + await site.start() + + print("Proxy server started on http://localhost:8000") + + # Keep the server running + await asyncio.Event().wait() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py new file mode 100644 index 0000000000000..e59d8bb0e6c8c --- /dev/null +++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py @@ -0,0 +1,46 @@ +import json + +import matplotlib.pyplot as plt +import pandas as pd + +if __name__ == "__main__": + + data = [] + for name in ['disagg_prefill', 'chunked_prefill']: + for qps in [2, 4, 6, 8]: + with open(f"results/{name}-qps-{qps}.json") as f: + x = json.load(f) + x['name'] = name + x['qps'] = qps + data.append(x) + + df = pd.DataFrame.from_dict(data) + dis_df = df[df['name'] == 'disagg_prefill'] + chu_df = df[df['name'] == 'chunked_prefill'] + + plt.style.use('bmh') + plt.rcParams['font.size'] = 20 + + for key in [ + 'mean_ttft_ms', 'median_ttft_ms', 'p99_ttft_ms', 'mean_itl_ms', + 'median_itl_ms', 'p99_itl_ms' + ]: + + fig, ax = plt.subplots(figsize=(11, 7)) + plt.plot(dis_df['qps'], + dis_df[key], + label='disagg_prefill', + marker='o', + linewidth=4) + plt.plot(chu_df['qps'], + chu_df[key], + label='chunked_prefill', + marker='o', + linewidth=4) + ax.legend() + + ax.set_xlabel('QPS') + ax.set_ylabel(key) + ax.set_ylim(bottom=0) + fig.savefig(f'results/{key}.png') + plt.close(fig) diff --git a/examples/disaggregated_prefill.sh b/examples/disaggregated_prefill.sh new file mode 100644 index 0000000000000..87155273a81d1 --- /dev/null +++ b/examples/disaggregated_prefill.sh @@ -0,0 +1,109 @@ +#!/bin/bash +# This file demonstrates the example usage of disaggregated prefilling +# We will launch 2 vllm instances (1 for prefill and 1 for decode), +# and then transfer the KV cache between them. + +echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧" +sleep 1 + +# Trap the SIGINT signal (triggered by Ctrl+C) +trap 'cleanup' INT + +# Cleanup function +cleanup() { + echo "Caught Ctrl+C, cleaning up..." + # Cleanup commands + pgrep python | xargs kill -9 + pkill -f python + echo "Cleanup complete. Exiting." + exit 0 +} + +export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') + +# install quart first -- required for disagg prefill proxy serve +if python3 -c "import quart" &> /dev/null; then + echo "Quart is already installed." +else + echo "Quart is not installed. Installing..." + python3 -m pip install quart +fi + +# a function that waits vLLM server to start +wait_for_server() { + local port=$1 + timeout 1200 bash -c " + until curl -s localhost:${port}/v1/completions > /dev/null; do + sleep 1 + done" && return 0 || return 1 +} + + +# You can also adjust --kv-ip and --kv-port for distributed inference. + +# prefilling instance, which is the KV producer +CUDA_VISIBLE_DEVICES=0 vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct \ + --port 8100 \ + --max-model-len 100 \ + --gpu-memory-utilization 0.8 \ + --kv-transfer-config \ + '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' & + +# decoding instance, which is the KV consumer +CUDA_VISIBLE_DEVICES=1 vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct \ + --port 8200 \ + --max-model-len 100 \ + --gpu-memory-utilization 0.8 \ + --kv-transfer-config \ + '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' & + +# wait until prefill and decode instances are ready +wait_for_server 8100 +wait_for_server 8200 + +# launch a proxy server that opens the service at port 8000 +# the workflow of this proxy: +# - send the request to prefill vLLM instance (port 8100), change max_tokens +# to 1 +# - after the prefill vLLM finishes prefill, send the request to decode vLLM +# instance +# NOTE: the usage of this API is subject to change --- in the future we will +# introduce "vllm connect" to connect between prefill and decode instances +python3 ../benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py & +sleep 1 + +# serve two example requests +output1=$(curl -X POST -s http://localhost:8000/v1/completions \ +-H "Content-Type: application/json" \ +-d '{ +"model": "meta-llama/Meta-Llama-3.1-8B-Instruct", +"prompt": "San Francisco is a", +"max_tokens": 10, +"temperature": 0 +}') + +output2=$(curl -X POST -s http://localhost:8000/v1/completions \ +-H "Content-Type: application/json" \ +-d '{ +"model": "meta-llama/Meta-Llama-3.1-8B-Instruct", +"prompt": "Santa Clara is a", +"max_tokens": 10, +"temperature": 0 +}') + + +# Cleanup commands +pgrep python | xargs kill -9 +pkill -f python + +echo "" + +sleep 1 + +# Print the outputs of the curl requests +echo "" +echo "Output of first request: $output1" +echo "Output of second request: $output2" + +echo "🎉🎉 Successfully finished 2 test requests! 🎉🎉" +echo "" diff --git a/tests/kv_transfer/disagg_test.py b/tests/kv_transfer/disagg_test.py new file mode 100644 index 0000000000000..adc6150edece6 --- /dev/null +++ b/tests/kv_transfer/disagg_test.py @@ -0,0 +1,119 @@ +import os +import subprocess +import sys +import time +from subprocess import Popen + +import pytest +import requests +import torch + + +# Fixture to set up environment variables and teardown servers after tests +@pytest.fixture(scope="module", autouse=True) +def setup_servers(): + if torch.cuda.device_count() < 4: + pytest.skip("Skipping test: fewer than 4 GPUs available") + + # Set up environment variables + VLLM_HOST_IP = subprocess.check_output("hostname -I | awk '{print $1}'", + shell=True).decode().strip() + os.environ["VLLM_HOST_IP"] = VLLM_HOST_IP + + # Start prefill instance + prefill_cmd = [ + sys.executable, + "-m", + "vllm.entrypoints.openai.api_server", + "--model", + "meta-llama/Meta-Llama-3.1-8B-Instruct", + "--port", + "8100", + "--gpu-memory-utilization", + "0.5", + "--max-model-len", + "1000", + "--kv-transfer-config", + '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer",'\ + '"kv_rank":0,"kv_parallel_size":2}', + ] + prefill_env = os.environ.copy() + prefill_env["CUDA_VISIBLE_DEVICES"] = "0" + prefill_proc = Popen(prefill_cmd, env=prefill_env) + + # Start decode instance + decode_cmd = [ + sys.executable, + "-m", + "vllm.entrypoints.openai.api_server", + "--model", + "meta-llama/Meta-Llama-3.1-8B-Instruct", + "--port", + "8200", + "--gpu-memory-utilization", + "0.5", + "--max-model-len", + "1000", + "--kv-transfer-config", + '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer",'\ + '"kv_rank":1,"kv_parallel_size":2}', + ] + decode_env = os.environ.copy() + decode_env["CUDA_VISIBLE_DEVICES"] = "1" + decode_proc = Popen(decode_cmd, env=decode_env) + + # Wait for servers to be ready + assert wait_for_server(8100), "Prefill server did not start in time" + assert wait_for_server(8200), "Decode server did not start in time" + + # Yield to the test function and handle teardown after tests + yield + + # Cleanup: kill the processes + prefill_proc.terminate() + decode_proc.terminate() + + # Additional cleanup if needed + prefill_proc.wait() + decode_proc.wait() + + +# Helper function to wait for server +def wait_for_server(port, timeout=240): + start_time = time.time() + while time.time() - start_time < timeout: + try: + response = requests.get(f"http://localhost:{port}/v1/completions") + if response.status_code in [200, 405]: + return True + except requests.ConnectionError: + time.sleep(1) + return False + + +# Test function to send curl requests and validate responses +@pytest.mark.parametrize("prompt", ["San Francisco is a", "Santa Clara is a"]) +def test_disaggregated_prefilling(prompt): + # Send to prefill + response = requests.post("http://localhost:8100/v1/completions", + headers={"Content-Type": "application/json"}, + json={ + "model": + "meta-llama/Meta-Llama-3.1-8B-Instruct", + "prompt": prompt, + "max_tokens": 1, + "temperature": 0 + }) + assert response.status_code == 200 + + # Send to decode + response = requests.post("http://localhost:8200/v1/completions", + headers={"Content-Type": "application/json"}, + json={ + "model": + "meta-llama/Meta-Llama-3.1-8B-Instruct", + "prompt": prompt, + "max_tokens": 10, + "temperature": 0 + }) + assert response.status_code == 200 diff --git a/tests/kv_transfer/module_test.py b/tests/kv_transfer/module_test.py new file mode 100644 index 0000000000000..355461919cd7c --- /dev/null +++ b/tests/kv_transfer/module_test.py @@ -0,0 +1,64 @@ +import subprocess +import sys + +import pytest +import torch + + +def run_python_script(script_name, timeout): + script_name = f'kv_transfer/{script_name}' + try: + # Start both processes asynchronously using Popen + process0 = subprocess.Popen( + [sys.executable, script_name], + env={"RANK": + "0"}, # Set the RANK environment variable for process 0 + stdout=sys.stdout, # Pipe stdout to current stdout + stderr=sys.stderr, # Pipe stderr to current stderr + ) + + process1 = subprocess.Popen( + [sys.executable, script_name], + env={"RANK": + "1"}, # Set the RANK environment variable for process 1 + stdout=sys.stdout, # Pipe stdout to current stdout + stderr=sys.stderr, # Pipe stderr to current stderr + ) + + # Wait for both processes to complete, with a timeout + process0.wait(timeout=timeout) + process1.wait(timeout=timeout) + + # Check the return status of both processes + if process0.returncode != 0: + pytest.fail( + f"Test {script_name} failed for RANK=0, {process0.returncode}") + if process1.returncode != 0: + pytest.fail( + f"Test {script_name} failed for RANK=1, {process1.returncode}") + + except subprocess.TimeoutExpired: + # If either process times out, terminate both and fail the test + process0.terminate() + process1.terminate() + pytest.fail(f"Test {script_name} timed out") + except Exception as e: + pytest.fail(f"Test {script_name} failed with error: {str(e)}") + + +# Define the test cases using pytest's parametrize +@pytest.mark.parametrize( + "script_name,timeout", + [ + ("test_lookup_buffer.py", + 60), # Second test case with a 60-second timeout + ("test_send_recv.py", 120) # First test case with a 120-second timeout + ]) +def test_run_python_script(script_name, timeout): + # Check the number of GPUs + if torch.cuda.device_count() < 2: + pytest.skip( + f"Skipping test {script_name} because <2 GPUs are available") + + # Run the test if there are at least 2 GPUs + run_python_script(script_name, timeout) diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py new file mode 100644 index 0000000000000..96b0e58713332 --- /dev/null +++ b/tests/kv_transfer/test_lookup_buffer.py @@ -0,0 +1,160 @@ +import os +import random + +import torch +from tqdm import tqdm + +from vllm.config import KVTransferConfig +from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import ( + SimpleBuffer) +from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe + +# TODO: the test depends on a lot of fields in the current implementation. +# We should have standard interface instead direct field access + + +def test_run(my_rank, buffer, device): + + # buffer should be empty in the beginning + if my_rank == 0: + assert buffer.buffer_size == 0 + assert len(buffer.buffer) == 0 + + print("My rank: %d, device: %s" % (my_rank, device)) + + # insert + tokens = torch.tensor([1, 2, 3]).to(device) + roi = (tokens > 0) + if my_rank == 0: + key = 2.0 * torch.ones([5, 6]).to(device) + value = 3.0 * torch.ones([5, 6]).to(device) + + placeholder = torch.tensor([1]).to(device) + + buffer.insert(tokens, roi, key, value, placeholder) + + torch.distributed.barrier() + + # drop_select + if my_rank == 1: + tok, roi_, key, value, hidden = buffer.drop_select(tokens, roi) + assert torch.allclose(tokens, tok) + assert torch.allclose(roi, roi_) + assert torch.allclose(key, 2.0 * torch.ones([5, 6], device=device)) + assert torch.allclose(value, 3.0 * torch.ones([5, 6], device=device)) + torch.distributed.barrier() + + if my_rank == 0: + assert buffer.buffer_size == 0 + assert len(buffer.buffer) == 0 + + print("Test run passed!") + + +def stress_test(my_rank, buf, device): + + torch.distributed.barrier() + torch.manual_seed(100) + + reqs = [ + ( + torch.rand(100).to(device), # tokens + torch.ones(100).bool().to(device), # roi + torch.rand(100).to(device), # key + torch.rand(100).to(device), # value + torch.rand(100).to(device), # hidden + ) for i in tqdm(range(200)) + ] + + random.seed(my_rank) + random.shuffle(reqs) + + torch.distributed.barrier() + + n = 0 + + # the buffer size can only store 100 reqs + # so the sender will occasionally block to wait for the receiver. + for req in tqdm(reqs): + if my_rank == 0: + buf.insert(*req) + else: + tok, roi, k, v, h = req + tok_, roi_, k_, v_, h_ = buf.drop_select(tok, roi) + + if tok_ is None: + assert roi_ is None + assert k_ is None + assert v_ is None + assert h_ is None + n += 1 + else: + assert torch.allclose(tok, tok_) + assert torch.allclose(roi, roi_) + assert torch.allclose(k, k_) + assert torch.allclose(v, v_) + assert torch.allclose(h, h_) + print('Rank %d done' % my_rank) + torch.distributed.barrier() + + if my_rank == 0: + x = torch.tensor([0]) + torch.distributed.recv(x, 1) + # the # of None received is the kv that are not selected + assert x.item() == len(buf.buffer) + # and the size of the buffer should be 2000 * buffer len + print(buf.buffer_size) + assert buf.buffer_size == 1700 * len(buf.buffer) + else: + torch.distributed.send(torch.tensor([n]), 0) + + print("Passed stress test!") + + +if __name__ == "__main__": + + my_rank = int(os.environ['RANK']) + + torch.distributed.init_process_group( + backend='gloo', + init_method='tcp://localhost:12398', + world_size=2, + rank=my_rank, + ) + + print("initialized! My rank is %d" % my_rank) + + config = KVTransferConfig( + kv_connector='PyNcclConnector', + kv_buffer_device='cuda', + kv_buffer_size=1e9, + kv_rank=my_rank, + kv_role="kv_both", # this arg doesn't matter in this test + kv_parallel_size=2, + kv_ip="127.0.0.1", + kv_port=12345, + ) + + data_pipe = PyNcclPipe( + local_rank=my_rank, + config=config, + device="cuda", + port_offset=0, + ) + cpu_pipe = PyNcclPipe( + local_rank=my_rank, + config=config, + device="cpu", + port_offset=1, + ) + + buffer = SimpleBuffer(cpu_pipe, data_pipe, 170000) + + test_run(my_rank, buffer, data_pipe.device) + + stress_test(my_rank, buffer, data_pipe.device) + + buffer.close() + data_pipe.close() + cpu_pipe.close() + print('Done') diff --git a/tests/kv_transfer/test_lookup_buffer.sh b/tests/kv_transfer/test_lookup_buffer.sh new file mode 100644 index 0000000000000..09d7ee018c3f4 --- /dev/null +++ b/tests/kv_transfer/test_lookup_buffer.sh @@ -0,0 +1,3 @@ +#!/bin/bash +RANK=0 python test_lookup_buffer.py & +RANK=1 python test_lookup_buffer.py & \ No newline at end of file diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py new file mode 100644 index 0000000000000..65973bf10a4d7 --- /dev/null +++ b/tests/kv_transfer/test_send_recv.py @@ -0,0 +1,155 @@ +import os +import time +from typing import List + +import torch +from tqdm import tqdm + +from vllm.config import KVTransferConfig +from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe + + +def test_run(my_rank, pipe): + # test run + x = torch.tensor([1]).to(pipe.device) + y = torch.tensor([[2., 3., 4., 8.]]).to(pipe.device) + if my_rank == 0: + pipe.send_tensor(x) + print("sent tensor x") + pipe.send_tensor(y) + print("sent tensor y") + x2 = pipe.recv_tensor() + print("received x2 = ", x2) + y2 = pipe.recv_tensor() + print("received y2 = ", x2) + + else: + x2 = pipe.recv_tensor() + print("received x2 = ", x2) + y2 = pipe.recv_tensor() + print("received y2 = ", x2) + pipe.send_tensor(x) + print("sent tensor x") + pipe.send_tensor(y) + print("sent tensor y") + + assert torch.allclose(x, x2) + assert torch.allclose(y, y2) + + +def stress_test(my_rank, pipe): + + torch.distributed.barrier() + + tensors: List[torch.Tensor] = [] + + torch.manual_seed(0) + + for i in tqdm(range(500)): + mean = torch.rand(1).item() * 100 + std = torch.rand(1).item() * 100 + size = torch.randint(900, 1000, (2, )) + x = torch.normal(mean * 1.0, std * 1.0, + size=size.tolist()).to(pipe.device) + + # 5% probability of sending a None + if torch.rand(1).item() < 0.05: + tensors.append(None) + tensors.append(None) + tensors.append(None) + else: + tensors.append(x) + tensors.append(x.mean().unsqueeze(0)) + tensors.append(x.std().unsqueeze(0)) + + torch.distributed.barrier() + + for i in tqdm(range(500)): + if my_rank == int((i % 10) > 3): + pipe.send_tensor(tensors[3 * i]) + pipe.send_tensor(tensors[3 * i + 1]) + pipe.send_tensor(tensors[3 * i + 2]) + else: + x = pipe.recv_tensor() + mean = pipe.recv_tensor() + std = pipe.recv_tensor() + + if x is None: + assert mean is None + assert std is None + else: + assert torch.allclose(x, tensors[3 * i]) + assert x.mean() == mean[0] + assert x.std() == std[0] + + torch.distributed.barrier() + + +def latency_test(my_rank, pipe, nelement, ntensor): + + latencies = [] + + torch.distributed.barrier() + + for i in tqdm(range(500)): + + tensors = [] + + if my_rank == 0: + # create tensor + tensors = [ + torch.rand(nelement).to(pipe.device) for _ in range(ntensor) + ] + + torch.distributed.barrier() + + if my_rank == 0: + t = torch.tensor([time.time()], + dtype=torch.float64).to(pipe.device) + for tensor in tensors: + pipe.send_tensor(tensor) + pipe.send_tensor(t) + else: + for _ in range(ntensor): + pipe.recv_tensor() + t = pipe.recv_tensor() + latencies.append(time.time() - t.item()) + + torch.distributed.barrier() + + print('Latency test passed.') + print('Latency:', torch.tensor(latencies).mean().item() * 1000, 'ms') + + +if __name__ == "__main__": + + my_rank = int(os.environ['RANK']) + + torch.distributed.init_process_group( + backend='gloo', + init_method='tcp://localhost:12398', + world_size=2, + rank=my_rank, + ) + + config = KVTransferConfig( + kv_connector='PyNcclConnector', + kv_buffer_device='cuda', + kv_buffer_size=1e9, + kv_rank=my_rank, + kv_role="kv_both", # this arg doesn't matter in this test + kv_parallel_size=2, + kv_ip="127.0.0.1", + kv_port=12345, + ) + + pipe = PyNcclPipe( + local_rank=my_rank, + config=config, + ) + + test_run(my_rank, pipe) + stress_test(my_rank, pipe) + + # Use this function if you want to test the latency of pipe impl. + # latency_test(my_rank, pipe, 1024 * 8 * 128, 80) diff --git a/tests/kv_transfer/test_send_recv.sh b/tests/kv_transfer/test_send_recv.sh new file mode 100644 index 0000000000000..1e89e246b4992 --- /dev/null +++ b/tests/kv_transfer/test_send_recv.sh @@ -0,0 +1,3 @@ +#!/bin/bash +RANK=0 python3 test_send_recv.py & +RANK=1 python3 test_send_recv.py & \ No newline at end of file diff --git a/vllm/config.py b/vllm/config.py index da043afbe1ae7..5d9e2766c7faa 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2052,6 +2052,88 @@ def __post_init__(self): f"installed. Original error:\n{otel_import_error_traceback}") +class KVTransferConfig(BaseModel): + """Configuration for distributed KV cache transfer.""" + + # The KV connector for vLLM to transmit KV caches between vLLM instances. + kv_connector: Optional[str] = None + + # The device used by kv connector to buffer the KV cache. + # Currently only support 'cuda'. + kv_buffer_device: Optional[str] = "cuda" + + # The buffer size for TorchDistributedConnector. Measured in number of + # bytes. Recommended value: 1e9 (about 1GB). + kv_buffer_size: float = 1e9 + + # Whether this vLLM instance produces, consumes KV cache, or both. Choices + # are 'kv_producer', 'kv_consumer', and 'both'. + kv_role: Optional[str] = None + + # The rank of this vLLM instance in the KV cache transfer. Typical value: + # 0 for prefill instance, 1 for decode instance. + # Currently only 1P1D is supported. + kv_rank: Optional[int] = None + + # The number of parallel instances for KV cache transfer. For + # PyNcclConnector, this should be 2. + kv_parallel_size: int = 1 + + # The KV connector ip, used to build distributed connection + kv_ip: str = "127.0.0.1" + + # The KV connector port, used to build distributed connection + kv_port: int = 14579 + + @classmethod + def from_cli(cls, cli_value: str) -> "KVTransferConfig": + """Parse the CLI value for the compilation config.""" + return KVTransferConfig.model_validate_json(cli_value) + + def model_post_init(self, __context: Any) -> None: + if all([ + self.kv_connector is not None, + self.kv_connector != "PyNcclConnector" + ]): + raise ValueError(f"Unsupported kv_connector: {self.kv_connector}. " + f"Supported connectors are " + f"`PyNcclConnector`.") + + if self.kv_role is not None and self.kv_role not in [ + "kv_producer", "kv_consumer", "kv_both" + ]: + raise ValueError( + f"Unsupported kv_role: {self.kv_role}. " + f"Supported roles are `kv_producer`, `kv_consumer`, " + f"and `kv_both`") + + if self.kv_connector is not None and self.kv_role is None: + raise ValueError("Please specify kv_disagg_role when kv_connector " + "is set, supported roles are `kv_producer`, " + "`kv_consumer`, and `kv_both`") + + @property + def is_kv_transfer_instance(self) -> bool: + return self.kv_connector is not None and \ + self.kv_role in ["kv_producer", "kv_consumer", "kv_both"] + + @property + def need_kv_parallel_group(self) -> bool: + # for those database-based connector, vLLM does not need to create + # parallel group, and in that case the kv parallel size will be 1. + return self.kv_connector is not None and self.kv_parallel_size > 1 + + @property + def is_kv_producer(self) -> bool: + return self.kv_connector is not None and \ + self.kv_role in ["kv_producer", "kv_both"] + + @property + def is_kv_consumer(self) -> bool: + return self.kv_connector is not None and \ + self.kv_role in ["kv_consumer", "kv_both"] + + class CompilationLevel: # constants for the levels of the compilation process NO_COMPILATION = 0 @@ -2317,6 +2399,8 @@ class VllmConfig: quant_config: Optional[QuantizationConfig] = None compilation_config: CompilationConfig = field(default=None, init=True) # type: ignore + kv_transfer_config: KVTransferConfig = field(default=None, + init=True) # type: ignore @staticmethod def _get_quantization_config( diff --git a/vllm/distributed/kv_transfer/README.md b/vllm/distributed/kv_transfer/README.md new file mode 100644 index 0000000000000..dab2d10c4c9d0 --- /dev/null +++ b/vllm/distributed/kv_transfer/README.md @@ -0,0 +1,30 @@ + +# Distributed KV cache transfer + +This folder implements distributed KV cache transfer across vLLM instances. +Currently the main usecase is for disaggregated prefilling. + +## Abstractions + +The KV cache transfer contains three layer of abstractions: + +- KV pipe: a FIFO pipe for torch.tensor transmission. Key APIs: `send_tensor` and `recv_tensor`. +- KV lookup buffer: a lookup buffer for KV caches. Key: the tokens, value: the KV caches (and/or hidden states). Key APIs: `insert` and `drop_select` (similar to SQL semantics). +- KV connector: a connector that connects the KV pipe and KV lookup buffer to vLLM. Key APIs: `send_kv_caches_and_hidden_states` and `recv_kv_caches_and_hidden_states`. + +Why we need KV lookup buffer: FIFO pipe itself is not enough as prefill vLLM worker may process requests in a different order compared to decode vLLM worker. Say the QPS is really high, prefill worker may handle requests in order A -> B -> C, but the decode worker may process request C first. This is not the case that can be naturally handled by FIFO pipe, so we provide KV lookup buffer to help translate a FIFO pipe to a lookup buffer. + +NOTE: KV pipe layer is bypassible: you can skip this layer if your distributed +communication service already supports key-value-based lookup (like redis or +RDMA database). + +NOTE: If you want to not only transfer KV caches, but adjust the model execution flow of vLLM as well (for example, allow vLLM to receive KV caches on some tokens and do prefill on the remaining tokens), you can bypass both KV pipe layer and KV lookup buffer layer, and directly implement on KV connector layer. Bear in mind that as vLLM's model input is constantly changing, this implementation will likely be broken when vLLM has new updates. + +## Disaggregated prefilling + +The example usage is in [this file](../../../examples/disaggregated_prefill.sh). + +Here is the diagram of how we run disaggretgated prefilling. + +![Disaggregated prefill workflow](./disagg_prefill_workflow.jpg) + diff --git a/vllm/distributed/kv_transfer/__init__.py b/vllm/distributed/kv_transfer/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg b/vllm/distributed/kv_transfer/disagg_prefill_workflow.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a25ec5ef52491a0e3faf596669e6cf0e7c7ae175 GIT binary patch literal 142656 zcmeFZ2Ut_v)-D`+C(=7XKv5|{1nERTnuv%F604naBs!odHl%P*G7*Q~lOz z^3lQMdH^*W4g1;4nzS4y59ma_IOW2UbLqvjsyn&vjAF&F*gpwpVC3fE<>NnhUgE;V zOY#a=6_u1#w6E*v>ggNYFuiMLZee-P%E9sBBPVAUS8pF*KYw^YV8qkNsOV=gu_>u9 z($X_tzRJwYFDQIhR9y1@Lrra6eM4hYb60myZ(skX&tJyICnl$+XTG7(nB|q#??2Yo zH*kAD_YV$#;g61glZygC^;fa}F4^zoVk65%Nli^fP4}Bz6qNqtLd8Z+bM`VVyQT@< z11}CyxiET8t>oP5P6n|ncd%UcPevKJ#pTiGaKDN67s>wn1PlL{B>THy|Bwp>V5XuV z4;~d800JQVIy8pjdvhX?hv%IakF{9>D^Ppw-&9FPPD_g?F`0^_=iPRt*BI&(YHM#( zy5+k`a{{uQXAN}#=TTS3M*o$HFhdkLa;(uf3{qa&EmrenluI(%9tgcT?n%4HxEe+b3BuFNLJJh5|B+h^GMX$uZ|Y zwkSqNZGOCZj~w1R1uW+3{eA12kkEgf@b~`u-?o>yxCR$WJcLgH^(8C71MQChUrr0< z@2)vNiS57EO#WlzHl3c1clM)_QvhV(6c7?A{Etn5kI#Rd@YnA8Kc<(AyxvRZoiv4b z96MfI_^+Y6!4$cHa#MDCFxdH z*W%%Mt5d+b?*S*paS1^J`MHcJa0;ltqI(J$N;l|DUcyc@Z+LgP*A5%-PH%x>^A)jR1Y2<$MaV+5E?n!F<%2Wu;DPZPRK60H|VWVyp`oW)=Io#P!;E?w?IUDuSJe2W5 z-oIQfz}oldTFA|OR8x*zo&x>iIWkWq{}u^qKmHC=nI@?}I0bki(=@_2$kyx~-hVb{ zEnpRq>`XpI+>^kIyG9IeY(TXQajig1 zC2$?d9U=%0-yaOby3||kyeh3Ia&EZE+t1mUz-PFvdTV^KN?~8OkF>s-_-1nzA_zxf zklTi(aBZDwl5l%Bglb)7LS-S}`@L)ZFaL}-U9=#Z9)*V8_K$Pb@swBMuX^+(G0r3O zPg);hYmdL@zJos@ba)Mv-kZ5Ks<34U@BhMDz&Y@E06bVL7p`bp1+v?AoAvY$bjUQ< zHEt~o$!oigFF`vO;a_1}4n44gy}zQ=D9&ejmu^(g%o+Zev+6npNEg3(SwHopx9SNk z9TgX}&yHpk>EID_3Sd%z0@M%F&`3_07u>@dyOr@_&Pu=LfkTt#S^E^-?bhX8fF567 zr}vgA0TeX@OTt(?%V%A5yc^f`)vf4VtM70BeY?E0!*omfzrUA{gx{W{E#&(y&ar z9PbsDB74h_X1IFNP>>>Z+W;y_492a11Y7B_{$0YtZH^Hbzg!nz-*aUIMCGC5 zZ?s(!q)Kd5547mGg9yYMr+@pw<_4UHP%rJ2g;wW-3HgUGu}k6u+H~%eEG=5Z8oDi-*t_`~bTA z@8 zdw%)1*?R_rIi5VY}>KPO;JDQ_HLqyJ38-q z(-5XeLZZmPDxOe^dOiAK@5{AcTzeAR;AOmqFp&*!i-uCcGiB$t!8;le*K{oA9<#&nCD7WHjs zko4bdT$_UnyL+aqr9!37PJ}-j7Lks32PA%PC?kT4LI#m_JfS}G%mzhp$vzdqY>=UW zgny#s?43uUJBWlqKF7@zEmq?@9l@rx+Y*9-V`c6G&&%|RxqeDS`(`EagiJs<5MCFu zp+KVgsBXjqwmH!h-1IijEzZ~OX7pL1D>u695J%V(5?QqX~V)QMcYg?Bxw!o(V(f*!O0O11vBI!k@AKN;l)}rrX2n|93 z$)#QJu6DEd-pon5VFaM(XMs)wjYmt1Qom9^*Sc5xEB^Y+jZ{!2WTz%%W$rs!6-Y^^ zfWlFW(fU&WZkLCUe{^S!E)oW48?aGGe0rFBv5rNfNpJ+a4uq-F5Y^G7u^g$2E$>t~mOC(nvGcYC3>+L4+VZyOC{dI7GT(kwL|} zIV$yDS)23I0rs{Iyg+peU&oM&g>7y2HKlbs6N1Dk;Bys`{J+}Th-^4IZ}yX<;X0&q zA+v$NNcV#iZ@p0otWodaoQm{MEA_F$Ifu_qc3M|Y0a@-$6!A+FvnDy#ALN2pB@Z}r z`Klu&)Ri`&0^rd7#czs-#dwE&Jk75BDS#G!OYhlhdsA*hDsUy%rtQdbVV@YJ$#bMuJXjvZ&~i>~!E{&#qF^Sv%dL_Oex_-EX2 zI$Se<)Iptd^Ul`&PJNMM`!*1M8UqAwfP?Wyh^G zdv2d=8B#^pc?w{hQS5ZGr^b6!9$`r+^Ph;Y9BOyH2`i2Uqe=-l4aX zN83jEYiG2s#-{|bs#=V*U%1E18k!8{B8@NZJwURwwUK>@QA4QfAgS5-SJKatAm)dh zCi)L_IB!$@u2A1WR>a{-T>ok?*ZwJ@dFMmg2L16^u&xebkf6SR?8?P|#-*a`OFU__ zJY;d%&k5aCRj=)R`}{%DUGevqLuhG7EQM!r-u>)|%lHQv5AJ?@1)k9@@sHLP&wFY( z`DWYc+=D$``@11LkRD(Zi5+({z104AC&}r(!u!wP>2G4i3#snA&$g4F@+)CbF?f*i zN3tF52u)n%RB%vg$L&E~&9OtQB2`+WvzHuGxL)vV#>*{vrS1=JEuR9;Z(~bIz0zW* z0R7x|Be5EC_;JNZ4Gp3)-m_W7RvR+_BScRmeAcZa$OA7>sz--xza(_??b^MIYf1}r z#I!l!$pa`cOHe0U=BhSieNx_BUqen65xP?Oe(rAlAee#s!Tj)nZpC9ty!Y)`8!i^M8$t)Pw~|8FHrN?DVVjJ%QgI{4}i4 zreUL)prCGzkyxy2!RP10^={eU(ipfe@zaud`FoSxneO*t77w4ehr~2*KeB67DzyAY z+DJcU^p+unw^b%hwaMW1usPZ%AL>B72t9oIaa7a0{z_Yy5YUJ@$ysWj0DJQO!==72 zHw)4S1f5b7bw!-2h_v`z3`iJp2S1y)sd(AA2o8lt7>{8RY2tgCwC+X7@;%WXvR2q0 z5RhBBC^o@}pFMO)0Y)6RZ=0xrgG*6(CsXUE1fOm7uj@1~D!2+do$PtsJq?yiD$iq8 z5fFSOb_&8pygnWx0bAtob^g&3-4p1S_TE7F#m?ZWbWHq_mb`#hsAj+VMww(z0q6IG ziHB(~f@1NNJvrPVBKWLBBp!?oiksK&wv)r{RI~|A97x8QhaalKiJkIa5@|n+-@Rol z79f_bS4&hrc^)D}l)$;?V89HTrEAA87}uXTOjZc1>v@VPUifb7{J-ow-fG8B(9|Of6^`9CLotc9iU~Y-*0k^X?Qj)K!O5H4ODUG`w*% zXXvBmgQ+E*L1<2EFmL7vl7tsGc(TuqX@J(T8zfN9Z5A(2X~$TLy>rh68?}mgazz>* zqGx1+{h5u#ioV=nr%=MQiXDRSg&5;_B;{kG5vFBk_aLT@sgd7#N#DZpiQ6?UfRO4z zCHN%!6!376V;R(K%u2k8k0{$jE%t!fyE4D39JH0Iw>IC-0CM`}TDwx)=2NrHFjLq7 zJE5_eIh_#v`TBB90@tW-X|V1N>WfPT+~8r8C9&&rxS65sFdvn&AlVUm8)CgZUsr#; z9P1Y11+VN5VHiPBU2Rz!1Y=qd?lla>y_nJj;9YiCp7*o!=ggttfmE3=za_dKVj`qS zIP`9g4Yn_-#MWBj&HU;MMGk8N0Wlr1zOqkdO}ygq!tfH;4wq3K&ZytR8C)zm#CSI3 zG%d+jLEGsRAVv*6=xl!@tHOUF?>Q)T5o^_cq~gJZoOwlFzze4*O#c6g5^yRk9#PGo z0;aFrn`*~x0ts=%Qa#&Kz;pG*6K{n#~a{8NzO@Uqe5vP@07*Ni2%J84uwkuY{dA$u7GK_-f$@^fpfy55!|!P4Vs5 zU%DFvDz~rWmj$%WFutYBF7Q9rh?NoR((6!dkAZR_pcrHo+L7J+F9yVqodG&R9T(bL zZ_`UW*A#x5V^v{|(F=WmPeZ>Obyp~dX+^FYSpCp!B}jLoylX-jP(fiwqn;Y(CkZ5h zVGmplhNf%rjdf=mMf-%BGCJl(vY=gXM>ggRklEqrcGdEDxa0;jE)W@`kQ8Niew--m zmfG_xUC!Flz+Y99^N!ND?>uJqS81hWl`qlp4H{vm(Us9{Y?9Q(jfi%>L!>Im@P%AtA0v^T~1}m9JsB69k#^xVkLTofg@Iv z3T_4>mi&9#g4z#zo(O%V5L?woeY_K^)@L#zRWR-&Bsb*N=Jv7dcH4Mz)9s6$`=2U# zaK8Q9bPn(F;r%zs%DfB$LJdU5SKAvi2e)${L?&X`g&vwMD=9zDenZz{at$Q=jnuCg zXuP$I22!stoVPaShZ~#nlTs_nbjmSp>^0ka!x{!GwUu_nef9Qm-(eJ;$yDtCloRjW zfx4Q(`N>yEci6P4F&!S|U*4*E%|b%33BZhnS32dzJ;UVyiDJ_}i>UElB>N+nF<#FX z%slISxbY#?>qkL#3wxF)4`aZPH>wut0*tiE==A_t^`h6&wK%`%_g1h>q2BHnG45|) z$ljx05xaQbYs=FsVx!i?EBhWngDBpNe~A6O-+m@bwVi#F$KUGRM{j%UFWDOy-GHk# z0DX$^MB2-}0sdhj!bIQp;#Bq09#L1Yj;X9T z@m2=p#nO@|IFbK^;{6l)Y5jMvZG@)z{|)Ke+eJapZl{3Wq^R!$+UWvFPM5&Qch&jE zUS|`%+CoF6t!0h8-2B|l zAuVpe_T3AOeR8Zym0Wm#?CfDY{<8Nn6Nqsg!dfRSV2KmXE!VzUiG)iqd0kN{rJuOa zwVW6*e7>zE?@h164{!pMIph-2^CU~48&%g3S*8|(DlGqz>foz1-r6XWt|KsSxRMu- zn>Z|0F)t_F32ytMy2y%+maN?oDOf5`@SFa2F;UI3bf=DegKSy(5kVZZSTDeHr5$^yia&n{{+Wp zA88VM8IbnnH9HLghx1ZIQu}#)TI!WcG{#-ghP8t@)2Gz#$$tLT>i$^^R!7^n;04}JNMw*Nrwxz}pNsYHPY7}OqloYAv=2ez5>OvN~AU%;)pJS{3N zKw%CY=5hI5a+)c%8|pn!J9UUINQZ|H1NuNlr^=B_q=@BN>42?lTkl?#s2bPD-(5I0 zm1z`Lbp;aj$j<#&IL{#rcRr2)!snpv3Vn;-wZCV%*{#s}^>D+fF~EMVmqL(>ZdK{? zCe+YGrw37uV_M4S@rZ#|XGFJhc)we>@=x-CR(+T$Kis?*nTra&K|zr%+e97K0+AJ* z9#>(RDh$q>v4L@Wxg6I%7n?_ED!vgkW&8QQjdqnzNohUyqYf2Wt)Bl`n4isn)C}$` zEG{?&*zzpj8Tq}}^OxVyt07Z4H4SlXQm)6W?Mo~-`VFS@t3F~$MGE_o+(uVd>Fo=| z*m2eN`AKa7l?7`>Rl*K94)pcR=&kiKgfjN4A6C^_OmBS5{Bl>8!`+*&**PiJV4qu= z-jOCq6j(?!0_^k_A=OkfDnIKc$5)}iv@55Nj0?8ws@%L~Z zJy6aNPQ-O=WKrZp958af8c4B6ME(?*4k2?XrMMdx^LpgEjUaVTTXl;x$40}R>^(>1V7M6_Q+YJiI*{zRPfY0 z!FEiLGj@h9GO4~Y*CQ_Qem=t_?Dg{uEkIfNK%LXZDJ*2Tol?zsNyq`SI-KY$q-x_K z0M&+%uYVbpbZ>b0My=+{rIrs8Uk%KdJ#W#{g(85s*(KJLY9{ z8poh|!O8lLs3UJGCpw5CU}>526hN^7spT%g1Pg@FMQw&n>reZ$O#K)f6!v4ERzLH>+jMsS3xMtXr+V zC!5`(;bkcrmL+`DOU&leoyczsrvS}yVo)hjccPrAvkV3aHP59}q)ajS>gsRbpC6iF zJq29m6DKhK3cSz=0wb>Ay$RP4V%Wik?gaKgfPLlMeMCO z&0_7+!%#sK-jHx%e!cyJMkzL^Z`$vJaq(yPU_cRvXpfzVxLjxl;>(xvh9ch2m4$0X zp4C5xpDv#is(}mE+qkobJDM?8)OE%@^H7^ym=7a$3hL61OKb@m>S8bFl?v5*w;k6S zg+P4G2V}1>zXtYI416f!Fm8WlsS)F~}Q$Ki}sFU+lHx#>4Uk zO7S(lS2YC3#R$_@g>#Bt`s3Zvi*O%v;%0 znXWm96^~3BvR7q;_+6k6Ko+CWvkg7NsD|Dz*0-?WFa#BYvPs1z`{EqMj`Za7oF3EM zu~UF+OD~Hz?n+0!^vM8})!+gsPG8bYD}T*cPNO-m&EOR96nXge6p+`11`^L@k^6w` zQPGp*PnxFy9q%p>-jLk?!bJyxZz8fKrY0;tL$FDt3DCZU+FA0cHAvhB1PwYVhOAY< zwNC*Ik4^#06Tx;x_kYA3)N$<@6Ui`49o-4nFg+42wMY&CI*8OixpDq)F%DD&DfY)x2sYh=q^>u);l~Qku{ME@v+(wStV53Rw zgMSSv|3R4l>l>b2Ym7=^Ylg%wFu!SU0Y?dG60g_(ZG=8@?JAg>zV1vh_&>;))`rIbwV(fvr}P;mF!;?yEBDGlaW0UMx3* z;q>4zrGhMXB+TA%q0%zom&&bM=y$}a71%2RXU&lTiP z6v<2hLViYeD}R<7|4rl43@yg5bBDFA3&`Yu?=Pnm&nr6xH~^3E8T8vL;nrCEaFHLP`-)~S~yfQqWjPg0~Ustnnzs6?j(52#$MbJf6#1T z`J`I6wI(o)I(~&xJtX$Xevup@u1o9DB8Ngiz{lU+t(cI&K{<&kKVK>JPh4Q4D5MU0 z!Udu@1)#5x10h1i-;=z-Zmd5h?ph4AW1o>Gk^6TMbYxxo+D`OK!e2+awjW>8pdvr~ zkDw$~J^7&M`e^TsOOmULf2jMzwevI8yJxkV4!F4X$RNV+ZcJ7+5UP?Cn-qEqz`n{j z1tg9R3Zu7k=Z<4zkNp2wN!He+Z42@&q1S=L-y9(8pY!)5&%=?OU>zL$k@prYLQ)H1T!*}ZGr`nIONFj9F%hYoO-Ql zj)@6WhTH+auZ5u5PTrpa){5~!@>KK+iQg6+-4R9!4Hp+zW+4Ai3Hp8SCOZED{#Vcc zHZFhZdF1Gw-*^12&wsJ3E3?~Zi8i?T{$9zXNC>x-PotUxce+BB5Ul%ya`i;H|B*YT z%1+|-5PCJXaCPI+#hGqW$hnhC`;TJYRm4u1<^0FuiTtbHzRj!Sa{8O@g8q8!10 z_5Vpv{1d1O{hQrlcD9Zn@hsPN`}}=}^y3jRKXP#S3eNfuq$S87h)r*Slg7ViU)!1e zk;vx!2OII9=vx1+I?Aa3AT7BDLr9v3iwB>P&x`*Kz?!a~0``FQWS)~DF_O$>vivLD z+PD~(!-J5b#T8Evr>}x;3qJKgBq>OYnWF+(HF=*=`U#|%6e)X~~@g`25IsG*| zaK`P^yz>4d-eEH;7MtFjP~t;^BWx;L0pBS!6o$p{>GVgFcuSwxTv(Oy71balhl{SM zc*=p+VFChfGGCZfRpI5*BASA?wZhmHgt+c{(p6O)TRMRQAIX)tv7tjaPZAO5SZhGf zNFvY;2sh%vuPk%i-FMETjpEvFIWh{L2#pX^yIq(YOG26X)JVUZ+nHIxGjFeJuDw^H zTEAmr7w^DZs>55qfY#wOd-l&L^nYvnqFHSH-87B76f&v=jmi1Y-i?g4Ff3kNV_Vk{ zNof6~u+5N=K$P?Il7f>>v5%Q6Qmx*&%|p}s$I&Pab)szaD{&o5qQSfCS5m_wn{Pun zl9q%*V)Cj=bh#9DO5{C=LipvS$|O53<1=sx>u8G)L!U8#g2N9EFTclBjaKc8ttZ>A zB8+j?hfmrXa1ID#yn1o}QckSjvQMP0{D~Lc`Gvi+o%8VwbYg9uj0DiuFu~MiQCRvw zn#p3^5sU2kNXNFB&D9F|7}j{QMQ&nQ!B^(GOB8qp`gNwq`BxLSB=x;DyR7CxsfKuI&tMH2C&j6mNr@^^N*O zn3q3aifEtn$j)s|G3KDy?`pvWozF|q2w~RA{F)>id?aS@>en?AsE%4;b!9}X8YO4W zC1v0+!tA7QLwf5@oKw>%R0BVbDe3_}wd(I|B${1K3Vw%=!mbO8HGQb^;r{iZ*qQm} z!?ONm(7fRh0O`0U}ZMrwL zPd-wmW#{TVbg{6o@{fF6fzb1HVcF+4`CMWP-f^2Ui|2do4iTkg4p3u#*V>N;ZcCzv zWBGr@)izAlO7fAB*U$MbUgyi12Hsz_&>9*=pc&fp;e9N=z zSlb1=eok*)3r|NWU1Ft}yZ5!Sq43z@5jE{Xe5GOZol##&Bj~M{$wHIy1M;^tJ1Jfy zKZlpSvBvmE)JmThvM|9tihlUcNV1b_1r;yJnIgvJV?H1CP$Y`#x#rLXs&{#%)*F8Q znSs#R`Pt$7xT67I!S|!`@FDEdvb*w-mnA=4FPIiyX_;X#im=YzKotZv+`H$>W+*%R zE3w!9mg;gLz*oq*GD$-O-#5HWx9y@B?u0M*MFZ)a0~x=Rq8HEfKbl6Re&rbezYd6N zO?X|k*U>RxF%fbWmk}Kzw4R=XF=n$|8<9QJ>s4k~Agy|VZ}ro{{E)i$-DMc(GE9hu zOACC)#h)K^IAQhXMW*8OX$Z=|GivrTAU$=E&b@=;Tumqi*(SzY3iG%*+!)K~2{hdZ zEIW_s8U;JJ$mI8JU22ymdML&0yqE5$y5ajEjFp zsr&Spj9kp?8n#Bq3lhneNM4PLGaF>0QcjZZA{{>OOYp639T(Dg?#4x_^gAC`L!4(` zxC`)gE*+b$I+#2@43zv#4l&tc4~n|lqt4*-`&1rloSP&n7Y>bX1coz}mN?cw%eX(- zSkvMpQ6f2= zYq+aLCF>~Nx(Dx}>xtENv-jo-K3HEK;_6DR#Ujt>-F8iXg@bf9Pd@``jT7(XF2eNh zS>LLkc#wi9J{Q&TP5k}}>r3Tz-Q1FKlY6LN&-QbeT(*(ruJtPFvCmnia-TfE&alKX zbZuDd=X(+tE<0>FL6M-{-8>o95u%j_cY69vM7 zavr0GpwgCss4i{(_!>}m)ylr$boU}~KiKrJ6Cdq_%juu;w#G}|OiKzSiuts+=y&!X zCUgn&TXb9k0ssLW<+~`@oz%m(_@584T4OH2P_;`UQ>`B2Qd;w0^@D8Hs*V|WS!Z@= z)pzJc>%uRH#GDNmw_J`@U@U%n|58{7IrUUHZ80i;aQ9PRL3tiXhs)YDsCa+2zdGSj zo2AWc-wI6t<(tryYn_i-^($|8q#HiVPf=R9YassV`XfL+?JYV4V@ezE$!iT^1PDKm z&U&dK!l!_Ye=>K=H*Ko&e8ipl7dJYuU$T5)rnLm{dz~mt6KvsIyaugV>_XPURO)lR z^PI1ujpl!5Wi`=mi8_@woio!5p>g8#mfYuCFtOt)3K=^tuPDiFk0bGJhA?2Kn*7qO zLK=dDs2AH1&!#ZcM@!6}6p!PRH*Jjjz;Pi8WMa#PPXvjj%s*L7le_h zje%LRtxVEtCztvT-E+){x=cpO~p4q!wkbi#w8qfcz9GgDge=L*r9(L@znrS;bv+may z)THPswm1Ezv(0pUxbU3#A(PNN7rGu&n_m46-*9b==z$%u#L8q;&Xp%-mDxc%pCovm z0yZ$O#P}j#6W?A|OWsWWwurEa(8mwHLugFYHa%Xns7n87IC-;)%reC~HZaxNUu(%J zK^xx;z2!)U)bWI}kfdgbx29Wd-QKx;IG>Crkh3X!CsFquJ~o=Xy6#HH_%zfq+u%@X z*EO}RgVW+GIq7Y;$Cv{T?jrFvjO@QLajWp% zymE2xs4?ZaTyY(KHyd+MZuW?6cD=0QB}Uow`;G25?{(?KV}G=o60W&k#@F1#?uMyR zyhmJbA7wmWqF%Vg?N>i0JHK-D!3mKYa3NnU%X(qZSct$=oR{ck{^O&Kb~10%fMa&6F=rR+>vwh!jmYLT`I_C~&()1E#eXayo^2SdI2cgZtR2)kkSd&TbNiZpc&_oD zi~vNG{n~Ijd~qupZ4IF&QEa*gUg*L?%JJvW_JVna<)`u9FQr1IxSKTdxme1DvudvJTMc1%!XBb+sf7*9 z>jr%stE1ZM59T>n zRuSkP(<$P#ldevouw8ri%cUD%UB1mT5adEeP)^%I*j<7c%&hbZldArF;B^xo(8k~ubx}zi#PA}SevG6`j4L;pc9<=naDVmJ2DOw^N9hsA4R1f`eY-4IIZ*=?S z6$a0vb-m=NZze{Iy)QYANc6UI!6fnCBaJR7oxItP(_MVv?XKosVqHopW+`UE`Q+lK zQ-ED>0BCOz`$Q5&q9RI7kl2U{5uIcoCn;|GabdcuWLBKFVBDkri_`@z9+K&{b1W^o zv~uvAZw-fL&~IesrQO+y?>O;nG3#Or<1vTy*)J+?q1i5^;g%qeqwf;_1B>xz0GS5i zE@Jb$wFFzMFPJaKD1lkYmngpN3Ndy)smHRPzGyNRu+yuZfvxP$45N_|;!|ysu3$6^ zMIDPk|Mc_q47b9LM^y_^~_M3FlXPV7X--NO%9u;dyhonHr5D(Uy8RL(Fxn z`?BjWYB}u(j-X3>0#SP+M{Vn~t;iylIh&x18{2QEJZ)IuWF?{TBY+Gt-rtdNF zMY3PU0^;k@t05lV-OSZi7*?!rlxRQECW27K4dz6-AC%HRR{0A>l}X2Z@8Ii))DU1@qd%>wos1nUUHJa!uAxv5wIe%$==k9j((!$PJKwJ=MSU z3+o3BTVR{0z31yVmHE@Eb@J)@`{^51839AT5$iW~ zX|Df>k$e*}YgJus7IX!ia$QI0wbIEH!>3R7vX_d87d)N`E8+Z~DnMd1)W+&QUg>Hs z3+yFRJaSK-4SVxySbzC$Awr9qwMe8FKU7X_D^!;7eu^FLTBMl^S+u)ao6*HjonD2v zn>LN*8C$w2G;7QTi@fuaJ*DRr-B)t7UlJ{S=O>1)e=#mwVv@vWhq&83c2I;piGMP0 z$>eqkbq1glwV_G)e7Wg0*dbzo@fd1=SC+{jT!%+sye7x=eT`U!cwLZ>&gNZiR($n+ z_lN4`2ItMGCL|Yv0;iFH6S#}3>>B2^#%mF*&(@Vom{cid25;q)>?rB(qzZ3c$>eUGta`P8A_*tXFSZ=JYdp!T`;rbZuH~_^4BZv&CadElU}H!8QYl%E?8FrcaI44)=)3UoJ{mk_}4c9W0eQk4oqax(@F z0bcGYM$Utxw9c>N`}6_q1&)mExCrzqAd<*LkRWQ2!y&M4D@b(6EPCu0^xIW7tD~)I z;yYt$U5w&Yh9O0e)jqkYIn0mS@wCVmdKmfV;Uw? z$bJxIZyxs(#ki#_KQHOGUPH}|#(*Oy{Fpo=ojl&$=Au7yCRfA8z9eLXCu2eKBtC@E z$QP+*@vFHlT3}Ik@uT_W8(E#CL*?O0h5gz^hKH$#-X4*QsQ2eJ*vXPP5G=NSxbId6 zXASQ)_zVx$_}A|oe1f+eIyK_-I>FHk6i~K)H*loKK4EHtCHY|!LDuuG-glO)2H!ca z#Dg1Fn{%$@vfA!4%(RZIsoa{zh+)UC*I}nh0uQZjq7u^{Coc@8+^ptFdF<}r-9EIX zFLNuPQVCbo_2!00r1>!@4D#s2qAUC2*jGg>NQ}tDepmp@#N)E_DM;0?&tD_HNvNxM zK_3urU|ExJ)}1OpB7N7GQRe0DQ$H^)Klzb=aqEkx=3(Wl46=+kx`rNjy>U9bOd!-loW1o5~xwF-|n{@PIw87!r-Y=2t9b(6J^5=M|!Pj3N!% zUBUZfcG(sX()bv<$W~>HVPAXusIT-{k8~6opJHvk(855W%PV;QiB_1cM~87}28jB= zLug?R|8BSU2|oXJ^zEQ>EgHi}-#FMSK6-Ph-j{B78D(X?GQ}se->#G$s0Wgf-gx8e zy4@pvy{=sXM_tm10820odJeX@_dF?b@lXN=I=;$(Jk{(_y)lYEW1_>Q+xfd}nZNMH z{?&c=t{^`Rc#sX1ZQ`cY2Qp70Ien4zarS7O%8DFrp~E!%;gtY?d~Lur(EFU)!$>{3UUJ#MtOQI?}Y(#PeJe#>p;@A}?a^bJHx7JPj z_D%abM={o=g~wb~B5mq_f-IeXZE?kbYNVVXh5%2H6E)67-^NKQ^e497^OAVhs^cq5 zY0=WCdZ_rW_pJV!K#0mTnS~XEi;Xo9beo^MlwC(oHbLsOO@a&d9+$lhidkW_e(YTM za)~*cV#(|aJu|(S2v+AdNP~bJMa8Zbe$B=)`HlPy>2@2oG6K=po02WPYq=25HR*xl8wu8j z_ITq87!)<#4&1-!5qFe`88tcQ)>$Je!>#>X1FNvM>F_vRYtRkD9S*r*{Z!cUri1&! z%YNRn0D(Dw4d-bR>l{KKo6zhEtyL$LqUUeuJ4X9(8wz!Mm_MVGHJGni6|>D*wlw+zR$V{ErqAD;7Q(h*CU-lBXDqr@ zRwHW_6^E6_r3+fct!Mc>9Ah442yST((Y@t5u!u&0^5A;iRcrh*X|Tezs(d}6gfcCq zG;7sRlg-qlNmY+15?3wOgD7eON8S?VLw!C;ncA3|y%#4tqpNIhJTdX4Hm!yA$0umr z`E+?86Z`@U$)f<|#pnI(vGT#KM+L#9=GDdq)vi{{+-3P3J~TrYF&LP!?d2fRb7I&H zVNnAMg0NMOatL;_<^jT2-p*dvd*%I+R`Yq}GjMQ!e3)HEUdRyH`BqbXN6ulKflKJc zP7lQ{m8E0MT||}R(-GAu-D?tnm0t>zGk8VqafD$pJZjdkF4|^7OuyoQCO%+XGOm@4=MMCI^(xb0U99`r+|*Q zVbb~Xp!Z){wGTIw-AG*cTNv|i)WTQBR?~F})o*`(_!je{ciUbw z1x_~;miVF#h_^t40eHVM}YEv0cM`qvFCxB?qA@= z(QVCRRE;2J)ss62L)Y{`_40zxQW@U*>#I+%c~8XS_I~wxB5S9jF)aq|P9Fs{KQ`3k z0&VMFDE_*3Q4vzK#5-xO8t+PwWr@rKnjiH7( z-W?iel@sbqfU?;_jq`)EXO6I6XY-oOUp$Nu5mFM+(~KoB-NQ!3VT3I7CJvb&lS^#{{52!;a4!9zhhr>za)IDFC;aCK?Xj9QsB760AZq%!n)M7BG%(uhHux%p!RPmJQ2;hFaU&lSOT8axGVQlUC=GhY`~cmTe3O z%!}>uoHJ*f&Md^0QDDn<$QT&)N5yG8iVF+Zy)zd=jY( zwME#9Bww^2b_M0u+RS7X99H`)p4X^tFxzgNs0#1ZxZQ+qws_hKT&^U$;^^~nn0Y&C z4inNqP0Of97VEAPMFpmbq-Un1fqje?eJzxyPWt(1Kp_hD4#(+azHY1st@zcO4au^_1T8_NhW zAg?VMBP}y~Z(L|fx3W>4N^t^%@7 z1c9kJSvfb3&0D7pPclhdYt0FDCrP35Ox6P~+|G{@KU0a7e0tD-` VyoS(94Q7bHvy3QPzj(r^spo3ajpLn~R&DgWWN)+dLQ?s9eREqH zJW4W7V9nh5kttCso?dM8T4r0y>`tsZ1Q+ zzC?t7Ns`~$+}pgvF*czup<1vHqC=5+vybGFUxGY3+_zPB^g)pt?WPC$6Y_@<-^*PjUS-L z0Ny$C6>=Mhb)zA_{Q=%XQqb6U{{ebSy$mqfbu4~>;tPI&LLZSo!@ixH1{SDRF>=R< zum7X+KgZ+0c07jTba-2BS;B=aACI?HWTbS#^Z)qW6r+ggM<>V_Hmv*gIMph{F$pR-}vVsFq2G@3{i3(`TWqTEPNiuFiv#66R#d` zRCROd4)NO42luUwIr%**uHkZhPd-Xaj49AaQhck%Vkq{rUHs2_i2kNz?__*?Zs*Dr zz+7$w6w90d&9DDtLdLwOD=$36l_)G*i)A0#%Ir7;2Or&#&H~ub7O3hep;VfKnnrNv z5Fp$o+Ji7TvP6jCU1n=-3*ZsPBhr4@JWIf^dwo36aW{l(;xs?R+C(*crB*jJ(Z>7W zLW9T(fL~~a56BrC-LGYI@S?&uv9k@L@5T#JygcCK87Sc0WW@$tJ+5wHhO>8ZWrl9= ztxq*;+)C&h#=QIH*En8+;$-e)x&16^heE)IbrwE_vkwG9O=u0FHcW;nrg+t}yt#T_ zd?5Q z_GwN@uHFa%%9cURHI-5bU%()QsfB|WaimI+3mdvUvhj^rEm!4amg5s*#&?ZW8ip8q z7~L8z2klF*TZ+7_BoRRnXW#E5>XZ^;W@9Fu?q(f1mg2cH(VM}YChMa9o!SN&&cTgT zhOZ$vt>oIaC&6tHUfU6k7uE8QC8d*1x4#`tX}LTujeIKfzPtXZC@q{i(~IMZE{n(ttr`uHl5$Pw;rd)H zsx6)7_B#yUg{I-n$1VoMmbC1d&*JmW3tI0{XFMUI*cKaaT&h^>Azj2$=7LFZHlBk} zl6w3ifLS){;OM?q;M8}42l5`RJjWhA=7ja_sj>N`1o0xr>oxWHr7JVG0%VTinpyP0 z+pc;Ess1}J^dDYbaTF;_F0@#`ZkPA=(Z~H-y&S^TYTpJ!1M->iI8DBbKo!E>d1;hg zr~di_56Zxh-M*b??~kTx_GePpEcT;`h)il6v1oEJs5SnnpBJ>;a_e|;{c&df4r$iT z4pt9HeEf9|d_^D$H#=bTBpN>)lzELYpXmPiVuoK=ZPNXM&XdZPj-Q|28*rFa1Zc3R z5M{>9E^DEYg+WhYBh%^LlCD}?$1Guq4M3nx%>NR_u)ehg9x*Ef@CI`+leSg2cZrJlET2a02 z=Gn+FoPf>oywtW@#)3kTXf)t}eVPX!@-`{`00n76iGWw-0jD8{x`L7OgJ7~q?C0JH z9P;nqLlG@!fw=YURL2}qI_L)|;tkV3^dS@gruwUZgK{PylX&EMxQKaga;B8@0DpE> z?pBKo{)jmg*zOY>AsxL@(d46q3*YfN%1%7yg^9qglWOf^jsK8&b2LTAC5(@cQZ4P>H5) zo@qo-dw?u}(m_cN6~H|#%Ez2gf;9Kr1zrxG3n36`+)quBH`7K&7K!){Zo_8}K^Ty?*cq|}HV z2KTB)%|5H(EVhOa1QQlMT~|0}H>esZ$vH}9GZZU8vI@T9k~X;n?n0fMhQES)OR_y0 zbec+mRvI5(RsJe)dE)irX?3OIfL*ilA+)%-1lnb@%vG_!vP1g0xvRKO%NJb*lZ2R{ zREcNxY97|S#1t%q-HF}TIDElH$Ygl)>hh$SwT4&)uFOb_pOe+2%Ub%2A~@e4(|P?< zOt{@^|7^WFO~dC2eP`50=bZ;&&~ehugicuXQYfIb^~_P3tpRVIJ%@4;M> zd8y4MC+cYI9kH(~6&a`~=U?LJleBNVPpB|yh`n%i^P-{B2YwEo)LUB5Yt^2HwnCyO zeY#Pg74u?i2RC@g&U>p_2R1p1+2Lg>R)KO*to&XX%9>a`pGm*2zjD|U>1lPAN>~tS*+AS(H$F+y_uVa;*dZBi(@YDl9 z#6Zq>E^Ijjr-w3YdS?>I8jdKKP-QIceyTXw(EMy)M6?U)t(OL8;XV_tNr#`VMp z-s+Ou_l)AX>q}o$EUCzy{#XW5bf@-++Cx_7cQe{!X*#ZLDN1~ijPT;l6#P_n=7H(` z2-%D!Z`D(Fn)jU#UHcs~ydEAg^;8d7_Ou`S?smZ#o5V}j7|_T9hol1&hA|9KvH+O)tG{s35nqqIZVD#g(H+(@HRmM}>#9W_Q5hp3ji$WJ$8fGr5&AB$o zIeb!IWOOGo9*OOJlHGD8YD$Of%r{0)o!Gx}hhpWVzgadsL31Q4T+GWtbZk=D!tX`w zw|&9qX5u=I!dcIOoYr5uIk0R)aXWp-OJpPW!dYa`*W{y5q@u6?ZZT|Ncl^`8{Vzj1 ze~^ZJ*Xj?0ZU95FxtZ%bw~-F0)m7qM$(^6L0kyh&r53P2U&hQH?*52ErSF7`qmjpf z_^QHveb=QPiTN}+*BR#BMh>BfM_}-A;bO(+rYa<~$+G~)6(keF z9)~*XkT3@n8rru+XKqO37)SHT*y*39ql=RItZhJ)n_mhA#1t~*2pO45-%@KM7i*IB zViwdM>9s+UG7N%Gs?XMo_vb2v?Ldh_!7agZ8_8IC&wjxrCv;Uyp%9}$t@=~B@A-{3 zXc%^COuht)@?qONat5l<>J1g>;L$n#jCM{HQ*M3Jh@M!u7^Z=a=6t%pQ_&cJZ->y# zIhlXj4n-K^bVK*APgSL6ps-4hCdQ{5{gDL_^xVoj&`ka!rXOzBx-rZEzrf%i@d}E7Ip?DfBc3fLl3NTd=^eyG%836V6#ZoV? zvL<0!rqS+k@^H`BTK+um1pN^UNN{M)kh!+>y;NGp&E90yfYnLO z_Zv>lM290mqAT_w`W(SKdqJ0#2tdk=W2>8k52C6K3uFSN`kr#?mzTE+WxTc(Y&0c< zSK>(anF~)c4#>>3Rgb<2hJJ)q`lnxfdh_H|W5BZFR<~4_Xc%7UYE_}k$)7sH@?=v!oDCaPV zPHifKA0;6<(Jg{G2l<`ssj4gQR0ky|UeKS5J>BTz+8g6z%UhUg(iTmxLHtxox*h1Z zM4(3Jb>&X0?wqeVqkgy0XsS;3?t>)$$;hbh`TIUVHqhb-$Zw8MEJz*;vQ7l_1V9EC zj)Of<8Q!VZtY&1}(XKMROCR#^l~e-iRV-+^9BqV00trVfhU$?rbWr*z{g$>X*BX0j zQqI^}ent1`ybn}Aotu2aL`mJ_?gYd3xBGKrQSCDdt=Dgl#jn)JzBB6*pN<%M>E0RN zrqcfz89JC|gRsqAxwVF9Ej_?mnj`mJ&@juZ`b`!WB*yP;e>l*YkbQa=dA~!@@M+m%vkt3J#k7>UjR81g?xkzel#~b-J{c5oa<$& z_*Wnsp@FGObEle0YA z6VsHmyzftJ-EsOVZ&`9J$|d4@GFS(_KIGO7e^cVN_N=InxBOoEPR%`d$H(Oy zpHvi`#MO^TM{op}Mpyp?6x$zNbuF)IV4Gx=rMvhs&3wvkUPGM{6*rO`b8VreeN@^B z3H7#G`9XXnADnyW{0hwp3VhYuHL7tR+_1%maQ0txmT>+>)U1UoF)$5(fP#LHUc zJ-S~r$m-|D$M87D-KP-rwK8E$Ugj-60c%0JG_bWHKtM$x9y}H>QyyAKzquLT6->t_ zJ9)|yq`534R483V383XeM2Yl2KtF}H%@Kt|e}KZX$sFi;$zx^+4RSsULKbY!i-^k* z2hd$6cy@9)I0%E+2Sf?ZYLKL0t*}D@WhS74_!E?*+AD|QG2e+)0&%l+e^kO?SLje>&ZwZDa zpf{5Okg-$J?ld4VO|A-{IG9K&=HK7efa@#ujT7d7)XpX#|2!XcWpyFBsQ%sTgnmLNBK zk#=(-02(0_MV4`#+=_N7nKdH<%2ZREy$@qJKA17RpAsoCT`cFk26|1QN8%opXyGDB z9A{zihp*1zmoNqCBBQ(o;+TPf3J8ZD_fogi>%-Io$si=n6jnVS*Oy%w{S!RXI@z4= z!o;Gho?_*P&r+jriGFz1skUtbAFt|~xUDWBrc^fjVq+^PS|D0K%f_8Mcs=I$l5nHs zE6e!+5Q!GAOS&2meuztB)0XsFiK$QO{qBv+osSul7(Xv?41A1r;qLmCZmc zdHID1p_>#6O3Z6twFN2)c8{FP-2PKcC~3_koRe69~a?-r((p1{PCQjzC%Rp zn^|~AC*<8|9PW{wzxs|}q`%`?2`-0#zQ;nwc0EitsVM_FVnUkfJ1|r&fj92$$V) zs%O)@;nX4E_ICAI{-q3R`BlRYc{8CNy~;9hp^hIQurjblYsp*#&eHOWV>yGc4YqZ< z=Bl_d{lMq1ZE#9*a-|Z<*mX$cTums-u*+C#?a(K%s*&4Yu>Cum=Dm3Mpvf!2%ck4T zQXkr^TPfCIEXrr)I$hZi3Zu#T7H6Y z7cj(|2$OxOyg99lS<_RnDVpgl&3+TZfrfcu+@lURpQ(ZvN)7y-p!v+HngMBE!74m8yA`#=;z`{O#@w$uBY17^mj+Gq zh1BT+P}YW?&=x5R0Ps%}U8ZPQT7!~J_Ix)faE5WGd!;<A< ziF0cuMG~ix5Ok&(SqW}~o`fgwd*Sz%n%|DYll2HP*|4>SkhhNoG8VL6RbKK)`c6Ge zb>@I#K6Sn2B!Y`z(zdBA$!HBP>*(Y=RT=R}tZG%dHJL7^?WLCB#$`^sHlC2>sdNBZ zX%C1BF%2TY2qj{%TnLF(`kfYI^im4z%xL=3HG7MvX*Jq$uWjo2UstQ@>VT5>1xF>( z%FJu#Ix~CS#jXJC??|HpQx~f_WbxCyMt}J?TwIKZWLPf-*sUDrK=2PV33>3|QC&Jm zRGrN0{6}u0u`n>fZNtH(DV-LOy-X92k*dKvp@N8`6Z_i}gl%C;pnm`-R zTAkobu~wx=d`cMiDXcX11_2vTAOPNA9zbq90*3Zw^|#F)5<;6yAbSBo|FbP1V)nvv zTN;EGj_NMABnFNdp$y1Vs5_Plxi;rjvd8he3I3$Z z*=0)|!rkCcM0R3h_i6HgGB;s4ru4S!8z69OUbASwvn^!MT<-Il0n*HKvrj0qNn>mU z%}5eLf=N#Zp`8>3B_9^(=`En;-%T&SGE(21SmbRo@xSSNms z&w22w&eu;qXZfK*7ftx-GiDs+`LgQ^@xV}Ho zJ@y${8{9F6U>Rx>I-p0?T`vJc}lS4S&mN!j7~8`n@qc zEoT65jhvm=F=~PF;sOT0RL&XXFouu1DJ=R~Dp3h;+vPr2$-YV9L&(OC%*HLGAvZF^ zGtM;WaCaVB<{Il(b+d54yb+~#sqa$v)z~W_p*D+j`vFgVOQ9ZcR4TYtH^jF;8llGT z9|?p>-UFF{>O#+%8ys6(ofO23XM-GbQFA;^oOt?n$puL+r!%Wbc1?|Nl{q&biE9xY zJs-FHX>iYB5^E5^g?ogYeDFO3B1Hlbk&=gv#p_;;1l|pMOwygcC+Z4YG{m%A4LlM{ z-ALf?8X!3k`?(zXt~KdQ3Wd&z+hZe*zg`FjQ@dX_>p!=Q;a`dP=5Q~uKJT7L`kCEQ z^o9_AThG=52Y8DtB+cRJt)zo4uFcNeDa;n`!EL|O0r{oQ=#86raTZ}fY%o?yKl}&i zOY!uhvXMLVZNNfM0V1wlGkt+dkT~YJ)$7ikB$;Y+Cu{wR-iH>p)LF z?KneX0OpDlP`#h4`2pgBV#CPP$N`xFA;|Mpy{zq>VStQy~E-k_8NMd;rR8<08wyIXUb7S8^kKa4Wz{zo(j z);d9L2%R?e2tD60AS0S4fBG;w9;7?Wbx8d$UlU^472kvFX-#3<#lpHSHF;%q1kN%M zBO?RXpds8NlkeSLQ&#)M&xlTtPXMimc!|wrALi#yJ0v5)%0lLO>zTCc4`>BAzHa8U zn&IYm>aKBic~&oF^0b^-f$&tFvT3>EpnPFyZj(zcTr5C)=E^((RgdDWrxbn7BJ}r6|?3EJjbV*@IFnYZY3kIcxAtPBkumR`JC=mZR>B1BLG?w%T{oj}RZP!*dVN(fZRqIe)9Ot38GFprQa}MEV7MO~M-o zJ)m(S!8oVkVWQL1^D*OceJy}WBaB zu(W9xw$8T1Pc^yG6k0NT&4!njT3WIJM4o4woh+dwU<&HBIeL z%NHt)4O`nDpUVCC>|3hnrxqsQlf?8w_|D+6Zb%RL41fXQlM7Lu^%c^v@rB&$^3{>v zDtiXN%xwOw7{aAc{spyU{dzGm|A+t1B&K~NjsFejD!x>jP;~kM_s-aaA z8kfIg2j9%_X4%%LA6#3nGhq`!lz-iF(sA-w=OS+Jx8qy#@sd5}KAZgW>-3Sh{2sn^ zoIl$~Y_-+Za{}XSd=D6_mQgPmUW@yOTk9VEgm4;ErX||zw2W4PPdg>CA4EC{?X$fc zYd_g0wky2ys=rY28RJu-8zZ^|k2Y@%(|y9XS;3+Ec*}AP_lCt}x|$ZtsG=xKJjFw! z<+s;lcI-O=lcpOm{$<7WTqDr~pTyxs`__1dkzD^Ir(O5q>7Il+8l@a`EJ=e9q&g~b z|2Uj}rNlc?kFCS#I`hH3E1L$YEJ6-22+<$ zb|gj#vliQaUifmU*F3WdrC>ZLE7|GsRMpnE9_bOAUQSIrB@UjT+b6RU?^b{qGH+-G z7*Nb-sO$k/(Q$Jff*IbJuFlQEzVZ>WIJ)I7Ob73=o-)SB=lAiDH?c;OjygkDbJ zIc6Heh@0(6iXuB&E5e|x#FvF_R1ja;*iPF-P~2+9OVCRbZhyk{!9E;yA|5|J{h@G; z>!;LqD~Y*F7qmF(;?k$Wt*`X}aamtJb>bFanGAoCBcxn*om|#z!K-0H$k&LD)UIbq zO7#|LFT=2C&q5CIkw)L>5>tDHJGg0}WXVZj4UJNi%XOO6X`fm3&CM|xO{O2zv{PB> zSStbJtPRUnj5CD4?1{7M>B@E()Fky{rAFmXXM;aPKY`+Ppl7VtGT-H)-mV#NdVy)z zOlEmf#&?gxr)&;It_$z$TU(3B#wUAZr49A~s_|-1xL*Fc3;lmt3;i!_ZTycaKV^bR z&K0l;=>99zg7AqkEUGi*emQ~V*?r!tCi-EIrzq*-3q+QWre0;j-umt^X%v)=Etqtq z(=04M+r!WhGrGJOX-sRKoC;{$+S(ZMiDssvqD(i1`EwCIc-wBS<Hw$i~LD`&?W168}FGFY|+$oT(st%2B4^~=Bxh8zWCc`nv8%ivloV& zF5>074?m-PE`}7^=4r`P51Y<#N8JnY8;k0gca5?ZC*I5_-O!CZjLr{V8sr}p`{r`J zS7u07@xC*?%pS8@OFG;GV;Fctt!WGW+=5C#+I#0x^ffhM;AG~v!wrtDmI=i^|sqi zpl*xkWPlPvG8xC-cI?;geULdZHWn*tJ{_;36&IAZ!SFmTP=jB7Q!N+mN3k0k21qVu zoWN@n*CZ#obqR7&C8~4QaD7)7*sTWDqui>Hi+zQlL^Fz+9XvH0>fpi$NEpKaGK+os zV4$?_w&)Rx7MgB zA0+JHN)3rlK#RIuLISz<(1aXuYn+%kf9z^_oKHRj4;;E(S2y6Jg4tb^1=nXAk2s{m4G1&sJmD(rTub;Xz z(K9fh^)pNTzTQXM;e&pw*eO)AuD@A$}R?u9`%hNTneW$iG;zppi^?* z@!lYc)j;(@r!8@Ajd%Dhlx?m<&A`O!=;e#m)lD~PSv243)0Bh2?M+Vm`p2*PTcOmm z{bi1nozT-HjVE4`% z7(w*VHyX;S)8u>^&smG1q|Ed^Ol?Z*%%tBdwb)MrL{!c>-FW-xP-Zi@w<$+=sV_av zub7i# z^>x(l`)>A5a2U|e07bxc0h$A5idueHDD5ZSq_j6H#>u)2Lsd8O>gxP#I5J;}6gu1R zow|m029!ROeCKd;ApsYON*l)NwbxC;HY9SS{NMEpSlSJDW^+ zy;3uKu8D}mHlC_RfhC$9+|IW5bElLh=5eRW#F>CHK(-{KW_x zGqoprZ~dB=;K;)0!{^_YVxLCxzI#ho)cEXskUP`m%QWRgGR&`1|mK{|M^hr4w4$sc$sU5oiZoV-y9-|E+|3o=sV@Yll5Ct z1w+RD+hpntOy7K0F$O)$ju3Z(-KBSK2+~pzO zowG4}S5Nax0O3BRpH#m9#oUG3*6q`e%)y&9`73iwVoh33^Zp(>uhwL*kEE})**`rc z$rD?({B@UIL__|ZEpIdTf_*!TV+P*e4s;W5;zi~n+ud>wQQGH*lLW6Q^A03Yo0e7`tx9aA+N_5>6fQblMy9@_dKk-_ zDVwzdhjf5xW`XvQoP?tr+G*JZS~y0-jVl6M?=-}oSSa4$cz}B4$yZ&9bB#Wjy?z{{ z2z{FUnI^Kyxn`Cl+$izZY@bk^N9$4XPG3J2jqyVB&3zx@y;6jD1!35{&%AQoTBK>T zG~mOfC+Cu`vC7%0b4%5zykGtF#QMx}nxY1F4y9A5_0(~WQOqlHo7urzRQ_u2-mq{~ zDc?sbRlBL(47%6$g2|%qIC~LK2-ImBrS;l*&FJ&q zk;|w0cPk0;nz##j#2X8eV7Tl4GyHxs{&P>4aR$TK#ujeW^OHd@0YC-ck5XhmlQJCJ zUmZ&tKR)T()9o6pEHKjKfXxgGplB?wA%ujoB)Lzlr3IJe+VP+GqEK-0l)OzVpWCVx z_*peF+_$C%))l}5}p-(dko^YmbAiw4JgU6=o#g7LY-%iWa2LU+ms_Cor{=_ zuLL^N9C(hEcyE)lQRv6!Q>&KHX9(7N_;oClB;oJqH+KfU1Ete0oSw6sz4N8&;q>it z0q%$kMOo);8td-s3O4KF@&orEVRJN17T9Qk)`pD2`=k3yUwbv`-esmd6JSzewD^Kx zK;c7m@DCfprR`NB)iIhDA{`s;oT~BHA!$^o_R{X1I!|R0{8m07-o!_d(3v6DOjyb$ zKuvOsiDkcgpGYJ>%z;G)wti-`3)dm$6aJt~}%9}asTrv3?5ztISN6AQrDW4<;&zn|F9eZ>@9RjT+@20$s zS)(1gcS40}P)=%f+@F5iZk!=AD%r5($P_^xEEsn_Q3iQM#+{XL0>}Thr}-%|CV(HV zyCB!o&%K%nWSOyoE%^N42Po|!X>qo}z0I%n9h_VCP6+0|M7?%EBZMO?qW zVJ_41`Z4LETv+MW{nXJVQ_kD87u)(S+@dhh;QSuxeut*j3WD{@tH$yQbxOhWTQM9i zb`jdoi`Lji=J^Ef+L0lULbI>vy`%l~ zWJTMD*Rg5+H+SZNGNJq~lougS%qL7rf*%D)d?86~zX% z=b;meW8+-bX|CC;H8pHu1Cx`T<((xPix-}O#5xmN*R*67pd5hRUYf!shbjvbxpfEk z*3)z@STP@H3_p%8yYCT3Ge^+S-?WJO+u} zBbLwWC}WwZ-M$#LnKe`Oz0_B8{F%x9b2v(2l0-S%lt!lQxnN1UPR#cLll&|Zx5n&V z#01m#1S%DyiG{dzSTKeW0l3-SMxkm$OHC9FSBo;mq4T=cNG|&g3jESGHgh9wUO`ER z(<}PU#9rejNj`pc(yh+h2E`S(me;glJHc(Z_7}0h)W?Q^vN@_x@*wTIQExs#?}jA7 zw&iw;U;ZnpFP4Av?|;ij94$$~EZWrq@`T=G&(=|ZD^5WK*w-hE0loD+eC=^~66Vm# zUk&p6Ca`)mYx%zi)1r;gBF71s#nAvp0IZmjja+4rS^M)2*S}`&{*RYWar4jkL3g=+ z=@wN8C~|$LnL>_TB&z~#U&VmKhOjPSAQ+(BMn@h9IGNlLAQ-4QE62{|UHr(~YPFVs zOP6xbn1;ubKgc>!4TQLe1xNFV5WzTlXb6PwrG@3kS=~_bd6A7FuVILn;N#e7TTYrl z|0}uVIAf-w-kYKfAvEVu=34>hNY{ZV3r05!cLK=qdCghY4BGbaw3itl!AAEco>O_t z?64K$p*W$9%&WTi-lt?qgrs2|QNgrss>C6@pUsEr{$1X<5MPF~$!OIi!O7d*cMr2y z%td?19OMxUkb{TS#PJmhg*54NPC-LRTI}PJSmf4Aj}Yh<8|#Kx?wwQTj>fV%FNx;e zD+ea7-d~sy{{q|QS?+YDG6$rz_!_ZJQ{koJGETP)8ap#Q2&TeCrH*DLR@VWCz5TaD^Dl1j-#uf=LI@YU zTw5`90|6CYB7Nsd(vWt;g04zznN{V6=!eoKEdpl0!0R2+HESAZs>aZBmWb;d1t3>f z>QIWu5g#*?`LpZ!-p1nH(oKhEUS=(NLAjw0^tz!9)#yOd#XL-mvdCKY!xXF^W!+oo z&g9pg)1A#)_h~7I#1vo?sQtTjMB5S=;wNHgu~1Zt9?Ye@nfY z-iB?TDq^#Rg8Z?C_Bh@1DHi`VXDoGOA7V&gD-_BsSY!5auv5e-ZgS|aT31*bnGY6_ zV#2z90`j!WVxn@^z`|q&x?L2K)^AmMZ4(Kw!?n!MsnfRy(NR#Rbn4xCNH>a!{AE(D zDuL*}z=B_n*|agZHBkw!oSXNOk{uB$lVu8y_RqX4vkBHEC`bC|IF3G9SESDFUX{FF1UQSaJjc!sf~)O0w!r? zM$Fu&92FR>HuVCtlKYddBoE0HB<45b3m<-{t)on?`S2Z9J9h4v7mGgYz<1@#CJ;Ru zRF6uh<<2mEeVL{C+VArA=CDcaof1dU>?D0R>6NUA@z{SceZOSm{_L3&V7}uqTEx;^ z2k+}uwWoXX9lZ}r7@e`Y>GY3P+?h<2)Vx{rwLbz=Chxa7mA}3QogGU3zPuo`$yf}> zb)5xz)w8=mB1lgqx3;}Hzkfr6)!`2Luk6fkd>-$9$~SfUmRn#`NP-`W@?{~C^l}dv zOrjeEPd3Tt>ax}28Vf$#M$BH}qGyB7C04`@ODGWz+%I%TguOLVKT*(1IS9yfxgBSC zLMQrLOS4ypw2Pg5U*sKFj%~}=k4SPSC*2%>5L^6Ocuyc9uUyWe<=Y9^AIy{@om0(fv6O9H%(ON%B+* z+d-Q$1z}MNU-_t^YSKC0ZzILhe&>)54e3IoY8G>sH%5vpV@mB8L^)3%)7n9E{Y%xH zD_GeLo?buzXgT^I9P*f3Hw>shKg|a$QZE$y1OX_PsCa>J5Bcw(4lN5fTEb+)dl)I zy+pRp|D51aA6kHt_#A$KNbW6zS*ViXEuJ`K0K6L9M!=Ax^E;75MF2iS{0hO>Da!$R z(N#p$4-ic%@&||qT~(01E~5}Cpc{$&!{us_Bz|TTeREL;@PutknLvU~bW=q5@Y5aa zKkwj|?_fe%lH_IA39VFbfz_sZKPSfR5O;CGwJ)SUTz)V}=5xsS=>=brM6RF_iI#6xdWX!u)Y5 zWdCu2CGID(_&>~l@^85$$iIB?f3leW)F5b#I{GmEH@XF&Vf+~!`ZPlA`1OVt>a63~ z_1PjXe*?;=-cvbJJ$@ot3DYM*U>h38L&1MqnqN)(|It>?qrza-8l9M6WuEH?K=Rps z6R9ysg`Z3-^hy)rdr-zK(=M-AL7DHn@|O7gNE05{6b;!&hN{^^#OK1Av>#VU)|=j} z;G19_0qn3gW^r8b2dE+x*f<8qi`Oh%i3gF#ZKOi~`}uA=KM^OP6X1n|?ixy*T$jv1 zlboR9HVsKBy!1l4U(T3r3a78#Q<|C~m>x-rsEY*$DYMq%=K`zfq2Nb&&)TZn?9Mh# zG46x%TOYNQt49Y+Z{4}G`7*HvC@rz@Q{c3kh9e)MZ+-7tHOco?!?)O0?uzUVK&;$h({juNh4thd?tYbevcO0H7 zM~>~VF8&W)JIkLpvbzEFW%z4j;!I$tj`goSc84uq`o?b1^=G{R{y}BeKRUqx>%#1B zI7@S*UuzqlqYJ;Gk^hyP@uz{uoxa-{-(7Rbsd=#u4C z`0D>39x)LYAaI<$L82s#-68rf9CP714!xVXz)bKAGfLAnxodoO*=NW_HUVq3vm8Bo zB0Yb&b(8Z1Tb$&^0HL_G0rjHEgrM7Ma;UQq1X#a|Ss$o^UZx5$+X)!HzxV#|;WNdv zKm|0bB!D5HZ)ute>n-5B050`;UyV9-hoUE(UFZi@{djlT9!YQbZi}J;NW3i{? zr*7dqj&D*p5)b&W?;G8nMksCz;3tUhHfW(8tVJ;;X4*sxxLv$JG{f@-YzfF`k-2u~U60u$Pav@xK)lfdcx53?L?Dh)i)_s#>Q5pG zy`a5b(f2@V8?8(Ygc|qpLSz>2Op-h5zuOUe#eaaDpn-A*VoyjgyjZ+cdloU~W5PE+ zJD)>vR27)jBDXuTbak=`eDI*6A;u1>n)8L`69yo2f0w1jY}^|F0;3v{!=x#)(!vkW z6Qsy@Co&z7VH9kWJC3>zB`0;k45pAhO|UKSR~6;~wZG*iDeml&p<*RLib>C2=5 zGi1|TZ`uDAW%<9e+W+eN|8o_~)mRKT{=ua0Ka|h@t$`a&{MGWw)TQM9^;`%yg5Q1` z!Kd4m-7%8^YX5z6K~{3#^$bZZ1<|J@klDZJQ`<|r_4OwJ8rZ9y2b%#i2SaKhn82@$QPTHALcIj-y;tc%)JlTRm_j2 zUCDmF<*r*`lB-LXR4J8sr>FmPKu61okvt1Dk7Ix!F}dLJ>=>W$=)-h5pUzvxmz8S9 zTyw~Mm{Xnt^tSk^4n45HvK%yx()z8MSJ9K0_kAg^g-F7%5X`r8Z~(1xQV$8RTHAwv zMy{yRgkbPSGyq!AYKGOaxZLU=`Xq~cS)2EhO#IsbE`ZMP9E&Eg0)WF#U>VV|W6{T_ z_<*q-(8Q)%z%c-2D?3AqkeY?$nVcG=A*{*&kOyBM>NAa z;4_Ba>r|8v1gt1?f$tQ#?*;N=8NcaqNDHGQ`-yE`d_L*IVIXQ|7Vf>{M5}ig#EtqI zaZq&Wn%Y+22Jr|7*+?dM7~**V{xPbXew!5zx15@l3{~_BA8%qVDd2B5n#Ej+p?l_6 z?U&r)I@PF&GOxmic0(CvNj!w87i!@Bl0YlL?+p)Uzad`79awO@r+77%KPnW}1b&0~ z34i=KbiswM-*WIVyK_=)1?}mhsBiq045yz`*V_k2weT{ss@E!gz7!9SH%)&K-D>A0 z&j3CbG;O#(Un+s?`2i{~9hzA|7b4V&4BY;+;cn+Mjq#IbceR|CQ{GtTg3mEcSNfOE zL-`&-&m#Eq;F%p(6q2uPc+U0zr5~81Ub}o4Io4<&{JmPF<=qu}(`0 z-_e+Rx?!aNNJs-{uT9yc<(u8bMP>7Dh5gFuY`q%&16v`AFMhrfbNP-j^#DyaO${?B zV_~{o)wjYoxiqPtm?U@744?`1=E@kz+NoQX?DIka)xZ*QbV)u?YFhfa)MWE>sp(zZ z@aY@Htc0-*!-ZpAtVxWb{*gQ}V@n%N)X3sO2cfsP#h+%qA3yUT!<^>3^7CiyZ~Dip zu^+BAUM+r-6(OSj&uaGHyvJW0ntx9<`{!u=KkLEzYp%g7_tzYR&SOD`|Ha;$$3y+@ z{o^B*Eu@HSQ&B|7z8eyfkjhRfStn#43?o9;389FwW+(f;WKGs=Bl`?xXNHVnmcH+E z?)%*5oX_WT?%(;H`}=tOzTf+I|5129xaN9q*Y$e6p3m3wY1$3p-bO9uZ?$1;P9%)M z^e{oGD!PS0i{Q2I6Rv8OHy@9@F`Q>D32B|MXJilpc|b8c-+1ey_T6G2Yz}wP{8#Zw zSk3Dz-g#Lh6%o}JXHY~Aw+N){%{z1PKULX{9)A1s|9lqab^KSp0zS1~4f}y=%=g^( zO{A=~tNMC^%=%_wKiPvvv$LBULk;Wvwm-y8()TEdNJx(vm_*G;`@^v zr>nk7S6i+Ho&XFZ84CX=&-hJU?f>A7NW4ZqK4+nS?CBZ#}}apB45}FoU!PX&5fp@E=)H*;$yuPnaa{f~BR7GRIlHH`C-tMA@)b_(UZ`Gqp~C51ZZ;1nM_be) zeQKqge9oOQ^mLbe;V{D($3;IS`NUY~hDMJU*$%(g!B0CX!!vg;V9&i=U0h~PWT?Y3 zdgJAEN!N|wq-n0FwV<^G=s@f7)mkx<2|RF;ds1#3+cdV7(*By8Rk+DOs{pw4 zn3;C=a-wQ`V{LI1!VBAC9ZZwNCINq(ij;Z8RI8*+x9X zKzLwoUED(`%7=)~748p7p80*}vYdUkX-bC&^gvTR8Y1l;%DFK3H2We z^9;n(8lm@@W&)k}3|>bE*fCagr5tyO($aZc{WzF*#Izn@zgjMlz_E$2U!d;P`B26B zucq1wXw8}oHeubEil)PuZ+6!@Ro!2Vm`ajZC4usd<TX zO9^fEYCYLD;Vr=mNJg&nh|3n9-@cQEe9KjrK2t1zLjV)PkNZ|TBsFyppC;fOXjtl)9ks{&<*#_ef9E#ee>MOV{$tO#&_ANV z-Cq^Y1Gcx<9e=@$|95=-PjKgRcfUM{3AwHXS0!Bt$2BjU3*sL#7FW5tq`V)+LFKVb zCfyYKsk}@}8MARW?FOF{vs`r8n#h91Ma#7L8VpRWZDg+Qma(+OFsiK+BKYS+MY@hO})&z%{?JR)Dnl zh3J6Ehz4Y#VZ~jI{vw<{;KJ@F`a}w`(WHkC#Ouu9OZUk?+f^ixy)15YsjJ^aH=B{nzI{Elql|hR_8E;&v zcNtYMruF1KRneo{Yf<;_N+N&sI(tISHal>3qQa@d_&Y;vUV&2BeLXMn_zEqur6bvy zl_)g);tDa4#7~y1Y|>Auo1`;+UpH^-Enp?S^E}}KS@bjFnH&m`9eP1!TC2Q1u6AKV z((<971M|6ewAauc0grFI$XGU=A;{fzsx=`bR|uY)#FrRpadl)1cNFjigBW)!?lr#& z;A5Xm5dv=g%kjVAyC(fL2KqA-_m603|MIcFVY_;l{!O)Y?L{e&!qVa`sZ6FopO-$y zjdWHF@w4$L3bI&(*!7wR=n555XaFcHFUVw_F#IhZ`Ougogy<_F1B-2iv^D_pazQ5? z|K(X`M+qMbZvyo21KN&$I0r(g8ia-YKE&&SKL#$Wk#G^fvjlSD|XE(AI{4arQCP06X>RJ89aapFfn`3+~oNGX<7bF z_WXY~K>G@4(FgoeHOgbuu26@lQj~O+$AxfBBIl#jwy^#4%)_s!+^$mzZ6*z4Bpq?- zw@3WJUbI_`A~`~wURTN+_UJyfAv0KsD|y52m?na?_Q0EQ8NKCQw@pDh=(Q2lyXUhG z9|F+XrV#uLK^mviQ|bZcu zogSX0w15mj-#=z8+CBYrQmQBLeAF9oG~>P)c*(SLm`z)7v^fA1tIMkCR5GLY3zXSV z)~(QQ?_eJ{6b1Cmz1-^r|CDk96TzInKpZ+`p$wo)D=YEVzl-A+XYo{e=B_h>^Zr#3I~=Y|N?d`qj(^mjSb-hU){dEiHf^VhesdNv)Cp znUm{RtBIAaos(-7=f%m^nbSRr5^DL!XoxB0@_58nDGu`K*1|I)GG}2x!7}ii_P8}N zbC)(aDEFR6)Lh&EVR{fR6j_Ztx$5gb)KWOTdZeGdlR+RDu^Sq^G_#83!4jHdlb@9mX4x}6fs6`L zC04pcDdjaWl){xNC}|p1uFaOTq){b%gZgFe?-^Ivh~8rN-8_e}lg$Xy^I5%YvI=zP z7@XT-*V~47_U7Z3Ndc;Y%uiQi);X#7-U1aSBuxym6U+?f!>OfUBp96Bow)ha!70qm znhA)=xyGuBzH5GhXk93jIkrS@a-`SIjq#eh=9#dYcdv7(FM>{jIK=!C3q|5~mChLb zX}KPF{6Ac+-|@QsFX3vPghV#;Wx__g%!v`2W8NE*31a)ox3&dV%D*f|?dn4bJVZJy z!!&dujGsqJvhTWuAMI{TLAhskuo}P@itwinp?=;5l-$0g%uwVu1|B2_l3Yd_tAc(%@n`?l+ zTRu>{Z3S{AfO6j1f^aZAx~wEY*_ZzlfjnC{lQkXSd(5|0h&?TQ%*Yf4?dKx~ zxWuHy<9zSP(67Inpa~uM?0u@feJ8cEv6q@MXmfvr4zLdX?R5YME`NVPZEhN=gO!1J z_3tkOVAc6|7c`I!Aja(guA+Z;5s<&To=HCe#Bi_ecXtW$o9k)%72xUpGxM{C^bAk; zV#?cf1uF4Z8VJbK29U4K%DG*x`#d4q>SkvJ9VR|1ZalmkNYbQ^g3q-rqu`QXgSZ_N z0taoe%rkmE>wYY?uMx!xn=`#FR<>8+!uoj)YqhA4tebnb7ZN1GxEUUQ-aDg9w>+ff z?-m-w2v;NeUDCxMkHZie=_k(xLhgj0N9@#C_u^%D)5zSEOt@iWvp(sLvrpWd3tSRI z3Xm~;d}R|=c-IMnc^7oimy$=lvNHYSB1IHY{9sc@#bDtd5@~E`iJb&7p#PYUeT`p` zk{NVy!{0nraWHTBW8e7%IhNRv7UN)nS(6pAv{0ACks>0pjmQCP`e$_}ihNj|&Teyy zVs?o3Hm_`xC?C|WoZM=}UV>b^pF`9mX%X+7hf9)l4U5=Dhk?fK5Da)~fa@cfQzp}LM{a504={2Nj3ZgF0{?Ih>FZEH$=+*I9JEZ>sE ztq_8%lgK<+VF`&X;MLjCZ8$l9G!;jmK-U?=FC7zrR>=`2->M#)+6QX1xpx0Uto^$M z`~Q-~`nO>B-~RkB5Nj_gEaOC%z>(nU{8)se+u8M?V?q&(pzhy zFvBuO`}d4Ybks0^*4n#kZDVP6sb#RsWVK7!$x2udN2j`m=DcgsVV~-Vd3oxc;G?3G z((h5upyVj$Tb063AARxXP-I^R0qv`jLH$%VVx8Z~Cbk9~B5w0lq9HyaVqx=uFr?&a zCopJ3u&jMLstukqX z9ErSq>?n-jq3kM2)ui{1-C)+f z-G5N~jQ{4%F4I&83NrtdV$SdW_Wws^sA6ZOqz_12*)7VroXPXDHCJ&#ww~zq0Rzy{ z$ca?Zl{TKB1;jBGR@gIp{C;vxq3hJOW(d!^RK-vJTaOU5RxeR?%c)~ZFrCGSu5z|t zpkc)yV`n-z4+e&Kz&LJavCWr(4qEV#wwktH%ay^D%{~=({s<#{STifJ;6+#f^ijB`7@MKtvQa4k5?tkY(w%m;ihP zlaIi716YfI(PlomL@~v4BT94`L3C!F=A_1U8URG-bud*shsqsF0J6-S=VNWS#)w~_ z_yWRxstIp@!+tl%F68%v{mqOP{%u0z0at&4L|N=+kFZgP;NOo26vZ+rS%}qHnH{ot z7N8O3tor5`C@Df}JJ4MVj{JiG5uJB~M%sS5a{E#_fkw++S^h7GFP-mv)#|fbb$&k_ z1XBQUv^kPz2BIXU0>mLV(hl)(B*hShwE_Lt@>pB z3&i4c>6i!gx~>skfseId=uXrO&thG-LmZfetUa>Q1b;HLzHBj^NWOS^T93;>{|UbVAbExVQ6c=d$KVyeK*z^{#t8F|0G_)I{FJw6I=;W#`@6{VKQ;|O2lF1> z*(MlFOV-5_vq>h!r*5EcS5KSG+T4cMpB5%9C?C^#{-Nbbt0JYSETU_8x@obuQ-Cxl zeKUNim%HSeRO$Cb<_yGEW(t1Bm}^h>qP(4_4q&~YDDkO0j6g^&*(xA*nkUsD&p3P3 z9*Wm6wE8i)Vd)Lp172j5Kqm{bTZr9S=DYzY8nQFKHf0e^6t+(*+c*N0Ca$1t-BfnJ z^g6$A^q24v%A^%cJ%R#qXIpl-rKRu&wOCAxiQUAK=7q{}v1x|O8VB#BJ8kRf1V&q3 zhoTo+gA8FA# z>p76x-c;wWFnw(<{r36$?d2t|ANSl{4wD1#=a2z4&j!4wDG}=A_ndldw3IsG?K(MB zQ4;XvStR|mfe+lI&h&tH$YszkI+Vi4 zaNC$`h{b%H)0f_FCXjS?x6sWoVkI`TR9f~3>M>_l(E7R5d@_4g*Y32HrZ4@NP)r|F zgmnj3r?}2}x=_PQABuC=t#nT;%fdk^{MJNddK=h?_%0qFsJz~glfH~}Qc zukXLm(Q>K7kSJW&CF)R78{oKg9-Le!i{aX%n`u-=+eN6YgR4v4SZ5zY zq>nNWv_nt)x`Dc)T2gF!i@DB-Cm`=+O_G9xak(DOV)?6Q%(aiSlDVNX1+#P~?(axa z7W-6AEyi2^0-Xe;&ylonAIr~Vp`z-8NlkgBz*(^q1A4FJ*&lydeo_DQ>5?M0^$u}D zJ_<}AjBa_QNw8WupLR0(5axy*mL{npyt*nyq@Eb?3J*X0HiI-GTH+wsSk#uD9Uk2M zk*K3%@-Z2Gw*uF%667y)r=m&w`&-m?o#bI0>+Np_rj-_FZ5t*LE{0*;%n`4)%W7uA z09z-0r5~&nw=O66NP#K}X<_{L6Xs5yG!#&4!hp|Oz{WZnbzWUN^Yr6{*rk>_PKHWD z5v@~UDHS3by)AHM5+{)pF!0{Gi1##}sO}Hc-qUM(Yo7Lo1*j_VDxXkspgn&edyv=C z>{E-^I0a>w6uudIlHXn@>7 z-IEQ>dU!lVv`T3NQ1uu1XCdH!>i7QIT=)NpYV|)gM}YJAufy3QqG<)sw!YWbCtV1+ zEFoBV-B^oOFu@-re9U66Ed~%m=Is0h`bN^=u8Sk6HFL--(5+8?oYm+L<%LhbgqiIb(Taf=C(!$Z2a zj4cPw9hL4)uYIi8_-@hhKzR6(y6R+W+>oD2s_1AqKiiroo!k%<=syP*n(faPFq=+p zoruzkQ2sPKUY?ezf)mJwzOv*f(HxgQaZCTr2KNsbZhuC*PSKuXi(hrj4pt#yIX4e* zXl%8siuqzJk9krL^LTQHay)etzZH)-$??O&ce8WJFw|v*3`Fo{#3z$#rXAEPZd)!F z8-~VB53}jGzK|ETOuvqs+{Vu?*plqNO(@*uPfYNw4=tEj`~HVt%g|o()9?)$!*!|r5arO%D%^X0 zi+~*Zg$%(|2KLuV%raKpf3r_5?BmqYlX`Bb_A?+LxbWwvdl22W)Q#9ELMV-v%>Bg(_(R=UnS$&n z&41?kS2F`1IcESAo$?J3ZDYn`1e6j@~ZASFPZLC5wr>Q?P)p zy>I5dmAl4Yx*|aDpJps4KHLYW1p>K1>}o>;R5Yhvq5^TuOT>L@7wP(BYrn))&@|Vll}$C9cXjD$74JP`03(s#{GI7I zRs=BHs@2H=}lsE&aBKa);jzFosp zq;?di+C>|{BI=%V!n(X1)Um>H_L@<<@iD>Tu%Z1PZ3F3PWC)^?H6)0Rn7Y^H51)>! zD;wZe*r{#iWC-EAJPcCr0z z>Gxsm-LawK^t_}OhrtS;H$jT+r4r$7mGZ&VGvph!c;E1DO5R*+I^}(BxrmGbW0;f8 z_t{Z7DFgMsRXUC>FQY!5`Aqoyj%_2n)20&kGSXHt&2D%O_5ATyE`B;JY_EB6caRn^=3Qc&2HQkzo@3Zh0K4 z$|e-z$YTj6-db#^9p8l2On4n0_hrjclzbmj0vOIqv0jLI|CK02y1r_2yk?`tW2}q+ z5_&%%&E0X&8T~znjrMh`S-ZkkeXTOcuiQ2}<(gZ7Gp9-uMq(3>SqNsZ;ntBGZ$F+A0d z;F+;@J%0557Tu!g79(-sP8|eycR1qQTK_ar-Z<6J&hLpahz; z4*d6b#iu{7b^q$I|JI%B9HtK@uKiS%+A$gze^U9eI&!7HA-zL~-I+bl1hUq5jr1_Kw5T$FJt6j$=GvnM^?Pd7V)tIi<|0 z5o>T{pW{$?xp~k{+I&{9cXgkpXG8F=4#qh&saSdRN-G~ z2^f4#T=>Kd?m{xI1CCf%(pXBf*V(wr#2n6_Qaac2B&@1ooJhjMxaYx&mlP7fXl@aNEu zfuW|c!u3tkdCKd+u6nEEKcNI%eb ziWYKR2pwme&dm-MT{{XaOSE;fvvuJ}Pj7~@%i%8ET2jBx@!&a#FANS{vXsW{P#Ld@ zr^8zySfP*(!PxeIjjKU+{JEOn0ie&DkOVIWHp=6;Qc7KC8#DRA$d<*1H4$3eTly9Y zZcgoP^YuGTkbJ#B7%+MBy-)Vi)3OqO7mhRxifxU09k-oQwkChoFK^xC>)iCLwnwG* z`gy};Ehp1v4`6A$bF6viA+d=Pf$#5VV?k7^07ZZ~xfLVo7|CEk;0x=npC!xY=0?4( zDzpD(cqqvb)@FsX*|Ns<>~$(>Op>)cvmTl`5#Dw9jmf+!`vuy(;#*glI^^-)lv}l_ z8^RNg!rR^?24Ds`sb^w676Wrk9Jk4@ssjuUH$ z%@vOahL7#eHEokB<_ z@yTxz*GgBjk+d+ZI~t{#4+#A7*7^nk9liYMvSy6#Jw z3QCbUWe}4=(l9@y7*voYTj1)HR#JC?=m_Q zJ={hkhQP=X=kT6b!J^AyZs?AqX9Lo&T(qw+A{y<5>Tsb6k~SK~-E{>aQNqPQf%iLA z7=Cjs2ym>x(?##w8`+|~D%Wq4wtF8L&V_WQ%tzz1NpyL29C6MF~zl-OVc=vdb za1JM;9oQ)O{&o!^7T!lkt&D%^HTo}q{ckQ3iU)6TVq-N4rNlP*PhggL zT_@Z|ryB=4ZXarYVZJZ@>zez={hl>d$BOY*uo!52H15&l~sUPtIsL4L_qJ8eoXYT;ra(Li#?Et--K6Yoo zPtlb|HMp!qRl-TMCI0>QFF=NRXLR$YwH(@TYzx?(Ya$nRcqI9hL3d5LJlN$SO}SNI(S||%Wwx!oIIf;{9P14QK4|QS8utSE2BoA`J}{zZ z{sPW-YC%te5+P20LM@FDxww0IH#bAQ zd$u*>lvv{x`g z;nT@_@YJwjZJkYYoqJcYr{~h(1HCJXkE%Wzq0|}3gYFAVKNonM!PT6-Ow4pW{x%V^ zoh7~Q*OWel625Jh5ZGvYdc3-z;XLXJvkZ{y?1y=FKLoy65?+pwHE?NNYS(c4aZLqQ zPC+htlSsFG$?;9skoZbVcu120*X!pXAr)R2coSe-&}*SL$4YfdU_9n^Nk3I@Tl;oZ zI$|%6Z>4`Qx2Gyaw?(|fqSLt>xK#S z&8WPGDeq60w}5MNZ98K?vR8pw-$h$88G zF0}=ltx!e&{0mpTNWn*!nxR&dA{EIG#H#xv)#f(X?FucU9y(Z4fh3*$U>w2+WwAboI z)Kn9tviaii6m;!0U9_wr{nWsS6|Bi{l-otp+VYf%y2oebkG%f8F^7=O1~*wZK!PtU zMI*@-KybWkYZK0WhUdbIk156Mr5?0}D)qixq`W0d_zyd)fLS91Ec0HMCtoy^RB@G# zU4Tnvco_JVWX~mN%9CC2DIL_~M0*v4pZDd1@bhBd^-5)rf4KF-?t%C5807o2L+%;U z-Op2OYK2~Gd5ft~BD(YeJ>Bk)(LMTkD*lph697X~sfOdOw7in%LFl1G&4O+KKIjlK z^5fO@Pm3Oeh-EE_hXx4Kuq^5fX+MX4OQxyb}LU~>bj^h zm?ce7$o!&XIuj+Y)#^ILYxjhjlYA!fd6^#8w~NAx0f)_4qf@_^PQ$hy)mFY7boW$= zU|=_jGe@fU9}6(maVavNx3-$Gocp9YSjeOz`T0Cn@aaj=@>75GS$r`jE)j3rDaWmr zRn(VlGqcXN;KUu)RFFJRkPVEBnpI>RPEyr?SbUwIrjNiZ^t^5ME7Q-OBSN|}&P>>b zZi(`@?Ap>D|Ze&T!B&PM$HA1+nU=DZWji#;( zT@^TIRsZQw+uv3ZTLEXnD`QzhgT$OrOhFeBc2N20(wKurNTk<WjliO3@BOayT~{FDhh&>q<-tqJ&4yjC~QOR`(`wZCEhBXNt-?Bh(x1Fv3%LA!yU zw|vy1$JJC|YW)}{@}2M_YrPKWncEIkF*0wR&Z)3JsTUcLV&M9;(F2RMCG_JSA_MJr za$rSnr{=80N`l0qbr72=@voS^4;k*Tr+11#UM|Y|9dz%!tV03y4qHg%v@xNMr9~~P zK(1)daw_^(e12;RUMYj*4=EQpluKVGoFk3T#u1NT98{=XW7W3l8%yh)n zAeyWV#2K}#umFrsn2ino#8|JR^^*`LuRU?`JlmI1T)RvSn-CS#fi`Zcqy=0XAMr`~ ze6uXpjeh+#Z0B};+;CT;{UF3OUp1XC!rOY9mfZ?Pu=H9mgZ_ zDB-e@8$dr0i|7((hxOSdhfa!NN9sC9$eQLSUoOg4ddD6wK4YkKm`_{hhI7g$pf78U z=Oiq27)G)b;&nbA_vB!?WJ&<^zW=jDqug_(7LliC?qZhav-+g-e zRpuiQ9f)Si6cyPWXq^Y0olb{1!WRGwa}6SZ_^PVh2a1PV!+)Y-kyYsyk>sCvL+aT& z2w6Yy>I#rUsq#>dWc+p!!$7l3YpnY(kQ3xP8L`L{XftqFPUR29=}`S)`9Q!lgQ8wR z68%{}fe+WPD6)_%f|z=Ae>joqFHiaDxVH>c17ZOJBR8ZPwM&KIXxFI@g9zMDa0p~? ztQ$f!m<42c0qfY2AbJ4%H)ID$fyw~h+=U*^fTQ-lw(aETpoGXj-AJonDL|z(1^){a z3^v_U12|q0nPfI#^b7!7@90v(y^Tga$N;qL{6!J$gEiv(@eMhK4AX7m@!mo|@s| z>cYX-9Fp!YkOWX82dS0+fP+bj)G?b%#AY34h9cD7h=_I>%2LeOyW2A2YTs19ys4e| z=rFTvA>hX^o{W*;H_qwP004uV;Q97n9= zM%FHf56?n3C#G{a+n z+|cd)XjR5}Rk`7A2nvQRJ$do+Pgsp+p9UaPgSg4AcyOm7S2r$YDWNx) z*a(?hoxj>MsND5Bj^pbvgPLRmi&P*Hq90LG+9wd=zK+ax%#Qkb>PB-!x6P^j7E3|~ zP9n}_bas_nE!NN0Yf$|1nJ#QYfEEAtQk5iQk(Akc5Zyhe;F>*yeAk6q)2$OOi>mSm0Jx$Cp4TaW6&u~V zqx2y+;=0qS&Z&Y5@U7a6T!5Z*s_Vg;cS&F{E1Ny`rr9a^orZzd6!?Hjk&gT=-+0?kDvg% zB$#Xh6tyRQf%?W^mej6(8;W>qHj*f2Lr#|uYumh214QN<>%z4A_X!4nqL&cme}SIZ z11n$db#xIr>Gzta|E0NS0Tr6xNtuc;`%={p5c>n;M+kE0R@+9xFVIE9%*JHl?B+TG zirCn8td)j{?kV~rAv33@`Lr~&agBOQo7E=$P5dhJMU0iKA3MWBt3k?6lEIhnCQ(lklFq1P9nxOe4R zK+?6dsvV5`Q3t}E2@JdmzK;RL{pCMZ^LZyjaVzRLbc~7)HdbjcbCRqHY!6OW=a=2( z-!$2FoD+FVM^qp7UvtcU&ASj1#J37zS(;cHc{V-h?Q$c~WK(8W?}ydEOs;!N+=wnI z2jp|?BUJ#-fOG5Czdd#zEwh$lR{g5;WMTeXzk^qNVtU5YJGo_um&+coa`o(_q!K#t zBI+fT8z^I?QRuX*>*G4eP1ZcceP#A<8=qayi~ly<-@X#8(5;IB(6|h4TwABnqL|#L zMO@|d@Y8F9U2*ewx(A|vNOLZKCU1Wp=4vmijJg(#k#z$aRVG{6g*u~4C1*N}KJhjC zpd1t7&T)M7uWKn^+E06Ch)A4+pTgzGHVXh_v#gKITbZv)b*iDB*CmJzNZrgcZXWSH zTm9M-t%A-p0wmOqiEQ0fRVWw4(`^>xX1X%2Hlw#?pwR&{Yj@E2sdf2tlLtcvx~amm z0ttSKV0n0)C|dZ#XuyjB7#$`*yp?n9fS|@Tx-|N%yqy0C>i8}u2wGIkaqpH^NBNg} zT0YU?9U!^{&e9^e2%SLm(Qv!`sWI(#92Mhrm?xy1F$GEX6N;@@{e*4FhywKFc*|pe zl*OTU{se*!%KfAdQ)_mAQ*Kwav8e0I*P^#yl0Zc`)@Q~j9vGb_F>;YK%(Ov>^~<)- z){$)-z@K|yXU8)V&Lz^q>b&Zh|CCO8i8VaqJpSNiZ+NptZfB9>m#9KFIZytH>5@2> z?~jzO0~Rdbn`uZ2L{Us@8%teYXOY$V#~YKl+4a<#!G)yrJ;vvBM3lUH0yoB1^S%Jy zlvoI3E2r)UGK?6DKPGmbteU^U#Gmm||Dx2wAk)VL+=u6{W>0LZi@j(~C4ju3M!4M5i3Pq_I2k-Cm@b<)re_`!RgoCVEkVAC|CU(>7LqZ9h7**w;6A z-SnJYipHgn6w($#)!4?>xcYF^Ksj9%(M#9d99@6RFWwUD zOF_KVea@g<88C^8R^wlND2G!;zm$!ZYa9_TO&+o|PQBHCGZWM_5y}$!+S8hkeKUBO zY>O-J00Jy$n5E5V;@z(TW|ImJbxe`=a#FW)l|J^)10KFh)i&RBFlCYMd2?m4o&kkU zb%dj&+B`(Ttd&*nh}>!$Y{;M+vOB!Zdn)u{la}~{19hOX|8Lea0wSMF%8FJyPn7`b zI2wH2V(c`Sg^HvV@>QT{f4cS9&<|B+Gh%Uv@G|C^1Ki5;Lx9%RMTe|mPaAvFj<%Za z@dZZj@^BPyi4l4DIMjV+ood#XsM~yrDLC-CxjS_%bh`CNU47L0ydoyvK3t0>;Ub_> z^;%<_zMQ36@ygc;PA}cRN*evGtg?61><6dc^8esTq-Th}r<4@PW`GUl40QGAu#+(L z3zX1D5=TrC+KvR#U-uX7A-sVsvuXpvMHb6QmWYBr9;sW&qm`BKPv1C0uT}%x&oLaT z7tEtZPqghxP3d)eWx_fA{+>)N0lnChXZLhTK$YDT#prD0kyxp{n`qEyMY6j3Y|%cU znYK5B33m`C7R2T_FG+lXEiSz=(?y@^{WCE+(YR63o#X3WkNSs7G_@@}y*4}m32eb6 z(E!g@xi!87Vr8bsLL=`r;+J$&c& z?qZE&!Vfn=29D#8UYSUu6_4|KTBEk?@bz#eqH+oJTvfkIxs^I@O9n2b=NhC=9%;Ti0rF79N9+0$7h)|xMA1oy?& z^f>s2=r{6Xyk_S-?L@PD?mF6_vHVq8Yxf6p%busnexN>O9WB~~-e9Fv)wJ*Ttlfxo za;4HG+08&$3_J;RaB!MI4gO@7=z3%PrQutE7{vC`BB-lUgG=0W`Ee81=iX8aq9;DF z0|Fq!P15O1bhnp zsj7xr<4^WV%d(G)mfcc0O@ASTUL}M+U&J${IqAi1iJOC|heeP>-JW;9Ku@cFfjm6X zqcwXE5W9)7($qc&>hen204qKQN~S}5_mEA95P(M-JI_`Rr~!E?yZGKpLy14q?3%(X zxrgiUy}Kp*&B*4q-%95JioR7fsK1f}XFN=n0Ebr~ibnPWUYu;p@9phpC(bCn7`Z)AZw^3ODTP?Lf~*7*WR<5t2CNS^Cp`-SHVyh#Ci1D2t_=23 zSv@{0D7VYoRDF-#$KMCN;y=qZzw{{4cJB^ohT!gizfmySYJ8JY55R|9nnQV1}< zD(c+^$}Wux15#{w>Izw73CT<};7qZ^vn1tehhB3le-iQ2o}&}dSq`9}3s~Iz$4*iy zR+J1T$6B+7w=Jfls_MeZ&#W(ClETRAp6O4vSBy0G^&+hiw{5Jbs{s({ftt1%+jc2u z)*RwFW*cn&t4a>91$wiVk-kN@w=cL)?F&**#wrs6h=U|UgG9LHcyGsW#4}$+2P-D6 zP1>mSVV0W&sH1Szm`*@|KFFEG*%@?pgv|G$H-D8|4wY@DeLcC+yU4ODXG8>a0&1#) zKbw)pt^%oUIxgJam^t*fQ04*H!PI124@&N{!Ha%V^hOrHFZDzJ zoc5*Zb=zk}Na5qIQ}Ql!G4BZ+XMAdck2<$r2flMvAyS9hrTUv)F6`2)O6o!VPV6@h zyU6&St`il!bKYa`&AI#ONo|U{FjM=7(T{H>{v$ze}#0n;NWg<3RCmB z*wc?a+!Itc0v1buW^fS4%K8@EOM0<)zi(13MhD-tobrd_gEEJona-b5WXz3tb}1W%oL5O6XT z!uPl9U?aLzW#Q)7a&~L$&kn8euRew=zq=I21$w4a-DAcU!emQy&TofG=2Hj3{H3SH ztm$P9*>2c3zV5sB;<4TArl0)<4?_#~`Bc|)YHj9hIeWBM?>IbHvE8_FtMY=^qjhb~ ztDj|o<(2+{#HMjp*$5|PSLfnA0`q<}8X)1A+BQ4+FD)=Chm?H@bnV98Bv_FlQv z!nA}AP99!LBhK_tk2ew%ao@e(RE8+aa!Ocr&MKI6KS|zodVkZ+M!qYkIA*T7`w;;deMB>;zSB;TLVjtdr2E&XD&rcn2 zJ_EJb`VjN+aNa!T&Mm1Gx{J>hmJ~Ov$(}k=$#Giop0QsvICjxneT!fgXfY1`U|c2u z+Hzy5+dn8#tgA>*H8-y3mLbHuX<~RNTY4nP`Wc zm$G*p3Nn3i+Cd<8YM)M8f0i|OYhP5aIPyaH`uTk1iKRI8frho$tQ=2YXC^$SvpAF| zgb+_)SYOyMy21l+8+Z8G@rExaTr*p%l0vOBpYw~pUl}y%6H+yFw=r5h__{I8oV3*4 z$TRaaDf^vLapq-n!-`KQ3w7rQw5FDIU#mPTi^#e9xm*G8gBN}Xzf9CDhgIo@Pg4&J zt`~Ja*NsVecRGC9v)u#Q2l(b@FD$Y~1YIXe+_Sfops_YAUR&+D7^vjsl*RO(g-gmI z>yf`#8Myl)eLXIMeD=VmWjsN$jF1evf9y36EBDW!dE5v6&xHai zNMiETmTW+P`~*xBsIMI80F@I-pqx@=Ec6i>)LIX<}$^u&Dv{*a#l(r!m)T5 zNd==(W@C$EvMpDBT*q0`v9&Z8Se!q(?RQmJwe_xsVmWjYvdk6;m~{vOL+iseNXoOl zxP@CpMLf5YK-E|8Xy^h1v`YDUoGj;=eyFs^b4~3p29hzSVn*MjCTz+C_;%S$-`)VW z{A%A&HEt&pAK%)eNaKm^%TEMroH*v$)mg6@GqJ_d+@EhDv~rF;X`Ue4;i7vj-s|({ zES2-mZHOeY&Rc%FTs`uwgQt-`*nR85L3yKasO!N=AgTHfBU;)-V%?%1A_~kzHpPgR zW{Wx8$rw;Rb?Ex}24unp%IeJlC9vTsyj||1Kl6$C3Zeq;qPooHbIVOY_YJGdyq(`v{c5g@{!g`RMgB^G z-b?;5F#2bSsq)LjcO*0NCE|@?&M0_5uK7a=@T0mnbJmIPPhWT?^Af3Evz!91Y3nwO zD*g^XN8~J-o4LN!^-wiQCYs~Rck9QqPm_Y4s&G7!ZeItf)s(9dQ6vqx98m(g0xT!f z@)41Sl49FsZ)R^d2uOA`#2?aa7p^@#&$?NM&&Ot$v~j8_+nAewh6G~TB78axYJ1P$ z_gWhCR#;iSC-vOl_AB)~wFg3{c6N?ui|3PxyhW8le2ulOH)Td{uBo{%`m~>_$Pf1oY z0`K38p{7U&ku-_E4v{bYhQ0<~-{Yu1FHGm2Ai6#+OpA4!)@jGX0i~18c+6*6Yg5a( z&K?Y;TDG{hvFZKWZ_&D<9$g904W<7q*UZ9#4>|V@kA9~70;$X5*~Tmh&8CtP*5@R|?CajxUp2~k&LWljh#KBt^K#*o z3^LS)^=CQVs>JgngVeH*JPovc;U2x$K*QId;pwIZX;E`EM`>SFM)flUNT%lL^464e zms7iQJP0214p(s=uY|3-*KW3RaZ-kb1woe`xr5q(hyg@2k!3R5?oJ5K;kvdZRtE_UV+!OcBDxiLHtcu2O=(v`|U9RhQk5@NlSJ_x^^lFKL#0wps-=kAd#~%cwmAWsX z7B$WK{el47;DIA@(zBztj3;;0-_f6LD|?eKGV+(#4U(28WeVt_L_H*?CLhzHC`n0W z2nhM@9^^UYTDCcUB?h5m|H&XwfqeEC(;bW`c*>A3C0DEi(-u=jB6!{pk7V%R+4T>RpaW+-v9(9IG_V&9^KUU~14 z99c!|(>^i3Noaj_HWcyc)w)eqGEjG<6We&@`6(X8fe#_xRZl555-GOwaC)RXTldoF zuxZ^HBi&le`YhPsAjKUkeHI?AoyAcdX)u90{hc#ljhp@vpEeiFeMRIcExhbB(!PaS zhuZZ@DX~F zAg9^L+qsd?`L6fm>l4&4g3$y^0h+Z>*zuE_qaVloX0^M4I#w0oT|{Z*PB!-bH7_}{ z{g^vak{lgzIoPCG`jcN;xap0F%`%r}9!E^oDfcj?=D(!UCnBF4a^JltKtgawFN>Q&ZP zE&oeFV!RHFJJZq{8VInmhqK^7QSXn;k!lA;I^Re!i10k&_Ct~-xBuI>B^{yoZ}PC->If2U3DQx*CIT2&Aw;whP-xu+3v4>;Tq>K#~jJ zHUooiMUAQV&ellevXXA!rm+9|3_!1Q-~2sOm-wzOfG;q{&-WOCiOV6=dVaq_Y?hNZ zJl%Pas=`e6=FJbK82Ifn_KvS8c&P_s3xLKU+Z^%40C?QD%`1jTuC8uV{HFZJw; zCSP)EUA|R+r<{gk&AYc7mPho35k%KQPO@VHHTgJ)Z(pbx+M~bpnE(Bb{?)uR5k$^I z_qLUc?Lwe+;HBx0&u(i0WNftBIY7{1x&3SzZ!k7fYYM@qSXnD9r!Uv6Xzj#UTo*~5 z9o=v3RcWaJwz1qXO>UC$VA-~m^u;=YZH0rvVqCm@PEz-ar;1T3B2Q&rtwIgWWELP% z$KVdDP+MkN26xPu0#CL6>$C@A+e=UCt#_z28Xx*t6tk>6BdRSZbu&v~&`>5%v@?bb zX-f+jw&*H$IT(>bc~i~STy6e^rKw=TSUeSC+ShmpsZ?E?uY^X2@vH}bo?dIvXlk@t z=QS6xnr)xX&A8U4N7cxGPHwpAM;szX3O*3!H=kUwX6cyLSO(``ReaSs5&ZuB`#zib zc7D$rJQtPUz7i_urlUDCLU76?BpVU@0loUOeW0*Cb$TMbQg33@Il}5X)mWG4hA<$E z35}SCQ_{)O@`-d^XXiz8^4)BCENuLau#kpVhXq%EN|jK7CElG4671LFFACpHYQ#HZ z=erqg)HFEh4WAGtjW+y+aiKP4`=YFSH)AVs!lo1w5eG0PeHdQOuc7&l+SM7S8rvT zSXT2|bg0{cxEMuo&2u%gu4kPc+Pb2ou&Pe4EFLh5JJo+Qff3=)9cn zOqo@NG|6%rvp^=httR3Q-7YVgmTv{ML_7(TH1I9#!1c9I_kHSfAxs0p8HLjuz6`M- zF1ZSu%s@XtRrdm)ywTz|UulDCPv)MBFC1G8@|1U;`ex8^_mg6pT!bubz%g#TX)aF1 z%vewlHxMBQo4ZIf^8IM!T;Qt7(7Zb}?tYR*uL6*SX*+OU>t>SwH9X_WTBc!7osbC@P{)?dAyrVPSA(TtkZ@U_pNW z#_{%?hpPjDd|-PG7-%t70+g%hv0h`iY!621dKo50VjhsEM6>mn6D!^n$+{p}exQULPpox4@ zQR^MIRRx}YwLlk>u9(fGE^+q9{2uwnRWfp=X27t;$oM+nJQLC}`xy`-au_{BGvuz; z-r9zKrJItPS%|O<(%0>IHP)DIxqoLFEk}@R7nG+VhI9psqN!MX9b=9M#!=unS-fmOEDeItsSR(<`y^R9zP3c!F$l`eO!CPru>GdVL@ec^zeDJA5eTd?>3n~xto=AnU8cC2*uA% zAb%}}T|;87&tt410KLw^vqY+AsOrD_UY$Rb;$mc-Ufns;uz7fi&g#h+ZrVD^pW2lz|Y;Nu0E$zufdqAd&>7;9apwK$P1fVE#2B1Eifnc5~%pa?O0UPi=C89CF?ptR4 zf6WI*^ykfJl_1*;A9LQ995{esSlXj_ij=x`Gpw z4+zw(tF|q8#se7Mb7LiM%?s`-jjFbJI1M> z0WXP{#J2=k+dM5vNwf*)37D{WhhHxkgvHUut2~n|{e7MCYoOXR6RWf7TcYb&JxJk#2XHRgHPoTe{s)%= zn8*_5BkyW%?}~)J(n0{eCx*aH;4c6IWkBN3{00H!#-V?+HWqT`ePo8lK;>l=fJ?wg zk;{=Q>lkSY6%-IuPC@?bg2r>hwtm5t$=7DxNPK$OPsr{*9!L{FK%0rU3I3NA!S({X z^=4)6d_ z4`6XQ)cpRjSq_M18V`T~G8^@*g?Rc;v(M$;);A?6t4+-1jOmhr`8^< zdZ#SA&C#_O7@Htt6KJm12?RULqIXqoY!gJ!Hj*&-VcAC*#BdRYA67hr9+NAu?rh*ka(x&mZ6vzzb0Dg z>;R;=rJ-$5&UMq=HYDD7`)s$*1p+Eu67_nI*&F8d$bRd#O96#O0-;uY`!0oj_7~tJ z`*H7|b~jMIasAVM<-(P>k{@2WvIr;YR~%4_HoFl+u-oxK-Ai7{RsmlSSCxGVerwZ5 z>*H#AhU)y+LJ{}!uj5w zdxG;peC-$jPZ{7?Z9awfQ}UT|shO};wH+u5@~r;!uAV%Vb)6yW1`d#xv_h*ADiL5p zZH&4@fn;(Ff8sEwk?P_Tta}EhF`Xo%XTZ`^K`gK}VkA17jz4~0#%@%d zwwhx)WqmPCob?Iz3m~l90>=VsZgmK@FGQ(4(!4T>Ao)Gt;Hx}MhHCY0R zA+74DE!cXzP1>ug(MxP!v)_=f|KQ`9w)Mq3L1P7hDDPQd7Z_iSR>MJ9-Po=k*Sw`o z;eRxM(2B6qXWzPts)mFlzm|Jp)Mu4NMNBOt#vjBIQx~QuaiNUYt45Y9Vjr3KyN)Hi zweJfZO4!Tq520H3GwW+%1*TVJ;`&?Su|w*{IYc?5A^(%vEB@LeT1%tbg5$d5vL?b> zI@hh!9+{_%aJ1l|2NC1`tl`Zn1aP8#)Cm+X*_{I`1m-#wYM~Vit$dzd%a3vV#g(w& z1N!vF{Fo)~h`lpRkvN}^poVZ6CHuM19N^c)Ix1YeHJYkR%0AG%O*%|5Y3uM`bXG?w#6*yLLwW+_%SVTy+{$nV z+;m7UgxcD?U`qsMWm9@>u zef1G)F}v?yez4EB=3S^pO-^9UVnto6jiR#HM#Y{`D|AOc+41r{jB`dc1q+;yLTr{I z+tdO5i+iwkNb>Urb*b;>)tUXCUscq#7aXSf14YVS+)5^ryrLN_4n zv)2pFxLaqg|B_e(47E!YC+x&XJ=g3GV6#)|l9VrRzqDK=a4cPLyeP6KZ~}gyLz0`{ zWX&@WUIkZGrKfBjU7SE%fkD*wT_!bjlsBRG0j{)!-UrH03OGxP-)Ho2$F!NJjLw zv&r5w*d|6D!1a4(efiP|IM^viZTE2pBrXH*zRlO`&!UCrT3vL$G%3J}fy3a)yjFX?QAPIyNf zvC8kPyb)S#Z1b4Mlu6@8KVqdAxN{Kaev`xGXhl`utr@U>UDi-I4(*`MXfr$>w`Pbc7)?+exN z?#tBc;u4E3Y6lbesdAWSs1q}6X^%xuis1CbY~)!(G{pl${S`(6j%H-@UZWCQe@ z$#v@lS<>B+s|;7wo9?K%gOgp)f(uH6St3vLNumUGw_|8$HlwWsKXXPEwMMAb&q2C; zV~YmoSieM9y^(4w2|No!b)BBd-jQCZ_`elT9hv<#vs)6`rr)yo6L^R!qJe(<@Xz8k zD^KEKJh_cjc*^phrrG|rS_`dzP)#I5TA@)e&1feDB>(!vl{`31uiK!1;~lN=NsR1< z5kE+`ALNUyD%iC0$DeeXBALT@R~c}S_>8I$Y`J8-fhapr^D`^c#IocB*J7!I2Evh! z5I+)CS7s&~8}#CAd11Q-79@j`Ha@SfM&nVp&C*V`XiT3JBc)Puo*;_0W; z-#Y{UJMTf@uIhMSOjD0Dsvl&TJUYJ(?Q%)m>5C^zugU{S<^gE zFo>96<@#Eko|mG0@}+yeeJzN2eY4AFEQdF4-3KSe{>xHg5pa`FUp|2r4?e{Gx(#}0V2f(j4q+I|W>r+t5Fz$UXJa7X+%sE?1>p5yRKBr$kz z2Dbis%&>8ph01$TEMtv|``-KC}+JC_MII1N$ zM)#(RmAd<4vP5TOuYJ?mN4*A)HGp6?5;ibL7Y`t@5f`xDWxtf}Ppxj|yPli9_4($H zf%7T7xgNU8D$3jSbYCtRw9KrkAR->XuU#&@#mATLNU~5IxypGSKHV2-K2Tyzk^vhux!Tb-{64(S5ZQN)N!?GFa>h^A9kHn%!z&Wg`k{hZNC z&^deZJworsjd|`Z*f5Ee(5OwUFValO2k3U#Ze0?*Ug&-3#; zbFA(?SgIV<`39y76EQluM$}%xpb`)=SpA|s*mPU28?R={r5V-gs+Z`B*D{ecH-F4a z_{d(N2oSljo-t6GFd5BiJA?)n5)=4kO)NNzmD0ZwwT9Dr@j~ZJy4* zMIfv*n|Y&T&mja0X0=*9{^(*x2tq!QaR_aUYP0#K3rPUgi-RZO|;cNYU%<= zCE1k)k5?#!>KFoyg#oTxi}l(stK~meebl*z45Z)k9sLG*n@K6j*bqp*?TGPW@{v1xqDQvI-1SU7<#>xOU>e~o^x6dfZ)PSdHF?9waKi8k-!Yh(SAx#7Vb7K5Jl zqw?{+X;!d!WZmjCb?qm$M`)x$C6VBa2|=<*y$e?OKLTMXjOCe zpE|!(k+^LxOE_I?^F@Nf(X?ak>LJ&zZ-nXP$zH(~Zox%UXA=8%+e8){`87#E^Aq2R z71RQK1ODThrG7oH9g@RQVG&uAke;P9sLq4;)Je@9*A#)(96#v?OR{7$GHfmNeEfVW z^twINV^!6rEINO3wdPuBb>h1#o-|hu9TLxr7aoA9Y)!?@UM4$y%otv3_)l@yKgHx_ zm+Eu>=KzF?B{vlMh(J7*9z@f`uLX-}$iIpo?gJM)6-cp4f=T z>vx{c0E&>1$byv2pcgJRzZQ24Egk|Wvfw3Pl!{_LK+TvTXDNWXr9n067u23*I8}tY zC+TSEwIA$ld4KJ9$R>&@J*%ooLCX*z*xJZ9HA^V(usm+ z2*IZ>aCHjuaOUk8B8MUt_5`!PlO2IsFo0*1L1{Dk;|yd9&4aJ(#1}5CW~44f%B&P* zK&MeT`TOy)m_2nf=+QS$ zG!P~uJfm`7o%1A*rG*6T12lZQ<-&*4jEGmb+6_dp-0J!;EYN7yyI_aJ2;s-ZzI>WT z>rKJwB3GdgfZtBj1&P1MI=Rwmz&k}U8N>xDGs1_h>~QKwF?Gj#s9S>o2^#WzPPBD6 zOqgh7{K)&uv}H}efV!P#2>4qy;ub!$l_K>U^kO?w9a4(?c}qVctbSnr7I~R8P#RRz zc)~;i5zcl^v_lxRb~%58Zj@UY8j-!Hdus214o*S}J%x8Amy_VTi9_Y~*@Z>0B?%N! z6x@2+S2PjRZ`Q%Zm-_5ju`!4*f?oJ^p2xGWPf_G>WW^=|#YhSywqSNs0kLGe;UOF7 z8f5HC_#jo*&z?IVNx1?P!eI1TblK)6(yuq(}y9 z>=TaxZm}o!wd@0ic}a5D!4HR^KzQ8IiXF;{A?7X^b%R-Tugl8d8~e}r$i-5FT4{NN z0=wnzyF%~}wkMA_KPEpUh!$bu->O=zt1>S>q4i?8!dFQ5>K!if-95UoQL|_w@?e2@ z8v9VYl30(?4SWe2RXAU%A=jH}4^9j=il$tKD|Nk51;_hIK4z~%Is_>PkMqz^NXOups#it7bASC2J2{qmhc?*j70Oe|2KWAae9!5Gm4_W&lE>3hB* z?aeEB-*~2MP4nvyns0Nfg(?YpM^LSq;OF9lfjK7A`rTQwb%My}np`>fnurA1#3E(p zP~|lxX|_dwK?_u|pJ%+SAb`*@`;*Mqhc>=N(R?IBux<2?L_lxFJpoug2k0uV# zWa-irB_Rl`&zi+8iR|qZP-b^V$<)0uqgO?3&R-xhFMfkch4{~^tz@N2=GO^MXu;>h zfu|sLA0qIRDAqm4Fct=eXB)@qRsGs_id!2d3e9}wC!EV~U))^T%DU02`Sqro6(G&w z0Gy0>>aNOq)h`a7GPaKHV0P#Z9*+)EN>Yb7VrC6I^ONSLy0|;`6K3BmMQMQ4 zojE3kXMkqVr+*9s_`65qZ|^=Zg2Y|VxM4W;*LuoN10IDIHMb){{k!<*KYSZ~*b^MM zmm9tTKNkQ={zpco=}WGVL^Xpgfa%)F(HK{S?dagwflA(;UK7C&zh7*8dg5*uTCY*O z`JUAh7!?&?ur}ql?q4uJ-Q(}B>i1IYUT??G2v(e}G%lHOiim3q29jy%*9uvvf*JF> z^aEe7K6yqlFmL&jWOG`>D-UkqshXTZHf85*L_=3Z6ws)%;E2!3?dXz+?w6!iw3PCb z^ZwE%gH`zdb~*rD_Y2v~B&89qgCKGdA{1ZLGl`bNb|blo7Dn_5D_k{Sf4Y0uUH(jjW|>OZ3EwbumwJU62VHB zIEkOXO^Aw{cK4^hrSD-S4ZDWbg_85^bQVJOCi$IVjSW}qXCJ=Ww1GaQ$PfdukZ@1+2f2D` zQ|7U0JHzmVK*cYejd@YGA^xpBg1GU5w;G@=alVkgkO8@kLpGzO3Z0#IisJ$gaXAMl z0bgJ00|s48xR1ZJLX?cg{jd6V~`fw5uE&x=rzaGle*&2&X-4Z4 z$cK?tZ_ygA?pVgo@%*M2SpB+|i8~Ad?fx3k55o2IG@c%)MywrA^?W-q=rpA~iMULf zU~C%D;#o8LfI8Je{^C~*VbRCFL7&D+44jTtuR~}tzlJ(zuZRnnPzuEK zlk0E?j9FY&3+p-zpf?w}O7wKYw&_`Lzibe$I>O?Ue;oJg#@>}P zpK0~P%0XtS#1Y&v+_|$MI(jdg8s6A-TpA}~*r1&mW9HS{Is8S~d*GW9)Uq}oac?M0 z8o^HBijgbjZxM>Hh2$`WXL%HR5MF<6fGQr+4OCj6RWJ{ ziqBuii}+Yur3vZt=-BD-?@nlVxh%)8acbDc?X>VCf(1DywyWHQTjwMRi3i!`a{^pO zf_E>ZLdj1`;#OeP&m;%vI5eLFcMV|2(`_|hbWzMmtN3J@TXh_Y7MB)&Y%p*VE=rbB zaUOOV-vX)t_kYf{Sqnqe9&y3w$Tb#a`J-0>)cHVPB~~8A>1ym2ER_o@b{&2x+)$b7 z7+db;_00>W-E2mFNfC0HXGA^=N&6AP=1Lu2lE=gblYIriQ6%v3meKn#vpEe za%q@Q7!=c2j`AS6+ z&!ge0X(Lrw+r_l+*9BwfzV}x#B?aZa_4KM(uQJO-2opDRN!FtZWN`txhxqLn`73M+ z%v@H_FV;U(RQU3=$*rinWWgJ%_kd${qg4uNFI1@M~#`Rr2&>1Y^djs=yWeT z%R`eywN?=?wdZ4(X9}5a^`PS1T(wlIMbXRTC@lnOU?5iSXX<7(JQ!u zHnP%z1s(=lIe12FbPo!YwOY00=Y4JYemvhxriS}snymT$*Ip3+J3Q;mVK(8&k+O>v zFD@b)X^mHxikXed7pu)a(34fn4pJB2nRlNbsKD=F#9a*MB7ABGv;r)r1t;NC7o)1KzxgBvI68LHJ>s=bBPb` z5uUf9tU2T9$(n2HgNjjK-;VGm_Ao;@& zsK@uM?vV-ZlEuvGa6zjDjgU@owP!aVYvl^&*JQ;HbcAYeagLOgbvC>{jug6C7JfaE z%WY9`{i&4*N)8K+4`U;y5`4RBkZkI%rEp}2tzZkKe^}mzVfzAm!{uEmr?g4EM|z+O z>UP<84l-MwHvmFXTb~ z3oSs(&Qmb5A2Pt@s>yuKniyg>o@OuEX~y=nyw`>!`|E4ASACCn!!y=3`6)lxiMeeV z;5dXaE}I*YxUXzKy3$-R?i3;6^rEEscEE4YT01DU}m1C2#%;I>(A zEd2D(hq;tql^)-LW7<<+YSW+9Bs9A8c2g}r#%pqnX+nss?E!*k;#)1bVmzzr&UFuT z`K&)uQ?W}B7pc5arE7kf=iWCf8hmi0d;e$Zw?s0*N`Ev=y$gTR6?S!$bYY=X*dYFX zgX)DktEM~4M)mCa^inCGU-o#?B(V2#IkcxU!l_2fo`h-e#&=E&SpG%xDBYg8aU^flf#qW((LkEZ+P`pb)R!&PKltx)f%0M zkfaD)?b1|mAb`A7Wtu*bC4wGZFn87qkI==?cNPip=X#Gms@n#nrIod@>SBbKiepbU zxiBlwxBU6G86R!vOTUNz@_O*OL~mJ$Nwr)>?A4V!eczp=y9Z)tv_q0(BFbOXn%7n0TAi!=D3-CZ#YdY?}0oEPc z6S#@nSS_65WNH)hJw6+W z*VwriUxe(wJ5#yL60LILq+~OE}*Uwb3d@v(WmdEx&cVSP& z^#1At#6@8IFv(%OI#j;NM?jDE}iy|$WvZJdSarAO)adMBNPSX22>`F;&O;dX?;_QqtoHOxy!GT({R zdY^}xpP-eyX6`$%+o#M|k2ED~3CWTEtkIRk^QMPqY|2TP+EBDwUSk!E1G&wJG}C77ORj*8Ng^NoV*JTdDWcIfG-g&z)yG*|_R z*7$rcP(0uZ2DqXBAdk!P(pu8CzKav2|%-z3iWbbRRy14S~Q;F{#jDXuFsZfNh|4xT;x54^EjxcT?bu-dl}k;azRYk(nXqYkq+0LyuDC*zR{qgLO$Net zn`bOVg1{Q7uHuesIYa!ia-#hkr2q0jO}~-xM<7}&{Yeat7`#%1yVTeeY|UbZfp$ut zheu+}y2z}p$K5&D%u*_AxS8pweTs*Zp3qzhyZF}2+jif0qQ3_$CgrqLHl=r<&biHj z)9(@v<2S=D{Cu8nI=J8Ogz@mmdq`f>gUMXTWzsuM2eM=9MY)god&V|IcaUw5&~g|u zUDuP7y?$OSdCPvM&APO6==0MJC7kD(J1aS;s|2M;L@1%JTadnPJWOzsr0L1JMp#R4 zAdFpTs7p1bKT#L7jIukam)=RtU1NMgcOeLGqyr3@%g0Iz?phLV2W(?ps==xg(E2F< zlr8J?791V)(Z<|IH$-p>2O?+V;SRC-vfVjUKISiIRm6%=r-+FdX@0ocK}0;(`58jC zS+1&iP&+>8lO!p?=lZIQ-wU}SraZ-KDyIz%=~2NIFM~=-7aN{hS)<*CTg&*h7f*fT z4&C5SIP9|K$Yd7oN{Es;m8#}xwJ zB1wX0ycCbA2A_T%`ra(B)~%(2v23^G01n>+Ry;tmWC18X*-=xDGfrxgdtIX*RO`){ zgqahp*1#}&#$l50!CJ`DUrfER4XQ8II)9&) z2NZ`ZWP$3aKC18o>zL7`8pXhQVV)eDp+tGnVOwl#@~^$$e@OrTVeAc)XM|lNOR;7T)0XO$+j_noo}n5gU}P<2aTbyjiF_*YV_3x z>?XA^t450`lcDHeH`-`?=xKb!F=7n97Zm&*28PCOn`xVAQ>DI+px2-mE-a8zV%qCR z^a0r$bOQOfC5978+blAt7lP5ac{A%YA-|10LpT`OvikV_EtTGy%+G{SIony)xIB*} zJuxna8zMA9VYlmZ}y^#)-9o7}TsLmmAk zt17jrs6;NeUTYIhdB^X}ZwCE9AIvPbYqSVo*kp^f5FvfA`YI z(DiZ?mtFl5vVu9WIr~J42(r5QBDRz_NRi1iCEw?JsGdSQWq?yPpa_7UlG7^GELXWS zFA->zYT1gM;%E~}5G?6Ue zboOV8V?NP7}Osq5MNO)wYA-^S>R;<0+M+rrN=}M3dpVw%NBf3ghSt?pJ8Ot`SEPy=uDUaLV$ ziHqhj;)6p<+%ez6e21hyAn!R%V(=uG{=nTMup|bOi=L$KwoFOpnZB?(`d*rr^5}(x z$AHR!!n@EV`wa(#_PzO<^!9`V+>Nmq5-ty5CuC2nEiA?;qNlV=8EZe&PgWc~g-!0W zQvCwZu2m?GS@T$bvJ^vUsT8T3R(Yv=e~M+~eQx2WwJfT^>TDjDW^weFUH@4@75j>Y zUG5-#@l@|Be*VZAQ#6XeF(5iZ3c_V4AkU&x{4LO!V5CNsV?JMN!-LXi55cLU+B+&aq z-sT>lEm4seZ{{IM;#b-QXhXT$$6uPe`r2qMLj~Xac-rU1 z@ChA)`&fD=YW1wXr=$P!wVm7-XK^mdDonv~WoF-A2yzhEZiUT5>30C7|M|Jx-=L3M zfEF@7*M!{AELe9>0DY;jSW!ht^E%{i_V@jN5~(G&cqssPwDHc4ah{N zvPZKt-k9H6XN@8-X5Gjo zuJj=J)M2dKXF zX1DpOld!rU(84(xB6eQ{X?;{$f57e-J+-AqGQu=@lW*u7P@6Y$)lT{6p_t@!PbIkD zS1jmlxrl*x%XCRu*e)P4@wDU8r%5h&&)Dj9wLS=l_|~5u&A>d`Hlm+}#pL~5yFTTZ zAXl`xOf~9#_}&MoLpxxy1>v@=EbErmfxH^%7k#JEcL$Fo*=S`Go^eQoCxG3S{FB)b zHdSLNyJ0jNrmO6q~?C9%?z|JY3eL&)3erobnk{T zU6x$NtzKF02aVgvF$8 z&v;di<7Wi_+61YUQB#9vzo9xK@w?8Klo8;@ufp5B$qNEwHPt7E2;Z?i?lF=AFf&9i z@C;?m{igE4594UM7rsdalGzze5?G~;^#yQT#xP9vAg;&4935F`KBEUyk8M4($-{@I zrQKNY9tIL=TB6tu05Uu6&e#!b(wvxQwbEBzbD!48^pW1xo`lMChII0;ZR5+q@1dvn zvU#jXY2%m085NrhEq;H)^behVPvrQPsqFH4D8Uq(=K>8jVb>P4E3%Z133n z19&|%^L9IW#5dc6;}nvx2^?e zg?~}=2||64r0@}aCVGOi6LLq=Dd!~beDynN{1jk+m9f_CyqJfk$);_n|A_$UBDy(f zzWq+xj{eHm0<9obi{7JN(X%IBEV>}7uQ>kuOQEM$ir^1A&k`%!j}_}Z`MNxdCU1rC z`#AWN#FSC>c=1-dR@{D}ozfl851588x873)X^1x$eYW@&>^`1bID5;%9>#9q?eB#V zj#J=qastNnty{)?BWx#C0peShF|;w3uexGruZ|asbrMz<;Lr(hcurj}WJYzuPm-EI zyJJg@fcjdr^CiH5aLe-m%H-m^<7WU?OVQoD+7jAGnyuyFH|xlzTZ7co3tbe!-!R_>smnkMbPtOpeyBl1|nMilNYNhXjG{lbc-c|2n zO8tMGf68jVqOG@*=H$5*cM^=A2CgVyZrI>&5Tieo&TZ%G;|VIKiqQ|?=JJMH4OeiQ zpXYEwDJdCsdhK^~4Q$N&KccySQtDW#H7p~!-80TuCl6tPpW1j0P80AL^%mH#Sh!{< z+v=wgE#ACuyYcu_S(0|i22e08{$~ZzKll;KeM%9V9Lo_ZR8IGz81cnBAyHI$> zKa6jVgYxvR-sLggC5iFlGB$sGOnrL!W1QW;$}DYN3DdYwsd%+^f=$1>R5e!Pj>2Vo-uS@Y%%$<~C&DT3}* z@qx@0F~1yNoCeGdS#MDdb`fgk4pH3WmmTiK^1E@ETyF&d`1Yg2gWKe+{p9DtwzW72 z*t)BTWJ*XV4l13weM`rg8{AM4laQFgqP&px7R0`%08e!b(X(EUTQdDCS?ZU|<3PMm zMFfK0)5Sns@d;l`j!kPhwh7e&<1Jf`R6QAyKi>kPy8MaJdvt3B8}EOEL@V*0l+J#> z-ymIklnF|l{HkSfJd7Do%f8J;t}-dCRsGfF{-0T=`s4V0p)pVPRnW`^5|j5Os*<06Tw1~| z+$m|9dpYWM2G4m_Jw95~2ZIsH!gJYBH5l|1MJ%l6(03kjC2AdO)_$|w0>K#Q-S}W= zbf>%9>1!$-^h&*kiU^>O1CEs`>NY6<)lZ(xZn-D-M^l@Iq+hocS|}?!rqLcn(F+R; z&o#OE=RO!+er;`}4P?C^Qs!X$bCcb3O-)y-{#P0yAiY)|1#HXt7B^3T;?>CrRQnmt zXAwHYWKC06MtI1%Z;3rh=a@?98F(an_ljP>Hd$m)s#vAS6KbC0w>tuxg$Pnb?zt`p zF1E1Zf~nk}fleB)A7{0rx!Ncog4yPD$&C`?m+q5xFzCs$MTz{CO6J43w}z&$rz%O) z$`@2?*VI^}AVA?Xruh>5b03y#lPwg6T1!K%xKL;+M1CF1tnqXsheZ#cgi|K^YS$OF z5;@pzeoi=T{FE*~$iIRmY$HHbn9=Rqu~Vk&$JOz1PR=0<^QrFgHqTQuNg~G2O^jPX zOm-%pFP91b&A0dO*=#UxxjS)}^RE3jDfEArc2h4w|GHQ7M+$dZ3V62cGAZ2xU?A~- zger1x!Ock@((Qa;0QUZWI1K&?os(*~OMzT%ne$vSyZqAvQX{IxT+gbT{EJPW&nJDQ%*BK2nCd+VkjWa za-06Y9Nqu21bjcM(*W!v`T>Qz5obgeFl_-cwz1m3S)%`CW!-B-us;jXL;d%kL}wiH zQzKKOERJAAe6*bi+4HvqeA(=#^M9<||8=3XqdY`0#D)W#jsG9)y?0nsYq|#-1jGVF z1f&xc6cG^V(h?gWBA|lOi3);r0jY+BBE3XVKv4+2NQp?Vp%($^AVnZSr9*-c0wnRS zJv004;@)#-&N{iyz2E!(O6xQ~YJL&8HK->GaX+!de?MRrdWY;t zu9j$p;UnvT=0g845QQnaW+>9QCM^mo*Ov8)oIe_GX#EVIe+Tp(ge&!P777|GnK*^T zsEO=KBlJjHp@j--?8wpa^0EwzwRPzNs=9;dO(##y@g~cM?;Cc5%>Jp}{&*nJ3iih3 zv6V+g5vXP!1n{+Hya5VZ#3Z1a1dzIa9Qx}Kj?ZJcfJF8T1?VSe-2zDRcm=1w(#C%r z+nH2l`Cs)tzgZkFO9;BRRh|5$KG~MH%iQ8>v8D`t@Q_6NC|QJ3>^-3%J^re7zzkDc zjf`c^#02HPM&7Ez1K%Bhcx?K($uxv;$faNjMX8zRp0$|{CCcm-qUMrCz_x7KgxP-Z zSV<5Gyz*d5emI+mZNRddBW{!Ipk`AOjKYyQ&!}EJS_fR8dZx`~&v-)%ch!-dZrvRVyAvRRZ0HSi{nE!>FhK5_$6nIX#4V6$N!Yq<$5p6BZi4jdb(g=HGgCs~U) zdiwfRUpNdt{|3aJZ+=VSYWCOIh%f-n7jV}^4%N}+0u&fls7t2rY`%kH6Q=2P6WfD~ zra-i0AyoW8S^1IHavt&q^UjN!)Z2*L<&<}!9Sf%Rd6{_$hx zKk{X{?toio0g9eQHaWm8_89`)UFM0~2mF|aih{IN zD>eJd*$KdTRz;^|a54l-F17gko)Z(?Q-$?N_lE+WK`=X*|FMv?OcSaPF;iW7!>?TB zvun&6!}iH@4cpLr_Gp~+d{?eqW(*VgtoazKc=D$!zq-&zUIiG!6w582PF zs9|#_N>B~A53|ha13}W33;8Z~|cSQKmkHX-^J&HhE*YxG}&DFp0-<{^e1a zRJdUEJR}^uZy4u<-D{uJk^-UFWjUaDsMl)!&`qBDQnh|1CpsCpQR94Y-wiMrXqPA> z!4vj}7LgJaBXrTl{upUBc)srA557+(d5CQi^Eg&-ol8W$XGh}kEBwtD2SvUB>RQ$n z;eJg=85MdZ%}@uV3@r4YIwAK^k+f7`ItmUYIG}>24!xqegsO?0tsJ(!G<>pdH*XRm zeRQtQ4+1E|`Vm={xAD`xhsFAgziI9Kq_^{T{(cqTN6LQH)ym#m23$#S ztIH2YTtFpZ__tk5ng4(1TsVFcj^{e%p`W zH}lwd7=Ct|4L5(A5)Mo>$vqGvX)R=WLjrO6C$h`eNN9tfkp?0Ukna$t0F(H|iWYsU z=<0*t@85*XxL7UmMW5om)$FpvU4I;f7jxJ8&F<5C-$HKn!vWE^-yG`i_J73Bf2XMa z4yASXvNv{yr8Pev73S*~OOYd-OPJ6i-Fl)2(W;eUJ_}#h>=cwS0fDU9TclvW*{w1C zEUkvl4k*ht^J7908Z-LBi#sQ?7|lM7S$JRkk`d4A22NYu_ZhH|iTdd(u+t)dg&l^= z64?vi2dLi*4Hhb31oADWD3P~@ZoJYb(gKE7&E&X69u0_1o_7ynsTA25sAr{scXg4y z({2dVJk`eX?G%rXt7EsU;*(+nH79AXwqQeFZHW`UkTcu zOKCsWIe$_5{b!#0C)|AhnUDSXwFsbK`KN^yYyxg3%fDi|`4polB~lXv-l0X5BUE>< z+oQN~4X^mb%Zqx_Qo3@|sqo9k%buQ5nfp&pN4Gx(sXctaxWNAxZc(Pq zzN{C)!kGmBBN34+K##Q-2tjLU*R>0bFMbiNtB7vDWoHPPJI_2vMVg>$r{5RdmRhaJ zE*#Df((F5(Ao_9$B(n)jPE8hRs6GchK)VfS-<_PLVicfQh2qAk9LfPB>3Ksc;+ar-j zm$Z6Ds3v*TEh9I<`v%JE1m339<-P6^o7Ec&cjHDLzh2M#2q+$f!b(`0Fi%K8$!gzd zfV_iFCZO;b+*(;xBR2XW(LHKs4@;SEFqe`2RTYt==@F-YDz5)W4eTHNUm}Jy*NA1H zUa#=3HJhRFv-4MN7PUp_id90C#_8Rb5){#1KO&fLG7Eq1`AZlc zOk%?3&btfOTvV23*MoYTsO!DDF&zXef8skDwL%x9ni9WJWht?zS~HboSq42~U%Ql4 z&bMG@#FS@6KZ7q7DG}P6AO{il#Mg9Tl4VjN+38!eEVX1@XWZC&F-Bc$Hr71;Y=1&= zf0AZ~83qux6h`*q1k?EL4vr4c*W<3ggE8fGEoN0d)kT8U9e9FZ%-j z{nFh3kD5mR=-8=0UT_EfQ6=AC{!CWHsr08!|1a6z-_2Qn|K2|%3KX{Voc?dv7dQ?z zlRLuphTf@?9QR?Q(7n*^diaoao9U`dC2Lgd>rEkvFW9j;ve6)tld4@!ksv9xV-Bea z720#w%>{2tvpJ-9X^y6~cZ=wsdR_%$>*&^=VZGzL)BgK~#I>FbyPY?7>r)>mEOj1q zDb(~HsJV2Mm;DJkzOG%K;oAwo6rsIY>1nAP)tFKxuN&PlB{4HINB4Pbh#{_GLZ>&* zQLh4lruu7rE>N<#6-wVU8~4y6}BJZGgPp1EF$0o4KkPF(o|m`84Rv)ahZ_sX3mbw)5f`;tFnzR;0F| zCX(ALRxl+xG_7X!deR-P2k1FTTLPl@DBHDCGff1COw7f`p(U@+95I-sMH-IE`HJjO z5qu9GQhtUfL)*)38ChU0N>|uW_>ZW0ouw1T3A>LQ!;&3|dU+)1H+L;xC zabUC zEJ)=8r(MQ(w(Dh({nPvD2abJ{mpDXMB1t-QcsosWm4z2mrNosDX!E^yEx%aC9KlT3 zD4gg6LeNIkPxl`?4*}Pj4|(#|+s153ARk|}bbDO#;>XXLNyAck4>+GDL@g$>MKwRNmF<%-LkeVh6HFQZ|{%N zBkUtg)~1GBF*vqzW4VzNBcV#?O{VhX)00I4ma zK5p-n_c?pU>$-{^7W$ca4jqzP=u+){f@{WGhqOen_8Z`_5ABUfSzF|GfBG59t2V{` zPVc?}v=4Z>liPU{Q)Le8@OS&(`wJ|L7?LkoP0wS)9O>u0RWXgao7 zGB)qa6I#adZv9T!6CPPLS5Ap4=0w+H7uqByQP$Jpqp-tN#)85RcA z2g#uI)Lopf=8KqW;4&~{=1`pS68p<09No}#fy(=)lLopu1ZYu}~x za=lyOEP||`XOqHv7M~8lu8?2b3i|fFH{xUR`dZIJ%Pw0J22V&gA%%%rVRQY`WlmC( z88$EJy*x?pGTvLUFvn_phziPzFH2mSz4f7LmkZfjvY^5f37C^nu&us>Tq9Tv=ftx; z+fe~8t7v`A9^F-90k>9^2Jag0-p{ztcl!JmSAdSS{}t}-C6dF;92#{W_|COj0P#EB zkxd~F+lt8B71dA6N`o8Eee6cGFfli?&c+4}T9L;oWd)NI+Zj}pnnLc>1I|Sv;Oop( zUU~mx_tPu66cJ?GJ>{91odI%!-n^vWm$JY)yY?b+iEm5f2qHKJW-Wj@NYxpvKj{F# z-(DH+9?2KZk#nlF61c*3%x145$A`1iUgBt%`bQQw)m^lThwcj8s|i#d^4)vA0a@mn zC})>?=H?qY>tIpv1#Nf|@;FJ1DhebUVlndAXviU8>J^A|AnUVr-usYnn-ng(o_SQ5 z$0echIa}+C1QvNcL^)FtGU4Vs*_rsG!Y{c=-$5(tBnLAJH7w!VP>2NP{3;j+VT(w9{smXujC0A>k6Iq2`Do+*1fklCk@e^;S(sQ;>SfBChK znunPfWw3_o`uGjmJS%@$zkC8Z622ZM4HV-xXJ8w*hTVYEGisYhcDcieY(AMbV{AL_hlxa_qJ#n@;QMyL`F2$pBmgOn=%%~4dQ$a52|M*p;sDV zVN`blCX62!D&J+c_RenHGG^s-LxTi0zm}ntd%L(Qyi@I7{`eQB{s zR!ndTGui-Hs%IFEj<%@y1viup3CMh0iJowT%gk(7szV8VfWx)1RNA8DIZ9|ILfwLE zu~+sx$YJsN!tHe4$ge(nT!N2;zgF9C8<2;I(&V}DdTuy`gA%FFgFNwWfZmm}D17bE z{p9+=!z-%JdzbgiKICHc-9YgwSUAjt`XeM}PzSs*Wu*Z}`2q%){UQ+>2gS0m{>!U7j+|V%o31cq& zm^3@{A)YjW?*W=vWjqc#wULE#22WxYG2XA6RX94vKXwQi#DQUp{`&F zB{1StQQ%J*4fr%I8_Kx|;iYr;W)gzOVh=(;zv(Ha4^c7HVKqJ}eUKv|XMJgpt$&XJN#J<&TXroGtr3`;f;U?;b z*Lv(;e1PHr`}5$VQ^ z7Cp9RNm7hYr>eliko$%<#T9fF9dvddxJX)>^WA-}8`xDT!zvfxulpF&T(jb0KpgV2 zJVvY)J~WgW+hzCAqTRjtLQ{fwTTcta)2vY3-NF8g=raw7oLy4{8|iBqNwpEhUC%#p zDUX6~CLOdZ-Oe(!6EWd?noZzBPhaD8W?_{XsFdbF$WXQc=~`G58sd#@OmdFSc~IT@ z8k;i)zw~v?G^<;-Q?llB*I}-wkyRlB2ZH>O3+Y*dy66|%?gXPo6dxssJV+EnOUlDb zz4fmJiq;H4NB3-B-Y=_G?s{yut6$%A73QGtFH7&k90XV+%V$Qq%m#}z({i+rjiXOz zhZ_Sk}Jn>%Th2YQ%f6a=%A9+zm|k#0uwQY@RBOoa-HJM$=EJzOCu z$wP%@Csfh!_`Um&#_g3*r5}*Ocft2lP%~cxvX}#Vb+sLx5X(>V_!}b37{&beAGQ7l zM;XN;5VRyU_1#NQ|8a8On}v2wAJL#)8G^+Z@*L_VP<(j!sNKNiJ*IjkUuY9>OnXUe z2k?$7+5~mRrYR_KBp$#Q9eA*n{dM<7NwxO9+NXPJ`jtwGPg@^23!3ucX!XltC99Zo zn9#CMCF~nf&=$SkcphX+Y=y^m$H@s7xLcacx{O8QQ7tZqniq~5^v>mEbPx4>u5Z@%HnzMz&mkKx(5l}&Zas$+ENHMB_eS|hvt+M zyN2)Dv%CX8tRI)W(fM)y!(zvKSa@LI`3~jAEi9Pc3ENjIQM9o0;n|FzBH#%*r}N6; zMUZpEW#!v~n@mPUgRXA3?-rA$qhMD09QM=idIPBthc_3bqp1>d_D8x0I2_kiOVm`u z<{3c+=lO1=usKkbhU(=PcgW!x-ib4|Pd4&T-M^6UV8rIkk;W7vbDC%Jvm1!Xnl+w9 zVbh7LE~&21d>c%Tp_q{lPmoiHVy)79DGv0*uhbQ}0tqAH{T62}1dhG*78h@1nu&SH zEY_y)4^<`VuP}I!tnd|pss?{^Uen!SHCP_+Q1P)#D%-(^LB(cASDCDs_;<(=>PcP) zQv`_8bk1HLZEV-61mlD%U_xfq&=f_MKgFihvjS35jhlsO?07)@bo|X!; zrkSM7&Fy|1C_2ysdP zX^CtSsxd6Nu_PTPE^qoI3Bkj-ye8^A``$#EAWx6{kq2l>nVVH^#s*U zX96}ir#0S|l@-_TUcE9Jl@_~o$~}rXDsZo|v#(Lr4gEy3$ur&+E7~)V!<4RODDSY^ ziMztZg%m4`@{x>z%$Ij}@3rZDW^VyKbnl!Kt`)?FTe1N+0yeZ-5ep8@B(g+P6^gX- z?T(lhel{rhDk-SinjG*!OC&nzXb2XW{wlf!(KtJgl%Nk-LQ08>ImA`{Q2rCcEz;N} zp7E@*k1}mt$3mvgb=pk6uSA{)MeWcOY#oH6GmrG|IUb@A&hUBVI*7)sqj5lS2nf6R ze`&p7lTj{aN6=)yS1skAvor8Q4!WZ&qL#TN^GwGEa*JCB>cGzFS_AUXdhc=$NvWr8 zdf7(?vv%K;URkgx(HNteu~a8=6Ky=;hy~yz1|EC#FstuEfb`_qFK7*EyyhC^Rz3Ul z77#BAwroc`u8m|)uge0ssL>jf3Q@25-wDC-Gm_)Kvc2tG2jub+z|3i|JrZxTG)*Gl zHiu!!3Y9OjevKCSPwwOI8teh7bYO~>tJrc5BH^8+l5Ww$ad9E9)36DV@hhtgty(z< zZd(2dwFkV{@_X9tg+2?;5!u@q* zAAppdea^5c#l2pKX@-oCk7(LbdS@bAMHp;}B_MUWk>D_vVgz6i&@^szD1cDX0aDub zT*ie(iLa0!pLkP<3j(-t2hE~aNC)v*lU7GX$GJNU=8+~0G@w@A)k~qi`wsHK#rsSL z0UBrrDb0kf2s$5kT|f0co>)?j$!q>1Fmd}}WWIZxXrv^A$Wz~l^_pZBa^>!=^0gz> z(^8cyZQ*_k(_5SCm=!;~e$38jD>j&XgA&xpAKKR8f}pY0a_3l>+21Ztg(&LeS;uFX zU3eCI{Ha^RL-RvmPl{Ge6RNCzg&QyaQN&`}j()NXy-S+BP_7BlywbR34&LaYh8O@%whDLy0WnhzAj?|V)QQ!<0EU9eQ#l-$?I zWPRHyryUNVUQ)>}B#t(Wdsg7pp*~WC`r$G;C&gS++&jOQ+`LYc?0g+;0ZXxJ^OBF( z^K3wSwC9;mAYylKBFu-7M*+lf$}uASX)|RKyxnQho9Lhlu<59;IfwX`7TLCifNY+##ehRh98LHLDdN4Zf z5l_!>u=UBia_?P5(siucET8evdW*HKkeoIFe0TxMIY^%_wD(NXP1r=Q@RUeu*TrzV zwdbibBff)nZ!9JxiM64}wr=1mb#w%D;Wtx!KA$t5H9lrzwG290j6Ccxf+`2Ms$v8` zP3BW18offMFB6xX_d2BeM=k}caRz#_e{7^59qh$QUoYMmNhIisnb!_UZqTldkk|A* zV%GPaU?(p(#2-HOMH>{GQkk=ua&%xr+40_xhkzNMOOBbtmDQ7m4&nNj&+D0>Q?bl3 z-3WPFS;~W1<44#q--Aa^y%xIIS442_dqEZ??zd5Uo%E{ppi&ELLZB(>y5u9=p8549 zgCh1p5BF-`s#(ZUt4T!Z_6x{`eiO^n!_uYtY{1g~WoB2=;*pUxNq%eD0{urb)|v&= zr$kB5TD{?6^-zQk#r7n+!227X{4B>?>v~e)`Y-}0hPPR6r12>Ws- z<(v>|@#6%%AKLU|cu)tax-%=Bdc+~RSUpKH@sOF>cz1DC+_|mXE<~FCi5Hmo3u>cE zsR&QfNHi(3dFmh~bf!o3QaS0eyJD>a~zWNX~- zK5=Go|MGr?E(*ByJBYhX*ImbjgQ9eyocJ||?}eqEX|t|E#EY)fcoPtUSQV9G)@PId z(y~canw@&Nk9v#H%Smz9tMVyxVJqFAcey{_IO_chFRQm>*hJ2?+vRmwJ33FlOPQbP zsw$go+sTx#i;l|xPe{_0CNYP8qC&>tQR9r{mm>-enZ<*W)&pnGIE#MxIBV=2u7BzeJC2d@ou9Lg4XGdmiiqGP{##M=%wFeDpX;O3uy$i`li3S999UiNO z70gCnD!$95*x#|cr#%?N48YpiCvU;|=zWmGNI|oBbc}{BVju{s zJgO-P(>%@MePr5rs1{1j+10;3gzT{{?s5`rd4EwZr2CcO>DRMj+RA!? z!&Ey$;p0ZAEOF5r-Zn1jYTo1Gq+D5ERq-r1NvvU3?T`=6A%+N9IWg-JIcge=4Wxpt zBpy_^j3bM^-q^FT)w*VOzd3enf85Q;+9*)XbK4|ypoA%;8+SQGF;3G^gZbr)8IVex zLz~(KQcf%MC?%lKTMM5UQlNIh9lxGeusGiH&2-GHyVOds0q~%ELZxd|H~~7;r|2WqZ3hygK0!D>H1!mvlMh z@T=>BcRrW98zB3pj#iM2mdl9WjLQb^^sivcoc0o=pdT5yt4m3-{Qe*i`(Dt3$+vp5 z;4)|!NvXvN+{C{Xvf-phxPql#dN@+;E2{A%?ljNQ*e&FBeT6}zURbFXHDP*em)^Vu z+qts2q3|Gi0$@UM-{JmNv9gk@M3pO>3r%G%19RK&I0xAAZ^mgm-xxc;_JR;sAC65a zTT!o9K~CVnh!ZSpfKK#>Pqh{W)$2ItXP=A1QeF+wiQw~_!FFzg_cbgqcJtMazGu)@ zIW6wonH&>oWQ_x`_}j1Onr5h-Etfgd;a~MHYcCc2UuEjIFp7V%%2X?bWyUB7Aa_o* z(xv-1w!z_uTOQ)(Wj)+<4*D9JM>nu!HwSI#TB z=C&vme+T(0^RS5E)cM@Weu&{+BjgAE-e?7NF+yUq_|ZYA6valiobJ-aUSjvUF1r0% z(1#J&qm&F~f&RSZFUTiX)jC#lu@V%XpGr*vjxJJ1#!>8)E^Vk@1>WL{W9{Hq!Lz5& zPraO-~8y@l{M9TGH}xNB5NtcgrEXmMRF9)7mM z8#Bo^&sywcD`}=`u@}s{lU^&<0wAswx&TdK{-T;9@6*%mbdKtQyCs8O=HqJvlZS2f zPkNn4v@t~Hi7{w7?oVmHjAKMdWOUONM+Vz{;nB4g1fx&6i~JI8#b)W*9qHlRAK#>& z>pHCc6D-7|pDKl4+k=;2+=!s*-#{;*MJN`p1RE0$%$_!n=ZsSbY1!4(XRh8#Cs$6h9fwMu7FVQdxZVy(rSuo9$QtrkYS;@^V;Yl zTs?dGT%0xo$aRRQ3h~_$1J^ih8pg1ewxzVP28y;5N;^Q`?2YhNjsyR;SH}tAroJZ#(avH>sx6gD=4+~oCJ^I2%5p9 zXBP&$eVICig9u#*UE)rZ96K`gne{FrTZ3s+8Gc)AL83>v`Fd?hP4&Bqs_J((kL|L< zO=fI(?>#|3f6AhN)iW#GWJyuZ#QwVuUVw(b){2j_6O|c{ z4jaE~l0~#Bu`3L>`e51rLhCG(dq|8P27i|r~XP8iTK_W{k4r?lh%0KQx|O^>{~b9_l-a1C3q z<}B$nS6d?KR3sWE*JOM*;;?L$pzCs7QZ5)TN&|KSMkHhgFCyDE=&i4kA14?GH2%zJ11z9 zD6!4l4UXiWej*QJSag}Kx{Gcaz2JytZn`qRbZnyZ3A6v^-L3PYPZ>nMequ2$rl^vF z@FM}lUV`M560toD%N9KPWNygd2ESjfrE}Wz!=y}aF}v+67x(UUSXxMnA@Chbl272{ z%5Og&a@%0&koWPowJ_pV6P>FzVNHX$nwVvw)7)!oE2yU_6Ww6T_2tN?2FkjApc24y z17M~;pPXN#6zkXOczwRbB<-60dQE>1A_ zMj|Dyjl1d#pvTn8*Ah4vs(cO3nHw=S1d&BQjRyqlB7{JP+E7q>8{k! zyAPgw?m=W7nTjR%D$eGCHsNJjO@3wsSi>0N#LSA~SkcJ(z!k}nD=X<&G7xW>!)^qO z*Ikunq}cbi$Srv$?R~j7UnD`Pf2^%UFwVV_b?#ZOrJY;NM2D9IaFbJ~IEg%=kIX_o zx#lbo&8}$ocE`R+aqT@`??Y4qVw*RF+(h~0LJ)H9z3g{>LEL_8eV}P8O~e@M~;?9a4j~Z8e#ChJr)WhSU8U}&2%M`T>bA_q3|rNH`u&AO$5P4Af}xa&&qo$GsiikBpm|S=W{YxKsC5G3L%ECgEZm zw3FD3O~&jM2Nf@|z$C88rX}XKDYyD=1yaRl{G=OToMpW%-svb_-8_??BUx|sa}PER zv^u`euVp>V6_DaX4 z%lWgaQ(+E~I;ay>@Zbbm(vR8Ci#;IQY#ukpVMi`rurvu>mN*=SD0yAfZN(yn_H}9x z*?3jQI$yGo@XT(n4EUoZ_t|A39vhi}1+9|_>jzU9b~WwGuTxEE7zOb&c(k6%a`%~+ z4Uu5`M)p;VjEHa*_eQYQG_VpKme&E&0_eF{?$+x2r-3n za&mI~-5!*WyCc+_T_z4>o&s8cKe}PsRFFrNOyrj8eF-Sd_CeU_QkMiMI&Ia^VtN02 z1FBg%v75mpF}kYdla8+8D?1TAl;qtbR1Qj`hr(PqUGPB#e^g6pBH2|yHPMQyFM5Kn zceiEq^#x#Z7^S#pC0C0@Aw|92!1#)R+FSBBuy2|5zZJDcu13{iUM_AJC8C;Z5i)?2 zsTpC5egs%wkhp#r;f9-A1dY|t<@Q?w-Ewvkv7N^45;;vV22%Ex8$XV0wgL?ARo`gt zFVn;|2rH`pHU(%I1V`bhfJVme2mTd4TGoXkh6{GW$C*GS5thcYH@l~aFWx@{bv-_MMOHT4r5-B`Tb1Y|Ilz*VGk}##C zqp_%lE;Nfr@0_0I!4ZoS0a=HPrO3CBl_od&CAx6!63Q*7w8#U|JYT=1h4+(W_m`O}-kdCP_wB9vB z5mDqbmB8}g!yrMcZTgUO3mP&(-SK-G+10-RbvrV`9@UJVI@inBhOeY)+1k3~j5j+t zFnNyjT;IhMCcyB{eTUCd+PUazy3^fEzD9^{3`1cN~CTnm_gSk2FH`4^%_O zO@lw6jE}}6JEzwjV0!^A=WG4AjS&7#(|?*k@N1$$S`k1U6kelOC1C)x?o+=Cnj8G{ z=|AF@>!(&H0Pno?k3!@Bq1XK@D?j~(A(Q_1=>u7Xs1ANW7NhR-cTk_M#T(O6Y5*1x z^O@48G4-+Zz=+pewq>S6a9fAJg9ej_{j}g`P>nLU&+SOyU6VTBL3&=dHiiBnyURc2 zOq~7Kcij1}`W^f%k^7hJFP(0?U$?*q&BFgzcq{yku+6OcA6;ehH@|3){ki1>cNapB zl8o;Ss%IXqg%f%-cfJpOS|V}#)Hn^qHDR1@m#5lCON}~!cIh|@pt#PDK#!tkE<^BI zR7DI>>)TO4he0SDEEL1X_NeaA_q!1D8*mYT0BT1Cf=Pxk1t@w196|*{e*gCJ9BPL$ z+-fOeTa_F_sUk@LGNI9rXW5W8B(DYw8{U;Ocgu>SUDPC$&!>2PzCL(Cr!AV|kOmZc1}T{DAkAVDBcOYC z6li&?m%+YX_)lCLiO(pC6D%B1rE`EBf)S5{i5YYOeFBQc-`)bI?hB%xA`g8By=0MC z_H)%hL<7={`6LA&moZxcdw=JIyrk>0OeYz8pN7FbSUW)PPk{_cKjPhFlQtZ1hPLO= ztMn9C;0A5mJazbb%>~bj3UV|u9f+Eh4xYU2;64mQ`r_aGYJSFS{>e(|8k;>pH-bb= z|6V%{NGpG@uLeTC`EeFo$NpXq`p<9mpO5A5Y`Oke=J~=OO?ToOJU3ha8?+OCBir)t zC)2?49zwi+%dCfUW-9sIH%Dot$b{{7?4l^ydA^96IyI3x8BnJ%dg{T<^+xHeaM?GP!0#X@1`=m8(IJJOqBGlLdnNg$ zRc(1$#i>?3Ue<0E{sH+f@{_k)s!px?iD>vZ=aXDo85@y;qw+2xDl40-Id4uxPocde z6-QMbiN1As3AJI?>~Dazs2Js?>~B|)jf|Pql{uCEoUMgnB=g=4^J#zN@dqeQc@v=2 zH(B;6N-6OOn~0{tR$qtJQ_$nRH|3_e2Mp|5$0Gdz*l^}qY%-m1ycwXcHeN!Wpd~2; ztxT~DtQH}P(JD9GMej}w2c9nS17Xu8=6V4P(fZRh;qRa^*dS`-o&5%Y2bx91_5xxe?3$Ge+^bM1@iSj6A-=5R0>Ei{tVSOsQtxH z^H-8~&wq0&zq%X@>xnA>j2*ad7;(#)^T3QMVAQwyt^E%MXKY}CddtvmATY1%rS*RY z6~J?`L4cRKbd;qO5wkptUy;+MXb=E@ol8=U7B`y~M^0Ivym;k?NXTrqE8JW2;iCSv z(Lj1Losq=StUuP?JOv>^6BW?~6Fm_qAkJSGMu-XC0S^17;nX7X;$nW0nl-p_EtcEaW(f z!wlwtcWmiFFvo&Q+#>)Gl6rgSMxETPin{u1T!Gq5NYe`C z?YFLN=L+pAvv{%xzJvV5EmwR6A?>+T1ENtA^k6><4<$XZ++REF3~#kmnINxRZ95rO zdX?{b=?%WOvO!;19?XJvc$(pi8gCG=Q34Uy)_cgi=F+g0#^9#s&{15?ppct+R)G-Ac7Rx*l*Ys>dvZamsk|0S-Jz>J7CHAH8mIn#6NG_?_Z%;5Ery&1L_w59eBa zV9RsFS(9?gr&*Nnr5Q7H@D%$E}A#JB4Rc~4qgGAaKodwOWgxt2M36k0V0G|3< z-;Pa0+3-?riI%%JDYk{ciHHrpg4frUlJ4W4kNcD*X~vp#I5k!XI(`23Eb3l;dO@-7 z`}{<$;z#uYz#MO$od$;@VBbN{L%p}?o%C0L=IJw|K96s6K#_U-iWeHSj9I|#m?-$3 z1OGavUHQM5#`R}E?0;zde{oE^?TIBwi8aro-`FSk+ABAeeQCPPyTjIEn?}u#7>3zF zo>YrQ3#XATHJ0DZm9)aZOSKwAAiiKzDKqjGEoI~KCwCIOd0Ck-Zd9pe zRchZ}=Qau_8t9L$I0K9~O52=yrHI6l#7v9s58h1yUjns7!G$0{T@6GDgvuq+IG!o1 z7(m=2goX>bU0j#~SKn#yP!oO)=Yxr zb3GtGIiK$!NCJFKX+1xFJ$`hBkF<7U=(q)pY%}E4RBq)2_(5I|5%6KLq4Jj~wP45F zjYEzM(%C5@m+l#%$T<;~Gcz&XF(=w&RZ~Lzp_%FR=26Ho{tT5;9LvV3vGc4>n*xc~ z=0rJ8F~z7v@n_@qqa2(}1>p!E!1j2zdY;Z1f!sg$vhg!RyU>Tn{Ggrsh;69Ln9giM zmD{@|T$ydSsp8Fd8W|MdUj2@95DCqP>?{^lOmj7f z3mmf$_pUqVI<|l_b1>csjVfh3@zSNNN3kQ1A-HwYbtc9C^Jq6v$7T9J(en8s+zcpnAZKVV1YBM~S|_aaY;ST`i9|axnXD5H0Nu+yeVWa+zi36yuPG z%eWlgGL*m5j|6FYD6H!vS3 zW^RY`pTIUj_6pTpt!s;9THPloG$0{Jhi*;)jn{otHaz+E0F@6a3A0HacCLa8kI}6;VzC~Ff$!SKOU84 z{Y;M--DlJ=&l;dgT8M>TYt1x1FC!FRYejqU9Cfw)4c8r3usvw|KH#11U>Jg029He; zK4iHZuLbm3e)Ws?BvUh4xaazwQK%oYj zEU~zFiV^UwtT88Pg+n3qUSD(+RgNd9R!6BNT6!f(QY8?cRb}$o@c1JJGdE@=OBc?r zlMkD}G>4w}Mpvh^pvJU;DSAT@o3_-@jXan9(GLJnbAL|0pmBy>E=K$(VcRboxNsz7 z1bWN?xxcK|mLweQ#P;b+{^bVIo580Vt0A@GnNv;GT0pk&E8)go4EQ_f%dsi-@1S?6 zY=Bs24LFM-wuwNA>88-^^sd;B3q3*CGKP^dEs}z3@G}4s?g3%%U!@%X5t;r+)qJd% zrlk6)XNt>93Kn**c5bn44qU2PsEq2{TgKgiYMLR6(b!WUK<1qZq27WJE7kUY2VKem zvTULq3xJ3{FXZ=c`7(!j=-EERO>2I|iX8hksJZbcEg5*xfa`gIpEJy#x7Ytj{Ed5W z&5wphIeiv7z`Op-o}s^~2K-YsGy{_ng~@JiKkbH8mK!EK$TNMVo1Z)i?MAvr+#^|A_Xx}yoKszTopLr_1-H7c0JmB3I5O*>Gii^ z#2e_)(Q8W?6jg#&nCEL-U1BChgi+KdSjC#%XO5I>MxH0J=e?z%$c~hnT#?vt>C&)5 z9sxJKHHG$UejTcHTs6 zThreIB=M6U3U{wl%9{5GJ*qoC(EX^PeM%&zc5^P`F|rBkCsLkI8YN6+ResJR?Kv+C z0?`*wECAG%#UrsIyWRPl=iYcFeSMX4@qW#al+!(4=C3brXzHy8_>$PPhGekIM>-e9;z0b-&O32}*lP(NP=%hp`VG^eDhDO(>KTZwk>X4W z^=Rklr*7;QZG+AW^W~!&^>Mc9O5KMjQ%eT14DUbAd#%0pWW9U9ecAPv8U29Hj6|zJ zDusm*8gbfh0O#5p*{|{i&I7F5_1rAm@0tt7w-Gp_)B#{u~u3S~avL7$-C2 z)2OfaqFrIq4pt0nu;31;plINEc5$T5EDhJeoSlYg8gg^x(RCJ%Gz2@v74JQwE`SHx z&!u6*LBYlSPN@t5qJpQ{-L$C=@5;+U-ZO_x@y-MzmtAvj3C{VSHvNlw{GMO*M{oVA zqW-6HoZ#oUAFEd>uph@5f2nB2c|iE6nj~Dy1Z=|n0??l4%UR@6Oc5S3(kpYaeIK*a zpN*y+zi6~?sg&&n%=?>V4@q@)HEl+ka*?WFg%2vCUMF1^7Znz$sM;go^d_62ZQIVF z>E%zEbFs%mPdFq9gK0JO!uGzOCVeemdEtdL-wSkQyyJOsEyloSMyPiEG1@er#MzPr zZ^lg=1e|;}Yd$_}03foZ<5ES{1Dv)lZr`*deI6bNI3BgooUZwXj?h>Aar0-?s1DvdCR%M_VBgfFcWDoMF1&Do2u+k;}aTgjJt zA?!E4E#q@z&xIzRS0;5AXX^})2`JJg+TnGa;;LIE(3g}8r1$y}NTE^ceMK~oaIX1F zC20qiglf9yxpdEMR>U3(yl(@{BW(9Qj59>{&FIO?fC7g!{&>^ zw~O6va`t{zbqe;Jl`bv6n%Aby*W5?dz#?pD@kmjGbWNbDbOe%bU{(I$r!x}=$lZ-6EpEO8hN)GrES!cH)>n<>lgrfC2)LCC*#P$d=%Anv*_qyNkC2D+!Q1Ve zM@~=qXdkl^c`(~%5`Mi#Pi2$vwh$1*(HHb~CwM)P2ZzB}QFxAiH4{}6yXq*F0xvAv z$iNaSLB_v@)wRv-!cjq27@wM02IZhOEgk{G?^a2ds>50p9~xU+#V0+`C>{90>R*Qk^659)(f#v~@9SdN!UpJnA{{MfnS$^^1{+~){wx~cP7cCowm!cGt z46X*XtE^^W7y137%Zpe?s_(~_H8%^MZPM#QUTiIX7CZQw23!!2`uKI1<%8crqU@+$ zzOr80IbBOnm7CELtrG_65qhS{fO7-70lOng2l{4>=5U~8_S?snJsJQxuoUx6t|s17 z&6dvFK4U~Xi~{o9nIdVR)AQTMSet?#gI3d%p%35Mn|{FL=yiUTH1cw7Y1I~a$abgXp6MR(|oXTrLZ z^K3j(CHYOuY?4bb!Ya*sR@YCfxwTKfhQi(Bt*h!dyd48M=1s^8dr$ zdq*|drt70YL<9t+cY=bV(nNX<7Mh5NNRbv5kR~9~YXAk577+!d1u243A~iG{b_V1j1_HXvuf3V0}#w70(-sidR>%Pi$saK0(cO864 z-5Mbg4OhxbGJ^5qs_&PRfObU87()95-0I(T?UBWrO}TNuA3DD==KfE@NB?wB{L{Zb z2-bXh=W>1zo1N!cs2dg9T{nEHspGm8+;lH))~^?~nuB_0E+ z%#K~EV)&^DE#5eSu?=bS00B@cUC5QVF6Ej5l1++tk3wgvkobf*wvGisKD)8HTq&(= zuUtDc@6hHisD!UMZ>TEIcH|8G@&~DTE7(k~{js6bn^khL2<}FGIet5p7C!b7%|{n| z=(HfEYA_EIzU8ZkF-Rq}UE^I?PFRCUQrjVej?iI#{*5FJpf)ecMcY8CEZ68i+*yKE zo;ql}uB{J5lsT|5Niw`LI?vw`2-qW+!`x-uAlq_J!jF9Ck7*tXHIX3C(Is=+ztMuV z{0`4pZvqnSFf;KrFq5K$uh-sPnEzt%F_C4a6X(`YvP2wOYEedpl9}6sIysL{4yT3 zid>}I%65++2CRJ}uo$CU-jPdj!>KA3XBnDb6MKc|*6fbNQ~0Sz3AE{o&=UloSBoRT znko`LnLo@OGI!Vd>uunVrw$Pg5*&;O`mu`N%Xq@O^2ggFY$RBVR8PN#pU9vw8xvT3 zO4H)Ys$(ukDSkS+KC|5uZnb|5{`Hmza04ABn9mlwOdTP=LcSFXO4a75AKwjdmd>kz zsPFqdSi@63>S3-NDllhh;ujr^39U!epfMlhjfF@7%fj zIc-b`AfZMe_!SZ$NQiBvuTz8_)m0U~1G>MEUH4fmTeVOhh@^KLL}mGqN{Hqi_y*c3 zL7p=#3hIQrNcp_^?|rL;3cq;kp4C9B49qjbW&G9!#ne+k)OoD zM(?M1*zaj5hNrib+@}QtpAugUX_^OL?0_v~yq(M3*>_0aPX}szbT`mOGha%sRrCLd1$JoWMMF(e+pw>$=z9Pz z9dT`2Dlolx8)4W7RfV-=ACy2AE6I|;*9fo@XMsC2T8m-@>vJZU&^W*S1htPK4iGk< zr^lOi<@f^GTf-scNxf}Ywk#x8k9O~J(YVBN=`ub2TXr!ApIS%u5L$iGM6LCgtQ%cp(o>p5)l}h<;ndpiDECbz) z-8wWTA^5ZiHBI3=`HS)l!XmP$UCeZ3sY*yMa6S1Q@V$x-sykQk#GzisPZ0_mzoEHOMj$$>3Fv8rA9EP^USjKl+L}^143lZtBHiiTbYM$g`2li`N$}ou^j0R(WDK`9LUDbvWPuDewDoE)g$bG-%?go^q^p zDR&x_SLrr;0z5veT4*Nb1I{Y+)vOA72d}9@LB(AR=wvTktz*Lryy(rA6Wyt#4j-s` zAa;F(Qmhz zuvXe3PgBDhUxc8xOsGL0%pDF`!$VrvE?WQ&Rf7^A2WE*P)m7G5emv{L%Od6c&!lRg zuXn8X3tQ4t3rhmMOwhx=7KHU;&5P^4h4o2uShUG!C-{8$1X6gU7c5XM<;6zBg6Vhc zZaRJ(@bt}sniw_2jPYRwc!pR+=}uf%n5KWvvt12Dj=go~`Ib+#v$?9GugE4M92_Ie zv0YMZ^Mcn7VqvR&*3B50be#BT`E8nDDz~Qy18DanJ#T)33WePE_0XOGgX0;jE05es zL;C`~+PK9aEFOI83XM0+44~|N<^nnxB#>OO&0+cBugiL?%0|?nOdF}P(}#sK*T7c& z1=C^2;N$v(g8N%vPXCAXzF(7#K?BHf?|wn~UC}I0t2GwR7d8nGe`B0|2xCfdXi;KVCGof9|cvInEpwELw_n^GlGxVqKZ17x~O?6T@aZDxu6zW{&P4D|gy6eP$xeXz>I;a|HwfR5U*$rle>KI%FpE7l~bpO8*jTi-dX@(2)d?D znqcBNhTwucuyYfDRnm@;xmq7(zV4}EOGuhTyFtvv`}I}qqG(K7557B+tmxvMnjI9(j-OaAmB zmCg>@Cf`=tB@w>PAjTQyv!Mev={k3*F5s4EiT5o}!0^W^?^92qvu9Dkn}RHEAVncF z`_)z$VB=kkvrWwvaWEyKp4L&YhZihn87Nix>TbIz0cEUE9iyrzR!3ZXvf%@6hXd=8}c6tKS?$-zU5Kkxl`;?lj+Rplk`w3aTwSM zBh-a6WW!|xZ(e?mSqzS|P0|Z)uXUIvtXL>wMjK`<;@V4S+=150Vo~1;#WOkCEf~LB zx)z3VkOT71$)rVUD!Cm${!Zrf#Bp|e{~L_YreV$3AkOXQ6rCoOQ}^yWl7A z*{gC55?O)*Zi|=XL+h&GO9XJ+2KTc#z*NgI&_Zv%PWba8Ti{Z5F1oP%`}Ug5v&w-r z5B8GdA42>^cY-6djp3vp#OviJ>=;ST^`cx0 z@2lSyTY^-S?!0|4Yi!d#d?Rwp;gpqBFhOlyYDN7F{k%Alh14`G0iYFRV*vNkdG`Fd z5|qPGBhOdhe3TY>Z9*{JLz2`rq&ei_gAtuM6UgPd?#=$@0wx~7iD1x8a!fqmmnJ|Gzaw@U3!m#)yXiQ*qy zfntVSwycAfc&;NZ_TrJ$G7wSvgB8HI3n*$*KHO{ z-ppCnp%#Cv7+CWi4b3oe@wmG(-^#DM$ary;ao*w|1Y7-A2Lil;%@q8xv8(y!2$j2m zW*p^3;rE3d$i$%wfCvY8AZ7ymCF0btTjhTelRxxp=)rq;=;jd&uw4g1wz(|bQmX+F z#vLHe?WO>jWGC8XxwP(6+!|x&y2$iK_GyRGUW2TrXZ$ z|EF{_|GMSXY1q-l0By+{q6Ahq0WDUIayL-YM_u;emNs-5zUIknDCwd)H(bqaR~lix z0JN5|;k^B!XM=F@a%`^t*}{#tUo05L^e9%paXNfeR80BfLxTrNKHq#5%^Gazj7<6z z4 zHm=X}j$ONIA|KRqT7H7V4M+b;eBA%?JPN~ zkVFN`fnlj{LT!RRK>_vV*rKzFa)XbKSibGNM<)obTu&oHT4}6x{n}@j)h!x|T}M1T z97>noChtfonQty+UpSv4{QOIufWQXSK1l!NcsO?I)bOdzZP-}v?5pwfr%yM0jQS>& zdF;(8p^NyUC2X!27K%Th!75=+Ylf;0S!V6LsrV=m+c2+Y(DRhALu10H z>p}L@h#_EX*JVryA=M8n3t_N~t?G0W4Qcv>ms}C@J;#=PdcxG%vd*%v#uF>eYm;{{ z*w%KRPH`#jeqSj&%>+MC-g6CKdbCgaJ~j?SrrbD)kf`sQAg7R`$ls8!VL|g?&XOA_ zE#}vm+nsvU2D^UPOgiGIEbs8j?FM4nGuzHIJIl7+D~7e}YVs9e(+>6hE$GdaKh8!4 z?O}UqOuZEKu?3n;^`|Kx1Zp{81Pb49_6*w}ffj~$LJn;g|jIi>@)X5elDqKvnmC5-QSHR$49_oKpaSsvT) zOJ#lkuOtos=LC%ZOg{R*Jr@ilb^ODbZt*0?^AVW_nRz|E>40H{tpj)b@B9A0FV6dufbTDQ z|GzTsdM!BkUz#-}_=kNm^fqMDs~|LkatH{Tq({&?JbVBZAeZHv*D!|QEJbbc#pN@( zXx|OZZc-fiTOpN|Y>5nv@h~!~8g?kJ4HNA(%hZ!A>p!`~EIVWSg7Hh^mSs2_)s2Aa zkYgLSUCb8m^UJy^r7AhNrdZsTIo5S@!Tm$|*)2x;fL)Vw^tCYCme+D>t%L<$H{ z0Qg{sgj6~}Q!jo9^dbeNQ}l=Ad^392615nH5swH))dkIVix-Dm6yTNzSp<@C#PNlvKhR2Q@6YL_jvuHQsM*I$x8`gYY6I|@xZ|LpE zbC7umm$Z<|@0qPAIT)>jri8h>DdN_|&uqT{AWVBbpOu99M8(w{LKyfque2{ zV5ff!n=$?*w)YLZThLS7N3_Z3Rw$6H&;pF6wyOq1Es;w!(TZehe#VVjtH+pKOUWPF zEoeJ3WEM`gnXbNt1qAkCf)s{{;MJ|~q^8W32w|c?{$1gwEGxu4Gv#xnN z>%|?8#T+j?N%m%MyRCVuaP;<)asal9_r_A633Io^a^+dJO@;Fl>TGO*ZDDpkk5Cm2 z;{++JtIrDb2zjLybG+)xu%<%k=nB?G{-R%DPo;jX-_2VD{f9w3U)^C<`oITfT~fBr zE+o>oruy?WW!OAZI#bg_&wWbU>-CXiZ0dLREz_yT2;jsjXi~Nc@qXcq1|*gyG_u`b zo?=`*e4_ET+-v}=s#S!!jLcHs6;D*vsPfxGX z;ea!zoQ|q2xJ8aVR7e>dtiOd1Qpv3uUbfWCdZAt0zhk83Wdg|1%5doN+KUn@%Kc2@ z$tEXW#U2KmTDULBjUI7@;4&2{l6U%C>ucUS9XTS;X@8a5K@-FVS_L8V+_UqwGAA?y zb+>fSBQr2FxzJ4)nmkY)W+?!7jq=cT;vY=*KIBoY_TQS|oQKrG)JUM%*xv=Ryt+n=b<${;wkD|M6L0{{p4G5bat34tdaq*jftGn1S+Y-rG6$-(Psts``I&H$#oG|asCV&R47MwL z!I;=a!VJcYv zfCH8YBkKYb3{gNZ+G_wz5afuz?a}~W7(awNut05QWeK z=gw3c5tBi_I&@EZ$_ygi(d>6!=p`E zYYVApjoQ77E4F-8mF#4ojfdi)G!En6+_V8OX{#Ty7*7S-vzD_M@{V=h(}bW~zNdOJ z0&1tp4ls^Z+*l}+D-qFV0b|aW0aL~M#%vDCWir&NZTeN7?sX{Rg*`fa7Qq0S;r=Aw ztV*+%1}-%taEh-_aY_1H@kK@~^Qk(9TQN=(HVs@a3?(H-=z;BRAs^PqM+QoTCuT6> zG6dJS4c|Wr1Oo&sOWkqGutkjAR zd}2c9CrCQNVP0bXQ7|!y{qNR(5Y`&B1m4!A0Zk~5dzu>jPO)6RIE7oU1lhmVXaAPCJ&Kn~Fa$H|viEsXE0 zmdVO$k4YTj3>}e6e4FvkdnA1mPWI9gr#;1VV5ld7xe*)~p8n0Mr0E|Yux*ISQX1Sr z03r4`vdD}DVNkFny+!$L0F3lT?09Cc!c|L``;>le$=29r^=mg7q(jGU=b1)4Quy3Fp_mu*;e&4N&S`V8*?#9q!N1Y!C098VmEN> zFY!G8TrK^-h8MrcLH`xW`u{q;BG0!_12P;p{RGW2qyYpjU_!R9jh*Is*}8fm>XPp@TVO7-v`QWF5p41O_%x2-B> zjP3)Y$q<^5Q5$006TuB5@b&^HH89kR=_g7a2o<)2sSE(FH5UhWH`z;@(!TAl{WceE z|M&k6!BsUH0xl3sME##2I$)H&ckYA_jr?;RWHv_#8}oJs*WYloGd zT447)N-s7Ls%TahA)ggwDzmLJUrL8K{Kn|}oA*b;D0kh13liI3ZLE$FFRb^>D#y78 zm6T*j=sM3L+6RGV=;u;as)~!vY&C!Q-TT5=scs2q`>mpW*$@AWKUV-G zkmS^9+Nbfo;p$&X$ahEn*Cp^b&f}kdN+v^SnwGTs!mZMCf3bLS_g1+2o3FyJ&-_1l zOg|6($M91qFc0{L(dao$F2Ce$7Y6-BAbNd^29d^0dCgn?u|2v&Fx5QBVIe^vd>(FY=GX0)Mu@FdeC$gtY9-(2yS{^&Q?> zHSMaXTv&Kunj`(ydlclpPz3WhIOnc_lxsD4$`#1;>^!+ z`L6b$=LPEN?LuF!Y5+VX^Quo*r>zHMlvq?QVC`B`T2`Z3#LqZsSRGOokD@D9dIOkl z+@SmOPmtv^^m#Wy_BEjyOe|m%TCs#(hX=hk4xVpZ%;u#t*U@+wGI#%z#RpUG>iuVm z8+U$p!Duf8@W9L3r{Z0P>B45hWY-rZ54ooPFla>;A2zq$XWl>g_w?^FDh5y?G6VbH z?-m&UD;L-Or-^od7HR$}-2A^-F8)i+^Z$bd#vEEERO|69rd5uMto~!Z1UJ0BSCaTe z7SG`)rmZnEgGWm($g$AAiz*+y_Vo~NfeD84c?10~4Z%6ujGt!v3CbVs;>3w7v>jPQ zn|AfUMn&A0pwG+#E2f+FtFjp%U#}zH>|jk$ue>nc;3@Q&XF(&tp=Q-NVWgL)4!Bi8 zW+ht>pv?|5r$0H(irnuv);2_ztx5EkevPmC;xuI>cFbT;;gvA+eBOqBs-f!ir}Qnw z_j&ab*n>it3qWxy(H3>J7bV(f;L7pt&48TMxGTxv!S)XP>kHrM%~Ypz$FnbVuh{$K zhuZhk1VU}eK2A83#9b(Nl^mB_>y@GHEazLO#@eXcX~%6_x&&W|-dACZjgYT($R~EL z_Rd0%Q6w(A`27f3!73wo9$G^nu zKnfqylsS58y}Iv%E7af_gSw*U#TQXMUBxXo1{d~XYtD&KLI>8KG6B?UJ+Na5=HcSO<^*6_o z?pXi}cqS_T;y`*^O{vy_lQC=P#Ivs<*D|gf-V2h<5w>7nL~9;ohO3h|+SFAZ%QeDg zWrP?4%vo*YHgvorb!%RQqp5uMjA5+rR0{Mgxz2o!rGGP3HZ zA5(7jjqj^3Bs1=wdxGAPN(Ir4q}y*OdU+aagi;Um`*zjXpF&w^QdAMLLl9Bu8o3oo zhPPuG@|Nq%UX>ip@7uVmE9cGq@Gz_3hmCUqTmZM6NvsrsWyC|`l)TuepkbPb+w`bH z;DBxQQ~iK9Vp5t$PuF=K+L>lNKiXwoN@VE-3!*Pjtf;34HKk_l?hRE<+Sm?y z0Zi$NoamS_EU%^Wb9G*Gp^J3zhYFQMhnet72M>zMVz|J-!Z0wAWgw@Rq}-@Cel~A& z>A2=9lQ1V!Q`f>LTaQ4Bjr~T=@aER&8~ffg2gc4}ZS8|hniy4qNKZ8xY4moQg&o(; zaksXwI*~M%?lR|ns>-zE@QKixmiebMjtL{iM4z^71#$-EvK19fKs;M-6e7WnlLy-UKT9 zcriG9I4=Ms8WI7~%TYjgfUknJc2H#iz%K>QPUBbu06+J|gS7+vUf(B7BTRh@QdK}L zG#MK4SzspzPDf)FUxeNTa6l~#c@0b;Vpw2(5@di&^CSdD#O^~vV1YDWFRu-74}9$` z>`a(lfc8Uxl8{KYp*@rA0FXg|NuLCu7VJU<)B&eS0)(s&ke5V|dAq1x_<|mZMx-v% z`qMUJ|Fz$6#EgI%$^f zB`Jf!%tHf3v?{vrxCNy439{w&#O#%}dfke%SZkZbmq4JSUkS?wmJ1h?+8>X38qQa9 zKCz$4FObir<54*5DrlQo>T}@%63u8`dplf~d`?+hOu4~hfJQ0vr;zp!va5cWr_Eww z=VUU&vhTn4UN*fBDFuZb@NAh+Q@I030aYFt!pGqSX>K7IQQDlmc8n@NxD_MfuD(lS zzD>9={Y^O?`P9!`_p>|Xm>a)C!kQ4yKD-tX{_H66Y(52;Oz^-%OUb?)^weuZZ(i0> zz|p5PEEBlu(jI4z(aeP*gd5;B1JiBz*cSf&+u9TaSx?*?%f@v0`XqTt2YlNUDuG-p zC$itrS9e}T5ta2^HEUzua$DT3qnByW&|~xYyt{D~Qw{B;bmjAcE9`5CVqm3(_y=pR zQx*jkva_C1&m00S@d2{XU_Y7Yl(iMq?I#c; zFZ_Vt^C5Spu(KpTGRJ{9y%rxPEagtX;A&o0+=;=AEWEx_+cUhmpZ!=W8Q})n?}*#G zu6IzaBo!p6s?Fjm+sO>~dEanazL(>?+QL?W*HPLSU(XK{QL4-U_Zemx-;BCl(MQ(_ z^5Y{^9qr;=v^9#XPcer|8-)>9TJ~W=StX%@70*W(Rj$^FQp0}3cYF`t%5KU*T*vwx zB`~9!C)}BY=MKwAIiPHb>7f18xIJ%(lU%%t;mKui)lX2O#57P5yj`lXn30u-*FFs1 zJbV8jPHCkZuDk+_<&p9%Rfh*lbLlgI@i4vCBK?v(VCD({M?gfVjWpl;L+Bf@QK{=T zpET`59mv(8bSF^c3dafmojp(|&Jq9_qWr1KgF!=$xoHgkzn?(X1vigi^W65^?4dp1UcVErS zgOJx+cf(S_=FN4cUSdl6qT`9@0dZ%u zDa1MTxvs-(sgXri=7-BpIa8IjpchSwmlN2L)ts>AtKt0JLWYHb&a=GLqS}Vino=6l zi3dY6s7zvy)birpq$#B&2`jvPTm%Th90m$401hNjP2(1%Qu{CNt+o9bjYNS zQSDLhmt3pHOW}?K^W^)`xJr$(O*zH6YC7-b1!Q?MvbZRR{S{CNy7eaP=K|ya49SXU zC3pJsH&UIG$F#tmIS>AgfvJ2z6-1<5vu(^aS?%Mg@N5Y~c323AK3R3>>_@-LjnF=O zP+Se;UYTb9je5KxxX3%jPt?S4`eJHaEdrv}&lqIu8dpT99MMS}NE}^*T3PmjQ47=% zD$oN}qyRwLq6hTyNaSm(x5h^bfXhgAsYk`x3Oftm1P}wB9snUbPSyGlxL`r>E6Yy_ zOuJeWw@zvjkBK=eUmnjW0~)*Ukzsd&W{LsapVt-ixy`fiJNLGRK_7i zQ{Au=3(>!lfBy9N7nJSSV2=N4yV-v*s70)!`q9P*k+8W)02p>q2I}2HrsA((AyJ>M zr-M1R{en!|*|9dw(=4Hv`C>iO%tsoHBu>sP(!c|A=-RlG4<(+9wcR02=!IAoTXtou zqKF%-3ltr4?`(L+%s3oOmTI>kJ-%Ugs@u)_ZqzCGTs#@CVRSqXBCp1eWi4NlmYApG>|itD}m~6x@*d8qP%x zB!>`vTJ}-#no>oV75!baEYwbHdalJ4pTGTb>$@_Wf$9TMHcg{upuNo?c3({Ctaol- zW5l&5l9qhIOvcX6zDVX5;vrp*Z8=|#%ogc=Zv}CfuiEC*z5rw(Ty(EvJ;DwEZEa~I zf;9yc)j+QHbhczCA3R3DO@2|Cmy3_WZ8_zmD_-~6;rJOV(8Uw6TJ8FzND6V@l-o3;4{f#h#I2GuYA>C>Q>xXu80I`lnfnrTy4r+2cQ$|{Zs zZO1MoHF^^|XT_@PiC#~fB1bGMo?aGx?|gnz_=kLpGP9!`So{3ZlNoO%GV1Fm{rIZ% z8%&`>p-e={P9)Z0<`ews8tvrL6k{r1ZIni_ohVCPwb~1d6Ynf_o}Wy7U(FNGAaO7& zRYHKQ*}b=+b?i>_gqjsz4ww}Y`Zln>CPSB`kug}y`&qT;`=+{QhfbF`J?PfQ&G-t} zhEbj?J6c<_k_lWm0N2CXBpZ3QqH{_WdEU`B9s*?%$>d*orOEX3U32~n04n(#=xTWoo+iGQAn&PsNpej@} zUiw~=Z(GBlJYCWwoykngJ`4yp7agh&EM9=gPf)+h()G-{EQJ9N6EX|jMdts2KZDG7 z|FJHc#7^j~gj%0Bv#$#lg{IGFT zZ}Z|($GOwwJ<8QdX{0MfX}l!|$4lwp{n8ov;*$6=zYXl~(tT*ktjZd+NIGU<1~7Zy z{wmAk6>2U~7@$dgo{*I3z4KwE6e!LT`Jt>-$$=x2UkB``dQ&?1SWqH+KYo7!Yq-fl zg`d#F7b}pXifQ-faRb#%w~vI}zsIJJ3QnM^9h4#S{WI{45!>xPC?|M~{f+ntm9xp{ zU9-o@2v5L;AYE$S35d76F==4jxu)dqm=l`Y+bkI3sZANAy|TkNPABp8G6Mah+Xh#a z_v2R;gq^H93HXMT&u5W~8w0?%km&6G!MaOFK-3r)pg38 zyU4`{%?qkG5CKW5s-!jZYB*VPCx5sMTiA(6On-n}I`Q*)qEFbB$Ge^E42y2L#G&MnZT=+@1W$oGNz%qZPmBiYWPUuw|F!|K(>W*8v zOlS0t-4(HBipf)C$4`(?h10@PoKD!;0K6C563w-Kq^kQaM_abvoO>lo%~MKn1#j8f zeO9vpE*f~BNj|G$g?HI|C29xn4BSsKnPU)=NodEl~I zu)FMIyC%*qXil%O-xnYR;o*a{)jWdPbrzD>6Z<1iK0KK~i&RbMiYRbM zmdbTez!8lKu5m_ut?XZPinMf-VtWZQa#75|rTG>sA7p$w_BuL5_xV zhguGbWVlJ&b0@3#jL3e0C_v35T*1t7nDqp+BKrBNgF3klP9Fz~cBp;5Jp5%%yq92V zCuqI)v%tn3}ljgT0!%esOg$B?(87AUVVHMxj zt>%QL1+1G6;WZJbJ_dX54+YUK(1!WCGr(gjt@Yu1+Gb&RRj17n)xD_PluqwG|KE$l z6h~s=T1IF^1~eANvRxv};Z+@y@wkEU$HBO){&iLD?F@*=q~Q&L@S{um_YJM8_0&{n z<`Kzjox|u#pfJ3KBCxNeg)r*}5sh#)ufJZ{99=ZK{w0j@HAs9S(ilDiUAqgztYNA; zlJ7{Qyeuc^=0|dgQ48WSlL94C_Uu#TqEFKLv9u=|7*Jt0y|vsdmsYe^Bi#2(twI(%Si` zT!@)2XZLO|Za2pWzO`_v!RyVv#+7V!8|m5XfHrHD!bVT#DH}Y-&^WZ=Oh3&ozc7tF zjZOs?0S46GcOYXM#+y`3wDqx$3kE9ZtJ)SIAtAy$8#ct8t;OOAc#8aqN1WU`Ra;n{Fwk%k~s#I_<&MRu05H;SS&Vz2pqU|%7 zXM!JA51Q%-qynkxB}0b=brVjo-O_&$U$vmdFYIH@(f0g3M{Z8f!#wW&KMxoG+h&+@ zf9Fs7A6b?Hxkw!0XQ&1P7!NzPfop`r`YdaQUy54h*|L+53- zMBPWj&xZF^P*vc~^%C008;R^~2=4J`GMeHJ0it)j`CUVoKDTtl2Jc1MXfV->s6wAk zi48QKrG$iPG!A>+?O#P|o+!HA>=z^!C;9qo;zy0+Dh@Rva8q+_iNHK#vNy#wkiAn| z3>_uMK#px0SgcW>=$9KUEwjFo#owLyLE=i2$LNWz=XFX;k2*rLH^wk_0lzz(Wj~%V z!SY!lW_e;GftNZ7WGzb$<)(&--_K2tPR%XfI;pRIZdUg&XTGm^woaKsvwCz{3dKLv z z$-fuFg-#|+et9dh{Pwy!YJEz5&+XaEtPvXppbvWjpq*>9vw_ZSqH^kHE}{(OsiuW1 z^@$z5YeIorKkSZvVV9MD9A%Y~6xFyv6Kv>6o8qV?N9I$724!jd<1M=|EK0ELhMO6- z%*a;%-i;SIXHOVKH9hV)T9U{JC9-$G#>{5k1%fT_@e?2L?#SBuTkA4YX?ZNj$Im zu)4}j8q2KH)in{EWRf1L zKE=KN?qvCr!d=jx-G&l^ZY z7>wJG8P#rob6&fZ=PE_oOY3s>?S0WGRJshV;7bgJ<+^J#5tZv^dO6&E0%HwELX`@u z_pjW)vgNU$tCslqgjjY%(vc(j<`ktsl{T~(xf%Zxlt4Sv=cx&dP?#=_FYW8a7In@~ zjgRT@T%M4lRyuDE)i{f1665xzIn{-4n!O4se24TmsoeU$H$b8%OqJqH#t`}gJD+5C z^)WUpoysWE#^JZu3K`u&58b;P>4B(T2nXaYLV&D7QO92+igaL#+|>QhZ@F|FUufVo z#TPnwjWYs7&+n~4J^M0u3U{9QTy=HB!XAbwIlduI$HQFMc>qsFt`f3r?+j2vcXGMB z0HVNm1zl0)m3Ia7y*0p@A_=DEhM{*X`w;XG))hCR&~(-HQAX}XiY!M9VjRy9Yf(Dp zFE@{h9~N9!5mn1C%F;cv*`d6A>)t>^z(1|by!D$?gBn$OKgGB{rnA)_rE;Ou%ZoMF zP@@0iE#0F`_e?-^J;wJ!r2)Guc%nAj<92)YK^u$jvZvFJda*V>FL`%XQ2CsM`Kf@j zt#pr}03W5<2-rwBF2iw%$XUYNR;bhn{xQY*My!RglHb|gp+?k;BbgrsYcxTbV-7pjshV(xySGNdoQ20KUiG$b{t$|1n>>9(L{7La(u1(#yfSCN<#$ z!=xl=X!0moaN>?5q0Ez!e+SyjO}C@Pvsq00f)U)f@@^M!MNrH~6hQSQ5IkH`rpD}%F=&r%iQDt}`LK{UDi?lO|n5@{04xjOKZlY!_B z_awG{o`_D7y29`JgT_A05BqZw{ybc6sfN2qnUNJhXqXt$yzXJ!RV=g2VDKGO)43W}5cbC-d0gBygiQ9LhJg?6s*K@=o) z+GX$roVEAMqsc(c5KFE1;r(DH-}`rqLT?b9p50(!K^(W%j|k>^qcOthM4=1swKxQk z!|6rLD?C;-Iqj3(@Ol9mJXwB_NMm#@FYyQLmm1H-6+Z0A5b`0%fsFM@U&-tRPKZl{D=pSwDlsh$ zT2m&iUQDOgF+_FTtF7As2`quQ|=x(76{!3$;ci5#xgLmG`gxXQ=ESt7Ka8RmTzetz#NbuTVB&heW zzK|dOT`=u;=VF;EXpI(L+mU?vU2!J6%ns96hmqGK0R!;Tb7Xzer$)}8>Eqf0K*1!- zwYtyqwN7)6+JB6rvouT}XeR1Zny0aGn^)wBl-8Q3Idj%zr3=$`1A>VVXBhV6!rg|0 zLfP5w_2;((Z?+$~5NOx@QGO-+)s7Z$?US9W8ALOS+Yn&--|_!Eu^RZ zAmI*iyi^`@&*b=&6tJBGNfY<+336jQuWFjKU_(W{o+19o_lr5~sTf?OVtm^(KY*OT zuVgAnkkxedPMmOxExd{ZQ@NnHE0Hk;AINwc%X|H&^Je4IZZoNMgY+H-?Z@rc06rDh z*~zB>o#Zl#U28gO-OteWy-Y~)WF0=i(SZFLdy~aC1eGh6rhCYWpw&WkrBc78rlE)dLVy&vCB$RrLi?|hpD>+&&OV~&!|Sc zi4J^EJ;itj+>)D$y`^>L#8n0EXwcjK_(b5tq2fX>A2gHzE`9ac<(LqAJJ)iJ&!Zhs zsV&5ArM(%j-KUWNpfiQI=#MvOlC8aRg0kbgmc`s?~@`)3u ztnk%mi2BxYo*CF*y*j{a`K$MqD;i#hc!+lPEB-xgOy$#hTEu-B0bT&@RAWlMrPjclu+gDjmU*}%&bFXgU9YcULJNvpX z+`lkB|Md93FMw9*kNO#O#UF|-70%O`chp@%_4@`*bwih?xGkU3Mll@K$QTSA)hr}u zslIA=U}?luwz%MGxroH;r#6N6njUUTF)L9iHeCeRDU$!{g$KvO=FW@2I0)bD1+1UhNRL zDVe=(Sd-~TlC@Ma*@$>qm;8;e zp-W7QopbKrF$f*JEss_{T--5=e^gO{@V4`Fl9{Ee_2-MAU`3vG~DaDy7T4(QC^yx#_- zL5$R@2J?$VM}(x^o&Vl zILCyW0-(el-dBL;B>6qdO1X2Y=3My+OUdUs^pQ(;ipyu(W#3PHG4vbML8U;>cfo{rYwJP4S1g*9+F?dOna2H}=h4opEispXagP_unan zJXKwv{EWXn{iyxs?Qc21o$nUkRkz;!sKzCYKb`_Nc7=Y?igQc*QG){@;05-^BCT@%}J> zsJr)0oXCZ>UnW=FUYF){Ugt@H`EH@J4GfYtVe6Y}*ca4Te)v-*-}dE1$yzl_xzMfc z*3Oo9j!(Wb=UULJrIPM@_iWplWtKTl{x{=BU=>gkQT%&_>EAeu>%UijyXj|dRB!dm zYQrDt2esS}-}RKt=X*IJ)2eV@^yK_Jj(8(?edStjp?ax1YhIlVnX_cqbkWlyO_;}T zhF>y8-W(2Wq5yY?&jqa@;Hu}gTX_Mv#gY>^o^X9n{V$HySKWbg;1l$K+X~}80#__4 ztq;FGF92y{45Kq}5y}MH`+ti~?SP6De&_#svj0Q~)-ra!W2p{MYa6W1LJeAU_WJtc zT|n1{{3~6Gxc1pMPc4%gR@JP=v!iYOwEeh_;|pQbg)O22(jthB+$-)WdDNoL;t`34EO&td@MoZmwr7f zdj4ya{i)2i>34ogm%p6|Jgri{rT#lmJ@9Dm#pnX-*JUhzwWWTF?V8{7i7;EFHmv_s z`_}ozPjc^XbN|b*PY~#g-@t4A^m@_w`*tqsUv=?cz_-osYI?zw84JV@*#ni<1J5JB zh%Rvbx=A3&)pFNr&rjT5cT@fg(+}VrdHwtU3_o_6qw#+|>#|$z{?qx^`JJE4-rrXK z*AUO~pW*#qkPpQ}@_>id^~G-y+qy8SChOWB^@Ts!J@>Z<%|E=4zprd^_$;-%LjF2y z%8p1dg;qUY=8*bZu4~1^m9C#PUa1H9Ms5^!TEURO#~`vLFjftDOCl?8`9Fv6^>1AA9Mv;9Ca$8>I zOkA^ZlT|x#i+17!$2S?wPv-y7n74b`kLllhV*Nd?=$6KsJw763?Od5`be5+f-i-)@((&z0a*G%Wsm~}?aqs?h|toxl4dCl5ip1-;M z+v-QN`$yONN6&Xp|M>d8*w^YprlXu^=eNH(uXO5aaby3Z`bU;ep3ZErn~2n$BK2M^ zCZy&sN$jDx=i20FKtwdtEB+~a|F-(ua29jBD!p@)P`lBKE;0j8RjScXQ-9`xPX`6(th9mxpGDIwe?oO1**uBi~k9pwfH zC;xu^E2%vB_v@e6_DC}QKio~;|NJKZ{UY$d9EJ;@C;xu^^CH-pYk&m?lAJw~E0{+e zhzQ%!5Tr#KyjHeX$#l}gDgQoJ|Goqqkwau7U_#}%Hy3zm@8Qq(j~#l6%mIzFAejW5 z4@!ZpMviA-GyeUuf9(E8c~<$K-~7K{0H>xHGQgSS1vn>Mt4EfSN6N5_MCN6-c5p84 z2ZvEQhDQ*NaF7~xAlA}wGz96F28HgPn4h!R`ucnG`mF8nTEOiKsD_vctRRppfzjG* zw7P}1N=DlQ1X@(rVtS;HOnV~We)IP7Yrs8T$nCV1YfGxVC2LM3a{M?}lp$CK9Q!pU rdkm7o6Gw>YK8*l1j6c=_hhTuiFq4-7cb|7o+y5)J7I>Zx`~RB&kn;59 literal 0 HcmV?d00001 diff --git a/vllm/distributed/kv_transfer/kv_connector/__init__.py b/vllm/distributed/kv_transfer/kv_connector/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/distributed/kv_transfer/kv_connector/base.py b/vllm/distributed/kv_transfer/kv_connector/base.py new file mode 100644 index 0000000000000..6089e3babac3e --- /dev/null +++ b/vllm/distributed/kv_transfer/kv_connector/base.py @@ -0,0 +1,122 @@ +""" +KVConnectorBase Class for Distributed KV Cache & Hidden State communication + +The class provides two primary abstract methods: +1. send_kv_caches_and_hidden_states(): Send KV caches and hidden states +2. recv_kv_caches_and_hidden_states(): Recv KV caches and hidden states +""" + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, List, Tuple, Union + +import torch + +from vllm.sequence import IntermediateTensors + +if TYPE_CHECKING: + from vllm.config import VllmConfig + from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata + + +class KVConnectorBase(ABC): + """ + Abstract base class for a KV connector. + + The class provides two primary abstract methods: + 1. send_kv_caches_and_hidden_states(): Send KV caches and hidden states + 2. recv_kv_caches_and_hidden_states(): Recv KV caches and hidden states + """ + + @abstractmethod + def __init__( + self, + rank: int, + local_rank: int, + config: "VllmConfig", + ): + raise NotImplementedError + + @abstractmethod + def close(self) -> None: + """Close the buffer and release resources. + + This method is responsible for cleaning up resources related to the + connector when it is no longer needed. + + Raises: + NotImplementedError: This method must be implemented in subclasses. + """ + raise NotImplementedError + + @abstractmethod + def send_kv_caches_and_hidden_states( + self, + model_executable: torch.nn.Module, + model_input: "ModelInputForGPUWithSamplingMetadata", + kv_caches: List[torch.Tensor], + hidden_or_intermediate_states: Union[torch.Tensor, + IntermediateTensors], + ) -> None: + """ + Send KV caches and hidden states to the connector. + + This method processes the input tokens, KV caches, and + hidden/intermediate states for a given model and sends the data to the + decode instance. + + Args: + model_executable (torch.nn.Module): The model executable containing + start and end layer information. + model_input (ModelInputForGPUWithSamplingMetadata): The input + metadata from vLLM. + kv_caches (List[torch.Tensor]): List of KV caches (keys and values) + for each layer. + hidden_or_intermediate_states (Union[torch.Tensor, + IntermediateTensors]): + The hidden or intermediate states associated with the tokens. + + Returns: + None + + """ + + raise NotImplementedError + + @abstractmethod + def recv_kv_caches_and_hidden_states( + self, model_executable: torch.nn.Module, + model_input: "ModelInputForGPUWithSamplingMetadata", + kv_caches: List[torch.Tensor] + ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool, + "ModelInputForGPUWithSamplingMetadata"]: + """ + Receive KV caches and hidden states from the connector. + + This method attempts to retrieve KV caches and hidden states for input + tokens. If all required KV caches and hidden states are received, it + will bypass model input, else it will fall back to normal vLLM model + forwarding. + + Args: + model_executable (torch.nn.Module): + The model executable from vLLM modelrunner. + model_input (ModelInputForGPUWithSamplingMetadata): + The model input from vLLM modelrunner. + kv_caches (List[torch.Tensor]): + List of KV caches for each layer. + + Returns: + - hidden_or_intermediate_states (torch.Tensor or + IntermediateTensors): + Concatenated hidden states if all required data is retrieved, + otherwise `None`. + - bypass_model_exec (bool): + Indicates whether the model execution can be skipped (True) or + needs to be redone (False). + - model_input (ModelInputForGPUWithSamplingMetadata): + Optionally adjusted input metadata for re-execution when + `bypass_model_exec=False`. + + """ + + raise NotImplementedError diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py new file mode 100644 index 0000000000000..015f892cec933 --- /dev/null +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -0,0 +1,19 @@ +from typing import TYPE_CHECKING + +from .base import KVConnectorBase + +if TYPE_CHECKING: + from vllm.config import VllmConfig + + +class KVConnectorFactory: + + @staticmethod + def create_connector(rank: int, local_rank: int, + config: "VllmConfig") -> KVConnectorBase: + if config.kv_transfer_config.kv_connector == 'PyNcclConnector': + from .simple_connector import SimpleConnector + return SimpleConnector(rank, local_rank, config) + else: + raise ValueError(f"Unsupported connector type: " + f"{config.kv_connector}") diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py new file mode 100644 index 0000000000000..5870070a54c75 --- /dev/null +++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py @@ -0,0 +1,261 @@ +""" +Simple KV Cache Connector for Distributed Machine Learning Inference + +The SimpleConnector transfers KV caches between prefill vLLM worker (KV cache +producer) and decode vLLM worker (KV cache consumer) using PyNcclPipe. + +But the logic can be extended to support other pipe and lookup buffer. +""" +from typing import TYPE_CHECKING, List, Optional, Tuple, Union + +import torch + +from vllm import _custom_ops as ops +from vllm.config import VllmConfig +from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase +from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import ( + SimpleBuffer) +from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe +from vllm.logger import init_logger +from vllm.sequence import IntermediateTensors + +if TYPE_CHECKING: + from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata + +logger = init_logger(__name__) + + +class SimpleConnector(KVConnectorBase): + + def __init__( + self, + rank: int, + local_rank: int, + config: VllmConfig, + ): + + self.config = config.kv_transfer_config + + logger.info("Initializing PyNcclConfig under kv_transfer_config %s", + self.config) + + self.lookup_buffer_size = self.config.kv_buffer_size + + self.producer_buffer: Optional[SimpleBuffer] = None + self.consumer_buffer: Optional[SimpleBuffer] = None + + # 2 pipes for every rank in the world + port_offset_base = 2 * rank + + # In disaggregated prefill, the prefill vLLM only uses send pipe + # and the decode vLLM only uses recv pipe + if self.config.is_kv_producer: + + self.producer_data_pipe = PyNcclPipe( + local_rank=local_rank, + config=self.config, + port_offset=port_offset_base, + ) + self.producer_signal_pipe = PyNcclPipe( + local_rank=local_rank, + config=self.config, + port_offset=port_offset_base + 1, + device="cpu", + ) + self.producer_buffer = SimpleBuffer(self.producer_signal_pipe, + self.producer_data_pipe, + self.config.kv_buffer_size) + + else: + + # the current vLLM instance is KV consumer, so it needs to connect + # its recv pipe to the send pipe of KV producder + self.consumer_data_pipe = PyNcclPipe( + local_rank=local_rank, + config=self.config, + port_offset=port_offset_base, + ) + self.consumer_signal_pipe = PyNcclPipe( + local_rank=local_rank, + config=self.config, + port_offset=port_offset_base + 1, + device="cpu", + ) + self.consumer_buffer = SimpleBuffer( + self.consumer_signal_pipe, + self.consumer_data_pipe, + self.config.kv_buffer_size, + ) + + def select(self, input_tokens: Optional[torch.Tensor], + roi: Optional[torch.Tensor]) -> List[Optional[torch.Tensor]]: + + assert self.consumer_buffer is not None, "Please initialize the "\ + "consumer buffer before calling select." + return self.consumer_buffer.drop_select(input_tokens, roi) + + def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor, + key: torch.Tensor, value: torch.Tensor, + hidden: torch.Tensor) -> None: + + assert self.producer_buffer is not None, "Please initialize the "\ + "producer buffer before calling insert." + + self.producer_buffer.insert(input_tokens, roi, key, value, hidden) + + def send_kv_caches_and_hidden_states( + self, + model_executable: torch.nn.Module, + model_input: "ModelInputForGPUWithSamplingMetadata", + kv_caches: List[torch.Tensor], + hidden_or_intermediate_states: Union[torch.Tensor, + IntermediateTensors], + ) -> None: + + input_tokens_tensor = model_input.input_tokens + seq_lens = model_input.attn_metadata.seq_lens + slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten() + start_layer = model_executable.model.start_layer + end_layer = model_executable.model.end_layer + + # query_lens contains new KV caches that are added to vLLM. + # so we will send them to decode instance + # FIXME(Kuntai): This assume that all requests are prefill. + for idx, slen in enumerate(seq_lens): + start_pos = sum(seq_lens[:idx]) + end_pos = start_pos + slen + current_tokens = input_tokens_tensor[start_pos:end_pos] + + keys, values = [], [] + + for layer_id in range(start_layer, end_layer): + kv_cache = kv_caches[layer_id - start_layer] + + _, _, num_heads, head_size = kv_cache[0].shape + + key_cache = kv_cache[0].reshape(-1, num_heads, head_size) + value_cache = kv_cache[1].reshape(-1, num_heads, head_size) + + current_slot_mapping = slot_mapping_flat[start_pos:end_pos] + + keys.append(key_cache[current_slot_mapping].unsqueeze(0)) + values.append(value_cache[current_slot_mapping].unsqueeze(0)) + + keys = torch.cat(keys, dim=0) + values = torch.cat(values, dim=0) + + self.insert(current_tokens, + torch.ones_like(current_tokens, + dtype=bool), keys, values, + hidden_or_intermediate_states[start_pos:end_pos]) + + logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank()) + + def recv_kv_caches_and_hidden_states( + self, model_executable: torch.nn.Module, + model_input: "ModelInputForGPUWithSamplingMetadata", + kv_caches: List[torch.Tensor] + ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool, + "ModelInputForGPUWithSamplingMetadata"]: + + # When bypass_model_exec is set to False, it means that at least for one + # request its corresponding KV cache or hidden state is missing. + # In this case we need to do prefilling to recompute missing KV cache + # and hidden states. + bypass_model_exec = True + + input_tokens_tensor = model_input.input_tokens + seq_lens = model_input.attn_metadata.seq_lens + slot_mapping = model_input.attn_metadata.slot_mapping.flatten() + + hidden_or_intermediate_states_for_one_req = [] + + input_tokens_list = [] + num_computed_tokens_list = [] + start_pos_list = [] + + # enumerate different requests + # FIXME(Kuntai): This impl assumes that all requests are prefill. + for idx, slen in enumerate(seq_lens): + + start_pos = sum(seq_lens[:idx]) + end_pos = start_pos + slen + current_tokens = input_tokens_tensor[start_pos:end_pos] + num_tokens = slen + + # collecting data for rebuilding the input + input_tokens_list.append(current_tokens) + start_pos_list.append(start_pos) + + ret = self.select(current_tokens, + torch.ones_like(current_tokens, dtype=bool)) + if ret[0] is None: + # didn't find any match. + bypass_model_exec = False + num_computed_tokens_list.append(0) + continue + + roi: torch.Tensor = ret[1] + keys: torch.Tensor = ret[2] + values: torch.Tensor = ret[3] + hidden: torch.Tensor = ret[4] + + num_computed_tokens = roi.shape[0] + num_computed_tokens_list.append(num_computed_tokens) + + # check if both KV cache and the hidden states are received + # If not, need to redo the forwarding to compute missing states + if not all([(num_computed_tokens == num_tokens), hidden is not None + ]): + bypass_model_exec = False + + # update the end position based on how many tokens are cached. + end_pos = start_pos + num_computed_tokens + + # put received KV caches into paged memory + for i in range(model_executable.model.start_layer, + model_executable.model.end_layer): + + kv_cache = kv_caches[i - model_executable.model.start_layer] + layer = model_executable.model.layers[i] + + key_cache, value_cache = kv_cache[0], kv_cache[1] + ops.reshape_and_cache_flash( + keys[i - model_executable.model.start_layer].to( + key_cache.device), + values[i - model_executable.model.start_layer].to( + value_cache.device), + key_cache, + value_cache, + slot_mapping[start_pos:end_pos], + layer.self_attn.attn.kv_cache_dtype, + layer.self_attn.attn._k_scale, + layer.self_attn.attn._v_scale, + ) + + hidden_or_intermediate_states_for_one_req.append(hidden) + + if not bypass_model_exec: + # Some of the KV cache is not retrieved + # Here we will fall back to normal model forwarding + # But optionally you can adjust model_input so that you only do + # prefilling on those tokens that are missing KV caches. + logger.debug( + "[rank%d]: Failed to receive all KVs and hidden " + "states, redo model forwarding.", torch.distributed.get_rank()) + hidden_or_intermediate_states = None + + else: + logger.debug( + "[rank%d]: Successfully received all KVs and hidden " + "states, skip model forwarding.", torch.distributed.get_rank()) + hidden_or_intermediate_states = torch.cat( + hidden_or_intermediate_states_for_one_req, dim=0) + + return hidden_or_intermediate_states, bypass_model_exec, model_input + + def close(self): + self.producer_data_pipe.close() + self.producer_signal_pipe.close() + self.consumer_data_pipe.close() + self.consumer_signal_pipe.close() diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py new file mode 100644 index 0000000000000..bad119a1aa929 --- /dev/null +++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py @@ -0,0 +1,108 @@ +""" +This file contains a new class `KVLookupBufferBase` that allows developers to +think of KV cache operations as inserting new KV cache entries (`insert`) +into the lookup buffer and querying existing KV caches (`drop_select`) +from the lookup buffer. + +All distributed communications are abstracted behind this class. +""" + +from abc import ABC, abstractmethod +from typing import List, Optional + +import torch + + +class KVLookupBufferBase(ABC): + """ + Abstract base class for a lookup buffer. + + This class provides an abstraction for a key-value (KV) cache lookup buffer. + + The key of the lookup buffer: + - input_tokens: token IDs of the request + - roi: a binary mask on top of input_tokens. + - Purpose of roi: Since KV cache may only be available for a subset of + tokens in the input (for example, when vLLM is connected to an external + KV cache service), roi specifies the subset of tokens that the KV cache + is associated with. + - NOTE: roi can be further extended to describe which part of KV the + current process is holding (each process may only hold a part of KV + due to TP and PP). This is not implemented for now. + + The value of the lookup buffer: + - key: the key tensor in the KV cache + - value: the value tensor in the KV cache + - hidden: the final hidden state generated by model forwarding. This allows + vLLM to bypass further model forwarding by transmitting the hidden state. + """ + + @abstractmethod + def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor, + key: torch.Tensor, value: torch.Tensor, + hidden: torch.Tensor) -> None: + """Insert into the lookup buffer. + + The functionality is similar to the following python statement + ``` + buffer[input_tokens, roi] = [key, value, hidden] + ``` + + FIXME: in the future, we should only have two arguments, key and value, + where key is a tensor dict and value is a tensor dict. + + FIXME: we should transmit both sampler outputs and the hidden states. + + Args: + input_tokens (torch.Tensor): token IDs. + roi (torch.Tensor): A binary mask on top of the input tokens + key (torch.Tensor): The key tensor in the KV cache. + value (torch.Tensor): The value tensor in the KV cache. + hidden (torch.Tensor): The final hidden state tensor generated + during model forwarding to bypass model + forwarding. + + Raises: + NotImplementedError: This method must be implemented in subclasses. + """ + raise NotImplementedError + + @abstractmethod + def drop_select( + self, input_tokens: Optional[torch.Tensor], + roi: Optional[torch.Tensor]) -> List[Optional[torch.Tensor]]: + """Select and *drop* KV cache entries from the lookup buffer. + + The functionality is similar to the following python statements + ``` + ret = buffer.pop(input_tokens, roi) + return ret + ``` + + If `input_tokens` and `roi` is `None`, it means selecting any of the + KV caches in the buffer, return, and remove it from the buffer, useful + when offloading KV cache to KV cache storage service. + + Args: + input_tokens (torch.Tensor): token IDs. + roi (torch.Tensor): A binary mask on top of the input tokens + + Returns: + List[Optional[torch.Tensor]]: A list of tensors. Can be None. + + Raises: + NotImplementedError: This method must be implemented in subclasses. + """ + raise NotImplementedError + + @abstractmethod + def close(self) -> None: + """Close the buffer and release resources. + + This method is responsible for cleaning up resources related to the + lookup buffer when it is no longer needed. + + Raises: + NotImplementedError: This method must be implemented in subclasses. + """ + raise NotImplementedError diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py new file mode 100644 index 0000000000000..fe8d8d7375f36 --- /dev/null +++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py @@ -0,0 +1,242 @@ +""" + Implements a distributed key-value (KV) cache transfer mechanism. + + Key Features: + - Distributed KV cache transmission using PyNccl pipes. + - Non-blocking `insert`, blocking `drop_select`. + - Use CPU signal pipe to avoid racing condition + - Handles buffer size constraints and provide backpressure mechanism to + stop the prefill instance when the decode instance is slow. +""" +import threading +import time +from collections import deque +from typing import Deque, List, Optional, Union + +import torch + +from vllm.distributed.kv_transfer.kv_lookup_buffer.base import ( + KVLookupBufferBase) +from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class SimpleBuffer(KVLookupBufferBase): + + def __init__(self, signal_pipe: KVPipeBase, data_pipe: KVPipeBase, + buffer_size_thresh: float): + """ + signal_pipe: on CPU + + NOTE: on-device recv will block all threads in the process, making the + KV cache producer unable to listen to new request while transmitting + KV cache. Luckily CPU recv only blocks the current thread so we use + CPU recv to listen to new request. + + data_pipe: on device (e.g. GPU) + """ + + self.buffer: Deque[List[torch.Tensor]] = deque() + + self.buffer_size = 0 + self.buffer_size_threshold = buffer_size_thresh + self.buffer_lock = threading.Lock() + self.signal_pipe = signal_pipe + self.data_pipe = data_pipe + self.request_handling_thread: Optional[threading.Thread] = None + + self.normal_signal = torch.tensor([0], device="cpu") + self.end_signal = None + + def _matches(self, tokens_roi_sender: List[torch.Tensor], + tokens_roi_recver: List[torch.Tensor]): + + # tokens_roi_sender: tokens and roi of the producer (in the buffer) + # tokens_roi_recver: tokens and roi of the consumer (query) + + tokens_sender = tokens_roi_sender[0] + tokens_recver = tokens_roi_recver[0] + roi_sender = tokens_roi_sender[1] + roi_recver = tokens_roi_recver[1] + + if tokens_recver is None: + # consumer sends an empty request + # semantics: DROP SELECT * LIMIT 1 + # so any of the data in the buffer can be drop-selected + return True + + # Assuming that roi is a binary mask on tokens + tokens_sender = tokens_sender[roi_sender] + tokens_recver = tokens_recver[roi_recver] + + # simple common prefix matching + min_length = min(len(tokens_sender), len(tokens_recver)) + if torch.allclose(tokens_sender[:min_length], + tokens_recver[:min_length]): + return min_length + + return 0 + + def _send_tensor_and_dec_size(self, + tensor: Optional[torch.Tensor]) -> None: + + assert tensor is not None, "Use self.data_pipe.send(None) instead" + self.buffer_size -= tensor.element_size() * tensor.numel() + if tensor.dtype == torch.bool: + tensor = tensor.float() + self.data_pipe.send_tensor(tensor) + + def _get_element_size(self, data: Optional[Union[List, torch.Tensor]]): + + if isinstance(data, torch.Tensor): + return data.element_size() * data.numel() + if not data: + # cannot perform `not data` on a tensor + # so this check needs to go after the check above + return 0 + + raise AssertionError(f"Unknown data type {type(data)}") + + def _add_to_buffer(self, input_tokens: torch.Tensor, roi: torch.Tensor, + key: torch.Tensor, value: torch.Tensor, + hidden: torch.Tensor): + + if isinstance(input_tokens, torch.Tensor): + input_tokens = input_tokens.clone() + if isinstance(roi, torch.Tensor): + roi = roi.clone() + if isinstance(key, torch.Tensor): + key = key.clone() + if isinstance(value, torch.Tensor): + value = value.clone() + if isinstance(hidden, torch.Tensor): + hidden = hidden.clone() + + buffer_item = [input_tokens, roi, key, value, hidden] + + with self.buffer_lock: + for data in buffer_item: + self.buffer_size += self._get_element_size(data) + self.buffer.append(buffer_item) + + def _is_end_signal(self, signal): + return signal is None + + def drop_select_handler(self): + + try: + + while True: + signal = self.signal_pipe.recv_tensor() + if self._is_end_signal(signal): + logger.info("Received end signal!") + break + + input_tokens = self.data_pipe.recv_tensor() + + roi = self.data_pipe.recv_tensor() + assert roi is not None, "Please provide the roi when sending "\ + "drop-select request" + roi = (roi > 0.5) + tokens_roi_recver = [input_tokens, roi] + + matched_length = 0 + + # perform input tokens and roi matching + # FIXME: this matching is O(n), ideally it should be O(1) + # but this buffer size won't (and shouldn't) be too large so + # the fix is not urgent. + with self.buffer_lock: + + for _ in range(len(self.buffer)): + + temp_length = self._matches(self.buffer[0], + tokens_roi_recver) + if temp_length > 0: + matched_length = temp_length + break + # rotate the element we just accessed to the end + self.buffer.rotate(-1) + + if matched_length > 0: + # need to clone the tensor + # in case the tensor is freed before sending finishes + matched_item = self.buffer.popleft() + for tensor in matched_item: + self._send_tensor_and_dec_size(tensor) + + else: + # no match, just send None + for _ in range(5): + self.data_pipe.send_tensor(None) + + except RuntimeError as e: + if 'Connection closed by peer' not in str(e): + raise e + + logger.debug("Closing drop_select_handler") + + def drop_select( + self, input_tokens: Optional[torch.Tensor], + roi: Optional[torch.Tensor]) -> List[Optional[torch.Tensor]]: + + assert self.request_handling_thread is None, \ + "drop_select should be called by the KV cache consumer "\ + "(e.g. the decode vLLM instance)" + + if isinstance(input_tokens, torch.Tensor): + input_tokens = input_tokens.clone() + if isinstance(roi, torch.Tensor): + roi = roi.clone().float() + + self.signal_pipe.send_tensor(self.normal_signal) + self.data_pipe.send_tensor(input_tokens) + self.data_pipe.send_tensor(roi) + + input_tokens = self.data_pipe.recv_tensor() + roi = self.data_pipe.recv_tensor() + if roi is not None: + # convert from float tensor to bool tensor + # as PyNccl does not support sending bool tensor + roi = (roi > 0.5) + key = self.data_pipe.recv_tensor() + value = self.data_pipe.recv_tensor() + hidden = self.data_pipe.recv_tensor() + + return [input_tokens, roi, key, value, hidden] + + def full_handler(self): + time.sleep(0.001) + + def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor, + key: torch.Tensor, value: torch.Tensor, + hidden: torch.Tensor) -> None: + + if self.buffer_size > self.buffer_size_threshold: + # log outside the while loop to avoid this message being logged + # repeatedly. + logger.debug("KV transfer buffer is full. Handling...") + while self.buffer_size > self.buffer_size_threshold: + self.full_handler() + + self._add_to_buffer(input_tokens, roi, key, value, hidden) + + # when calling the insert, the current process is a sender + # need to launch the request handler and start listening to request. + if self.request_handling_thread is None: + self.request_handling_thread = threading.Thread( + target=self.drop_select_handler) + self.request_handling_thread.start() + + def close(self): + + if hasattr(self, "request_handling_thread" + ) and self.request_handling_thread is not None: + self.request_handling_thread.join() + + else: + # TODO: have a explicit close signal and have a explicit way to + # check if it's requester + self.signal_pipe.send_tensor(self.end_signal) diff --git a/vllm/distributed/kv_transfer/kv_pipe/__init__.py b/vllm/distributed/kv_transfer/kv_pipe/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py new file mode 100644 index 0000000000000..4b0cb44cc5b81 --- /dev/null +++ b/vllm/distributed/kv_transfer/kv_pipe/base.py @@ -0,0 +1,65 @@ +""" +This file defines an interface `KVPipeBase` +that provides an abstraction for sending and receiving tensors, or None, via +distributed communications. + +All classes instantiated from this interface are assumed to be a FIFO pipe. + +If your distributed communication platform already supports key-value lookup, +you can bypass this interface and directly start from `kv_lookup_buffer`. +""" + +from abc import ABC, abstractmethod +from typing import Optional + +import torch + + +class KVPipeBase(ABC): + """ + This class provides an interface for sending and receiving tensors, or + None, by distributed communications. + """ + + @abstractmethod + def send_tensor(self, tensor: Optional[torch.Tensor]) -> None: + """Send a tensor, or None, via the pipe. + + Need to support sending None -- important for error handling. + + TODO: add a `key` argument so that we can use traditional + key-value database as the distributed communication mechanism behind + the pipe. + + Args: + tensor (Optional[torch.Tensor]): The tensor to be sent. Can be None. + + Raises: + NotImplementedError: This method must be implemented in subclasses. + """ + raise NotImplementedError + + @abstractmethod + def recv_tensor(self) -> Optional[torch.Tensor]: + """Receive a tensor (can be None) from the pipeline. + + Returns: + Optional[torch.Tensor]: The tensor received from the pipeline. Can + be None. + + Raises: + NotImplementedError: This method must be implemented in subclasses. + """ + raise NotImplementedError + + @abstractmethod + def close(self) -> None: + """Close the pipeline and release resources. + + This method is responsible for closing the communication pipeline + and releasing any resources associated with it. + + Raises: + NotImplementedError: This method must be implemented in subclasses. + """ + raise NotImplementedError diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py new file mode 100644 index 0000000000000..98222fa67e492 --- /dev/null +++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py @@ -0,0 +1,276 @@ +""" + This module implements a PyNccl pipe for sending and receiving + Optional[torch.Tensor] between distributed ranks with advanced + communication features. + + Key Features: + - Supports sending and receiving tensors with metadata + - Handles both CUDA and CPU device communications + - Implements a non-blocking tensor transfer mechanism + - Manages buffer size and provides backpressure control + - Supports distributed process groups with configurable parameters +""" + +import threading +import time +from concurrent.futures import ThreadPoolExecutor +from typing import Callable, Dict, Optional, Tuple + +import torch + +from vllm.config import KVTransferConfig +from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator +from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase +from vllm.distributed.utils import StatelessProcessGroup +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class BrokenPipeException(Exception): + + def __init__(self, message): + self.message = message + super().__init__(self.message) + + +Metadata = Dict[str, Optional[torch.Tensor]] + + +class PyNcclPipe(KVPipeBase): + + METADATA_LENGTH = 16 + MAX_TENSOR_DIMENSIONS = 14 + METADATA_DTYPE = torch.int64 + + def __init__(self, + local_rank: int, + config: KVTransferConfig, + device: Optional[str] = None, + port_offset: int = 0): + self.config = config + self.local_rank = local_rank + self.kv_rank = self.config.kv_rank + self.kv_parallel_size = self.config.kv_parallel_size + if device is None: + self.device = self._select_device(self.config.kv_buffer_device) + else: + self.device = self._select_device(device) + + # build distributed connection and send/recv implementation + self.group = StatelessProcessGroup.create( + host=self.config.kv_ip, + port=self.config.kv_port + port_offset, + rank=self.kv_rank, + world_size=self.kv_parallel_size, + ) + # add a barrier to make sure the connection is initiated properly + self.group.barrier() + impl = self._get_device_send_recv_impl(self.group) + self.device_send_func, self.device_recv_func = impl + # set target rank + self.target_rank_for_send = (self.kv_rank + 1) % self.kv_parallel_size + self.target_rank_for_recv = (self.kv_rank - 1) % self.kv_parallel_size + + # transportation-related variables + self.transport_thread: Optional[ThreadPoolExecutor] = None + self.buffer_size = 0 + self.buffer_size_lock = threading.Lock() + self.buffer_size_thresh = self.config.kv_buffer_size + + def _get_device_send_recv_impl( + self, group: StatelessProcessGroup + ) -> Tuple[Callable[[torch.Tensor, int], None], Callable[ + [torch.Tensor, int], None]]: + + send: Callable[[torch.Tensor, int], None] + recv: Callable[[torch.Tensor, int], None] + if self.device.type == "cuda": + # use PyNCCL for send / recv + comm = PyNcclCommunicator(group, device=self.local_rank) + comm.disabled = False + send, recv = comm.send, comm.recv # type: ignore + else: + # This send / recv implementation here is NOT intended to transfer + # KV caches (and should NOT be repurposed to transfer KV caches). + # Currently it is only used to transmit control-plane messages + # for PyNcclBuffer. + send = group.send_obj + + def my_recv(x, src): + x[...] = group.recv_obj(src) + + recv = my_recv + + return send, recv + + def _select_device(self, device: str): + logger.info("Selecting device: %s", device) + if device == "cuda": + return torch.device(f"cuda:{self.local_rank}") + else: + return torch.device("cpu") + + def _make_metadata(self, tensor: Optional[torch.Tensor]) -> Metadata: + """ + Create the metadata as a dictionary based on the input tensor. + + Parameters: + - tensor: The input tensor or None if no tensor is provided. + + Returns: + - metadata: A dictionary with the following keys: + - "dtype": The data type of the tensor or None. + - "shape": The shape of the tensor or None. + """ + if tensor is None: + return {"dtype": None, "shape": None} + else: + return {"dtype": tensor.dtype, "shape": tensor.shape} + + def _prepare_recv_buffer(self, metadata: Metadata) -> torch.Tensor: + """ + Create a buffer to receive the tensor based on the provided metadata. + + Parameters: + - metadata: A dictionary with keys "dtype" and "shape", describing + the tensor's data type and shape. + + Returns: + - buffer: A tensor of the specified type and shape, allocated on + self.device. + """ + return torch.empty(metadata["shape"], + dtype=metadata["dtype"], + device=self.device) + + def _send_metadata(self, metadata: Metadata): + """ + Send the metadata dictionary to the target rank. + + Parameters: + - metadata: A dictionary with keys "dtype" and "shape". + """ + self.group.send_obj(metadata, self.target_rank_for_send) + + def _recv_metadata(self) -> Metadata: + """ + Receive the metadata dictionary from the target rank. + + Returns: + - metadata: A dictionary with keys "dtype" and "shape" describing + the tensor. + """ + return self.group.recv_obj(self.target_rank_for_recv) + + def _send_impl(self, tensor: Optional[torch.Tensor]) -> None: + """ + The actual implementation of sending the tensor and its metadata to the + target rank. + + Parameters: + - tensor: The input tensor to be sent, or None if no tensor is + being sent. + """ + metadata = self._make_metadata(tensor) + self._send_metadata(metadata) + if tensor is not None: + self.device_send_func(tensor.to(self.device), + self.target_rank_for_send) + + def _recv_impl(self) -> Optional[torch.Tensor]: + """ + The actual implementation of receiving a tensor and its metadata from + the target rank. + + Returns: + - buffer: The received tensor, or None if no tensor is received. + """ + metadata = self._recv_metadata() + if metadata["dtype"] is None: + return None + buffer = self._prepare_recv_buffer(metadata) + self.device_recv_func(buffer, self.target_rank_for_recv) + + return buffer + + def send_tensor_wrapper(self, tensor: Optional[torch.Tensor], + tensor_size: int) -> None: + """ + Wrapper for _send_impl to handle exceptions and update buffer size. + """ + try: + self._send_impl(tensor) + + with self.buffer_size_lock: + self.buffer_size -= tensor_size + except Exception as e: + logger.error("[rank%d]: Exception when trying to send %s, msg: %s", + torch.distributed.get_rank(), str(tensor), str(e)) + import traceback + traceback.print_exc() + + def block_if_full(self): + """ + Block the current thread if the buffer size is larger than the + threshold. + """ + while self.buffer_size > self.buffer_size_thresh: + logger.debug("KV cache transfer pipe is full. Waiting...") + time.sleep(0.05) + + def send_tensor(self, tensor: Optional[torch.Tensor]) -> None: + """ + Sends a tensor and its metadata to the destination rank in a + non-blocking way. + + Parameters: + - tensor: The tensor to send, or None if no tensor is being sent. + """ + if self.transport_thread is None: + self.transport_thread = ThreadPoolExecutor(max_workers=1) + + if tensor is not None: + tensor_size = tensor.element_size() * tensor.numel() + else: + tensor_size = 0 + + self.block_if_full() + + with self.buffer_size_lock: + self.buffer_size += tensor_size + + self.transport_thread.submit(self.send_tensor_wrapper, tensor, + tensor_size) + + def recv_tensor(self) -> Optional[torch.Tensor]: + """ + Receives a tensor and its metadata from the source rank. Blocking call. + + Returns: + - tensor: The received tensor, or None if no tensor is received. + """ + if self.transport_thread is None: + self.transport_thread = ThreadPoolExecutor(max_workers=1) + + future = self.transport_thread.submit(self._recv_impl) + + try: + tensor = future.result() + except Exception as e: + logger.error("Encountering exception in KV receiving thread") + logger.error("%s", e) + logger.error("My device: %s", self.device) + import traceback + traceback.print_exc() + raise e + + return tensor + + def close(self): + """ + Close the pipe and release associated resources. + """ + if hasattr(self, + "transport_thread") and self.transport_thread is not None: + self.transport_thread.shutdown() diff --git a/vllm/distributed/kv_transfer/kv_transfer_agent.py b/vllm/distributed/kv_transfer/kv_transfer_agent.py new file mode 100644 index 0000000000000..9ce97851dc849 --- /dev/null +++ b/vllm/distributed/kv_transfer/kv_transfer_agent.py @@ -0,0 +1,75 @@ +"""A centralized entrypoint to perform distributed KV cache transfer. + +This implementation is a shim wrapper on two APIs exposed by `kv_connector`: +1. `send_kv_caches_and_hidden_states` +2. `recv_kv_caches_and_hidden_states +""" +from typing import TYPE_CHECKING, List, Tuple, Union + +if TYPE_CHECKING: + from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata + from vllm.config import VllmConfig + +import torch + +from vllm.distributed.kv_transfer.kv_connector.factory import ( + KVConnectorFactory) +from vllm.logger import init_logger +from vllm.sequence import IntermediateTensors + +logger = init_logger(__name__) + + +class KVTransferAgent: + """ + A class designated for distributed KV transfer + + Target use cases: + 1. Disaggregated prefill + 2. Remote KV cache storage + """ + + def __init__( + self, + rank: int, + local_rank: int, + config: "VllmConfig", + ): + + self.config = config + + if config.kv_transfer_config is None: + raise ValueError("KVTransferConfig is not set in the VllmConfig," + " cannot initialize KVConnector.") + + assert self.config.kv_transfer_config.is_kv_transfer_instance, "KV"\ + "TransferAgent should only be used when kv_connector is set." + + self.connector = KVConnectorFactory.create_connector( + rank, local_rank, config) + + def send_kv_caches_and_hidden_states( + self, + model_executable: torch.nn.Module, + model_input: "ModelInputForGPUWithSamplingMetadata", + kv_caches: List[torch.Tensor], + hidden_or_intermediate_states: Union[torch.Tensor, + IntermediateTensors], + ) -> None: + + self.connector.send_kv_caches_and_hidden_states( + model_executable, model_input, kv_caches, + hidden_or_intermediate_states) + + def close(self) -> None: + self.connector.close() + + def recv_kv_caches_and_hidden_states( + self, model_executable: torch.nn.Module, + model_input: "ModelInputForGPUWithSamplingMetadata", + kv_caches: List[torch.Tensor] + ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool, + "ModelInputForGPUWithSamplingMetadata"]: + + return self.connector.recv_kv_caches_and_hidden_states( + model_executable, model_input, kv_caches) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index ccbe00386c5da..34815d7f0aa78 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -27,18 +27,23 @@ from contextlib import contextmanager, nullcontext from dataclasses import dataclass from multiprocessing import shared_memory -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, + Union) from unittest.mock import patch import torch import torch.distributed from torch.distributed import Backend, ProcessGroup +import vllm.distributed.kv_transfer.kv_transfer_agent as kv_transfer import vllm.envs as envs from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.utils import direct_register_custom_op, supports_custom_op +if TYPE_CHECKING: + from vllm.config import VllmConfig + @dataclass class GraphCaptureContext: @@ -904,6 +909,14 @@ def get_pp_group() -> GroupCoordinator: # kept for backward compatibility get_pipeline_model_parallel_group = get_pp_group +_KV_TRANSFER: Optional[kv_transfer.KVTransferAgent] = None + + +def get_kv_transfer_group() -> kv_transfer.KVTransferAgent: + assert _KV_TRANSFER is not None, ( + "disaggregated KV cache transfer parallel group is not initialized") + return _KV_TRANSFER + @contextmanager def graph_capture(): @@ -1052,6 +1065,26 @@ def initialize_model_parallel( group_name="pp") +def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None: + """ + Initialize KV cache transfer parallel group. + """ + + global _KV_TRANSFER + + if vllm_config.kv_transfer_config is None: + return + + if all([ + vllm_config.kv_transfer_config.need_kv_parallel_group, + _KV_TRANSFER is None + ]): + _KV_TRANSFER = kv_transfer.KVTransferAgent( + rank=get_world_group().rank, + local_rank=get_world_group().local_rank, + config=vllm_config) + + def ensure_model_parallel_initialized( tensor_model_parallel_size: int, pipeline_model_parallel_size: int, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index f0020562c3c3a..4aa0eebd976c9 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -9,10 +9,10 @@ import vllm.envs as envs from vllm.config import (CacheConfig, CompilationConfig, ConfigFormat, - DecodingConfig, DeviceConfig, HfOverrides, LoadConfig, - LoadFormat, LoRAConfig, ModelConfig, - ObservabilityConfig, ParallelConfig, PoolerConfig, - PromptAdapterConfig, SchedulerConfig, + DecodingConfig, DeviceConfig, HfOverrides, + KVTransferConfig, LoadConfig, LoadFormat, LoRAConfig, + ModelConfig, ObservabilityConfig, ParallelConfig, + PoolerConfig, PromptAdapterConfig, SchedulerConfig, SpeculativeConfig, TaskOption, TokenizerPoolConfig, VllmConfig) from vllm.executor.executor_base import ExecutorBase @@ -108,6 +108,7 @@ class EngineArgs: # notice. distributed_executor_backend: Optional[Union[str, Type[ExecutorBase]]] = None + # number of P/D disaggregation (or other disaggregation) workers pipeline_parallel_size: int = 1 tensor_parallel_size: int = 1 max_parallel_loading_workers: Optional[int] = None @@ -194,6 +195,8 @@ class EngineArgs: compilation_config: Optional[CompilationConfig] = None worker_cls: str = "auto" + kv_transfer_config: Optional[KVTransferConfig] = None + def __post_init__(self): if not self.tokenizer: self.tokenizer = self.model @@ -908,6 +911,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: 'compilers, using -O without space is also ' 'supported. -O3 is equivalent to -O 3.') + parser.add_argument('--kv-transfer-config', + type=KVTransferConfig.from_cli, + default=None, + help='The configurations for distributed KV cache ' + 'transfer. Should be a JSON string.') + parser.add_argument( '--worker-cls', type=str, @@ -1201,6 +1210,7 @@ def create_engine_config(self, observability_config=observability_config, prompt_adapter_config=prompt_adapter_config, compilation_config=self.compilation_config, + kv_transfer_config=self.kv_transfer_config, ) if envs.VLLM_USE_V1: diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 1f654a9cce465..c9f06eef3f907 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -21,7 +21,7 @@ from vllm.compilation.compile_context import set_compile_context from vllm.config import CompilationLevel, VllmConfig from vllm.core.scheduler import SchedulerOutputs -from vllm.distributed import get_pp_group +from vllm.distributed import get_kv_transfer_group, get_pp_group from vllm.distributed.parallel_state import graph_capture from vllm.forward_context import set_forward_context from vllm.inputs import INPUT_REGISTRY, InputRegistry @@ -1666,6 +1666,24 @@ def execute_model( else: model_executable = self.model + # Receive KV cache in distributed KV cache transfer setting + # In disagg prefill setting, it will also recv hidden states and bypass + # model forwarding + # In KV cache database setting, it will change the model input so that + # we can skip prefilling on tokens that successfully received KV caches + # NOTE: The receive operation is blocking + bypass_model_exec = False + if self.need_recv_kv(model_input, kv_caches): + hidden_or_intermediate_states, bypass_model_exec, model_input = \ + get_kv_transfer_group().recv_kv_caches_and_hidden_states( + # model is used to know which layer the current worker + # is working on, so that we can receive KV for only those + # layers. + model_executable, + model_input, + kv_caches=kv_caches + ) + multi_modal_kwargs = model_input.multi_modal_kwargs or {} seqlen_agnostic_kwargs = { "finished_requests_ids": model_input.finished_requests_ids, @@ -1677,21 +1695,36 @@ def execute_model( model_forward_end = torch.cuda.Event(enable_timing=True) model_forward_start.record() - with set_forward_context(model_input.attn_metadata, self.vllm_config): - hidden_or_intermediate_states = model_executable( - input_ids=model_input.input_tokens, - positions=model_input.input_positions, - kv_caches=kv_caches, - attn_metadata=model_input.attn_metadata, - intermediate_tensors=intermediate_tensors, - **MultiModalKwargs.as_kwargs(multi_modal_kwargs, - device=self.device), - **seqlen_agnostic_kwargs) + if not bypass_model_exec: + with set_forward_context(model_input.attn_metadata, + self.vllm_config): + hidden_or_intermediate_states = model_executable( + input_ids=model_input.input_tokens, + positions=model_input.input_positions, + kv_caches=kv_caches, + attn_metadata=model_input.attn_metadata, + intermediate_tensors=intermediate_tensors, + **MultiModalKwargs.as_kwargs(multi_modal_kwargs, + device=self.device), + **seqlen_agnostic_kwargs) if (self.observability_config is not None and self.observability_config.collect_model_forward_time): model_forward_end.record() + # Sending KV cache in distributed KV cache transfer setting + # NOTE: the send operation is non-blocking + if self.need_send_kv(model_input, kv_caches): + get_kv_transfer_group().send_kv_caches_and_hidden_states( + # model_executable is used to know which layer the current + # worker is working on, so that we can send KV for only those + # layers. + model_executable, + model_input, + kv_caches, + hidden_or_intermediate_states, + ) + # Compute the logits in the last pipeline stage. if not get_pp_group().is_last_rank: if (self.is_driver_worker @@ -1759,6 +1792,56 @@ def execute_model( return [output] + def need_recv_kv(self, model_input, kv_caches) -> bool: + """Check if we need to receive kv-cache from the other worker. + We need to receive KV when + 1. current vLLM instance is KV cache consumer/decode vLLM instance + 2. this batch is not a profiling run + 3. this batch is a prefill run + + Args: + model_input: input to the model executable + kv_caches: vLLM's paged memory + """ + + prefill_meta = model_input.attn_metadata.prefill_metadata + + # check if the current run is profiling + is_profile_run = (kv_caches[0].numel() == 0) + # check if the current run is prefill + is_prefill_run = prefill_meta is not None + + if self.vllm_config.kv_transfer_config is None: + return False + + return self.vllm_config.kv_transfer_config.is_kv_consumer and ( + not is_profile_run) and is_prefill_run + + def need_send_kv(self, model_input, kv_caches) -> bool: + """Check if we need to send kv-cache to the other worker. + We need to send KV when + 1. current vLLM instance is KV cache producer/prefill vLLM instance + 2. this batch is not a profiling run + 3. this batch is a prefill run + + Args: + model_input: input to the model executable + kv_caches: vLLM's paged memory + """ + + prefill_meta = model_input.attn_metadata.prefill_metadata + + # check if the current run is profiling + is_profile_run = (kv_caches[0].numel() == 0) + # check if the current run is prefill + is_prefill_run = prefill_meta is not None + + if self.vllm_config.kv_transfer_config is None: + return False + + return self.vllm_config.kv_transfer_config.is_kv_producer and ( + not is_profile_run) and is_prefill_run + # NOTE: this is nn.Module so the profiler can properly capture/group # kernels calls made within the graph diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index d58cb029618e9..094dd5a5d08b3 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -8,8 +8,9 @@ import torch.distributed import vllm.envs as envs -from vllm.config import ParallelConfig, VllmConfig -from vllm.distributed import (ensure_model_parallel_initialized, +from vllm.config import VllmConfig +from vllm.distributed import (ensure_kv_transfer_initialized, + ensure_model_parallel_initialized, init_distributed_environment, set_custom_all_reduce) from vllm.logger import init_logger @@ -144,7 +145,7 @@ def init_device(self) -> None: raise RuntimeError( f"Not support device type: {self.device_config.device}") # Initialize the distributed environment. - init_worker_distributed_environment(self.parallel_config, self.rank, + init_worker_distributed_environment(self.vllm_config, self.rank, self.distributed_init_method, self.local_rank) # Set random seed. @@ -457,20 +458,22 @@ def get_cache_block_size_bytes(self) -> int: def init_worker_distributed_environment( - parallel_config: ParallelConfig, + vllm_config: VllmConfig, rank: int, distributed_init_method: Optional[str] = None, local_rank: int = -1, ) -> None: """Initialize the distributed environment.""" + parallel_config = vllm_config.parallel_config set_custom_all_reduce(not parallel_config.disable_custom_all_reduce) init_distributed_environment(parallel_config.world_size, rank, distributed_init_method, local_rank) - ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size) + ensure_kv_transfer_initialized(vllm_config) + def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype): # Check if the GPU supports the dtype. diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 7aaa8b453cff1..7c0bc5a678956 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -43,6 +43,7 @@ def __init__( self.speculative_config = vllm_config.speculative_config self.prompt_adapter_config = vllm_config.prompt_adapter_config self.observability_config = vllm_config.observability_config + self.kv_transfer_config = vllm_config.kv_transfer_config @abstractmethod def init_device(self) -> None: From b18c9bbaba6e1c6dfb92fe52e5a6cb22dd6bfa81 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Mon, 2 Dec 2024 09:31:09 +0800 Subject: [PATCH 208/255] [Model] Add BNB support to Llava and Pixtral-HF (#10795) Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/llava.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 7fd4b32774798..db7fa82ceb9b7 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -287,6 +287,15 @@ def init_vision_tower_for_llava( @INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava) @INPUT_REGISTRY.register_input_processor(input_processor_for_llava) class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): + # BitandBytes specific attributes + bitsandbytes_stacked_params_mapping = { + # shard_name, weight_name, index + "q_proj": ("qkv_proj", 0), + "k_proj": ("qkv_proj", 1), + "v_proj": ("qkv_proj", 2), + "gate_proj": ("gate_up_proj", 0), + "up_proj": ("gate_up_proj", 1), + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() From b7954776fd338cab442a8004d240f7fe74e4e51b Mon Sep 17 00:00:00 2001 From: cduk <19917266+cduk@users.noreply.github.com> Date: Mon, 2 Dec 2024 02:49:48 +0100 Subject: [PATCH 209/255] =?UTF-8?q?[core]=20Avoid=20metrics=20log=20noise?= =?UTF-8?q?=20when=20idle=20-=20include=20speculative=20decodi=E2=80=A6=20?= =?UTF-8?q?(#10809)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- vllm/engine/metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 5bfd6a9f4b386..4869557ba9b44 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -473,13 +473,13 @@ def log(self, stats: Stats) -> None: ) if (stats.cpu_prefix_cache_hit_rate >= 0 or stats.gpu_prefix_cache_hit_rate >= 0): - logger.info( + log_fn( "Prefix cache hit rate: GPU: %.2f%%, CPU: %.2f%%", stats.gpu_prefix_cache_hit_rate * 100, stats.cpu_prefix_cache_hit_rate * 100, ) if self.spec_decode_metrics is not None: - logger.info( + log_fn( self._format_spec_decode_metrics_str( self.spec_decode_metrics)) From 073a4bd1c04164af29843cb5478740e9839d2d8a Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 1 Dec 2024 17:55:39 -0800 Subject: [PATCH 210/255] [Kernel] Use `out` arg in flash_attn_varlen_func (#10811) Signed-off-by: Woosuk Kwon --- CMakeLists.txt | 2 +- tests/kernels/test_flash_attn.py | 20 +++++++++++++++++--- vllm/v1/attention/backends/flash_attn.py | 6 +++--- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f43bf8143458b..c78cdc77a7e42 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -522,7 +522,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG fdf6d72b48aea41f4ae6a89139a453dae554abc8 + GIT_TAG 04325b6798bcc326c86fb35af62d05a9c8c8eceb GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py index a20c73345218f..1ae78d7b46c5b 100644 --- a/tests/kernels/test_flash_attn.py +++ b/tests/kernels/test_flash_attn.py @@ -71,6 +71,7 @@ def ref_paged_attn( return torch.cat(outputs, dim=0) +@pytest.mark.parametrize("use_out", [True, False]) @pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]]) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @@ -81,6 +82,7 @@ def ref_paged_attn( @pytest.mark.parametrize("sliding_window", [None, 256]) @torch.inference_mode() def test_flash_attn_with_paged_kv( + use_out: bool, kv_lens: List[int], num_heads: Tuple[int, int], head_size: int, @@ -116,17 +118,22 @@ def test_flash_attn_with_paged_kv( (num_seqs, max_num_blocks_per_seq), dtype=torch.int32) + q = query.unsqueeze(1) + out = torch.empty_like(q) if use_out else None output = flash_attn_with_kvcache( - q=query.unsqueeze(1), + q=q, k_cache=key_cache, v_cache=value_cache, + out=out, softmax_scale=scale, causal=True, block_table=block_tables, cache_seqlens=kv_lens_tensor, softcap=soft_cap if soft_cap is not None else 0, window_size=window_size, - ).squeeze(1) + ) + output = output if not use_out else out + output = output.squeeze(1) ref_output = ref_paged_attn(query=query, key_cache=key_cache, @@ -141,7 +148,10 @@ def test_flash_attn_with_paged_kv( f"{torch.max(torch.abs(output - ref_output))}" -@pytest.mark.parametrize("seq_lens", [[(1, 1328), (5, 18), (129, 463)]]) +@pytest.mark.parametrize("use_out", [True, False]) +@pytest.mark.parametrize("seq_lens", + [[(1, 1328), (5, 18), + (129, 463)], [(1, 523), (1, 37), (1, 2011)]]) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("block_size", BLOCK_SIZES) @@ -151,6 +161,7 @@ def test_flash_attn_with_paged_kv( @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @torch.inference_mode() def test_varlen_with_paged_kv( + use_out: bool, seq_lens: List[Tuple[int, int]], num_heads: Tuple[int, int], head_size: int, @@ -197,10 +208,12 @@ def test_varlen_with_paged_kv( (num_seqs, max_num_blocks_per_seq), dtype=torch.int32) + out = torch.empty_like(query) if use_out else None output = flash_attn_varlen_func( q=query, k=key_cache, v=value_cache, + out=out, cu_seqlens_q=cu_query_lens, cu_seqlens_k=cu_kv_lens, max_seqlen_q=max_query_len, @@ -211,6 +224,7 @@ def test_varlen_with_paged_kv( block_table=block_tables, softcap=soft_cap if soft_cap is not None else 0, ) + output = output if not use_out else out ref_output = ref_paged_attn( query=query, diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index e618edf7d35bf..4aa4b296f0efc 100644 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -205,10 +205,12 @@ def unified_v1_flash_attention( v_scale, ) - attn_output = flash_attn_varlen_func( + # Compute attention and update output up to `num_actual_tokens`. + flash_attn_varlen_func( q=query[:num_actual_tokens], k=key_cache, v=value_cache, + out=output[:num_actual_tokens], cu_seqlens_q=attn_metadata.query_start_loc, max_seqlen_q=attn_metadata.max_query_len, cu_seqlens_k=attn_metadata.seq_start_loc, @@ -220,8 +222,6 @@ def unified_v1_flash_attention( block_table=attn_metadata.block_table, softcap=logits_soft_cap, ) - # TODO(woosuk): Remove this unnecessary copy. - output[:num_actual_tokens].copy_(attn_output) def unified_v1_flash_attention_fake( From e25810ae29058299b7bf845c7ed572f2474a1d85 Mon Sep 17 00:00:00 2001 From: Maximilien de Bayser Date: Sun, 1 Dec 2024 23:05:32 -0300 Subject: [PATCH 211/255] Fill TorchSDPAAttentionMetadata seq_lens_field for prefill (#10799) Signed-off-by: Max de Bayser --- vllm/attention/backends/torch_sdpa.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 16e044b618c40..dafa5bb56acda 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -341,7 +341,11 @@ def build(self, seq_lens: List[int], query_lens: List[int], ) else: block_tables = torch.tensor([]) - seq_lens_tensor = torch.tensor([]) + seq_lens_tensor = torch.tensor( + input_data.seq_lens[:input_data.num_prefills], + dtype=torch.int32, + device="cpu", + ) # For multi-modal models placeholder_index_maps = None From 63a164172dbcc43857dbcf6443a7594faa143151 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sun, 1 Dec 2024 19:27:13 -0800 Subject: [PATCH 212/255] [misc] remove xverse modeling file (#10814) Signed-off-by: youkaichao --- vllm/model_executor/models/registry.py | 2 +- vllm/model_executor/models/xverse.py | 423 ------------------------- 2 files changed, 1 insertion(+), 424 deletions(-) delete mode 100644 vllm/model_executor/models/xverse.py diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 2b7b69e8c3a95..c66fbce018a62 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -94,7 +94,7 @@ "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"), "SolarForCausalLM": ("solar", "SolarForCausalLM"), "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"), - "XverseForCausalLM": ("xverse", "XverseForCausalLM"), + "XverseForCausalLM": ("llama", "LlamaForCausalLM"), # [Encoder-decoder] "BartModel": ("bart", "BartForConditionalGeneration"), "BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"), diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py deleted file mode 100644 index 25a0d474e2863..0000000000000 --- a/vllm/model_executor/models/xverse.py +++ /dev/null @@ -1,423 +0,0 @@ -# Adapted from -# https://huggingface.co/xverse/XVERSE-7B/blob/main/modeling_xverse.py -# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Inference-only Xverse model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union - -import torch -from torch import nn -from transformers import PretrainedConfig - -from vllm.attention import Attention, AttentionMetadata -from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ( - ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors - -from .interfaces import SupportsLoRA, SupportsPP -from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers, - maybe_prefix) - - -class XverseMLP(nn.Module): - - def __init__( - self, - hidden_size: int, - intermediate_size: int, - hidden_act: str, - quant_config: Optional[QuantizationConfig] = None, - ) -> None: - super().__init__() - self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, - bias=False, - quant_config=quant_config) - self.down_proj = RowParallelLinear(intermediate_size, - hidden_size, - bias=False, - quant_config=quant_config) - if hidden_act != "silu": - raise ValueError(f"Unsupported activation: {hidden_act}. " - "Only silu is supported for now.") - self.act_fn = SiluAndMul() - - def forward(self, x): - gate, _ = self.gate_up_proj(x) - x = self.act_fn(gate) - x, _ = self.down_proj(x) - return x - - -class XverseAttention(nn.Module): - - def __init__( - self, - hidden_size: int, - num_heads: int, - num_kv_heads: int, - rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, - max_position_embeddings: int = 8192, - quant_config: Optional[QuantizationConfig] = None, - bias: bool = False, - cache_config: Optional[CacheConfig] = None, - prefix: str = "", - ) -> None: - super().__init__() - self.hidden_size = hidden_size - tp_size = get_tensor_model_parallel_world_size() - self.total_num_heads = num_heads - assert self.total_num_heads % tp_size == 0 - self.num_heads = self.total_num_heads // tp_size - self.total_num_kv_heads = num_kv_heads - # partition the KV heads across multiple tensor parallel GPUs. - assert self.total_num_kv_heads % tp_size == 0 - self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) - self.head_dim = hidden_size // self.total_num_heads - self.q_size = self.num_heads * self.head_dim - self.kv_size = self.num_kv_heads * self.head_dim - self.scaling = self.head_dim**-0.5 - self.rope_theta = rope_theta - self.max_position_embeddings = max_position_embeddings - - self.qkv_proj = QKVParallelLinear( - hidden_size, - self.head_dim, - self.total_num_heads, - self.total_num_kv_heads, - bias=bias, - quant_config=quant_config, - ) - self.o_proj = RowParallelLinear( - self.total_num_heads * self.head_dim, - hidden_size, - bias=bias, - quant_config=quant_config, - ) - - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.head_dim, - max_position=max_position_embeddings, - base=rope_theta, - rope_scaling=rope_scaling, - ) - self.attn = Attention(self.num_heads, - self.head_dim, - self.scaling, - num_kv_heads=self.num_kv_heads, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.attn") - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: AttentionMetadata, - ) -> torch.Tensor: - qkv, _ = self.qkv_proj(hidden_states) - q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - q, k = self.rotary_emb(positions, q, k) - attn_output = self.attn(q, k, v, kv_cache, attn_metadata) - output, _ = self.o_proj(attn_output) - return output - - -class XverseDecoderLayer(nn.Module): - - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: - super().__init__() - self.hidden_size = config.hidden_size - rope_theta = getattr(config, "rope_theta", 10000) - rope_scaling = getattr(config, "rope_scaling", None) - max_position_embeddings = getattr(config, "max_position_embeddings", - 8192) - self.self_attn = XverseAttention( - hidden_size=self.hidden_size, - num_heads=config.num_attention_heads, - num_kv_heads=getattr(config, "num_key_value_heads", - config.num_attention_heads), - rope_theta=rope_theta, - rope_scaling=rope_scaling, - max_position_embeddings=max_position_embeddings, - quant_config=quant_config, - bias=getattr(config, "bias", False), - cache_config=cache_config, - prefix=f"{prefix}.self_attn", - ) - self.mlp = XverseMLP( - hidden_size=self.hidden_size, - intermediate_size=config.intermediate_size, - hidden_act=config.hidden_act, - quant_config=quant_config, - ) - self.input_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - self.post_attention_layernorm = RMSNorm(config.hidden_size, - eps=config.rms_norm_eps) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: AttentionMetadata, - residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: - # Self Attention - if residual is None: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - else: - hidden_states, residual = self.input_layernorm( - hidden_states, residual) - hidden_states = self.self_attn( - positions=positions, - hidden_states=hidden_states, - kv_cache=kv_cache, - attn_metadata=attn_metadata, - ) - - # Fully Connected - hidden_states, residual = self.post_attention_layernorm( - hidden_states, residual) - hidden_states = self.mlp(hidden_states) - return hidden_states, residual - - -@support_torch_compile -class XverseModel(nn.Module): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config - self.config = config - self.padding_idx = config.pad_token_id - lora_vocab = (lora_config.lora_extra_vocab_size * - (lora_config.max_loras or 1)) if lora_config else 0 - self.vocab_size = config.vocab_size + lora_vocab - self.org_vocab_size = config.vocab_size - self.embed_tokens = VocabParallelEmbedding( - self.vocab_size, - config.hidden_size, - org_num_embeddings=config.vocab_size, - ) - self.start_layer, self.end_layer, self.layers = make_layers( - config.num_hidden_layers, - lambda prefix: XverseDecoderLayer( - config, cache_config, quant_config, prefix=prefix), - prefix=f"{prefix}.layers", - ) - self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.make_empty_intermediate_tensors = ( - make_empty_intermediate_tensors_factory( - ["hidden_states", "residual"], config.hidden_size)) - - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.embed_tokens(input_ids) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - intermediate_tensors: Optional[IntermediateTensors], - inputs_embeds: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, IntermediateTensors]: - if get_pp_group().is_first_rank: - if inputs_embeds is not None: - hidden_states = inputs_embeds - else: - hidden_states = self.get_input_embeddings(input_ids) - residual = None - else: - hidden_states = intermediate_tensors["hidden_states"] - residual = intermediate_tensors["residual"] - for i in range(self.start_layer, self.end_layer): - layer = self.layers[i] - hidden_states, residual = layer( - positions, - hidden_states, - kv_caches[i - self.start_layer], - attn_metadata, - residual, - ) - if not get_pp_group().is_last_rank: - return IntermediateTensors({ - "hidden_states": hidden_states, - "residual": residual - }) - hidden_states, _ = self.norm(hidden_states, residual) - return hidden_states - - -class XverseForCausalLM(nn.Module, SupportsLoRA, SupportsPP): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } - - # LoRA specific attributes - supported_lora_modules = [ - "qkv_proj", - "o_proj", - "gate_up_proj", - "down_proj", - "embed_tokens", - "lm_head", - ] - embedding_modules = { - "embed_tokens": "input_embeddings", - "lm_head": "output_embeddings", - } - embedding_padding_modules = ["lm_head"] - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config - - self.config = config - self.lora_config = lora_config - - self.quant_config = quant_config - self.model = XverseModel(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) - self.lm_head = ParallelLMHead(config.vocab_size, - config.hidden_size, - quant_config=quant_config) - if self.config.tie_word_embeddings: - self.lm_head.weight = self.model.embed_tokens.weight - self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() - self.make_empty_intermediate_tensors = ( - self.model.make_empty_intermediate_tensors) - - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.model.get_input_embeddings(input_ids) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, IntermediateTensors]: - hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors, - inputs_embeds) - return hidden_states - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) - return logits - - def sample( - self, - logits: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: - stacked_params_mapping = [ - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() - for name, loaded_weight in weights: - if ("rotary_emb.inv_freq" in name - or "rotary_emb.cos_cached" in name - or "rotary_emb.sin_cached" in name): - continue - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params From 995a148575aaacc7889ff0d29a96195c329422ab Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Mon, 2 Dec 2024 12:14:45 +0800 Subject: [PATCH 213/255] [doc]Update config docstring (#10732) Signed-off-by: wangxiyuan --- vllm/config.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/vllm/config.py b/vllm/config.py index 5d9e2766c7faa..510bd81d66217 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -91,6 +91,8 @@ class ModelConfig: the default version. max_model_len: Maximum length of a sequence (including prompt and output). If None, will be derived from the model. + spec_target_max_model_len: Specify the the maximum length for spec + decoding draft models. quantization: Quantization method that was used to quantize the model weights. If None, we assume the model weights are not quantized. quantization_param_path: Path to JSON file containing scaling factors. @@ -107,6 +109,7 @@ class ModelConfig: to eager mode. Additionally for encoder-decoder models, if the sequence length of the encoder input is larger than this, we fall back to the eager mode. + max_logprobs: Maximum number of log probabilities. Defaults to 20. disable_sliding_window: Whether to disable sliding window. If True, we will disable the sliding window functionality of the model. If the model does not support sliding window, this argument is @@ -119,6 +122,8 @@ class ModelConfig: the model name will be the same as `model`. limit_mm_per_prompt: Maximum number of data items per modality per prompt. Only applicable for multimodal models. + use_async_output_proc: Whether to use async output processor. + Defaults to True. config_format: The config format which shall be loaded. Defaults to 'auto' which defaults to 'hf'. hf_overrides: If a dictionary, contains arguments to be forwarded to the @@ -130,7 +135,7 @@ class ModelConfig: override default neuron config that are specific to Neuron devices, this argument will be used to configure the neuron config that can not be gathered from the vllm arguments. - override_pooling_config: Initialize non default pooling config or + override_pooler_config: Initialize non default pooling config or override default pooling config for the embedding model. """ @@ -734,8 +739,13 @@ class CacheConfig: vLLM execution. swap_space: Size of the CPU swap space per GPU (in GiB). cache_dtype: Data type for kv cache storage. + is_attention_free: Whether the model is attention-free. num_gpu_blocks_override: Number of GPU blocks to use. This overrides the profiled num_gpu_blocks if specified. Does nothing if None. + sliding_window: Sliding window size for the KV cache. Can not work with + prefix caching enabled. + enable_prefix_caching: Whether to enable prefix caching. + cpu_offload_gb: Size of the CPU offload buffer in GiB. """ def __init__( @@ -904,6 +914,7 @@ class LoadConfig: "tensorizer" will use CoreWeave's tensorizer library for fast weight loading. "bitsandbytes" will load nf4 type weights. + model_loader_extra_config: The extra config for the model loader. ignore_patterns: The list of patterns to ignore when loading the model. Default to "original/**/*" to avoid repeated loading of llama's checkpoints. From ef31eabc68099ff2f64bbe5f42dc06101451a18d Mon Sep 17 00:00:00 2001 From: zhou fan <1247714429@qq.com> Date: Mon, 2 Dec 2024 13:36:36 +0800 Subject: [PATCH 214/255] [Model]: add some tests for aria model (#10770) Signed-off-by: xffxff <1247714429@qq.com> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com> --- tests/conftest.py | 6 +++- .../vision_language/test_models.py | 30 +++++++++++++++++++ .../vision_language/vlm_utils/core.py | 11 +++++-- .../vision_language/vlm_utils/types.py | 7 +++++ 4 files changed, 51 insertions(+), 3 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 36f1d477fab59..d6be8f5b00af8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -656,6 +656,7 @@ def __init__( model_name: str, task: TaskOption = "auto", tokenizer_name: Optional[str] = None, + tokenizer_mode: str = "auto", # Use smaller max model length, otherwise bigger model cannot run due # to kv cache size limit. max_model_len: int = 1024, @@ -672,6 +673,7 @@ def __init__( model=model_name, task=task, tokenizer=tokenizer_name, + tokenizer_mode=tokenizer_mode, trust_remote_code=True, dtype=dtype, swap_space=swap_space, @@ -842,6 +844,7 @@ def generate_greedy_logprobs( audios: Optional[PromptAudioInput] = None, videos: Optional[PromptVideoInput] = None, stop_token_ids: Optional[List[int]] = None, + stop: Optional[List[str]] = None, ) -> Union[List[TokensTextLogprobs], List[TokensTextLogprobsPromptLogprobs]]: greedy_logprobs_params = SamplingParams( @@ -849,7 +852,8 @@ def generate_greedy_logprobs( max_tokens=max_tokens, logprobs=num_logprobs, prompt_logprobs=num_prompt_logprobs, - stop_token_ids=stop_token_ids) + stop_token_ids=stop_token_ids, + stop=stop) return self.generate_w_logprobs(prompts, greedy_logprobs_params, diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 3457ec6b8e73b..dbb0b4d350d10 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -8,6 +8,7 @@ import pytest import transformers from transformers import AutoModelForVision2Seq +from transformers.utils import is_flash_attn_2_available from vllm.platforms import current_platform from vllm.utils import cuda_device_count_stateless, identity @@ -134,6 +135,35 @@ marks=[pytest.mark.core_model, pytest.mark.cpu_model], ), #### Extended model tests + "aria": VLMTestInfo( + models=["rhymes-ai/Aria"], + tokenizer_mode="slow", + test_type=( + VLMTestType.IMAGE, + VLMTestType.MULTI_IMAGE, + ), + dtype="bfloat16", + prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501 + img_idx_to_prompt=lambda idx: "<|img|>\n", + max_model_len=4096, + max_num_seqs=2, + single_image_prompts=IMAGE_ASSETS.prompts({ + "stop_sign": "Please describe the image shortly.", + "cherry_blossom": "Please infer the season with reason.", + }), + multi_image_prompt="Describe the two images shortly.", # noqa: E501 + postprocess_inputs=model_utils.get_key_type_post_processor("pixel_values"), + stop_str=["<|im_end|>"], + image_size_factors=[(0.10, 0.15)], + max_tokens=64, + marks=[ + pytest.mark.skipif( + not is_flash_attn_2_available(), + reason="Model needs flash-attn for numeric convergence.", + ), + large_gpu_mark(min_gb=64), + ], + ), "blip2": VLMTestInfo( models=["Salesforce/blip2-opt-2.7b"], test_type=VLMTestType.IMAGE, diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py index 7e8c6dabb15af..88349ef9a3a69 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/core.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py @@ -29,6 +29,8 @@ def run_test( postprocess_inputs: Callable[[BatchEncoding], BatchEncoding], comparator: Callable[..., None], get_stop_token_ids: Optional[Callable[[AutoTokenizer], List[int]]], + stop_str: Optional[List[str]], + tokenizer_mode: str, limit_mm_per_prompt: Dict[str, int], model_kwargs: Optional[Dict[str, Any]], patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]], @@ -50,11 +52,14 @@ def run_test( # vLLM needs a fresh new process without cuda initialization. # if we run HF first, the cuda initialization will be done and it # will hurt multiprocessing backend with fork method (the default method). - vllm_kwargs = {} + vllm_kwargs: Dict[str, Any] = {} if get_stop_token_ids is not None: vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer) + if stop_str: + vllm_kwargs["stop"] = stop_str with vllm_runner(model, + tokenizer_mode=tokenizer_mode, max_model_len=max_model_len, max_num_seqs=max_num_seqs, dtype=dtype, @@ -85,6 +90,8 @@ def run_test( hf_kwargs = {} if use_tokenizer_eos: hf_kwargs["eos_token_id"] = tokenizer.eos_token_id + if stop_str: + hf_kwargs["stop_strings"] = stop_str with hf_model, torch.no_grad(): for prompts, media in inputs: @@ -138,4 +145,4 @@ def process_runner_outputs( def process_outputs(output_processor, model, outputs_per_image): """Applies a model specific post-processor function to a runner's output""" return [[output_processor(res, model) for res in outputs] - for outputs in outputs_per_image] + for outputs in outputs_per_image] \ No newline at end of file diff --git a/tests/models/decoder_only/vision_language/vlm_utils/types.py b/tests/models/decoder_only/vision_language/vlm_utils/types.py index 8459476dc2d07..d410fa8c653ce 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/types.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py @@ -97,6 +97,9 @@ class VLMTestInfo(NamedTuple): # Optional callable which gets a list of token IDs from the model tokenizer get_stop_token_ids: Optional[Callable[[AutoTokenizer], List[int]]] = None + # Optional list of strings to stop generation, useful when stop tokens are + # not special tokens in the tokenizer + stop_str: Optional[List[str]] = None # Exposed options for HF runner model_kwargs: Optional[Dict[str, Any]] = None @@ -148,6 +151,8 @@ class VLMTestInfo(NamedTuple): marks: Optional[List[MarkDecorator]] = None + tokenizer_mode: str = "auto" + def get_non_parametrized_runner_kwargs(self): """Returns a dictionary of expandable kwargs for items that are used in all test types, which are NOT used when creating the parametrized @@ -166,8 +171,10 @@ def get_non_parametrized_runner_kwargs(self): "postprocess_inputs": self.postprocess_inputs, "comparator": self.comparator, "get_stop_token_ids": self.get_stop_token_ids, + "stop_str": self.stop_str, "model_kwargs": self.model_kwargs, "patch_hf_runner": self.patch_hf_runner, + "tokenizer_mode": self.tokenizer_mode } From e95f275f57bcff44b43e1b4300ae6ea4ee871211 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 2 Dec 2024 18:26:10 +0800 Subject: [PATCH 215/255] [CI/Build] Update `mistral_common` version for tests and docs (#10825) Signed-off-by: DarkLight1337 --- docs/requirements-docs.txt | 2 +- requirements-test.in | 2 +- requirements-test.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index e3e35844405ac..8ea240f59c38f 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -12,7 +12,7 @@ pydantic >= 2.8 torch py-cpuinfo transformers -mistral_common >= 1.3.4 +mistral_common >= 1.5.0 aiohttp starlette openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args diff --git a/requirements-test.in b/requirements-test.in index 76f6de2f77c34..44972866ddc4b 100644 --- a/requirements-test.in +++ b/requirements-test.in @@ -20,7 +20,7 @@ timm # required for internvl test torch==2.5.1 transformers_stream_generator # required for qwen-vl test matplotlib # required for qwen-vl test -mistral_common[opencv] >= 1.4.4 # required for pixtral test +mistral_common[opencv] >= 1.5.0 # required for pixtral test datamodel_code_generator # required for minicpm3 test lm-eval[api]==0.4.4 # required for model evaluation test diff --git a/requirements-test.txt b/requirements-test.txt index 65695111e4dc5..a59b85023948b 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -217,7 +217,7 @@ mbstrdecoder==1.1.3 # dataproperty # pytablewriter # typepy -mistral-common[opencv]==1.4.4 +mistral-common[opencv]==1.5.1 # via # -r requirements-test.in # mistral-common From a4c4daf3642ae2629608d5181487739b044fabe8 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 2 Dec 2024 02:50:10 -0800 Subject: [PATCH 216/255] [misc] use out argument for flash attention (#10822) Signed-off-by: youkaichao --- vllm/attention/backends/abstract.py | 1 + vllm/attention/backends/blocksparse_attn.py | 2 + vllm/attention/backends/flash_attn.py | 55 +++---- vllm/attention/backends/flashinfer.py | 4 + vllm/attention/backends/hpu_attn.py | 1 + vllm/attention/backends/ipex_attn.py | 1 + vllm/attention/backends/pallas.py | 1 + vllm/attention/backends/rocm_flash_attn.py | 1 + vllm/attention/backends/torch_sdpa.py | 1 + vllm/attention/backends/xformers.py | 1 + vllm/attention/layer.py | 76 +++++++++- vllm/config.py | 2 +- vllm/v1/attention/backends/flash_attn.py | 155 +++++--------------- 13 files changed, 144 insertions(+), 157 deletions(-) diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 5be2d83346d00..aed04361e5fb4 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -247,5 +247,6 @@ def forward( k_scale: float = 1.0, v_scale: float = 1.0, attn_type: str = AttentionType.DECODER, + output: Optional[torch.Tensor] = None, ) -> torch.Tensor: raise NotImplementedError diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py index 9e54c3b40c54e..99cb84346d84e 100644 --- a/vllm/attention/backends/blocksparse_attn.py +++ b/vllm/attention/backends/blocksparse_attn.py @@ -360,6 +360,7 @@ def forward( k_scale: float = 1.0, v_scale: float = 1.0, attn_type: str = AttentionType.DECODER, + output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention and PagedAttention. @@ -448,5 +449,6 @@ def forward( blocksparse_head_sliding_step=self.head_sliding_step, ) + assert output is not None # Reshape the output tensor. return output.view(num_tokens, hidden_size) diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 32738d1043b1d..c69e12ad78c44 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -638,24 +638,27 @@ def forward( k_scale: float = 1.0, v_scale: float = 1.0, attn_type: str = AttentionType.DECODER, + output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention. Args: - query: shape = [num_tokens, num_heads * head_size] - key: shape = [num_tokens, num_kv_heads * head_size] - value: shape = [num_tokens, num_kv_heads * head_size] + query: shape = [num_tokens, num_heads, head_size] + key: shape = [num_tokens, num_kv_heads, head_size] + value: shape = [num_tokens, num_kv_heads, head_size] + output: shape = [num_tokens, num_heads, head_size] kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] NOTE: kv_cache will be an empty tensor with shape [0] for profiling run. attn_metadata: Metadata for attention. - Returns: - shape = [num_tokens, num_heads * head_size] + NOTE: It in-place updates the output tensor. """ # NOTE(woosuk): FlashAttention does not support FP8 KV cache. assert k_scale == 1.0 and v_scale == 1.0, ( "key/v_scale is not supported in FlashAttention.") + assert output is not None, "Output tensor must be provided." + if (attn_type == AttentionType.ENCODER and (not attn_metadata.is_all_encoder_attn_metadata_set)): raise AttributeError("Encoder attention requires setting " @@ -666,23 +669,12 @@ def forward( "requires setting cross-attention " "metadata attributes.") - num_heads: int = self.num_heads - head_size: int = self.head_size - num_kv_heads: int = self.num_kv_heads kv_cache_dtype: str = self.kv_cache_dtype softmax_scale: float = self.scale window_size = self.sliding_window alibi_slopes: Optional[torch.Tensor] = self.alibi_slopes logits_soft_cap: Optional[float] = self.logits_soft_cap - num_tokens, hidden_size = query.shape - - # Reshape the query, key, and value tensors. - query = query.view(-1, num_heads, head_size) - if (key is not None) and (value is not None): - key = key.view(-1, num_kv_heads, head_size) - value = value.view(-1, num_kv_heads, head_size) - if kv_cache.numel() > 0: key_cache = kv_cache[0] value_cache = kv_cache[1] @@ -721,13 +713,13 @@ def forward( num_decode_query_tokens) = \ get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type) decode_query = query[num_prefill_query_tokens:] + decode_output = output[num_prefill_query_tokens:] # QKV for prefill. query = query[:num_prefill_query_tokens] + prefill_output = output[:num_prefill_query_tokens] assert query.shape[0] == num_prefill_query_tokens assert decode_query.shape[0] == num_decode_query_tokens - prefill_output: Optional[torch.Tensor] = None - decode_output: Optional[torch.Tensor] = None if prefill_meta := attn_metadata.prefill_metadata: # Prompt run. if (kv_cache.numel() == 0 or prefill_meta.block_tables is None @@ -741,7 +733,7 @@ def forward( key = key[:num_prefill_kv_tokens] value = value[:num_prefill_kv_tokens] - prefill_output = flash_attn_varlen_func( + flash_attn_varlen_func( q=query, k=key, v=value, @@ -754,6 +746,7 @@ def forward( window_size=window_size, alibi_slopes=alibi_slopes, softcap=logits_soft_cap, + out=prefill_output, ) else: # prefix-enabled attention @@ -761,7 +754,7 @@ def forward( "Only decoder-only models support prefix caching") assert prefill_meta.seq_lens is not None max_seq_len = max(prefill_meta.seq_lens) - prefill_output = flash_attn_varlen_func( # noqa + flash_attn_varlen_func( # noqa q=query, k=key_cache, v=value_cache, @@ -775,6 +768,7 @@ def forward( alibi_slopes=alibi_slopes, block_table=prefill_meta.block_tables, softcap=logits_soft_cap, + out=prefill_output, ) if decode_meta := attn_metadata.decode_metadata: @@ -788,7 +782,7 @@ def forward( assert attn_type == AttentionType.DECODER, ( "Only decoder-only models support max_decode_query_len > 1" ) - decode_output = flash_attn_varlen_func( + flash_attn_varlen_func( q=decode_query, k=key_cache, v=value_cache, @@ -802,6 +796,7 @@ def forward( alibi_slopes=alibi_slopes, softcap=logits_soft_cap, block_table=decode_meta.block_tables, + out=decode_output, ) else: # Use flash_attn_with_kvcache for normal decoding. @@ -810,7 +805,7 @@ def forward( _, block_tables_arg, ) = get_seq_len_block_table_args(decode_meta, False, attn_type) - decode_output = flash_attn_with_kvcache( + flash_attn_with_kvcache( q=decode_query.unsqueeze(1), k_cache=key_cache, v_cache=value_cache, @@ -821,20 +816,8 @@ def forward( window_size=window_size, alibi_slopes=alibi_slopes, softcap=logits_soft_cap, - ).squeeze(1) - - if prefill_output is None: - assert decode_output is not None - return decode_output.view(num_decode_query_tokens, hidden_size) - if decode_output is None: - assert prefill_output is not None - return prefill_output.view(num_prefill_query_tokens, hidden_size) - - assert decode_meta is not None - decode_output = decode_output.squeeze(1) - output = torch.cat([prefill_output, decode_output], dim=0) - return output.view(num_tokens, hidden_size) - + out=decode_output.unsqueeze(1), + ) return output diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 1a2024705eb04..e367468d05d26 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -774,7 +774,11 @@ def forward( k_scale: float = 1.0, v_scale: float = 1.0, attn_type: str = AttentionType.DECODER, + output: Optional[torch.Tensor] = None, ) -> torch.Tensor: + + # TODO: directly write to output tensor + if attn_type != AttentionType.DECODER: raise NotImplementedError("Encoder self-attention and " "encoder/decoder cross-attention " diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index 5359941d41fde..2c62e565c04c7 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -145,6 +145,7 @@ def forward( k_scale: float = 1.0, v_scale: float = 1.0, attn_type: str = AttentionType.DECODER, + output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with xFormers and PagedAttention. diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index 3b0d51ea4a3d8..21949874bea47 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -173,6 +173,7 @@ def forward( k_scale: float = 1.0, v_scale: float = 1.0, attn_type: str = AttentionType.DECODER, + output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with IPEX varlen_attention and PagedAttention. diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index 5988be0e6b687..9809aed0e66f9 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -151,6 +151,7 @@ def forward( k_scale: float = 1.0, v_scale: float = 1.0, attn_type: str = AttentionType.DECODER, + output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with Pallas attention. diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 6a494f4e73cb4..9139c3c1314d8 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -415,6 +415,7 @@ def forward( k_scale: float = 1.0, v_scale: float = 1.0, attn_type: str = AttentionType.DECODER, + output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention and PagedAttention. diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index dafa5bb56acda..86e952a903f36 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -431,6 +431,7 @@ def forward( k_scale: float = 1.0, v_scale: float = 1.0, attn_type: str = AttentionType.DECODER, + output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with torch SDPA and PagedAttention. diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 292575a8736bc..e2e989efb020c 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -417,6 +417,7 @@ def forward( k_scale: float = 1.0, v_scale: float = 1.0, attn_type: str = AttentionType.DECODER, + output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with xFormers and PagedAttention. diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 17157617248f7..e024eef286f05 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -4,7 +4,6 @@ import torch import torch.nn as nn -import vllm.envs as envs from vllm.attention import AttentionMetadata, AttentionType from vllm.attention.selector import backend_name_to_enum, get_attn_backend from vllm.config import CacheConfig, get_current_vllm_config @@ -12,7 +11,7 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod -from vllm.platforms import current_platform +from vllm.platforms import _Backend, current_platform from vllm.utils import direct_register_custom_op @@ -97,14 +96,23 @@ def __init__( self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, blocksparse_params, logits_soft_cap) + self.num_heads = num_heads + self.head_size = head_size + self.num_kv_heads = num_kv_heads self.backend = backend_name_to_enum(attn_backend.get_name()) # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how # torch.compile works by registering the attention as one giant # opaque custom op. For other platforms, we directly call them # and let torch.compile handle them. - self.use_direct_call = envs.VLLM_USE_V1 or not ( - current_platform.is_cuda_alike() or current_platform.is_cpu()) + self.use_direct_call = not current_platform.is_cuda_alike( + ) and not current_platform.is_cpu() + + # For some attention backends, we allocate an output tensor before + # calling the custom op. When piecewise cudagraph is enabled, this + # makes sure the output tensor is allocated inside the cudagraph. + self.use_output = self.backend == _Backend.FLASH_ATTN or \ + self.backend == _Backend.FLASH_ATTN_VLLM_V1 compilation_config = get_current_vllm_config().compilation_config if prefix in compilation_config.static_forward_context: raise ValueError(f"Duplicate layer name: {prefix}") @@ -130,6 +138,22 @@ def forward( self._k_scale, self._v_scale, attn_type=attn_type) + elif self.use_output: + output = torch.empty_like(query) + hidden_size = query.size(-1) + # Reshape the query, key, and value tensors. + # NOTE(woosuk): We do this outside the custom op to minimize the + # CPU overheads from the non-CUDA-graph regions. + query = query.view(-1, self.num_heads, self.head_size) + output = output.view(-1, self.num_heads, self.head_size) + if key is not None: + key = key.view(-1, self.num_kv_heads, self.head_size) + if value is not None: + value = value.view(-1, self.num_kv_heads, self.head_size) + torch.ops.vllm.unified_attention_with_output( + query, key, value, output, kv_cache, attn_type, + self.layer_name) + return output.view(-1, hidden_size) else: return torch.ops.vllm.unified_attention(query, key, value, kv_cache, attn_type, @@ -183,3 +207,47 @@ def unified_attention_fake( fake_impl=unified_attention_fake, dispatch_key=current_platform.dispatch_key, ) + + +def unified_attention_with_output( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + output: torch.Tensor, + kv_cache: torch.Tensor, + attn_type: str, + layer_name: str, +) -> None: + forward_context: ForwardContext = get_forward_context() + attn_metadata = forward_context.dynamic_forward_context + self = forward_context.static_forward_context[layer_name] + self.impl.forward(query, + key, + value, + kv_cache, + attn_metadata, + self._k_scale, + self._v_scale, + attn_type=attn_type, + output=output) + + +def unified_attention_with_output_fake( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + output: torch.Tensor, + kv_cache: torch.Tensor, + attn_type: str, + layer_name: str, +) -> None: + return + + +direct_register_custom_op( + op_name="unified_attention_with_output", + op_func=unified_attention_with_output, + mutates_args=["kv_cache", "output"], + fake_impl=unified_attention_with_output_fake, + dispatch_key=current_platform.dispatch_key, +) diff --git a/vllm/config.py b/vllm/config.py index 510bd81d66217..5f50d65ec87e1 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2238,7 +2238,7 @@ class CompilationConfig(BaseModel): custom_ops: List[str] = Field(default_factory=list) splitting_ops: List[str] = Field(default_factory=lambda: [ "vllm.unified_attention", - "vllm.unified_v1_flash_attention", + "vllm.unified_attention_with_output", ]) use_inductor: bool = True diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 4aa4b296f0efc..d37989055c2e5 100644 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -6,8 +6,6 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType) -from vllm.forward_context import get_forward_context -from vllm.utils import direct_register_custom_op from vllm.vllm_flash_attn import flash_attn_varlen_func @@ -113,13 +111,14 @@ def forward( k_scale: float = 1.0, v_scale: float = 1.0, attn_type: AttentionType = AttentionType.DECODER, + output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention. Args: - query: shape = [num_tokens, num_heads * head_size] - key: shape = [num_tokens, num_kv_heads * head_size] - value: shape = [num_tokens, num_kv_heads * head_size] + query: shape = [num_tokens, num_heads, head_size] + key: shape = [num_tokens, num_kv_heads, head_size] + value: shape = [num_tokens, num_kv_heads, head_size] kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] attn_metadata: Metadata for attention. Returns: @@ -135,118 +134,42 @@ def forward( assert k_scale == 1.0 and v_scale == 1.0, ( "key/v_scale is not supported in FlashAttention.") - # Reshape the query, key, and value tensors. - # NOTE(woosuk): We do this outside the custom op to minimize the CPU - # overheads from the non-CUDA-graph regions. - query = query.view(-1, self.num_heads, self.head_size) - key = key.view(-1, self.num_kv_heads, self.head_size) - value = value.view(-1, self.num_kv_heads, self.head_size) - - output = torch.empty_like(query) - torch.ops.vllm.unified_v1_flash_attention( - output, - query, - key, - value, - self.num_heads, - self.head_size, - self.num_kv_heads, - kv_cache, + if attn_metadata is None: + # Profiling run. + return output + + num_actual_tokens = attn_metadata.num_actual_tokens + + # Reshape the input keys and values and store them in the cache. + key_cache = kv_cache[0] + value_cache = kv_cache[1] + torch.ops._C_cache_ops.reshape_and_cache_flash( + key[:num_actual_tokens], + value[:num_actual_tokens], + key_cache, + value_cache, + attn_metadata.slot_mapping, self.kv_cache_dtype, k_scale, v_scale, - self.scale, - self.sliding_window, - self.alibi_slopes, - self.logits_soft_cap, ) - return output.view(-1, self.num_heads * self.head_size) - - -def unified_v1_flash_attention( - output: torch.Tensor, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - num_heads: int, - head_size: int, - num_kv_heads: int, - kv_cache: torch.Tensor, - kv_cache_dtype: str, - k_scale: float, - v_scale: float, - softmax_scale: float, - window_size: Optional[List[int]] = None, - alibi_slopes: Optional[torch.Tensor] = None, - logits_soft_cap: Optional[float] = None, -) -> None: - context = get_forward_context() - current_metadata = context.dynamic_forward_context - if current_metadata is None: - # Profiling run. - return - - assert current_metadata is not None - assert isinstance(current_metadata, FlashAttentionMetadata) - attn_metadata: FlashAttentionMetadata = current_metadata - num_actual_tokens = attn_metadata.num_actual_tokens - - # Reshape the input keys and values and store them in the cache. - key_cache = kv_cache[0] - value_cache = kv_cache[1] - torch.ops._C_cache_ops.reshape_and_cache_flash( - key[:num_actual_tokens], - value[:num_actual_tokens], - key_cache, - value_cache, - attn_metadata.slot_mapping, - kv_cache_dtype, - k_scale, - v_scale, - ) - - # Compute attention and update output up to `num_actual_tokens`. - flash_attn_varlen_func( - q=query[:num_actual_tokens], - k=key_cache, - v=value_cache, - out=output[:num_actual_tokens], - cu_seqlens_q=attn_metadata.query_start_loc, - max_seqlen_q=attn_metadata.max_query_len, - cu_seqlens_k=attn_metadata.seq_start_loc, - max_seqlen_k=attn_metadata.max_seq_len, - softmax_scale=softmax_scale, - causal=True, - alibi_slopes=alibi_slopes, - window_size=window_size, - block_table=attn_metadata.block_table, - softcap=logits_soft_cap, - ) - - -def unified_v1_flash_attention_fake( - output: torch.Tensor, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - num_heads: int, - head_size: int, - num_kv_heads: int, - kv_cache: torch.Tensor, - kv_cache_dtype: str, - k_scale: float, - v_scale: float, - softmax_scale: float, - window_size: Optional[List[int]] = None, - alibi_slopes: Optional[torch.Tensor] = None, - logits_soft_cap: Optional[float] = None, -) -> None: - return - - -direct_register_custom_op( - op_name="unified_v1_flash_attention", - op_func=unified_v1_flash_attention, - mutates_args=["kv_cache", "output"], - fake_impl=unified_v1_flash_attention_fake, -) + + # Compute attention and update output up to `num_actual_tokens`. + flash_attn_varlen_func( + q=query[:num_actual_tokens], + k=key_cache, + v=value_cache, + out=output[:num_actual_tokens], + cu_seqlens_q=attn_metadata.query_start_loc, + max_seqlen_q=attn_metadata.max_query_len, + cu_seqlens_k=attn_metadata.seq_start_loc, + max_seqlen_k=attn_metadata.max_seq_len, + softmax_scale=self.scale, + causal=True, + alibi_slopes=self.alibi_slopes, + window_size=self.sliding_window, + block_table=attn_metadata.block_table, + softcap=self.logits_soft_cap, + ) + + return output From b45f0d79469f583736052b80bfc8b3bab29f50d8 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Tue, 3 Dec 2024 01:53:36 +0800 Subject: [PATCH 217/255] [Misc][LoRA] Move the implementation of lora bias to punica.py (#10829) Signed-off-by: Jee Jee Li --- tests/lora/test_llama_tp.py | 60 +++++++-------- vllm/lora/fully_sharded_layers.py | 41 +++-------- vllm/lora/layers.py | 113 +++-------------------------- vllm/lora/punica.py | 117 +++++++++++++++++++++++++++--- 4 files changed, 156 insertions(+), 175 deletions(-) diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index aae6310a2a213..d3ca7f878191a 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -55,15 +55,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: return generated_texts -@fork_new_process_for_each_test -def test_llama_lora(sql_lora_files): - - llm = vllm.LLM(MODEL_PATH, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - tensor_parallel_size=1) - +def generate_and_test(llm, sql_lora_files): print("lora adapter created") assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT @@ -79,6 +71,17 @@ def test_llama_lora(sql_lora_files): print("removing lora") +@fork_new_process_for_each_test +def test_llama_lora(sql_lora_files): + + llm = vllm.LLM(MODEL_PATH, + enable_lora=True, + max_num_seqs=16, + max_loras=4, + tensor_parallel_size=1) + generate_and_test(llm, sql_lora_files) + + @fork_new_process_for_each_test def test_llama_lora_warmup(sql_lora_files): """Test that the LLM initialization works with a warmup LORA path and @@ -118,20 +121,7 @@ def test_llama_lora_tp4(sql_lora_files): max_loras=4, tensor_parallel_size=4, ) - - print("lora adapter created") - assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT - - print("lora 1") - assert do_sample(llm, sql_lora_files, lora_id=1) == EXPECTED_LORA_OUTPUT - - print("no lora") - assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT - - print("lora 2") - assert do_sample(llm, sql_lora_files, lora_id=2) == EXPECTED_LORA_OUTPUT - - print("removing lora") + generate_and_test(llm, sql_lora_files) @multi_gpu_test(num_gpus=4) @@ -146,16 +136,20 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files): tensor_parallel_size=4, fully_sharded_loras=True, ) - print("lora adapter created") - assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT - - print("lora 1") - assert do_sample(llm, sql_lora_files, lora_id=1) == EXPECTED_LORA_OUTPUT + generate_and_test(llm, sql_lora_files) - print("no lora") - assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT - print("lora 2") - assert do_sample(llm, sql_lora_files, lora_id=2) == EXPECTED_LORA_OUTPUT +@multi_gpu_test(num_gpus=4) +@fork_new_process_for_each_test +def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files): - print("removing lora") + llm = vllm.LLM( + MODEL_PATH, + enable_lora=True, + max_num_seqs=16, + max_loras=4, + tensor_parallel_size=4, + fully_sharded_loras=True, + enable_lora_bias=True, + ) + generate_and_test(llm, sql_lora_files) diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index f5c2eced9d2bb..5f2d32defe030 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -73,6 +73,7 @@ def apply(self, x: torch.Tensor, self.punica_wrapper.add_expand(output, buffer, self.lora_b_stacked, + self.bias_stacked, add_input=True) # now have column partitioned output @@ -131,27 +132,14 @@ def _mcp_apply(x, bias, layer: QKVParallelLinearWithLora): layer.lora_a_stacked[idx], 1.0) buffers = tensor_model_parallel_all_gather(buffers) - left_offset = 0 - for idx in range(n): - shard_size = layer.lora_b_stacked[idx].shape[2] - - if layer.bias_stacked is not None: - bias = layer.bias_stacked[idx] - if bias is not None: - bias = bias.view(-1, bias.shape[-1]) - bias = bias[layer.punica_wrapper.token_lora_indices] - bias[layer.punica_wrapper.token_lora_indices == -1] = 0 - output[:, left_offset:left_offset + shard_size] += bias - - layer.punica_wrapper.add_expand_slice( - output, - buffers[idx], - layer.lora_b_stacked[idx], - left_offset, - shard_size, - add_input=True, - ) - left_offset += shard_size + layer.punica_wrapper.add_expand_packed_nslice( + output, + buffers, + layer.lora_b_stacked, + layer.bias_stacked, + 1.0, + layer.output_slices, + ) output = output.view(*out_orig_shape) # now have column partitioned and packed output @@ -234,6 +222,7 @@ def apply(self, x: torch.Tensor, self.punica_wrapper.add_expand(output, buffer, self.lora_b_stacked, + self.bias_all, add_input=True) # now have column partitioned output output = output.view(*out_orig_shape) @@ -350,15 +339,9 @@ def apply(self, x: torch.Tensor) -> torch.Tensor: # reduced before being used shard_size = self.lora_b_stacked.shape[2] start_idx = self.tp_rank * shard_size - - if self.bias_stacked is not None: - bias = self.bias_stacked.view(-1, self.bias_stacked.shape[-1]) - bias = bias[self.punica_wrapper.token_lora_indices] - bias[self.punica_wrapper.token_lora_indices == -1] = 0 - output += bias - self.punica_wrapper.add_expand_slice(output, buffer, - self.lora_b_stacked, start_idx, + self.lora_b_stacked, + self.bias_stacked, start_idx, shard_size) output = output.view(*out_orig_shape) return output diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 3701988ff692f..73748b5ce511e 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -67,63 +67,6 @@ def dec(*args, **kwargs): return dec -def apply_bias( - indices: torch.Tensor, - output: torch.Tensor, - bias_stacked: torch.Tensor, -): - """Applies bias to output - - Input shapes: - bias_stacked: (num_loras, output_dim) - indices: (batch_size) - output: (batch_size, output_dim) - """ - org_output = output - output = output.view(-1, output.shape[-1]) - indices = indices.view(-1) - - bias_stacked = bias_stacked.view(-1, bias_stacked.shape[-1]) - bias_stacked = bias_stacked[indices] - bias_stacked[indices == -1] = 0 - output += bias_stacked - - return output.view_as(org_output) - - -def apply_bias_packed_nslice( - indices: torch.Tensor, - output: torch.Tensor, - output_slices: Tuple[int, ...], - bias_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], -): - """Applies bias to output - - Input shapes: - bias_stacked: 3 element tuple of (num_loras, output_dim) - indices: (batch_size) - output: (batch_size, q_slice_size + 2*kv_slice_size) - output_slices: n-1 element tuple of (slice_size...), - where n is number of slices - """ - org_output = output - output = output.view(-1, output.shape[-1]) - indices = indices.view(-1) - - offset_left = 0 - for slice_idx, slice in enumerate(output_slices): - bias = bias_stacked[slice_idx] - if bias is not None: - bias = bias.view(-1, bias.shape[-1]) - bias = bias[indices] - bias[indices == -1] = 0 - output[:, offset_left:offset_left + slice] += bias - - offset_left += slice - - return output.view_as(org_output) - - @dataclass class LoRAMapping(AdapterMapping): is_prefill: bool = False @@ -311,6 +254,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: self.punica_wrapper.add_expand(full_output, full_lora_a_embeddings, self.lora_b_stacked, + bias_all=None, add_input=True) return full_output.view_as(full_output_org) @@ -399,15 +343,9 @@ def set_lora( def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - if self.bias_stacked is not None: - self.indices = self.punica_wrapper.token_lora_indices - output = apply_bias( - self.indices, - output, - self.bias_stacked, - ) self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, - self.lora_b_stacked, 1.0) + self.lora_b_stacked, self.bias_stacked, + 1.0) return output def forward(self, input_): @@ -576,15 +514,9 @@ def set_lora( def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - if self.bias_stacked is not None: - self.indices = self.punica_wrapper.token_lora_indices - output = apply_bias( - self.indices, - output, - self.bias_stacked, - ) self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, - self.lora_b_stacked, 1.0) + self.lora_b_stacked, self.bias_stacked, + 1.0) return output def forward(self, input_): @@ -687,8 +619,8 @@ def create_lora_weights( ) for _ in range(n_slices)) else: self.bias_stacked = None - self.output_dim = self.lora_b_stacked[0].shape[2] + self.output_slices = (self.output_dim, self.output_dim) def reset_lora(self, index: int): self.lora_a_stacked[0][index] = 0 @@ -772,17 +704,9 @@ def set_lora( def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - if self.bias_stacked is not None: - self.indices = self.punica_wrapper.token_lora_indices - output = apply_bias_packed_nslice( - self.indices, - output, - (self.output_dim, self.output_dim), - self.bias_stacked, - ) self.punica_wrapper.add_lora_packed_nslice( - output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0, - (self.output_dim, self.output_dim)) + output, x, self.lora_a_stacked, self.lora_b_stacked, + self.bias_stacked, 1.0, (self.output_dim, self.output_dim)) return output @classmethod @@ -1129,17 +1053,10 @@ def set_lora( def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - if self.bias_stacked is not None: - self.indices = self.punica_wrapper.token_lora_indices - output = apply_bias_packed_nslice( - self.indices, - output, - self.output_slices, - self.bias_stacked, - ) self.punica_wrapper.add_lora_packed_nslice(output, x, self.lora_a_stacked, - self.lora_b_stacked, 1.0, + self.lora_b_stacked, + self.bias_stacked, 1.0, self.output_slices) return output @@ -1264,15 +1181,9 @@ def set_lora( def apply(self, x: torch.Tensor) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x) - if self.bias_stacked is not None: - self.indices = self.punica_wrapper.token_lora_indices - output = apply_bias( - self.indices, - output, - self.bias_stacked, - ) self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, - self.lora_b_stacked, 1.0) + self.lora_b_stacked, self.bias_stacked, + 1.0) return output def forward(self, input_): diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 082041f390750..3f775b7ba363e 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -450,6 +450,62 @@ def expand_slice_decode( bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset, y_slice_size, add_input) + def apply_bias( + self, + indices: torch.Tensor, + output: torch.Tensor, + bias_stacked: torch.Tensor, + ): + """Applies bias to output + + Input shapes: + bias_stacked: (num_loras, output_dim) + indices: (batch_size) + output: (batch_size, output_dim) + """ + org_output = output + output = output.view(-1, output.shape[-1]) + indices = indices.view(-1) + + bias_stacked = bias_stacked.view(-1, bias_stacked.shape[-1]) + bias_stacked = bias_stacked[indices] + bias_stacked[indices == -1] = 0 + output += bias_stacked + + return output.view_as(org_output) + + def apply_bias_packed_nslice( + self, + indices: torch.Tensor, + output: torch.Tensor, + output_slices: Tuple[int, ...], + bias_stacked: Tuple[Optional[torch.Tensor], ...], + ): + """Applies bias to output + + Input shapes: + bias_stacked: 3 element tuple of (num_loras, output_dim) + indices: (batch_size) + output: (batch_size, q_slice_size + 2*kv_slice_size) + output_slices: n-1 element tuple of (slice_size...), + where n is number of slices + """ + org_output = output + output = output.view(-1, output.shape[-1]) + indices = indices.view(-1) + + offset_left = 0 + for slice_idx, slice in enumerate(output_slices): + bias = bias_stacked[slice_idx] + if bias is not None: + bias = bias.view(-1, bias.shape[-1]) + bias = bias[indices] + bias[indices == -1] = 0 + output[:, offset_left:offset_left + slice] += bias + offset_left += slice + + return output.view_as(org_output) + def add_shrink( self, y: torch.Tensor, @@ -474,16 +530,19 @@ def add_expand( y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor, + bias_all: Optional[torch.Tensor], add_input: bool = True, ): """ - Perform the ` y+=x@w_t_all` computation, which is suitable for the + Perform the ` y+=x@w_t_all+bias` computation, which is suitable for the GEMM of lora'b. When `is_prefill` is true, it indicates that it is currently the prefill stage, and the `expand_prefill` function should be called. Otherwise, it is the decode stage, and the expand_decode function should be called. """ + if bias_all is not None: + y = self.apply_bias(self.token_lora_indices, y, bias_all) expand_fun: Callable = (self.expand_prefill if self.is_prefill else self.expand_decode) @@ -493,23 +552,54 @@ def add_expand_slice(self, y: torch.Tensor, x: torch.Tensor, w_t_all: torch.Tensor, + bias_all: Optional[torch.Tensor], y_offset: Optional[int], y_slice_size: Optional[int], add_input: bool = True): """ Similar to `add_expand` """ + if bias_all is not None: + y = self.apply_bias(self.token_lora_indices, y, bias_all) expand_slice_fun: Callable = (self.expand_slice_prefill if self.is_prefill else self.expand_slice_decode) expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input) + def add_expand_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, + lora_b_stacked: Tuple[torch.Tensor, ...], + bias_stacked: Optional[Tuple[torch.Tensor, + ...]], + scale: float, + output_slices: Tuple[int, ...]) -> None: + """ + Similar to `add_expand` + """ + y_org = y + y = y.view(-1, y.shape[-1]) + offset_left = 0 + if bias_stacked is not None: + self.apply_bias_packed_nslice(self.token_lora_indices, y, + output_slices, bias_stacked) + for slice_idx in range(len(lora_b_stacked)): + self.add_expand_slice(y, + x[slice_idx], + lora_b_stacked[slice_idx], + None, + offset_left, + output_slices[slice_idx], + add_input=True) + offset_left += output_slices[slice_idx] + + y = y.view_as(y_org) + def add_lora(self, y: torch.Tensor, x: torch.Tensor, wa_t_all: torch.Tensor, wb_t_all: torch.Tensor, + bias_all: Optional[torch.Tensor], scale: float, y_offset: Optional[int] = None, y_slice_size: Optional[int] = None, @@ -522,12 +612,13 @@ def add_lora(self, @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) * scale - ).squeeze(0) + ).squeeze(0)+bias[i] Args: y (torch.Tensor): Output tensor. Will be changed in-place. x (torch.Tensor): Input tensor wa_t_all (torch.Tensor): lora_a's weight wb_t_all (torch.Tensor): lora_b's weight + bias_all: (torch.Tensor): lora's bias scale (float): Scaling factor. y_offset (Optional[int], optional): Offset to apply to the starting column of y. @@ -544,27 +635,26 @@ def add_lora(self, buffer = torch.zeros((x.size(0), r), dtype=torch.float32, device=x.device) - + if bias_all is not None: + y = self.apply_bias(self.token_lora_indices, y, bias_all) self.add_shrink(buffer, x, wa_t_all, scale) if y_offset is None and y_slice_size is None: - self.add_expand(y, buffer, wb_t_all, add_input=True) + self.add_expand(y, buffer, wb_t_all, bias_all=None, add_input=True) else: self.add_expand_slice(y, buffer, wb_t_all, + None, y_offset, y_slice_size, add_input=True) y = y.view_as(y_org) def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, - lora_a_stacked: Tuple[torch.Tensor, - torch.Tensor, - torch.Tensor], - lora_b_stacked: Tuple[torch.Tensor, - torch.Tensor, - torch.Tensor], - scale: float, + lora_a_stacked: Tuple[torch.Tensor, ...], + lora_b_stacked: Tuple[torch.Tensor, ...], + bias_all: Tuple[Optional[torch.Tensor], + ...], scale: float, output_slices: Tuple[int, ...]) -> None: """ Applies lora to each input. Similar to add_lora, This method is @@ -575,10 +665,13 @@ def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, x = x.view(-1, x.shape[-1]) y = y.view(-1, y.shape[-1]) offset_left = 0 + if bias_all is not None: + y = self.apply_bias_packed_nslice(self.token_lora_indices, y, + output_slices, bias_all) # TODO fuse these kernels for slice_idx in range(len(output_slices)): self.add_lora(y, x, lora_a_stacked[slice_idx], - lora_b_stacked[slice_idx], scale, offset_left, + lora_b_stacked[slice_idx], None, scale, offset_left, output_slices[slice_idx]) offset_left += output_slices[slice_idx] From 519cc6ca12dc89eec35bc2579494e399da33c31a Mon Sep 17 00:00:00 2001 From: Yan Ma Date: Tue, 3 Dec 2024 01:53:55 +0800 Subject: [PATCH 218/255] [Misc][XPU] Avoid torch compile for XPU platform (#10747) Signed-off-by: yan ma Co-authored-by: youkaichao --- .buildkite/run-xpu-test.sh | 6 ++++-- vllm/plugins/__init__.py | 4 ++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh index faeac8e2ded36..50f58f7d70430 100644 --- a/.buildkite/run-xpu-test.sh +++ b/.buildkite/run-xpu-test.sh @@ -12,5 +12,7 @@ remove_docker_container() { docker rm -f xpu-test || true; } trap remove_docker_container EXIT remove_docker_container -# Run the image and launch offline inference -docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py +# Run the image and test offline inference/tensor parallel +docker run -it -d --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test /bin/bash +docker exec xpu-test bash -c "python3 examples/offline_inference.py" +docker exec xpu-test bash -c "python3 examples/offline_inference_cli.py -tp 2" diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 3c64726ca3344..81ee9975cdc4a 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -4,6 +4,7 @@ import torch import vllm.envs as envs +from vllm.platforms import current_platform logger = logging.getLogger(__name__) @@ -25,6 +26,9 @@ def load_general_plugins(): os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1' # see https://github.com/vllm-project/vllm/issues/10619 torch._inductor.config.compile_threads = 1 + if current_platform.is_xpu(): + # see https://github.com/pytorch/pytorch/blob/8cada5cbe5450e17c26fb8b358116785324537b2/torch/_dynamo/config.py#L158 # noqa + os.environ['TORCH_COMPILE_DISABLE'] = 'True' global plugins_loaded if plugins_loaded: return From 9b14d978aa8c286b738f107fab4626273f4fc088 Mon Sep 17 00:00:00 2001 From: Jani Monoses Date: Mon, 2 Dec 2024 20:52:19 +0200 Subject: [PATCH 219/255] Fix openvino on GPU (#10793) --- vllm/worker/openvino_worker.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py index 205f8a337ce6c..0bf522d5333ed 100644 --- a/vllm/worker/openvino_worker.py +++ b/vllm/worker/openvino_worker.py @@ -489,7 +489,7 @@ def model_profile_run(): block_size = cache_config.block_size seq_num_blocks = (seq_len + block_size - 1) // block_size - seq_data, dummy_multi_modal_data = input_registry \ + dummy_data = input_registry \ .dummy_data_for_profiling(model_config, seq_len, mm_registry) @@ -498,11 +498,11 @@ def model_profile_run(): seq = SequenceGroupMetadata( request_id=str(group_id), is_prompt=True, - seq_data={group_id: seq_data}, + seq_data={group_id: dummy_data.seq_data}, sampling_params=sampling_params, block_tables=block_tables, lora_request=None, - multi_modal_data=dummy_multi_modal_data) + multi_modal_data=dummy_data.multi_modal_data) seqs.append(seq) self.model_runner.block_size = tmp_cache_config.block_size From 4c05edb33ae4ae279421ddf981816d070e8ec37a Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Tue, 3 Dec 2024 07:06:09 +0800 Subject: [PATCH 220/255] [Model] Add TP and BNB quantization support to LlavaMultiModalProjector (#10834) Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung --- vllm/model_executor/model_loader/loader.py | 14 +++++++-- vllm/model_executor/models/llava.py | 35 ++++++++++++++-------- 2 files changed, 34 insertions(+), 15 deletions(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 0e12bc5691538..b4921cc80797f 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -1120,7 +1120,14 @@ def _load_weights(self, model_config: ModelConfig, model_config.revision, pre_quant, load_8bit)) - model.load_weights(qweight_iterator) + weights_to_load = {name for name, _ in model.named_parameters()} + loaded_weights = model.load_weights(qweight_iterator) + # Some models may have weights loading tracker unimplemented. + if loaded_weights is not None: + weights_not_loaded = weights_to_load - loaded_weights + if weights_not_loaded: + raise ValueError("Following weights were not initialized from " + f"checkpoint: {weights_not_loaded}") torch.cuda.empty_cache() @@ -1152,9 +1159,10 @@ def _load_weights(self, model_config: ModelConfig, shard_name, weight_name) break + # Models like Clip/Siglip may skip some layers in initialization, + # causing unused quant_param_name in state_dict. if quant_param_name not in param_dict: - raise ValueError( - f"Parameter {quant_param_name} not found in the model.") + continue if quant_param_name not in stacked_quant_state_dict: stacked_quant_state_dict[quant_param_name] = {} diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index db7fa82ceb9b7..d375c1c9da2a9 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -13,6 +13,8 @@ from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, InputContext) from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -59,25 +61,32 @@ class LlavaImageEmbeddingInputs(TypedDict): LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageEmbeddingInputs] -# TODO(xwjiang): Run benchmark and decide if TP. class LlavaMultiModalProjector(nn.Module): - def __init__(self, vision_hidden_size: int, text_hidden_size: int, - projector_hidden_act: str): + def __init__(self, + vision_hidden_size: int, + text_hidden_size: int, + projector_hidden_act: str, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): super().__init__() - self.linear_1 = nn.Linear(vision_hidden_size, - text_hidden_size, - bias=True) + self.linear_1 = ColumnParallelLinear(vision_hidden_size, + text_hidden_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.linear_1") self.act = get_act_fn(projector_hidden_act) - self.linear_2 = nn.Linear(text_hidden_size, - text_hidden_size, - bias=True) + self.linear_2 = RowParallelLinear(text_hidden_size, + text_hidden_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.linear_2") def forward(self, image_features: torch.Tensor) -> torch.Tensor: - hidden_states = self.linear_1(image_features) + hidden_states, _ = self.linear_1(image_features) hidden_states = self.act(hidden_states) - hidden_states = self.linear_2(hidden_states) + hidden_states, _ = self.linear_2(hidden_states) return hidden_states @@ -325,7 +334,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.multi_modal_projector = LlavaMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, text_hidden_size=config.text_config.hidden_size, - projector_hidden_act=config.projector_hidden_act) + projector_hidden_act=config.projector_hidden_act, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "multi_modal_projector")) self.language_model = init_vllm_registered_model( vllm_config=vllm_config, From 4433195ab75e2bb367303ba5f34c97521c5677ce Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 2 Dec 2024 21:26:15 -0500 Subject: [PATCH 221/255] [Bugfix] Prevent benchmark_throughput.py from using duplicated random prompts (#10753) --- benchmarks/benchmark_throughput.py | 47 +++++++++++++++++++----------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 159cf055737ce..1e5967bd9bf8b 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -294,23 +294,36 @@ def main(args: argparse.Namespace): tokenizer = AutoTokenizer.from_pretrained( args.tokenizer, trust_remote_code=args.trust_remote_code) if args.dataset is None: - # Synthesize a prompt with the given input length. - # As tokenizer may add additional tokens like BOS, we need to try - # different lengths to get the desired input length. - for i in range(-10, 10): - prompt = "hi " * (args.input_len + i) - tokenized_prompt = tokenizer(prompt).input_ids - if len(tokenized_prompt) == args.input_len: - break - else: - raise ValueError( - f"Failed to synthesize a prompt with {args.input_len} tokens.") - requests = [ - SampleRequest(prompt=prompt, - prompt_len=args.input_len, - expected_output_len=args.output_len) - for _ in range(args.num_prompts) - ] + vocab_size = tokenizer.vocab_size + requests = [] + for _ in range(args.num_prompts): + # Synthesize a prompt with the given input length. + candidate_ids = [ + random.randint(0, vocab_size - 1) + for _ in range(args.input_len) + ] + # As tokenizer may add additional tokens like BOS, we need to try + # different lengths to get the desired input length. + for _ in range(5): # Max attempts to correct + candidate_prompt = tokenizer.decode(candidate_ids) + tokenized_len = len(tokenizer.encode(candidate_prompt)) + + if tokenized_len == args.input_len: + break + + # Adjust length based on difference + diff = args.input_len - tokenized_len + if diff > 0: + candidate_ids.extend([ + random.randint(100, vocab_size - 100) + for _ in range(diff) + ]) + else: + candidate_ids = candidate_ids[:diff] + requests.append( + SampleRequest(prompt=candidate_prompt, + prompt_len=args.input_len, + expected_output_len=args.output_len)) else: requests = sample_requests(tokenizer, args) From d746268e92dc97d3a816c70637e20073eeac5103 Mon Sep 17 00:00:00 2001 From: zixuanzhang226 Date: Mon, 2 Dec 2024 19:06:41 -0800 Subject: [PATCH 222/255] [Model] support bitsandbytes quantization with minicpm model (#10842) Signed-off-by: Ubuntu --- vllm/model_executor/models/minicpm.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 6254d26c7060d..5a0f202364f26 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -534,6 +534,16 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP): } embedding_padding_modules = ["lm_head"] + # BitandBytes specific attributes + bitsandbytes_stacked_params_mapping = { + # shard_name, weight_name, index + "q_proj": ("qkv_proj", 0), + "k_proj": ("qkv_proj", 1), + "v_proj": ("qkv_proj", 2), + "gate_proj": ("gate_up_proj", 0), + "up_proj": ("gate_up_proj", 1), + } + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config From a4cf2561599448d4a5c3de4d79c73ca37cb8d647 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Tue, 3 Dec 2024 12:10:29 +0800 Subject: [PATCH 223/255] [Bugfix] Fix QKVParallelLinearWithShardedLora bias bug (#10844) Signed-off-by: Jee Jee Li --- .buildkite/test-pipeline.yaml | 1 - vllm/lora/fully_sharded_layers.py | 9 +-------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index f5591f1098534..455f02a2062f1 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -481,7 +481,6 @@ steps: - label: LoRA TP Test (Distributed) num_gpus: 4 - soft_fail: true source_file_dependencies: - vllm/lora - tests/lora diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index 5f2d32defe030..e25e453201f01 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -77,13 +77,6 @@ def apply(self, x: torch.Tensor, add_input=True) # now have column partitioned output - if self.bias_stacked is not None: - self.bias_stacked = self.bias_stacked.view( - -1, self.bias_stacked.shape[-1]) - self.bias_stacked = self.bias_stacked[ - self.punica_wrapper.token_lora_indices] - output += self.bias_stacked - output = output.view(*out_orig_shape) return output @@ -222,7 +215,7 @@ def apply(self, x: torch.Tensor, self.punica_wrapper.add_expand(output, buffer, self.lora_b_stacked, - self.bias_all, + self.bias_stacked, add_input=True) # now have column partitioned output output = output.view(*out_orig_shape) From 21fe7b481a3a84dc9ebe2497ec89a17002ad52c5 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 2 Dec 2024 20:53:23 -0800 Subject: [PATCH 224/255] [core][distributed] add pynccl broadcast (#10843) Signed-off-by: youkaichao --- tests/distributed/test_pynccl.py | 45 ++++++++++++++++++- .../device_communicators/pynccl.py | 19 ++++++++ .../device_communicators/pynccl_wrapper.py | 16 +++++++ 3 files changed, 78 insertions(+), 2 deletions(-) diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index fb24d6bc2c100..4e27babf12cc3 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -61,6 +61,7 @@ def worker_fn(): dtype=torch.float32).cuda(pynccl_comm.rank) with pynccl_comm.change_state(enable=True): tensor = pynccl_comm.all_reduce(tensor) + torch.cuda.synchronize() result = tensor.mean().cpu().item() assert result == pynccl_comm.world_size @@ -86,10 +87,12 @@ def multiple_allreduce_worker_fn(): if torch.distributed.get_rank() in [0, 1]: tensor = pynccl_comm.all_reduce(tensor) tensor = pynccl_comm.all_reduce(tensor) + torch.cuda.synchronize() result = tensor.mean().cpu().item() assert result == 4 else: tensor = pynccl_comm.all_reduce(tensor) + torch.cuda.synchronize() result = tensor.mean().cpu().item() assert result == 2 @@ -112,10 +115,12 @@ def multiple_allreduce_with_vllm_worker_fn(): if torch.distributed.get_rank() in [0, 1]: tensor = tensor_model_parallel_all_reduce(tensor) tensor = tensor_model_parallel_all_reduce(tensor) + torch.cuda.synchronize() result = tensor.mean().cpu().item() assert result == 4 else: tensor = tensor_model_parallel_all_reduce(tensor) + torch.cuda.synchronize() result = tensor.mean().cpu().item() assert result == 2 @@ -141,9 +146,9 @@ def worker_fn_with_cudagraph(): graph, stream=pynccl_comm.stream), pynccl_comm.change_state( enable=True): a_out = pynccl_comm.all_reduce(a) - pynccl_comm.stream.synchronize() + torch.cuda.synchronize() graph.replay() - pynccl_comm.stream.synchronize() + torch.cuda.synchronize() assert a_out.mean().cpu().item() == pynccl_comm.world_size**1 @@ -170,6 +175,7 @@ def all_gather_worker_fn(): with pynccl_comm.change_state(enable=True): pynccl_comm.all_gather(result, tensor) + torch.cuda.synchronize() torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8) @@ -207,6 +213,7 @@ def reduce_scatter_worker_fn(): with pynccl_comm.change_state(enable=True): pynccl_comm.reduce_scatter(result, tensor) + torch.cuda.synchronize() torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8) @@ -241,6 +248,7 @@ def send_recv_worker_fn(): pynccl_comm.recv(tensor, src=(pynccl_comm.rank - 1) % pynccl_comm.world_size) + torch.cuda.synchronize() result = tensor.mean().cpu().item() assert result == 1 @@ -280,6 +288,7 @@ def multiple_send_recv_worker_fn(): pynccl_comm.recv(tensor, src=(pynccl_comm.rank - 1) % pynccl_comm.world_size) + torch.cuda.synchronize() result = tensor.mean().cpu().item() if torch.distributed.get_rank() in [0, 2]: assert result == 1 @@ -293,6 +302,38 @@ def test_pynccl_multiple_send_recv(): distributed_run(multiple_send_recv_worker_fn, 4) +@pytest.mark.skipif(torch.cuda.device_count() < 4, + reason="Need at least 4 GPUs to run the test.") +def test_pynccl_broadcast(): + distributed_run(broadcast_worker_fn, 4) + + +@worker_fn_wrapper +def broadcast_worker_fn(): + # Test broadcast for every root rank. + # Essentially this is an all-gather operation. + pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group, + device=get_world_group().device) + recv_tensors = [ + torch.empty(16, + 1024, + 1024, + dtype=torch.float32, + device=pynccl_comm.device) + for i in range(pynccl_comm.world_size) + ] + recv_tensors[pynccl_comm.rank] = torch.ones( + 16, 1024, 1024, dtype=torch.float32, + device=pynccl_comm.device) * pynccl_comm.rank + + for i in range(pynccl_comm.world_size): + pynccl_comm.broadcast(recv_tensors[i], src=i) + # the broadcast op might be launched in a different stream + # need to synchronize to make sure the tensor is ready + torch.cuda.synchronize() + assert torch.all(recv_tensors[i] == i).cpu().item() + + def test_ncclGetUniqueId(): lib = NCCLLibrary() unique_id = lib.ncclGetUniqueId() diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index d4e3f81747038..a6800f93f167b 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -197,6 +197,25 @@ def recv(self, tensor: torch.Tensor, src: int, stream=None): ncclDataTypeEnum.from_torch(tensor.dtype), src, self.comm, cudaStream_t(stream.cuda_stream)) + def broadcast(self, tensor: torch.Tensor, src: int, stream=None): + if self.disabled: + return + assert tensor.device == self.device, ( + f"this nccl communicator is created to work on {self.device}, " + f"but the input tensor is on {tensor.device}") + if stream is None: + stream = self.stream + if src == self.rank: + sendbuff = buffer_type(tensor.data_ptr()) + # NCCL requires the sender also to have a receive buffer + recvbuff = buffer_type(tensor.data_ptr()) + else: + sendbuff = buffer_type() + recvbuff = buffer_type(tensor.data_ptr()) + self.nccl.ncclBroadcast(sendbuff, recvbuff, tensor.numel(), + ncclDataTypeEnum.from_torch(tensor.dtype), src, + self.comm, cudaStream_t(stream.cuda_stream)) + @contextmanager def change_state(self, enable: Optional[bool] = None, diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index ff88f72470b27..7dea61b6a09f1 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -189,6 +189,15 @@ class NCCLLibrary: ncclComm_t, cudaStream_t ]), + # ncclResult_t ncclBroadcast( + # const void* sendbuff, void* recvbuff, size_t count, + # ncclDataType_t datatype, int root, ncclComm_t comm, + # cudaStream_t stream); + Function("ncclBroadcast", ncclResult_t, [ + buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t, + ctypes.c_int, ncclComm_t, cudaStream_t + ]), + # be cautious! this is a collective call, it will block until all # processes in the communicator have called this function. # because Python object destruction can happen in random order, @@ -312,6 +321,13 @@ def ncclRecv(self, recvbuff: buffer_type, count: int, datatype: int, self.NCCL_CHECK(self._funcs["ncclRecv"](recvbuff, count, datatype, src, comm, stream)) + def ncclBroadcast(self, sendbuff: buffer_type, recvbuff: buffer_type, + count: int, datatype: int, root: int, comm: ncclComm_t, + stream: cudaStream_t) -> None: + self.NCCL_CHECK(self._funcs["ncclBroadcast"](sendbuff, recvbuff, count, + datatype, root, comm, + stream)) + def ncclCommDestroy(self, comm: ncclComm_t) -> None: self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm)) From dc5ce861bf0e10fc002384859b93b1eebbd70933 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 2 Dec 2024 22:19:02 -0800 Subject: [PATCH 225/255] [torch.compile] remove compilation_context and simplify code (#10838) Signed-off-by: youkaichao --- tests/compile/piecewise/test_simple.py | 9 +- tests/compile/piecewise/test_toy_llama.py | 33 ++++---- .../decoder_only/language/test_jamba.py | 5 +- .../decoder_only/language/test_mamba.py | 5 +- .../test_encoder_decoder_model_runner.py | 4 +- tests/worker/test_model_runner.py | 5 +- vllm/compilation/backends.py | 4 - vllm/compilation/compile_context.py | 23 ----- vllm/config.py | 83 +++++++++++++++++-- vllm/model_executor/models/jamba.py | 6 +- vllm/model_executor/models/mamba.py | 6 +- vllm/v1/worker/gpu_model_runner.py | 14 ++-- vllm/worker/enc_dec_model_runner.py | 6 +- vllm/worker/model_runner.py | 68 ++------------- 14 files changed, 128 insertions(+), 143 deletions(-) delete mode 100644 vllm/compilation/compile_context.py diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index 7ef502abee345..aa11524812cdd 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -7,7 +7,6 @@ from torch import nn from torch.library import Library -from vllm.compilation.compile_context import set_compile_context from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig, @@ -81,6 +80,7 @@ def test_simple_piecewise_compile(): use_cudagraph=True, splitting_ops=["silly.attention"], cudagraph_copy_inputs=True, + cudagraph_capture_sizes=[1, 2], )) with set_current_vllm_config(vllm_config): model = SillyModel(vllm_config=vllm_config, prefix='') @@ -96,11 +96,10 @@ def test_simple_piecewise_compile(): 6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen ): - with set_compile_context([1, 2]): - model(inputs) + model(inputs) - model(torch.randn(2).cuda()) - model(torch.randn(1).cuda()) + model(torch.randn(2).cuda()) + model(torch.randn(1).cuda()) input = torch.zeros(2).cuda() global global_counter diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index dbd5a3bbffeab..07c10a3a18c55 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -13,7 +13,6 @@ from torch import nn from torch.library import Library -from vllm.compilation.compile_context import set_compile_context from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig, @@ -256,6 +255,7 @@ def run_model(llama_config, compilation_config = CompilationConfig( level=CompilationLevel.PIECEWISE, use_cudagraph=True, + cudagraph_capture_sizes=[1, 2], ) if split_attn: compilation_config.splitting_ops = ["silly.attention"] @@ -273,10 +273,9 @@ def run_model(llama_config, input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda() positions = torch.arange(B).cuda() - with set_compile_context([1, 2]): - model(input_ids, positions) - model(input_ids[:2], positions[:2]) - model(input_ids[:1], positions[:1]) + model(input_ids, positions) + model(input_ids[:2], positions[:2]) + model(input_ids[:1], positions[:1]) input_ids[:2].zero_() output = model(input_ids[:2], positions[:2]) @@ -379,10 +378,13 @@ def benchmark(): level=CompilationLevel.PIECEWISE, use_cudagraph=True, splitting_ops=["silly.attention"], + cudagraph_capture_sizes=cudagraph_sizes, ) else: compilation_config = CompilationConfig( - level=CompilationLevel.PIECEWISE, ) + level=CompilationLevel.PIECEWISE, + cudagraph_capture_sizes=cudagraph_sizes, + ) vllm_config = VllmConfig(compilation_config=compilation_config) with set_current_vllm_config(vllm_config): @@ -396,17 +398,16 @@ def benchmark(): graphs = {} - with set_compile_context(cudagraph_sizes): - model(input_ids, positions) - for b in cudagraph_sizes[::-1]: - if not piecewise: - graph = torch.cuda.CUDAGraph() - with torch.cuda.graph(graph, pool=pool): - output = model(input_ids[:b], positions[:b]) - graphs[b] = (graph, output) - else: + model(input_ids, positions) + for b in cudagraph_sizes[::-1]: + if not piecewise: + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph, pool=pool): output = model(input_ids[:b], positions[:b]) - graphs[b] = (model, output) + graphs[b] = (graph, output) + else: + output = model(input_ids[:b], positions[:b]) + graphs[b] = (model, output) for b in cudagraph_sizes: if piecewise: # noqa is for `Function definition does not bind loop variable` diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py index 87a05b3011393..cae25ae9fa2c8 100644 --- a/tests/models/decoder_only/language/test_jamba.py +++ b/tests/models/decoder_only/language/test_jamba.py @@ -1,8 +1,8 @@ import pytest from tests.utils import multi_gpu_test +from vllm.config import VllmConfig from vllm.sampling_params import SamplingParams -from vllm.worker.model_runner import _get_graph_batch_size from ...utils import check_outputs_equal @@ -189,7 +189,8 @@ def test_mamba_cache_cg_padding( # This test is for verifying that mamba cache is padded to CG captured # batch size. If it's not, a torch RuntimeError will be raised because # tensor dimensions aren't compatible - while len(example_prompts) == _get_graph_batch_size(len(example_prompts)): + while len(example_prompts) == VllmConfig.get_graph_batch_size( + len(example_prompts)): example_prompts.append(example_prompts[0]) try: diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py index 01e208347bff4..35018c3c14dee 100644 --- a/tests/models/decoder_only/language/test_mamba.py +++ b/tests/models/decoder_only/language/test_mamba.py @@ -5,8 +5,8 @@ import pytest from transformers import AutoModelForCausalLM, AutoTokenizer +from vllm.config import VllmConfig from vllm.sampling_params import SamplingParams -from vllm.worker.model_runner import _get_graph_batch_size from ...utils import check_outputs_equal @@ -200,7 +200,8 @@ def test_mamba_cache_cg_padding( # This test is for verifying that mamba cache is padded to CG captured # batch size. If it's not, a torch RuntimeError will be raised because # tensor dimensions aren't compatible - while len(example_prompts) == _get_graph_batch_size(len(example_prompts)): + while len(example_prompts) == VllmConfig.get_graph_batch_size( + len(example_prompts)): example_prompts.append(example_prompts[0]) try: diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py index 9e166ae64dbfb..5289c91f201cd 100644 --- a/tests/worker/test_encoder_decoder_model_runner.py +++ b/tests/worker/test_encoder_decoder_model_runner.py @@ -4,12 +4,12 @@ import pytest import torch +from vllm.config import VllmConfig from vllm.engine.arg_utils import EngineArgs from vllm.platforms import current_platform from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata from vllm.utils import make_tensor_with_pad from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner -from vllm.worker.model_runner import _get_graph_batch_size BATCH_SIZES = [1, 4, 16, 64, 256] @@ -548,7 +548,7 @@ def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group): # With CUDA Graph capture and replay enabled, the decoder and encoder # input sequences will be padded. Create the expected padded tensors # accordingly. - graph_batch_size = _get_graph_batch_size(expanded_batch_size) + graph_batch_size = VllmConfig.get_graph_batch_size(expanded_batch_size) cuda_graph_pad_size = graph_batch_size - expanded_batch_size padded_seq_lens = seq_lens + list(itertools.repeat(1, cuda_graph_pad_size)) padded_encoder_seq_lens = encoder_seq_lens + list( diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index 433a9b30ba57a..4055524f3e0c7 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -3,13 +3,14 @@ import pytest import torch +from vllm.config import VllmConfig from vllm.distributed.parallel_state import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.engine.arg_utils import EngineArgs from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata from vllm.utils import get_open_port -from vllm.worker.model_runner import ModelRunner, _get_graph_batch_size +from vllm.worker.model_runner import ModelRunner def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner: @@ -176,7 +177,7 @@ def test_prepare_decode_cuda_graph(batch_size): model_input.attn_metadata, model_input.attn_metadata.slot_mapping) assert len(slot_mapping) == len(input_tokens) - expected_bs = _get_graph_batch_size(len(seq_group_metadata_list)) + expected_bs = VllmConfig.get_graph_batch_size(len(seq_group_metadata_list)) # Verify input metadata is correct for prompts. device = model_runner.device assert attn_metadata.num_prefills == 0 diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 464bc2af8fd6d..d49a83fe3981f 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -242,10 +242,6 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: assert not self._called, "VllmBackend can only be called once" self.graph = graph - # config is updated now, because only here can - # we get the sizes to capture for cudagraph - # from compilation context - self.compilation_configs.init_during_runtime() self.configure_post_pass() self.split_gm, self.piecewise_graphs = split_graph( diff --git a/vllm/compilation/compile_context.py b/vllm/compilation/compile_context.py deleted file mode 100644 index 29db3d4c637b9..0000000000000 --- a/vllm/compilation/compile_context.py +++ /dev/null @@ -1,23 +0,0 @@ -from contextlib import contextmanager -from typing import Any - -_compile_context: Any = None - - -def get_compile_context() -> Any: - """Get the current compile context.""" - return _compile_context - - -@contextmanager -def set_compile_context(context: Any): - """A context manager that stores the current compile context, - usually it is a list of sizes to specialize. - """ - global _compile_context - prev_context = _compile_context - _compile_context = context - try: - yield - finally: - _compile_context = prev_context diff --git a/vllm/config.py b/vllm/config.py index 5f50d65ec87e1..326340d3fa655 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2357,15 +2357,10 @@ def init_backend(self) -> Union[str, Callable]: from vllm.compilation.backends import VllmBackend return VllmBackend(self) - def init_during_runtime(self): + def init_with_cudagraph_sizes(self, sizes_to_specialize: List[int]): """To complete the initialization of config, - we need to know the compile context, which is only available - during the first run of the model. - """ - from vllm.compilation.compile_context import get_compile_context - context = get_compile_context() - context = copy.deepcopy(context) if context is not None else [] - sizes_to_specialize: List[int] = context + we need to know the cudagraph sizes.""" + if self.cudagraph_capture_sizes is None: self.capture_sizes = sizes_to_specialize else: @@ -2386,6 +2381,21 @@ def init_during_runtime(self): self.inductor_compile_sizes = [] self.compile_sizes = self.inductor_compile_sizes + # sort to make sure cudagraph capture sizes are in descending order + self.capture_sizes.sort(reverse=True) + + +_BATCH_SIZE_ALIGNMENT = 8 +# all the token sizes that **can** be captured by cudagraph. +# they can be arbitrarily large. +# currently it includes: 1, 2, 4, 8, 16, 24, 32, 40, ..., 8192. +# the actual sizes to capture will be determined by the model, +# depending on the model's max_num_seqs. +# NOTE: get_graph_batch_size needs to be updated if this list is changed. +_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [ + _BATCH_SIZE_ALIGNMENT * i for i in range(1, 1025) +] + @dataclass class VllmConfig: @@ -2413,6 +2423,41 @@ class VllmConfig: kv_transfer_config: KVTransferConfig = field(default=None, init=True) # type: ignore + @staticmethod + def get_graph_batch_size(batch_size: int) -> int: + """Returns the padded batch size given actual batch size. + + Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT, + 2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT... + """ + if batch_size <= 2: + return batch_size + elif batch_size <= 4: + return 4 + else: + return ((batch_size + _BATCH_SIZE_ALIGNMENT - 1) // + _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT) + + @staticmethod + def get_max_graph_batch_size(max_num_seqs: int) -> int: + """ + max_num_seqs: Maximum number of sequences in a batch. + _BATCH_SIZES_TO_CAPTURE: all the sizes that we want to capture. + + pad the max_num_seqs if necessary by calling get_graph_batch_size, + which will deal with some edge cases like 1, 2, 4. + + if the padded size is in _BATCH_SIZES_TO_CAPTURE, return the padded + size. if not, it means the padded size is larger than the largest size + in _BATCH_SIZES_TO_CAPTURE, return the largest size in + _BATCH_SIZES_TO_CAPTURE. + """ + padded_size = VllmConfig.get_graph_batch_size(max_num_seqs) + if padded_size in _BATCH_SIZES_TO_CAPTURE: + return padded_size + assert padded_size > _BATCH_SIZES_TO_CAPTURE[-1] + return _BATCH_SIZES_TO_CAPTURE[-1] + @staticmethod def _get_quantization_config( model_config: ModelConfig, @@ -2496,6 +2541,28 @@ def __post_init__(self): self.compilation_config.pass_config.enable_reshape = False self.compilation_config.level = CompilationLevel.PIECEWISE + if not envs.VLLM_USE_V1: + max_batchsize_to_capture = 0 + if self.scheduler_config is not None and \ + self.model_config is not None and \ + not self.model_config.enforce_eager: + max_batchsize_to_capture = \ + self.get_max_graph_batch_size( + self.scheduler_config.max_num_seqs) + batch_size_capture_list = [ + size for size in _BATCH_SIZES_TO_CAPTURE + if size <= max_batchsize_to_capture + ] + else: + batch_size_capture_list = [] + if self.model_config is not None and \ + not self.model_config.enforce_eager: + batch_size_capture_list = [1, 2, 4 + ] + [i for i in range(8, 513, 8)] + + self.compilation_config.init_with_cudagraph_sizes( + batch_size_capture_list) + if self.cache_config is not None and \ self.cache_config.cpu_offload_gb > 0 and \ self.compilation_config.level != CompilationLevel.NO_COMPILATION: diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 099ca7e12b288..5d5e8ae1ee532 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -7,7 +7,7 @@ from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.layer import Attention -from vllm.config import CacheConfig, VllmConfig +from vllm.config import _BATCH_SIZES_TO_CAPTURE, CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm @@ -25,8 +25,6 @@ MambaCacheParams) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from vllm.worker.model_runner import (_BATCH_SIZES_TO_CAPTURE, - _get_graph_batch_size) from .interfaces import HasInnerState, SupportsLoRA from .utils import maybe_prefix @@ -404,7 +402,7 @@ def forward(self, inputs_embeds: Optional[torch.Tensor] = None, **kwargs): if self.mamba_cache is None: - max_batch_size = (_get_graph_batch_size( + max_batch_size = (VllmConfig.get_graph_batch_size( self.scheduler_config.max_num_seqs) if self.scheduler_config else max(_BATCH_SIZES_TO_CAPTURE) + 2) diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index ac0d265a961f0..b32032e411b0a 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -6,7 +6,7 @@ from transformers import MambaConfig from vllm.attention.backends.abstract import AttentionMetadata -from vllm.config import CacheConfig, VllmConfig +from vllm.config import _BATCH_SIZES_TO_CAPTURE, CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -23,8 +23,6 @@ MambaCacheParams) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from vllm.worker.model_runner import (_BATCH_SIZES_TO_CAPTURE, - _get_graph_batch_size) from .utils import maybe_prefix @@ -187,7 +185,7 @@ def forward(self, inputs_embeds: Optional[torch.Tensor] = None, **kwargs): if self.mamba_cache is None: - max_batch_size = (_get_graph_batch_size( + max_batch_size = (VllmConfig.get_graph_batch_size( self.scheduler_config.max_num_seqs) if self.scheduler_config else max(_BATCH_SIZES_TO_CAPTURE) + 2) self.mamba_cache = MambaCacheManager( diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 1fa47f553dfd6..4692762493f00 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -8,7 +8,6 @@ import torch.distributed import torch.nn as nn -from vllm.compilation.compile_context import set_compile_context from vllm.config import CompilationLevel, VllmConfig from vllm.distributed.parallel_state import graph_capture from vllm.forward_context import set_forward_context @@ -100,7 +99,11 @@ def __init__( == CompilationLevel.PIECEWISE and not self.model_config.enforce_eager) # TODO(woosuk): Provide an option to tune the max cudagraph batch size. - self.cudagraph_batch_sizes = [1, 2, 4] + [i for i in range(8, 513, 8)] + # The convention is different. + # self.cudagraph_batch_sizes sorts in ascending order. + # The batch sizes in the config are in descending order. + self.cudagraph_batch_sizes = list( + reversed(self.vllm_config.compilation_config.capture_sizes)) self.positions = torch.zeros(self.max_num_tokens, dtype=torch.int64, device=self.device) @@ -548,10 +551,9 @@ def profile_run(self) -> None: torch.tensor([], dtype=torch.float32, device=self.device) for _ in range(self.num_attn_layers) ] - with set_compile_context(self.cudagraph_batch_sizes): - # Trigger compilation for general shape. - hidden_states = self._dummy_run(self.model, self.max_num_tokens, - dummy_kv_caches) + # Trigger compilation for general shape. + hidden_states = self._dummy_run(self.model, self.max_num_tokens, + dummy_kv_caches) logits = self.model.compute_logits(hidden_states, None) logits = logits[:self.max_num_tokens] # TODO(woosuk): Consider the memory usage of the sampler. diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index ae18c79c980c8..5697fbbaa2041 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -25,8 +25,7 @@ from vllm.utils import STR_NOT_IMPL_ENC_DEC_BACKEND, make_tensor_with_pad from vllm.worker.model_runner import (GPUModelRunnerBase, ModelInputForGPUBuilder, - ModelInputForGPUWithSamplingMetadata, - _get_graph_batch_size) + ModelInputForGPUWithSamplingMetadata) from vllm.worker.model_runner_base import ( _add_attn_metadata_broadcastable_dict, _add_sampling_metadata_broadcastable_dict) @@ -465,7 +464,8 @@ def _prepare_encoder_model_input_tensors( # We will be using CUDA graph replay for this decode. max_len_of_block_table = self.get_max_block_per_batch() batch_size = len(encoder_seq_lens) - graph_batch_size = _get_graph_batch_size(batch_size) + graph_batch_size = self.vllm_config.get_graph_batch_size( + batch_size) assert graph_batch_size >= batch_size cuda_graph_pad_size = graph_batch_size - batch_size # extend the cross_block_tables and encoder_seq_lens to match diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index c9f06eef3f907..4388b3c1ee164 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -18,7 +18,6 @@ from vllm.attention import AttentionMetadata, get_attn_backend from vllm.attention.backends.abstract import AttentionState from vllm.attention.backends.utils import CommonAttentionState -from vllm.compilation.compile_context import set_compile_context from vllm.config import CompilationLevel, VllmConfig from vllm.core.scheduler import SchedulerOutputs from vllm.distributed import get_kv_transfer_group, get_pp_group @@ -63,16 +62,7 @@ logger = init_logger(__name__) LORA_WARMUP_RANK = 8 -_BATCH_SIZE_ALIGNMENT = 8 -# all the token sizes that **can** be captured by cudagraph. -# they can be arbitrarily large. -# currently it includes: 1, 2, 4, 8, 16, 24, 32, 40, ..., 8192. -# the actual sizes to capture will be determined by the model, -# depending on the model's max_num_seqs. -# NOTE: _get_graph_batch_size needs to be updated if this list is changed. -_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [ - _BATCH_SIZE_ALIGNMENT * i for i in range(1, 1025) -] + _NUM_WARMUP_ITERS = 2 TModelInputForGPU = TypeVar('TModelInputForGPU', bound="ModelInputForGPU") @@ -763,7 +753,6 @@ def _use_captured_graph(self, max_decode_seq_len: int, max_encoder_seq_len: int = 0) -> bool: return (decode_only and not self.runner.model_config.enforce_eager - and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1] and max_decode_seq_len <= self.runner.max_seq_len_to_capture and max_encoder_seq_len <= self.runner.max_seq_len_to_capture and batch_size <= self.runner.max_batchsize_to_capture) @@ -811,7 +800,7 @@ def _get_cuda_graph_pad_size(self, max_encoder_seq_len): return -1 - graph_batch_size = _get_graph_batch_size(batch_size) + graph_batch_size = VllmConfig.get_graph_batch_size(batch_size) assert graph_batch_size >= batch_size return graph_batch_size - batch_size @@ -1023,7 +1012,7 @@ def __init__( self.sliding_window = model_config.get_sliding_window() self.block_size = cache_config.block_size self.max_seq_len_to_capture = self.model_config.max_seq_len_to_capture - self.max_batchsize_to_capture = _get_max_graph_batch_size( + self.max_batchsize_to_capture = VllmConfig.get_max_graph_batch_size( self.scheduler_config.max_num_seqs) self.graph_runners: List[Dict[int, CUDAGraphRunner]] = [ @@ -1333,14 +1322,7 @@ def profile_run(self) -> None: dtype=self.model_config.dtype, device=self.device) - graph_batch_size = self.max_batchsize_to_capture - batch_size_capture_list = [ - bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size - ] - if self.model_config.enforce_eager: - batch_size_capture_list = [] - with set_compile_context(batch_size_capture_list): - self.execute_model(model_input, kv_caches, intermediate_tensors) + self.execute_model(model_input, kv_caches, intermediate_tensors) torch.cuda.synchronize() return @@ -1459,18 +1441,14 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: dtype=self.model_config.dtype, device=self.device) - graph_batch_size = self.max_batchsize_to_capture - batch_size_capture_list = [ - bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size - ] - with self.attn_state.graph_capture( max_batch_size), graph_capture() as graph_capture_context: # NOTE: Capturing the largest batch size first may help reduce the # memory usage of CUDA graph. for virtual_engine in range( self.parallel_config.pipeline_parallel_size): - for batch_size in reversed(batch_size_capture_list): + for batch_size in \ + self.vllm_config.compilation_config.capture_sizes: attn_metadata = ( self.attn_state.graph_capture_get_metadata_for_batch( batch_size, @@ -1993,37 +1971,3 @@ def forward( return self.output_buffers["hidden_states"] return self.output_buffers - - -def _get_graph_batch_size(batch_size: int) -> int: - """Returns the padded batch size given actual batch size. - - Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT, - 2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT... - """ - if batch_size <= 2: - return batch_size - elif batch_size <= 4: - return 4 - else: - return ((batch_size + _BATCH_SIZE_ALIGNMENT - 1) // - _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT) - - -def _get_max_graph_batch_size(max_num_seqs: int) -> int: - """ - max_num_seqs: Maximum number of sequences in a batch. - _BATCH_SIZES_TO_CAPTURE: all the sizes that we want to capture. - - pad the max_num_seqs if necessary by calling _get_graph_batch_size, - which will deal with some edge cases like 1, 2, 4. - - if the padded size is in _BATCH_SIZES_TO_CAPTURE, return the padded size. - if not, it means the padded size is larger than the largest size in - _BATCH_SIZES_TO_CAPTURE, return the largest size in _BATCH_SIZES_TO_CAPTURE. - """ - padded_size = _get_graph_batch_size(max_num_seqs) - if padded_size in _BATCH_SIZES_TO_CAPTURE: - return padded_size - assert padded_size > _BATCH_SIZES_TO_CAPTURE[-1] - return _BATCH_SIZES_TO_CAPTURE[-1] From ef51831ee8dbd64833b25e042d4e984d169202f9 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Tue, 3 Dec 2024 01:46:07 -0500 Subject: [PATCH 226/255] [Doc] Add github links for source code references (#10672) Signed-off-by: Russell Bryant Signed-off-by: DarkLight1337 Co-authored-by: DarkLight1337 --- docs/requirements-docs.txt | 3 +- docs/source/conf.py | 66 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 1 deletion(-) diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 8ea240f59c38f..5c80645b405ae 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -16,4 +16,5 @@ mistral_common >= 1.5.0 aiohttp starlette openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args -partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args \ No newline at end of file +partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args +requests diff --git a/docs/source/conf.py b/docs/source/conf.py index 96ad9a4c26b09..4a1a5fb455ff3 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -10,11 +10,13 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. +import inspect import logging import os import sys from typing import List +import requests from sphinx.ext import autodoc logger = logging.getLogger(__name__) @@ -34,6 +36,7 @@ extensions = [ "sphinx.ext.napoleon", "sphinx.ext.viewcode", + "sphinx.ext.linkcode", "sphinx.ext.intersphinx", "sphinx_copybutton", "sphinx.ext.autodoc", @@ -94,6 +97,69 @@ def setup(app): generate_examples() +_cached_base: str = "" +_cached_branch: str = "" + + +def get_repo_base_and_branch(pr_number): + global _cached_base, _cached_branch + if _cached_base and _cached_branch: + return _cached_base, _cached_branch + + url = f"https://api.github.com/repos/vllm-project/vllm/pulls/{pr_number}" + response = requests.get(url) + if response.status_code == 200: + data = response.json() + _cached_base = data['head']['repo']['full_name'] + _cached_branch = data['head']['ref'] + return _cached_base, _cached_branch + else: + logger.error("Failed to fetch PR details: %s", response) + return None, None + + +def linkcode_resolve(domain, info): + if domain != 'py': + return None + if not info['module']: + return None + filename = info['module'].replace('.', '/') + module = info['module'] + + # try to determine the correct file and line number to link to + obj = sys.modules[module] + + # get as specific as we can + lineno: int = 0 + filename: str = "" + try: + for part in info['fullname'].split('.'): + obj = getattr(obj, part) + + if not (inspect.isclass(obj) or inspect.isfunction(obj) + or inspect.ismethod(obj)): + obj = obj.__class__ # Get the class of the instance + + lineno = inspect.getsourcelines(obj)[1] + filename = (inspect.getsourcefile(obj) + or f"{filename}.py").split("vllm/", 1)[1] + except Exception: + # For some things, like a class member, won't work, so + # we'll use the line number of the parent (the class) + pass + + if filename.startswith("checkouts/"): + # a PR build on readthedocs + pr_number = filename.split("/")[1] + filename = filename.split("/", 2)[2] + base, branch = get_repo_base_and_branch(pr_number) + if base and branch: + return f"https://github.com/{base}/blob/{branch}/{filename}#L{lineno}" + + # Otherwise, link to the source file on the main branch + return f"https://github.com/vllm-project/vllm/blob/main/{filename}#L{lineno}" + + # Mock out external dependencies here, otherwise the autodoc pages may be blank. autodoc_mock_imports = [ "compressed_tensors", From 3257d449fa0fd3e05aa20cc8c5fff79ad101984f Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Tue, 3 Dec 2024 14:52:57 +0800 Subject: [PATCH 227/255] [Misc] Remove deprecated names (#10817) Signed-off-by: DarkLight1337 --- vllm/engine/async_llm_engine.py | 8 +++++-- vllm/engine/llm_engine.py | 5 ++-- vllm/engine/multiprocessing/__init__.py | 5 +++- vllm/engine/multiprocessing/client.py | 7 ++++-- vllm/entrypoints/llm.py | 11 +++++++++ vllm/inputs/__init__.py | 31 ------------------------- vllm/inputs/data.py | 31 ------------------------- vllm/model_executor/models/aria.py | 5 ++-- vllm/multimodal/__init__.py | 15 ------------ vllm/multimodal/base.py | 15 ------------ 10 files changed, 31 insertions(+), 102 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 7b1bb7b05708d..4395588d29cda 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -6,6 +6,8 @@ List, Mapping, Optional, Set, Tuple, Type, Union, overload) from weakref import ReferenceType +from typing_extensions import deprecated + import vllm.envs as envs from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VllmConfig) @@ -422,7 +424,8 @@ async def get_tokenizer_async(self, return await ( self.get_tokenizer_group().get_lora_tokenizer_async(lora_request)) - @overload # DEPRECATED + @overload + @deprecated("'inputs' will be renamed to 'prompt") async def add_request_async( self, request_id: str, @@ -894,7 +897,8 @@ async def run_engine_loop(engine_ref: ReferenceType): # This method does not need to be async, but kept that way # for backwards compatibility. - @overload # DEPRECATED + @overload + @deprecated("'inputs' will be renamed to 'prompt") def add_request( self, request_id: str, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 7911dc8d04500..dd55aa2818621 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -10,7 +10,7 @@ from typing import Set, Type, Union, cast, overload import torch -from typing_extensions import TypeVar +from typing_extensions import TypeVar, deprecated import vllm.envs as envs from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, @@ -719,7 +719,8 @@ def _add_processed_request( def stop_remote_worker_execution_loop(self) -> None: self.model_executor.stop_remote_worker_execution_loop() - @overload # DEPRECATED + @overload + @deprecated("'inputs' will be renamed to 'prompt") def add_request( self, request_id: str, diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py index 34c161e9395ae..7020012e8bb86 100644 --- a/vllm/engine/multiprocessing/__init__.py +++ b/vllm/engine/multiprocessing/__init__.py @@ -2,6 +2,8 @@ from enum import Enum from typing import List, Mapping, Optional, Union, overload +from typing_extensions import deprecated + from vllm import PoolingParams from vllm.inputs import PromptType from vllm.lora.request import LoRARequest @@ -32,7 +34,8 @@ class RPCProcessRequest: prompt_adapter_request: Optional[PromptAdapterRequest] = None priority: int = 0 - @overload # DEPRECATED + @overload + @deprecated("'inputs' will be renamed to 'prompt") def __init__( self, *, diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index d26728e8c6e67..8383e774db20f 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -9,6 +9,7 @@ import psutil import zmq import zmq.asyncio +from typing_extensions import deprecated from zmq import Frame # type: ignore[attr-defined] from zmq.asyncio import Socket @@ -414,7 +415,8 @@ def errored(self) -> bool: def dead_error(self) -> BaseException: return ENGINE_DEAD_ERROR(self._errored_with) - @overload # DEPRECATED + @overload + @deprecated("'inputs' will be renamed to 'prompt") def generate( self, *, @@ -485,7 +487,8 @@ def generate( lora_request, trace_headers, prompt_adapter_request, priority) - @overload # DEPRECATED + @overload + @deprecated("'inputs' will be renamed to 'prompt") def encode( self, *, diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index a25c401b4ea10..65fa9873df28c 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -6,6 +6,7 @@ Union, cast, overload) from tqdm import tqdm +from typing_extensions import deprecated from vllm import envs from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, @@ -256,6 +257,7 @@ def set_tokenizer(self, tokenizer: AnyTokenizer) -> None: tokenizer_group.tokenizer = get_cached_tokenizer(tokenizer) @overload # LEGACY: single (prompt + optional token ids) + @deprecated("'prompt_token_ids' will become part of 'prompts") def generate( self, prompts: str, @@ -268,6 +270,7 @@ def generate( ... @overload # LEGACY: multi (prompt + optional token ids) + @deprecated("'prompt_token_ids' will become part of 'prompts") def generate( self, prompts: List[str], @@ -280,6 +283,7 @@ def generate( ... @overload # LEGACY: single (token ids + optional prompt) + @deprecated("'prompt_token_ids' will become part of 'prompts") def generate( self, prompts: Optional[str] = None, @@ -293,6 +297,7 @@ def generate( ... @overload # LEGACY: multi (token ids + optional prompt) + @deprecated("'prompt_token_ids' will become part of 'prompts") def generate( self, prompts: Optional[List[str]] = None, @@ -306,6 +311,7 @@ def generate( ... @overload # LEGACY: single or multi token ids [pos-only] + @deprecated("'prompt_token_ids' will become part of 'prompts") def generate( self, prompts: None, @@ -671,6 +677,7 @@ def chat( ) @overload # LEGACY: single (prompt + optional token ids) + @deprecated("'prompt_token_ids' will become part of 'prompts") def encode( self, prompts: str, @@ -683,6 +690,7 @@ def encode( ... @overload # LEGACY: multi (prompt + optional token ids) + @deprecated("'prompt_token_ids' will become part of 'prompts") def encode( self, prompts: List[str], @@ -695,6 +703,7 @@ def encode( ... @overload # LEGACY: single (token ids + optional prompt) + @deprecated("'prompt_token_ids' will become part of 'prompts") def encode( self, prompts: Optional[str] = None, @@ -708,6 +717,7 @@ def encode( ... @overload # LEGACY: multi (token ids + optional prompt) + @deprecated("'prompt_token_ids' will become part of 'prompts") def encode( self, prompts: Optional[List[str]] = None, @@ -721,6 +731,7 @@ def encode( ... @overload # LEGACY: single or multi token ids [pos-only] + @deprecated("'prompt_token_ids' will become part of 'prompts") def encode( self, prompts: None, diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index 54fbd7a321a6f..d4402e77a3886 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -38,34 +38,3 @@ "InputProcessingContext", "InputRegistry", ] - - -def __getattr__(name: str): - import warnings - - if name == "PromptInput": - msg = ("PromptInput has been renamed to PromptType. " - "The original name will be removed in an upcoming version.") - - warnings.warn(DeprecationWarning(msg), stacklevel=2) - - return PromptType - - if name == "LLMInputs": - msg = ("LLMInputs has been renamed to DecoderOnlyInputs. " - "The original name will be removed in an upcoming version.") - - warnings.warn(DeprecationWarning(msg), stacklevel=2) - - return DecoderOnlyInputs - - if name == "EncoderDecoderLLMInputs": - msg = ( - "EncoderDecoderLLMInputs has been renamed to EncoderDecoderInputs. " - "The original name will be removed in an upcoming version.") - - warnings.warn(DeprecationWarning(msg), stacklevel=2) - - return EncoderDecoderInputs - - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index fb7dbbebd7b90..e8fc78f1a66f6 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -358,34 +358,3 @@ def to_enc_dec_tuple_list( return [(enc_dec_prompt["encoder_prompt"], enc_dec_prompt["decoder_prompt"]) for enc_dec_prompt in enc_dec_prompts] - - -def __getattr__(name: str): - import warnings - - if name == "PromptInput": - msg = ("PromptInput has been renamed to PromptType. " - "The original name will be removed in an upcoming version.") - - warnings.warn(DeprecationWarning(msg), stacklevel=2) - - return PromptType - - if name == "LLMInputs": - msg = ("LLMInputs has been renamed to DecoderOnlyInputs. " - "The original name will be removed in an upcoming version.") - - warnings.warn(DeprecationWarning(msg), stacklevel=2) - - return DecoderOnlyInputs - - if name == "EncoderDecoderLLMInputs": - msg = ( - "EncoderDecoderLLMInputs has been renamed to EncoderDecoderInputs. " - "The original name will be removed in an upcoming version.") - - warnings.warn(DeprecationWarning(msg), stacklevel=2) - - return EncoderDecoderInputs - - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index fa6b95f5481ad..dd4b0c75cb84d 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -32,9 +32,8 @@ maybe_prefix, merge_multimodal_embeddings) from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalInputs from vllm.multimodal.image import cached_get_image_processor -from vllm.multimodal.inputs import NestedTensors +from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors from vllm.multimodal.utils import (cached_get_tokenizer, repeat_and_pad_placeholder_tokens) from vllm.sequence import IntermediateTensors @@ -451,7 +450,7 @@ def get_max_multimodal_tokens(ctx): def input_mapper_for_aria(ctx, data): - return MultiModalInputs(data) + return MultiModalKwargs(data) def input_processor(ctx, llm_inputs): diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 03a5f3a91f7a1..928c31a2f2843 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -27,18 +27,3 @@ "MULTIMODAL_REGISTRY", "MultiModalRegistry", ] - - -def __getattr__(name: str): - import warnings - - if name == "MultiModalInputs": - msg = ("MultiModalInputs has been renamed to MultiModalKwargs. " - "The original name will take another meaning in an upcoming " - "version.") - - warnings.warn(DeprecationWarning(msg), stacklevel=2) - - return MultiModalKwargs - - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index bbb8fb4bc1cd1..f93722523728d 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -433,18 +433,3 @@ def index_map(self) -> "IndexMap": return MultiModalPlaceholderMap.IndexMap(src=src_indices, dest=dest_indices) - - -def __getattr__(name: str): - import warnings - - if name == "MultiModalInputs": - msg = ("MultiModalInputs has been renamed to MultiModalKwargs. " - "The original name will take another meaning in an upcoming " - "version.") - - warnings.warn(DeprecationWarning(msg), stacklevel=2) - - return MultiModalKwargs - - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") From 9323a3153b20d4a2ca7ac04a2784609d6ce656e0 Mon Sep 17 00:00:00 2001 From: Aaron Pham Date: Tue, 3 Dec 2024 02:17:00 -0500 Subject: [PATCH 228/255] [Core][Performance] Add XGrammar support for guided decoding and set it as default (#10785) Signed-off-by: Aaron Pham Signed-off-by: mgoin Co-authored-by: mgoin --- docs/source/conf.py | 1 + requirements-common.txt | 1 + tests/entrypoints/llm/test_guided_generate.py | 27 ++ .../model_executor/test_guided_processors.py | 3 +- vllm/config.py | 15 +- vllm/engine/arg_utils.py | 9 +- vllm/engine/async_llm_engine.py | 18 +- vllm/engine/llm_engine.py | 15 +- vllm/engine/multiprocessing/client.py | 5 +- .../guided_decoding/__init__.py | 73 ++++- .../guided_decoding/xgrammar_decoding.py | 251 ++++++++++++++++++ 11 files changed, 385 insertions(+), 33 deletions(-) create mode 100644 vllm/model_executor/guided_decoding/xgrammar_decoding.py diff --git a/docs/source/conf.py b/docs/source/conf.py index 4a1a5fb455ff3..e9d9ac68c9560 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -178,6 +178,7 @@ def linkcode_resolve(domain, info): "tensorizer", "pynvml", "outlines", + "xgrammar," "librosa", "soundfile", "gguf", diff --git a/requirements-common.txt b/requirements-common.txt index 02e3d65fb774c..818f72e14be96 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -19,6 +19,7 @@ prometheus-fastapi-instrumentator >= 7.0.0 tiktoken >= 0.6.0 # Required for DBRX tokenizer lm-format-enforcer >= 0.10.9, < 0.11 outlines >= 0.0.43, < 0.1 +xgrammar typing_extensions >= 4.10 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 partial-json-parser # used for parsing partial JSON outputs diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py index 67c79415f322a..c3706f696b264 100644 --- a/tests/entrypoints/llm/test_guided_generate.py +++ b/tests/entrypoints/llm/test_guided_generate.py @@ -159,3 +159,30 @@ def test_validation_against_both_guided_decoding_options(sample_regex, llm): sampling_params=sampling_params, use_tqdm=True, guided_options_request=dict(guided_regex=sample_regex)) + + +@pytest.mark.skip_global_cleanup +def test_guided_json_object(llm): + sampling_params = SamplingParams( + temperature=1.0, + max_tokens=100, + guided_decoding=GuidedDecodingParams(json_object=True)) + + outputs = llm.generate( + prompts=("Generate a JSON object describing a person with name " + "and age for John Smith who is 31 years old."), + sampling_params=sampling_params, + use_tqdm=True) + + assert outputs is not None + for output in outputs: + assert output is not None + assert isinstance(output, RequestOutput) + + generated_text = output.outputs[0].text + print(generated_text) + assert generated_text is not None + + # Parse to verify it is valid JSON + parsed_json = json.loads(generated_text) + assert isinstance(parsed_json, dict) diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py index 45fab8e96b968..9f4d81b583141 100644 --- a/tests/model_executor/test_guided_processors.py +++ b/tests/model_executor/test_guided_processors.py @@ -36,7 +36,8 @@ def test_guided_logits_processors(sample_regex, sample_json_schema): @pytest.mark.asyncio -@pytest.mark.parametrize("backend", ["outlines", "lm-format-enforcer"]) +@pytest.mark.parametrize("backend", + ["outlines", "lm-format-enforcer", "xgrammar"]) async def test_guided_logits_processor_black_box(backend: str, sample_regex, sample_json_schema): tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta') diff --git a/vllm/config.py b/vllm/config.py index 326340d3fa655..971eb36d677b8 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1789,15 +1789,15 @@ class PoolerConfig: step_tag_id: Optional[int] = None """ - If set, only the score corresponding to the ``step_tag_id`` in the + If set, only the score corresponding to the ``step_tag_id`` in the generated sentence should be returned. Otherwise, the scores for all tokens are returned. """ returned_token_ids: Optional[List[int]] = None """ - A list of indices for the vocabulary dimensions to be extracted, - such as the token IDs of ``good_token`` and ``bad_token`` in the + A list of indices for the vocabulary dimensions to be extracted, + such as the token IDs of ``good_token`` and ``bad_token`` in the ``math-shepherd-mistral-7b-prm`` model. """ @@ -2031,11 +2031,12 @@ def get_served_model_name(model: str, class DecodingConfig: """Dataclass which contains the decoding strategy of the engine""" - # Which guided decoding algo to use. 'outlines' / 'lm-format-enforcer' - guided_decoding_backend: str = 'outlines' + # Which guided decoding algo to use. + # 'outlines' / 'lm-format-enforcer' / 'xgrammar' + guided_decoding_backend: str = 'xgrammar' def __post_init__(self): - valid_guided_backends = ['outlines', 'lm-format-enforcer'] + valid_guided_backends = ['outlines', 'lm-format-enforcer', 'xgrammar'] backend = self.guided_decoding_backend if backend not in valid_guided_backends: raise ValueError(f"Invalid guided_decoding_backend '{backend}," @@ -2222,7 +2223,7 @@ class CompilationConfig(BaseModel): from Python, functions can also be passed directly via Python object constructor, e.g. `CompilationConfig(inductor_passes={"a": func})` - custom inductor passes: see PassConfig for more details - + Why we have different sizes for cudagraph and inductor: - cudagraph: a cudagraph captured for a specific size can only be used for the same size. We need to capture all the sizes we want to use. diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 4aa0eebd976c9..3b776c1d9d39f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -168,7 +168,7 @@ class EngineArgs: scheduler_delay_factor: float = 0.0 enable_chunked_prefill: Optional[bool] = None - guided_decoding_backend: str = 'outlines' + guided_decoding_backend: str = 'xgrammar' # Speculative decoding configuration. speculative_model: Optional[str] = None speculative_model_quantization: Optional[str] = None @@ -364,11 +364,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument( '--guided-decoding-backend', type=str, - default='outlines', - choices=['outlines', 'lm-format-enforcer'], + default='xgrammar', + choices=['outlines', 'lm-format-enforcer', 'xgrammar'], help='Which engine will be used for guided decoding' ' (JSON schema / regex etc) by default. Currently support ' - 'https://github.com/outlines-dev/outlines and ' + 'https://github.com/outlines-dev/outlines,' + 'https://github.com/mlc-ai/xgrammar, and ' 'https://github.com/noamgat/lm-format-enforcer.' ' Can be overridden per request via guided_decoding_backend' ' parameter.') diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 4395588d29cda..60dccd7a0812c 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -1,4 +1,5 @@ import asyncio +import copy import time import weakref from functools import partial @@ -507,7 +508,8 @@ async def add_request_async( sampling_params=params, tokenizer=await self.get_tokenizer_async(lora_request), default_guided_backend=self.decoding_config. - guided_decoding_backend) + guided_decoding_backend, + model_config=self.model_config) self._add_processed_request( request_id=request_id, @@ -528,22 +530,30 @@ async def check_health_async(self) -> None: async def build_guided_decoding_logits_processor_async( sampling_params: SamplingParams, tokenizer: AnyTokenizer, - default_guided_backend: str) -> SamplingParams: + default_guided_backend: str, + model_config: ModelConfig) -> SamplingParams: """Constructs logits processors based on the guided_decoding, logits_bias, and allowed_token_ids fields in sampling_params. Deletes those fields and adds the constructed logits processors to the logits_processors field. Modifies sampling params in-place and returns the modified sampling params.""" - if (guided_decoding := sampling_params.guided_decoding) is None: + if sampling_params.guided_decoding is None: return sampling_params + # Defensively copy sampling params since guided decoding logits + # processors can have different state for each request + sampling_params = copy.copy(sampling_params) + guided_decoding = sampling_params.guided_decoding + logger.debug("Building guided decoding logits processor. " "Params: %s", guided_decoding) guided_decoding.backend = guided_decoding.backend or default_guided_backend processor = await get_guided_decoding_logits_processor( - guided_params=guided_decoding, tokenizer=tokenizer) + guided_params=guided_decoding, + tokenizer=tokenizer, + model_config=model_config) if processor: if sampling_params.logits_processors is None: diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index dd55aa2818621..af66b307028cf 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1,3 +1,4 @@ +import copy import time from collections import Counter as collectionsCounter from collections import deque @@ -1024,9 +1025,9 @@ def _update_num_computed_tokens_for_multi_step_prefill( This function updates num_computed_tokens for prompt sequences when Multi-Step is enabled. - seq_group: SequenceGroup to update the num_computed_tokens for. + seq_group: SequenceGroup to update the num_computed_tokens for. seq_group_meta: Metadata of the given SequenceGroup. - is_first_step_output: Optional[bool] - + is_first_step_output: Optional[bool] - When available, is_first_step_output indicates if the appended output token is the output of the first-step in multi-step. A value of None indicates that outputs from all steps in @@ -2036,7 +2037,11 @@ def _build_logits_processors( logits_processors = [] - if (guided_decoding := sampling_params.guided_decoding) is not None: + if sampling_params.guided_decoding is not None: + # Defensively copy sampling params since guided decoding logits + # processors can have different state for each request + sampling_params = copy.copy(sampling_params) + guided_decoding = sampling_params.guided_decoding logger.debug( "Building guided decoding logits processor in " @@ -2047,7 +2052,9 @@ def _build_logits_processors( self.decoding_config.guided_decoding_backend processor = get_local_guided_decoding_logits_processor( - guided_params=guided_decoding, tokenizer=tokenizer) + guided_params=guided_decoding, + tokenizer=tokenizer, + model_config=self.model_config) if processor: logits_processors.append(processor) diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index 8383e774db20f..d21136c03d7d2 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -474,8 +474,8 @@ def generate( trace_headers: OpenTelemetry trace headers. prompt_adapter_request: Prompt Adapter request to use for generation, if any. - priority: Priority of the request (lower means earlier handling). - Any priority other than 0 will lead to an error if the + priority: Priority of the request (lower means earlier handling). + Any priority other than 0 will lead to an error if the scheduling policy is not "priority". """ if inputs is not None: @@ -589,6 +589,7 @@ async def _process_request( default_guided_backend=(self.decoding_config.guided_decoding_backend if self.decoding_config else DecodingConfig.guided_decoding_backend), + model_config=self.model_config ) # 1) Create output queue for this requests. diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py index d7b67425fcbc0..23c31fcfd7f05 100644 --- a/vllm/model_executor/guided_decoding/__init__.py +++ b/vllm/model_executor/guided_decoding/__init__.py @@ -1,14 +1,54 @@ -from typing import Optional +from __future__ import annotations -from vllm.logits_process import LogitsProcessor -from vllm.sampling_params import GuidedDecodingParams +from typing import TYPE_CHECKING + +from vllm.logger import init_logger + +if TYPE_CHECKING: + from transformers import PreTrainedTokenizer + + from vllm.config import ModelConfig + from vllm.logits_process import LogitsProcessor + from vllm.sampling_params import GuidedDecodingParams + +logger = init_logger(__name__) + + +def maybe_backend_fallback( + guided_params: GuidedDecodingParams) -> GuidedDecodingParams: + # lm-format-enforce doesn't support grammar, fallback to xgrammar + if (guided_params.backend == "lm-format-enforcer" + and guided_params.grammar is not None): + logger.warning( + "lm-format-enforcer does not support grammar guided decoding. " + "Falling back to use xgrammar instead.") + guided_params.backend = "xgrammar" + + if guided_params.backend == "xgrammar": + # xgrammar doesn't support regex or choice, fallback to outlines + if guided_params.regex is not None or guided_params.choice is not None: + logger.warning( + "xgrammar only supports json or grammar guided decoding. " + "Falling back to use outlines instead.") + guided_params.backend = "outlines" + + # xgrammar only supports EBNF grammars and uses the GBNF format + # https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md + elif (guided_params.grammar is not None + and "::=" not in guided_params.grammar): + logger.warning("xgrammar only supports EBNF grammars. " + "Falling back to use outlines instead.") + guided_params.backend = "outlines" + + return guided_params async def get_guided_decoding_logits_processor( - guided_params: GuidedDecodingParams, - tokenizer) -> Optional[LogitsProcessor]: + guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizer, + model_config: ModelConfig) -> LogitsProcessor | None: + guided_params = maybe_backend_fallback(guided_params) # CFG grammar not supported by LMFE, so we use outlines instead - if guided_params.backend == 'outlines' or guided_params.grammar: + if guided_params.backend == 'outlines': # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193 from vllm.model_executor.guided_decoding.outlines_decoding import ( # noqa get_outlines_guided_decoding_logits_processor) @@ -19,17 +59,23 @@ async def get_guided_decoding_logits_processor( get_local_lm_format_enforcer_guided_decoding_logits_processor) return get_local_lm_format_enforcer_guided_decoding_logits_processor( guided_params, tokenizer) + if guided_params.backend == 'xgrammar': + from vllm.model_executor.guided_decoding.xgrammar_decoding import ( # noqa + get_local_xgrammar_guided_decoding_logits_processor) + return get_local_xgrammar_guided_decoding_logits_processor( + guided_params, tokenizer, model_config) raise ValueError( f"Unknown guided decoding backend '{guided_params.backend}'. " - "Must be one of 'outlines, 'lm-format-enforcer'") + "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar'") def get_local_guided_decoding_logits_processor( - guided_params: GuidedDecodingParams, - tokenizer) -> Optional[LogitsProcessor]: + guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizer, + model_config: ModelConfig) -> LogitsProcessor | None: + guided_params = maybe_backend_fallback(guided_params) # CFG grammar not supported by LMFE, so we use outlines instead - if guided_params.backend == 'outlines' or guided_params.grammar: + if guided_params.backend == 'outlines': # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193 from vllm.model_executor.guided_decoding.outlines_decoding import ( # noqa get_local_outlines_guided_decoding_logits_processor) @@ -40,7 +86,12 @@ def get_local_guided_decoding_logits_processor( get_local_lm_format_enforcer_guided_decoding_logits_processor) return get_local_lm_format_enforcer_guided_decoding_logits_processor( guided_params, tokenizer) + if guided_params.backend == 'xgrammar': + from vllm.model_executor.guided_decoding.xgrammar_decoding import ( # noqa + get_local_xgrammar_guided_decoding_logits_processor) + return get_local_xgrammar_guided_decoding_logits_processor( + guided_params, tokenizer, model_config) raise ValueError( f"Unknown guided decoding backend '{guided_params.backend}'. " - "Must be one of 'outlines, 'lm-format-enforcer'") + "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar'") diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py new file mode 100644 index 0000000000000..8287cd6cf3aa0 --- /dev/null +++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py @@ -0,0 +1,251 @@ +# noqa: UP007 +from __future__ import annotations + +import json +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any, NamedTuple + +import torch +from transformers import PreTrainedTokenizerFast + +try: + import xgrammar as xgr + from xgrammar.base import _core as xgr_core +except ImportError: + pass + +if TYPE_CHECKING: + from transformers import PreTrainedTokenizer + + from vllm.config import ModelConfig + from vllm.sampling_params import GuidedDecodingParams + + +# TODO: passing batch size to max threads here +def get_local_xgrammar_guided_decoding_logits_processor( + guided_params: GuidedDecodingParams, + tokenizer: PreTrainedTokenizer, + model_config: ModelConfig, + max_threads: int = 8): + config = GrammarConfig.from_guided_params(guided_params=guided_params, + model_config=model_config, + tokenizer=tokenizer, + max_threads=max_threads) + return XGrammarLogitsProcessor(config) + + +class TokenizerData(NamedTuple): + """Immutable container for cached tokenizer data.""" + encoded_vocab: list[str] + stop_token_ids: list[int] | None + backend_str: str + + +class TokenizerDataCache: + """Cache manager for tokenizer data to avoid repeated processing.""" + _cache: dict[int, TokenizerData] = {} + + @classmethod + def get_tokenizer_data(cls, + tokenizer: PreTrainedTokenizer) -> TokenizerData: + tokenizer_hash = hash(tokenizer) + + if tokenizer_hash not in cls._cache: + # Vendored from xgrammar logic since we cannot pickle the tokenizer + # https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98 # noqa: E501 + try: + encoded_vocab = [ + token for token, _ in sorted(tokenizer.get_vocab().items(), + key=lambda x: x[1]) + ] + except AttributeError as e: + raise ValueError( + f"Cannot get the vocabulary of the tokenizer " + f"{type(tokenizer)}. The tokenizer should have a " + "get_vocab method.") from e + + stop_token_ids = None + backend_str = xgr.VocabType.RAW + if isinstance(tokenizer, PreTrainedTokenizerFast): + backend_str = tokenizer.backend_tokenizer.to_str() + if stop_token_ids is None and hasattr( + tokenizer, + "eos_token_id") and tokenizer.eos_token_id is not None: + stop_token_ids = [tokenizer.eos_token_id] + + cls._cache[tokenizer_hash] = TokenizerData( + encoded_vocab=encoded_vocab, + stop_token_ids=stop_token_ids, + backend_str=backend_str) + + return cls._cache[tokenizer_hash] + + +class GrammarCompilerCache: + """ + Cache for GrammarCompiler instances based on tokenizer. + + This cache reduces the overhead of creating new compiler instances when + using the same tokenizer configuration. + """ + _cache: dict[str, xgr.GrammarCompiler] = {} + + @classmethod + def get_compiler(cls, config: GrammarConfig) -> xgr.GrammarCompiler: + cache_key = str(config.tokenizer_hash) + + if cache_key not in cls._cache: + assert config.encoded_vocab is not None + tokenizer_info = xgr.TokenizerInfo._create_from_handle( + xgr_core.TokenizerInfo.from_huggingface( + config.encoded_vocab, config.backend_str, + config.vocab_size, config.stop_token_ids)) + cls._cache[cache_key] = xgr.GrammarCompiler( + tokenizer_info, max_threads=config.max_threads) + + return cls._cache[cache_key] + + +@dataclass +class GrammarConfig: + """Serializable configuration for grammar compilation""" + tokenizer_hash: int + vocab_size: int + json_str: str | None = None + grammar_str: str | None = None + json_object: bool | None = None + max_threads: int = 8 + # Only populated if tokenizer_hash not in cache + encoded_vocab: list[str] | None = None + stop_token_ids: list[int] | None = None + backend_str: str | None = None + + @classmethod + def from_guided_params(cls, + guided_params: GuidedDecodingParams, + model_config: ModelConfig, + tokenizer: PreTrainedTokenizer, + max_threads: int = 8) -> GrammarConfig: + + tokenizer_hash = hash(tokenizer) + # Only get tokenizer data if not already cached + if tokenizer_hash in TokenizerDataCache._cache: + encoded_vocab = None + stop_token_ids = None + backend_str = None + else: + tokenizer_data = TokenizerDataCache.get_tokenizer_data(tokenizer) + encoded_vocab = tokenizer_data.encoded_vocab + stop_token_ids = tokenizer_data.stop_token_ids + backend_str = tokenizer_data.backend_str + + if guided_params.json: + if not isinstance(guided_params.json, str): + json_str = json.dumps(guided_params.json) + else: + json_str = guided_params.json + return cls(json_str=json_str, + vocab_size=model_config.hf_config.vocab_size, + encoded_vocab=encoded_vocab, + stop_token_ids=stop_token_ids, + backend_str=backend_str, + tokenizer_hash=tokenizer_hash, + max_threads=max_threads) + elif guided_params.grammar: + return cls(grammar_str=guided_params.grammar, + vocab_size=model_config.hf_config.vocab_size, + encoded_vocab=encoded_vocab, + stop_token_ids=stop_token_ids, + backend_str=backend_str, + tokenizer_hash=tokenizer_hash, + max_threads=max_threads) + elif guided_params.json_object: + return cls(json_object=True, + vocab_size=model_config.hf_config.vocab_size, + encoded_vocab=encoded_vocab, + stop_token_ids=stop_token_ids, + backend_str=backend_str, + tokenizer_hash=tokenizer_hash, + max_threads=max_threads) + else: + raise ValueError( + "Currently only support JSON and EBNF grammar mode for xgrammar" + ) + + +@dataclass +class XGrammarLogitsProcessor: + """Wrapper class to support pickle protocol""" + config: GrammarConfig + + ctx: xgr.CompiledGrammar | None = None + token_bitmask: torch.Tensor = None # type: ignore[assignment] + matchers: list[xgr.GrammarMatcher] = field(default_factory=list) + batch_size: int = field(default=1) + prefilled: bool = field(default=False) + + def __getstate__(self) -> dict[str, Any]: + return {'config': self.config} + + def __setstate__(self, state: dict[str, Any]): + self.config = state['config'] + + self.ctx = None + self.matchers = [] + self.batch_size = 1 + self.token_bitmask = None # type: ignore[assignment] + self.prefilled = False + + def _ensure_ctx(self): + """Lazily initialize the processor in the worker process""" + if self.ctx is None: + compiler = GrammarCompilerCache.get_compiler(self.config) + if self.config.json_str is not None: + self.ctx = compiler.compile_json_schema(self.config.json_str) + elif self.config.grammar_str is not None: + self.ctx = compiler.compile_grammar(self.config.grammar_str) + elif self.config.json_object: + self.ctx = compiler.compile_builtin_json_grammar() + else: + raise ValueError( + "Invalid configuration for xgrammar logits processor") + + def __call__(self, input_ids: list[int], + scores: torch.Tensor) -> torch.Tensor: + if self.ctx is None: + self._ensure_ctx() + + if len(self.matchers) == 0: + self.matchers = [ + xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size) + ] + self.token_bitmask = xgr.allocate_token_bitmask( + self.batch_size, self.config.vocab_size) + + if not self.prefilled: + # Have not sampled a token yet + self.prefilled = True + else: + for i, matcher in enumerate(self.matchers): + if not matcher.is_terminated(): + sampled_token = input_ids[-1] + assert self.matchers[i].accept_token(sampled_token) + + for i, matcher in enumerate(self.matchers): + if not matcher.is_terminated(): + # @ubospica: ideally, fill_next_token_bitmask should be + # parallelized with model decoding + # See https://github.com/vllm-project/vllm/pull/10785/files#r1864278303 + matcher.fill_next_token_bitmask(self.token_bitmask, i) + + # token_bitmask is a CPU tensor for use with accept_token and + # fill_next_token_bitmask so we move it to the device of scores + device_type = scores.device.type + if device_type != "cuda": + scores = scores.to("cpu") + xgr.apply_token_bitmask_inplace(scores, + self.token_bitmask.to(scores.device)) + if device_type != "cuda": + scores = scores.to(device_type) + + return scores From f6084f63248a89df52bed9d9c24d6604f87e51f3 Mon Sep 17 00:00:00 2001 From: Yang Zheng <50227060+zhengy001@users.noreply.github.com> Date: Tue, 3 Dec 2024 17:01:39 +0800 Subject: [PATCH 229/255] [Speculative Decoding] Move indices to device before filtering output (#10850) Co-authored-by: Yang Zheng(SW)(Alex) --- vllm/spec_decode/multi_step_worker.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index d249b37c780e4..676ac5eb3609d 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -120,6 +120,9 @@ def sampler_output( indices_of_seq_with_bonus_tokens) model_outputs.append(model_output) + # move indices to device to avoid stream sync + indices_of_seq_with_bonus_tokens = torch.tensor( + indices_of_seq_with_bonus_tokens, device=self.device) filtered_model_outputs = self._filter_model_output( model_outputs, indices_of_seq_with_bonus_tokens) return filtered_model_outputs, True @@ -189,7 +192,7 @@ def _expand_execute_model_request( @staticmethod def _filter_model_output( expanded_batch_outputs: List[SamplerOutput], - output_indices_to_retain: List[int]) -> List[SamplerOutput]: + output_indices_to_retain: torch.Tensor) -> List[SamplerOutput]: """ Filters the model output to include only the specified sequence outputs. This method contracts the expanded batch output from the @@ -199,8 +202,8 @@ def _filter_model_output( Args: expanded_batch_output (List[SamplerOutput]): The expanded output batch from the model. - output_indices_to_retain (List[int]): Indices of the model outputs - to retain. + output_indices_to_retain (torch.Tensor): Indices of the model + outputs to retain. Returns: List[SamplerOutput]: A list containing the filtered model From 3bc94cab695387eb16be90b6368029f56ce5dbc7 Mon Sep 17 00:00:00 2001 From: Alexander Matveev <59768536+alexm-neuralmagic@users.noreply.github.com> Date: Tue, 3 Dec 2024 05:33:10 -0500 Subject: [PATCH 230/255] [V1] VLM - Run the mm_mapper preprocessor in the frontend process (#10640) Signed-off-by: Roger Wang Co-authored-by: Michael Goin Co-authored-by: Roger Wang --- tests/v1/engine/test_engine_core.py | 3 +-- tests/v1/engine/test_engine_core_client.py | 3 +-- vllm/inputs/data.py | 24 +++++++++++++++++++++- vllm/v1/engine/__init__.py | 7 +++---- vllm/v1/engine/core.py | 7 ------- vllm/v1/engine/processor.py | 13 ++++++++++-- vllm/v1/request.py | 15 +++++++------- 7 files changed, 47 insertions(+), 25 deletions(-) diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index bd11ff1877064..fef44ac29c41f 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -27,9 +27,8 @@ def make_request() -> EngineCoreRequest: request_id=uuid.uuid4(), prompt=PROMPT, prompt_token_ids=PROMPT_TOKENS, - mm_data=None, + mm_inputs=None, mm_placeholders=None, - mm_processor_kwargs=None, sampling_params=SamplingParams(), eos_token_id=None, arrival_time=time.time(), diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 582192196aaf9..4e003a25e91d2 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -29,9 +29,8 @@ def make_request(params: SamplingParams) -> EngineCoreRequest: request_id=str(uuid.uuid4()), prompt=PROMPT, prompt_token_ids=PROMPT_TOKENS, - mm_data=None, + mm_inputs=None, mm_placeholders=None, - mm_processor_kwargs=None, sampling_params=params, eos_token_id=None, arrival_time=time.time(), diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index e8fc78f1a66f6..85aaaa776907f 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -7,7 +7,8 @@ from typing_extensions import NotRequired, TypedDict, TypeVar, assert_never if TYPE_CHECKING: - from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict + from vllm.multimodal import (MultiModalDataDict, MultiModalKwargs, + MultiModalPlaceholderDict) from vllm.multimodal.inputs import MultiModalInputsV2 @@ -150,6 +151,12 @@ class TokenInputs(TypedDict): if the model supports it. """ + multi_modal_inputs: NotRequired["MultiModalKwargs"] + """ + Optional multi-modal inputs to pass to the model, + if the model supports it. + """ + multi_modal_placeholders: NotRequired["MultiModalPlaceholderDict"] """ Placeholder ranges for the multi-modal data. @@ -169,6 +176,7 @@ def token_inputs( token_type_ids: Optional[List[int]] = None, prompt: Optional[str] = None, multi_modal_data: Optional["MultiModalDataDict"] = None, + multi_modal_inputs: Optional["MultiModalKwargs"] = None, multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None, mm_processor_kwargs: Optional[Dict[str, Any]] = None, ) -> TokenInputs: @@ -181,6 +189,8 @@ def token_inputs( inputs["token_type_ids"] = token_type_ids if multi_modal_data is not None: inputs["multi_modal_data"] = multi_modal_data + if multi_modal_inputs is not None: + inputs["multi_modal_inputs"] = multi_modal_inputs if multi_modal_placeholders is not None: inputs["multi_modal_placeholders"] = multi_modal_placeholders if mm_processor_kwargs is not None: @@ -273,6 +283,18 @@ def multi_modal_data(self) -> "MultiModalDataDict": assert_never(inputs) + @cached_property + def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]: + inputs = self.inputs + + if inputs["type"] == "token": + return inputs.get("multi_modal_inputs", {}) + + if inputs["type"] == "multimodal": + return inputs.get("mm_kwargs", {}) + + assert_never(inputs) + @cached_property def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict": inputs = self.inputs diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 967124fd850ea..3cf0e610ae7af 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -1,11 +1,11 @@ import enum from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Union +from typing import List, Optional, Union import msgspec from vllm.lora.request import LoRARequest -from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict +from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict from vllm.sampling_params import RequestOutputKind, SamplingParams @@ -35,9 +35,8 @@ class EngineCoreRequest: # always be tokenized? prompt: Optional[str] prompt_token_ids: List[int] - mm_data: Optional[MultiModalDataDict] + mm_inputs: Optional[List[MultiModalKwargs]] mm_placeholders: Optional[MultiModalPlaceholderDict] - mm_processor_kwargs: Optional[Dict[str, Any]] sampling_params: SamplingParams eos_token_id: Optional[int] arrival_time: float diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 34f99dd30ef2e..397a33eed3896 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -84,14 +84,7 @@ def _initialize_kv_caches(self, def add_request(self, request: EngineCoreRequest): """Add request to the scheduler.""" - req = Request.from_engine_core_request(request) - # FIXME(woosuk): The input mapping (e.g., PIL images to tensors) may - # take 10-50 ms, which can cause a spike in the latency. We should - # consider moving this to a separate thread. - if req.mm_data: - req.mm_inputs = self.mm_input_mapper.process_inputs( - req.mm_data, req.mm_processor_kwargs) self.scheduler.add_request(req) def abort_requests(self, request_ids: List[str]): diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 5c1577190c75a..7a1ea2530abda 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -14,6 +14,7 @@ from vllm.transformers_utils.config import try_get_generation_config from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup from vllm.v1.engine import DetokenizerRequest, EngineCoreRequest +from vllm.v1.engine.mm_input_mapper import MMInputMapper class Processor: @@ -39,6 +40,9 @@ def __init__( self.input_processor = input_registry.create_input_processor( model_config) + # Multi-modal (huggingface) input mapper + self.mm_input_mapper = MMInputMapper(model_config) + # TODO: run in an ThreadpoolExecutor or BackgroundProcess. # This ideally should releases the GIL, so we should not block the # asyncio loop while this is running. @@ -96,6 +100,12 @@ def process_inputs( sampling_params.update_from_generation_config( self.generation_config_fields, eos_token_id) + # Preprocess multi-modal data + mm_inputs = self.mm_input_mapper.process_inputs( + decoder_inputs.multi_modal_data, + decoder_inputs.mm_processor_kwargs) if len( + decoder_inputs.multi_modal_data) > 0 else None + # Make Request for Detokenizer. detokenizer_request = DetokenizerRequest( request_id, @@ -113,9 +123,8 @@ def process_inputs( request_id, decoder_inputs.prompt, decoder_inputs.prompt_token_ids, - decoder_inputs.multi_modal_data, + mm_inputs, decoder_inputs.multi_modal_placeholders, - decoder_inputs.mm_processor_kwargs, sampling_params, eos_token_id, arrival_time, diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 51fb4003e5fe0..6bc1e4d5c769f 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -45,9 +45,6 @@ def __init__( self._all_token_ids: List[int] = self.prompt_token_ids.copy() self.num_computed_tokens = 0 - # Raw multimodal data before the mm input mapper (e.g., PIL images). - self.mm_data = self.inputs.multi_modal_data - self.mm_processor_kwargs = self.inputs.mm_processor_kwargs mm_positions = self.inputs.multi_modal_placeholders if mm_positions: # FIXME(woosuk): Support other modalities. @@ -55,7 +52,10 @@ def __init__( else: self.mm_positions = [] # Output of the mm input mapper (e.g., image tensors). - self.mm_inputs: List[MultiModalKwargs] = [] + if self.inputs.multi_modal_inputs: + self.mm_inputs = self.inputs.multi_modal_inputs + else: + self.mm_inputs: List[MultiModalKwargs] = [] @classmethod def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": @@ -64,9 +64,10 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": inputs=token_inputs( prompt_token_ids=request.prompt_token_ids, prompt=request.prompt, - multi_modal_data=request.mm_data, + multi_modal_data=None, + multi_modal_inputs=request.mm_inputs, multi_modal_placeholders=request.mm_placeholders, - mm_processor_kwargs=request.mm_processor_kwargs, + mm_processor_kwargs=None, ), sampling_params=request.sampling_params, eos_token_id=request.eos_token_id, @@ -110,7 +111,7 @@ def get_finished_reason(self) -> Union[str, None]: return RequestStatus.get_finished_reason(self.status) def has_encoder_inputs(self) -> bool: - return len(self.mm_data) > 0 + return len(self.mm_inputs) > 0 @property def num_encoder_inputs(self) -> int: From 2f2cdc745a7a569637c58cfd5f6789c1d0741c84 Mon Sep 17 00:00:00 2001 From: Yan Ma Date: Wed, 4 Dec 2024 01:16:31 +0800 Subject: [PATCH 231/255] [MISC][XPU] quick fix for XPU CI (#10859) Signed-off-by: yan ma --- .buildkite/run-xpu-test.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh index 50f58f7d70430..e0a12afbe7320 100644 --- a/.buildkite/run-xpu-test.sh +++ b/.buildkite/run-xpu-test.sh @@ -13,6 +13,7 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and test offline inference/tensor parallel -docker run -it -d --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test /bin/bash -docker exec xpu-test bash -c "python3 examples/offline_inference.py" -docker exec xpu-test bash -c "python3 examples/offline_inference_cli.py -tp 2" +docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c ' + python3 examples/offline_inference.py + python3 examples/offline_inference_cli.py -tp 2 +' From 7090c27bb2cb0d9c4e0acd644e484291df3aff2a Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Tue, 3 Dec 2024 13:32:21 -0500 Subject: [PATCH 232/255] [Bugfix] Only require XGrammar on x86 (#10865) Signed-off-by: mgoin --- requirements-common.txt | 2 +- .../guided_decoding/__init__.py | 7 +++++ vllm/platforms/__init__.py | 4 +-- vllm/platforms/interface.py | 26 +++++++++++++++++++ 4 files changed, 36 insertions(+), 3 deletions(-) diff --git a/requirements-common.txt b/requirements-common.txt index 818f72e14be96..72fb020a82c4e 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -19,7 +19,7 @@ prometheus-fastapi-instrumentator >= 7.0.0 tiktoken >= 0.6.0 # Required for DBRX tokenizer lm-format-enforcer >= 0.10.9, < 0.11 outlines >= 0.0.43, < 0.1 -xgrammar +xgrammar >= 0.1.5; platform_machine == "x86_64" typing_extensions >= 4.10 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 partial-json-parser # used for parsing partial JSON outputs diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py index 23c31fcfd7f05..3340bad38ab73 100644 --- a/vllm/model_executor/guided_decoding/__init__.py +++ b/vllm/model_executor/guided_decoding/__init__.py @@ -3,6 +3,7 @@ from typing import TYPE_CHECKING from vllm.logger import init_logger +from vllm.platforms import CpuArchEnum, current_platform if TYPE_CHECKING: from transformers import PreTrainedTokenizer @@ -25,6 +26,12 @@ def maybe_backend_fallback( guided_params.backend = "xgrammar" if guided_params.backend == "xgrammar": + # xgrammar only has x86 wheels for linux, fallback to outlines + if current_platform.get_cpu_architecture() is not CpuArchEnum.X86: + logger.warning("xgrammar is only supported on x86 CPUs. " + "Falling back to use outlines instead.") + guided_params.backend = "outlines" + # xgrammar doesn't support regex or choice, fallback to outlines if guided_params.regex is not None or guided_params.choice is not None: logger.warning( diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index 7cb8ac4b0a1e0..419237c252ffd 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -1,5 +1,5 @@ from .interface import _Backend # noqa: F401 -from .interface import Platform, PlatformEnum, UnspecifiedPlatform +from .interface import CpuArchEnum, Platform, PlatformEnum, UnspecifiedPlatform current_platform: Platform @@ -120,4 +120,4 @@ def cuda_is_jetson() -> bool: else: current_platform = UnspecifiedPlatform() -__all__ = ['Platform', 'PlatformEnum', 'current_platform'] +__all__ = ['Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum'] diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index eac2b413f9271..0be7df7941b8b 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -1,4 +1,5 @@ import enum +import platform import random from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union @@ -37,6 +38,14 @@ class PlatformEnum(enum.Enum): UNSPECIFIED = enum.auto() +class CpuArchEnum(enum.Enum): + X86 = enum.auto() + ARM = enum.auto() + POWERPC = enum.auto() + OTHER = enum.auto() + UNKNOWN = enum.auto() + + class DeviceCapability(NamedTuple): major: int minor: int @@ -184,6 +193,23 @@ def verify_quantization(cls, quant: str) -> None: f"{quant} quantization is currently not supported in " f"{cls.device_name}.") + @classmethod + def get_cpu_architecture(cls) -> CpuArchEnum: + """ + Determine the CPU architecture of the current system. + Returns CpuArchEnum indicating the architecture type. + """ + machine = platform.machine().lower() + + if machine in ("x86_64", "amd64", "i386", "i686"): + return CpuArchEnum.X86 + elif machine.startswith("arm") or machine.startswith("aarch"): + return CpuArchEnum.ARM + elif machine.startswith("ppc"): + return CpuArchEnum.POWERPC + + return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN + class UnspecifiedPlatform(Platform): _enum = PlatformEnum.UNSPECIFIED From 7c32b6861e20b6521959b6cc1ce7ccc84614974d Mon Sep 17 00:00:00 2001 From: tomeras91 <57313761+tomeras91@users.noreply.github.com> Date: Tue, 3 Dec 2024 21:13:31 +0200 Subject: [PATCH 233/255] [Frontend] correctly record prefill and decode time metrics (#10853) Signed-off-by: Tomer Asida --- vllm/engine/metrics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 4869557ba9b44..a5ae21c3966a7 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -599,9 +599,9 @@ def _log_prometheus(self, stats: Stats) -> None: stats.time_queue_requests) self._log_histogram(self.metrics.histogram_inference_time_request, stats.time_inference_requests) - self._log_histogram(self.metrics.histogram_decode_time_request, - stats.time_prefill_requests) self._log_histogram(self.metrics.histogram_prefill_time_request, + stats.time_prefill_requests) + self._log_histogram(self.metrics.histogram_decode_time_request, stats.time_decode_requests) self._log_histogram(self.metrics.histogram_time_in_queue_request, stats.time_in_queue_requests) From a061fe601eb165f11a4808b3ab1ac57d99e0d84e Mon Sep 17 00:00:00 2001 From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Date: Tue, 3 Dec 2024 15:47:55 -0500 Subject: [PATCH 234/255] [Build][Bugfix] Using the correct type hint (#10866) Signed-off-by: Gregory Shtrasberg --- vllm/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/utils.py b/vllm/utils.py index 0165a22582e7b..07bf82e24cbe6 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1540,9 +1540,9 @@ def __len__(self): return len(self._factory) -class ClassRegistry(UserDict[type[T], _V]): +class ClassRegistry(UserDict[Type[T], _V]): - def __getitem__(self, key: type[T]) -> _V: + def __getitem__(self, key: Type[T]) -> _V: for cls in key.mro(): if cls in self.data: return self.data[cls] From 381ac93bb5a41347a025367bc58119cb45357095 Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Tue, 3 Dec 2024 18:21:06 -0600 Subject: [PATCH 235/255] [Benchmark] Benchmark structured output with datasets (#10557) Signed-off-by: Aaron Pham Signed-off-by: Chendi Xue Co-authored-by: Aaron Pham --- benchmarks/benchmark_guided.py | 494 ++++++++++++++++++ .../structured_schema_1.json | 113 ++++ 2 files changed, 607 insertions(+) create mode 100644 benchmarks/benchmark_guided.py create mode 100644 benchmarks/structured_schemas/structured_schema_1.json diff --git a/benchmarks/benchmark_guided.py b/benchmarks/benchmark_guided.py new file mode 100644 index 0000000000000..1a0e62598bfcb --- /dev/null +++ b/benchmarks/benchmark_guided.py @@ -0,0 +1,494 @@ +"""Benchmark guided decoding throughput.""" +import argparse +import dataclasses +import json +import os +import random +import time +from typing import List + +import datasets +import pandas as pd +import uvloop +from transformers import AutoTokenizer, PreTrainedTokenizerBase + +from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs +from vllm.entrypoints.openai.api_server import ( + build_async_engine_client_from_engine_args) +from vllm.sampling_params import GuidedDecodingParams +from vllm.utils import FlexibleArgumentParser, merge_async_iterators + + +@dataclasses.dataclass +class SampleRequest: + """A class representing a single inference request for benchmarking. + + Attributes: + prompt: The input text prompt for the model. + multi_modal_data: Optional dictionary containing multi-modal data (e.g. + images). + prompt_len: The length of the prompt in tokens. + expected_output_len: The expected length of the output in tokens. + """ + prompt: str + prompt_len: int + expected_output_len: int + schema: dict + structure_type: str = 'json' + completion: str = None + + +def run_vllm(requests: List[SampleRequest], + engine_args: EngineArgs, + n: int, + guided_decoding_rate: float = 1.0, + warmup: bool = False) -> float: + from vllm import LLM, SamplingParams + llm = LLM(**vars(engine_args)) + + # Add the requests to the engine. + prompts: List[str] = [] + sampling_params: List[SamplingParams] = [] + # create a list containing random selected true or false + guided_decoding_req_idx = random.sample( + range(len(requests)), int(len(requests) * guided_decoding_rate)) + + if warmup: + print(">>>>> Running warmup prompt, for the first 5") + # We setup the first 5 requests to warmup FSM + # if using xgrammar dataset, we will skip warmup + warmup_requests = requests[:5] + for i, request in enumerate(warmup_requests): + prompts.append(request.prompt) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + guided_decoding=GuidedDecodingParams(json=request.schema) + if guided_decoding_rate > 0 else None, + )) + llm.generate(prompts, sampling_params, use_tqdm=False) + + print(">>>>> Benchmark started...") + prompts = [] + sampling_params = [] + for i, request in enumerate(requests): + prompts.append(request.prompt) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + guided_decoding=GuidedDecodingParams( + **{request.structure_type: request.schema}) + if i in guided_decoding_req_idx else None, + )) + + start = time.perf_counter() + outputs = llm.generate(prompts, sampling_params, use_tqdm=False) + ret = [] + for output, request in zip(outputs, requests): + generated_text = output.outputs[0].text + ret.append({ + "generated": generated_text, + "expected": request.completion + }) + end = time.perf_counter() + return end - start, ret + + +async def run_vllm_async( + requests: List[SampleRequest], + engine_args: AsyncEngineArgs, + n: int, + guided_decoding_rate: float = 1.0, + warmup: bool = False, + disable_frontend_multiprocessing: bool = False) -> float: + from vllm import SamplingParams + + async with build_async_engine_client_from_engine_args( + engine_args, disable_frontend_multiprocessing) as llm: + + # Add the requests to the engine. + prompts: List[str] = [] + sampling_params: List[SamplingParams] = [] + guided_decoding_req_idx = random.sample( + range(len(requests)), int(len(requests) * guided_decoding_rate)) + + if warmup: + print(">>>>>> Running warmup prompt, for the first 5") + # We setup the first 5 requests to warmup FSM + # if using xgrammar dataset, we will skip warmup + warmup_requests = requests[:5] + for i, request in enumerate(warmup_requests): + prompts.append(request.prompt) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + guided_decoding=GuidedDecodingParams( + json=request.schema) + if guided_decoding_rate > 0 else None, + )) + generators = [] + for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)): + generator = llm.generate(prompt, sp, request_id=f"test{i}") + generators.append(generator) + all_gens = merge_async_iterators(*generators) + async for i, res in all_gens: + pass + + print(">>>>> Benchmark started...") + prompts = [] + sampling_params = [] + for i, request in enumerate(requests): + prompts.append(request.prompt) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + guided_decoding=GuidedDecodingParams(json=request.schema) + if i in guided_decoding_req_idx else None, + )) + + generators = [] + start_time = [] + latencies = [] + start = time.perf_counter() + for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)): + generator = llm.generate(prompt, sp, request_id=f"test{i}") + generators.append(generator) + start_time.append(time.perf_counter()) + latencies.append([]) + all_gens = merge_async_iterators(*generators) + generated_texts = [''] * len(requests) + async for i, res in all_gens: + generated_texts[i] = res.outputs[0].text + lat = time.perf_counter() - start_time[i] + latencies[i].append(lat) + ret = [{ + 'generated': gt, + 'expected': req.completion + } for gt, req in zip(generated_texts, requests)] + end = time.perf_counter() + first_latency = pd.Series([lat[0] * 1000 for lat in latencies]) + next_latency = pd.Series([(lat[-1] - lat[0]) / len(lat[1:]) * 1000 + for lat in latencies]) + return end - start, ret, (first_latency, next_latency) + + +def sample_requests(tokenizer: PreTrainedTokenizerBase, + args: argparse.Namespace) -> List[SampleRequest]: + if args.dataset == 'json': + if args.json_schema_path is None: + dir_path = os.path.dirname(os.path.realpath(__file__)) + args.json_schema_path = os.path.join(dir_path, + "structured_schemas", + "structured_schema_1.json") + with open(args.json_schema_path) as f: + schema = json.load(f) + prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501 + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=schema, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "grammar": + schema = """ + ?start: select_statement + + ?select_statement: "SELECT " column_list " FROM " table_name + + ?column_list: column_name ("," column_name)* + + ?table_name: identifier + + ?column_name: identifier + + ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/ + """ + prompt = "Generate an SQL query to show the 'username' \ + and 'email' from the 'users' table." + + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=schema, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "regex": + regex = r"\w+@\w+\.com\n" + args.regex = regex + prompt = "Generate an email address for Alan Turing, \ + who works in Enigma. End in .com and new line. \ + Example result: alan.turing@enigma.com\n" + + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=regex, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "choice": + choice = ["Positive", "Negative"] + args.choice = choice + prompt = "Classify this sentiment: vLLM is wonderful!" + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=choice, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "xgrammar_bench": + args.warmup = False + requests: List[SampleRequest] = [] + dataset = datasets.load_dataset("NousResearch/json-mode-eval", + split="train") + print(f"dataset has {len(dataset)} entries") + len_dataset = len(dataset) + for data_point_idx in range(args.num_prompts): + idx = data_point_idx + while idx >= len_dataset: + idx -= len_dataset + schema = dataset["schema"][idx] + prompt = tokenizer.apply_chat_template(dataset["prompt"][idx], + tokenize=False) + input_len = len(tokenizer(prompt).input_ids) + completion = dataset["completion"][idx] + + requests.append( + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=schema, + completion=completion)) + + return requests + + +def evaluate(ret, args): + + def _eval_correctness_json(expected, actual): + # extract json string from string using regex + import re + actual = actual.replace('\n', '').replace(' ', '').strip() + try: + actual = re.search(r'\{.*\}', actual).group() + actual = json.loads(actual) + except Exception: + return False + + return True + + def _eval_correctness_choice(expected, actual): + return actual in args.choice + + def _eval_correctness_regex(expected, actual): + import re + return re.match(args.regex, actual) is not None + + def _eval_correctness(expected, actual): + if args.structure_type == 'json': + return _eval_correctness_json(expected, actual) + elif args.structure_type == 'regex': + return _eval_correctness_regex(expected, actual) + elif args.structure_type == 'choice': + return _eval_correctness_choice(expected, actual) + else: + return None + + scores = [] + for res in ret: + score = _eval_correctness(res['expected'], res['generated']) + res['correctness'] = score + scores.append(score) + + not_none_scores = [score for score in scores if score is not None] + + return (sum(not_none_scores) / len(not_none_scores) * + 100) if len(not_none_scores) > 0 else None + + +def main(args: argparse.Namespace): + print(args) + random.seed(args.seed) + + # async engine is working for 'regex', 'choice' and 'grammar' + if args.dataset == 'grammar': + args.structure_type = 'grammar' + args.async_engine = False + elif args.dataset == 'regex': + args.structure_type = 'regex' + args.async_engine = False + elif args.dataset == 'choice': + args.structure_type = 'choice' + args.async_engine = False + else: + args.structure_type = 'json' + + if args.no_guided_decoding: + args.guided_decoding_ratio = 0 + if args.save_results: + result_file_name = f'{args.guided_decoding_ratio}guided' + result_file_name += f"_{args.model.split('/')[-1]}" + result_file_name += f"_{args.dataset}" + result_file_name += f"_{args.num_prompts}" + result_file_name += f"_out{args.output_len}" + result_file_name += f"_async{args.async_engine}" + result_file_name += f"_warmup{args.warmup}" + result_file_name += f"_chunkedprefill{args.enable_chunked_prefill}" + result_file_name += ".txt" + else: + result_file_name = None + + # Synthesize a prompt with the given input length. + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer, trust_remote_code=args.trust_remote_code) + requests = sample_requests(tokenizer, args) + + if args.async_engine: + engine_args = AsyncEngineArgs.from_cli_args(args) + elapsed_time, ret, (first_latency, next_latency) = uvloop.run( + run_vllm_async(requests, engine_args, args.n, + args.guided_decoding_ratio, args.warmup, + args.disable_frontend_multiprocessing)) + else: + engine_args = EngineArgs.from_cli_args(args) + elapsed_time, ret = run_vllm(requests, engine_args, args.n, + args.guided_decoding_ratio, args.warmup) + first_latency, next_latency = None, None + + score = evaluate(ret, args) + total_num_tokens = sum(request.prompt_len + request.expected_output_len + for request in requests) + total_output_tokens = sum(request.expected_output_len + for request in requests) + if first_latency is not None: + latency_breakdown = "\nFirst token latency(msecs):\n" + latency_breakdown += f"{first_latency.describe()}" + latency_breakdown += "\nNext token latency(msecs):\n" + latency_breakdown += f"{next_latency.describe()}" + print( + f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " + f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " + f"{total_output_tokens / elapsed_time:.2f} output tokens/s", + f"Correct rate is {score} %", + f"{latency_breakdown if first_latency is not None else ''}") + + # Output JSON results if specified + if args.output_json or result_file_name: + results = { + "elapsed_time": elapsed_time, + "num_requests": len(requests), + "total_num_tokens": total_num_tokens, + "total_output_tokens": total_output_tokens, + "requests_per_second": len(requests) / elapsed_time, + "tokens_per_second": f"{total_num_tokens / elapsed_time:.2f}", + "output_tokens_per_second": + f"{total_output_tokens / elapsed_time:.2f}", + "correct_rate(%)": score + } + results = {"outputs": ret, **results} + if first_latency is not None: + results["first_token_latency(msecs)"] = first_latency.describe( + ).to_dict() + results["next_token_latency(msecs)"] = next_latency.describe( + ).to_dict() + if args.output_json: + with open(args.output_json, "w") as f: + json.dump(results, f, indent=4) + elif result_file_name: + with open(result_file_name, "w") as f: + json.dump(results, f, indent=4) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser(description="Benchmark guided decoding.") + parser = AsyncEngineArgs.add_cli_args(parser) + + parser.add_argument("--output-len", + type=int, + default=512, + help="Output length for each request. Overrides the " + "output length from the dataset.") + parser.add_argument( + "--dataset", + default='json', + choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench']) + parser.add_argument("--json_schema_path", + type=str, + default=None, + help="Path to json schema.") + parser.add_argument("--n", + type=int, + default=1, + help="Number of generated sequences per prompt.") + parser.add_argument("--num-prompts", + type=int, + default=10, + help="Number of prompts to process.") + parser.add_argument( + '--output-json', + type=str, + default=None, + help='Path to save the throughput results in JSON format.') + parser.add_argument("--async-engine", + action='store_true', + default=False, + help="Use vLLM async engine rather than LLM class.") + parser.add_argument("--no-guided-decoding", + action='store_true', + default=False, + help="Whether to disable JSON decoding or not.") + parser.add_argument("--guided-decoding-ratio", + type=float, + default=1.0, + help="Ratio of Guided Decoding requests") + parser.add_argument("--disable-frontend-multiprocessing", + action='store_true', + default=False, + help="Disable decoupled async engine frontend.") + parser.add_argument("--warmup", + action="store_true", + default=False, + help="Run warmup prompts before benchmark.") + parser.add_argument("--save-results", + action="store_true", + default=False, + help="save output results.") + args = parser.parse_args() + if args.tokenizer is None: + args.tokenizer = args.model + main(args) diff --git a/benchmarks/structured_schemas/structured_schema_1.json b/benchmarks/structured_schemas/structured_schema_1.json new file mode 100644 index 0000000000000..6003698469e8d --- /dev/null +++ b/benchmarks/structured_schemas/structured_schema_1.json @@ -0,0 +1,113 @@ +{ + "$schema": + "https://json-schema.org/draft/2020-12/schema", + "title": + "User Profile", + "type": + "object", + "properties": { + "userId": { + "type": "string", + "description": "Unique identifier for the user." + }, + "personalInfo": { + "type": "object", + "properties": { + "firstName": { + "type": "string", + "description": "The user's first name." + }, + "lastName": { + "type": "string", + "description": "The user's last name." + }, + "age": { + "type": "integer", + "minimum": 0, + "description": "The user's age." + }, + "phoneNumbers": { + "type": + "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["home", "work", "mobile"], + "description": "Type of phone number." + }, + "number": { + "type": "string", + "pattern": "^\\+?[1-9]\\d{1,14}$", + "description": "Phone number in E.164 format." + } + }, + "required": ["type", "number"] + }, + "description": + "List of phone numbers associated with the user." + } + }, + "required": ["firstName", "lastName"] + }, + "address": { + "type": "object", + "properties": { + "street": { + "type": "string", + "description": "Street address." + }, + "city": { + "type": "string", + "description": "City name." + }, + "state": { + "type": "string", + "description": "State or province." + }, + "postalCode": { + "type": "string", + "pattern": "^\\d{5}(-\\d{4})?$", + "description": "Postal code." + }, + "country": { + "type": "string", + "description": "Country name." + } + }, + "required": ["street", "city", "state", "postalCode", "country"] + }, + "preferences": { + "type": "object", + "properties": { + "newsletterSubscribed": { + "type": + "boolean", + "description": + "Indicates if the user is subscribed to the newsletter." + }, + "favoriteCategories": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of user's favorite categories." + } + }, + "required": ["newsletterSubscribed"] + }, + "accountStatus": { + "type": "string", + "enum": ["active", "inactive", "suspended"], + "description": "Current status of the user's account." + }, + "registrationDate": { + "type": "string", + "format": "date-time", + "description": "ISO 8601 formatted date-time of user registration." + } + }, + "required": + ["userId", "personalInfo", "address", "accountStatus", "registrationDate"] +} \ No newline at end of file From d2bd88b1226fc93ba42cdcba51daff5e026343f0 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Tue, 3 Dec 2024 22:23:21 -0500 Subject: [PATCH 236/255] [CI/Build] Replace mean with torch.all in test_pynccl.py (#10876) Signed-off-by: Tyler Michael Smith --- tests/distributed/test_pynccl.py | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index 4e27babf12cc3..3e9b0e10a11d8 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -62,8 +62,7 @@ def worker_fn(): with pynccl_comm.change_state(enable=True): tensor = pynccl_comm.all_reduce(tensor) torch.cuda.synchronize() - result = tensor.mean().cpu().item() - assert result == pynccl_comm.world_size + assert torch.all(tensor == pynccl_comm.world_size).cpu().item() @pytest.mark.skipif(torch.cuda.device_count() < 2, @@ -88,13 +87,11 @@ def multiple_allreduce_worker_fn(): tensor = pynccl_comm.all_reduce(tensor) tensor = pynccl_comm.all_reduce(tensor) torch.cuda.synchronize() - result = tensor.mean().cpu().item() - assert result == 4 + assert torch.all(tensor == 4).cpu().item() else: tensor = pynccl_comm.all_reduce(tensor) torch.cuda.synchronize() - result = tensor.mean().cpu().item() - assert result == 2 + assert torch.all(tensor == 2).cpu().item() @pytest.mark.skipif(torch.cuda.device_count() < 4, @@ -116,13 +113,11 @@ def multiple_allreduce_with_vllm_worker_fn(): tensor = tensor_model_parallel_all_reduce(tensor) tensor = tensor_model_parallel_all_reduce(tensor) torch.cuda.synchronize() - result = tensor.mean().cpu().item() - assert result == 4 + assert torch.all(tensor == 4).cpu().item() else: tensor = tensor_model_parallel_all_reduce(tensor) torch.cuda.synchronize() - result = tensor.mean().cpu().item() - assert result == 2 + assert torch.all(tensor == 2).cpu().item() @pytest.mark.skipif(torch.cuda.device_count() < 4, @@ -149,7 +144,7 @@ def worker_fn_with_cudagraph(): torch.cuda.synchronize() graph.replay() torch.cuda.synchronize() - assert a_out.mean().cpu().item() == pynccl_comm.world_size**1 + assert torch.all(a_out == pynccl_comm.world_size).cpu().item() @worker_fn_wrapper @@ -249,8 +244,7 @@ def send_recv_worker_fn(): src=(pynccl_comm.rank - 1) % pynccl_comm.world_size) torch.cuda.synchronize() - result = tensor.mean().cpu().item() - assert result == 1 + assert torch.all(tensor == 1).cpu().item() @pytest.mark.skipif(torch.cuda.device_count() < 2, @@ -289,11 +283,10 @@ def multiple_send_recv_worker_fn(): src=(pynccl_comm.rank - 1) % pynccl_comm.world_size) torch.cuda.synchronize() - result = tensor.mean().cpu().item() if torch.distributed.get_rank() in [0, 2]: - assert result == 1 + assert torch.all(tensor == 1).cpu().item() else: - assert result == 2 + assert torch.all(tensor == 2).cpu().item() @pytest.mark.skipif(torch.cuda.device_count() < 4, From b5b647b084de3a5a29d35ca527c9901f8e6a4e7e Mon Sep 17 00:00:00 2001 From: wangxiyuan Date: Wed, 4 Dec 2024 12:32:21 +0800 Subject: [PATCH 237/255] Drop ROCm load format check (#10767) Signed-off-by: wangxiyuan --- vllm/config.py | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 971eb36d677b8..1cbab8ea30249 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -931,7 +931,9 @@ def __post_init__(self): if isinstance(model_loader_extra_config, str): self.model_loader_extra_config = json.loads( model_loader_extra_config) - self._verify_load_format() + if isinstance(self.load_format, str): + load_format = self.load_format.lower() + self.load_format = LoadFormat(load_format) if self.ignore_patterns is not None and len(self.ignore_patterns) > 0: logger.info( @@ -940,25 +942,6 @@ def __post_init__(self): else: self.ignore_patterns = ["original/**/*"] - def _verify_load_format(self) -> None: - if not isinstance(self.load_format, str): - return - - load_format = self.load_format.lower() - self.load_format = LoadFormat(load_format) - - rocm_not_supported_load_format: List[str] = [] - if current_platform.is_rocm( - ) and load_format in rocm_not_supported_load_format: - rocm_supported_load_format = [ - f for f in LoadFormat.__members__ - if (f not in rocm_not_supported_load_format) - ] - raise ValueError( - f"load format '{load_format}' is not supported in ROCm. " - f"Supported load formats are " - f"{rocm_supported_load_format}") - @dataclass class ParallelConfig: From fa2dea61df9bb3fa3dbd081f42f464c45e3db5b2 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Tue, 3 Dec 2024 23:02:16 -0800 Subject: [PATCH 238/255] [ci/build] Change queue name for Release jobs (#10875) --- .buildkite/release-pipeline.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index f78e360b7afd3..173b52f072502 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -1,7 +1,7 @@ steps: - label: "Build wheel - CUDA 12.1" agents: - queue: cpu_queue + queue: cpu_queue_postmerge commands: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ." - "mkdir artifacts" @@ -18,7 +18,7 @@ steps: - label: "Build wheel - CUDA 11.8" # depends_on: block-build-cu118-wheel agents: - queue: cpu_queue + queue: cpu_queue_postmerge commands: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ." - "mkdir artifacts" From c9ca4fce3f48e27801e1bad03d4bc0b963567d24 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Tue, 3 Dec 2024 23:02:40 -0800 Subject: [PATCH 239/255] [ci/build] Job to build and push release image (#10877) --- .buildkite/release-pipeline.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 173b52f072502..93e118fb3eab8 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -26,3 +26,16 @@ steps: - "bash .buildkite/upload-wheels.sh" env: DOCKER_BUILDKIT: "1" + + - block: "Build release image" + depends_on: ~ + key: block-release-image-build + + - label: "Build release image" + depends_on: block-release-image-build + agents: + queue: cpu_queue_postmerge + commands: + - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ." + - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" From 8db957ee3a8234574430d9e570e520501d8539e9 Mon Sep 17 00:00:00 2001 From: jianzheng <57654625+o2363286@users.noreply.github.com> Date: Wed, 4 Dec 2024 16:48:22 +0800 Subject: [PATCH 240/255] =?UTF-8?q?[bugfix]=20fixed=20parameter=20?= =?UTF-8?q?=E2=80=9Cn=E2=80=9D=20when=20set=20parameter=20=E2=80=9Cbestof?= =?UTF-8?q?=E2=80=9D=20>=201=20(#10854)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: jianzheng <57654625+o2363286@users.noreply.github.com> --- vllm/sampling_params.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 5c6df5aaf5446..fc77f3ca529b2 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -293,8 +293,9 @@ def __post_init__(self) -> None: raise ValueError( f"best_of must be greater than or equal to n, " f"got n={self.n} and best_of={self.best_of}.") - self._real_n = self.n - self.n = self.best_of + if not self._real_n: + self._real_n = self.n + self.n = self.best_of if 0 < self.temperature < _MAX_TEMP: logger.warning( From c92acb9693c0504d7dabed2a0251b9f5d4ddaebb Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Wed, 4 Dec 2024 01:01:20 -0800 Subject: [PATCH 241/255] [ci/build] Update vLLM postmerge ECR repo (#10887) --- .buildkite/nightly-benchmarks/benchmark-pipeline.yaml | 6 +++--- .buildkite/nightly-benchmarks/scripts/wait-for-image.sh | 4 ++-- docs/source/getting_started/installation.rst | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml index 3db77d5f16022..dd2ce454ecb2d 100644 --- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml @@ -21,7 +21,7 @@ steps: podSpec: priorityClassName: perf-benchmark containers: - - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT command: - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh resources: @@ -51,7 +51,7 @@ steps: queue: H200 plugins: - docker#v5.12.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT command: - bash - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -71,7 +71,7 @@ steps: queue: H100 plugins: - docker#v5.12.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT command: - bash - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh diff --git a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh index 19f7160e68a4d..aa0f7ade808e0 100644 --- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh +++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh @@ -1,6 +1,6 @@ #!/bin/sh -TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token) -URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT" +TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token) +URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT" TIMEOUT_SECONDS=10 diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst index e3dbbc9affe66..52412fa8437b9 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation.rst @@ -73,7 +73,7 @@ Another way to access the latest code is to use the docker images: .. code-block:: console $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch - $ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${VLLM_COMMIT} + $ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT} These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days. From 01d079fd8e65ed9a243ebbf6b771393607942907 Mon Sep 17 00:00:00 2001 From: Xin Yang <105740670+xyang16@users.noreply.github.com> Date: Wed, 4 Dec 2024 09:40:16 -0800 Subject: [PATCH 242/255] [LoRA] Change lora_tokenizers capacity (#10796) Signed-off-by: Xin Yang --- tests/lora/test_tokenizer_group.py | 20 +++++++++++++++++++ vllm/engine/llm_engine.py | 2 +- vllm/engine/multiprocessing/client.py | 3 +-- .../tokenizer_group/__init__.py | 9 +++++---- .../tokenizer_group/tokenizer_group.py | 3 ++- vllm/v1/engine/async_llm.py | 2 +- vllm/v1/engine/llm_engine.py | 2 +- 7 files changed, 31 insertions(+), 10 deletions(-) diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py index daa39b2a3dba1..d225a3f7d6c06 100644 --- a/tests/lora/test_tokenizer_group.py +++ b/tests/lora/test_tokenizer_group.py @@ -17,6 +17,7 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type): tokenizer_id="gpt2", enable_lora=True, max_num_seqs=1, + max_loras=1, max_input_length=None, ) lora_request = LoRARequest("1", 1, sql_lora_files) @@ -53,3 +54,22 @@ def test_get_lora_tokenizer(sql_lora_files, tmp_path): lora_request = LoRARequest("1", 1, str(tmp_path)) tokenizer = get_lora_tokenizer(lora_request) assert not tokenizer + + +@pytest.mark.parametrize("enable_lora", [True, False]) +@pytest.mark.parametrize("max_num_seqs", [1, 2]) +@pytest.mark.parametrize("max_loras", [1, 2]) +def test_lora_tokenizers(enable_lora, max_num_seqs, max_loras): + tokenizer_group = get_tokenizer_group( + get_tokenizer_pool_config(None), + tokenizer_id="gpt2", + enable_lora=enable_lora, + max_num_seqs=max_num_seqs, + max_loras=max_loras, + max_input_length=None, + ) + if enable_lora: + assert tokenizer_group.lora_tokenizers.capacity == max( + max_num_seqs, max_loras) + else: + assert tokenizer_group.lora_tokenizers.capacity == 0 diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index af66b307028cf..1f3c6197ba1a8 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -620,7 +620,7 @@ def _init_tokenizer(self) -> BaseTokenizerGroup: model_config=self.model_config, scheduler_config=self.scheduler_config, parallel_config=self.parallel_config, - enable_lora=bool(self.lora_config)) + lora_config=self.lora_config) def _verify_args(self) -> None: self.model_config.verify_with_parallel_config(self.parallel_config) diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index d21136c03d7d2..7e4f81b2cf8e2 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -94,8 +94,7 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig, model_config=self.model_config, scheduler_config=engine_config.scheduler_config, parallel_config=engine_config.parallel_config, - enable_lora=bool(engine_config.lora_config), - ) + lora_config=engine_config.lora_config) self.input_preprocessor = InputPreprocessor(self.model_config, self.tokenizer) diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py index 6a114b513f382..c0b3d2585a962 100644 --- a/vllm/transformers_utils/tokenizer_group/__init__.py +++ b/vllm/transformers_utils/tokenizer_group/__init__.py @@ -1,7 +1,7 @@ from typing import Optional, Type -from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig, - TokenizerPoolConfig) +from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig, + SchedulerConfig, TokenizerPoolConfig) from vllm.executor.ray_utils import ray from .base_tokenizer_group import AnyTokenizer, BaseTokenizerGroup @@ -16,10 +16,11 @@ def init_tokenizer_from_configs(model_config: ModelConfig, scheduler_config: SchedulerConfig, parallel_config: ParallelConfig, - enable_lora: bool): + lora_config: LoRAConfig): init_kwargs = dict(tokenizer_id=model_config.tokenizer, - enable_lora=enable_lora, + enable_lora=bool(lora_config), max_num_seqs=scheduler_config.max_num_seqs, + max_loras=lora_config.max_loras if lora_config else 0, max_input_length=None, tokenizer_mode=model_config.tokenizer_mode, trust_remote_code=model_config.trust_remote_code, diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py index e516eeabaadef..761b07f34d2f9 100644 --- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py @@ -21,8 +21,9 @@ def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int, self.enable_lora = enable_lora self.max_input_length = max_input_length self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config) + max_loras = tokenizer_config.get("max_loras", 0) self.lora_tokenizers = LRUCache[AnyTokenizer]( - capacity=max_num_seqs if enable_lora else 0) + capacity=max(max_loras, max_num_seqs) if enable_lora else 0) @classmethod def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig], diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 7335c637f0f79..4ef372fd8464b 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -51,7 +51,7 @@ def __init__( model_config=vllm_config.model_config, scheduler_config=vllm_config.scheduler_config, parallel_config=vllm_config.parallel_config, - enable_lora=bool(vllm_config.lora_config)) + lora_config=vllm_config.lora_config) self.tokenizer.ping() # Request streams (map of request_id -> AsyncStream). diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index bd19d998a4adb..312c0242a45dd 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -46,7 +46,7 @@ def __init__( model_config=vllm_config.model_config, scheduler_config=vllm_config.scheduler_config, parallel_config=vllm_config.parallel_config, - enable_lora=bool(vllm_config.lora_config)) + lora_config=vllm_config.lora_config) self.tokenizer.ping() # Processor (convert Inputs --> EngineCoreRequests) From 10398b4706ee71d0bddc32c1d33b11e73df12a27 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Thu, 5 Dec 2024 02:11:08 +0800 Subject: [PATCH 243/255] [Model] Consolidate ViTs attention implementation without mask (#10893) Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/attention/layer.py | 63 +++++++++++++++++++ vllm/model_executor/models/blip.py | 45 ++----------- vllm/model_executor/models/clip.py | 46 ++------------ .../models/glm4_vision_encoder.py | 22 ++----- .../models/idefics2_vision_model.py | 25 ++------ vllm/model_executor/models/intern_vit.py | 28 ++------- vllm/model_executor/models/internvl.py | 23 ++++--- vllm/model_executor/models/molmo.py | 38 +++-------- vllm/model_executor/models/siglip.py | 45 ++----------- 9 files changed, 109 insertions(+), 226 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index e024eef286f05..05d997279893b 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -3,6 +3,7 @@ import torch import torch.nn as nn +import torch.nn.functional as F from vllm.attention import AttentionMetadata, AttentionType from vllm.attention.selector import backend_name_to_enum, get_attn_backend @@ -168,6 +169,68 @@ def extra_repr(self) -> str: return s +class MultiHeadAttention(nn.Module): + """Multi-headed attention without any cache, used for ViT.""" + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: Optional[int] = None, + ): + super().__init__() + self.num_heads = num_heads + self.head_size = head_size + self.scale = scale + self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads + + dtype = torch.get_default_dtype() + attn_backend = get_attn_backend(head_size, + dtype, + kv_cache_dtype=None, + block_size=16, + is_attention_free=False) + if attn_backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}: + attn_backend = _Backend.XFORMERS + + self.attn_backend = attn_backend if attn_backend in { + _Backend.TORCH_SDPA, _Backend.XFORMERS + } else _Backend.TORCH_SDPA + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + ) -> torch.Tensor: + """Input shape: batch_size x seq_len x hidden_size""" + # TODO(Isotr0py): Use existing backend implementations and support FA2 + bsz, q_len, _ = query.size() + kv_len = key.size(1) + + query = query.view(bsz, q_len, self.num_heads, self.head_size) + key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size) + value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size) + + if self.attn_backend == _Backend.XFORMERS: + from xformers import ops as xops + + out = xops.memory_efficient_attention_forward(query, + key, + value, + scale=self.scale) + elif self.attn_backend == _Backend.TORCH_SDPA: + query, key, value = (x.transpose(1, 2) + for x in (query, key, value)) + out = F.scaled_dot_product_attention(query, + key, + value, + scale=self.scale) + out = out.transpose(1, 2) + return out.view(bsz, q_len, -1) + + def unified_attention( query: torch.Tensor, key: torch.Tensor, diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 6af59697160a0..42a239cadac46 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -4,11 +4,10 @@ import torch import torch.nn as nn -import torch.nn.functional as F from PIL import Image from transformers import Blip2VisionConfig, BlipVisionConfig -from vllm.attention.selector import _Backend +from vllm.attention.layer import MultiHeadAttention from vllm.config import ModelConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.inputs import DecoderOnlyInputs, token_inputs @@ -22,8 +21,6 @@ repeat_and_pad_placeholder_tokens) from vllm.sequence import SequenceData -from .utils import get_vit_attn_backend - def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int: assert image_size % patch_size == 0 @@ -205,11 +202,8 @@ def __init__( self.tp_size = get_tensor_model_parallel_world_size() self.num_heads_per_partition = divide(self.num_heads, self.tp_size) - # Detect attention implementation. - self.attn_backend = get_vit_attn_backend(support_fa=False) - if self.attn_backend not in {_Backend.TORCH_SDPA, _Backend.XFORMERS}: - raise RuntimeError( - f"BLIP does not support {self.attn_backend} backend now.") + self.attn = MultiHeadAttention(self.num_heads_per_partition, + self.head_dim, self.scale) def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): return tensor.view(bsz, seq_len, self.num_heads, @@ -220,41 +214,10 @@ def forward( hidden_states: torch.Tensor, ): """Input shape: Batch x Time x Channel""" - bsz, tgt_len, _ = hidden_states.size() qkv_states, _ = self.qkv(hidden_states) query_states, key_states, value_states = qkv_states.chunk(3, dim=-1) - query_states = query_states.view(bsz, tgt_len, - self.num_heads_per_partition, - self.head_dim) - key_states = key_states.view(bsz, tgt_len, - self.num_heads_per_partition, - self.head_dim) - value_states = value_states.view(bsz, tgt_len, - self.num_heads_per_partition, - self.head_dim) - - if self.attn_backend == _Backend.XFORMERS: - from xformers import ops as xops - - out = xops.memory_efficient_attention_forward(query_states, - key_states, - value_states, - p=self.dropout, - scale=self.scale) - elif self.attn_backend == _Backend.TORCH_SDPA: - query_states, key_states, value_states = (x.transpose(1, 2) - for x in (query_states, - key_states, - value_states)) - out = F.scaled_dot_product_attention(query_states, - key_states, - value_states, - dropout_p=self.dropout, - scale=self.scale) - out = out.transpose(1, 2) - - out = out.view(bsz, tgt_len, -1) + out = self.attn(query_states, key_states, value_states) attn_output, _ = self.projection(out) return attn_output, None diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index cd89519e95986..a5300dfd986f3 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -5,11 +5,10 @@ import numpy as np import torch import torch.nn as nn -import torch.nn.functional as F from PIL import Image from transformers import CLIPVisionConfig -from vllm.attention.selector import _Backend +from vllm.attention.layer import MultiHeadAttention from vllm.config import ModelConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.inputs import DecoderOnlyInputs, token_inputs @@ -25,8 +24,6 @@ resolve_visual_encoder_outputs) from vllm.sequence import SequenceData -from .utils import get_vit_attn_backend - def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int: assert image_size % patch_size == 0 @@ -235,11 +232,8 @@ def __init__( self.tp_size = get_tensor_model_parallel_world_size() self.num_heads_per_partition = divide(self.num_heads, self.tp_size) - # Detect attention implementation. - self.attn_backend = get_vit_attn_backend(support_fa=False) - if self.attn_backend not in {_Backend.TORCH_SDPA, _Backend.XFORMERS}: - raise RuntimeError( - f"CLIP does not support {self.attn_backend} backend now.") + self.attn = MultiHeadAttention(self.num_heads_per_partition, + self.head_dim, self.scale) def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): return tensor.view(bsz, seq_len, self.num_heads, @@ -250,42 +244,10 @@ def forward( hidden_states: torch.Tensor, ): """Input shape: Batch x Time x Channel""" - bsz, tgt_len, _ = hidden_states.size() qkv_states, _ = self.qkv_proj(hidden_states) query_states, key_states, value_states = qkv_states.chunk(3, dim=-1) - - query_states = query_states.view(bsz, tgt_len, - self.num_heads_per_partition, - self.head_dim) - key_states = key_states.view(bsz, tgt_len, - self.num_heads_per_partition, - self.head_dim) - value_states = value_states.view(bsz, tgt_len, - self.num_heads_per_partition, - self.head_dim) - - if self.attn_backend == _Backend.XFORMERS: - from xformers import ops as xops - - out = xops.memory_efficient_attention_forward(query_states, - key_states, - value_states, - p=self.dropout, - scale=self.scale) - elif self.attn_backend == _Backend.TORCH_SDPA: - query_states, key_states, value_states = (x.transpose(1, 2) - for x in (query_states, - key_states, - value_states)) - out = F.scaled_dot_product_attention(query_states, - key_states, - value_states, - dropout_p=self.dropout, - scale=self.scale) - out = out.transpose(1, 2) - - out = out.view(bsz, tgt_len, -1) + out = self.attn(query_states, key_states, value_states) attn_output, _ = self.out_proj(out) return attn_output, None diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py index f37ab0f82d52a..39a5736eb199b 100644 --- a/vllm/model_executor/models/glm4_vision_encoder.py +++ b/vllm/model_executor/models/glm4_vision_encoder.py @@ -8,6 +8,7 @@ from torch import nn from torch.nn import LayerNorm +from vllm.attention.layer import MultiHeadAttention from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -77,27 +78,16 @@ def __init__( quant_config=quant_config, ) + self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim, + self.scale) self.output_dropout = torch.nn.Dropout(config.dropout_prob) def forward(self, x: torch.Tensor) -> torch.Tensor: - B, L, _ = x.shape qkv, _ = self.query_key_value(x) # B, L, 3 * H * D q, k, v = qkv.chunk(3, dim=-1) - q = q.reshape(B, L, self.num_heads_per_rank, - self.head_dim).permute(0, 2, 1, 3) # B, H, L, D - k = k.reshape(B, L, self.num_heads_per_rank, - self.head_dim).permute(0, 2, 1, 3) # B, H, L, D - v = v.reshape(B, L, self.num_heads_per_rank, - self.head_dim).permute(0, 2, 1, 3) # B, H, L, D - - out = torch.nn.functional.scaled_dot_product_attention(q, - k, - v, - attn_mask=None, - dropout_p=0., - is_causal=False) - - output, _ = self.dense(out.transpose(1, 2).view(B, L, -1)) + + out = self.attn(q, k, v) + output, _ = self.dense(out) output = self.output_dropout(output) return output diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index 16192928beb1f..e430a158d869a 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -21,8 +21,8 @@ from torch import nn from transformers.models.idefics2.configuration_idefics2 import ( Idefics2Config, Idefics2VisionConfig) -from xformers import ops as xops +from vllm.attention.layer import MultiHeadAttention from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -141,35 +141,18 @@ def __init__( ) self.tp_size = get_tensor_model_parallel_world_size() self.num_heads_per_partition = divide(self.num_heads, self.tp_size) - self.is_causal = False + self.attn = MultiHeadAttention(self.num_heads_per_partition, + self.head_dim, self.scale) def forward( self, hidden_states: torch.Tensor, ) -> torch.Tensor: - batch_size, q_len, _ = hidden_states.size() qkv, _ = self.qkv_proj( hidden_states ) # batch_size, q_len, 3 * num_heads_per_partition * head_dim query_states, key_states, value_states = qkv.chunk(3, dim=-1) - query_states = query_states.view(batch_size, q_len, - self.num_heads_per_partition, - self.head_dim) - key_states = key_states.view(batch_size, q_len, - self.num_heads_per_partition, - self.head_dim) - value_states = value_states.view(batch_size, q_len, - self.num_heads_per_partition, - self.head_dim) - # see: https://facebookresearch.github.io/xformers/components/ops.html - out = xops.memory_efficient_attention_forward( - query_states, - key_states, - value_states, - p=self.dropout, - scale=self.scale, - ) - out = out.view(batch_size, q_len, -1) + out = self.attn(query_states, key_states, value_states) attn_output, _ = self.out_proj(out) return attn_output diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index c4346fcb3bd2a..7ff68bd60e8ad 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -12,7 +12,7 @@ import torch.nn.functional as F from transformers import PretrainedConfig -from vllm.attention.selector import _Backend +from vllm.attention.layer import MultiHeadAttention from vllm.distributed import (divide, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, split_tensor_along_last_dim, @@ -25,8 +25,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from .utils import get_vit_attn_backend - NORM2FN = { 'rms_norm': RMSNorm, 'layer_norm': nn.LayerNorm, @@ -183,10 +181,8 @@ def __init__( prefix=f"{prefix}.proj", ) - self.attn_backend = get_vit_attn_backend(support_fa=False) - if self.attn_backend not in {_Backend.TORCH_SDPA, _Backend.XFORMERS}: - raise RuntimeError( - f"InternViT does not support {self.attn_backend} backend now.") + self.attn = MultiHeadAttention(self.num_heads_per_partition, + self.head_dim, self.scale) def _apply_qk_norm(self, q: torch.Tensor, k: torch.Tensor): if self.tp_size > 1: @@ -209,23 +205,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: if self.qk_normalization: q, k = self._apply_qk_norm(q, k) - q = q.view(B, N, self.num_heads_per_partition, self.head_dim) - k = k.view(B, N, self.num_heads_per_partition, self.head_dim) - v = v.view(B, N, self.num_heads_per_partition, self.head_dim) - - if self.attn_backend == _Backend.XFORMERS: - from xformers import ops as xops - - out = xops.memory_efficient_attention_forward(q, - k, - v, - scale=self.scale) - elif self.attn_backend == _Backend.TORCH_SDPA: - q, k, v = (x.transpose(1, 2) for x in (q, k, v)) - out = F.scaled_dot_product_attention(q, k, v, scale=self.scale) - out = out.transpose(1, 2) - - out = out.view(B, N, -1) + out = self.attn(q, k, v) out, _ = self.proj(out) return out diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 86aab38032450..d5a7781fecfc3 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -482,6 +482,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.mlp1 = self._init_mlp1(config) self.img_context_token_id = None + self.visual_token_mask = None self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) @@ -635,13 +636,12 @@ def _process_image_input( return image_embeds - def _get_visual_token_mask(self, input_ids: torch.Tensor) -> torch.Tensor: + def _set_visual_token_mask(self, input_ids: torch.Tensor) -> torch.Tensor: if self.is_mono: - visual_token_mask = ( + self.visual_token_mask = ( input_ids == self.img_context_token_id).reshape(-1, 1) else: - visual_token_mask = None - return visual_token_mask + self.visual_token_mask = None def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input = self._parse_and_validate_image_input(**kwargs) @@ -658,6 +658,7 @@ def get_input_embeddings( inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: assert self.img_context_token_id is not None + self._set_visual_token_mask(input_ids) inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, self.img_context_token_id) @@ -674,7 +675,6 @@ def forward( **kwargs: object, ) -> Union[SamplerOutput, IntermediateTensors]: - visual_token_mask = None if intermediate_tensors is not None: input_ids = None inputs_embeds = None @@ -695,16 +695,15 @@ def forward( "intermediate_tensors": intermediate_tensors, "inputs_embeds": inputs_embeds, } - if self.img_context_token_id is not None: - visual_token_mask = self._get_visual_token_mask(input_ids) - # We always overwrite it back to None after computing visual token - # mask so that this doesn't need to depend on encoder output + if self.visual_token_mask is not None: + # overwrite visual_token_mask and img_context_token_id back to None, + # so that this doesn't need to depend on encoder output + forward_kwargs.update( + {"visual_token_mask": self.visual_token_mask}) + self.visual_token_mask = None self.img_context_token_id = None - if self.is_mono: - forward_kwargs.update({"visual_token_mask": visual_token_mask}) - hidden_states = self.language_model.model(**forward_kwargs) return hidden_states diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 98caa6857e211..d1fcbd167c199 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -13,6 +13,7 @@ from transformers import PretrainedConfig from vllm.attention import Attention, AttentionMetadata +from vllm.attention.layer import MultiHeadAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, @@ -38,14 +39,12 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.inputs import NestedTensors from vllm.multimodal.utils import cached_get_tokenizer -from vllm.platforms import _Backend from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, SequenceData) from vllm.transformers_utils.processor import get_processor from .interfaces import SupportsMultiModal, SupportsPP -from .utils import (AutoWeightsLoader, WeightsMapper, get_vit_attn_backend, - is_pp_missing_parameter, +from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -188,13 +187,11 @@ def __init__( quant_config=quant_config, ) - # Detect attention implementation. - self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) - if self.attn_backend not in { - _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS - }: - raise RuntimeError( - f"Molmo does not support {self.attn_backend} backend now.") + self.scale = self.head_dim**-0.5 + self.attn = MultiHeadAttention(self.num_heads, + self.head_dim, + self.scale, + num_kv_heads=self.num_kv_heads) def forward(self, inputs_q: torch.Tensor, @@ -210,25 +207,8 @@ def forward(self, xq, _ = self.wq(inputs_q) xk, _ = self.wk(inputs_k) xv, _ = self.wv(inputs_v) - q_shape = xq.size()[:-1] + (self.num_heads, self.head_dim) - kv_shape = xk.size()[:-1] + (self.num_kv_heads, self.head_dim) - xq = xq.view(*q_shape) - xk = xk.view(*kv_shape) - xv = xv.view(*kv_shape) - - if self.attn_backend == _Backend.FLASH_ATTN: - from flash_attn import flash_attn_func - output = flash_attn_func(xq, xk, xv, dropout_p=0.0, causal=False) - elif self.attn_backend == _Backend.TORCH_SDPA: - xq, xk, xv = (rearrange(x, "b s h d -> b h s d") - for x in (xq, xk, xv)) - output = F.scaled_dot_product_attention(xq, xk, xv) - output = rearrange(output, "b h s d -> b s h d ") - elif self.attn_backend == _Backend.XFORMERS: - from xformers import ops as xops - output = xops.memory_efficient_attention_forward(xq, xk, xv, p=0) - - output = rearrange(output, "b s h d -> b s (h d)").contiguous() + + output = self.attn(xq, xk, xv) output, _ = self.wo(output) return output diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index deaed0ba7e4ce..6fb9e2cc4584f 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -6,12 +6,11 @@ import numpy as np import torch -import torch.nn.functional as F from PIL import Image from torch import nn from transformers import SiglipVisionConfig -from vllm.attention.selector import _Backend +from vllm.attention.layer import MultiHeadAttention from vllm.config import ModelConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.inputs import DecoderOnlyInputs, token_inputs @@ -29,8 +28,6 @@ resolve_visual_encoder_outputs) from vllm.sequence import SequenceData -from .utils import get_vit_attn_backend - def get_siglip_patch_grid_length(*, image_size: int, patch_size: int) -> int: # Since interpolation is applied, the image size need not be divisible @@ -291,52 +288,18 @@ def __init__( self.tp_size = get_tensor_model_parallel_world_size() self.num_heads_per_partition = divide(self.num_heads, self.tp_size) - self.attn_backend = get_vit_attn_backend(support_fa=False) - if self.attn_backend not in {_Backend.TORCH_SDPA, _Backend.XFORMERS}: - raise RuntimeError( - f"SIGLIP does not support {self.attn_backend} backend now.") + self.attn = MultiHeadAttention(self.num_heads_per_partition, + self.head_dim, self.scale) def forward( self, hidden_states: torch.Tensor, ) -> torch.Tensor: """Input shape: Batch x Time x Channel""" - batch_size, q_len, _ = hidden_states.size() - qkv_states, _ = self.qkv_proj(hidden_states) query_states, key_states, value_states = qkv_states.chunk(3, dim=-1) - query_states = query_states.view(batch_size, q_len, - self.num_heads_per_partition, - self.head_dim) - key_states = key_states.view(batch_size, q_len, - self.num_heads_per_partition, - self.head_dim) - value_states = value_states.view(batch_size, q_len, - self.num_heads_per_partition, - self.head_dim) - - if self.attn_backend == _Backend.XFORMERS: - from xformers import ops as xops - - out = xops.memory_efficient_attention_forward(query_states, - key_states, - value_states, - p=self.dropout, - scale=self.scale) - elif self.attn_backend == _Backend.TORCH_SDPA: - query_states, key_states, value_states = (x.transpose(1, 2) - for x in (query_states, - key_states, - value_states)) - out = F.scaled_dot_product_attention(query_states, - key_states, - value_states, - dropout_p=self.dropout, - scale=self.scale) - out = out.transpose(1, 2) - - out = out.view(batch_size, q_len, -1) + out = self.attn(query_states, key_states, value_states) attn_output, _ = self.out_proj(out) return attn_output, None From 82eb5ea8f3bd3aabbe5c2fd43e37d263768603c5 Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Wed, 4 Dec 2024 15:28:21 -0600 Subject: [PATCH 244/255] Benchmark serving structured output (#10880) Signed-off-by: Chendi Xue Co-authored-by: Michael Goin --- benchmarks/backend_request_func.py | 6 + benchmarks/benchmark_serving_guided.py | 881 +++++++++++++++++++++++++ 2 files changed, 887 insertions(+) create mode 100644 benchmarks/benchmark_serving_guided.py diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index c3fed56e8a956..b67849038cf0d 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -24,6 +24,7 @@ class RequestFuncInput: model: str best_of: int = 1 logprobs: Optional[int] = None + extra_body: Optional[dict] = None multi_modal_content: Optional[dict] = None ignore_eos: bool = False @@ -36,6 +37,7 @@ class RequestFuncOutput: ttft: float = 0.0 # Time to first token itl: List[float] = field( default_factory=list) # List of inter-token latencies + tpot: float = 0.0 # avg next-token latencies prompt_len: int = 0 error: str = "" @@ -242,6 +244,8 @@ async def async_request_openai_completions( "stream": True, "ignore_eos": request_func_input.ignore_eos, } + if request_func_input.extra_body: + payload.update(request_func_input.extra_body) headers = { "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" } @@ -336,6 +340,8 @@ async def async_request_openai_chat_completions( "stream": True, "ignore_eos": request_func_input.ignore_eos, } + if request_func_input.extra_body: + payload.update(request_func_input.extra_body) headers = { "Content-Type": "application/json", "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", diff --git a/benchmarks/benchmark_serving_guided.py b/benchmarks/benchmark_serving_guided.py new file mode 100644 index 0000000000000..4435d87e18a8a --- /dev/null +++ b/benchmarks/benchmark_serving_guided.py @@ -0,0 +1,881 @@ +r"""Benchmark online serving throughput with guided decoding. + +On the server side, run one of the following commands: + (vLLM OpenAI API server) + vllm serve --disable-log-requests + + (TGI backend) + ./launch_tgi_server.sh + +On the client side, run: + python benchmarks/benchmark_serving.py \ + --backend \ + --model \ + --dataset json \ + --guided-decoding-ratio 1.0 \ + --guided-decoding-backend xgrammar \ + --request-rate 10 \ + --num-prompts 1000 + + when using tgi backend, add + --endpoint /generate_stream + to the end of the command above. +""" +import argparse +import asyncio +import dataclasses +import json +import os +import random +import time +import warnings +from dataclasses import dataclass +from typing import AsyncGenerator, List, Optional, Tuple + +import datasets +import numpy as np +import pandas as pd +from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, + RequestFuncOutput) +from tqdm.asyncio import tqdm +from transformers import PreTrainedTokenizerBase + +try: + from vllm.transformers_utils.tokenizer import get_tokenizer +except ImportError: + from backend_request_func import get_tokenizer + +try: + from vllm.utils import FlexibleArgumentParser +except ImportError: + from argparse import ArgumentParser as FlexibleArgumentParser + +MILLISECONDS_TO_SECONDS_CONVERSION = 1000 + + +@dataclass +class BenchmarkMetrics: + completed: int + total_input: int + total_output: int + request_throughput: float + request_goodput: float + output_throughput: float + total_token_throughput: float + mean_ttft_ms: float + median_ttft_ms: float + std_ttft_ms: float + percentiles_ttft_ms: List[Tuple[float, float]] + mean_tpot_ms: float + median_tpot_ms: float + std_tpot_ms: float + percentiles_tpot_ms: List[Tuple[float, float]] + mean_itl_ms: float + median_itl_ms: float + std_itl_ms: float + percentiles_itl_ms: List[Tuple[float, float]] + # E2EL stands for end-to-end latency per request. + # It is the time taken on the client side from sending + # a request to receiving a complete response. + mean_e2el_ms: float + median_e2el_ms: float + std_e2el_ms: float + percentiles_e2el_ms: List[Tuple[float, float]] + + +@dataclasses.dataclass +class SampleRequest: + """A class representing a single inference request for benchmarking. + + Attributes: + prompt: The input text prompt for the model. + multi_modal_data: Optional dictionary containing multi-modal data (e.g. + images). + prompt_len: The length of the prompt in tokens. + expected_output_len: The expected length of the output in tokens. + """ + prompt: str + prompt_len: int + expected_output_len: int + schema: dict + structure_type: str + completion: str = None + + +def sample_requests(tokenizer: PreTrainedTokenizerBase, + args: argparse.Namespace) -> List[SampleRequest]: + if args.dataset == 'json': + if args.json_schema_path is None: + dir_path = os.path.dirname(os.path.realpath(__file__)) + args.json_schema_path = os.path.join(dir_path, + "structured_schemas", + "structured_schema_1.json") + with open(args.json_schema_path) as f: + schema = json.load(f) + prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501 + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=schema, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "grammar": + schema = """ + ?start: select_statement + + ?select_statement: "SELECT " column_list " FROM " table_name + + ?column_list: column_name ("," column_name)* + + ?table_name: identifier + + ?column_name: identifier + + ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/ + """ + prompt = "Generate an SQL query to show the 'username' \ + and 'email' from the 'users' table." + + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=schema, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "regex": + regex = r"\w+@\w+\.com\n" + args.regex = regex + prompt = "Generate an email address for Alan Turing, \ + who works in Enigma. End in .com and new line. \ + Example result: alan.turing@enigma.com\n" + + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=regex, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "choice": + choice = ["Positive", "Negative"] + args.choice = choice + prompt = "Classify this sentiment: vLLM is wonderful!" + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=choice, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "xgrammar_bench": + requests: List[SampleRequest] = [] + dataset = datasets.load_dataset("NousResearch/json-mode-eval", + split="train") + print(f"dataset has {len(dataset)} entries") + len_dataset = len(dataset) + for data_point_idx in range(args.num_prompts): + idx = data_point_idx + while idx >= len_dataset: + idx -= len_dataset + schema = dataset["schema"][idx] + prompt = tokenizer.apply_chat_template(dataset["prompt"][idx], + tokenize=False) + input_len = len(tokenizer(prompt).input_ids) + completion = dataset["completion"][idx] + + requests.append( + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=schema, + structure_type=args.structure_type, + completion=completion)) + + return requests + + +async def get_request( + input_requests: List[SampleRequest], + request_rate: float, + burstiness: float = 1.0, +) -> AsyncGenerator[Tuple[int, SampleRequest], None]: + """ + Asynchronously generates requests at a specified rate + with OPTIONAL burstiness. + + Args: + input_requests: + A list of input requests, each represented as a tuple. + request_rate: + The rate at which requests are generated (requests/s). + burstiness (optional): + The burstiness factor of the request generation. + Only takes effect when request_rate is not inf. + Default value is 1, which follows a Poisson process. + Otherwise, the request intervals follow a gamma distribution. + A lower burstiness value (0 < burstiness < 1) results + in more bursty requests, while a higher burstiness value + (burstiness > 1) results in a more uniform arrival of requests. + """ + input_requests = iter(input_requests) + + # Calculate scale parameter theta to maintain the desired request_rate. + assert burstiness > 0, ( + f"A positive burstiness factor is expected, but given {burstiness}.") + theta = 1.0 / (request_rate * burstiness) + + for i, request in enumerate(input_requests): + yield i, request + + if request_rate == float("inf"): + # If the request rate is infinity, then we don't need to wait. + continue + + # Sample the request interval from the gamma distribution. + # If burstiness is 1, it follows exponential distribution. + interval = np.random.gamma(shape=burstiness, scale=theta) + # The next request will be sent after the interval. + await asyncio.sleep(interval) + + +def calculate_metrics( + input_requests: List[Tuple[str, int, int]], + outputs: List[RequestFuncOutput], + dur_s: float, + tokenizer: PreTrainedTokenizerBase, + selected_percentile_metrics: List[str], + selected_percentiles: List[float], +) -> Tuple[BenchmarkMetrics, List[int]]: + actual_output_lens: List[int] = [] + total_input = 0 + completed = 0 + good_completed = 0 + itls: List[float] = [] + tpots: List[float] = [] + all_tpots: List[float] = [] + ttfts: List[float] = [] + e2els: List[float] = [] + for i in range(len(outputs)): + if outputs[i].success: + # We use the tokenizer to count the number of output tokens for all + # serving backends instead of looking at len(outputs[i].itl) since + # multiple output tokens may be bundled together + # Note : this may inflate the output token count slightly + output_len = len( + tokenizer(outputs[i].generated_text, + add_special_tokens=False).input_ids) + actual_output_lens.append(output_len) + total_input += input_requests[i].prompt_len + tpot = 0 + if output_len > 1: + tpot = (outputs[i].latency - outputs[i].ttft) / (output_len - + 1) + tpots.append(tpot) + outputs[i].tpot = sum(tpots) / len(tpots) if len(tpots) else 0 + # Note: if output_len <= 1, we regard tpot as 0 for goodput + all_tpots.append(tpot) + itls += outputs[i].itl + ttfts.append(outputs[i].ttft) + e2els.append(outputs[i].latency) + completed += 1 + else: + actual_output_lens.append(0) + + if completed == 0: + warnings.warn( + "All requests failed. This is likely due to a misconfiguration " + "on the benchmark arguments.", + stacklevel=2) + metrics = BenchmarkMetrics( + completed=completed, + total_input=total_input, + total_output=sum(actual_output_lens), + request_throughput=completed / dur_s, + request_goodput=good_completed / dur_s, + output_throughput=sum(actual_output_lens) / dur_s, + total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, + mean_ttft_ms=np.mean(ttfts or 0) * + 1000, # ttfts is empty if streaming is not supported by backend + std_ttft_ms=np.std(ttfts or 0) * 1000, + median_ttft_ms=np.median(ttfts or 0) * 1000, + percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) + for p in selected_percentiles], + mean_tpot_ms=np.mean(tpots or 0) * 1000, + std_tpot_ms=np.std(tpots or 0) * 1000, + median_tpot_ms=np.median(tpots or 0) * 1000, + percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) + for p in selected_percentiles], + mean_itl_ms=np.mean(itls or 0) * 1000, + std_itl_ms=np.std(itls or 0) * 1000, + median_itl_ms=np.median(itls or 0) * 1000, + percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) + for p in selected_percentiles], + mean_e2el_ms=np.mean(e2els or 0) * 1000, + std_e2el_ms=np.std(e2els or 0) * 1000, + median_e2el_ms=np.median(e2els or 0) * 1000, + percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) + for p in selected_percentiles], + ) + + return metrics, actual_output_lens + + +async def benchmark( + backend: str, + api_url: str, + base_url: str, + model_id: str, + tokenizer: PreTrainedTokenizerBase, + input_requests: List[SampleRequest], + request_rate: float, + burstiness: float, + disable_tqdm: bool, + profile: bool, + selected_percentile_metrics: List[str], + selected_percentiles: List[str], + ignore_eos: bool, + max_concurrency: Optional[int], + guided_decoding_ratio: float, + guided_decoding_backend: str, +): + if backend in ASYNC_REQUEST_FUNCS: + request_func = ASYNC_REQUEST_FUNCS[backend] + else: + raise ValueError(f"Unknown backend: {backend}") + + def prepare_extra_body(request) -> dict: + extra_body = {} + # Add the schema to the extra_body + extra_body[request.structure_type] = request.schema + # Add the specific guided_decoding_backend + extra_body["guided_decoding_backend"] = guided_decoding_backend + return extra_body + + print("Starting initial single prompt test run...") + guided_decoding_req_idx = random.sample( + range(len(input_requests)), + int(len(input_requests) * guided_decoding_ratio)) + + test_request = input_requests[0] + test_input = RequestFuncInput( + model=model_id, + prompt=test_request.prompt, + api_url=api_url, + prompt_len=test_request.prompt_len, + output_len=test_request.expected_output_len, + ignore_eos=ignore_eos, + extra_body=prepare_extra_body(test_request), + ) + test_output = await request_func(request_func_input=test_input) + if not test_output.success: + raise ValueError( + "Initial test run failed - Please make sure benchmark arguments " + f"are correctly specified. Error: {test_output.error}") + else: + print("Initial test run completed. Starting main benchmark run...") + + if profile: + print("Starting profiler...") + profile_input = RequestFuncInput( + model=model_id, + prompt=test_request.prompt, + api_url=base_url + "/start_profile", + prompt_len=test_request.prompt_len, + output_len=test_request.expected_output_len, + ignore_eos=ignore_eos, + extra_body=prepare_extra_body(test_request), + ) + profile_output = await request_func(request_func_input=profile_input) + if profile_output.success: + print("Profiler started") + + if burstiness == 1.0: + distribution = "Poisson process" + else: + distribution = "Gamma distribution" + + print(f"Traffic request rate: {request_rate}") + print(f"Burstiness factor: {burstiness} ({distribution})") + print(f"Maximum request concurrency: {max_concurrency}") + + pbar = None if disable_tqdm else tqdm(total=len(input_requests)) + + # This can be used once the minimum Python version is 3.10 or higher, + # and it will simplify the code in limited_request_func. + # semaphore = (asyncio.Semaphore(max_concurrency) + # if max_concurrency else contextlib.nullcontext()) + semaphore = (asyncio.Semaphore(max_concurrency) + if max_concurrency else None) + + async def limited_request_func(request_func_input, pbar): + if semaphore is None: + return await request_func(request_func_input=request_func_input, + pbar=pbar) + async with semaphore: + return await request_func(request_func_input=request_func_input, + pbar=pbar) + + benchmark_start_time = time.perf_counter() + tasks: List[asyncio.Task] = [] + expected: List[str] = [] + async for i, request in get_request(input_requests, request_rate, + burstiness): + extra_body = prepare_extra_body( + request) if i in guided_decoding_req_idx else None + request_func_input = RequestFuncInput( + model=model_id, + prompt=request.prompt, + api_url=api_url, + prompt_len=request.prompt_len, + output_len=request.expected_output_len, + ignore_eos=ignore_eos, + extra_body=extra_body, + ) + expected.append(request.completion) + tasks.append( + asyncio.create_task( + limited_request_func(request_func_input=request_func_input, + pbar=pbar))) + outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) + + if profile: + print("Stopping profiler...") + profile_input = RequestFuncInput( + model=model_id, + prompt=test_request.prompt, + api_url=base_url + "/stop_profile", + prompt_len=test_request.prompt_len, + output_len=test_request.expected_output_len, + extra_body={test_request.structure_type: test_request.schema}, + ) + profile_output = await request_func(request_func_input=profile_input) + if profile_output.success: + print("Profiler stopped") + + if pbar is not None: + pbar.close() + + benchmark_duration = time.perf_counter() - benchmark_start_time + + metrics, actual_output_lens = calculate_metrics( + input_requests=input_requests, + outputs=outputs, + dur_s=benchmark_duration, + tokenizer=tokenizer, + selected_percentile_metrics=selected_percentile_metrics, + selected_percentiles=selected_percentiles, + ) + + print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) + print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) + print("{:<40} {:<10.2f}".format("Benchmark duration (s):", + benchmark_duration)) + print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) + print("{:<40} {:<10}".format("Total generated tokens:", + metrics.total_output)) + print("{:<40} {:<10.2f}".format("Request throughput (req/s):", + metrics.request_throughput)) + print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", + metrics.output_throughput)) + print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", + metrics.total_token_throughput)) + + result = { + "duration": + benchmark_duration, + "completed": + metrics.completed, + "total_input_tokens": + metrics.total_input, + "total_output_tokens": + metrics.total_output, + "request_throughput": + metrics.request_throughput, + "output_throughput": + metrics.output_throughput, + "total_token_throughput": + metrics.total_token_throughput, + "ttft_description": + pd.Series([output.ttft for output in outputs]).describe().to_dict(), + "tpot_description": + pd.Series([output.tpot for output in outputs]).describe().to_dict(), + "input_lens": [output.prompt_len for output in outputs], + "output_lens": + actual_output_lens, + "ttfts": [output.ttft for output in outputs], + "itls": [output.itl for output in outputs], + "errors": [output.error for output in outputs], + } + + ret = [{ + 'generated': output.generated_text, + 'expected': gt + } for output, gt in zip(outputs, expected)] + + def process_one_metric( + # E.g., "ttft" + metric_attribute_name: str, + # E.g., "TTFT" + metric_name: str, + # E.g., "Time to First Token" + metric_header: str, + ): + # This function prints and adds statistics of the specified + # metric. + if metric_attribute_name not in selected_percentile_metrics: + return + print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-')) + print("{:<40} {:<10.2f}".format( + f"Mean {metric_name} (ms):", + getattr(metrics, f"mean_{metric_attribute_name}_ms"))) + print("{:<40} {:<10.2f}".format( + f"Median {metric_name} (ms):", + getattr(metrics, f"median_{metric_attribute_name}_ms"))) + result[f"mean_{metric_attribute_name}_ms"] = getattr( + metrics, f"mean_{metric_attribute_name}_ms") + result[f"median_{metric_attribute_name}_ms"] = getattr( + metrics, f"median_{metric_attribute_name}_ms") + result[f"std_{metric_attribute_name}_ms"] = getattr( + metrics, f"std_{metric_attribute_name}_ms") + for p, value in getattr(metrics, + f"percentiles_{metric_attribute_name}_ms"): + p_word = str(int(p)) if int(p) == p else str(p) + print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", + value)) + result[f"p{p_word}_{metric_attribute_name}_ms"] = value + + process_one_metric("ttft", "TTFT", "Time to First Token") + process_one_metric("tpot", "TPOT", + "Time per Output Token (excl. 1st token)") + process_one_metric("itl", "ITL", "Inter-token Latency") + process_one_metric("e2el", "E2EL", "End-to-end Latency") + + print("=" * 50) + + return result, ret + + +def evaluate(ret, args): + + def _eval_correctness_json(expected, actual): + # extract json string from string using regex + import re + actual = actual.replace('\n', '').replace(' ', '').strip() + try: + actual = re.search(r'\{.*\}', actual).group() + actual = json.loads(actual) + except Exception: + return False + + return True + + def _eval_correctness_choice(expected, actual): + return actual in args.choice + + def _eval_correctness_regex(expected, actual): + import re + return re.match(args.regex, actual) is not None + + def _eval_correctness(expected, actual): + if args.structure_type == 'guided_json': + return _eval_correctness_json(expected, actual) + elif args.structure_type == 'guided_regex': + return _eval_correctness_regex(expected, actual) + elif args.structure_type == 'guided_choice': + return _eval_correctness_choice(expected, actual) + else: + return None + + scores = [] + for res in ret: + score = _eval_correctness(res['expected'], res['generated']) + res['correctness'] = score + scores.append(score) + + not_none_scores = [score for score in scores if score is not None] + + return (sum(not_none_scores) / len(not_none_scores) * + 100) if len(not_none_scores) > 0 else None + + +def main(args: argparse.Namespace): + print(args) + random.seed(args.seed) + np.random.seed(args.seed) + + backend = args.backend + model_id = args.model + tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model + + if args.base_url is not None: + api_url = f"{args.base_url}{args.endpoint}" + base_url = f"{args.base_url}" + else: + api_url = f"http://{args.host}:{args.port}{args.endpoint}" + base_url = f"http://{args.host}:{args.port}" + + tokenizer = get_tokenizer(tokenizer_id, + trust_remote_code=args.trust_remote_code) + + if args.dataset == 'grammar': + args.structure_type = 'guided_grammar' + elif args.dataset == 'regex': + args.structure_type = 'guided_regex' + elif args.dataset == 'choice': + args.structure_type = 'guided_choice' + else: + args.structure_type = 'guided_json' + + if args.no_guided_decoding: + args.guided_decoding_ratio = 0 + if args.save_results: + result_file_name = f'{args.guided_decoding_ratio}guided' + result_file_name += f"_{backend}" + result_file_name += f"_{args.request_rate}qps" + result_file_name += f"_{args.model.split('/')[-1]}" + result_file_name += f"_{args.dataset}" + result_file_name += f"_{args.num_prompts}" + result_file_name += f"_out{args.output_len}" + result_file_name += ".txt" + else: + result_file_name = None + + input_requests = sample_requests(tokenizer, args) + + benchmark_result, ret = asyncio.run( + benchmark( + backend=backend, + api_url=api_url, + base_url=base_url, + model_id=model_id, + tokenizer=tokenizer, + input_requests=input_requests, + request_rate=args.request_rate, + burstiness=args.burstiness, + disable_tqdm=args.disable_tqdm, + profile=args.profile, + selected_percentile_metrics=args.percentile_metrics.split(","), + selected_percentiles=[ + float(p) for p in args.metric_percentiles.split(",") + ], + ignore_eos=args.ignore_eos, + max_concurrency=args.max_concurrency, + guided_decoding_ratio=args.guided_decoding_ratio, + guided_decoding_backend=args.guided_decoding_backend, + )) + + # Save config and results to json + score = evaluate(ret, args) + print("correct_rate(%)", score, '\n') + if args.save_results: + results = { + "backend": + backend, + "model_id": + model_id, + "tokenizer_id": + tokenizer_id, + "num_prompts": + args.num_prompts, + "request_rate": + args.request_rate if args.request_rate < float("inf") else "inf", + "burstiness": + args.burstiness, + "max_concurrency": + args.max_concurrency, + "correct_rate(%)": + score + } + results = {"outputs": ret, **results, **benchmark_result} + + # Save to file + if args.result_filename: + result_file_name = args.result_filename + if args.result_dir: + result_file_name = os.path.join(args.result_dir, result_file_name) + with open(result_file_name, "w", encoding='utf-8') as outfile: + json.dump(results, outfile, indent=4) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description="Benchmark the online serving throughput.") + parser.add_argument( + "--backend", + type=str, + default="vllm", + choices=list(ASYNC_REQUEST_FUNCS.keys()), + ) + parser.add_argument( + "--base-url", + type=str, + default=None, + help="Server or API base url if not using http host and port.", + ) + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument( + "--endpoint", + type=str, + default="/v1/completions", + help="API endpoint.", + ) + parser.add_argument( + "--dataset", + default='json', + choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench']) + parser.add_argument("--json_schema_path", + type=str, + default=None, + help="Path to json schema.") + parser.add_argument( + "--max-concurrency", + type=int, + default=None, + help="Maximum number of concurrent requests. This can be used " + "to help simulate an environment where a higher level component " + "is enforcing a maximum number of concurrent requests. While the " + "--request-rate argument controls the rate at which requests are " + "initiated, this argument will control how many are actually allowed " + "to execute at a time. This means that when used in combination, the " + "actual request rate may be lower than specified with --request-rate, " + "if the server is not processing requests fast enough to keep up.") + parser.add_argument( + "--model", + type=str, + required=True, + help="Name of the model.", + ) + parser.add_argument( + "--tokenizer", + type=str, + help= + "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 + ) + parser.add_argument( + "--num-prompts", + type=int, + default=1000, + help="Number of prompts to process.", + ) + parser.add_argument( + "--output-len", + type=int, + default=128, + help="Number of output tokens.", + ) + parser.add_argument( + "--request-rate", + type=float, + default=float("inf"), + help="Number of requests per second. If this is inf, " + "then all the requests are sent at time 0. " + "Otherwise, we use Poisson process or gamma distribution " + "to synthesize the request arrival times.", + ) + parser.add_argument( + "--burstiness", + type=float, + default=1.0, + help="Burstiness factor of the request generation. " + "Only take effect when request_rate is not inf. " + "Default value is 1, which follows Poisson process. " + "Otherwise, the request intervals follow a gamma distribution. " + "A lower burstiness value (0 < burstiness < 1) results in more " + "bursty requests. A higher burstiness value (burstiness > 1) " + "results in a more uniform arrival of requests.", + ) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument( + "--trust-remote-code", + action="store_true", + help="Trust remote code from huggingface", + ) + parser.add_argument( + "--disable-tqdm", + action="store_true", + help="Specify to disable tqdm progress bar.", + ) + parser.add_argument( + "--save-results", + action="store_true", + help="Specify to save benchmark results to a json file", + ) + parser.add_argument( + "--profile", + action="store_true", + help="Use Torch Profiler. The endpoint must be launched with " + "VLLM_TORCH_PROFILER_DIR to enable profiler.", + ) + parser.add_argument( + "--result-dir", + type=str, + default=None, + help="Specify directory to save benchmark json results." + "If not specified, results are saved in the current directory.", + ) + parser.add_argument( + "--result-filename", + type=str, + default=None, + help="Specify the filename to save benchmark json results." + "If not specified, results will be saved in " + "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" + " format.", + ) + parser.add_argument( + "--ignore-eos", + action="store_true", + help="Set ignore_eos flag when sending the benchmark request." + "Warning: ignore_eos is not supported in deepspeed_mii and tgi.") + parser.add_argument( + "--percentile-metrics", + type=str, + default="ttft,tpot,itl", + help="Comma-seperated list of selected metrics to report percentils. " + "This argument specifies the metrics to report percentiles. " + "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". " + "Default value is \"ttft,tpot,itl\".") + parser.add_argument( + "--metric-percentiles", + type=str, + default="99", + help="Comma-seperated list of percentiles for selected metrics. " + "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " + "Default value is \"99\". " + "Use \"--percentile-metrics\" to select metrics.", + ) + parser.add_argument("--no-guided-decoding", + action='store_true', + default=False, + help="Whether to disable JSON decoding or not.") + parser.add_argument("--guided-decoding-ratio", + type=float, + default=1.0, + help="Ratio of Guided Decoding requests") + parser.add_argument("--guided-decoding-backend", + type=str, + choices=["outlines", "lm-format-enforcer", "xgrammar"], + default="xgrammar", + help="Backend to use for guided decoding") + + args = parser.parse_args() + main(args) From e4c34c23de2a90ab837772ac182638ac3bc1636d Mon Sep 17 00:00:00 2001 From: Daniele <36171005+dtrifiro@users.noreply.github.com> Date: Wed, 4 Dec 2024 22:48:13 +0100 Subject: [PATCH 245/255] [CI/Build] improve python-only dev setup (#9621) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Daniele Trifirò Signed-off-by: youkaichao Co-authored-by: youkaichao --- docs/source/getting_started/installation.rst | 41 +++------ python_only_dev.py | 96 ++------------------ setup.py | 83 ++++++++++++++++- vllm/envs.py | 3 +- 4 files changed, 102 insertions(+), 121 deletions(-) diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst index 52412fa8437b9..9b6cb0e80d60e 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation.rst @@ -21,7 +21,7 @@ You can install vLLM using pip: .. code-block:: console $ # (Recommended) Create a new conda environment. - $ conda create -n myenv python=3.10 -y + $ conda create -n myenv python=3.12 -y $ conda activate myenv $ # Install vLLM with CUDA 12.1. @@ -89,45 +89,24 @@ Build from source Python-only build (without compilation) --------------------------------------- -If you only need to change Python code, you can simply build vLLM without compilation. - -The first step is to install the latest vLLM wheel: - -.. code-block:: console - - pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl - -You can find more information about vLLM's wheels `above <#install-the-latest-code>`_. - -After verifying that the installation is successful, you can use `the following script `_: +If you only need to change Python code, you can build and install vLLM without compilation. Using `pip's ``--editable`` flag `_, changes you make to the code will be reflected when you run vLLM: .. code-block:: console $ git clone https://github.com/vllm-project/vllm.git $ cd vllm - $ python python_only_dev.py + $ VLLM_USE_PRECOMPILED=1 pip install --editable . -The script will: +This will download the latest nightly wheel and use the compiled libraries from there in the install. -* Find the installed vLLM package in the current environment. -* Copy built files to the current directory. -* Rename the installed vLLM package. -* Symbolically link the current directory to the installed vLLM package. - -Now, you can edit the Python code in the current directory, and the changes will be reflected when you run vLLM. - -Once you have finished editing or want to install another vLLM wheel, you should exit the development environment using `the same script `_ with the ``--quit-dev`` (or ``-q`` for short) flag: +The ``VLLM_PRECOMPILED_WHEEL_LOCATION`` environment variable can be used instead of ``VLLM_USE_PRECOMPILED`` to specify a custom path or URL to the wheel file. For example, to use the `0.6.1.post1 PyPi wheel `_: .. code-block:: console - $ python python_only_dev.py --quit-dev - -The ``--quit-dev`` flag will: - -* Remove the symbolic link from the current directory to the vLLM package. -* Restore the original vLLM package from the backup. + $ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl + $ pip install --editable . -If you update the vLLM wheel and rebuild from the source to make further edits, you will need to repeat the `Python-only build <#python-only-build>`_ steps again. +You can find more information about vLLM's wheels `above <#install-the-latest-code>`_. .. note:: @@ -148,9 +127,13 @@ If you want to modify C++ or CUDA code, you'll need to build vLLM from source. T .. tip:: Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results. + For example, you can install `ccache `_ using ``conda install ccache`` or ``apt install ccache`` . As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster. + `sccache `_ works similarly to ``ccache``, but has the capability to utilize caching in remote storage environments. + The following environment variables can be set to configure the vLLM ``sccache`` remote: ``SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1``. We also recommend setting ``SCCACHE_IDLE_TIMEOUT=0``. + Use an existing PyTorch installation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/python_only_dev.py b/python_only_dev.py index 1ca0f5c30b741..f70b4984025b3 100644 --- a/python_only_dev.py +++ b/python_only_dev.py @@ -1,92 +1,14 @@ -# enable python only development -# copy compiled files to the current directory directly +msg = """Old style python only build (without compilation) is deprecated, please check https://docs.vllm.ai/en/latest/getting_started/installation.html#python-only-build-without-compilation for the new way to do python only build (without compilation). -import argparse -import os -import shutil -import subprocess -import sys -import warnings +TL;DR: -parser = argparse.ArgumentParser( - description="Development mode for python-only code") -parser.add_argument('-q', - '--quit-dev', - action='store_true', - help='Set the flag to quit development mode') -args = parser.parse_args() +VLLM_USE_PRECOMPILED=1 pip install -e . -# cannot directly `import vllm` , because it will try to -# import from the current directory -output = subprocess.run([sys.executable, "-m", "pip", "show", "vllm"], - capture_output=True) +or -assert output.returncode == 0, "vllm is not installed" +export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch +export VLLM_PRECOMPILED_WHEEL_LOCATION=https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl +pip install -e . +""" # noqa -text = output.stdout.decode("utf-8") - -package_path = None -for line in text.split("\n"): - if line.startswith("Location: "): - package_path = line.split(": ")[1] - break - -assert package_path is not None, "could not find package path" - -cwd = os.getcwd() - -assert cwd != package_path, "should not import from the current directory" - -files_to_copy = [ - "vllm/_C.abi3.so", - "vllm/_moe_C.abi3.so", - "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so", - "vllm/vllm_flash_attn/flash_attn_interface.py", - "vllm/vllm_flash_attn/__init__.py", - # "vllm/_version.py", # not available in nightly wheels yet -] - -# Try to create _version.py to avoid version related warning -# Refer to https://github.com/vllm-project/vllm/pull/8771 -try: - from setuptools_scm import get_version - get_version(write_to="vllm/_version.py") -except ImportError: - warnings.warn( - "To avoid warnings related to vllm._version, " - "you should install setuptools-scm by `pip install setuptools-scm`", - stacklevel=2) - -if not args.quit_dev: - for file in files_to_copy: - src = os.path.join(package_path, file) - dst = file - print(f"Copying {src} to {dst}") - shutil.copyfile(src, dst) - - pre_built_vllm_path = os.path.join(package_path, "vllm") - tmp_path = os.path.join(package_path, "vllm_pre_built") - current_vllm_path = os.path.join(cwd, "vllm") - - print(f"Renaming {pre_built_vllm_path} to {tmp_path} for backup") - shutil.copytree(pre_built_vllm_path, tmp_path) - shutil.rmtree(pre_built_vllm_path) - - print(f"Linking {current_vllm_path} to {pre_built_vllm_path}") - os.symlink(current_vllm_path, pre_built_vllm_path) -else: - vllm_symlink_path = os.path.join(package_path, "vllm") - vllm_backup_path = os.path.join(package_path, "vllm_pre_built") - current_vllm_path = os.path.join(cwd, "vllm") - - print(f"Unlinking {current_vllm_path} to {vllm_symlink_path}") - assert os.path.islink( - vllm_symlink_path - ), f"not in dev mode: {vllm_symlink_path} is not a symbolic link" - assert current_vllm_path == os.readlink( - vllm_symlink_path - ), "current directory is not the source code of package" - os.unlink(vllm_symlink_path) - - print(f"Recovering backup from {vllm_backup_path} to {vllm_symlink_path}") - os.rename(vllm_backup_path, vllm_symlink_path) +print(msg) diff --git a/setup.py b/setup.py index b936589869e76..182dabe449674 100644 --- a/setup.py +++ b/setup.py @@ -249,6 +249,74 @@ def run(self): self.copy_file(file, dst_file) +class repackage_wheel(build_ext): + """Extracts libraries and other files from an existing wheel.""" + default_wheel = "https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + + def run(self) -> None: + wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", + self.default_wheel) + + assert _is_cuda( + ), "VLLM_USE_PRECOMPILED is only supported for CUDA builds" + + import zipfile + + if os.path.isfile(wheel_location): + wheel_path = wheel_location + print(f"Using existing wheel={wheel_path}") + else: + # Download the wheel from a given URL, assume + # the filename is the last part of the URL + wheel_filename = wheel_location.split("/")[-1] + + import tempfile + + # create a temporary directory to store the wheel + temp_dir = tempfile.mkdtemp(prefix="vllm-wheels") + wheel_path = os.path.join(temp_dir, wheel_filename) + + print(f"Downloading wheel from {wheel_location} to {wheel_path}") + + from urllib.request import urlretrieve + + try: + urlretrieve(wheel_location, filename=wheel_path) + except Exception as e: + from setuptools.errors import SetupError + + raise SetupError( + f"Failed to get vLLM wheel from {wheel_location}") from e + + with zipfile.ZipFile(wheel_path) as wheel: + files_to_copy = [ + "vllm/_C.abi3.so", + "vllm/_moe_C.abi3.so", + "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so", + "vllm/vllm_flash_attn/flash_attn_interface.py", + "vllm/vllm_flash_attn/__init__.py", + # "vllm/_version.py", # not available in nightly wheels yet + ] + file_members = filter(lambda x: x.filename in files_to_copy, + wheel.filelist) + + for file in file_members: + print(f"Extracting and including {file.filename} " + "from existing wheel") + package_name = os.path.dirname(file.filename).replace("/", ".") + file_name = os.path.basename(file.filename) + + if package_name not in package_data: + package_data[package_name] = [] + + wheel.extract(file) + if file_name.endswith(".py"): + # python files shouldn't be added to package_data + continue + + package_data[package_name].append(file_name) + + def _is_hpu() -> bool: is_hpu_available = True try: @@ -403,6 +471,8 @@ def get_vllm_version() -> str: # skip this for source tarball, required for pypi if "sdist" not in sys.argv: version += f"{sep}cu{cuda_version_str}" + if envs.VLLM_USE_PRECOMPILED: + version += ".precompiled" elif _is_hip(): # Get the HIP version hipcc_version = get_hipcc_rocm_version() @@ -514,13 +584,18 @@ def _read_requirements(filename: str) -> List[str]: package_data = { "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"] } -if envs.VLLM_USE_PRECOMPILED: - ext_modules = [] - package_data["vllm"].append("*.so") if _no_device(): ext_modules = [] +if not ext_modules: + cmdclass = {} +else: + cmdclass = { + "build_ext": + repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext + } + setup( name="vllm", version=get_vllm_version(), @@ -557,7 +632,7 @@ def _read_requirements(filename: str) -> List[str]: "audio": ["librosa", "soundfile"], # Required for audio processing "video": ["decord"] # Required for video processing }, - cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {}, + cmdclass=cmdclass, package_data=package_data, entry_points={ "console_scripts": [ diff --git a/vllm/envs.py b/vllm/envs.py index c896770e5f6bc..28797ac1e4af2 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -113,7 +113,8 @@ def get_default_config_root(): # If set, vllm will use precompiled binaries (*.so) "VLLM_USE_PRECOMPILED": - lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")), + lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool( + os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")), # CMake build type # If not set, defaults to "Debug" or "RelWithDebInfo" From 2a56e1264f3f0f32e25de42c32eac67cbc86a098 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 4 Dec 2024 16:54:05 -0800 Subject: [PATCH 246/255] [V1] Fix when max_model_len is not divisible by block_size (#10903) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_model_runner.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4692762493f00..e8d964a722f60 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -260,7 +260,8 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2] # where M is the max_model_len. - token_indices = positions_np + req_indices * self.max_model_len + token_indices = (positions_np + + req_indices * self.input_batch.token_ids_cpu.shape[1]) token_indices = torch.from_numpy(token_indices) input_ids = torch.empty((total_num_scheduled_tokens, ), dtype=torch.int32, @@ -273,9 +274,15 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): out=input_ids) # Calculate the slot mapping. + # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1] + # where K is the max_num_blocks_per_req and the block size is 2. + # NOTE(woosuk): We can't simply use `token_indices // block_size` here + # because M (max_model_len) is not necessarily divisible by block_size. block_numbers = self.input_batch.block_table_cpu_tensor.flatten()[ - token_indices // self.block_size] - block_offsets = token_indices % self.block_size + req_indices * self.max_num_blocks_per_req + + positions_np // self.block_size] + block_offsets = torch.from_numpy(positions_np % self.block_size) slot_mapping = torch.empty((total_num_scheduled_tokens, ), dtype=torch.int32, device="cpu", From 7883c2bbe7d0ab47160d205822f7b188a5a2771b Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Wed, 4 Dec 2024 17:02:17 -0800 Subject: [PATCH 247/255] [benchmark] Make H100 benchmark optional (#10908) --- .buildkite/nightly-benchmarks/benchmark-pipeline.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml index dd2ce454ecb2d..64ba1b32fb074 100644 --- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml @@ -65,10 +65,15 @@ steps: - VLLM_USAGE_SOURCE - HF_TOKEN + - block: "Run H100 Benchmark" + key: block-h100 + depends_on: ~ + - label: "H100" # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" agents: queue: H100 + depends_on: block-h100 plugins: - docker#v5.12.0: image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT From 8d370e91cb0049dc150c85710a08e85952504bfc Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 4 Dec 2024 22:14:06 -0500 Subject: [PATCH 248/255] [Bugfix] Fallback to outlines for complex json schemas (#10899) Signed-off-by: mgoin --- tests/entrypoints/conftest.py | 31 +++++++++++++ tests/entrypoints/llm/test_guided_generate.py | 28 ++++++++++++ .../guided_decoding/__init__.py | 43 +++++++++++++++++++ 3 files changed, 102 insertions(+) diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py index e7ef5637c8ccb..0f7d15e1d85aa 100644 --- a/tests/entrypoints/conftest.py +++ b/tests/entrypoints/conftest.py @@ -69,6 +69,37 @@ def sample_json_schema(): } +@pytest.fixture +def sample_complex_json_schema(): + return { + "type": "object", + "properties": { + "score": { + "type": "integer", + "minimum": 0, + "maximum": 100 # Numeric range + }, + "grade": { + "type": "string", + "pattern": "^[A-D]$" # Regex pattern + }, + "email": { + "type": "string", + "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$" + }, + "tags": { + "type": "array", + "items": { + "type": "string", + "pattern": + "^[a-z]{1,10}$" # Combining length and pattern restrictions + } + } + }, + "required": ["score", "grade", "email", "tags"] + } + + @pytest.fixture def sample_guided_choice(): return [ diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py index c3706f696b264..de6257cfc551c 100644 --- a/tests/entrypoints/llm/test_guided_generate.py +++ b/tests/entrypoints/llm/test_guided_generate.py @@ -76,6 +76,34 @@ def test_guided_json_completion(sample_json_schema, llm): jsonschema.validate(instance=output_json, schema=sample_json_schema) +@pytest.mark.skip_global_cleanup +def test_guided_complex_json_completion(sample_complex_json_schema, llm): + sampling_params = SamplingParams( + temperature=1.0, + max_tokens=1000, + guided_decoding=GuidedDecodingParams(json=sample_complex_json_schema)) + outputs = llm.generate(prompts=[ + f"Give an example JSON for an assignment grade " + f"that fits this schema: {sample_complex_json_schema}" + ] * 2, + sampling_params=sampling_params, + use_tqdm=True) + + assert outputs is not None + + for output in outputs: + assert output is not None + assert isinstance(output, RequestOutput) + prompt = output.prompt + + generated_text = output.outputs[0].text + assert generated_text is not None + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + output_json = json.loads(generated_text) + jsonschema.validate(instance=output_json, + schema=sample_complex_json_schema) + + @pytest.mark.skip_global_cleanup def test_guided_choice_completion(sample_guided_choice, llm): sampling_params = SamplingParams( diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py index 3340bad38ab73..a81377341e095 100644 --- a/vllm/model_executor/guided_decoding/__init__.py +++ b/vllm/model_executor/guided_decoding/__init__.py @@ -15,6 +15,40 @@ logger = init_logger(__name__) +def has_xgrammar_unsupported_json_features(schema: dict) -> bool: + """Check if JSON schema contains features unsupported by xgrammar.""" + + def check_object(obj: dict) -> bool: + if not isinstance(obj, dict): + return False + + # Check for pattern restrictions + if "pattern" in obj: + return True + + # Check for numeric ranges + if obj.get("type") in ("integer", "number") and any( + key in obj for key in [ + "minimum", "maximum", "exclusiveMinimum", + "exclusiveMaximum", "multipleOf" + ]): + return True + + # Recursively check all nested objects and arrays + for value in obj.values(): + if isinstance(value, dict): + if check_object(value): + return True + elif isinstance(value, list): + for item in value: + if isinstance(item, dict) and check_object(item): + return True + + return False + + return check_object(schema) + + def maybe_backend_fallback( guided_params: GuidedDecodingParams) -> GuidedDecodingParams: # lm-format-enforce doesn't support grammar, fallback to xgrammar @@ -47,6 +81,15 @@ def maybe_backend_fallback( "Falling back to use outlines instead.") guided_params.backend = "outlines" + # xgrammar doesn't support some JSON schema features + elif (guided_params.json is not None + and has_xgrammar_unsupported_json_features(guided_params.json)): + logger.warning( + "xgrammar does not support advanced JSON schema features like " + "patterns or numeric ranges. " + "Falling back to use outlines instead.") + guided_params.backend = "outlines" + return guided_params From aa39a8e17537f9127b3da65dba6b33067bfd2f78 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 5 Dec 2024 11:19:35 +0800 Subject: [PATCH 249/255] [Doc] Create a new "Usage" section (#10827) Signed-off-by: DarkLight1337 --- .../design/multimodal/multimodal_index.rst | 5 +- docs/source/index.rst | 25 +- .../models/enabling_multimodal_inputs.rst | 2 +- docs/source/models/supported_models.rst | 19 +- .../serving/openai_compatible_server.md | 4 +- .../compatibility_matrix.rst | 0 docs/source/{models => usage}/engine_args.rst | 0 docs/source/{serving => usage}/env_vars.rst | 0 docs/source/{serving => usage}/faq.rst | 2 + docs/source/{models => usage}/lora.rst | 4 +- .../vlm.rst => usage/multimodal_inputs.rst} | 248 ++++++++++++------ docs/source/{models => usage}/performance.rst | 0 docs/source/{models => usage}/spec_decode.rst | 8 +- .../{models => usage}/structured_outputs.rst | 0 docs/source/{serving => usage}/usage_stats.md | 0 vllm/attention/backends/rocm_flash_attn.py | 2 +- vllm/config.py | 8 +- vllm/engine/arg_utils.py | 2 +- vllm/engine/output_processor/multi_step.py | 2 +- vllm/executor/cpu_executor.py | 2 +- vllm/platforms/cpu.py | 2 +- vllm/spec_decode/spec_decode_worker.py | 2 +- vllm/utils.py | 2 +- vllm/worker/multi_step_model_runner.py | 2 +- vllm/worker/utils.py | 2 +- 25 files changed, 218 insertions(+), 125 deletions(-) rename docs/source/{serving => usage}/compatibility_matrix.rst (100%) rename docs/source/{models => usage}/engine_args.rst (100%) rename docs/source/{serving => usage}/env_vars.rst (100%) rename docs/source/{serving => usage}/faq.rst (99%) rename docs/source/{models => usage}/lora.rst (99%) rename docs/source/{models/vlm.rst => usage/multimodal_inputs.rst} (62%) rename docs/source/{models => usage}/performance.rst (100%) rename docs/source/{models => usage}/spec_decode.rst (98%) rename docs/source/{models => usage}/structured_outputs.rst (100%) rename docs/source/{serving => usage}/usage_stats.md (100%) diff --git a/docs/source/design/multimodal/multimodal_index.rst b/docs/source/design/multimodal/multimodal_index.rst index 30f543abc20c7..c6d47f90b62d5 100644 --- a/docs/source/design/multimodal/multimodal_index.rst +++ b/docs/source/design/multimodal/multimodal_index.rst @@ -7,7 +7,7 @@ Multi-Modality vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package. -Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models ` +Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models ` via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptType`. Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities @@ -15,9 +15,6 @@ by following :ref:`this guide `. Looking to add your own multi-modal model? Please follow the instructions listed :ref:`here `. -.. - TODO: Add usage of --limit-mm-per-prompt when multi-image input is officially supported - Guides ++++++ diff --git a/docs/source/index.rst b/docs/source/index.rst index 0692e949f1c77..86b1eed2d26ba 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -85,12 +85,8 @@ Documentation serving/deploying_with_nginx serving/distributed_serving serving/metrics - serving/env_vars - serving/usage_stats serving/integrations serving/tensorizer - serving/compatibility_matrix - serving/faq .. toctree:: :maxdepth: 1 @@ -99,12 +95,21 @@ Documentation models/supported_models models/adding_model models/enabling_multimodal_inputs - models/engine_args - models/lora - models/vlm - models/structured_outputs - models/spec_decode - models/performance + +.. toctree:: + :maxdepth: 1 + :caption: Usage + + usage/lora + usage/multimodal_inputs + usage/structured_outputs + usage/spec_decode + usage/compatibility_matrix + usage/performance + usage/faq + usage/engine_args + usage/env_vars + usage/usage_stats .. toctree:: :maxdepth: 1 diff --git a/docs/source/models/enabling_multimodal_inputs.rst b/docs/source/models/enabling_multimodal_inputs.rst index 49b5285c45590..5c1236e1a8972 100644 --- a/docs/source/models/enabling_multimodal_inputs.rst +++ b/docs/source/models/enabling_multimodal_inputs.rst @@ -3,7 +3,7 @@ Enabling Multimodal Inputs ========================== -This document walks you through the steps to extend a vLLM model so that it accepts :ref:`multi-modal ` inputs. +This document walks you through the steps to extend a vLLM model so that it accepts :ref:`multi-modal inputs `. .. seealso:: :ref:`adding_a_new_model` diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 9f3b6f59068e2..5b416e04da745 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -471,6 +471,8 @@ Sentence Pair Scoring .. note:: These models are supported in both offline and online inference via Score API. +.. _supported_mm_models: + Multimodal Language Models ^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -489,8 +491,6 @@ On the other hand, modalities separated by :code:`/` are mutually exclusive. - e.g.: :code:`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs. -.. _supported_vlms: - Text Generation --------------- @@ -646,6 +646,21 @@ Text Generation | :sup:`E` Pre-computed embeddings can be inputted for this modality. | :sup:`+` Multiple items can be inputted per text prompt for this modality. +.. important:: + To enable multiple multi-modal items per text prompt, you have to set :code:`limit_mm_per_prompt` (offline inference) + or :code:`--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt: + + .. code-block:: python + + llm = LLM( + model="Qwen/Qwen2-VL-7B-Instruct", + limit_mm_per_prompt={"image": 4}, + ) + + .. code-block:: bash + + vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4 + .. note:: vLLM currently only supports adding LoRA to the language backbone of multimodal models. diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index c39cef85897ed..d75e90807ca1d 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -32,7 +32,7 @@ We currently support the following OpenAI APIs: - [Completions API](https://platform.openai.com/docs/api-reference/completions) - *Note: `suffix` parameter is not supported.* - [Chat Completions API](https://platform.openai.com/docs/api-reference/chat) - - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Using VLMs](../models/vlm.rst). + - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Multimodal Inputs](../usage/multimodal_inputs.rst). - *Note: `image_url.detail` parameter is not supported.* - We also support `audio_url` content type for audio files. - Refer to [vllm.entrypoints.chat_utils](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py) for the exact schema. @@ -41,7 +41,7 @@ We currently support the following OpenAI APIs: - [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings) - Instead of `inputs`, you can pass in a list of `messages` (same schema as Chat Completions API), which will be treated as a single prompt to the model according to its chat template. - - This enables multi-modal inputs to be passed to embedding models, see [Using VLMs](../models/vlm.rst). + - This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.rst) for details. - *Note: You should run `vllm serve` with `--task embedding` to ensure that the model is being run in embedding mode.* ## Score API for Cross Encoder Models diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/usage/compatibility_matrix.rst similarity index 100% rename from docs/source/serving/compatibility_matrix.rst rename to docs/source/usage/compatibility_matrix.rst diff --git a/docs/source/models/engine_args.rst b/docs/source/usage/engine_args.rst similarity index 100% rename from docs/source/models/engine_args.rst rename to docs/source/usage/engine_args.rst diff --git a/docs/source/serving/env_vars.rst b/docs/source/usage/env_vars.rst similarity index 100% rename from docs/source/serving/env_vars.rst rename to docs/source/usage/env_vars.rst diff --git a/docs/source/serving/faq.rst b/docs/source/usage/faq.rst similarity index 99% rename from docs/source/serving/faq.rst rename to docs/source/usage/faq.rst index 9e858e612c8bf..ce327abd5fa20 100644 --- a/docs/source/serving/faq.rst +++ b/docs/source/usage/faq.rst @@ -1,3 +1,5 @@ +.. _faq: + Frequently Asked Questions =========================== diff --git a/docs/source/models/lora.rst b/docs/source/usage/lora.rst similarity index 99% rename from docs/source/models/lora.rst rename to docs/source/usage/lora.rst index ef0177eaf2162..c2c6fa2aebfaf 100644 --- a/docs/source/models/lora.rst +++ b/docs/source/usage/lora.rst @@ -1,7 +1,7 @@ .. _lora: -Using LoRA adapters -=================== +LoRA Adapters +============= This document shows you how to use `LoRA adapters `_ with vLLM on top of a base model. diff --git a/docs/source/models/vlm.rst b/docs/source/usage/multimodal_inputs.rst similarity index 62% rename from docs/source/models/vlm.rst rename to docs/source/usage/multimodal_inputs.rst index bcbe50a25fa09..c93f65327e31b 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/usage/multimodal_inputs.rst @@ -1,34 +1,31 @@ -.. _vlm: +.. _multimodal_inputs: -Using VLMs -========== +Multimodal Inputs +================= -vLLM provides experimental support for Vision Language Models (VLMs). See the :ref:`list of supported VLMs here `. -This document shows you how to run and serve these models using vLLM. +This page teaches you how to pass multi-modal inputs to :ref:`multi-modal models ` in vLLM. .. note:: - We are actively iterating on VLM support. See `this RFC `_ for upcoming changes, + We are actively iterating on multi-modal support. See `this RFC `_ for upcoming changes, and `open an issue on GitHub `_ if you have any feedback or feature requests. Offline Inference ----------------- -Single-image input -^^^^^^^^^^^^^^^^^^ - -The :class:`~vllm.LLM` class can be instantiated in much the same way as language-only models. - -.. code-block:: python - - llm = LLM(model="llava-hf/llava-1.5-7b-hf") - -To pass an image to the model, note the following in :class:`vllm.inputs.PromptType`: +To input multi-modal data, follow this schema in :class:`vllm.inputs.PromptType`: * ``prompt``: The prompt should follow the format that is documented on HuggingFace. * ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`. +Image +^^^^^ + +You can pass a single image to the :code:`'image'` field of the multi-modal dictionary, as shown in the following examples: + .. code-block:: python + llm = LLM(model="llava-hf/llava-1.5-7b-hf") + # Refer to the HuggingFace repo for the correct format to use prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" @@ -41,41 +38,6 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptT "multi_modal_data": {"image": image}, }) - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - - # Inference with image embeddings as input - image_embeds = torch.load(...) # torch.Tensor of shape (1, image_feature_size, hidden_size of LM) - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": {"image": image_embeds}, - }) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - - # Inference with image embeddings as input with additional parameters - # Specifically, we are conducting a trial run of Qwen2VL and MiniCPM-V with the new input format, which utilizes additional parameters. - mm_data = {} - - image_embeds = torch.load(...) # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM) - # For Qwen2VL, image_grid_thw is needed to calculate positional encoding. - mm_data['image'] = { - "image_embeds": image_embeds, - "image_grid_thw": torch.load(...) # torch.Tensor of shape (1, 3), - } - # For MiniCPM-V, image_size_list is needed to calculate details of the sliced image. - mm_data['image'] = { - "image_embeds": image_embeds, - "image_size_list": [image.size] # list of image sizes - } - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": mm_data, - }) - for o in outputs: generated_text = o.outputs[0].text print(generated_text) @@ -102,12 +64,7 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptT A code example can be found in `examples/offline_inference_vision_language.py `_. -Multi-image input -^^^^^^^^^^^^^^^^^ - -Multi-image input is only supported for a subset of VLMs, as shown :ref:`here `. - -To enable multiple multi-modal items per text prompt, you have to set ``limit_mm_per_prompt`` for the :class:`~vllm.LLM` class. +To substitute multiple images inside the same text prompt, you can pass in a list of images instead: .. code-block:: python @@ -118,10 +75,6 @@ To enable multiple multi-modal items per text prompt, you have to set ``limit_mm limit_mm_per_prompt={"image": 2}, # The maximum number to accept ) -Instead of passing in a single image, you can pass in a list of images. - -.. code-block:: python - # Refer to the HuggingFace repo for the correct format to use prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n" @@ -169,30 +122,114 @@ Multi-image input can be extended to perform video captioning. We show this with generated_text = o.outputs[0].text print(generated_text) +Video +^^^^^ + +You can pass a list of NumPy arrays directly to the :code:`'video'` field of the multi-modal dictionary +instead of using multi-image input. + +Please refer to `examples/offline_inference_vision_language.py `_ for more details. + +Audio +^^^^^ + +You can pass a tuple :code:`(array, sampling_rate)` to the :code:`'audio'` field of the multi-modal dictionary. + +Please refer to `examples/offline_inference_audio_language.py `_ for more details. + +Embedding +^^^^^^^^^ + +To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model, +pass a tensor of shape :code:`(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary. + +.. code-block:: python + + # Inference with image embeddings as input + llm = LLM(model="llava-hf/llava-1.5-7b-hf") + + # Refer to the HuggingFace repo for the correct format to use + prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" + + # Embeddings for single image + # torch.Tensor of shape (1, image_feature_size, hidden_size of LM) + image_embeds = torch.load(...) + + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": {"image": image_embeds}, + }) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + +For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings: + +.. code-block:: python + + # Construct the prompt based on your model + prompt = ... + + # Embeddings for multiple images + # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM) + image_embeds = torch.load(...) + + # Qwen2-VL + llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) + mm_data = { + "image": { + "image_embeds": image_embeds, + # image_grid_thw is needed to calculate positional encoding. + "image_grid_thw": torch.load(...), # torch.Tensor of shape (1, 3), + } + } + + # MiniCPM-V + llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4}) + mm_data = { + "image": { + "image_embeds": image_embeds, + # image_size_list is needed to calculate details of the sliced image. + "image_size_list": [image.size for image in images], # list of image sizes + } + } + + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": mm_data, + }) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + Online Inference ---------------- -OpenAI Vision API -^^^^^^^^^^^^^^^^^ +Our OpenAI-compatible server accepts multi-modal data via the `Chat Completions API `_. + +.. important:: + A chat template is **required** to use Chat Completions API. + + Although most models come with a chat template, for others you have to define one yourself. + The chat template can be inferred based on the documentation on the model's HuggingFace repo. + For example, LLaVA-1.5 (``llava-hf/llava-1.5-7b-hf``) requires a chat template that can be found `here `__. + +Image +^^^^^ -You can serve vision language models with vLLM's HTTP server that is compatible with `OpenAI Vision API `_. +Image input is supported according to `OpenAI Vision API `_. +Here is a simple example using Phi-3.5-Vision. -Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruct`` with vLLM's OpenAI-compatible API server. +First, launch the OpenAI-compatible server: .. code-block:: bash vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2 -.. important:: - Since OpenAI Vision API is based on `Chat Completions API `_, - a chat template is **required** to launch the API server. - - Although Phi-3.5-Vision comes with a chat template, for other models you may have to provide one if the model's tokenizer does not come with it. - The chat template can be inferred based on the documentation on the model's HuggingFace repo. - For example, LLaVA-1.5 (``llava-hf/llava-1.5-7b-hf``) requires a chat template that can be found `here `_. - -To consume the server, you can use the OpenAI client like in the example below: +Then, you can use the OpenAI client as follows: .. code-block:: python @@ -252,22 +289,59 @@ A full code example can be found in `examples/openai_chat_completion_client_for_ .. note:: - By default, the timeout for fetching images through http url is ``5`` seconds. You can override this by setting the environment variable: + By default, the timeout for fetching images through HTTP URL is ``5`` seconds. + You can override this by setting the environment variable: .. code-block:: console $ export VLLM_IMAGE_FETCH_TIMEOUT= -Chat Embeddings API -^^^^^^^^^^^^^^^^^^^ +Video +^^^^^ + +Instead of :code:`image_url`, you can pass a video file via :code:`video_url`. + +You can use `these tests `_ as reference. + +.. note:: + + By default, the timeout for fetching videos through HTTP URL url is ``30`` seconds. + You can override this by setting the environment variable: + + .. code-block:: console + + $ export VLLM_VIDEO_FETCH_TIMEOUT= -vLLM's Chat Embeddings API is a superset of OpenAI's `Embeddings API `_, -where a list of ``messages`` can be passed instead of batched ``inputs``. This enables multi-modal inputs to be passed to embedding models. +Audio +^^^^^ + +Instead of :code:`image_url`, you can pass an audio file via :code:`audio_url`. + +A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py `_. + +.. note:: + + By default, the timeout for fetching audios through HTTP URL is ``10`` seconds. + You can override this by setting the environment variable: + + .. code-block:: console + + $ export VLLM_AUDIO_FETCH_TIMEOUT= + +Embedding +^^^^^^^^^ + +vLLM's Embeddings API is a superset of OpenAI's `Embeddings API `_, +where a list of chat ``messages`` can be passed instead of batched ``inputs``. This enables multi-modal inputs to be passed to embedding models. .. tip:: The schema of ``messages`` is exactly the same as in Chat Completions API. + You can refer to the above tutorials for more details on how to pass each type of multi-modal data. -In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model. +Usually, embedding models do not expect chat-based input, so we need to use a custom chat template to format the text and images. +Refer to the examples below for illustration. + +Here is an end-to-end example using VLM2Vec. To serve the model: .. code-block:: bash @@ -279,10 +353,8 @@ In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model. Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embedding`` to run this model in embedding mode instead of text generation mode. -.. important:: - - VLM2Vec does not expect chat-based input. We use a `custom chat template `_ - to combine the text and images together. + The custom chat template is completely different from the original one for this model, + and can be found `here `__. Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library: @@ -310,7 +382,7 @@ Since the request schema is not defined by OpenAI client, we post a request to t response_json = response.json() print("Embedding output:", response_json["data"][0]["embedding"]) -Here is an example for serving the ``MrLight/dse-qwen2-2b-mrl-v1`` model. +Below is another example, this time using the ``MrLight/dse-qwen2-2b-mrl-v1`` model. .. code-block:: bash @@ -319,8 +391,10 @@ Here is an example for serving the ``MrLight/dse-qwen2-2b-mrl-v1`` model. .. important:: - Like with VLM2Vec, we have to explicitly pass ``--task embedding``. Additionally, ``MrLight/dse-qwen2-2b-mrl-v1`` requires an EOS token for embeddings, - which is handled by the jinja template. + Like with VLM2Vec, we have to explicitly pass ``--task embedding``. + + Additionally, ``MrLight/dse-qwen2-2b-mrl-v1`` requires an EOS token for embeddings, which is handled + by `this custom chat template `__. .. important:: diff --git a/docs/source/models/performance.rst b/docs/source/usage/performance.rst similarity index 100% rename from docs/source/models/performance.rst rename to docs/source/usage/performance.rst diff --git a/docs/source/models/spec_decode.rst b/docs/source/usage/spec_decode.rst similarity index 98% rename from docs/source/models/spec_decode.rst rename to docs/source/usage/spec_decode.rst index d57ffec53215d..67e8ede7654b7 100644 --- a/docs/source/models/spec_decode.rst +++ b/docs/source/usage/spec_decode.rst @@ -1,7 +1,7 @@ .. _spec_decode: -Speculative decoding in vLLM -============================ +Speculative decoding +==================== .. warning:: Please note that speculative decoding in vLLM is not yet optimized and does @@ -182,7 +182,7 @@ speculative decoding, breaking down the guarantees into three key areas: 3. **vLLM Logprob Stability** - vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the same request across runs. For more details, see the FAQ section - titled *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq>`_. + titled *Can the output of a prompt vary across runs in vLLM?* in the :ref:`FAQs `. **Conclusion** @@ -197,7 +197,7 @@ can occur due to following factors: **Mitigation Strategies** -For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq>`_. +For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the :ref:`FAQs `. Resources for vLLM contributors ------------------------------- diff --git a/docs/source/models/structured_outputs.rst b/docs/source/usage/structured_outputs.rst similarity index 100% rename from docs/source/models/structured_outputs.rst rename to docs/source/usage/structured_outputs.rst diff --git a/docs/source/serving/usage_stats.md b/docs/source/usage/usage_stats.md similarity index 100% rename from docs/source/serving/usage_stats.md rename to docs/source/usage/usage_stats.md diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 9139c3c1314d8..19daeb729ee61 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -430,7 +430,7 @@ def forward( Returns: shape = [num_tokens, num_heads * head_size] """ - # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.rst # If the feature combo become valid if attn_type != AttentionType.DECODER: raise NotImplementedError("Encoder self-attention and " diff --git a/vllm/config.py b/vllm/config.py index 1cbab8ea30249..5c904914a71cf 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -509,7 +509,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config, self.use_async_output_proc = False return - # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.rst # If the feature combo become valid if device_config.device_type not in ("cuda", "tpu", "xpu", "hpu"): logger.warning( @@ -525,7 +525,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config, self.use_async_output_proc = False return - # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.rst # If the feature combo become valid if device_config.device_type == "cuda" and self.enforce_eager: logger.warning( @@ -540,7 +540,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config, if self.task == "embedding": self.use_async_output_proc = False - # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.rst # If the feature combo become valid if speculative_config: logger.warning("Async output processing is not supported with" @@ -1704,7 +1704,7 @@ def verify_with_model_config(self, model_config: ModelConfig): model_config.quantization) def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): - # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.rst # If the feature combo become valid if scheduler_config.chunked_prefill_enabled: raise ValueError("LoRA is not supported with chunked prefill yet.") diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 3b776c1d9d39f..0b304658f012c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1111,7 +1111,7 @@ def create_engine_config(self, disable_logprobs=self.disable_logprobs_during_spec_decoding, ) - # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.rst # If the feature combo become valid if self.num_scheduler_steps > 1: if speculative_config is not None: diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 7a6ebb430541f..a9b638ed02a1e 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -65,7 +65,7 @@ def process_prompt_logprob(self, seq_group: SequenceGroup, @staticmethod @functools.lru_cache def _log_prompt_logprob_unsupported_warning_once(): - # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.rst # If the feature combo become valid logger.warning( "Prompt logprob is not supported by multi step workers. " diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 336f9bc8efb20..6b4cb5a9a1d61 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -23,7 +23,7 @@ class CPUExecutor(ExecutorBase): def _init_executor(self) -> None: assert self.device_config.device_type == "cpu" - # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.rst # If the feature combo become valid assert self.lora_config is None, "cpu backend doesn't support LoRA" diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index b5333fbd6f502..680ee74129739 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -46,7 +46,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: import vllm.envs as envs from vllm.utils import GiB_bytes model_config = vllm_config.model_config - # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.rst # If the feature combo become valid if not model_config.enforce_eager: logger.warning( diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 53634f7b0b366..ced7f53827665 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -104,7 +104,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": return spec_decode_worker -# Reminder: Please update docs/source/serving/compatibility_matrix.rst +# Reminder: Please update docs/source/usage/compatibility_matrix.rst # If the feature combo become valid class SpecDecodeWorker(LoraNotSupportedWorkerBase): """Worker which implements speculative decoding. diff --git a/vllm/utils.py b/vllm/utils.py index 07bf82e24cbe6..6cee4847e57b4 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -47,7 +47,7 @@ # Exception strings for non-implemented encoder/decoder scenarios -# Reminder: Please update docs/source/serving/compatibility_matrix.rst +# Reminder: Please update docs/source/usage/compatibility_matrix.rst # If the feature combo become valid STR_NOT_IMPL_ENC_DEC_SWA = \ diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 3ee0fb4dc943e..3ca0d88a42183 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -817,7 +817,7 @@ def _pythonize_sampler_output( for sgdx, (seq_group, sample_result) in enumerate(zip(seq_groups, samples_list)): - # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.rst # If the feature combo become valid # (Check for Guided Decoding) if seq_group.sampling_params.logits_processors: diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py index f43635464ef00..5f71ec0c14df8 100644 --- a/vllm/worker/utils.py +++ b/vllm/worker/utils.py @@ -13,7 +13,7 @@ def assert_enc_dec_mr_supported_scenario( a supported scenario. ''' - # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.rst # If the feature combo become valid if enc_dec_mr.cache_config.enable_prefix_caching: From 1f958a7d52b24314e41c4bb56c51b1dce5405e05 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 5 Dec 2024 13:20:26 +0800 Subject: [PATCH 250/255] [Bugfix] Fix BNB loader target_modules (#10720) Signed-off-by: Jee Jee Li --- vllm/model_executor/model_loader/loader.py | 64 ++-------------------- 1 file changed, 6 insertions(+), 58 deletions(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index b4921cc80797f..a0ea0e5fad3c2 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -6,7 +6,6 @@ import glob import inspect import itertools -import json import math import os import warnings @@ -18,7 +17,7 @@ import huggingface_hub import numpy as np import torch -from huggingface_hub import HfApi, hf_hub_download +from huggingface_hub import HfApi from torch import nn from transformers import AutoModelForCausalLM from transformers.utils import SAFE_WEIGHTS_INDEX_NAME @@ -704,51 +703,9 @@ def __init__(self, load_config: LoadConfig): self.unsharded_weights_modules: List[str] = [] # Save the module names that are sharded by column. self.column_sharded_weights_modules: List[str] = [] - # we don't need to quantize the whole model, only the target modules - # that are specified in the adapter config file. If the adapter config - # file is not provided, we will quantize the default modules. - if (not load_config.model_loader_extra_config - or "qlora_adapter_name_or_path" - not in load_config.model_loader_extra_config): - self.target_modules = [] - return - - qlora_adapter = load_config.model_loader_extra_config[ - "qlora_adapter_name_or_path"] - - config_file_path = self._get_config_file(qlora_adapter) - - with open(config_file_path) as f: - config = json.load(f) - self.target_modules = config["target_modules"] - # TODO: target_modules could be either a list or a regex string. - # We need to handle both cases. - assert isinstance(self.target_modules, - list), "Unsupported target_modules: " - f"{self.target_modules}" - - def _get_config_file(self, qlora_adapter: str) -> str: - is_local = os.path.isdir(qlora_adapter) - config_file_path = None - if is_local: - for file in self.possible_config_file_names: - config_file_path = os.path.join(qlora_adapter, file) - if os.path.exists(config_file_path): - break - else: - hf_api = HfApi() - repo_files = hf_api.list_repo_files(repo_id=qlora_adapter) - for file in self.possible_config_file_names: - if file in repo_files: - config_file_path = hf_hub_download(repo_id=qlora_adapter, - filename=file) - break - - if not config_file_path: - raise ValueError( - f"Cannot find adapter config file in {qlora_adapter}") - - return config_file_path + # Store all module names (from transformers) that support + # BNB quantization. + self.target_modules: List[str] = [] def _get_weight_files( self, @@ -1030,25 +987,16 @@ def _get_bnb_target_modules(self, model: nn.Module) -> None: inverse_stacked_mapping[packed] = [] inverse_stacked_mapping[packed].insert(idx, orig) - linear_module_lst = [] for name, module in model.named_modules(): if isinstance(module, (LinearBase, )): last_name = name.split(".")[-1] if sub_modules := inverse_stacked_mapping.get(last_name, []): # Map vllm's names to transformers' names. for sub_name in sub_modules: - linear_module_lst.append( + self.target_modules.append( name.replace(last_name, sub_name)) else: - linear_module_lst.append(name) - if self.target_modules: - # Update self.target_modules - self.target_modules = [ - qual_name for qual_name in linear_module_lst - if any(t in qual_name for t in self.target_modules) - ] - else: - self.target_modules = linear_module_lst + self.target_modules.append(name) assert (self.target_modules ), "vllm currently does not support BNB quantization for" f" {type(model).__name__}" From 39c89e71a84779c0758ec603efcded7a48bb5fc0 Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Wed, 4 Dec 2024 22:54:06 -0700 Subject: [PATCH 251/255] [Misc] Update llama 3.2 template to support system prompt with images (#10901) Signed-off-by: Travis Johnson --- examples/tool_chat_template_llama3.2_json.jinja | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/examples/tool_chat_template_llama3.2_json.jinja b/examples/tool_chat_template_llama3.2_json.jinja index 39f902c1c3c40..2b290c0eede03 100644 --- a/examples/tool_chat_template_llama3.2_json.jinja +++ b/examples/tool_chat_template_llama3.2_json.jinja @@ -26,13 +26,11 @@ {%- endfor %} {%- endfor %} - {#- This block extracts the system message, so we can slot it into the right place. #} {%- if messages[0]['role'] == 'system' %} {%- if messages[0]['content'] is string %} {%- set system_message = messages[0]['content']|trim %} {%- else %} - {#- Support vLLM's transforming of a content string to JSON. #} {%- set system_message = messages[0]['content'][0]['text']|trim %} {%- endif %} {%- set messages = messages[1:] %} @@ -44,14 +42,8 @@ {%- endif %} {%- endif %} -{#- Including an image is not compatible with a system message #} -{%- if image_ns.has_images and not system_message == "" %} - {{- raise_exception("Prompting with images is incompatible with system messages and tool use.") }} -{%- endif %} - - -{#- System message, if there are no images #} -{%- if not image_ns.has_images %} +{#- System message if there are no images, if the user supplied one, or if tools are used (default tool system message) #} +{%- if system_message or not image_ns.has_images %} {{- "<|start_header_id|>system<|end_header_id|>\n\n" }} {%- if tools is not none %} {{- "Environment: ipython\n" }} From 571da8fc431ec36427ee1034a7779b23229b015e Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Thu, 5 Dec 2024 21:22:28 +0800 Subject: [PATCH 252/255] [Misc][LoRA] Clean up the function interface of Punica (#10917) Signed-off-by: Jee Jee Li --- tests/lora/test_layers.py | 42 ++- vllm/lora/fully_sharded_layers.py | 175 +++++----- vllm/lora/layers.py | 538 +++++++++++------------------- vllm/lora/models.py | 8 +- vllm/lora/punica.py | 365 ++++++++++---------- 5 files changed, 497 insertions(+), 631 deletions(-) diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 15e576cb065c7..a113e3f7abc1e 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -565,7 +565,9 @@ def _pretest(): @pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("stage", STAGES) -def test_linear_replicated(dist_init, num_loras, device, stage) -> None: +@pytest.mark.parametrize("bias_enabled", [True, False]) +def test_linear_replicated(dist_init, num_loras, device, stage, + bias_enabled) -> None: torch.cuda.set_device(device) torch.set_default_device(device) @@ -573,7 +575,8 @@ def test_linear_replicated(dist_init, num_loras, device, stage) -> None: max_loras = 8 lora_config = LoRAConfig(max_loras=max_loras, max_lora_rank=8, - lora_dtype=torch.float16) + lora_dtype=torch.float16, + bias_enabled=bias_enabled) def create_random_linear_replicated_layer(): @@ -585,7 +588,12 @@ def create_random_linear_replicated_layer(): lora_linear = ReplicatedLinearWithLoRA(linear) lora_linear.create_lora_weights(max_loras, lora_config) - + assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len( + lora_linear.lora_b_stacked) == 1) + if bias_enabled: + assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices + else: + assert lora_linear.lora_bias_stacked is None return linear, lora_linear for i in range(10): @@ -669,8 +677,9 @@ def create_random_linear_replicated_layer(): @pytest.mark.parametrize("fully_shard", [True, False]) @pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("stage", STAGES) +@pytest.mark.parametrize("bias_enabled", [True, False]) def test_linear_parallel(dist_init, num_loras, orientation, fully_shard, - device, stage) -> None: + device, stage, bias_enabled) -> None: torch.cuda.set_device(device) torch.set_default_device(device) @@ -679,7 +688,8 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard, lora_config = LoRAConfig(max_loras=max_loras, max_lora_rank=8, fully_sharded_loras=fully_shard, - lora_dtype=torch.float16) + lora_dtype=torch.float16, + bias_enabled=bias_enabled) def create_random_linear_parallel_layer(): if orientation == "row": @@ -700,7 +710,12 @@ def create_random_linear_parallel_layer(): if not fully_shard else ColumnParallelLinearWithShardedLoRA(linear)) lora_linear.create_lora_weights(max_loras, lora_config) - + assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len( + lora_linear.lora_b_stacked) == 1) + if bias_enabled: + assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices + else: + assert lora_linear.lora_bias_stacked is None return linear, lora_linear for i in range(10): @@ -784,8 +799,9 @@ def create_random_linear_parallel_layer(): @pytest.mark.parametrize("fully_shard", [True, False]) @pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("stage", STAGES) +@pytest.mark.parametrize("bias_enabled", [True, False]) def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, - device, stage) -> None: + device, stage, bias_enabled) -> None: torch.cuda.set_device(device) torch.set_default_device(device) @@ -794,7 +810,8 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, lora_config = LoRAConfig(max_loras=max_loras, max_lora_rank=8, fully_sharded_loras=fully_shard, - lora_dtype=torch.float16) + lora_dtype=torch.float16, + bias_enabled=bias_enabled) def create_column_parallel_packed_layer(): if repeats == 2: @@ -832,10 +849,16 @@ class FakeConfig: num_key_value_heads = 32 num_attention_heads = 32 + n_slices = repeats lora_linear.create_lora_weights(max_loras, lora_config, model_config=FakeConfig()) - + assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len( + lora_linear.lora_b_stacked) == n_slices) + if bias_enabled: + assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices + else: + assert lora_linear.lora_bias_stacked is None return linear, lora_linear for i in range(10): @@ -911,7 +934,6 @@ class FakeConfig: 512, lora_config.lora_extra_vocab_size, ) - # lora_linear.set_mapping(*mapping_info) lora_result = lora_linear(torch.cat(inputs))[0] expected_result = linear(torch.cat(inputs))[0] diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index e25e453201f01..545ec21ca74c1 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -1,5 +1,5 @@ # pylint: disable=unused-argument -from typing import TYPE_CHECKING, List, Optional, Union +from typing import TYPE_CHECKING, List, Optional, Tuple, Union, cast import torch import torch.nn as nn @@ -32,6 +32,44 @@ def dec(*args, **kwargs): return dec +def _mcp_apply(x, bias, layer: ColumnParallelLinearWithLoRA): + """ + For `ColumnParallelLinearWithLoRA` or classes that inherit from + `ColumnParallelLinearWithLoRA`, they share the same `apply` logic. + """ + assert (layer.n_slices == len(layer.lora_a_stacked) == len( + layer.lora_b_stacked) == len(layer.output_slices)) + if layer.lora_bias_stacked is not None: + assert layer.n_slices == len(layer.lora_bias_stacked) + + output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias) + + x = x.view(-1, x.shape[-1]) + output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape + + # Since communication is needed, the buffer is directly initialized as a + # tensor rather than a tuple of tensor. + buffers = torch.zeros( + (layer.n_slices, x.shape[0], layer.lora_a_stacked[0].shape[2]), + dtype=torch.float32, + device=x.device, + ) + + layer.punica_wrapper.add_shrink(buffers, x, layer.lora_a_stacked, 1.0) + buffers = tensor_model_parallel_all_gather(buffers) + layer.punica_wrapper.add_expand(output, + buffers, + layer.lora_b_stacked, + layer.lora_bias_stacked, + layer.output_slices, + offset_start=0, + add_input=True) + + output = output.view(*out_orig_shape) + # now have column partitioned and packed output + return output + + # these layers are based on the tensor parallelism strategy given in # Y. Sheng et al., S-LoRA: Serving Thousands of Concurrent LoRA Adapters. 2023, # https://arxiv.org/abs/2311.03285. @@ -51,34 +89,15 @@ class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA): # gather operation. def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: tp_rank = get_tensor_model_parallel_rank() - shard_size = self.lora_a_stacked.shape[2] + shard_size = self.lora_a_stacked[0].shape[2] start_idx = tp_rank * shard_size lora_a = lora_a[:, start_idx:start_idx + shard_size] return lora_a - def apply(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: - output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - - x = x.view(-1, x.shape[-1]) - output, out_orig_shape = output.view(-1, - output.shape[-1]), output.shape - buffer = torch.zeros( - (x.shape[0], self.lora_a_stacked.shape[2]), - dtype=torch.float32, - device=x.device, - ) - self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0) - buffer = tensor_model_parallel_all_gather(buffer) - self.punica_wrapper.add_expand(output, - buffer, - self.lora_b_stacked, - self.bias_stacked, - add_input=True) - # now have column partitioned output - - output = output.view(*out_orig_shape) - return output + def apply(self, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + return _mcp_apply(x, bias, self) @classmethod @_fully_sharded_can_replace @@ -99,46 +118,6 @@ def can_replace_layer( ) -def _mcp_apply(x, bias, layer: QKVParallelLinearWithLora): - """ - MergedColumnParallelLinearWithShardedLoRA and - MergedQKVParallelLinearWithShardedLora share the same - LoRa weight application method. - - The main difference is the step by shard_size for lora_b which can - vary for MergedQKVParallelLinearWithShardedLora but is constant for - MergedColumnParallelLinearWithShardedLoRA. - """ - # expecting 2 for column parallel and 3 for qkv - n = len(layer.lora_a_stacked) - output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias) - - x = x.view(-1, x.shape[-1]) - output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape - buffers = torch.zeros( - (n, x.shape[0], layer.lora_a_stacked[0].shape[2]), - dtype=torch.float32, - device=x.device, - ) - for idx in range(n): - layer.punica_wrapper.add_shrink(buffers[idx], x, - layer.lora_a_stacked[idx], 1.0) - - buffers = tensor_model_parallel_all_gather(buffers) - layer.punica_wrapper.add_expand_packed_nslice( - output, - buffers, - layer.lora_b_stacked, - layer.bias_stacked, - 1.0, - layer.output_slices, - ) - - output = output.view(*out_orig_shape) - # now have column partitioned and packed output - return output - - class MergedColumnParallelLinearWithShardedLoRA( MergedColumnParallelLinearWithLoRA): """ @@ -162,8 +141,9 @@ def slice_lora_a( ] return lora_a - def apply(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: + def apply(self, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: return _mcp_apply(x, bias, self) @classmethod @@ -195,31 +175,15 @@ class QKVParallelLinearWithShardedLora(QKVParallelLinearWithLora): def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: tp_rank = get_tensor_model_parallel_rank() - shard_size = self.lora_a_stacked.shape[2] + shard_size = self.lora_a_stacked[0].shape[2] start_idx = tp_rank * shard_size lora_a = lora_a[:, start_idx:start_idx + shard_size] return lora_a - def apply(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: - output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - - x = x.view(-1, x.shape[-1]) - output, out_orig_shape = output.view(-1, - output.shape[-1]), output.shape - buffer = torch.zeros((x.shape[0], self.lora_a_stacked.shape[2]), - dtype=torch.float32, - device=x.device) - self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0) - buffer = tensor_model_parallel_all_gather(buffer) - self.punica_wrapper.add_expand(output, - buffer, - self.lora_b_stacked, - self.bias_stacked, - add_input=True) - # now have column partitioned output - output = output.view(*out_orig_shape) - return output + def apply(self, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + return _mcp_apply(x, bias, self) @classmethod @_fully_sharded_can_replace @@ -260,8 +224,9 @@ def slice_lora_a( ] return lora_a - def apply(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: + def apply(self, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: return _mcp_apply(x, bias, self) @classmethod @@ -294,7 +259,7 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA): """ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: - shard_size = self.lora_b_stacked.shape[2] + shard_size = self.lora_b_stacked[0].shape[2] start_idx = self.tp_rank * shard_size end_idx = (self.tp_rank + 1) * shard_size lora_b = lora_b[:, start_idx:end_idx] @@ -303,20 +268,24 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: if bias is None: return bias - shard_size = self.bias_stacked.shape[2] + self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...], + self.lora_bias_stacked) + shard_size = self.lora_bias_stacked[0].shape[2] start_idx = self.tp_rank * shard_size end_idx = (self.tp_rank + 1) * shard_size bias = bias[start_idx:end_idx] return bias - def apply(self, x: torch.Tensor) -> torch.Tensor: + def apply(self, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x) x = x.view(-1, x.shape[-1]) output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape buffer = torch.zeros( - (x.shape[0], self.lora_a_stacked.shape[2]), + (self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]), dtype=torch.float32, device=x.device, ) @@ -330,12 +299,18 @@ def apply(self, x: torch.Tensor) -> torch.Tensor: # remains is a standard all_reduce. User should be aware though that # the output is not the same as a normal row_parallel, it should be # reduced before being used - shard_size = self.lora_b_stacked.shape[2] - start_idx = self.tp_rank * shard_size - self.punica_wrapper.add_expand_slice(output, buffer, - self.lora_b_stacked, - self.bias_stacked, start_idx, - shard_size) + # NOTE offset are based on the rank. + shard_size = self.lora_b_stacked[0].shape[2] + offset_start = self.tp_rank * shard_size + self.punica_wrapper.add_expand( + output, + buffer, + self.lora_b_stacked, + self.lora_bias_stacked, + self.output_slices, + offset_start=offset_start, + add_input=True, + ) output = output.view(*out_orig_shape) return output diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 73748b5ce511e..473e4bedf3d60 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -1,7 +1,7 @@ # pylint: disable=unused-argument import math from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union, cast import torch import torch.nn as nn @@ -18,11 +18,14 @@ tensor_model_parallel_gather) from vllm.distributed.utils import divide from vllm.lora.punica import PunicaWrapper +# yapf: disable from vllm.model_executor.layers.linear import (ColumnParallelLinear, + LinearBase, MergedColumnParallelLinear, QKVParallelLinear, ReplicatedLinear, RowParallelLinear) +# yapf: enable from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.rotary_embedding import ( LinearScalingRotaryEmbedding, RotaryEmbedding) @@ -249,13 +252,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: full_lora_a_embeddings.shape[1], -1, ) - - # Embedding layer only need expand op - self.punica_wrapper.add_expand(full_output, - full_lora_a_embeddings, - self.lora_b_stacked, - bias_all=None, - add_input=True) + self.punica_wrapper.add_lora_embedding(full_output, + full_lora_a_embeddings, + self.lora_b_stacked, + add_input=True) return full_output.view_as(full_output_org) @classmethod @@ -269,14 +269,19 @@ def can_replace_layer( return type(source_layer) is VocabParallelEmbedding -class ReplicatedLinearWithLoRA(BaseLayerWithLoRA): +class BaseLinearLayerWithLoRA(BaseLayerWithLoRA): - def __init__(self, base_layer: ReplicatedLinear) -> None: + def __init__(self, base_layer: LinearBase): super().__init__() self.base_layer = base_layer self.input_size = self.base_layer.input_size - self.output_size = self.base_layer.output_size self.device = _get_lora_device(self.base_layer) + self.lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]] = None + + self.output_slices: Tuple[int, ...] + self.tp_size: int + self.output_size: int + self.n_slices: int def create_lora_weights( self, @@ -285,39 +290,64 @@ def create_lora_weights( model_config: Optional[PretrainedConfig] = None, ) -> None: self.lora_config = lora_config - lora_a_output_size = lora_config.max_lora_rank - self.lora_a_stacked = torch.zeros( - max_loras, - 1, - lora_a_output_size, - self.input_size, - dtype=lora_config.lora_dtype, - device=self.device, - ) - self.lora_b_stacked = torch.zeros( - max_loras, - 1, - self.output_size, - lora_config.max_lora_rank, - dtype=lora_config.lora_dtype, - device=self.device, - ) - if lora_config.bias_enabled: - self.bias_stacked = torch.zeros( + # + if isinstance(self.base_layer, ReplicatedLinear): + lora_a_out_size = lora_config.max_lora_rank + lora_b_out_size = self.output_size + + elif isinstance(self.base_layer, ColumnParallelLinear): + lora_a_out_size = (lora_config.max_lora_rank if + not lora_config.fully_sharded_loras else divide( + lora_config.max_lora_rank, self.tp_size)) + lora_b_out_size = self.output_size + + elif isinstance(self.base_layer, RowParallelLinear): + lora_a_out_size = lora_config.max_lora_rank + lora_b_out_size = (self.output_size if + not lora_config.fully_sharded_loras else divide( + self.output_size, self.tp_size)) + else: + raise NotImplementedError + + self.lora_a_stacked = tuple( + torch.zeros( max_loras, 1, - self.output_size, + lora_a_out_size, + self.input_size, dtype=lora_config.lora_dtype, device=self.device, - ) - else: - self.bias_stacked = None + ) for _ in range(self.n_slices)) + self.lora_b_stacked = tuple( + torch.zeros( + max_loras, + 1, + lora_b_out_size, + lora_config.max_lora_rank, + dtype=lora_config.lora_dtype, + device=self.device, + ) for _ in range(self.n_slices)) + if lora_config.bias_enabled: + lora_bias_out_size = lora_b_out_size + self.lora_bias_stacked = tuple( + torch.zeros( + max_loras, + 1, + lora_bias_out_size, + dtype=lora_config.lora_dtype, + device=self.device, + ) for _ in range(self.n_slices)) + self.output_slices = (self.lora_b_stacked[0].shape[2], ) def reset_lora(self, index: int): - self.lora_a_stacked[index] = 0 - self.lora_b_stacked[index] = 0 - if self.lora_config.bias_enabled: - self.bias_stacked[index] = 0 + for s_index in range(self.n_slices): + self.lora_a_stacked[s_index][index] = 0 + self.lora_b_stacked[s_index][index] = 0 + if self.lora_config.bias_enabled: + # Make mypy happy + self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...], + self.lora_bias_stacked) + self.lora_bias_stacked[s_index][index] = 0 def set_lora( self, @@ -325,29 +355,56 @@ def set_lora( lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], - bias: Optional[torch.Tensor] = None, + lora_bias: Optional[torch.Tensor] = None, ): - self.reset_lora(index) + # Except for QKVParallelLinearWithLora and + # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers + # store weights in a tuple of size 1. These two layers will + # override this function. + assert (len(self.lora_a_stacked) == len(self.lora_b_stacked) == + self.n_slices == 1) - self.lora_a_stacked[index, - 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - lora_a.T, non_blocking=True) - self.lora_b_stacked[index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) - if bias is not None: - self.bias_stacked[index, - 0, :bias.shape[0]].copy_(bias.T, - non_blocking=True) - - def apply(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: + self.reset_lora(index) + if self.tp_size > 1: + lora_a = self.slice_lora_a(lora_a) + lora_b = self.slice_lora_b(lora_b) + if lora_bias is not None: + lora_bias = self.slice_bias(lora_bias) + + self.lora_a_stacked[0][index, + 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( + lora_a.T, non_blocking=True) + self.lora_b_stacked[0][index, + 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( + lora_b.T, non_blocking=True) + if lora_bias is not None: + + self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...], + self.lora_bias_stacked) + assert len(self.lora_bias_stacked) + self.lora_bias_stacked[0][index, 0, :lora_bias.shape[0]].copy_( + lora_bias.T, non_blocking=True) + + def apply(self, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, - self.lora_b_stacked, self.bias_stacked, - 1.0) + self.punica_wrapper.add_lora_linear(output, x, self.lora_a_stacked, + self.lora_b_stacked, + self.lora_bias_stacked, 1.0, + self.output_slices) return output + +class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA): + + def __init__(self, base_layer: ReplicatedLinear) -> None: + super().__init__(base_layer, ) + # To ensure interface compatibility, set to 1 always. + self.tp_size = 1 + self.output_size = self.base_layer.output_size + self.n_slices = 1 + def forward(self, input_): """Forward of ReplicatedLinearWithLoRA @@ -380,73 +437,26 @@ def can_replace_layer( return type(source_layer) is ReplicatedLinear -class ColumnParallelLinearWithLoRA(BaseLayerWithLoRA): +class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA): """ LoRA on top of ColumnParallelLinear layer. - LoRA B is sliced for tensor parallelism. + There are two types for the `base_layer`: + 1. ColumnParallelLinear, e.g.`dense_h_to_4h` in `FalconForCausalLM`. + 2. MergedColumnParallelLinear, e.g.`gate_up_proj` in `Phi3ForCausalLM`. """ def __init__(self, base_layer: ColumnParallelLinear) -> None: - super().__init__() + super().__init__(base_layer) # The base_layer type is ColumnParallelLinear or # MergedColumnParallelLinear, their weight sharding logic is # inconsistent when TP is greater than 1. self.is_merged_col_linear = type( base_layer) is MergedColumnParallelLinear - - self.base_layer = base_layer self.tp_size = get_tensor_model_parallel_world_size() - self.input_size = self.base_layer.input_size self.output_size = self.base_layer.output_size_per_partition - self.device = _get_lora_device(self.base_layer) - - def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None, - ) -> None: - self.lora_config = lora_config - self.tp_size = get_tensor_model_parallel_world_size() - lora_a_output_size_per_partition = ( - lora_config.max_lora_rank if not lora_config.fully_sharded_loras - else divide(lora_config.max_lora_rank, self.tp_size)) - self.lora_a_stacked = torch.zeros( - max_loras, - 1, - lora_a_output_size_per_partition, - self.input_size, - dtype=lora_config.lora_dtype, - device=self.device, - ) - self.lora_b_stacked = torch.zeros( - max_loras, - 1, - self.output_size, - lora_config.max_lora_rank, - dtype=lora_config.lora_dtype, - device=self.device, - ) - - if lora_config.bias_enabled: - self.bias_stacked = torch.zeros( - max_loras, - 1, - self.output_size, - dtype=lora_config.lora_dtype, - device=self.device, - ) - else: - self.bias_stacked = None - - self.output_dim = self.lora_b_stacked.shape[2] - - def reset_lora(self, index: int): - self.lora_a_stacked[index] = 0 - self.lora_b_stacked[index] = 0 - if self.lora_config.bias_enabled: - self.bias_stacked[index] = 0 + # There is only one LoRA layer + self.n_slices = 1 def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: return lora_a @@ -485,40 +495,6 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: bias = bias[start_idx:end_idx] return bias - def set_lora( - self, - index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, - embeddings_tensor: Optional[torch.Tensor], - bias: Optional[torch.Tensor] = None, - ): - self.reset_lora(index) - - if self.tp_size > 1: - lora_a = self.slice_lora_a(lora_a) - lora_b = self.slice_lora_b(lora_b) - bias = self.slice_bias(bias) - - self.lora_a_stacked[index, - 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - lora_a.T, non_blocking=True) - self.lora_b_stacked[index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) - if bias is not None: - self.bias_stacked[index, - 0, :bias.shape[0]].copy_(bias.T, - non_blocking=True) - - def apply(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: - output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, - self.lora_b_stacked, self.bias_stacked, - 1.0) - return output - def forward(self, input_): """Forward of ColumnParallelLinear @@ -568,6 +544,8 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): def __init__(self, base_layer: MergedColumnParallelLinear) -> None: super().__init__(base_layer) + # There are two LoRA layers + self.n_slices = len(self.base_layer.output_sizes) def create_lora_weights( self, @@ -575,9 +553,13 @@ def create_lora_weights( lora_config: LoRAConfig, model_config: Optional[PretrainedConfig] = None, ) -> None: + """ + The main reason for overriding this function is to enhance code + maintainability. + """ self.lora_config = lora_config - n_slices = 2 - if not (len(self.base_layer.output_sizes) == n_slices + + if not (len(self.base_layer.output_sizes) == self.n_slices == 2 and self.base_layer.output_sizes[0] == self.base_layer.output_sizes[1]): raise ValueError( @@ -598,7 +580,7 @@ def create_lora_weights( self.input_size, dtype=lora_config.lora_dtype, device=self.device, - ) for _ in range(n_slices)) + ) for _ in range(self.n_slices)) self.lora_b_stacked = tuple( torch.zeros( max_loras, @@ -607,30 +589,19 @@ def create_lora_weights( lora_config.max_lora_rank, dtype=lora_config.lora_dtype, device=self.device, - ) for _ in range(n_slices)) + ) for _ in range(self.n_slices)) if lora_config.bias_enabled: - self.bias_stacked = tuple( + self.lora_bias_stacked = tuple( torch.zeros( max_loras, 1, self.output_size // 2, dtype=lora_config.lora_dtype, device=self.device, - ) for _ in range(n_slices)) - else: - self.bias_stacked = None + ) for _ in range(self.n_slices)) self.output_dim = self.lora_b_stacked[0].shape[2] self.output_slices = (self.output_dim, self.output_dim) - def reset_lora(self, index: int): - self.lora_a_stacked[0][index] = 0 - self.lora_a_stacked[1][index] = 0 - self.lora_b_stacked[0][index] = 0 - self.lora_b_stacked[1][index] = 0 - if self.lora_config.bias_enabled: - self.bias_stacked[0][index] = 0 - self.bias_stacked[1][index] = 0 - def slice_lora_a( self, lora_a: List[Union[torch.Tensor, None]] ) -> List[Union[torch.Tensor, None]]: @@ -668,15 +639,15 @@ def set_lora( lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], - bias: Optional[torch.Tensor] = None, + lora_bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) if self.tp_size > 1: lora_a = self.slice_lora_a(lora_a) lora_b = self.slice_lora_b(lora_b) - if bias is not None: - bias = self.slice_bias(bias) + if lora_bias is not None: + lora_bias = self.slice_bias(lora_bias) if lora_a[0] is not None: self.lora_a_stacked[0][ @@ -685,10 +656,11 @@ def set_lora( self.lora_b_stacked[0][ index, 0, :lora_b[0].shape[1], :lora_b[0].shape[0]].copy_( lora_b[0].T, non_blocking=True) - if bias is not None and bias[0] is not None: - self.bias_stacked[0][index, - 0, :bias[0].shape[0]].copy_(bias[0].T, - non_blocking=True) + if lora_bias is not None and lora_bias[0] is not None: + self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...], + self.lora_bias_stacked) + self.lora_bias_stacked[0][index, 0, :lora_bias[0].shape[0]].copy_( + lora_bias[0].T, non_blocking=True) if lora_a[1] is not None: self.lora_a_stacked[1][ index, 0, :lora_a[1].shape[1], :lora_a[1].shape[0]].copy_( @@ -696,18 +668,11 @@ def set_lora( self.lora_b_stacked[1][ index, 0, :lora_b[1].shape[1], :lora_b[1].shape[0]].copy_( lora_b[1].T, non_blocking=True) - if bias is not None and bias[1] is not None: - self.bias_stacked[1][index, - 0, :bias[1].shape[0]].copy_(bias[1].T, - non_blocking=True) - - def apply(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: - output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - self.punica_wrapper.add_lora_packed_nslice( - output, x, self.lora_a_stacked, self.lora_b_stacked, - self.bias_stacked, 1.0, (self.output_dim, self.output_dim)) - return output + if lora_bias is not None and lora_bias[1] is not None: + self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...], + self.lora_bias_stacked) + self.lora_bias_stacked[1][index, 0, :lora_bias[1].shape[0]].copy_( + lora_bias[1].T, non_blocking=True) @classmethod @_not_fully_sharded_can_replace @@ -737,7 +702,6 @@ class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA): def __init__(self, base_layer: QKVParallelLinear) -> None: super().__init__(base_layer) - self.tp_size = get_tensor_model_parallel_world_size() self.q_proj_total_size = (self.base_layer.total_num_heads * self.base_layer.head_size) self.q_proj_shard_size = (self.base_layer.num_heads * @@ -746,6 +710,8 @@ def __init__(self, base_layer: QKVParallelLinear) -> None: self.base_layer.head_size) self.kv_proj_total_size = (self.base_layer.total_num_kv_heads * self.base_layer.head_size) + # There is only one LoRA layer + self.n_slices = 1 def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: tp_rank = get_tensor_model_parallel_rank() @@ -780,32 +746,6 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: bias = torch.cat([bias_q, bias_k, bias_v], dim=1) return bias - def set_lora( - self, - index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, - embeddings_tensor: Optional[torch.Tensor], - bias: Optional[torch.Tensor] = None, - ): - self.reset_lora(index) - if self.tp_size > 1: - lora_a = self.slice_lora_a(lora_a) - lora_b = self.slice_lora_b(lora_b) - if bias is not None: - bias = self.slice_bias(bias) - - self.lora_a_stacked[index, - 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - lora_a.T, non_blocking=True) - self.lora_b_stacked[index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) - if bias is not None: - self.bias_stacked[index, - 0, :bias.shape[0]].copy_(bias.T, - non_blocking=True) - @classmethod @_not_fully_sharded_can_replace def can_replace_layer(cls, source_layer: nn.Module, @@ -828,6 +768,10 @@ class MergedQKVParallelLinearWithLora(ColumnParallelLinearWithLoRA): def __init__(self, base_layer: QKVParallelLinear) -> None: super().__init__(base_layer) + # There are three LoRA layer. + self.n_slices = len(self.base_layer.output_sizes) + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() def create_lora_weights( self, @@ -835,9 +779,16 @@ def create_lora_weights( lora_config: LoRAConfig, model_config: Optional[PretrainedConfig] = None, ) -> None: + """ + The main reason for overloading this function is to handle inconsistent + weight dimensions in qkv lora. + """ self.lora_config = lora_config - self.tp_size = get_tensor_model_parallel_world_size() - self.tp_rank = get_tensor_model_parallel_rank() + + if not (len(self.base_layer.output_sizes) == self.n_slices == 3): + raise ValueError( + "LoRAColumnParallelLinear3Slice requires 3 slices.") + self.q_proj_shard_size = (self.base_layer.num_heads * self.base_layer.head_size) self.kv_proj_shard_size = (self.base_layer.num_kv_heads * @@ -902,7 +853,7 @@ def create_lora_weights( ), ) if lora_config.bias_enabled: - self.bias_stacked = ( + self.lora_bias_stacked = ( torch.zeros( max_loras, 1, @@ -925,9 +876,6 @@ def create_lora_weights( device=self.device, ), ) - else: - self.bias_stacked = None - self.output_slices = ( self.q_proj_shard_size, self.kv_proj_shard_size, @@ -939,18 +887,6 @@ def create_lora_weights( self.indices: torch.Tensor self.indices_len: List[int] - def reset_lora(self, index: int): - self.lora_a_stacked[0][index] = 0 - self.lora_b_stacked[0][index] = 0 - self.lora_a_stacked[1][index] = 0 - self.lora_b_stacked[1][index] = 0 - self.lora_a_stacked[2][index] = 0 - self.lora_b_stacked[2][index] = 0 - if self.lora_config.bias_enabled: - self.bias_stacked[0][index] = 0 - self.bias_stacked[1][index] = 0 - self.bias_stacked[2][index] = 0 - def slice_lora_a( self, lora_a: List[Union[torch.Tensor, None]] ) -> List[Union[torch.Tensor, None]]: @@ -1000,15 +936,15 @@ def set_lora( lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], - bias: Optional[torch.Tensor] = None, + lora_bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) if self.tp_size > 1: lora_a = self.slice_lora_a(lora_a) lora_b = self.slice_lora_b(lora_b) - if bias is not None: - bias = self.slice_bias(bias) + if lora_bias is not None: + lora_bias = self.slice_bias(lora_bias) if lora_b[0] is not None: lora_b_q = lora_b[0] @@ -1039,26 +975,24 @@ def set_lora( index, 0, :lora_a[2].shape[1], :lora_a[2].shape[0]].copy_( lora_a[2].T, non_blocking=True) - if bias is not None: - if bias[0] is not None: - self.bias_stacked[0][index, 0, :bias[0].shape[0]].copy_( - bias[0].T, non_blocking=True) - if bias[1] is not None: - self.bias_stacked[1][index, 0, :bias[1].shape[0]].copy_( - bias[1].T, non_blocking=True) - if bias[2] is not None: - self.bias_stacked[2][index, 0, :bias[2].shape[0]].copy_( - bias[2].T, non_blocking=True) - - def apply(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: - output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - self.punica_wrapper.add_lora_packed_nslice(output, x, - self.lora_a_stacked, - self.lora_b_stacked, - self.bias_stacked, 1.0, - self.output_slices) - return output + if lora_bias is not None: + self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...], + self.lora_bias_stacked) + if lora_bias[0] is not None: + self.lora_bias_stacked[0][index, + 0, :lora_bias[0].shape[0]].copy_( + lora_bias[0].T, + non_blocking=True) + if lora_bias[1] is not None: + self.lora_bias_stacked[1][index, + 0, :lora_bias[1].shape[0]].copy_( + lora_bias[1].T, + non_blocking=True) + if lora_bias[2] is not None: + self.lora_bias_stacked[2][index, + 0, :lora_bias[2].shape[0]].copy_( + lora_bias[2].T, + non_blocking=True) @classmethod @_not_fully_sharded_can_replace @@ -1073,76 +1007,25 @@ def can_replace_layer( and len(packed_modules_list) == 3) -class RowParallelLinearWithLoRA(BaseLayerWithLoRA): +class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA): def __init__(self, base_layer: RowParallelLinear) -> None: - super().__init__() - self.base_layer = base_layer + super().__init__(base_layer) + + self.tp_size = get_tensor_model_parallel_world_size() + # reset input_size self.input_size = self.base_layer.input_size_per_partition self.output_size = self.base_layer.output_size - self.device = _get_lora_device(self.base_layer) - def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None, - ) -> None: - self.lora_config = lora_config self.tp_rank = get_tensor_model_parallel_rank() - self.lora_a_stacked = torch.zeros( - ( - max_loras, - 1, - lora_config.max_lora_rank, - self.input_size, - ), - dtype=lora_config.lora_dtype, - device=self.device, - ) - tp_size = get_tensor_model_parallel_world_size() - lora_b_output_size_per_partition = ( - self.output_size if not lora_config.fully_sharded_loras else - divide(self.output_size, tp_size)) - - self.lora_b_stacked = torch.zeros( - ( - max_loras, - 1, - lora_b_output_size_per_partition, - lora_config.max_lora_rank, - ), - dtype=lora_config.lora_dtype, - device=self.device, - ) - - if lora_config.bias_enabled: - self.bias_stacked = torch.zeros( - ( - max_loras, - 1, - self.output_size, - ), - dtype=lora_config.lora_dtype, - device=self.device, - ) - else: - self.bias_stacked = None - # Lazily initialized - self.indices: torch.Tensor - self.indices_len: List[int] - - def reset_lora(self, index: int): - self.lora_a_stacked[index] = 0 - self.lora_b_stacked[index] = 0 - if self.lora_config.bias_enabled: - self.bias_stacked[index] = 0 + # There is only one LoRA layer. + self.n_slices = 1 def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: - tensor_model_parallel_rank = get_tensor_model_parallel_rank() + shard_size = self.input_size - start_idx = tensor_model_parallel_rank * shard_size - end_idx = (tensor_model_parallel_rank + 1) * shard_size + start_idx = self.tp_rank * shard_size + end_idx = (self.tp_rank + 1) * shard_size lora_a = lora_a[start_idx:end_idx, :] return lora_a @@ -1152,40 +1035,6 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: return bias - def set_lora( - self, - index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, - embeddings_tensor: Optional[torch.Tensor], - bias: Optional[torch.Tensor] = None, - ): - self.reset_lora(index) - - if self.base_layer.tp_size > 1: - lora_a = self.slice_lora_a(lora_a) - lora_b = self.slice_lora_b(lora_b) - if bias is not None: - bias = self.slice_bias(bias) - - self.lora_a_stacked[index, - 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - lora_a.T, non_blocking=True) - self.lora_b_stacked[index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) - if bias is not None: - self.bias_stacked[index, - 0, :bias.shape[0]].copy_(bias.T, - non_blocking=True) - - def apply(self, x: torch.Tensor) -> torch.Tensor: - output = self.base_layer.quant_method.apply(self.base_layer, x) - self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, - self.lora_b_stacked, self.bias_stacked, - 1.0) - return output - def forward(self, input_): """Forward of RowParallelLinear @@ -1203,10 +1052,9 @@ def forward(self, input_): input_parallel = input_ else: # TODO: simplify code below - tp_rank = get_tensor_model_parallel_rank() splitted_input = split_tensor_along_last_dim( input_, num_partitions=self.base_layer.tp_size) - input_parallel = splitted_input[tp_rank].contiguous() + input_parallel = splitted_input[self.tp_rank].contiguous() # Matrix multiply. output_parallel = self.apply(input_parallel) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 2ffefe61427e3..9855b57d0c9c9 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -555,17 +555,17 @@ def create_dummy_lora( input_dim, output_dim, rank, - module.lora_a_stacked.dtype, + module.lora_a_stacked[0].dtype, "cpu", embeddings_tensor_dim=embeddings_tensor_dim, bias_enabled=bias_enabled) else: lora = LoRALayerWeights.create_dummy_lora_weights( module_name, - module.lora_a_stacked.shape[-1], - module.lora_b_stacked.shape[-2], + module.lora_a_stacked[0].shape[-1], + module.lora_b_stacked[0].shape[-2], rank, - module.lora_a_stacked.dtype, + module.lora_a_stacked[0].dtype, "cpu", bias_enabled=bias_enabled, ) diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 3f775b7ba363e..563d1181d6fcb 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -362,7 +362,7 @@ def long_lora_indices(self) -> torch.Tensor: long_lora_len = self.indices_len[4] return self._long_lora_indices[:long_lora_len] - def shrink_prefill( + def _shrink_prefill( self, y: torch.Tensor, x: torch.Tensor, @@ -380,7 +380,7 @@ def shrink_prefill( scale, ) - def shrink_decode( + def _shrink_decode( self, y: torch.Tensor, x: torch.Tensor, @@ -389,7 +389,7 @@ def shrink_decode( ): bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale) - def expand_prefill( + def _expand_prefill( self, y: torch.Tensor, x: torch.Tensor, @@ -407,7 +407,7 @@ def expand_prefill( add_input, ) - def expand_decode( + def _expand_decode( self, y: torch.Tensor, x: torch.Tensor, @@ -416,7 +416,7 @@ def expand_decode( ): bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_input) - def expand_slice_prefill( + def _expand_slice_prefill( self, y: torch.Tensor, x: torch.Tensor, @@ -438,7 +438,7 @@ def expand_slice_prefill( add_input, ) - def expand_slice_decode( + def _expand_slice_decode( self, y: torch.Tensor, x: torch.Tensor, @@ -450,41 +450,35 @@ def expand_slice_decode( bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset, y_slice_size, add_input) - def apply_bias( - self, - indices: torch.Tensor, - output: torch.Tensor, - bias_stacked: torch.Tensor, - ): - """Applies bias to output - - Input shapes: - bias_stacked: (num_loras, output_dim) - indices: (batch_size) - output: (batch_size, output_dim) + def _apply_expand(self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + y_offset: Optional[int], + y_slice_size: Optional[int], + add_input: bool = True): + """ + Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` + computation, which is suitable for the + GEMM of lora'b. """ - org_output = output - output = output.view(-1, output.shape[-1]) - indices = indices.view(-1) - - bias_stacked = bias_stacked.view(-1, bias_stacked.shape[-1]) - bias_stacked = bias_stacked[indices] - bias_stacked[indices == -1] = 0 - output += bias_stacked - return output.view_as(org_output) + expand_slice_fun: Callable = (self._expand_slice_prefill + if self.is_prefill else + self._expand_slice_decode) + expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input) - def apply_bias_packed_nslice( + def _apply_bias( self, indices: torch.Tensor, output: torch.Tensor, output_slices: Tuple[int, ...], - bias_stacked: Tuple[Optional[torch.Tensor], ...], + lora_bias_stacked: Tuple[Optional[torch.Tensor], ...], ): """Applies bias to output Input shapes: - bias_stacked: 3 element tuple of (num_loras, output_dim) + lora_bias_stacked: 3 element tuple of (num_loras, output_dim) indices: (batch_size) output: (batch_size, q_slice_size + 2*kv_slice_size) output_slices: n-1 element tuple of (slice_size...), @@ -496,7 +490,7 @@ def apply_bias_packed_nslice( offset_left = 0 for slice_idx, slice in enumerate(output_slices): - bias = bias_stacked[slice_idx] + bias = lora_bias_stacked[slice_idx] if bias is not None: bias = bias.view(-1, bias.shape[-1]) bias = bias[indices] @@ -506,7 +500,7 @@ def apply_bias_packed_nslice( return output.view_as(org_output) - def add_shrink( + def _apply_shrink( self, y: torch.Tensor, x: torch.Tensor, @@ -517,188 +511,215 @@ def add_shrink( Perform the ` y+=x@w_t_all` computation, which is suitable for the GEMM of lora'a. When `is_prefill is` true, it indicates that it is currently the - prefill stage, and the `shrink_prefill` function should be called. - Otherwise, it is the decode stage, and the shrink_decode function + prefill stage, and the `_shrink_prefill` function should be called. + Otherwise, it is the decode stage, and the _shrink_decode function should be called. """ - shrink_fun: Callable = (self.shrink_prefill - if self.is_prefill else self.shrink_decode) + y_org = y + y = y.view(-1, y.shape[-1]) + shrink_fun: Callable = (self._shrink_prefill + if self.is_prefill else self._shrink_decode) shrink_fun(y, x, w_t_all, scale) + y = y.view_as(y_org) - def add_expand( + def add_shrink( self, - y: torch.Tensor, + y: Union[Tuple[torch.Tensor, ...], torch.Tensor], x: torch.Tensor, - w_t_all: torch.Tensor, - bias_all: Optional[torch.Tensor], - add_input: bool = True, + lora_a_stacked: Tuple[torch.Tensor, ...], + scale: float, ): """ - Perform the ` y+=x@w_t_all+bias` computation, which is suitable for the - GEMM of lora'b. - When `is_prefill` is true, it indicates that it is currently the - prefill stage, and the `expand_prefill` function should be called. - Otherwise, it is the decode stage, and the expand_decode function + Performs GEMM for multiple slices of lora_a. + When `is_prefill is` true, it indicates that it is currently the + prefill stage, and the `_shrink_prefill` function should be called. + Otherwise, it is the decode stage, and the _shrink_decode function should be called. - """ - if bias_all is not None: - y = self.apply_bias(self.token_lora_indices, y, bias_all) - - expand_fun: Callable = (self.expand_prefill - if self.is_prefill else self.expand_decode) - expand_fun(y, x, w_t_all, add_input) - - def add_expand_slice(self, - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - bias_all: Optional[torch.Tensor], - y_offset: Optional[int], - y_slice_size: Optional[int], - add_input: bool = True): - """ - Similar to `add_expand` - """ - if bias_all is not None: - y = self.apply_bias(self.token_lora_indices, y, bias_all) + + Semantics: + for i in range(len(lora_a_stacked)): + y[i] += (x @ lora_a_stacked[i]) * scale + + Args: + y (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Output tensors + x (torch.Tensor): Input tensor + lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weights + scale (float): Scaling factor for the operation + """ - expand_slice_fun: Callable = (self.expand_slice_prefill - if self.is_prefill else - self.expand_slice_decode) - expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input) + x = x.view(-1, x.shape[-1]) + # TODO fuse these kernels + for slice_idx in range(len(lora_a_stacked)): + self._apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx], + scale) - def add_expand_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, - lora_b_stacked: Tuple[torch.Tensor, ...], - bias_stacked: Optional[Tuple[torch.Tensor, - ...]], - scale: float, - output_slices: Tuple[int, ...]) -> None: - """ - Similar to `add_expand` + def add_expand( + self, + y: torch.Tensor, + x: Union[Tuple[torch.Tensor, ...], torch.Tensor], + lora_b_stacked: Tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], + output_slices: Tuple[int, ...], + offset_start: int = 0, + add_input=True, + ) -> None: """ + Performs GEMM and bias addition for multiple slices of lora_b. + + Semantics: + for i in range(len(lora_b_stacked)): + slice = output_slices[i] + y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + + lora_bias_stacked[i] + offset += slice + + Args: + y (torch.Tensor): Output tensor. + x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors + lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight + lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): + bias's weight + output_slices (Tuple[int, ...]): Every slice's size + add_input (bool): Defaults to True. + """ y_org = y y = y.view(-1, y.shape[-1]) - offset_left = 0 - if bias_stacked is not None: - self.apply_bias_packed_nslice(self.token_lora_indices, y, - output_slices, bias_stacked) + offset_left = offset_start + if lora_bias_stacked is not None: + self._apply_bias(self.token_lora_indices, y, output_slices, + lora_bias_stacked) for slice_idx in range(len(lora_b_stacked)): - self.add_expand_slice(y, - x[slice_idx], - lora_b_stacked[slice_idx], - None, - offset_left, - output_slices[slice_idx], - add_input=True) + self._apply_expand( + y, + x[slice_idx], + lora_b_stacked[slice_idx], + offset_left, + output_slices[slice_idx], + add_input=add_input, + ) offset_left += output_slices[slice_idx] - y = y.view_as(y_org) - def add_lora(self, - y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - bias_all: Optional[torch.Tensor], - scale: float, - y_offset: Optional[int] = None, - y_slice_size: Optional[int] = None, - *, - buffer: Optional[torch.Tensor] = None) -> None: + def add_lora_embedding( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_b_stacked: torch.Tensor, + add_input: bool = True, + ): + """ + Applies lora specifically for VocabParallelEmbeddingWithLoRA. + + Semantics: + y += x @ lora_b_stacked + + Args: + y (torch.Tensor): Output tensor. + x (torch.Tensor): Input tensor. + lora_b_stacked (torch.Tensor): lora_b's weights. + add_input (bool): Default to True. + + """ + + # Embedding layer only need expand op + expand_fun: Callable = (self._expand_prefill + if self.is_prefill else self._expand_decode) + expand_fun(y, x, lora_b_stacked, add_input) + + def add_lora_linear( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: Tuple[torch.Tensor, ...], + lora_b_stacked: Tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], + scale: float, + output_slices: Tuple[int, ...], + *, + buffer: Optional[Tuple[torch.Tensor, ...]] = None) -> None: """ + Applicable to linear-related lora. + Semantics: - y[i] += ( - x[i].unsqueeze(0) - @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - * scale - ).squeeze(0)+bias[i] + for i in range(len(lora_a_stacked)): + y[i] += ( + x[i].unsqueeze(0) + @ lora_a_stacked[indices[i], layer_idx, :, :] + @ lora_b_stacked[indices[i], layer_idx, :, :] + * scale + ).squeeze(0)+lora_bias_stacked[i] + Args: - y (torch.Tensor): Output tensor. Will be changed in-place. + y (torch.Tensor): Output tensor. Will be changed in-place. x (torch.Tensor): Input tensor - wa_t_all (torch.Tensor): lora_a's weight - wb_t_all (torch.Tensor): lora_b's weight - bias_all: (torch.Tensor): lora's bias + lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight. + lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight. + lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias. scale (float): Scaling factor. - y_offset (Optional[int], optional): Offset to apply to the starting - column of y. - y_slice_size (Optional[int], optional): Size of the y column slice. - buffer (Optional[torch.Tensor], optional): Defaults to None. + output_slices (Tuple[int, ...]): Every slice's size. + buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None. """ - y_org = y - y = y.view(-1, y.shape[-1]) - x = x.view(-1, x.shape[-1]) - r = wb_t_all.size(-1) + + assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices) + if lora_bias_stacked is not None: + assert len(lora_bias_stacked) == len(output_slices) + y = self._apply_bias(self.token_lora_indices, y, output_slices, + lora_bias_stacked) + if buffer is None: + r = lora_b_stacked[0].size(-1) # We set the buffer to be float32 by default ,refer to: # https://github.com/triton-lang/triton/issues/1387 - buffer = torch.zeros((x.size(0), r), - dtype=torch.float32, - device=x.device) - if bias_all is not None: - y = self.apply_bias(self.token_lora_indices, y, bias_all) - self.add_shrink(buffer, x, wa_t_all, scale) - if y_offset is None and y_slice_size is None: - self.add_expand(y, buffer, wb_t_all, bias_all=None, add_input=True) - else: - self.add_expand_slice(y, - buffer, - wb_t_all, - None, - y_offset, - y_slice_size, - add_input=True) - y = y.view_as(y_org) - - def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, - lora_a_stacked: Tuple[torch.Tensor, ...], - lora_b_stacked: Tuple[torch.Tensor, ...], - bias_all: Tuple[Optional[torch.Tensor], - ...], scale: float, - output_slices: Tuple[int, ...]) -> None: - """ - Applies lora to each input. Similar to add_lora, This method is - used for layers that are composed of multiple sublayers - (slices) packed together. - """ - y_org = y - x = x.view(-1, x.shape[-1]) - y = y.view(-1, y.shape[-1]) - offset_left = 0 - if bias_all is not None: - y = self.apply_bias_packed_nslice(self.token_lora_indices, y, - output_slices, bias_all) - # TODO fuse these kernels - for slice_idx in range(len(output_slices)): - self.add_lora(y, x, lora_a_stacked[slice_idx], - lora_b_stacked[slice_idx], None, scale, offset_left, - output_slices[slice_idx]) - offset_left += output_slices[slice_idx] - - y = y.view_as(y_org) + buffer = tuple( + torch.zeros( + (x.size(0), r), dtype=torch.float32, device=x.device) + for _ in range(len(output_slices))) + self.add_shrink(buffer, x, lora_a_stacked, scale) + self.add_expand(y, + buffer, + lora_b_stacked, + None, + output_slices, + add_input=True) def add_lora_logits(self, y: torch.Tensor, x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, + lora_a_stacked: torch.Tensor, + lora_b_stacked: torch.Tensor, scale, *, buffer: Optional[torch.Tensor] = None) -> None: """ - LogitsProcessorWithLoRA always using bgmv - """ + Applies lora specifically for LogitsProcessorWithLoRA. + + Semantics: + buffer = (x @ lora_a_stacked) * scale + y += buffer @ lora_b_stacked + + Args: + y (torch.Tensor): Output tensor. + x (torch.Tensor): Input tensor. + lora_a_stacked (torch.Tensor): lora_a's weights. + lora_b_stacked (torch.Tensor):lora_b's weights. + scale (float): Scaling factor. + buffer (Optional[torch.Tensor]):Default to None. + """ y_org = y y = y.view(-1, y.shape[-1]) x = x.view(-1, x.shape[-1]) - r = wb_t_all.size(-1) + r = lora_b_stacked.size(-1) if buffer is None: # We set the buffer to be float32 by default ,refer to: # https://github.com/triton-lang/triton/issues/1387 buffer = torch.zeros((x.size(0), r), dtype=torch.float32, device=x.device) - - bgmv_shrink(x, wa_t_all, buffer, self.sampler_indices, scale) - bgmv_expand(buffer, wb_t_all, y, self.sampler_indices, add_inputs=True) + # LogitsProcessorWithLoRA always using bgmv. + bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale) + bgmv_expand(buffer, + lora_b_stacked, + y, + self.sampler_indices, + add_inputs=True) y = y.view_as(y_org) From 998eeafe58c0263323b7fd8813c8b3d3f839bcbc Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 6 Dec 2024 00:05:52 +0800 Subject: [PATCH 253/255] [CI/Build] Bump test transformers version (#10106) Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: DarkLight1337 Co-authored-by: DarkLight1337 --- requirements-test.txt | 2 +- .../vision_language/test_models.py | 25 +------------------ .../vision_language/test_pixtral.py | 2 +- .../vision_language/test_llava_next.py | 4 --- tests/models/test_initialization.py | 5 ---- 5 files changed, 3 insertions(+), 35 deletions(-) diff --git a/requirements-test.txt b/requirements-test.txt index a59b85023948b..19369254dbe26 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -550,7 +550,7 @@ tqdm==4.66.6 # transformers tqdm-multiprocess==0.0.11 # via lm-eval -transformers==4.45.2 +transformers==4.46.3 # via # lm-eval # peft diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index dbb0b4d350d10..924f19c4448b8 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -6,7 +6,6 @@ from typing import Type import pytest -import transformers from transformers import AutoModelForVision2Seq from transformers.utils import is_flash_attn_2_available @@ -187,12 +186,6 @@ comparator=check_outputs_equal, max_tokens=8, dtype="bfloat16", - marks=[ - pytest.mark.skipif( - transformers.__version__ < "4.46.2", - reason="Model broken in HF, see huggingface/transformers#34379" - ), - ] ), "fuyu": VLMTestInfo( models=["adept/fuyu-8b"], @@ -243,13 +236,7 @@ max_model_len=8192, max_num_seqs=2, auto_cls=AutoModelForVision2Seq, - marks=[ - pytest.mark.skipif( - transformers.__version__ < "4.46.0", - reason="Model introduced in HF >= 4.46.0" - ), - large_gpu_mark(min_gb=48), - ], + marks=[large_gpu_mark(min_gb=48)], ), "intern_vl": VLMTestInfo( models=[ @@ -318,12 +305,6 @@ auto_cls=AutoModelForVision2Seq, vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output, image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))], - marks=[ - pytest.mark.skipif( - transformers.__version__ < "4.46.2", - reason="Model broken with changes in transformers 4.46" - ) - ], ), "minicpmv_25": VLMTestInfo( models=["openbmb/MiniCPM-Llama3-V-2_5"], @@ -404,10 +385,6 @@ cuda_device_count_stateless() < 2, reason="Need at least 2 GPUs to run the test.", ), - pytest.mark.skipif( - transformers.__version__ < "4.46.2", - reason="Model broken in HF, see huggingface/transformers#34379" - ) ], **COMMON_BROADCAST_SETTINGS # type: ignore ), diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py index 6233860747b9c..90c0fab99054c 100644 --- a/tests/models/decoder_only/vision_language/test_pixtral.py +++ b/tests/models/decoder_only/vision_language/test_pixtral.py @@ -228,7 +228,7 @@ def test_model_engine(vllm_runner, model: str, dtype: str) -> None: name_1="output") -@large_gpu_test(min_gb=24) +@large_gpu_test(min_gb=48) @pytest.mark.parametrize( "prompt,expected_ranges", [(_create_engine_inputs_hf(IMG_URLS[:1]), [{ diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py index 329c6ba279f89..bab8d3897579e 100644 --- a/tests/models/embedding/vision_language/test_llava_next.py +++ b/tests/models/embedding/vision_language/test_llava_next.py @@ -2,7 +2,6 @@ import pytest import torch.nn.functional as F -import transformers from transformers import AutoModelForVision2Seq from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner @@ -86,9 +85,6 @@ def _run_test( ) -@pytest.mark.skipif(transformers.__version__.startswith("4.46"), - reason="Model broken with changes in transformers 4.46") -@pytest.mark.core_model @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_models_text( diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 2a072737db043..3b728f2744fca 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -1,7 +1,6 @@ from unittest.mock import patch import pytest -import transformers from transformers import PretrainedConfig from vllm import LLM @@ -11,10 +10,6 @@ @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs()) def test_can_initialize(model_arch): - if (model_arch in {"Idefics3ForConditionalGeneration", "GlmForCausalLM"} - and transformers.__version__ < "4.46.0"): - pytest.skip(reason="Model introduced in HF >= 4.46.0") - model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) if not model_info.is_available_online: pytest.skip("Model is not available online") From a43065272f73a7468b1a35dd44fb5b0ed80f88c7 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 5 Dec 2024 17:47:46 +0100 Subject: [PATCH 254/255] [Misc][Gaudi] Avoid torch.compile and enable lazy collectives (#10897) Signed-off-by: Konrad Zawora --- vllm/plugins/__init__.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 81ee9975cdc4a..ae6e5c0a3481f 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -29,6 +29,20 @@ def load_general_plugins(): if current_platform.is_xpu(): # see https://github.com/pytorch/pytorch/blob/8cada5cbe5450e17c26fb8b358116785324537b2/torch/_dynamo/config.py#L158 # noqa os.environ['TORCH_COMPILE_DISABLE'] = 'True' + if current_platform.is_hpu(): + # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1) + # does not support torch.compile + # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for + # torch.compile support + is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1' + if is_lazy: + # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158 + torch._dynamo.config.disable = True + # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only) + # requires enabling lazy collectives + # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501 + os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true' + global plugins_loaded if plugins_loaded: return From 9743d64e4e04a88174c76553fcbffa33a18c7db5 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 5 Dec 2024 08:54:47 -0800 Subject: [PATCH 255/255] [ci][build] add tests for python only compilation (#10915) Signed-off-by: youkaichao --- .buildkite/test-pipeline.yaml | 11 +++++-- setup.py | 13 ++++---- .../lazy_torch_compile.py} | 0 tests/standalone_tests/python_only_compile.sh | 30 +++++++++++++++++++ 4 files changed, 46 insertions(+), 8 deletions(-) rename tests/{test_lazy_torch_compile.py => standalone_tests/lazy_torch_compile.py} (100%) create mode 100644 tests/standalone_tests/python_only_compile.sh diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 455f02a2062f1..bf0de3f69f14e 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -50,9 +50,9 @@ steps: - tests/multimodal - tests/test_utils - tests/worker - - tests/test_lazy_torch_compile.py + - tests/standalone_tests/lazy_torch_compile.py commands: - - python3 test_lazy_torch_compile.py + - python3 standalone_tests/lazy_torch_compile.py - pytest -v -s mq_llm_engine # MQLLMEngine - pytest -v -s async_engine # AsyncLLMEngine - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py @@ -61,6 +61,13 @@ steps: - pytest -v -s test_utils.py # Utils - pytest -v -s worker # Worker +- label: Python-only Installation Test + source_file_dependencies: + - tests/standalone_tests/python_only_compile.sh + - setup.py + commands: + - bash standalone_tests/python_only_compile.sh + - label: Basic Correctness Test # 30min #mirror_hardwares: [amd] fast_check: true diff --git a/setup.py b/setup.py index 182dabe449674..fcfaa207c176a 100644 --- a/setup.py +++ b/setup.py @@ -465,14 +465,15 @@ def get_vllm_version() -> str: if envs.VLLM_TARGET_DEVICE == "empty": version += f"{sep}empty" elif _is_cuda(): - cuda_version = str(get_nvcc_cuda_version()) - if cuda_version != MAIN_CUDA_VERSION: - cuda_version_str = cuda_version.replace(".", "")[:3] - # skip this for source tarball, required for pypi - if "sdist" not in sys.argv: - version += f"{sep}cu{cuda_version_str}" if envs.VLLM_USE_PRECOMPILED: version += ".precompiled" + else: + cuda_version = str(get_nvcc_cuda_version()) + if cuda_version != MAIN_CUDA_VERSION: + cuda_version_str = cuda_version.replace(".", "")[:3] + # skip this for source tarball, required for pypi + if "sdist" not in sys.argv: + version += f"{sep}cu{cuda_version_str}" elif _is_hip(): # Get the HIP version hipcc_version = get_hipcc_rocm_version() diff --git a/tests/test_lazy_torch_compile.py b/tests/standalone_tests/lazy_torch_compile.py similarity index 100% rename from tests/test_lazy_torch_compile.py rename to tests/standalone_tests/lazy_torch_compile.py diff --git a/tests/standalone_tests/python_only_compile.sh b/tests/standalone_tests/python_only_compile.sh new file mode 100644 index 0000000000000..f00895c0997f1 --- /dev/null +++ b/tests/standalone_tests/python_only_compile.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# This script tests if the python only compilation works correctly +# for users who do not have any compilers installed on their system + +set -e +set -x + +cd /vllm-workspace/ + +# uninstall vllm +pip3 uninstall -y vllm +# restore the original files +mv test_docs/vllm ./vllm + +# remove all compilers +apt remove --purge build-essential -y +apt autoremove -y + +echo 'import os; os.system("touch /tmp/changed.file")' >> vllm/__init__.py + +VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e . + +# Run the script +python3 -c 'import vllm' + +# Check if the clangd log file was created +if [ ! -f /tmp/changed.file ]; then + echo "changed.file was not created, python only compilation failed" + exit 1 +fi