From a115250b5fcf9187248d11f68e6a43ae2959dabf Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 8 May 2024 14:39:54 +0200 Subject: [PATCH 001/819] Re-integrate HPU after upstream refactors (#20) * Fix setup.py for HPU * Fix vllm._C import ops -> vllm.hpu import ops * more of the same thing * re-add hpex rmsnorm and rope; but rope is crashing * remove unnecessary comments * add vllm/hpu files * add hpu autodetection * Add HabanaAttention stub * revert accidental changes * revert non-habana backend attention changes * add habana attention/worker/executor, sampling fails now * Restore unnecessarily changed files * enable HabanaMemoryProfiler * Make sampler pass * restore habana fused rope * prefill is now working!!! * fix prefill padding; decode is now working!!!!! * revert accidental changes * remove unused stuff in habana_paged_attn.py * remove diagnostic stuff from llm_engine.py * use HabanaExecutorAsync in async_llm_engine.py * add habana copyright headers to habana_*.py files * fix prefill attention conformance * minor naming fixes * remove naive attention from habana_attn (it never worked anyway) * re-enable profile run * Add fake HPUGraph support * add more metrics * indentation fix * ~~recipe cache metrics don't work lalalala~~ * i'm done with metrics for now * fix corner case in which hl-smi is not available but synapse is * FIXME: temporary setup.py workaround * WIP: add tensor parallelism stubs * habana worker cleanup * tensor parallelism is now working * remove unused files * remove unused func * add hpugraphrunner * improve hpu layernorm * Port pipelined PA * Port context length bucketing * remove cudagraphrunner from hpu runner * restore HPUGraphRunner back from FakeHPUGraphRunner * handle rotary embeddings properly on gaudi3 * oopsie! captured_block_counts was incorrect! * captured_block_counts.append doesn't do anything * Restore habana_main KV cache memory layout * fix memory profiler * overhaul hpugraph capture * memory profiling overhaul * format memory properly in model warmup * add graph compilation profiler for graph capture phase * adroll back log lvl on graph capture message * Remove unnecessary view on residual connection in RMSNorm (#25) --------- Co-authored-by: madamczykhabana <110973826+madamczykhabana@users.noreply.github.com> --- pyproject.toml | 57 - requirements-hpu.txt | 15 + setup.py | 40 +- vllm/attention/backends/habana_attn.py | 352 +++++ vllm/attention/ops/habana_paged_attn.py | 150 +++ vllm/attention/selector.py | 10 +- vllm/config.py | 4 +- vllm/engine/arg_utils.py | 2 +- vllm/engine/async_llm_engine.py | 8 + vllm/engine/llm_engine.py | 15 +- vllm/engine/ray_utils.py | 26 +- vllm/entrypoints/openai/api_server.py | 2 +- vllm/executor/habana_executor.py | 190 +++ vllm/executor/ray_habana_executor.py | 419 ++++++ vllm/hpu/__init__.py | 6 + vllm/hpu/attn_bias.py | 764 +++++++++++ vllm/hpu/cache_ops.py | 82 ++ vllm/hpu/ops.py | 115 ++ vllm/hpu/rotary_embed.py | 119 ++ vllm/hpu/utils.py | 99 ++ vllm/hpu/xops.py | 66 + vllm/model_executor/layers/activation.py | 6 +- vllm/model_executor/layers/layernorm.py | 23 +- .../model_executor/layers/logits_processor.py | 10 +- .../model_executor/layers/quantization/awq.py | 7 +- .../layers/quantization/gptq.py | 7 +- .../layers/quantization/marlin.py | 7 +- .../layers/quantization/squeezellm.py | 7 +- .../model_executor/layers/rotary_embedding.py | 15 +- vllm/model_executor/models/llama.py | 1 - .../parallel_utils/communication_op.py | 11 +- vllm/model_executor/sampling_metadata.py | 12 +- vllm/utils.py | 53 + vllm/worker/cache_engine.py | 27 +- vllm/worker/habana_model_runner.py | 1168 +++++++++++++++++ vllm/worker/habana_worker.py | 263 ++++ 36 files changed, 4045 insertions(+), 113 deletions(-) delete mode 100644 pyproject.toml create mode 100644 requirements-hpu.txt create mode 100644 vllm/attention/backends/habana_attn.py create mode 100644 vllm/attention/ops/habana_paged_attn.py create mode 100644 vllm/executor/habana_executor.py create mode 100644 vllm/executor/ray_habana_executor.py create mode 100644 vllm/hpu/__init__.py create mode 100644 vllm/hpu/attn_bias.py create mode 100644 vllm/hpu/cache_ops.py create mode 100644 vllm/hpu/ops.py create mode 100644 vllm/hpu/rotary_embed.py create mode 100644 vllm/hpu/utils.py create mode 100644 vllm/hpu/xops.py create mode 100644 vllm/worker/habana_model_runner.py create mode 100644 vllm/worker/habana_worker.py diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 4d6fb5a362fc7..0000000000000 --- a/pyproject.toml +++ /dev/null @@ -1,57 +0,0 @@ -[build-system] -# Should be mirrored in requirements-build.txt -requires = [ - "cmake>=3.21", - "ninja", - "packaging", - "setuptools >= 49.4.0", - "torch == 2.1.2", - "wheel", -] -build-backend = "setuptools.build_meta" - -[tool.ruff] -# Allow lines to be as long as 80. -line-length = 80 - -[tool.ruff.lint] -select = [ - # pycodestyle - "E", - # Pyflakes - "F", - # pyupgrade - # "UP", - # flake8-bugbear - "B", - # flake8-simplify - "SIM", - # isort - # "I", -] -ignore = [ - # star imports - "F405", "F403", - # lambda expression assignment - "E731", - # Loop control variable not used within loop body - "B007", -] - -[tool.mypy] -python_version = "3.8" - -ignore_missing_imports = true - -files = "vllm" -# TODO(woosuk): Include the code from Megatron and HuggingFace. -exclude = "vllm/model_executor/parallel_utils/|vllm/model_executor/models/" - - -[tool.codespell] -ignore-words-list = "dout, te, indicies" -skip = "./tests/prompts" - -[tool.isort] -use_parentheses = true -skip_gitignore = true diff --git a/requirements-hpu.txt b/requirements-hpu.txt new file mode 100644 index 0000000000000..26fd05eb42d2a --- /dev/null +++ b/requirements-hpu.txt @@ -0,0 +1,15 @@ +cmake>=3.21 +ninja # For faster builds. +psutil +ray == 2.9.3 +sentencepiece # Required for LLaMA tokenizer. +numpy +fastapi +uvicorn[standard] +pydantic >= 2.0 # Required for OpenAI server. +prometheus_client >= 0.18.0 +pynvml == 11.5.0 +triton >= 2.1.0 +outlines == 0.0.34 +pandas +tabulate \ No newline at end of file diff --git a/setup.py b/setup.py index 9c9a428f94683..673c6e709a8f6 100644 --- a/setup.py +++ b/setup.py @@ -174,8 +174,19 @@ def build_extensions(self) -> None: subprocess.check_call(['cmake', *build_args], cwd=self.build_temp) +def _is_hpu() -> bool: + return True + is_hpu_available = True + try: + subprocess.run(["hl-smi"], capture_output=True, check=True) + except (FileNotFoundError, PermissionError, subprocess.CalledProcessError): + if not os.path.exists('/dev/hl0') and not os.path.exists('/dev/hl_controlD0'): + is_hpu_available = False + return is_hpu_available + + def _is_cuda() -> bool: - return torch.version.cuda is not None and not _is_neuron() + return torch.version.cuda is not None and not _is_neuron() and not _is_hpu() def _is_hip() -> bool: @@ -190,7 +201,6 @@ def _is_neuron() -> bool: torch_neuronx_installed = False return torch_neuronx_installed - def _install_punica() -> bool: return bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0"))) @@ -265,6 +275,17 @@ def find_version(filepath: str) -> str: return version_match.group(1) raise RuntimeError("Unable to find version string.") +def get_gaudi_sw_version(): + """ + Returns the driver version. + """ + # Enable console printing for `hl-smi` check + output = subprocess.run( + "hl-smi", shell=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env={"ENABLE_CONSOLE": "true"} + ) + if output.returncode == 0 and output.stdout: + return output.stdout.split("\n")[2].replace(" ", "").split(":")[1][:-1].split("-")[0] + return "0.0.0" # when hl-smi is not available def get_vllm_version() -> str: version = find_version(get_path("vllm", "__init__.py")) @@ -286,6 +307,12 @@ def get_vllm_version() -> str: if neuron_version != MAIN_CUDA_VERSION: neuron_version_str = neuron_version.replace(".", "")[:3] version += f"+neuron{neuron_version_str}" + elif _is_hpu(): + # Get the Intel Gaudi Software Suite version + gaudi_sw_version = str(get_gaudi_sw_version()) + if gaudi_sw_version != MAIN_CUDA_VERSION: + gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3] + version += f"+gaudi{gaudi_sw_version}" else: raise RuntimeError("Unknown runtime environment") @@ -318,9 +345,12 @@ def get_requirements() -> List[str]: elif _is_neuron(): with open(get_path("requirements-neuron.txt")) as f: requirements = f.read().strip().split("\n") + elif _is_hpu(): + with open(get_path("requirements-hpu.txt")) as f: + requirements = f.read().strip().split("\n") else: raise ValueError( - "Unsupported platform, please use CUDA, ROCM or Neuron.") + "Unsupported platform, please use CUDA, ROCM, Neuron or HPU.") return requirements @@ -333,7 +363,7 @@ def get_requirements() -> List[str]: if _install_punica(): ext_modules.append(CMakeExtension(name="vllm._punica_C")) -if not _is_neuron(): +if not (_is_neuron() or _is_hpu()): ext_modules.append(CMakeExtension(name="vllm._C")) package_data = { @@ -369,6 +399,6 @@ def get_requirements() -> List[str]: python_requires=">=3.8", install_requires=get_requirements(), ext_modules=ext_modules, - cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {}, + cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() or _is_hpu() else {}, package_data=package_data, ) diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py new file mode 100644 index 0000000000000..844dc92b315ac --- /dev/null +++ b/vllm/attention/backends/habana_attn.py @@ -0,0 +1,352 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +############################################################################### + +import importlib +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple, Type + +import torch +import vllm.hpu.xops as xops +from vllm.hpu.attn_bias import (AttentionBias, + BlockDiagonalCausalMask, + LowerTriangularMaskWithTensorBias) + +from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionMetadata) +from vllm.attention.ops.habana_paged_attn import (HabanaPagedAttention, + HabanaPagedAttentionMetadata) +from vllm.logger import init_logger +from vllm.utils import is_hip + +logger = init_logger(__name__) + + +class HabanaAttentionBackend(AttentionBackend): + + @staticmethod + def get_impl_cls() -> Type["HabanaAttentionImpl"]: + return HabanaAttentionImpl + + @staticmethod + def make_metadata(*args, **kwargs) -> "HabanaAttentionMetadata": + return HabanaAttentionMetadata(*args, **kwargs) + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> Tuple[int, ...]: + return HabanaPagedAttention.get_kv_cache_shape(num_blocks, block_size, + num_kv_heads, head_size) + + @staticmethod + def swap_blocks( + src_kv_cache: torch.Tensor, + dst_kv_cache: torch.Tensor, + src_to_dst: Dict[int, int], + ) -> None: + HabanaPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) + + @staticmethod + def copy_blocks( + kv_caches: List[torch.Tensor], + src_to_dists: Dict[int, List[int]], + ) -> None: + HabanaPagedAttention.copy_blocks(kv_caches, src_to_dists) + + +@dataclass +class HabanaAttentionMetadata(AttentionMetadata, HabanaPagedAttentionMetadata): + """Metadata for HabanaAttentionbackend. + + NOTE: Any python object stored here is not updated when it is + cuda-graph replayed. If you have values that need to be changed + dynamically, it should be stored in tensor. The tensor has to be + updated from `CUDAGraphRunner.forward` API. + """ + # Currently, input sequences can only contain all prompts + # or all decoding. True if all sequences are prompts. + is_prompt: bool + # (num_tokens,). The indices of the token slots that input tokens will be + # stored into. E.g., if `slot_mapping` is [35, 2, 17] and the block size + # is 16, the three tokens are stored in the 3rd slot in block 2, 2nd slot + # in block 0, and 1st slot in block 1, respectively. + slot_mapping: torch.Tensor + # (batch_size,). The prompt length per sequence. None if it is a decoding. + prompt_lens: Optional[List[int]] + # prompt_lens stored as a tensor. + prompt_lens_tensor: Optional[torch.Tensor] + # The number of prompt tokens. Doesn't include padding. + num_prompt_tokens: int + # The number of generation tokens. Doesn't include padding. + num_generation_tokens: int + + # NOTE(sang): Definition of context_len, subquery_len, and seqlen. + # |---------- N-1 iteration --------| + # |---------------- N iteration ---------------------| + # |- tokenA -|......................|-- newTokens ---| + # |---------- context_len ----------| + # |-------------------- seqlen ----------------------| + # |- subquery_len -| + + # WARNING(sang): context_len has different definition depending on if it is + # prefill vs decoding. When it is prefill, it doesn't include new tokens. + # When it is for decoding, it includes a new token. + + # Maximum subquery length in the batch. + max_subquery_len: Optional[int] + # FIXME: It is for flash attn. + # Maximum prompt length in the batch. + max_prompt_len: Optional[int] + # (batch_size + 1,). The cumulative subquery lengths of the sequences in + # the batch, used to index into subquery. E.g., if the subquery length + # is [4, 6], it is [0, 4, 10]. + subquery_start_loc: Optional[torch.Tensor] + # FIXME: It is for flash attn. + # (batch_size + 1,). The cumulative sequence lengths of the sequences in + # the batch, used to index into sequence. E.g., if the sequence length is + # [4, 6], it is [0, 4, 10]. + seq_start_loc: Optional[torch.Tensor] + + # Whether or not if cuda graph is enabled. + # Cuda-graph is currently enabled for decoding only. + # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention. + use_cuda_graph: bool + + def __post_init__(self): + # Set during the execution of the first attention op. + # It is a list because it is needed to set per prompt + # when alibi slopes is used. It is because of the limitation + # from xformer API. + # will not appear in the __repr__ and __init__ + self.attn_bias: Optional[List[AttentionBias]] = None + + +class HabanaAttentionImpl(AttentionImpl): + """ + If the input tensors contain prompt tokens, the layout is as follows: + |<--------------- num_prompt_tokens --------------->| + |<--prompt_0-->|<--prompt_1-->|...|<--prompt_N-1--->| + + Otherwise, the layout is as follows: + |<------------------ num_generation_tokens (M) ----------------->| + |<--generation_0-->|..........|<--generation_M-1-->|<--padding-->| + + Generation tokens can contain padding when cuda-graph is used. + Currently, prompt tokens don't contain any padding. + + The prompts might have different lengths, while the generation tokens + always have length 1. + """ + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: Optional[int] = None, + alibi_slopes: Optional[List[float]] = None, + sliding_window: Optional[int] = None, + ) -> None: + self.num_heads = num_heads + self.head_size = head_size + self.scale = float(scale) + self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads + self.sliding_window = sliding_window + if alibi_slopes is not None: + alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) + self.alibi_slopes = alibi_slopes + + assert self.num_heads % self.num_kv_heads == 0 + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + + suppored_head_sizes = HabanaPagedAttention.get_supported_head_sizes() + if head_size not in suppored_head_sizes: + raise ValueError( + f"Head size {head_size} is not supported by PagedAttention. " + f"Supported head sizes are: {suppored_head_sizes}.") + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: Optional[torch.Tensor], + attn_metadata: HabanaAttentionMetadata, + ) -> torch.Tensor: + """Forward pass with xFormers and PagedAttention. + + Args: + query: shape = [num_tokens, num_heads * head_size] + key: shape = [num_tokens, num_kv_heads * head_size] + value: shape = [num_tokens, num_kv_heads * head_size] + kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] + attn_metadata: Metadata for attention. + Returns: + shape = [num_tokens, num_heads * head_size] + """ + batch_size, seq_len, hidden_size = query.shape + _, seq_len_kv, _ = key.shape + + + query = query.view(-1, self.num_heads, self.head_size) + key = key.view(-1, self.num_kv_heads, self.head_size) + value = value.view(-1, self.num_kv_heads, self.head_size) + if kv_cache is not None: + key_cache, value_cache = HabanaPagedAttention.split_kv_cache( + kv_cache, self.num_kv_heads, self.head_size) + + # Reshape the input keys and values and store them in the cache. + # If kv_cache is not provided, the new key and value tensors are + # not cached. This happens during the initial memory profiling run. + HabanaPagedAttention.write_to_paged_cache(key, value, key_cache, + value_cache, + attn_metadata.slot_mapping, + attn_metadata.kv_cache_dtype, + attn_metadata.is_prompt) + + if attn_metadata.is_prompt: + # Prompt run. + if kv_cache is None or attn_metadata.block_tables.numel() == 0: + # normal attention. + # block tables are empty if the prompt does not have a cached + # prefix. + if self.num_kv_heads != self.num_heads: + # As of Nov 2023, xformers only supports MHA. For MQA/GQA, + # project the key and value tensors to the desired number of + # heads. + # TODO(woosuk): Use MQA/GQA kernels for higher performance. + query = query.view(query.shape[0], self.num_kv_heads, + self.num_queries_per_kv, + query.shape[-1]) + key = key[:, :, + None, :].expand(key.shape[0], self.num_kv_heads, + self.num_queries_per_kv, + key.shape[-1]) + value = value[:, :, + None, :].expand(value.shape[0], + self.num_kv_heads, + self.num_queries_per_kv, + value.shape[-1]) + + if attn_metadata.attn_bias is None: + if self.alibi_slopes is None: + attn_bias = BlockDiagonalCausalMask.from_seqlens( + [seq_len] * batch_size) + if self.sliding_window is not None: + attn_bias = attn_bias.make_local_attention( + self.sliding_window) + attn_metadata.attn_bias = attn_bias + else: + attn_metadata.attn_bias = _make_alibi_bias( + self.alibi_slopes, self.num_kv_heads, batch_size, + seq_len, query.dtype) + query_shape = (batch_size, seq_len, self.num_kv_heads, self.num_queries_per_kv, self.head_size) if self.num_kv_heads != self.num_heads else (batch_size, seq_len, self.num_heads, self.head_size) + kv_shape = (batch_size, seq_len_kv, self.num_kv_heads, self.num_queries_per_kv, self.head_size) if self.num_kv_heads != self.num_heads else (batch_size, seq_len_kv, self.num_kv_heads, self.head_size) + out = xops.memory_efficient_attention_forward( + query.view(query_shape), + key.view(kv_shape), + value.view(kv_shape), + attn_bias=attn_metadata.attn_bias, + p=0.0, + scale=self.scale, + ) + output = out.reshape(batch_size, seq_len, hidden_size) + else: + # prefix-enabled attention + output = HabanaPagedAttention.forward_prefix( + query, + key, + value, + key_cache, + value_cache, + attn_metadata.block_tables, + attn_metadata.subquery_start_loc, + attn_metadata.prompt_lens_tensor, + attn_metadata.context_lens, + attn_metadata.max_subquery_len, + self.alibi_slopes, + ) + else: + # Decoding run. + output = HabanaPagedAttention.forward_decode( + query, + key_cache, + value_cache, + attn_metadata.block_tables, + attn_metadata.context_lens, + attn_metadata.max_context_len, + attn_metadata.kv_cache_dtype, + self.num_kv_heads, + self.scale, + self.alibi_slopes, + ) + + # Reshape the output tensor. + return output.view(batch_size, seq_len, hidden_size) + + +def _make_alibi_bias( + alibi_slopes: torch.Tensor, + num_kv_heads: int, + dtype: torch.dtype, + prompt_lens: List[int], +) -> LowerTriangularMaskWithTensorBias: + attn_biases = [] + for prompt_len in prompt_lens: + bias = torch.arange(prompt_len, dtype=dtype) + # NOTE(zhuohan): HF uses + # `bias = bias[None, :].repeat(prompt_len, 1)` + # here. We find that both biases give the same results, but + # the bias below more accurately follows the original ALiBi + # paper. + # Calculate a matrix where each element represents ith element- jth + # element. + bias = bias[None, :] - bias[:, None] + + padded_len = (prompt_len + 7) // 8 * 8 + num_heads = alibi_slopes.shape[0] + bias = torch.empty( + 1, # batch size + num_heads, + prompt_len, + padded_len, + device=alibi_slopes.device, + dtype=dtype, + )[:, :, :, :prompt_len].copy_(bias) + bias.mul_(alibi_slopes[:, None, None]) + if num_heads != num_kv_heads: + bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads)) + attn_biases.append(LowerTriangularMaskWithTensorBias(bias)) + + return attn_biases + + +def _naive_masked_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + num_heads: int, + num_kv_heads: int, + head_size: int, + scale: float, +) -> torch.Tensor: + query = query.view(-1, num_heads, head_size) + key = key.view(-1, num_kv_heads, head_size) + value = value.view(-1, num_kv_heads, head_size) + seq_len, _, _ = query.shape + attn_mask = torch.triu(torch.ones(seq_len, + seq_len, + dtype=query.dtype, + device=query.device), + diagonal=1) + attn_mask = attn_mask * torch.finfo(query.dtype).min + + attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float() + attn_weights = attn_weights + attn_mask.float() + attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype) + out = torch.einsum("hqk,khd->qhd", attn_weights, value) + return out diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/habana_paged_attn.py new file mode 100644 index 0000000000000..03027bb01565c --- /dev/null +++ b/vllm/attention/ops/habana_paged_attn.py @@ -0,0 +1,150 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +############################################################################### + +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple + +import torch + +from vllm.hpu import cache_ops, ops + +# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. +_PARTITION_SIZE = 512 + + +@dataclass +class HabanaPagedAttentionMetadata: + """Metadata for PagedAttention.""" + # (num_tokens,). The indices of the token slots that input tokens will be + # stored into. E.g., if `slot_mapping` is [35, 2, 17] and the block size + # is 16, the three tokens are stored in the 3rd slot in block 2, 2nd slot + # in block 0, and 1st slot in block 1, respectively. + slot_mapping: torch.Tensor + # (batch_size,). The length of context (tokens stored in KV cache) per + # sequence. WARNING: When it is a prefill request, it doesn't include new + # tokens. When it is for decoding, it includes a new token. + context_lens: Optional[torch.Tensor] + # Maximum context length in the batch. + max_context_len: Optional[int] + # (batch_size, max_blocks_per_seq). + # Block addresses per sequence. (Seq id -> list of physical block) + # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks + # in the kv cache. Each block can contain up to block_size tokens. + # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph + # captured. + block_tables: Optional[torch.Tensor] + kv_cache_dtype: str + + +class HabanaPagedAttention: + + @staticmethod + def get_supported_head_sizes() -> List[int]: + return [64, 80, 96, 112, 128, 256] + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> Tuple[int, ...]: + return (num_blocks, num_kv_heads, head_size, block_size) + + @staticmethod + def split_kv_cache( + kv_cache: torch.Tensor, + num_kv_heads: int, + head_size: int, + ) -> Tuple[torch.Tensor, torch.Tensor]: + key_cache = kv_cache[0] + value_cache = kv_cache[1] + return key_cache, value_cache + + @staticmethod + def write_to_paged_cache( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, + kv_cache_dtype: str, + is_prompt: bool + ) -> None: + cache_ops.reshape_and_cache( + key, + value, + key_cache, + value_cache, + slot_mapping, + kv_cache_dtype, + is_prompt + ) + + @staticmethod + def forward_decode( + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + block_tables: torch.Tensor, + context_lens: torch.Tensor, + max_context_len: int, + kv_cache_dtype: str, + num_kv_heads: int, + scale: float, + alibi_slopes: Optional[torch.Tensor], + ) -> torch.Tensor: + block_size = value_cache.shape[3] + return ops.paged_attention_v1( + query, + key_cache, + value_cache, + num_kv_heads, + scale, + block_tables, + context_lens, + block_size, + max_context_len, + alibi_slopes, + kv_cache_dtype, + ) + + @staticmethod + def forward_prefix( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + block_tables: torch.Tensor, + subquery_start_loc: torch.Tensor, + prompt_lens_tensor: torch.Tensor, + context_lens: torch.Tensor, + max_subquery_len: int, + alibi_slopes: Optional[torch.Tensor], + ) -> torch.Tensor: + raise NotImplementedError("forward_prefix is not implemented for HabanaPagedAttention") + + @staticmethod + def swap_blocks( + src_kv_cache: torch.Tensor, + dst_kv_cache: torch.Tensor, + src_to_dst: Dict[int, int], + ) -> None: + src_key_cache = src_kv_cache[0] + dst_key_cache = dst_kv_cache[0] + cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst) + + src_value_cache = src_kv_cache[1] + dst_value_cache = dst_kv_cache[1] + cache_ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst) + + @staticmethod + def copy_blocks( + kv_caches: List[torch.Tensor], + src_to_dists: Dict[int, List[int]], + ) -> None: + key_caches = [kv_cache[0] for kv_cache in kv_caches] + value_caches = [kv_cache[1] for kv_cache in kv_caches] + cache_ops.copy_blocks(key_caches, value_caches, src_to_dists) diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 90fce1a0349b2..1f68e0aad7b59 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -4,7 +4,7 @@ from vllm.attention.backends.abstract import AttentionBackend from vllm.logger import init_logger -from vllm.utils import is_hip +from vllm.utils import is_hip, is_hpu logger = init_logger(__name__) @@ -16,6 +16,11 @@ def get_attn_backend(dtype: torch.dtype) -> AttentionBackend: from vllm.attention.backends.flash_attn import ( # noqa: F401 FlashAttentionBackend) return FlashAttentionBackend + elif is_hpu(): + logger.info("Using HabanaAttention backend.") + from vllm.attention.backends.habana_attn import ( # noqa: F401 + HabanaAttentionBackend) + return HabanaAttentionBackend else: logger.info("Using XFormers backend.") from vllm.attention.backends.xformers import ( # noqa: F401 @@ -28,6 +33,9 @@ def _can_use_flash_attn(dtype: torch.dtype) -> bool: # AMD GPUs. logger.info("Cannot use FlashAttention backend for AMD GPUs.") return False + if is_hpu(): + logger.info("Cannot use FlashAttention backend for HPUs.") + return False if torch.cuda.get_device_capability()[0] < 8: # Volta and Turing NVIDIA GPUs. logger.info("Cannot use FlashAttention backend for Volta and Turing " diff --git a/vllm/config.py b/vllm/config.py index 6070d9d9e50f1..17e25fa2fc00f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -9,7 +9,7 @@ from vllm.logger import init_logger from vllm.transformers_utils.config import get_config -from vllm.utils import get_cpu_memory, get_nvcc_cuda_version, is_hip, is_neuron +from vllm.utils import get_cpu_memory, get_nvcc_cuda_version, is_hip, is_neuron, is_hpu if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup @@ -563,6 +563,8 @@ def __init__(self, device: str = "auto") -> None: # Automated device type detection if is_neuron(): self.device_type = "neuron" + elif is_hpu(): + self.device_type = "hpu" else: # We don't call torch.cuda.is_available() here to # avoid initializing CUDA before workers are forked diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index fc6665dbe64bc..be36db2176d05 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -303,7 +303,7 @@ def add_cli_args( parser.add_argument("--device", type=str, default=EngineArgs.device, - choices=["auto", "cuda", "neuron"], + choices=["auto", "cuda", "neuron", 'hpu'], help='Device type for vLLM execution.') parser.add_argument( '--scheduler-delay-factor', diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index d642915aee192..6b1b9ea32ff76 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -330,6 +330,14 @@ def from_engine_args(cls, if device_config.device_type == "neuron": raise NotImplementedError("Neuron is not supported for " "async engine yet.") + elif device_config.device_type == "hpu": + if parallel_config.worker_use_ray or engine_args.engine_use_ray: + initialize_ray_cluster(parallel_config) + from vllm.executor.ray_habana_executor import RayHabanaExecutorAsync + executor_class = RayHabanaExecutorAsync + else: + from vllm.executor.habana_executor import HabanaExecutorAsync + executor_class = HabanaExecutorAsync elif parallel_config.worker_use_ray or engine_args.engine_use_ray: initialize_ray_cluster(parallel_config) from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index f9638d1101906..56941f876a233 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -66,7 +66,7 @@ def __init__( log_stats: bool, ) -> None: logger.info( - f"Initializing an LLM engine (v{vllm.__version__}) with config: " + f"Initializing an LLM engine with config: " f"model={model_config.model!r}, " f"tokenizer={model_config.tokenizer!r}, " f"tokenizer_mode={model_config.tokenizer_mode}, " @@ -132,6 +132,14 @@ def from_engine_args(cls, engine_args: EngineArgs) -> "LLMEngine": if device_config.device_type == "neuron": from vllm.executor.neuron_executor import NeuronExecutor executor_class = NeuronExecutor + elif device_config.device_type == "hpu": + if parallel_config.worker_use_ray: + initialize_ray_cluster(parallel_config) + from vllm.executor.ray_habana_executor import RayHabanaExecutor + executor_class = RayHabanaExecutor + else: + from vllm.executor.habana_executor import HabanaExecutor + executor_class = HabanaExecutor elif parallel_config.worker_use_ray: initialize_ray_cluster(parallel_config) from vllm.executor.ray_gpu_executor import RayGPUExecutor @@ -141,7 +149,6 @@ def from_engine_args(cls, engine_args: EngineArgs) -> "LLMEngine": "Ray is required if parallel_config.world_size > 1.") from vllm.executor.gpu_executor import GPUExecutor executor_class = GPUExecutor - # Create the LLM engine. engine = cls(*engine_configs, executor_class=executor_class, @@ -419,7 +426,9 @@ def _process_sequence_group_outputs(self, seq_group: SequenceGroup, self.detokenizer.decode_sequence_inplace(seq, seq_group.sampling_params) self._check_stop(seq, seq_group.sampling_params) - + #emitted_token = seq.tokens[seq.prefix_offset:] if not seq.status == SequenceStatus.FINISHED_STOPPED else '' + #print(f'[{seq.status}] Emitted token: {emitted_token} ({seq.get_token_ids()[-1]}) ({seq.output_text!r})') + # Non-beam search case if not seq_group.sampling_params.use_beam_search: # For newly created child sequences, add them to the sequence group diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py index 70d5c9b1fae05..991f1b9e443f1 100644 --- a/vllm/engine/ray_utils.py +++ b/vllm/engine/ray_utils.py @@ -1,9 +1,9 @@ import pickle from typing import List, Optional, Tuple - +import os from vllm.config import ParallelConfig from vllm.logger import init_logger -from vllm.utils import get_ip, is_hip, set_cuda_visible_devices +from vllm.utils import get_ip, is_hip, is_hpu, set_cuda_visible_devices logger = init_logger(__name__) @@ -100,8 +100,10 @@ def initialize_ray_cluster( ignore_reinit_error=True, num_gpus=parallel_config.world_size) else: - ray.init(address=ray_address, ignore_reinit_error=True) - + ray.init(address=ray_address, ignore_reinit_error=True, + log_to_driver=not os.environ.get('VLLM_RAY_DISABLE_LOG_TO_DRIVER', '0') != '0') + ray_accel_name = "HPU" if is_hpu() else "GPU" + if parallel_config.placement_group: # Placement group is already set. return @@ -114,24 +116,24 @@ def initialize_ray_cluster( # Verify that we can use the placement group. gpu_bundles = 0 for bundle in bundles: - bundle_gpus = bundle.get("GPU", 0) + bundle_gpus = bundle.get(ray_accel_name, 0) if bundle_gpus > 1: raise ValueError( - "Placement group bundle cannot have more than 1 GPU.") + f"Placement group bundle cannot have more than 1 {ray_accel_name}.") if bundle_gpus: gpu_bundles += 1 if parallel_config.world_size > gpu_bundles: raise ValueError( - "The number of required GPUs exceeds the total number of " - "available GPUs in the placement group.") + f"The number of required {ray_accel_name}s exceeds the total number of " + f"available {ray_accel_name}s in the placement group.") else: - num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0) + num_gpus_in_cluster = ray.cluster_resources().get(ray_accel_name, 0) if parallel_config.world_size > num_gpus_in_cluster: raise ValueError( - "The number of required GPUs exceeds the total number of " - "available GPUs in the cluster.") + f"The number of required {ray_accel_name}s exceeds the total number of " + f"available {ray_accel_name}s in the cluster.") # Create a new placement group - placement_group_specs = ([{"GPU": 1}] * parallel_config.world_size) + placement_group_specs = ([{ray_accel_name: 1}] * parallel_config.world_size) current_placement_group = ray.util.placement_group( placement_group_specs) # Wait until PG is ready - this will block until all diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 06e8bdf11abd3..9eb9a654d2b41 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -144,7 +144,7 @@ async def authentication(request: Request, call_next): raise ValueError(f"Invalid middleware {middleware}. " f"Must be a function or a class.") - logger.info(f"vLLM API server version {vllm.__version__}") + logger.info(f"vLLM API server version") logger.info(f"args: {args}") if args.served_model_name is not None: diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py new file mode 100644 index 0000000000000..dd211eadbea78 --- /dev/null +++ b/vllm/executor/habana_executor.py @@ -0,0 +1,190 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +############################################################################### + +from typing import Dict, List, Optional + +from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, + ParallelConfig, SchedulerConfig) +from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase +from vllm.executor.utils import check_block_size_valid +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, + make_async, HabanaMemoryProfiler, format_bytes) +import os +import contextlib +logger = init_logger(__name__) + + +class HabanaExecutor(ExecutorBase): + + def __init__( + self, + model_config: ModelConfig, + cache_config: CacheConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + lora_config: Optional[LoRAConfig], + ) -> None: + self.model_config = model_config + self.cache_config = cache_config + self.lora_config = lora_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + + # Instantiate the worker and load the model to GPU. + self._init_worker() + + # Profile the memory usage and initialize the cache. + self._init_cache() + + def _init_worker(self): + # Lazy import the Worker to avoid importing torch.cuda/xformers + # before CUDA_VISIBLE_DEVICES is set in the Worker + from vllm.worker.habana_worker import HabanaWorker + + assert self.parallel_config.world_size == 1, ( + "HabanaExecutor only supports single GPU.") + + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + self.driver_worker = HabanaWorker( + self.model_config, + self.parallel_config, + self.scheduler_config, + self.device_config, + local_rank=0, + rank=0, + distributed_init_method=distributed_init_method, + lora_config=self.lora_config, + kv_cache_dtype=self.cache_config.cache_dtype, + is_driver_worker=True, + ) + self.driver_worker.init_device() + self.driver_worker.load_model() + + def _init_cache(self) -> None: + """Profiles the memory usage and initializes the KV cache. + + The engine first profiles the existing memory usage. + Then, it allocates the remaining memory for KV blocks. + + .. tip:: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. + """ + # Get the maximum number of blocks that can be allocated on GPU and CPU. + num_gpu_blocks, num_cpu_blocks = ( + self.driver_worker.profile_num_available_blocks( + block_size=self.cache_config.block_size, + hpu_memory_utilization=self.cache_config. + gpu_memory_utilization, + cpu_swap_space=self.cache_config.swap_space_bytes, + cache_dtype=self.cache_config.cache_dtype, + )) + + logger.info(f"# HPU blocks: {num_gpu_blocks}, " + f"# CPU blocks: {num_cpu_blocks}") + + check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, + self.model_config.max_model_len) + + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + + # Initialize the cache. + with HabanaMemoryProfiler() as cache_init_m: + self.driver_worker.init_cache_engine(cache_config=self.cache_config) + logger.info(f"init_cache_engine took " + f"{format_bytes(cache_init_m.consumed_memory)} ({cache_init_m.consumed_memory/HabanaMemoryProfiler.total_memory():.2%} of total memory, gpu_memory_utilization: {self.cache_config.gpu_memory_utilization}, {format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)") + + # Warm up the model. This includes capturing the model into CUDA graph + # if enforce_eager is False. + with HabanaMemoryProfiler() as warmup_m: + self.driver_worker.warm_up_model() + logger.info(f"Model warmup took " + f"{format_bytes(warmup_m.consumed_memory)} ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)") + + def execute_model(self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + + # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! + # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none + # VLLM_HPU_LOG_STEP_CPU_FALLBACKS - will log cpu fallbacks per engine step, only when there was any + # VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL - will log cpu fallbacks per engine step, always, even if there were none + log_graph_compilation_all = os.environ.get('VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0' + log_graph_compilation = os.environ.get('VLLM_HPU_LOG_STEP_GRAPH_COMPILATION', '0') != '0' or log_graph_compilation_all + log_cpu_fallbacks_all = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0' + log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS', '0') != '0' or log_cpu_fallbacks_all + if log_graph_compilation or log_cpu_fallbacks: + from habana_frameworks.torch.hpu.metrics import metric_localcontext + is_prompt = any([seq_group_metadata.is_prompt for seq_group_metadata in seq_group_metadata_list]) + max_context_len = max([max([len(v.prompt_token_ids) + len(v.output_token_ids) for v in seq_group_metadata.seq_data.values()]) for seq_group_metadata in seq_group_metadata_list]) # whoa, that's some spicy stuff right here + max_num_blocks = ((max_context_len - 1) // self.cache_config.block_size) + 1 + input_stats = f'is_prompt: {is_prompt}, num_seqs: {len(seq_group_metadata_list)} max_context_len: {max_context_len}, max_num_blocks {max_num_blocks}' + gc_ctx = metric_localcontext("graph_compilation") if log_graph_compilation else contextlib.nullcontext() + cpu_fallback_ctx = metric_localcontext("cpu_fallback") if log_cpu_fallbacks else contextlib.nullcontext() + with gc_ctx as gc_local_metric, cpu_fallback_ctx as cpu_fallback_local_metric: + output = self.driver_worker.execute_model( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + ) + if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0) or log_graph_compilation_all: + logger.warning(f"VLLM_HPU_STEP_GRAPH_COMPILATION: {gc_local_metric.stats()}, {input_stats}") + if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] > 0) or log_cpu_fallbacks_all: + logger.warning(f"VLLM_HPU_STEP_CPU_FALLBACK: {cpu_fallback_local_metric.stats()}, {input_stats}") + + return output + + output = self.driver_worker.execute_model( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + ) + return output + + def add_lora(self, lora_request: LoRARequest) -> bool: + raise NotImplementedError("LoRA is not implemented for HPU backend.") + + def remove_lora(self, lora_id: int) -> bool: + raise NotImplementedError("LoRA is not implemented for HPU backend.") + + def list_loras(self) -> List[int]: + raise NotImplementedError("LoRA is not implemented for HPU backend.") + + def check_health(self) -> None: + # GPUExecutor will always be healthy as long as + # it's running. + return + + +class HabanaExecutorAsync(HabanaExecutor, ExecutorAsyncBase): + + async def execute_model_async( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + ) -> SamplerOutput: + output = await make_async(self.driver_worker.execute_model)( + seq_group_metadata_list=seq_group_metadata_list, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy) + return output + + async def check_health_async(self) -> None: + # GPUExecutor will always be healthy as long as + # it's running. + return diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py new file mode 100644 index 0000000000000..dac8eefb18adc --- /dev/null +++ b/vllm/executor/ray_habana_executor.py @@ -0,0 +1,419 @@ +import asyncio +import copy +import os +import pickle +from collections import defaultdict +from typing import TYPE_CHECKING, Any, Dict, List, Optional + +from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, + ParallelConfig, SchedulerConfig) +from vllm.engine.ray_utils import RayWorkerVllm, ray +from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase +from vllm.executor.utils import check_block_size_valid +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, + make_async) + +if ray is not None: + from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy + +if TYPE_CHECKING: + from ray.util.placement_group import PlacementGroup + +logger = init_logger(__name__) + +# If the env var is set, it uses the Ray's compiled DAG API +# which optimizes the control plane overhead. +# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. +USE_RAY_COMPILED_DAG = bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0)) + + +class RayHabanaExecutor(ExecutorBase): + + def __init__( + self, + model_config: ModelConfig, + cache_config: CacheConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + lora_config: Optional[LoRAConfig], + ) -> None: + self.model_config = model_config + self.cache_config = cache_config + self.lora_config = lora_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + + assert self.parallel_config.worker_use_ray + placement_group = self.parallel_config.placement_group + + # Disable Ray usage stats collection. + ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") + if ray_usage != "1": + os.environ["RAY_USAGE_STATS_ENABLED"] = "0" + + # Create the parallel GPU workers. + self._init_workers_ray(placement_group) + + # Profile the memory usage and initialize the cache. + self._init_cache() + + self.forward_dag = None + if USE_RAY_COMPILED_DAG: + self.forward_dag = self._compiled_ray_dag() + + def _init_workers_ray(self, placement_group: "PlacementGroup", + **ray_remote_kwargs): + if self.parallel_config.tensor_parallel_size == 1: + # For single GPU case, we use a ray worker with constrained memory. + num_gpus = self.cache_config.gpu_memory_utilization + else: + # Otherwise, the ray workers are allocated with a full GPU. + num_gpus = 1 + + # The driver dummy worker does not actually use any resources. + # It holds the resource for the driver worker. + self.driver_dummy_worker: RayWorkerVllm = None + # The remaining workers are the actual ray actors. + self.workers: List[RayWorkerVllm] = [] + + # Create the workers. + driver_ip = get_ip() + for bundle_id, bundle in enumerate(placement_group.bundle_specs): + if not bundle.get("HPU", 0): + continue + scheduling_strategy = PlacementGroupSchedulingStrategy( + placement_group=placement_group, + placement_group_capture_child_tasks=True, + placement_group_bundle_index=bundle_id, + ) + worker = ray.remote( + num_cpus=0, + num_gpus=0, + resources={'HPU': num_gpus}, + scheduling_strategy=scheduling_strategy, + **ray_remote_kwargs, + )(RayWorkerVllm).remote(self.model_config.trust_remote_code) + + worker_ip = ray.get(worker.get_node_ip.remote()) + if worker_ip == driver_ip and self.driver_dummy_worker is None: + # If the worker is on the same node as the driver, we use it + # as the resource holder for the driver process. + self.driver_dummy_worker = worker + else: + # Else, added to the list of workers. + self.workers.append(worker) + + if self.driver_dummy_worker is None: + raise ValueError( + "Ray does not allocate any GPUs on the driver node. Consider " + "adjusting the Ray placement group or running the driver on a " + "GPU node.") + + # Get the set of GPU IDs used on each node. + driver_node_id, driver_gpu_ids = ray.get( + self.driver_dummy_worker.get_node_and_gpu_ids.remote()) + worker_node_and_gpu_ids = ray.get( + [worker.get_node_and_gpu_ids.remote() for worker in self.workers]) + + node_workers = defaultdict(list) + node_gpus = defaultdict(list) + + node_workers[driver_node_id].append(0) + node_gpus[driver_node_id].extend(driver_gpu_ids) + for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids, + start=1): + node_workers[node_id].append(i) + node_gpus[node_id].extend(gpu_ids) + for node_id, gpu_ids in node_gpus.items(): + node_gpus[node_id] = sorted(gpu_ids) + + distributed_init_method = get_distributed_init_method( + driver_ip, get_open_port()) + + # Lazy import the Worker to avoid importing torch.cuda/xformers + # before CUDA_VISIBLE_DEVICES is set in the Worker + from vllm.worker.habana_worker import HabanaWorker + + model_config = copy.deepcopy(self.model_config) + parallel_config = copy.deepcopy(self.parallel_config) + scheduler_config = copy.deepcopy(self.scheduler_config) + device_config = copy.deepcopy(self.device_config) + lora_config = copy.deepcopy(self.lora_config) + kv_cache_dtype = self.cache_config.cache_dtype + + # Initialize the actual workers with the Worker class. + for rank, (worker, (node_id, _)) in enumerate( + zip(self.workers, worker_node_and_gpu_ids), + start=1, + ): + local_rank = node_workers[node_id].index(rank) + worker.init_worker.remote( + lambda rank=rank, local_rank=local_rank: HabanaWorker( + model_config, + parallel_config, + scheduler_config, + device_config, + local_rank, + rank, + distributed_init_method, + lora_config=lora_config, + kv_cache_dtype=kv_cache_dtype, + )) + + # Initialize the driver worker with the Worker class. + driver_rank = 0 + driver_local_rank = node_workers[driver_node_id].index(driver_rank) + self.driver_worker = HabanaWorker( + self.model_config, + self.parallel_config, + self.scheduler_config, + self.device_config, + driver_local_rank, + driver_rank, + distributed_init_method, + lora_config=self.lora_config, + kv_cache_dtype=kv_cache_dtype, + is_driver_worker=True, + ) + + self._run_workers("init_device") + self._run_workers( + "load_model", + max_concurrent_workers=self.parallel_config. + max_parallel_loading_workers, + ) + + def _init_cache(self) -> None: + """Profiles the memory usage and initializes the KV cache. + + The engine will first conduct a profiling of the existing memory usage. + Then, it calculate the maximum possible number of GPU and CPU blocks + that can be allocated with the remaining free memory. + More details can be found in the + :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method + from class :class:`~vllm.worker.Worker`. + + Afterwards, as there may be multiple workers, + we take the minimum number of blocks across all workers + to ensure this can be applied to all of them. + + Finally, the engine will initialize the KV cache + with the calculated number of blocks. + + .. tip:: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. + """ + # Get the maximum number of blocks that can be allocated on GPU and CPU. + num_blocks = self._run_workers( + "profile_num_available_blocks", + block_size=self.cache_config.block_size, + hpu_memory_utilization=self.cache_config.gpu_memory_utilization, + cpu_swap_space=self.cache_config.swap_space_bytes, + cache_dtype=self.cache_config.cache_dtype, + ) + + # Since we use a shared centralized controller, we take the minimum + # number of blocks across all workers to make sure all the memory + # operators can be applied to all workers. + num_gpu_blocks = min(b[0] for b in num_blocks) + num_cpu_blocks = min(b[1] for b in num_blocks) + logger.info(f"# HPU blocks: {num_gpu_blocks}, " + f"# CPU blocks: {num_cpu_blocks}") + + check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, + self.model_config.max_model_len) + + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + + # Initialize the cache. + self._run_workers("init_cache_engine", cache_config=self.cache_config) + # Warm up the model. This includes capturing the model into CUDA graph + # if enforce_eager is False. + self._run_workers("warm_up_model") + + def execute_model(self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + all_outputs = self._run_workers( + "execute_model", + driver_kwargs={ + "seq_group_metadata_list": seq_group_metadata_list, + "blocks_to_swap_in": blocks_to_swap_in, + "blocks_to_swap_out": blocks_to_swap_out, + "blocks_to_copy": blocks_to_copy, + }, + use_ray_compiled_dag=USE_RAY_COMPILED_DAG) + + # Only the driver worker returns the sampling results. + output = all_outputs[0] + return output + + def add_lora(self, lora_request: LoRARequest) -> bool: + assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." + return self._run_workers( + "add_lora", + lora_request=lora_request, + ) + + def remove_lora(self, lora_id: int) -> bool: + assert lora_id > 0, "lora_id must be greater than 0." + return self._run_workers( + "remove_lora", + lora_id=lora_id, + ) + + def list_loras(self) -> List[int]: + return self._run_workers("list_loras") + + def _run_workers( + self, + method: str, + *args, + driver_args: Optional[List[Any]] = None, + driver_kwargs: Optional[Dict[str, Any]] = None, + max_concurrent_workers: Optional[int] = None, + use_ray_compiled_dag: bool = False, + **kwargs, + ) -> Any: + """Runs the given method on all workers.""" + + if max_concurrent_workers: + raise NotImplementedError( + "max_concurrent_workers is not supported yet.") + + if use_ray_compiled_dag: + # Right now, compiled DAG can only accept a single + # input. TODO(sang): Fix it. + output_channels = self.forward_dag.execute(1) + else: + # Start the ray workers first. + ray_worker_outputs = [ + worker.execute_method.remote(method, *args, **kwargs) + for worker in self.workers + ] + + if driver_args is None: + driver_args = args + if driver_kwargs is None: + driver_kwargs = kwargs + + # Start the driver worker after all the ray workers. + driver_worker_output = getattr(self.driver_worker, + method)(*driver_args, **driver_kwargs) + + # Get the results of the ray workers. + if self.workers: + if use_ray_compiled_dag: + try: + ray_worker_outputs = [ + pickle.loads(chan.begin_read()) + for chan in output_channels + ] + finally: + # Has to call end_read in order to reuse the DAG. + for chan in output_channels: + chan.end_read() + else: + ray_worker_outputs = ray.get(ray_worker_outputs) + + return [driver_worker_output] + ray_worker_outputs + + def _compiled_ray_dag(self): + import pkg_resources + required_version = "2.9" + current_version = pkg_resources.get_distribution("ray").version + if current_version < required_version: + raise ValueError(f"Ray version {required_version} or greater is " + f"required, but found {current_version}") + + from ray.dag import InputNode, MultiOutputNode + assert self.parallel_config.worker_use_ray + + # Right now, compiled DAG requires at least 1 arg. We send + # a dummy value for now. It will be fixed soon. + with InputNode() as input_data: + forward_dag = MultiOutputNode([ + worker.execute_model_compiled_dag_remote.bind(input_data) + for worker in self.workers + ]) + return forward_dag.experimental_compile() + + def check_health(self) -> None: + """Raises an error if engine is unhealthy.""" + self._check_if_any_actor_is_dead() + + def _check_if_any_actor_is_dead(self): + if not self.workers: + return + + dead_actors = [] + for actor in self.workers: + actor_state = ray.state.actors(actor._ray_actor_id.hex()) # pylint: disable=protected-access + if actor_state["State"] == "DEAD": + dead_actors.append(actor) + if dead_actors: + raise RuntimeError("At least one Worker is dead. " + f"Dead Workers: {dead_actors}. ") + + +class RayHabanaExecutorAsync(RayHabanaExecutor, ExecutorAsyncBase): + + async def _run_workers_async( + self, + method: str, + *args, + driver_args: Optional[List[Any]] = None, + driver_kwargs: Optional[Dict[str, Any]] = None, + **kwargs, + ) -> Any: + """Runs the given method on all workers.""" + coros = [] + + if driver_args is None: + driver_args = args + if driver_kwargs is None: + driver_kwargs = kwargs + + # Run the driver worker asynchronously. + driver_executor = make_async(getattr(self.driver_worker, method)) + coros.append(driver_executor(*driver_args, **driver_kwargs)) + + # Run the ray workers asynchronously. + for worker in self.workers: + coros.append(worker.execute_method.remote(method, *args, **kwargs)) + + all_outputs = await asyncio.gather(*coros) + return all_outputs + + async def execute_model_async( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + ) -> SamplerOutput: + all_outputs = await self._run_workers_async( + "execute_model", + driver_kwargs={ + "seq_group_metadata_list": seq_group_metadata_list, + "blocks_to_swap_in": blocks_to_swap_in, + "blocks_to_swap_out": blocks_to_swap_out, + "blocks_to_copy": blocks_to_copy, + }) + + # Only the driver worker returns the sampling results. + output = all_outputs[0] + return output + + async def check_health_async(self) -> None: + """Raises an error if engine is unhealthy.""" + self._check_if_any_actor_is_dead() diff --git a/vllm/hpu/__init__.py b/vllm/hpu/__init__.py new file mode 100644 index 0000000000000..b8e4d3aac98a7 --- /dev/null +++ b/vllm/hpu/__init__.py @@ -0,0 +1,6 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. +############################################################################### diff --git a/vllm/hpu/attn_bias.py b/vllm/hpu/attn_bias.py new file mode 100644 index 0000000000000..ff508a59cc56a --- /dev/null +++ b/vllm/hpu/attn_bias.py @@ -0,0 +1,764 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. + + +import math +from dataclasses import dataclass +from typing import Any, Iterable, List, Optional, Sequence, Tuple, Union + +import torch + + +class AttentionBias: + """Base class for a custom bias that can be applied \ + as the attn_bias argument in + :attr:`xformers.ops.memory_efficient_attention`. + + That function has the ability to add a tensor, the + attention bias, to the QK^T matrix before it is used + in the softmax part of the attention calculation. + The attention bias tensor with shape + (B or 1, n_queries, number of keys) + can be given as the attn_bias input. + The most common use case is for an attention bias is + to contain only zeros and negative infinities, which forms + a mask so that some queries only attend to some keys. + + Children of this class define alternative things which can + be used as the attn_bias input to define an attention bias which + forms such a mask, for some common cases. + + When using an :attr:`xformers.ops.AttentionBias` + instead of a :attr:`torch.Tensor`, the mask matrix does + not need to be materialized, and can be + hardcoded into some kernels for better performance. + + See: + + - :attr:`xformers.ops.fmha.attn_bias.LowerTriangularMask` + - :attr:`xformers.ops.fmha.attn_bias.LowerTriangularMaskWithTensorBias` + - :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalMask` + - :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask` + + """ + + def materialize( + self, + shape: Tuple[int, ...], + dtype: torch.dtype = torch.float32, + device: Union[str, torch.device] = "cpu", + ) -> torch.Tensor: + """ + Materializes the bias as a `torch.Tensor`. This is very slow + and we don't attempt to make it fast. Only use for debugging/testing. + + Shape should be like `[*, q_seqlen, k_seqlen]` + """ + raise NotImplementedError() + + +class LowerTriangularMask(AttentionBias): + """ + A lower-triangular (aka causal) mask + + A query Q cannot attend to a key which is farther from the + initial key than Q is from the initial query. + """ + + def __init__(self, *tensor_args, **tensor_kwargs) -> None: + # NOTE: Unused arguments, we keep them for backward compatibility + super().__init__() + + def materialize( + self, + shape: Tuple[int, ...], + dtype: torch.dtype = torch.float32, + device: Union[str, torch.device] = "cpu", + ) -> torch.Tensor: + create_as = dtype if dtype is not torch.bfloat16 else torch.float32 + tensor = torch.full( # type: ignore + shape, + dtype=create_as, + fill_value=float("-inf"), + device=device, + ) + return torch.triu(tensor, diagonal=1).to(dtype) # type: ignore + + def add_bias(self, bias: torch.Tensor) -> "LowerTriangularMaskWithTensorBias": + return LowerTriangularMaskWithTensorBias(bias) + + +class LowerTriangularMaskWithTensorBias(LowerTriangularMask): + """A lower-triangular (aka causal) mask with an additive bias""" + + def __init__(self, bias: torch.Tensor) -> None: + self._bias = bias + + def materialize( + self, + shape: Tuple[int, ...], + dtype: torch.dtype = torch.float32, + device: Union[str, torch.device] = "cpu", + ) -> torch.Tensor: + return super().materialize(shape, dtype=dtype, device=device) + self._bias + + +@dataclass +class _SeqLenInfo: + """ + (Internal) Represents the division of a dimension into blocks. + + For example, to represents a dimension of length 7 divided into + three blocks of lengths 2, 3 and 2, use `from_seqlength([2, 3, 2])`. + The members will be: + max_seqlen: 3 + min_seqlen: 2 + seqstart_py: [0, 2, 5, 7] + seqstart: torch.IntTensor([0, 2, 5, 7]) + """ + + seqstart: torch.Tensor + max_seqlen: int + min_seqlen: int + seqstart_py: List[int] + + def to(self, device: torch.device) -> None: + self.seqstart = self.seqstart.to(device, non_blocking=True) + + def intervals(self) -> Iterable[Tuple[int, int]]: + yield from zip(self.seqstart_py, self.seqstart_py[1:]) + + @classmethod + def from_seqlens(cls, seqlens: Iterable[int]) -> "_SeqLenInfo": + """ + Input tensors are assumed to be in shape [B, M, *] + """ + assert not isinstance(seqlens, torch.Tensor) + seqstart_py = [0] + max_seqlen = -1 + min_seqlen = -1 + for seqlen in seqlens: + min_seqlen = min(min_seqlen, seqlen) if min_seqlen != -1 else seqlen + max_seqlen = max(max_seqlen, seqlen) + seqstart_py.append(seqstart_py[len(seqstart_py) - 1] + seqlen) + seqstart = torch.tensor(seqstart_py, dtype=torch.int32) + return cls( + max_seqlen=max_seqlen, + min_seqlen=min_seqlen, + seqstart=seqstart, + seqstart_py=seqstart_py, + ) + + def split( + self, x: torch.Tensor, batch_sizes: Optional[Sequence[int]] = None + ) -> List[torch.Tensor]: + if self.seqstart_py[-1] != x.shape[1] or x.shape[0] != 1: + raise ValueError( + f"Invalid `torch.Tensor` of shape {x.shape}, expected format " + f"(B, M, *) with B=1 and M={self.seqstart_py[-1]}\n" + f" seqstart: {self.seqstart_py}" + ) + if batch_sizes is None: + batch_sizes = [1] * (len(self.seqstart_py) - 1) + split_chunks = [] + it = 0 + for batch_size in batch_sizes: + split_chunks.append( + self.seqstart_py[it + batch_size] - self.seqstart_py[it] + ) + it += batch_size + return [ + tensor.reshape([bs, -1, *tensor.shape[2:]]) + for bs, tensor in zip(batch_sizes, x.split(split_chunks, dim=1)) + ] + + +@dataclass +class _PaddedSeqLenInfo(_SeqLenInfo): + """ + (Internal) Represents the division of a dimension into blocks which are + padded out to the same total length. + + For example, to represent a dimension of length 12 with space for + three blocks of length 4, but where the occupied lengths are + 2, 3 and 2, use `from_seqlens_padded([2, 3, 2], 4)`. + + The layout along the dimension is + + 0 ─â–ș block 0 + block 0 + + + 4 ─â–ș block 1 + block 1 + block 1 + + 8 ─â–ș block 2 + block 2 + + + 12 ─â–ș + + The members will be: + max_seqlen: 3 + min_seqlen: 2 + seqstart_py: [0, 4, 8, 12] + seqstart: torch.IntTensor([0, 4, 8, 12]) + seqlen_py: [2, 3, 2] + seqlen: torch.IntTensor([2, 3, 2]) + padding: 4 + """ + + seqlen: torch.Tensor + seqlen_py: Sequence[int] + padding: int + # From parent: seqstart[i] contains the start position + # of the i-th sequence + # seqstart: torch.Tensor + + def __post_init__(self) -> None: + assert len(self.seqstart_py) == len(self.seqlen_py) + 1 + + def to(self, device: torch.device) -> None: + self.seqlen = self.seqlen.to(device, non_blocking=True) + super().to(device) + + def intervals(self) -> Iterable[Tuple[int, int]]: + for (start, _), length in zip(super().intervals(), self.seqlen_py): + yield start, start + length + + @classmethod + def from_seqlens(cls, seqlens: Iterable[int]) -> "_SeqLenInfo": + raise RuntimeError( + "Use either `_SeqLenInfo.from_seqlens` or `_PaddedSeqLenInfo.from_seqlens_padded`" + ) + + @classmethod + def from_seqlens_padded( + cls, seqlens: Sequence[int], padding: int + ) -> "_PaddedSeqLenInfo": + """ + Input tensors are assumed to be in shape [B, M, *] + seqstart = padding * torch.arange(batch_size) + """ + assert not isinstance(seqlens, torch.Tensor) + assert all(seqlen <= padding for seqlen in seqlens) + seqstart_py = list(range(0, len(seqlens) * padding + 1, padding)) + return cls( + seqlen=torch.tensor(seqlens, dtype=torch.int32), + seqlen_py=seqlens, + max_seqlen=max(seqlens), + min_seqlen=min(seqlens), + seqstart=torch.tensor(seqstart_py, dtype=torch.int32), + seqstart_py=seqstart_py, + padding=padding, + ) + + def split( + self, x: torch.Tensor, batch_sizes: Optional[Sequence[int]] = None + ) -> List[torch.Tensor]: + raise NotImplementedError("_PaddedSeqLenInfo.split") + + +@dataclass +class BlockDiagonalMask(AttentionBias): + """ + A block-diagonal mask that can be passed as ``attn_bias`` + argument to :attr:`xformers.ops.memory_efficient_attention`. + + Queries and Keys are each divided into the same number of blocks. + Queries in block i only attend to keys in block i. + + .. figure:: /_static/block_diag_bias.png + + This bias can be used to handle a batch of sequences of + different lengths, via :attr:`BlockDiagonalMask.from_tensor_list` + + :Example: + + .. code-block:: python + + import torch + from xformers.ops import fmha + + K = 16 + dtype = torch.float16 + device = "cuda" + list_x = [ + torch.randn([1, 3, 1, K], dtype=dtype, device=device), + torch.randn([1, 6, 1, K], dtype=dtype, device=device), + torch.randn([1, 2, 1, K], dtype=dtype, device=device), + ] + attn_bias, x = fmha.BlockDiagonalMask.from_tensor_list(list_x) + linear = torch.nn.Linear(K, K * 3).to(device=device, dtype=dtype) + + q, k, v = linear(x).reshape([1, -1, 1, 3, K]).unbind(-2) + out = fmha.memory_efficient_attention(q, k, v, attn_bias=attn_bias) + list_out = attn_bias.split(out) + print(list_out[0].shape) # [1, 3, 1, K] + assert tuple(list_out[0].shape) == (1, 3, 1, K) + + """ + + q_seqinfo: _SeqLenInfo + k_seqinfo: _SeqLenInfo + _batch_sizes: Optional[Sequence[int]] = None + + def _create_block_mask( + self, + shape: Tuple[int, ...], + dtype: torch.dtype = torch.float32, + device: Union[str, torch.device] = "cpu", + ) -> torch.Tensor: + return torch.zeros( + shape, + dtype=dtype, + device=device, + ) + + def materialize( + self, + shape: Optional[Tuple[int, ...]] = None, + dtype: torch.dtype = torch.float32, + device: Union[str, torch.device] = "cpu", + ) -> torch.Tensor: + """Materialize the attention bias - for debugging & testing""" + if shape is None: + shape = (self.q_seqinfo.seqstart_py[-1], + self.k_seqinfo.seqstart_py[-1]) + assert shape[-1] == self.k_seqinfo.seqstart_py[-1], ( + shape[-1], + self.k_seqinfo.seqstart_py[-1], + ) + assert shape[-2] == self.q_seqinfo.seqstart_py[-1], ( + shape[-2], + self.q_seqinfo.seqstart_py[-1], + ) + mask = torch.empty(shape[-2:], dtype=dtype, device=device) + mask.fill_(-math.inf) + for i, ((q_start, q_end), (k_start, k_end)) in enumerate( + zip( + self.q_seqinfo.intervals(), + self.k_seqinfo.intervals(), + ) + ): + mask[q_start:q_end, k_start:k_end] = self._create_block_mask( + (q_end - q_start, k_end - k_start), + dtype=dtype, + device=device, + ) + for _ in range(len(shape) - 2): + mask = mask.unsqueeze(0) + return mask.expand(shape) + + @classmethod + def from_seqlens( + cls, + q_seqlen: Sequence[int], + kv_seqlen: Optional[Sequence[int]] = None, + ) -> "BlockDiagonalMask": + """Creates a :attr:`BlockDiagonalMask` from a list of tensors lengths for query and key/value. + + Args: + q_seqlen (Union[Sequence[int], torch.Tensor]): List or tensor of sequence lengths for query tensors + kv_seqlen (Union[Sequence[int], torch.Tensor], optional): List or tensor of sequence lengths for key/value. + (Defaults to ``q_seqlen``.) + Returns: + BlockDiagonalMask + """ + assert kv_seqlen is None or len(q_seqlen) == len(kv_seqlen) + q_seqinfo = _SeqLenInfo.from_seqlens(q_seqlen) + if kv_seqlen is None or q_seqlen == kv_seqlen: + k_seqinfo = q_seqinfo + else: + k_seqinfo = _SeqLenInfo.from_seqlens(kv_seqlen) + return cls(q_seqinfo=q_seqinfo, k_seqinfo=k_seqinfo) + + @classmethod + def from_tensor_list( + cls, + tensors: Sequence[torch.Tensor], + ) -> Tuple["BlockDiagonalMask", torch.Tensor]: + """Creates a :attr:`BlockDiagonalMask` from a list of tensors, and returns the tensors + concatenated on the sequence length dimension + + .. figure:: /_static/block_diag_cat_split.png + + See also :attr:`BlockDiagonalMask.split` to split the returned + :attr:`torch.Tensor` back to a list of tensors of varying sequence length + + Args: + tensors (Sequence[torch.Tensor]): A list of tensors of shape ``[B, M_i, *]``. + All tensors should have the same dimension and the same batch size ``B``, but + they can have different sequence length ``M``. + + Returns: + Tuple[BlockDiagonalMask, torch.Tensor]: The corresponding bias for the attention + along with `tensors` concatenated on the sequence length dimension, with shape ``[1, sum_i{M_i}, *]`` + """ + batch_sizes = [tensor.shape[0] for tensor in tensors] + seqlens = [] + for x in tensors: + for _ in range(x.shape[0]): + seqlens.append(x.shape[1]) + block_diag = cls.from_seqlens(seqlens) + block_diag._batch_sizes = batch_sizes + tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in tensors) + concat_tensors = torch.cat(tensors_bs1, dim=1) + return block_diag, concat_tensors + + @classmethod + def from_tensor_lists_qkv( + cls, + tensors_q: Sequence[torch.Tensor], + tensors_k: Sequence[torch.Tensor], + tensors_v: Optional[Sequence[torch.Tensor]] = None, + ) -> Tuple["BlockDiagonalMask", torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + assert len(tensors_q) == len(tensors_k) + assert tensors_v is None or len(tensors_v) == len(tensors_q) + batch_sizes = [tensor.shape[0] for tensor in tensors_q] + q_seqlens, kv_seqlens = [], [] + for i, (q, k) in enumerate(zip(tensors_q, tensors_k)): + assert q.shape[0] == k.shape[0] + q_seqlens += [q.shape[1]] * q.shape[0] + kv_seqlens += [k.shape[1]] * k.shape[0] + assert tensors_v is None or tensors_v[i].shape[:2] == k.shape[:2] + block_diag = cls.from_seqlens(q_seqlens, kv_seqlens) + block_diag._batch_sizes = batch_sizes + return ( + block_diag, + torch.cat([x.reshape([1, -1, *x.shape[2:]]) for x in tensors_q], dim=1), + torch.cat([x.reshape([1, -1, *x.shape[2:]]) for x in tensors_k], dim=1), + torch.cat([x.reshape([1, -1, *x.shape[2:]]) for x in tensors_v], dim=1) + if tensors_v is not None + else None, + ) + + def split_queries(self, tensor: torch.Tensor) -> Sequence[torch.Tensor]: + return self.q_seqinfo.split(tensor, self._batch_sizes) + + def split_kv(self, tensor: torch.Tensor) -> Sequence[torch.Tensor]: + return self.k_seqinfo.split(tensor, self._batch_sizes) + + def split(self, tensor: torch.Tensor) -> Sequence[torch.Tensor]: + """The inverse operation of :attr:`BlockDiagonalCausalMask.from_tensor_list` + + Args: + tensor (torch.Tensor): Tensor of tokens of shape ``[1, sum_i{M_i}, *]`` + + Returns: + Sequence[torch.Tensor]: A list of tokens with possibly different sequence lengths + """ + assert self.q_seqinfo is self.k_seqinfo + return self.q_seqinfo.split(tensor, self._batch_sizes) + + def make_causal(self) -> "BlockDiagonalCausalMask": + """Makes each block causal""" + return BlockDiagonalCausalMask( + q_seqinfo=self.q_seqinfo, + k_seqinfo=self.k_seqinfo, + _batch_sizes=self._batch_sizes, + ) + + def make_causal_from_bottomright(self) -> "BlockDiagonalCausalFromBottomRightMask": + """Makes each block causal with a possible non-causal prefix""" + return BlockDiagonalCausalFromBottomRightMask( + q_seqinfo=self.q_seqinfo, + k_seqinfo=self.k_seqinfo, + _batch_sizes=self._batch_sizes, + ) + + def make_local_attention( + self, window_size: int + ) -> "BlockDiagonalCausalLocalAttentionMask": + """Experimental: Makes each block causal with local attention""" + return BlockDiagonalCausalLocalAttentionMask( + q_seqinfo=self.q_seqinfo, + k_seqinfo=self.k_seqinfo, + _batch_sizes=self._batch_sizes, + _window_size=window_size, + ) + + def make_local_attention_from_bottomright( + self, window_size: int + ) -> "BlockDiagonalCausalLocalAttentionFromBottomRightMask": + """Experimental: Makes each block causal with local attention, start from bottom right""" + return BlockDiagonalCausalLocalAttentionFromBottomRightMask( + q_seqinfo=self.q_seqinfo, + k_seqinfo=self.k_seqinfo, + _batch_sizes=self._batch_sizes, + _window_size=window_size, + ) + + +@dataclass +class BlockDiagonalCausalMask(BlockDiagonalMask): + """ + Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalMask`, except that each block is causal. + + Queries and Keys are each divided into the same number of blocks. + A query Q in block i cannot attend to a key which is not in block i, + nor one which is farther from the initial key in block i than Q + is from the initial query in block i. + """ + + def _create_block_mask( + self, + shape: Tuple[int, ...], + dtype: torch.dtype = torch.float32, + device: Union[str, torch.device] = "cpu", + ) -> torch.Tensor: + return LowerTriangularMask().materialize( + shape, + dtype=dtype, + device=device, + ) + + +@dataclass +class BlockDiagonalCausalFromBottomRightMask(BlockDiagonalMask): + """ + Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalMask`, except that each block is causal. + This mask allows for a non-causal prefix + NOTE: Each block should have `num_keys >= num_queries` otherwise the forward pass is not + defined (softmax of vector of `-inf` in the attention) + + Queries and keys are each divided into the same number of blocks. + A query Q in block i cannot attend to a key which is not in block i, + nor one which nearer the final key in block i than Q is to the + final query in block i. + """ + + def __post_init__(self) -> None: + for i, ((q_start, q_end), (k_start, k_end)) in enumerate( + zip( + self.q_seqinfo.intervals(), + self.k_seqinfo.intervals(), + ) + ): + num_queries = q_end - q_start + num_keys = k_end - k_start + if num_keys < num_queries: + raise ValueError( + f"Block #{i} has num_keys={num_keys} and num_queries={num_queries}." + " Expected `num_keys >= num_queries`" + ) + + def _create_block_mask( + self, + shape: Tuple[int, ...], + dtype: torch.dtype = torch.float32, + device: Union[str, torch.device] = "cpu", + ) -> torch.Tensor: + create_as = dtype if dtype is not torch.bfloat16 else torch.float32 + tensor = torch.full( # type: ignore + shape, + dtype=create_as, + fill_value=float("-inf"), + device=device, + ) + num_queries, num_keys = shape[-2:] + return torch.triu(tensor, diagonal=num_keys - num_queries + 1).to(dtype) # type: ignore + + +@dataclass +class BlockDiagonalCausalWithOffsetPaddedKeysMask(AttentionBias): + """ + Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask`, + except an offset on causality is allowed for each block and we support padding for k/v + + The keys and values are divided into blocks which are padded out to + the same total length. + For example, if there is space for 12 keys, for three blocks of + max length 4, but we only want to use the first 2, 3 and 2 + of each block, use `kv_padding=4` and `kv_seqlens=[2, 3, 2]`. + The queries are divided into blocks, without padding, of lengths given by + q_seqlen. + + A query Q in block i cannot attend to a key which is not in block i, + nor one which is not in use (i.e. in the padded area), + nor one which is nearer to the final key in block i + than Q is to the final query in block i. + """ + + q_seqinfo: _SeqLenInfo + k_seqinfo: _PaddedSeqLenInfo + causal_diagonal: Any = None # unused. Exists for BC only. + + def _create_block_mask( + self, + shape: Tuple[int, ...], + dtype: torch.dtype = torch.float32, + device: Union[str, torch.device] = "cpu", + ) -> torch.Tensor: + create_as = dtype if dtype is not torch.bfloat16 else torch.float32 + tensor = torch.full( # type: ignore + shape, + dtype=create_as, + fill_value=float("-inf"), + device=device, + ) + num_queries, num_keys = shape[-2:] + return torch.triu(tensor, diagonal=1 + num_keys - num_queries).to(dtype) # type: ignore + + def materialize( + self, + shape: Tuple[int, ...], + dtype: torch.dtype = torch.float32, + device: Union[str, torch.device] = "cpu", + ) -> torch.Tensor: + """Materialize the attention bias - for debugging & testing""" + if shape[-1] != self.k_seqinfo.seqstart_py[-1]: + raise ValueError("k shapes wrong") + if shape[-2] != self.q_seqinfo.seqstart_py[-1]: + raise ValueError("q shapes wrong") + mask = torch.empty(shape[-2:], dtype=dtype, device=device) + mask.fill_(-math.inf) + for i, ((q_start, q_end), (k_start, k_end)) in enumerate( + zip( + self.q_seqinfo.intervals(), + self.k_seqinfo.intervals(), + ) + ): + mask[q_start:q_end, k_start:k_end] = self._create_block_mask( + (q_end - q_start, k_end - k_start), + dtype=dtype, + device=device, + ) + for _ in range(len(shape) - 2): + mask = mask.unsqueeze(0) + return mask.expand(shape) + + @classmethod + def from_seqlens( + cls, + q_seqlen: Sequence[int], + kv_padding: int, + kv_seqlen: Sequence[int], + causal_diagonal: Any = None, + ) -> "BlockDiagonalCausalWithOffsetPaddedKeysMask": + """Creates a :attr:`BlockDiagonalCausalWithOffsetPaddedKeysMask` from a list of tensor + lengths for query and key/value. + + Args: + q_seqlen (Sequence[int]): List or tensor of sequence lengths for query tensors + kv_padding (int): Padding for k/v - also an upperbound on each individual key length + kv_seqlen (Sequence[int]): List or tensor of sequence lengths for key/value. + causal_diagonal: unused, for BC only + Returns: + BlockDiagonalCausalWithOffsetPaddedKeysMask + """ + assert kv_seqlen is None or len(q_seqlen) == len(kv_seqlen), ( + q_seqlen, + kv_seqlen, + ) + q_seqinfo = _SeqLenInfo.from_seqlens(q_seqlen) + k_seqinfo = _PaddedSeqLenInfo.from_seqlens_padded(kv_seqlen, kv_padding) + return cls(q_seqinfo=q_seqinfo, k_seqinfo=k_seqinfo) + + +@dataclass +class BlockDiagonalCausalLocalAttentionMask(BlockDiagonalCausalMask): + """ + (Experimental feature) + Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask`. + This makes the mask "local" and the attention pattern banded. + + Query i only attends to keys in its block and cannot attend keys further than "window_size" + from it. + """ + + _window_size: int = 0 # forced due to inheritance and default arguments + + def __post_init__(self): + if self._window_size <= 0: + raise ValueError( + f"Expected `window_size > 0`, but window_size={self._window_size}" + ) + q_seqlen = [ + y - x + for x, y in zip( + self.q_seqinfo.seqstart_py[:-1], self.q_seqinfo.seqstart_py[1:] + ) + ] + kv_seqlen = [ + y - x + for x, y in zip( + self.k_seqinfo.seqstart_py[:-1], self.k_seqinfo.seqstart_py[1:] + ) + ] + for q, k in zip(q_seqlen, kv_seqlen): + if q - self._window_size >= k: + # Each query only attends to keys no further than window_size back. + # When q > k + window_size, there will be a query for which the window doesn't reach any key. + raise RuntimeError( + f"No keys are attended in q_seqlen {q} k_seqlen {k} with sliding window {self._window_size}" + ) + + def _create_block_mask( + self, + shape: Tuple[int, ...], + dtype: torch.dtype = torch.float32, + device: Union[str, torch.device] = "cpu", + ) -> torch.Tensor: + create_as = dtype if dtype is not torch.bfloat16 else torch.float32 + tensor = torch.full( # type: ignore + shape, + dtype=create_as, + fill_value=1, + device=device, + ) + + num_queries, num_keys = shape[-2:] + mask = torch.tril(tensor, diagonal=0).to(dtype) # type: ignore + if self._window_size is not None and self._window_size > 0: + mask = torch.triu(mask, diagonal=-self._window_size + 1) + mask = torch.log(mask) + return mask.to(dtype) + + +@dataclass +class BlockDiagonalCausalLocalAttentionFromBottomRightMask( + BlockDiagonalCausalFromBottomRightMask +): + """ + (Experimental feature) + Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask`. + This makes the mask "local" and the attention pattern banded. + + Query i only attends to keys in its block and cannot attend keys further than "window_size" + from it. + """ + + _window_size: int = 0 # forced due to inheritance and default arguments + + def __post_init__(self): + super().__post_init__() + if self._window_size <= 0: + raise ValueError( + f"Expected `window_size > 0`, but window_size={self._window_size}" + ) + + def _create_block_mask( + self, + shape: Tuple[int, ...], + dtype: torch.dtype = torch.float32, + device: Union[str, torch.device] = "cpu", + ) -> torch.Tensor: + create_as = dtype if dtype is not torch.bfloat16 else torch.float32 + tensor = torch.full( # type: ignore + shape, + dtype=create_as, + fill_value=1, + device=device, + ) + num_queries, num_keys = shape[-2:] + mask = torch.tril(tensor, diagonal=num_keys - num_queries).to(dtype) # type: ignore + if self._window_size is not None: + mask = torch.triu( + mask, diagonal=num_keys - num_queries - self._window_size + 1 + ) + mask = torch.log(mask) + return mask.to(dtype) diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py new file mode 100644 index 0000000000000..759fdb65e08ed --- /dev/null +++ b/vllm/hpu/cache_ops.py @@ -0,0 +1,82 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. +############################################################################### + +from typing import Tuple +import torch +import habana_frameworks.torch as htorch + + +def pad_to_full_block(data, block_size, pad_value): + seq_dim = 1 + pad_shape = list(data.shape) + remainder = pad_shape[seq_dim] % block_size + if remainder == 0: + return data + pad_shape[seq_dim] = block_size - remainder + pad = torch.full(pad_shape, pad_value, dtype=data.dtype, device=data.device) + return torch.cat([data, pad], dim=seq_dim) + + +def initialize_cache(data, indices, cache): + block_size = cache.size(-1) + data = data.unflatten(0, (-1, block_size)).permute(0, 2, 3, 1) + indices = indices.unflatten(0, (-1, block_size))[:,0] + cache.index_copy_(0, indices, data) + + +def update_cache(data, indices, offsets, cache): + prev = cache.index_select(0, indices) + idx = offsets.view(-1, 1, 1, 1).expand(-1, data.size(1), data.size(2), -1) + prev.scatter_(-1, idx, data.unsqueeze(-1)) + cache.index_copy_(0, indices, prev) + + +def reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, dtype, is_prompt): + block_size = key_cache.size(-1) + assert slot_mapping.dim() == 2, 'This implementation requires unflattened slot_mapping!' + + if is_prompt: + block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor") + batch_size, seq_length = block_indices.shape + key = pad_to_full_block(key.unflatten(0, (batch_size, seq_length)), block_size, 0).flatten(0, 1) + value = pad_to_full_block(value.unflatten(0, (batch_size, seq_length)), block_size, 0).flatten(0, 1) + block_indices = pad_to_full_block(block_indices, block_size, -1).flatten(0, 1) + initialize_cache(key, block_indices, key_cache) + initialize_cache(value, block_indices, value_cache) + else: + slot_mapping = slot_mapping.flatten() + block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor") + block_offsets = torch.fmod(slot_mapping, block_size) + update_cache(key, block_indices, block_offsets, key_cache) + update_cache(value, block_indices, block_offsets, value_cache) + + +def swap_blocks(src, dst, block_mapping): + index_src = torch.zeros((1,), dtype=torch.int32, device=key_caches[0].device) + index_dst = torch.zeros((1,), dtype=torch.int32, device=key_caches[0].device) + for src_idx, dst_idx in block_mapping.items(): + index_src[0] = src_idx + index_dst[0] = dst_idx + dst.index_put_([index_dst], src.index_select(0, index_src)) + if dst.device.type == 'hpu': + htorch.core.mark_step() + torch.hpu.synchronize() + + +def copy_blocks(key_caches, value_caches, block_mapping): + index_src = torch.zeros((1,), dtype=torch.int32, device=key_caches[0].device) + index_dst = torch.zeros((1,), dtype=torch.int32, device=key_caches[0].device) + for src, dsts in block_mapping.items(): + index_src[0] = src + for dst in dsts: + index_dst[0] = dst + for key_cache in key_caches: + key_cache.index_copy_(0, index_dst, key_cache.index_select(0, index_src)) + for value_cache in value_caches: + value_cache.index_copy_(0, index_dst, value_cache.index_select(0, index_src)) + if key_caches[0].device.type == 'hpu': + htorch.core.mark_step() diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py new file mode 100644 index 0000000000000..d95b301697cea --- /dev/null +++ b/vllm/hpu/ops.py @@ -0,0 +1,115 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. +############################################################################### +import os +import torch +import torch.nn as nn +import torch.nn.functional as F +import habana_frameworks.torch as htorch +from typing import List, Optional, Tuple + +import vllm.hpu.utils as hpu_utils + +PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', '0') == '1') + + +def silu_and_mul(output, input): + d = input.shape[-1] // 2 + silu = torch.nn.SiLU().to(input.device) + x, y = torch.split(input, d, dim=-1) + output.copy_(silu(x) * y) + + +def gelu_new(output, input): + raise NotImplementedError + + +def gelu_fast(output, input): + raise NotImplementedError + + +def fetch_from_cache(cache, blocks): + return [cache.index_select(0, blocks[:, i]) for i in range(blocks.size(1))] + + +@hpu_utils.with_mark_steps +def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block_tables, context_lens, block_size, max_context_len, alibi_slopes, kv_cache_dtype=None) -> None: + seq_len = block_tables.size(1) + batch_size, query_heads, _ = query.shape + _, kv_heads, _, _ = key_cache.shape + min_inf = torch.finfo(query.dtype).min + mask = (torch.arange(0, seq_len * block_size, dtype=torch.int32, device=key_cache.device) + .view(1, -1) + .expand(batch_size, -1) + .ge(context_lens.view(-1, 1)) + .view(batch_size, 1, 1, -1)) + query = query.unsqueeze(-2) + keys = fetch_from_cache(key_cache, block_tables) + if query_heads != kv_heads: + query = query.unflatten(1, (kv_heads, -1)) + keys = [k.unflatten(1, (kv_heads, 1)) for k in keys] + mask = mask.unsqueeze(2) + + attn_weights = [torch.matmul(query, k) for k in keys] + attn_weights = (torch.cat(attn_weights, dim=-1) + .mul_(scale) + .masked_fill(mask, min_inf) + .softmax(dim=-1)) + + values = fetch_from_cache(value_cache, block_tables) + if PA_SPLIT_VALUE: + attn_weights = attn_weights.split(block_size, dim=-1) + else: + values = [torch.cat(values, dim=-1)] + attn_weights = [attn_weights] + if query_heads != kv_heads: + values = [v.unflatten(1, (kv_heads, 1)) for v in values] + attn_weights = [torch.matmul(a, v.transpose(-1, -2)).squeeze(-2) for a, v in zip(attn_weights, values)] + if query_heads != kv_heads: + attn_weights = [a.flatten(1, 2) for a in attn_weights] + attn_weights = sum(attn_weights) + + return attn_weights + + +def rms_norm(out, hidden_states, weight, eps): + htorch.core.mark_step() + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + eps) + out.copy_(weight * hidden_states.to(input_dtype)) + htorch.core.mark_step() + + +def rotate_neox(x: torch.Tensor) -> torch.Tensor: + x1 = x[..., :x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2:] + return torch.cat((-x2, x1), dim=-1) + + +def rotate_gptj(x: torch.Tensor) -> torch.Tensor: + x1 = x[..., ::2] + x2 = x[..., 1::2] + x = torch.stack((-x2, x1), dim=-1) + return x.flatten(-2) + + +def apply_rope( + q: torch.Tensor, + k: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + is_neox_style: bool, +) -> Tuple[torch.Tensor, torch.Tensor]: + rotate_fn = rotate_neox if is_neox_style else rotate_gptj + q_embed = (q * cos) + (rotate_fn(q) * sin) + k_embed = (k * cos) + (rotate_fn(k) * sin) + return q_embed, k_embed + + +def awq_gemm(*args): + raise NotImplementedError diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py new file mode 100644 index 0000000000000..30f96153cd4a2 --- /dev/null +++ b/vllm/hpu/rotary_embed.py @@ -0,0 +1,119 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. +############################################################################### + +import torch +import torch.nn as nn +import habana_frameworks.torch.utils.experimental as htexp + +def get_device_type(): + return htexp._get_device_type() + +def is_gaudi1(): + return get_device_type() == htexp.synDeviceType.synDeviceGaudi + +def is_gaudi2(): + return get_device_type() == htexp.synDeviceType.synDeviceGaudi2 + +def is_gaudi3(): + return get_device_type() == htexp.synDeviceType.synDeviceGaudi3 + +# TODO: remove this workaround when FusedRoPE properly works on Gaudi +if not is_gaudi1() and (is_gaudi2() or is_gaudi3()): + try: + from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV1 as FusedRoPE + except ImportError: + print("Not using HPU fused kernel for apply_rotary_pos_emb") + FusedRoPE = None +else: + FusedRoPE = None + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids]#.unsqueeze(unsqueeze_dim) + sin = sin[position_ids]#.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class HpuRotaryEmbedding(nn.Module): + def __init__(self, head_size, rotary_dim, max_position_embeddings=2048, base=10000, is_neox_style=None, device='hpu'): + super().__init__() + + self.head_size = head_size + self.dim = rotary_dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor): + if query.dim() == 2: + query = query.unsqueeze(0) + if key.dim() == 2: + key = key.unsqueeze(0) + if positions.dim() == 1: + positions = positions.unsqueeze(0) + seq_len = key.shape[-2] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=query.device, dtype=query.dtype) + + cos, sin = self.cos_cached[:seq_len].to(dtype=query.dtype), self.sin_cached[:seq_len].to(dtype=query.dtype) + query = query.reshape((query.shape[0], query.shape[1], query.shape[2] // self.head_size, self.head_size)) + key = key.reshape((key.shape[0], key.shape[1], key.shape[2] // self.head_size, self.head_size)) + if query.device.type == "hpu" and FusedRoPE: + if len(positions[0]) == 1: + cos = self.cos_cached[positions].unsqueeze(2).to(dtype=query.dtype) + sin = self.sin_cached[positions].unsqueeze(2).to(dtype=query.dtype) + else: + cos = cos[positions].unsqueeze(2) + sin = sin[positions].unsqueeze(2) + query, key = FusedRoPE.apply(query, cos, sin, 0), FusedRoPE.apply(key, cos, sin, 0) + else: + query, key = apply_rotary_pos_emb(query, key, cos, sin, positions) + return query.reshape((query.shape[0], query.shape[1], query.shape[2] * query.shape[3])), key.reshape((key.shape[0], key.shape[1], key.shape[2] * key.shape[3])) diff --git a/vllm/hpu/utils.py b/vllm/hpu/utils.py new file mode 100644 index 0000000000000..8d7f388cf262a --- /dev/null +++ b/vllm/hpu/utils.py @@ -0,0 +1,99 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. +############################################################################### + +import habana_frameworks.torch as htorch + +def with_mark_steps(fn): + def wrapped(*args, **kwargs): + htorch.core.mark_step() + result = fn(*args, **kwargs) + del args + del kwargs + htorch.core.mark_step() + return result + return wrapped + + +def profile_reicpes(recipe_names): + from pathlib import Path + import numpy as np + import matplotlib.pyplot as plt + from sklearn.metrics import ConfusionMatrixDisplay + import tqdm + recipe_names_short = [name.replace('.graph_dumps/HabanaFusedOpLazy_', '') for name in recipe_names] + recipes = [Path(Path.cwd().joinpath(name + '-PostGraph-symbol.pbtxt')).open('r').read() for name in recipe_names] + + def generic_similarity_backend(recipes, similarity_func, backend_name=''): + num_recipes = len(recipes) + sim_tri = np.zeros((num_recipes, num_recipes)) + total = (num_recipes * (num_recipes + 1)) // 2 - num_recipes + backend_txt = f' with {backend_name}' if backend_name != '' else '' + with tqdm.tqdm(total=total, desc=f" computing similarity matrix{backend_txt}") as pbar: + for i in range(num_recipes): + for j in range(i): + sim_tri[i,j] = similarity_func(recipes[i], recipes[j]) + pbar.update(1) + sim = sim_tri.T + sim_tri + sim_idx = np.arange(sim_tri.shape[0]) + sim[sim_idx,sim_idx] = 1 + return sim + + def cosine_similarity_rad_backend(recipes): + from strsimpy.cosine import Cosine + s = Cosine(2) + return generic_similarity_backend(recipes, s.similarity, "Cosine (rad)"), "cosine similarity, 1 = max similarity" + + def cosine_similarity_deg_backend(recipes): + from strsimpy.cosine import Cosine + s = Cosine(2) + rad = generic_similarity_backend(recipes, s.similarity, "cosine similarity") + deg = np.degrees(np.arccos(rad)) + return deg, "cosine similarity (deviation in deg, 0 = max similarity)" + + def overlap_coefficient_backend(recipes): + from strsimpy.overlap_coefficient import OverlapCoefficient + s = OverlapCoefficient(2) + return generic_similarity_backend(recipes, s.similarity, OverlapCoefficient.__name__), OverlapCoefficient.__name__ + + def normalized_levenshtein_backend(recipes): + from strsimpy.normalized_levenshtein import NormalizedLevenshtein + s = NormalizedLevenshtein() + return generic_similarity_backend(recipes, s.similarity, NormalizedLevenshtein.__name__), NormalizedLevenshtein.__name__ + + def jaro_winkler_backend(recipes): + from strsimpy.jaro_winkler import JaroWinkler + s = JaroWinkler() + return generic_similarity_backend(recipes, s.similarity, JaroWinkler.__name__), JaroWinkler.__name__ + + def tfidf_weird_backend(recipes): + def tfidf_single_elem(x,y): + from sklearn.feature_extraction.text import TfidfVectorizer + vect = TfidfVectorizer() + tfidf = vect.fit_transform([x,y]) + sim_sparse = tfidf * tfidf.T + sim = sim_sparse.toarray() + return sim[0,1] + return generic_similarity_backend(recipes, tfidf_single_elem, 'TfidfVectorizer (weird)'), 'TfidfVectorizer (weird)' + + def tfidf_backend(recipes): + from sklearn.feature_extraction.text import TfidfVectorizer + vect = TfidfVectorizer() + tfidf = vect.fit_transform(recipes) + sim_sparse = tfidf * tfidf.T + sim = sim_sparse.toarray() + return sim, 'TfidfVectorizer' + + sim, backend_name = tfidf_backend(recipes) + plt.rcParams["figure.figsize"] = [16,16] + plt.rcParams["figure.dpi"] = 300 + cm = ConfusionMatrixDisplay(sim, display_labels=recipe_names_short) + cm.plot(xticks_rotation='vertical', text_kw={"fontsize":5}) + cm.ax_.set_xlabel("Target recipe number") + cm.ax_.set_ylabel("Source recipe number") + plt.title(f'Recipe similarity ({backend_name})') + return plt +# plt.savefig('similarity.png') \ No newline at end of file diff --git a/vllm/hpu/xops.py b/vllm/hpu/xops.py new file mode 100644 index 0000000000000..c9d237744a917 --- /dev/null +++ b/vllm/hpu/xops.py @@ -0,0 +1,66 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. +############################################################################### + +import habana_frameworks.torch as htorch +import torch +import torch.nn.functional as F +from typing import List, Optional, Tuple, Union +from .attn_bias import AttentionBias, BlockDiagonalCausalMask + +try: + from habana_frameworks.torch.hpex.kernels import FusedSDPA +except ImportError: + print("Not using HPU fused scaled dot-product attention kernel.") + FusedSDPA = None + +def memory_efficient_attention_forward( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_bias: Optional[torch.Tensor] = None, + p: float = 0.0, + scale: Optional[float] = None, +) -> torch.Tensor: + assert attn_bias is not None, "Attention mask is required for prompt processing" + dim = query.dim() + is_causal = isinstance(attn_bias, BlockDiagonalCausalMask) + if FusedSDPA and (is_causal or attn_bias is None): + bs = query.shape[0] + seq_len_q = query.shape[1] + seq_len_kv = key.shape[1] + heads = query.shape[-2] if dim != 5 else query.shape[-3] + attn_groups = 1 if dim != 5 else query.shape[-2] + head_dim = query.shape[-1] + if dim == 4: + # [bs, seq_len, 1, heads, head_dim] -> [bs, heads, seq_len, head_dim] + query = query.reshape(bs, seq_len_q, heads, head_dim).permute(0, 2, 1, 3) + key = key.reshape(bs, seq_len_kv, heads, head_dim).permute(0, 2, 1, 3) + value = value.reshape(bs, seq_len_kv, heads, head_dim).permute(0, 2, 1, 3) + elif dim == 5: + # [bs, seq_len, heads, attn_groups, head_dim] -> [bs, heads, attn_groups, seq_len, head_dim] + query = query.reshape(bs, seq_len_q, heads, attn_groups, head_dim).permute(0, 2, 3, 1, 4) + key = key.reshape(bs, seq_len_kv, heads, attn_groups, head_dim).permute(0, 2, 3, 1, 4) + value = value.reshape(bs, seq_len_kv, heads, attn_groups, head_dim).permute(0, 2, 3, 1, 4) + else: + raise ValueError(f"Unsupported attention dimension: {dim}") + + import habana_frameworks.torch.hpu as ht + with ht.sdp_kernel(enable_recompute=False): # (flash_attention_recompute and q_len == 1)): + out = FusedSDPA.apply( + query, key, value, None, p, is_causal, scale + ) + htorch.core.mark_step() + if dim == 4: + # [bs, heads, seq_len, head_dim] -> [bs, seq_len, heads, head_dim] + out = out.permute(0, 2, 1, 3).reshape(bs, seq_len_q, heads, head_dim) + elif dim == 5: + # [bs, heads, attn_groups, seq_len, head_dim] -> [bs, seq_len, heads, attn_groups, head_dim] + out = out.permute(0, 3, 1, 2, 4).reshape(bs, seq_len_q, heads, attn_groups, head_dim) + else: + raise NotImplementedError(f'Only FusedSDPA causal or non-masked attention is supported.\nFusedSDPA support: {FusedSDPA is not None}\nis_causal: {is_causal}\nmask_present: {attn_bias is not None}') + + return out diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index f569a5a49cbdf..4435748899af2 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -6,7 +6,11 @@ import torch.nn as nn import torch.nn.functional as F -from vllm._C import ops +from vllm.utils import is_hpu +if is_hpu(): + from vllm.hpu import ops +else: + from vllm._C import ops from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index cb3cee2bad5ad..e194905f68770 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -4,8 +4,18 @@ import torch import torch.nn as nn -from vllm._C import ops +from vllm.utils import is_hpu +if is_hpu(): + from vllm.hpu import ops +else: + from vllm._C import ops +if is_hpu(): + try: + from habana_frameworks.torch.hpex.normalization import FusedRMSNorm as FusedRMSNorm + except ImportError: + print("Not using HPU fused kernel for RMSNorm") + FusedRMSNorm = None class RMSNorm(nn.Module): """Root mean square normalization. @@ -49,6 +59,13 @@ def forward( residual: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: if residual is not None: + if x.device.type == "hpu" and FusedRMSNorm: + orig_dtype = x.dtype + orig_shape = x.shape + residual += x.view(residual.shape) + # Note: FusedRMSNorm requires 3D tensors as inputs + x = FusedRMSNorm.apply(residual.float(), self.weight.float(), self.variance_epsilon) + return x.to(orig_dtype).view(orig_shape), residual ops.fused_add_rms_norm( x, residual, @@ -56,6 +73,10 @@ def forward( self.variance_epsilon, ) return x, residual + if x.device.type == "hpu" and FusedRMSNorm: + orig_dtype = x.dtype + x = FusedRMSNorm.apply(x.float(), self.weight.float(), self.variance_epsilon) + return x.to(orig_dtype) out = torch.empty_like(x) ops.rms_norm( out, diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index 28e8f6bb7e638..6661164b0b53f 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -5,9 +5,9 @@ import torch.nn as nn from vllm.model_executor.parallel_utils.communication_op import ( - tensor_model_parallel_gather) + tensor_model_parallel_gather, tensor_model_parallel_all_gather) from vllm.model_executor.sampling_metadata import SamplingMetadata - +from vllm.utils import is_hpu class LogitsProcessor(nn.Module): """Process logits and apply logits processors from sampling metadata. @@ -51,7 +51,7 @@ def forward( # Get the logits for the next tokens. logits = self._get_logits(hidden_states, embedding, embedding_bias) - if logits is not None: + if logits is not None and sampling_metadata.perform_sampling: logits *= self.scale # Apply logits processors (if any). @@ -65,7 +65,9 @@ def _get_logits(self, hidden_states: torch.Tensor, embedding: torch.Tensor, logits = torch.matmul(hidden_states, embedding.t()) if embedding_bias is not None: logits += embedding_bias - logits = tensor_model_parallel_gather(logits) + # NOTE(kzawora): HPU PT bridge is missing support for single-rank gather. We'll use all-gather on Gaudi for now. + gather_op = tensor_model_parallel_all_gather if is_hpu() else tensor_model_parallel_gather + logits = gather_op(logits) # Remove paddings in vocab (if any). if logits is not None: logits = logits[:, :self.org_vocab_size] diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 2caef5f1ebf50..633b094903a8f 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -3,7 +3,12 @@ import torch from torch.nn.parameter import Parameter -from vllm._C import ops +from vllm.utils import is_hpu +if is_hpu(): + from vllm.hpu import ops +else: + from vllm._C import ops + from vllm.model_executor.layers.linear import (LinearMethodBase, set_weight_attrs) from vllm.model_executor.layers.quantization.base_config import ( diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index 53baf710ed811..07471a0856c7a 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -6,7 +6,12 @@ import torch from torch.nn.parameter import Parameter -from vllm._C import ops +from vllm.utils import is_hpu +if is_hpu(): + from vllm.hpu import ops +else: + from vllm._C import ops + from vllm.model_executor.layers.linear import (LinearMethodBase, set_weight_attrs) from vllm.model_executor.layers.quantization.base_config import ( diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py index 784229878edf4..9803e693b7484 100644 --- a/vllm/model_executor/layers/quantization/marlin.py +++ b/vllm/model_executor/layers/quantization/marlin.py @@ -3,7 +3,12 @@ import torch from torch.nn.parameter import Parameter -from vllm._C import ops +from vllm.utils import is_hpu +if is_hpu(): + from vllm.hpu import ops +else: + from vllm._C import ops + from vllm.model_executor.layers.linear import (LinearMethodBase, set_weight_attrs) from vllm.model_executor.layers.quantization.base_config import ( diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py index ed25455e6ec1f..d2813e8d31265 100644 --- a/vllm/model_executor/layers/quantization/squeezellm.py +++ b/vllm/model_executor/layers/quantization/squeezellm.py @@ -3,7 +3,12 @@ import torch from torch.nn.parameter import Parameter -from vllm._C import ops +from vllm.utils import is_hpu +if is_hpu(): + from vllm.hpu import ops +else: + from vllm._C import ops + from vllm.model_executor.layers.linear import (LinearMethodBase, set_weight_attrs) from vllm.model_executor.layers.quantization.base_config import ( diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index d80e73bbe39e9..9cd243ea47a77 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -27,7 +27,12 @@ import torch import torch.nn as nn -from vllm._C import ops +from vllm.utils import is_hpu +if is_hpu(): + from vllm.hpu import ops + from vllm.hpu.rotary_embed import HpuRotaryEmbedding +else: + from vllm._C import ops def _rotate_neox(x: torch.Tensor) -> torch.Tensor: @@ -353,8 +358,12 @@ def get_rope( return _ROPE_DICT[key] if rope_scaling is None: - rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base, - is_neox_style) + if is_hpu(): + rotary_emb = HpuRotaryEmbedding(head_size, rotary_dim, max_position, base, + is_neox_style) + else: + rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base, + is_neox_style) else: scaling_type = rope_scaling["type"] scaling_factor = rope_scaling["factor"] diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 2cd56f0ce59d8..5d637e1585f55 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -216,7 +216,6 @@ def forward( kv_cache=kv_cache, attn_metadata=attn_metadata, ) - # Fully Connected hidden_states, residual = self.post_attention_layernorm( hidden_states, residual) diff --git a/vllm/model_executor/parallel_utils/communication_op.py b/vllm/model_executor/parallel_utils/communication_op.py index 04b30b4d093d7..382d884f0315e 100644 --- a/vllm/model_executor/parallel_utils/communication_op.py +++ b/vllm/model_executor/parallel_utils/communication_op.py @@ -10,7 +10,7 @@ from vllm.model_executor.parallel_utils.parallel_state import ( get_tensor_model_parallel_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, is_cupy_nccl_enabled_for_all_reduce) - +from vllm.utils import is_hpu def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor: """All-reduce the input tensor across model parallel group. @@ -161,9 +161,10 @@ def broadcast_tensor_dict( metadata_list = [] for key, value in tensor_dict.items(): if isinstance(value, torch.Tensor): - assert value.is_cuda, ( - f"Tensor {key}: {value} is not on cuda. Currently we only " - f"support broadcasting tensors on cuda.") + if not is_hpu(): + assert value.is_cuda, ( + f"Tensor {key}: {value} is not on cuda or HPU. Currently we only " + f"support broadcasting tensors on cuda or HPU.") metadata_list.append( (key, TensorMetadata(value.dtype, value.size()))) else: @@ -187,7 +188,7 @@ def broadcast_tensor_dict( if isinstance(value, TensorMetadata): tensor = torch.empty(value.size, dtype=value.dtype, - device="cuda") + device="hpu" if is_hpu() else "cuda") async_handle = torch.distributed.broadcast(tensor, src=src, async_op=True, diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 534cb75c2fd2f..9e343d11b151d 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -7,7 +7,7 @@ from vllm.model_executor.layers.ops.sample import get_num_triton_sampler_splits from vllm.sampling_params import SamplingParams, SamplingType from vllm.sequence import SequenceData -from vllm.utils import is_pin_memory_available +from vllm.utils import is_pin_memory_available, is_hpu _SAMPLING_EPS = 1e-5 _SEED_0_REPLACEMENT = 3403598558 @@ -269,19 +269,19 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float], sample_indices_t = torch.tensor( sample_indices, device="cpu", - dtype=torch.long, + dtype=torch.int, pin_memory=pin_memory, ) prompt_tensor = torch.tensor( prompt_padded_tokens, device="cpu", - dtype=torch.long, + dtype=torch.int, pin_memory=pin_memory, ) output_tensor = torch.tensor( output_padded_tokens, device="cpu", - dtype=torch.long, + dtype=torch.int, pin_memory=pin_memory, ) # need to transpose and make contiguous to @@ -290,7 +290,7 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float], sampling_seeds_t = torch.tensor( sampling_seeds, device="cpu", - dtype=torch.long, + dtype=torch.int, pin_memory=pin_memory, ).T.contiguous() @@ -339,7 +339,7 @@ def _get_sequence_seeds( else: generator = random.Random(str((seed, ) + extra_entropy)) randint_fn = generator.randint - lo, hi = torch.iinfo(torch.long).min, torch.iinfo(torch.long).max + lo, hi = torch.iinfo(torch.int).min, torch.iinfo(torch.int).max # If the user/random sets seed = 0 but request should # have sampling, we need to change it to something # else. We use a constant in that case. diff --git a/vllm/utils.py b/vllm/utils.py index 4b9558ffe88d8..941be08047967 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -6,6 +6,7 @@ import subprocess import uuid import warnings +import importlib from collections import OrderedDict from functools import lru_cache, partial from platform import uname @@ -125,6 +126,9 @@ def is_neuron() -> bool: transformers_neuronx = None return transformers_neuronx is not None +@lru_cache(maxsize=None) +def is_hpu() -> bool: + return importlib.util.find_spec('habana_frameworks') is not None @lru_cache(maxsize=None) def get_max_shared_memory_bytes(gpu: int = 0) -> int: @@ -350,6 +354,9 @@ def is_pin_memory_available() -> bool: elif is_neuron(): print_warning_once("Pin memory is not supported on Neuron.") return False + elif is_hpu(): + print_warning_once("Pin memory is not supported on HPU.") + return False return True @@ -377,6 +384,52 @@ def __exit__(self, exc_type, exc_val, exc_tb): gc.collect() + + +class HabanaMemoryProfiler: + + def __init__(self, device=None): + self.device = device + + def current_memory_usage() -> float: + # Return the memory usage in bytes. + free_hpu_memory, total_hpu_memory = torch.hpu.mem_get_info() + return total_hpu_memory - free_hpu_memory + + def current_free_memory() -> float: + # Return the memory usage in bytes. + free_hpu_memory, _ = torch.hpu.mem_get_info() + return free_hpu_memory + + def total_memory() -> float: + # Return the memory usage in bytes. + _, total_hpu_memory = torch.hpu.mem_get_info() + return total_hpu_memory + + def __enter__(self): + self.initial_memory = HabanaMemoryProfiler.current_memory_usage() + # This allows us to call methods of the context manager if needed + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.final_memory = HabanaMemoryProfiler.current_memory_usage() + self.consumed_memory = self.final_memory - self.initial_memory + + # Force garbage collection + gc.collect() + +# Adapted from https://stackoverflow.com/a/49361727 +def format_bytes(size): + # 2**10 = 1024 + power = 2**10 + n = 0 + power_labels = {0 : '', 1: 'Ki', 2: 'Mi', 3: 'Gi', 4: 'Ti'} + while abs(size) > power: + size /= power + n += 1 + return f'{size:.4g} {power_labels[n]+"B"}' + + def pad_to_max_length(x: List[int], max_len: int, pad: int) -> List[int]: assert len(x) <= max_len return x + [pad] * (max_len - len(x)) diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 27d1727cd16a3..eb21a31842533 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -6,7 +6,10 @@ from vllm.attention import get_attn_backend from vllm.config import CacheConfig, ModelConfig, ParallelConfig from vllm.logger import init_logger -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, is_pin_memory_available +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, is_pin_memory_available, is_hpu + +if is_hpu(): + import habana_frameworks.torch as htorch logger = init_logger(__name__) @@ -46,7 +49,7 @@ def __init__( self.attn_backend = get_attn_backend(model_config.dtype) # Initialize the cache. - self.gpu_cache = self._allocate_kv_cache(self.num_gpu_blocks, "cuda") + self.gpu_cache = self._allocate_kv_cache(self.num_gpu_blocks, "hpu" if is_hpu() else "cuda") self.cpu_cache = self._allocate_kv_cache(self.num_cpu_blocks, "cpu") def _allocate_kv_cache( @@ -60,11 +63,21 @@ def _allocate_kv_cache( pin_memory = is_pin_memory_available() if device == "cpu" else False kv_cache: List[torch.Tensor] = [] for _ in range(self.num_layers): - kv_cache.append( - torch.empty(kv_cache_shape, - dtype=self.dtype, - pin_memory=pin_memory, - device=device)) + if device == 'hpu': + key_cache = torch.zeros(kv_cache_shape, + dtype=self.dtype, + device=device) + value_cache = torch.zeros(kv_cache_shape, + dtype=self.dtype, + device=device) + kv_layer = (key_cache, value_cache) + kv_cache.append(kv_layer) + else: + kv_layer = torch.empty(kv_cache_shape, + dtype=self.dtype, + pin_memory=pin_memory, + device=device) + kv_cache.append(kv_layer) return kv_cache def swap_in(self, src_to_dst: Dict[int, int]) -> None: diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py new file mode 100644 index 0000000000000..7ced639a7fb03 --- /dev/null +++ b/vllm/worker/habana_model_runner.py @@ -0,0 +1,1168 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +############################################################################### + +import contextlib +import time +from typing import Dict, List, Optional, Set, Tuple + +# for logging hpugraph capture +import tqdm +import pandas as pd +import tabulate + +import os +import contextlib +import math +import itertools +import numpy as np +import torch +import torch.nn as nn +import habana_frameworks.torch as htorch +from habana_frameworks.torch.hpu.metrics import metric_localcontext + +from vllm.attention import AttentionMetadata, get_attn_backend +from vllm.config import (DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig, + SchedulerConfig) +from vllm.logger import init_logger +from vllm.lora.layers import LoRAMapping +from vllm.lora.request import LoRARequest +from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager +from vllm.model_executor import SamplingMetadata +from vllm.model_executor.model_loader import get_model +from vllm.model_executor.parallel_utils import cupy_utils, custom_all_reduce +from vllm.model_executor.parallel_utils.communication_op import ( + broadcast_tensor_dict) +from vllm.model_executor.parallel_utils.parallel_state import ( + with_cupy_nccl_for_all_reduce) +from vllm.sampling_params import SamplingParams, SamplingType +from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata +from vllm.utils import (HabanaMemoryProfiler, async_tensor_h2d, + is_pin_memory_available, make_tensor_with_pad, + maybe_expand_dim, pad_to_max_length, format_bytes) + +logger = init_logger(__name__) + +_PAD_SLOT_ID = -1 +LORA_WARMUP_RANK = 8 +_BATCH_SIZE_ALIGNMENT = 16 +# Capture graphs for token size 1, 2, 4, 8, 16, 32, 48, ..., 512. +# NOTE: _get_graph_batch_size needs to be updated if this list is changed. +_BATCH_SIZES_TO_CAPTURE = [1, 2, 4, 8] + [ + _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33) +] + +# Capture graphs for token size 1, 32, 64, 128, 256, 512, 768 ... 2048 +_MAX_CONTEXT_LEN_ALIGNMENT = 256 +_MAX_CONTEXT_LENS_TO_CAPTURE = [1, 32, 64, 128] + [ + _MAX_CONTEXT_LEN_ALIGNMENT * i for i in range(1, 9) +] + + +class HabanaModelRunner: + + def __init__( + self, + model_config: ModelConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + lora_config: Optional[LoRAConfig], + kv_cache_dtype: Optional[str] = "auto", + is_driver_worker: bool = False, + ): + self.model_config = model_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.lora_config = lora_config + self.is_driver_worker = is_driver_worker + + # model_config can be None in tests/samplers/test_sampler.py. + # FIXME(woosuk): This is a hack to make the tests work. Refactor this. + self.sliding_window = (model_config.get_sliding_window() + if model_config is not None else None) + self.device_config = (device_config + if device_config is not None else DeviceConfig()) + self.device = self.device_config.device + + self.model = None + self.block_size = None # Set after initial profiling. + self.lora_manager = None + self.graph_runner_class = HPUGraphRunner + self.graph_runners: Dict[Tuple[int, int], self.graph_runner_class] = {} + + self.max_context_len_to_capture = ( + self.model_config.max_context_len_to_capture + if self.model_config is not None else 0) + # When using CUDA graph, the input block tables must be padded to + # max_context_len_to_capture. However, creating the block table in + # Python can be expensive. To optimize this, we cache the block table + # in numpy and only copy the actual input content at every iteration. + # The shape of the cached block table will be + # (max batch size to capture, max context len to capture / block size). + self.graph_block_tables = None # Set after initial profiling. + self.pin_memory = is_pin_memory_available() + self.kv_cache_dtype = kv_cache_dtype + + self.attn_backend = get_attn_backend( + self.model_config.dtype if model_config is not None else None) + + def load_model(self) -> None: + with HabanaMemoryProfiler() as m: + self.model = get_model(self.model_config, + self.device_config, + lora_config=self.lora_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config) + + self.model_memory_usage = m.consumed_memory + logger.info(f"Loading model weights took " + f"{format_bytes(self.model_memory_usage)} ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)") + + if self.lora_config: + assert hasattr(self.model, "supported_lora_modules" + ) and self.model.supported_lora_modules, ( + "Model does not support LoRA") + assert hasattr( + self.model, + "embedding_modules"), "Model does not have embedding_modules" + assert hasattr(self.model, "embedding_padding_modules" + ), "Model does not have embedding_padding_modules" + self.lora_manager = LRUCacheWorkerLoRAManager( + self.scheduler_config.max_num_seqs, + self.scheduler_config.max_num_batched_tokens, self.vocab_size, + self.lora_config, self.device, self.model.embedding_modules, + self.model.embedding_padding_modules) + self.model = self.lora_manager.create_lora_manager(self.model) + + def set_block_size(self, block_size: int) -> None: + self.block_size = block_size + + self.graph_block_tables = np.zeros( + (max(_BATCH_SIZES_TO_CAPTURE), self.get_max_block_per_batch()), + dtype=np.int32) + + def get_max_block_per_batch(self) -> int: + block_size = self.block_size + return (self.max_context_len_to_capture + block_size - 1) // block_size + + def _prepare_prompt( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int], + List[int], List[int], List[int], Set[LoRARequest]]: + assert len(seq_group_metadata_list) > 0 + input_tokens: List[List[int]] = [] + input_positions: List[List[int]] = [] + slot_mapping: List[List[int]] = [] + lora_index_mapping: List[List[int]] = [] + lora_prompt_mapping: List[List[int]] = [] + lora_requests: Set[LoRARequest] = set() + + prompt_lens: List[int] = [] + context_lens: List[int] = [] + subquery_lens: List[int] = [] + prefix_block_tables: List[List[int]] = [] + for seq_group_metadata in seq_group_metadata_list: + assert seq_group_metadata.is_prompt + seq_ids = list(seq_group_metadata.seq_data.keys()) + assert len(seq_ids) == 1 + seq_id = seq_ids[0] + + seq_data = seq_group_metadata.seq_data[seq_id] + prompt_tokens = seq_data.get_token_ids() + prompt_len = len(prompt_tokens) + prompt_lens.append(prompt_len) + computed_len = 0 + + # NOTE: This only works for oooooooxxx style attention. + computed_block_nums = seq_group_metadata.computed_block_nums + if computed_block_nums is not None and len( + computed_block_nums) > 0 and self.sliding_window is None: + # Prefix is not supported with sliding_window + computed_len = len(computed_block_nums) * self.block_size + prompt_tokens = prompt_tokens[computed_len:] + prefix_block_tables.append(computed_block_nums) + context_len = computed_len + else: + prefix_block_tables.append([]) + context_len = 0 + # actual prompt lens + context_lens.append(context_len) + if computed_len != 0: + import pdb; pdb.set_trace() # what happens if we hit that path?? + subquery_lens.append(prompt_len - computed_len) + + input_tokens.append(prompt_tokens) + # NOTE(woosuk): Here we assume that the first token in the prompt + # is always the first token in the sequence. + input_positions.append( + list(range(computed_len, computed_len + len(prompt_tokens)))) + + lora_id = seq_group_metadata.lora_int_id + + if lora_id > 0: + lora_requests.add(seq_group_metadata.lora_request) + + lora_index_mapping += [lora_id] * (prompt_len - computed_len) + lora_prompt_mapping.append( + [lora_id] * + (prompt_len - computed_len + if seq_group_metadata.sampling_params.prompt_logprobs else 1)) + + if seq_group_metadata.block_tables is None: + # During memory profiling, the block tables are not initialized + # yet. In this case, we just use a dummy slot mapping. + slot_mapping.append([_PAD_SLOT_ID] * prompt_len) + continue + + # Compute the slot mapping. + slot_mapping.append([]) + block_table = seq_group_metadata.block_tables[seq_id] + # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID, + # where start_idx is max(0, prompt_len - sliding_window). + # For example, if the prompt len is 10, sliding window is 8, and + # block size is 4, the first two tokens are masked and the slot + # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1]. + start_idx = 0 + if self.sliding_window is not None: + assert computed_len == 0, ( + "Prefix caching is currently not supported with " + "sliding window attention") + start_idx = max(0, prompt_len - self.sliding_window) + for i in range(computed_len, prompt_len): + if i < start_idx: + slot_mapping[-1].append(_PAD_SLOT_ID) + continue + + block_number = block_table[i // self.block_size] + block_offset = i % self.block_size + slot = block_number * self.block_size + block_offset + slot_mapping[-1].append(slot) + + max_subquery_len = max(subquery_lens) + max_prompt_len = max(prompt_lens) + num_prompt_tokens = len(input_tokens) + assert max_subquery_len > 0 + + lora_index_mapping = lora_index_mapping + + context_lens_tensor = torch.tensor(context_lens, + dtype=torch.int, + device=self.device) + max_prompt_block_table_len = max(len(t) for t in prefix_block_tables) + max_prompt_len = max(prompt_lens) + input_tokens = make_tensor_with_pad(input_tokens, + max_prompt_len, + pad=0, + dtype=torch.long, + device=self.device) + + input_positions = make_tensor_with_pad(input_positions, + max_prompt_len, + pad=0, + dtype=torch.long, + device=self.device) + + slot_mapping = make_tensor_with_pad(slot_mapping, + max_prompt_len, + pad=_PAD_SLOT_ID, + dtype=torch.long, + device=self.device) + + # Prepare prefix block tables + block_tables = make_tensor_with_pad( + prefix_block_tables, + max_len=max_prompt_block_table_len, + pad=0, + dtype=torch.int, + device=self.device, + ) + + # Query length can be shorter than key (i.e., prompt) when prefill + # is chunked or prefix cached. + subquery_lens_tensor = torch.tensor(subquery_lens, + dtype=torch.long, + device=self.device) + subquery_start_loc = torch.zeros(subquery_lens_tensor.shape[0] + 1, + dtype=torch.int32, + device=self.device) + + prompt_lens_tensor = torch.tensor(prompt_lens, + dtype=torch.long, + device=self.device) + seq_start_loc = torch.zeros(prompt_lens_tensor.shape[0] + 1, + dtype=torch.int32, + device=self.device) + + torch.cumsum(subquery_lens_tensor, + dim=0, + dtype=subquery_start_loc.dtype, + out=subquery_start_loc[1:]) + + torch.cumsum(prompt_lens_tensor, + dim=0, + dtype=seq_start_loc.dtype, + out=seq_start_loc[1:]) + attn_metadata = self.attn_backend.make_metadata( + is_prompt=True, + slot_mapping=slot_mapping, + prompt_lens=prompt_lens, + prompt_lens_tensor=prompt_lens_tensor, + num_prompt_tokens=num_prompt_tokens, + num_generation_tokens=0, + max_subquery_len=max_subquery_len, + max_context_len=None, + max_prompt_len=max_prompt_len, + subquery_start_loc=subquery_start_loc, + seq_start_loc=seq_start_loc, + context_lens=context_lens_tensor, + block_tables=block_tables, + use_cuda_graph=False, + kv_cache_dtype=self.kv_cache_dtype, + ) + return (input_tokens, input_positions, attn_metadata, prompt_lens, + subquery_lens, lora_index_mapping, lora_prompt_mapping, + lora_requests) + + def _prepare_decode( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int], + List[int], Set[LoRARequest]]: + assert len(seq_group_metadata_list) > 0 + input_tokens: List[List[int]] = [] + input_positions: List[List[int]] = [] + slot_mapping: List[List[int]] = [] + context_lens: List[int] = [] + block_tables: List[List[int]] = [] + lora_index_mapping: List[int] = [] + lora_prompt_mapping: List[int] = [] + lora_requests: Set[LoRARequest] = set() + + for seq_group_metadata in seq_group_metadata_list: + assert not seq_group_metadata.is_prompt + + seq_ids = list(seq_group_metadata.seq_data.keys()) + lora_id = seq_group_metadata.lora_int_id + + if lora_id > 0: + lora_requests.add(seq_group_metadata.lora_request) + + for seq_id in seq_ids: + seq_data = seq_group_metadata.seq_data[seq_id] + generation_token = seq_data.get_last_token_id() + input_tokens.append([generation_token]) + + seq_len = seq_data.get_len() + position = seq_len - 1 + input_positions.append([position]) + + context_len = seq_len if self.sliding_window is None else min( + seq_len, self.sliding_window) + context_lens.append(context_len) + + block_table = seq_group_metadata.block_tables[seq_id] + block_number = block_table[position // self.block_size] + block_offset = position % self.block_size + slot = block_number * self.block_size + block_offset + slot_mapping.append([slot]) + lora_index_mapping.append(lora_id) + lora_prompt_mapping.append(lora_id) + + if self.sliding_window is not None: + sliding_window_blocks = (self.sliding_window // + self.block_size) + block_table = block_table[-sliding_window_blocks:] + block_tables.append(block_table) + + # vLLM uses cuda graph only for decoding requests. + # See `capture_model` API for more details. + # For decoding requests, batch_size == input_tokens. + batch_size = len(input_tokens) + max_context_len = max(context_lens) + use_captured_graph = ( + not self.model_config.enforce_eager + and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1] + and max_context_len <= self.max_context_len_to_capture) + if use_captured_graph: + graph_batch_size = _get_graph_batch_size(batch_size) + assert graph_batch_size >= batch_size + for _ in range(graph_batch_size - batch_size): + input_tokens.append([0]) + input_positions.append([0]) + slot_mapping.append([_PAD_SLOT_ID]) + context_lens.append(1) + block_tables.append([]) + lora_index_mapping.append(0) + batch_size = graph_batch_size + + input_tokens = torch.tensor(input_tokens, + dtype=torch.long, + device=self.device) + input_positions = torch.tensor(input_positions, + dtype=torch.long, + device=self.device) + slot_mapping = torch.tensor(slot_mapping, + dtype=torch.long, + device=self.device) + context_lens = torch.tensor(context_lens, + dtype=torch.int, + device=self.device) + + if use_captured_graph: + # When using cuda-graph all these tensors should be + # padded. + assert context_lens.shape[0] == input_tokens.shape[0] + assert context_lens.shape[0] == input_positions.shape[0] + assert context_lens.shape[0] == slot_mapping.shape[0] + + # The shape of graph_block_tables is + # [max batch size, max context len // block size]. + graph_max_context_len = _get_graph_max_context_len(max_context_len) + assert graph_max_context_len >= max_context_len + graph_block_count = math.ceil(graph_max_context_len / self.block_size) + input_block_tables = self.graph_block_tables[:batch_size, :graph_block_count] + + for i, block_table in enumerate(block_tables): + if block_table: + input_block_tables[i, :len(block_table)] = block_table + block_tables = torch.tensor(input_block_tables, device=self.device) + else: + max_block_table_len = max( + len(block_table) for block_table in block_tables) + block_tables = make_tensor_with_pad( + block_tables, + max_len=max_block_table_len, + pad=0, + dtype=torch.int, + device=self.device, + ) + + attn_metadata = self.attn_backend.make_metadata( + is_prompt=False, + slot_mapping=slot_mapping, + prompt_lens=None, + prompt_lens_tensor=None, + num_prompt_tokens=0, + num_generation_tokens=len(input_tokens), + max_subquery_len=None, + max_context_len=max_context_len, + max_prompt_len=None, + subquery_start_loc=None, + seq_start_loc=None, + context_lens=context_lens, + block_tables=block_tables, + use_cuda_graph=use_captured_graph, + kv_cache_dtype=self.kv_cache_dtype, + ) + return (input_tokens, input_positions, attn_metadata, + lora_index_mapping, lora_prompt_mapping, lora_requests) + + def _prepare_sample( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + prompt_lens: List[int], + subquery_lens: Optional[List[int]], + ) -> SamplingMetadata: + seq_groups: List[Tuple[List[int], SamplingParams]] = [] + selected_token_indices: List[int] = [] + generators: List[torch.Generator] = [] + selected_token_start_idx = 0 + categorized_sample_indices = {t: [] for t in SamplingType} + categorized_sample_indices_start_idx = 0 + categorized_sampled_token_indices_start_idx = 0 + max_subquery_len = max(subquery_lens) if subquery_lens else 1 + for i, seq_group_metadata in enumerate(seq_group_metadata_list): + seq_ids = list(seq_group_metadata.seq_data.keys()) + sampling_params = seq_group_metadata.sampling_params + seq_groups.append((seq_ids, sampling_params)) + + if seq_group_metadata.is_prompt: + assert len(seq_ids) == 1 + assert subquery_lens is not None + subquery_len = subquery_lens[i] + if sampling_params.prompt_logprobs is not None: + # NOTE: prompt token positions do not need sample, skip + categorized_sample_indices_start_idx += subquery_len - 1 + + categorized_sample_indices[ + sampling_params.sampling_type].append([ + categorized_sample_indices_start_idx, + categorized_sampled_token_indices_start_idx + ]) + categorized_sample_indices_start_idx += 1 + categorized_sampled_token_indices_start_idx += 1 + + if sampling_params.prompt_logprobs is not None: + selected_token_indices.extend( + range(selected_token_start_idx, + selected_token_start_idx + subquery_len - 1)) + selected_token_indices.append(selected_token_start_idx + + subquery_len - 1) + selected_token_start_idx += max_subquery_len + + if sampling_params.seed is not None: + seq_group_metadata.state.generator = torch.Generator( + device=self.device).manual_seed(sampling_params.seed) + else: + num_seqs = len(seq_ids) + selected_token_indices.extend( + range(selected_token_start_idx, + selected_token_start_idx + num_seqs)) + selected_token_start_idx += num_seqs + + categorized_sample_indices[ + sampling_params.sampling_type].extend( + zip( + range( + categorized_sample_indices_start_idx, + categorized_sample_indices_start_idx + + num_seqs), + range( + categorized_sampled_token_indices_start_idx, + categorized_sampled_token_indices_start_idx + + num_seqs))) + categorized_sample_indices_start_idx += num_seqs + categorized_sampled_token_indices_start_idx += num_seqs + + if sampling_params.seed is not None: + generators.append(seq_group_metadata.state.generator) + + selected_token_indices = async_tensor_h2d(selected_token_indices, + dtype=torch.long, + target_device=self.device, + pin_memory=self.pin_memory) + + categorized_sample_indices = { + t: maybe_expand_dim( + async_tensor_h2d(seq_ids, + dtype=torch.int, + target_device=self.device, + pin_memory=self.pin_memory), 2, 2) + for t, seq_ids in categorized_sample_indices.items() + } + + seq_data: Dict[int, SequenceData] = {} + for seq_group_metadata in seq_group_metadata_list: + seq_data.update(seq_group_metadata.seq_data) + + sampling_metadata = SamplingMetadata( + seq_groups=seq_groups, + seq_data=seq_data, + prompt_lens=prompt_lens, + selected_token_indices=selected_token_indices, + categorized_sample_indices=categorized_sample_indices, + generators=generators, + ) + return sampling_metadata + + def prepare_input_tensors( + self, + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], + ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata, + Set[int], LoRAMapping]: + if self.is_driver_worker: + # NOTE: We assume that all sequences in the group are all prompts or + # all decodes. + is_prompt = seq_group_metadata_list[0].is_prompt + # Prepare input tensors. + if is_prompt: + (input_tokens, input_positions, attn_metadata, prompt_lens, + subquery_lens, lora_index_mapping, lora_prompt_mapping, + lora_requests) = self._prepare_prompt(seq_group_metadata_list) + else: + (input_tokens, input_positions, attn_metadata, + lora_index_mapping, lora_prompt_mapping, + lora_requests) = self._prepare_decode(seq_group_metadata_list) + prompt_lens = [] + subquery_lens = None + sampling_metadata = self._prepare_sample(seq_group_metadata_list, + prompt_lens, + subquery_lens) + + if self.lora_config: + lora_mapping = LoRAMapping( + lora_index_mapping, + lora_prompt_mapping, + ) + else: + lora_mapping = None + + # Broadcast the metadata. + metadata_dict = { + "input_tokens": input_tokens, + "input_positions": input_positions, + "selected_token_indices": + sampling_metadata.selected_token_indices, + "lora_requests": lora_requests, + "lora_mapping": lora_mapping, + } + metadata_dict.update(attn_metadata.asdict_zerocopy()) + broadcast_tensor_dict(metadata_dict, src=0) + else: + metadata_dict = broadcast_tensor_dict(src=0) + input_tokens = metadata_dict.pop("input_tokens") + input_positions = metadata_dict.pop("input_positions") + selected_token_indices = metadata_dict.pop( + "selected_token_indices") + lora_mapping = metadata_dict.pop("lora_mapping") + lora_requests = metadata_dict.pop("lora_requests") + attn_metadata = self.attn_backend.make_metadata(**metadata_dict) + sampling_metadata = SamplingMetadata( + seq_groups=None, + seq_data=None, + prompt_lens=None, + selected_token_indices=selected_token_indices, + categorized_sample_indices=None, + generators=None, + perform_sampling=False, + ) + + return (input_tokens, input_positions, attn_metadata, + sampling_metadata, lora_requests, lora_mapping) + + @torch.inference_mode() + def execute_model( + self, + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], + kv_caches: List[torch.Tensor], + ) -> Optional[SamplerOutput]: + (input_tokens, input_positions, attn_metadata, sampling_metadata, + lora_requests, + lora_mapping) = self.prepare_input_tensors(seq_group_metadata_list) + + if self.lora_config: + self.set_active_loras(lora_requests, lora_mapping) + + # Execute the model. + if attn_metadata.use_cuda_graph: + graph_batch_size = input_tokens.shape[0] + graph_block_count = attn_metadata.block_tables.shape[1] + graph_runner_key = (graph_batch_size, graph_block_count) + model_executable = self.graph_runners[graph_runner_key] + logger.info(f"Executing {self.graph_runner_class.__name__} with batch {graph_batch_size}, block_count {graph_block_count} (context_len up to {graph_block_count*self.block_size}, currently {torch.max(attn_metadata.context_lens).item()})") + else: + model_executable = self.model + hidden_states = model_executable( + input_ids=input_tokens, + positions=input_positions, + kv_caches=kv_caches, + attn_metadata=attn_metadata, + ) + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + # Compute the logits. + logits = self.model.compute_logits(hidden_states, sampling_metadata) + + # Only perform sampling in the driver worker. + if not sampling_metadata.perform_sampling: + return None + + # Sample the next token. + output = self.model.sample( + logits=logits, + sampling_metadata=sampling_metadata, + ) + return output + + @torch.inference_mode() + def profile_run(self) -> None: + # Enable top-k sampling to reflect the accurate memory usage. + sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) + max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens + max_num_seqs = self.scheduler_config.max_num_seqs + + # This represents the maximum number of different requests + # that will have unique loras, an therefore the max amount of memory + # consumption create dummy lora request copies from the lora request + # passed in, which contains a lora from the lora warmup path. + dummy_lora_requests = [] + dummy_lora_requests_per_seq = [] + if self.lora_config: + for idx in range(self.lora_config.max_loras): + lora_id = idx + 1 + dummy_lora_request = LoRARequest( + lora_name=f"warmup_{lora_id}", + lora_int_id=lora_id, + lora_local_path="/not/a/real/path", + ) + self.lora_manager.add_dummy_lora(dummy_lora_request, + rank=LORA_WARMUP_RANK) + dummy_lora_requests.append(dummy_lora_request) + dummy_lora_requests_per_seq = [ + dummy_lora_requests[idx % len(dummy_lora_requests)] + for idx in range(max_num_seqs) + ] + + # Profile memory usage with max_num_sequences sequences and the total + # number of tokens equal to max_num_batched_tokens. + seqs: List[SequenceGroupMetadata] = [] + for group_id in range(max_num_seqs): + seq_len = (max_num_batched_tokens // max_num_seqs + + (group_id < max_num_batched_tokens % max_num_seqs)) + seq_data = SequenceData([0] * seq_len) + seq = SequenceGroupMetadata( + request_id=str(group_id), + is_prompt=True, + seq_data={group_id: seq_data}, + sampling_params=sampling_params, + block_tables=None, + lora_request=dummy_lora_requests_per_seq[group_id] + if dummy_lora_requests_per_seq else None, + ) + seqs.append(seq) + + # Run the model with the dummy inputs. + num_layers = self.model_config.get_num_layers(self.parallel_config) + kv_caches = [None] * num_layers + self.execute_model(seqs, kv_caches) + torch.hpu.synchronize() + return + + def remove_all_loras(self) -> bool: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.remove_all_loras() + + def set_active_loras(self, lora_requests: List[LoRARequest], + lora_mapping: LoRAMapping) -> None: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + self.lora_manager.set_active_loras(lora_requests, lora_mapping) + + def add_lora(self, lora_request: LoRARequest) -> bool: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.add_lora(lora_request) + + def remove_lora(self, lora_id: int) -> bool: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.remove_lora(lora_id) + + def list_loras(self) -> Set[int]: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.list_loras() + + @torch.inference_mode() + def capture_model(self, kv_caches: List[torch.Tensor]) -> None: + """Cuda graph capture a model. + + Note that CUDA graph's performance gain is negligible if number + of batched tokens are larger than 200. And since CUDA graph + requires fixed sized tensors, supporting large/variable batch + size requires high GPU memory overhead. Thus, vLLM only captures + decoding requests. Mixed batch (chunked prefill + decoding) or + prefill requests are not captured. + + Since it is used for decoding-only, it assumes there's only 1 token + per sequence in the batch. + """ + # NOTE(woosuk): This is a hack to ensure that the NCCL backend is never + # deleted before the CUDA graphs. + self.cupy_nccl_backend = cupy_utils.get_nccl_backend() + + assert not self.model_config.enforce_eager + logger.info("Capturing the model for HPUGraphs. This may lead to " + "unexpected consequences if the model is not static. To " + "run the model in eager mode, set 'enforce_eager=True' or " + "use '--enforce-eager' in the CLI.") + logger.info("HPUGraphs can take additional ~10 GiB memory per HPU. " + "If you are running out of memory, consider decreasing " + "`gpu_memory_utilization` or enforcing eager mode. " + "You can also reduce the `max_num_seqs` as needed " + "to decrease memory usage.") + start_time = time.perf_counter() + + # Prepare dummy inputs. These will be reused for all batch sizes. + max_batch_size = max(_BATCH_SIZES_TO_CAPTURE) + input_tokens = torch.zeros(max_batch_size, 1, dtype=torch.long).to('hpu') + input_positions = torch.zeros(max_batch_size, 1, dtype=torch.long).to('hpu') + slot_mapping = torch.zeros(max_batch_size, 1, dtype=torch.long).to('hpu') # TODO(kzawora): when using torch.empty, following occurs: RuntimeError: Error when trying to cast Long to Int, Input values range [0, 139632108750000] exceeds Int range [-2147483648, 2147483647] + slot_mapping.fill_(_PAD_SLOT_ID) + context_lens = torch.ones(max_batch_size, dtype=torch.int32).to('hpu') + block_tables = torch.from_numpy(self.graph_block_tables).to('hpu') + + graph_batch_size = _get_graph_batch_size( + self.scheduler_config.max_num_seqs) + batch_size_capture_list = [ + bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size + ] + + # NOTE(woosuk): There are 3 backends for all-reduce: custom all-reduce + # kernel, CuPy NCCL, and PyTorch NCCL. When using CUDA graph, we use + # either custom all-reduce kernel or CuPy NCCL. When not using CUDA + # graph, we use either custom all-reduce kernel or PyTorch NCCL. + # We always prioritize using custom all-reduce kernel but fall back + # to PyTorch or CuPy NCCL if it is disabled or not supported. + with custom_all_reduce.capture(): + # NOTE: Capturing the largest batch size first may help reduce the + # memory usage of CUDA graph. + valid_combinations = [] + total_combinations = len(_BATCH_SIZES_TO_CAPTURE)*len(_MAX_CONTEXT_LENS_TO_CAPTURE) + import pandas as pd + df = pd.DataFrame(index=_BATCH_SIZES_TO_CAPTURE, columns=_MAX_CONTEXT_LENS_TO_CAPTURE) + for idx, (batch_size, max_context_len) in enumerate(itertools.product(reversed(_BATCH_SIZES_TO_CAPTURE), reversed(_MAX_CONTEXT_LENS_TO_CAPTURE))): + block_count = math.ceil(max_context_len / self.block_size) + # Skip capture of "out-of-bound" batch sizes and context lengths + if batch_size > self.scheduler_config.max_num_seqs: + logger.debug(f"[{idx}/{total_combinations}] Skipping capture for batch {batch_size}, max_context_len {max_context_len}, block_count {block_count}. Reason: Batch out of bound.") + df[max_context_len][batch_size] = 'batch OoB' + continue + if max_context_len > self.max_context_len_to_capture: + logger.debug(f"[{idx}/{total_combinations}] Skipping capture for batch {batch_size}, max_context_len {max_context_len}, block_count {block_count}. Reason: Nax context length out of bound.") + df[max_context_len][batch_size] = 'ctx OoB' + continue + block_count = math.ceil(max_context_len / self.block_size) + captured_block_counts = [math.ceil(cl / self.block_size) for (n, cl) in valid_combinations if n == batch_size] + if block_count in captured_block_counts: + logger.debug(f"[{idx}/{total_combinations}] Skipping capture for batch {batch_size}, max_context_len {max_context_len}, block_count {block_count}. Reason: Block size already captured.") + df[max_context_len][batch_size] = 'redundant' + continue + logger.debug(f"[{idx}/{total_combinations}] Will capture for batch {batch_size}, max_context_len {max_context_len}, block_count {block_count}. Constraints met.") + df[max_context_len][batch_size] = 'VALID' + valid_combinations.append((batch_size, max_context_len)) + + total_valid_hpugraphs = len(valid_combinations) + logger.info(f"Starting capture {total_valid_hpugraphs} valid HPUGraphs. Skipping capture of {total_combinations-total_valid_hpugraphs}/{total_combinations} graphs due to batch/context constraints.") + logger.debug(f"Capture summary (row: batch_size; col: max_context_len):") + logger.debug(tabulate.tabulate(df, tablefmt='mixed_outline', headers='keys', showindex="always")) + + graph_runner_name = self.graph_runner_class.__name__ + graph_mem_usage_df = pd.DataFrame(index=list(reversed(sorted({b for b,c in valid_combinations}))), columns=list(reversed(sorted({c for b,c in valid_combinations})))) + pbar = tqdm.tqdm(valid_combinations) + start_mem = HabanaMemoryProfiler.current_memory_usage() + log_graph_compilation_all = os.environ.get('VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0' + log_graph_compilation = os.environ.get('VLLM_HPU_LOG_STEP_GRAPH_COMPILATION', '0') != '0' or log_graph_compilation_all + + for idx, (batch_size, max_context_len) in enumerate(pbar): + block_count = math.ceil(max_context_len / self.block_size) + # Create dummy attn_metadata. + attn_metadata = self.attn_backend.make_metadata( + is_prompt=False, + slot_mapping=slot_mapping[:batch_size], + prompt_lens=None, + prompt_lens_tensor=None, + num_prompt_tokens=0, + num_generation_tokens=batch_size, + max_subquery_len=None, + max_context_len=block_count*self.block_size, + max_prompt_len=None, + subquery_start_loc=None, + seq_start_loc=None, + context_lens=context_lens[:batch_size], + block_tables=block_tables[:batch_size, :block_count], + use_cuda_graph=True, + kv_cache_dtype=self.kv_cache_dtype, + ) + + if self.lora_config: + lora_mapping = LoRAMapping( + [0] * batch_size, + [0] * batch_size, + ) + self.set_active_loras(set(), lora_mapping) + graph_runner = self.graph_runner_class(self.model) + local_start_mem = HabanaMemoryProfiler.current_memory_usage() + capture_start = time.time() + desc = f'Capturing {graph_runner_name} for batch {batch_size}, max_context_len {max_context_len}, block_count {block_count}, allocated {format_bytes(local_start_mem - start_mem)} device memory in total ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)' + pbar.set_description(desc) + logger.debug(f"[{idx}/{total_valid_hpugraphs}] {desc}...") + profiling_ctx = contextlib.nullcontext() if not (log_graph_compilation_all or log_graph_compilation) else metric_localcontext("graph_compilation") + with profiling_ctx as gc_local_metric: + graph_runner.capture( + input_tokens[:batch_size], + input_positions[:batch_size], + kv_caches, + attn_metadata, + ) + if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0) or log_graph_compilation_all: + logger.info(f"VLLM_HPU_STEP_GRAPH_COMPILATION: {gc_local_metric.stats()}, {graph_runner_name}; batch {batch_size}, max_context_len {max_context_len}, block_count {block_count}") + self.graph_runners[(batch_size, block_count)] = graph_runner + capture_end = time.time() + local_end_mem = HabanaMemoryProfiler.current_memory_usage() + mem_usage_str = format_bytes(local_end_mem - local_start_mem) + graph_mem_usage_df[max_context_len][batch_size] = mem_usage_str + logger.debug(f"[{idx}/{total_valid_hpugraphs}] {desc}... done in {capture_end-capture_start:.2f} seconds! Took {mem_usage_str} of device memory ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)") + + end_time = time.perf_counter() + elapsed_time = end_time - start_time + # This usually takes < 10 seconds. + end_mem = HabanaMemoryProfiler.current_memory_usage() + logger.info(f"Graph capturing finished in {elapsed_time:.0f} secs, allocated {format_bytes(end_mem - start_mem)} of device memory ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)") + logger.info(f"Graph memory allocation summary (row: batch_size; col: max_context_len):") + logger.info(tabulate.tabulate(graph_mem_usage_df, tablefmt='mixed_outline', headers='keys', showindex="always")) + + def __del__(self) -> None: + # Delete the CUDA graphs before deleting the CuPy NCCL communicator. + # NOTE(woosuk): This is necessary because otherwise deadlocks can + # happen. + # FIXME(woosuk): This is a bit hacky. Find a more robust solution. + self.graph_runners.clear() + self.cupy_nccl_backend = None + + @property + def vocab_size(self) -> int: + return self.model_config.get_vocab_size() + + +class FakeHPUGraphRunner: + + def __init__(self, model: nn.Module): + self.model = model + + def capture( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + ) -> None: + return + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + return self.model( + input_ids, + positions, + kv_caches, + attn_metadata, + ) + + def __call__(self, *args, **kwargs): + return self.forward(*args, **kwargs) + +class FakeHPUGraphRunnerWithWarmup: + + def __init__(self, model: nn.Module): + self.model = model + + def capture( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + ) -> None: + htorch.core.mark_step() + out = self.model( + input_ids, + positions, + kv_caches, + attn_metadata, + ) + htorch.core.mark_step() + htorch.hpu.synchronize() + return + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + htorch.core.mark_step() + out = self.model( + input_ids, + positions, + kv_caches, + attn_metadata, + ) + htorch.core.mark_step() + return out + + def __call__(self, *args, **kwargs): + return self.forward(*args, **kwargs) +class HPUGraphRunner: + + def __init__(self, model: nn.Module): + self.model = model + self.graph = None + self.input_buffers: Dict[str, torch.Tensor] = {} + self.output_buffers: Dict[str, torch.Tensor] = {} + + def capture( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + ) -> None: + assert self.graph is None + # Run the model once without capturing the graph. + # This is to make sure that the captured graph does not include the + # kernel launches for initial benchmarking (e.g., Triton autotune). + self.model( + input_ids, + positions, + kv_caches, + attn_metadata, + ) + htorch.hpu.synchronize() + + # Capture the graph. + # NOTE(woosuk): Python 3.8 does not support multi-line with statements. + # https://stackoverflow.com/questions/31039022/python-multi-line-with-statement + self.graph = htorch.hpu.HPUGraph() + with htorch.hpu.graph(self.graph): # noqa: SIM117 + hidden_states = self.model( + input_ids, + positions, + kv_caches, + attn_metadata, + ) + torch.hpu.synchronize() + + # Save the input and output buffers. + self.input_buffers = { + "input_ids": input_ids, + "positions": positions, + "kv_caches": kv_caches, + "slot_mapping": attn_metadata.slot_mapping, + "context_lens": attn_metadata.context_lens, + "block_tables": attn_metadata.block_tables, + } + self.output_buffers = {"hidden_states": hidden_states} + return + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + # KV caches are fixed tensors, so we don't need to copy them. + del kv_caches + + # Copy the input tensors to the input buffers. + self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True) + self.input_buffers["positions"].copy_(positions, non_blocking=True) + self.input_buffers["slot_mapping"].copy_(attn_metadata.slot_mapping, + non_blocking=True) + self.input_buffers["context_lens"].copy_(attn_metadata.context_lens, + non_blocking=True) + self.input_buffers["block_tables"].copy_(attn_metadata.block_tables, + non_blocking=True) + # Run the graph. + self.graph.replay() + + # Return the output tensor. + return self.output_buffers["hidden_states"] + + def __call__(self, *args, **kwargs): + return self.forward(*args, **kwargs) + +class ExperimentalHPUGraphRunner: + def __init__(self, model: nn.Module): + self.model = model + + def capture( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + ) -> None: + class ModelWrapper(torch.nn.Module): + def __init__(self, model): + super().__init__() + self.model = model + self.attn_backend = get_attn_backend(torch.bfloat16) + def forward(self, input_ids, positions, kv_caches, slot_mapping, context_lens, block_tables): + wrapper_attn_metadata = self.attn_backend.make_metadata( + is_prompt=attn_metadata.is_prompt, + slot_mapping=slot_mapping, + prompt_lens=None, + prompt_lens_tensor=None, + num_prompt_tokens=0, + num_generation_tokens=attn_metadata.num_generation_tokens, + max_subquery_len=None, + max_context_len=attn_metadata.max_context_len, + max_prompt_len=None, + subquery_start_loc=None, + seq_start_loc=None, + context_lens=context_lens, + block_tables=block_tables, + use_cuda_graph=True, + kv_cache_dtype=attn_metadata.kv_cache_dtype, + ) + return self.model( + input_ids, + positions, + kv_caches, + wrapper_attn_metadata + ) + self.graph_model = htorch.hpu.wrap_in_hpu_graph(ModelWrapper(self.model)) + out = self.graph_model( + input_ids, + positions, + kv_caches, + attn_metadata.slot_mapping, + attn_metadata.context_lens, + attn_metadata.block_tables, + ) + htorch.hpu.synchronize() + return + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + out = self.graph_model( + input_ids, + positions, + kv_caches, + attn_metadata.slot_mapping, + attn_metadata.context_lens, + attn_metadata.block_tables, + ) + return out + + + def __call__(self, *args, **kwargs): + return self.forward(*args, **kwargs) + + +def _get_graph_batch_size(batch_size: int) -> int: + """Returns the padded batch size given actual batch size. + + Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT, + 2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT... + """ + if batch_size <= 2: + return batch_size + elif batch_size <= 4: + return 4 + elif batch_size <= 8: + return 8 + else: + return ((batch_size + _BATCH_SIZE_ALIGNMENT - 1) // + _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT) + + +def _get_graph_max_context_len(max_context_len: int) -> int: + """Returns the padded batch size given actual batch size. + + Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT, + 2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT... + """ + if max_context_len <= 32: + return 32 + elif max_context_len <= 64: + return 64 + elif max_context_len <= 128: + return 128 + else: + return ((max_context_len + _MAX_CONTEXT_LEN_ALIGNMENT - 1) // + _MAX_CONTEXT_LEN_ALIGNMENT * _MAX_CONTEXT_LEN_ALIGNMENT) diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py new file mode 100644 index 0000000000000..bbfd7dad7f90a --- /dev/null +++ b/vllm/worker/habana_worker.py @@ -0,0 +1,263 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +############################################################################### + +import gc +import os +from typing import Dict, List, Optional, Set, Tuple + +import torch +import habana_frameworks.torch as htorch +import torch.distributed + +from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, + ParallelConfig, SchedulerConfig) +from vllm.lora.request import LoRARequest +from vllm.model_executor import set_random_seed +from vllm.model_executor.parallel_utils.communication_op import ( + broadcast_tensor_dict) +from vllm.model_executor.parallel_utils.custom_all_reduce import init_custom_ar +from vllm.model_executor.parallel_utils.parallel_state import ( + ensure_model_parallel_initialized) +from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.worker.cache_engine import CacheEngine +from vllm.worker.habana_model_runner import HabanaModelRunner + + +class HabanaWorker: + """A worker class that executes (a partition of) the model on a HPU. + + Each worker is associated with a single HPU. The worker is responsible for + maintaining the KV cache and executing the model on the HPU. In case of + distributed inference, each worker is assigned a partition of the model. + """ + + def __init__( + self, + model_config: ModelConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + local_rank: int, + rank: int, + distributed_init_method: str, + lora_config: Optional[LoRAConfig] = None, + kv_cache_dtype: Optional[str] = "auto", + is_driver_worker: bool = False, + ) -> None: + self.model_config = model_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + self.local_rank = local_rank + self.rank = rank + self.distributed_init_method = distributed_init_method + self.lora_config = lora_config + self.is_driver_worker = is_driver_worker + if self.is_driver_worker: + assert self.rank == 0, "The driver worker must have rank 0." + + self.model_runner = HabanaModelRunner(model_config, + parallel_config, + scheduler_config, + device_config, + lora_config=self.lora_config, + kv_cache_dtype=kv_cache_dtype, + is_driver_worker=is_driver_worker) + # Uninitialized cache engine. Will be initialized by + # self.init_cache_engine(). + self.cache_config = None + self.cache_engine = None + self.hpu_cache = None + + def init_device(self) -> None: + if self.device_config.device.type == "hpu": + self.device = torch.device("hpu") + torch.hpu.set_device(self.device) + self.init_hpu_memory = torch.hpu.mem_get_info()[0] + else: + raise RuntimeError( + f"Not support device type: {self.device_config.device}") + # Initialize the distributed environment. + init_distributed_environment(self.parallel_config, self.rank, + self.distributed_init_method) + # Set random seed. + set_random_seed(self.model_config.seed) + + def load_model(self): + self.model_runner.load_model() + + @torch.inference_mode() + def profile_num_available_blocks( + self, + block_size: int, + hpu_memory_utilization: float, + cpu_swap_space: int, + cache_dtype: str, + ) -> Tuple[int, int]: + """Profiles the peak memory usage of the model and returns the maximum + number of HPU and CPU cache blocks that can be allocated. + + Args: + block_size: The size of the cache block. + hpu_memory_utilization: The fraction of the total HPU memory to use. + cpu_swap_space: The size of the CPU swap space in bytes. + """ + # Profile the memory usage of the model and get the maximum number of + # cache blocks that can be allocated with the remaining free memory. + + # Execute a forward pass with dummy inputs to profile the memory usage + # of the model. + self.model_runner.profile_run() + + # Calculate the number of blocks that can be allocated with the + # profiled peak memory. + torch.hpu.synchronize() + free_hpu_memory, total_hpu_memory = torch.hpu.mem_get_info() + # NOTE(woosuk): Here we assume that the other processes using the same + # HPU did not change their memory usage during the profiling. + peak_memory = self.init_hpu_memory - free_hpu_memory + assert peak_memory > 0, ( + "Error in memory profiling. This happens when the hpu memory was " + "not properly cleaned up before initializing the vLLM instance.") + + cache_block_size = self.get_cache_block_size_bytes( + block_size, cache_dtype) + num_hpu_blocks = int( + (total_hpu_memory * hpu_memory_utilization - peak_memory) // + cache_block_size) + num_cpu_blocks = int(cpu_swap_space // cache_block_size) + num_hpu_blocks = max(num_hpu_blocks, 0) + num_cpu_blocks = max(num_cpu_blocks, 0) + if self.model_runner.lora_manager: + self.model_runner.remove_all_loras() + gc.collect() + return num_hpu_blocks, num_cpu_blocks + + def init_cache_engine(self, cache_config: CacheConfig) -> None: + self.cache_config = cache_config + self.cache_engine = CacheEngine(self.cache_config, self.model_config, + self.parallel_config) + self.hpu_cache = self.cache_engine.gpu_cache + self.model_runner.set_block_size(self.cache_engine.block_size) + htorch.hpu.synchronize() # we want to materialize cache tensors before we proceed with graph capture/execution + + def warm_up_model(self) -> None: + if not self.model_config.enforce_eager: + self.model_runner.capture_model(self.hpu_cache) + # Reset the seed to ensure that the random state is not affected by + # the model initialization and profiling. + set_random_seed(self.model_config.seed) + + def cache_swap( + self, + blocks_to_swap_in: Dict[int, int], + blocks_to_swap_out: Dict[int, int], + blocks_to_copy: Dict[int, List[int]], + ) -> None: + # Issue cache operations. + # TODO(woosuk): Profile swapping overhead and optimize if needed. + if blocks_to_swap_in: + self.cache_engine.swap_in(blocks_to_swap_in) + if blocks_to_swap_out: + self.cache_engine.swap_out(blocks_to_swap_out) + if blocks_to_copy: + self.cache_engine.copy(blocks_to_copy) + + @torch.inference_mode() + def execute_model( + self, + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None, + blocks_to_swap_in: Optional[Dict[int, int]] = None, + blocks_to_swap_out: Optional[Dict[int, int]] = None, + blocks_to_copy: Optional[Dict[int, List[int]]] = None, + ) -> Optional[SamplerOutput]: + if self.is_driver_worker: + assert seq_group_metadata_list is not None + num_seq_groups = len(seq_group_metadata_list) + assert blocks_to_swap_in is not None + assert blocks_to_swap_out is not None + assert blocks_to_copy is not None + data = { + "num_seq_groups": num_seq_groups, + "blocks_to_swap_in": blocks_to_swap_in, + "blocks_to_swap_out": blocks_to_swap_out, + "blocks_to_copy": blocks_to_copy, + } + broadcast_tensor_dict(data, src=0) + else: + data = broadcast_tensor_dict(src=0) + num_seq_groups = data["num_seq_groups"] + blocks_to_swap_in = data["blocks_to_swap_in"] + blocks_to_swap_out = data["blocks_to_swap_out"] + blocks_to_copy = data["blocks_to_copy"] + + self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy) + + # If there is no input, we don't need to execute the model. + if num_seq_groups == 0: + return {} + + output = self.model_runner.execute_model(seq_group_metadata_list, + self.hpu_cache) + return output + + def add_lora(self, lora_request: LoRARequest) -> bool: + raise NotImplementedError("LoRA is not implemented for HPU backend.") + + def remove_lora(self, lora_id: int) -> bool: + raise NotImplementedError("LoRA is not implemented for HPU backend.") + + def list_loras(self) -> Set[int]: + raise NotImplementedError("LoRA is not implemented for HPU backend.") + + @property + def max_model_len(self) -> int: + return self.model_config.max_model_len + + @property + def vocab_size(self) -> int: + return self.model_runner.vocab_size + + def get_cache_block_size_bytes(self, block_size: int, + cache_dtype: str) -> int: + """Get the size of the KV cache block size in bytes. + """ + return CacheEngine.get_cache_block_size(block_size, cache_dtype, + self.model_config, + self.parallel_config) + + +def init_distributed_environment( + parallel_config: ParallelConfig, + rank: int, + distributed_init_method: Optional[str] = None, +) -> None: + """Initialize the distributed environment.""" + if torch.distributed.is_initialized(): + torch_world_size = torch.distributed.get_world_size() + if torch_world_size != parallel_config.world_size: + raise RuntimeError( + "torch.distributed is already initialized but the torch world " + "size does not match parallel_config.world_size " + f"({torch_world_size} vs. {parallel_config.world_size}).") + elif not distributed_init_method: + raise ValueError( + "distributed_init_method must be set if torch.distributed " + "is not already initialized") + else: + torch.distributed.init_process_group( + backend="hccl", + world_size=parallel_config.world_size, + rank=rank, + init_method=distributed_init_method, + ) + + # A small all_reduce for warmup. + torch.distributed.all_reduce(torch.zeros(1).to('hpu')) + ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, + parallel_config.pipeline_parallel_size) + + # Initialize a custom fast all-reduce implementation. + if not parallel_config.disable_custom_all_reduce: + init_custom_ar() From 6963277d4364d7d98b354b44ddc5978bb9e85786 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 8 May 2024 20:15:22 +0300 Subject: [PATCH 002/819] adapt habana components to changed vllm apis --- vllm/attention/backends/habana_attn.py | 139 ++--- vllm/attention/ops/habana_paged_attn.py | 31 +- vllm/executor/habana_executor.py | 182 ++---- .../model_executor/layers/logits_processor.py | 2 +- vllm/worker/habana_model_runner.py | 565 +++++++++++++----- vllm/worker/habana_worker.py | 174 ++++-- 6 files changed, 669 insertions(+), 424 deletions(-) diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py index 844dc92b315ac..909c2ad955f25 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/habana_attn.py @@ -13,7 +13,8 @@ LowerTriangularMaskWithTensorBias) from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, - AttentionMetadata) + AttentionMetadata, + AttentionMetadataPerStage) from vllm.attention.ops.habana_paged_attn import (HabanaPagedAttention, HabanaPagedAttentionMetadata) from vllm.logger import init_logger @@ -40,7 +41,7 @@ def get_kv_cache_shape( head_size: int, ) -> Tuple[int, ...]: return HabanaPagedAttention.get_kv_cache_shape(num_blocks, block_size, - num_kv_heads, head_size) + num_kv_heads, head_size) @staticmethod def swap_blocks( @@ -59,7 +60,7 @@ def copy_blocks( @dataclass -class HabanaAttentionMetadata(AttentionMetadata, HabanaPagedAttentionMetadata): +class HabanaAttentionMetadata(AttentionMetadataPerStage, HabanaPagedAttentionMetadata): """Metadata for HabanaAttentionbackend. NOTE: Any python object stored here is not updated when it is @@ -70,37 +71,24 @@ class HabanaAttentionMetadata(AttentionMetadata, HabanaPagedAttentionMetadata): # Currently, input sequences can only contain all prompts # or all decoding. True if all sequences are prompts. is_prompt: bool - # (num_tokens,). The indices of the token slots that input tokens will be - # stored into. E.g., if `slot_mapping` is [35, 2, 17] and the block size - # is 16, the three tokens are stored in the 3rd slot in block 2, 2nd slot - # in block 0, and 1st slot in block 1, respectively. - slot_mapping: torch.Tensor - # (batch_size,). The prompt length per sequence. None if it is a decoding. - prompt_lens: Optional[List[int]] - # prompt_lens stored as a tensor. - prompt_lens_tensor: Optional[torch.Tensor] - # The number of prompt tokens. Doesn't include padding. - num_prompt_tokens: int - # The number of generation tokens. Doesn't include padding. - num_generation_tokens: int - - # NOTE(sang): Definition of context_len, subquery_len, and seqlen. + # (batch_size,). The sequence length per sequence. Sequence length means + # the computed tokens + new tokens None if it is a decoding. + seq_lens: Optional[List[int]] + # seq_lens stored as a tensor. + seq_lens_tensor: Optional[torch.Tensor] + # |---------- N-1 iteration --------| # |---------------- N iteration ---------------------| # |- tokenA -|......................|-- newTokens ---| # |---------- context_len ----------| - # |-------------------- seqlen ----------------------| - # |- subquery_len -| - - # WARNING(sang): context_len has different definition depending on if it is - # prefill vs decoding. When it is prefill, it doesn't include new tokens. - # When it is for decoding, it includes a new token. + # |-------------------- seq_len ----------------------| + # |-- query_len ---| - # Maximum subquery length in the batch. - max_subquery_len: Optional[int] + # Maximum query length in the batch. + max_query_len: Optional[int] # FIXME: It is for flash attn. - # Maximum prompt length in the batch. - max_prompt_len: Optional[int] + # Maximum sequence length in the batch. + max_seq_len: Optional[int] # (batch_size + 1,). The cumulative subquery lengths of the sequences in # the batch, used to index into subquery. E.g., if the subquery length # is [4, 6], it is [0, 4, 10]. @@ -110,6 +98,9 @@ class HabanaAttentionMetadata(AttentionMetadata, HabanaPagedAttentionMetadata): # the batch, used to index into sequence. E.g., if the sequence length is # [4, 6], it is [0, 4, 10]. seq_start_loc: Optional[torch.Tensor] + # (batch_size,) A tensor of context lengths (tokens that are computed + # so far). + context_lens_tensor: Optional[torch.Tensor] # Whether or not if cuda graph is enabled. # Cuda-graph is currently enabled for decoding only. @@ -128,12 +119,12 @@ def __post_init__(self): class HabanaAttentionImpl(AttentionImpl): """ If the input tensors contain prompt tokens, the layout is as follows: - |<--------------- num_prompt_tokens --------------->| - |<--prompt_0-->|<--prompt_1-->|...|<--prompt_N-1--->| + |<--------------- num_prefill_tokens ----------------->| + |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->| Otherwise, the layout is as follows: - |<------------------ num_generation_tokens (M) ----------------->| - |<--generation_0-->|..........|<--generation_M-1-->|<--padding-->| + |<----------------- num_decode_tokens ------------------>| + |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->| Generation tokens can contain padding when cuda-graph is used. Currently, prompt tokens don't contain any padding. @@ -175,7 +166,8 @@ def forward( key: torch.Tensor, value: torch.Tensor, kv_cache: Optional[torch.Tensor], - attn_metadata: HabanaAttentionMetadata, + attn_metadata: AttentionMetadata[HabanaAttentionMetadata], + kv_scale: float, ) -> torch.Tensor: """Forward pass with xFormers and PagedAttention. @@ -191,7 +183,6 @@ def forward( batch_size, seq_len, hidden_size = query.shape _, seq_len_kv, _ = key.shape - query = query.view(-1, self.num_heads, self.head_size) key = key.view(-1, self.num_kv_heads, self.head_size) value = value.view(-1, self.num_kv_heads, self.head_size) @@ -203,14 +194,14 @@ def forward( # If kv_cache is not provided, the new key and value tensors are # not cached. This happens during the initial memory profiling run. HabanaPagedAttention.write_to_paged_cache(key, value, key_cache, - value_cache, - attn_metadata.slot_mapping, - attn_metadata.kv_cache_dtype, - attn_metadata.is_prompt) + value_cache, + attn_metadata.slot_mapping, + attn_metadata.kv_cache_dtype, + attn_metadata.prefill_metadata is not None) - if attn_metadata.is_prompt: + if prefill_meta := attn_metadata.prefill_metadata: # Prompt run. - if kv_cache is None or attn_metadata.block_tables.numel() == 0: + if kv_cache is None or prefill_meta.block_tables.numel() == 0: # normal attention. # block tables are empty if the prompt does not have a cached # prefix. @@ -232,16 +223,16 @@ def forward( self.num_queries_per_kv, value.shape[-1]) - if attn_metadata.attn_bias is None: + if prefill_meta.attn_bias is None: if self.alibi_slopes is None: attn_bias = BlockDiagonalCausalMask.from_seqlens( [seq_len] * batch_size) if self.sliding_window is not None: attn_bias = attn_bias.make_local_attention( self.sliding_window) - attn_metadata.attn_bias = attn_bias + prefill_meta.attn_bias = attn_bias else: - attn_metadata.attn_bias = _make_alibi_bias( + prefill_meta.attn_bias = _make_alibi_bias( self.alibi_slopes, self.num_kv_heads, batch_size, seq_len, query.dtype) query_shape = (batch_size, seq_len, self.num_kv_heads, self.num_queries_per_kv, self.head_size) if self.num_kv_heads != self.num_heads else (batch_size, seq_len, self.num_heads, self.head_size) @@ -250,7 +241,7 @@ def forward( query.view(query_shape), key.view(kv_shape), value.view(kv_shape), - attn_bias=attn_metadata.attn_bias, + attn_bias=prefill_meta.attn_bias, p=0.0, scale=self.scale, ) @@ -263,26 +254,27 @@ def forward( value, key_cache, value_cache, - attn_metadata.block_tables, - attn_metadata.subquery_start_loc, - attn_metadata.prompt_lens_tensor, - attn_metadata.context_lens, - attn_metadata.max_subquery_len, + prefill_meta.block_tables, + prefill_meta.subquery_start_loc, + prefill_meta.seq_lens_tensor, + prefill_meta.context_lens_tensor, + prefill_meta.max_query_len, self.alibi_slopes, ) - else: + if decode_meta := attn_metadata.decode_metadata: # Decoding run. output = HabanaPagedAttention.forward_decode( query, key_cache, value_cache, - attn_metadata.block_tables, - attn_metadata.context_lens, - attn_metadata.max_context_len, + decode_meta.block_tables, + decode_meta.seq_lens_tensor, + decode_meta.max_seq_len, attn_metadata.kv_cache_dtype, self.num_kv_heads, self.scale, self.alibi_slopes, + kv_scale ) # Reshape the output tensor. @@ -293,13 +285,13 @@ def _make_alibi_bias( alibi_slopes: torch.Tensor, num_kv_heads: int, dtype: torch.dtype, - prompt_lens: List[int], + seq_lens: List[int], ) -> LowerTriangularMaskWithTensorBias: attn_biases = [] - for prompt_len in prompt_lens: - bias = torch.arange(prompt_len, dtype=dtype) + for seq_len in seq_lens: + bias = torch.arange(seq_len, dtype=dtype) # NOTE(zhuohan): HF uses - # `bias = bias[None, :].repeat(prompt_len, 1)` + # `bias = bias[None, :].repeat(seq_len, 1)` # here. We find that both biases give the same results, but # the bias below more accurately follows the original ALiBi # paper. @@ -307,46 +299,19 @@ def _make_alibi_bias( # element. bias = bias[None, :] - bias[:, None] - padded_len = (prompt_len + 7) // 8 * 8 + padded_len = (seq_len + 7) // 8 * 8 num_heads = alibi_slopes.shape[0] bias = torch.empty( 1, # batch size num_heads, - prompt_len, + seq_len, padded_len, device=alibi_slopes.device, dtype=dtype, - )[:, :, :, :prompt_len].copy_(bias) + )[:, :, :, :seq_len].copy_(bias) bias.mul_(alibi_slopes[:, None, None]) if num_heads != num_kv_heads: bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads)) attn_biases.append(LowerTriangularMaskWithTensorBias(bias)) return attn_biases - - -def _naive_masked_attention( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - num_heads: int, - num_kv_heads: int, - head_size: int, - scale: float, -) -> torch.Tensor: - query = query.view(-1, num_heads, head_size) - key = key.view(-1, num_kv_heads, head_size) - value = value.view(-1, num_kv_heads, head_size) - seq_len, _, _ = query.shape - attn_mask = torch.triu(torch.ones(seq_len, - seq_len, - dtype=query.dtype, - device=query.device), - diagonal=1) - attn_mask = attn_mask * torch.finfo(query.dtype).min - - attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float() - attn_weights = attn_weights + attn_mask.float() - attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype) - out = torch.einsum("hqk,khd->qhd", attn_weights, value) - return out diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/habana_paged_attn.py index 03027bb01565c..8dc79f17f8c9c 100644 --- a/vllm/attention/ops/habana_paged_attn.py +++ b/vllm/attention/ops/habana_paged_attn.py @@ -16,17 +16,11 @@ @dataclass class HabanaPagedAttentionMetadata: """Metadata for PagedAttention.""" - # (num_tokens,). The indices of the token slots that input tokens will be - # stored into. E.g., if `slot_mapping` is [35, 2, 17] and the block size - # is 16, the three tokens are stored in the 3rd slot in block 2, 2nd slot - # in block 0, and 1st slot in block 1, respectively. - slot_mapping: torch.Tensor - # (batch_size,). The length of context (tokens stored in KV cache) per - # sequence. WARNING: When it is a prefill request, it doesn't include new - # tokens. When it is for decoding, it includes a new token. - context_lens: Optional[torch.Tensor] - # Maximum context length in the batch. - max_context_len: Optional[int] + # (batch_size,). The length of sequences (entire tokens seen so far) per + # sequence. + seq_lens_tensor: Optional[torch.Tensor] + # Maximum sequence length in the batch. + max_seq_len: Optional[int] # (batch_size, max_blocks_per_seq). # Block addresses per sequence. (Seq id -> list of physical block) # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks @@ -34,7 +28,6 @@ class HabanaPagedAttentionMetadata: # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph # captured. block_tables: Optional[torch.Tensor] - kv_cache_dtype: str class HabanaPagedAttention: @@ -88,12 +81,13 @@ def forward_decode( key_cache: torch.Tensor, value_cache: torch.Tensor, block_tables: torch.Tensor, - context_lens: torch.Tensor, - max_context_len: int, + seq_lens: torch.Tensor, + max_seq_len: int, kv_cache_dtype: str, num_kv_heads: int, scale: float, alibi_slopes: Optional[torch.Tensor], + kv_scale: float, ) -> torch.Tensor: block_size = value_cache.shape[3] return ops.paged_attention_v1( @@ -103,9 +97,9 @@ def forward_decode( num_kv_heads, scale, block_tables, - context_lens, + seq_lens, block_size, - max_context_len, + max_seq_len, alibi_slopes, kv_cache_dtype, ) @@ -119,10 +113,11 @@ def forward_prefix( value_cache: torch.Tensor, block_tables: torch.Tensor, subquery_start_loc: torch.Tensor, - prompt_lens_tensor: torch.Tensor, + seq_lens_tensor: torch.Tensor, context_lens: torch.Tensor, - max_subquery_len: int, + max_query_len: int, alibi_slopes: Optional[torch.Tensor], + sliding_window: Optional[int], ) -> torch.Tensor: raise NotImplementedError("forward_prefix is not implemented for HabanaPagedAttention") diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py index dd211eadbea78..cc035f397aa6d 100644 --- a/vllm/executor/habana_executor.py +++ b/vllm/executor/habana_executor.py @@ -2,119 +2,90 @@ # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company ############################################################################### -from typing import Dict, List, Optional - -from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig) +from typing import Any, Dict, List, Optional, Set, Tuple from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase -from vllm.executor.utils import check_block_size_valid from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, make_async, HabanaMemoryProfiler, format_bytes) import os import contextlib +from vllm.worker.worker_base import WorkerWrapperBase + logger = init_logger(__name__) class HabanaExecutor(ExecutorBase): - - def __init__( - self, - model_config: ModelConfig, - cache_config: CacheConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - lora_config: Optional[LoRAConfig], - ) -> None: - self.model_config = model_config - self.cache_config = cache_config - self.lora_config = lora_config - self.parallel_config = parallel_config - self.scheduler_config = scheduler_config - self.device_config = device_config - - # Instantiate the worker and load the model to GPU. + def _init_executor(self) -> None: + """Initialize the worker and load the model.""" self._init_worker() - # Profile the memory usage and initialize the cache. - self._init_cache() - - def _init_worker(self): - # Lazy import the Worker to avoid importing torch.cuda/xformers - # before CUDA_VISIBLE_DEVICES is set in the Worker - from vllm.worker.habana_worker import HabanaWorker - - assert self.parallel_config.world_size == 1, ( - "HabanaExecutor only supports single GPU.") - - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - self.driver_worker = HabanaWorker( - self.model_config, - self.parallel_config, - self.scheduler_config, - self.device_config, - local_rank=0, - rank=0, + def _get_worker_kwargs( + self, + local_rank: int = 0, + rank: int = 0, + distributed_init_method: Optional[str] = None) -> Dict[str, Any]: + """Return worker init args for a given rank.""" + if distributed_init_method is None: + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + return dict( + model_config=self.model_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + device_config=self.device_config, + cache_config=self.cache_config, + load_config=self.load_config, + local_rank=local_rank, + rank=rank, distributed_init_method=distributed_init_method, lora_config=self.lora_config, - kv_cache_dtype=self.cache_config.cache_dtype, - is_driver_worker=True, + vision_language_config=self.vision_language_config, + is_driver_worker=rank == 0, ) - self.driver_worker.init_device() - self.driver_worker.load_model() - def _init_cache(self) -> None: - """Profiles the memory usage and initializes the KV cache. + def _create_worker(self, + local_rank: int = 0, + rank: int = 0, + distributed_init_method: Optional[str] = None): + wrapper = WorkerWrapperBase( + worker_module_name="vllm.worker.habana_worker", + worker_class_name="HabanaWorker", + ) + wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank, + distributed_init_method)) + return wrapper.worker + def _init_worker(self): + assert self.parallel_config.world_size == 1, ( + "GPUExecutor only supports single GPU.") - The engine first profiles the existing memory usage. - Then, it allocates the remaining memory for KV blocks. + self.driver_worker = self._create_worker() + self.driver_worker.init_device() + self.driver_worker.load_model() + def determine_num_available_blocks(self) -> Tuple[int, int]: + """Determine the number of available KV blocks by invoking the + underlying worker. + """ + return self.driver_worker.determine_num_available_blocks() - .. tip:: - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameter. + def initialize_cache(self, num_gpu_blocks : int, num_cpu_blocks) -> None: + """Initialize the KV cache by invoking the underlying worker. """ - # Get the maximum number of blocks that can be allocated on GPU and CPU. - num_gpu_blocks, num_cpu_blocks = ( - self.driver_worker.profile_num_available_blocks( - block_size=self.cache_config.block_size, - hpu_memory_utilization=self.cache_config. - gpu_memory_utilization, - cpu_swap_space=self.cache_config.swap_space_bytes, - cache_dtype=self.cache_config.cache_dtype, - )) - - logger.info(f"# HPU blocks: {num_gpu_blocks}, " - f"# CPU blocks: {num_cpu_blocks}") - - check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, - self.model_config.max_model_len) - - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - - # Initialize the cache. + # NOTE: This is logged in the executor because there can be >1 worker + # with other executors. We could log in the engine level, but work + # remains to abstract away the device for non-GPU configurations. + logger.info("# HPU blocks: %d, # CPU blocks: %d", num_gpu_blocks, + num_cpu_blocks) + with HabanaMemoryProfiler() as cache_init_m: - self.driver_worker.init_cache_engine(cache_config=self.cache_config) + self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) logger.info(f"init_cache_engine took " f"{format_bytes(cache_init_m.consumed_memory)} ({cache_init_m.consumed_memory/HabanaMemoryProfiler.total_memory():.2%} of total memory, gpu_memory_utilization: {self.cache_config.gpu_memory_utilization}, {format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)") - # Warm up the model. This includes capturing the model into CUDA graph - # if enforce_eager is False. - with HabanaMemoryProfiler() as warmup_m: - self.driver_worker.warm_up_model() - logger.info(f"Model warmup took " - f"{format_bytes(warmup_m.consumed_memory)} ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)") - - def execute_model(self, - seq_group_metadata_list: List[SequenceGroupMetadata], - blocks_to_swap_in: Dict[int, int], - blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: - + def execute_model( + self, + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # VLLM_HPU_LOG_STEP_CPU_FALLBACKS - will log cpu fallbacks per engine step, only when there was any @@ -132,12 +103,7 @@ def execute_model(self, gc_ctx = metric_localcontext("graph_compilation") if log_graph_compilation else contextlib.nullcontext() cpu_fallback_ctx = metric_localcontext("cpu_fallback") if log_cpu_fallbacks else contextlib.nullcontext() with gc_ctx as gc_local_metric, cpu_fallback_ctx as cpu_fallback_local_metric: - output = self.driver_worker.execute_model( - seq_group_metadata_list=seq_group_metadata_list, - blocks_to_swap_in=blocks_to_swap_in, - blocks_to_swap_out=blocks_to_swap_out, - blocks_to_copy=blocks_to_copy, - ) + output = self.driver_worker.execute_model(execute_model_req) if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0) or log_graph_compilation_all: logger.warning(f"VLLM_HPU_STEP_GRAPH_COMPILATION: {gc_local_metric.stats()}, {input_stats}") if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] > 0) or log_cpu_fallbacks_all: @@ -145,12 +111,7 @@ def execute_model(self, return output - output = self.driver_worker.execute_model( - seq_group_metadata_list=seq_group_metadata_list, - blocks_to_swap_in=blocks_to_swap_in, - blocks_to_swap_out=blocks_to_swap_out, - blocks_to_copy=blocks_to_copy, - ) + output = self.driver_worker.execute_model(execute_model_req) return output def add_lora(self, lora_request: LoRARequest) -> bool: @@ -172,19 +133,8 @@ class HabanaExecutorAsync(HabanaExecutor, ExecutorAsyncBase): async def execute_model_async( self, - seq_group_metadata_list: List[SequenceGroupMetadata], - blocks_to_swap_in: Dict[int, int], - blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]], - ) -> SamplerOutput: - output = await make_async(self.driver_worker.execute_model)( - seq_group_metadata_list=seq_group_metadata_list, - blocks_to_swap_in=blocks_to_swap_in, - blocks_to_swap_out=blocks_to_swap_out, - blocks_to_copy=blocks_to_copy) + execute_model_req: ExecuteModelRequest, + ) -> List[SamplerOutput]: + output = await make_async(self.driver_worker.execute_model + )(execute_model_req=execute_model_req, ) return output - - async def check_health_async(self) -> None: - # GPUExecutor will always be healthy as long as - # it's running. - return diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index 40ba2130ccd9e..605009e8f695c 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -50,7 +50,7 @@ def forward( # Get the logits for the next tokens. logits = self._get_logits(hidden_states, embedding, embedding_bias) - if logits is not None and sampling_metadata.perform_sampling: + if logits is not None: # and sampling_metadata.perform_sampling: FIXME: this is needed for 8xHPU logits *= self.scale # Apply logits processors (if any). diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 7ced639a7fb03..a25a09c2598fd 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -4,7 +4,8 @@ import contextlib import time -from typing import Dict, List, Optional, Set, Tuple +from enum import IntEnum +from typing import Dict, List, NamedTuple, Optional, Set, Tuple # for logging hpugraph capture import tqdm @@ -21,20 +22,18 @@ import habana_frameworks.torch as htorch from habana_frameworks.torch.hpu.metrics import metric_localcontext -from vllm.attention import AttentionMetadata, get_attn_backend -from vllm.config import (DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig, - SchedulerConfig) +from vllm.attention import (AttentionMetadata, AttentionMetadataPerStage, + get_attn_backend) +from vllm.config import (DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, + ParallelConfig, SchedulerConfig, VisionLanguageConfig) +from vllm.distributed import broadcast_tensor_dict +from vllm.distributed.device_communicators import custom_all_reduce from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping from vllm.lora.request import LoRARequest from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.model_executor import SamplingMetadata from vllm.model_executor.model_loader import get_model -from vllm.model_executor.parallel_utils import cupy_utils, custom_all_reduce -from vllm.model_executor.parallel_utils.communication_op import ( - broadcast_tensor_dict) -from vllm.model_executor.parallel_utils.parallel_state import ( - with_cupy_nccl_for_all_reduce) from vllm.sampling_params import SamplingParams, SamplingType from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata from vllm.utils import (HabanaMemoryProfiler, async_tensor_h2d, @@ -59,6 +58,66 @@ ] +class PreparePromptMetadata(NamedTuple): + input_tokens: List[int] + input_positions: List[int] + attn_metadata: Optional[AttentionMetadataPerStage] + seq_lens: List[int] + query_lens: List[int] + lora_index_mapping: List[int] + lora_prompt_mapping: List[int] + lora_requests: Set[LoRARequest] + multi_modal_input: Optional[torch.Tensor] + slot_mapping: List[int] + + @classmethod + def empty(cls): + return PreparePromptMetadata( + input_tokens=[], + input_positions=[], + attn_metadata=None, + seq_lens=[], + query_lens=[], + lora_index_mapping=[], + lora_prompt_mapping=[], + lora_requests=set(), + multi_modal_input=None, + slot_mapping=[], + ) + + +class PrepareDecodeMetadata(NamedTuple): + input_tokens: List[int] + input_positions: List[int] + attn_metadata: Optional[AttentionMetadata] + lora_index_mapping: List[int] + lora_prompt_mapping: List[int] + lora_requests: Set[LoRARequest] + slot_mapping: List[int] + + @classmethod + def empty(cls): + return PrepareDecodeMetadata( + input_tokens=[], + input_positions=[], + attn_metadata=None, + lora_index_mapping=[], + lora_prompt_mapping=[], + lora_requests=set(), + slot_mapping=[], + ) + + +# How batches are constructed. +class BatchType(IntEnum): + # Every batch is prefill. + PREFILL = 0 + # Every batch is decode. + DECODE = 1 + # Batch is a mixture of prefill and decode. + MIXED = 2 + + class HabanaModelRunner: def __init__( @@ -67,14 +126,17 @@ def __init__( parallel_config: ParallelConfig, scheduler_config: SchedulerConfig, device_config: DeviceConfig, + load_config: LoadConfig, lora_config: Optional[LoRAConfig], kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, + vision_language_config: Optional[VisionLanguageConfig] = None, ): self.model_config = model_config self.parallel_config = parallel_config self.scheduler_config = scheduler_config self.lora_config = lora_config + self.load_config = load_config self.is_driver_worker = is_driver_worker # model_config can be None in tests/samplers/test_sampler.py. @@ -85,35 +147,45 @@ def __init__( if device_config is not None else DeviceConfig()) self.device = self.device_config.device - self.model = None - self.block_size = None # Set after initial profiling. - self.lora_manager = None + # Set after load_model. + self.lora_manager: LRUCacheWorkerLoRAManager = None + self.graph_runner_class = HPUGraphRunner self.graph_runners: Dict[Tuple[int, int], self.graph_runner_class] = {} - self.max_context_len_to_capture = ( - self.model_config.max_context_len_to_capture - if self.model_config is not None else 0) + self.max_seq_len_to_capture = (self.model_config.max_seq_len_to_capture + if self.model_config is not None else 0) + + self.pin_memory = is_pin_memory_available() + self.kv_cache_dtype = kv_cache_dtype + self.vision_language_config = vision_language_config + + self.attn_backend = get_attn_backend( + self.model_config.dtype if model_config is not None else None) + + # Lazy initialization + self.model: torch.nn.Module # Set after load_model + self.block_size: int # Set after initial profiling. # When using CUDA graph, the input block tables must be padded to - # max_context_len_to_capture. However, creating the block table in + # max_seq_len_to_capture. However, creating the block table in # Python can be expensive. To optimize this, we cache the block table # in numpy and only copy the actual input content at every iteration. # The shape of the cached block table will be # (max batch size to capture, max context len to capture / block size). - self.graph_block_tables = None # Set after initial profiling. - self.pin_memory = is_pin_memory_available() - self.kv_cache_dtype = kv_cache_dtype + self.graph_block_tables: torch.Tensor # Set after initial profiling. - self.attn_backend = get_attn_backend( - self.model_config.dtype if model_config is not None else None) def load_model(self) -> None: with HabanaMemoryProfiler() as m: - self.model = get_model(self.model_config, - self.device_config, - lora_config=self.lora_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config) + self.model = get_model( + model_config=self.model_config, + device_config=self.device_config, + load_config=self.load_config, + lora_config=self.lora_config, + vision_language_config=self.vision_language_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + ) self.model_memory_usage = m.consumed_memory logger.info(f"Loading model weights took " @@ -144,14 +216,12 @@ def set_block_size(self, block_size: int) -> None: def get_max_block_per_batch(self) -> int: block_size = self.block_size - return (self.max_context_len_to_capture + block_size - 1) // block_size + return (self.max_seq_len_to_capture + block_size - 1) // block_size def _prepare_prompt( self, seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int], - List[int], List[int], List[int], Set[LoRARequest]]: - assert len(seq_group_metadata_list) > 0 + ) -> PreparePromptMetadata: input_tokens: List[List[int]] = [] input_positions: List[List[int]] = [] slot_mapping: List[List[int]] = [] @@ -159,78 +229,107 @@ def _prepare_prompt( lora_prompt_mapping: List[List[int]] = [] lora_requests: Set[LoRARequest] = set() - prompt_lens: List[int] = [] + seq_lens: List[int] = [] context_lens: List[int] = [] - subquery_lens: List[int] = [] + query_lens: List[int] = [] prefix_block_tables: List[List[int]] = [] + multi_modal_input_list: List[torch.Tensor] = [] + + if len(seq_group_metadata_list) == 0: + return PreparePromptMetadata.empty() + for seq_group_metadata in seq_group_metadata_list: assert seq_group_metadata.is_prompt seq_ids = list(seq_group_metadata.seq_data.keys()) assert len(seq_ids) == 1 seq_id = seq_ids[0] + computed_block_nums = seq_group_metadata.computed_block_nums + if (self.scheduler_config is not None + and self.scheduler_config.chunked_prefill_enabled + and not (computed_block_nums is None + or computed_block_nums == [])): + raise RuntimeError( + "chunked prefill cannot be used with prefix caching " + "now.") + + token_chunk_size = seq_group_metadata.token_chunk_size seq_data = seq_group_metadata.seq_data[seq_id] - prompt_tokens = seq_data.get_token_ids() - prompt_len = len(prompt_tokens) - prompt_lens.append(prompt_len) - computed_len = 0 + context_len = seq_data.get_num_computed_tokens() + # We should use get_len here because in case of preemption + # it contains output tokens. + seq_len = min(seq_data.get_len(), context_len + token_chunk_size) + prompt_tokens = seq_data.get_token_ids()[context_len:seq_len] + seq_lens.append(seq_len) # NOTE: This only works for oooooooxxx style attention. - computed_block_nums = seq_group_metadata.computed_block_nums if computed_block_nums is not None and len( computed_block_nums) > 0 and self.sliding_window is None: # Prefix is not supported with sliding_window - computed_len = len(computed_block_nums) * self.block_size - prompt_tokens = prompt_tokens[computed_len:] + context_len = len(computed_block_nums) * self.block_size + prompt_tokens = prompt_tokens[context_len:] prefix_block_tables.append(computed_block_nums) - context_len = computed_len + elif self.scheduler_config.chunked_prefill_enabled: + if seq_group_metadata.block_tables is not None: + # Prefill has chunked before. + block_table = seq_group_metadata.block_tables[seq_id] + prefix_block_tables.append(block_table) + else: + # The first prefill. + prefix_block_tables.append([]) else: prefix_block_tables.append([]) - context_len = 0 + # Right now, prefill start is always 0. However, this + # assumption can be changed once chunked prefill is introduced. + assert context_len == 0 + # actual prompt lens context_lens.append(context_len) - if computed_len != 0: + if context_len != 0: import pdb; pdb.set_trace() # what happens if we hit that path?? - subquery_lens.append(prompt_len - computed_len) + query_lens.append(seq_len - context_len) input_tokens.append(prompt_tokens) # NOTE(woosuk): Here we assume that the first token in the prompt # is always the first token in the sequence. - input_positions.append( - list(range(computed_len, computed_len + len(prompt_tokens)))) - + input_positions.append(list(range(context_len, seq_len))) lora_id = seq_group_metadata.lora_int_id if lora_id > 0: lora_requests.add(seq_group_metadata.lora_request) - lora_index_mapping += [lora_id] * (prompt_len - computed_len) + lora_index_mapping += [lora_id] * (seq_len - context_len) lora_prompt_mapping.append( [lora_id] * - (prompt_len - computed_len + (seq_len - context_len if seq_group_metadata.sampling_params.prompt_logprobs else 1)) + if seq_group_metadata.multi_modal_data: + multi_modal_input_list.append( + seq_group_metadata.multi_modal_data.data) + if seq_group_metadata.block_tables is None: # During memory profiling, the block tables are not initialized # yet. In this case, we just use a dummy slot mapping. - slot_mapping.append([_PAD_SLOT_ID] * prompt_len) + slot_mapping.append([_PAD_SLOT_ID] * seq_len) continue # Compute the slot mapping. slot_mapping.append([]) block_table = seq_group_metadata.block_tables[seq_id] + # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID, - # where start_idx is max(0, prompt_len - sliding_window). + # where start_idx is max(0, seq_len - sliding_window). # For example, if the prompt len is 10, sliding window is 8, and # block size is 4, the first two tokens are masked and the slot # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1]. start_idx = 0 if self.sliding_window is not None: - assert computed_len == 0, ( + assert context_len == 0, ( "Prefix caching is currently not supported with " "sliding window attention") - start_idx = max(0, prompt_len - self.sliding_window) - for i in range(computed_len, prompt_len): + start_idx = max(0, seq_len - self.sliding_window) + for i in range(context_len, seq_len): if i < start_idx: slot_mapping[-1].append(_PAD_SLOT_ID) continue @@ -240,18 +339,25 @@ def _prepare_prompt( slot = block_number * self.block_size + block_offset slot_mapping[-1].append(slot) - max_subquery_len = max(subquery_lens) - max_prompt_len = max(prompt_lens) - num_prompt_tokens = len(input_tokens) - assert max_subquery_len > 0 - - lora_index_mapping = lora_index_mapping + max_query_len = max(query_lens) + max_seq_len = max(seq_lens) + assert max_query_len > 0 context_lens_tensor = torch.tensor(context_lens, dtype=torch.int, device=self.device) + + if multi_modal_input_list: + assert self.vision_language_config, ( + "Multi-modal inputs are only supported by " + "vision language models.") + multi_modal_input = torch.cat(multi_modal_input_list, + dim=0).to(self.device) + else: + multi_modal_input = None + max_prompt_block_table_len = max(len(t) for t in prefix_block_tables) - max_prompt_len = max(prompt_lens) + max_prompt_len = max(seq_lens) input_tokens = make_tensor_with_pad(input_tokens, max_prompt_len, pad=0, @@ -271,6 +377,7 @@ def _prepare_prompt( device=self.device) # Prepare prefix block tables + max_prompt_block_table_len = max(len(t) for t in prefix_block_tables) block_tables = make_tensor_with_pad( prefix_block_tables, max_len=max_prompt_block_table_len, @@ -281,67 +388,72 @@ def _prepare_prompt( # Query length can be shorter than key (i.e., prompt) when prefill # is chunked or prefix cached. - subquery_lens_tensor = torch.tensor(subquery_lens, - dtype=torch.long, - device=self.device) - subquery_start_loc = torch.zeros(subquery_lens_tensor.shape[0] + 1, + query_lens_tensor = torch.tensor(query_lens, + dtype=torch.long, + device=self.device) + subquery_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1, dtype=torch.int32, device=self.device) - prompt_lens_tensor = torch.tensor(prompt_lens, - dtype=torch.long, - device=self.device) - seq_start_loc = torch.zeros(prompt_lens_tensor.shape[0] + 1, + seq_lens_tensor = torch.tensor(seq_lens, + dtype=torch.long, + device=self.device) + seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1, dtype=torch.int32, device=self.device) - torch.cumsum(subquery_lens_tensor, + torch.cumsum(query_lens_tensor, dim=0, dtype=subquery_start_loc.dtype, out=subquery_start_loc[1:]) - torch.cumsum(prompt_lens_tensor, + torch.cumsum(seq_lens_tensor, dim=0, dtype=seq_start_loc.dtype, out=seq_start_loc[1:]) attn_metadata = self.attn_backend.make_metadata( is_prompt=True, - slot_mapping=slot_mapping, - prompt_lens=prompt_lens, - prompt_lens_tensor=prompt_lens_tensor, - num_prompt_tokens=num_prompt_tokens, - num_generation_tokens=0, - max_subquery_len=max_subquery_len, - max_context_len=None, - max_prompt_len=max_prompt_len, + seq_lens=seq_lens, + seq_lens_tensor=seq_lens_tensor, + max_query_len=max_query_len, + max_seq_len=max_seq_len, subquery_start_loc=subquery_start_loc, seq_start_loc=seq_start_loc, - context_lens=context_lens_tensor, + context_lens_tensor=context_lens_tensor, block_tables=block_tables, use_cuda_graph=False, - kv_cache_dtype=self.kv_cache_dtype, ) - return (input_tokens, input_positions, attn_metadata, prompt_lens, - subquery_lens, lora_index_mapping, lora_prompt_mapping, - lora_requests) - + return PreparePromptMetadata( + input_tokens=input_tokens, + input_positions=input_positions, + attn_metadata=attn_metadata, + seq_lens=seq_lens, + query_lens=query_lens, + lora_index_mapping=lora_index_mapping, + lora_prompt_mapping=lora_prompt_mapping, + lora_requests=lora_requests, + multi_modal_input=multi_modal_input, + slot_mapping=slot_mapping, + ) def _prepare_decode( self, seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int], - List[int], Set[LoRARequest]]: - assert len(seq_group_metadata_list) > 0 + ) -> PrepareDecodeMetadata: input_tokens: List[List[int]] = [] input_positions: List[List[int]] = [] slot_mapping: List[List[int]] = [] - context_lens: List[int] = [] + seq_lens: List[int] = [] block_tables: List[List[int]] = [] lora_index_mapping: List[int] = [] lora_prompt_mapping: List[int] = [] lora_requests: Set[LoRARequest] = set() + if len(seq_group_metadata_list) == 0: + return PrepareDecodeMetadata.empty() + for seq_group_metadata in seq_group_metadata_list: assert not seq_group_metadata.is_prompt + assert seq_group_metadata.token_chunk_size == 1 seq_ids = list(seq_group_metadata.seq_data.keys()) lora_id = seq_group_metadata.lora_int_id @@ -358,9 +470,9 @@ def _prepare_decode( position = seq_len - 1 input_positions.append([position]) - context_len = seq_len if self.sliding_window is None else min( + seq_len = seq_len if self.sliding_window is None else min( seq_len, self.sliding_window) - context_lens.append(context_len) + seq_lens.append(seq_len) block_table = seq_group_metadata.block_tables[seq_id] block_number = block_table[position // self.block_size] @@ -380,11 +492,11 @@ def _prepare_decode( # See `capture_model` API for more details. # For decoding requests, batch_size == input_tokens. batch_size = len(input_tokens) - max_context_len = max(context_lens) + max_seq_len = max(seq_lens) use_captured_graph = ( not self.model_config.enforce_eager and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1] - and max_context_len <= self.max_context_len_to_capture) + and max_seq_len <= self.max_seq_len_to_capture) if use_captured_graph: graph_batch_size = _get_graph_batch_size(batch_size) assert graph_batch_size >= batch_size @@ -392,7 +504,7 @@ def _prepare_decode( input_tokens.append([0]) input_positions.append([0]) slot_mapping.append([_PAD_SLOT_ID]) - context_lens.append(1) + seq_lens.append(1) block_tables.append([]) lora_index_mapping.append(0) batch_size = graph_batch_size @@ -406,24 +518,23 @@ def _prepare_decode( slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device=self.device) - context_lens = torch.tensor(context_lens, - dtype=torch.int, - device=self.device) + seq_lens_tensor = torch.tensor(seq_lens, + dtype=torch.int, + device=self.device) if use_captured_graph: # When using cuda-graph all these tensors should be # padded. - assert context_lens.shape[0] == input_tokens.shape[0] - assert context_lens.shape[0] == input_positions.shape[0] - assert context_lens.shape[0] == slot_mapping.shape[0] + assert seq_lens_tensor.shape[0] == len(input_tokens) + assert seq_lens_tensor.shape[0] == len(input_positions) + assert seq_lens_tensor.shape[0] == len(slot_mapping) # The shape of graph_block_tables is # [max batch size, max context len // block size]. - graph_max_context_len = _get_graph_max_context_len(max_context_len) - assert graph_max_context_len >= max_context_len - graph_block_count = math.ceil(graph_max_context_len / self.block_size) + graph_max_seq_len = _get_graph_max_context_len(max_seq_len) + assert graph_max_seq_len >= max_seq_len + graph_block_count = math.ceil(graph_max_seq_len / self.block_size) input_block_tables = self.graph_block_tables[:batch_size, :graph_block_count] - for i, block_table in enumerate(block_tables): if block_table: input_block_tables[i, :len(block_table)] = block_table @@ -438,26 +549,28 @@ def _prepare_decode( dtype=torch.int, device=self.device, ) - attn_metadata = self.attn_backend.make_metadata( is_prompt=False, - slot_mapping=slot_mapping, - prompt_lens=None, - prompt_lens_tensor=None, - num_prompt_tokens=0, - num_generation_tokens=len(input_tokens), - max_subquery_len=None, - max_context_len=max_context_len, - max_prompt_len=None, + seq_lens=None, + seq_lens_tensor=seq_lens_tensor, + max_query_len=None, + max_seq_len=max_seq_len, subquery_start_loc=None, seq_start_loc=None, - context_lens=context_lens, + context_lens_tensor=None, block_tables=block_tables, use_cuda_graph=use_captured_graph, - kv_cache_dtype=self.kv_cache_dtype, ) - return (input_tokens, input_positions, attn_metadata, - lora_index_mapping, lora_prompt_mapping, lora_requests) + return PrepareDecodeMetadata( + input_tokens=input_tokens, + input_positions=input_positions, + attn_metadata=attn_metadata, + lora_index_mapping=lora_index_mapping, + lora_prompt_mapping=lora_prompt_mapping, + lora_requests=lora_requests, + slot_mapping=slot_mapping, + ) + def _prepare_sample( self, @@ -558,6 +671,164 @@ def _prepare_sample( return sampling_metadata def prepare_input_tensors( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata, + Set[LoRARequest], LoRAMapping, torch.Tensor]: + if self.is_driver_worker: + prefill_reqs = [] + decode_reqs = [] + for seq_group_meta in seq_group_metadata_list: + if seq_group_meta.is_prompt: + prefill_reqs.append(seq_group_meta) + else: + decode_reqs.append(seq_group_meta) + + # Prepare input tensors. + ( + input_tokens, + input_positions, + prefill_attn_metadata, + seq_lens, + query_lens, + lora_index_mapping, + lora_prompt_mapping, + lora_requests, + multi_modal_input, + slot_mapping, + ) = self._prepare_prompt(prefill_reqs) + ( + decode_input_tokens, + decode_input_positions, + decode_attn_metadata, + decode_lora_index_mapping, + decode_lora_prompt_mapping, + decode_lora_requests, + decode_slot_mapping, + ) = self._prepare_decode(decode_reqs) + sampling_metadata = SamplingMetadata.prepare( + seq_group_metadata_list, seq_lens, query_lens, self.device, + self.pin_memory) + + if not self.scheduler_config.chunked_prefill_enabled: + assert (len(prefill_reqs) and len(decode_reqs)) == 0 + + num_prefills = len(seq_lens) + num_prefill_tokens = len(input_tokens) + num_decode_tokens = len(decode_input_tokens) + + # NOTE(kzawora): Here we diverge from GPU code - we don't support mixed batches, so we either use decode or prefill inputs, without coalescing. + assert (num_prefills == 0 and num_decode_tokens > 0) or (num_prefills > 0 and num_decode_tokens == 0), "HPU does not support mixed batches!" + if num_decode_tokens > 0: + input_tokens = decode_input_tokens + input_positions = decode_input_positions + slot_mapping = decode_slot_mapping + lora_index_mapping = decode_lora_index_mapping + lora_prompt_mapping = decode_lora_prompt_mapping + lora_requests = decode_lora_requests + + if self.lora_config: + lora_mapping = LoRAMapping( + lora_index_mapping, + lora_prompt_mapping, + ) + else: + lora_mapping = None + + # Broadcast the metadata. + # If batch contains both prefill and decode, it sends 2 broadcasts. + # If it only contains 1 type, it triggers a single broadcast. + if (prefill_attn_metadata is not None + and decode_attn_metadata is not None): + batch_type = BatchType.MIXED + raise NotImplementedError("Mixed batch is not supported on HPU") + elif prefill_attn_metadata is not None: + batch_type = BatchType.PREFILL + else: + batch_type = BatchType.DECODE + + metadata_dict = { + "input_tokens": input_tokens, + "input_positions": input_positions, + "selected_token_indices": + sampling_metadata.selected_token_indices, + "lora_requests": lora_requests, + "lora_mapping": lora_mapping, + "multi_modal_input": multi_modal_input, + "num_prefill_tokens": num_prefill_tokens, + "num_decode_tokens": num_decode_tokens, + "slot_mapping": slot_mapping, + "num_prefills": num_prefills, + "batch_type": batch_type, + } + if prefill_attn_metadata is not None: + metadata_dict.update(prefill_attn_metadata.asdict_zerocopy()) + else: + assert decode_attn_metadata is not None + metadata_dict.update(decode_attn_metadata.asdict_zerocopy()) + broadcast_tensor_dict(metadata_dict, src=0) + + # Broadcast decode attn metadata for mixed batch type. + # The additional broadcast costs 300us overhead on 4 A10 GPUs. + # We can potentially reduce the overhead by coelescing tensors. + if batch_type == BatchType.MIXED: + assert decode_attn_metadata is not None + metadata_dict = decode_attn_metadata.asdict_zerocopy() + broadcast_tensor_dict(metadata_dict, src=0) + else: + metadata_dict = broadcast_tensor_dict(src=0) + input_tokens = metadata_dict.pop("input_tokens") + input_positions = metadata_dict.pop("input_positions") + slot_mapping = metadata_dict.pop("slot_mapping") + num_prefills = metadata_dict.pop("num_prefills") + selected_token_indices = metadata_dict.pop( + "selected_token_indices") + lora_mapping = metadata_dict.pop("lora_mapping") + lora_requests = metadata_dict.pop("lora_requests") + multi_modal_input = metadata_dict.pop("multi_modal_input") + num_prefill_tokens = metadata_dict.pop("num_prefill_tokens") + num_decode_tokens = metadata_dict.pop("num_decode_tokens") + batch_type = metadata_dict.pop("batch_type") + + # Create an attention metadata. + prefill_attn_metadata = None + decode_attn_metadata = None + if batch_type == BatchType.PREFILL or batch_type == BatchType.MIXED: + prefill_attn_metadata = self.attn_backend.make_metadata( + **metadata_dict) + else: + decode_attn_metadata = self.attn_backend.make_metadata( + **metadata_dict) + sampling_metadata = SamplingMetadata( + seq_groups=None, + selected_token_indices=selected_token_indices, + categorized_sample_indices=None, + num_prompts=0, + ) + + # if it is a mixed batch, decode attn_metadata is broadcasted + # separately. + if batch_type == BatchType.MIXED: + metadata_dict = broadcast_tensor_dict(src=0) + decode_attn_metadata = self.attn_backend.make_metadata( + **metadata_dict) + + attn_metadata = AttentionMetadata( + num_prefills=num_prefills, + slot_mapping=slot_mapping, + num_prefill_tokens=num_prefill_tokens, + num_decode_tokens=num_decode_tokens, + prefill_metadata=prefill_attn_metadata, + decode_metadata=decode_attn_metadata, + kv_cache_dtype=self.kv_cache_dtype, + ) + + return (input_tokens, input_positions, attn_metadata, + sampling_metadata, lora_requests, lora_mapping, + multi_modal_input) + + + def _old_prepare_input_tensors( self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata, @@ -629,14 +900,16 @@ def execute_model( kv_caches: List[torch.Tensor], ) -> Optional[SamplerOutput]: (input_tokens, input_positions, attn_metadata, sampling_metadata, - lora_requests, - lora_mapping) = self.prepare_input_tensors(seq_group_metadata_list) + lora_requests, lora_mapping, multi_modal_input + ) = self.prepare_input_tensors(seq_group_metadata_list) if self.lora_config: self.set_active_loras(lora_requests, lora_mapping) - # Execute the model. - if attn_metadata.use_cuda_graph: + # Currently HPU graph is only supported by the decode phase. + prefill_meta = attn_metadata.prefill_metadata + decode_meta = attn_metadata.decode_metadata + if prefill_meta is None and decode_meta.use_cuda_graph: graph_batch_size = input_tokens.shape[0] graph_block_count = attn_metadata.block_tables.shape[1] graph_runner_key = (graph_batch_size, graph_block_count) @@ -644,25 +917,30 @@ def execute_model( logger.info(f"Executing {self.graph_runner_class.__name__} with batch {graph_batch_size}, block_count {graph_block_count} (context_len up to {graph_block_count*self.block_size}, currently {torch.max(attn_metadata.context_lens).item()})") else: model_executable = self.model - hidden_states = model_executable( - input_ids=input_tokens, - positions=input_positions, - kv_caches=kv_caches, - attn_metadata=attn_metadata, - ) + execute_model_kwargs = { + "input_ids": input_tokens, + "positions": input_positions, + "kv_caches": kv_caches, + "attn_metadata": attn_metadata, + } + if self.vision_language_config: + execute_model_kwargs.update({"image_input": multi_modal_input}) + hidden_states = model_executable(**execute_model_kwargs) hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) - # Compute the logits. + + # Compute the logits. logits = self.model.compute_logits(hidden_states, sampling_metadata) # Only perform sampling in the driver worker. - if not sampling_metadata.perform_sampling: + if not self.is_driver_worker: return None - + # Sample the next token. output = self.model.sample( logits=logits, sampling_metadata=sampling_metadata, ) + return output @torch.inference_mode() @@ -697,6 +975,17 @@ def profile_run(self) -> None: # Profile memory usage with max_num_sequences sequences and the total # number of tokens equal to max_num_batched_tokens. seqs: List[SequenceGroupMetadata] = [] + # Additional GPU memory may be needed for vision encoding, which needs + # to be accounted for when calculating the GPU blocks for + # vLLM blocker manager. + # To exercise the worst scenario for GPU memory consumption, + # the number of seqs (batch_size) is chosen to maximize the number + # of images processed. + if self.vision_language_config: + max_num_seqs = min( + max_num_seqs, + int(max_num_batched_tokens / + self.vision_language_config.image_feature_size)) for group_id in range(max_num_seqs): seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs)) @@ -719,12 +1008,12 @@ def profile_run(self) -> None: torch.hpu.synchronize() return - def remove_all_loras(self) -> bool: + def remove_all_loras(self): if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.remove_all_loras() + self.lora_manager.remove_all_loras() - def set_active_loras(self, lora_requests: List[LoRARequest], + def set_active_loras(self, lora_requests: Set[LoRARequest], lora_mapping: LoRAMapping) -> None: if not self.lora_manager: raise RuntimeError("LoRA is not enabled.") @@ -761,7 +1050,6 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None: """ # NOTE(woosuk): This is a hack to ensure that the NCCL backend is never # deleted before the CUDA graphs. - self.cupy_nccl_backend = cupy_utils.get_nccl_backend() assert not self.model_config.enforce_eager logger.info("Capturing the model for HPUGraphs. This may lead to " @@ -841,10 +1129,9 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None: # Create dummy attn_metadata. attn_metadata = self.attn_backend.make_metadata( is_prompt=False, - slot_mapping=slot_mapping[:batch_size], prompt_lens=None, prompt_lens_tensor=None, - num_prompt_tokens=0, + num_prefill_tokens=0, num_generation_tokens=batch_size, max_subquery_len=None, max_context_len=block_count*self.block_size, @@ -900,7 +1187,6 @@ def __del__(self) -> None: # happen. # FIXME(woosuk): This is a bit hacky. Find a more robust solution. self.graph_runners.clear() - self.cupy_nccl_backend = None @property def vocab_size(self) -> int: @@ -1079,10 +1365,9 @@ def __init__(self, model): def forward(self, input_ids, positions, kv_caches, slot_mapping, context_lens, block_tables): wrapper_attn_metadata = self.attn_backend.make_metadata( is_prompt=attn_metadata.is_prompt, - slot_mapping=slot_mapping, - prompt_lens=None, - prompt_lens_tensor=None, - num_prompt_tokens=0, + seq_lens=None, + seq_lens_tensor=None, + num_prefill_tokens=0, num_generation_tokens=attn_metadata.num_generation_tokens, max_subquery_len=None, max_context_len=attn_metadata.max_context_len, diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index bbfd7dad7f90a..a05eee90648b2 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -4,27 +4,28 @@ import gc import os -from typing import Dict, List, Optional, Set, Tuple +from typing import Any, Dict, List, Optional, Set, Tuple import torch import habana_frameworks.torch as htorch import torch.distributed -from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig) +from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, + ModelConfig, ParallelConfig, SchedulerConfig, + VisionLanguageConfig) +from vllm.distributed import (broadcast_tensor_dict, + ensure_model_parallel_initialized, + get_tensor_model_parallel_cpu_group, + init_distributed_environment) from vllm.lora.request import LoRARequest from vllm.model_executor import set_random_seed -from vllm.model_executor.parallel_utils.communication_op import ( - broadcast_tensor_dict) -from vllm.model_executor.parallel_utils.custom_all_reduce import init_custom_ar -from vllm.model_executor.parallel_utils.parallel_state import ( - ensure_model_parallel_initialized) -from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.worker.cache_engine import CacheEngine from vllm.worker.habana_model_runner import HabanaModelRunner +from vllm.worker.worker_base import WorkerBase -class HabanaWorker: +class HabanaWorker(WorkerBase): """A worker class that executes (a partition of) the model on a HPU. Each worker is associated with a single HPU. The worker is responsible for @@ -38,37 +39,51 @@ def __init__( parallel_config: ParallelConfig, scheduler_config: SchedulerConfig, device_config: DeviceConfig, + cache_config: CacheConfig, + load_config: LoadConfig, local_rank: int, rank: int, distributed_init_method: str, lora_config: Optional[LoRAConfig] = None, - kv_cache_dtype: Optional[str] = "auto", + vision_language_config: Optional[VisionLanguageConfig] = None, is_driver_worker: bool = False, ) -> None: self.model_config = model_config self.parallel_config = parallel_config self.scheduler_config = scheduler_config self.device_config = device_config + self.cache_config = cache_config self.local_rank = local_rank self.rank = rank self.distributed_init_method = distributed_init_method self.lora_config = lora_config + self.load_config = load_config self.is_driver_worker = is_driver_worker if self.is_driver_worker: assert self.rank == 0, "The driver worker must have rank 0." + if self.model_config.trust_remote_code: + # note: lazy import to avoid importing torch before initializing + from vllm.utils import init_cached_hf_modules + init_cached_hf_modules() + self.vision_language_config = vision_language_config + if self.vision_language_config: + assert not self.lora_config, ( + "To be tested: vision language model with LoRA settings.") + assert False, "To be tested: vision language model on HPU" + self.model_runner = HabanaModelRunner(model_config, parallel_config, scheduler_config, device_config, + load_config=load_config, lora_config=self.lora_config, - kv_cache_dtype=kv_cache_dtype, + kv_cache_dtype=self.cache_config.cache_dtype, is_driver_worker=is_driver_worker) # Uninitialized cache engine. Will be initialized by - # self.init_cache_engine(). - self.cache_config = None - self.cache_engine = None - self.hpu_cache = None + # initialize_cache. + self.cache_engine: CacheEngine + self.hpu_cache: List[torch.Tensor] def init_device(self) -> None: if self.device_config.device.type == "hpu": @@ -79,8 +94,9 @@ def init_device(self) -> None: raise RuntimeError( f"Not support device type: {self.device_config.device}") # Initialize the distributed environment. - init_distributed_environment(self.parallel_config, self.rank, - self.distributed_init_method) + init_worker_distributed_environment(self.parallel_config, self.rank, + self.distributed_init_method, + self.local_rank) # Set random seed. set_random_seed(self.model_config.seed) @@ -88,20 +104,17 @@ def load_model(self): self.model_runner.load_model() @torch.inference_mode() - def profile_num_available_blocks( - self, - block_size: int, - hpu_memory_utilization: float, - cpu_swap_space: int, - cache_dtype: str, - ) -> Tuple[int, int]: - """Profiles the peak memory usage of the model and returns the maximum - number of HPU and CPU cache blocks that can be allocated. - - Args: - block_size: The size of the cache block. - hpu_memory_utilization: The fraction of the total HPU memory to use. - cpu_swap_space: The size of the CPU swap space in bytes. + def determine_num_available_blocks(self) -> Tuple[int, int]: + """Profiles the peak memory usage of the model to determine how many + KV blocks may be allocated without OOMs. + + The engine will first conduct a profiling of the existing memory usage. + Then, it calculate the maximum possible number of GPU and CPU blocks + that can be allocated with the remaining free memory. + + .. tip:: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. """ # Profile the memory usage of the model and get the maximum number of # cache blocks that can be allocated with the remaining free memory. @@ -118,15 +131,15 @@ def profile_num_available_blocks( # HPU did not change their memory usage during the profiling. peak_memory = self.init_hpu_memory - free_hpu_memory assert peak_memory > 0, ( - "Error in memory profiling. This happens when the hpu memory was " + "Error in memory profiling. This happens when the HPU memory was " "not properly cleaned up before initializing the vLLM instance.") - cache_block_size = self.get_cache_block_size_bytes( - block_size, cache_dtype) + cache_block_size = self.get_cache_block_size_bytes() num_hpu_blocks = int( - (total_hpu_memory * hpu_memory_utilization - peak_memory) // - cache_block_size) - num_cpu_blocks = int(cpu_swap_space // cache_block_size) + (total_hpu_memory * self.cache_config.gpu_memory_utilization - + peak_memory) // cache_block_size) + num_cpu_blocks = int(self.cache_config.swap_space_bytes // + cache_block_size) num_hpu_blocks = max(num_hpu_blocks, 0) num_cpu_blocks = max(num_cpu_blocks, 0) if self.model_runner.lora_manager: @@ -134,15 +147,31 @@ def profile_num_available_blocks( gc.collect() return num_hpu_blocks, num_cpu_blocks - def init_cache_engine(self, cache_config: CacheConfig) -> None: - self.cache_config = cache_config + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: + """Allocate GPU and CPU KV cache with the specified number of blocks. + + This also warms up the model, which may record CUDA graphs. + """ + raise_if_cache_size_invalid(num_gpu_blocks, + self.cache_config.block_size, + self.model_config.max_model_len) + + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + + self._init_cache_engine() + self._warm_up_model() + + def _init_cache_engine(self) -> None: + assert self.cache_config.num_gpu_blocks is not None self.cache_engine = CacheEngine(self.cache_config, self.model_config, self.parallel_config) self.hpu_cache = self.cache_engine.gpu_cache self.model_runner.set_block_size(self.cache_engine.block_size) htorch.hpu.synchronize() # we want to materialize cache tensors before we proceed with graph capture/execution - def warm_up_model(self) -> None: + def _warm_up_model(self) -> None: if not self.model_config.enforce_eager: self.model_runner.capture_model(self.hpu_cache) # Reset the seed to ensure that the random state is not affected by @@ -153,7 +182,7 @@ def cache_swap( self, blocks_to_swap_in: Dict[int, int], blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]], + blocks_to_copy: torch.Tensor, ) -> None: # Issue cache operations. # TODO(woosuk): Profile swapping overhead and optimize if needed. @@ -161,24 +190,29 @@ def cache_swap( self.cache_engine.swap_in(blocks_to_swap_in) if blocks_to_swap_out: self.cache_engine.swap_out(blocks_to_swap_out) - if blocks_to_copy: + if blocks_to_copy.numel() > 0: self.cache_engine.copy(blocks_to_copy) @torch.inference_mode() def execute_model( self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None, - blocks_to_swap_in: Optional[Dict[int, int]] = None, - blocks_to_swap_out: Optional[Dict[int, int]] = None, - blocks_to_copy: Optional[Dict[int, List[int]]] = None, - ) -> Optional[SamplerOutput]: + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + if execute_model_req is None: + seq_group_metadata_list = None + else: + seq_group_metadata_list = execute_model_req.seq_group_metadata_list + if self.is_driver_worker: assert seq_group_metadata_list is not None + assert execute_model_req is not None num_seq_groups = len(seq_group_metadata_list) - assert blocks_to_swap_in is not None - assert blocks_to_swap_out is not None - assert blocks_to_copy is not None - data = { + blocks_to_swap_in = execute_model_req.blocks_to_swap_in + blocks_to_swap_out = execute_model_req.blocks_to_swap_out + blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy, + device=self.device, + dtype=torch.int64).view(-1, 2) + data: Dict[str, Any] = { "num_seq_groups": num_seq_groups, "blocks_to_swap_in": blocks_to_swap_in, "blocks_to_swap_out": blocks_to_swap_out, @@ -196,11 +230,11 @@ def execute_model( # If there is no input, we don't need to execute the model. if num_seq_groups == 0: - return {} + return [] output = self.model_runner.execute_model(seq_group_metadata_list, self.hpu_cache) - return output + return [output] def add_lora(self, lora_request: LoRARequest) -> bool: raise NotImplementedError("LoRA is not implemented for HPU backend.") @@ -219,21 +253,27 @@ def max_model_len(self) -> int: def vocab_size(self) -> int: return self.model_runner.vocab_size - def get_cache_block_size_bytes(self, block_size: int, - cache_dtype: str) -> int: + def get_cache_block_size_bytes(self) -> int: """Get the size of the KV cache block size in bytes. """ - return CacheEngine.get_cache_block_size(block_size, cache_dtype, + return CacheEngine.get_cache_block_size(self.cache_config, self.model_config, self.parallel_config) -def init_distributed_environment( +def init_worker_distributed_environment( parallel_config: ParallelConfig, rank: int, distributed_init_method: Optional[str] = None, + local_rank: int = -1, ) -> None: """Initialize the distributed environment.""" + init_distributed_environment(parallel_config.world_size, rank, + distributed_init_method, local_rank, backend='hccl') + + ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, + parallel_config.pipeline_parallel_size) + if torch.distributed.is_initialized(): torch_world_size = torch.distributed.get_world_size() if torch_world_size != parallel_config.world_size: @@ -257,7 +297,17 @@ def init_distributed_environment( torch.distributed.all_reduce(torch.zeros(1).to('hpu')) ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size) - - # Initialize a custom fast all-reduce implementation. - if not parallel_config.disable_custom_all_reduce: - init_custom_ar() +def raise_if_cache_size_invalid(num_gpu_blocks, block_size, + max_model_len) -> None: + if num_gpu_blocks <= 0: + raise ValueError("No available memory for the cache blocks. " + "Try increasing `gpu_memory_utilization` when " + "initializing the engine.") + max_seq_len = block_size * num_gpu_blocks + if max_model_len > max_seq_len: + raise ValueError( + f"The model's max seq len ({max_model_len}) " + "is larger than the maximum number of tokens that can be " + f"stored in KV cache ({max_seq_len}). Try increasing " + "`gpu_memory_utilization` or decreasing `max_model_len` when " + "initializing the engine.") From 737c767cefcfef179b65afd335bc13d5dda39917 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 8 May 2024 20:36:24 +0300 Subject: [PATCH 003/819] fix hpugraph capture/replay post rebase --- vllm/worker/habana_model_runner.py | 269 ++++++----------------------- 1 file changed, 56 insertions(+), 213 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index a25a09c2598fd..e418ccc1d5c62 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -52,9 +52,9 @@ ] # Capture graphs for token size 1, 32, 64, 128, 256, 512, 768 ... 2048 -_MAX_CONTEXT_LEN_ALIGNMENT = 256 -_MAX_CONTEXT_LENS_TO_CAPTURE = [1, 32, 64, 128] + [ - _MAX_CONTEXT_LEN_ALIGNMENT * i for i in range(1, 9) +_MAX_SEQ_LEN_ALIGNMENT = 256 +_MAX_SEQ_LENS_TO_CAPTURE = [1, 32, 64, 128] + [ + _MAX_SEQ_LEN_ALIGNMENT * i for i in range(1, 9) ] @@ -531,7 +531,7 @@ def _prepare_decode( # The shape of graph_block_tables is # [max batch size, max context len // block size]. - graph_max_seq_len = _get_graph_max_context_len(max_seq_len) + graph_max_seq_len = _get_graph_max_seq_len(max_seq_len) assert graph_max_seq_len >= max_seq_len graph_block_count = math.ceil(graph_max_seq_len / self.block_size) input_block_tables = self.graph_block_tables[:batch_size, :graph_block_count] @@ -572,104 +572,6 @@ def _prepare_decode( ) - def _prepare_sample( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - prompt_lens: List[int], - subquery_lens: Optional[List[int]], - ) -> SamplingMetadata: - seq_groups: List[Tuple[List[int], SamplingParams]] = [] - selected_token_indices: List[int] = [] - generators: List[torch.Generator] = [] - selected_token_start_idx = 0 - categorized_sample_indices = {t: [] for t in SamplingType} - categorized_sample_indices_start_idx = 0 - categorized_sampled_token_indices_start_idx = 0 - max_subquery_len = max(subquery_lens) if subquery_lens else 1 - for i, seq_group_metadata in enumerate(seq_group_metadata_list): - seq_ids = list(seq_group_metadata.seq_data.keys()) - sampling_params = seq_group_metadata.sampling_params - seq_groups.append((seq_ids, sampling_params)) - - if seq_group_metadata.is_prompt: - assert len(seq_ids) == 1 - assert subquery_lens is not None - subquery_len = subquery_lens[i] - if sampling_params.prompt_logprobs is not None: - # NOTE: prompt token positions do not need sample, skip - categorized_sample_indices_start_idx += subquery_len - 1 - - categorized_sample_indices[ - sampling_params.sampling_type].append([ - categorized_sample_indices_start_idx, - categorized_sampled_token_indices_start_idx - ]) - categorized_sample_indices_start_idx += 1 - categorized_sampled_token_indices_start_idx += 1 - - if sampling_params.prompt_logprobs is not None: - selected_token_indices.extend( - range(selected_token_start_idx, - selected_token_start_idx + subquery_len - 1)) - selected_token_indices.append(selected_token_start_idx + - subquery_len - 1) - selected_token_start_idx += max_subquery_len - - if sampling_params.seed is not None: - seq_group_metadata.state.generator = torch.Generator( - device=self.device).manual_seed(sampling_params.seed) - else: - num_seqs = len(seq_ids) - selected_token_indices.extend( - range(selected_token_start_idx, - selected_token_start_idx + num_seqs)) - selected_token_start_idx += num_seqs - - categorized_sample_indices[ - sampling_params.sampling_type].extend( - zip( - range( - categorized_sample_indices_start_idx, - categorized_sample_indices_start_idx + - num_seqs), - range( - categorized_sampled_token_indices_start_idx, - categorized_sampled_token_indices_start_idx + - num_seqs))) - categorized_sample_indices_start_idx += num_seqs - categorized_sampled_token_indices_start_idx += num_seqs - - if sampling_params.seed is not None: - generators.append(seq_group_metadata.state.generator) - - selected_token_indices = async_tensor_h2d(selected_token_indices, - dtype=torch.long, - target_device=self.device, - pin_memory=self.pin_memory) - - categorized_sample_indices = { - t: maybe_expand_dim( - async_tensor_h2d(seq_ids, - dtype=torch.int, - target_device=self.device, - pin_memory=self.pin_memory), 2, 2) - for t, seq_ids in categorized_sample_indices.items() - } - - seq_data: Dict[int, SequenceData] = {} - for seq_group_metadata in seq_group_metadata_list: - seq_data.update(seq_group_metadata.seq_data) - - sampling_metadata = SamplingMetadata( - seq_groups=seq_groups, - seq_data=seq_data, - prompt_lens=prompt_lens, - selected_token_indices=selected_token_indices, - categorized_sample_indices=categorized_sample_indices, - generators=generators, - ) - return sampling_metadata - def prepare_input_tensors( self, seq_group_metadata_list: List[SequenceGroupMetadata], @@ -828,71 +730,6 @@ def prepare_input_tensors( multi_modal_input) - def _old_prepare_input_tensors( - self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata, - Set[int], LoRAMapping]: - if self.is_driver_worker: - # NOTE: We assume that all sequences in the group are all prompts or - # all decodes. - is_prompt = seq_group_metadata_list[0].is_prompt - # Prepare input tensors. - if is_prompt: - (input_tokens, input_positions, attn_metadata, prompt_lens, - subquery_lens, lora_index_mapping, lora_prompt_mapping, - lora_requests) = self._prepare_prompt(seq_group_metadata_list) - else: - (input_tokens, input_positions, attn_metadata, - lora_index_mapping, lora_prompt_mapping, - lora_requests) = self._prepare_decode(seq_group_metadata_list) - prompt_lens = [] - subquery_lens = None - sampling_metadata = self._prepare_sample(seq_group_metadata_list, - prompt_lens, - subquery_lens) - - if self.lora_config: - lora_mapping = LoRAMapping( - lora_index_mapping, - lora_prompt_mapping, - ) - else: - lora_mapping = None - - # Broadcast the metadata. - metadata_dict = { - "input_tokens": input_tokens, - "input_positions": input_positions, - "selected_token_indices": - sampling_metadata.selected_token_indices, - "lora_requests": lora_requests, - "lora_mapping": lora_mapping, - } - metadata_dict.update(attn_metadata.asdict_zerocopy()) - broadcast_tensor_dict(metadata_dict, src=0) - else: - metadata_dict = broadcast_tensor_dict(src=0) - input_tokens = metadata_dict.pop("input_tokens") - input_positions = metadata_dict.pop("input_positions") - selected_token_indices = metadata_dict.pop( - "selected_token_indices") - lora_mapping = metadata_dict.pop("lora_mapping") - lora_requests = metadata_dict.pop("lora_requests") - attn_metadata = self.attn_backend.make_metadata(**metadata_dict) - sampling_metadata = SamplingMetadata( - seq_groups=None, - seq_data=None, - prompt_lens=None, - selected_token_indices=selected_token_indices, - categorized_sample_indices=None, - generators=None, - perform_sampling=False, - ) - - return (input_tokens, input_positions, attn_metadata, - sampling_metadata, lora_requests, lora_mapping) - @torch.inference_mode() def execute_model( self, @@ -911,10 +748,10 @@ def execute_model( decode_meta = attn_metadata.decode_metadata if prefill_meta is None and decode_meta.use_cuda_graph: graph_batch_size = input_tokens.shape[0] - graph_block_count = attn_metadata.block_tables.shape[1] + graph_block_count = decode_meta.block_tables.shape[1] graph_runner_key = (graph_batch_size, graph_block_count) model_executable = self.graph_runners[graph_runner_key] - logger.info(f"Executing {self.graph_runner_class.__name__} with batch {graph_batch_size}, block_count {graph_block_count} (context_len up to {graph_block_count*self.block_size}, currently {torch.max(attn_metadata.context_lens).item()})") + logger.info(f"Executing {self.graph_runner_class.__name__} with batch {graph_batch_size}, block_count {graph_block_count} (context_len up to {graph_block_count*self.block_size}, currently {torch.max(decode_meta.seq_lens_tensor).item()})") else: model_executable = self.model execute_model_kwargs = { @@ -1088,33 +925,33 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None: # NOTE: Capturing the largest batch size first may help reduce the # memory usage of CUDA graph. valid_combinations = [] - total_combinations = len(_BATCH_SIZES_TO_CAPTURE)*len(_MAX_CONTEXT_LENS_TO_CAPTURE) + total_combinations = len(_BATCH_SIZES_TO_CAPTURE)*len(_MAX_SEQ_LENS_TO_CAPTURE) import pandas as pd - df = pd.DataFrame(index=_BATCH_SIZES_TO_CAPTURE, columns=_MAX_CONTEXT_LENS_TO_CAPTURE) - for idx, (batch_size, max_context_len) in enumerate(itertools.product(reversed(_BATCH_SIZES_TO_CAPTURE), reversed(_MAX_CONTEXT_LENS_TO_CAPTURE))): - block_count = math.ceil(max_context_len / self.block_size) + df = pd.DataFrame(index=_BATCH_SIZES_TO_CAPTURE, columns=_MAX_SEQ_LENS_TO_CAPTURE) + for idx, (batch_size, max_seq_len) in enumerate(itertools.product(reversed(_BATCH_SIZES_TO_CAPTURE), reversed(_MAX_SEQ_LENS_TO_CAPTURE))): + block_count = math.ceil(max_seq_len / self.block_size) # Skip capture of "out-of-bound" batch sizes and context lengths if batch_size > self.scheduler_config.max_num_seqs: - logger.debug(f"[{idx}/{total_combinations}] Skipping capture for batch {batch_size}, max_context_len {max_context_len}, block_count {block_count}. Reason: Batch out of bound.") - df[max_context_len][batch_size] = 'batch OoB' + logger.debug(f"[{idx}/{total_combinations}] Skipping capture for batch {batch_size}, max_seq_len {max_seq_len}, block_count {block_count}. Reason: Batch out of bound.") + df[max_seq_len][batch_size] = 'batch OoB' continue - if max_context_len > self.max_context_len_to_capture: - logger.debug(f"[{idx}/{total_combinations}] Skipping capture for batch {batch_size}, max_context_len {max_context_len}, block_count {block_count}. Reason: Nax context length out of bound.") - df[max_context_len][batch_size] = 'ctx OoB' + if max_seq_len > self.max_seq_len_to_capture: + logger.debug(f"[{idx}/{total_combinations}] Skipping capture for batch {batch_size}, max_seq_len {max_seq_len}, block_count {block_count}. Reason: Nax context length out of bound.") + df[max_seq_len][batch_size] = 'ctx OoB' continue - block_count = math.ceil(max_context_len / self.block_size) + block_count = math.ceil(max_seq_len / self.block_size) captured_block_counts = [math.ceil(cl / self.block_size) for (n, cl) in valid_combinations if n == batch_size] if block_count in captured_block_counts: - logger.debug(f"[{idx}/{total_combinations}] Skipping capture for batch {batch_size}, max_context_len {max_context_len}, block_count {block_count}. Reason: Block size already captured.") - df[max_context_len][batch_size] = 'redundant' + logger.debug(f"[{idx}/{total_combinations}] Skipping capture for batch {batch_size}, max_seq_len {max_seq_len}, block_count {block_count}. Reason: Block size already captured.") + df[max_seq_len][batch_size] = 'redundant' continue - logger.debug(f"[{idx}/{total_combinations}] Will capture for batch {batch_size}, max_context_len {max_context_len}, block_count {block_count}. Constraints met.") - df[max_context_len][batch_size] = 'VALID' - valid_combinations.append((batch_size, max_context_len)) + logger.debug(f"[{idx}/{total_combinations}] Will capture for batch {batch_size}, max_seq_len {max_seq_len}, block_count {block_count}. Constraints met.") + df[max_seq_len][batch_size] = 'VALID' + valid_combinations.append((batch_size, max_seq_len)) total_valid_hpugraphs = len(valid_combinations) logger.info(f"Starting capture {total_valid_hpugraphs} valid HPUGraphs. Skipping capture of {total_combinations-total_valid_hpugraphs}/{total_combinations} graphs due to batch/context constraints.") - logger.debug(f"Capture summary (row: batch_size; col: max_context_len):") + logger.debug(f"Capture summary (row: batch_size; col: max_seq_len):") logger.debug(tabulate.tabulate(df, tablefmt='mixed_outline', headers='keys', showindex="always")) graph_runner_name = self.graph_runner_class.__name__ @@ -1124,23 +961,28 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None: log_graph_compilation_all = os.environ.get('VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0' log_graph_compilation = os.environ.get('VLLM_HPU_LOG_STEP_GRAPH_COMPILATION', '0') != '0' or log_graph_compilation_all - for idx, (batch_size, max_context_len) in enumerate(pbar): - block_count = math.ceil(max_context_len / self.block_size) + for idx, (batch_size, max_seq_len) in enumerate(pbar): + block_count = math.ceil(max_seq_len / self.block_size) # Create dummy attn_metadata. - attn_metadata = self.attn_backend.make_metadata( + decode_metadata = self.attn_backend.make_metadata( is_prompt=False, - prompt_lens=None, - prompt_lens_tensor=None, - num_prefill_tokens=0, - num_generation_tokens=batch_size, - max_subquery_len=None, - max_context_len=block_count*self.block_size, - max_prompt_len=None, + seq_lens=None, + seq_lens_tensor=context_lens[:batch_size], + max_query_len=None, + max_seq_len=block_count*self.block_size, subquery_start_loc=None, seq_start_loc=None, - context_lens=context_lens[:batch_size], + context_lens_tensor=None, # NOTE(kzawora): this seems sus, shoudn't we have seq_lens tensor here? block_tables=block_tables[:batch_size, :block_count], use_cuda_graph=True, + ) + attn_metadata = AttentionMetadata( + num_prefills=0, + num_prefill_tokens=0, + num_decode_tokens=batch_size, + slot_mapping=slot_mapping[:batch_size], + prefill_metadata=None, + decode_metadata=decode_metadata, kv_cache_dtype=self.kv_cache_dtype, ) @@ -1153,7 +995,7 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None: graph_runner = self.graph_runner_class(self.model) local_start_mem = HabanaMemoryProfiler.current_memory_usage() capture_start = time.time() - desc = f'Capturing {graph_runner_name} for batch {batch_size}, max_context_len {max_context_len}, block_count {block_count}, allocated {format_bytes(local_start_mem - start_mem)} device memory in total ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)' + desc = f'Capturing {graph_runner_name} for batch {batch_size}, max_seq_len {max_seq_len}, block_count {block_count}, allocated {format_bytes(local_start_mem - start_mem)} device memory in total ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)' pbar.set_description(desc) logger.debug(f"[{idx}/{total_valid_hpugraphs}] {desc}...") profiling_ctx = contextlib.nullcontext() if not (log_graph_compilation_all or log_graph_compilation) else metric_localcontext("graph_compilation") @@ -1165,12 +1007,12 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None: attn_metadata, ) if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0) or log_graph_compilation_all: - logger.info(f"VLLM_HPU_STEP_GRAPH_COMPILATION: {gc_local_metric.stats()}, {graph_runner_name}; batch {batch_size}, max_context_len {max_context_len}, block_count {block_count}") + logger.info(f"VLLM_HPU_STEP_GRAPH_COMPILATION: {gc_local_metric.stats()}, {graph_runner_name}; batch {batch_size}, max_seq_len {max_seq_len}, block_count {block_count}") self.graph_runners[(batch_size, block_count)] = graph_runner capture_end = time.time() local_end_mem = HabanaMemoryProfiler.current_memory_usage() mem_usage_str = format_bytes(local_end_mem - local_start_mem) - graph_mem_usage_df[max_context_len][batch_size] = mem_usage_str + graph_mem_usage_df[max_seq_len][batch_size] = mem_usage_str logger.debug(f"[{idx}/{total_valid_hpugraphs}] {desc}... done in {capture_end-capture_start:.2f} seconds! Took {mem_usage_str} of device memory ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)") end_time = time.perf_counter() @@ -1178,7 +1020,7 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None: # This usually takes < 10 seconds. end_mem = HabanaMemoryProfiler.current_memory_usage() logger.info(f"Graph capturing finished in {elapsed_time:.0f} secs, allocated {format_bytes(end_mem - start_mem)} of device memory ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)") - logger.info(f"Graph memory allocation summary (row: batch_size; col: max_context_len):") + logger.info(f"Graph memory allocation summary (row: batch_size; col: max_seq_len):") logger.info(tabulate.tabulate(graph_mem_usage_df, tablefmt='mixed_outline', headers='keys', showindex="always")) def __del__(self) -> None: @@ -1312,8 +1154,8 @@ def capture( "positions": positions, "kv_caches": kv_caches, "slot_mapping": attn_metadata.slot_mapping, - "context_lens": attn_metadata.context_lens, - "block_tables": attn_metadata.block_tables, + "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor, + "block_tables": attn_metadata.decode_metadata.block_tables, } self.output_buffers = {"hidden_states": hidden_states} return @@ -1324,6 +1166,7 @@ def forward( positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, + **kwargs, ) -> torch.Tensor: # KV caches are fixed tensors, so we don't need to copy them. del kv_caches @@ -1333,10 +1176,10 @@ def forward( self.input_buffers["positions"].copy_(positions, non_blocking=True) self.input_buffers["slot_mapping"].copy_(attn_metadata.slot_mapping, non_blocking=True) - self.input_buffers["context_lens"].copy_(attn_metadata.context_lens, - non_blocking=True) - self.input_buffers["block_tables"].copy_(attn_metadata.block_tables, - non_blocking=True) + self.input_buffers["seq_lens_tensor"].copy_( + attn_metadata.decode_metadata.seq_lens_tensor, non_blocking=True) + self.input_buffers["block_tables"].copy_( + attn_metadata.decode_metadata.block_tables, non_blocking=True) # Run the graph. self.graph.replay() @@ -1370,7 +1213,7 @@ def forward(self, input_ids, positions, kv_caches, slot_mapping, context_lens, b num_prefill_tokens=0, num_generation_tokens=attn_metadata.num_generation_tokens, max_subquery_len=None, - max_context_len=attn_metadata.max_context_len, + max_seq_len=attn_metadata.max_seq_len, max_prompt_len=None, subquery_start_loc=None, seq_start_loc=None, @@ -1436,18 +1279,18 @@ def _get_graph_batch_size(batch_size: int) -> int: _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT) -def _get_graph_max_context_len(max_context_len: int) -> int: +def _get_graph_max_seq_len(max_seq_len: int) -> int: """Returns the padded batch size given actual batch size. Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT, 2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT... """ - if max_context_len <= 32: + if max_seq_len <= 32: return 32 - elif max_context_len <= 64: + elif max_seq_len <= 64: return 64 - elif max_context_len <= 128: + elif max_seq_len <= 128: return 128 else: - return ((max_context_len + _MAX_CONTEXT_LEN_ALIGNMENT - 1) // - _MAX_CONTEXT_LEN_ALIGNMENT * _MAX_CONTEXT_LEN_ALIGNMENT) + return ((max_seq_len + _MAX_SEQ_LEN_ALIGNMENT - 1) // + _MAX_SEQ_LEN_ALIGNMENT * _MAX_SEQ_LEN_ALIGNMENT) From b5d403780ef615779d10e460acca904cc4206aec Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 9 May 2024 14:11:55 +0300 Subject: [PATCH 004/819] re-enable 8x hpu support --- vllm/distributed/communication_op.py | 3 +- vllm/executor/ray_habana_executor.py | 324 ++++++------------ .../model_executor/layers/logits_processor.py | 4 +- vllm/worker/habana_worker.py | 7 +- 4 files changed, 116 insertions(+), 222 deletions(-) diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py index 80d03129bdb9b..7b2905af7e0ab 100644 --- a/vllm/distributed/communication_op.py +++ b/vllm/distributed/communication_op.py @@ -3,6 +3,7 @@ import torch from torch.distributed import ProcessGroup +from vllm.utils import is_hpu from .parallel_state import (get_cpu_world_group, get_tensor_model_parallel_group, @@ -156,7 +157,7 @@ def _split_tensor_dict( # because it contains not only the device type but also the device # index (e.g. "cuda:0"). We only need the device type. # receiving side will set the device index. - device = "cpu" if value.is_cpu else "cuda" + device = "cpu" if value.is_cpu else ("hpu" if is_hpu() else "cuda") metadata_list.append( (key, TensorMetadata(device, value.dtype, value.size()))) tensor_list.append(value) diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py index dac8eefb18adc..a17f509f11658 100644 --- a/vllm/executor/ray_habana_executor.py +++ b/vllm/executor/ray_habana_executor.py @@ -1,20 +1,18 @@ import asyncio -import copy import os import pickle from collections import defaultdict -from typing import TYPE_CHECKING, Any, Dict, List, Optional +from itertools import islice, repeat +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple -from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig) -from vllm.engine.ray_utils import RayWorkerVllm, ray -from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase -from vllm.executor.utils import check_block_size_valid +import vllm.envs as envs +from vllm.executor.distributed_gpu_executor import ( # yapf: disable + DistributedGPUExecutor, DistributedGPUExecutorAsync) +from vllm.executor.ray_utils import RayWorkerWrapper, ray from vllm.logger import init_logger -from vllm.lora.request import LoRARequest -from vllm.sequence import SamplerOutput, SequenceGroupMetadata +from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, - make_async) + get_vllm_instance_id, make_async) if ray is not None: from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy @@ -24,29 +22,14 @@ logger = init_logger(__name__) -# If the env var is set, it uses the Ray's compiled DAG API -# which optimizes the control plane overhead. -# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. -USE_RAY_COMPILED_DAG = bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0)) +USE_RAY_COMPILED_DAG = envs.VLLM_USE_RAY_COMPILED_DAG -class RayHabanaExecutor(ExecutorBase): +class RayHabanaExecutor(DistributedGPUExecutor): - def __init__( - self, - model_config: ModelConfig, - cache_config: CacheConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - lora_config: Optional[LoRAConfig], - ) -> None: - self.model_config = model_config - self.cache_config = cache_config - self.lora_config = lora_config - self.parallel_config = parallel_config - self.scheduler_config = scheduler_config - self.device_config = device_config + def _init_executor(self) -> None: + assert (not self.speculative_config + ), "Speculative decoding not yet supported for RayGPU backend." assert self.parallel_config.worker_use_ray placement_group = self.parallel_config.placement_group @@ -59,9 +42,6 @@ def __init__( # Create the parallel GPU workers. self._init_workers_ray(placement_group) - # Profile the memory usage and initialize the cache. - self._init_cache() - self.forward_dag = None if USE_RAY_COMPILED_DAG: self.forward_dag = self._compiled_ray_dag() @@ -77,9 +57,9 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # The driver dummy worker does not actually use any resources. # It holds the resource for the driver worker. - self.driver_dummy_worker: RayWorkerVllm = None + self.driver_dummy_worker: Optional[RayWorkerWrapper] = None # The remaining workers are the actual ray actors. - self.workers: List[RayWorkerVllm] = [] + self.workers: List[RayWorkerWrapper] = [] # Create the workers. driver_ip = get_ip() @@ -97,13 +77,22 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", resources={'HPU': num_gpus}, scheduling_strategy=scheduling_strategy, **ray_remote_kwargs, - )(RayWorkerVllm).remote(self.model_config.trust_remote_code) + )(RayWorkerWrapper).remote( + worker_module_name="vllm.worker.habana_worker", + worker_class_name="HabanaWorker", + trust_remote_code=self.model_config.trust_remote_code, + ) worker_ip = ray.get(worker.get_node_ip.remote()) if worker_ip == driver_ip and self.driver_dummy_worker is None: # If the worker is on the same node as the driver, we use it # as the resource holder for the driver process. self.driver_dummy_worker = worker + self.driver_worker = RayWorkerWrapper( + worker_module_name="vllm.worker.habana_worker", + worker_class_name="HabanaWorker", + trust_remote_code=self.model_config.trust_remote_code, + ) else: # Else, added to the list of workers. self.workers.append(worker) @@ -115,201 +104,120 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", "GPU node.") # Get the set of GPU IDs used on each node. - driver_node_id, driver_gpu_ids = ray.get( - self.driver_dummy_worker.get_node_and_gpu_ids.remote()) - worker_node_and_gpu_ids = ray.get( - [worker.get_node_and_gpu_ids.remote() for worker in self.workers]) + worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids", + use_dummy_driver=True) node_workers = defaultdict(list) node_gpus = defaultdict(list) - node_workers[driver_node_id].append(0) - node_gpus[driver_node_id].extend(driver_gpu_ids) - for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids, - start=1): + for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids): node_workers[node_id].append(i) node_gpus[node_id].extend(gpu_ids) for node_id, gpu_ids in node_gpus.items(): node_gpus[node_id] = sorted(gpu_ids) + VLLM_INSTANCE_ID = get_vllm_instance_id() + + # Set environment variables for the driver and workers. + all_args_to_update_environment_variables = [({ + "VLLM_INSTANCE_ID": + VLLM_INSTANCE_ID, + "VLLM_TRACE_FUNCTION": + str(envs.VLLM_TRACE_FUNCTION), + }, ) for (node_id, _) in worker_node_and_gpu_ids] + self._run_workers("update_environment_variables", + all_args=all_args_to_update_environment_variables) + distributed_init_method = get_distributed_init_method( driver_ip, get_open_port()) - # Lazy import the Worker to avoid importing torch.cuda/xformers - # before CUDA_VISIBLE_DEVICES is set in the Worker - from vllm.worker.habana_worker import HabanaWorker - - model_config = copy.deepcopy(self.model_config) - parallel_config = copy.deepcopy(self.parallel_config) - scheduler_config = copy.deepcopy(self.scheduler_config) - device_config = copy.deepcopy(self.device_config) - lora_config = copy.deepcopy(self.lora_config) - kv_cache_dtype = self.cache_config.cache_dtype - - # Initialize the actual workers with the Worker class. - for rank, (worker, (node_id, _)) in enumerate( - zip(self.workers, worker_node_and_gpu_ids), - start=1, - ): - local_rank = node_workers[node_id].index(rank) - worker.init_worker.remote( - lambda rank=rank, local_rank=local_rank: HabanaWorker( - model_config, - parallel_config, - scheduler_config, - device_config, - local_rank, - rank, - distributed_init_method, - lora_config=lora_config, - kv_cache_dtype=kv_cache_dtype, - )) - - # Initialize the driver worker with the Worker class. - driver_rank = 0 - driver_local_rank = node_workers[driver_node_id].index(driver_rank) - self.driver_worker = HabanaWorker( - self.model_config, - self.parallel_config, - self.scheduler_config, - self.device_config, - driver_local_rank, - driver_rank, - distributed_init_method, - lora_config=self.lora_config, - kv_cache_dtype=kv_cache_dtype, - is_driver_worker=True, - ) + # Initialize the actual workers inside worker wrapper. + init_worker_all_kwargs = [ + self._get_worker_kwargs( + local_rank=node_workers[node_id].index(rank), + rank=rank, + distributed_init_method=distributed_init_method, + ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids) + ] + self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs) self._run_workers("init_device") - self._run_workers( - "load_model", - max_concurrent_workers=self.parallel_config. - max_parallel_loading_workers, - ) - - def _init_cache(self) -> None: - """Profiles the memory usage and initializes the KV cache. - - The engine will first conduct a profiling of the existing memory usage. - Then, it calculate the maximum possible number of GPU and CPU blocks - that can be allocated with the remaining free memory. - More details can be found in the - :meth:`~vllm.worker.worker.Worker.profile_num_available_blocks` method - from class :class:`~vllm.worker.Worker`. - - Afterwards, as there may be multiple workers, - we take the minimum number of blocks across all workers - to ensure this can be applied to all of them. - - Finally, the engine will initialize the KV cache - with the calculated number of blocks. - - .. tip:: - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameter. - """ - # Get the maximum number of blocks that can be allocated on GPU and CPU. - num_blocks = self._run_workers( - "profile_num_available_blocks", - block_size=self.cache_config.block_size, - hpu_memory_utilization=self.cache_config.gpu_memory_utilization, - cpu_swap_space=self.cache_config.swap_space_bytes, - cache_dtype=self.cache_config.cache_dtype, - ) - - # Since we use a shared centralized controller, we take the minimum - # number of blocks across all workers to make sure all the memory - # operators can be applied to all workers. - num_gpu_blocks = min(b[0] for b in num_blocks) - num_cpu_blocks = min(b[1] for b in num_blocks) - logger.info(f"# HPU blocks: {num_gpu_blocks}, " - f"# CPU blocks: {num_cpu_blocks}") - - check_block_size_valid(num_gpu_blocks, self.cache_config.block_size, - self.model_config.max_model_len) - - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = num_cpu_blocks - - # Initialize the cache. - self._run_workers("init_cache_engine", cache_config=self.cache_config) - # Warm up the model. This includes capturing the model into CUDA graph - # if enforce_eager is False. - self._run_workers("warm_up_model") - - def execute_model(self, - seq_group_metadata_list: List[SequenceGroupMetadata], - blocks_to_swap_in: Dict[int, int], - blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput: + self._run_workers("load_model", + max_concurrent_workers=self.parallel_config. + max_parallel_loading_workers) + + def execute_model( + self, + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: all_outputs = self._run_workers( "execute_model", - driver_kwargs={ - "seq_group_metadata_list": seq_group_metadata_list, - "blocks_to_swap_in": blocks_to_swap_in, - "blocks_to_swap_out": blocks_to_swap_out, - "blocks_to_copy": blocks_to_copy, - }, + driver_kwargs={"execute_model_req": execute_model_req}, use_ray_compiled_dag=USE_RAY_COMPILED_DAG) # Only the driver worker returns the sampling results. - output = all_outputs[0] - return output - - def add_lora(self, lora_request: LoRARequest) -> bool: - assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." - return self._run_workers( - "add_lora", - lora_request=lora_request, - ) - - def remove_lora(self, lora_id: int) -> bool: - assert lora_id > 0, "lora_id must be greater than 0." - return self._run_workers( - "remove_lora", - lora_id=lora_id, - ) - - def list_loras(self) -> List[int]: - return self._run_workers("list_loras") + return all_outputs[0] def _run_workers( self, method: str, *args, - driver_args: Optional[List[Any]] = None, + driver_args: Optional[Tuple[Any, ...]] = None, driver_kwargs: Optional[Dict[str, Any]] = None, + all_args: Optional[List[Tuple[Any, ...]]] = None, + all_kwargs: Optional[List[Dict[str, Any]]] = None, + use_dummy_driver: bool = False, max_concurrent_workers: Optional[int] = None, use_ray_compiled_dag: bool = False, **kwargs, ) -> Any: - """Runs the given method on all workers.""" + """Runs the given method on all workers. Can be used in the following + ways: + + - args/kwargs: All workers share the same args/kwargs + - args/kwargs and driver_args/driver_kwargs: Driver worker has + different args + - all_args/all_kwargs: args/kwargs for each worker are specified + individually + """ if max_concurrent_workers: raise NotImplementedError( "max_concurrent_workers is not supported yet.") + if driver_args is None: + driver_args = args if all_args is None else all_args[0] + if driver_kwargs is None: + driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0] + + count = len(self.workers) + all_worker_args = repeat(args, count) if all_args is None \ + else islice(all_args, 1, None) + all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \ + else islice(all_kwargs, 1, None) + if use_ray_compiled_dag: # Right now, compiled DAG can only accept a single # input. TODO(sang): Fix it. + assert self.forward_dag is not None output_channels = self.forward_dag.execute(1) else: # Start the ray workers first. ray_worker_outputs = [ - worker.execute_method.remote(method, *args, **kwargs) - for worker in self.workers + worker.execute_method.remote(method, *worker_args, + **worker_kwargs) + for (worker, worker_args, worker_kwargs + ) in zip(self.workers, all_worker_args, all_worker_kwargs) ] - if driver_args is None: - driver_args = args - if driver_kwargs is None: - driver_kwargs = kwargs - # Start the driver worker after all the ray workers. - driver_worker_output = getattr(self.driver_worker, - method)(*driver_args, **driver_kwargs) - + if not use_dummy_driver: + driver_worker_output = self.driver_worker.execute_method( + method, *driver_args, **driver_kwargs) + else: + assert self.driver_dummy_worker is not None + driver_worker_output = ray.get( + self.driver_dummy_worker.execute_method.remote( + method, *driver_args, **driver_kwargs)) # Get the results of the ray workers. if self.workers: if use_ray_compiled_dag: @@ -342,8 +250,9 @@ def _compiled_ray_dag(self): # a dummy value for now. It will be fixed soon. with InputNode() as input_data: forward_dag = MultiOutputNode([ - worker.execute_model_compiled_dag_remote.bind(input_data) - for worker in self.workers + worker.execute_model_compiled_dag_remote. + bind( # type: ignore[attr-defined] + input_data) for worker in self.workers ]) return forward_dag.experimental_compile() @@ -365,13 +274,17 @@ def _check_if_any_actor_is_dead(self): f"Dead Workers: {dead_actors}. ") -class RayHabanaExecutorAsync(RayHabanaExecutor, ExecutorAsyncBase): +class RayHabanaExecutorAsync(RayHabanaExecutor, DistributedGPUExecutorAsync): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.driver_executor = make_async(self.driver_worker.execute_method) async def _run_workers_async( self, method: str, *args, - driver_args: Optional[List[Any]] = None, + driver_args: Optional[Tuple[Any, ...]] = None, driver_kwargs: Optional[Dict[str, Any]] = None, **kwargs, ) -> Any: @@ -383,9 +296,8 @@ async def _run_workers_async( if driver_kwargs is None: driver_kwargs = kwargs - # Run the driver worker asynchronously. - driver_executor = make_async(getattr(self.driver_worker, method)) - coros.append(driver_executor(*driver_args, **driver_kwargs)) + coros.append( + self.driver_executor(method, *driver_args, **driver_kwargs)) # Run the ray workers asynchronously. for worker in self.workers: @@ -393,27 +305,3 @@ async def _run_workers_async( all_outputs = await asyncio.gather(*coros) return all_outputs - - async def execute_model_async( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - blocks_to_swap_in: Dict[int, int], - blocks_to_swap_out: Dict[int, int], - blocks_to_copy: Dict[int, List[int]], - ) -> SamplerOutput: - all_outputs = await self._run_workers_async( - "execute_model", - driver_kwargs={ - "seq_group_metadata_list": seq_group_metadata_list, - "blocks_to_swap_in": blocks_to_swap_in, - "blocks_to_swap_out": blocks_to_swap_out, - "blocks_to_copy": blocks_to_copy, - }) - - # Only the driver worker returns the sampling results. - output = all_outputs[0] - return output - - async def check_health_async(self) -> None: - """Raises an error if engine is unhealthy.""" - self._check_if_any_actor_is_dead() diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index 605009e8f695c..5e484ff05b2f3 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -50,7 +50,9 @@ def forward( # Get the logits for the next tokens. logits = self._get_logits(hidden_states, embedding, embedding_bias) - if logits is not None: # and sampling_metadata.perform_sampling: FIXME: this is needed for 8xHPU + # NOTE(kzawora): allgather on HPU will cause logits to be not None, + # and we need to guard against applying logits processors on non-driver worker + if logits is not None and sampling_metadata.seq_groups is not None: logits *= self.scale # Apply logits processors (if any). diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index a05eee90648b2..43ccd235c174f 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -293,10 +293,13 @@ def init_worker_distributed_environment( init_method=distributed_init_method, ) - # A small all_reduce for warmup. - torch.distributed.all_reduce(torch.zeros(1).to('hpu')) + # A small all_reduce for warmup & checking conformance. + dummy_tensor_hpu = torch.ones(1).to('hpu') + torch.distributed.all_reduce(dummy_tensor_hpu) + assert dummy_tensor_hpu.item() == parallel_config.world_size ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size) + def raise_if_cache_size_invalid(num_gpu_blocks, block_size, max_model_len) -> None: if num_gpu_blocks <= 0: From 90dfa92d8e22b2cc6634dbb5df27a6e253b84be1 Mon Sep 17 00:00:00 2001 From: madamczykhabana <110973826+madamczykhabana@users.noreply.github.com> Date: Thu, 9 May 2024 13:14:46 +0200 Subject: [PATCH 005/819] Fix model_output_idx on HPU (#27) --- vllm/model_executor/sampling_metadata.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 78b3e6417366e..e2076018b5609 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -192,6 +192,12 @@ def _prepare_seq_groups( # Total number of prompts from given sequence groups. num_prompts = 0 + # FIXME: On HPU prompts are right-padded. We need to take that into account + # when updating model_output_idx + if is_hpu() and len(seq_lens) > 0: + assert seq_lens == query_lens, 'Prompt chunking is not yet supported on HPU!' + max_seq_len = max(seq_lens) + for i, seq_group_metadata in enumerate(seq_group_metadata_list): seq_ids = list(seq_group_metadata.seq_data.keys()) sampling_params = seq_group_metadata.sampling_params @@ -219,10 +225,12 @@ def _prepare_seq_groups( prompt_logprob_len = (query_len - num_prefill_sample if do_sample else query_len) sample_len = num_prefill_sample if do_sample else 0 + padding_len = 0 if not is_hpu() else max_seq_len - seq_len else: # Decode prompt_logprob_len = 0 sample_len = len(seq_ids) if do_sample else 0 + padding_len = 0 # Update indices to select from the model output. """ @@ -241,6 +249,7 @@ def _prepare_seq_groups( selected_token_indices.extend( range(model_output_idx, model_output_idx + sample_len)) model_output_idx += sample_len + model_output_idx += padding_len # We now find indices for logprob computation and sampling. """ From eeef644262f76b4d7af560b88dbbed6946f8c1bd Mon Sep 17 00:00:00 2001 From: madamczykhabana <110973826+madamczykhabana@users.noreply.github.com> Date: Thu, 9 May 2024 13:21:14 +0200 Subject: [PATCH 006/819] Allow block_sizes: 64 and 128 (#28) --- vllm/engine/arg_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b718ed9cf393a..a8dcaef0e5754 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -249,7 +249,7 @@ def add_cli_args( parser.add_argument('--block-size', type=int, default=EngineArgs.block_size, - choices=[8, 16, 32], + choices=[8, 16, 32, 64, 128], help='Token block size for contiguous chunks of ' 'tokens.') From 84a46987f9981f96b9032432412c81b967219b2e Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 9 May 2024 15:04:33 +0300 Subject: [PATCH 007/819] add triton to requirements-hpu --- requirements-hpu.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 5176cc23cde47..21666eb116c22 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -3,5 +3,6 @@ # Dependencies for HPU code ray == 2.9.3 +triton pandas tabulate \ No newline at end of file From 972acf3ccb086568b76e49ca76fbe884f3f0fb7e Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 9 May 2024 16:09:37 +0300 Subject: [PATCH 008/819] Fix out-of-bound HPUGraph capture issue --- vllm/worker/habana_model_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index e418ccc1d5c62..a8f801d62cc3d 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -496,6 +496,7 @@ def _prepare_decode( use_captured_graph = ( not self.model_config.enforce_eager and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1] + and max_seq_len <= _MAX_SEQ_LENS_TO_CAPTURE[-1] and max_seq_len <= self.max_seq_len_to_capture) if use_captured_graph: graph_batch_size = _get_graph_batch_size(batch_size) From 61b77632897928cac86c691080df4f125824a60d Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 9 May 2024 16:56:10 +0300 Subject: [PATCH 009/819] fix VLLM_HPU_LOG_STEP_GRAPH_COMPILATION --- vllm/executor/habana_executor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py index cc035f397aa6d..5c2cc7e958f96 100644 --- a/vllm/executor/habana_executor.py +++ b/vllm/executor/habana_executor.py @@ -96,6 +96,7 @@ def execute_model( log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS', '0') != '0' or log_cpu_fallbacks_all if log_graph_compilation or log_cpu_fallbacks: from habana_frameworks.torch.hpu.metrics import metric_localcontext + seq_group_metadata_list = execute_model_req.seq_group_metadata_list is_prompt = any([seq_group_metadata.is_prompt for seq_group_metadata in seq_group_metadata_list]) max_context_len = max([max([len(v.prompt_token_ids) + len(v.output_token_ids) for v in seq_group_metadata.seq_data.values()]) for seq_group_metadata in seq_group_metadata_list]) # whoa, that's some spicy stuff right here max_num_blocks = ((max_context_len - 1) // self.cache_config.block_size) + 1 From fdf282b9e08560e230565b47de68513a89261050 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 15 May 2024 16:40:02 +0300 Subject: [PATCH 010/819] WA: Disable cumsum in HPU _prepare_prompt --- vllm/worker/habana_model_runner.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index a8f801d62cc3d..e306ef0ae12cb 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -402,15 +402,6 @@ def _prepare_prompt( dtype=torch.int32, device=self.device) - torch.cumsum(query_lens_tensor, - dim=0, - dtype=subquery_start_loc.dtype, - out=subquery_start_loc[1:]) - - torch.cumsum(seq_lens_tensor, - dim=0, - dtype=seq_start_loc.dtype, - out=seq_start_loc[1:]) attn_metadata = self.attn_backend.make_metadata( is_prompt=True, seq_lens=seq_lens, From ce1670b11156a25db6b992b285aeadc1166df504 Mon Sep 17 00:00:00 2001 From: madamczykhabana <110973826+madamczykhabana@users.noreply.github.com> Date: Tue, 21 May 2024 16:06:26 +0200 Subject: [PATCH 011/819] bs/seq bucketing for prompt and decode (#33) * Bucketing/Warmup WIP * Cleanup * Revert "Fix model_output_idx on HPU (#27)" This reverts commit 90dfa92d8e22b2cc6634dbb5df27a6e253b84be1. * Rework selected_token_indices fix to also work with block_size padding * Simple prompt attention POC * Remove cumsum * MQA/GQA support for simple prompt_attention * Cleanup * Fix typo * Restore profiling runs --- vllm/attention/backends/habana_attn.py | 57 +- vllm/hpu/xops.py | 85 +-- vllm/model_executor/sampling_metadata.py | 9 - vllm/worker/habana_model_runner.py | 818 +++++------------------ vllm/worker/habana_worker.py | 19 +- 5 files changed, 225 insertions(+), 763 deletions(-) diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py index 909c2ad955f25..45fe1989f9bff 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/habana_attn.py @@ -2,14 +2,13 @@ # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company ############################################################################### -import importlib from dataclasses import dataclass from typing import Dict, List, Optional, Tuple, Type import torch +import math import vllm.hpu.xops as xops from vllm.hpu.attn_bias import (AttentionBias, - BlockDiagonalCausalMask, LowerTriangularMaskWithTensorBias) from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, @@ -18,7 +17,6 @@ from vllm.attention.ops.habana_paged_attn import (HabanaPagedAttention, HabanaPagedAttentionMetadata) from vllm.logger import init_logger -from vllm.utils import is_hip logger = init_logger(__name__) @@ -119,11 +117,11 @@ def __post_init__(self): class HabanaAttentionImpl(AttentionImpl): """ If the input tensors contain prompt tokens, the layout is as follows: - |<--------------- num_prefill_tokens ----------------->| + |<--------------- num_prefill_tokens ----------------->| |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->| - Otherwise, the layout is as follows: - |<----------------- num_decode_tokens ------------------>| + Otherwise, the layout is as follows: + |<----------------- num_decode_tokens ------------------>| |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->| Generation tokens can contain padding when cuda-graph is used. @@ -196,48 +194,37 @@ def forward( HabanaPagedAttention.write_to_paged_cache(key, value, key_cache, value_cache, attn_metadata.slot_mapping, - attn_metadata.kv_cache_dtype, + attn_metadata.kv_cache_dtype, attn_metadata.prefill_metadata is not None) if prefill_meta := attn_metadata.prefill_metadata: # Prompt run. if kv_cache is None or prefill_meta.block_tables.numel() == 0: - # normal attention. - # block tables are empty if the prompt does not have a cached - # prefix. - if self.num_kv_heads != self.num_heads: - # As of Nov 2023, xformers only supports MHA. For MQA/GQA, - # project the key and value tensors to the desired number of - # heads. - # TODO(woosuk): Use MQA/GQA kernels for higher performance. - query = query.view(query.shape[0], self.num_kv_heads, - self.num_queries_per_kv, - query.shape[-1]) - key = key[:, :, - None, :].expand(key.shape[0], self.num_kv_heads, - self.num_queries_per_kv, - key.shape[-1]) - value = value[:, :, - None, :].expand(value.shape[0], - self.num_kv_heads, - self.num_queries_per_kv, - value.shape[-1]) - + # TODO: move this outside of model if prefill_meta.attn_bias is None: if self.alibi_slopes is None: - attn_bias = BlockDiagonalCausalMask.from_seqlens( - [seq_len] * batch_size) + lens = torch.tensor(attn_metadata.prefill_metadata.seq_lens, device=query.device, dtype=torch.int32) + len_mask = (torch.arange(0, seq_len, device=query.device, dtype=torch.int32) + .view(1, seq_len) + .ge(lens.unsqueeze(-1)) + .view(batch_size, 1, 1, seq_len)) + causal_mask = torch.triu( + torch.ones((batch_size, 1, seq_len, seq_len), device=query.device, dtype=torch.bool), + diagonal=1 + ) + mask = causal_mask.logical_or(len_mask) + attn_bias = (torch.zeros_like(mask, dtype=query.dtype) + .masked_fill_(mask, -math.inf)) if self.sliding_window is not None: - attn_bias = attn_bias.make_local_attention( - self.sliding_window) + raise NotImplementedError("Sliding window is not supported on HPU") prefill_meta.attn_bias = attn_bias else: prefill_meta.attn_bias = _make_alibi_bias( self.alibi_slopes, self.num_kv_heads, batch_size, seq_len, query.dtype) - query_shape = (batch_size, seq_len, self.num_kv_heads, self.num_queries_per_kv, self.head_size) if self.num_kv_heads != self.num_heads else (batch_size, seq_len, self.num_heads, self.head_size) - kv_shape = (batch_size, seq_len_kv, self.num_kv_heads, self.num_queries_per_kv, self.head_size) if self.num_kv_heads != self.num_heads else (batch_size, seq_len_kv, self.num_kv_heads, self.head_size) - out = xops.memory_efficient_attention_forward( + query_shape = (batch_size, seq_len, self.num_heads, self.head_size) + kv_shape = (batch_size, seq_len_kv, self.num_kv_heads, self.head_size) + out = xops.prompt_attention( query.view(query_shape), key.view(kv_shape), value.view(kv_shape), diff --git a/vllm/hpu/xops.py b/vllm/hpu/xops.py index c9d237744a917..d6404a4872c0d 100644 --- a/vllm/hpu/xops.py +++ b/vllm/hpu/xops.py @@ -5,62 +5,37 @@ # LICENSE file in the root directory of this source tree. ############################################################################### -import habana_frameworks.torch as htorch import torch -import torch.nn.functional as F -from typing import List, Optional, Tuple, Union -from .attn_bias import AttentionBias, BlockDiagonalCausalMask +from typing import Optional -try: - from habana_frameworks.torch.hpex.kernels import FusedSDPA -except ImportError: - print("Not using HPU fused scaled dot-product attention kernel.") - FusedSDPA = None +import vllm.hpu.utils -def memory_efficient_attention_forward( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - attn_bias: Optional[torch.Tensor] = None, - p: float = 0.0, - scale: Optional[float] = None, -) -> torch.Tensor: - assert attn_bias is not None, "Attention mask is required for prompt processing" - dim = query.dim() - is_causal = isinstance(attn_bias, BlockDiagonalCausalMask) - if FusedSDPA and (is_causal or attn_bias is None): - bs = query.shape[0] - seq_len_q = query.shape[1] - seq_len_kv = key.shape[1] - heads = query.shape[-2] if dim != 5 else query.shape[-3] - attn_groups = 1 if dim != 5 else query.shape[-2] - head_dim = query.shape[-1] - if dim == 4: - # [bs, seq_len, 1, heads, head_dim] -> [bs, heads, seq_len, head_dim] - query = query.reshape(bs, seq_len_q, heads, head_dim).permute(0, 2, 1, 3) - key = key.reshape(bs, seq_len_kv, heads, head_dim).permute(0, 2, 1, 3) - value = value.reshape(bs, seq_len_kv, heads, head_dim).permute(0, 2, 1, 3) - elif dim == 5: - # [bs, seq_len, heads, attn_groups, head_dim] -> [bs, heads, attn_groups, seq_len, head_dim] - query = query.reshape(bs, seq_len_q, heads, attn_groups, head_dim).permute(0, 2, 3, 1, 4) - key = key.reshape(bs, seq_len_kv, heads, attn_groups, head_dim).permute(0, 2, 3, 1, 4) - value = value.reshape(bs, seq_len_kv, heads, attn_groups, head_dim).permute(0, 2, 3, 1, 4) - else: - raise ValueError(f"Unsupported attention dimension: {dim}") - - import habana_frameworks.torch.hpu as ht - with ht.sdp_kernel(enable_recompute=False): # (flash_attention_recompute and q_len == 1)): - out = FusedSDPA.apply( - query, key, value, None, p, is_causal, scale - ) - htorch.core.mark_step() - if dim == 4: - # [bs, heads, seq_len, head_dim] -> [bs, seq_len, heads, head_dim] - out = out.permute(0, 2, 1, 3).reshape(bs, seq_len_q, heads, head_dim) - elif dim == 5: - # [bs, heads, attn_groups, seq_len, head_dim] -> [bs, seq_len, heads, attn_groups, head_dim] - out = out.permute(0, 3, 1, 2, 4).reshape(bs, seq_len_q, heads, attn_groups, head_dim) - else: - raise NotImplementedError(f'Only FusedSDPA causal or non-masked attention is supported.\nFusedSDPA support: {FusedSDPA is not None}\nis_causal: {is_causal}\nmask_present: {attn_bias is not None}') - return out +@vllm.hpu.utils.with_mark_steps +def prompt_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_bias: Optional[torch.Tensor] = None, + p: float = 0.0, + scale: Optional[float] = None, +) -> torch.Tensor: + query = query.transpose(1, 2) + key = key.transpose(1, 2) + value = value.transpose(1, 2) + query_heads = query.size(1) + kv_heads = key.size(1) + if query_heads != kv_heads: + query = query.unflatten(1, (kv_heads, -1)) + key = key.unflatten(1, (kv_heads, 1)) + value = value.unflatten(1, (kv_heads, 1)) + attn_bias = attn_bias.unsqueeze(2) + attn_weights = torch.matmul(query * scale, key.transpose(-1, -2)) + if attn_bias is not None: + attn_weights.add_(attn_bias) + attn_weights = torch.softmax(attn_weights, dim=-1) + attn_weights = torch.matmul(attn_weights, value) + if query_heads != kv_heads: + attn_weights = attn_weights.flatten(1, 2) + attn_weights = attn_weights.transpose(1, 2) + return attn_weights diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index e2076018b5609..78b3e6417366e 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -192,12 +192,6 @@ def _prepare_seq_groups( # Total number of prompts from given sequence groups. num_prompts = 0 - # FIXME: On HPU prompts are right-padded. We need to take that into account - # when updating model_output_idx - if is_hpu() and len(seq_lens) > 0: - assert seq_lens == query_lens, 'Prompt chunking is not yet supported on HPU!' - max_seq_len = max(seq_lens) - for i, seq_group_metadata in enumerate(seq_group_metadata_list): seq_ids = list(seq_group_metadata.seq_data.keys()) sampling_params = seq_group_metadata.sampling_params @@ -225,12 +219,10 @@ def _prepare_seq_groups( prompt_logprob_len = (query_len - num_prefill_sample if do_sample else query_len) sample_len = num_prefill_sample if do_sample else 0 - padding_len = 0 if not is_hpu() else max_seq_len - seq_len else: # Decode prompt_logprob_len = 0 sample_len = len(seq_ids) if do_sample else 0 - padding_len = 0 # Update indices to select from the model output. """ @@ -249,7 +241,6 @@ def _prepare_seq_groups( selected_token_indices.extend( range(model_output_idx, model_output_idx + sample_len)) model_output_idx += sample_len - model_output_idx += padding_len # We now find indices for logprob computation and sampling. """ diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index e306ef0ae12cb..995864e3f81e7 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -2,60 +2,77 @@ # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company ############################################################################### -import contextlib import time from enum import IntEnum -from typing import Dict, List, NamedTuple, Optional, Set, Tuple - -# for logging hpugraph capture -import tqdm -import pandas as pd -import tabulate +from typing import List, NamedTuple, Optional, Set, Tuple, Dict import os -import contextlib import math import itertools -import numpy as np +import operator import torch -import torch.nn as nn import habana_frameworks.torch as htorch -from habana_frameworks.torch.hpu.metrics import metric_localcontext from vllm.attention import (AttentionMetadata, AttentionMetadataPerStage, get_attn_backend) -from vllm.config import (DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, +from vllm.config import (DeviceConfig, LoadConfig, CacheConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig) from vllm.distributed import broadcast_tensor_dict -from vllm.distributed.device_communicators import custom_all_reduce from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping from vllm.lora.request import LoRARequest from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.model_executor import SamplingMetadata from vllm.model_executor.model_loader import get_model -from vllm.sampling_params import SamplingParams, SamplingType +from vllm.sampling_params import SamplingParams from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata -from vllm.utils import (HabanaMemoryProfiler, async_tensor_h2d, - is_pin_memory_available, make_tensor_with_pad, - maybe_expand_dim, pad_to_max_length, format_bytes) +from vllm.utils import (HabanaMemoryProfiler, is_pin_memory_available, + make_tensor_with_pad, format_bytes) logger = init_logger(__name__) -_PAD_SLOT_ID = -1 +_PAD_SLOT_ID = 0 LORA_WARMUP_RANK = 8 -_BATCH_SIZE_ALIGNMENT = 16 -# Capture graphs for token size 1, 2, 4, 8, 16, 32, 48, ..., 512. -# NOTE: _get_graph_batch_size needs to be updated if this list is changed. -_BATCH_SIZES_TO_CAPTURE = [1, 2, 4, 8] + [ - _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33) -] -# Capture graphs for token size 1, 32, 64, 128, 256, 512, 768 ... 2048 -_MAX_SEQ_LEN_ALIGNMENT = 256 -_MAX_SEQ_LENS_TO_CAPTURE = [1, 32, 64, 128] + [ - _MAX_SEQ_LEN_ALIGNMENT * i for i in range(1, 9) -] + +# Read bucketing configuration from env variables +# phase is either 'prompt' or 'decode' +# dim is either 'bs' or 'seq' +# example env variable: VLLM_DECODE_BS_STEP=128 +def read_bucket_settings(phase: str, dim: str, **defaults: Dict): + params = ['min', 'step', 'max'] + values = [os.environ.get(f'VLLM_{phase}_{dim}_BUCKET_{p}'.upper(), defaults[p]) for p in params] + return values + + +def warmup_buckets(config: Tuple[int, int, int]): + bmin, bstep, bmax = config + base = itertools.repeat(2) + ramp_up = itertools.accumulate(base, func=operator.mul, initial=bmin) + ramp_up = itertools.takewhile(lambda x: x < bstep and x <= bmax, ramp_up) + stable = range(bstep, bmax + 1, bstep) + return list(ramp_up) + list(stable) + + +def next_pow2(value: int): + res = 1 + while value > 1: + value = (value + 1) // 2 + res *= 2 + return res + + +def round_up(value: int, k: int): + return (value + k - 1) // k * k + + +def find_bucket(value: int, config: Tuple[int, int, int]): + bmin, bstep, bmax = config + if value < bstep: + result = min(next_pow2(value), bstep) + else: + result = round_up(value, bstep) + return result class PreparePromptMetadata(NamedTuple): @@ -127,6 +144,7 @@ def __init__( scheduler_config: SchedulerConfig, device_config: DeviceConfig, load_config: LoadConfig, + cache_config: CacheConfig, lora_config: Optional[LoRAConfig], kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, @@ -139,22 +157,16 @@ def __init__( self.load_config = load_config self.is_driver_worker = is_driver_worker - # model_config can be None in tests/samplers/test_sampler.py. - # FIXME(woosuk): This is a hack to make the tests work. Refactor this. self.sliding_window = (model_config.get_sliding_window() if model_config is not None else None) self.device_config = (device_config if device_config is not None else DeviceConfig()) self.device = self.device_config.device - # Set after load_model. - self.lora_manager: LRUCacheWorkerLoRAManager = None - - self.graph_runner_class = HPUGraphRunner - self.graph_runners: Dict[Tuple[int, int], self.graph_runner_class] = {} - - self.max_seq_len_to_capture = (self.model_config.max_seq_len_to_capture - if self.model_config is not None else 0) + self.max_num_seqs = self.scheduler_config.max_num_seqs + self.max_model_len = self.scheduler_config.max_model_len + self.max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens + self.block_size = cache_config.block_size self.pin_memory = is_pin_memory_available() self.kv_cache_dtype = kv_cache_dtype @@ -164,16 +176,11 @@ def __init__( self.model_config.dtype if model_config is not None else None) # Lazy initialization - self.model: torch.nn.Module # Set after load_model - self.block_size: int # Set after initial profiling. - # When using CUDA graph, the input block tables must be padded to - # max_seq_len_to_capture. However, creating the block table in - # Python can be expensive. To optimize this, we cache the block table - # in numpy and only copy the actual input content at every iteration. - # The shape of the cached block table will be - # (max batch size to capture, max context len to capture / block size). - self.graph_block_tables: torch.Tensor # Set after initial profiling. + self.lora_manager: LRUCacheWorkerLoRAManager = None + self.model: torch.nn.Module = None + self.excluded_from_warmup = [] + self._setup_buckets() def load_model(self) -> None: with HabanaMemoryProfiler() as m: @@ -207,16 +214,18 @@ def load_model(self) -> None: self.model.embedding_padding_modules) self.model = self.lora_manager.create_lora_manager(self.model) - def set_block_size(self, block_size: int) -> None: - self.block_size = block_size - - self.graph_block_tables = np.zeros( - (max(_BATCH_SIZES_TO_CAPTURE), self.get_max_block_per_batch()), - dtype=np.int32) - - def get_max_block_per_batch(self) -> int: - block_size = self.block_size - return (self.max_seq_len_to_capture + block_size - 1) // block_size + def _setup_buckets(self) -> None: + self.prompt_bs_bucket_cfg = read_bucket_settings('prompt', 'bs', min=1, step=32, max=min(self.max_num_seqs, 64)) + self.decode_bs_bucket_cfg = read_bucket_settings('decode', 'bs', min=1, step=128, max=self.max_num_seqs) + self.prompt_seq_bucket_cfg = read_bucket_settings('prompt', 'seq', min=self.block_size, step=self.block_size, max=1024) + self.decode_seq_bucket_cfg = read_bucket_settings('decode', 'seq', min=self.block_size, step=self.block_size, max=2048) + logger.info(f"Prompt bucket config (min, step, max_warmup) bs:{self.prompt_bs_bucket_cfg}, seq:{self.prompt_seq_bucket_cfg}") + logger.info(f"Decode bucket config (min, step, max_warmup) bs:{self.decode_bs_bucket_cfg}, seq:{self.decode_seq_bucket_cfg}") + + # FIXME: exclude from warmup as it causes OOM on llama-70b + self.excluded_from_warmup = [ + (64, 1024, True) + ] def _prepare_prompt( self, @@ -285,8 +294,6 @@ def _prepare_prompt( # actual prompt lens context_lens.append(context_len) - if context_len != 0: - import pdb; pdb.set_trace() # what happens if we hit that path?? query_lens.append(seq_len - context_len) input_tokens.append(prompt_tokens) @@ -357,34 +364,31 @@ def _prepare_prompt( multi_modal_input = None max_prompt_block_table_len = max(len(t) for t in prefix_block_tables) - max_prompt_len = max(seq_lens) + max_prompt_len = max(find_bucket(max(seq_lens), self.prompt_seq_bucket_cfg), self.block_size) + input_tokens = make_tensor_with_pad(input_tokens, - max_prompt_len, - pad=0, - dtype=torch.long, - device=self.device) - + max_prompt_len, + pad=0, + dtype=torch.long, + device=self.device) + input_positions = make_tensor_with_pad(input_positions, - max_prompt_len, - pad=0, - dtype=torch.long, - device=self.device) - + max_prompt_len, + pad=0, + dtype=torch.long, + device=self.device) + slot_mapping = make_tensor_with_pad(slot_mapping, - max_prompt_len, - pad=_PAD_SLOT_ID, - dtype=torch.long, - device=self.device) + max_prompt_len, + pad=_PAD_SLOT_ID, + dtype=torch.long, + device=self.device) - # Prepare prefix block tables - max_prompt_block_table_len = max(len(t) for t in prefix_block_tables) - block_tables = make_tensor_with_pad( - prefix_block_tables, - max_len=max_prompt_block_table_len, - pad=0, - dtype=torch.int, - device=self.device, - ) + block_tables = make_tensor_with_pad(prefix_block_tables, + max_len=max_prompt_block_table_len, + pad=0, + dtype=torch.int, + device=self.device) # Query length can be shorter than key (i.e., prompt) when prefill # is chunked or prefix cached. @@ -394,7 +398,6 @@ def _prepare_prompt( subquery_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1, dtype=torch.int32, device=self.device) - seq_lens_tensor = torch.tensor(seq_lens, dtype=torch.long, device=self.device) @@ -426,6 +429,7 @@ def _prepare_prompt( multi_modal_input=multi_modal_input, slot_mapping=slot_mapping, ) + def _prepare_decode( self, seq_group_metadata_list: List[SequenceGroupMetadata], @@ -479,28 +483,7 @@ def _prepare_decode( block_table = block_table[-sliding_window_blocks:] block_tables.append(block_table) - # vLLM uses cuda graph only for decoding requests. - # See `capture_model` API for more details. - # For decoding requests, batch_size == input_tokens. - batch_size = len(input_tokens) max_seq_len = max(seq_lens) - use_captured_graph = ( - not self.model_config.enforce_eager - and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1] - and max_seq_len <= _MAX_SEQ_LENS_TO_CAPTURE[-1] - and max_seq_len <= self.max_seq_len_to_capture) - if use_captured_graph: - graph_batch_size = _get_graph_batch_size(batch_size) - assert graph_batch_size >= batch_size - for _ in range(graph_batch_size - batch_size): - input_tokens.append([0]) - input_positions.append([0]) - slot_mapping.append([_PAD_SLOT_ID]) - seq_lens.append(1) - block_tables.append([]) - lora_index_mapping.append(0) - batch_size = graph_batch_size - input_tokens = torch.tensor(input_tokens, dtype=torch.long, device=self.device) @@ -514,33 +497,15 @@ def _prepare_decode( dtype=torch.int, device=self.device) - if use_captured_graph: - # When using cuda-graph all these tensors should be - # padded. - assert seq_lens_tensor.shape[0] == len(input_tokens) - assert seq_lens_tensor.shape[0] == len(input_positions) - assert seq_lens_tensor.shape[0] == len(slot_mapping) - - # The shape of graph_block_tables is - # [max batch size, max context len // block size]. - graph_max_seq_len = _get_graph_max_seq_len(max_seq_len) - assert graph_max_seq_len >= max_seq_len - graph_block_count = math.ceil(graph_max_seq_len / self.block_size) - input_block_tables = self.graph_block_tables[:batch_size, :graph_block_count] - for i, block_table in enumerate(block_tables): - if block_table: - input_block_tables[i, :len(block_table)] = block_table - block_tables = torch.tensor(input_block_tables, device=self.device) - else: - max_block_table_len = max( - len(block_table) for block_table in block_tables) - block_tables = make_tensor_with_pad( - block_tables, - max_len=max_block_table_len, - pad=0, - dtype=torch.int, - device=self.device, - ) + max_block_table_len = max( + len(block_table) for block_table in block_tables) + block_tables = make_tensor_with_pad( + block_tables, + max_len=max_block_table_len, + pad=0, + dtype=torch.int, + device=self.device, + ) attn_metadata = self.attn_backend.make_metadata( is_prompt=False, seq_lens=None, @@ -551,7 +516,7 @@ def _prepare_decode( seq_start_loc=None, context_lens_tensor=None, block_tables=block_tables, - use_cuda_graph=use_captured_graph, + use_cuda_graph=False, ) return PrepareDecodeMetadata( input_tokens=input_tokens, @@ -563,7 +528,6 @@ def _prepare_decode( slot_mapping=slot_mapping, ) - def prepare_input_tensors( self, seq_group_metadata_list: List[SequenceGroupMetadata], @@ -611,7 +575,7 @@ def prepare_input_tensors( num_prefill_tokens = len(input_tokens) num_decode_tokens = len(decode_input_tokens) - # NOTE(kzawora): Here we diverge from GPU code - we don't support mixed batches, so we either use decode or prefill inputs, without coalescing. + # NOTE(kzawora): Here we diverge from GPU code - we don't support mixed batches, so we either use decode or prefill inputs, without coalescing. assert (num_prefills == 0 and num_decode_tokens > 0) or (num_prefills > 0 and num_decode_tokens == 0), "HPU does not support mixed batches!" if num_decode_tokens > 0: input_tokens = decode_input_tokens @@ -621,6 +585,14 @@ def prepare_input_tensors( lora_prompt_mapping = decode_lora_prompt_mapping lora_requests = decode_lora_requests + # FIXME: We need to adjust selected_token_indices to accomodate for padding + max_len = input_tokens.size(1) + paddings = [max_len - s for s in seq_lens] + paddings = [0] + paddings[:-1] + paddings = list(itertools.accumulate(paddings)) + paddings = torch.tensor(paddings, dtype=sampling_metadata.selected_token_indices.dtype, device=sampling_metadata.selected_token_indices.device) + sampling_metadata.selected_token_indices.add_(paddings) + if self.lora_config: lora_mapping = LoRAMapping( lora_index_mapping, @@ -629,9 +601,6 @@ def prepare_input_tensors( else: lora_mapping = None - # Broadcast the metadata. - # If batch contains both prefill and decode, it sends 2 broadcasts. - # If it only contains 1 type, it triggers a single broadcast. if (prefill_attn_metadata is not None and decode_attn_metadata is not None): batch_type = BatchType.MIXED @@ -721,13 +690,19 @@ def prepare_input_tensors( sampling_metadata, lora_requests, lora_mapping, multi_modal_input) - @torch.inference_mode() def execute_model( self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], kv_caches: List[torch.Tensor], ) -> Optional[SamplerOutput]: + if self.is_driver_worker: + is_prompt = seq_group_metadata_list[0].is_prompt + real_batch_size = len(seq_group_metadata_list) + bucket_cfg = self.prompt_bs_bucket_cfg if is_prompt else self.decode_bs_bucket_cfg + batch_size_padding = find_bucket(real_batch_size, bucket_cfg) - real_batch_size + seq_group_metadata_list = seq_group_metadata_list.copy() + seq_group_metadata_list.extend(seq_group_metadata_list[0] for _ in range(batch_size_padding)) (input_tokens, input_positions, attn_metadata, sampling_metadata, lora_requests, lora_mapping, multi_modal_input ) = self.prepare_input_tensors(seq_group_metadata_list) @@ -735,17 +710,6 @@ def execute_model( if self.lora_config: self.set_active_loras(lora_requests, lora_mapping) - # Currently HPU graph is only supported by the decode phase. - prefill_meta = attn_metadata.prefill_metadata - decode_meta = attn_metadata.decode_metadata - if prefill_meta is None and decode_meta.use_cuda_graph: - graph_batch_size = input_tokens.shape[0] - graph_block_count = decode_meta.block_tables.shape[1] - graph_runner_key = (graph_batch_size, graph_block_count) - model_executable = self.graph_runners[graph_runner_key] - logger.info(f"Executing {self.graph_runner_class.__name__} with batch {graph_batch_size}, block_count {graph_block_count} (context_len up to {graph_block_count*self.block_size}, currently {torch.max(decode_meta.seq_lens_tensor).item()})") - else: - model_executable = self.model execute_model_kwargs = { "input_ids": input_tokens, "positions": input_positions, @@ -754,11 +718,14 @@ def execute_model( } if self.vision_language_config: execute_model_kwargs.update({"image_input": multi_modal_input}) - hidden_states = model_executable(**execute_model_kwargs) + + htorch.core.mark_step() + hidden_states = self.model(**execute_model_kwargs) hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) # Compute the logits. logits = self.model.compute_logits(hidden_states, sampling_metadata) + htorch.core.mark_step() # Only perform sampling in the driver worker. if not self.is_driver_worker: @@ -769,520 +736,63 @@ def execute_model( logits=logits, sampling_metadata=sampling_metadata, ) - + output.outputs = output.outputs[:real_batch_size] + htorch.core.mark_step() return output - @torch.inference_mode() - def profile_run(self) -> None: - # Enable top-k sampling to reflect the accurate memory usage. - sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) - max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens - max_num_seqs = self.scheduler_config.max_num_seqs - - # This represents the maximum number of different requests - # that will have unique loras, an therefore the max amount of memory - # consumption create dummy lora request copies from the lora request - # passed in, which contains a lora from the lora warmup path. - dummy_lora_requests = [] - dummy_lora_requests_per_seq = [] - if self.lora_config: - for idx in range(self.lora_config.max_loras): - lora_id = idx + 1 - dummy_lora_request = LoRARequest( - lora_name=f"warmup_{lora_id}", - lora_int_id=lora_id, - lora_local_path="/not/a/real/path", - ) - self.lora_manager.add_dummy_lora(dummy_lora_request, - rank=LORA_WARMUP_RANK) - dummy_lora_requests.append(dummy_lora_request) - dummy_lora_requests_per_seq = [ - dummy_lora_requests[idx % len(dummy_lora_requests)] - for idx in range(max_num_seqs) - ] - - # Profile memory usage with max_num_sequences sequences and the total - # number of tokens equal to max_num_batched_tokens. - seqs: List[SequenceGroupMetadata] = [] - # Additional GPU memory may be needed for vision encoding, which needs - # to be accounted for when calculating the GPU blocks for - # vLLM blocker manager. - # To exercise the worst scenario for GPU memory consumption, - # the number of seqs (batch_size) is chosen to maximize the number - # of images processed. - if self.vision_language_config: - max_num_seqs = min( - max_num_seqs, - int(max_num_batched_tokens / - self.vision_language_config.image_feature_size)) - for group_id in range(max_num_seqs): - seq_len = (max_num_batched_tokens // max_num_seqs + - (group_id < max_num_batched_tokens % max_num_seqs)) - seq_data = SequenceData([0] * seq_len) - seq = SequenceGroupMetadata( - request_id=str(group_id), - is_prompt=True, - seq_data={group_id: seq_data}, - sampling_params=sampling_params, - block_tables=None, - lora_request=dummy_lora_requests_per_seq[group_id] - if dummy_lora_requests_per_seq else None, - ) - seqs.append(seq) + def create_dummy_seq_group_metadata(self, group_id, seq_len, is_prompt): + sampling_params = SamplingParams(temperature=0) + num_blocks = math.ceil(seq_len / self.block_size) + if is_prompt: + input_len = seq_len + output_len = 0 + block_tables = None + else: + input_len = seq_len - 1 + output_len = 1 + block_tables = {group_id: [0] * num_blocks} + prompt_token_ids = [0] * input_len + output_token_ids = [1] * output_len + seq_data = SequenceData(prompt_token_ids) + seq_data.output_token_ids = output_token_ids + return SequenceGroupMetadata( + request_id=str(group_id), + is_prompt=(output_len == 0), + seq_data={group_id: seq_data}, + sampling_params=sampling_params, + block_tables=block_tables, + ) - # Run the model with the dummy inputs. + def profile_run(self) -> None: num_layers = self.model_config.get_num_layers(self.parallel_config) kv_caches = [None] * num_layers - self.execute_model(seqs, kv_caches) + seq_len = self.max_model_len // self.max_num_seqs + self.warmup_scenario(self.max_num_seqs, seq_len, True, kv_caches) + + def warmup_scenario(self, batch_size, seq_len, is_prompt, kv_caches) -> None: + seqs = [self.create_dummy_seq_group_metadata(i, seq_len, is_prompt) for i in range(batch_size)] + _ = self.execute_model(seqs, kv_caches) torch.hpu.synchronize() - return - - def remove_all_loras(self): - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - self.lora_manager.remove_all_loras() - - def set_active_loras(self, lora_requests: Set[LoRARequest], - lora_mapping: LoRAMapping) -> None: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - self.lora_manager.set_active_loras(lora_requests, lora_mapping) - - def add_lora(self, lora_request: LoRARequest) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.add_lora(lora_request) - - def remove_lora(self, lora_id: int) -> bool: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.remove_lora(lora_id) - - def list_loras(self) -> Set[int]: - if not self.lora_manager: - raise RuntimeError("LoRA is not enabled.") - return self.lora_manager.list_loras() @torch.inference_mode() - def capture_model(self, kv_caches: List[torch.Tensor]) -> None: - """Cuda graph capture a model. - - Note that CUDA graph's performance gain is negligible if number - of batched tokens are larger than 200. And since CUDA graph - requires fixed sized tensors, supporting large/variable batch - size requires high GPU memory overhead. Thus, vLLM only captures - decoding requests. Mixed batch (chunked prefill + decoding) or - prefill requests are not captured. - - Since it is used for decoding-only, it assumes there's only 1 token - per sequence in the batch. - """ - # NOTE(woosuk): This is a hack to ensure that the NCCL backend is never - # deleted before the CUDA graphs. - - assert not self.model_config.enforce_eager - logger.info("Capturing the model for HPUGraphs. This may lead to " - "unexpected consequences if the model is not static. To " - "run the model in eager mode, set 'enforce_eager=True' or " - "use '--enforce-eager' in the CLI.") - logger.info("HPUGraphs can take additional ~10 GiB memory per HPU. " - "If you are running out of memory, consider decreasing " - "`gpu_memory_utilization` or enforcing eager mode. " - "You can also reduce the `max_num_seqs` as needed " - "to decrease memory usage.") + def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: + times = 1 # TODO: this is will be updated once HPU graphs are reintroduced + scenarios = [] + scenarios.extend(itertools.product(warmup_buckets(self.decode_bs_bucket_cfg), warmup_buckets(self.decode_seq_bucket_cfg), [False])) + scenarios.extend(itertools.product(warmup_buckets(self.prompt_bs_bucket_cfg), warmup_buckets(self.prompt_seq_bucket_cfg), [True])) + scenarios = [scenario for scenario in reversed(scenarios) for _ in range(times) if scenario not in self.excluded_from_warmup] + + start_mem = HabanaMemoryProfiler.current_memory_usage() start_time = time.perf_counter() - - # Prepare dummy inputs. These will be reused for all batch sizes. - max_batch_size = max(_BATCH_SIZES_TO_CAPTURE) - input_tokens = torch.zeros(max_batch_size, 1, dtype=torch.long).to('hpu') - input_positions = torch.zeros(max_batch_size, 1, dtype=torch.long).to('hpu') - slot_mapping = torch.zeros(max_batch_size, 1, dtype=torch.long).to('hpu') # TODO(kzawora): when using torch.empty, following occurs: RuntimeError: Error when trying to cast Long to Int, Input values range [0, 139632108750000] exceeds Int range [-2147483648, 2147483647] - slot_mapping.fill_(_PAD_SLOT_ID) - context_lens = torch.ones(max_batch_size, dtype=torch.int32).to('hpu') - block_tables = torch.from_numpy(self.graph_block_tables).to('hpu') - - graph_batch_size = _get_graph_batch_size( - self.scheduler_config.max_num_seqs) - batch_size_capture_list = [ - bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size - ] - - # NOTE(woosuk): There are 3 backends for all-reduce: custom all-reduce - # kernel, CuPy NCCL, and PyTorch NCCL. When using CUDA graph, we use - # either custom all-reduce kernel or CuPy NCCL. When not using CUDA - # graph, we use either custom all-reduce kernel or PyTorch NCCL. - # We always prioritize using custom all-reduce kernel but fall back - # to PyTorch or CuPy NCCL if it is disabled or not supported. - with custom_all_reduce.capture(): - # NOTE: Capturing the largest batch size first may help reduce the - # memory usage of CUDA graph. - valid_combinations = [] - total_combinations = len(_BATCH_SIZES_TO_CAPTURE)*len(_MAX_SEQ_LENS_TO_CAPTURE) - import pandas as pd - df = pd.DataFrame(index=_BATCH_SIZES_TO_CAPTURE, columns=_MAX_SEQ_LENS_TO_CAPTURE) - for idx, (batch_size, max_seq_len) in enumerate(itertools.product(reversed(_BATCH_SIZES_TO_CAPTURE), reversed(_MAX_SEQ_LENS_TO_CAPTURE))): - block_count = math.ceil(max_seq_len / self.block_size) - # Skip capture of "out-of-bound" batch sizes and context lengths - if batch_size > self.scheduler_config.max_num_seqs: - logger.debug(f"[{idx}/{total_combinations}] Skipping capture for batch {batch_size}, max_seq_len {max_seq_len}, block_count {block_count}. Reason: Batch out of bound.") - df[max_seq_len][batch_size] = 'batch OoB' - continue - if max_seq_len > self.max_seq_len_to_capture: - logger.debug(f"[{idx}/{total_combinations}] Skipping capture for batch {batch_size}, max_seq_len {max_seq_len}, block_count {block_count}. Reason: Nax context length out of bound.") - df[max_seq_len][batch_size] = 'ctx OoB' - continue - block_count = math.ceil(max_seq_len / self.block_size) - captured_block_counts = [math.ceil(cl / self.block_size) for (n, cl) in valid_combinations if n == batch_size] - if block_count in captured_block_counts: - logger.debug(f"[{idx}/{total_combinations}] Skipping capture for batch {batch_size}, max_seq_len {max_seq_len}, block_count {block_count}. Reason: Block size already captured.") - df[max_seq_len][batch_size] = 'redundant' - continue - logger.debug(f"[{idx}/{total_combinations}] Will capture for batch {batch_size}, max_seq_len {max_seq_len}, block_count {block_count}. Constraints met.") - df[max_seq_len][batch_size] = 'VALID' - valid_combinations.append((batch_size, max_seq_len)) - - total_valid_hpugraphs = len(valid_combinations) - logger.info(f"Starting capture {total_valid_hpugraphs} valid HPUGraphs. Skipping capture of {total_combinations-total_valid_hpugraphs}/{total_combinations} graphs due to batch/context constraints.") - logger.debug(f"Capture summary (row: batch_size; col: max_seq_len):") - logger.debug(tabulate.tabulate(df, tablefmt='mixed_outline', headers='keys', showindex="always")) - - graph_runner_name = self.graph_runner_class.__name__ - graph_mem_usage_df = pd.DataFrame(index=list(reversed(sorted({b for b,c in valid_combinations}))), columns=list(reversed(sorted({c for b,c in valid_combinations})))) - pbar = tqdm.tqdm(valid_combinations) - start_mem = HabanaMemoryProfiler.current_memory_usage() - log_graph_compilation_all = os.environ.get('VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0' - log_graph_compilation = os.environ.get('VLLM_HPU_LOG_STEP_GRAPH_COMPILATION', '0') != '0' or log_graph_compilation_all - - for idx, (batch_size, max_seq_len) in enumerate(pbar): - block_count = math.ceil(max_seq_len / self.block_size) - # Create dummy attn_metadata. - decode_metadata = self.attn_backend.make_metadata( - is_prompt=False, - seq_lens=None, - seq_lens_tensor=context_lens[:batch_size], - max_query_len=None, - max_seq_len=block_count*self.block_size, - subquery_start_loc=None, - seq_start_loc=None, - context_lens_tensor=None, # NOTE(kzawora): this seems sus, shoudn't we have seq_lens tensor here? - block_tables=block_tables[:batch_size, :block_count], - use_cuda_graph=True, - ) - attn_metadata = AttentionMetadata( - num_prefills=0, - num_prefill_tokens=0, - num_decode_tokens=batch_size, - slot_mapping=slot_mapping[:batch_size], - prefill_metadata=None, - decode_metadata=decode_metadata, - kv_cache_dtype=self.kv_cache_dtype, - ) - - if self.lora_config: - lora_mapping = LoRAMapping( - [0] * batch_size, - [0] * batch_size, - ) - self.set_active_loras(set(), lora_mapping) - graph_runner = self.graph_runner_class(self.model) - local_start_mem = HabanaMemoryProfiler.current_memory_usage() - capture_start = time.time() - desc = f'Capturing {graph_runner_name} for batch {batch_size}, max_seq_len {max_seq_len}, block_count {block_count}, allocated {format_bytes(local_start_mem - start_mem)} device memory in total ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)' - pbar.set_description(desc) - logger.debug(f"[{idx}/{total_valid_hpugraphs}] {desc}...") - profiling_ctx = contextlib.nullcontext() if not (log_graph_compilation_all or log_graph_compilation) else metric_localcontext("graph_compilation") - with profiling_ctx as gc_local_metric: - graph_runner.capture( - input_tokens[:batch_size], - input_positions[:batch_size], - kv_caches, - attn_metadata, - ) - if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0) or log_graph_compilation_all: - logger.info(f"VLLM_HPU_STEP_GRAPH_COMPILATION: {gc_local_metric.stats()}, {graph_runner_name}; batch {batch_size}, max_seq_len {max_seq_len}, block_count {block_count}") - self.graph_runners[(batch_size, block_count)] = graph_runner - capture_end = time.time() - local_end_mem = HabanaMemoryProfiler.current_memory_usage() - mem_usage_str = format_bytes(local_end_mem - local_start_mem) - graph_mem_usage_df[max_seq_len][batch_size] = mem_usage_str - logger.debug(f"[{idx}/{total_valid_hpugraphs}] {desc}... done in {capture_end-capture_start:.2f} seconds! Took {mem_usage_str} of device memory ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)") - + for i, (batch_size, seq_len, is_prompt) in enumerate(scenarios): + mem_usage = 100.0 * HabanaMemoryProfiler.current_memory_usage() / HabanaMemoryProfiler.total_memory() + logger.info(f"[Warmup][{i+1}/{len(scenarios)}] batch_size:{batch_size} seq_len:{seq_len} is_prompt:{is_prompt} mem_usage:{mem_usage:0.1f}%") + self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches) end_time = time.perf_counter() - elapsed_time = end_time - start_time - # This usually takes < 10 seconds. end_mem = HabanaMemoryProfiler.current_memory_usage() - logger.info(f"Graph capturing finished in {elapsed_time:.0f} secs, allocated {format_bytes(end_mem - start_mem)} of device memory ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)") - logger.info(f"Graph memory allocation summary (row: batch_size; col: max_seq_len):") - logger.info(tabulate.tabulate(graph_mem_usage_df, tablefmt='mixed_outline', headers='keys', showindex="always")) - - def __del__(self) -> None: - # Delete the CUDA graphs before deleting the CuPy NCCL communicator. - # NOTE(woosuk): This is necessary because otherwise deadlocks can - # happen. - # FIXME(woosuk): This is a bit hacky. Find a more robust solution. - self.graph_runners.clear() + elapsed_time = end_time - start_time + logger.info(f"Warmup finished in {elapsed_time:.0f} secs, allocated {format_bytes(end_mem - start_mem)} of device memory") @property def vocab_size(self) -> int: return self.model_config.get_vocab_size() - - -class FakeHPUGraphRunner: - - def __init__(self, model: nn.Module): - self.model = model - - def capture( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - ) -> None: - return - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - ) -> torch.Tensor: - return self.model( - input_ids, - positions, - kv_caches, - attn_metadata, - ) - - def __call__(self, *args, **kwargs): - return self.forward(*args, **kwargs) - -class FakeHPUGraphRunnerWithWarmup: - - def __init__(self, model: nn.Module): - self.model = model - - def capture( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - ) -> None: - htorch.core.mark_step() - out = self.model( - input_ids, - positions, - kv_caches, - attn_metadata, - ) - htorch.core.mark_step() - htorch.hpu.synchronize() - return - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - ) -> torch.Tensor: - htorch.core.mark_step() - out = self.model( - input_ids, - positions, - kv_caches, - attn_metadata, - ) - htorch.core.mark_step() - return out - - def __call__(self, *args, **kwargs): - return self.forward(*args, **kwargs) -class HPUGraphRunner: - - def __init__(self, model: nn.Module): - self.model = model - self.graph = None - self.input_buffers: Dict[str, torch.Tensor] = {} - self.output_buffers: Dict[str, torch.Tensor] = {} - - def capture( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - ) -> None: - assert self.graph is None - # Run the model once without capturing the graph. - # This is to make sure that the captured graph does not include the - # kernel launches for initial benchmarking (e.g., Triton autotune). - self.model( - input_ids, - positions, - kv_caches, - attn_metadata, - ) - htorch.hpu.synchronize() - - # Capture the graph. - # NOTE(woosuk): Python 3.8 does not support multi-line with statements. - # https://stackoverflow.com/questions/31039022/python-multi-line-with-statement - self.graph = htorch.hpu.HPUGraph() - with htorch.hpu.graph(self.graph): # noqa: SIM117 - hidden_states = self.model( - input_ids, - positions, - kv_caches, - attn_metadata, - ) - torch.hpu.synchronize() - - # Save the input and output buffers. - self.input_buffers = { - "input_ids": input_ids, - "positions": positions, - "kv_caches": kv_caches, - "slot_mapping": attn_metadata.slot_mapping, - "seq_lens_tensor": attn_metadata.decode_metadata.seq_lens_tensor, - "block_tables": attn_metadata.decode_metadata.block_tables, - } - self.output_buffers = {"hidden_states": hidden_states} - return - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - **kwargs, - ) -> torch.Tensor: - # KV caches are fixed tensors, so we don't need to copy them. - del kv_caches - - # Copy the input tensors to the input buffers. - self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True) - self.input_buffers["positions"].copy_(positions, non_blocking=True) - self.input_buffers["slot_mapping"].copy_(attn_metadata.slot_mapping, - non_blocking=True) - self.input_buffers["seq_lens_tensor"].copy_( - attn_metadata.decode_metadata.seq_lens_tensor, non_blocking=True) - self.input_buffers["block_tables"].copy_( - attn_metadata.decode_metadata.block_tables, non_blocking=True) - # Run the graph. - self.graph.replay() - - # Return the output tensor. - return self.output_buffers["hidden_states"] - - def __call__(self, *args, **kwargs): - return self.forward(*args, **kwargs) - -class ExperimentalHPUGraphRunner: - def __init__(self, model: nn.Module): - self.model = model - - def capture( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - ) -> None: - class ModelWrapper(torch.nn.Module): - def __init__(self, model): - super().__init__() - self.model = model - self.attn_backend = get_attn_backend(torch.bfloat16) - def forward(self, input_ids, positions, kv_caches, slot_mapping, context_lens, block_tables): - wrapper_attn_metadata = self.attn_backend.make_metadata( - is_prompt=attn_metadata.is_prompt, - seq_lens=None, - seq_lens_tensor=None, - num_prefill_tokens=0, - num_generation_tokens=attn_metadata.num_generation_tokens, - max_subquery_len=None, - max_seq_len=attn_metadata.max_seq_len, - max_prompt_len=None, - subquery_start_loc=None, - seq_start_loc=None, - context_lens=context_lens, - block_tables=block_tables, - use_cuda_graph=True, - kv_cache_dtype=attn_metadata.kv_cache_dtype, - ) - return self.model( - input_ids, - positions, - kv_caches, - wrapper_attn_metadata - ) - self.graph_model = htorch.hpu.wrap_in_hpu_graph(ModelWrapper(self.model)) - out = self.graph_model( - input_ids, - positions, - kv_caches, - attn_metadata.slot_mapping, - attn_metadata.context_lens, - attn_metadata.block_tables, - ) - htorch.hpu.synchronize() - return - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - ) -> torch.Tensor: - out = self.graph_model( - input_ids, - positions, - kv_caches, - attn_metadata.slot_mapping, - attn_metadata.context_lens, - attn_metadata.block_tables, - ) - return out - - - def __call__(self, *args, **kwargs): - return self.forward(*args, **kwargs) - - -def _get_graph_batch_size(batch_size: int) -> int: - """Returns the padded batch size given actual batch size. - - Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT, - 2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT... - """ - if batch_size <= 2: - return batch_size - elif batch_size <= 4: - return 4 - elif batch_size <= 8: - return 8 - else: - return ((batch_size + _BATCH_SIZE_ALIGNMENT - 1) // - _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT) - - -def _get_graph_max_seq_len(max_seq_len: int) -> int: - """Returns the padded batch size given actual batch size. - - Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT, - 2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT... - """ - if max_seq_len <= 32: - return 32 - elif max_seq_len <= 64: - return 64 - elif max_seq_len <= 128: - return 128 - else: - return ((max_seq_len + _MAX_SEQ_LEN_ALIGNMENT - 1) // - _MAX_SEQ_LEN_ALIGNMENT * _MAX_SEQ_LEN_ALIGNMENT) diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index 43ccd235c174f..eeba9e5c4adba 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -73,13 +73,14 @@ def __init__( assert False, "To be tested: vision language model on HPU" self.model_runner = HabanaModelRunner(model_config, - parallel_config, - scheduler_config, - device_config, - load_config=load_config, - lora_config=self.lora_config, - kv_cache_dtype=self.cache_config.cache_dtype, - is_driver_worker=is_driver_worker) + parallel_config, + scheduler_config, + device_config, + load_config=load_config, + cache_config=cache_config, + lora_config=self.lora_config, + kv_cache_dtype=self.cache_config.cache_dtype, + is_driver_worker=is_driver_worker) # Uninitialized cache engine. Will be initialized by # initialize_cache. self.cache_engine: CacheEngine @@ -168,12 +169,10 @@ def _init_cache_engine(self) -> None: self.cache_engine = CacheEngine(self.cache_config, self.model_config, self.parallel_config) self.hpu_cache = self.cache_engine.gpu_cache - self.model_runner.set_block_size(self.cache_engine.block_size) htorch.hpu.synchronize() # we want to materialize cache tensors before we proceed with graph capture/execution def _warm_up_model(self) -> None: - if not self.model_config.enforce_eager: - self.model_runner.capture_model(self.hpu_cache) + self.model_runner.warmup_model(self.hpu_cache) # Reset the seed to ensure that the random state is not affected by # the model initialization and profiling. set_random_seed(self.model_config.seed) From 14d294d885296f44e2bef3fa7b1b512654fd69a9 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 22 May 2024 10:43:47 +0200 Subject: [PATCH 012/819] Cleanup: Fix HPU auto-detection in setup.py (#34) * Fix HPU auto-detection in setup.py * Update setup.py --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index f0364cc7a5893..49e20aac0068a 100644 --- a/setup.py +++ b/setup.py @@ -203,12 +203,11 @@ def build_extensions(self) -> None: def _is_hpu() -> bool: - return True is_hpu_available = True try: subprocess.run(["hl-smi"], capture_output=True, check=True) except (FileNotFoundError, PermissionError, subprocess.CalledProcessError): - if not os.path.exists('/dev/hl0') and not os.path.exists('/dev/hl_controlD0'): + if not os.path.exists('/dev/accel/accel0') and not os.path.exists('/dev/accel/accel_controlD0'): is_hpu_available = False return is_hpu_available From f6fb119ca85ddbd280b0c35527cf378a503b1c00 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 22 May 2024 10:43:56 +0200 Subject: [PATCH 013/819] Restore int64 sampling (#35) --- vllm/model_executor/sampling_metadata.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 78b3e6417366e..9969c45963e9a 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -8,7 +8,7 @@ from vllm.sampling_params import SamplingParams, SamplingType from vllm.sequence import SequenceData, SequenceGroupMetadata from vllm.utils import (async_tensor_h2d, is_pin_memory_available, - maybe_expand_dim, is_hpu) + maybe_expand_dim) _SAMPLING_EPS = 1e-5 _SEED_0_REPLACEMENT = 3403598558 @@ -501,19 +501,19 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float], sample_indices_t = torch.tensor( sample_indices, device="cpu", - dtype=torch.int, + dtype=torch.long, pin_memory=pin_memory, ) prompt_tensor = torch.tensor( prompt_padded_tokens, device="cpu", - dtype=torch.int, + dtype=torch.long, pin_memory=pin_memory, ) output_tensor = torch.tensor( output_padded_tokens, device="cpu", - dtype=torch.int, + dtype=torch.long, pin_memory=pin_memory, ) # need to transpose and make contiguous to @@ -522,7 +522,7 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float], sampling_seeds_t = torch.tensor( sampling_seeds, device="cpu", - dtype=torch.int, + dtype=torch.long, pin_memory=pin_memory, ).T.contiguous() @@ -571,7 +571,7 @@ def _get_sequence_seeds( else: generator = random.Random(str((seed, ) + extra_entropy)) randint_fn = generator.randint - lo, hi = torch.iinfo(torch.int).min, torch.iinfo(torch.int).max + lo, hi = torch.iinfo(torch.long).min, torch.iinfo(torch.long).max # If the user/random sets seed = 0 but request should # have sampling, we need to change it to something # else. We use a constant in that case. From 78b0513b3e4ac7be9082dbda4cfef5bf3cd05e97 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 22 May 2024 10:44:05 +0200 Subject: [PATCH 014/819] Llama whitespace fix (#36) --- vllm/model_executor/models/llama.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 4f766c0d6b366..f6d7fc8733fce 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -236,6 +236,7 @@ def forward( kv_cache=kv_cache, attn_metadata=attn_metadata, ) + # Fully Connected hidden_states, residual = self.post_attention_layernorm( hidden_states, residual) From 09c1eb246d7c97b3d082f83b98d173a481573c6e Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 22 May 2024 10:44:13 +0200 Subject: [PATCH 015/819] Restore pyproject.toml (#37) --- pyproject.toml | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000000..6a448defc16e1 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,67 @@ +[build-system] +# Should be mirrored in requirements-build.txt +requires = [ + "cmake>=3.21", + "ninja", + "packaging", + "setuptools >= 49.4.0", + "torch == 2.3.0", + "wheel", +] +build-backend = "setuptools.build_meta" + +[tool.ruff] +# Allow lines to be as long as 80. +line-length = 80 +exclude = [ + # External file, leaving license intact + "examples/fp8/quantizer/quantize.py" +] + +[tool.ruff.lint] +select = [ + # pycodestyle + "E", + # Pyflakes + "F", + # pyupgrade + # "UP", + # flake8-bugbear + "B", + # flake8-simplify + "SIM", + # isort + # "I", + "G", +] +ignore = [ + # star imports + "F405", "F403", + # lambda expression assignment + "E731", + # Loop control variable not used within loop body + "B007", +] + +[tool.mypy] +python_version = "3.8" + +ignore_missing_imports = true +check_untyped_defs = true +follow_imports = "skip" + +files = "vllm" +# TODO(woosuk): Include the code from Megatron and HuggingFace. +exclude = [ + "vllm/model_executor/parallel_utils/|vllm/model_executor/models/", + # Ignore triton kernels in ops. + 'vllm/attention/ops/.*\.py$' +] + +[tool.codespell] +ignore-words-list = "dout, te, indicies" +skip = "./tests/prompts,./benchmarks/sonnet.txt" + +[tool.isort] +use_parentheses = true +skip_gitignore = true From 7f7500b9de21a455691b90f0a951ac272fa6cbd6 Mon Sep 17 00:00:00 2001 From: Damian Szwichtenberg Date: Wed, 22 May 2024 13:40:22 +0200 Subject: [PATCH 016/819] Add high-level profiler (#29) --- vllm/worker/habana_model_runner.py | 56 ++++++++++--- vllm/worker/profiler.py | 121 +++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+), 11 deletions(-) create mode 100644 vllm/worker/profiler.py diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 995864e3f81e7..1451b6fe38aef 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -29,6 +29,8 @@ from vllm.utils import (HabanaMemoryProfiler, is_pin_memory_available, make_tensor_with_pad, format_bytes) +from .profiler import Profiler + logger = init_logger(__name__) _PAD_SLOT_ID = 0 @@ -156,6 +158,7 @@ def __init__( self.lora_config = lora_config self.load_config = load_config self.is_driver_worker = is_driver_worker + self.profiler = Profiler() self.sliding_window = (model_config.get_sliding_window() if model_config is not None else None) @@ -696,16 +699,22 @@ def execute_model( seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], kv_caches: List[torch.Tensor], ) -> Optional[SamplerOutput]: + event_start = self.profiler.get_timestamp_us() + is_prompt = seq_group_metadata_list[0].is_prompt + base_event_name = 'prompt' if is_prompt else 'decode' + self.profiler.start('internal', base_event_name) + if self.is_driver_worker: - is_prompt = seq_group_metadata_list[0].is_prompt real_batch_size = len(seq_group_metadata_list) bucket_cfg = self.prompt_bs_bucket_cfg if is_prompt else self.decode_bs_bucket_cfg - batch_size_padding = find_bucket(real_batch_size, bucket_cfg) - real_batch_size + batch_size_padded = find_bucket(real_batch_size, bucket_cfg) + batch_size_padding = batch_size_padded - real_batch_size seq_group_metadata_list = seq_group_metadata_list.copy() seq_group_metadata_list.extend(seq_group_metadata_list[0] for _ in range(batch_size_padding)) - (input_tokens, input_positions, attn_metadata, sampling_metadata, - lora_requests, lora_mapping, multi_modal_input - ) = self.prepare_input_tensors(seq_group_metadata_list) + with self.profiler.record_event('internal', 'prepare_input_tensors'): + (input_tokens, input_positions, attn_metadata, sampling_metadata, + lora_requests, lora_mapping, multi_modal_input + ) = self.prepare_input_tensors(seq_group_metadata_list) if self.lora_config: self.set_active_loras(lora_requests, lora_mapping) @@ -720,11 +729,13 @@ def execute_model( execute_model_kwargs.update({"image_input": multi_modal_input}) htorch.core.mark_step() - hidden_states = self.model(**execute_model_kwargs) + with self.profiler.record_event('internal', f'model_{base_event_name}_eager_bs{real_batch_size}'): + hidden_states = self.model(**execute_model_kwargs) hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) # Compute the logits. - logits = self.model.compute_logits(hidden_states, sampling_metadata) + with self.profiler.record_event('internal', 'compute_logits'): + logits = self.model.compute_logits(hidden_states, sampling_metadata) htorch.core.mark_step() # Only perform sampling in the driver worker. @@ -732,12 +743,30 @@ def execute_model( return None # Sample the next token. - output = self.model.sample( - logits=logits, - sampling_metadata=sampling_metadata, - ) + with self.profiler.record_event('internal', 'sample'): + output = self.model.sample( + logits=logits, + sampling_metadata=sampling_metadata, + ) output.outputs = output.outputs[:real_batch_size] htorch.core.mark_step() + + # Stop recording 'execute_model' event + self.profiler.end() + + if self.profiler.enabled: + event_end = self.profiler.get_timestamp_us() + duration = event_end - event_start + throughput = batch_size_padded / (duration / 1e6) + throughput_effective = real_batch_size / (duration / 1e6) + counters = { + 'batch_size': batch_size_padded, + 'batch_size_effective': real_batch_size, + 'throughput': throughput, + 'throughput_effective': throughput_effective + } + self.profiler.record_counter(event_start, counters) + return output def create_dummy_seq_group_metadata(self, group_id, seq_len, is_prompt): @@ -770,12 +799,16 @@ def profile_run(self) -> None: self.warmup_scenario(self.max_num_seqs, seq_len, True, kv_caches) def warmup_scenario(self, batch_size, seq_len, is_prompt, kv_caches) -> None: + scenario_name = f"warmup_{'prompt' if is_prompt else 'decode'}_bs{batch_size}_seq{seq_len}" + self.profiler.start('internal', scenario_name) seqs = [self.create_dummy_seq_group_metadata(i, seq_len, is_prompt) for i in range(batch_size)] _ = self.execute_model(seqs, kv_caches) torch.hpu.synchronize() + self.profiler.end() @torch.inference_mode() def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: + self.profiler.start('internal', 'warmup') times = 1 # TODO: this is will be updated once HPU graphs are reintroduced scenarios = [] scenarios.extend(itertools.product(warmup_buckets(self.decode_bs_bucket_cfg), warmup_buckets(self.decode_seq_bucket_cfg), [False])) @@ -792,6 +825,7 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: end_mem = HabanaMemoryProfiler.current_memory_usage() elapsed_time = end_time - start_time logger.info(f"Warmup finished in {elapsed_time:.0f} secs, allocated {format_bytes(end_mem - start_mem)} of device memory") + self.profiler.end() @property def vocab_size(self) -> int: diff --git a/vllm/worker/profiler.py b/vllm/worker/profiler.py new file mode 100644 index 0000000000000..2d47f4349d45a --- /dev/null +++ b/vllm/worker/profiler.py @@ -0,0 +1,121 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +############################################################################### + +import json +import os +import queue +import threading +import time +from contextlib import contextmanager + +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class FileWriter(threading.Thread): + + def __init__(self, filename, event_queue): + super().__init__() + self.filename = filename + self.event_queue = event_queue + self.daemon = True + self.timer_event = threading.Event() + + def _drain_event_queue(self): + content = '' + while True: + try: + element = self.event_queue.get_nowait() + content += element + except queue.Empty: + break + return content + + def run(self): + # don't check the queue too often + while not self.timer_event.wait(1): + # Block and wait for the next item in the queue + content = self.event_queue.get() + # Collect any other items in the queue + content += self._drain_event_queue() + + with open(self.filename, 'a') as outfile: + outfile.write(content) + + +class Profiler: + profiling_trace_events = queue.Queue() + event_tid = {'counter': 1, 'external': 2, 'internal': 3} + filename = 'server_events.json' + event_cache = [] + + def __init__(self): + self.enabled = os.getenv('VLLM_PROFILER_ENABLED', + 'false').lower() == 'true' and int( + os.getenv('RANK', '0')) == 0 + if self.enabled: + # initialize the trace file (JSON Array Format) + with open(self.filename, 'w') as outfile: + outfile.write('[') + file_writer = FileWriter(self.filename, + self.profiling_trace_events) + file_writer.start() + + def _dump_with_sep(self, entry): + entry = json.dumps(entry) + ',' + self.profiling_trace_events.put(entry) + + def get_timestamp_us(self): + return time.time() * 1000000.0 + + def record_counter(self, ts, counter): + if self.enabled: + self._dump_with_sep({ + 'pid': 1, + 'tid': self.event_tid['counter'], + 'ph': 'C', + 'name': 'utils', + 'ts': ts, + 'args': counter + }) + + def start(self, type, name, args=None): + if self.enabled: + ts = self.get_timestamp_us() + if args is not None and 'counter' in args: + self.record_counter(ts, args['counter']) + del args['counter'] + event = { + 'pid': 1, + 'tid': self.event_tid[type], + 'ph': 'X', + 'name': name, + 'ts': ts, + 'dur': None, + 'args': args + } + self.event_cache.append(event) + + def end(self): + if self.enabled: + ts = self.get_timestamp_us() + if not self.event_cache: + logger.warning( + 'Profiler: end() call does not have matching start() call. Disabling profiler.' + ) + self.enabled = False + return + event = self.event_cache.pop() + event['dur'] = ts - event['ts'] + self._dump_with_sep(event) + + @contextmanager + def record_event(self, type, name, args=None): + if self.enabled: + self.start(type, name, args) + yield + self.end() + else: + yield From b6f5584f9da7b1bd61772ed6a41a64baed00079c Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 23 May 2024 13:40:16 +0200 Subject: [PATCH 017/819] Add release docs for Gaudi (#32) * add gaudi installation readme * readme writeup * Create README_GAUDI.md * Update README.md * Update README_GAUDI.md * Update README.md * Update readmes --- README.md | 3 +- README_GAUDI.md | 136 +++++++++++++++++ .../getting_started/gaudi-installation.rst | 144 ++++++++++++++++++ docs/source/index.rst | 2 + 4 files changed, 284 insertions(+), 1 deletion(-) create mode 100644 README_GAUDI.md create mode 100644 docs/source/getting_started/gaudi-installation.rst diff --git a/README.md b/README.md index 524d027137aba..9b180877a5a82 100644 --- a/README.md +++ b/README.md @@ -10,11 +10,12 @@ Easy, fast, and cheap LLM serving for everyone

-| Documentation | Blog | Paper | Discord | +| IntelÂź GaudiÂź README | Documentation | Blog | Paper | Discord |

*Latest News* đŸ”„ +- [2024/05] vLLM-fork specific: Added IntelÂź GaudiÂź 2 support with SynapseAI 1.16.0. For more information, please refer to IntelÂź GaudiÂź README. - [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing). - [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing). - [2024/01] Added ROCm 6.0 support to vLLM. diff --git a/README_GAUDI.md b/README_GAUDI.md new file mode 100644 index 0000000000000..44e75e690950f --- /dev/null +++ b/README_GAUDI.md @@ -0,0 +1,136 @@ +# vLLM with IntelÂź GaudiÂź 2 AI Accelerators + +This README provides instructions on running vLLM with Intel Gaudi devices. + +Requirements and Installation +============================== + +Please follow the instructions provided in the [Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) +to set up the environment. To achieve the best performance, please follow the methods outlined in the +[Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html). + +> [!NOTE] +> In this release (1.16.0), we are only targeting functionality and +> accuracy. Performance will be improved in next releases. + +Requirements +------------- + +- OS: Ubuntu 22.04 LTS +- Python: 3.10 +- Intel Gaudi 2 accelerator +- Intel Gaudi software version 1.16.0 + +To verify that the Intel Gaudi software was correctly installed, run: + +``` {.console} +$ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible +$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core and habanalabs-thunk are installed +$ pip list | habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml, habana-media-loader and habana_quantization_toolkit are installed +``` + +Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade) for more details. + +Run Docker Image +------------------ + +It is highly recommended to use the latest Docker image from Intel +Gaudi vault. Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers) for more details. + +Use the following commands to run a Docker image: + +``` {.console} +$ docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest +$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest + ``` + +Build and Install vLLM-fork +----------------------------- + +To build and install vLLM-fork from source, run: + +``` {.console} +$ git clone https://github.com/HabanaAI/vllm-fork.git +$ cd vllm-fork +# git checkout 0.4.2-Gaudi-1.16.0 +$ pip install -e . # This may take 5-10 minutes. +``` + +Supported Features +================== + +- [Offline batched inference](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#offline-batched-inference) +- Online inference via [OpenAI-Compatible Server](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server) +- HPU autodetection - no need to manually select device within vLLM +- Paged KV cache with algorithms enabled for Intel Gaudi 2 + accelerators +- Custom Intel Gaudi implementations of Paged Attention, KV cache ops, + prefill attention, Root Mean Square Layer Normalization, Rotary + Positional Encoding +- Tensor parallelism support for multi-card inference +- Inference with [HPU + Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) + for accelerating low-batch latency and throughput + + +Unsupported Features +==================== + +- Beam search +- LoRA adapters +- Attention with Linear Biases (ALiBi) +- Quantization (AWQ, FP8 E5M2, FP8 E4M3) +- Prefill chunking (mixed-batch inferencing) + + +Supported Configurations +======================== + +The following configurations have been validated to be function with Gaudi devices. Configurations that are not listed may or may not work. + +- [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- [meta-llama/Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b) + with tensor parallelism on 8x HPU, BF16 datatype with random + or greedy sampling +- [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) + with tensor parallelism 8x HPU, BF16 datatype with random + or greedy sampling + + + +Performance Tips +================ + +- We recommend running inference on Gaudi 2 with + `block_size` of 128 for BF16 data type. Using default + values (16, 32) might lead to sub-optimal performance due to Matrix + Multiplication Engine under-utilization (see [Gaudi + Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)). +- For max throughput on Llama 7B, we recommend running with batch size + of 128 or 256 and max context length of 2048 with HPU Graphs enabled. + If you encounter out-of-memory issues, see troubleshooting section. + +Troubleshooting: Tweaking HPU Graphs +==================================== + +If you experience device out-of-memory issues or want to attempt inference at higher batch sizes, try tweaking HPU Graphs by following the below: + +- Tweak `gpu_memory_utilization` knob. It + will decrease the allocation of KV cache, leaving some headroom for + capturing graphs with larger batch size. By default `gpu_memory_utilization` is set to 0.9. + It attempts to allocate \~90% of HBM left for KV cache after short + profiling run. Note that decreasing reduces the number of KV + cache blocks you have available, and therefore reduces the effective + maximum number of tokens you can handle at a given time. + +- If this methon is not efficient, you can disable `HPUGraph` completely. With + HPU Graphs disabled, you are trading latency and throughput at lower + batches for potentially higher throughput on higher batches. You can do + that by adding `--enforce-eager` flag to server (for + online inference), or by passing `enforce_eager=True` + argument to LLM constructor (for offline inference). diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst new file mode 100644 index 0000000000000..cd026df8bf057 --- /dev/null +++ b/docs/source/getting_started/gaudi-installation.rst @@ -0,0 +1,144 @@ +vLLM with IntelÂź GaudiÂź 2 AI Accelerators +========================================= + +This README provides instructions on running vLLM with Intel Gaudi +devices. + +Requirements and Installation +============================= + +Please follow the instructions provided in the `Gaudi Installation +Guide `__ +to set up the environment. To achieve the best performance, please +follow the methods outlined in the `Optimizing Training Platform +Guide `__. + +.. note:: + In this release (1.16.0), we are only targeting functionality + and accuracy. Performance will be improved in next releases. + +Requirements +------------ + +- OS: Ubuntu 22.04 LTS +- Python: 3.10 +- Intel Gaudi 2 accelerator +- Intel Gaudi software version 1.16.0 + +To verify that the Intel Gaudi software was correctly installed, run: + +.. code:: console + + $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible + $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core and habanalabs-thunk are installed + $ pip list | habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml, habana-media-loader and habana_quantization_toolkit are installed + +Refer to `Intel Gaudi Software Stack +Verification `__ +for more details. + +Run Docker Image +---------------- + +It is highly recommended to use the latest Docker image from Intel Gaudi +vault. Refer to the `Intel Gaudi +documentation `__ +for more details. + +Use the following commands to run a Docker image: + +.. code:: console + + $ docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest + $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest + +Build and Install vLLM-fork +--------------------------- + +To build and install vLLM-fork from source, run: + +.. code:: console + + $ git clone https://github.com/HabanaAI/vllm-fork.git + $ cd vllm-fork + # git checkout 0.4.2-Gaudi-1.16.0 + $ pip install -e . # This may take 5-10 minutes. + +Supported Features +================== + +- `Offline batched + inference `__ +- Online inference via `OpenAI-Compatible + Server `__ +- HPU autodetection - no need to manually select device within vLLM +- Paged KV cache with algorithms enabled for Intel Gaudi 2 accelerators +- Custom Intel Gaudi implementations of Paged Attention, KV cache ops, + prefill attention, Root Mean Square Layer Normalization, Rotary + Positional Encoding +- Tensor parallelism support for multi-card inference +- Inference with `HPU + Graphs `__ + for accelerating low-batch latency and throughput + +Unsupported Features +==================== + +- Beam search +- LoRA adapters +- Attention with Linear Biases (ALiBi) +- Quantization (AWQ, FP8 E5M2, FP8 E4M3) +- Prefill chunking (mixed-batch inferencing) + +Supported Configurations +======================== + +The following configurations have been validated to be function with +Gaudi devices. Configurations that are not listed may or may not work. + +- `meta-llama/Llama-2-7b `__ + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- `meta-llama/Llama-2-7b-chat-hf `__ + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- `meta-llama/Llama-2-70b `__ + with tensor parallelism on 8x HPU, BF16 datatype with random or + greedy sampling +- `meta-llama/Llama-2-70b-chat-hf `__ + with tensor parallelism 8x HPU, BF16 datatype with random or greedy + sampling + +Performance Tips +================ + +- We recommend running inference on Gaudi 2 with ``block_size`` of 128 + for BF16 data type. Using default values (16, 32) might lead to + sub-optimal performance due to Matrix Multiplication Engine + under-utilization (see `Gaudi + Architecture `__). +- For max throughput on Llama 7B, we recommend running with batch size + of 128 or 256 and max context length of 2048 with HPU Graphs enabled. + If you encounter out-of-memory issues, see troubleshooting section. + +Troubleshooting: Tweaking HPU Graphs +==================================== + +If you experience device out-of-memory issues or want to attempt +inference at higher batch sizes, try tweaking HPU Graphs by following +the below: + +- Tweak ``gpu_memory_utilization`` knob. It will decrease the + allocation of KV cache, leaving some headroom for capturing graphs + with larger batch size. By default ``gpu_memory_utilization`` is set + to 0.9. It attempts to allocate ~90% of HBM left for KV cache after + short profiling run. Note that decreasing reduces the number of KV + cache blocks you have available, and therefore reduces the effective + maximum number of tokens you can handle at a given time. + +- If this methon is not efficient, you can disable ``HPUGraph`` + completely. With HPU Graphs disabled, you are trading latency and + throughput at lower batches for potentially higher throughput on + higher batches. You can do that by adding ``--enforce-eager`` flag to + server (for online inference), or by passing ``enforce_eager=True`` + argument to LLM constructor (for offline inference). diff --git a/docs/source/index.rst b/docs/source/index.rst index 4022c590843e6..aeb3b60ccb1ad 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -42,6 +42,7 @@ vLLM is flexible and easy to use with: * Streaming outputs * OpenAI-compatible API server * Support NVIDIA GPUs and AMD GPUs +* (Experimental) Support for IntelÂź GaudiÂź 2 accelerators * (Experimental) Prefix caching support * (Experimental) Multi-lora support @@ -64,6 +65,7 @@ Documentation getting_started/amd-installation getting_started/neuron-installation getting_started/cpu-installation + getting_started/gaudi-installation getting_started/quickstart getting_started/examples/examples_index From 6f5629fda71eb40bf761f204a5ca2837e341cd0b Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 23 May 2024 14:00:28 +0200 Subject: [PATCH 018/819] Update tag in readme (#39) --- README_GAUDI.md | 2 +- docs/source/getting_started/gaudi-installation.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README_GAUDI.md b/README_GAUDI.md index 44e75e690950f..24d3fe0761f54 100644 --- a/README_GAUDI.md +++ b/README_GAUDI.md @@ -52,7 +52,7 @@ To build and install vLLM-fork from source, run: ``` {.console} $ git clone https://github.com/HabanaAI/vllm-fork.git $ cd vllm-fork -# git checkout 0.4.2-Gaudi-1.16.0 +# git checkout v0.4.2-Gaudi-1.16.0 $ pip install -e . # This may take 5-10 minutes. ``` diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst index cd026df8bf057..90f97155e1d75 100644 --- a/docs/source/getting_started/gaudi-installation.rst +++ b/docs/source/getting_started/gaudi-installation.rst @@ -61,7 +61,7 @@ To build and install vLLM-fork from source, run: $ git clone https://github.com/HabanaAI/vllm-fork.git $ cd vllm-fork - # git checkout 0.4.2-Gaudi-1.16.0 + # git checkout v0.4.2-Gaudi-1.16.0 $ pip install -e . # This may take 5-10 minutes. Supported Features From 3c827b311ee7d0cf3ceae49a42dbb8d830e153ce Mon Sep 17 00:00:00 2001 From: Damian Szwichtenberg Date: Thu, 23 May 2024 15:20:53 +0200 Subject: [PATCH 019/819] Fix error with high-level profiler in multi-card scenario (#38) --- vllm/worker/habana_model_runner.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 1451b6fe38aef..2bec899831c49 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -699,12 +699,13 @@ def execute_model( seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], kv_caches: List[torch.Tensor], ) -> Optional[SamplerOutput]: - event_start = self.profiler.get_timestamp_us() - is_prompt = seq_group_metadata_list[0].is_prompt - base_event_name = 'prompt' if is_prompt else 'decode' - self.profiler.start('internal', base_event_name) - if self.is_driver_worker: + # profiler is enabled only for rank == 0 (profiler.py:L57) + event_start = self.profiler.get_timestamp_us() + is_prompt = seq_group_metadata_list[0].is_prompt + base_event_name = 'prompt' if is_prompt else 'decode' + self.profiler.start('internal', base_event_name) + real_batch_size = len(seq_group_metadata_list) bucket_cfg = self.prompt_bs_bucket_cfg if is_prompt else self.decode_bs_bucket_cfg batch_size_padded = find_bucket(real_batch_size, bucket_cfg) @@ -729,7 +730,11 @@ def execute_model( execute_model_kwargs.update({"image_input": multi_modal_input}) htorch.core.mark_step() - with self.profiler.record_event('internal', f'model_{base_event_name}_eager_bs{real_batch_size}'): + if self.is_driver_worker: + model_event_name = f'model_{base_event_name}_eager_bs{real_batch_size}' + else: + model_event_name = 'model_executable' + with self.profiler.record_event('internal', model_event_name): hidden_states = self.model(**execute_model_kwargs) hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) @@ -751,10 +756,9 @@ def execute_model( output.outputs = output.outputs[:real_batch_size] htorch.core.mark_step() - # Stop recording 'execute_model' event - self.profiler.end() - - if self.profiler.enabled: + if self.is_driver_worker: + # Stop recording 'execute_model' event + self.profiler.end() event_end = self.profiler.get_timestamp_us() duration = event_end - event_start throughput = batch_size_padded / (duration / 1e6) From af0f1a691ef5c20f34d40359bf462924522da6d0 Mon Sep 17 00:00:00 2001 From: jkaniecki <153085639+jkaniecki@users.noreply.github.com> Date: Tue, 28 May 2024 13:00:38 +0200 Subject: [PATCH 020/819] Static fused moe op (#41) * Fix mixtral hidden states layout to fit into habana model runner * Add static moe op to mixtral * Add mark_step to static_fused_moe * Update __init__.py * Fix code indentation * Make code compatible with non HPU devices * Move static_fused_moe to vllm.hpu.ops * Update mixtral.py * Move op import from forward to top of the file * Remove circular import --- vllm/hpu/ops.py | 36 +++++++++++++++++++++ vllm/model_executor/models/mixtral.py | 45 ++++++++++++++++++--------- 2 files changed, 66 insertions(+), 15 deletions(-) diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index d95b301697cea..25bccb43297d5 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -113,3 +113,39 @@ def apply_rope( def awq_gemm(*args): raise NotImplementedError + + +def silu_and_mul_wrapper(x: torch.Tensor) -> torch.Tensor: + d = x.shape[-1] // 2 + output_shape = (x.shape[:-1] + (d, )) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + silu_and_mul(out, x) + return out + + +@hpu_utils.with_mark_steps +def static_fused_moe(hidden_states, w1, w2, score, topk): + B, D = hidden_states.shape + num_experts = w1.shape[0] + routing_weights = F.softmax(score, dim=1, dtype=torch.float32) + routing_weights, selected_experts = torch.topk(routing_weights, topk, dim=-1) + routing_weights = routing_weights.to(hidden_states.dtype) + final_hidden_states = torch.zeros( + (1, B, D), dtype=hidden_states.dtype, device=hidden_states.device + ) + padded_weights = torch.zeros( + (B, num_experts), dtype=hidden_states.dtype, device=hidden_states.device + ) + padded_weights.scatter_(-1, selected_experts, routing_weights) + padded_weights = padded_weights.reshape(-1, B, w1.shape[0]) + padded_weights = padded_weights.permute(2, 0, 1).unsqueeze(-1) + + for expert_idx in range(num_experts): + padded_weight = padded_weights[expert_idx] + current_state_static = hidden_states.reshape(-1, D) + w_output = silu_and_mul_wrapper(torch.matmul(current_state_static, w1[expert_idx].transpose(0, 1))) + w_output = torch.matmul(w_output, w2[expert_idx].transpose(0, 1)) + current_hidden_states_static = w_output * padded_weight + final_hidden_states += current_hidden_states_static + + return final_hidden_states.view(-1, D) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index efa4de7516212..4b602203cee79 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -50,7 +50,10 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.sequence import SamplerOutput -from vllm.utils import print_warning_once +from vllm.utils import print_warning_once, is_hpu + +if is_hpu(): + from vllm.hpu.ops import static_fused_moe class MixtralMoE(nn.Module): @@ -220,28 +223,40 @@ def process_weights_after_loading(self): requires_grad=False) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - num_tokens, hidden_size = hidden_states.shape + if is_hpu(): + batch_size, sequence_length, hidden_size = hidden_states.shape + else: + num_tokens, hidden_size = hidden_states.shape hidden_states = hidden_states.view(-1, self.hidden_size) # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) - final_hidden_states = fused_moe(hidden_states, - self.w13_weight, - self.w2_weight, - router_logits, - self.top_k, - renormalize=True, - inplace=True, - use_fp8=self.use_fp8, - w1_scale=self.w13_scale, - w2_scale=self.w2_scale, - a1_scale=self.a13_scale, - a2_scale=self.a2_scale) + + if is_hpu(): + final_hidden_states = static_fused_moe(hidden_states, + self.w13_weight, + self.w2_weight, + router_logits, + self.top_k) + else: + final_hidden_states = fused_moe(hidden_states, + self.w13_weight, + self.w2_weight, + router_logits, + self.top_k, + renormalize=True, + inplace=True, + use_fp8=self.use_fp8, + w1_scale=self.w13_scale, + w2_scale=self.w2_scale, + a1_scale=self.a13_scale, + a2_scale=self.a2_scale) if self.tp_size > 1: final_hidden_states = tensor_model_parallel_all_reduce( final_hidden_states) - return final_hidden_states.view(num_tokens, hidden_size) + return (final_hidden_states.view(batch_size, sequence_length, hidden_size) if is_hpu() + else final_hidden_states.view(num_tokens, hidden_size)) class MixtralAttention(nn.Module): From 8359489977af675b464773b6462059632e589cce Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 3 Jun 2024 11:55:45 +0200 Subject: [PATCH 021/819] WA: Remove pyproject.toml, bypass HPU autodetection (#45) --- pyproject.toml | 67 -------------------------------------------------- setup.py | 1 + 2 files changed, 1 insertion(+), 67 deletions(-) delete mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 6a448defc16e1..0000000000000 --- a/pyproject.toml +++ /dev/null @@ -1,67 +0,0 @@ -[build-system] -# Should be mirrored in requirements-build.txt -requires = [ - "cmake>=3.21", - "ninja", - "packaging", - "setuptools >= 49.4.0", - "torch == 2.3.0", - "wheel", -] -build-backend = "setuptools.build_meta" - -[tool.ruff] -# Allow lines to be as long as 80. -line-length = 80 -exclude = [ - # External file, leaving license intact - "examples/fp8/quantizer/quantize.py" -] - -[tool.ruff.lint] -select = [ - # pycodestyle - "E", - # Pyflakes - "F", - # pyupgrade - # "UP", - # flake8-bugbear - "B", - # flake8-simplify - "SIM", - # isort - # "I", - "G", -] -ignore = [ - # star imports - "F405", "F403", - # lambda expression assignment - "E731", - # Loop control variable not used within loop body - "B007", -] - -[tool.mypy] -python_version = "3.8" - -ignore_missing_imports = true -check_untyped_defs = true -follow_imports = "skip" - -files = "vllm" -# TODO(woosuk): Include the code from Megatron and HuggingFace. -exclude = [ - "vllm/model_executor/parallel_utils/|vllm/model_executor/models/", - # Ignore triton kernels in ops. - 'vllm/attention/ops/.*\.py$' -] - -[tool.codespell] -ignore-words-list = "dout, te, indicies" -skip = "./tests/prompts,./benchmarks/sonnet.txt" - -[tool.isort] -use_parentheses = true -skip_gitignore = true diff --git a/setup.py b/setup.py index 49e20aac0068a..964c467fd0a3f 100644 --- a/setup.py +++ b/setup.py @@ -204,6 +204,7 @@ def build_extensions(self) -> None: def _is_hpu() -> bool: is_hpu_available = True + return is_hpu_available # FIXME(kzawora): HPU autodetection sporadically fails on certain clients. Find the cause and fix it. try: subprocess.run(["hl-smi"], capture_output=True, check=True) except (FileNotFoundError, PermissionError, subprocess.CalledProcessError): From 82f6280b896db33d9ea2e081f303de5b8aae644a Mon Sep 17 00:00:00 2001 From: madamczykhabana <110973826+madamczykhabana@users.noreply.github.com> Date: Mon, 3 Jun 2024 14:49:16 +0200 Subject: [PATCH 022/819] Use setuptools older than 70.0.0 (#42) * Use setuptools older than 70.0.0 * Delete pyproject.toml --------- Co-authored-by: Konrad Zawora --- requirements-build.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/requirements-build.txt b/requirements-build.txt index 1a07a94e82e04..a944fa31fd74d 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -1,7 +1,7 @@ -# Should be mirrored in pyproject.toml -cmake>=3.21 -ninja -packaging -setuptools>=49.4.0 -torch==2.3.0 -wheel +# Should be mirrored in pyproject.toml +cmake>=3.21 +ninja +packaging +setuptools>=49.4.0,<70.0.0 +torch==2.3.0 +wheel From 539e394085b722f0860d3174eb0f265f63722c9a Mon Sep 17 00:00:00 2001 From: madamczykhabana <110973826+madamczykhabana@users.noreply.github.com> Date: Mon, 3 Jun 2024 14:50:08 +0200 Subject: [PATCH 023/819] Add VLLM_SKIP_WARMUP flag (#43) --- vllm/worker/habana_model_runner.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 2bec899831c49..e7d1c41214369 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -812,6 +812,9 @@ def warmup_scenario(self, batch_size, seq_len, is_prompt, kv_caches) -> None: @torch.inference_mode() def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: + if os.environ.get('VLLM_SKIP_WARMUP', 'false').lower() == 'true': + logger.info("Skipping warmup...") + return self.profiler.start('internal', 'warmup') times = 1 # TODO: this is will be updated once HPU graphs are reintroduced scenarios = [] From b3617eef86c506c66de8faeb7b0c25a8a8850abc Mon Sep 17 00:00:00 2001 From: madamczykhabana <110973826+madamczykhabana@users.noreply.github.com> Date: Tue, 4 Jun 2024 13:02:02 +0200 Subject: [PATCH 024/819] Graphs v2 (#44) * Trimmed metadata - part 1 * [WIP] HPU graphs for decode * [WIP] Graph allocation algorithm reworked * Cleanup * Add graph memory estimations * Fix multinode synchronization * Create attn_bias inside HPU graph * Cleanup after rebase * Increase default VLLM_GRAPH_RESERVED_MEM to 0.3 * Remove obsolete class * Tweak default HPU graph parameters --- vllm/attention/backends/habana_attn.py | 26 +- vllm/attention/ops/habana_paged_attn.py | 4 - vllm/hpu/ops.py | 2 +- .../model_executor/layers/logits_processor.py | 7 +- vllm/utils.py | 7 +- vllm/worker/habana_model_runner.py | 227 +++++++++++++++--- vllm/worker/habana_worker.py | 23 +- 7 files changed, 216 insertions(+), 80 deletions(-) diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py index 45fe1989f9bff..017cf9c8933e5 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/habana_attn.py @@ -84,9 +84,6 @@ class HabanaAttentionMetadata(AttentionMetadataPerStage, HabanaPagedAttentionMet # Maximum query length in the batch. max_query_len: Optional[int] - # FIXME: It is for flash attn. - # Maximum sequence length in the batch. - max_seq_len: Optional[int] # (batch_size + 1,). The cumulative subquery lengths of the sequences in # the batch, used to index into subquery. E.g., if the subquery length # is [4, 6], it is [0, 4, 10]. @@ -201,27 +198,7 @@ def forward( # Prompt run. if kv_cache is None or prefill_meta.block_tables.numel() == 0: # TODO: move this outside of model - if prefill_meta.attn_bias is None: - if self.alibi_slopes is None: - lens = torch.tensor(attn_metadata.prefill_metadata.seq_lens, device=query.device, dtype=torch.int32) - len_mask = (torch.arange(0, seq_len, device=query.device, dtype=torch.int32) - .view(1, seq_len) - .ge(lens.unsqueeze(-1)) - .view(batch_size, 1, 1, seq_len)) - causal_mask = torch.triu( - torch.ones((batch_size, 1, seq_len, seq_len), device=query.device, dtype=torch.bool), - diagonal=1 - ) - mask = causal_mask.logical_or(len_mask) - attn_bias = (torch.zeros_like(mask, dtype=query.dtype) - .masked_fill_(mask, -math.inf)) - if self.sliding_window is not None: - raise NotImplementedError("Sliding window is not supported on HPU") - prefill_meta.attn_bias = attn_bias - else: - prefill_meta.attn_bias = _make_alibi_bias( - self.alibi_slopes, self.num_kv_heads, batch_size, - seq_len, query.dtype) + assert prefill_meta.attn_bias is not None, 'attn_bias must be set before calling model.forward!' query_shape = (batch_size, seq_len, self.num_heads, self.head_size) kv_shape = (batch_size, seq_len_kv, self.num_kv_heads, self.head_size) out = xops.prompt_attention( @@ -256,7 +233,6 @@ def forward( value_cache, decode_meta.block_tables, decode_meta.seq_lens_tensor, - decode_meta.max_seq_len, attn_metadata.kv_cache_dtype, self.num_kv_heads, self.scale, diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/habana_paged_attn.py index 8dc79f17f8c9c..bd6a58684f567 100644 --- a/vllm/attention/ops/habana_paged_attn.py +++ b/vllm/attention/ops/habana_paged_attn.py @@ -19,8 +19,6 @@ class HabanaPagedAttentionMetadata: # (batch_size,). The length of sequences (entire tokens seen so far) per # sequence. seq_lens_tensor: Optional[torch.Tensor] - # Maximum sequence length in the batch. - max_seq_len: Optional[int] # (batch_size, max_blocks_per_seq). # Block addresses per sequence. (Seq id -> list of physical block) # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks @@ -82,7 +80,6 @@ def forward_decode( value_cache: torch.Tensor, block_tables: torch.Tensor, seq_lens: torch.Tensor, - max_seq_len: int, kv_cache_dtype: str, num_kv_heads: int, scale: float, @@ -99,7 +96,6 @@ def forward_decode( block_tables, seq_lens, block_size, - max_seq_len, alibi_slopes, kv_cache_dtype, ) diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index 25bccb43297d5..10e53312378ad 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -36,7 +36,7 @@ def fetch_from_cache(cache, blocks): @hpu_utils.with_mark_steps -def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block_tables, context_lens, block_size, max_context_len, alibi_slopes, kv_cache_dtype=None) -> None: +def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block_tables, context_lens, block_size, alibi_slopes, kv_cache_dtype=None) -> None: seq_len = block_tables.size(1) batch_size, query_heads, _ = query.shape _, kv_heads, _, _ = key_cache.shape diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index 5e484ff05b2f3..3951619c6e3ec 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -85,8 +85,11 @@ def _prune_hidden_states( hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> torch.Tensor: - return hidden_states.index_select(0, - sampling_metadata.selected_token_indices) + if sampling_metadata.selected_token_indices is not None: + return hidden_states.index_select(0, + sampling_metadata.selected_token_indices) + else: + return hidden_states def _apply_logits_processors( diff --git a/vllm/utils.py b/vllm/utils.py index a8cefefecb8e5..456c5602cf9d3 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -512,16 +512,17 @@ def total_memory() -> float: return total_hpu_memory def __enter__(self): + # Force garbage collection + gc.collect() self.initial_memory = HabanaMemoryProfiler.current_memory_usage() # This allows us to call methods of the context manager if needed return self def __exit__(self, exc_type, exc_val, exc_tb): - self.final_memory = HabanaMemoryProfiler.current_memory_usage() - self.consumed_memory = self.final_memory - self.initial_memory - # Force garbage collection gc.collect() + self.final_memory = HabanaMemoryProfiler.current_memory_usage() + self.consumed_memory = self.final_memory - self.initial_memory # Adapted from https://stackoverflow.com/a/49361727 diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index e7d1c41214369..9b3511a328c5e 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -6,6 +6,8 @@ from enum import IntEnum from typing import List, NamedTuple, Optional, Set, Tuple, Dict +import collections +import gc import os import math import itertools @@ -18,6 +20,7 @@ from vllm.config import (DeviceConfig, LoadConfig, CacheConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig) from vllm.distributed import broadcast_tensor_dict +from vllm.distributed.parallel_state import get_cpu_world_group from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping from vllm.lora.request import LoRARequest @@ -35,19 +38,21 @@ _PAD_SLOT_ID = 0 LORA_WARMUP_RANK = 8 +_TYPE_CACHE = {} # Read bucketing configuration from env variables # phase is either 'prompt' or 'decode' # dim is either 'bs' or 'seq' -# example env variable: VLLM_DECODE_BS_STEP=128 +# param is either 'min', 'step' or 'max' +# example env variable: VLLM_DECODE_BS_BUCKET_STEP=128 def read_bucket_settings(phase: str, dim: str, **defaults: Dict): params = ['min', 'step', 'max'] - values = [os.environ.get(f'VLLM_{phase}_{dim}_BUCKET_{p}'.upper(), defaults[p]) for p in params] + values = [int(os.environ.get(f'VLLM_{phase}_{dim}_BUCKET_{p}'.upper(), defaults[p])) for p in params] return values -def warmup_buckets(config: Tuple[int, int, int]): +def warmup_range(config: Tuple[int, int, int]): bmin, bstep, bmax = config base = itertools.repeat(2) ramp_up = itertools.accumulate(base, func=operator.mul, initial=bmin) @@ -56,6 +61,11 @@ def warmup_buckets(config: Tuple[int, int, int]): return list(ramp_up) + list(stable) +def warmup_buckets(bs_bucket_config, seq_bucket_config): + buckets = itertools.product(warmup_range(bs_bucket_config), warmup_range(seq_bucket_config)) + return list(sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0]))) + + def next_pow2(value: int): res = 1 while value > 1: @@ -77,6 +87,78 @@ def find_bucket(value: int, config: Tuple[int, int, int]): return result +def subtuple(obj: object, typename: str, to_copy: List[str], to_override: Dict[str, object] = {}): + if obj is None: + return None + fields = set(to_copy) | set(to_override.keys()) + values = {f: to_override.get(f, getattr(obj, f)) for f in fields} + if typename not in _TYPE_CACHE: + _TYPE_CACHE[typename] = collections.namedtuple(typename, ' '.join(fields)) + return _TYPE_CACHE[typename](**values) + + +def align_workers(value, op): + group = get_cpu_world_group() + world_size = torch.distributed.get_world_size() + if world_size <= 1: + return value + value_t = torch.tensor(value, device='cpu') + torch.distributed.all_reduce(value_t, op=op, group=group) + return value_t.item() + + +class HpuModelAdapter(): + def __init__(self, model): + self.model = model + + def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device, dtype): + prefill_metadata = attn_metadata.prefill_metadata + if prefill_metadata is None: + return attn_metadata + #FIXME: Restore alibi support + #if self.alibi_slopes is None: + if True: + seq_lens_t = prefill_metadata.seq_lens_tensor + len_mask = (torch.arange(0, seq_len, device=device, dtype=torch.int32) + .view(1, seq_len) + .ge(seq_lens_t.unsqueeze(-1)) + .view(batch_size, 1, 1, seq_len)) + causal_mask = torch.triu( + torch.ones((batch_size, 1, seq_len, seq_len), device=device, dtype=torch.bool), + diagonal=1 + ) + mask = causal_mask.logical_or(len_mask) + attn_bias = (torch.zeros_like(mask, dtype=dtype) + .masked_fill_(mask, -math.inf)) + #FIXME: Restore sliding window support + #if self.sliding_window is not None: + prefill_metadata = prefill_metadata._replace(attn_bias=attn_bias) + attn_metadata = attn_metadata._replace(prefill_metadata=prefill_metadata) + return attn_metadata + else: + # FIXME: This needs updating... + prefill_meta.attn_bias = _make_alibi_bias( + self.alibi_slopes, self.num_kv_heads, batch_size, + seq_len, query.dtype) + + + def forward(self, *args, **kwargs): + kwargs = kwargs.copy() + selected_token_indices = kwargs.pop('selected_token_indices') + input_ids = kwargs['input_ids'] + kwargs['attn_metadata'] = self._set_attn_bias(kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1), input_ids.device, torch.bfloat16) + hidden_states = self.model(*args, **kwargs) + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + hidden_states = hidden_states.index_select(0, selected_token_indices) + return hidden_states + + def compute_logits(self, *args, **kwargs): + return self.model.compute_logits(*args, **kwargs) + + def sample(self, *args, **kwargs): + return self.model.sample(*args, **kwargs) + + class PreparePromptMetadata(NamedTuple): input_tokens: List[int] input_positions: List[int] @@ -164,8 +246,9 @@ def __init__( if model_config is not None else None) self.device_config = (device_config if device_config is not None else DeviceConfig()) - self.device = self.device_config.device + self.device = self.device_config.device + self.enforce_eager = self.model_config.enforce_eager self.max_num_seqs = self.scheduler_config.max_num_seqs self.max_model_len = self.scheduler_config.max_model_len self.max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens @@ -181,7 +264,6 @@ def __init__( # Lazy initialization self.lora_manager: LRUCacheWorkerLoRAManager = None self.model: torch.nn.Module = None - self.excluded_from_warmup = [] self._setup_buckets() @@ -196,6 +278,8 @@ def load_model(self) -> None: parallel_config=self.parallel_config, scheduler_config=self.scheduler_config, ) + # FIXME: Running with disable_tensor_cache=True causes RuntimeErrors. This needs to be debugged + self.model = htorch.hpu.wrap_in_hpu_graph(HpuModelAdapter(self.model)) self.model_memory_usage = m.consumed_memory logger.info(f"Loading model weights took " @@ -217,18 +301,25 @@ def load_model(self) -> None: self.model.embedding_padding_modules) self.model = self.lora_manager.create_lora_manager(self.model) + def _use_graphs(self, batch_size, seq_len, is_prompt): + if self.enforce_eager: + return False + return (batch_size, seq_len, is_prompt) in self.graphed_buckets + def _setup_buckets(self) -> None: self.prompt_bs_bucket_cfg = read_bucket_settings('prompt', 'bs', min=1, step=32, max=min(self.max_num_seqs, 64)) self.decode_bs_bucket_cfg = read_bucket_settings('decode', 'bs', min=1, step=128, max=self.max_num_seqs) self.prompt_seq_bucket_cfg = read_bucket_settings('prompt', 'seq', min=self.block_size, step=self.block_size, max=1024) self.decode_seq_bucket_cfg = read_bucket_settings('decode', 'seq', min=self.block_size, step=self.block_size, max=2048) + self.graphed_buckets = set() + logger.info(f"Prompt bucket config (min, step, max_warmup) bs:{self.prompt_bs_bucket_cfg}, seq:{self.prompt_seq_bucket_cfg}") - logger.info(f"Decode bucket config (min, step, max_warmup) bs:{self.decode_bs_bucket_cfg}, seq:{self.decode_seq_bucket_cfg}") + self.prompt_buckets = warmup_buckets(self.prompt_bs_bucket_cfg, self.prompt_seq_bucket_cfg) + logger.info(f"Generated {len(self.prompt_buckets)} prompt buckets: {list(sorted(self.prompt_buckets))}") - # FIXME: exclude from warmup as it causes OOM on llama-70b - self.excluded_from_warmup = [ - (64, 1024, True) - ] + logger.info(f"Decode bucket config (min, step, max_warmup) bs:{self.decode_bs_bucket_cfg}, seq:{self.decode_seq_bucket_cfg}") + self.decode_buckets = warmup_buckets(self.decode_bs_bucket_cfg, self.decode_seq_bucket_cfg) + logger.info(f"Generated {len(self.decode_buckets)} decode buckets: {list(sorted(self.decode_buckets))}") def _prepare_prompt( self, @@ -350,7 +441,6 @@ def _prepare_prompt( slot_mapping[-1].append(slot) max_query_len = max(query_lens) - max_seq_len = max(seq_lens) assert max_query_len > 0 context_lens_tensor = torch.tensor(context_lens, @@ -413,7 +503,6 @@ def _prepare_prompt( seq_lens=seq_lens, seq_lens_tensor=seq_lens_tensor, max_query_len=max_query_len, - max_seq_len=max_seq_len, subquery_start_loc=subquery_start_loc, seq_start_loc=seq_start_loc, context_lens_tensor=context_lens_tensor, @@ -486,7 +575,6 @@ def _prepare_decode( block_table = block_table[-sliding_window_blocks:] block_tables.append(block_table) - max_seq_len = max(seq_lens) input_tokens = torch.tensor(input_tokens, dtype=torch.long, device=self.device) @@ -514,7 +602,6 @@ def _prepare_decode( seq_lens=None, seq_lens_tensor=seq_lens_tensor, max_query_len=None, - max_seq_len=max_seq_len, subquery_start_loc=None, seq_start_loc=None, context_lens_tensor=None, @@ -693,6 +780,30 @@ def prepare_input_tensors( sampling_metadata, lora_requests, lora_mapping, multi_modal_input) + def _seq_len(self, attn_metadata): + if attn_metadata.prefill_metadata: + return attn_metadata.slot_mapping.size(1) + else: + return attn_metadata.decode_metadata.block_tables.size(1) * self.block_size + + def trim_attn_metadata(self, metadata: AttentionMetadata) -> object: + prefill_metadata = subtuple(metadata.prefill_metadata, + 'TrimmedPrefillMetadata', + ['block_tables', + 'seq_lens_tensor', + 'attn_bias']) + decode_metadata = subtuple(metadata.decode_metadata, + 'TrimmedDecodeMetadata', + ['block_tables', + 'seq_lens_tensor', + ]) + return subtuple(metadata, + 'TrimmedMetadata', + ['slot_mapping', + 'kv_cache_dtype'], + {'prefill_metadata': prefill_metadata, + 'decode_metadata': decode_metadata}) + @torch.inference_mode() def execute_model( self, @@ -700,7 +811,6 @@ def execute_model( kv_caches: List[torch.Tensor], ) -> Optional[SamplerOutput]: if self.is_driver_worker: - # profiler is enabled only for rank == 0 (profiler.py:L57) event_start = self.profiler.get_timestamp_us() is_prompt = seq_group_metadata_list[0].is_prompt base_event_name = 'prompt' if is_prompt else 'decode' @@ -716,15 +826,19 @@ def execute_model( (input_tokens, input_positions, attn_metadata, sampling_metadata, lora_requests, lora_mapping, multi_modal_input ) = self.prepare_input_tensors(seq_group_metadata_list) + is_prompt = attn_metadata.prefill_metadata is not None if self.lora_config: self.set_active_loras(lora_requests, lora_mapping) + batch_size = input_tokens.size(0) + seq_len = self._seq_len(attn_metadata) + use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) execute_model_kwargs = { "input_ids": input_tokens, "positions": input_positions, "kv_caches": kv_caches, - "attn_metadata": attn_metadata, + "attn_metadata": self.trim_attn_metadata(attn_metadata), } if self.vision_language_config: execute_model_kwargs.update({"image_input": multi_modal_input}) @@ -735,11 +849,11 @@ def execute_model( else: model_event_name = 'model_executable' with self.profiler.record_event('internal', model_event_name): - hidden_states = self.model(**execute_model_kwargs) - hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + hidden_states = self.model.forward(**execute_model_kwargs, selected_token_indices=sampling_metadata.selected_token_indices, bypass_hpu_graphs=not use_graphs) # Compute the logits. with self.profiler.record_event('internal', 'compute_logits'): + sampling_metadata.selected_token_indices = None logits = self.model.compute_logits(hidden_states, sampling_metadata) htorch.core.mark_step() @@ -803,31 +917,84 @@ def profile_run(self) -> None: self.warmup_scenario(self.max_num_seqs, seq_len, True, kv_caches) def warmup_scenario(self, batch_size, seq_len, is_prompt, kv_caches) -> None: - scenario_name = f"warmup_{'prompt' if is_prompt else 'decode'}_bs{batch_size}_seq{seq_len}" + use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) + scenario_name = f"warmup_{'prompt' if is_prompt else 'decode'}_bs{batch_size}_seq{seq_len}_graphs{'T' if use_graphs else 'F'}" self.profiler.start('internal', scenario_name) + times = 3 if use_graphs else 1 seqs = [self.create_dummy_seq_group_metadata(i, seq_len, is_prompt) for i in range(batch_size)] - _ = self.execute_model(seqs, kv_caches) torch.hpu.synchronize() + for _ in range(times): + self.execute_model(seqs, kv_caches) + torch.hpu.synchronize() self.profiler.end() + gc.collect() + + def log_warmup(self, phase, i, max_i, batch_size, seq_len): + free_mem = format_bytes(HabanaMemoryProfiler.current_free_memory()) + logger.info(f"[Warmup][{phase}][{i+1}/{max_i}] batch_size:{batch_size} seq_len:{seq_len} free_mem:{free_mem}") + + def warmup_all_buckets(self, buckets, is_prompt, kv_caches): + for i, (batch_size, seq_len) in enumerate(reversed(buckets)): + mem_usage = 100.0 * HabanaMemoryProfiler.current_memory_usage() / HabanaMemoryProfiler.total_memory() + self.log_warmup('Prompt' if is_prompt else 'Decode', i, len(buckets), batch_size, seq_len) + self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches) + def warmup_graphs(self, strategy, buckets, is_prompt, kv_caches, available_mem): + total_batch_seq = 0.001 + total_mem = 0 + idx = 0 + phase = f'Graph/{"Prompt" if is_prompt else "Decode"}' + num_candidates = len(buckets) + + if strategy == 'min_tokens': + ordering = lambda b: (b[0] * b[1], b[1], b[0]) + elif strategy == 'max_bs': + ordering = lambda b: (-b[0], b[1]) + else: + raise NotImplementedError(f'Unsupported graph allocation strategy: {strategy}') + buckets = list(sorted(buckets, key=ordering)) + + for idx, (batch_size, seq_len) in enumerate(buckets): + # Graph memory usage is proportional to seq dimension in a batch + batch_seq = batch_size * seq_len if is_prompt else batch_size + mem_estimate = batch_seq / total_batch_seq * total_mem + if mem_estimate >= available_mem: + continue + self.graphed_buckets.add((batch_size, seq_len, is_prompt)) + self.log_warmup(phase, idx, num_candidates, batch_size, seq_len) + with HabanaMemoryProfiler() as mem_prof: + self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches) + used_mem = align_workers(mem_prof.consumed_memory, torch.distributed.ReduceOp.MAX) + available_mem -= used_mem + total_mem += used_mem + total_batch_seq += batch_seq + graphed = list(c[:2] for c in self.graphed_buckets if c[2] == is_prompt) + logger.info(f'{phase} captured:{len(graphed)} ({100 * len(graphed) / num_candidates:.1f}%) used_mem:{format_bytes(total_mem)} buckets:{sorted(list(graphed))}') + + @torch.inference_mode() def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: if os.environ.get('VLLM_SKIP_WARMUP', 'false').lower() == 'true': logger.info("Skipping warmup...") return self.profiler.start('internal', 'warmup') - times = 1 # TODO: this is will be updated once HPU graphs are reintroduced - scenarios = [] - scenarios.extend(itertools.product(warmup_buckets(self.decode_bs_bucket_cfg), warmup_buckets(self.decode_seq_bucket_cfg), [False])) - scenarios.extend(itertools.product(warmup_buckets(self.prompt_bs_bucket_cfg), warmup_buckets(self.prompt_seq_bucket_cfg), [True])) - scenarios = [scenario for scenario in reversed(scenarios) for _ in range(times) if scenario not in self.excluded_from_warmup] - start_mem = HabanaMemoryProfiler.current_memory_usage() start_time = time.perf_counter() - for i, (batch_size, seq_len, is_prompt) in enumerate(scenarios): - mem_usage = 100.0 * HabanaMemoryProfiler.current_memory_usage() / HabanaMemoryProfiler.total_memory() - logger.info(f"[Warmup][{i+1}/{len(scenarios)}] batch_size:{batch_size} seq_len:{seq_len} is_prompt:{is_prompt} mem_usage:{mem_usage:0.1f}%") - self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches) + self.warmup_all_buckets(self.prompt_buckets, True, kv_caches) + self.warmup_all_buckets(self.decode_buckets, False, kv_caches) + + if not self.enforce_eager: + mem_margin = 1.0 - float(os.environ.get('VLLM_GRAPH_MEM_MARGIN', '0.02')) + free_mem = mem_margin * HabanaMemoryProfiler.current_free_memory() + free_mem = align_workers(free_mem, torch.distributed.ReduceOp.MIN) + prompt_graph_mem_ratio = float(os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.5')) + prompt_available_memory = prompt_graph_mem_ratio * free_mem + decode_available_memory = free_mem - prompt_available_memory + prompt_strategy = 'min_tokens' + decode_strategy = os.environ.get('VLLM_GRAPH_DECODE_STRATEGY', 'max_bs') + self.warmup_graphs(prompt_strategy, self.prompt_buckets, True, kv_caches, prompt_available_memory) + self.warmup_graphs(decode_strategy, self.decode_buckets, False, kv_caches, decode_available_memory) + end_time = time.perf_counter() end_mem = HabanaMemoryProfiler.current_memory_usage() elapsed_time = end_time - start_time diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index eeba9e5c4adba..e253e4479a855 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -90,7 +90,6 @@ def init_device(self) -> None: if self.device_config.device.type == "hpu": self.device = torch.device("hpu") torch.hpu.set_device(self.device) - self.init_hpu_memory = torch.hpu.mem_get_info()[0] else: raise RuntimeError( f"Not support device type: {self.device_config.device}") @@ -123,22 +122,15 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: # Execute a forward pass with dummy inputs to profile the memory usage # of the model. self.model_runner.profile_run() - - # Calculate the number of blocks that can be allocated with the - # profiled peak memory. torch.hpu.synchronize() - free_hpu_memory, total_hpu_memory = torch.hpu.mem_get_info() - # NOTE(woosuk): Here we assume that the other processes using the same - # HPU did not change their memory usage during the profiling. - peak_memory = self.init_hpu_memory - free_hpu_memory - assert peak_memory > 0, ( - "Error in memory profiling. This happens when the HPU memory was " - "not properly cleaned up before initializing the vLLM instance.") + + # At this point we should've allocated the maximum workspace for all recipes + # we will use the extra memory for graphs/blocks + free_hpu_memory = torch.hpu.mem_get_info()[0] cache_block_size = self.get_cache_block_size_bytes() - num_hpu_blocks = int( - (total_hpu_memory * self.cache_config.gpu_memory_utilization - - peak_memory) // cache_block_size) + graph_headroom = 1 - (float(os.environ.get('VLLM_GRAPH_RESERVED_MEM', '0.4')) if not self.model_config.enforce_eager else 0) + num_hpu_blocks = int(free_hpu_memory * graph_headroom * self.cache_config.gpu_memory_utilization // cache_block_size) num_cpu_blocks = int(self.cache_config.swap_space_bytes // cache_block_size) num_hpu_blocks = max(num_hpu_blocks, 0) @@ -298,7 +290,8 @@ def init_worker_distributed_environment( assert dummy_tensor_hpu.item() == parallel_config.world_size ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size) - + + def raise_if_cache_size_invalid(num_gpu_blocks, block_size, max_model_len) -> None: if num_gpu_blocks <= 0: From 1c5d12e53d19155f5f8f1633c9a758591958477e Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 5 Jun 2024 09:57:46 +0200 Subject: [PATCH 025/819] Remove usage of wrap_in_hpu_graph in PT eager (#47) --- vllm/worker/habana_model_runner.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 9b3511a328c5e..5b9dff97d75e9 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -145,6 +145,8 @@ def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device, dtype): def forward(self, *args, **kwargs): kwargs = kwargs.copy() selected_token_indices = kwargs.pop('selected_token_indices') + if 'bypass_hpu_graphs' in kwargs: + kwargs.pop('bypass_hpu_graphs') # required for PT eager input_ids = kwargs['input_ids'] kwargs['attn_metadata'] = self._set_attn_bias(kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1), input_ids.device, torch.bfloat16) hidden_states = self.model(*args, **kwargs) @@ -279,7 +281,7 @@ def load_model(self) -> None: scheduler_config=self.scheduler_config, ) # FIXME: Running with disable_tensor_cache=True causes RuntimeErrors. This needs to be debugged - self.model = htorch.hpu.wrap_in_hpu_graph(HpuModelAdapter(self.model)) + self.model = _maybe_wrap_in_hpu_graph(self.model) self.model_memory_usage = m.consumed_memory logger.info(f"Loading model weights took " @@ -1004,3 +1006,6 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: @property def vocab_size(self) -> int: return self.model_config.get_vocab_size() + +def _maybe_wrap_in_hpu_graph(model): + return htorch.hpu.wrap_in_hpu_graph(HpuModelAdapter(model)) if htorch.utils.internal.is_lazy() else HpuModelAdapter(model) From 9bb5d20ca215b90eb80d7813ffe68b0cf4e41cfb Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 5 Jun 2024 12:01:09 +0200 Subject: [PATCH 026/819] Add HPU support to benchmark_latency and benchmark_throughput (#49) --- benchmarks/benchmark_latency.py | 4 ++-- benchmarks/benchmark_throughput.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 44da3bad8d840..e8530c2761acf 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -170,8 +170,8 @@ def run_to_completion(profile_dir: Optional[str] = None): "--device", type=str, default="cuda", - choices=["cuda", "cpu"], - help='device type for vLLM execution, supporting CUDA and CPU.') + choices=["cuda", "cpu", "hpu"], + help='device type for vLLM execution, supporting CUDA, CPU and HPU.') parser.add_argument('--block-size', type=int, default=16, diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 695d06e7b243d..2e8cfd3f2ca3e 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -334,8 +334,8 @@ def main(args: argparse.Namespace): "--device", type=str, default="cuda", - choices=["cuda", "cpu"], - help='device type for vLLM execution, supporting CUDA and CPU.') + choices=["cuda", "cpu", "hpu"], + help='device type for vLLM execution, supporting CUDA, CPU and HPU.') parser.add_argument( "--enable-prefix-caching", action='store_true', From ab359aca159ecf01a9dbcd85074eb4613e218f57 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 5 Jun 2024 14:04:24 +0200 Subject: [PATCH 027/819] Use int32 seeds for random sampler on HPU (#50) --- vllm/model_executor/sampling_metadata.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 9969c45963e9a..4b722aba567a4 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -8,7 +8,7 @@ from vllm.sampling_params import SamplingParams, SamplingType from vllm.sequence import SequenceData, SequenceGroupMetadata from vllm.utils import (async_tensor_h2d, is_pin_memory_available, - maybe_expand_dim) + maybe_expand_dim, is_hpu) _SAMPLING_EPS = 1e-5 _SEED_0_REPLACEMENT = 3403598558 @@ -498,22 +498,23 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float], dtype=torch.int, pin_memory=pin_memory, ) + idx_dtype = torch.long if not is_hpu() else torch.int # Gaudi doesn't have full native int64 support sample_indices_t = torch.tensor( sample_indices, device="cpu", - dtype=torch.long, + dtype=idx_dtype, pin_memory=pin_memory, ) prompt_tensor = torch.tensor( prompt_padded_tokens, device="cpu", - dtype=torch.long, + dtype=idx_dtype, pin_memory=pin_memory, ) output_tensor = torch.tensor( output_padded_tokens, device="cpu", - dtype=torch.long, + dtype=idx_dtype, pin_memory=pin_memory, ) # need to transpose and make contiguous to @@ -522,7 +523,7 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float], sampling_seeds_t = torch.tensor( sampling_seeds, device="cpu", - dtype=torch.long, + dtype=idx_dtype, pin_memory=pin_memory, ).T.contiguous() @@ -571,7 +572,8 @@ def _get_sequence_seeds( else: generator = random.Random(str((seed, ) + extra_entropy)) randint_fn = generator.randint - lo, hi = torch.iinfo(torch.long).min, torch.iinfo(torch.long).max + idx_dtype = torch.long if not is_hpu() else torch.int # Gaudi doesn't have full native int64 support + lo, hi = torch.iinfo(idx_dtype).min, torch.iinfo(idx_dtype).max # If the user/random sets seed = 0 but request should # have sampling, we need to change it to something # else. We use a constant in that case. From cf6952d3ba9bf194146d6cfa801649283f038d0c Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 5 Jun 2024 14:31:50 +0200 Subject: [PATCH 028/819] Add host memory profiling to HabanaMemoryProfiler (#51) --- vllm/executor/habana_executor.py | 3 +- vllm/utils.py | 40 +++++++++++++++++++++------ vllm/worker/habana_model_runner.py | 44 ++++++++++++++++-------------- 3 files changed, 56 insertions(+), 31 deletions(-) diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py index 5c2cc7e958f96..cfad194bf9cca 100644 --- a/vllm/executor/habana_executor.py +++ b/vllm/executor/habana_executor.py @@ -80,8 +80,7 @@ def initialize_cache(self, num_gpu_blocks : int, num_cpu_blocks) -> None: with HabanaMemoryProfiler() as cache_init_m: self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - logger.info(f"init_cache_engine took " - f"{format_bytes(cache_init_m.consumed_memory)} ({cache_init_m.consumed_memory/HabanaMemoryProfiler.total_memory():.2%} of total memory, gpu_memory_utilization: {self.cache_config.gpu_memory_utilization}, {format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)") + logger.info(f"init_cache_engine took {cache_init_m.get_summary_string()}") def execute_model( self, diff --git a/vllm/utils.py b/vllm/utils.py index 456c5602cf9d3..6d6d3d4f4590d 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -496,33 +496,55 @@ class HabanaMemoryProfiler: def __init__(self, device=None): self.device = device - def current_memory_usage() -> float: - # Return the memory usage in bytes. + def current_device_memory_usage() -> float: + # Return the device memory usage in bytes. free_hpu_memory, total_hpu_memory = torch.hpu.mem_get_info() return total_hpu_memory - free_hpu_memory - def current_free_memory() -> float: - # Return the memory usage in bytes. + def current_free_device_memory() -> float: + # Return the device memory usage in bytes. free_hpu_memory, _ = torch.hpu.mem_get_info() return free_hpu_memory - def total_memory() -> float: - # Return the memory usage in bytes. + def total_device_memory() -> float: + # Return the device memory usage in bytes. _, total_hpu_memory = torch.hpu.mem_get_info() return total_hpu_memory + def current_host_memory_usage() -> float: + # Return the host memory usage in bytes. + return HabanaMemoryProfiler.total_host_memory() - HabanaMemoryProfiler.current_free_host_memory() + + def current_free_host_memory() -> float: + # Return the host memory usage in bytes. + return psutil.virtual_memory().available + + def total_host_memory() -> float: + # Return the host memory usage in bytes. + return psutil.virtual_memory().total + + def get_summary_string(self): + if getattr(self, 'final_device_memory', None) is None or getattr(self, 'final_host_memory', None) is None: + raise RuntimeError("HabanaMemoryProfiler.get_summary_string() can only be called after closing context manager") + return (f"{format_bytes(self.consumed_device_memory)} of device memory ({format_bytes(self.final_device_memory)}/{format_bytes(HabanaMemoryProfiler.total_device_memory())} used) and " + f"{format_bytes(self.consumed_host_memory)} of host memory ({format_bytes(self.final_host_memory)}/{format_bytes(HabanaMemoryProfiler.total_host_memory())} used)") + def __enter__(self): # Force garbage collection gc.collect() - self.initial_memory = HabanaMemoryProfiler.current_memory_usage() + self.initial_device_memory = HabanaMemoryProfiler.current_device_memory_usage() + self.initial_host_memory = HabanaMemoryProfiler.current_host_memory_usage() # This allows us to call methods of the context manager if needed return self def __exit__(self, exc_type, exc_val, exc_tb): # Force garbage collection gc.collect() - self.final_memory = HabanaMemoryProfiler.current_memory_usage() - self.consumed_memory = self.final_memory - self.initial_memory + self.final_device_memory = HabanaMemoryProfiler.current_device_memory_usage() + self.final_host_memory = HabanaMemoryProfiler.current_host_memory_usage() + self.consumed_device_memory = self.final_device_memory - self.initial_device_memory + self.consumed_host_memory = self.final_host_memory - self.initial_host_memory + # Adapted from https://stackoverflow.com/a/49361727 diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 5b9dff97d75e9..78290fd59b10a 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -271,21 +271,25 @@ def __init__( def load_model(self) -> None: with HabanaMemoryProfiler() as m: - self.model = get_model( - model_config=self.model_config, - device_config=self.device_config, - load_config=self.load_config, - lora_config=self.lora_config, - vision_language_config=self.vision_language_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - ) - # FIXME: Running with disable_tensor_cache=True causes RuntimeErrors. This needs to be debugged - self.model = _maybe_wrap_in_hpu_graph(self.model) + with HabanaMemoryProfiler() as m_getmodel: + self.model = get_model( + model_config=self.model_config, + device_config=self.device_config, + load_config=self.load_config, + lora_config=self.lora_config, + vision_language_config=self.vision_language_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + ) + logger.info(f"Pre-loading model weights on {next(self.model.parameters()).device} took {m_getmodel.get_summary_string()}") - self.model_memory_usage = m.consumed_memory - logger.info(f"Loading model weights took " - f"{format_bytes(self.model_memory_usage)} ({format_bytes(HabanaMemoryProfiler.current_memory_usage())}/{format_bytes(HabanaMemoryProfiler.total_memory())} used)") + # FIXME: Running with disable_tensor_cache=True causes RuntimeErrors. This needs to be debugged + with HabanaMemoryProfiler() as m_wrap: + self.model = _maybe_wrap_in_hpu_graph(self.model) + logger.info(f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}") + + self.model_memory_usage = m.consumed_device_memory + logger.info(f"Loading model weights took in total {m.get_summary_string()}") if self.lora_config: assert hasattr(self.model, "supported_lora_modules" @@ -932,12 +936,12 @@ def warmup_scenario(self, batch_size, seq_len, is_prompt, kv_caches) -> None: gc.collect() def log_warmup(self, phase, i, max_i, batch_size, seq_len): - free_mem = format_bytes(HabanaMemoryProfiler.current_free_memory()) + free_mem = format_bytes(HabanaMemoryProfiler.current_free_device_memory()) logger.info(f"[Warmup][{phase}][{i+1}/{max_i}] batch_size:{batch_size} seq_len:{seq_len} free_mem:{free_mem}") def warmup_all_buckets(self, buckets, is_prompt, kv_caches): for i, (batch_size, seq_len) in enumerate(reversed(buckets)): - mem_usage = 100.0 * HabanaMemoryProfiler.current_memory_usage() / HabanaMemoryProfiler.total_memory() + mem_usage = 100.0 * HabanaMemoryProfiler.current_device_memory_usage() / HabanaMemoryProfiler.total_device_memory() self.log_warmup('Prompt' if is_prompt else 'Decode', i, len(buckets), batch_size, seq_len) self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches) @@ -966,7 +970,7 @@ def warmup_graphs(self, strategy, buckets, is_prompt, kv_caches, available_mem): self.log_warmup(phase, idx, num_candidates, batch_size, seq_len) with HabanaMemoryProfiler() as mem_prof: self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches) - used_mem = align_workers(mem_prof.consumed_memory, torch.distributed.ReduceOp.MAX) + used_mem = align_workers(mem_prof.consumed_device_memory, torch.distributed.ReduceOp.MAX) available_mem -= used_mem total_mem += used_mem total_batch_seq += batch_seq @@ -980,14 +984,14 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: logger.info("Skipping warmup...") return self.profiler.start('internal', 'warmup') - start_mem = HabanaMemoryProfiler.current_memory_usage() + start_mem = HabanaMemoryProfiler.current_device_memory_usage() start_time = time.perf_counter() self.warmup_all_buckets(self.prompt_buckets, True, kv_caches) self.warmup_all_buckets(self.decode_buckets, False, kv_caches) if not self.enforce_eager: mem_margin = 1.0 - float(os.environ.get('VLLM_GRAPH_MEM_MARGIN', '0.02')) - free_mem = mem_margin * HabanaMemoryProfiler.current_free_memory() + free_mem = mem_margin * HabanaMemoryProfiler.current_free_device_memory() free_mem = align_workers(free_mem, torch.distributed.ReduceOp.MIN) prompt_graph_mem_ratio = float(os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.5')) prompt_available_memory = prompt_graph_mem_ratio * free_mem @@ -998,7 +1002,7 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: self.warmup_graphs(decode_strategy, self.decode_buckets, False, kv_caches, decode_available_memory) end_time = time.perf_counter() - end_mem = HabanaMemoryProfiler.current_memory_usage() + end_mem = HabanaMemoryProfiler.current_device_memory_usage() elapsed_time = end_time - start_time logger.info(f"Warmup finished in {elapsed_time:.0f} secs, allocated {format_bytes(end_mem - start_mem)} of device memory") self.profiler.end() From d3e64dc17c8c2881a443260d03d968cba333fee0 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 6 Jun 2024 14:45:17 +0200 Subject: [PATCH 029/819] Update requirements-hpu.txt (#52) --- requirements-hpu.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 21666eb116c22..339fe989bdb7a 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -2,7 +2,7 @@ -r requirements-common.txt # Dependencies for HPU code -ray == 2.9.3 +ray == 2.23.0 triton pandas -tabulate \ No newline at end of file +tabulate From 0b70e5075453a366f033db13ae281a624606098b Mon Sep 17 00:00:00 2001 From: Artur Fierka <160735857+afierka-intel@users.noreply.github.com> Date: Fri, 7 Jun 2024 11:19:23 +0200 Subject: [PATCH 030/819] Skip incompatible tests with HPU (#46) * Fix setup.py for HPU * Fix vllm._C import ops -> vllm.hpu import ops * more of the same thing * re-add hpex rmsnorm and rope; but rope is crashing * remove unnecessary comments * add vllm/hpu files * add hpu autodetection * Add HabanaAttention stub * revert accidental changes * revert non-habana backend attention changes * add habana attention/worker/executor, sampling fails now * Restore unnecessarily changed files * enable HabanaMemoryProfiler * Make sampler pass * restore habana fused rope * prefill is now working!!! * fix prefill padding; decode is now working!!!!! * revert accidental changes * remove unused stuff in habana_paged_attn.py * remove diagnostic stuff from llm_engine.py * use HabanaExecutorAsync in async_llm_engine.py * add habana copyright headers to habana_*.py files * fix prefill attention conformance * minor naming fixes * remove naive attention from habana_attn (it never worked anyway) * re-enable profile run * Add fake HPUGraph support * add more metrics * indentation fix * ~~recipe cache metrics don't work lalalala~~ * i'm done with metrics for now * fix corner case in which hl-smi is not available but synapse is * FIXME: temporary setup.py workaround * WIP: add tensor parallelism stubs * habana worker cleanup * tensor parallelism is now working * remove unused files * remove unused func * add hpugraphrunner * improve hpu layernorm * Port pipelined PA * Port context length bucketing * remove cudagraphrunner from hpu runner * restore HPUGraphRunner back from FakeHPUGraphRunner * handle rotary embeddings properly on gaudi3 * oopsie! captured_block_counts was incorrect! * captured_block_counts.append doesn't do anything * Restore habana_main KV cache memory layout * fix memory profiler * overhaul hpugraph capture * Enable attention tests * Add geneeric changes * Enable activation tests * Enable cache tests: reshape & cache * Enable layernorm tests * Decouple reshape_and_cache prompt and decode tests and change slot mapping generation in prompt tests * Decrease max seq len in attention UTs * Enable pos_encoding tests * Enable cache copy tests * Remove gpu migration from unit tests * skip incompatible on HPU tests * Fix noisy lines * Update sampling_metadata.py Outdated changes * Update test_cache.py; fix code style * fix attention test after rebase * disable rotary embedding tests for hpu * restore oryginal rotary embedding tests * disable multiple sampling test * disable all metrics tests * disable some models tests * disable some sampler tests * restore recently disabled tests --------- Co-authored-by: Konrad Zawora Co-authored-by: Tomasz Krupa Co-authored-by: Artur Fierka --- tests/async_engine/test_api_server.py | 2 + tests/async_engine/test_openapi_server_ray.py | 3 + .../test_basic_correctness.py | 2 + .../basic_correctness/test_chunked_prefill.py | 2 + tests/basic_correctness/test_preemption.py | 6 + tests/core/block/e2e/test_correctness.py | 7 + tests/core/test_chunked_prefill_scheduler.py | 7 + tests/core/test_scheduler.py | 8 + tests/distributed/test_pynccl.py | 24 ++- tests/distributed/test_pynccl_library.py | 3 + tests/engine/test_computed_prefix_blocks.py | 2 + tests/engine/test_skip_tokenizer_init.py | 2 + tests/engine/test_stop_reason.py | 2 + tests/engine/test_stop_strings.py | 3 + tests/entrypoints/test_openai_server.py | 3 + .../test_server_oot_registration.py | 4 +- tests/kernels/test_activation.py | 25 ++- tests/kernels/test_attention.py | 68 ++++-- tests/kernels/test_cache.py | 193 +++++++++++++++--- tests/kernels/test_layernorm.py | 18 +- tests/kernels/test_moe.py | 3 + tests/kernels/test_pos_encoding.py | 19 +- tests/kernels/test_prefix_prefill.py | 10 +- tests/kernels/test_rand.py | 2 + tests/kernels/test_sampler.py | 4 + tests/lora/test_baichuan.py | 3 + tests/lora/test_chatglm3.py | 4 + tests/lora/test_gemma.py | 4 + tests/lora/test_layer_variation.py | 2 + tests/lora/test_layers.py | 6 + tests/lora/test_llama.py | 4 + tests/lora/test_lora.py | 4 + tests/lora/test_lora_manager.py | 9 + tests/lora/test_punica.py | 4 + tests/lora/test_quant_model.py | 2 + tests/lora/test_worker.py | 4 + tests/metrics/test_metrics.py | 4 + tests/models/test_aqlm.py | 15 +- tests/models/test_big_models.py | 4 +- tests/models/test_fp8.py | 13 +- tests/models/test_gptq_marlin.py | 13 +- tests/models/test_llava.py | 2 + tests/models/test_marlin.py | 13 +- tests/models/test_mistral.py | 2 + tests/models/test_models.py | 3 + tests/models/test_oot_registration.py | 1 + tests/quantization/test_configs.py | 5 +- tests/quantization/test_fp8.py | 10 +- tests/samplers/test_beam_search.py | 2 + tests/samplers/test_logits_processor.py | 2 + tests/samplers/test_logprobs.py | 2 + tests/samplers/test_ranks.py | 2 + tests/samplers/test_rejection_sampler.py | 20 +- tests/samplers/test_sampler.py | 30 +-- tests/samplers/test_seeded_generate.py | 2 + tests/spec_decode/e2e/test_compatibility.py | 5 + tests/spec_decode/e2e/test_logprobs.py | 6 + .../e2e/test_multistep_correctness.py | 12 ++ .../spec_decode/e2e/test_ngram_correctness.py | 5 + tests/spec_decode/test_batch_expansion.py | 2 + tests/spec_decode/test_metrics.py | 6 + tests/spec_decode/test_multi_step_worker.py | 6 + tests/spec_decode/test_ngram_worker.py | 5 + tests/spec_decode/test_spec_decode_worker.py | 9 +- tests/tensorizer_loader/test_tensorizer.py | 10 + tests/test_config.py | 3 + tests/test_logits_processor.py | 13 +- tests/tokenization/test_detokenize.py | 9 + tests/worker/test_model_runner.py | 6 +- tests/worker/test_swap.py | 4 +- vllm/hpu/cache_ops.py | 4 +- vllm/utils.py | 12 +- 72 files changed, 601 insertions(+), 129 deletions(-) diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index 7f57d5cf9b182..8b0e79cf9a6ee 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -6,6 +6,7 @@ import pytest import requests +from vllm.utils import is_hpu def _query_server(prompt: str, max_tokens: int = 5) -> dict: @@ -44,6 +45,7 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool, uvicorn_process.terminate() +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("tokenizer_pool_size", [0, 2]) @pytest.mark.parametrize("worker_use_ray", [False, True]) @pytest.mark.parametrize("engine_use_ray", [False, True]) diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py index 4b97af88012b9..2dd1d74c7eba6 100644 --- a/tests/async_engine/test_openapi_server_ray.py +++ b/tests/async_engine/test_openapi_server_ray.py @@ -10,6 +10,7 @@ # and debugging. import ray import requests +from vllm.utils import is_hpu MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds # any model with a chat template should work here @@ -57,6 +58,8 @@ def __del__(self): @pytest.fixture(scope="session") def server(): + if is_hpu(): + pytest.skip("Skipping test on HPU") ray.init() server_runner = ServerRunner.remote([ "--model", diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index d75279dd9cfa9..27468c6054258 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -5,6 +5,7 @@ import os import pytest +from vllm.utils import is_hpu MODELS = [ "facebook/opt-125m", @@ -13,6 +14,7 @@ VLLM_ATTENTION_BACKEND = "VLLM_ATTENTION_BACKEND" +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [5]) diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index 47d582c726c66..0b66f10f29acc 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -7,6 +7,7 @@ Run `pytest tests/models/test_chunked_prefill.py`. """ import pytest +from vllm.utils import is_hpu MODELS = [ "facebook/opt-125m", @@ -14,6 +15,7 @@ ] +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [32]) diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py index ffb0717b3bfdb..b9d46cb8f5f52 100644 --- a/tests/basic_correctness/test_preemption.py +++ b/tests/basic_correctness/test_preemption.py @@ -10,6 +10,7 @@ from vllm import SamplingParams from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT, ENABLE_ARTIFICIAL_PREEMPT) +from vllm.utils import is_hpu MODELS = [ "facebook/opt-125m", @@ -21,6 +22,7 @@ "tests/basic_correctness/test_preemption.py`") +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [96]) @@ -67,6 +69,7 @@ def test_chunked_prefill_recompute( f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("max_tokens", [96]) @@ -102,6 +105,7 @@ def test_preemption( f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("max_tokens", [96]) @@ -139,6 +143,7 @@ def test_swap( f"vLLM: {vllm_output_ids}") +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("max_tokens", [96]) @@ -183,6 +188,7 @@ def test_swap_infeasible( assert req_outputs[0].outputs[0].finish_reason == "length" +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("max_tokens", [96]) diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index c3666da7542b5..00fb9223c742e 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -3,8 +3,10 @@ import pytest from vllm import SamplingParams +from vllm.utils import is_hpu +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -77,6 +79,7 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator, assert baseline_token_ids == test_token_ids +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -138,6 +141,7 @@ def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator, assert baseline_token_ids == test_token_ids +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -230,6 +234,7 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator, assert baseline_token_ids == test_token_ids +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [ @@ -300,6 +305,7 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator, assert baseline_token_ids == test_token_ids +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -375,6 +381,7 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption( assert baseline_token_ids == test_token_ids +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index 92498c0014666..ca43a7701ac39 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -7,6 +7,7 @@ from vllm.core.interfaces import AllocStatus from vllm.core.scheduler import Scheduler from vllm.sequence import Logprob, SequenceGroup +from vllm.utils import is_hpu from .utils import create_dummy_prompt @@ -27,6 +28,7 @@ def schedule_and_update_computed_tokens(scheduler): return metas, out +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_simple(): """Verify basic scheduling works.""" block_size = 4 @@ -69,6 +71,7 @@ def test_simple(): assert len(seq_group_meta) == num_seq_group +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_chunk(): """Verify prefills are chunked properly.""" block_size = 4 @@ -113,6 +116,7 @@ def test_chunk(): assert out.num_batched_tokens == 57 +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_complex(): block_size = 4 max_seqs = 60 @@ -176,6 +180,7 @@ def test_complex(): assert running[2].is_prefill() +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_maximal_decoding(): """Verify decoding requests are prioritized.""" block_size = 4 @@ -369,6 +374,7 @@ def cannot_append_second_group(seq_group, num_lookahead_slots): assert out.blocks_to_swap_out == {} +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_running_prefill_prioritized_over_swap(): block_size = 4 max_seqs = 30 @@ -517,6 +523,7 @@ def cannot_append_second_group(seq_group, num_lookahead_slots): assert out.num_batched_tokens == max_num_batched_tokens +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_chunked_prefill_max_seqs(): block_size = 4 max_seqs = 2 diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index 3f0c918a89abb..3f45d55520934 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -11,6 +11,7 @@ from vllm.core.scheduler import Scheduler, SchedulingBudget from vllm.lora.request import LoRARequest from vllm.sequence import Logprob, SequenceGroup, SequenceStatus +from vllm.utils import is_hpu from .utils import create_dummy_prompt @@ -77,6 +78,7 @@ def test_scheduler_abort_seq_group(): assert scheduler.get_num_unfinished_seq_groups() == 0 +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_scheduler_schedule_simple(): block_size = 4 num_seq_group = 4 @@ -144,6 +146,7 @@ def test_scheduler_prefill_prioritized(): assert get_sequence_groups(out) == [seq_group_b] +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_scheduler_schedule_preempt_abort(): block_size = 4 max_model_len = 16 @@ -192,6 +195,7 @@ def test_scheduler_schedule_preempt_abort(): assert scheduler.get_num_unfinished_seq_groups() == 1 +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_scheduler_max_seqs(): block_size = 4 num_seq_group = 4 @@ -233,6 +237,7 @@ def test_scheduler_max_seqs(): assert set(get_sequence_groups(out)) == set([all_seq_groups[1]]) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_scheduler_delay_factor(): block_size = 4 scheduler_config = SchedulerConfig(100, 64, 16, delay_factor=0.5) @@ -270,6 +275,7 @@ def test_scheduler_delay_factor(): append_new_token(out, 1) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_swapped_out_prioritized(): scheduler = initialize_scheduler(max_num_seqs=6) # best_of=2 * 3 == 6 sequences. @@ -571,6 +577,7 @@ def cannot_append_second_group(seq_group, num_lookahead_slots): assert output.blocks_to_copy == [] +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_decode_swap_beam_search(): """ Test best_of > 1 swap out blocks @@ -621,6 +628,7 @@ def cannot_append_second_group(seq_group, num_lookahead_slots): assert output.blocks_to_copy == [] +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_schedule_decode_blocks_to_copy_update(): """ Verify blocks_to_copy is updated. diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index b6f461b76ed03..1e0b85a1a17f0 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -3,14 +3,17 @@ import pytest import torch -import vllm.distributed.device_communicators.pynccl_utils as pynccl_utils -from vllm.distributed.communication_op import tensor_model_parallel_all_reduce -from vllm.distributed.device_communicators.pynccl import (NCCLCommunicator, - ncclGetUniqueId) -from vllm.distributed.parallel_state import ( - ensure_model_parallel_initialized, get_tensor_model_parallel_cpu_group, - init_distributed_environment, with_pynccl_for_all_reduce) -from vllm.utils import update_environment_variables +from vllm.utils import is_hpu, update_environment_variables + +if not is_hpu(): + import vllm.distributed.device_communicators.pynccl_utils as pynccl_utils + from vllm.distributed.communication_op import tensor_model_parallel_all_reduce + from vllm.distributed.device_communicators.pynccl import (NCCLCommunicator, + ncclGetUniqueId) + from vllm.distributed.parallel_state import ( + ensure_model_parallel_initialized, get_tensor_model_parallel_cpu_group, + init_distributed_environment, with_pynccl_for_all_reduce) + def distributed_run(fn, world_size): @@ -56,6 +59,7 @@ def worker_fn(): assert result == comm.world_size +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test.") def test_pynccl(): @@ -84,6 +88,7 @@ def multiple_tp_worker_fn(): assert result == 2 +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.skipif(torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test.") def test_pynccl_multiple_tp(): @@ -113,6 +118,7 @@ def multiple_tp_with_vllm_worker_fn(): assert result == 2 +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.skipif(torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test.") def test_pynccl_multiple_tp_with_vllm(): @@ -140,12 +146,14 @@ def worker_fn_with_cudagraph(): assert a.mean().cpu().item() == comm.world_size**1 +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test.") def test_pynccl_with_cudagraph(): distributed_run(worker_fn_with_cudagraph, 2) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_ncclGetUniqueId(): unique_id = ncclGetUniqueId() # `list(unique_id.internal)` is something like this: diff --git a/tests/distributed/test_pynccl_library.py b/tests/distributed/test_pynccl_library.py index ec60a5ed3114d..67533a5866b55 100644 --- a/tests/distributed/test_pynccl_library.py +++ b/tests/distributed/test_pynccl_library.py @@ -1,6 +1,8 @@ import multiprocessing import tempfile +import pytest +from vllm.utils import is_hpu def target_fn(env, filepath): from vllm.utils import update_environment_variables @@ -9,6 +11,7 @@ def target_fn(env, filepath): nccl_integrity_check(filepath) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_library_file(): # note: don't import vllm.distributed.device_communicators.pynccl # before running this test, otherwise the library file will be loaded diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py index ed35212cc3f11..ec64cdd9749ff 100644 --- a/tests/engine/test_computed_prefix_blocks.py +++ b/tests/engine/test_computed_prefix_blocks.py @@ -3,8 +3,10 @@ from vllm.engine.arg_utils import EngineArgs from vllm.engine.llm_engine import LLMEngine from vllm.sampling_params import SamplingParams +from vllm.utils import is_hpu +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("block_size", [16]) def test_computed_prefix_blocks(model: str, block_size: int): diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py index baa463a316902..169c9186599cd 100644 --- a/tests/engine/test_skip_tokenizer_init.py +++ b/tests/engine/test_skip_tokenizer_init.py @@ -2,8 +2,10 @@ from vllm.entrypoints.llm import LLM from vllm.sampling_params import SamplingParams +from vllm.utils import is_hpu +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", ["facebook/opt-125m"]) def test_skip_tokenizer_initialization(model: str): # This test checks if the flag skip_tokenizer_init skips the initialization diff --git a/tests/engine/test_stop_reason.py b/tests/engine/test_stop_reason.py index b2f521a8ae4ce..c3109330785b7 100644 --- a/tests/engine/test_stop_reason.py +++ b/tests/engine/test_stop_reason.py @@ -10,6 +10,7 @@ import transformers from vllm import SamplingParams +from vllm.utils import is_hpu MODEL = "facebook/opt-350m" STOP_STR = "." @@ -24,6 +25,7 @@ def vllm_model(vllm_runner): del vllm_model +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_stop_reason(vllm_model, example_prompts): tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL) stop_token_id = tokenizer.convert_tokens_to_ids(STOP_STR) diff --git a/tests/engine/test_stop_strings.py b/tests/engine/test_stop_strings.py index 6b747beb4b543..61d3cd485b80d 100644 --- a/tests/engine/test_stop_strings.py +++ b/tests/engine/test_stop_strings.py @@ -3,6 +3,7 @@ import pytest from vllm import CompletionOutput, LLMEngine, SamplingParams +from vllm.utils import is_hpu MODEL = "meta-llama/llama-2-7b-hf" MAX_TOKENS = 200 @@ -10,6 +11,8 @@ @pytest.fixture(scope="session") def vllm_model(vllm_runner): + if is_hpu(): + pytest.skip("Skipping test on HPU") return vllm_runner(MODEL) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index e53e64a0c1ff8..e5fe2246f9934 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -19,6 +19,7 @@ from openai import BadRequestError from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.utils import is_hpu MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds # any model with a chat template should work here @@ -123,6 +124,8 @@ def zephyr_lora_files(): @pytest.fixture(scope="session") def server(zephyr_lora_files): + if is_hpu(): + pytest.skip("Skipping test on HPU") ray.init() server_runner = ServerRunner.remote([ "--model", diff --git a/tests/entrypoints/test_server_oot_registration.py b/tests/entrypoints/test_server_oot_registration.py index 22e65bf7e7da1..c65d90a80e14c 100644 --- a/tests/entrypoints/test_server_oot_registration.py +++ b/tests/entrypoints/test_server_oot_registration.py @@ -2,13 +2,14 @@ import sys import time +import pytest import torch from openai import OpenAI, OpenAIError from vllm import ModelRegistry from vllm.model_executor.models.opt import OPTForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.utils import get_open_port +from vllm.utils import get_open_port, is_hpu class MyOPTForCausalLM(OPTForCausalLM): @@ -32,6 +33,7 @@ def server_function(port): runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__') +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_oot_registration_for_api_server(): port = get_open_port() server = multiprocessing.Process(target=server_function, args=(port, )) diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index 86ecc6412c648..7c6ed07ba61f0 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -6,14 +6,18 @@ from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul, NewGELU, SiluAndMul) +from vllm.utils import is_hpu DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing D = [512, 4096, 5120, 13824] # Arbitrary values for testing SEEDS = [0] -CUDA_DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] +if is_hpu(): + DEVICES = ["hpu"] +else: + DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) + ] @pytest.mark.parametrize("activation", ["silu", "gelu", "gelu_tanh"]) @@ -21,7 +25,7 @@ @pytest.mark.parametrize("d", D) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @torch.inference_mode() def test_act_and_mul( activation: str, @@ -31,9 +35,15 @@ def test_act_and_mul( seed: int, device: str, ) -> None: + + if is_hpu() and activation != "silu": + pytest.skip("Only SiluAndMul supported on HPU.") + torch.random.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) + elif is_hpu(): + torch.hpu.manual_seed(seed) torch.set_default_device(device) x = torch.randn(num_tokens, 2 * d, dtype=dtype) if activation == "silu": @@ -54,7 +64,7 @@ def test_act_and_mul( @pytest.mark.parametrize("d", D) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @torch.inference_mode() def test_activation( activation: Type[torch.nn.Module], @@ -64,9 +74,14 @@ def test_activation( seed: int, device: str, ) -> None: + if is_hpu(): + pytest.skip("GELU not supported on HPU.") + torch.random.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) + elif is_hpu(): + torch.hpu.manual_seed(seed) torch.set_default_device(device) x = torch.randn(num_tokens, d, dtype=dtype) layer = activation() diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 84539205e0ae3..f17a51b0ccf78 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -4,16 +4,21 @@ import pytest import torch from allclose_default import get_default_atol, get_default_rtol -from xformers import ops as xops -from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask -from vllm import _custom_ops as ops -from vllm.utils import get_max_shared_memory_bytes, is_hip +from vllm.utils import get_max_shared_memory_bytes, is_hip, is_hpu +if is_hpu(): + from vllm.hpu import ops, cache_ops + from vllm.hpu import xops + from vllm.hpu.attn_bias import BlockDiagonalCausalMask +else: + from vllm._C import ops, cache_ops + from xformers import ops as xops + from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 # This will change depending on the compute capability. # - 512 as a buffer -MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512 +MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512 if not is_hpu() else 128 # There may not be enough gpu memory due to large NUM_BLOCKS. # Reduce NUM_BLOCKS when it happens. NUM_BLOCKS = 4321 # Arbitrary values for testing @@ -34,9 +39,12 @@ USE_ALIBI = [False, True] KV_CACHE_DTYPE = ["auto", "fp8"] SEEDS = [0] -CUDA_DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] +if is_hpu(): + DEVICES = ["hpu"] +else: + DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) + ] def ref_masked_attention( @@ -84,7 +92,10 @@ def ref_single_query_cached_kv_attention( block_number = int(block_table[j // block_size]) block_offset = j % block_size - k = key_cache[block_number, :, :, block_offset, :] + if is_hpu(): + k = key_cache[block_number, :, :, block_offset] + else: + k = key_cache[block_number, :, :, block_offset, :] k = k.reshape(num_kv_heads, head_size) keys.append(k) @@ -119,7 +130,7 @@ def ref_single_query_cached_kv_attention( @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_paged_attention( kv_cache_factory, version: str, @@ -133,14 +144,24 @@ def test_paged_attention( seed: int, device: str, ) -> None: + if is_hpu(): + if version != "v1": + pytest.skip("Paged attention v2 not supported on HPU") + if kv_cache_dtype != "auto": + pytest.skip("Only auto kv_cache_dtype supported on HPU") + if use_alibi: + pytest.skip("Alibi not supported on HPU") + random.seed(seed) torch.random.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) + elif is_hpu(): + torch.hpu.manual_seed(seed) torch.set_default_device(device) scale = float(1.0 / (head_size**0.5)) num_query_heads, num_kv_heads = num_heads - query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype) + query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype, device=device) query.uniform_(-scale, scale) assert num_query_heads % num_kv_heads == 0 @@ -163,7 +184,7 @@ def test_paged_attention( for _ in range(max_num_blocks_per_seq) ] block_tables.append(block_table) - block_tables = torch.tensor(block_tables, dtype=torch.int) + block_tables = torch.tensor(block_tables, dtype=torch.int, device=device) # Create the KV caches. key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1, @@ -177,7 +198,21 @@ def test_paged_attention( # Call the paged attention kernel. output = torch.empty_like(query) - if version == "v1": + + if is_hpu(): + output = ops.paged_attention_v1( + query, + key_cache, + value_cache, + num_kv_heads, + scale, + block_tables, + seq_lens, + block_size, + alibi_slopes, + kv_cache_dtype, + ) + elif version == "v1": ops.paged_attention_v1( output, query, @@ -307,12 +342,13 @@ def ref_multi_query_kv_attention( # TODO(woosuk): Add tests for USE_ALIBI=True. +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @torch.inference_mode() def test_multi_query_kv_attention( num_seqs: int, @@ -339,7 +375,8 @@ def test_multi_query_kv_attention( qkv = torch.empty(num_tokens, num_query_heads + 2 * num_kv_heads, head_size, - dtype=dtype) + dtype=dtype, + device=device) qkv.uniform_(-scale, scale) query, key, value = qkv.split( [num_query_heads, num_kv_heads, num_kv_heads], dim=1) @@ -373,4 +410,5 @@ def test_multi_query_kv_attention( ) atol = get_default_atol(output) if is_hip() else 1e-3 rtol = get_default_rtol(output) if is_hip() else 1e-5 + assert torch.allclose(output, ref_output, atol=atol, rtol=rtol) diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 94a577139596e..f8163deb05223 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -1,14 +1,18 @@ import random from typing import Tuple +import math import pytest import torch -from vllm import _custom_ops as ops -from vllm._C import cache_ops -from vllm.utils import is_hip +from vllm.utils import is_hip, is_hpu +from vllm import _custom_ops as ops +if is_hpu(): + from vllm.hpu import cache_ops +else: + from vllm._C import cache_ops + -COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')] DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [42] # Arbitrary values for testing NUM_LAYERS = [1] # Arbitrary values for testing @@ -22,9 +26,14 @@ NUM_MAPPINGS = [256] # Arbitrary values for testing SEEDS = [0] -CUDA_DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] +if is_hpu(): + COPYING_DIRECTION = [('hpu', 'cpu'), ('hpu', 'hpu'), ('cpu', 'hpu')] + DEVICES = ["hpu"] +else: + COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')] + DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) + ] KV_CACHE_DTYPE = ["auto", "fp8"] @@ -36,8 +45,8 @@ @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) +@pytest.mark.parametrize("device", DEVICES) @torch.inference_mode() def test_copy_blocks( kv_cache_factory, @@ -52,10 +61,15 @@ def test_copy_blocks( kv_cache_dtype: str, device: str, ) -> None: + if is_hpu() and kv_cache_dtype != "auto": + pytest.skip("Only auto kv_cache_dtype supported on HPU") + random.seed(seed) torch.random.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) + elif is_hpu(): + torch.hpu.manual_seed(seed) torch.set_default_device(device) # Generate random block mappings where each source block is mapped to two # destination blocks. @@ -78,14 +92,25 @@ def test_copy_blocks( dtype, seed, device) # Clone the KV caches. - cloned_key_caches = [key_cache.clone() for key_cache in key_caches] - cloned_value_caches = [value_cache.clone() for value_cache in value_caches] + cloned_key_caches = [key_cache.clone().to("cpu") for key_cache in key_caches] + cloned_value_caches = [value_cache.clone().to("cpu") for value_cache in value_caches] # Call the copy blocks kernel. block_mapping_tensor = torch.tensor(block_mapping, dtype=torch.int64, device=device).view(-1, 2) - ops.copy_blocks(key_caches, value_caches, block_mapping_tensor) + if is_hpu(): + tmp_block_mapping_dict = {} + for src, dst in block_mapping: + print(src, dst, tmp_block_mapping_dict) + if not tmp_block_mapping_dict.get(src): + tmp_block_mapping_dict[src] = [dst] + continue + tmp_block_mapping_dict[src].append(dst) + + ops.copy_blocks(key_caches, value_caches, tmp_block_mapping_dict) + else: + ops.copy_blocks(key_caches, value_caches, block_mapping_tensor) # Run the reference implementation. for src, dst in block_mapping: @@ -109,7 +134,7 @@ def test_copy_blocks( @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @torch.inference_mode() def test_reshape_and_cache( @@ -126,14 +151,20 @@ def test_reshape_and_cache( ) -> None: if not is_hip() and kv_cache_dtype == "fp8": pytest.skip() # This test is not tuned for e5m2 cuda precision + if is_hpu() and kv_cache_dtype != "auto": + pytest.skip("Only auto kv_cache_dtype supported on HPU") random.seed(seed) torch.random.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) + elif is_hpu(): + torch.hpu.manual_seed(seed) torch.set_default_device(device) + # Create a random slot mapping. - num_slots = block_size * num_blocks - slot_mapping = random.sample(range(num_slots), num_tokens) + blocks = random.sample(range(num_blocks), num_tokens) + offsets = random.choices(range(block_size), k=num_tokens) + slot_mapping = [block * block_size + offset for block, offset in zip(blocks, offsets)] slot_mapping = torch.tensor(slot_mapping, dtype=torch.long) qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype) @@ -160,25 +191,29 @@ def test_reshape_and_cache( kv_scale = 1.0 # Call the reshape_and_cache kernel. - ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, - kv_cache_dtype, kv_scale) - - if kv_cache_dtype == "fp8": - result_key_cache = torch.empty_like(key_cache, dtype=torch.float16) - ops.convert_fp8(key_cache, result_key_cache) - result_value_cache = torch.empty_like(value_cache, dtype=torch.float16) - ops.convert_fp8(value_cache, result_value_cache) + if is_hpu(): + cache_ops.reshape_and_cache(key, value, key_cache, value_cache, + slot_mapping.view((1, -1)), "auto", False) + else: + cache_ops.reshape_and_cache(key, value, key_cache, value_cache, + slot_mapping, "auto") # Run the reference implementation. - reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape) - block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor") - block_indicies = block_indicies.cpu().tolist() + if is_hpu(): + reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0].shape) + else: + reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape) + block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor") + block_indices = block_indices.cpu().tolist() block_offsets = slot_mapping % block_size block_offsets = block_offsets.cpu().tolist() for i in range(num_tokens): - block_idx = block_indicies[i] + block_idx = block_indices[i] block_offset = block_offsets[i] - cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i] + if is_hpu(): + cloned_key_cache[block_idx, :, :, block_offset] = reshaped_key[i] + else: + cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i] cloned_value_cache[block_idx, :, :, block_offset] = value[i] if kv_cache_dtype == "fp8": @@ -195,6 +230,7 @@ def test_reshape_and_cache( assert torch.allclose(value_cache, cloned_value_cache) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @@ -202,7 +238,7 @@ def test_reshape_and_cache( @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @torch.inference_mode() def test_reshape_and_cache_flash( @@ -279,7 +315,7 @@ def test_reshape_and_cache_flash( @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @torch.inference_mode() def test_swap_blocks( @@ -295,6 +331,8 @@ def test_swap_blocks( device: str, kv_cache_dtype: str, ) -> None: + if is_hpu() and direction[0] == "hpu" and direction[1] == "cpu": + pytest.skip("Skipping test on HPU") if kv_cache_dtype == "fp8" and "cpu" in direction: pytest.skip() if not is_hip() and kv_cache_dtype == "fp8": @@ -303,9 +341,15 @@ def test_swap_blocks( torch.random.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) + elif is_hpu(): + torch.hpu.manual_seed(seed) - src_device = device if direction[0] == "cuda" else 'cpu' - dst_device = device if direction[1] == "cuda" else 'cpu' + if is_hpu(): + src_device = device if direction[0] == "hpu" else 'cpu' + dst_device = device if direction[1] == "hpu" else 'cpu' + else: + src_device = device if direction[0] == "cuda" else 'cpu' + dst_device = device if direction[1] == "cuda" else 'cpu' src_blocks = random.sample(range(num_blocks), num_mappings) # For the same device, mapping must not overlap @@ -341,6 +385,7 @@ def test_swap_blocks( dist_value_caches[0][dst].cpu()) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.skipif(not is_hip(), reason="FP8 conversion test requires e4m3") @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @@ -348,7 +393,7 @@ def test_swap_blocks( @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @torch.inference_mode() def test_fp8_conversion( num_heads: int, @@ -376,3 +421,87 @@ def test_fp8_conversion( ops.convert_fp8(cache_fp8, converted_cache) assert torch.allclose(cache, converted_cache, atol=0.001, rtol=0.1) + + +@pytest.mark.skipif(not is_hpu(), reason="This case is HPU-specific") +@pytest.mark.parametrize("num_tokens", NUM_TOKENS) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("block_size", BLOCK_SIZES) +@pytest.mark.parametrize("num_blocks", NUM_BLOCKS) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", DEVICES) +@torch.inference_mode() +def test_reshape_and_cache_prompt( + kv_cache_factory, + num_tokens: int, + num_heads: int, + head_size: int, + block_size: int, + num_blocks: int, + dtype: torch.dtype, + seed: int, + device: str, +) -> None: + random.seed(seed) + torch.random.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + elif is_hpu(): + torch.hpu.manual_seed(seed) + torch.set_default_device(device) + + # Create a random slot mapping. + num_block_indices_to_generate = math.ceil(num_tokens / block_size) + block_indices_ = random.sample(range(num_blocks), num_block_indices_to_generate) + block_offsets_ = [] + slot_mapping = [] + for i in block_indices_: + for j in range(block_size): + slot_mapping.append(i * block_size + j) + slot_mapping = slot_mapping[:num_tokens] + slot_mapping = torch.tensor(slot_mapping, dtype=torch.long) + + qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype) + _, key, value = qkv.unbind(dim=1) + + # Create the KV caches. + key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1, + num_heads, head_size, dtype, + None, seed, device) + key_cache, value_cache = key_caches[0], value_caches[0] + + # Clone the KV caches. + cloned_key_cache = key_cache.clone() + cloned_value_cache = value_cache.clone() + + # Call the reshape_and_cache kernel. + cache_ops.reshape_and_cache(key, value, key_cache, value_cache, + slot_mapping.view((1, -1)), "auto", True) + + # Run the reference implementation. + if is_hpu(): + reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0].shape) + else: + reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape) + block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor") + block_indices = block_indices.cpu().tolist() + block_offsets = slot_mapping % block_size + block_offsets = block_offsets.cpu().tolist() + for i in range(0, num_tokens): + block_idx = block_indices[i] + block_offset = block_offsets[i] + cloned_key_cache[block_idx, :, :, block_offset] = key[i, :, :] + cloned_value_cache[block_idx, :, :, block_offset] = value[i, :, :] + + # Note: only checking cache areas specified by the slot mapping because + # the implementation may initialize whole blocks even if some of the offsets of the block + # are not present in the slot mapping. + for i in range(0, num_tokens): + block_idx = block_indices[i] + block_offset = block_offsets[i] + assert torch.allclose(key_cache[block_idx, :, :, block_offset], + cloned_key_cache[block_idx, :, :, block_offset]) + assert torch.allclose(value_cache[block_idx, :, :, block_offset], + cloned_value_cache[block_idx, :, :, block_offset]) diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py index 210d59e4f32fa..54385c6074068 100644 --- a/tests/kernels/test_layernorm.py +++ b/tests/kernels/test_layernorm.py @@ -2,6 +2,7 @@ import torch from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.utils import is_hpu DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing @@ -9,9 +10,12 @@ 8199] # Arbitrary values for testing ADD_RESIDUAL = [False, True] SEEDS = [0] -CUDA_DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] +if is_hpu(): + DEVICES = ["hpu"] +else: + DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) + ] @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @@ -19,7 +23,7 @@ @pytest.mark.parametrize("add_residual", ADD_RESIDUAL) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @torch.inference_mode() def test_rms_norm( num_tokens: int, @@ -29,14 +33,18 @@ def test_rms_norm( seed: int, device: str, ) -> None: + if is_hpu() and dtype == torch.half and add_residual: + pytest.skip("Skipping test on HPU") torch.random.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) + elif is_hpu(): + torch.hpu.manual_seed(seed) torch.set_default_device(device) layer = RMSNorm(hidden_size).to(dtype=dtype) layer.weight.data.normal_(mean=1.0, std=0.1) scale = 1 / (2 * hidden_size) - x = torch.randn(num_tokens, hidden_size, dtype=dtype) + x = torch.randn(1, num_tokens, hidden_size, dtype=dtype, device=device) x *= scale residual = torch.randn_like(x) * scale if add_residual else None diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index 2356b9ec18b0d..8d52fbaa6cc25 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -10,6 +10,7 @@ from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_moe from vllm.model_executor.models.mixtral import MixtralMoE +from vllm.utils import is_hpu def torch_moe(a, w1, w2, score, topk): @@ -29,6 +30,7 @@ def torch_moe(a, w1, w2, score, topk): topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("m", [512, 222, 33, 1]) @pytest.mark.parametrize("n", [2048, 256, 1024]) @pytest.mark.parametrize("k", [128, 511, 1024]) @@ -53,6 +55,7 @@ def test_fused_moe( assert torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) @torch.inference_mode() diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index bf1856972cf33..49407acdf1a0e 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -6,6 +6,7 @@ from allclose_default import get_default_atol, get_default_rtol from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.utils import is_hpu IS_NEOX_STYLE = [True, False] DTYPES = [torch.half, torch.bfloat16, torch.float] @@ -15,11 +16,15 @@ BATCH_SIZES = [1, 5] # Arbitrary values for testing SEQ_LENS = [11, 8192] # Arbitrary values for testing SEEDS = [0] -CUDA_DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] +if is_hpu(): + DEVICES = ["hpu"] +else: + DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) + ] +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) @pytest.mark.parametrize("batch_size", BATCH_SIZES) @pytest.mark.parametrize("seq_len", SEQ_LENS) @@ -28,7 +33,7 @@ @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @torch.inference_mode() def test_rotary_embedding( is_neox_style: bool, @@ -76,6 +81,7 @@ def test_rotary_embedding( rtol=get_default_rtol(out_key)) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) @pytest.mark.parametrize("batch_size", BATCH_SIZES) @pytest.mark.parametrize("seq_len", SEQ_LENS) @@ -84,7 +90,7 @@ def test_rotary_embedding( @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @torch.inference_mode() def test_batched_rotary_embedding( is_neox_style: bool, @@ -138,6 +144,7 @@ def test_batched_rotary_embedding( rtol=get_default_rtol(out_key)) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) @pytest.mark.parametrize("batch_size", BATCH_SIZES) @pytest.mark.parametrize("seq_len", SEQ_LENS) @@ -146,7 +153,7 @@ def test_batched_rotary_embedding( @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @torch.inference_mode() def test_batched_rotary_embedding_multi_lora( is_neox_style: bool, diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py index 5a5987e2242fa..9350ab25c0f86 100644 --- a/tests/kernels/test_prefix_prefill.py +++ b/tests/kernels/test_prefix_prefill.py @@ -3,10 +3,15 @@ import pytest import torch -from xformers import ops as xops -from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask from vllm.attention.ops.prefix_prefill import context_attention_fwd +from vllm.utils import is_hpu +if is_hpu(): + from vllm.hpu import xops + from vllm.hpu.attn_bias import BlockDiagonalCausalFromBottomRightMask +else: + from xformers import ops as xops + from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask NUM_HEADS = [64] NUM_QUERIES_PER_KV = [1, 8, 64] @@ -18,6 +23,7 @@ SLIDING_WINDOW = [0, 16, 64, 128, 256, 512, 2048] +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV) @pytest.mark.parametrize("head_size", HEAD_SIZES) diff --git a/tests/kernels/test_rand.py b/tests/kernels/test_rand.py index a4242d22eb489..a96a238834a49 100644 --- a/tests/kernels/test_rand.py +++ b/tests/kernels/test_rand.py @@ -5,8 +5,10 @@ from vllm.model_executor.layers.ops.rand import seeded_uniform from vllm.model_executor.utils import set_random_seed +from vllm.utils import is_hpu +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) @pytest.mark.parametrize("use_3d", [True, False]) diff --git a/tests/kernels/test_sampler.py b/tests/kernels/test_sampler.py index e28f809309ec5..4bab8caedbf62 100644 --- a/tests/kernels/test_sampler.py +++ b/tests/kernels/test_sampler.py @@ -10,6 +10,7 @@ sample) from vllm.model_executor.sampling_metadata import SamplingTensors from vllm.model_executor.utils import set_random_seed +from vllm.utils import is_hpu SINGLE_SPLIT_VOCAB_SIZE = 32000 # llama/mistral/mixtral vocab size MULTI_SPLIT_VOCAB_SIZE = MAX_TRITON_N_COLS + 100 @@ -30,6 +31,7 @@ def _uniform_to_exponential_kernel(input, output, n: tl.constexpr): tl.store(output + idx, y) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_uniform_to_exponential(): """Test that we can convert uniform to exponential without div by 0.""" input = torch.tensor([0.0, 1.0 - torch.finfo(torch.float32).eps], @@ -42,6 +44,7 @@ def test_uniform_to_exponential(): assert torch.all(torch.isfinite(torch.full_like(output, 1.0) / output)) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("random_sampling", [True, False, "mixed"]) @pytest.mark.parametrize("max_best_of", [1, 2, 3, 4, 5]) @pytest.mark.parametrize("modify_greedy_probs", [True, False]) @@ -121,6 +124,7 @@ def test_sample_decoding_only(random_sampling, max_best_of, assert sampled_logprobs is None +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("random_sampling", [True, False, "mixed"]) @pytest.mark.parametrize("max_best_of", [1, 2, 3, 4, 5]) @pytest.mark.parametrize("modify_greedy_probs", [True, False]) diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py index 5ab863eea94b3..18fc5dfb55367 100644 --- a/tests/lora/test_baichuan.py +++ b/tests/lora/test_baichuan.py @@ -2,6 +2,7 @@ import vllm from vllm.lora.request import LoRARequest +from vllm.utils import is_hpu from .conftest import cleanup @@ -39,6 +40,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str: return generated_texts +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_baichuan_lora(baichuan_lora_files): llm = vllm.LLM(MODEL_PATH, max_model_len=1024, @@ -61,6 +63,7 @@ def test_baichuan_lora(baichuan_lora_files): assert output2[i] == expected_lora_output[i] +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.skip("Requires multiple GPUs") def test_baichuan_tensor_parallel_equality(baichuan_lora_files): # Cannot use as it will initialize torch.cuda too early... diff --git a/tests/lora/test_chatglm3.py b/tests/lora/test_chatglm3.py index bd8cc98ef8ca0..34528c9a6bdcd 100644 --- a/tests/lora/test_chatglm3.py +++ b/tests/lora/test_chatglm3.py @@ -1,5 +1,8 @@ +import pytest + import vllm from vllm.lora.request import LoRARequest +from vllm.utils import is_hpu MODEL_PATH = "THUDM/chatglm3-6b" @@ -35,6 +38,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str: return generated_texts +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_chatglm3_lora(chatglm3_lora_files): llm = vllm.LLM(MODEL_PATH, max_model_len=1024, diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py index 0082c6e74e888..07219b4502822 100644 --- a/tests/lora/test_gemma.py +++ b/tests/lora/test_gemma.py @@ -1,5 +1,8 @@ +import pytest + import vllm from vllm.lora.request import LoRARequest +from vllm.utils import is_hpu MODEL_PATH = "google/gemma-7b" @@ -26,6 +29,7 @@ def do_sample(llm, lora_path: str, lora_id: int) -> str: return generated_texts +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_gemma_lora(gemma_lora_files): llm = vllm.LLM(MODEL_PATH, max_model_len=1024, diff --git a/tests/lora/test_layer_variation.py b/tests/lora/test_layer_variation.py index 7d37aa6474adc..a4f56a20ce838 100644 --- a/tests/lora/test_layer_variation.py +++ b/tests/lora/test_layer_variation.py @@ -8,6 +8,7 @@ import vllm from vllm.lora.request import LoRARequest +from vllm.utils import is_hpu from .conftest import cleanup @@ -70,6 +71,7 @@ def do_sample(llm, # step 1: init a base model and serve with LoRA to get the reference results # step 2: merge the same LoRA to the base model, serve the merged model # step 3: compare the results from step 1 and step 2 +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("tp_size", [1]) @pytest.mark.parametrize("target_modules", TARGET_MODULES_LIST) @pytest.mark.parametrize("rank", [8, 16, 32, 64]) diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 0eb04f4ccd133..db482c9821c73 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -32,6 +32,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.utils import set_random_seed +from vllm.utils import is_hpu from .utils import DummyLoRAManager @@ -171,6 +172,7 @@ def create_random_inputs( return inputs, index_mapping, prompt_mapping +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @torch.inference_mode() @pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("device", CUDA_DEVICES) @@ -265,6 +267,7 @@ def create_random_embedding_layer(): atol=atol) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @torch.inference_mode() # @pytest.mark.skip( # reason="Fails when loras are in any slot other than the first.") @@ -402,6 +405,7 @@ def create_random_embedding_layer(): atol=atol) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @torch.inference_mode() @pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("device", CUDA_DEVICES) @@ -525,6 +529,7 @@ def _pretest(): atol=atol) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @torch.inference_mode() @pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("orientation", ["row", "column"]) @@ -636,6 +641,7 @@ def create_random_linear_parallel_layer(): atol=atol) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @torch.inference_mode() @pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("repeats", [1, 2, 3]) diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py index f5a571e81acba..c12fc1a1cc213 100644 --- a/tests/lora/test_llama.py +++ b/tests/lora/test_llama.py @@ -3,6 +3,7 @@ import vllm from vllm.lora.request import LoRARequest +from vllm.utils import is_hpu from .conftest import cleanup @@ -36,6 +37,7 @@ def do_sample(llm, lora_path: str, lora_id: int): return generated_texts +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("tp_size", [1]) def test_llama_lora(sql_lora_files, tp_size): # Cannot use as it will initialize torch.cuda too early... @@ -80,6 +82,7 @@ def test_llama_lora(sql_lora_files, tp_size): print("removing lora") +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.skip("Requires multiple GPUs") def test_llama_tensor_parallel_equality(sql_lora_files): # Cannot use as it will initialize torch.cuda too early... @@ -121,6 +124,7 @@ def test_llama_tensor_parallel_equality(sql_lora_files): assert output_tp1 == output_tp4 +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_llama_lora_warmup(sql_lora_files): """Test that the LLM initialization works with a warmup LORA path and is more conservative""" diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py index 3415d36b7e341..90363305e137c 100644 --- a/tests/lora/test_lora.py +++ b/tests/lora/test_lora.py @@ -2,6 +2,7 @@ import torch from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice +from vllm.utils import is_hpu from .utils import DummyLoRAManager @@ -21,6 +22,7 @@ } +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("m", TENSOR_SIZES) @pytest.mark.parametrize("n", TENSOR_SIZES) @pytest.mark.parametrize("k", BATCH_SIZES) @@ -71,6 +73,7 @@ def test_apply_lora(m, n, k, rank, dtype) -> None: manager.reset_lora() +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("m", TENSOR_SIZES) @pytest.mark.parametrize("n", TENSOR_SIZES) @pytest.mark.parametrize("k", BATCH_SIZES) @@ -140,6 +143,7 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None: manager.reset_lora() +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("qkv", QKV_TENSOR_SIZES) @pytest.mark.parametrize("n", TENSOR_SIZES) @pytest.mark.parametrize("k", BATCH_SIZES) diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index c08eee9910149..5146e22f77d57 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -17,6 +17,7 @@ from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager, WorkerLoRAManager) from vllm.model_executor.layers.linear import RowParallelLinear +from vllm.utils import is_hpu EMBEDDING_MODULES = { "embed_tokens": "input_embeddings", @@ -26,6 +27,7 @@ EMBEDDING_PADDING_MODULES = ["lm_head"] +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_from_lora_tensors(sql_lora_files): tensors = load_file( os.path.join(sql_lora_files, "adapter_model.safetensors")) @@ -98,6 +100,7 @@ def create_packed_lora( return LoRAModel(lora_id, 8, loras) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_replace_submodules(dist_init, dummy_model): model = dummy_model model.supported_lora_modules = ["dense1", "layer1.dense2"] @@ -116,6 +119,7 @@ def test_replace_submodules(dist_init, dummy_model): RowParallelLinearWithLoRA) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_lora_model_manager(dist_init, dummy_model): model = dummy_model model.supported_lora_modules = ["dense1", "dense2", "lm_head"] @@ -162,6 +166,7 @@ def test_lora_model_manager(dist_init, dummy_model): assert manager.lora_index_to_id[1] == 2 +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_lora_lru_cache_model_manager(dist_init, dummy_model): model = dummy_model model.supported_lora_modules = ["dense1", "dense2", "lm_head"] @@ -211,6 +216,7 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model): assert manager.lora_index_to_id[1] == 3 +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_lru_lora_model_manager(dist_init, dummy_model): # This tests just the LRU cache functionality, everything else is # tested in test_lora_model_manager @@ -289,6 +295,7 @@ def test_lru_lora_model_manager(dist_init, dummy_model): assert all(x is None for x in manager.lora_index_to_id) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_lru_cache_worker_lora_manager(llama_2_7b_model_extra_embeddings, sql_lora_files): lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4) @@ -362,6 +369,7 @@ def test_lru_cache_worker_lora_manager(llama_2_7b_model_extra_embeddings, ], mapping) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_worker_lora_manager(llama_2_7b_model_extra_embeddings, sql_lora_files): # Should remove every LoRA not specified in the request. @@ -432,6 +440,7 @@ def test_worker_lora_manager(llama_2_7b_model_extra_embeddings, ], mapping) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_packed_loras(dist_init, dummy_model_gate_up): model = dummy_model_gate_up model.supported_lora_modules = ["gate_up_proj"] diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py index fd2a1b75f460c..f2d62d5fca0bb 100644 --- a/tests/lora/test_punica.py +++ b/tests/lora/test_punica.py @@ -4,6 +4,7 @@ import torch import vllm.lora.punica as punica +from vllm.utils import is_hpu def assert_close(a, b): @@ -102,6 +103,7 @@ def _lora_ref_impl( ] +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"]) @pytest.mark.parametrize("h1", H1) @pytest.mark.parametrize("r", R) @@ -144,6 +146,7 @@ def test_lora_a_extra_shapes(dtype_str, h1, r, seed): assert_close(y_ref, y_our) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"]) @pytest.mark.parametrize("h1", H1) @pytest.mark.parametrize("h2", H2) @@ -178,6 +181,7 @@ def test_lora_correctness(dtype_str, h1, h2, seed, device): assert_close(y_ref, y_our) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"]) @pytest.mark.parametrize("h1", H1) @pytest.mark.parametrize("h2", H2) diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index 3d86a4366aa57..3e8440fd4c25f 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -7,6 +7,7 @@ import vllm from vllm.lora.request import LoRARequest +from vllm.utils import is_hpu from .conftest import cleanup @@ -54,6 +55,7 @@ def format_prompt_tuples(prompt): return generated_texts +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tp_size", [1]) def test_quant_model_lora(tinyllama_lora_files, model, tp_size): diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 732e91a52c0a9..943a9170605c2 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -3,13 +3,17 @@ import tempfile from unittest.mock import patch +import pytest + from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig) from vllm.lora.models import LoRAMapping from vllm.lora.request import LoRARequest from vllm.worker.worker import Worker +from vllm.utils import is_hpu +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @patch.dict(os.environ, {"RANK": "0"}) def test_worker_apply_lora(sql_lora_files): worker = Worker( diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index e0aa14f165c2d..f8a4da4349a76 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -7,12 +7,14 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.sampling_params import SamplingParams +from vllm.utils import is_hpu MODELS = [ "facebook/opt-125m", ] +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("max_tokens", [128]) @@ -46,6 +48,7 @@ def test_metric_counter_prompt_tokens( f"metric: {metric_count!r}") +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("max_tokens", [128]) @@ -78,6 +81,7 @@ def test_metric_counter_generation_tokens( f"metric: {metric_count!r}") +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize( diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py index a7abc011f57d7..4425fc87e59b9 100644 --- a/tests/models/test_aqlm.py +++ b/tests/models/test_aqlm.py @@ -7,11 +7,15 @@ import torch from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.utils import is_hpu -capability = torch.cuda.get_device_capability() -capability = capability[0] * 10 + capability[1] -aqlm_not_supported = (capability < - QUANTIZATION_METHODS["aqlm"].get_min_capability()) +if not is_hpu(): + capability = torch.cuda.get_device_capability() + capability = capability[0] * 10 + capability[1] + aqlm_not_supported = (capability < + QUANTIZATION_METHODS["aqlm"].get_min_capability()) +else: + aqlm_not_supported = False # In this test we hardcode prompts and generations for the model so we don't # need to require the AQLM package as a dependency @@ -63,7 +67,7 @@ 'The early bird catches the worm.\nThe early bird catches the' ] - +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.skipif(aqlm_not_supported, reason="AQLM is not supported on this GPU type.") @pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"]) @@ -78,7 +82,6 @@ def test_models( max_tokens: int, num_logprobs: int, ) -> None: - vllm_model = vllm_runner(model, dtype=dtype) vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts, max_tokens, diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py index 3dde498bcd639..f8425598cd756 100644 --- a/tests/models/test_big_models.py +++ b/tests/models/test_big_models.py @@ -5,6 +5,7 @@ Run `pytest tests/models/test_big_models.py`. """ import pytest +from vllm.utils import is_hpu MODELS = [ "meta-llama/Llama-2-7b-hf", @@ -17,6 +18,7 @@ ] +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [32]) @@ -44,7 +46,7 @@ def test_models( assert hf_output_ids == vllm_output_ids, ( f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") - +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_model_print( diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py index e87a1783a83f1..00bdcf578ed15 100644 --- a/tests/models/test_fp8.py +++ b/tests/models/test_fp8.py @@ -10,6 +10,7 @@ from vllm import LLM, SamplingParams from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.utils import is_hpu os.environ["TOKENIZERS_PARALLELISM"] = "true" @@ -43,12 +44,16 @@ ], } -capability = torch.cuda.get_device_capability() -capability = capability[0] * 10 + capability[1] -fp8_not_supported = (capability < - QUANTIZATION_METHODS["fp8"].get_min_capability()) +if not is_hpu(): + capability = torch.cuda.get_device_capability() + capability = capability[0] * 10 + capability[1] + fp8_not_supported = (capability < + QUANTIZATION_METHODS["fp8"].get_min_capability()) +else: + fp8_not_supported = True +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.skipif(fp8_not_supported, reason="fp8 is not supported on this GPU type.") @pytest.mark.parametrize("model_name", MODELS) diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py index 4d73843f970c4..37930e2708eaa 100644 --- a/tests/models/test_gptq_marlin.py +++ b/tests/models/test_gptq_marlin.py @@ -17,15 +17,19 @@ from tests.models.utils import check_logprobs_close from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.utils import is_hpu os.environ["TOKENIZERS_PARALLELISM"] = "true" MAX_MODEL_LEN = 1024 -capability = torch.cuda.get_device_capability() -capability = capability[0] * 10 + capability[1] -gptq_marlin_not_supported = ( - capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability()) +if not is_hpu(): + capability = torch.cuda.get_device_capability() + capability = capability[0] * 10 + capability[1] + gptq_marlin_not_supported = ( + capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability()) +else: + gptq_marlin_not_supported = True MODELS = [ # act_order==False, group_size=channelwise @@ -49,6 +53,7 @@ ] +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.flaky(reruns=2) @pytest.mark.skipif(gptq_marlin_not_supported, reason="gptq_marlin is not supported on this GPU type.") diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py index f86cd3fa88f5d..1d2e99cd566e5 100644 --- a/tests/models/test_llava.py +++ b/tests/models/test_llava.py @@ -8,6 +8,7 @@ from transformers import AutoTokenizer from vllm.config import VisionLanguageConfig +from vllm.utils import is_hpu model_and_vl_config = [ ("llava-hf/llava-1.5-7b-hf", @@ -62,6 +63,7 @@ def sanitize_vllm_output(vllm_output: Tuple[List[int], str], return sanitized_input_ids, sanitzied_output_str +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("worker_use_ray", [False]) @pytest.mark.parametrize("model_and_config", model_and_vl_config) @pytest.mark.parametrize("dtype", ["half"]) diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index fa846d43d0e88..9b3d4bdba775c 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -17,11 +17,15 @@ from tests.models.utils import check_logprobs_close from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.utils import is_hpu -capability = torch.cuda.get_device_capability() -capability = capability[0] * 10 + capability[1] -marlin_not_supported = (capability < - QUANTIZATION_METHODS["marlin"].get_min_capability()) +if not is_hpu: + capability = torch.cuda.get_device_capability() + capability = capability[0] * 10 + capability[1] + marlin_not_supported = (capability < + QUANTIZATION_METHODS["marlin"].get_min_capability()) +else: + marlin_not_supported = True @dataclass @@ -40,6 +44,7 @@ class ModelPair: ] +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.flaky(reruns=2) @pytest.mark.skipif(marlin_not_supported, reason="Marlin is not supported on this GPU type.") diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py index 7aeff3a913098..2500d572ebefa 100644 --- a/tests/models/test_mistral.py +++ b/tests/models/test_mistral.py @@ -3,12 +3,14 @@ Run `pytest tests/models/test_mistral.py`. """ import pytest +from vllm.utils import is_hpu MODELS = [ "mistralai/Mistral-7B-Instruct-v0.1", ] +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [128]) diff --git a/tests/models/test_models.py b/tests/models/test_models.py index e4609620387fa..9fc62f29ed0c9 100644 --- a/tests/models/test_models.py +++ b/tests/models/test_models.py @@ -6,6 +6,7 @@ Run `pytest tests/models/test_models.py`. """ import pytest +from vllm.utils import is_hpu MODELS = [ "facebook/opt-125m", @@ -20,6 +21,7 @@ ] +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("max_tokens", [96]) @@ -51,6 +53,7 @@ def test_models( f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) def test_model_print( diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py index 50ab06631500b..f03c657dac4a2 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/models/test_oot_registration.py @@ -1,3 +1,4 @@ +import pytest import torch from vllm import LLM, ModelRegistry, SamplingParams diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py index 6820b2728e3c9..b125de8906b9b 100644 --- a/tests/quantization/test_configs.py +++ b/tests/quantization/test_configs.py @@ -8,7 +8,7 @@ import pytest from vllm.config import ModelConfig - +from vllm.utils import is_hpu @dataclass class ModelPair: @@ -53,7 +53,8 @@ class ModelPair: @pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES) def test_auto_gptq(model_arg_exptype: str) -> None: model_path, quantization_arg, expected_type = model_arg_exptype - + if is_hpu() and model_path in ('TheBloke/Llama-2-7B-Chat-GPTQ', 'LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit'): + pytest.skip("Skipping test on HPU") try: model_config = ModelConfig(model_path, model_path, diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index 607544a1c8394..c9ee2d5d05fa4 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -7,11 +7,15 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod +from vllm.utils import is_hpu -capability = torch.cuda.get_device_capability() -capability = capability[0] * 10 + capability[1] - +if not is_hpu(): + capability = torch.cuda.get_device_capability() + capability = capability[0] * 10 + capability[1] +else: + capability = 0 +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.skipif( capability < QUANTIZATION_METHODS["fp8"].get_min_capability(), reason="FP8 is not supported on this GPU type.") diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py index 2682f284505bd..176371b5cd166 100644 --- a/tests/samplers/test_beam_search.py +++ b/tests/samplers/test_beam_search.py @@ -6,6 +6,7 @@ import pytest import torch +from vllm.utils import is_hpu # FIXME(zhuohan): The test can not pass if we: # 1. Increase max_tokens to 256. @@ -16,6 +17,7 @@ MODELS = ["facebook/opt-125m"] +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", MAX_TOKENS) diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py index 3788e9e9752ff..7e95d0fa60c3d 100644 --- a/tests/samplers/test_logits_processor.py +++ b/tests/samplers/test_logits_processor.py @@ -2,10 +2,12 @@ import torch from vllm import SamplingParams +from vllm.utils import is_hpu MODELS = ["facebook/opt-125m"] +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_logits_processor_force_generate( diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 57d6d2a410ee5..3d2597ab33cce 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -3,10 +3,12 @@ from tests.conftest import VllmRunner from vllm import SamplingParams +from vllm.utils import is_hpu MODELS = ["facebook/opt-125m"] +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1]) diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py index 5e93238d709ec..81d78ed987dac 100644 --- a/tests/samplers/test_ranks.py +++ b/tests/samplers/test_ranks.py @@ -1,10 +1,12 @@ import pytest from vllm import SamplingParams +from vllm.utils import is_hpu MODELS = ["facebook/opt-125m"] +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_ranks( diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py index 13b5b80cccfdc..c96862f3231a3 100644 --- a/tests/samplers/test_rejection_sampler.py +++ b/tests/samplers/test_rejection_sampler.py @@ -7,10 +7,14 @@ from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.model_executor.utils import set_random_seed +from vllm.utils import is_hpu -CUDA_DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] +if is_hpu(): + DEVICES = ["hpu"] +else: + DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) + ] def mock_causal_accepted_tensor( @@ -38,11 +42,12 @@ def mock_causal_accepted_tensor( return accepted +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("seed", list(range(10))) @pytest.mark.parametrize( "which_tokens_accepted", ["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"]) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @torch.inference_mode() def test_correct_output_format(which_tokens_accepted: str, seed: int, device: str): @@ -124,10 +129,11 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int, assert torch.all(output_token_ids[subsequent_mask] == -1) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("k", list(range(1, 6))) @pytest.mark.parametrize("vocab_size", [30_000, 50_000]) @pytest.mark.parametrize("batch_size", list(range(1, 32))) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @torch.inference_mode() def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int, device: str): @@ -150,10 +156,11 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int, draft_token_ids) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"]) @pytest.mark.parametrize("which_token_ids", ["bonus_token_ids", "draft_token_ids"]) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @torch.inference_mode() def test_raises_when_vocab_oob(above_or_below_vocab_range: str, which_token_ids: str, device: str): @@ -198,6 +205,7 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str, draft_token_ids) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("draft_and_target_probs_equal", [True, False]) @pytest.mark.parametrize("seed", list(range(5))) @torch.inference_mode() diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index e4fea165a4d46..570cbe420f592 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -11,7 +11,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_random_seed from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata -from vllm.utils import Counter +from vllm.utils import Counter, is_hpu from vllm.worker.model_runner import ModelRunner @@ -44,9 +44,12 @@ def _prepare_test( VOCAB_SIZE = 32000 RANDOM_SEEDS = list(range(128)) -CUDA_DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] +if is_hpu(): + DEVICES = ["hpu"] +else: + DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) + ] def _do_sample( @@ -80,7 +83,7 @@ def _do_sample( @pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_sampler_all_greedy(seed: int, device: str): set_random_seed(seed) torch.set_default_device(device) @@ -100,7 +103,7 @@ def test_sampler_all_greedy(seed: int, device: str): @pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_sampler_all_random(seed: int, device: str): set_random_seed(seed) torch.set_default_device(device) @@ -125,8 +128,9 @@ def test_sampler_all_random(seed: int, device: str): del model_runner +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_sampler_all_random_seed(seed: int, device: str): set_random_seed(seed) torch.set_default_device(device) @@ -151,8 +155,9 @@ def test_sampler_all_random_seed(seed: int, device: str): del model_runner +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_sampler_all_random_seed_deterministic(seed: int, device: str): set_random_seed(seed) torch.set_default_device(device) @@ -176,7 +181,7 @@ def test_sampler_all_random_seed_deterministic(seed: int, device: str): @pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_sampler_all_beam(seed: int, device: str): set_random_seed(seed) torch.set_default_device(device) @@ -198,7 +203,7 @@ def test_sampler_all_beam(seed: int, device: str): @pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_sampler_min_tokens_penalty(seed: int, device: str): seq_id_counter = Counter(start=random.randint(0, 100)) set_random_seed(seed) @@ -486,8 +491,9 @@ def run_test_case(*, run_test_case(**test_case) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_sampler_mixed(seed: int, device: str): set_random_seed(seed) torch.set_default_device(device) @@ -589,7 +595,7 @@ def test_sampling(model_runner: ModelRunner): @pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_sampler_top_k_top_p(seed: int, device: str): set_random_seed(seed) batch_size = random.randint(1, 256) diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py index 3cd659cef58da..72a792c2d757f 100644 --- a/tests/samplers/test_seeded_generate.py +++ b/tests/samplers/test_seeded_generate.py @@ -10,6 +10,7 @@ from vllm import SamplingParams from vllm.model_executor.utils import set_random_seed +from vllm.utils import is_hpu MODEL = "facebook/opt-125m" RANDOM_SEEDS = list(range(5)) @@ -22,6 +23,7 @@ def vllm_model(vllm_runner): del vllm_model +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("seed", RANDOM_SEEDS) def test_random_sample_with_seed( vllm_model, diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py index 60c20ed7db7a3..7786e864d2a4f 100644 --- a/tests/spec_decode/e2e/test_compatibility.py +++ b/tests/spec_decode/e2e/test_compatibility.py @@ -1,10 +1,12 @@ import pytest from vllm import SamplingParams +from vllm.utils import is_hpu from .conftest import get_output_from_llm_generator +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -55,6 +57,7 @@ def test_spec_decode_xfail_ray(test_llm_generator): ray.shutdown() +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -94,6 +97,7 @@ def test_spec_decode_xfail_chunked_prefill(test_llm_generator): sampling_params) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -146,6 +150,7 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator): sampling_params) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("common_llm_kwargs", [{ "model": "JackFram/llama-68m", "speculative_model": "JackFram/llama-68m", diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py index 9572aac7df6e0..882cb8dd9dbac 100644 --- a/tests/spec_decode/e2e/test_logprobs.py +++ b/tests/spec_decode/e2e/test_logprobs.py @@ -4,10 +4,12 @@ import pytest from vllm import SamplingParams +from vllm.utils import is_hpu from .conftest import get_logprobs_from_llm_generator +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -45,6 +47,7 @@ def test_logprobs_equality(baseline_llm_generator, test_llm_generator, force_output_len=True) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -86,6 +89,7 @@ def test_diff_num_logprobs(baseline_llm_generator, test_llm_generator, logprob_rank=num_logprobs) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -125,6 +129,7 @@ def test_logprobs_different_k(baseline_llm_generator, test_llm_generator, force_output_len=True) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -168,6 +173,7 @@ def test_logprobs_when_skip_speculation(baseline_llm_generator, force_output_len=True) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py index f15fcc4746d20..565936dd50c5d 100644 --- a/tests/spec_decode/e2e/test_multistep_correctness.py +++ b/tests/spec_decode/e2e/test_multistep_correctness.py @@ -34,11 +34,13 @@ from transformers import AutoTokenizer from vllm import SamplingParams +from vllm.utils import is_hpu from .conftest import (get_output_from_llm_generator, run_greedy_equality_correctness_test) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -110,6 +112,7 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator, assert actual_tokens.strip() == expected_tokens.strip() +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -148,6 +151,7 @@ def test_spec_decode_e2e_with_async_engine(test_llm_generator, force_output_len=True) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -202,6 +206,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( force_output_len=True) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -253,6 +258,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs( force_output_len=True) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -299,6 +305,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len( force_output_len=False) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -343,6 +350,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1( force_output_len=True) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -387,6 +395,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( force_output_len=True) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -434,6 +443,7 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption( force_output_len=True) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -487,6 +497,7 @@ def test_spec_decode_different_block_size(baseline_llm_generator, force_output_len=True) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -536,6 +547,7 @@ def test_skip_speculation(baseline_llm_generator, test_llm_generator, force_output_len=True) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py index 44ef400c91d34..bf2641ff2ffbc 100644 --- a/tests/spec_decode/e2e/test_ngram_correctness.py +++ b/tests/spec_decode/e2e/test_ngram_correctness.py @@ -26,9 +26,12 @@ import pytest +from vllm.utils import is_hpu + from .conftest import run_greedy_equality_correctness_test +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -70,6 +73,7 @@ def test_ngram_e2e_greedy_correctness(baseline_llm_generator, force_output_len=True) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -119,6 +123,7 @@ def test_ngram_e2e_greedy_correctness_with_preemption(baseline_llm_generator, force_output_len=True) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py index 43cfd78ddb0cc..6013ce17b608f 100644 --- a/tests/spec_decode/test_batch_expansion.py +++ b/tests/spec_decode/test_batch_expansion.py @@ -2,6 +2,7 @@ import torch from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer +from vllm.utils import is_hpu from .utils import create_seq_group_metadata_from_prompts, mock_worker @@ -27,6 +28,7 @@ def test_create_target_seq_id_iterator(num_target_seq_ids: int): assert next(iterator) > max_seq_id +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize('k', [1, 2, 6]) @pytest.mark.skip_global_cleanup def test_get_token_ids_to_score(k: int): diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py index 312878804b86e..d85f264c33099 100644 --- a/tests/spec_decode/test_metrics.py +++ b/tests/spec_decode/test_metrics.py @@ -5,8 +5,10 @@ import torch from vllm.spec_decode.metrics import AsyncMetricsCollector +from vllm.utils import is_hpu +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_initial_call_returns_none(): """Expect first call to get metrics to return None. """ @@ -25,6 +27,7 @@ def test_initial_call_returns_none(): assert maybe_metrics is None +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_second_call_returns_metrics(): """Expect second call to not return None. """ @@ -52,6 +55,7 @@ def test_second_call_returns_metrics(): assert metrics is not None +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("rank", [1, 2, 3, 4]) def test_nonzero_rank_noop(rank): """Verify nonzero ranks don't collect metrics. @@ -72,6 +76,7 @@ def test_nonzero_rank_noop(rank): assert metrics is None +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_noop_until_time(): """Verify metrics aren't collected until enough time passes. """ @@ -105,6 +110,7 @@ def test_noop_until_time(): assert metrics is not None +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("has_data", [True, False]) def test_initial_metrics_has_correct_values(has_data: bool): """Test correctness of metrics data. diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py index cb2de97a4af94..b907365d6261d 100644 --- a/tests/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -8,6 +8,7 @@ from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.top1_proposer import Top1Proposer +from vllm.utils import is_hpu from vllm.worker.worker import Worker from .utils import (assert_logprobs_dict_allclose, create_batch, @@ -68,6 +69,7 @@ def test_assert_enough_kv_space(num_steps: int): seq_group_metadata.block_tables = original_block_tables +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @torch.inference_mode() def test_same_output_for_single_step(): """Verify the multi step worker produces the same output as the normal @@ -150,6 +152,7 @@ def test_same_output_for_single_step(): assert_logprobs_dict_allclose(actual_logprobs, expected_logprobs) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @torch.inference_mode() def test_same_output_for_multi_step(): """Verify the multi-step worker produces the same output as the normal @@ -269,6 +272,7 @@ def test_same_output_for_multi_step(): single_step_logprobs) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @torch.inference_mode() def test_draft_proposals_full_speculation_len(): """Verify Top1Proposer correctly handles case where all sequences @@ -321,6 +325,7 @@ def test_draft_proposals_full_speculation_len(): assert proposals.proposal_lens.tolist() == [k for _ in range(batch_size)] +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @torch.inference_mode() def test_draft_proposals_no_speculations(): """Verify Top1Proposer correctly handles case where no sequences @@ -358,6 +363,7 @@ def test_draft_proposals_no_speculations(): assert proposals.proposal_lens.tolist() == [0 for _ in range(batch_size)] +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @torch.inference_mode() def test_draft_proposals_mixed_k(): """Verify Top1Proposer correctly handles case some sequences can diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py index de305c4030aa9..931a7176b4555 100644 --- a/tests/spec_decode/test_ngram_worker.py +++ b/tests/spec_decode/test_ngram_worker.py @@ -1,12 +1,15 @@ import torch +import pytest from vllm.sequence import ExecuteModelRequest from vllm.spec_decode.ngram_worker import NGramWorker from vllm.spec_decode.top1_proposer import Top1Proposer +from vllm.utils import is_hpu from .utils import create_seq_group_metadata_from_prompts, create_worker +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_ngram_algo_correctness_for_single_no_match(): """Verify our ngram algo find the right candidate in the prompt @@ -63,6 +66,7 @@ def test_ngram_algo_correctness_for_single_no_match(): assert proposals.proposal_lens.tolist() == [0] +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_ngram_algo_correctness_for_batches_not_match_all(): """Verify our ngram algo find the right candidate in the prompt @@ -139,6 +143,7 @@ def test_ngram_algo_correctness_for_batches_not_match_all(): assert proposals.proposal_token_ids[4][i] == -1 +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_ngram_algo_correctness_for_batches_match_all(): """Verify our ngram algo find the right candidate in the prompt diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index ef9d32f73d668..9076ed3ce6eb0 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -14,10 +14,11 @@ from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker, split_num_cache_blocks_evenly) +from vllm.utils import is_hpu from .utils import create_batch, create_sampler_output_list, mock_worker - +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize('k', [1, 2, 6]) @pytest.mark.parametrize('batch_size', [1, 2, 32]) @torch.inference_mode() @@ -50,6 +51,7 @@ def test_correctly_calls_draft_model(k: int, batch_size: int): assert actual_execute_model_data == execute_model_req +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize('k', [1, 2, 6]) @pytest.mark.parametrize('batch_size', [1, 2, 32]) @torch.inference_mode() @@ -130,6 +132,7 @@ def test_correctly_calls_target_model(k: int, batch_size: int): assert expected_seen_contexts == seen_contexts +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize('k', [1, 2, 6]) @pytest.mark.parametrize('batch_size', [1, 2, 32]) @torch.inference_mode() @@ -218,6 +221,7 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int): assert torch.equal(actual.draft_probs, proposal_probs) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize('k', [1, 2, 6]) @pytest.mark.parametrize('batch_size', [1, 2, 32]) @torch.inference_mode() @@ -340,6 +344,7 @@ def test_correctly_formats_output(k: int, batch_size: int): i].output_token +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize('k', [1, 2]) @pytest.mark.parametrize('batch_size', [1]) @pytest.mark.parametrize('returns_metrics', [True, False]) @@ -436,6 +441,7 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool): assert args[0] == k or kwargs.get('k', -1) == k +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize('k', [0]) @pytest.mark.parametrize('batch_size', [1, 2, 32]) @torch.inference_mode() @@ -476,6 +482,7 @@ def test_k_equals_zero(k: int, batch_size: int): target_worker.execute_model.assert_called_once_with(execute_model_req) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize('k', [0, 5]) @pytest.mark.parametrize('batch_size', [0]) @torch.inference_mode() diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index df1db4e6c4001..9426dc2e6d45f 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -14,6 +14,7 @@ from vllm.model_executor.model_loader.tensorizer import ( EncryptionParams, TensorizerConfig, TensorSerializer, is_vllm_serialized_tensorizer, load_with_tensorizer, open_stream) +from vllm.utils import is_hpu prompts = [ "Hello, my name is", @@ -74,6 +75,7 @@ def test_is_vllm_model_without_vllm_in_uri(tensorizer_config): assert result is False +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_deserialized_vllm_model_has_same_outputs(vllm_runner, tmp_path): vllm_model = vllm_runner(model_ref) model_path = tmp_path / (model_ref + ".tensors") @@ -99,6 +101,7 @@ def test_deserialized_vllm_model_has_same_outputs(vllm_runner, tmp_path): assert outputs == deserialized_outputs +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") def test_can_deserialize_s3(vllm_runner): model_ref = "EleutherAI/pythia-1.4b" @@ -118,6 +121,7 @@ def test_can_deserialize_s3(vllm_runner): assert deserialized_outputs +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") def test_deserialized_encrypted_vllm_model_has_same_outputs( vllm_runner, tmp_path): @@ -151,6 +155,7 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs( assert outputs == deserialized_outputs +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner, tmp_path): hf_model = hf_runner(model_ref) @@ -176,6 +181,7 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner, assert outputs == deserialized_outputs +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): from huggingface_hub import snapshot_download @@ -217,6 +223,7 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): assert loaded_vllm_model +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_load_without_tensorizer_load_format(vllm_runner): with pytest.raises(ValueError): vllm_runner(model_ref, @@ -224,6 +231,7 @@ def test_load_without_tensorizer_load_format(vllm_runner): tensorizer_uri="test", vllm_tensorized=False)) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") def test_tensorize_vllm_model(tmp_path): # Test serialize command @@ -251,6 +259,7 @@ def test_tensorize_vllm_model(tmp_path): f"\n{result.stdout}\n{result.stderr}") +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") def test_openai_apiserver_with_tensorizer(tmp_path): ## Serialize model @@ -301,6 +310,7 @@ def test_openai_apiserver_with_tensorizer(tmp_path): completion_tokens=5, prompt_tokens=6, total_tokens=11) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_raise_value_error_on_invalid_load_format(vllm_runner): with pytest.raises(ValueError): vllm_runner(model_ref, diff --git a/tests/test_config.py b/tests/test_config.py index 19db10630bbae..5b642666e7cca 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,6 +1,9 @@ +import pytest from vllm.config import ModelConfig +from vllm.utils import is_hpu +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_get_sliding_window(): TEST_SLIDING_WINDOW = 4096 # Test that the sliding window is correctly computed. diff --git a/tests/test_logits_processor.py b/tests/test_logits_processor.py index 179e8d25a341b..536ba3dc1d6fa 100644 --- a/tests/test_logits_processor.py +++ b/tests/test_logits_processor.py @@ -9,6 +9,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_random_seed from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata +from vllm.utils import is_hpu from vllm.worker.model_runner import ModelRunner @@ -48,13 +49,17 @@ def _prepare_test( RANDOM_SEEDS = list(range(128)) -CUDA_DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] +if is_hpu(): + DEVICES = ["hpu"] +else: + DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) + ] +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) def test_logits_processors(seed: int, device: str): set_random_seed(seed) torch.set_default_device(device) diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 9bc9becb2a6f1..b3cd37a43a8ef 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -7,6 +7,7 @@ from vllm.transformers_utils.detokenizer import (Detokenizer, detokenize_incrementally) from vllm.transformers_utils.tokenizer_group import get_tokenizer_group +from vllm.utils import is_hpu TRUTH = [ "Hello here, this is a simple test", @@ -55,6 +56,8 @@ def _run_incremental_decode(tokenizer, all_input_ids, @pytest.mark.parametrize("skip_special_tokens", (True, False)) def test_decode_streaming(tokenizer_id, truth, with_prompt, skip_special_tokens): + if is_hpu() and tokenizer_id == "meta-llama/Llama-2-7b-hf": + pytest.skip("Skipping test on HPU") tokenizer = AutoTokenizer.from_pretrained(tokenizer_id) if with_prompt: truth_tokens = tokenizer(truth, add_special_tokens=False)["input_ids"] @@ -114,6 +117,8 @@ def detokenizer(tokenizer_name: str) -> Detokenizer: @pytest.fixture(name="complete_sequence_token_ids") def create_complete_sequence_token_ids(complete_sequence: str, tokenizer_name: str) -> List[int]: + if is_hpu() and tokenizer_name == "meta-llama/Llama-2-7b-hf": + pytest.skip("Skipping test on HPU") tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) complete_sequence_token_ids = tokenizer(complete_sequence)["input_ids"] return complete_sequence_token_ids @@ -145,6 +150,8 @@ def test_decode_sequence_logprobs(complete_sequence: str, detokenizer: Detokenizer, skip_special_tokens: bool): """Verify Detokenizer decodes logprobs correctly.""" + if is_hpu() and detokenizer == "meta-llama/Llama-2-7b-hf": + pytest.skip("Skipping test on HPU") sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens, logprobs=2) @@ -181,6 +188,8 @@ def test_decode_prompt_logprobs(complete_sequence: str, detokenizer: Detokenizer, skip_special_tokens: bool): """Verify Detokenizer decodes prompt logprobs correctly.""" + if is_hpu() and detokenizer == "meta-llama/Llama-2-7b-hf": + pytest.skip("Skipping test on HPU") sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens, prompt_logprobs=1) diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index e7975d0ef48b9..e54071c8b7dca 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -5,10 +5,11 @@ from vllm.distributed.parallel_state import init_distributed_environment from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata -from vllm.utils import get_open_port +from vllm.utils import get_open_port, is_hpu from vllm.worker.model_runner import ModelRunner, _get_graph_batch_size +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("batch_size", list(range(1, 257))) def test_prepare_prompt(batch_size): scheduler_config = SchedulerConfig(100000, @@ -121,6 +122,7 @@ def test_prepare_prompt(batch_size): torch.testing.assert_close(actual, expected) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("batch_size", list(range(1, 257))) def test_prepare_decode_cuda_graph(batch_size): model_config = ModelConfig( @@ -212,6 +214,7 @@ def test_prepare_decode_cuda_graph(batch_size): torch.testing.assert_close(actual, expected) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_empty_seq_group(): """Verify prepare prompt and decode returns empty output.""" model_config = ModelConfig( @@ -257,6 +260,7 @@ def distributed_init(): local_rank=0) +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("batch_size", list(range(2, 128))) @pytest.mark.parametrize("enforce_eager", [True, False]) def test_hybrid_batches(batch_size, enforce_eager, distributed_init): diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py index 4d2d3add27d59..36f9cdb1e59e1 100644 --- a/tests/worker/test_swap.py +++ b/tests/worker/test_swap.py @@ -1,11 +1,13 @@ import torch +import pytest from vllm.engine.arg_utils import EngineArgs from vllm.sequence import ExecuteModelRequest -from vllm.utils import get_distributed_init_method, get_ip, get_open_port +from vllm.utils import get_distributed_init_method, get_ip, get_open_port, is_hpu from vllm.worker.worker import Worker +@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_swap() -> None: # Configure the engine. engine_args = EngineArgs(model="facebook/opt-125m", diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py index 759fdb65e08ed..4b2e5c9201d30 100644 --- a/vllm/hpu/cache_ops.py +++ b/vllm/hpu/cache_ops.py @@ -56,8 +56,8 @@ def reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, dtype, i def swap_blocks(src, dst, block_mapping): - index_src = torch.zeros((1,), dtype=torch.int32, device=key_caches[0].device) - index_dst = torch.zeros((1,), dtype=torch.int32, device=key_caches[0].device) + index_src = torch.zeros((1,), dtype=torch.int32, device=src.device) + index_dst = torch.zeros((1,), dtype=torch.int32, device=dst.device) for src_idx, dst_idx in block_mapping.items(): index_src[0] = src_idx index_dst[0] = dst_idx diff --git a/vllm/utils.py b/vllm/utils.py index 6d6d3d4f4590d..19786035cb18e 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -409,14 +409,18 @@ def create_kv_caches_with_random( torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype) scale = head_size**-0.5 - x = 16 // torch.tensor([], dtype=torch_dtype).element_size() - key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x) + if is_hpu(): + key_cache_shape = (num_blocks, num_heads, head_size, block_size) + else: + x = 16 // torch.tensor([], dtype=torch_dtype).element_size() + key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x) key_caches = [] for _ in range(num_layers): key_cache = torch.empty(size=key_cache_shape, dtype=torch_dtype, device=device) - if cache_dtype in ["auto", "half", "bfloat16", "float"]: + cache_dtype = str(cache_dtype) + if cache_dtype in ["auto", "half", "float16", "torch.float16", "torch.bfloat16", "torch.float32"]: key_cache.uniform_(-scale, scale) elif cache_dtype == 'fp8': _generate_random_fp8(key_cache, -scale, scale) @@ -431,7 +435,7 @@ def create_kv_caches_with_random( value_cache = torch.empty(size=value_cache_shape, dtype=torch_dtype, device=device) - if cache_dtype in ["auto", "half", "bfloat16", "float"]: + if cache_dtype in ["auto", "half", "torch.float16", "torch.bfloat16", "torch.float32"]: value_cache.uniform_(-scale, scale) elif cache_dtype == 'fp8': _generate_random_fp8(value_cache, -scale, scale) From eaa6c06c2cd354880d3bc8b3921a21fdbabc489f Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Fri, 7 Jun 2024 12:02:29 +0200 Subject: [PATCH 031/819] Update ops.py (#54) --- vllm/hpu/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index 10e53312378ad..a93508a50d7a8 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -13,7 +13,7 @@ import vllm.hpu.utils as hpu_utils -PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', '0') == '1') +PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', '1') == '1') def silu_and_mul(output, input): From 45fb692a259909ab6f0cb6e2e015e3f2a7228f5a Mon Sep 17 00:00:00 2001 From: jkaniecki <153085639+jkaniecki@users.noreply.github.com> Date: Mon, 10 Jun 2024 16:55:00 +0200 Subject: [PATCH 032/819] Add syncs in mixtral weight loader (#55) * Add hpu syncs in model loader to prevent memory peak after loading weights * Remove spaces * Fix typo --- vllm/model_executor/models/mixtral.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 4b602203cee79..e725c4d7cde8a 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -591,7 +591,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) - + + if is_hpu(): + torch.hpu.synchronize() def all_close_1d(x: torch.Tensor) -> bool: assert len(x.shape) == 1 From 2825ddec9c69ec72b17cd6fb600afaf33ebe2103 Mon Sep 17 00:00:00 2001 From: madamczykhabana <110973826+madamczykhabana@users.noreply.github.com> Date: Tue, 11 Jun 2024 09:24:25 +0200 Subject: [PATCH 033/819] HPU: Change KV-cache layout (#56) * HPU: Change KV-cache layout to (num_blocks, block_size, num_heads, head_size) * Fix UTs * Fix UTs - part 2 --- tests/kernels/test_attention.py | 18 ++++++--- tests/kernels/test_cache.py | 20 ++++------ vllm/attention/ops/habana_paged_attn.py | 4 +- vllm/hpu/cache_ops.py | 50 ++++--------------------- vllm/hpu/ops.py | 19 +++++----- vllm/utils.py | 7 +++- vllm/worker/habana_model_runner.py | 3 +- 7 files changed, 45 insertions(+), 76 deletions(-) diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index f17a51b0ccf78..b034fd6d8ce32 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -74,9 +74,14 @@ def ref_single_query_cached_kv_attention( alibi_slopes: Optional[torch.Tensor], ) -> None: num_query_heads = query.shape[1] - num_kv_heads = value_cache.shape[1] - head_size = value_cache.shape[2] - block_size = value_cache.shape[3] + if not is_hpu(): + num_kv_heads = value_cache.shape[1] + head_size = value_cache.shape[2] + block_size = value_cache.shape[3] + else: + block_size = value_cache.shape[1] + num_kv_heads = value_cache.shape[2] + head_size = value_cache.shape[3] num_seqs = query.shape[0] block_tables = block_tables.cpu().tolist() @@ -93,13 +98,16 @@ def ref_single_query_cached_kv_attention( block_offset = j % block_size if is_hpu(): - k = key_cache[block_number, :, :, block_offset] + k = key_cache[block_number, block_offset, :, :] else: k = key_cache[block_number, :, :, block_offset, :] k = k.reshape(num_kv_heads, head_size) keys.append(k) - v = value_cache[block_number, :, :, block_offset] + if is_hpu(): + v = value_cache[block_number, block_offset, :, :] + else: + v = value_cache[block_number, :, :, block_offset] values.append(v) keys = torch.stack(keys, dim=0) values = torch.stack(values, dim=0) diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index f8163deb05223..db1a8b556f47e 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -102,7 +102,6 @@ def test_copy_blocks( if is_hpu(): tmp_block_mapping_dict = {} for src, dst in block_mapping: - print(src, dst, tmp_block_mapping_dict) if not tmp_block_mapping_dict.get(src): tmp_block_mapping_dict[src] = [dst] continue @@ -191,17 +190,11 @@ def test_reshape_and_cache( kv_scale = 1.0 # Call the reshape_and_cache kernel. - if is_hpu(): - cache_ops.reshape_and_cache(key, value, key_cache, value_cache, - slot_mapping.view((1, -1)), "auto", False) - else: - cache_ops.reshape_and_cache(key, value, key_cache, value_cache, - slot_mapping, "auto") + cache_ops.reshape_and_cache(key, value, key_cache, value_cache, + slot_mapping, "auto") # Run the reference implementation. - if is_hpu(): - reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0].shape) - else: + if not is_hpu(): reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape) block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor") block_indices = block_indices.cpu().tolist() @@ -211,10 +204,13 @@ def test_reshape_and_cache( block_idx = block_indices[i] block_offset = block_offsets[i] if is_hpu(): - cloned_key_cache[block_idx, :, :, block_offset] = reshaped_key[i] + cloned_key_cache[block_idx, block_offset, :, :] = key[i] else: cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i] - cloned_value_cache[block_idx, :, :, block_offset] = value[i] + if is_hpu(): + cloned_value_cache[block_idx, block_offset, :, :] = value[i] + else: + cloned_value_cache[block_idx, :, :, block_offset] = value[i] if kv_cache_dtype == "fp8": assert torch.allclose(result_key_cache, diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/habana_paged_attn.py index bd6a58684f567..c8ed500f7af1c 100644 --- a/vllm/attention/ops/habana_paged_attn.py +++ b/vllm/attention/ops/habana_paged_attn.py @@ -41,7 +41,7 @@ def get_kv_cache_shape( num_kv_heads: int, head_size: int, ) -> Tuple[int, ...]: - return (num_blocks, num_kv_heads, head_size, block_size) + return (num_blocks, block_size, num_kv_heads, head_size) @staticmethod def split_kv_cache( @@ -86,7 +86,7 @@ def forward_decode( alibi_slopes: Optional[torch.Tensor], kv_scale: float, ) -> torch.Tensor: - block_size = value_cache.shape[3] + block_size = value_cache.shape[1] return ops.paged_attention_v1( query, key_cache, diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py index 4b2e5c9201d30..56aafd2a4d0a9 100644 --- a/vllm/hpu/cache_ops.py +++ b/vllm/hpu/cache_ops.py @@ -10,49 +10,13 @@ import habana_frameworks.torch as htorch -def pad_to_full_block(data, block_size, pad_value): - seq_dim = 1 - pad_shape = list(data.shape) - remainder = pad_shape[seq_dim] % block_size - if remainder == 0: - return data - pad_shape[seq_dim] = block_size - remainder - pad = torch.full(pad_shape, pad_value, dtype=data.dtype, device=data.device) - return torch.cat([data, pad], dim=seq_dim) - - -def initialize_cache(data, indices, cache): - block_size = cache.size(-1) - data = data.unflatten(0, (-1, block_size)).permute(0, 2, 3, 1) - indices = indices.unflatten(0, (-1, block_size))[:,0] - cache.index_copy_(0, indices, data) - - -def update_cache(data, indices, offsets, cache): - prev = cache.index_select(0, indices) - idx = offsets.view(-1, 1, 1, 1).expand(-1, data.size(1), data.size(2), -1) - prev.scatter_(-1, idx, data.unsqueeze(-1)) - cache.index_copy_(0, indices, prev) - - -def reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, dtype, is_prompt): - block_size = key_cache.size(-1) - assert slot_mapping.dim() == 2, 'This implementation requires unflattened slot_mapping!' - - if is_prompt: - block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor") - batch_size, seq_length = block_indices.shape - key = pad_to_full_block(key.unflatten(0, (batch_size, seq_length)), block_size, 0).flatten(0, 1) - value = pad_to_full_block(value.unflatten(0, (batch_size, seq_length)), block_size, 0).flatten(0, 1) - block_indices = pad_to_full_block(block_indices, block_size, -1).flatten(0, 1) - initialize_cache(key, block_indices, key_cache) - initialize_cache(value, block_indices, value_cache) - else: - slot_mapping = slot_mapping.flatten() - block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor") - block_offsets = torch.fmod(slot_mapping, block_size) - update_cache(key, block_indices, block_offsets, key_cache) - update_cache(value, block_indices, block_offsets, value_cache) +def reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, dtype, is_prompt=False): + block_size = key_cache.size(1) + slot_mapping = slot_mapping.flatten() + indices = torch.div(slot_mapping, block_size, rounding_mode="floor") + offsets = torch.fmod(slot_mapping, block_size) + key_cache.index_put_((indices, offsets), key) + value_cache.index_put_((indices, offsets), value) def swap_blocks(src, dst, block_mapping): diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index a93508a50d7a8..d4b4c488b1bf2 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -31,23 +31,24 @@ def gelu_fast(output, input): raise NotImplementedError -def fetch_from_cache(cache, blocks): - return [cache.index_select(0, blocks[:, i]) for i in range(blocks.size(1))] +def fetch_from_cache(cache, blocks, permutations): + return [cache.index_select(0, blocks[:, i]).permute(permutations) for i in range(blocks.size(1))] @hpu_utils.with_mark_steps def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block_tables, context_lens, block_size, alibi_slopes, kv_cache_dtype=None) -> None: seq_len = block_tables.size(1) batch_size, query_heads, _ = query.shape - _, kv_heads, _, _ = key_cache.shape + _, _, kv_heads, _ = key_cache.shape min_inf = torch.finfo(query.dtype).min mask = (torch.arange(0, seq_len * block_size, dtype=torch.int32, device=key_cache.device) .view(1, -1) .expand(batch_size, -1) .ge(context_lens.view(-1, 1)) .view(batch_size, 1, 1, -1)) + query.mul_(scale) query = query.unsqueeze(-2) - keys = fetch_from_cache(key_cache, block_tables) + keys = fetch_from_cache(key_cache, block_tables, (0, 2, 3, 1)) if query_heads != kv_heads: query = query.unflatten(1, (kv_heads, -1)) keys = [k.unflatten(1, (kv_heads, 1)) for k in keys] @@ -55,24 +56,22 @@ def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block attn_weights = [torch.matmul(query, k) for k in keys] attn_weights = (torch.cat(attn_weights, dim=-1) - .mul_(scale) .masked_fill(mask, min_inf) .softmax(dim=-1)) - values = fetch_from_cache(value_cache, block_tables) + values = fetch_from_cache(value_cache, block_tables, (0, 2, 1, 3)) if PA_SPLIT_VALUE: attn_weights = attn_weights.split(block_size, dim=-1) else: - values = [torch.cat(values, dim=-1)] + values = [torch.cat(values, dim=-2)] attn_weights = [attn_weights] if query_heads != kv_heads: values = [v.unflatten(1, (kv_heads, 1)) for v in values] - attn_weights = [torch.matmul(a, v.transpose(-1, -2)).squeeze(-2) for a, v in zip(attn_weights, values)] + attn_weights = [torch.matmul(a, v) for a, v in zip(attn_weights, values)] if query_heads != kv_heads: attn_weights = [a.flatten(1, 2) for a in attn_weights] attn_weights = sum(attn_weights) - - return attn_weights + return attn_weights.squeeze(-2) def rms_norm(out, hidden_states, weight, eps): diff --git a/vllm/utils.py b/vllm/utils.py index 19786035cb18e..e7a2cde3e0f5d 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -410,7 +410,7 @@ def create_kv_caches_with_random( scale = head_size**-0.5 if is_hpu(): - key_cache_shape = (num_blocks, num_heads, head_size, block_size) + key_cache_shape = (num_blocks, block_size, num_heads, head_size) else: x = 16 // torch.tensor([], dtype=torch_dtype).element_size() key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x) @@ -429,7 +429,10 @@ def create_kv_caches_with_random( f"Does not support key cache of type {cache_dtype}") key_caches.append(key_cache) - value_cache_shape = (num_blocks, num_heads, head_size, block_size) + if is_hpu(): + value_cache_shape = (num_blocks, block_size, num_heads, head_size) + else: + value_cache_shape = (num_blocks, num_heads, head_size, block_size) value_caches = [] for _ in range(num_layers): value_cache = torch.empty(size=value_cache_shape, diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 78290fd59b10a..0eaf6dd9f0b93 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -976,8 +976,7 @@ def warmup_graphs(self, strategy, buckets, is_prompt, kv_caches, available_mem): total_batch_seq += batch_seq graphed = list(c[:2] for c in self.graphed_buckets if c[2] == is_prompt) logger.info(f'{phase} captured:{len(graphed)} ({100 * len(graphed) / num_candidates:.1f}%) used_mem:{format_bytes(total_mem)} buckets:{sorted(list(graphed))}') - - + @torch.inference_mode() def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: if os.environ.get('VLLM_SKIP_WARMUP', 'false').lower() == 'true': From b2d0e1e3327a9951bf9eef5452d0829e7380de23 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 11 Jun 2024 12:18:14 +0200 Subject: [PATCH 034/819] Add more detailed event names to profiler (#57) --- vllm/worker/habana_model_runner.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 0eaf6dd9f0b93..fa3c113c45bf1 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -851,14 +851,14 @@ def execute_model( htorch.core.mark_step() if self.is_driver_worker: - model_event_name = f'model_{base_event_name}_eager_bs{real_batch_size}' + model_event_name = f"model_{'prompt' if is_prompt else 'decode'}_bs{batch_size}_seq{seq_len}_graphs{'T' if use_graphs else 'F'}" else: model_event_name = 'model_executable' with self.profiler.record_event('internal', model_event_name): hidden_states = self.model.forward(**execute_model_kwargs, selected_token_indices=sampling_metadata.selected_token_indices, bypass_hpu_graphs=not use_graphs) # Compute the logits. - with self.profiler.record_event('internal', 'compute_logits'): + with self.profiler.record_event('internal', f'compute_logits_{"prompt" if is_prompt else "decode"}_bs{batch_size}_seq{seq_len}'): sampling_metadata.selected_token_indices = None logits = self.model.compute_logits(hidden_states, sampling_metadata) htorch.core.mark_step() @@ -868,7 +868,7 @@ def execute_model( return None # Sample the next token. - with self.profiler.record_event('internal', 'sample'): + with self.profiler.record_event('internal', f'sample_{"prompt" if is_prompt else "decode"}_bs{batch_size}_seq{seq_len}'): output = self.model.sample( logits=logits, sampling_metadata=sampling_metadata, From 47c0c5b95c4d8f52f9991a495e3d021e73ea957e Mon Sep 17 00:00:00 2001 From: madamczykhabana <110973826+madamczykhabana@users.noreply.github.com> Date: Tue, 11 Jun 2024 14:05:07 +0200 Subject: [PATCH 035/819] Disable value splitting on G3 (#58) --- vllm/hpu/ops.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index d4b4c488b1bf2..1f2e07bd59ccb 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -9,11 +9,14 @@ import torch.nn as nn import torch.nn.functional as F import habana_frameworks.torch as htorch +import habana_frameworks.torch.utils.experimental as htexp from typing import List, Optional, Tuple import vllm.hpu.utils as hpu_utils -PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', '1') == '1') +# FIXME: For some reason splitting value causes DFAs on G3. This needs to be debugged +PA_SPLIT_VALUE_DEFAULT = '0' if (htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi3) else '1' +PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', PA_SPLIT_VALUE_DEFAULT) == '1') def silu_and_mul(output, input): From 628869caae26df9363eaf644d7e8cbfc57a633dd Mon Sep 17 00:00:00 2001 From: Tomasz Zielinski <85164140+tzielinski-habana@users.noreply.github.com> Date: Thu, 13 Jun 2024 16:17:03 +0200 Subject: [PATCH 036/819] Fix for OOM in Llama 70b (#60) --- vllm/worker/habana_model_runner.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index fa3c113c45bf1..6a9cb6f066ea1 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -919,8 +919,10 @@ def create_dummy_seq_group_metadata(self, group_id, seq_len, is_prompt): def profile_run(self) -> None: num_layers = self.model_config.get_num_layers(self.parallel_config) kv_caches = [None] * num_layers - seq_len = self.max_model_len // self.max_num_seqs - self.warmup_scenario(self.max_num_seqs, seq_len, True, kv_caches) + max_batch_size = self.prompt_bs_bucket_cfg[-1] + max_seq_len = self.prompt_seq_bucket_cfg[-1] + + self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches) def warmup_scenario(self, batch_size, seq_len, is_prompt, kv_caches) -> None: use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) From ad890f1f51fcc87c0b04ed2ee3a1ab94eeb97baf Mon Sep 17 00:00:00 2001 From: Damian Szwichtenberg Date: Mon, 17 Jun 2024 13:27:18 +0200 Subject: [PATCH 037/819] Enable high-level profiler on multiple instances (#61) --- vllm/worker/profiler.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/worker/profiler.py b/vllm/worker/profiler.py index 2d47f4349d45a..34221d2553909 100644 --- a/vllm/worker/profiler.py +++ b/vllm/worker/profiler.py @@ -10,12 +10,12 @@ from contextlib import contextmanager from vllm.logger import init_logger +from vllm.utils import get_vllm_instance_id logger = init_logger(__name__) class FileWriter(threading.Thread): - def __init__(self, filename, event_queue): super().__init__() self.filename = filename @@ -48,13 +48,15 @@ def run(self): class Profiler: profiling_trace_events = queue.Queue() event_tid = {'counter': 1, 'external': 2, 'internal': 3} - filename = 'server_events.json' + vllm_instance_id = get_vllm_instance_id() + filename = f'server_events_{vllm_instance_id}.json' event_cache = [] def __init__(self): self.enabled = os.getenv('VLLM_PROFILER_ENABLED', 'false').lower() == 'true' and int( os.getenv('RANK', '0')) == 0 + logger.info(f'Profiler enabled for: {self.vllm_instance_id}') if self.enabled: # initialize the trace file (JSON Array Format) with open(self.filename, 'w') as outfile: From 11f047c1f122710f58929857981d5f270f4a4121 Mon Sep 17 00:00:00 2001 From: jkaniecki <153085639+jkaniecki@users.noreply.github.com> Date: Mon, 24 Jun 2024 15:35:14 +0200 Subject: [PATCH 038/819] Add mark steps to prevent oom in static moe op (#65) --- vllm/hpu/ops.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index 1f2e07bd59ccb..fa9d5ff521a6a 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -125,7 +125,6 @@ def silu_and_mul_wrapper(x: torch.Tensor) -> torch.Tensor: return out -@hpu_utils.with_mark_steps def static_fused_moe(hidden_states, w1, w2, score, topk): B, D = hidden_states.shape num_experts = w1.shape[0] @@ -142,6 +141,8 @@ def static_fused_moe(hidden_states, w1, w2, score, topk): padded_weights = padded_weights.reshape(-1, B, w1.shape[0]) padded_weights = padded_weights.permute(2, 0, 1).unsqueeze(-1) + htorch.core.mark_step() + for expert_idx in range(num_experts): padded_weight = padded_weights[expert_idx] current_state_static = hidden_states.reshape(-1, D) @@ -149,5 +150,6 @@ def static_fused_moe(hidden_states, w1, w2, score, topk): w_output = torch.matmul(w_output, w2[expert_idx].transpose(0, 1)) current_hidden_states_static = w_output * padded_weight final_hidden_states += current_hidden_states_static + htorch.core.mark_step() return final_hidden_states.view(-1, D) From fc6d4b4198ad20e9072780b7c87d8b862f80c180 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 24 Jun 2024 19:08:29 +0300 Subject: [PATCH 039/819] post-rebase api adjustments --- vllm/attention/backends/habana_attn.py | 28 +++++---- vllm/engine/llm_engine.py | 11 +++- vllm/model_executor/custom_op.py | 7 ++- vllm/model_executor/layers/layernorm.py | 36 ++++++++++++ vllm/worker/habana_model_runner.py | 78 ++++++++++++++++--------- vllm/worker/habana_worker.py | 3 +- 6 files changed, 118 insertions(+), 45 deletions(-) diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py index 017cf9c8933e5..518cbae81f465 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/habana_attn.py @@ -3,7 +3,7 @@ ############################################################################### from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple, Type +from typing import Any, Dict, List, Optional, Tuple, Type import torch import math @@ -12,8 +12,7 @@ LowerTriangularMaskWithTensorBias) from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, - AttentionMetadata, - AttentionMetadataPerStage) + AttentionMetadata) from vllm.attention.ops.habana_paged_attn import (HabanaPagedAttention, HabanaPagedAttentionMetadata) from vllm.logger import init_logger @@ -58,7 +57,7 @@ def copy_blocks( @dataclass -class HabanaAttentionMetadata(AttentionMetadataPerStage, HabanaPagedAttentionMetadata): +class HabanaAttentionMetadata(AttentionMetadata, HabanaPagedAttentionMetadata): """Metadata for HabanaAttentionbackend. NOTE: Any python object stored here is not updated when it is @@ -133,10 +132,13 @@ def __init__( num_heads: int, head_size: int, scale: float, - num_kv_heads: Optional[int] = None, - alibi_slopes: Optional[List[float]] = None, - sliding_window: Optional[int] = None, + num_kv_heads: int, + alibi_slopes: Optional[List[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + blocksparse_params: Optional[Dict[str, Any]] = None, ) -> None: + self.kv_cache_dtype = kv_cache_dtype self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) @@ -161,7 +163,7 @@ def forward( key: torch.Tensor, value: torch.Tensor, kv_cache: Optional[torch.Tensor], - attn_metadata: AttentionMetadata[HabanaAttentionMetadata], + attn_metadata: HabanaAttentionMetadata, kv_scale: float, ) -> torch.Tensor: """Forward pass with xFormers and PagedAttention. @@ -191,10 +193,11 @@ def forward( HabanaPagedAttention.write_to_paged_cache(key, value, key_cache, value_cache, attn_metadata.slot_mapping, - attn_metadata.kv_cache_dtype, + self.kv_cache_dtype, attn_metadata.prefill_metadata is not None) - if prefill_meta := attn_metadata.prefill_metadata: + if attn_metadata.num_prefills > 0: + prefill_meta = attn_metadata # Prompt run. if kv_cache is None or prefill_meta.block_tables.numel() == 0: # TODO: move this outside of model @@ -225,7 +228,8 @@ def forward( prefill_meta.max_query_len, self.alibi_slopes, ) - if decode_meta := attn_metadata.decode_metadata: + if attn_metadata.num_decode_tokens > 0: + decode_meta = attn_metadata # Decoding run. output = HabanaPagedAttention.forward_decode( query, @@ -233,7 +237,7 @@ def forward( value_cache, decode_meta.block_tables, decode_meta.seq_lens_tensor, - attn_metadata.kv_cache_dtype, + self.kv_cache_dtype, self.num_kv_heads, self.scale, self.alibi_slopes, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 3dfd38ce60b91..810e64a873647 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -348,7 +348,7 @@ def from_engine_args( from vllm.executor.cpu_executor import CPUExecutor executor_class = CPUExecutor elif engine_config.device_config.device_type == "hpu": - if engine_config.parallel_config.worker_use_ray: + if distributed_executor_backend == "ray": initialize_ray_cluster(engine_config.parallel_config) from vllm.executor.ray_habana_executor import RayHabanaExecutor executor_class = RayHabanaExecutor @@ -796,7 +796,6 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: request_outputs = self._process_model_outputs( output, scheduler_outputs.scheduled_seq_groups, scheduler_outputs.ignored_seq_groups, seq_group_metadata_list) - # Log stats. self.do_log_stats(scheduler_outputs, output) @@ -808,6 +807,14 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: # queued control plane messages, such as add/remove lora adapters. self.model_executor.stop_remote_worker_execution_loop() + out_prompt = [ro.prompt for ro in request_outputs] + out_indices = [ro.outputs[-1].index for ro in request_outputs] + out_text = [f'{ro.outputs[-1].text!r}' for ro in request_outputs] + for idx, (p, i, t) in enumerate(zip(out_prompt, out_indices, out_text)): + logger.info(f'\tPROMPT ({idx}): {p}') + logger.info(f'\tGEN IDX ({idx}): {i}') + logger.info(f'\tGEN TXT ({idx}): {t}') + logger.info('') return request_outputs def do_log_stats( diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 0db72d8d95f24..5276ada2a3086 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -1,6 +1,6 @@ import torch.nn as nn -from vllm.utils import is_cpu, is_hip, is_tpu, is_xpu +from vllm.utils import is_cpu, is_hip, is_hpu, is_tpu, is_xpu class CustomOp(nn.Module): @@ -31,6 +31,9 @@ def forward_hip(self, *args, **kwargs): def forward_xpu(self, *args, **kwargs): raise NotImplementedError + def forward_hpu(self, *args, **kwargs): + return self.forward_cuda(*args, **kwargs) + def forward_cpu(self, *args, **kwargs): # By default, we assume that CPU ops are compatible with CUDA ops. return self.forward_cuda(*args, **kwargs) @@ -54,6 +57,8 @@ def dispatch_forward(self): return self.forward_hip elif is_cpu(): return self.forward_cpu + elif is_hpu(): + return self.forward_hpu elif is_tpu(): return self.forward_tpu elif is_xpu(): diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index d8b25ea9566e3..43015068b6685 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -73,6 +73,41 @@ def forward_cuda( ) return out + def forward_hpu( + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + from vllm._ipex_ops import ipex_ops as ops + + if residual is not None: + if x.device.type == "hpu" and FusedRMSNorm: + orig_dtype = x.dtype + orig_shape = x.shape + residual += x.view(residual.shape) + # Note: FusedRMSNorm requires 3D tensors as inputs + x = FusedRMSNorm.apply(residual.float(), self.weight.float(), self.variance_epsilon) + return x.to(orig_dtype).view(orig_shape), residual + ops.fused_add_rms_norm( + x, + residual, + self.weight.data, + self.variance_epsilon, + ) + return x, residual + if x.device.type == "hpu" and FusedRMSNorm: + orig_dtype = x.dtype + x = FusedRMSNorm.apply(x.float(), self.weight.float(), self.variance_epsilon) + return x.to(orig_dtype) + out = torch.empty_like(x) + ops.rms_norm( + out, + x, + self.weight.data, + self.variance_epsilon, + ) + return out + def forward_xpu( self, x: torch.Tensor, @@ -108,6 +143,7 @@ def forward_xpu( ) return out + def extra_repr(self) -> str: s = f"hidden_size={self.weight.data.size(0)}" s += f", eps={self.variance_epsilon}" diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 6a9cb6f066ea1..56eaaa490b025 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -15,12 +15,11 @@ import torch import habana_frameworks.torch as htorch -from vllm.attention import (AttentionMetadata, AttentionMetadataPerStage, - get_attn_backend) +from vllm.attention import (AttentionMetadata, get_attn_backend) from vllm.config import (DeviceConfig, LoadConfig, CacheConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig) from vllm.distributed import broadcast_tensor_dict -from vllm.distributed.parallel_state import get_cpu_world_group +#from vllm.distributed.parallel_state import get_cpu_world_group from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping from vllm.lora.request import LoRARequest @@ -98,13 +97,14 @@ def subtuple(obj: object, typename: str, to_copy: List[str], to_override: Dict[s def align_workers(value, op): - group = get_cpu_world_group() - world_size = torch.distributed.get_world_size() - if world_size <= 1: - return value - value_t = torch.tensor(value, device='cpu') - torch.distributed.all_reduce(value_t, op=op, group=group) - return value_t.item() + #group = get_cpu_world_group() + #world_size = torch.distributed.get_world_size() + #if world_size <= 1: + # return value + #value_t = torch.tensor(value, device='cpu') + #torch.distributed.all_reduce(value_t, op=op, group=group) + #return value_t.item() + return 0 class HpuModelAdapter(): @@ -112,7 +112,7 @@ def __init__(self, model): self.model = model def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device, dtype): - prefill_metadata = attn_metadata.prefill_metadata + prefill_metadata = attn_metadata if prefill_metadata is None: return attn_metadata #FIXME: Restore alibi support @@ -132,8 +132,9 @@ def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device, dtype): .masked_fill_(mask, -math.inf)) #FIXME: Restore sliding window support #if self.sliding_window is not None: - prefill_metadata = prefill_metadata._replace(attn_bias=attn_bias) - attn_metadata = attn_metadata._replace(prefill_metadata=prefill_metadata) + #prefill_metadata = prefill_metadata._replace(attn_bias=attn_bias) +# attn_metadata = attn_metadata._replace(prefill_metadata=prefill_metadata) + attn_metadata.attn_bias = attn_bias return attn_metadata else: # FIXME: This needs updating... @@ -149,6 +150,7 @@ def forward(self, *args, **kwargs): kwargs.pop('bypass_hpu_graphs') # required for PT eager input_ids = kwargs['input_ids'] kwargs['attn_metadata'] = self._set_attn_bias(kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1), input_ids.device, torch.bfloat16) + import pdb; pdb.set_trace() hidden_states = self.model(*args, **kwargs) hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) hidden_states = hidden_states.index_select(0, selected_token_indices) @@ -164,7 +166,7 @@ def sample(self, *args, **kwargs): class PreparePromptMetadata(NamedTuple): input_tokens: List[int] input_positions: List[int] - attn_metadata: Optional[AttentionMetadataPerStage] + attn_metadata: Optional[AttentionMetadata] seq_lens: List[int] query_lens: List[int] lora_index_mapping: List[int] @@ -241,6 +243,7 @@ def __init__( self.scheduler_config = scheduler_config self.lora_config = lora_config self.load_config = load_config + self.cache_config = cache_config self.is_driver_worker = is_driver_worker self.profiler = Profiler() @@ -261,7 +264,14 @@ def __init__( self.vision_language_config = vision_language_config self.attn_backend = get_attn_backend( - self.model_config.dtype if model_config is not None else None) + self.model_config.get_num_attention_heads(self.parallel_config), + self.model_config.get_head_size(), + self.model_config.get_num_kv_heads(self.parallel_config), + self.model_config.get_sliding_window(), + self.model_config.dtype, + self.kv_cache_dtype, + self.block_size, + ) # Lazy initialization self.lora_manager: LRUCacheWorkerLoRAManager = None @@ -280,6 +290,7 @@ def load_model(self) -> None: vision_language_config=self.vision_language_config, parallel_config=self.parallel_config, scheduler_config=self.scheduler_config, + cache_config=self.cache_config ) logger.info(f"Pre-loading model weights on {next(self.model.parameters()).device} took {m_getmodel.get_summary_string()}") @@ -447,6 +458,8 @@ def _prepare_prompt( slot_mapping[-1].append(slot) max_query_len = max(query_lens) + sum_query_len = sum(query_lens) + real_num_seqs = len(query_lens) assert max_query_len > 0 context_lens_tensor = torch.tensor(context_lens, @@ -514,6 +527,10 @@ def _prepare_prompt( context_lens_tensor=context_lens_tensor, block_tables=block_tables, use_cuda_graph=False, + num_prefills=real_num_seqs, + num_prefill_tokens=sum_query_len, + num_decode_tokens=0, + slot_mapping=slot_mapping ) return PreparePromptMetadata( input_tokens=input_tokens, @@ -593,7 +610,7 @@ def _prepare_decode( seq_lens_tensor = torch.tensor(seq_lens, dtype=torch.int, device=self.device) - + num_decode_tokens = sum(seq_lens) max_block_table_len = max( len(block_table) for block_table in block_tables) block_tables = make_tensor_with_pad( @@ -613,6 +630,10 @@ def _prepare_decode( context_lens_tensor=None, block_tables=block_tables, use_cuda_graph=False, + num_prefills=0, + num_prefill_tokens=0, + num_decode_tokens=num_decode_tokens, + slot_mapping=slot_mapping ) return PrepareDecodeMetadata( input_tokens=input_tokens, @@ -772,25 +793,26 @@ def prepare_input_tensors( decode_attn_metadata = self.attn_backend.make_metadata( **metadata_dict) - attn_metadata = AttentionMetadata( - num_prefills=num_prefills, - slot_mapping=slot_mapping, - num_prefill_tokens=num_prefill_tokens, - num_decode_tokens=num_decode_tokens, - prefill_metadata=prefill_attn_metadata, - decode_metadata=decode_attn_metadata, - kv_cache_dtype=self.kv_cache_dtype, - ) + attn_metadata = prefill_attn_metadata if prefill_attn_metadata is not None else decode_attn_metadata +# attn_metadata = AttentionMetadata( +# num_prefills=num_prefills, +# slot_mapping=slot_mapping, +# num_prefill_tokens=num_prefill_tokens, +# num_decode_tokens=num_decode_tokens, +# prefill_metadata=prefill_attn_metadata, +# decode_metadata=decode_attn_metadata, +# kv_cache_dtype=self.kv_cache_dtype, +# ) return (input_tokens, input_positions, attn_metadata, sampling_metadata, lora_requests, lora_mapping, multi_modal_input) def _seq_len(self, attn_metadata): - if attn_metadata.prefill_metadata: + if attn_metadata.num_prefills != 0: return attn_metadata.slot_mapping.size(1) else: - return attn_metadata.decode_metadata.block_tables.size(1) * self.block_size + return attn_metadata.block_tables.size(1) * self.block_size def trim_attn_metadata(self, metadata: AttentionMetadata) -> object: prefill_metadata = subtuple(metadata.prefill_metadata, @@ -844,7 +866,7 @@ def execute_model( "input_ids": input_tokens, "positions": input_positions, "kv_caches": kv_caches, - "attn_metadata": self.trim_attn_metadata(attn_metadata), + "attn_metadata": attn_metadata, } if self.vision_language_config: execute_model_kwargs.update({"image_input": multi_modal_input}) diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index e253e4479a855..7abaa155708c6 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -15,7 +15,6 @@ VisionLanguageConfig) from vllm.distributed import (broadcast_tensor_dict, ensure_model_parallel_initialized, - get_tensor_model_parallel_cpu_group, init_distributed_environment) from vllm.lora.request import LoRARequest from vllm.model_executor import set_random_seed @@ -159,7 +158,7 @@ def initialize_cache(self, num_gpu_blocks: int, def _init_cache_engine(self) -> None: assert self.cache_config.num_gpu_blocks is not None self.cache_engine = CacheEngine(self.cache_config, self.model_config, - self.parallel_config) + self.parallel_config, self.device_config) self.hpu_cache = self.cache_engine.gpu_cache htorch.hpu.synchronize() # we want to materialize cache tensors before we proceed with graph capture/execution From 07a200e60e8dc5bce66e240948cb13a6ff121ede Mon Sep 17 00:00:00 2001 From: Michal Szutenberg <37601244+szutenberg@users.noreply.github.com> Date: Mon, 24 Jun 2024 18:13:15 +0200 Subject: [PATCH 040/819] Add Mistal&Mixtral supported configurations (#64) --- README_GAUDI.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README_GAUDI.md b/README_GAUDI.md index 24d3fe0761f54..3b72ad71069c4 100644 --- a/README_GAUDI.md +++ b/README_GAUDI.md @@ -100,6 +100,10 @@ The following configurations have been validated to be function with Gaudi devic - [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) with tensor parallelism 8x HPU, BF16 datatype with random or greedy sampling +- [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) + on single HPU or with tensor parallelism 2x HPU, BF16 datatype with random or greedy sampling +- [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) + with tensor parallelism 2x HPU, BF16 datatype with random or greedy sampling From d12bff7f0d68084b9c851616933254db1ecab901 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 25 Jun 2024 15:40:06 +0300 Subject: [PATCH 041/819] add pin_lora to habana components --- vllm/executor/habana_executor.py | 5 ++++- vllm/worker/habana_worker.py | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py index cfad194bf9cca..cbb30e39e11a4 100644 --- a/vllm/executor/habana_executor.py +++ b/vllm/executor/habana_executor.py @@ -113,7 +113,7 @@ def execute_model( output = self.driver_worker.execute_model(execute_model_req) return output - + def add_lora(self, lora_request: LoRARequest) -> bool: raise NotImplementedError("LoRA is not implemented for HPU backend.") @@ -123,6 +123,9 @@ def remove_lora(self, lora_id: int) -> bool: def list_loras(self) -> List[int]: raise NotImplementedError("LoRA is not implemented for HPU backend.") + def pin_lora(self) -> List[int]: + raise NotImplementedError("LoRA is not implemented for HPU backend.") + def check_health(self) -> None: # GPUExecutor will always be healthy as long as # it's running. diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index 7abaa155708c6..1a82aa9ef7738 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -235,6 +235,9 @@ def remove_lora(self, lora_id: int) -> bool: def list_loras(self) -> Set[int]: raise NotImplementedError("LoRA is not implemented for HPU backend.") + def pin_lora(self, lora_id: int) -> bool: + raise NotImplementedError("LoRA is not implemented for HPU backend.") + @property def max_model_len(self) -> int: return self.model_config.max_model_len From efce3c48f3752c652d3a17504a56c7df4ed34f6a Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 25 Jun 2024 15:57:10 +0300 Subject: [PATCH 042/819] add WA for model loader --- vllm/engine/llm_engine.py | 16 ++++++++-------- vllm/model_executor/model_loader/loader.py | 3 ++- vllm/worker/habana_model_runner.py | 1 - 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 4c76649a2d862..aadb8e08de5d4 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -832,14 +832,14 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: # queued control plane messages, such as add/remove lora adapters. self.model_executor.stop_remote_worker_execution_loop() - out_prompt = [ro.prompt for ro in request_outputs] - out_indices = [ro.outputs[-1].index for ro in request_outputs] - out_text = [f'{ro.outputs[-1].text!r}' for ro in request_outputs] - for idx, (p, i, t) in enumerate(zip(out_prompt, out_indices, out_text)): - logger.info(f'\tPROMPT ({idx}): {p}') - logger.info(f'\tGEN IDX ({idx}): {i}') - logger.info(f'\tGEN TXT ({idx}): {t}') - logger.info('') +# out_prompt = [ro.prompt for ro in request_outputs] +# out_indices = [ro.outputs[-1].index for ro in request_outputs] +# out_text = [f'{ro.outputs[-1].text!r}' for ro in request_outputs] +# for idx, (p, i, t) in enumerate(zip(out_prompt, out_indices, out_text)): +# logger.info(f'\tPROMPT ({idx}): {p}') +# logger.info(f'\tGEN IDX ({idx}): {i}') +# logger.info(f'\tGEN TXT ({idx}): {t}') +# logger.info('') return request_outputs def do_log_stats( diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index d3babcf9c3451..9f41b3e28e6ed 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -257,7 +257,7 @@ def load_model(self, *, model_config: ModelConfig, scheduler_config: SchedulerConfig, cache_config: CacheConfig) -> nn.Module: with set_default_torch_dtype(model_config.dtype): - with torch.device(device_config.device): + with torch.device('cpu'): # FIXME(kzawora): this is a nasty workaround!!! model = _initialize_model(model_config, self.load_config, lora_config, vision_language_config, cache_config) @@ -277,6 +277,7 @@ def load_model(self, *, model_config: ModelConfig, # to use quant_method. if hasattr(module, "process_weights_after_loading"): module.process_weights_after_loading() + model = model.to('hpu') # FIXME(kzawora): this is a nasty workaround!!! return model.eval() diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 56eaaa490b025..4c6b6600397b4 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -150,7 +150,6 @@ def forward(self, *args, **kwargs): kwargs.pop('bypass_hpu_graphs') # required for PT eager input_ids = kwargs['input_ids'] kwargs['attn_metadata'] = self._set_attn_bias(kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1), input_ids.device, torch.bfloat16) - import pdb; pdb.set_trace() hidden_states = self.model(*args, **kwargs) hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) hidden_states = hidden_states.index_select(0, selected_token_indices) From c1e758927a4735f3c12bbe65e40a29317563305a Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 25 Jun 2024 17:24:43 +0300 Subject: [PATCH 043/819] fix api mismatches with ray --- vllm/executor/ray_habana_executor.py | 8 ++++++-- vllm/worker/habana_model_runner.py | 4 ---- vllm/worker/habana_worker.py | 3 ++- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py index a17f509f11658..21d7c5ffceff2 100644 --- a/vllm/executor/ray_habana_executor.py +++ b/vllm/executor/ray_habana_executor.py @@ -31,7 +31,7 @@ def _init_executor(self) -> None: assert (not self.speculative_config ), "Speculative decoding not yet supported for RayGPU backend." - assert self.parallel_config.worker_use_ray + assert self.parallel_config.distributed_executor_backend == "ray" placement_group = self.parallel_config.placement_group # Disable Ray usage stats collection. @@ -146,7 +146,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", max_concurrent_workers=self.parallel_config. max_parallel_loading_workers) - def execute_model( + def _driver_execute_model( self, execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: all_outputs = self._run_workers( @@ -273,6 +273,10 @@ def _check_if_any_actor_is_dead(self): raise RuntimeError("At least one Worker is dead. " f"Dead Workers: {dead_actors}. ") + def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: + """Wait for futures returned from _run_workers() with + async_run_remote_workers_only to complete.""" + ray.get(parallel_worker_tasks) class RayHabanaExecutorAsync(RayHabanaExecutor, DistributedGPUExecutorAsync): diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 4c6b6600397b4..93a44654f5375 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -758,15 +758,11 @@ def prepare_input_tensors( metadata_dict = broadcast_tensor_dict(src=0) input_tokens = metadata_dict.pop("input_tokens") input_positions = metadata_dict.pop("input_positions") - slot_mapping = metadata_dict.pop("slot_mapping") - num_prefills = metadata_dict.pop("num_prefills") selected_token_indices = metadata_dict.pop( "selected_token_indices") lora_mapping = metadata_dict.pop("lora_mapping") lora_requests = metadata_dict.pop("lora_requests") multi_modal_input = metadata_dict.pop("multi_modal_input") - num_prefill_tokens = metadata_dict.pop("num_prefill_tokens") - num_decode_tokens = metadata_dict.pop("num_decode_tokens") batch_type = metadata_dict.pop("batch_type") # Create an attention metadata. diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index 1a82aa9ef7738..0d42304d3b47a 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -12,7 +12,7 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, - VisionLanguageConfig) + VisionLanguageConfig, SpeculativeConfig) from vllm.distributed import (broadcast_tensor_dict, ensure_model_parallel_initialized, init_distributed_environment) @@ -45,6 +45,7 @@ def __init__( distributed_init_method: str, lora_config: Optional[LoRAConfig] = None, vision_language_config: Optional[VisionLanguageConfig] = None, + speculative_config: Optional[SpeculativeConfig] = None, is_driver_worker: bool = False, ) -> None: self.model_config = model_config From 58bd037c7ac570c5139aa4f9e36578955f92ac8b Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 25 Jun 2024 18:21:13 +0300 Subject: [PATCH 044/819] tensor parallel fixes --- vllm/_custom_ops.py | 12 +-- vllm/executor/ray_habana_executor.py | 82 +++++++++---------- .../model_executor/layers/logits_processor.py | 2 +- vllm/worker/habana_worker.py | 34 ++++++++ 4 files changed, 82 insertions(+), 48 deletions(-) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index e899c57ce0e86..57a96c9f988b5 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -46,10 +46,12 @@ def wrapper(*args, **kwargs): return wrapper _ops = torch.ops._C +_cache_ops = torch.ops._C_cache_ops if importlib.util.find_spec('habana_frameworks') is not None: from vllm.hpu import ops as vllm_ops from vllm.hpu import cache_ops as vllm_cache_ops _ops = vllm_ops + _cache_ops = vllm_cache_ops # activation ops def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: @@ -369,7 +371,7 @@ def reshape_and_cache( kv_cache_dtype: str, kv_scale: float, ) -> None: - torch.ops._C_cache_ops.reshape_and_cache(key, value, key_cache, + _cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype, kv_scale) @@ -382,7 +384,7 @@ def reshape_and_cache_flash( slot_mapping: torch.Tensor, kv_cache_dtype: str, ) -> None: - torch.ops._C_cache_ops.reshape_and_cache_flash(key, value, key_cache, + _cache_ops.reshape_and_cache_flash(key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype) @@ -390,19 +392,19 @@ def reshape_and_cache_flash( def copy_blocks(key_caches: List[torch.Tensor], value_caches: List[torch.Tensor], block_mapping: torch.Tensor) -> None: - torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping) + _cache_ops.copy_blocks(key_caches, value_caches, block_mapping) def swap_blocks(src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor) -> None: - torch.ops._C_cache_ops.swap_blocks(src, dst, block_mapping) + _cache_ops.swap_blocks(src, dst, block_mapping) def convert_fp8(output: torch.Tensor, input: torch.Tensor, scale: float = 1.0, kv_dtype: str = "fp8") -> None: - torch.ops._C_cache_ops.convert_fp8(output, input, scale, kv_dtype) + _cache_ops.convert_fp8(output, input, scale, kv_dtype) def get_device_attribute(attribute: int, device: int) -> int: diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py index 21d7c5ffceff2..b9c800e85728b 100644 --- a/vllm/executor/ray_habana_executor.py +++ b/vllm/executor/ray_habana_executor.py @@ -147,22 +147,22 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", max_parallel_loading_workers) def _driver_execute_model( - self, - execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: - all_outputs = self._run_workers( - "execute_model", - driver_kwargs={"execute_model_req": execute_model_req}, - use_ray_compiled_dag=USE_RAY_COMPILED_DAG) + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + """Run execute_model in the driver worker. - # Only the driver worker returns the sampling results. - return all_outputs[0] + Passing None will cause the driver to stop the model execution + loop running in each of the remote workers. + """ + return self.driver_worker.execute_method("execute_model", + execute_model_req) def _run_workers( self, method: str, *args, - driver_args: Optional[Tuple[Any, ...]] = None, - driver_kwargs: Optional[Dict[str, Any]] = None, + async_run_remote_workers_only: bool = False, all_args: Optional[List[Tuple[Any, ...]]] = None, all_kwargs: Optional[List[Dict[str, Any]]] = None, use_dummy_driver: bool = False, @@ -173,9 +173,11 @@ def _run_workers( """Runs the given method on all workers. Can be used in the following ways: + - async_run_remote_workers_only: If True the method will be run only + in the remote workers, not the driver worker. It will also be + run asynchronously and return a list of futures rather than blocking + on the results. - args/kwargs: All workers share the same args/kwargs - - args/kwargs and driver_args/driver_kwargs: Driver worker has - different args - all_args/all_kwargs: args/kwargs for each worker are specified individually """ @@ -184,11 +186,6 @@ def _run_workers( raise NotImplementedError( "max_concurrent_workers is not supported yet.") - if driver_args is None: - driver_args = args if all_args is None else all_args[0] - if driver_kwargs is None: - driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0] - count = len(self.workers) all_worker_args = repeat(args, count) if all_args is None \ else islice(all_args, 1, None) @@ -200,6 +197,7 @@ def _run_workers( # input. TODO(sang): Fix it. assert self.forward_dag is not None output_channels = self.forward_dag.execute(1) + ray_worker_outputs = [] else: # Start the ray workers first. ray_worker_outputs = [ @@ -209,6 +207,13 @@ def _run_workers( ) in zip(self.workers, all_worker_args, all_worker_kwargs) ] + if async_run_remote_workers_only: + # Just return futures + return ray_worker_outputs + + driver_args = args if all_args is None else all_args[0] + driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0] + # Start the driver worker after all the ray workers. if not use_dummy_driver: driver_worker_output = self.driver_worker.execute_method( @@ -235,6 +240,11 @@ def _run_workers( return [driver_worker_output] + ray_worker_outputs + def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: + """Wait for futures returned from _run_workers() with + async_run_remote_workers_only to complete.""" + ray.get(parallel_worker_tasks) + def _compiled_ray_dag(self): import pkg_resources required_version = "2.9" @@ -282,30 +292,18 @@ class RayHabanaExecutorAsync(RayHabanaExecutor, DistributedGPUExecutorAsync): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.driver_executor = make_async(self.driver_worker.execute_method) + self.driver_exec_method = make_async(self.driver_worker.execute_method) - async def _run_workers_async( + async def _driver_execute_model_async( self, - method: str, - *args, - driver_args: Optional[Tuple[Any, ...]] = None, - driver_kwargs: Optional[Dict[str, Any]] = None, - **kwargs, - ) -> Any: - """Runs the given method on all workers.""" - coros = [] - - if driver_args is None: - driver_args = args - if driver_kwargs is None: - driver_kwargs = kwargs - - coros.append( - self.driver_executor(method, *driver_args, **driver_kwargs)) - - # Run the ray workers asynchronously. - for worker in self.workers: - coros.append(worker.execute_method.remote(method, *args, **kwargs)) - - all_outputs = await asyncio.gather(*coros) - return all_outputs + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + return await self.driver_exec_method("execute_model", + execute_model_req) + + async def _start_worker_execution_loop(self): + coros = [ + worker.execute_method.remote("start_worker_execution_loop") + for worker in self.workers + ] + return await asyncio.gather(*coros) diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index 607f93d8b335d..de0f9d36dce87 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -54,7 +54,7 @@ def forward( # NOTE(kzawora): allgather on HPU will cause logits to be not None, # and we need to guard against applying logits processors on non-driver worker #if logits is not None and sampling_metadata.seq_groups is not None: - if logits is not None: + if logits is not None and sampling_metadata.seq_groups is not None: if self.scale != 1.0: logits *= self.scale diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index 0d42304d3b47a..e1c374124633f 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -227,6 +227,40 @@ def execute_model( self.hpu_cache) return [output] + @torch.inference_mode() + def start_worker_execution_loop(self) -> None: + """Execute model loop in parallel worker. + + You can stop the loop by executing a driver worker with an empty output. + See `stop_remote_worker_execution_loop` for more details. + """ + while self._execute_model_non_driver(): + pass + + def _execute_model_non_driver(self) -> bool: + """Execute model in parallel worker. + + Returns True iff there are remaining sequences to process. + """ + assert not self.is_driver_worker + data = broadcast_tensor_dict(src=0) + if not data: + return False + + num_seq_groups = data.get("num_seq_groups", 0) + blocks_to_swap_in = data.get("blocks_to_swap_in") + blocks_to_swap_out = data.get("blocks_to_swap_out") + blocks_to_copy = data.get("blocks_to_copy") + self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy) + + # If there is no input, we don't need to execute the model. + if num_seq_groups == 0: + return False + + self.model_runner.execute_model(None, self.hpu_cache) + return True + + def add_lora(self, lora_request: LoRARequest) -> bool: raise NotImplementedError("LoRA is not implemented for HPU backend.") From 1d6409b0e327cb24abaf044b7367323e8d7b3309 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 25 Jun 2024 18:41:33 +0300 Subject: [PATCH 045/819] workers cpu alignment fix --- vllm/worker/habana_model_runner.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 93a44654f5375..2c243ade40a23 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -19,7 +19,7 @@ from vllm.config import (DeviceConfig, LoadConfig, CacheConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig) from vllm.distributed import broadcast_tensor_dict -#from vllm.distributed.parallel_state import get_cpu_world_group +from vllm.distributed.parallel_state import get_world_group from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping from vllm.lora.request import LoRARequest @@ -97,14 +97,13 @@ def subtuple(obj: object, typename: str, to_copy: List[str], to_override: Dict[s def align_workers(value, op): - #group = get_cpu_world_group() - #world_size = torch.distributed.get_world_size() - #if world_size <= 1: - # return value - #value_t = torch.tensor(value, device='cpu') - #torch.distributed.all_reduce(value_t, op=op, group=group) - #return value_t.item() - return 0 + group = get_world_group().cpu_group + world_size = torch.distributed.get_world_size() + if world_size <= 1: + return value + value_t = torch.tensor(value, device='cpu') + torch.distributed.all_reduce(value_t, op=op, group=group) + return value_t.item() class HpuModelAdapter(): From 952b7c4ffa5a208b8f82de701f8d4a4c12913ec5 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 25 Jun 2024 20:11:05 +0300 Subject: [PATCH 046/819] prefill/decode metadata fixes --- vllm/attention/backends/habana_attn.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py index 518cbae81f465..7d5fb5146cc2f 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/habana_attn.py @@ -194,21 +194,20 @@ def forward( value_cache, attn_metadata.slot_mapping, self.kv_cache_dtype, - attn_metadata.prefill_metadata is not None) + attn_metadata.num_prefills > 0) if attn_metadata.num_prefills > 0: - prefill_meta = attn_metadata # Prompt run. - if kv_cache is None or prefill_meta.block_tables.numel() == 0: + if kv_cache is None or attn_metadata.block_tables.numel() == 0: # TODO: move this outside of model - assert prefill_meta.attn_bias is not None, 'attn_bias must be set before calling model.forward!' + assert attn_metadata.attn_bias is not None, 'attn_bias must be set before calling model.forward!' query_shape = (batch_size, seq_len, self.num_heads, self.head_size) kv_shape = (batch_size, seq_len_kv, self.num_kv_heads, self.head_size) out = xops.prompt_attention( query.view(query_shape), key.view(kv_shape), value.view(kv_shape), - attn_bias=prefill_meta.attn_bias, + attn_bias=attn_metadata.attn_bias, p=0.0, scale=self.scale, ) @@ -221,22 +220,21 @@ def forward( value, key_cache, value_cache, - prefill_meta.block_tables, - prefill_meta.subquery_start_loc, - prefill_meta.seq_lens_tensor, - prefill_meta.context_lens_tensor, - prefill_meta.max_query_len, + attn_metadata.block_tables, + attn_metadata.subquery_start_loc, + attn_metadata.seq_lens_tensor, + attn_metadata.context_lens_tensor, + attn_metadata.max_query_len, self.alibi_slopes, ) if attn_metadata.num_decode_tokens > 0: - decode_meta = attn_metadata # Decoding run. output = HabanaPagedAttention.forward_decode( query, key_cache, value_cache, - decode_meta.block_tables, - decode_meta.seq_lens_tensor, + attn_metadata.block_tables, + attn_metadata.seq_lens_tensor, self.kv_cache_dtype, self.num_kv_heads, self.scale, From cf04c81db857d73028d4959d3053229d7015467d Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 25 Jun 2024 20:25:00 +0300 Subject: [PATCH 047/819] re-enable attn metadata trimming --- vllm/worker/habana_model_runner.py | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 2c243ade40a23..da6dfb847a6ed 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -131,9 +131,7 @@ def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device, dtype): .masked_fill_(mask, -math.inf)) #FIXME: Restore sliding window support #if self.sliding_window is not None: - #prefill_metadata = prefill_metadata._replace(attn_bias=attn_bias) -# attn_metadata = attn_metadata._replace(prefill_metadata=prefill_metadata) - attn_metadata.attn_bias = attn_bias + attn_metadata = attn_metadata._replace(attn_bias=attn_bias) return attn_metadata else: # FIXME: This needs updating... @@ -809,22 +807,15 @@ def _seq_len(self, attn_metadata): return attn_metadata.block_tables.size(1) * self.block_size def trim_attn_metadata(self, metadata: AttentionMetadata) -> object: - prefill_metadata = subtuple(metadata.prefill_metadata, - 'TrimmedPrefillMetadata', + prefill_metadata = subtuple(metadata, + 'TrimmedAttentionMetadata', ['block_tables', 'seq_lens_tensor', - 'attn_bias']) - decode_metadata = subtuple(metadata.decode_metadata, - 'TrimmedDecodeMetadata', - ['block_tables', - 'seq_lens_tensor', - ]) - return subtuple(metadata, - 'TrimmedMetadata', - ['slot_mapping', - 'kv_cache_dtype'], - {'prefill_metadata': prefill_metadata, - 'decode_metadata': decode_metadata}) + 'attn_bias', + 'num_prefills', + 'num_decode_tokens', + 'slot_mapping']) + return prefill_metadata @torch.inference_mode() def execute_model( @@ -860,7 +851,7 @@ def execute_model( "input_ids": input_tokens, "positions": input_positions, "kv_caches": kv_caches, - "attn_metadata": attn_metadata, + "attn_metadata": self.trim_attn_metadata(attn_metadata), } if self.vision_language_config: execute_model_kwargs.update({"image_input": multi_modal_input}) From 2b850fe749946c132524eba90d2a63995b22b52e Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 25 Jun 2024 20:47:34 +0300 Subject: [PATCH 048/819] worker_use_ray fix --- vllm/engine/async_llm_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 84d8b7913b4e0..a35820d36d322 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -390,7 +390,7 @@ def from_engine_args( from vllm.executor.cpu_executor import CPUExecutorAsync executor_class = CPUExecutorAsync elif engine_config.device_config.device_type == "hpu": - if engine_config.parallel_config.worker_use_ray or engine_args.engine_use_ray: + if distributed_executor_backend == "ray": initialize_ray_cluster(engine_config.parallel_config) from vllm.executor.ray_habana_executor import RayHabanaExecutorAsync executor_class = RayHabanaExecutorAsync From 27285993ca35682d72275011c0c3308a2fa62961 Mon Sep 17 00:00:00 2001 From: jkaniecki <153085639+jkaniecki@users.noreply.github.com> Date: Wed, 26 Jun 2024 15:27:10 +0200 Subject: [PATCH 049/819] Update ops.py (#72) --- vllm/hpu/ops.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index fa9d5ff521a6a..54dd4332902a3 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -130,6 +130,7 @@ def static_fused_moe(hidden_states, w1, w2, score, topk): num_experts = w1.shape[0] routing_weights = F.softmax(score, dim=1, dtype=torch.float32) routing_weights, selected_experts = torch.topk(routing_weights, topk, dim=-1) + routing_weights /= routing_weights.sum(dim=-1, keepdim=True) routing_weights = routing_weights.to(hidden_states.dtype) final_hidden_states = torch.zeros( (1, B, D), dtype=hidden_states.dtype, device=hidden_states.device From 4a45bbfd28417da67acbea4e5ac7eb5d673be7a8 Mon Sep 17 00:00:00 2001 From: Tomasz Zielinski <85164140+tzielinski-habana@users.noreply.github.com> Date: Thu, 27 Jun 2024 14:05:21 +0200 Subject: [PATCH 050/819] Revert "Disable value splitting on G3 (#58)" (#74) This reverts commit 47c0c5b95c4d8f52f9991a495e3d021e73ea957e. --- vllm/hpu/ops.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index 54dd4332902a3..b66f6709977c8 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -9,14 +9,11 @@ import torch.nn as nn import torch.nn.functional as F import habana_frameworks.torch as htorch -import habana_frameworks.torch.utils.experimental as htexp from typing import List, Optional, Tuple import vllm.hpu.utils as hpu_utils -# FIXME: For some reason splitting value causes DFAs on G3. This needs to be debugged -PA_SPLIT_VALUE_DEFAULT = '0' if (htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi3) else '1' -PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', PA_SPLIT_VALUE_DEFAULT) == '1') +PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', '1') == '1') def silu_and_mul(output, input): From 1fd06cc516f218b59e0342bf7b29f60e3e1f3149 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 27 Jun 2024 17:28:34 +0300 Subject: [PATCH 051/819] add collective crash WA --- vllm/distributed/communication_op.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py index 32394a07b00b9..eb3046105f820 100644 --- a/vllm/distributed/communication_op.py +++ b/vllm/distributed/communication_op.py @@ -5,9 +5,14 @@ from .parallel_state import get_tp_group +from vllm.utils import is_hpu +if is_hpu(): + import habana_frameworks.torch as htorch def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor: """All-reduce the input tensor across model parallel group.""" + if is_hpu(): + htorch.core.mark_step() return get_tp_group().all_reduce(input_) From 940f5250de17711a6a903f3fbd8695a85530c077 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 27 Jun 2024 17:34:15 +0300 Subject: [PATCH 052/819] add comment to the weird mark_step --- vllm/distributed/communication_op.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py index eb3046105f820..233be75b47f5a 100644 --- a/vllm/distributed/communication_op.py +++ b/vllm/distributed/communication_op.py @@ -12,6 +12,9 @@ def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor: """All-reduce the input tensor across model parallel group.""" if is_hpu(): + # FIXME(kzawora): this is a workaround for a bug in Habana PT bridge + # occuring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used + # (which is required for tensor parallel HPUGraph inference) htorch.core.mark_step() return get_tp_group().all_reduce(input_) From 20eafe9fae7d55ecf5a4802b1c4480158e18f60f Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Fri, 28 Jun 2024 11:15:03 +0200 Subject: [PATCH 053/819] Add more metrics to high level profiler (#63) * Add more detailed event names to profiler * Add more profiler stats * separate prompt and decode batch utilization * Add more metrics * revert engine/metrics.py changes * un-singletonify (what a funny word) habana profiler * formatting * add batch block utilization metric * fix division by zero * fix batch_block_utilization formula * minor refactors --- vllm/worker/habana_model_runner.py | 82 ++++++++++++++++++++++++++---- 1 file changed, 72 insertions(+), 10 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 6a9cb6f066ea1..1a9206a314d5c 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -241,6 +241,7 @@ def __init__( self.scheduler_config = scheduler_config self.lora_config = lora_config self.load_config = load_config + self.cache_config = cache_config self.is_driver_worker = is_driver_worker self.profiler = Profiler() @@ -267,6 +268,9 @@ def __init__( self.lora_manager: LRUCacheWorkerLoRAManager = None self.model: torch.nn.Module = None + # Profiler stats + self.profiler_counter_helper = HabanaProfilerCounterHelper() + self._setup_buckets() def load_model(self) -> None: @@ -876,19 +880,18 @@ def execute_model( output.outputs = output.outputs[:real_batch_size] htorch.core.mark_step() - if self.is_driver_worker: + if self.is_driver_worker and self.profiler.enabled: # Stop recording 'execute_model' event self.profiler.end() event_end = self.profiler.get_timestamp_us() - duration = event_end - event_start - throughput = batch_size_padded / (duration / 1e6) - throughput_effective = real_batch_size / (duration / 1e6) - counters = { - 'batch_size': batch_size_padded, - 'batch_size_effective': real_batch_size, - 'throughput': throughput, - 'throughput_effective': throughput_effective - } + counters = self.profiler_counter_helper.get_counter_dict( + cache_config=self.cache_config, + duration=event_end-event_start, + seq_len=seq_len, + batch_size_padded=batch_size_padded, + real_batch_size=real_batch_size, + seq_group_metadata_list=seq_group_metadata_list, + is_prompt=is_prompt) self.profiler.record_counter(event_start, counters) return output @@ -1014,3 +1017,62 @@ def vocab_size(self) -> int: def _maybe_wrap_in_hpu_graph(model): return htorch.hpu.wrap_in_hpu_graph(HpuModelAdapter(model)) if htorch.utils.internal.is_lazy() else HpuModelAdapter(model) + + +class HabanaProfilerCounterHelper(): + def __init__(self): + self.niter = 0 + self.average_real_throughput = None + self.logged_once = False + + def get_counter_dict(self, cache_config, duration, seq_len, batch_size_padded, real_batch_size, seq_group_metadata_list, is_prompt): + throughput = batch_size_padded / (duration / 1e6) + throughput_effective = real_batch_size / (duration / 1e6) + real_seq_lens = [len(seq_data.prompt_token_ids) + len(seq_data.output_token_ids) for seq_group_metadata in seq_group_metadata_list for seq_data in seq_group_metadata.seq_data.values()] + real_max_seq_len = max(real_seq_lens) + real_num_tokens = sum(real_seq_lens) + padded_num_tokens = batch_size_padded * seq_len + batch_token_utilization = real_num_tokens / padded_num_tokens + if self.average_real_throughput is None: + self.average_real_throughput = throughput_effective + else: # https://www.heikohoffmann.de/htmlthesis/node134.html + self.average_real_throughput = self.average_real_throughput + 1/(self.niter+1) * (throughput_effective-self.average_real_throughput) + phase = "prompt" if is_prompt else "decode" + counters = { + f'{phase}_bucket_batch_size': batch_size_padded, + f'{phase}_batch_size': real_batch_size, + f'{phase}_bucket_seq_len': seq_len, + f'{phase}_seq_len': real_max_seq_len, + f'{phase}_bucket_gen_throughput': throughput, + f'{phase}_real_gen_throughput': throughput_effective, + f'{phase}_batch_token_utilization': batch_token_utilization, + 'average_real_throughput': self.average_real_throughput, + 'engine_iteration': self.niter, + } + self.niter += 1 + if is_prompt: + prompt_seq_lens = [len(seq_data.prompt_token_ids) for seq_group_metadata in seq_group_metadata_list for seq_data in seq_group_metadata.seq_data.values()] + prompt_bucket_in_throughput = (seq_len*batch_size_padded) / (duration / 1e6) + prompt_real_in_throughput = sum(prompt_seq_lens) / (duration / 1e6) + counters[f'{phase}_bucket_in_throughput'] = prompt_bucket_in_throughput + counters[f'{phase}_real_in_throughput'] = prompt_real_in_throughput + + # KV cache might not be created yet (e.g. for profiling run) + if cache_config.num_gpu_blocks is not None and cache_config.num_gpu_blocks != 0: + cache_num_blocks_used = [math.ceil(sl/cache_config.block_size) for sl in real_seq_lens] + cache_total_num_blocks_used = sum(cache_num_blocks_used) + num_cache_blocks = cache_config.num_gpu_blocks + cache_total_num_free_blocks = num_cache_blocks - cache_total_num_blocks_used + cache_computed_utilization = cache_total_num_blocks_used / num_cache_blocks + max_blocks_per_seq = math.ceil(seq_len/cache_config.block_size) + batch_block_utilization = cache_total_num_blocks_used / (batch_size_padded * max_blocks_per_seq) + counters['cache_num_blocks_used'] = cache_total_num_blocks_used + counters['cache_num_free_blocks'] = cache_total_num_free_blocks + counters['cache_computed_utilization'] = cache_computed_utilization + counters[f'{phase}_batch_block_utilization'] = batch_block_utilization + if not self.logged_once: + counters['const_cache_num_blocks'] = cache_config.num_gpu_blocks + counters['const_gpu_memory_utilization'] = cache_config.gpu_memory_utilization + counters['const_block_size'] = cache_config.block_size + self.logged_once = True + return counters From a3ac366a2cc9b47d5f167573b4f4baa3f8424c04 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 1 Jul 2024 13:25:09 +0300 Subject: [PATCH 054/819] Revert test changes --- tests/async_engine/test_api_server.py | 2 - tests/async_engine/test_openapi_server_ray.py | 3 - .../test_basic_correctness.py | 3 +- .../basic_correctness/test_chunked_prefill.py | 2 - tests/basic_correctness/test_preemption.py | 6 - tests/core/block/e2e/test_correctness.py | 7 - tests/core/test_chunked_prefill_scheduler.py | 7 - tests/core/test_scheduler.py | 8 - tests/distributed/test_pynccl.py | 20 +- tests/engine/test_computed_prefix_blocks.py | 2 - tests/engine/test_skip_tokenizer_init.py | 2 - tests/engine/test_stop_reason.py | 2 - tests/engine/test_stop_strings.py | 3 - tests/entrypoints/openai/test_chat.py | 2 - .../openai/test_oot_registration.py | 3 +- tests/kernels/test_activation.py | 25 +-- tests/kernels/test_attention.py | 87 +++------ tests/kernels/test_cache.py | 176 +++--------------- tests/kernels/test_layernorm.py | 18 +- tests/kernels/test_moe.py | 3 - tests/kernels/test_pos_encoding.py | 19 +- tests/kernels/test_prefix_prefill.py | 10 +- tests/kernels/test_rand.py | 2 - tests/kernels/test_sampler.py | 4 - tests/lora/test_baichuan.py | 3 - tests/lora/test_chatglm3.py | 3 - tests/lora/test_gemma.py | 3 - tests/lora/test_layer_variation.py | 2 - tests/lora/test_layers.py | 6 - tests/lora/test_llama.py | 6 +- tests/lora/test_lora.py | 4 - tests/lora/test_lora_manager.py | 9 - tests/lora/test_punica.py | 4 - tests/lora/test_quant_model.py | 2 - tests/lora/test_worker.py | 4 - tests/metrics/test_metrics.py | 4 - tests/models/test_big_models.py | 4 +- tests/models/test_llava.py | 1 - tests/models/test_mistral.py | 2 - tests/models/test_models.py | 3 - tests/models/test_oot_registration.py | 1 - tests/quantization/test_configs.py | 5 +- tests/samplers/test_beam_search.py | 2 - tests/samplers/test_logits_processor.py | 2 - tests/samplers/test_logprobs.py | 2 - tests/samplers/test_ranks.py | 2 - tests/samplers/test_rejection_sampler.py | 18 +- tests/samplers/test_sampler.py | 30 ++- tests/samplers/test_seeded_generate.py | 2 - tests/spec_decode/e2e/test_compatibility.py | 4 - tests/spec_decode/e2e/test_logprobs.py | 6 - .../e2e/test_multistep_correctness.py | 12 -- .../spec_decode/e2e/test_ngram_correctness.py | 5 - tests/spec_decode/test_batch_expansion.py | 2 - tests/spec_decode/test_metrics.py | 6 - tests/spec_decode/test_multi_step_worker.py | 6 - tests/spec_decode/test_ngram_worker.py | 5 - tests/spec_decode/test_spec_decode_worker.py | 9 +- tests/tensorizer_loader/test_tensorizer.py | 10 +- tests/test_config.py | 3 +- tests/test_logits_processor.py | 13 +- tests/tokenization/test_detokenize.py | 9 - tests/worker/test_model_runner.py | 7 +- tests/worker/test_swap.py | 4 +- 64 files changed, 107 insertions(+), 534 deletions(-) diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index 8b0e79cf9a6ee..7f57d5cf9b182 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -6,7 +6,6 @@ import pytest import requests -from vllm.utils import is_hpu def _query_server(prompt: str, max_tokens: int = 5) -> dict: @@ -45,7 +44,6 @@ def api_server(tokenizer_pool_size: int, engine_use_ray: bool, uvicorn_process.terminate() -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("tokenizer_pool_size", [0, 2]) @pytest.mark.parametrize("worker_use_ray", [False, True]) @pytest.mark.parametrize("engine_use_ray", [False, True]) diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py index 50fb22901f957..332937b874e93 100644 --- a/tests/async_engine/test_openapi_server_ray.py +++ b/tests/async_engine/test_openapi_server_ray.py @@ -3,7 +3,6 @@ # using Ray for overall ease of process management, parallel requests, # and debugging. import ray -from vllm.utils import is_hpu from ..utils import RemoteOpenAIServer @@ -20,8 +19,6 @@ def ray_ctx(): @pytest.fixture(scope="module") def server(ray_ctx): - if is_hpu(): - pytest.skip("Skipping test on HPU") return RemoteOpenAIServer([ "--model", MODEL_NAME, diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 72f36b0df98ac..a7b0fef533ccb 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -5,7 +5,6 @@ import weakref import pytest -from vllm.utils import is_hpu from vllm import LLM @@ -26,7 +25,7 @@ def test_vllm_gc_ed(): # because llm instance is not GC'ed. assert weak_llm() is None -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") + @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [5]) diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index d559537baa9be..767e0628765bd 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -7,7 +7,6 @@ Run `pytest tests/models/test_chunked_prefill.py`. """ import pytest -from vllm.utils import is_hpu from ..models.utils import check_outputs_equal @@ -17,7 +16,6 @@ ] -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [32]) diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py index 30d9e3b36fedf..d60cc95d75433 100644 --- a/tests/basic_correctness/test_preemption.py +++ b/tests/basic_correctness/test_preemption.py @@ -11,7 +11,6 @@ from vllm import SamplingParams from vllm.core.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT, ENABLE_ARTIFICIAL_PREEMPT) -from vllm.utils import is_hpu from ..models.utils import check_outputs_equal @@ -25,7 +24,6 @@ "tests/basic_correctness/test_preemption.py`") -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [96]) @@ -70,7 +68,6 @@ def test_chunked_prefill_recompute( f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("max_tokens", [96]) @@ -122,7 +119,6 @@ def test_preemption( assert total_preemption == total_recorded_preemption -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("max_tokens", [96]) @@ -181,7 +177,6 @@ def test_swap( assert total_preemption == total_recorded_preemption -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("max_tokens", [96]) @@ -226,7 +221,6 @@ def test_swap_infeasible( assert req_outputs[0].outputs[0].finish_reason == "length" -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("max_tokens", [96]) diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index 0d728f6b10047..8502eab0f8da0 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -3,12 +3,10 @@ import pytest from vllm import SamplingParams -from vllm.utils import is_hpu from .conftest import get_token_ids_from_llm_generator -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -87,7 +85,6 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator, assert baseline_token_ids == test_token_ids -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -155,7 +152,6 @@ def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator, assert baseline_token_ids == test_token_ids -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -255,7 +251,6 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator, assert baseline_token_ids == test_token_ids -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [ @@ -326,7 +321,6 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator, assert baseline_token_ids == test_token_ids -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -408,7 +402,6 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption( assert baseline_token_ids == test_token_ids -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index 9347472a64a3a..a3b76327e0a53 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -7,7 +7,6 @@ from vllm.core.interfaces import AllocStatus from vllm.core.scheduler import Scheduler from vllm.sequence import Logprob, SequenceGroup -from vllm.utils import is_hpu from .utils import create_dummy_prompt @@ -28,7 +27,6 @@ def schedule_and_update_computed_tokens(scheduler): return metas, out -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_simple(): """Verify basic scheduling works.""" block_size = 4 @@ -71,7 +69,6 @@ def test_simple(): assert len(seq_group_meta) == num_seq_group -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_chunk(): """Verify prefills are chunked properly.""" block_size = 4 @@ -116,7 +113,6 @@ def test_chunk(): assert out.num_batched_tokens == 57 -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_complex(): block_size = 4 max_seqs = 60 @@ -180,7 +176,6 @@ def test_complex(): assert running[2].is_prefill() -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_maximal_decoding(): """Verify decoding requests are prioritized.""" block_size = 4 @@ -374,7 +369,6 @@ def cannot_append_second_group(seq_group, num_lookahead_slots): assert out.blocks_to_swap_out == [] -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_running_prefill_prioritized_over_swap(): block_size = 4 max_seqs = 30 @@ -523,7 +517,6 @@ def cannot_append_second_group2(seq_group, num_lookahead_slots): assert out.num_batched_tokens == max_num_batched_tokens -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_chunked_prefill_max_seqs(): block_size = 4 max_seqs = 2 diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index 3377d735f21b7..bae958211cb7b 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -11,7 +11,6 @@ from vllm.core.scheduler import Scheduler, SchedulingBudget from vllm.lora.request import LoRARequest from vllm.sequence import Logprob, SequenceGroup, SequenceStatus -from vllm.utils import is_hpu from .utils import create_dummy_prompt @@ -78,7 +77,6 @@ def test_scheduler_abort_seq_group(): assert scheduler.get_num_unfinished_seq_groups() == 0 -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_scheduler_schedule_simple(): block_size = 4 num_seq_group = 4 @@ -146,7 +144,6 @@ def test_scheduler_prefill_prioritized(): assert get_sequence_groups(out) == [seq_group_b] -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_scheduler_schedule_preempt_abort(): block_size = 4 max_model_len = 16 @@ -196,7 +193,6 @@ def test_scheduler_schedule_preempt_abort(): assert scheduler.get_num_unfinished_seq_groups() == 1 -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_scheduler_max_seqs(): block_size = 4 num_seq_group = 4 @@ -238,7 +234,6 @@ def test_scheduler_max_seqs(): assert set(get_sequence_groups(out)) == set([all_seq_groups[1]]) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_scheduler_delay_factor(): block_size = 4 scheduler_config = SchedulerConfig(100, 64, 16, delay_factor=0.5) @@ -276,7 +271,6 @@ def test_scheduler_delay_factor(): append_new_token(out, 1) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_swapped_out_prioritized(): scheduler = initialize_scheduler(max_num_seqs=6) # best_of=2 * 3 == 6 sequences. @@ -578,7 +572,6 @@ def cannot_append_second_group(seq_group, num_lookahead_slots): assert output.blocks_to_copy == [] -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_decode_swap_beam_search(): """ Test best_of > 1 swap out blocks @@ -629,7 +622,6 @@ def cannot_append_second_group(seq_group, num_lookahead_slots): assert output.blocks_to_copy == [] -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_schedule_decode_blocks_to_copy_update(): """ Verify blocks_to_copy is updated. diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index cb404fef15797..e0e424439e3a5 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -8,13 +8,13 @@ from vllm.distributed.communication_op import ( # noqa tensor_model_parallel_all_reduce) -from vllm.utils import update_environment_variables, is_hpu -if not is_hpu(): - from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator - from vllm.distributed.device_communicators.pynccl_wrapper import NCCLLibrary - from vllm.distributed.parallel_state import (ensure_model_parallel_initialized, - get_world_group, graph_capture, - init_distributed_environment) +from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator +from vllm.distributed.device_communicators.pynccl_wrapper import NCCLLibrary +from vllm.distributed.parallel_state import (ensure_model_parallel_initialized, + get_world_group, graph_capture, + init_distributed_environment) +from vllm.utils import update_environment_variables + def distributed_run(fn, world_size): number_of_processes = world_size @@ -65,7 +65,6 @@ def worker_fn(): assert result == pynccl_comm.world_size -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test.") def test_pynccl(): @@ -95,7 +94,6 @@ def multiple_allreduce_worker_fn(): assert result == 2 -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.skipif(torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test.") def test_pynccl_multiple_allreduce(): @@ -122,7 +120,6 @@ def multiple_allreduce_with_vllm_worker_fn(): assert result == 2 -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.skipif(torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test.") def test_pynccl_multiple_allreduce_with_vllm(): @@ -153,7 +150,6 @@ def worker_fn_with_cudagraph(): assert a.mean().cpu().item() == pynccl_comm.world_size**1 -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Need at least 2 GPUs to run the test.") def test_pynccl_with_cudagraph(): @@ -224,7 +220,7 @@ def multiple_send_recv_worker_fn(): else: assert result == 2 -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") + @pytest.mark.skipif(torch.cuda.device_count() < 4, reason="Need at least 4 GPUs to run the test.") def test_pynccl_multiple_send_recv(): diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py index ec64cdd9749ff..ed35212cc3f11 100644 --- a/tests/engine/test_computed_prefix_blocks.py +++ b/tests/engine/test_computed_prefix_blocks.py @@ -3,10 +3,8 @@ from vllm.engine.arg_utils import EngineArgs from vllm.engine.llm_engine import LLMEngine from vllm.sampling_params import SamplingParams -from vllm.utils import is_hpu -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("block_size", [16]) def test_computed_prefix_blocks(model: str, block_size: int): diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py index 56faa15d14c3d..338b208723ba9 100644 --- a/tests/engine/test_skip_tokenizer_init.py +++ b/tests/engine/test_skip_tokenizer_init.py @@ -2,10 +2,8 @@ from vllm.entrypoints.llm import LLM from vllm.sampling_params import SamplingParams -from vllm.utils import is_hpu -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", ["facebook/opt-125m"]) def test_skip_tokenizer_initialization(model: str): # This test checks if the flag skip_tokenizer_init skips the initialization diff --git a/tests/engine/test_stop_reason.py b/tests/engine/test_stop_reason.py index d87ff3e39b3e3..b0bd6c4aa95d3 100644 --- a/tests/engine/test_stop_reason.py +++ b/tests/engine/test_stop_reason.py @@ -10,7 +10,6 @@ import transformers from vllm import SamplingParams -from vllm.utils import is_hpu MODEL = "facebook/opt-350m" STOP_STR = "." @@ -24,7 +23,6 @@ def vllm_model(vllm_runner): yield vllm_model -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_stop_reason(vllm_model, example_prompts): tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL) stop_token_id = tokenizer.convert_tokens_to_ids(STOP_STR) diff --git a/tests/engine/test_stop_strings.py b/tests/engine/test_stop_strings.py index 18afc02b88ba3..1584b85aeb064 100644 --- a/tests/engine/test_stop_strings.py +++ b/tests/engine/test_stop_strings.py @@ -3,7 +3,6 @@ import pytest from vllm import CompletionOutput, LLMEngine, SamplingParams -from vllm.utils import is_hpu MODEL = "meta-llama/llama-2-7b-hf" MAX_TOKENS = 200 @@ -11,8 +10,6 @@ @pytest.fixture(scope="session") def vllm_model(vllm_runner): - if is_hpu(): - pytest.skip("Skipping test on HPU") with vllm_runner(MODEL) as vllm_model: yield vllm_model diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index 639f4d3fd6361..f4c0af1adfdf9 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -13,8 +13,6 @@ # downloading lora to test lora requests from huggingface_hub import snapshot_download from openai import BadRequestError -from vllm.transformers_utils.tokenizer import get_tokenizer -from vllm.utils import is_hpu from ...utils import RemoteOpenAIServer diff --git a/tests/entrypoints/openai/test_oot_registration.py b/tests/entrypoints/openai/test_oot_registration.py index fea991be6b913..dbbda6de1fa09 100644 --- a/tests/entrypoints/openai/test_oot_registration.py +++ b/tests/entrypoints/openai/test_oot_registration.py @@ -7,7 +7,7 @@ from vllm import ModelRegistry from vllm.model_executor.models.opt import OPTForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.utils import get_open_port, is_hpu +from vllm.utils import get_open_port class MyOPTForCausalLM(OPTForCausalLM): @@ -31,7 +31,6 @@ def server_function(port): runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__') -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_oot_registration_for_api_server(): port = get_open_port() ctx = torch.multiprocessing.get_context() diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index f5da4f55d9231..a4b9f91c7688b 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -5,7 +5,6 @@ from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul, NewGELU, SiluAndMul) -from vllm.utils import is_hpu from .allclose_default import get_default_atol, get_default_rtol @@ -13,12 +12,9 @@ NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing D = [512, 4096, 5120, 13824] # Arbitrary values for testing SEEDS = [0] -if is_hpu(): - DEVICES = ["hpu"] -else: - DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) - ] +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) +] @pytest.mark.parametrize("activation", ["silu", "gelu", "gelu_tanh"]) @@ -26,7 +22,7 @@ @pytest.mark.parametrize("d", D) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() def test_act_and_mul( activation: str, @@ -36,15 +32,9 @@ def test_act_and_mul( seed: int, device: str, ) -> None: - - if is_hpu() and activation != "silu": - pytest.skip("Only SiluAndMul supported on HPU.") - torch.random.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) - elif is_hpu(): - torch.hpu.manual_seed(seed) torch.set_default_device(device) x = torch.randn(num_tokens, 2 * d, dtype=dtype) if activation == "silu": @@ -65,7 +55,7 @@ def test_act_and_mul( @pytest.mark.parametrize("d", D) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() def test_activation( activation: Type[torch.nn.Module], @@ -75,14 +65,9 @@ def test_activation( seed: int, device: str, ) -> None: - if is_hpu(): - pytest.skip("GELU not supported on HPU.") - torch.random.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) - elif is_hpu(): - torch.hpu.manual_seed(seed) torch.set_default_device(device) x = torch.randn(num_tokens, d, dtype=dtype) layer = activation() diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 5d141fb111407..f848ad51c7014 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -3,23 +3,18 @@ import pytest import torch +from xformers import ops as xops +from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask -from vllm.utils import get_max_shared_memory_bytes, is_hip, is_hpu -if is_hpu(): - from vllm.hpu import ops, cache_ops - from vllm.hpu import xops - from vllm.hpu.attn_bias import BlockDiagonalCausalMask -else: - from vllm._C import ops, cache_ops - from xformers import ops as xops - from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask +from vllm import _custom_ops as ops +from vllm.utils import get_max_shared_memory_bytes, is_hip from .allclose_default import get_default_atol, get_default_rtol FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 # This will change depending on the compute capability. # - 512 as a buffer -MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512 if not is_hpu() else 128 +MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512 # There may not be enough gpu memory due to large NUM_BLOCKS. # Reduce NUM_BLOCKS when it happens. NUM_BLOCKS = 4321 # Arbitrary values for testing @@ -40,12 +35,9 @@ USE_ALIBI = [False, True] KV_CACHE_DTYPE = ["auto", "fp8"] SEEDS = [0] -if is_hpu(): - DEVICES = ["hpu"] -else: - DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) - ] +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) +] def ref_masked_attention( @@ -75,14 +67,9 @@ def ref_single_query_cached_kv_attention( alibi_slopes: Optional[torch.Tensor], ) -> None: num_query_heads = query.shape[1] - if not is_hpu(): - num_kv_heads = value_cache.shape[1] - head_size = value_cache.shape[2] - block_size = value_cache.shape[3] - else: - block_size = value_cache.shape[1] - num_kv_heads = value_cache.shape[2] - head_size = value_cache.shape[3] + num_kv_heads = value_cache.shape[1] + head_size = value_cache.shape[2] + block_size = value_cache.shape[3] num_seqs = query.shape[0] block_tables_lst = block_tables.cpu().tolist() @@ -98,18 +85,12 @@ def ref_single_query_cached_kv_attention( block_number = int(block_table[j // block_size]) block_offset = j % block_size - if is_hpu(): - k = key_cache[block_number, block_offset, :, :] - else: - k = key_cache[block_number, :, :, block_offset, :] + k = key_cache[block_number, :, :, block_offset, :] k = k.reshape(num_kv_heads, head_size) keys_lst.append(k) - if is_hpu(): - v = value_cache[block_number, block_offset, :, :] - else: - v = value_cache[block_number, :, :, block_offset] - values.append(v) + v = value_cache[block_number, :, :, block_offset] + values_lst.append(v) keys = torch.stack(keys_lst, dim=0) values = torch.stack(values_lst, dim=0) if num_queries_per_kv > 1: @@ -139,7 +120,7 @@ def ref_single_query_cached_kv_attention( @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) def test_paged_attention( kv_cache_factory, version: str, @@ -153,24 +134,14 @@ def test_paged_attention( seed: int, device: str, ) -> None: - if is_hpu(): - if version != "v1": - pytest.skip("Paged attention v2 not supported on HPU") - if kv_cache_dtype != "auto": - pytest.skip("Only auto kv_cache_dtype supported on HPU") - if use_alibi: - pytest.skip("Alibi not supported on HPU") - random.seed(seed) torch.random.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) - elif is_hpu(): - torch.hpu.manual_seed(seed) torch.set_default_device(device) scale = float(1.0 / (head_size**0.5)) num_query_heads, num_kv_heads = num_heads - query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype, device=device) + query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype) query.uniform_(-scale, scale) assert num_query_heads % num_kv_heads == 0 @@ -193,7 +164,8 @@ def test_paged_attention( for _ in range(max_num_blocks_per_seq) ] block_tables_lst.append(block_table) - block_tables = torch.tensor(block_tables_lst, dtype=torch.int, device=device) + + block_tables = torch.tensor(block_tables_lst, dtype=torch.int) # Create the KV caches. key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1, @@ -207,21 +179,7 @@ def test_paged_attention( # Call the paged attention kernel. output = torch.empty_like(query) - - if is_hpu(): - output = ops.paged_attention_v1( - query, - key_cache, - value_cache, - num_kv_heads, - scale, - block_tables, - seq_lens, - block_size, - alibi_slopes, - kv_cache_dtype, - ) - elif version == "v1": + if version == "v1": ops.paged_attention_v1( output, query, @@ -351,13 +309,12 @@ def ref_multi_query_kv_attention( # TODO(woosuk): Add tests for USE_ALIBI=True. -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() def test_multi_query_kv_attention( num_seqs: int, @@ -384,8 +341,7 @@ def test_multi_query_kv_attention( qkv = torch.empty(num_tokens, num_query_heads + 2 * num_kv_heads, head_size, - dtype=dtype, - device=device) + dtype=dtype) qkv.uniform_(-scale, scale) query, key, value = qkv.split( [num_query_heads, num_kv_heads, num_kv_heads], dim=1) @@ -419,5 +375,4 @@ def test_multi_query_kv_attention( ) atol = get_default_atol(output) if is_hip() else 1e-3 rtol = get_default_rtol(output) if is_hip() else 1e-5 - assert torch.allclose(output, ref_output, atol=atol, rtol=rtol) diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 83fa7e47bcfac..23b6baa60c05b 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -1,13 +1,12 @@ import random from typing import List, Tuple -import math import pytest import torch from vllm import _custom_ops as ops -from vllm.utils import is_hpu +COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')] DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [42] # Arbitrary values for testing NUM_LAYERS = [1] # Arbitrary values for testing @@ -21,14 +20,11 @@ NUM_MAPPINGS = [256] # Arbitrary values for testing SEEDS = [0] -if is_hpu(): - COPYING_DIRECTION = [('hpu', 'cpu'), ('hpu', 'hpu'), ('cpu', 'hpu')] - DEVICES = ["hpu"] -else: - COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')] - DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) - ] +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) +] + +# We assume fp8 is always enabled for testing. KV_CACHE_DTYPE = ["auto", "fp8"] @@ -40,8 +36,8 @@ @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) -@pytest.mark.parametrize("device", DEVICES) @torch.inference_mode() def test_copy_blocks( kv_cache_factory, @@ -56,15 +52,10 @@ def test_copy_blocks( kv_cache_dtype: str, device: str, ) -> None: - if is_hpu() and kv_cache_dtype != "auto": - pytest.skip("Only auto kv_cache_dtype supported on HPU") - random.seed(seed) torch.random.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) - elif is_hpu(): - torch.hpu.manual_seed(seed) torch.set_default_device(device) # Generate random block mappings where each source block is mapped to two # destination blocks. @@ -87,24 +78,14 @@ def test_copy_blocks( dtype, seed, device) # Clone the KV caches. - cloned_key_caches = [key_cache.clone().to("cpu") for key_cache in key_caches] - cloned_value_caches = [value_cache.clone().to("cpu") for value_cache in value_caches] + cloned_key_caches = [key_cache.clone() for key_cache in key_caches] + cloned_value_caches = [value_cache.clone() for value_cache in value_caches] # Call the copy blocks kernel. block_mapping_tensor = torch.tensor(block_mapping, dtype=torch.int64, device=device).view(-1, 2) - if is_hpu(): - tmp_block_mapping_dict = {} - for src, dst in block_mapping: - if not tmp_block_mapping_dict.get(src): - tmp_block_mapping_dict[src] = [dst] - continue - tmp_block_mapping_dict[src].append(dst) - - ops.copy_blocks(key_caches, value_caches, tmp_block_mapping_dict) - else: - ops.copy_blocks(key_caches, value_caches, block_mapping_tensor) + ops.copy_blocks(key_caches, value_caches, block_mapping_tensor) # Run the reference implementation. for src, dst in block_mapping: @@ -128,7 +109,7 @@ def test_copy_blocks( @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @torch.inference_mode() def test_reshape_and_cache( @@ -143,16 +124,11 @@ def test_reshape_and_cache( device: str, kv_cache_dtype: str, ) -> None: - if is_hpu() and kv_cache_dtype != "auto": - pytest.skip("Only auto kv_cache_dtype supported on HPU") random.seed(seed) torch.random.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) - elif is_hpu(): - torch.hpu.manual_seed(seed) torch.set_default_device(device) - # Create a random slot mapping. num_slots = block_size * num_blocks slot_mapping_lst = random.sample(range(num_slots), num_tokens) @@ -182,8 +158,9 @@ def test_reshape_and_cache( kv_scale = 1.0 # Call the reshape_and_cache kernel. - cache_ops.reshape_and_cache(key, value, key_cache, value_cache, - slot_mapping, "auto") + ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, + kv_cache_dtype, kv_scale) + if kv_cache_dtype == "fp8": result_key_cache = torch.empty_like(key_cache, dtype=torch.float16) ops.convert_fp8(result_key_cache, key_cache) @@ -191,23 +168,16 @@ def test_reshape_and_cache( ops.convert_fp8(result_value_cache, value_cache) # Run the reference implementation. - if not is_hpu(): - reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape) - block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor") - block_indices = block_indices.cpu().tolist() + reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape) + block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor") + block_indicies_lst = block_indicies.cpu().tolist() block_offsets = slot_mapping % block_size block_offsets_lst = block_offsets.cpu().tolist() for i in range(num_tokens): - block_idx = block_indices[i] - block_offset = block_offsets[i] - if is_hpu(): - cloned_key_cache[block_idx, block_offset, :, :] = key[i] - else: - cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i] - if is_hpu(): - cloned_value_cache[block_idx, block_offset, :, :] = value[i] - else: - cloned_value_cache[block_idx, :, :, block_offset] = value[i] + block_idx = block_indicies_lst[i] + block_offset = block_offsets_lst[i] + cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i] + cloned_value_cache[block_idx, :, :, block_offset] = value[i] if kv_cache_dtype == "fp8": assert torch.allclose(result_key_cache, @@ -223,7 +193,6 @@ def test_reshape_and_cache( assert torch.allclose(value_cache, cloned_value_cache) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @@ -231,7 +200,7 @@ def test_reshape_and_cache( @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @torch.inference_mode() def test_reshape_and_cache_flash( @@ -312,7 +281,7 @@ def test_reshape_and_cache_flash( @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) @torch.inference_mode() def test_swap_blocks( @@ -328,23 +297,15 @@ def test_swap_blocks( device: str, kv_cache_dtype: str, ) -> None: - if is_hpu() and direction[0] == "hpu" and direction[1] == "cpu": - pytest.skip("Skipping test on HPU") if kv_cache_dtype == "fp8" and "cpu" in direction: pytest.skip() random.seed(seed) torch.random.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) - elif is_hpu(): - torch.hpu.manual_seed(seed) - if is_hpu(): - src_device = device if direction[0] == "hpu" else 'cpu' - dst_device = device if direction[1] == "hpu" else 'cpu' - else: - src_device = device if direction[0] == "cuda" else 'cpu' - dst_device = device if direction[1] == "cuda" else 'cpu' + src_device = device if direction[0] == "cuda" else 'cpu' + dst_device = device if direction[1] == "cuda" else 'cpu' src_blocks = random.sample(range(num_blocks), num_mappings) # For the same device, mapping must not overlap @@ -385,14 +346,13 @@ def test_swap_blocks( dist_value_caches[0][dst].cpu()) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("block_size", BLOCK_SIZES) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() def test_fp8_e4m3_conversion( num_heads: int, @@ -420,87 +380,3 @@ def test_fp8_e4m3_conversion( ops.convert_fp8(converted_cache, cache_fp8) assert torch.allclose(cache, converted_cache, atol=0.001, rtol=0.1) - - -@pytest.mark.skipif(not is_hpu(), reason="This case is HPU-specific") -@pytest.mark.parametrize("num_tokens", NUM_TOKENS) -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("head_size", HEAD_SIZES) -@pytest.mark.parametrize("block_size", BLOCK_SIZES) -@pytest.mark.parametrize("num_blocks", NUM_BLOCKS) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) -@torch.inference_mode() -def test_reshape_and_cache_prompt( - kv_cache_factory, - num_tokens: int, - num_heads: int, - head_size: int, - block_size: int, - num_blocks: int, - dtype: torch.dtype, - seed: int, - device: str, -) -> None: - random.seed(seed) - torch.random.manual_seed(seed) - if torch.cuda.is_available(): - torch.cuda.manual_seed(seed) - elif is_hpu(): - torch.hpu.manual_seed(seed) - torch.set_default_device(device) - - # Create a random slot mapping. - num_block_indices_to_generate = math.ceil(num_tokens / block_size) - block_indices_ = random.sample(range(num_blocks), num_block_indices_to_generate) - block_offsets_ = [] - slot_mapping = [] - for i in block_indices_: - for j in range(block_size): - slot_mapping.append(i * block_size + j) - slot_mapping = slot_mapping[:num_tokens] - slot_mapping = torch.tensor(slot_mapping, dtype=torch.long) - - qkv = torch.randn(num_tokens, 3, num_heads, head_size, dtype=dtype) - _, key, value = qkv.unbind(dim=1) - - # Create the KV caches. - key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1, - num_heads, head_size, dtype, - None, seed, device) - key_cache, value_cache = key_caches[0], value_caches[0] - - # Clone the KV caches. - cloned_key_cache = key_cache.clone() - cloned_value_cache = value_cache.clone() - - # Call the reshape_and_cache kernel. - cache_ops.reshape_and_cache(key, value, key_cache, value_cache, - slot_mapping.view((1, -1)), "auto", True) - - # Run the reference implementation. - if is_hpu(): - reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0].shape) - else: - reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape) - block_indices = torch.div(slot_mapping, block_size, rounding_mode="floor") - block_indices = block_indices.cpu().tolist() - block_offsets = slot_mapping % block_size - block_offsets = block_offsets.cpu().tolist() - for i in range(0, num_tokens): - block_idx = block_indices[i] - block_offset = block_offsets[i] - cloned_key_cache[block_idx, :, :, block_offset] = key[i, :, :] - cloned_value_cache[block_idx, :, :, block_offset] = value[i, :, :] - - # Note: only checking cache areas specified by the slot mapping because - # the implementation may initialize whole blocks even if some of the offsets of the block - # are not present in the slot mapping. - for i in range(0, num_tokens): - block_idx = block_indices[i] - block_offset = block_offsets[i] - assert torch.allclose(key_cache[block_idx, :, :, block_offset], - cloned_key_cache[block_idx, :, :, block_offset]) - assert torch.allclose(value_cache[block_idx, :, :, block_offset], - cloned_value_cache[block_idx, :, :, block_offset]) diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py index 0cd33494f9a1e..a635e6c12c594 100644 --- a/tests/kernels/test_layernorm.py +++ b/tests/kernels/test_layernorm.py @@ -2,7 +2,6 @@ import torch from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.utils import is_hpu DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing @@ -10,12 +9,9 @@ 8199] # Arbitrary values for testing ADD_RESIDUAL = [False, True] SEEDS = [0] -if is_hpu(): - DEVICES = ["hpu"] -else: - DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) - ] +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) +] @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @@ -23,7 +19,7 @@ @pytest.mark.parametrize("add_residual", ADD_RESIDUAL) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() def test_rms_norm( num_tokens: int, @@ -33,18 +29,14 @@ def test_rms_norm( seed: int, device: str, ) -> None: - if is_hpu() and dtype == torch.half and add_residual: - pytest.skip("Skipping test on HPU") torch.random.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) - elif is_hpu(): - torch.hpu.manual_seed(seed) torch.set_default_device(device) layer = RMSNorm(hidden_size).to(dtype=dtype) layer.weight.data.normal_(mean=1.0, std=0.1) scale = 1 / (2 * hidden_size) - x = torch.randn(1, num_tokens, hidden_size, dtype=dtype, device=device) + x = torch.randn(num_tokens, hidden_size, dtype=dtype) x *= scale residual = torch.randn_like(x) * scale if add_residual else None diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index 8d52fbaa6cc25..2356b9ec18b0d 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -10,7 +10,6 @@ from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_moe from vllm.model_executor.models.mixtral import MixtralMoE -from vllm.utils import is_hpu def torch_moe(a, w1, w2, score, topk): @@ -30,7 +29,6 @@ def torch_moe(a, w1, w2, score, topk): topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("m", [512, 222, 33, 1]) @pytest.mark.parametrize("n", [2048, 256, 1024]) @pytest.mark.parametrize("k", [128, 511, 1024]) @@ -55,7 +53,6 @@ def test_fused_moe( assert torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) @torch.inference_mode() diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index 0e08055bf12fe..4c83659929d41 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -5,7 +5,6 @@ import torch from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.utils import is_hpu from .allclose_default import get_default_atol, get_default_rtol @@ -17,15 +16,11 @@ BATCH_SIZES = [1, 5] # Arbitrary values for testing SEQ_LENS = [11, 8192] # Arbitrary values for testing SEEDS = [0] -if is_hpu(): - DEVICES = ["hpu"] -else: - DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) - ] +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) +] -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) @pytest.mark.parametrize("batch_size", BATCH_SIZES) @pytest.mark.parametrize("seq_len", SEQ_LENS) @@ -34,7 +29,7 @@ @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() def test_rotary_embedding( is_neox_style: bool, @@ -82,7 +77,6 @@ def test_rotary_embedding( rtol=get_default_rtol(out_key)) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) @pytest.mark.parametrize("batch_size", BATCH_SIZES) @pytest.mark.parametrize("seq_len", SEQ_LENS) @@ -91,7 +85,7 @@ def test_rotary_embedding( @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() def test_batched_rotary_embedding( is_neox_style: bool, @@ -145,7 +139,6 @@ def test_batched_rotary_embedding( rtol=get_default_rtol(out_key)) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) @pytest.mark.parametrize("batch_size", BATCH_SIZES) @pytest.mark.parametrize("seq_len", SEQ_LENS) @@ -154,7 +147,7 @@ def test_batched_rotary_embedding( @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() def test_batched_rotary_embedding_multi_lora( is_neox_style: bool, diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py index 1240411841d3f..99fda8364dc0e 100644 --- a/tests/kernels/test_prefix_prefill.py +++ b/tests/kernels/test_prefix_prefill.py @@ -4,16 +4,11 @@ import pytest import torch +from xformers import ops as xops +from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask from vllm.attention.backends.xformers import _make_alibi_bias from vllm.attention.ops.prefix_prefill import context_attention_fwd -from vllm.utils import is_hpu -if is_hpu(): - from vllm.hpu import xops - from vllm.hpu.attn_bias import BlockDiagonalCausalFromBottomRightMask -else: - from xformers import ops as xops - from xformers.ops.fmha.attn_bias import BlockDiagonalCausalFromBottomRightMask NUM_HEADS = [64] NUM_QUERIES_PER_KV = [1, 8, 64] @@ -25,7 +20,6 @@ SLIDING_WINDOW = [0, 16, 64, 128, 256, 512, 2048] -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV) @pytest.mark.parametrize("head_size", HEAD_SIZES) diff --git a/tests/kernels/test_rand.py b/tests/kernels/test_rand.py index a96a238834a49..a4242d22eb489 100644 --- a/tests/kernels/test_rand.py +++ b/tests/kernels/test_rand.py @@ -5,10 +5,8 @@ from vllm.model_executor.layers.ops.rand import seeded_uniform from vllm.model_executor.utils import set_random_seed -from vllm.utils import is_hpu -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16]) @pytest.mark.parametrize("use_3d", [True, False]) diff --git a/tests/kernels/test_sampler.py b/tests/kernels/test_sampler.py index 4bab8caedbf62..e28f809309ec5 100644 --- a/tests/kernels/test_sampler.py +++ b/tests/kernels/test_sampler.py @@ -10,7 +10,6 @@ sample) from vllm.model_executor.sampling_metadata import SamplingTensors from vllm.model_executor.utils import set_random_seed -from vllm.utils import is_hpu SINGLE_SPLIT_VOCAB_SIZE = 32000 # llama/mistral/mixtral vocab size MULTI_SPLIT_VOCAB_SIZE = MAX_TRITON_N_COLS + 100 @@ -31,7 +30,6 @@ def _uniform_to_exponential_kernel(input, output, n: tl.constexpr): tl.store(output + idx, y) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_uniform_to_exponential(): """Test that we can convert uniform to exponential without div by 0.""" input = torch.tensor([0.0, 1.0 - torch.finfo(torch.float32).eps], @@ -44,7 +42,6 @@ def test_uniform_to_exponential(): assert torch.all(torch.isfinite(torch.full_like(output, 1.0) / output)) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("random_sampling", [True, False, "mixed"]) @pytest.mark.parametrize("max_best_of", [1, 2, 3, 4, 5]) @pytest.mark.parametrize("modify_greedy_probs", [True, False]) @@ -124,7 +121,6 @@ def test_sample_decoding_only(random_sampling, max_best_of, assert sampled_logprobs is None -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("random_sampling", [True, False, "mixed"]) @pytest.mark.parametrize("max_best_of", [1, 2, 3, 4, 5]) @pytest.mark.parametrize("modify_greedy_probs", [True, False]) diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py index 4a00abd15266a..56cec4db89e64 100644 --- a/tests/lora/test_baichuan.py +++ b/tests/lora/test_baichuan.py @@ -4,7 +4,6 @@ import vllm from vllm.lora.request import LoRARequest -from vllm.utils import is_hpu from .conftest import cleanup @@ -42,7 +41,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: return generated_texts -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_baichuan_lora(baichuan_lora_files): llm = vllm.LLM(MODEL_PATH, max_model_len=1024, @@ -65,7 +63,6 @@ def test_baichuan_lora(baichuan_lora_files): assert output2[i] == expected_lora_output[i] -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.skip("Requires multiple GPUs") @pytest.mark.parametrize("fully_sharded", [True, False]) def test_baichuan_tensor_parallel_equality(baichuan_lora_files, fully_sharded): diff --git a/tests/lora/test_chatglm3.py b/tests/lora/test_chatglm3.py index c17ebac3e4543..de4cbea80924e 100644 --- a/tests/lora/test_chatglm3.py +++ b/tests/lora/test_chatglm3.py @@ -1,9 +1,7 @@ -import pytest from typing import List import vllm from vllm.lora.request import LoRARequest -from vllm.utils import is_hpu MODEL_PATH = "THUDM/chatglm3-6b" @@ -39,7 +37,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: return generated_texts -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_chatglm3_lora(chatglm3_lora_files): llm = vllm.LLM(MODEL_PATH, max_model_len=1024, diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py index 6022b82e8a7cb..709246179bfe4 100644 --- a/tests/lora/test_gemma.py +++ b/tests/lora/test_gemma.py @@ -1,9 +1,7 @@ -import pytest from typing import List import vllm from vllm.lora.request import LoRARequest -from vllm.utils import is_hpu MODEL_PATH = "google/gemma-7b" @@ -30,7 +28,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: return generated_texts -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_gemma_lora(gemma_lora_files): llm = vllm.LLM(MODEL_PATH, max_model_len=1024, diff --git a/tests/lora/test_layer_variation.py b/tests/lora/test_layer_variation.py index 8b6585e4cf76e..ec9776b77df76 100644 --- a/tests/lora/test_layer_variation.py +++ b/tests/lora/test_layer_variation.py @@ -8,7 +8,6 @@ import vllm from vllm.lora.request import LoRARequest -from vllm.utils import is_hpu from .conftest import cleanup @@ -71,7 +70,6 @@ def do_sample(llm: vllm.LLM, # step 1: init a base model and serve with LoRA to get the reference results # step 2: merge the same LoRA to the base model, serve the merged model # step 3: compare the results from step 1 and step 2 -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("tp_size", [1]) @pytest.mark.parametrize("target_modules", TARGET_MODULES_LIST) @pytest.mark.parametrize("rank", [8, 16, 32, 64]) diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 2908bc7ee70af..2e51e95a38f2e 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -36,7 +36,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask) from vllm.model_executor.utils import set_random_seed -from vllm.utils import is_hpu from .utils import DummyLoRAManager @@ -179,7 +178,6 @@ def create_random_inputs( return inputs, index_mapping, prompt_mapping -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @torch.inference_mode() @pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("device", CUDA_DEVICES) @@ -274,7 +272,6 @@ def create_random_embedding_layer(): atol=atol) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @torch.inference_mode() # @pytest.mark.skip( # reason="Fails when loras are in any slot other than the first.") @@ -412,7 +409,6 @@ def create_random_embedding_layer(): atol=atol) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @torch.inference_mode() @pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("device", CUDA_DEVICES) @@ -537,7 +533,6 @@ def _pretest(): atol=atol) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @torch.inference_mode() @pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("orientation", ["row", "column"]) @@ -649,7 +644,6 @@ def create_random_linear_parallel_layer(): atol=atol) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @torch.inference_mode() @pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("repeats", [1, 2, 3]) diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py index 42d7e7ab78e16..ad8490353998f 100644 --- a/tests/lora/test_llama.py +++ b/tests/lora/test_llama.py @@ -5,7 +5,6 @@ import vllm from vllm.lora.request import LoRARequest -from vllm.utils import is_hpu from .conftest import cleanup @@ -38,7 +37,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") return generated_texts -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") + @pytest.mark.parametrize("tp_size", [1, 2, 4]) def test_llama_lora(sql_lora_files, tp_size, num_gpus_available): if num_gpus_available < tp_size: @@ -81,7 +80,7 @@ def test_llama_lora(sql_lora_files, tp_size, num_gpus_available): print("removing lora") -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") + def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available): if num_gpus_available < 4: pytest.skip("Not enough GPUs for tensor parallelism 4") @@ -121,7 +120,6 @@ def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available): assert output_tp1 == output_tp4 -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_llama_lora_warmup(sql_lora_files): """Test that the LLM initialization works with a warmup LORA path and is more conservative""" diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py index 90363305e137c..3415d36b7e341 100644 --- a/tests/lora/test_lora.py +++ b/tests/lora/test_lora.py @@ -2,7 +2,6 @@ import torch from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice -from vllm.utils import is_hpu from .utils import DummyLoRAManager @@ -22,7 +21,6 @@ } -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("m", TENSOR_SIZES) @pytest.mark.parametrize("n", TENSOR_SIZES) @pytest.mark.parametrize("k", BATCH_SIZES) @@ -73,7 +71,6 @@ def test_apply_lora(m, n, k, rank, dtype) -> None: manager.reset_lora() -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("m", TENSOR_SIZES) @pytest.mark.parametrize("n", TENSOR_SIZES) @pytest.mark.parametrize("k", BATCH_SIZES) @@ -143,7 +140,6 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None: manager.reset_lora() -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("qkv", QKV_TENSOR_SIZES) @pytest.mark.parametrize("n", TENSOR_SIZES) @pytest.mark.parametrize("k", BATCH_SIZES) diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index f25e55e0b2ea3..2133bce14957b 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -17,7 +17,6 @@ from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager, WorkerLoRAManager) from vllm.model_executor.layers.linear import RowParallelLinear -from vllm.utils import is_hpu EMBEDDING_MODULES = { "embed_tokens": "input_embeddings", @@ -27,7 +26,6 @@ EMBEDDING_PADDING_MODULES = ["lm_head"] -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_from_lora_tensors(sql_lora_files): tensors = load_file( os.path.join(sql_lora_files, "adapter_model.safetensors")) @@ -100,7 +98,6 @@ def create_packed_lora( return LoRAModel(lora_id, 8, loras) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_replace_submodules(dist_init, dummy_model): model = dummy_model model.supported_lora_modules = ["dense1", "layer1.dense2"] @@ -119,7 +116,6 @@ def test_replace_submodules(dist_init, dummy_model): RowParallelLinearWithLoRA) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_lora_model_manager(dist_init, dummy_model): model = dummy_model model.supported_lora_modules = ["dense1", "dense2", "lm_head"] @@ -166,7 +162,6 @@ def test_lora_model_manager(dist_init, dummy_model): assert manager.lora_index_to_id[1] == 2 -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_lora_lru_cache_model_manager(dist_init, dummy_model): model = dummy_model model.supported_lora_modules = ["dense1", "dense2", "lm_head"] @@ -244,7 +239,6 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model): assert manager.pin_lora(3) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_lru_lora_model_manager(dist_init, dummy_model): # This tests just the LRU cache functionality, everything else is # tested in test_lora_model_manager @@ -359,7 +353,6 @@ def test_lru_lora_model_manager(dist_init, dummy_model): assert set(manager.list_loras()) == {1} -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_lru_cache_worker_lora_manager(llama_2_7b_model_extra_embeddings, sql_lora_files): lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4) @@ -433,7 +426,6 @@ def test_lru_cache_worker_lora_manager(llama_2_7b_model_extra_embeddings, ], mapping) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_worker_lora_manager(llama_2_7b_model_extra_embeddings, sql_lora_files): # Should remove every LoRA not specified in the request. @@ -504,7 +496,6 @@ def test_worker_lora_manager(llama_2_7b_model_extra_embeddings, ], mapping) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_packed_loras(dist_init, dummy_model_gate_up): model = dummy_model_gate_up model.supported_lora_modules = ["gate_up_proj"] diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py index fccfcb1864422..dbeb16cb21ad3 100644 --- a/tests/lora/test_punica.py +++ b/tests/lora/test_punica.py @@ -4,7 +4,6 @@ import torch import vllm.lora.punica as punica -from vllm.utils import is_hpu def assert_close(a, b): @@ -130,7 +129,6 @@ def _lora_ref_impl( ] -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"]) @pytest.mark.parametrize("h1", H1) @pytest.mark.parametrize("r", R) @@ -173,7 +171,6 @@ def test_lora_a_extra_shapes(dtype_str, h1, r, seed): assert_close(y_ref, y_our) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"]) @pytest.mark.parametrize("h1", H1) @pytest.mark.parametrize("h2", H2) @@ -208,7 +205,6 @@ def test_lora_correctness(dtype_str, h1, h2, seed, device): assert_close(y_ref, y_our) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("dtype_str", ["float16", "bfloat16"]) @pytest.mark.parametrize("h1", H1) @pytest.mark.parametrize("h2", H2) diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index 73d87bda255f4..8fd968c69e58f 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -7,7 +7,6 @@ import vllm from vllm.lora.request import LoRARequest -from vllm.utils import is_hpu from .conftest import cleanup @@ -58,7 +57,6 @@ def format_prompt_tuples(prompt): return generated_texts -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("tp_size", [1]) def test_quant_model_lora(tinyllama_lora_files, model, tp_size): diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 943a9170605c2..732e91a52c0a9 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -3,17 +3,13 @@ import tempfile from unittest.mock import patch -import pytest - from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig) from vllm.lora.models import LoRAMapping from vllm.lora.request import LoRARequest from vllm.worker.worker import Worker -from vllm.utils import is_hpu -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @patch.dict(os.environ, {"RANK": "0"}) def test_worker_apply_lora(sql_lora_files): worker = Worker( diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index 13e910e74fff7..0191d85194e33 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -7,14 +7,12 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.sampling_params import SamplingParams -from vllm.utils import is_hpu MODELS = [ "facebook/opt-125m", ] -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("max_tokens", [128]) @@ -50,7 +48,6 @@ def test_metric_counter_prompt_tokens( f"metric: {metric_count!r}") -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("max_tokens", [128]) @@ -83,7 +80,6 @@ def test_metric_counter_generation_tokens( f"metric: {metric_count!r}") -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize( diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py index 4cec529a2f5c3..c3e48b56ee58f 100644 --- a/tests/models/test_big_models.py +++ b/tests/models/test_big_models.py @@ -6,7 +6,6 @@ """ import pytest import torch -from vllm.utils import is_hpu from .utils import check_outputs_equal @@ -26,7 +25,6 @@ target_dtype = "half" -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", [target_dtype]) @pytest.mark.parametrize("max_tokens", [32]) @@ -51,7 +49,7 @@ def test_models( name_1="vllm", ) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") + @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", [target_dtype]) def test_model_print( diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py index 6c83f711b62af..b4220dc599551 100644 --- a/tests/models/test_llava.py +++ b/tests/models/test_llava.py @@ -4,7 +4,6 @@ from transformers import AutoTokenizer from vllm.config import VisionLanguageConfig -from vllm.utils import is_hpu from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets from .utils import check_outputs_equal diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py index f30384a85ed0d..6acc057fe588c 100644 --- a/tests/models/test_mistral.py +++ b/tests/models/test_mistral.py @@ -3,7 +3,6 @@ Run `pytest tests/models/test_mistral.py`. """ import pytest -from vllm.utils import is_hpu from .utils import check_logprobs_close @@ -13,7 +12,6 @@ ] -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [64]) diff --git a/tests/models/test_models.py b/tests/models/test_models.py index 927c5569a9a33..4cd2cb665c8f0 100644 --- a/tests/models/test_models.py +++ b/tests/models/test_models.py @@ -6,7 +6,6 @@ Run `pytest tests/models/test_models.py`. """ import pytest -from vllm.utils import is_hpu from .utils import check_outputs_equal @@ -24,7 +23,6 @@ ] -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("max_tokens", [96]) @@ -53,7 +51,6 @@ def test_models( ) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) def test_model_print( diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py index f03c657dac4a2..50ab06631500b 100644 --- a/tests/models/test_oot_registration.py +++ b/tests/models/test_oot_registration.py @@ -1,4 +1,3 @@ -import pytest import torch from vllm import LLM, ModelRegistry, SamplingParams diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py index 70ef3c2cfcbf0..b63a8d01d6621 100644 --- a/tests/quantization/test_configs.py +++ b/tests/quantization/test_configs.py @@ -9,7 +9,7 @@ import pytest from vllm.config import ModelConfig -from vllm.utils import is_hpu + @dataclass class ModelPair: @@ -54,8 +54,7 @@ class ModelPair: @pytest.mark.parametrize("model_arg_exptype", MODEL_ARG_EXPTYPES) def test_auto_gptq(model_arg_exptype: Tuple[str, None, str]) -> None: model_path, quantization_arg, expected_type = model_arg_exptype - if is_hpu() and model_path in ('TheBloke/Llama-2-7B-Chat-GPTQ', 'LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit'): - pytest.skip("Skipping test on HPU") + try: model_config = ModelConfig(model_path, model_path, diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py index f2ac194be59d4..64f3ce94b7a83 100644 --- a/tests/samplers/test_beam_search.py +++ b/tests/samplers/test_beam_search.py @@ -4,7 +4,6 @@ """ import pytest -from vllm.utils import is_hpu # FIXME(zhuohan): The test can not pass if we: # 1. Increase max_tokens to 256. @@ -15,7 +14,6 @@ MODELS = ["facebook/opt-125m"] -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", MAX_TOKENS) diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py index 3a66a4a48772f..2979470120710 100644 --- a/tests/samplers/test_logits_processor.py +++ b/tests/samplers/test_logits_processor.py @@ -2,12 +2,10 @@ import torch from vllm import SamplingParams -from vllm.utils import is_hpu MODELS = ["facebook/opt-125m"] -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_logits_processor_force_generate( diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index f121a809be380..02a953da04659 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -4,14 +4,12 @@ import torch from vllm import SamplingParams -from vllm.utils import is_hpu from ..conftest import VllmRunner MODELS = ["facebook/opt-125m"] -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1]) diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py index 9b874722e3cfd..ed2fee1ae252e 100644 --- a/tests/samplers/test_ranks.py +++ b/tests/samplers/test_ranks.py @@ -1,12 +1,10 @@ import pytest from vllm import SamplingParams -from vllm.utils import is_hpu MODELS = ["facebook/opt-125m"] -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_ranks( diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py index 41f095b18c8bd..6dd643bbea2bb 100644 --- a/tests/samplers/test_rejection_sampler.py +++ b/tests/samplers/test_rejection_sampler.py @@ -7,14 +7,10 @@ from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.model_executor.utils import set_random_seed -from vllm.utils import is_hpu -if is_hpu(): - DEVICES = ["hpu"] -else: - DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) - ] +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) +] def mock_causal_accepted_tensor( @@ -42,7 +38,6 @@ def mock_causal_accepted_tensor( return accepted -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("seed", list(range(10))) @pytest.mark.parametrize( "which_tokens_accepted", @@ -134,11 +129,10 @@ def test_correct_output_format(which_tokens_accepted: str, assert torch.all(output_token_ids[subsequent_mask] == -1) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("k", list(range(1, 6))) @pytest.mark.parametrize("vocab_size", [30_000, 50_000]) @pytest.mark.parametrize("batch_size", list(range(1, 32))) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int, device: str): @@ -161,11 +155,10 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int, draft_token_ids) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"]) @pytest.mark.parametrize("which_token_ids", ["bonus_token_ids", "draft_token_ids"]) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) @torch.inference_mode() def test_raises_when_vocab_oob(above_or_below_vocab_range: str, which_token_ids: str, device: str): @@ -210,7 +203,6 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str, draft_token_ids) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("draft_and_target_probs_equal", [True, False]) @pytest.mark.parametrize("seed", list(range(5))) @torch.inference_mode() diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index d64f3c4e6fd13..9572588ce6e53 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -11,7 +11,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_random_seed from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata -from vllm.utils import Counter, is_pin_memory_available, is_hpu +from vllm.utils import Counter, is_pin_memory_available class MockLogitsSampler(Sampler): @@ -37,12 +37,9 @@ def _prepare_test( VOCAB_SIZE = 32000 RANDOM_SEEDS = list(range(128)) -if is_hpu(): - DEVICES = ["hpu"] -else: - DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) - ] +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) +] def _do_sample( @@ -75,7 +72,7 @@ def _do_sample( @pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) def test_sampler_all_greedy(seed: int, device: str): set_random_seed(seed) torch.set_default_device(device) @@ -92,7 +89,7 @@ def test_sampler_all_greedy(seed: int, device: str): @pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) def test_sampler_all_random(seed: int, device: str): set_random_seed(seed) torch.set_default_device(device) @@ -114,9 +111,8 @@ def test_sampler_all_random(seed: int, device: str): assert nth_output.output_token == i -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) def test_sampler_all_random_seed(seed: int, device: str): set_random_seed(seed) torch.set_default_device(device) @@ -139,9 +135,8 @@ def test_sampler_all_random_seed(seed: int, device: str): assert nth_output.output_token == i -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) def test_sampler_all_random_seed_deterministic(seed: int, device: str): set_random_seed(seed) torch.set_default_device(device) @@ -163,7 +158,7 @@ def test_sampler_all_random_seed_deterministic(seed: int, device: str): @pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) def test_sampler_all_beam(seed: int, device: str): set_random_seed(seed) torch.set_default_device(device) @@ -183,7 +178,7 @@ def test_sampler_all_beam(seed: int, device: str): @pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) def test_sampler_min_tokens_penalty(seed: int, device: str): seq_id_counter = Counter(start=random.randint(0, 100)) set_random_seed(seed) @@ -468,9 +463,8 @@ def run_test_case(*, expected_penalization: List[bool], run_test_case(**test_case) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) def test_sampler_mixed(seed: int, device: str): set_random_seed(seed) torch.set_default_device(device) @@ -572,7 +566,7 @@ def test_sampling(): @pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) def test_sampler_top_k_top_p(seed: int, device: str): set_random_seed(seed) batch_size = random.randint(1, 256) diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py index 6fed73ec7b3ce..88067f19c8f07 100644 --- a/tests/samplers/test_seeded_generate.py +++ b/tests/samplers/test_seeded_generate.py @@ -10,7 +10,6 @@ from vllm import SamplingParams from vllm.model_executor.utils import set_random_seed -from vllm.utils import is_hpu MODEL = "facebook/opt-125m" RANDOM_SEEDS = list(range(5)) @@ -22,7 +21,6 @@ def vllm_model(vllm_runner): yield vllm_model -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("seed", RANDOM_SEEDS) def test_random_sample_with_seed( vllm_model, diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py index c39a143ba3371..81f91c5e10b0d 100644 --- a/tests/spec_decode/e2e/test_compatibility.py +++ b/tests/spec_decode/e2e/test_compatibility.py @@ -1,12 +1,10 @@ import pytest from vllm import SamplingParams -from vllm.utils import is_hpu from .conftest import get_output_from_llm_generator -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -46,7 +44,6 @@ def test_spec_decode_xfail_chunked_prefill(test_llm_generator): sampling_params) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -99,7 +96,6 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator): sampling_params) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("common_llm_kwargs", [{ "model": "JackFram/llama-68m", "speculative_model": "JackFram/llama-68m", diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py index 882cb8dd9dbac..9572aac7df6e0 100644 --- a/tests/spec_decode/e2e/test_logprobs.py +++ b/tests/spec_decode/e2e/test_logprobs.py @@ -4,12 +4,10 @@ import pytest from vllm import SamplingParams -from vllm.utils import is_hpu from .conftest import get_logprobs_from_llm_generator -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -47,7 +45,6 @@ def test_logprobs_equality(baseline_llm_generator, test_llm_generator, force_output_len=True) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -89,7 +86,6 @@ def test_diff_num_logprobs(baseline_llm_generator, test_llm_generator, logprob_rank=num_logprobs) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -129,7 +125,6 @@ def test_logprobs_different_k(baseline_llm_generator, test_llm_generator, force_output_len=True) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -173,7 +168,6 @@ def test_logprobs_when_skip_speculation(baseline_llm_generator, force_output_len=True) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py index ca9158ec72a08..94cc36f22875a 100644 --- a/tests/spec_decode/e2e/test_multistep_correctness.py +++ b/tests/spec_decode/e2e/test_multistep_correctness.py @@ -40,13 +40,11 @@ from transformers import AutoTokenizer from vllm import SamplingParams -from vllm.utils import is_hpu from .conftest import (get_output_from_llm_generator, run_greedy_equality_correctness_test) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -118,7 +116,6 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator, assert actual_tokens.strip() == expected_tokens.strip() -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -157,7 +154,6 @@ def test_spec_decode_e2e_with_async_engine(test_llm_generator, force_output_len=True) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -212,7 +208,6 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( force_output_len=True) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -264,7 +259,6 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs( force_output_len=True) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -311,7 +305,6 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len( force_output_len=False) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -356,7 +349,6 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1( force_output_len=True) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -401,7 +393,6 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( force_output_len=True) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -449,7 +440,6 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption( force_output_len=True) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -503,7 +493,6 @@ def test_spec_decode_different_block_size(baseline_llm_generator, force_output_len=True) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -553,7 +542,6 @@ def test_skip_speculation(baseline_llm_generator, test_llm_generator, force_output_len=True) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py index 179125891e74d..d475d37af6425 100644 --- a/tests/spec_decode/e2e/test_ngram_correctness.py +++ b/tests/spec_decode/e2e/test_ngram_correctness.py @@ -26,12 +26,9 @@ import pytest -from vllm.utils import is_hpu - from .conftest import run_greedy_equality_correctness_test -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -73,7 +70,6 @@ def test_ngram_e2e_greedy_correctness(baseline_llm_generator, force_output_len=True) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -123,7 +119,6 @@ def test_ngram_e2e_greedy_correctness_with_preemption(baseline_llm_generator, force_output_len=True) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize( "common_llm_kwargs", [{ diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py index 0fbd4fefbde7c..42dd90422ec47 100644 --- a/tests/spec_decode/test_batch_expansion.py +++ b/tests/spec_decode/test_batch_expansion.py @@ -4,7 +4,6 @@ import torch from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer -from vllm.utils import is_hpu from .utils import create_seq_group_metadata_from_prompts, mock_worker @@ -30,7 +29,6 @@ def test_create_target_seq_id_iterator(num_target_seq_ids: int): assert next(iterator) > max_seq_id -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize('k', [1, 2, 6]) @pytest.mark.skip_global_cleanup def test_get_token_ids_to_score(k: int): diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py index adab972e610f8..2918fabddc900 100644 --- a/tests/spec_decode/test_metrics.py +++ b/tests/spec_decode/test_metrics.py @@ -5,10 +5,8 @@ import torch from vllm.spec_decode.metrics import AsyncMetricsCollector -from vllm.utils import is_hpu -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_initial_call_returns_none(): """Expect first call to get metrics to return None. """ @@ -27,7 +25,6 @@ def test_initial_call_returns_none(): assert maybe_metrics is None -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_second_call_returns_metrics(): """Expect second call to not return None. """ @@ -55,7 +52,6 @@ def test_second_call_returns_metrics(): assert metrics is not None -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("rank", [1, 2, 3, 4]) def test_nonzero_rank_noop(rank): """Verify nonzero ranks don't collect metrics. @@ -76,7 +72,6 @@ def test_nonzero_rank_noop(rank): assert metrics is None -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_noop_until_time(): """Verify metrics aren't collected until enough time passes. """ @@ -110,7 +105,6 @@ def test_noop_until_time(): assert metrics is not None -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("has_data", [True, False]) def test_initial_metrics_has_correct_values(has_data: bool): """Test correctness of metrics data. diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py index 81c6763ebc64b..7744b2640fe94 100644 --- a/tests/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -10,7 +10,6 @@ from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.top1_proposer import Top1Proposer -from vllm.utils import is_hpu from vllm.worker.worker import Worker from .utils import (assert_logprobs_dict_allclose, create_batch, @@ -71,7 +70,6 @@ def test_assert_enough_kv_space(num_steps: int): seq_group_metadata.block_tables = original_block_tables -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @torch.inference_mode() def test_same_output_for_single_step(): """Verify the multi step worker produces the same output as the normal @@ -155,7 +153,6 @@ def test_same_output_for_single_step(): assert_logprobs_dict_allclose(actual_logprobs, expected_logprobs) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @torch.inference_mode() def test_same_output_for_multi_step(): """Verify the multi-step worker produces the same output as the normal @@ -280,7 +277,6 @@ def test_same_output_for_multi_step(): single_step_logprobs) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @torch.inference_mode() def test_draft_proposals_full_speculation_len(): """Verify Top1Proposer correctly handles case where all sequences @@ -334,7 +330,6 @@ def test_draft_proposals_full_speculation_len(): assert proposals.proposal_lens.tolist() == [k for _ in range(batch_size)] -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @torch.inference_mode() def test_draft_proposals_no_speculations(): """Verify Top1Proposer correctly handles case where no sequences @@ -373,7 +368,6 @@ def test_draft_proposals_no_speculations(): assert proposals.proposal_lens.tolist() == [0 for _ in range(batch_size)] -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @torch.inference_mode() def test_draft_proposals_mixed_k(): """Verify Top1Proposer correctly handles case some sequences can diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py index e7968cf0c7737..b1537884f896e 100644 --- a/tests/spec_decode/test_ngram_worker.py +++ b/tests/spec_decode/test_ngram_worker.py @@ -1,15 +1,12 @@ import torch -import pytest from vllm.sequence import ExecuteModelRequest from vllm.spec_decode.ngram_worker import NGramWorker from vllm.spec_decode.top1_proposer import Top1Proposer -from vllm.utils import is_hpu from .utils import create_seq_group_metadata_from_prompts, create_worker -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_ngram_algo_correctness_for_single_no_match(): """Verify our ngram algo find the right candidate in the prompt @@ -67,7 +64,6 @@ def test_ngram_algo_correctness_for_single_no_match(): assert proposals.proposal_lens.tolist() == [0] -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_ngram_algo_correctness_for_batches_not_match_all(): """Verify our ngram algo find the right candidate in the prompt @@ -146,7 +142,6 @@ def test_ngram_algo_correctness_for_batches_not_match_all(): assert proposals.proposal_token_ids[4][i] == -1 -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_ngram_algo_correctness_for_batches_match_all(): """Verify our ngram algo find the right candidate in the prompt diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index 6b17af0b767b8..527e7eddd7e33 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -14,12 +14,11 @@ from vllm.spec_decode.multi_step_worker import MultiStepWorker from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker, split_num_cache_blocks_evenly) -from vllm.utils import is_hpu from .test_utils import mock_spec_decode_sampler from .utils import create_batch, create_sampler_output_list, mock_worker -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") + @pytest.mark.parametrize('k', [1, 2, 6]) @pytest.mark.parametrize('batch_size', [1, 2, 32]) @pytest.mark.parametrize("acceptance_sampler_method", @@ -54,7 +53,6 @@ def test_correctly_calls_draft_model(k: int, batch_size: int, assert actual_execute_model_data == execute_model_req -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize('k', [1, 2, 6]) @pytest.mark.parametrize('batch_size', [1, 2, 32]) @pytest.mark.parametrize("acceptance_sampler_method", @@ -137,7 +135,6 @@ def test_correctly_calls_target_model(k: int, batch_size: int, assert expected_seen_contexts == seen_contexts -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize('k', [1, 2, 6]) @pytest.mark.parametrize('batch_size', [1, 2, 32]) @pytest.mark.parametrize("acceptance_sampler_method", @@ -229,7 +226,6 @@ def test_correctly_calls_spec_decode_sampler(k: int, batch_size: int, assert torch.equal(actual.draft_probs, proposal_probs) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize('k', [1, 2, 6]) @pytest.mark.parametrize('batch_size', [1, 2, 32]) @pytest.mark.parametrize("acceptance_sampler_method", @@ -358,7 +354,6 @@ def test_correctly_formats_output(k: int, batch_size: int, i].output_token -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize('k', [1, 2]) @pytest.mark.parametrize('batch_size', [1]) @pytest.mark.parametrize('returns_metrics', [True, False]) @@ -456,7 +451,6 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool, assert args[0] == k or kwargs.get('k', -1) == k -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize('k', [0]) @pytest.mark.parametrize('batch_size', [1, 2, 32]) @pytest.mark.parametrize("acceptance_sampler_method", @@ -501,7 +495,6 @@ def test_k_equals_zero(k: int, batch_size: int, target_worker.execute_model.assert_called_once_with(execute_model_req) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize('k', [0, 5]) @pytest.mark.parametrize('batch_size', [0]) @pytest.mark.parametrize("acceptance_sampler_method", diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 12c6d1d5c7b3f..c8f86133f41ac 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -20,7 +20,6 @@ open_stream, serialize_vllm_model, tensorize_vllm_model) -from vllm.utils import is_hpu from ..conftest import VllmRunner, cleanup from ..utils import RemoteOpenAIServer @@ -86,8 +85,7 @@ def test_load_with_tensorizer(mock_agent, tensorizer_config): assert result == mock_agent_instance.deserialize.return_value -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") -s@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") +@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") def test_can_deserialize_s3(vllm_runner): model_ref = "EleutherAI/pythia-1.4b" tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors" @@ -105,7 +103,6 @@ def test_can_deserialize_s3(vllm_runner): assert deserialized_outputs -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") def test_deserialized_encrypted_vllm_model_has_same_outputs( vllm_runner, tmp_path): @@ -137,7 +134,6 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs( assert outputs == deserialized_outputs -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner, tmp_path): with hf_runner(model_ref) as hf_model: @@ -161,7 +157,6 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner, assert outputs == deserialized_outputs -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): from huggingface_hub import snapshot_download @@ -198,7 +193,6 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): assert loaded_vllm_model -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_load_without_tensorizer_load_format(vllm_runner): with pytest.raises(ValueError): vllm_runner( @@ -206,7 +200,6 @@ def test_load_without_tensorizer_load_format(vllm_runner): model_loader_extra_config=TensorizerConfig(tensorizer_uri="test")) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed") def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): ## Serialize model @@ -244,7 +237,6 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): completion_tokens=5, prompt_tokens=6, total_tokens=11) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_raise_value_error_on_invalid_load_format(vllm_runner): with pytest.raises(ValueError): vllm_runner( diff --git a/tests/test_config.py b/tests/test_config.py index 84ba9eec27969..6c8af9d7966b4 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,6 +1,6 @@ import pytest + from vllm.config import ModelConfig -from vllm.utils import is_hpu MODEL_IDS_EXPECTED = [ ("Qwen/Qwen1.5-7B", 32768), @@ -25,7 +25,6 @@ def test_disable_sliding_window(model_id_expected): assert model_config.max_model_len == expected -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_get_sliding_window(): TEST_SLIDING_WINDOW = 4096 # Test that the sliding window is correctly computed. diff --git a/tests/test_logits_processor.py b/tests/test_logits_processor.py index dfae9c6ef61a5..4ee980505a3ab 100644 --- a/tests/test_logits_processor.py +++ b/tests/test_logits_processor.py @@ -9,7 +9,6 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_random_seed from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata -from vllm.utils import is_hpu from vllm.utils import is_pin_memory_available @@ -43,17 +42,13 @@ def _prepare_test( RANDOM_SEEDS = list(range(128)) -if is_hpu(): - DEVICES = ["hpu"] -else: - DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) - ] +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) +] -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("seed", RANDOM_SEEDS) -@pytest.mark.parametrize("device", DEVICES) +@pytest.mark.parametrize("device", CUDA_DEVICES) def test_logits_processors(seed: int, device: str): set_random_seed(seed) torch.set_default_device(device) diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index c3bbc110fd69d..12e5ae85adea6 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -7,7 +7,6 @@ from vllm.transformers_utils.detokenizer import (Detokenizer, detokenize_incrementally) from vllm.transformers_utils.tokenizer_group import get_tokenizer_group -from vllm.utils import is_hpu TRUTH = [ "Hello here, this is a simple test", @@ -56,8 +55,6 @@ def _run_incremental_decode(tokenizer, all_input_ids, @pytest.mark.parametrize("skip_special_tokens", (True, False)) def test_decode_streaming(tokenizer_id, truth, with_prompt, skip_special_tokens): - if is_hpu() and tokenizer_id == "meta-llama/Llama-2-7b-hf": - pytest.skip("Skipping test on HPU") tokenizer = AutoTokenizer.from_pretrained(tokenizer_id) if with_prompt: truth_tokens = tokenizer(truth, add_special_tokens=False)["input_ids"] @@ -117,8 +114,6 @@ def detokenizer(tokenizer_name: str) -> Detokenizer: @pytest.fixture(name="complete_sequence_token_ids") def create_complete_sequence_token_ids(complete_sequence: str, tokenizer_name: str) -> List[int]: - if is_hpu() and tokenizer_name == "meta-llama/Llama-2-7b-hf": - pytest.skip("Skipping test on HPU") tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) complete_sequence_token_ids = tokenizer(complete_sequence)["input_ids"] return complete_sequence_token_ids @@ -152,8 +147,6 @@ def test_decode_sequence_logprobs(complete_sequence: str, detokenizer: Detokenizer, skip_special_tokens: bool): """Verify Detokenizer decodes logprobs correctly.""" - if is_hpu() and detokenizer == "meta-llama/Llama-2-7b-hf": - pytest.skip("Skipping test on HPU") sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens, logprobs=2) @@ -190,8 +183,6 @@ def test_decode_prompt_logprobs(complete_sequence: str, detokenizer: Detokenizer, skip_special_tokens: bool): """Verify Detokenizer decodes prompt logprobs correctly.""" - if is_hpu() and detokenizer == "meta-llama/Llama-2-7b-hf": - pytest.skip("Skipping test on HPU") sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens, prompt_logprobs=1) diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index c766c69874755..e1775790c0a03 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -8,7 +8,7 @@ from vllm.engine.arg_utils import EngineArgs from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata -from vllm.utils import get_open_port, is_hpu +from vllm.utils import get_open_port from vllm.worker.model_runner import ModelRunner, _get_graph_batch_size @@ -27,7 +27,7 @@ def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner: ) return model_runner -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") + @pytest.mark.parametrize("batch_size", list(range(1, 257))) def test_prepare_prompt(batch_size): model_runner = _create_model_runner( @@ -142,7 +142,6 @@ def test_prepare_prompt(batch_size): torch.testing.assert_close(actual, expected) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("batch_size", list(range(1, 257))) def test_prepare_decode_cuda_graph(batch_size): model_runner = _create_model_runner( @@ -253,7 +252,6 @@ def test_prepare_decode_cuda_graph(batch_size): torch.testing.assert_close(actual, expected) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_empty_seq_group(): """Verify prepare prompt and decode returns empty output.""" model_runner = _create_model_runner( @@ -298,7 +296,6 @@ def distributed_init(): ensure_model_parallel_initialized(1, 1) -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") @pytest.mark.parametrize("batch_size", list(range(2, 128))) @pytest.mark.parametrize("enforce_eager", [True, False]) def test_hybrid_batches(batch_size, enforce_eager, distributed_init): diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py index b831370fe81ad..d941ffdb5588a 100644 --- a/tests/worker/test_swap.py +++ b/tests/worker/test_swap.py @@ -1,13 +1,11 @@ import torch -import pytest from vllm.engine.arg_utils import EngineArgs from vllm.sequence import ExecuteModelRequest -from vllm.utils import get_distributed_init_method, get_ip, get_open_port, is_hpu +from vllm.utils import get_distributed_init_method, get_ip, get_open_port from vllm.worker.worker import Worker -@pytest.mark.skipif(is_hpu(), reason="Skipping test on HPU") def test_swap() -> None: # Configure the engine. engine_args = EngineArgs(model="facebook/opt-125m", From 85af27e06082a9dd9a3324806f67ca513db2b315 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 1 Jul 2024 13:36:30 +0300 Subject: [PATCH 055/819] cleanup --- vllm/hpu/utils.py | 83 +---------------------------------------------- 1 file changed, 1 insertion(+), 82 deletions(-) diff --git a/vllm/hpu/utils.py b/vllm/hpu/utils.py index 8d7f388cf262a..4ce9e2591c6b9 100644 --- a/vllm/hpu/utils.py +++ b/vllm/hpu/utils.py @@ -15,85 +15,4 @@ def wrapped(*args, **kwargs): del kwargs htorch.core.mark_step() return result - return wrapped - - -def profile_reicpes(recipe_names): - from pathlib import Path - import numpy as np - import matplotlib.pyplot as plt - from sklearn.metrics import ConfusionMatrixDisplay - import tqdm - recipe_names_short = [name.replace('.graph_dumps/HabanaFusedOpLazy_', '') for name in recipe_names] - recipes = [Path(Path.cwd().joinpath(name + '-PostGraph-symbol.pbtxt')).open('r').read() for name in recipe_names] - - def generic_similarity_backend(recipes, similarity_func, backend_name=''): - num_recipes = len(recipes) - sim_tri = np.zeros((num_recipes, num_recipes)) - total = (num_recipes * (num_recipes + 1)) // 2 - num_recipes - backend_txt = f' with {backend_name}' if backend_name != '' else '' - with tqdm.tqdm(total=total, desc=f" computing similarity matrix{backend_txt}") as pbar: - for i in range(num_recipes): - for j in range(i): - sim_tri[i,j] = similarity_func(recipes[i], recipes[j]) - pbar.update(1) - sim = sim_tri.T + sim_tri - sim_idx = np.arange(sim_tri.shape[0]) - sim[sim_idx,sim_idx] = 1 - return sim - - def cosine_similarity_rad_backend(recipes): - from strsimpy.cosine import Cosine - s = Cosine(2) - return generic_similarity_backend(recipes, s.similarity, "Cosine (rad)"), "cosine similarity, 1 = max similarity" - - def cosine_similarity_deg_backend(recipes): - from strsimpy.cosine import Cosine - s = Cosine(2) - rad = generic_similarity_backend(recipes, s.similarity, "cosine similarity") - deg = np.degrees(np.arccos(rad)) - return deg, "cosine similarity (deviation in deg, 0 = max similarity)" - - def overlap_coefficient_backend(recipes): - from strsimpy.overlap_coefficient import OverlapCoefficient - s = OverlapCoefficient(2) - return generic_similarity_backend(recipes, s.similarity, OverlapCoefficient.__name__), OverlapCoefficient.__name__ - - def normalized_levenshtein_backend(recipes): - from strsimpy.normalized_levenshtein import NormalizedLevenshtein - s = NormalizedLevenshtein() - return generic_similarity_backend(recipes, s.similarity, NormalizedLevenshtein.__name__), NormalizedLevenshtein.__name__ - - def jaro_winkler_backend(recipes): - from strsimpy.jaro_winkler import JaroWinkler - s = JaroWinkler() - return generic_similarity_backend(recipes, s.similarity, JaroWinkler.__name__), JaroWinkler.__name__ - - def tfidf_weird_backend(recipes): - def tfidf_single_elem(x,y): - from sklearn.feature_extraction.text import TfidfVectorizer - vect = TfidfVectorizer() - tfidf = vect.fit_transform([x,y]) - sim_sparse = tfidf * tfidf.T - sim = sim_sparse.toarray() - return sim[0,1] - return generic_similarity_backend(recipes, tfidf_single_elem, 'TfidfVectorizer (weird)'), 'TfidfVectorizer (weird)' - - def tfidf_backend(recipes): - from sklearn.feature_extraction.text import TfidfVectorizer - vect = TfidfVectorizer() - tfidf = vect.fit_transform(recipes) - sim_sparse = tfidf * tfidf.T - sim = sim_sparse.toarray() - return sim, 'TfidfVectorizer' - - sim, backend_name = tfidf_backend(recipes) - plt.rcParams["figure.figsize"] = [16,16] - plt.rcParams["figure.dpi"] = 300 - cm = ConfusionMatrixDisplay(sim, display_labels=recipe_names_short) - cm.plot(xticks_rotation='vertical', text_kw={"fontsize":5}) - cm.ax_.set_xlabel("Target recipe number") - cm.ax_.set_ylabel("Source recipe number") - plt.title(f'Recipe similarity ({backend_name})') - return plt -# plt.savefig('similarity.png') \ No newline at end of file + return wrapped \ No newline at end of file From f856a85adb003e8fedeb69bf7cd811a0fff12ea0 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 1 Jul 2024 13:38:06 +0300 Subject: [PATCH 056/819] llm engine cleanup --- vllm/engine/llm_engine.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index f98fb58592a32..96ac0bdd59012 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -408,6 +408,7 @@ def from_engine_args( else: from vllm.executor.gpu_executor import GPUExecutor executor_class = GPUExecutor + # Create the LLM engine. engine = cls( **engine_config.to_dict(), @@ -838,6 +839,7 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: request_outputs = self._process_model_outputs( output, scheduler_outputs.scheduled_seq_groups, scheduler_outputs.ignored_seq_groups, seq_group_metadata_list) + # Log stats. self.do_log_stats(scheduler_outputs, output) @@ -852,14 +854,6 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: # queued control plane messages, such as add/remove lora adapters. self.model_executor.stop_remote_worker_execution_loop() -# out_prompt = [ro.prompt for ro in request_outputs] -# out_indices = [ro.outputs[-1].index for ro in request_outputs] -# out_text = [f'{ro.outputs[-1].text!r}' for ro in request_outputs] -# for idx, (p, i, t) in enumerate(zip(out_prompt, out_indices, out_text)): -# logger.info(f'\tPROMPT ({idx}): {p}') -# logger.info(f'\tGEN IDX ({idx}): {i}') -# logger.info(f'\tGEN TXT ({idx}): {t}') -# logger.info('') return request_outputs def add_logger(self, logger_name: str, logger: StatLoggerBase) -> None: From b1f8b71e5bda8c71f0f9ebf806c9db36a47ffa13 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 1 Jul 2024 13:39:20 +0300 Subject: [PATCH 057/819] utils.py cleanup --- vllm/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/utils.py b/vllm/utils.py index 4e745ab96bc4c..2fb77a0fc431c 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -517,7 +517,7 @@ def create_kv_caches_with_random( dtype=torch_dtype, device=device) cache_dtype = str(cache_dtype) - if cache_dtype in ["auto", "half", "float16", "torch.float16", "torch.bfloat16", "torch.float32"]: + if cache_dtype in ["auto", "half", "torch.float16", "torch.bfloat16", "torch.float32"]: key_cache.uniform_(-scale, scale) elif cache_dtype == 'fp8': _generate_random_fp8(key_cache, -scale, scale) From fb744547dc8a7d4e4d650516b95d826b68bf3e2d Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 1 Jul 2024 13:54:40 +0300 Subject: [PATCH 058/819] custom ops refactor --- vllm/_custom_ops.py | 74 +++++++++++------------- vllm/hpu/ops.py | 16 ++++- vllm/model_executor/custom_op.py | 6 +- vllm/model_executor/layers/activation.py | 9 +++ 4 files changed, 57 insertions(+), 48 deletions(-) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index e4d07bae6dd11..479ea08e49072 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1,6 +1,5 @@ import contextlib import functools -import importlib from typing import List, Optional, Tuple, Type import torch @@ -45,33 +44,26 @@ def wrapper(*args, **kwargs): return wrapper -_ops = torch.ops._C -_cache_ops = torch.ops._C_cache_ops -if importlib.util.find_spec('habana_frameworks') is not None: - from vllm.hpu import ops as vllm_ops - from vllm.hpu import cache_ops as vllm_cache_ops - _ops = vllm_ops - _cache_ops = vllm_cache_ops # activation ops def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - _ops.silu_and_mul(out, x) + torch.ops._C.silu_and_mul(out, x) def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - _ops.gelu_and_mul(out, x) + torch.ops._C.gelu_and_mul(out, x) def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: - _ops.gelu_tanh_and_mul(out, x) + torch.ops._C.gelu_tanh_and_mul(out, x) def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None: - _ops.gelu_fast(out, x) + torch.ops._C.gelu_fast(out, x) def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None: - _ops.gelu_new(out, x) + torch.ops._C.gelu_new(out, x) def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None: @@ -99,7 +91,7 @@ def paged_attention_v1( blocksparse_block_size: int = 64, blocksparse_head_sliding_step: int = 0, ) -> None: - _ops.paged_attention_v1( + torch.ops._C.paged_attention_v1( out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len, alibi_slopes, kv_cache_dtype, kv_scale, tp_rank, blocksparse_local_blocks, blocksparse_vert_stride, @@ -129,7 +121,7 @@ def paged_attention_v2( blocksparse_block_size: int = 64, blocksparse_head_sliding_step: int = 0, ) -> None: - _ops.paged_attention_v2( + torch.ops._C.paged_attention_v2( out, exp_sum, max_logits, tmp_out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len, alibi_slopes, kv_cache_dtype, kv_scale, tp_rank, @@ -146,7 +138,7 @@ def rotary_embedding( cos_sin_cache: torch.Tensor, is_neox: bool, ) -> None: - _ops.rotary_embedding(positions, query, key, head_size, + torch.ops._C.rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox) @@ -155,7 +147,7 @@ def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor, cos_sin_cache: torch.Tensor, is_neox: bool, rot_dim: int, cos_sin_cache_offsets: torch.Tensor) -> None: - _ops.batched_rotary_embedding(positions, query, key, head_size, + torch.ops._C.batched_rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox, rot_dim, cos_sin_cache_offsets) @@ -163,12 +155,12 @@ def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor, # layer norm ops def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, epsilon: float) -> None: - _ops.rms_norm(out, input, weight, epsilon) + torch.ops._C.rms_norm(out, input, weight, epsilon) def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, epsilon: float) -> None: - _ops.fused_add_rms_norm(input, residual, weight, epsilon) + torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon) # quantization ops @@ -176,13 +168,13 @@ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor, def awq_dequantize(qweight: torch.Tensor, scales: torch.Tensor, zeros: torch.Tensor, split_k_iters: int, thx: int, thy: int) -> torch.Tensor: - return _ops.awq_dequantize(qweight, scales, zeros, split_k_iters, + return torch.ops._C.awq_dequantize(qweight, scales, zeros, split_k_iters, thx, thy) def awq_gemm(input: torch.Tensor, qweight: torch.Tensor, qzeros: torch.Tensor, scales: torch.Tensor, split_k_iters: int) -> torch.Tensor: - return _ops.awq_gemm(input, qweight, qzeros, scales, split_k_iters) + return torch.ops._C.awq_gemm(input, qweight, qzeros, scales, split_k_iters) # gptq @@ -190,26 +182,26 @@ def gptq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, b_gptq_qzeros: torch.Tensor, b_gptq_scales: torch.Tensor, b_g_idx: torch.Tensor, use_exllama: bool, bit: int) -> torch.Tensor: - return _ops.gptq_gemm(a, b_q_weight, b_gptq_qzeros, b_gptq_scales, + return torch.ops._C.gptq_gemm(a, b_q_weight, b_gptq_qzeros, b_gptq_scales, b_g_idx, use_exllama, bit) def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor, bit: int) -> None: - _ops.gptq_shuffle(q_weight, q_perm, bit) + torch.ops._C.gptq_shuffle(q_weight, q_perm, bit) # squeezellm def squeezellm_gemm(vec: torch.Tensor, mat: torch.Tensor, mul: torch.Tensor, lookup_table: torch.Tensor) -> None: - _ops.squeezellm_gemm(vec, mat, mul, lookup_table) + torch.ops._C.squeezellm_gemm(vec, mat, mul, lookup_table) # marlin def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, b_scales: torch.Tensor, workspace: torch.Tensor, size_m: int, size_n: int, size_k: int) -> torch.Tensor: - return _ops.marlin_gemm(a, b_q_weight, b_scales, workspace, size_m, + return torch.ops._C.marlin_gemm(a, b_q_weight, b_scales, workspace, size_m, size_n, size_k) @@ -218,7 +210,7 @@ def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, b_meta: torch.Tensor, b_scales: torch.Tensor, workspace: torch.Tensor, num_bits: int, size_m: int, size_n: int, size_k: int) -> torch.Tensor: - return _ops.gptq_marlin_24_gemm(a, b_q_weight, b_meta, b_scales, + return torch.ops._C.gptq_marlin_24_gemm(a, b_q_weight, b_meta, b_scales, workspace, num_bits, size_m, size_n, size_k) @@ -241,7 +233,7 @@ def cutlass_scaled_mm(a: torch.Tensor, n = b.shape[1] out = torch.empty((m, n), dtype=out_dtype, device=a.device) - _ops.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias) + torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias) return out @@ -251,13 +243,13 @@ def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor, codebooks: torch.Tensor, scales: torch.Tensor, codebook_partition_sizes: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: - return _ops.aqlm_gemm(input, codes, codebooks, scales, + return torch.ops._C.aqlm_gemm(input, codes, codebooks, scales, codebook_partition_sizes, bias) def aqlm_dequant(codes: torch.Tensor, codebooks: torch.Tensor, codebook_partition_sizes: torch.Tensor) -> torch.Tensor: - return _ops.aqlm_dequant(codes, codebooks, + return torch.ops._C.aqlm_dequant(codes, codebooks, codebook_partition_sizes) @@ -265,7 +257,7 @@ def aqlm_dequant(codes: torch.Tensor, codebooks: torch.Tensor, def gptq_marlin_repack(b_q_weight: torch.Tensor, perm: torch.Tensor, size_k: int, size_n: int, num_bits: int) -> torch.Tensor: - return _ops.gptq_marlin_repack(b_q_weight, perm, size_k, size_n, + return torch.ops._C.gptq_marlin_repack(b_q_weight, perm, size_k, size_n, num_bits) @@ -274,7 +266,7 @@ def gptq_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, perm: torch.Tensor, workspace: torch.Tensor, num_bits: int, size_m: int, size_n: int, size_k: int, is_k_full: bool) -> torch.Tensor: - return _ops.gptq_marlin_gemm(a, b_q_weight, b_scales, g_idx, perm, + return torch.ops._C.gptq_marlin_gemm(a, b_q_weight, b_scales, g_idx, perm, workspace, num_bits, size_m, size_n, size_k, is_k_full) @@ -313,9 +305,9 @@ def scaled_fp8_quant( output = torch.empty_like(input, dtype=torch.float8_e4m3fn) if scale is None: scale = torch.zeros(1, device=input.device, dtype=torch.float32) - _ops.dynamic_scaled_fp8_quant(output, input, scale) + torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale) else: - _ops.static_scaled_fp8_quant(output, input, scale) + torch.ops._C.static_scaled_fp8_quant(output, input, scale) return output, scale @@ -338,14 +330,14 @@ def scaled_int8_quant( output = torch.empty_like(input, dtype=torch.int8) if scale is not None: # static-per-tensor quantization. - _ops.static_scaled_int8_quant(output, input, scale) + torch.ops._C.static_scaled_int8_quant(output, input, scale) return output, scale # dynamic-per-token quantization. input_scales = torch.empty((input.numel() // input.shape[-1], 1), device=input.device, dtype=torch.float32) - _ops.dynamic_scaled_int8_quant(output, input, input_scales) + torch.ops._C.dynamic_scaled_int8_quant(output, input, input_scales) return output, input_scales @@ -354,7 +346,7 @@ def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int, block_size: int, sorted_token_ids: torch.Tensor, experts_ids: torch.Tensor, num_tokens_post_pad: torch.Tensor) -> None: - _ops.moe_align_block_size(topk_ids, num_experts, block_size, + torch.ops._C.moe_align_block_size(topk_ids, num_experts, block_size, sorted_token_ids, experts_ids, num_tokens_post_pad) @@ -375,7 +367,7 @@ def reshape_and_cache( kv_cache_dtype: str, kv_scale: float, ) -> None: - _cache_ops.reshape_and_cache(key, value, key_cache, + torch.ops._C_cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype, kv_scale) @@ -388,7 +380,7 @@ def reshape_and_cache_flash( slot_mapping: torch.Tensor, kv_cache_dtype: str, ) -> None: - _cache_ops.reshape_and_cache_flash(key, value, key_cache, + torch.ops._C_cache_ops.reshape_and_cache_flash(key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype) @@ -396,19 +388,19 @@ def reshape_and_cache_flash( def copy_blocks(key_caches: List[torch.Tensor], value_caches: List[torch.Tensor], block_mapping: torch.Tensor) -> None: - _cache_ops.copy_blocks(key_caches, value_caches, block_mapping) + torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping) def swap_blocks(src: torch.Tensor, dst: torch.Tensor, block_mapping: torch.Tensor) -> None: - _cache_ops.swap_blocks(src, dst, block_mapping) + torch.ops._C_cache_ops.swap_blocks(src, dst, block_mapping) def convert_fp8(output: torch.Tensor, input: torch.Tensor, scale: float = 1.0, kv_dtype: str = "fp8") -> None: - _cache_ops.convert_fp8(output, input, scale, kv_dtype) + torch.ops._C_cache_ops.convert_fp8(output, input, scale, kv_dtype) def get_device_attribute(attribute: int, device: int) -> int: diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index 1f2e07bd59ccb..c91f8c6a86afe 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -26,12 +26,24 @@ def silu_and_mul(output, input): output.copy_(silu(x) * y) +def gelu_and_mul(output, input): + raise NotImplementedError("gelu_and_mul is not implemented for HPU backend") + + +def gelu_tanh_and_mul(output, input): + raise NotImplementedError("gelu_tanh_and_mul is not implemented for HPU backend") + + def gelu_new(output, input): - raise NotImplementedError + raise NotImplementedError("gelu_new is not implemented for HPU backend") def gelu_fast(output, input): - raise NotImplementedError + raise NotImplementedError("gelu_fast is not implemented for HPU backend") + + +def gelu_quick(output, input): + raise NotImplementedError("gelu_quick is not implemented for HPU backend") def fetch_from_cache(cache, blocks, permutations): diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 5276ada2a3086..d474490b98797 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -31,9 +31,6 @@ def forward_hip(self, *args, **kwargs): def forward_xpu(self, *args, **kwargs): raise NotImplementedError - def forward_hpu(self, *args, **kwargs): - return self.forward_cuda(*args, **kwargs) - def forward_cpu(self, *args, **kwargs): # By default, we assume that CPU ops are compatible with CUDA ops. return self.forward_cuda(*args, **kwargs) @@ -44,10 +41,9 @@ def forward_tpu(self, *args, **kwargs): # NOTE(woosuk): This is a placeholder for future extensions. return self.forward_native(*args, **kwargs) - def forward_gaudi(self, *args, **kwargs): + def forward_hpu(self, *args, **kwargs): # By default, we assume that Gaudi ops are compatible with the # PyTorch-native implementation. - # NOTE(woosuk): This is a placeholder for future extensions. return self.forward_native(*args, **kwargs) def dispatch_forward(self): diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 5bfdba67b443d..69f889ed1a1b8 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -37,6 +37,15 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: ops.silu_and_mul(out, x) return out + def forward_hpu(self, x: torch.Tensor) -> torch.Tensor: + import vllm.hpu.ops as ops + + d = x.shape[-1] // 2 + output_shape = (x.shape[:-1] + (d, )) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + ops.silu_and_mul(out, x) + return out + def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: from vllm._ipex_ops import ipex_ops as ops From aae39b10aa480f5d3b969445c55c192eea8ae610 Mon Sep 17 00:00:00 2001 From: ChenWenbin Date: Mon, 1 Jul 2024 19:16:32 +0800 Subject: [PATCH 059/819] Add alibi support (#69) Signed-off-by: Wenbin Chen --- vllm/attention/backends/abstract.py | 1 + vllm/attention/backends/habana_attn.py | 74 ++++++++++++++------------ vllm/attention/layer.py | 3 +- vllm/hpu/ops.py | 7 ++- vllm/model_executor/models/mpt.py | 3 +- vllm/worker/habana_model_runner.py | 43 +++++++-------- 6 files changed, 67 insertions(+), 64 deletions(-) diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index b2b6e7ac810e3..9024c830c0fcb 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -111,6 +111,7 @@ def __init__( num_kv_heads: Optional[int] = None, alibi_slopes: Optional[List[float]] = None, sliding_window: Optional[int] = None, + max_seq_len : Optional[int] = 4096, ) -> None: raise NotImplementedError diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py index 017cf9c8933e5..5b31d9fc47ba8 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/habana_attn.py @@ -136,16 +136,21 @@ def __init__( num_kv_heads: Optional[int] = None, alibi_slopes: Optional[List[float]] = None, sliding_window: Optional[int] = None, + max_seq_len : Optional[int] = 4096, ) -> None: self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads self.sliding_window = sliding_window + self.position_bias = None if alibi_slopes is not None: - alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) + alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.bfloat16) + self.position_bias = _make_alibi_bias(alibi_slopes, + num_kv_heads, + alibi_slopes.dtype, + max_seq_len) self.alibi_slopes = alibi_slopes - assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads @@ -199,13 +204,17 @@ def forward( if kv_cache is None or prefill_meta.block_tables.numel() == 0: # TODO: move this outside of model assert prefill_meta.attn_bias is not None, 'attn_bias must be set before calling model.forward!' + attn_bias = prefill_meta.attn_bias + if self.alibi_slopes is not None: + attn_bias.add_(self.position_bias[:, :, -attn_bias.size(2):, -attn_bias.size(3):]) + query_shape = (batch_size, seq_len, self.num_heads, self.head_size) kv_shape = (batch_size, seq_len_kv, self.num_kv_heads, self.head_size) out = xops.prompt_attention( query.view(query_shape), key.view(kv_shape), value.view(kv_shape), - attn_bias=prefill_meta.attn_bias, + attn_bias=attn_bias, p=0.0, scale=self.scale, ) @@ -236,10 +245,9 @@ def forward( attn_metadata.kv_cache_dtype, self.num_kv_heads, self.scale, - self.alibi_slopes, + self.position_bias, kv_scale ) - # Reshape the output tensor. return output.view(batch_size, seq_len, hidden_size) @@ -248,33 +256,29 @@ def _make_alibi_bias( alibi_slopes: torch.Tensor, num_kv_heads: int, dtype: torch.dtype, - seq_lens: List[int], -) -> LowerTriangularMaskWithTensorBias: - attn_biases = [] - for seq_len in seq_lens: - bias = torch.arange(seq_len, dtype=dtype) - # NOTE(zhuohan): HF uses - # `bias = bias[None, :].repeat(seq_len, 1)` - # here. We find that both biases give the same results, but - # the bias below more accurately follows the original ALiBi - # paper. - # Calculate a matrix where each element represents ith element- jth - # element. - bias = bias[None, :] - bias[:, None] - - padded_len = (seq_len + 7) // 8 * 8 - num_heads = alibi_slopes.shape[0] - bias = torch.empty( - 1, # batch size - num_heads, - seq_len, - padded_len, - device=alibi_slopes.device, - dtype=dtype, - )[:, :, :, :seq_len].copy_(bias) - bias.mul_(alibi_slopes[:, None, None]) - if num_heads != num_kv_heads: - bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads)) - attn_biases.append(LowerTriangularMaskWithTensorBias(bias)) - - return attn_biases + seq_len: int, +) -> torch.Tensor: + bias = torch.arange(seq_len, dtype=dtype) + # NOTE(zhuohan): HF uses + # `bias = bias[None, :].repeat(seq_len, 1)` + # here. We find that both biases give the same results, but + # the bias below more accurately follows the original ALiBi + # paper. + # Calculate a matrix where each element represents ith element- jth + # element. + bias = bias[None, :] - bias[:, None] + + padded_len = (seq_len + 7) // 8 * 8 + num_heads = alibi_slopes.shape[0] + bias = torch.empty( + 1, # batch size + num_heads, + seq_len, + padded_len, + device=alibi_slopes.device, + dtype=dtype, + )[:, :, :, :seq_len].copy_(bias) + bias.mul_(alibi_slopes[:, None, None]) + if num_heads != num_kv_heads: + bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads)) + return bias diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index ee7be26c0876c..ec2616c1ab69a 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -29,12 +29,13 @@ def __init__( num_kv_heads: Optional[int] = None, alibi_slopes: Optional[List[float]] = None, sliding_window: Optional[int] = None, + max_seq_len: Optional[int] = 4096, ) -> None: super().__init__() self.backend = get_attn_backend(torch.get_default_dtype()) impl_cls = self.backend.get_impl_cls() self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads, - alibi_slopes, sliding_window) + alibi_slopes, sliding_window, max_seq_len) def forward( self, diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index b66f6709977c8..51dcff74fab5b 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -36,7 +36,7 @@ def fetch_from_cache(cache, blocks, permutations): @hpu_utils.with_mark_steps -def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block_tables, context_lens, block_size, alibi_slopes, kv_cache_dtype=None) -> None: +def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block_tables, context_lens, block_size, alibi_slopes=None, kv_cache_dtype=None) -> None: seq_len = block_tables.size(1) batch_size, query_heads, _ = query.shape _, _, kv_heads, _ = key_cache.shape @@ -55,7 +55,10 @@ def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block mask = mask.unsqueeze(2) attn_weights = [torch.matmul(query, k) for k in keys] - attn_weights = (torch.cat(attn_weights, dim=-1) + attn_weights = torch.cat(attn_weights, dim=-1) + if alibi_slopes is not None: + attn_weights.add_(alibi_slopes[:,:,-attn_weights.size(2):, -attn_weights.size(3):]) + attn_weights = (attn_weights .masked_fill(mask, min_inf) .softmax(dim=-1)) diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 6fa5c5bd3014a..c1bebd6e30106 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -107,7 +107,8 @@ def __init__( self.head_dim, scaling, alibi_slopes=alibi_slopes, - num_kv_heads=self.num_kv_heads) + num_kv_heads=self.num_kv_heads, + max_seq_len=config.max_seq_len) def forward( self, diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 1a9206a314d5c..4571eb631e6d7 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -115,31 +115,24 @@ def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device, dtype): prefill_metadata = attn_metadata.prefill_metadata if prefill_metadata is None: return attn_metadata - #FIXME: Restore alibi support - #if self.alibi_slopes is None: - if True: - seq_lens_t = prefill_metadata.seq_lens_tensor - len_mask = (torch.arange(0, seq_len, device=device, dtype=torch.int32) - .view(1, seq_len) - .ge(seq_lens_t.unsqueeze(-1)) - .view(batch_size, 1, 1, seq_len)) - causal_mask = torch.triu( - torch.ones((batch_size, 1, seq_len, seq_len), device=device, dtype=torch.bool), - diagonal=1 - ) - mask = causal_mask.logical_or(len_mask) - attn_bias = (torch.zeros_like(mask, dtype=dtype) - .masked_fill_(mask, -math.inf)) - #FIXME: Restore sliding window support - #if self.sliding_window is not None: - prefill_metadata = prefill_metadata._replace(attn_bias=attn_bias) - attn_metadata = attn_metadata._replace(prefill_metadata=prefill_metadata) - return attn_metadata - else: - # FIXME: This needs updating... - prefill_meta.attn_bias = _make_alibi_bias( - self.alibi_slopes, self.num_kv_heads, batch_size, - seq_len, query.dtype) + + seq_lens_t = prefill_metadata.seq_lens_tensor + len_mask = (torch.arange(0, seq_len, device=device, dtype=torch.int32) + .view(1, seq_len) + .ge(seq_lens_t.unsqueeze(-1)) + .view(batch_size, 1, 1, seq_len)) + causal_mask = torch.triu( + torch.ones((batch_size, 1, seq_len, seq_len), device=device, dtype=torch.bool), + diagonal=1 + ) + mask = causal_mask.logical_or(len_mask) + attn_bias = (torch.zeros_like(mask, dtype=dtype) + .masked_fill_(mask, -math.inf)) + #FIXME: Restore sliding window support + #if self.sliding_window is not None: + prefill_metadata = prefill_metadata._replace(attn_bias=attn_bias) + attn_metadata = attn_metadata._replace(prefill_metadata=prefill_metadata) + return attn_metadata def forward(self, *args, **kwargs): From 0e63941c1f84eec0ad7d398e54838a1a658fe9ef Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 1 Jul 2024 15:28:29 +0300 Subject: [PATCH 060/819] move xops to ops --- vllm/attention/backends/habana_attn.py | 9 +-- vllm/hpu/ops.py | 91 +++++++++----------------- vllm/hpu/xops.py | 41 ------------ 3 files changed, 34 insertions(+), 107 deletions(-) delete mode 100644 vllm/hpu/xops.py diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py index 7d5fb5146cc2f..5184a4d9d4c44 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/habana_attn.py @@ -6,10 +6,7 @@ from typing import Any, Dict, List, Optional, Tuple, Type import torch -import math -import vllm.hpu.xops as xops -from vllm.hpu.attn_bias import (AttentionBias, - LowerTriangularMaskWithTensorBias) +import vllm.hpu.ops as ops from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata) @@ -107,7 +104,7 @@ def __post_init__(self): # when alibi slopes is used. It is because of the limitation # from xformer API. # will not appear in the __repr__ and __init__ - self.attn_bias: Optional[List[AttentionBias]] = None + self.attn_bias: Optional[List[torch.Tensor]] = None class HabanaAttentionImpl(AttentionImpl): @@ -203,7 +200,7 @@ def forward( assert attn_metadata.attn_bias is not None, 'attn_bias must be set before calling model.forward!' query_shape = (batch_size, seq_len, self.num_heads, self.head_size) kv_shape = (batch_size, seq_len_kv, self.num_kv_heads, self.head_size) - out = xops.prompt_attention( + out = ops.prompt_attention( query.view(query_shape), key.view(kv_shape), value.view(kv_shape), diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index c91f8c6a86afe..cecdb7cc67d43 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -10,7 +10,7 @@ import torch.nn.functional as F import habana_frameworks.torch as htorch import habana_frameworks.torch.utils.experimental as htexp -from typing import List, Optional, Tuple +from typing import Optional import vllm.hpu.utils as hpu_utils @@ -26,25 +26,6 @@ def silu_and_mul(output, input): output.copy_(silu(x) * y) -def gelu_and_mul(output, input): - raise NotImplementedError("gelu_and_mul is not implemented for HPU backend") - - -def gelu_tanh_and_mul(output, input): - raise NotImplementedError("gelu_tanh_and_mul is not implemented for HPU backend") - - -def gelu_new(output, input): - raise NotImplementedError("gelu_new is not implemented for HPU backend") - - -def gelu_fast(output, input): - raise NotImplementedError("gelu_fast is not implemented for HPU backend") - - -def gelu_quick(output, input): - raise NotImplementedError("gelu_quick is not implemented for HPU backend") - def fetch_from_cache(cache, blocks, permutations): return [cache.index_select(0, blocks[:, i]).permute(permutations) for i in range(blocks.size(1))] @@ -89,46 +70,6 @@ def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block return attn_weights.squeeze(-2) -def rms_norm(out, hidden_states, weight, eps): - htorch.core.mark_step() - input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + eps) - out.copy_(weight * hidden_states.to(input_dtype)) - htorch.core.mark_step() - - -def rotate_neox(x: torch.Tensor) -> torch.Tensor: - x1 = x[..., :x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2:] - return torch.cat((-x2, x1), dim=-1) - - -def rotate_gptj(x: torch.Tensor) -> torch.Tensor: - x1 = x[..., ::2] - x2 = x[..., 1::2] - x = torch.stack((-x2, x1), dim=-1) - return x.flatten(-2) - - -def apply_rope( - q: torch.Tensor, - k: torch.Tensor, - cos: torch.Tensor, - sin: torch.Tensor, - is_neox_style: bool, -) -> Tuple[torch.Tensor, torch.Tensor]: - rotate_fn = rotate_neox if is_neox_style else rotate_gptj - q_embed = (q * cos) + (rotate_fn(q) * sin) - k_embed = (k * cos) + (rotate_fn(k) * sin) - return q_embed, k_embed - - -def awq_gemm(*args): - raise NotImplementedError - - def silu_and_mul_wrapper(x: torch.Tensor) -> torch.Tensor: d = x.shape[-1] // 2 output_shape = (x.shape[:-1] + (d, )) @@ -163,3 +104,33 @@ def static_fused_moe(hidden_states, w1, w2, score, topk): final_hidden_states += current_hidden_states_static return final_hidden_states.view(-1, D) + + +@hpu_utils.with_mark_steps +def prompt_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_bias: Optional[torch.Tensor] = None, + p: float = 0.0, + scale: Optional[float] = None, +) -> torch.Tensor: + query = query.transpose(1, 2) + key = key.transpose(1, 2) + value = value.transpose(1, 2) + query_heads = query.size(1) + kv_heads = key.size(1) + if query_heads != kv_heads: + query = query.unflatten(1, (kv_heads, -1)) + key = key.unflatten(1, (kv_heads, 1)) + value = value.unflatten(1, (kv_heads, 1)) + attn_bias = attn_bias.unsqueeze(2) + attn_weights = torch.matmul(query * scale, key.transpose(-1, -2)) + if attn_bias is not None: + attn_weights.add_(attn_bias) + attn_weights = torch.softmax(attn_weights, dim=-1) + attn_weights = torch.matmul(attn_weights, value) + if query_heads != kv_heads: + attn_weights = attn_weights.flatten(1, 2) + attn_weights = attn_weights.transpose(1, 2) + return attn_weights diff --git a/vllm/hpu/xops.py b/vllm/hpu/xops.py deleted file mode 100644 index d6404a4872c0d..0000000000000 --- a/vllm/hpu/xops.py +++ /dev/null @@ -1,41 +0,0 @@ -############################################################################### -# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company -# -# This source code is licensed under the BSD license found in the -# LICENSE file in the root directory of this source tree. -############################################################################### - -import torch -from typing import Optional - -import vllm.hpu.utils - - -@vllm.hpu.utils.with_mark_steps -def prompt_attention( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - attn_bias: Optional[torch.Tensor] = None, - p: float = 0.0, - scale: Optional[float] = None, -) -> torch.Tensor: - query = query.transpose(1, 2) - key = key.transpose(1, 2) - value = value.transpose(1, 2) - query_heads = query.size(1) - kv_heads = key.size(1) - if query_heads != kv_heads: - query = query.unflatten(1, (kv_heads, -1)) - key = key.unflatten(1, (kv_heads, 1)) - value = value.unflatten(1, (kv_heads, 1)) - attn_bias = attn_bias.unsqueeze(2) - attn_weights = torch.matmul(query * scale, key.transpose(-1, -2)) - if attn_bias is not None: - attn_weights.add_(attn_bias) - attn_weights = torch.softmax(attn_weights, dim=-1) - attn_weights = torch.matmul(attn_weights, value) - if query_heads != kv_heads: - attn_weights = attn_weights.flatten(1, 2) - attn_weights = attn_weights.transpose(1, 2) - return attn_weights From 0141d5751076a9b0e5040a551b10d3150a79ae59 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 1 Jul 2024 15:38:58 +0300 Subject: [PATCH 061/819] remove vllm/hpu/attn_bias.py --- vllm/hpu/attn_bias.py | 764 ------------------------------------------ 1 file changed, 764 deletions(-) delete mode 100644 vllm/hpu/attn_bias.py diff --git a/vllm/hpu/attn_bias.py b/vllm/hpu/attn_bias.py deleted file mode 100644 index ff508a59cc56a..0000000000000 --- a/vllm/hpu/attn_bias.py +++ /dev/null @@ -1,764 +0,0 @@ -# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. -# -# This source code is licensed under the BSD license found in the -# LICENSE file in the root directory of this source tree. - - -import math -from dataclasses import dataclass -from typing import Any, Iterable, List, Optional, Sequence, Tuple, Union - -import torch - - -class AttentionBias: - """Base class for a custom bias that can be applied \ - as the attn_bias argument in - :attr:`xformers.ops.memory_efficient_attention`. - - That function has the ability to add a tensor, the - attention bias, to the QK^T matrix before it is used - in the softmax part of the attention calculation. - The attention bias tensor with shape - (B or 1, n_queries, number of keys) - can be given as the attn_bias input. - The most common use case is for an attention bias is - to contain only zeros and negative infinities, which forms - a mask so that some queries only attend to some keys. - - Children of this class define alternative things which can - be used as the attn_bias input to define an attention bias which - forms such a mask, for some common cases. - - When using an :attr:`xformers.ops.AttentionBias` - instead of a :attr:`torch.Tensor`, the mask matrix does - not need to be materialized, and can be - hardcoded into some kernels for better performance. - - See: - - - :attr:`xformers.ops.fmha.attn_bias.LowerTriangularMask` - - :attr:`xformers.ops.fmha.attn_bias.LowerTriangularMaskWithTensorBias` - - :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalMask` - - :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask` - - """ - - def materialize( - self, - shape: Tuple[int, ...], - dtype: torch.dtype = torch.float32, - device: Union[str, torch.device] = "cpu", - ) -> torch.Tensor: - """ - Materializes the bias as a `torch.Tensor`. This is very slow - and we don't attempt to make it fast. Only use for debugging/testing. - - Shape should be like `[*, q_seqlen, k_seqlen]` - """ - raise NotImplementedError() - - -class LowerTriangularMask(AttentionBias): - """ - A lower-triangular (aka causal) mask - - A query Q cannot attend to a key which is farther from the - initial key than Q is from the initial query. - """ - - def __init__(self, *tensor_args, **tensor_kwargs) -> None: - # NOTE: Unused arguments, we keep them for backward compatibility - super().__init__() - - def materialize( - self, - shape: Tuple[int, ...], - dtype: torch.dtype = torch.float32, - device: Union[str, torch.device] = "cpu", - ) -> torch.Tensor: - create_as = dtype if dtype is not torch.bfloat16 else torch.float32 - tensor = torch.full( # type: ignore - shape, - dtype=create_as, - fill_value=float("-inf"), - device=device, - ) - return torch.triu(tensor, diagonal=1).to(dtype) # type: ignore - - def add_bias(self, bias: torch.Tensor) -> "LowerTriangularMaskWithTensorBias": - return LowerTriangularMaskWithTensorBias(bias) - - -class LowerTriangularMaskWithTensorBias(LowerTriangularMask): - """A lower-triangular (aka causal) mask with an additive bias""" - - def __init__(self, bias: torch.Tensor) -> None: - self._bias = bias - - def materialize( - self, - shape: Tuple[int, ...], - dtype: torch.dtype = torch.float32, - device: Union[str, torch.device] = "cpu", - ) -> torch.Tensor: - return super().materialize(shape, dtype=dtype, device=device) + self._bias - - -@dataclass -class _SeqLenInfo: - """ - (Internal) Represents the division of a dimension into blocks. - - For example, to represents a dimension of length 7 divided into - three blocks of lengths 2, 3 and 2, use `from_seqlength([2, 3, 2])`. - The members will be: - max_seqlen: 3 - min_seqlen: 2 - seqstart_py: [0, 2, 5, 7] - seqstart: torch.IntTensor([0, 2, 5, 7]) - """ - - seqstart: torch.Tensor - max_seqlen: int - min_seqlen: int - seqstart_py: List[int] - - def to(self, device: torch.device) -> None: - self.seqstart = self.seqstart.to(device, non_blocking=True) - - def intervals(self) -> Iterable[Tuple[int, int]]: - yield from zip(self.seqstart_py, self.seqstart_py[1:]) - - @classmethod - def from_seqlens(cls, seqlens: Iterable[int]) -> "_SeqLenInfo": - """ - Input tensors are assumed to be in shape [B, M, *] - """ - assert not isinstance(seqlens, torch.Tensor) - seqstart_py = [0] - max_seqlen = -1 - min_seqlen = -1 - for seqlen in seqlens: - min_seqlen = min(min_seqlen, seqlen) if min_seqlen != -1 else seqlen - max_seqlen = max(max_seqlen, seqlen) - seqstart_py.append(seqstart_py[len(seqstart_py) - 1] + seqlen) - seqstart = torch.tensor(seqstart_py, dtype=torch.int32) - return cls( - max_seqlen=max_seqlen, - min_seqlen=min_seqlen, - seqstart=seqstart, - seqstart_py=seqstart_py, - ) - - def split( - self, x: torch.Tensor, batch_sizes: Optional[Sequence[int]] = None - ) -> List[torch.Tensor]: - if self.seqstart_py[-1] != x.shape[1] or x.shape[0] != 1: - raise ValueError( - f"Invalid `torch.Tensor` of shape {x.shape}, expected format " - f"(B, M, *) with B=1 and M={self.seqstart_py[-1]}\n" - f" seqstart: {self.seqstart_py}" - ) - if batch_sizes is None: - batch_sizes = [1] * (len(self.seqstart_py) - 1) - split_chunks = [] - it = 0 - for batch_size in batch_sizes: - split_chunks.append( - self.seqstart_py[it + batch_size] - self.seqstart_py[it] - ) - it += batch_size - return [ - tensor.reshape([bs, -1, *tensor.shape[2:]]) - for bs, tensor in zip(batch_sizes, x.split(split_chunks, dim=1)) - ] - - -@dataclass -class _PaddedSeqLenInfo(_SeqLenInfo): - """ - (Internal) Represents the division of a dimension into blocks which are - padded out to the same total length. - - For example, to represent a dimension of length 12 with space for - three blocks of length 4, but where the occupied lengths are - 2, 3 and 2, use `from_seqlens_padded([2, 3, 2], 4)`. - - The layout along the dimension is - - 0 ─â–ș block 0 - block 0 - - - 4 ─â–ș block 1 - block 1 - block 1 - - 8 ─â–ș block 2 - block 2 - - - 12 ─â–ș - - The members will be: - max_seqlen: 3 - min_seqlen: 2 - seqstart_py: [0, 4, 8, 12] - seqstart: torch.IntTensor([0, 4, 8, 12]) - seqlen_py: [2, 3, 2] - seqlen: torch.IntTensor([2, 3, 2]) - padding: 4 - """ - - seqlen: torch.Tensor - seqlen_py: Sequence[int] - padding: int - # From parent: seqstart[i] contains the start position - # of the i-th sequence - # seqstart: torch.Tensor - - def __post_init__(self) -> None: - assert len(self.seqstart_py) == len(self.seqlen_py) + 1 - - def to(self, device: torch.device) -> None: - self.seqlen = self.seqlen.to(device, non_blocking=True) - super().to(device) - - def intervals(self) -> Iterable[Tuple[int, int]]: - for (start, _), length in zip(super().intervals(), self.seqlen_py): - yield start, start + length - - @classmethod - def from_seqlens(cls, seqlens: Iterable[int]) -> "_SeqLenInfo": - raise RuntimeError( - "Use either `_SeqLenInfo.from_seqlens` or `_PaddedSeqLenInfo.from_seqlens_padded`" - ) - - @classmethod - def from_seqlens_padded( - cls, seqlens: Sequence[int], padding: int - ) -> "_PaddedSeqLenInfo": - """ - Input tensors are assumed to be in shape [B, M, *] - seqstart = padding * torch.arange(batch_size) - """ - assert not isinstance(seqlens, torch.Tensor) - assert all(seqlen <= padding for seqlen in seqlens) - seqstart_py = list(range(0, len(seqlens) * padding + 1, padding)) - return cls( - seqlen=torch.tensor(seqlens, dtype=torch.int32), - seqlen_py=seqlens, - max_seqlen=max(seqlens), - min_seqlen=min(seqlens), - seqstart=torch.tensor(seqstart_py, dtype=torch.int32), - seqstart_py=seqstart_py, - padding=padding, - ) - - def split( - self, x: torch.Tensor, batch_sizes: Optional[Sequence[int]] = None - ) -> List[torch.Tensor]: - raise NotImplementedError("_PaddedSeqLenInfo.split") - - -@dataclass -class BlockDiagonalMask(AttentionBias): - """ - A block-diagonal mask that can be passed as ``attn_bias`` - argument to :attr:`xformers.ops.memory_efficient_attention`. - - Queries and Keys are each divided into the same number of blocks. - Queries in block i only attend to keys in block i. - - .. figure:: /_static/block_diag_bias.png - - This bias can be used to handle a batch of sequences of - different lengths, via :attr:`BlockDiagonalMask.from_tensor_list` - - :Example: - - .. code-block:: python - - import torch - from xformers.ops import fmha - - K = 16 - dtype = torch.float16 - device = "cuda" - list_x = [ - torch.randn([1, 3, 1, K], dtype=dtype, device=device), - torch.randn([1, 6, 1, K], dtype=dtype, device=device), - torch.randn([1, 2, 1, K], dtype=dtype, device=device), - ] - attn_bias, x = fmha.BlockDiagonalMask.from_tensor_list(list_x) - linear = torch.nn.Linear(K, K * 3).to(device=device, dtype=dtype) - - q, k, v = linear(x).reshape([1, -1, 1, 3, K]).unbind(-2) - out = fmha.memory_efficient_attention(q, k, v, attn_bias=attn_bias) - list_out = attn_bias.split(out) - print(list_out[0].shape) # [1, 3, 1, K] - assert tuple(list_out[0].shape) == (1, 3, 1, K) - - """ - - q_seqinfo: _SeqLenInfo - k_seqinfo: _SeqLenInfo - _batch_sizes: Optional[Sequence[int]] = None - - def _create_block_mask( - self, - shape: Tuple[int, ...], - dtype: torch.dtype = torch.float32, - device: Union[str, torch.device] = "cpu", - ) -> torch.Tensor: - return torch.zeros( - shape, - dtype=dtype, - device=device, - ) - - def materialize( - self, - shape: Optional[Tuple[int, ...]] = None, - dtype: torch.dtype = torch.float32, - device: Union[str, torch.device] = "cpu", - ) -> torch.Tensor: - """Materialize the attention bias - for debugging & testing""" - if shape is None: - shape = (self.q_seqinfo.seqstart_py[-1], - self.k_seqinfo.seqstart_py[-1]) - assert shape[-1] == self.k_seqinfo.seqstart_py[-1], ( - shape[-1], - self.k_seqinfo.seqstart_py[-1], - ) - assert shape[-2] == self.q_seqinfo.seqstart_py[-1], ( - shape[-2], - self.q_seqinfo.seqstart_py[-1], - ) - mask = torch.empty(shape[-2:], dtype=dtype, device=device) - mask.fill_(-math.inf) - for i, ((q_start, q_end), (k_start, k_end)) in enumerate( - zip( - self.q_seqinfo.intervals(), - self.k_seqinfo.intervals(), - ) - ): - mask[q_start:q_end, k_start:k_end] = self._create_block_mask( - (q_end - q_start, k_end - k_start), - dtype=dtype, - device=device, - ) - for _ in range(len(shape) - 2): - mask = mask.unsqueeze(0) - return mask.expand(shape) - - @classmethod - def from_seqlens( - cls, - q_seqlen: Sequence[int], - kv_seqlen: Optional[Sequence[int]] = None, - ) -> "BlockDiagonalMask": - """Creates a :attr:`BlockDiagonalMask` from a list of tensors lengths for query and key/value. - - Args: - q_seqlen (Union[Sequence[int], torch.Tensor]): List or tensor of sequence lengths for query tensors - kv_seqlen (Union[Sequence[int], torch.Tensor], optional): List or tensor of sequence lengths for key/value. - (Defaults to ``q_seqlen``.) - Returns: - BlockDiagonalMask - """ - assert kv_seqlen is None or len(q_seqlen) == len(kv_seqlen) - q_seqinfo = _SeqLenInfo.from_seqlens(q_seqlen) - if kv_seqlen is None or q_seqlen == kv_seqlen: - k_seqinfo = q_seqinfo - else: - k_seqinfo = _SeqLenInfo.from_seqlens(kv_seqlen) - return cls(q_seqinfo=q_seqinfo, k_seqinfo=k_seqinfo) - - @classmethod - def from_tensor_list( - cls, - tensors: Sequence[torch.Tensor], - ) -> Tuple["BlockDiagonalMask", torch.Tensor]: - """Creates a :attr:`BlockDiagonalMask` from a list of tensors, and returns the tensors - concatenated on the sequence length dimension - - .. figure:: /_static/block_diag_cat_split.png - - See also :attr:`BlockDiagonalMask.split` to split the returned - :attr:`torch.Tensor` back to a list of tensors of varying sequence length - - Args: - tensors (Sequence[torch.Tensor]): A list of tensors of shape ``[B, M_i, *]``. - All tensors should have the same dimension and the same batch size ``B``, but - they can have different sequence length ``M``. - - Returns: - Tuple[BlockDiagonalMask, torch.Tensor]: The corresponding bias for the attention - along with `tensors` concatenated on the sequence length dimension, with shape ``[1, sum_i{M_i}, *]`` - """ - batch_sizes = [tensor.shape[0] for tensor in tensors] - seqlens = [] - for x in tensors: - for _ in range(x.shape[0]): - seqlens.append(x.shape[1]) - block_diag = cls.from_seqlens(seqlens) - block_diag._batch_sizes = batch_sizes - tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in tensors) - concat_tensors = torch.cat(tensors_bs1, dim=1) - return block_diag, concat_tensors - - @classmethod - def from_tensor_lists_qkv( - cls, - tensors_q: Sequence[torch.Tensor], - tensors_k: Sequence[torch.Tensor], - tensors_v: Optional[Sequence[torch.Tensor]] = None, - ) -> Tuple["BlockDiagonalMask", torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: - assert len(tensors_q) == len(tensors_k) - assert tensors_v is None or len(tensors_v) == len(tensors_q) - batch_sizes = [tensor.shape[0] for tensor in tensors_q] - q_seqlens, kv_seqlens = [], [] - for i, (q, k) in enumerate(zip(tensors_q, tensors_k)): - assert q.shape[0] == k.shape[0] - q_seqlens += [q.shape[1]] * q.shape[0] - kv_seqlens += [k.shape[1]] * k.shape[0] - assert tensors_v is None or tensors_v[i].shape[:2] == k.shape[:2] - block_diag = cls.from_seqlens(q_seqlens, kv_seqlens) - block_diag._batch_sizes = batch_sizes - return ( - block_diag, - torch.cat([x.reshape([1, -1, *x.shape[2:]]) for x in tensors_q], dim=1), - torch.cat([x.reshape([1, -1, *x.shape[2:]]) for x in tensors_k], dim=1), - torch.cat([x.reshape([1, -1, *x.shape[2:]]) for x in tensors_v], dim=1) - if tensors_v is not None - else None, - ) - - def split_queries(self, tensor: torch.Tensor) -> Sequence[torch.Tensor]: - return self.q_seqinfo.split(tensor, self._batch_sizes) - - def split_kv(self, tensor: torch.Tensor) -> Sequence[torch.Tensor]: - return self.k_seqinfo.split(tensor, self._batch_sizes) - - def split(self, tensor: torch.Tensor) -> Sequence[torch.Tensor]: - """The inverse operation of :attr:`BlockDiagonalCausalMask.from_tensor_list` - - Args: - tensor (torch.Tensor): Tensor of tokens of shape ``[1, sum_i{M_i}, *]`` - - Returns: - Sequence[torch.Tensor]: A list of tokens with possibly different sequence lengths - """ - assert self.q_seqinfo is self.k_seqinfo - return self.q_seqinfo.split(tensor, self._batch_sizes) - - def make_causal(self) -> "BlockDiagonalCausalMask": - """Makes each block causal""" - return BlockDiagonalCausalMask( - q_seqinfo=self.q_seqinfo, - k_seqinfo=self.k_seqinfo, - _batch_sizes=self._batch_sizes, - ) - - def make_causal_from_bottomright(self) -> "BlockDiagonalCausalFromBottomRightMask": - """Makes each block causal with a possible non-causal prefix""" - return BlockDiagonalCausalFromBottomRightMask( - q_seqinfo=self.q_seqinfo, - k_seqinfo=self.k_seqinfo, - _batch_sizes=self._batch_sizes, - ) - - def make_local_attention( - self, window_size: int - ) -> "BlockDiagonalCausalLocalAttentionMask": - """Experimental: Makes each block causal with local attention""" - return BlockDiagonalCausalLocalAttentionMask( - q_seqinfo=self.q_seqinfo, - k_seqinfo=self.k_seqinfo, - _batch_sizes=self._batch_sizes, - _window_size=window_size, - ) - - def make_local_attention_from_bottomright( - self, window_size: int - ) -> "BlockDiagonalCausalLocalAttentionFromBottomRightMask": - """Experimental: Makes each block causal with local attention, start from bottom right""" - return BlockDiagonalCausalLocalAttentionFromBottomRightMask( - q_seqinfo=self.q_seqinfo, - k_seqinfo=self.k_seqinfo, - _batch_sizes=self._batch_sizes, - _window_size=window_size, - ) - - -@dataclass -class BlockDiagonalCausalMask(BlockDiagonalMask): - """ - Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalMask`, except that each block is causal. - - Queries and Keys are each divided into the same number of blocks. - A query Q in block i cannot attend to a key which is not in block i, - nor one which is farther from the initial key in block i than Q - is from the initial query in block i. - """ - - def _create_block_mask( - self, - shape: Tuple[int, ...], - dtype: torch.dtype = torch.float32, - device: Union[str, torch.device] = "cpu", - ) -> torch.Tensor: - return LowerTriangularMask().materialize( - shape, - dtype=dtype, - device=device, - ) - - -@dataclass -class BlockDiagonalCausalFromBottomRightMask(BlockDiagonalMask): - """ - Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalMask`, except that each block is causal. - This mask allows for a non-causal prefix - NOTE: Each block should have `num_keys >= num_queries` otherwise the forward pass is not - defined (softmax of vector of `-inf` in the attention) - - Queries and keys are each divided into the same number of blocks. - A query Q in block i cannot attend to a key which is not in block i, - nor one which nearer the final key in block i than Q is to the - final query in block i. - """ - - def __post_init__(self) -> None: - for i, ((q_start, q_end), (k_start, k_end)) in enumerate( - zip( - self.q_seqinfo.intervals(), - self.k_seqinfo.intervals(), - ) - ): - num_queries = q_end - q_start - num_keys = k_end - k_start - if num_keys < num_queries: - raise ValueError( - f"Block #{i} has num_keys={num_keys} and num_queries={num_queries}." - " Expected `num_keys >= num_queries`" - ) - - def _create_block_mask( - self, - shape: Tuple[int, ...], - dtype: torch.dtype = torch.float32, - device: Union[str, torch.device] = "cpu", - ) -> torch.Tensor: - create_as = dtype if dtype is not torch.bfloat16 else torch.float32 - tensor = torch.full( # type: ignore - shape, - dtype=create_as, - fill_value=float("-inf"), - device=device, - ) - num_queries, num_keys = shape[-2:] - return torch.triu(tensor, diagonal=num_keys - num_queries + 1).to(dtype) # type: ignore - - -@dataclass -class BlockDiagonalCausalWithOffsetPaddedKeysMask(AttentionBias): - """ - Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask`, - except an offset on causality is allowed for each block and we support padding for k/v - - The keys and values are divided into blocks which are padded out to - the same total length. - For example, if there is space for 12 keys, for three blocks of - max length 4, but we only want to use the first 2, 3 and 2 - of each block, use `kv_padding=4` and `kv_seqlens=[2, 3, 2]`. - The queries are divided into blocks, without padding, of lengths given by - q_seqlen. - - A query Q in block i cannot attend to a key which is not in block i, - nor one which is not in use (i.e. in the padded area), - nor one which is nearer to the final key in block i - than Q is to the final query in block i. - """ - - q_seqinfo: _SeqLenInfo - k_seqinfo: _PaddedSeqLenInfo - causal_diagonal: Any = None # unused. Exists for BC only. - - def _create_block_mask( - self, - shape: Tuple[int, ...], - dtype: torch.dtype = torch.float32, - device: Union[str, torch.device] = "cpu", - ) -> torch.Tensor: - create_as = dtype if dtype is not torch.bfloat16 else torch.float32 - tensor = torch.full( # type: ignore - shape, - dtype=create_as, - fill_value=float("-inf"), - device=device, - ) - num_queries, num_keys = shape[-2:] - return torch.triu(tensor, diagonal=1 + num_keys - num_queries).to(dtype) # type: ignore - - def materialize( - self, - shape: Tuple[int, ...], - dtype: torch.dtype = torch.float32, - device: Union[str, torch.device] = "cpu", - ) -> torch.Tensor: - """Materialize the attention bias - for debugging & testing""" - if shape[-1] != self.k_seqinfo.seqstart_py[-1]: - raise ValueError("k shapes wrong") - if shape[-2] != self.q_seqinfo.seqstart_py[-1]: - raise ValueError("q shapes wrong") - mask = torch.empty(shape[-2:], dtype=dtype, device=device) - mask.fill_(-math.inf) - for i, ((q_start, q_end), (k_start, k_end)) in enumerate( - zip( - self.q_seqinfo.intervals(), - self.k_seqinfo.intervals(), - ) - ): - mask[q_start:q_end, k_start:k_end] = self._create_block_mask( - (q_end - q_start, k_end - k_start), - dtype=dtype, - device=device, - ) - for _ in range(len(shape) - 2): - mask = mask.unsqueeze(0) - return mask.expand(shape) - - @classmethod - def from_seqlens( - cls, - q_seqlen: Sequence[int], - kv_padding: int, - kv_seqlen: Sequence[int], - causal_diagonal: Any = None, - ) -> "BlockDiagonalCausalWithOffsetPaddedKeysMask": - """Creates a :attr:`BlockDiagonalCausalWithOffsetPaddedKeysMask` from a list of tensor - lengths for query and key/value. - - Args: - q_seqlen (Sequence[int]): List or tensor of sequence lengths for query tensors - kv_padding (int): Padding for k/v - also an upperbound on each individual key length - kv_seqlen (Sequence[int]): List or tensor of sequence lengths for key/value. - causal_diagonal: unused, for BC only - Returns: - BlockDiagonalCausalWithOffsetPaddedKeysMask - """ - assert kv_seqlen is None or len(q_seqlen) == len(kv_seqlen), ( - q_seqlen, - kv_seqlen, - ) - q_seqinfo = _SeqLenInfo.from_seqlens(q_seqlen) - k_seqinfo = _PaddedSeqLenInfo.from_seqlens_padded(kv_seqlen, kv_padding) - return cls(q_seqinfo=q_seqinfo, k_seqinfo=k_seqinfo) - - -@dataclass -class BlockDiagonalCausalLocalAttentionMask(BlockDiagonalCausalMask): - """ - (Experimental feature) - Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask`. - This makes the mask "local" and the attention pattern banded. - - Query i only attends to keys in its block and cannot attend keys further than "window_size" - from it. - """ - - _window_size: int = 0 # forced due to inheritance and default arguments - - def __post_init__(self): - if self._window_size <= 0: - raise ValueError( - f"Expected `window_size > 0`, but window_size={self._window_size}" - ) - q_seqlen = [ - y - x - for x, y in zip( - self.q_seqinfo.seqstart_py[:-1], self.q_seqinfo.seqstart_py[1:] - ) - ] - kv_seqlen = [ - y - x - for x, y in zip( - self.k_seqinfo.seqstart_py[:-1], self.k_seqinfo.seqstart_py[1:] - ) - ] - for q, k in zip(q_seqlen, kv_seqlen): - if q - self._window_size >= k: - # Each query only attends to keys no further than window_size back. - # When q > k + window_size, there will be a query for which the window doesn't reach any key. - raise RuntimeError( - f"No keys are attended in q_seqlen {q} k_seqlen {k} with sliding window {self._window_size}" - ) - - def _create_block_mask( - self, - shape: Tuple[int, ...], - dtype: torch.dtype = torch.float32, - device: Union[str, torch.device] = "cpu", - ) -> torch.Tensor: - create_as = dtype if dtype is not torch.bfloat16 else torch.float32 - tensor = torch.full( # type: ignore - shape, - dtype=create_as, - fill_value=1, - device=device, - ) - - num_queries, num_keys = shape[-2:] - mask = torch.tril(tensor, diagonal=0).to(dtype) # type: ignore - if self._window_size is not None and self._window_size > 0: - mask = torch.triu(mask, diagonal=-self._window_size + 1) - mask = torch.log(mask) - return mask.to(dtype) - - -@dataclass -class BlockDiagonalCausalLocalAttentionFromBottomRightMask( - BlockDiagonalCausalFromBottomRightMask -): - """ - (Experimental feature) - Same as :attr:`xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask`. - This makes the mask "local" and the attention pattern banded. - - Query i only attends to keys in its block and cannot attend keys further than "window_size" - from it. - """ - - _window_size: int = 0 # forced due to inheritance and default arguments - - def __post_init__(self): - super().__post_init__() - if self._window_size <= 0: - raise ValueError( - f"Expected `window_size > 0`, but window_size={self._window_size}" - ) - - def _create_block_mask( - self, - shape: Tuple[int, ...], - dtype: torch.dtype = torch.float32, - device: Union[str, torch.device] = "cpu", - ) -> torch.Tensor: - create_as = dtype if dtype is not torch.bfloat16 else torch.float32 - tensor = torch.full( # type: ignore - shape, - dtype=create_as, - fill_value=1, - device=device, - ) - num_queries, num_keys = shape[-2:] - mask = torch.tril(tensor, diagonal=num_keys - num_queries).to(dtype) # type: ignore - if self._window_size is not None: - mask = torch.triu( - mask, diagonal=num_keys - num_queries - self._window_size + 1 - ) - mask = torch.log(mask) - return mask.to(dtype) From 90f900cbbd41e7f880d9c6ac4423fd4379ff1c31 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 1 Jul 2024 15:21:50 +0200 Subject: [PATCH 062/819] Remove allgather workaround in logits_processor (#76) --- vllm/model_executor/layers/logits_processor.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index 3951619c6e3ec..57b6c7f907ae2 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -4,9 +4,9 @@ import torch import torch.nn as nn -from vllm.distributed import tensor_model_parallel_gather, tensor_model_parallel_all_gather +from vllm.distributed import tensor_model_parallel_gather from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.utils import is_hpu + class LogitsProcessor(nn.Module): """Process logits and apply logits processors from sampling metadata. @@ -50,9 +50,7 @@ def forward( # Get the logits for the next tokens. logits = self._get_logits(hidden_states, embedding, embedding_bias) - # NOTE(kzawora): allgather on HPU will cause logits to be not None, - # and we need to guard against applying logits processors on non-driver worker - if logits is not None and sampling_metadata.seq_groups is not None: + if logits is not None: logits *= self.scale # Apply logits processors (if any). @@ -66,9 +64,7 @@ def _get_logits(self, hidden_states: torch.Tensor, embedding: torch.Tensor, logits = torch.matmul(hidden_states, embedding.t()) if embedding_bias is not None: logits += embedding_bias - # NOTE(kzawora): HPU PT bridge is missing support for single-rank gather. We'll use all-gather on Gaudi for now. - gather_op = tensor_model_parallel_all_gather if is_hpu() else tensor_model_parallel_gather - logits = gather_op(logits) + logits = tensor_model_parallel_gather(logits) # Remove paddings in vocab (if any). if logits is not None: logits = logits[:, :self.org_vocab_size] From a21fe62dcb3cbdf8aba8758687c5cdcd209f6b0e Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 1 Jul 2024 16:28:07 +0300 Subject: [PATCH 063/819] whitespace fix --- vllm/model_executor/layers/logits_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index 7028936c54b03..39d142b158445 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -53,7 +53,7 @@ def forward( # Get the logits for the next tokens. logits = self._get_logits(hidden_states, embedding, embedding_bias) - + if logits is not None: if self.soft_cap is not None: logits = logits / self.soft_cap From aaf544633b8d7c9d8dc48ee9c70afa8e726d71df Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 1 Jul 2024 16:31:40 +0300 Subject: [PATCH 064/819] revert accidental changes in rmsnorm --- vllm/model_executor/layers/layernorm.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index f992dfc64fa80..d0d1577b26a10 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -116,13 +116,6 @@ def forward_xpu( from vllm._ipex_ops import ipex_ops as ops if residual is not None: - if x.device.type == "hpu" and FusedRMSNorm: - orig_dtype = x.dtype - orig_shape = x.shape - residual += x.view(residual.shape) - # Note: FusedRMSNorm requires 3D tensors as inputs - x = FusedRMSNorm.apply(residual.float(), self.weight.float(), self.variance_epsilon) - return x.to(orig_dtype).view(orig_shape), residual ops.fused_add_rms_norm( x, residual, @@ -130,10 +123,6 @@ def forward_xpu( self.variance_epsilon, ) return x, residual - if x.device.type == "hpu" and FusedRMSNorm: - orig_dtype = x.dtype - x = FusedRMSNorm.apply(x.float(), self.weight.float(), self.variance_epsilon) - return x.to(orig_dtype) out = torch.empty_like(x) ops.rms_norm( out, @@ -143,7 +132,6 @@ def forward_xpu( ) return out - def extra_repr(self) -> str: s = f"hidden_size={self.weight.data.size(0)}" s += f", eps={self.variance_epsilon}" From 1ec95c405cbf7eee1f6dcb63cf0b55cf0afb5486 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 1 Jul 2024 17:20:37 +0300 Subject: [PATCH 065/819] Fix hpugraph hashing --- vllm/attention/backends/habana_attn.py | 6 +++--- vllm/worker/habana_model_runner.py | 11 +++++------ 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py index 661bed749679d..3c8aebad976b7 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/habana_attn.py @@ -196,9 +196,9 @@ def forward( value_cache, attn_metadata.slot_mapping, self.kv_cache_dtype, - attn_metadata.num_prefills > 0) + attn_metadata.is_prompt) - if attn_metadata.num_prefills > 0: + if attn_metadata.is_prompt: # Prompt run. if kv_cache is None or attn_metadata.block_tables.numel() == 0: # TODO: move this outside of model @@ -233,7 +233,7 @@ def forward( attn_metadata.max_query_len, self.alibi_slopes, ) - if attn_metadata.num_decode_tokens > 0: + else: # Decoding run. output = HabanaPagedAttention.forward_decode( query, diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 9cdf951fed6ee..e53350ecfc1fc 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -14,7 +14,7 @@ import operator import torch import habana_frameworks.torch as htorch - +import contextlib from vllm.attention import (AttentionMetadata, get_attn_backend) from vllm.config import (DeviceConfig, LoadConfig, CacheConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig) @@ -522,7 +522,7 @@ def _prepare_prompt( num_prefills=real_num_seqs, num_prefill_tokens=sum_query_len, num_decode_tokens=0, - slot_mapping=slot_mapping + slot_mapping=slot_mapping, ) return PreparePromptMetadata( input_tokens=input_tokens, @@ -625,7 +625,7 @@ def _prepare_decode( num_prefills=0, num_prefill_tokens=0, num_decode_tokens=num_decode_tokens, - slot_mapping=slot_mapping + slot_mapping=slot_mapping, ) return PrepareDecodeMetadata( input_tokens=input_tokens, @@ -808,9 +808,8 @@ def trim_attn_metadata(self, metadata: AttentionMetadata) -> object: ['block_tables', 'seq_lens_tensor', 'attn_bias', - 'num_prefills', - 'num_decode_tokens', - 'slot_mapping']) + 'slot_mapping', + 'is_prompt']) return prefill_metadata @torch.inference_mode() From 2394c41b9d03b80fe43534aeca2b66408ea78e02 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 1 Jul 2024 18:57:41 +0300 Subject: [PATCH 066/819] add trim_attn_metadata comment --- vllm/worker/habana_model_runner.py | 36 +++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index e53350ecfc1fc..49f66ae1e0863 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -803,14 +803,34 @@ def _seq_len(self, attn_metadata): return attn_metadata.block_tables.size(1) * self.block_size def trim_attn_metadata(self, metadata: AttentionMetadata) -> object: - prefill_metadata = subtuple(metadata, - 'TrimmedAttentionMetadata', - ['block_tables', - 'seq_lens_tensor', - 'attn_bias', - 'slot_mapping', - 'is_prompt']) - return prefill_metadata + # NOTE(kzawora): To anyone working on this in the future: + # Trimming metadata is required when using HPUGraphs. + # Attention metadata is going to be hashed by PT bridge, and + # appropriate HPUGraphs will be matched based on all inputs' hash. + + # Before you put more keys in here, make sure you know their + # value type and make sure you know how it's going to be hashed. + # You can find that information in input_hash function + # in habana_frameworks/torch/hpu/graphs.py. You can also hash + # it manually with torch.hpu.graphs.input_hash(attention_metadata) + + # If you use primitive types here - they will get hashed based + # on their value. You *will* get lots of excessive graph captures + # (and an OOM eventually) if you decide to put something like + # seq_len int here. + # If you absolutely need a scalar, put it in a tensor. Tensors + # get hashed using their metadata, not their values: + # input_hash(torch.tensor(123)) == input_hash(torch.tensor(321)) + # input_hash(123) != input_hash(321) + # input_hash("abc") != input_hash("cba") + attention_metadata = subtuple(metadata, + 'TrimmedAttentionMetadata', + ['block_tables', + 'seq_lens_tensor', + 'attn_bias', + 'slot_mapping', + 'is_prompt']) + return attention_metadata @torch.inference_mode() def execute_model( From 98fb698c6c6140cca5f56283b3313460af270914 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 1 Jul 2024 19:26:34 +0300 Subject: [PATCH 067/819] fix prompt bucketing: --- vllm/worker/habana_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 49f66ae1e0863..21d34a3924c86 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -854,7 +854,7 @@ def execute_model( (input_tokens, input_positions, attn_metadata, sampling_metadata, lora_requests, lora_mapping, multi_modal_input ) = self.prepare_input_tensors(seq_group_metadata_list) - is_prompt = attn_metadata.prefill_metadata is not None + is_prompt = attn_metadata.is_prompt if self.lora_config: self.set_active_loras(lora_requests, lora_mapping) From d99d9862a960668bbcd3d8af3554587db82d42ef Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 2 Jul 2024 16:21:24 +0300 Subject: [PATCH 068/819] guard model loader wa for hpu --- vllm/model_executor/model_loader/loader.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 48060481b2ee2..cade78114be42 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -35,7 +35,7 @@ from vllm.model_executor.models.interfaces import (supports_lora, supports_vision) from vllm.model_executor.utils import set_weight_attrs -from vllm.utils import get_device_capability_stateless, is_tpu +from vllm.utils import get_device_capability_stateless, is_tpu, is_hpu logger = init_logger(__name__) @@ -262,7 +262,8 @@ def load_model(self, *, model_config: ModelConfig, scheduler_config: SchedulerConfig, cache_config: CacheConfig) -> nn.Module: with set_default_torch_dtype(model_config.dtype): - with torch.device('cpu'): # FIXME(kzawora): this is a nasty workaround!!! + load_device = torch.device(device_config.device) if not is_hpu() else 'cpu' # FIXME(kzawora): this is a nasty workaround!!! + with torch.device(load_device): model = _initialize_model(model_config, self.load_config, lora_config, vision_language_config, cache_config) @@ -282,7 +283,8 @@ def load_model(self, *, model_config: ModelConfig, # to use quant_method. if hasattr(module, "process_weights_after_loading"): module.process_weights_after_loading() - model = model.to('hpu') # FIXME(kzawora): this is a nasty workaround!!! + if is_hpu(): + model = model.to('hpu') # FIXME(kzawora): this is a nasty workaround!!! return model.eval() From 235a7bf749dce23295288fc4a28cf12d5ba6a6a4 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 2 Jul 2024 18:10:04 +0300 Subject: [PATCH 069/819] Make mypy happy --- pyproject.toml | 73 ++++ setup.py | 23 +- vllm/attention/backends/abstract.py | 2 +- vllm/attention/backends/habana_attn.py | 56 ++- vllm/attention/ops/habana_paged_attn.py | 28 +- vllm/attention/selector.py | 2 +- vllm/distributed/communication_op.py | 3 +- vllm/executor/habana_executor.py | 66 +++- vllm/executor/ray_habana_executor.py | 4 - vllm/executor/ray_utils.py | 15 +- vllm/hpu/cache_ops.py | 26 +- vllm/hpu/ops.py | 66 ++-- vllm/hpu/rotary_embed.py | 79 ++-- vllm/hpu/utils.py | 5 +- vllm/model_executor/layers/activation.py | 2 +- vllm/model_executor/layers/layernorm.py | 7 +- .../model_executor/layers/logits_processor.py | 4 +- .../model_executor/layers/rotary_embedding.py | 9 +- vllm/model_executor/model_loader/loader.py | 6 +- vllm/model_executor/models/mixtral.py | 15 +- vllm/model_executor/sampling_metadata.py | 6 +- vllm/utils.py | 67 ++-- vllm/worker/cache_engine.py | 8 +- vllm/worker/habana_model_runner.py | 338 ++++++++++++------ vllm/worker/habana_worker.py | 48 ++- vllm/worker/profiler.py | 6 +- 26 files changed, 633 insertions(+), 331 deletions(-) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000000..790e013620286 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,73 @@ +[build-system] +# Should be mirrored in requirements-build.txt +requires = [ + "cmake>=3.21", + "ninja", + "packaging", + "setuptools >= 49.4.0", + "torch == 2.3.0", + "wheel", +] +build-backend = "setuptools.build_meta" + +[tool.ruff] +# Allow lines to be as long as 80. +line-length = 80 +exclude = [ + # External file, leaving license intact + "examples/fp8/quantizer/quantize.py" +] + +[tool.ruff.lint] +select = [ + # pycodestyle + "E", + # Pyflakes + "F", + # pyupgrade + # "UP", + # flake8-bugbear + "B", + # flake8-simplify + "SIM", + # isort + # "I", + "G", +] +ignore = [ + # star imports + "F405", "F403", + # lambda expression assignment + "E731", + # Loop control variable not used within loop body + "B007", +] + +[tool.mypy] +python_version = "3.8" + +ignore_missing_imports = true +check_untyped_defs = true +follow_imports = "skip" + +files = "vllm" +# TODO(woosuk): Include the code from Megatron and HuggingFace. +exclude = [ + "vllm/model_executor/parallel_utils/|vllm/model_executor/models/", + # Ignore triton kernels in ops. + 'vllm/attention/ops/.*\.py$' +] + +[tool.codespell] +ignore-words-list = "dout, te, indicies, subtile" +skip = "./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build" + +[tool.isort] +use_parentheses = true +skip_gitignore = true + +[tool.pytest.ini_options] +markers = [ + "skip_global_cleanup", + "vlm: run tests for vision language models only", +] diff --git a/setup.py b/setup.py index e9a72c24b2391..ddf1cdf034c1b 100644 --- a/setup.py +++ b/setup.py @@ -207,11 +207,12 @@ def build_extensions(self) -> None: def _is_hpu() -> bool: is_hpu_available = True - return is_hpu_available # FIXME(kzawora): HPU autodetection sporadically fails on certain clients. Find the cause and fix it. + return is_hpu_available # FIXME(kzawora): HPU autodetection sporadically fails on certain clients. Find the cause and fix it. try: subprocess.run(["hl-smi"], capture_output=True, check=True) except (FileNotFoundError, PermissionError, subprocess.CalledProcessError): - if not os.path.exists('/dev/accel/accel0') and not os.path.exists('/dev/accel/accel_controlD0'): + if not os.path.exists('/dev/accel/accel0') and not os.path.exists( + '/dev/accel/accel_controlD0'): is_hpu_available = False return is_hpu_available @@ -331,17 +332,23 @@ def find_version(filepath: str) -> str: return version_match.group(1) raise RuntimeError("Unable to find version string.") + def get_gaudi_sw_version(): """ Returns the driver version. """ # Enable console printing for `hl-smi` check - output = subprocess.run( - "hl-smi", shell=True, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env={"ENABLE_CONSOLE": "true"} - ) + output = subprocess.run("hl-smi", + shell=True, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env={"ENABLE_CONSOLE": "true"}) if output.returncode == 0 and output.stdout: - return output.stdout.split("\n")[2].replace(" ", "").split(":")[1][:-1].split("-")[0] - return "0.0.0" # when hl-smi is not available + return output.stdout.split("\n")[2].replace( + " ", "").split(":")[1][:-1].split("-")[0] + return "0.0.0" # when hl-smi is not available + def get_vllm_version() -> str: version = find_version(get_path("vllm", "version.py")) @@ -365,7 +372,7 @@ def get_vllm_version() -> str: version += f"+neuron{neuron_version_str}" elif _is_hpu(): # Get the Intel Gaudi Software Suite version - gaudi_sw_version = str(get_gaudi_sw_version()) + gaudi_sw_version = str(get_gaudi_sw_version()) if gaudi_sw_version != MAIN_CUDA_VERSION: gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3] version += f"+gaudi{gaudi_sw_version}" diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 1f912d5432537..55d9a43b35652 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -116,7 +116,7 @@ def __init__( sliding_window: Optional[int] = None, kv_cache_dtype: str = "auto", blocksparse_params: Optional[Dict[str, Any]] = None, - max_seq_len : Optional[int] = 4096, + max_seq_len: Optional[int] = 4096, ) -> None: raise NotImplementedError diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py index 3c8aebad976b7..09e717f61ac74 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/habana_attn.py @@ -43,7 +43,8 @@ def swap_blocks( dst_kv_cache: torch.Tensor, src_to_dst: Dict[int, int], ) -> None: - HabanaPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) + HabanaPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, + src_to_dst) @staticmethod def copy_blocks( @@ -104,7 +105,7 @@ def __post_init__(self): # when alibi slopes is used. It is because of the limitation # from xformer API. # will not appear in the __repr__ and __init__ - self.attn_bias: Optional[List[torch.Tensor]] = None + self.attn_bias: Optional[torch.Tensor] = None class HabanaAttentionImpl(AttentionImpl): @@ -134,7 +135,7 @@ def __init__( sliding_window: Optional[int], kv_cache_dtype: str, blocksparse_params: Optional[Dict[str, Any]] = None, - max_seq_len : Optional[int] = 4096, + max_seq_len: int = 4096, ) -> None: self.kv_cache_dtype = kv_cache_dtype self.num_heads = num_heads @@ -144,12 +145,13 @@ def __init__( self.sliding_window = sliding_window self.position_bias = None if alibi_slopes is not None: - alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.bfloat16) - self.position_bias = _make_alibi_bias(alibi_slopes, + alibi_slopes_tensor = torch.tensor(alibi_slopes, + dtype=torch.bfloat16) + self.position_bias = _make_alibi_bias(alibi_slopes_tensor, num_kv_heads, - alibi_slopes.dtype, + alibi_slopes_tensor.dtype, max_seq_len) - self.alibi_slopes = alibi_slopes + self.alibi_slopes = alibi_slopes_tensor assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads @@ -164,9 +166,9 @@ def forward( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, - kv_cache: Optional[torch.Tensor], + kv_cache: torch.Tensor, attn_metadata: HabanaAttentionMetadata, - kv_scale: float, + kv_scale: float = 1.0, ) -> torch.Tensor: """Forward pass with xFormers and PagedAttention. @@ -192,11 +194,9 @@ def forward( # Reshape the input keys and values and store them in the cache. # If kv_cache is not provided, the new key and value tensors are # not cached. This happens during the initial memory profiling run. - HabanaPagedAttention.write_to_paged_cache(key, value, key_cache, - value_cache, - attn_metadata.slot_mapping, - self.kv_cache_dtype, - attn_metadata.is_prompt) + HabanaPagedAttention.write_to_paged_cache( + key, value, key_cache, value_cache, attn_metadata.slot_mapping, + self.kv_cache_dtype, attn_metadata.is_prompt) if attn_metadata.is_prompt: # Prompt run. @@ -204,11 +204,15 @@ def forward( # TODO: move this outside of model assert attn_metadata.attn_bias is not None, 'attn_bias must be set before calling model.forward!' attn_bias = attn_metadata.attn_bias - if self.alibi_slopes is not None: - attn_bias.add_(self.position_bias[:, :, -attn_bias.size(2):, -attn_bias.size(3):]) - - query_shape = (batch_size, seq_len, self.num_heads, self.head_size) - kv_shape = (batch_size, seq_len_kv, self.num_kv_heads, self.head_size) + if self.alibi_slopes is not None and self.position_bias is not None: + attn_bias.add_(self.position_bias[:, :, + -attn_bias.size(2):, + -attn_bias.size(3):]) + + query_shape = (batch_size, seq_len, self.num_heads, + self.head_size) + kv_shape = (batch_size, seq_len_kv, self.num_kv_heads, + self.head_size) out = ops.prompt_attention( query.view(query_shape), key.view(kv_shape), @@ -236,17 +240,9 @@ def forward( else: # Decoding run. output = HabanaPagedAttention.forward_decode( - query, - key_cache, - value_cache, - attn_metadata.block_tables, - attn_metadata.seq_lens_tensor, - self.kv_cache_dtype, - self.num_kv_heads, - self.scale, - self.position_bias, - kv_scale - ) + query, key_cache, value_cache, attn_metadata.block_tables, + attn_metadata.seq_lens_tensor, self.kv_cache_dtype, + self.num_kv_heads, self.scale, self.position_bias, kv_scale) # Reshape the output tensor. return output.view(batch_size, seq_len, hidden_size) diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/habana_paged_attn.py index c8ed500f7af1c..ed47b906168e5 100644 --- a/vllm/attention/ops/habana_paged_attn.py +++ b/vllm/attention/ops/habana_paged_attn.py @@ -54,24 +54,13 @@ def split_kv_cache( return key_cache, value_cache @staticmethod - def write_to_paged_cache( - key: torch.Tensor, - value: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - slot_mapping: torch.Tensor, - kv_cache_dtype: str, - is_prompt: bool - ) -> None: - cache_ops.reshape_and_cache( - key, - value, - key_cache, - value_cache, - slot_mapping, - kv_cache_dtype, - is_prompt - ) + def write_to_paged_cache(key: torch.Tensor, value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, kv_cache_dtype: str, + is_prompt: bool) -> None: + cache_ops.reshape_and_cache(key, value, key_cache, value_cache, + slot_mapping, kv_cache_dtype, is_prompt) @staticmethod def forward_decode( @@ -115,7 +104,8 @@ def forward_prefix( alibi_slopes: Optional[torch.Tensor], sliding_window: Optional[int], ) -> torch.Tensor: - raise NotImplementedError("forward_prefix is not implemented for HabanaPagedAttention") + raise NotImplementedError( + "forward_prefix is not implemented for HabanaPagedAttention") @staticmethod def swap_blocks( diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index fcc96fae0fb58..6a2ab5c59cf18 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -87,7 +87,7 @@ def get_attn_backend( logger.info("Using HabanaAttention backend.") from vllm.attention.backends.habana_attn import ( # noqa: F401 HabanaAttentionBackend) - return HabanaAttentionBackend + return HabanaAttentionBackend elif backend == _Backend.PALLAS: logger.info("Using Pallas backend.") from vllm.attention.backends.pallas import PallasAttentionBackend diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py index 233be75b47f5a..2bb082385c0f3 100644 --- a/vllm/distributed/communication_op.py +++ b/vllm/distributed/communication_op.py @@ -9,11 +9,12 @@ if is_hpu(): import habana_frameworks.torch as htorch + def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor: """All-reduce the input tensor across model parallel group.""" if is_hpu(): # FIXME(kzawora): this is a workaround for a bug in Habana PT bridge - # occuring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used + # occurring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used # (which is required for tensor parallel HPUGraph inference) htorch.core.mark_step() return get_tp_group().all_reduce(input_) diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py index cbb30e39e11a4..a040e187eb0da 100644 --- a/vllm/executor/habana_executor.py +++ b/vllm/executor/habana_executor.py @@ -17,6 +17,7 @@ class HabanaExecutor(ExecutorBase): + def _init_executor(self) -> None: """Initialize the worker and load the model.""" self._init_worker() @@ -56,6 +57,7 @@ def _create_worker(self, wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank, distributed_init_method)) return wrapper.worker + def _init_worker(self): assert self.parallel_config.world_size == 1, ( "GPUExecutor only supports single GPU.") @@ -63,13 +65,14 @@ def _init_worker(self): self.driver_worker = self._create_worker() self.driver_worker.init_device() self.driver_worker.load_model() + def determine_num_available_blocks(self) -> Tuple[int, int]: """Determine the number of available KV blocks by invoking the underlying worker. """ return self.driver_worker.determine_num_available_blocks() - def initialize_cache(self, num_gpu_blocks : int, num_cpu_blocks) -> None: + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: """Initialize the KV cache by invoking the underlying worker. """ # NOTE: This is logged in the executor because there can be >1 worker @@ -80,7 +83,8 @@ def initialize_cache(self, num_gpu_blocks : int, num_cpu_blocks) -> None: with HabanaMemoryProfiler() as cache_init_m: self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - logger.info(f"init_cache_engine took {cache_init_m.get_summary_string()}") + logger.info( + f"init_cache_engine took {cache_init_m.get_summary_string()}") def execute_model( self, @@ -89,41 +93,65 @@ def execute_model( # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # VLLM_HPU_LOG_STEP_CPU_FALLBACKS - will log cpu fallbacks per engine step, only when there was any # VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL - will log cpu fallbacks per engine step, always, even if there were none - log_graph_compilation_all = os.environ.get('VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0' - log_graph_compilation = os.environ.get('VLLM_HPU_LOG_STEP_GRAPH_COMPILATION', '0') != '0' or log_graph_compilation_all - log_cpu_fallbacks_all = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0' - log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS', '0') != '0' or log_cpu_fallbacks_all + log_graph_compilation_all = os.environ.get( + 'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0' + log_graph_compilation = os.environ.get( + 'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION', + '0') != '0' or log_graph_compilation_all + log_cpu_fallbacks_all = os.environ.get( + 'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0' + log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS', + '0') != '0' or log_cpu_fallbacks_all if log_graph_compilation or log_cpu_fallbacks: from habana_frameworks.torch.hpu.metrics import metric_localcontext seq_group_metadata_list = execute_model_req.seq_group_metadata_list - is_prompt = any([seq_group_metadata.is_prompt for seq_group_metadata in seq_group_metadata_list]) - max_context_len = max([max([len(v.prompt_token_ids) + len(v.output_token_ids) for v in seq_group_metadata.seq_data.values()]) for seq_group_metadata in seq_group_metadata_list]) # whoa, that's some spicy stuff right here - max_num_blocks = ((max_context_len - 1) // self.cache_config.block_size) + 1 + is_prompt = any([ + seq_group_metadata.is_prompt + for seq_group_metadata in seq_group_metadata_list + ]) + max_context_len = max([ + max([ + len(v.prompt_token_ids) + len(v.output_token_ids) + for v in seq_group_metadata.seq_data.values() + ]) for seq_group_metadata in seq_group_metadata_list + ]) # whoa, that's some spicy stuff right here + max_num_blocks = ( + (max_context_len - 1) // self.cache_config.block_size) + 1 input_stats = f'is_prompt: {is_prompt}, num_seqs: {len(seq_group_metadata_list)} max_context_len: {max_context_len}, max_num_blocks {max_num_blocks}' - gc_ctx = metric_localcontext("graph_compilation") if log_graph_compilation else contextlib.nullcontext() - cpu_fallback_ctx = metric_localcontext("cpu_fallback") if log_cpu_fallbacks else contextlib.nullcontext() + gc_ctx = metric_localcontext( + "graph_compilation" + ) if log_graph_compilation else contextlib.nullcontext() + cpu_fallback_ctx = metric_localcontext( + "cpu_fallback" + ) if log_cpu_fallbacks else contextlib.nullcontext() with gc_ctx as gc_local_metric, cpu_fallback_ctx as cpu_fallback_local_metric: output = self.driver_worker.execute_model(execute_model_req) - if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0) or log_graph_compilation_all: - logger.warning(f"VLLM_HPU_STEP_GRAPH_COMPILATION: {gc_local_metric.stats()}, {input_stats}") - if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] > 0) or log_cpu_fallbacks_all: - logger.warning(f"VLLM_HPU_STEP_CPU_FALLBACK: {cpu_fallback_local_metric.stats()}, {input_stats}") - + if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0 + ) or log_graph_compilation_all: + logger.warning( + f"VLLM_HPU_STEP_GRAPH_COMPILATION: {gc_local_metric.stats()}, {input_stats}" + ) + if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] > + 0) or log_cpu_fallbacks_all: + logger.warning( + f"VLLM_HPU_STEP_CPU_FALLBACK: {cpu_fallback_local_metric.stats()}, {input_stats}" + ) + return output output = self.driver_worker.execute_model(execute_model_req) return output - + def add_lora(self, lora_request: LoRARequest) -> bool: raise NotImplementedError("LoRA is not implemented for HPU backend.") def remove_lora(self, lora_id: int) -> bool: raise NotImplementedError("LoRA is not implemented for HPU backend.") - def list_loras(self) -> List[int]: + def list_loras(self) -> Set[int]: raise NotImplementedError("LoRA is not implemented for HPU backend.") - def pin_lora(self) -> List[int]: + def pin_lora(self, lora_id: int) -> bool: raise NotImplementedError("LoRA is not implemented for HPU backend.") def check_health(self) -> None: diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py index b9c800e85728b..b57536436bd49 100644 --- a/vllm/executor/ray_habana_executor.py +++ b/vllm/executor/ray_habana_executor.py @@ -283,10 +283,6 @@ def _check_if_any_actor_is_dead(self): raise RuntimeError("At least one Worker is dead. " f"Dead Workers: {dead_actors}. ") - def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: - """Wait for futures returned from _run_workers() with - async_run_remote_workers_only to complete.""" - ray.get(parallel_worker_tasks) class RayHabanaExecutorAsync(RayHabanaExecutor, DistributedGPUExecutorAsync): diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 4d048ae634457..176b95b720615 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -76,10 +76,12 @@ def initialize_ray_cluster( ignore_reinit_error=True, num_gpus=parallel_config.world_size) else: - ray.init(address=ray_address, ignore_reinit_error=True, - log_to_driver=not os.environ.get('VLLM_RAY_DISABLE_LOG_TO_DRIVER', '0') != '0') + ray.init(address=ray_address, + ignore_reinit_error=True, + log_to_driver=not os.environ.get( + 'VLLM_RAY_DISABLE_LOG_TO_DRIVER', '0') != '0') ray_accel_name = "HPU" if is_hpu() else "GPU" - + if parallel_config.placement_group: # Placement group is already set. return @@ -95,7 +97,8 @@ def initialize_ray_cluster( bundle_gpus = bundle.get(ray_accel_name, 0) if bundle_gpus > 1: raise ValueError( - f"Placement group bundle cannot have more than 1 {ray_accel_name}.") + f"Placement group bundle cannot have more than 1 {ray_accel_name}." + ) if bundle_gpus: gpu_bundles += 1 if parallel_config.world_size > gpu_bundles: @@ -109,7 +112,9 @@ def initialize_ray_cluster( f"The number of required {ray_accel_name}s exceeds the total number of " f"available {ray_accel_name}s in the cluster.") # Create a new placement group - placement_group_specs = ([{ray_accel_name: 1}] * parallel_config.world_size) + placement_group_specs = ([{ + ray_accel_name: 1 + }] * parallel_config.world_size) current_placement_group = ray.util.placement_group( placement_group_specs) # Wait until PG is ready - this will block until all diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py index 56aafd2a4d0a9..6457ad3c460f3 100644 --- a/vllm/hpu/cache_ops.py +++ b/vllm/hpu/cache_ops.py @@ -10,7 +10,13 @@ import habana_frameworks.torch as htorch -def reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, dtype, is_prompt=False): +def reshape_and_cache(key, + value, + key_cache, + value_cache, + slot_mapping, + dtype, + is_prompt=False): block_size = key_cache.size(1) slot_mapping = slot_mapping.flatten() indices = torch.div(slot_mapping, block_size, rounding_mode="floor") @@ -20,8 +26,8 @@ def reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, dtype, i def swap_blocks(src, dst, block_mapping): - index_src = torch.zeros((1,), dtype=torch.int32, device=src.device) - index_dst = torch.zeros((1,), dtype=torch.int32, device=dst.device) + index_src = torch.zeros((1, ), dtype=torch.int32, device=src.device) + index_dst = torch.zeros((1, ), dtype=torch.int32, device=dst.device) for src_idx, dst_idx in block_mapping.items(): index_src[0] = src_idx index_dst[0] = dst_idx @@ -32,15 +38,21 @@ def swap_blocks(src, dst, block_mapping): def copy_blocks(key_caches, value_caches, block_mapping): - index_src = torch.zeros((1,), dtype=torch.int32, device=key_caches[0].device) - index_dst = torch.zeros((1,), dtype=torch.int32, device=key_caches[0].device) + index_src = torch.zeros((1, ), + dtype=torch.int32, + device=key_caches[0].device) + index_dst = torch.zeros((1, ), + dtype=torch.int32, + device=key_caches[0].device) for src, dsts in block_mapping.items(): index_src[0] = src for dst in dsts: index_dst[0] = dst for key_cache in key_caches: - key_cache.index_copy_(0, index_dst, key_cache.index_select(0, index_src)) + key_cache.index_copy_(0, index_dst, + key_cache.index_select(0, index_src)) for value_cache in value_caches: - value_cache.index_copy_(0, index_dst, value_cache.index_select(0, index_src)) + value_cache.index_copy_(0, index_dst, + value_cache.index_select(0, index_src)) if key_caches[0].device.type == 'hpu': htorch.core.mark_step() diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index d38b3731350be..798bee09fda4f 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -23,22 +23,34 @@ def silu_and_mul(output, input): output.copy_(silu(x) * y) - def fetch_from_cache(cache, blocks, permutations): - return [cache.index_select(0, blocks[:, i]).permute(permutations) for i in range(blocks.size(1))] + return [ + cache.index_select(0, blocks[:, i]).permute(permutations) + for i in range(blocks.size(1)) + ] @hpu_utils.with_mark_steps -def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block_tables, context_lens, block_size, alibi_slopes=None, kv_cache_dtype=None) -> None: +def paged_attention_v1(query, + key_cache, + value_cache, + head_mapping, + scale, + block_tables, + context_lens, + block_size, + alibi_slopes=None, + kv_cache_dtype=None) -> None: seq_len = block_tables.size(1) batch_size, query_heads, _ = query.shape _, _, kv_heads, _ = key_cache.shape min_inf = torch.finfo(query.dtype).min - mask = (torch.arange(0, seq_len * block_size, dtype=torch.int32, device=key_cache.device) - .view(1, -1) - .expand(batch_size, -1) - .ge(context_lens.view(-1, 1)) - .view(batch_size, 1, 1, -1)) + mask = (torch.arange(0, + seq_len * block_size, + dtype=torch.int32, + device=key_cache.device).view(1, -1).expand( + batch_size, -1).ge(context_lens.view(-1, 1)).view( + batch_size, 1, 1, -1)) query.mul_(scale) query = query.unsqueeze(-2) keys = fetch_from_cache(key_cache, block_tables, (0, 2, 3, 1)) @@ -50,10 +62,9 @@ def paged_attention_v1(query, key_cache, value_cache, head_mapping, scale, block attn_weights = [torch.matmul(query, k) for k in keys] attn_weights = torch.cat(attn_weights, dim=-1) if alibi_slopes is not None: - attn_weights.add_(alibi_slopes[:,:,-attn_weights.size(2):, -attn_weights.size(3):]) - attn_weights = (attn_weights - .masked_fill(mask, min_inf) - .softmax(dim=-1)) + attn_weights.add_(alibi_slopes[:, :, -attn_weights.size(2):, + -attn_weights.size(3):]) + attn_weights = (attn_weights.masked_fill(mask, min_inf).softmax(dim=-1)) values = fetch_from_cache(value_cache, block_tables, (0, 2, 1, 3)) if PA_SPLIT_VALUE: @@ -82,15 +93,17 @@ def static_fused_moe(hidden_states, w1, w2, score, topk): B, D = hidden_states.shape num_experts = w1.shape[0] routing_weights = F.softmax(score, dim=1, dtype=torch.float32) - routing_weights, selected_experts = torch.topk(routing_weights, topk, dim=-1) + routing_weights, selected_experts = torch.topk(routing_weights, + topk, + dim=-1) routing_weights /= routing_weights.sum(dim=-1, keepdim=True) routing_weights = routing_weights.to(hidden_states.dtype) - final_hidden_states = torch.zeros( - (1, B, D), dtype=hidden_states.dtype, device=hidden_states.device - ) - padded_weights = torch.zeros( - (B, num_experts), dtype=hidden_states.dtype, device=hidden_states.device - ) + final_hidden_states = torch.zeros((1, B, D), + dtype=hidden_states.dtype, + device=hidden_states.device) + padded_weights = torch.zeros((B, num_experts), + dtype=hidden_states.dtype, + device=hidden_states.device) padded_weights.scatter_(-1, selected_experts, routing_weights) padded_weights = padded_weights.reshape(-1, B, w1.shape[0]) padded_weights = padded_weights.permute(2, 0, 1).unsqueeze(-1) @@ -100,7 +113,8 @@ def static_fused_moe(hidden_states, w1, w2, score, topk): for expert_idx in range(num_experts): padded_weight = padded_weights[expert_idx] current_state_static = hidden_states.reshape(-1, D) - w_output = silu_and_mul_wrapper(torch.matmul(current_state_static, w1[expert_idx].transpose(0, 1))) + w_output = silu_and_mul_wrapper( + torch.matmul(current_state_static, w1[expert_idx].transpose(0, 1))) w_output = torch.matmul(w_output, w2[expert_idx].transpose(0, 1)) current_hidden_states_static = w_output * padded_weight final_hidden_states += current_hidden_states_static @@ -111,12 +125,12 @@ def static_fused_moe(hidden_states, w1, w2, score, topk): @hpu_utils.with_mark_steps def prompt_attention( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - attn_bias: Optional[torch.Tensor] = None, - p: float = 0.0, - scale: Optional[float] = None, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_bias: Optional[torch.Tensor] = None, + p: float = 0.0, + scale: Optional[float] = None, ) -> torch.Tensor: query = query.transpose(1, 2) key = key.transpose(1, 2) diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py index 30f96153cd4a2..16c956acdf817 100644 --- a/vllm/hpu/rotary_embed.py +++ b/vllm/hpu/rotary_embed.py @@ -9,18 +9,23 @@ import torch.nn as nn import habana_frameworks.torch.utils.experimental as htexp + def get_device_type(): return htexp._get_device_type() + def is_gaudi1(): return get_device_type() == htexp.synDeviceType.synDeviceGaudi + def is_gaudi2(): return get_device_type() == htexp.synDeviceType.synDeviceGaudi2 + def is_gaudi3(): return get_device_type() == htexp.synDeviceType.synDeviceGaudi3 + # TODO: remove this workaround when FusedRoPE properly works on Gaudi if not is_gaudi1() and (is_gaudi2() or is_gaudi3()): try: @@ -34,10 +39,11 @@ def is_gaudi3(): def rotate_half(x): """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] + x1 = x[..., :x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2:] return torch.cat((-x2, x1), dim=-1) + def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): """Applies Rotary Position Embedding to the query and key tensors. @@ -59,40 +65,55 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): Returns: `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. """ - cos = cos[position_ids]#.unsqueeze(unsqueeze_dim) - sin = sin[position_ids]#.unsqueeze(unsqueeze_dim) + cos = cos[position_ids] #.unsqueeze(unsqueeze_dim) + sin = sin[position_ids] #.unsqueeze(unsqueeze_dim) q_embed = (q * cos) + (rotate_half(q) * sin) k_embed = (k * cos) + (rotate_half(k) * sin) return q_embed, k_embed class HpuRotaryEmbedding(nn.Module): - def __init__(self, head_size, rotary_dim, max_position_embeddings=2048, base=10000, is_neox_style=None, device='hpu'): + + def __init__(self, + head_size, + rotary_dim, + max_position_embeddings=2048, + base=10000, + is_neox_style=None, + device='hpu'): super().__init__() self.head_size = head_size self.dim = rotary_dim self.max_position_embeddings = max_position_embeddings self.base = base - inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + inv_freq = 1.0 / (self.base**( + torch.arange(0, self.dim, 2).float().to(device) / self.dim)) self.register_buffer("inv_freq", inv_freq, persistent=False) # Build here to make `torch.jit.trace` work. - self._set_cos_sin_cache( - seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() - ) + self._set_cos_sin_cache(seq_len=max_position_embeddings, + device=self.inv_freq.device, + dtype=torch.get_default_dtype()) def _set_cos_sin_cache(self, seq_len, device, dtype): self.max_seq_len_cached = seq_len - t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + t = torch.arange(self.max_seq_len_cached, + device=device, + dtype=self.inv_freq.dtype) freqs = torch.einsum("i,j->ij", t, self.inv_freq) # Different from paper, but it uses a different permutation in order to obtain the same calculation emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) - self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) - - def forward(self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor): + self.register_buffer("cos_cached", + emb.cos().to(dtype), + persistent=False) + self.register_buffer("sin_cached", + emb.sin().to(dtype), + persistent=False) + + def forward(self, positions: torch.Tensor, query: torch.Tensor, + key: torch.Tensor): if query.dim() == 2: query = query.unsqueeze(0) if key.dim() == 2: @@ -101,19 +122,31 @@ def forward(self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tenso positions = positions.unsqueeze(0) seq_len = key.shape[-2] if seq_len > self.max_seq_len_cached: - self._set_cos_sin_cache(seq_len=seq_len, device=query.device, dtype=query.dtype) - - cos, sin = self.cos_cached[:seq_len].to(dtype=query.dtype), self.sin_cached[:seq_len].to(dtype=query.dtype) - query = query.reshape((query.shape[0], query.shape[1], query.shape[2] // self.head_size, self.head_size)) - key = key.reshape((key.shape[0], key.shape[1], key.shape[2] // self.head_size, self.head_size)) + self._set_cos_sin_cache(seq_len=seq_len, + device=query.device, + dtype=query.dtype) + + cos, sin = self.cos_cached[:seq_len].to( + dtype=query.dtype), self.sin_cached[:seq_len].to(dtype=query.dtype) + query = query.reshape( + (query.shape[0], query.shape[1], query.shape[2] // self.head_size, + self.head_size)) + key = key.reshape((key.shape[0], key.shape[1], + key.shape[2] // self.head_size, self.head_size)) if query.device.type == "hpu" and FusedRoPE: if len(positions[0]) == 1: - cos = self.cos_cached[positions].unsqueeze(2).to(dtype=query.dtype) - sin = self.sin_cached[positions].unsqueeze(2).to(dtype=query.dtype) + cos = self.cos_cached[positions].unsqueeze(2).to( + dtype=query.dtype) + sin = self.sin_cached[positions].unsqueeze(2).to( + dtype=query.dtype) else: cos = cos[positions].unsqueeze(2) sin = sin[positions].unsqueeze(2) - query, key = FusedRoPE.apply(query, cos, sin, 0), FusedRoPE.apply(key, cos, sin, 0) + query, key = FusedRoPE.apply(query, cos, sin, + 0), FusedRoPE.apply(key, cos, sin, 0) else: query, key = apply_rotary_pos_emb(query, key, cos, sin, positions) - return query.reshape((query.shape[0], query.shape[1], query.shape[2] * query.shape[3])), key.reshape((key.shape[0], key.shape[1], key.shape[2] * key.shape[3])) + return query.reshape( + (query.shape[0], query.shape[1], + query.shape[2] * query.shape[3])), key.reshape( + (key.shape[0], key.shape[1], key.shape[2] * key.shape[3])) diff --git a/vllm/hpu/utils.py b/vllm/hpu/utils.py index 4ce9e2591c6b9..06f3690aded8b 100644 --- a/vllm/hpu/utils.py +++ b/vllm/hpu/utils.py @@ -7,7 +7,9 @@ import habana_frameworks.torch as htorch + def with_mark_steps(fn): + def wrapped(*args, **kwargs): htorch.core.mark_step() result = fn(*args, **kwargs) @@ -15,4 +17,5 @@ def wrapped(*args, **kwargs): del kwargs htorch.core.mark_step() return result - return wrapped \ No newline at end of file + + return wrapped diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 69f889ed1a1b8..b2641cf89bdc5 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -45,7 +45,7 @@ def forward_hpu(self, x: torch.Tensor) -> torch.Tensor: out = torch.empty(output_shape, dtype=x.dtype, device=x.device) ops.silu_and_mul(out, x) return out - + def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: from vllm._ipex_ops import ipex_ops as ops diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index d0d1577b26a10..975019bc9c24d 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -13,6 +13,7 @@ print("Not using HPU fused kernel for RMSNorm") FusedRMSNorm = None + class RMSNorm(CustomOp): """Root mean square normalization. @@ -86,7 +87,8 @@ def forward_hpu( orig_shape = x.shape residual += x.view(residual.shape) # Note: FusedRMSNorm requires 3D tensors as inputs - x = FusedRMSNorm.apply(residual.float(), self.weight.float(), self.variance_epsilon) + x = FusedRMSNorm.apply(residual.float(), self.weight.float(), + self.variance_epsilon) return x.to(orig_dtype).view(orig_shape), residual ops.fused_add_rms_norm( x, @@ -97,7 +99,8 @@ def forward_hpu( return x, residual if x.device.type == "hpu" and FusedRMSNorm: orig_dtype = x.dtype - x = FusedRMSNorm.apply(x.float(), self.weight.float(), self.variance_epsilon) + x = FusedRMSNorm.apply(x.float(), self.weight.float(), + self.variance_epsilon) return x.to(orig_dtype) out = torch.empty_like(x) ops.rms_norm( diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index 39d142b158445..321de3491921d 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -92,8 +92,8 @@ def _prune_hidden_states( sampling_metadata: SamplingMetadata, ) -> torch.Tensor: if sampling_metadata.selected_token_indices is not None: - return hidden_states.index_select(0, - sampling_metadata.selected_token_indices) + return hidden_states.index_select( + 0, sampling_metadata.selected_token_indices) else: return hidden_states diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index c8c0225245f7d..ceaa2ddd3d553 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -32,6 +32,7 @@ if is_hpu(): from vllm.hpu.rotary_embed import HpuRotaryEmbedding + def _rotate_neox(x: torch.Tensor) -> torch.Tensor: x1 = x[..., :x.shape[-1] // 2] x2 = x[..., x.shape[-1] // 2:] @@ -763,11 +764,11 @@ def get_rope( return _ROPE_DICT[key] if rope_scaling is None: if is_hpu(): - rotary_emb = HpuRotaryEmbedding(head_size, rotary_dim, max_position, base, - is_neox_style) + rotary_emb = HpuRotaryEmbedding(head_size, rotary_dim, + max_position, base, is_neox_style) else: - rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base, - is_neox_style, dtype) + rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, + base, is_neox_style, dtype) else: scaling_type = rope_scaling["type"] # The correct one should be "longrope" but keep "su" here diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index cade78114be42..ad146da72fb26 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -262,7 +262,8 @@ def load_model(self, *, model_config: ModelConfig, scheduler_config: SchedulerConfig, cache_config: CacheConfig) -> nn.Module: with set_default_torch_dtype(model_config.dtype): - load_device = torch.device(device_config.device) if not is_hpu() else 'cpu' # FIXME(kzawora): this is a nasty workaround!!! + load_device = torch.device(device_config.device) if not is_hpu( + ) else 'cpu' # FIXME(kzawora): this is a nasty workaround!!! with torch.device(load_device): model = _initialize_model(model_config, self.load_config, lora_config, vision_language_config, @@ -284,7 +285,8 @@ def load_model(self, *, model_config: ModelConfig, if hasattr(module, "process_weights_after_loading"): module.process_weights_after_loading() if is_hpu(): - model = model.to('hpu') # FIXME(kzawora): this is a nasty workaround!!! + model = model.to( + 'hpu') # FIXME(kzawora): this is a nasty workaround!!! return model.eval() diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 33acc63c3fc0c..0344f1c7c7a03 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -279,10 +279,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: if is_hpu(): final_hidden_states = static_fused_moe(hidden_states, - self.w13_weight, - self.w2_weight, - router_logits, - self.top_k) + self.w13_weight, + self.w2_weight, + router_logits, self.top_k) else: final_hidden_states = fused_moe(hidden_states, self.w13_weight, @@ -301,8 +300,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: final_hidden_states = tensor_model_parallel_all_reduce( final_hidden_states) - return (final_hidden_states.view(batch_size, sequence_length, hidden_size) if is_hpu() - else final_hidden_states.view(num_tokens, hidden_size)) + return (final_hidden_states.view(batch_size, sequence_length, + hidden_size) if is_hpu() else + final_hidden_states.view(num_tokens, hidden_size)) class MixtralAttention(nn.Module): @@ -651,10 +651,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) - + if is_hpu(): torch.hpu.synchronize() + def all_close_1d(x: torch.Tensor) -> bool: assert len(x.shape) == 1 return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0])) diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 7ff826cf4e18f..ea82a3a4041b7 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -510,7 +510,8 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float], dtype=torch.int, pin_memory=pin_memory, ) - idx_dtype = torch.long if not is_hpu() else torch.int # Gaudi doesn't have full native int64 support + idx_dtype = torch.long if not is_hpu( + ) else torch.int # Gaudi doesn't have full native int64 support sample_indices_t = torch.tensor( sample_indices, device="cpu", @@ -598,7 +599,8 @@ def _get_sequence_seeds( else: generator = random.Random(str((seed, ) + extra_entropy)) randint_fn = generator.randint - idx_dtype = torch.long if not is_hpu() else torch.int # Gaudi doesn't have full native int64 support + idx_dtype = torch.long if not is_hpu( + ) else torch.int # Gaudi doesn't have full native int64 support lo, hi = torch.iinfo(idx_dtype).min, torch.iinfo(idx_dtype).max # If the user/random sets seed = 0 but request should # have sampling, we need to change it to something diff --git a/vllm/utils.py b/vllm/utils.py index 2fb77a0fc431c..520332110fd1f 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -194,9 +194,12 @@ def is_neuron() -> bool: transformers_neuronx = None return transformers_neuronx is not None + @lru_cache(maxsize=None) def is_hpu() -> bool: - return importlib.util.find_spec('habana_frameworks') is not None + from importlib import util + return util.find_spec('habana_frameworks') is not None + @lru_cache(maxsize=None) def is_tpu() -> bool: @@ -506,18 +509,14 @@ def create_kv_caches_with_random( torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype) scale = head_size**-0.5 - if is_hpu(): - key_cache_shape = (num_blocks, block_size, num_heads, head_size) - else: - x = 16 // torch.tensor([], dtype=torch_dtype).element_size() - key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x) + x = 16 // torch.tensor([], dtype=torch_dtype).element_size() + key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x) key_caches: List[torch.Tensor] = [] for _ in range(num_layers): key_cache = torch.empty(size=key_cache_shape, dtype=torch_dtype, device=device) - cache_dtype = str(cache_dtype) - if cache_dtype in ["auto", "half", "torch.float16", "torch.bfloat16", "torch.float32"]: + if cache_dtype in ["auto", "half", "bfloat16", "float"]: key_cache.uniform_(-scale, scale) elif cache_dtype == 'fp8': _generate_random_fp8(key_cache, -scale, scale) @@ -526,16 +525,13 @@ def create_kv_caches_with_random( f"Does not support key cache of type {cache_dtype}") key_caches.append(key_cache) - if is_hpu(): - value_cache_shape = (num_blocks, block_size, num_heads, head_size) - else: - value_cache_shape = (num_blocks, num_heads, head_size, block_size) + value_cache_shape = (num_blocks, num_heads, head_size, block_size) value_caches: List[torch.Tensor] = [] for _ in range(num_layers): value_cache = torch.empty(size=value_cache_shape, dtype=torch_dtype, device=device) - if cache_dtype in ["auto", "half", "torch.float16", "torch.bfloat16", "torch.float32"]: + if cache_dtype in ["auto", "half", "bfloat16", "float"]: value_cache.uniform_(-scale, scale) elif cache_dtype == 'fp8': _generate_random_fp8(value_cache, -scale, scale) @@ -607,55 +603,70 @@ class HabanaMemoryProfiler: def __init__(self, device=None): self.device = device + @staticmethod def current_device_memory_usage() -> float: # Return the device memory usage in bytes. free_hpu_memory, total_hpu_memory = torch.hpu.mem_get_info() return total_hpu_memory - free_hpu_memory - + + @staticmethod def current_free_device_memory() -> float: # Return the device memory usage in bytes. free_hpu_memory, _ = torch.hpu.mem_get_info() return free_hpu_memory - + + @staticmethod def total_device_memory() -> float: # Return the device memory usage in bytes. _, total_hpu_memory = torch.hpu.mem_get_info() return total_hpu_memory + @staticmethod def current_host_memory_usage() -> float: # Return the host memory usage in bytes. - return HabanaMemoryProfiler.total_host_memory() - HabanaMemoryProfiler.current_free_host_memory() - + return HabanaMemoryProfiler.total_host_memory( + ) - HabanaMemoryProfiler.current_free_host_memory() + + @staticmethod def current_free_host_memory() -> float: # Return the host memory usage in bytes. return psutil.virtual_memory().available - + + @staticmethod def total_host_memory() -> float: # Return the host memory usage in bytes. return psutil.virtual_memory().total def get_summary_string(self): - if getattr(self, 'final_device_memory', None) is None or getattr(self, 'final_host_memory', None) is None: - raise RuntimeError("HabanaMemoryProfiler.get_summary_string() can only be called after closing context manager") - return (f"{format_bytes(self.consumed_device_memory)} of device memory ({format_bytes(self.final_device_memory)}/{format_bytes(HabanaMemoryProfiler.total_device_memory())} used) and " - f"{format_bytes(self.consumed_host_memory)} of host memory ({format_bytes(self.final_host_memory)}/{format_bytes(HabanaMemoryProfiler.total_host_memory())} used)") + if getattr(self, 'final_device_memory', None) is None or getattr( + self, 'final_host_memory', None) is None: + raise RuntimeError( + "HabanaMemoryProfiler.get_summary_string() can only be called after closing context manager" + ) + return ( + f"{format_bytes(self.consumed_device_memory)} of device memory ({format_bytes(self.final_device_memory)}/{format_bytes(HabanaMemoryProfiler.total_device_memory())} used) and " + f"{format_bytes(self.consumed_host_memory)} of host memory ({format_bytes(self.final_host_memory)}/{format_bytes(HabanaMemoryProfiler.total_host_memory())} used)" + ) def __enter__(self): # Force garbage collection gc.collect() - self.initial_device_memory = HabanaMemoryProfiler.current_device_memory_usage() - self.initial_host_memory = HabanaMemoryProfiler.current_host_memory_usage() + self.initial_device_memory = HabanaMemoryProfiler.current_device_memory_usage( + ) + self.initial_host_memory = HabanaMemoryProfiler.current_host_memory_usage( + ) # This allows us to call methods of the context manager if needed return self def __exit__(self, exc_type, exc_val, exc_tb): # Force garbage collection gc.collect() - self.final_device_memory = HabanaMemoryProfiler.current_device_memory_usage() - self.final_host_memory = HabanaMemoryProfiler.current_host_memory_usage() + self.final_device_memory = HabanaMemoryProfiler.current_device_memory_usage( + ) + self.final_host_memory = HabanaMemoryProfiler.current_host_memory_usage( + ) self.consumed_device_memory = self.final_device_memory - self.initial_device_memory self.consumed_host_memory = self.final_host_memory - self.initial_host_memory - # Adapted from https://stackoverflow.com/a/49361727 @@ -663,7 +674,7 @@ def format_bytes(size): # 2**10 = 1024 power = 2**10 n = 0 - power_labels = {0 : '', 1: 'Ki', 2: 'Mi', 3: 'Gi', 4: 'Ti'} + power_labels = {0: '', 1: 'Ki', 2: 'Mi', 3: 'Gi', 4: 'Ti'} while abs(size) > power: size /= power n += 1 diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index ea53c3e99d9fa..961b5689e43a4 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -76,11 +76,11 @@ def _allocate_kv_cache( for _ in range(self.num_layers): if device == 'hpu': key_cache = torch.zeros(kv_cache_shape, - dtype=self.dtype, - device=device) + dtype=self.dtype, + device=device) value_cache = torch.zeros(kv_cache_shape, - dtype=self.dtype, - device=device) + dtype=self.dtype, + device=device) kv_layer = (key_cache, value_cache) kv_cache.append(kv_layer) else: diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 21d34a3924c86..e8e7c35579b0f 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -1,3 +1,5 @@ +# mypy: ignore-errors + ############################################################################### # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company ############################################################################### @@ -16,8 +18,9 @@ import habana_frameworks.torch as htorch import contextlib from vllm.attention import (AttentionMetadata, get_attn_backend) -from vllm.config import (DeviceConfig, LoadConfig, CacheConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig, VisionLanguageConfig) +from vllm.config import (DeviceConfig, LoadConfig, CacheConfig, LoRAConfig, + ModelConfig, ParallelConfig, SchedulerConfig, + VisionLanguageConfig) from vllm.distributed import broadcast_tensor_dict from vllm.distributed.parallel_state import get_world_group from vllm.logger import init_logger @@ -47,7 +50,11 @@ # example env variable: VLLM_DECODE_BS_BUCKET_STEP=128 def read_bucket_settings(phase: str, dim: str, **defaults: Dict): params = ['min', 'step', 'max'] - values = [int(os.environ.get(f'VLLM_{phase}_{dim}_BUCKET_{p}'.upper(), defaults[p])) for p in params] + values = [ + int( + os.environ.get(f'VLLM_{phase}_{dim}_BUCKET_{p}'.upper(), + defaults[p])) for p in params + ] return values @@ -61,7 +68,8 @@ def warmup_range(config: Tuple[int, int, int]): def warmup_buckets(bs_bucket_config, seq_bucket_config): - buckets = itertools.product(warmup_range(bs_bucket_config), warmup_range(seq_bucket_config)) + buckets = itertools.product(warmup_range(bs_bucket_config), + warmup_range(seq_bucket_config)) return list(sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0]))) @@ -86,13 +94,17 @@ def find_bucket(value: int, config: Tuple[int, int, int]): return result -def subtuple(obj: object, typename: str, to_copy: List[str], to_override: Dict[str, object] = {}): +def subtuple(obj: object, + typename: str, + to_copy: List[str], + to_override: Dict[str, object] = {}): if obj is None: return None fields = set(to_copy) | set(to_override.keys()) values = {f: to_override.get(f, getattr(obj, f)) for f in fields} if typename not in _TYPE_CACHE: - _TYPE_CACHE[typename] = collections.namedtuple(typename, ' '.join(fields)) + _TYPE_CACHE[typename] = collections.namedtuple(typename, + ' '.join(fields)) return _TYPE_CACHE[typename](**values) @@ -107,39 +119,44 @@ def align_workers(value, op): class HpuModelAdapter(): + def __init__(self, model): self.model = model - def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device, dtype): + def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device, + dtype): prefill_metadata = attn_metadata if prefill_metadata is None: return attn_metadata seq_lens_t = prefill_metadata.seq_lens_tensor - len_mask = (torch.arange(0, seq_len, device=device, dtype=torch.int32) - .view(1, seq_len) - .ge(seq_lens_t.unsqueeze(-1)) - .view(batch_size, 1, 1, seq_len)) - causal_mask = torch.triu( - torch.ones((batch_size, 1, seq_len, seq_len), device=device, dtype=torch.bool), - diagonal=1 - ) + len_mask = (torch.arange(0, seq_len, device=device, + dtype=torch.int32).view(1, seq_len).ge( + seq_lens_t.unsqueeze(-1)).view( + batch_size, 1, 1, seq_len)) + causal_mask = torch.triu(torch.ones((batch_size, 1, seq_len, seq_len), + device=device, + dtype=torch.bool), + diagonal=1) mask = causal_mask.logical_or(len_mask) - attn_bias = (torch.zeros_like(mask, dtype=dtype) - .masked_fill_(mask, -math.inf)) + attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_( + mask, -math.inf)) #FIXME: Restore sliding window support #if self.sliding_window is not None: attn_metadata = prefill_metadata._replace(attn_bias=attn_bias) return attn_metadata - def forward(self, *args, **kwargs): kwargs = kwargs.copy() selected_token_indices = kwargs.pop('selected_token_indices') if 'bypass_hpu_graphs' in kwargs: - kwargs.pop('bypass_hpu_graphs') # required for PT eager + kwargs.pop('bypass_hpu_graphs') # required for PT eager input_ids = kwargs['input_ids'] - kwargs['attn_metadata'] = self._set_attn_bias(kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1), input_ids.device, torch.bfloat16) + kwargs['attn_metadata'] = self._set_attn_bias(kwargs['attn_metadata'], + input_ids.size(0), + input_ids.size(1), + input_ids.device, + torch.bfloat16) hidden_states = self.model(*args, **kwargs) hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) hidden_states = hidden_states.index_select(0, selected_token_indices) @@ -282,17 +299,20 @@ def load_model(self) -> None: vision_language_config=self.vision_language_config, parallel_config=self.parallel_config, scheduler_config=self.scheduler_config, - cache_config=self.cache_config - ) - logger.info(f"Pre-loading model weights on {next(self.model.parameters()).device} took {m_getmodel.get_summary_string()}") + cache_config=self.cache_config) + logger.info( + f"Pre-loading model weights on {next(self.model.parameters()).device} took {m_getmodel.get_summary_string()}" + ) # FIXME: Running with disable_tensor_cache=True causes RuntimeErrors. This needs to be debugged with HabanaMemoryProfiler() as m_wrap: self.model = _maybe_wrap_in_hpu_graph(self.model) - logger.info(f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}") - + logger.info( + f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}") + self.model_memory_usage = m.consumed_device_memory - logger.info(f"Loading model weights took in total {m.get_summary_string()}") + logger.info( + f"Loading model weights took in total {m.get_summary_string()}") if self.lora_config: assert hasattr(self.model, "supported_lora_modules" @@ -316,19 +336,47 @@ def _use_graphs(self, batch_size, seq_len, is_prompt): return (batch_size, seq_len, is_prompt) in self.graphed_buckets def _setup_buckets(self) -> None: - self.prompt_bs_bucket_cfg = read_bucket_settings('prompt', 'bs', min=1, step=32, max=min(self.max_num_seqs, 64)) - self.decode_bs_bucket_cfg = read_bucket_settings('decode', 'bs', min=1, step=128, max=self.max_num_seqs) - self.prompt_seq_bucket_cfg = read_bucket_settings('prompt', 'seq', min=self.block_size, step=self.block_size, max=1024) - self.decode_seq_bucket_cfg = read_bucket_settings('decode', 'seq', min=self.block_size, step=self.block_size, max=2048) + self.prompt_bs_bucket_cfg = read_bucket_settings('prompt', + 'bs', + min=1, + step=32, + max=min( + self.max_num_seqs, + 64)) + self.decode_bs_bucket_cfg = read_bucket_settings('decode', + 'bs', + min=1, + step=128, + max=self.max_num_seqs) + self.prompt_seq_bucket_cfg = read_bucket_settings('prompt', + 'seq', + min=self.block_size, + step=self.block_size, + max=1024) + self.decode_seq_bucket_cfg = read_bucket_settings('decode', + 'seq', + min=self.block_size, + step=self.block_size, + max=2048) self.graphed_buckets = set() - logger.info(f"Prompt bucket config (min, step, max_warmup) bs:{self.prompt_bs_bucket_cfg}, seq:{self.prompt_seq_bucket_cfg}") - self.prompt_buckets = warmup_buckets(self.prompt_bs_bucket_cfg, self.prompt_seq_bucket_cfg) - logger.info(f"Generated {len(self.prompt_buckets)} prompt buckets: {list(sorted(self.prompt_buckets))}") + logger.info( + f"Prompt bucket config (min, step, max_warmup) bs:{self.prompt_bs_bucket_cfg}, seq:{self.prompt_seq_bucket_cfg}" + ) + self.prompt_buckets = warmup_buckets(self.prompt_bs_bucket_cfg, + self.prompt_seq_bucket_cfg) + logger.info( + f"Generated {len(self.prompt_buckets)} prompt buckets: {list(sorted(self.prompt_buckets))}" + ) - logger.info(f"Decode bucket config (min, step, max_warmup) bs:{self.decode_bs_bucket_cfg}, seq:{self.decode_seq_bucket_cfg}") - self.decode_buckets = warmup_buckets(self.decode_bs_bucket_cfg, self.decode_seq_bucket_cfg) - logger.info(f"Generated {len(self.decode_buckets)} decode buckets: {list(sorted(self.decode_buckets))}") + logger.info( + f"Decode bucket config (min, step, max_warmup) bs:{self.decode_bs_bucket_cfg}, seq:{self.decode_seq_bucket_cfg}" + ) + self.decode_buckets = warmup_buckets(self.decode_bs_bucket_cfg, + self.decode_seq_bucket_cfg) + logger.info( + f"Generated {len(self.decode_buckets)} decode buckets: {list(sorted(self.decode_buckets))}" + ) def _prepare_prompt( self, @@ -451,7 +499,7 @@ def _prepare_prompt( max_query_len = max(query_lens) sum_query_len = sum(query_lens) - real_num_seqs = len(query_lens) + real_num_seqs = len(query_lens) assert max_query_len > 0 context_lens_tensor = torch.tensor(context_lens, @@ -468,7 +516,9 @@ def _prepare_prompt( multi_modal_input = None max_prompt_block_table_len = max(len(t) for t in prefix_block_tables) - max_prompt_len = max(find_bucket(max(seq_lens), self.prompt_seq_bucket_cfg), self.block_size) + max_prompt_len = max( + find_bucket(max(seq_lens), self.prompt_seq_bucket_cfg), + self.block_size) input_tokens = make_tensor_with_pad(input_tokens, max_prompt_len, @@ -521,7 +571,7 @@ def _prepare_prompt( use_cuda_graph=False, num_prefills=real_num_seqs, num_prefill_tokens=sum_query_len, - num_decode_tokens=0, + num_decode_tokens=0, slot_mapping=slot_mapping, ) return PreparePromptMetadata( @@ -685,7 +735,9 @@ def prepare_input_tensors( num_decode_tokens = len(decode_input_tokens) # NOTE(kzawora): Here we diverge from GPU code - we don't support mixed batches, so we either use decode or prefill inputs, without coalescing. - assert (num_prefills == 0 and num_decode_tokens > 0) or (num_prefills > 0 and num_decode_tokens == 0), "HPU does not support mixed batches!" + assert (num_prefills == 0 and num_decode_tokens > 0) or ( + num_prefills > 0 and num_decode_tokens + == 0), "HPU does not support mixed batches!" if num_decode_tokens > 0: input_tokens = decode_input_tokens input_positions = decode_input_positions @@ -694,12 +746,15 @@ def prepare_input_tensors( lora_prompt_mapping = decode_lora_prompt_mapping lora_requests = decode_lora_requests - # FIXME: We need to adjust selected_token_indices to accomodate for padding + # FIXME: We need to adjust selected_token_indices to accommodate for padding max_len = input_tokens.size(1) paddings = [max_len - s for s in seq_lens] paddings = [0] + paddings[:-1] paddings = list(itertools.accumulate(paddings)) - paddings = torch.tensor(paddings, dtype=sampling_metadata.selected_token_indices.dtype, device=sampling_metadata.selected_token_indices.device) + paddings = torch.tensor( + paddings, + dtype=sampling_metadata.selected_token_indices.dtype, + device=sampling_metadata.selected_token_indices.device) sampling_metadata.selected_token_indices.add_(paddings) if self.lora_config: @@ -713,7 +768,8 @@ def prepare_input_tensors( if (prefill_attn_metadata is not None and decode_attn_metadata is not None): batch_type = BatchType.MIXED - raise NotImplementedError("Mixed batch is not supported on HPU") + raise NotImplementedError( + "Mixed batch is not supported on HPU") elif prefill_attn_metadata is not None: batch_type = BatchType.PREFILL else: @@ -782,15 +838,15 @@ def prepare_input_tensors( **metadata_dict) attn_metadata = prefill_attn_metadata if prefill_attn_metadata is not None else decode_attn_metadata -# attn_metadata = AttentionMetadata( -# num_prefills=num_prefills, -# slot_mapping=slot_mapping, -# num_prefill_tokens=num_prefill_tokens, -# num_decode_tokens=num_decode_tokens, -# prefill_metadata=prefill_attn_metadata, -# decode_metadata=decode_attn_metadata, -# kv_cache_dtype=self.kv_cache_dtype, -# ) + # attn_metadata = AttentionMetadata( + # num_prefills=num_prefills, + # slot_mapping=slot_mapping, + # num_prefill_tokens=num_prefill_tokens, + # num_decode_tokens=num_decode_tokens, + # prefill_metadata=prefill_attn_metadata, + # decode_metadata=decode_attn_metadata, + # kv_cache_dtype=self.kv_cache_dtype, + # ) return (input_tokens, input_positions, attn_metadata, sampling_metadata, lora_requests, lora_mapping, @@ -807,29 +863,26 @@ def trim_attn_metadata(self, metadata: AttentionMetadata) -> object: # Trimming metadata is required when using HPUGraphs. # Attention metadata is going to be hashed by PT bridge, and # appropriate HPUGraphs will be matched based on all inputs' hash. - - # Before you put more keys in here, make sure you know their - # value type and make sure you know how it's going to be hashed. - # You can find that information in input_hash function + + # Before you put more keys in here, make sure you know their + # value type and make sure you know how it's going to be hashed. + # You can find that information in input_hash function # in habana_frameworks/torch/hpu/graphs.py. You can also hash # it manually with torch.hpu.graphs.input_hash(attention_metadata) - + # If you use primitive types here - they will get hashed based # on their value. You *will* get lots of excessive graph captures # (and an OOM eventually) if you decide to put something like - # seq_len int here. - # If you absolutely need a scalar, put it in a tensor. Tensors + # seq_len int here. + # If you absolutely need a scalar, put it in a tensor. Tensors # get hashed using their metadata, not their values: # input_hash(torch.tensor(123)) == input_hash(torch.tensor(321)) # input_hash(123) != input_hash(321) # input_hash("abc") != input_hash("cba") - attention_metadata = subtuple(metadata, - 'TrimmedAttentionMetadata', - ['block_tables', - 'seq_lens_tensor', - 'attn_bias', - 'slot_mapping', - 'is_prompt']) + attention_metadata = subtuple(metadata, 'TrimmedAttentionMetadata', [ + 'block_tables', 'seq_lens_tensor', 'attn_bias', 'slot_mapping', + 'is_prompt' + ]) return attention_metadata @torch.inference_mode() @@ -849,11 +902,12 @@ def execute_model( batch_size_padded = find_bucket(real_batch_size, bucket_cfg) batch_size_padding = batch_size_padded - real_batch_size seq_group_metadata_list = seq_group_metadata_list.copy() - seq_group_metadata_list.extend(seq_group_metadata_list[0] for _ in range(batch_size_padding)) + seq_group_metadata_list.extend(seq_group_metadata_list[0] + for _ in range(batch_size_padding)) with self.profiler.record_event('internal', 'prepare_input_tensors'): (input_tokens, input_positions, attn_metadata, sampling_metadata, - lora_requests, lora_mapping, multi_modal_input - ) = self.prepare_input_tensors(seq_group_metadata_list) + lora_requests, lora_mapping, multi_modal_input + ) = self.prepare_input_tensors(seq_group_metadata_list) is_prompt = attn_metadata.is_prompt if self.lora_config: @@ -877,12 +931,20 @@ def execute_model( else: model_event_name = 'model_executable' with self.profiler.record_event('internal', model_event_name): - hidden_states = self.model.forward(**execute_model_kwargs, selected_token_indices=sampling_metadata.selected_token_indices, bypass_hpu_graphs=not use_graphs) + hidden_states = self.model.forward( + **execute_model_kwargs, + selected_token_indices=sampling_metadata. + selected_token_indices, + bypass_hpu_graphs=not use_graphs) # Compute the logits. - with self.profiler.record_event('internal', f'compute_logits_{"prompt" if is_prompt else "decode"}_bs{batch_size}_seq{seq_len}'): + with self.profiler.record_event( + 'internal', + f'compute_logits_{"prompt" if is_prompt else "decode"}_bs{batch_size}_seq{seq_len}' + ): sampling_metadata.selected_token_indices = None - logits = self.model.compute_logits(hidden_states, sampling_metadata) + logits = self.model.compute_logits(hidden_states, + sampling_metadata) htorch.core.mark_step() # Only perform sampling in the driver worker. @@ -890,7 +952,10 @@ def execute_model( return None # Sample the next token. - with self.profiler.record_event('internal', f'sample_{"prompt" if is_prompt else "decode"}_bs{batch_size}_seq{seq_len}'): + with self.profiler.record_event( + 'internal', + f'sample_{"prompt" if is_prompt else "decode"}_bs{batch_size}_seq{seq_len}' + ): output = self.model.sample( logits=logits, sampling_metadata=sampling_metadata, @@ -903,12 +968,12 @@ def execute_model( self.profiler.end() event_end = self.profiler.get_timestamp_us() counters = self.profiler_counter_helper.get_counter_dict( - cache_config=self.cache_config, - duration=event_end-event_start, - seq_len=seq_len, - batch_size_padded=batch_size_padded, - real_batch_size=real_batch_size, - seq_group_metadata_list=seq_group_metadata_list, + cache_config=self.cache_config, + duration=event_end - event_start, + seq_len=seq_len, + batch_size_padded=batch_size_padded, + real_batch_size=real_batch_size, + seq_group_metadata_list=seq_group_metadata_list, is_prompt=is_prompt) self.profiler.record_counter(event_start, counters) @@ -945,12 +1010,16 @@ def profile_run(self) -> None: self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches) - def warmup_scenario(self, batch_size, seq_len, is_prompt, kv_caches) -> None: + def warmup_scenario(self, batch_size, seq_len, is_prompt, + kv_caches) -> None: use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) scenario_name = f"warmup_{'prompt' if is_prompt else 'decode'}_bs{batch_size}_seq{seq_len}_graphs{'T' if use_graphs else 'F'}" self.profiler.start('internal', scenario_name) times = 3 if use_graphs else 1 - seqs = [self.create_dummy_seq_group_metadata(i, seq_len, is_prompt) for i in range(batch_size)] + seqs = [ + self.create_dummy_seq_group_metadata(i, seq_len, is_prompt) + for i in range(batch_size) + ] torch.hpu.synchronize() for _ in range(times): self.execute_model(seqs, kv_caches) @@ -959,16 +1028,22 @@ def warmup_scenario(self, batch_size, seq_len, is_prompt, kv_caches) -> None: gc.collect() def log_warmup(self, phase, i, max_i, batch_size, seq_len): - free_mem = format_bytes(HabanaMemoryProfiler.current_free_device_memory()) - logger.info(f"[Warmup][{phase}][{i+1}/{max_i}] batch_size:{batch_size} seq_len:{seq_len} free_mem:{free_mem}") + free_mem = format_bytes( + HabanaMemoryProfiler.current_free_device_memory()) + logger.info( + f"[Warmup][{phase}][{i+1}/{max_i}] batch_size:{batch_size} seq_len:{seq_len} free_mem:{free_mem}" + ) def warmup_all_buckets(self, buckets, is_prompt, kv_caches): for i, (batch_size, seq_len) in enumerate(reversed(buckets)): - mem_usage = 100.0 * HabanaMemoryProfiler.current_device_memory_usage() / HabanaMemoryProfiler.total_device_memory() - self.log_warmup('Prompt' if is_prompt else 'Decode', i, len(buckets), batch_size, seq_len) + mem_usage = 100.0 * HabanaMemoryProfiler.current_device_memory_usage( + ) / HabanaMemoryProfiler.total_device_memory() + self.log_warmup('Prompt' if is_prompt else 'Decode', i, + len(buckets), batch_size, seq_len) self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches) - def warmup_graphs(self, strategy, buckets, is_prompt, kv_caches, available_mem): + def warmup_graphs(self, strategy, buckets, is_prompt, kv_caches, + available_mem): total_batch_seq = 0.001 total_mem = 0 idx = 0 @@ -980,7 +1055,8 @@ def warmup_graphs(self, strategy, buckets, is_prompt, kv_caches, available_mem): elif strategy == 'max_bs': ordering = lambda b: (-b[0], b[1]) else: - raise NotImplementedError(f'Unsupported graph allocation strategy: {strategy}') + raise NotImplementedError( + f'Unsupported graph allocation strategy: {strategy}') buckets = list(sorted(buckets, key=ordering)) for idx, (batch_size, seq_len) in enumerate(buckets): @@ -993,12 +1069,16 @@ def warmup_graphs(self, strategy, buckets, is_prompt, kv_caches, available_mem): self.log_warmup(phase, idx, num_candidates, batch_size, seq_len) with HabanaMemoryProfiler() as mem_prof: self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches) - used_mem = align_workers(mem_prof.consumed_device_memory, torch.distributed.ReduceOp.MAX) + used_mem = align_workers(mem_prof.consumed_device_memory, + torch.distributed.ReduceOp.MAX) available_mem -= used_mem total_mem += used_mem total_batch_seq += batch_seq - graphed = list(c[:2] for c in self.graphed_buckets if c[2] == is_prompt) - logger.info(f'{phase} captured:{len(graphed)} ({100 * len(graphed) / num_candidates:.1f}%) used_mem:{format_bytes(total_mem)} buckets:{sorted(list(graphed))}') + graphed = list(c[:2] for c in self.graphed_buckets + if c[2] == is_prompt) + logger.info( + f'{phase} captured:{len(graphed)} ({100 * len(graphed) / num_candidates:.1f}%) used_mem:{format_bytes(total_mem)} buckets:{sorted(list(graphed))}' + ) @torch.inference_mode() def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: @@ -1012,49 +1092,68 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: self.warmup_all_buckets(self.decode_buckets, False, kv_caches) if not self.enforce_eager: - mem_margin = 1.0 - float(os.environ.get('VLLM_GRAPH_MEM_MARGIN', '0.02')) - free_mem = mem_margin * HabanaMemoryProfiler.current_free_device_memory() + mem_margin = 1.0 - float( + os.environ.get('VLLM_GRAPH_MEM_MARGIN', '0.02')) + free_mem = mem_margin * HabanaMemoryProfiler.current_free_device_memory( + ) free_mem = align_workers(free_mem, torch.distributed.ReduceOp.MIN) - prompt_graph_mem_ratio = float(os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.5')) + prompt_graph_mem_ratio = float( + os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.5')) prompt_available_memory = prompt_graph_mem_ratio * free_mem decode_available_memory = free_mem - prompt_available_memory prompt_strategy = 'min_tokens' - decode_strategy = os.environ.get('VLLM_GRAPH_DECODE_STRATEGY', 'max_bs') - self.warmup_graphs(prompt_strategy, self.prompt_buckets, True, kv_caches, prompt_available_memory) - self.warmup_graphs(decode_strategy, self.decode_buckets, False, kv_caches, decode_available_memory) + decode_strategy = os.environ.get('VLLM_GRAPH_DECODE_STRATEGY', + 'max_bs') + self.warmup_graphs(prompt_strategy, self.prompt_buckets, True, + kv_caches, prompt_available_memory) + self.warmup_graphs(decode_strategy, self.decode_buckets, False, + kv_caches, decode_available_memory) end_time = time.perf_counter() end_mem = HabanaMemoryProfiler.current_device_memory_usage() elapsed_time = end_time - start_time - logger.info(f"Warmup finished in {elapsed_time:.0f} secs, allocated {format_bytes(end_mem - start_mem)} of device memory") + logger.info( + f"Warmup finished in {elapsed_time:.0f} secs, allocated {format_bytes(end_mem - start_mem)} of device memory" + ) self.profiler.end() @property def vocab_size(self) -> int: return self.model_config.get_vocab_size() + def _maybe_wrap_in_hpu_graph(model): - return htorch.hpu.wrap_in_hpu_graph(HpuModelAdapter(model)) if htorch.utils.internal.is_lazy() else HpuModelAdapter(model) + return htorch.hpu.wrap_in_hpu_graph(HpuModelAdapter( + model)) if htorch.utils.internal.is_lazy() else HpuModelAdapter(model) class HabanaProfilerCounterHelper(): + def __init__(self): self.niter = 0 self.average_real_throughput = None self.logged_once = False - - def get_counter_dict(self, cache_config, duration, seq_len, batch_size_padded, real_batch_size, seq_group_metadata_list, is_prompt): + + def get_counter_dict(self, cache_config, duration, seq_len, + batch_size_padded, real_batch_size, + seq_group_metadata_list, is_prompt): throughput = batch_size_padded / (duration / 1e6) throughput_effective = real_batch_size / (duration / 1e6) - real_seq_lens = [len(seq_data.prompt_token_ids) + len(seq_data.output_token_ids) for seq_group_metadata in seq_group_metadata_list for seq_data in seq_group_metadata.seq_data.values()] + real_seq_lens = [ + len(seq_data.prompt_token_ids) + len(seq_data.output_token_ids) + for seq_group_metadata in seq_group_metadata_list + for seq_data in seq_group_metadata.seq_data.values() + ] real_max_seq_len = max(real_seq_lens) real_num_tokens = sum(real_seq_lens) padded_num_tokens = batch_size_padded * seq_len batch_token_utilization = real_num_tokens / padded_num_tokens if self.average_real_throughput is None: self.average_real_throughput = throughput_effective - else: # https://www.heikohoffmann.de/htmlthesis/node134.html - self.average_real_throughput = self.average_real_throughput + 1/(self.niter+1) * (throughput_effective-self.average_real_throughput) + else: # https://www.heikohoffmann.de/htmlthesis/node134.html + self.average_real_throughput = self.average_real_throughput + 1 / ( + self.niter + 1) * (throughput_effective - + self.average_real_throughput) phase = "prompt" if is_prompt else "decode" counters = { f'{phase}_bucket_batch_size': batch_size_padded, @@ -1067,30 +1166,41 @@ def get_counter_dict(self, cache_config, duration, seq_len, batch_size_padded, r 'average_real_throughput': self.average_real_throughput, 'engine_iteration': self.niter, } - self.niter += 1 + self.niter += 1 if is_prompt: - prompt_seq_lens = [len(seq_data.prompt_token_ids) for seq_group_metadata in seq_group_metadata_list for seq_data in seq_group_metadata.seq_data.values()] - prompt_bucket_in_throughput = (seq_len*batch_size_padded) / (duration / 1e6) - prompt_real_in_throughput = sum(prompt_seq_lens) / (duration / 1e6) - counters[f'{phase}_bucket_in_throughput'] = prompt_bucket_in_throughput + prompt_seq_lens = [ + len(seq_data.prompt_token_ids) + for seq_group_metadata in seq_group_metadata_list + for seq_data in seq_group_metadata.seq_data.values() + ] + prompt_bucket_in_throughput = (seq_len * batch_size_padded) / ( + duration / 1e6) + prompt_real_in_throughput = sum(prompt_seq_lens) / (duration / 1e6) + counters[ + f'{phase}_bucket_in_throughput'] = prompt_bucket_in_throughput counters[f'{phase}_real_in_throughput'] = prompt_real_in_throughput # KV cache might not be created yet (e.g. for profiling run) if cache_config.num_gpu_blocks is not None and cache_config.num_gpu_blocks != 0: - cache_num_blocks_used = [math.ceil(sl/cache_config.block_size) for sl in real_seq_lens] + cache_num_blocks_used = [ + math.ceil(sl / cache_config.block_size) for sl in real_seq_lens + ] cache_total_num_blocks_used = sum(cache_num_blocks_used) - num_cache_blocks = cache_config.num_gpu_blocks + num_cache_blocks = cache_config.num_gpu_blocks cache_total_num_free_blocks = num_cache_blocks - cache_total_num_blocks_used cache_computed_utilization = cache_total_num_blocks_used / num_cache_blocks - max_blocks_per_seq = math.ceil(seq_len/cache_config.block_size) - batch_block_utilization = cache_total_num_blocks_used / (batch_size_padded * max_blocks_per_seq) + max_blocks_per_seq = math.ceil(seq_len / cache_config.block_size) + batch_block_utilization = cache_total_num_blocks_used / ( + batch_size_padded * max_blocks_per_seq) counters['cache_num_blocks_used'] = cache_total_num_blocks_used counters['cache_num_free_blocks'] = cache_total_num_free_blocks counters['cache_computed_utilization'] = cache_computed_utilization - counters[f'{phase}_batch_block_utilization'] = batch_block_utilization + counters[ + f'{phase}_batch_block_utilization'] = batch_block_utilization if not self.logged_once: counters['const_cache_num_blocks'] = cache_config.num_gpu_blocks - counters['const_gpu_memory_utilization'] = cache_config.gpu_memory_utilization + counters[ + 'const_gpu_memory_utilization'] = cache_config.gpu_memory_utilization counters['const_block_size'] = cache_config.block_size self.logged_once = True return counters diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index e1c374124633f..8b53615805291 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -72,15 +72,16 @@ def __init__( "To be tested: vision language model with LoRA settings.") assert False, "To be tested: vision language model on HPU" - self.model_runner = HabanaModelRunner(model_config, - parallel_config, - scheduler_config, - device_config, - load_config=load_config, - cache_config=cache_config, - lora_config=self.lora_config, - kv_cache_dtype=self.cache_config.cache_dtype, - is_driver_worker=is_driver_worker) + self.model_runner = HabanaModelRunner( + model_config, + parallel_config, + scheduler_config, + device_config, + load_config=load_config, + cache_config=cache_config, + lora_config=self.lora_config, + kv_cache_dtype=self.cache_config.cache_dtype, + is_driver_worker=is_driver_worker) # Uninitialized cache engine. Will be initialized by # initialize_cache. self.cache_engine: CacheEngine @@ -129,14 +130,21 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: free_hpu_memory = torch.hpu.mem_get_info()[0] cache_block_size = self.get_cache_block_size_bytes() - graph_headroom = 1 - (float(os.environ.get('VLLM_GRAPH_RESERVED_MEM', '0.4')) if not self.model_config.enforce_eager else 0) - num_hpu_blocks = int(free_hpu_memory * graph_headroom * self.cache_config.gpu_memory_utilization // cache_block_size) + graph_headroom = 1 - (float( + os.environ.get('VLLM_GRAPH_RESERVED_MEM', '0.4')) + if not self.model_config.enforce_eager else 0) + num_hpu_blocks = int(free_hpu_memory * graph_headroom * + self.cache_config.gpu_memory_utilization // + cache_block_size) num_cpu_blocks = int(self.cache_config.swap_space_bytes // cache_block_size) num_hpu_blocks = max(num_hpu_blocks, 0) num_cpu_blocks = max(num_cpu_blocks, 0) - if self.model_runner.lora_manager: - self.model_runner.remove_all_loras() + + # NOTE(kzawora): Restore this once LoRA support is added + # if self.model_runner.lora_manager: + # self.model_runner.remove_all_loras() + gc.collect() return num_hpu_blocks, num_cpu_blocks @@ -159,9 +167,11 @@ def initialize_cache(self, num_gpu_blocks: int, def _init_cache_engine(self) -> None: assert self.cache_config.num_gpu_blocks is not None self.cache_engine = CacheEngine(self.cache_config, self.model_config, - self.parallel_config, self.device_config) + self.parallel_config, + self.device_config) self.hpu_cache = self.cache_engine.gpu_cache - htorch.hpu.synchronize() # we want to materialize cache tensors before we proceed with graph capture/execution + htorch.hpu.synchronize( + ) # we want to materialize cache tensors before we proceed with graph capture/execution def _warm_up_model(self) -> None: self.model_runner.warmup_model(self.hpu_cache) @@ -260,7 +270,6 @@ def _execute_model_non_driver(self) -> bool: self.model_runner.execute_model(None, self.hpu_cache) return True - def add_lora(self, lora_request: LoRARequest) -> bool: raise NotImplementedError("LoRA is not implemented for HPU backend.") @@ -296,8 +305,11 @@ def init_worker_distributed_environment( local_rank: int = -1, ) -> None: """Initialize the distributed environment.""" - init_distributed_environment(parallel_config.world_size, rank, - distributed_init_method, local_rank, backend='hccl') + init_distributed_environment(parallel_config.world_size, + rank, + distributed_init_method, + local_rank, + backend='hccl') ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size) diff --git a/vllm/worker/profiler.py b/vllm/worker/profiler.py index 34221d2553909..9e181e27bde34 100644 --- a/vllm/worker/profiler.py +++ b/vllm/worker/profiler.py @@ -11,11 +11,13 @@ from vllm.logger import init_logger from vllm.utils import get_vllm_instance_id +from typing import List, Any logger = init_logger(__name__) class FileWriter(threading.Thread): + def __init__(self, filename, event_queue): super().__init__() self.filename = filename @@ -46,11 +48,11 @@ def run(self): class Profiler: - profiling_trace_events = queue.Queue() + profiling_trace_events: queue.Queue = queue.Queue() event_tid = {'counter': 1, 'external': 2, 'internal': 3} vllm_instance_id = get_vllm_instance_id() filename = f'server_events_{vllm_instance_id}.json' - event_cache = [] + event_cache: List[Any] = [] def __init__(self): self.enabled = os.getenv('VLLM_PROFILER_ENABLED', From 23a9797240304770d66528b74814cbdd780b1512 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 2 Jul 2024 18:15:15 +0300 Subject: [PATCH 070/819] ruff --fix --- vllm/attention/backends/habana_attn.py | 2 +- vllm/executor/habana_executor.py | 2 +- vllm/executor/ray_utils.py | 3 +-- vllm/hpu/cache_ops.py | 1 - vllm/hpu/ops.py | 3 +-- vllm/utils.py | 1 - vllm/worker/cache_engine.py | 2 +- vllm/worker/habana_model_runner.py | 1 - 8 files changed, 5 insertions(+), 10 deletions(-) diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py index 09e717f61ac74..2fdb3d4f9c382 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/habana_attn.py @@ -202,7 +202,7 @@ def forward( # Prompt run. if kv_cache is None or attn_metadata.block_tables.numel() == 0: # TODO: move this outside of model - assert attn_metadata.attn_bias is not None, 'attn_bias must be set before calling model.forward!' + assert attn_metadata.attn_bias is not None, 'attn_bias must be set before calling model.forward!' attn_bias = attn_metadata.attn_bias if self.alibi_slopes is not None and self.position_bias is not None: attn_bias.add_(self.position_bias[:, :, diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py index a040e187eb0da..16245f70ec4d5 100644 --- a/vllm/executor/habana_executor.py +++ b/vllm/executor/habana_executor.py @@ -8,7 +8,7 @@ from vllm.lora.request import LoRARequest from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, - make_async, HabanaMemoryProfiler, format_bytes) + make_async, HabanaMemoryProfiler) import os import contextlib from vllm.worker.worker_base import WorkerWrapperBase diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 176b95b720615..2284012ecc7a0 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -78,8 +78,7 @@ def initialize_ray_cluster( else: ray.init(address=ray_address, ignore_reinit_error=True, - log_to_driver=not os.environ.get( - 'VLLM_RAY_DISABLE_LOG_TO_DRIVER', '0') != '0') + log_to_driver=os.environ.get("VLLM_RAY_DISABLE_LOG_TO_DRIVER", "0") == "0") ray_accel_name = "HPU" if is_hpu() else "GPU" if parallel_config.placement_group: diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py index 6457ad3c460f3..6f060b8280ea6 100644 --- a/vllm/hpu/cache_ops.py +++ b/vllm/hpu/cache_ops.py @@ -5,7 +5,6 @@ # LICENSE file in the root directory of this source tree. ############################################################################### -from typing import Tuple import torch import habana_frameworks.torch as htorch diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index 798bee09fda4f..875a54338913a 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -6,10 +6,9 @@ ############################################################################### import os import torch -import torch.nn as nn import torch.nn.functional as F import habana_frameworks.torch as htorch -from typing import List, Optional, Tuple +from typing import Optional import vllm.hpu.utils as hpu_utils diff --git a/vllm/utils.py b/vllm/utils.py index 520332110fd1f..72260a329f617 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -11,7 +11,6 @@ import threading import uuid import warnings -import importlib from collections import defaultdict from functools import lru_cache, partial, wraps from platform import uname diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 961b5689e43a4..2e6c374b1d51f 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -9,7 +9,7 @@ from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size, is_pin_memory_available, is_hpu) if is_hpu(): - import habana_frameworks.torch as htorch + pass logger = init_logger(__name__) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index e8e7c35579b0f..c8232c0cba407 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -16,7 +16,6 @@ import operator import torch import habana_frameworks.torch as htorch -import contextlib from vllm.attention import (AttentionMetadata, get_attn_backend) from vllm.config import (DeviceConfig, LoadConfig, CacheConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, From 22ee71540b3c7beb9d243f38b8ce1de1f861655a Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 2 Jul 2024 18:58:45 +0300 Subject: [PATCH 071/819] line lengths fixes --- setup.py | 4 +++- vllm/attention/backends/habana_attn.py | 6 +++-- vllm/engine/async_llm_engine.py | 3 ++- vllm/executor/habana_executor.py | 32 ++++++++++++++----------- vllm/executor/ray_utils.py | 23 +++++++++--------- vllm/hpu/rotary_embed.py | 30 ++++++++++++++--------- vllm/model_executor/layers/layernorm.py | 3 ++- vllm/utils.py | 30 +++++++++++++---------- vllm/worker/habana_worker.py | 11 +++++---- vllm/worker/profiler.py | 7 +++--- 10 files changed, 87 insertions(+), 62 deletions(-) diff --git a/setup.py b/setup.py index ddf1cdf034c1b..897958d875284 100644 --- a/setup.py +++ b/setup.py @@ -207,7 +207,9 @@ def build_extensions(self) -> None: def _is_hpu() -> bool: is_hpu_available = True - return is_hpu_available # FIXME(kzawora): HPU autodetection sporadically fails on certain clients. Find the cause and fix it. + # FIXME(kzawora): HPU autodetection sporadically fails on certain clients. + # Need to find the cause and fix it. + return is_hpu_available try: subprocess.run(["hl-smi"], capture_output=True, check=True) except (FileNotFoundError, PermissionError, subprocess.CalledProcessError): diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py index 2fdb3d4f9c382..4f34aed1c90f2 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/habana_attn.py @@ -202,9 +202,11 @@ def forward( # Prompt run. if kv_cache is None or attn_metadata.block_tables.numel() == 0: # TODO: move this outside of model - assert attn_metadata.attn_bias is not None, 'attn_bias must be set before calling model.forward!' + assert attn_metadata.attn_bias is not None, \ + 'attn_bias must be set before calling model.forward!' attn_bias = attn_metadata.attn_bias - if self.alibi_slopes is not None and self.position_bias is not None: + if self.alibi_slopes is not None and \ + self.position_bias is not None: attn_bias.add_(self.position_bias[:, :, -attn_bias.size(2):, -attn_bias.size(3):]) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 55e0d337b4235..b85087739c7ee 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -396,7 +396,8 @@ def from_engine_args( elif engine_config.device_config.device_type == "hpu": if distributed_executor_backend == "ray": initialize_ray_cluster(engine_config.parallel_config) - from vllm.executor.ray_habana_executor import RayHabanaExecutorAsync + from vllm.executor.ray_habana_executor import ( + RayHabanaExecutorAsync) executor_class = RayHabanaExecutorAsync else: from vllm.executor.habana_executor import HabanaExecutorAsync diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py index 16245f70ec4d5..06a214918f931 100644 --- a/vllm/executor/habana_executor.py +++ b/vllm/executor/habana_executor.py @@ -83,16 +83,16 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: with HabanaMemoryProfiler() as cache_init_m: self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - logger.info( - f"init_cache_engine took {cache_init_m.get_summary_string()}") + msg = f"init_cache_engine took {cache_init_m.get_summary_string()}" + logger.info(msg) def execute_model( self, execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: - # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! - # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none - # VLLM_HPU_LOG_STEP_CPU_FALLBACKS - will log cpu fallbacks per engine step, only when there was any - # VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL - will log cpu fallbacks per engine step, always, even if there were none + # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501 + # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501 + # VLLM_HPU_LOG_STEP_CPU_FALLBACKS - will log cpu fallbacks per engine step, only when there was any # noqa:E501 + # VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL - will log cpu fallbacks per engine step, always, even if there were none # noqa:E501 log_graph_compilation_all = os.environ.get( 'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0' log_graph_compilation = os.environ.get( @@ -117,25 +117,29 @@ def execute_model( ]) # whoa, that's some spicy stuff right here max_num_blocks = ( (max_context_len - 1) // self.cache_config.block_size) + 1 - input_stats = f'is_prompt: {is_prompt}, num_seqs: {len(seq_group_metadata_list)} max_context_len: {max_context_len}, max_num_blocks {max_num_blocks}' + input_stats = (f'is_prompt: {is_prompt}, ' + f'num_seqs: {len(seq_group_metadata_list)}, ' + f'max_context_len: {max_context_len}, ' + f'max_num_blocks {max_num_blocks}') gc_ctx = metric_localcontext( "graph_compilation" ) if log_graph_compilation else contextlib.nullcontext() cpu_fallback_ctx = metric_localcontext( "cpu_fallback" ) if log_cpu_fallbacks else contextlib.nullcontext() - with gc_ctx as gc_local_metric, cpu_fallback_ctx as cpu_fallback_local_metric: + with gc_ctx as gc_local_metric, \ + cpu_fallback_ctx as cpu_fallback_local_metric: output = self.driver_worker.execute_model(execute_model_req) if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0 ) or log_graph_compilation_all: - logger.warning( - f"VLLM_HPU_STEP_GRAPH_COMPILATION: {gc_local_metric.stats()}, {input_stats}" - ) + msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: " + f"{gc_local_metric.stats()}, {input_stats}") + logger.warning(msg) if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] > 0) or log_cpu_fallbacks_all: - logger.warning( - f"VLLM_HPU_STEP_CPU_FALLBACK: {cpu_fallback_local_metric.stats()}, {input_stats}" - ) + msg = ("VLLM_HPU_STEP_CPU_FALLBACK: " + f"{cpu_fallback_local_metric.stats()}, {input_stats}") + logger.warning(msg) return output diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 2284012ecc7a0..7961efdf4b516 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -78,8 +78,9 @@ def initialize_ray_cluster( else: ray.init(address=ray_address, ignore_reinit_error=True, - log_to_driver=os.environ.get("VLLM_RAY_DISABLE_LOG_TO_DRIVER", "0") == "0") - ray_accel_name = "HPU" if is_hpu() else "GPU" + log_to_driver=os.environ.get("VLLM_RAY_DISABLE_LOG_TO_DRIVER", + "0") == "0") + device = "HPU" if is_hpu() else "GPU" if parallel_config.placement_group: # Placement group is already set. @@ -93,27 +94,25 @@ def initialize_ray_cluster( # Verify that we can use the placement group. gpu_bundles = 0 for bundle in bundles: - bundle_gpus = bundle.get(ray_accel_name, 0) + bundle_gpus = bundle.get(device, 0) if bundle_gpus > 1: raise ValueError( - f"Placement group bundle cannot have more than 1 {ray_accel_name}." + f"Placement group bundle cannot have more than 1 {device}." ) if bundle_gpus: gpu_bundles += 1 if parallel_config.world_size > gpu_bundles: raise ValueError( - f"The number of required {ray_accel_name}s exceeds the total number of " - f"available {ray_accel_name}s in the placement group.") + f"The number of required {device}s exceeds the total number of " + f"available {device}s in the placement group.") else: - num_gpus_in_cluster = ray.cluster_resources().get(ray_accel_name, 0) + num_gpus_in_cluster = ray.cluster_resources().get(device, 0) if parallel_config.world_size > num_gpus_in_cluster: raise ValueError( - f"The number of required {ray_accel_name}s exceeds the total number of " - f"available {ray_accel_name}s in the cluster.") + f"The number of required {device}s exceeds the total number of " + f"available {device}s in the cluster.") # Create a new placement group - placement_group_specs = ([{ - ray_accel_name: 1 - }] * parallel_config.world_size) + placement_group_specs = ([{device: 1}] * parallel_config.world_size) current_placement_group = ray.util.placement_group( placement_group_specs) # Wait until PG is ready - this will block until all diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py index 16c956acdf817..3c701df439535 100644 --- a/vllm/hpu/rotary_embed.py +++ b/vllm/hpu/rotary_embed.py @@ -29,7 +29,8 @@ def is_gaudi3(): # TODO: remove this workaround when FusedRoPE properly works on Gaudi if not is_gaudi1() and (is_gaudi2() or is_gaudi3()): try: - from habana_frameworks.torch.hpex.kernels import RotaryPosEmbeddingHelperV1 as FusedRoPE + from habana_frameworks.torch.hpex.kernels import ( + RotaryPosEmbeddingHelperV1 as FusedRoPE) except ImportError: print("Not using HPU fused kernel for apply_rotary_pos_emb") FusedRoPE = None @@ -53,17 +54,23 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): cos (`torch.Tensor`): The cosine part of the rotary embedding. sin (`torch.Tensor`): The sine part of the rotary embedding. position_ids (`torch.Tensor`): - The position indices of the tokens corresponding to the query and key tensors. For example, this can be - used to pass offsetted position ids when working with a KV-cache. + The position indices of the tokens corresponding to the query and + key tensors. For example, this can be used to pass offsetted + position ids when working with a KV-cache. unsqueeze_dim (`int`, *optional*, defaults to 1): - The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and - sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note - that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and - k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes - cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have - the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + The 'unsqueeze_dim' argument specifies the dimension along which to + unsqueeze cos[position_ids] and sin[position_ids] so that they can + be properly broadcasted to the dimensions of q and k. For example, + note that cos[position_ids] and sin[position_ids] have the shape + [batch_size, seq_len, head_dim]. Then, if q and k have the shape + [batch_size, heads, seq_len, head_dim], then setting + unsqueeze_dim=1 makes cos[position_ids] and sin[position_ids] + broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set + unsqueeze_dim=2. Returns: - `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + `tuple(torch.Tensor)` comprising of the query and key tensors rotated + using the Rotary Position Embedding. """ cos = cos[position_ids] #.unsqueeze(unsqueeze_dim) sin = sin[position_ids] #.unsqueeze(unsqueeze_dim) @@ -103,7 +110,8 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): dtype=self.inv_freq.dtype) freqs = torch.einsum("i,j->ij", t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation + # Different from paper, but it uses a different permutation in order + # to obtain the same calculation emb = torch.cat((freqs, freqs), dim=-1) self.register_buffer("cos_cached", emb.cos().to(dtype), diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 975019bc9c24d..334c3d6c95c78 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -8,7 +8,8 @@ from vllm.utils import is_hpu if is_hpu(): try: - from habana_frameworks.torch.hpex.normalization import FusedRMSNorm as FusedRMSNorm + from habana_frameworks.torch.hpex.normalization import (FusedRMSNorm as + FusedRMSNorm) except ImportError: print("Not using HPU fused kernel for RMSNorm") FusedRMSNorm = None diff --git a/vllm/utils.py b/vllm/utils.py index 72260a329f617..53ced00dda766 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -640,32 +640,38 @@ def get_summary_string(self): if getattr(self, 'final_device_memory', None) is None or getattr( self, 'final_host_memory', None) is None: raise RuntimeError( - "HabanaMemoryProfiler.get_summary_string() can only be called after closing context manager" - ) + "HabanaMemoryProfiler.get_summary_string() can only be called " + "after closing context manager") return ( - f"{format_bytes(self.consumed_device_memory)} of device memory ({format_bytes(self.final_device_memory)}/{format_bytes(HabanaMemoryProfiler.total_device_memory())} used) and " - f"{format_bytes(self.consumed_host_memory)} of host memory ({format_bytes(self.final_host_memory)}/{format_bytes(HabanaMemoryProfiler.total_host_memory())} used)" - ) + f"{format_bytes(self.consumed_device_memory)} of device memory " + f"({format_bytes(self.final_device_memory)}/" + f"({format_bytes(HabanaMemoryProfiler.total_device_memory())} used)" + f" and {format_bytes(self.consumed_host_memory)} of host memory " + f"({format_bytes(self.final_host_memory)}/" + f"{format_bytes(HabanaMemoryProfiler.total_host_memory())} used)") def __enter__(self): # Force garbage collection gc.collect() - self.initial_device_memory = HabanaMemoryProfiler.current_device_memory_usage( - ) - self.initial_host_memory = HabanaMemoryProfiler.current_host_memory_usage( - ) + self.initial_device_memory = \ + HabanaMemoryProfiler.current_device_memory_usage() + self.initial_host_memory = \ + HabanaMemoryProfiler.current_host_memory_usage() # This allows us to call methods of the context manager if needed return self def __exit__(self, exc_type, exc_val, exc_tb): # Force garbage collection gc.collect() - self.final_device_memory = HabanaMemoryProfiler.current_device_memory_usage( + self.final_device_memory = \ + HabanaMemoryProfiler.current_device_memory_usage( ) self.final_host_memory = HabanaMemoryProfiler.current_host_memory_usage( ) - self.consumed_device_memory = self.final_device_memory - self.initial_device_memory - self.consumed_host_memory = self.final_host_memory - self.initial_host_memory + self.consumed_device_memory = \ + self.final_device_memory - self.initial_device_memory + self.consumed_host_memory = \ + self.final_host_memory - self.initial_host_memory # Adapted from https://stackoverflow.com/a/49361727 diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index 8b53615805291..382962ce9ea71 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -70,7 +70,7 @@ def __init__( if self.vision_language_config: assert not self.lora_config, ( "To be tested: vision language model with LoRA settings.") - assert False, "To be tested: vision language model on HPU" + raise AssertionError("To be tested: vision language model on HPU") self.model_runner = HabanaModelRunner( model_config, @@ -125,8 +125,8 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: self.model_runner.profile_run() torch.hpu.synchronize() - # At this point we should've allocated the maximum workspace for all recipes - # we will use the extra memory for graphs/blocks + # At this point we should've allocated the maximum workspace for all + # recipes we will use the extra memory for graphs/blocks free_hpu_memory = torch.hpu.mem_get_info()[0] cache_block_size = self.get_cache_block_size_bytes() @@ -170,8 +170,9 @@ def _init_cache_engine(self) -> None: self.parallel_config, self.device_config) self.hpu_cache = self.cache_engine.gpu_cache - htorch.hpu.synchronize( - ) # we want to materialize cache tensors before we proceed with graph capture/execution + # we want to materialize cache tensors before we proceed with + # graph capture/execution + htorch.hpu.synchronize() def _warm_up_model(self) -> None: self.model_runner.warmup_model(self.hpu_cache) diff --git a/vllm/worker/profiler.py b/vllm/worker/profiler.py index 9e181e27bde34..d5125019a4b8c 100644 --- a/vllm/worker/profiler.py +++ b/vllm/worker/profiler.py @@ -58,7 +58,8 @@ def __init__(self): self.enabled = os.getenv('VLLM_PROFILER_ENABLED', 'false').lower() == 'true' and int( os.getenv('RANK', '0')) == 0 - logger.info(f'Profiler enabled for: {self.vllm_instance_id}') + msg = f'Profiler enabled for: {self.vllm_instance_id}' + logger.info(msg) if self.enabled: # initialize the trace file (JSON Array Format) with open(self.filename, 'w') as outfile: @@ -107,8 +108,8 @@ def end(self): ts = self.get_timestamp_us() if not self.event_cache: logger.warning( - 'Profiler: end() call does not have matching start() call. Disabling profiler.' - ) + 'Profiler: end() call does not have matching start() call. ' + 'Disabling profiler.') self.enabled = False return event = self.event_cache.pop() From bf3187138253823fdb0f1cb23bd0cab04e423c76 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 2 Jul 2024 19:12:36 +0300 Subject: [PATCH 072/819] more line fixes + isort --- vllm/attention/backends/habana_attn.py | 2 +- vllm/config.py | 2 +- vllm/distributed/communication_op.py | 3 +- vllm/executor/habana_executor.py | 9 +- vllm/executor/ray_utils.py | 3 +- vllm/hpu/cache_ops.py | 2 +- vllm/hpu/ops.py | 5 +- vllm/hpu/rotary_embed.py | 2 +- vllm/model_executor/layers/layernorm.py | 5 +- .../model_executor/layers/rotary_embedding.py | 3 +- vllm/model_executor/model_loader/loader.py | 2 +- vllm/model_executor/models/mixtral.py | 2 +- vllm/model_executor/sampling_metadata.py | 4 +- vllm/worker/cache_engine.py | 5 +- vllm/worker/habana_model_runner.py | 159 +++++++++++------- vllm/worker/habana_worker.py | 4 +- vllm/worker/profiler.py | 2 +- 17 files changed, 125 insertions(+), 89 deletions(-) diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py index 4f34aed1c90f2..dd33d3526e907 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/habana_attn.py @@ -6,8 +6,8 @@ from typing import Any, Dict, List, Optional, Tuple, Type import torch -import vllm.hpu.ops as ops +import vllm.hpu.ops as ops from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata) from vllm.attention.ops.habana_paged_attn import (HabanaPagedAttention, diff --git a/vllm/config.py b/vllm/config.py index 38f8d4f2884d0..75a64732f8591 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -14,7 +14,7 @@ from vllm.tracing import is_otel_installed from vllm.transformers_utils.config import get_config, get_hf_text_config from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu, - is_hip, is_neuron, is_openvino, is_tpu, is_xpu, is_hpu, + is_hip, is_hpu, is_neuron, is_openvino, is_tpu, is_xpu, print_warning_once, update_environment_variables) if TYPE_CHECKING: diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py index 2bb082385c0f3..63c159fce3d71 100644 --- a/vllm/distributed/communication_op.py +++ b/vllm/distributed/communication_op.py @@ -3,9 +3,10 @@ import torch import torch.distributed +from vllm.utils import is_hpu + from .parallel_state import get_tp_group -from vllm.utils import is_hpu if is_hpu(): import habana_frameworks.torch as htorch diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py index 06a214918f931..b771b9e026970 100644 --- a/vllm/executor/habana_executor.py +++ b/vllm/executor/habana_executor.py @@ -2,15 +2,16 @@ # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company ############################################################################### +import contextlib +import os from typing import Any, Dict, List, Optional, Set, Tuple + from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.sequence import ExecuteModelRequest, SamplerOutput -from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, - make_async, HabanaMemoryProfiler) -import os -import contextlib +from vllm.utils import (HabanaMemoryProfiler, get_distributed_init_method, + get_ip, get_open_port, make_async) from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 7961efdf4b516..ea57bc842c8e7 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -1,6 +1,7 @@ +import os import pickle from typing import List, Optional, Tuple -import os + from vllm.config import ParallelConfig from vllm.logger import init_logger from vllm.utils import get_ip, is_hip, is_hpu, is_xpu diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py index 6f060b8280ea6..d28a47271c6ac 100644 --- a/vllm/hpu/cache_ops.py +++ b/vllm/hpu/cache_ops.py @@ -5,8 +5,8 @@ # LICENSE file in the root directory of this source tree. ############################################################################### -import torch import habana_frameworks.torch as htorch +import torch def reshape_and_cache(key, diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index 875a54338913a..bd737917cb919 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -5,10 +5,11 @@ # LICENSE file in the root directory of this source tree. ############################################################################### import os +from typing import Optional + +import habana_frameworks.torch as htorch import torch import torch.nn.functional as F -import habana_frameworks.torch as htorch -from typing import Optional import vllm.hpu.utils as hpu_utils diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py index 3c701df439535..26b19e8258285 100644 --- a/vllm/hpu/rotary_embed.py +++ b/vllm/hpu/rotary_embed.py @@ -5,9 +5,9 @@ # LICENSE file in the root directory of this source tree. ############################################################################### +import habana_frameworks.torch.utils.experimental as htexp import torch import torch.nn as nn -import habana_frameworks.torch.utils.experimental as htexp def get_device_type(): diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 334c3d6c95c78..57ada2ba8e3c4 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -6,10 +6,11 @@ from vllm.model_executor.custom_op import CustomOp from vllm.utils import is_hpu + if is_hpu(): try: - from habana_frameworks.torch.hpex.normalization import (FusedRMSNorm as - FusedRMSNorm) + from habana_frameworks.torch.hpex.normalization import ( + FusedRMSNorm as FusedRMSNorm) except ImportError: print("Not using HPU fused kernel for RMSNorm") FusedRMSNorm = None diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index ceaa2ddd3d553..d706c70c82374 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -28,7 +28,8 @@ import torch.nn as nn from vllm.model_executor.custom_op import CustomOp -from vllm.utils import is_tpu, is_hpu +from vllm.utils import is_hpu, is_tpu + if is_hpu(): from vllm.hpu.rotary_embed import HpuRotaryEmbedding diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index ad146da72fb26..dc16897196601 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -35,7 +35,7 @@ from vllm.model_executor.models.interfaces import (supports_lora, supports_vision) from vllm.model_executor.utils import set_weight_attrs -from vllm.utils import get_device_capability_stateless, is_tpu, is_hpu +from vllm.utils import get_device_capability_stateless, is_hpu, is_tpu logger = init_logger(__name__) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 0344f1c7c7a03..07edd4711e04f 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -52,7 +52,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.sequence import SamplerOutput -from vllm.utils import print_warning_once, is_hpu +from vllm.utils import is_hpu, print_warning_once if is_hpu(): from vllm.hpu.ops import static_fused_moe diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index ea82a3a4041b7..a916d8596d2f0 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -7,8 +7,8 @@ from vllm.model_executor.layers.ops.sample import get_num_triton_sampler_splits from vllm.sampling_params import SamplingParams, SamplingType from vllm.sequence import SequenceData, SequenceGroupMetadata -from vllm.utils import (async_tensor_h2d, is_pin_memory_available, - maybe_expand_dim, is_hpu) +from vllm.utils import (async_tensor_h2d, is_hpu, is_pin_memory_available, + maybe_expand_dim) _SAMPLING_EPS = 1e-5 _SEED_0_REPLACEMENT = 3403598558 diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 2e6c374b1d51f..1c185e9b3a405 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -6,8 +6,9 @@ from vllm.attention import get_attn_backend from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig from vllm.logger import init_logger -from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size, - is_pin_memory_available, is_hpu) +from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size, is_hpu, + is_pin_memory_available) + if is_hpu(): pass diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index c8232c0cba407..22d6368bacacc 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -4,20 +4,21 @@ # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company ############################################################################### -import time -from enum import IntEnum -from typing import List, NamedTuple, Optional, Set, Tuple, Dict - import collections import gc -import os -import math import itertools +import math import operator -import torch +import os +import time +from enum import IntEnum +from typing import Dict, List, NamedTuple, Optional, Set, Tuple + import habana_frameworks.torch as htorch -from vllm.attention import (AttentionMetadata, get_attn_backend) -from vllm.config import (DeviceConfig, LoadConfig, CacheConfig, LoRAConfig, +import torch + +from vllm.attention import AttentionMetadata, get_attn_backend +from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig) from vllm.distributed import broadcast_tensor_dict @@ -30,8 +31,8 @@ from vllm.model_executor.model_loader import get_model from vllm.sampling_params import SamplingParams from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata -from vllm.utils import (HabanaMemoryProfiler, is_pin_memory_available, - make_tensor_with_pad, format_bytes) +from vllm.utils import (HabanaMemoryProfiler, format_bytes, + is_pin_memory_available, make_tensor_with_pad) from .profiler import Profiler @@ -93,10 +94,10 @@ def find_bucket(value: int, config: Tuple[int, int, int]): return result -def subtuple(obj: object, - typename: str, - to_copy: List[str], - to_override: Dict[str, object] = {}): +def subtuple(obj: object, typename: str, to_copy: List[str], + to_override: Optional[Dict[str, object]]): + if to_override is None: + to_override = {} if obj is None: return None fields = set(to_copy) | set(to_override.keys()) @@ -261,7 +262,8 @@ def __init__( self.enforce_eager = self.model_config.enforce_eager self.max_num_seqs = self.scheduler_config.max_num_seqs self.max_model_len = self.scheduler_config.max_model_len - self.max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens + self.max_num_batched_tokens = \ + self.scheduler_config.max_num_batched_tokens self.block_size = cache_config.block_size self.pin_memory = is_pin_memory_available() @@ -299,19 +301,21 @@ def load_model(self) -> None: parallel_config=self.parallel_config, scheduler_config=self.scheduler_config, cache_config=self.cache_config) - logger.info( - f"Pre-loading model weights on {next(self.model.parameters()).device} took {m_getmodel.get_summary_string()}" - ) + msg = ("Pre-loading model weights on " + f"{next(self.model.parameters()).device} " + f"took {m_getmodel.get_summary_string()}") + logger.info(msg) - # FIXME: Running with disable_tensor_cache=True causes RuntimeErrors. This needs to be debugged + # FIXME: Running with disable_tensor_cache=True causes + # RuntimeErrors. This needs to be debugged with HabanaMemoryProfiler() as m_wrap: self.model = _maybe_wrap_in_hpu_graph(self.model) - logger.info( - f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}") + msg = f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}" + logger.info(msg) self.model_memory_usage = m.consumed_device_memory - logger.info( - f"Loading model weights took in total {m.get_summary_string()}") + msg = f"Loading model weights took in total {m.get_summary_string()}" + logger.info(msg) if self.lora_config: assert hasattr(self.model, "supported_lora_modules" @@ -359,23 +363,26 @@ def _setup_buckets(self) -> None: max=2048) self.graphed_buckets = set() - logger.info( - f"Prompt bucket config (min, step, max_warmup) bs:{self.prompt_bs_bucket_cfg}, seq:{self.prompt_seq_bucket_cfg}" - ) + msg = ("Prompt bucket config (min, step, max_warmup) " + f"bs:{self.prompt_bs_bucket_cfg}, " + f"seq:{self.prompt_seq_bucket_cfg}") + logger.info(msg) self.prompt_buckets = warmup_buckets(self.prompt_bs_bucket_cfg, self.prompt_seq_bucket_cfg) - logger.info( - f"Generated {len(self.prompt_buckets)} prompt buckets: {list(sorted(self.prompt_buckets))}" - ) - logger.info( - f"Decode bucket config (min, step, max_warmup) bs:{self.decode_bs_bucket_cfg}, seq:{self.decode_seq_bucket_cfg}" - ) + msg = (f"Generated {len(self.prompt_buckets)} " + f"prompt buckets: {list(sorted(self.prompt_buckets))}") + logger.info(msg) + + msg = ("Decode bucket config (min, step, max_warmup) " + f"bs:{self.decode_bs_bucket_cfg}, " + f"seq:{self.decode_seq_bucket_cfg}") + logger.info(msg) self.decode_buckets = warmup_buckets(self.decode_bs_bucket_cfg, self.decode_seq_bucket_cfg) - logger.info( - f"Generated {len(self.decode_buckets)} decode buckets: {list(sorted(self.decode_buckets))}" - ) + msg = ("Generated {len(self.decode_buckets)} decode buckets: " + f"{list(sorted(self.decode_buckets))}") + logger.info(msg) def _prepare_prompt( self, @@ -733,7 +740,9 @@ def prepare_input_tensors( num_prefill_tokens = len(input_tokens) num_decode_tokens = len(decode_input_tokens) - # NOTE(kzawora): Here we diverge from GPU code - we don't support mixed batches, so we either use decode or prefill inputs, without coalescing. + # NOTE(kzawora): Here we diverge from GPU code - we don't + # support mixed batches, so we either use decode or prefill + # inputs, without coalescing. assert (num_prefills == 0 and num_decode_tokens > 0) or ( num_prefills > 0 and num_decode_tokens == 0), "HPU does not support mixed batches!" @@ -745,7 +754,8 @@ def prepare_input_tensors( lora_prompt_mapping = decode_lora_prompt_mapping lora_requests = decode_lora_requests - # FIXME: We need to adjust selected_token_indices to accommodate for padding + # FIXME: We need to adjust selected_token_indices to accommodate f + # or padding max_len = input_tokens.size(1) paddings = [max_len - s for s in seq_lens] paddings = [0] + paddings[:-1] @@ -836,7 +846,8 @@ def prepare_input_tensors( decode_attn_metadata = self.attn_backend.make_metadata( **metadata_dict) - attn_metadata = prefill_attn_metadata if prefill_attn_metadata is not None else decode_attn_metadata + attn_metadata = prefill_attn_metadata if \ + prefill_attn_metadata is not None else decode_attn_metadata # attn_metadata = AttentionMetadata( # num_prefills=num_prefills, # slot_mapping=slot_mapping, @@ -897,7 +908,8 @@ def execute_model( self.profiler.start('internal', base_event_name) real_batch_size = len(seq_group_metadata_list) - bucket_cfg = self.prompt_bs_bucket_cfg if is_prompt else self.decode_bs_bucket_cfg + bucket_cfg = self.prompt_bs_bucket_cfg if is_prompt else \ + self.decode_bs_bucket_cfg batch_size_padded = find_bucket(real_batch_size, bucket_cfg) batch_size_padding = batch_size_padded - real_batch_size seq_group_metadata_list = seq_group_metadata_list.copy() @@ -926,7 +938,11 @@ def execute_model( htorch.core.mark_step() if self.is_driver_worker: - model_event_name = f"model_{'prompt' if is_prompt else 'decode'}_bs{batch_size}_seq{seq_len}_graphs{'T' if use_graphs else 'F'}" + model_event_name = ("model_" + f"{'prompt' if is_prompt else 'decode'}_" + f"bs{batch_size}_" + f"seq{seq_len}_" + f"graphs{'T' if use_graphs else 'F'}") else: model_event_name = 'model_executable' with self.profiler.record_event('internal', model_event_name): @@ -938,9 +954,10 @@ def execute_model( # Compute the logits. with self.profiler.record_event( - 'internal', - f'compute_logits_{"prompt" if is_prompt else "decode"}_bs{batch_size}_seq{seq_len}' - ): + 'internal', ('compute_logits_' + f'{"prompt" if is_prompt else "decode"}_bs' + f'{batch_size}_' + f'seq{seq_len}')): sampling_metadata.selected_token_indices = None logits = self.model.compute_logits(hidden_states, sampling_metadata) @@ -952,9 +969,10 @@ def execute_model( # Sample the next token. with self.profiler.record_event( - 'internal', - f'sample_{"prompt" if is_prompt else "decode"}_bs{batch_size}_seq{seq_len}' - ): + 'internal', ('sample_' + f'{"prompt" if is_prompt else "decode"}_' + f'bs{batch_size}_' + f'seq{seq_len}')): output = self.model.sample( logits=logits, sampling_metadata=sampling_metadata, @@ -1012,7 +1030,11 @@ def profile_run(self) -> None: def warmup_scenario(self, batch_size, seq_len, is_prompt, kv_caches) -> None: use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) - scenario_name = f"warmup_{'prompt' if is_prompt else 'decode'}_bs{batch_size}_seq{seq_len}_graphs{'T' if use_graphs else 'F'}" + scenario_name = ("warmup_" + f"{'prompt' if is_prompt else 'decode'}_" + f"bs{batch_size}_" + f"seq{seq_len}_" + f"graphs{'T' if use_graphs else 'F'}") self.profiler.start('internal', scenario_name) times = 3 if use_graphs else 1 seqs = [ @@ -1029,14 +1051,14 @@ def warmup_scenario(self, batch_size, seq_len, is_prompt, def log_warmup(self, phase, i, max_i, batch_size, seq_len): free_mem = format_bytes( HabanaMemoryProfiler.current_free_device_memory()) - logger.info( - f"[Warmup][{phase}][{i+1}/{max_i}] batch_size:{batch_size} seq_len:{seq_len} free_mem:{free_mem}" - ) + msg = (f"[Warmup][{phase}][{i+1}/{max_i}] " + f"batch_size:{batch_size} " + f"seq_len:{seq_len} " + f"free_mem:{free_mem}") + logger.info(msg) def warmup_all_buckets(self, buckets, is_prompt, kv_caches): for i, (batch_size, seq_len) in enumerate(reversed(buckets)): - mem_usage = 100.0 * HabanaMemoryProfiler.current_device_memory_usage( - ) / HabanaMemoryProfiler.total_device_memory() self.log_warmup('Prompt' if is_prompt else 'Decode', i, len(buckets), batch_size, seq_len) self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches) @@ -1075,9 +1097,11 @@ def warmup_graphs(self, strategy, buckets, is_prompt, kv_caches, total_batch_seq += batch_seq graphed = list(c[:2] for c in self.graphed_buckets if c[2] == is_prompt) - logger.info( - f'{phase} captured:{len(graphed)} ({100 * len(graphed) / num_candidates:.1f}%) used_mem:{format_bytes(total_mem)} buckets:{sorted(list(graphed))}' - ) + msg = (f'{phase} captured:{len(graphed)} ' + f'({100 * len(graphed) / num_candidates:.1f}%) ' + f'used_mem:{format_bytes(total_mem)} ' + f'buckets:{sorted(list(graphed))}') + logger.info(msg) @torch.inference_mode() def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: @@ -1093,8 +1117,8 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: if not self.enforce_eager: mem_margin = 1.0 - float( os.environ.get('VLLM_GRAPH_MEM_MARGIN', '0.02')) - free_mem = mem_margin * HabanaMemoryProfiler.current_free_device_memory( - ) + free_mem = \ + mem_margin * HabanaMemoryProfiler.current_free_device_memory() free_mem = align_workers(free_mem, torch.distributed.ReduceOp.MIN) prompt_graph_mem_ratio = float( os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.5')) @@ -1111,9 +1135,10 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: end_time = time.perf_counter() end_mem = HabanaMemoryProfiler.current_device_memory_usage() elapsed_time = end_time - start_time - logger.info( - f"Warmup finished in {elapsed_time:.0f} secs, allocated {format_bytes(end_mem - start_mem)} of device memory" - ) + msg = ( + f"Warmup finished in {elapsed_time:.0f} secs, " + f"allocated {format_bytes(end_mem - start_mem)} of device memory") + logger.info(msg) self.profiler.end() @property @@ -1180,14 +1205,17 @@ def get_counter_dict(self, cache_config, duration, seq_len, counters[f'{phase}_real_in_throughput'] = prompt_real_in_throughput # KV cache might not be created yet (e.g. for profiling run) - if cache_config.num_gpu_blocks is not None and cache_config.num_gpu_blocks != 0: + if cache_config.num_gpu_blocks is not None and \ + cache_config.num_gpu_blocks != 0: cache_num_blocks_used = [ math.ceil(sl / cache_config.block_size) for sl in real_seq_lens ] cache_total_num_blocks_used = sum(cache_num_blocks_used) num_cache_blocks = cache_config.num_gpu_blocks - cache_total_num_free_blocks = num_cache_blocks - cache_total_num_blocks_used - cache_computed_utilization = cache_total_num_blocks_used / num_cache_blocks + cache_total_num_free_blocks = \ + num_cache_blocks - cache_total_num_blocks_used + cache_computed_utilization = \ + cache_total_num_blocks_used / num_cache_blocks max_blocks_per_seq = math.ceil(seq_len / cache_config.block_size) batch_block_utilization = cache_total_num_blocks_used / ( batch_size_padded * max_blocks_per_seq) @@ -1199,7 +1227,8 @@ def get_counter_dict(self, cache_config, duration, seq_len, if not self.logged_once: counters['const_cache_num_blocks'] = cache_config.num_gpu_blocks counters[ - 'const_gpu_memory_utilization'] = cache_config.gpu_memory_utilization + 'const_gpu_memory_utilization'] = \ + cache_config.gpu_memory_utilization counters['const_block_size'] = cache_config.block_size self.logged_once = True return counters diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index 382962ce9ea71..63055bf4f2055 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -6,13 +6,13 @@ import os from typing import Any, Dict, List, Optional, Set, Tuple -import torch import habana_frameworks.torch as htorch +import torch import torch.distributed from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, - VisionLanguageConfig, SpeculativeConfig) + SpeculativeConfig, VisionLanguageConfig) from vllm.distributed import (broadcast_tensor_dict, ensure_model_parallel_initialized, init_distributed_environment) diff --git a/vllm/worker/profiler.py b/vllm/worker/profiler.py index d5125019a4b8c..48348de41f520 100644 --- a/vllm/worker/profiler.py +++ b/vllm/worker/profiler.py @@ -8,10 +8,10 @@ import threading import time from contextlib import contextmanager +from typing import Any, List from vllm.logger import init_logger from vllm.utils import get_vllm_instance_id -from typing import List, Any logger = init_logger(__name__) From b7d34afe72edcfb26d9556a459f1f22ddf5a0601 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 2 Jul 2024 19:19:28 +0300 Subject: [PATCH 073/819] tiny fixes --- vllm/attention/backends/habana_attn.py | 3 ++- vllm/model_executor/layers/layernorm.py | 4 ++-- vllm/worker/habana_model_runner.py | 9 +++++---- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py index dd33d3526e907..7db5ab2eeeeff 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/habana_attn.py @@ -144,6 +144,7 @@ def __init__( self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads self.sliding_window = sliding_window self.position_bias = None + self.alibi_slopes = alibi_slopes if alibi_slopes is not None: alibi_slopes_tensor = torch.tensor(alibi_slopes, dtype=torch.bfloat16) @@ -151,7 +152,7 @@ def __init__( num_kv_heads, alibi_slopes_tensor.dtype, max_seq_len) - self.alibi_slopes = alibi_slopes_tensor + self.alibi_slopes = alibi_slopes_tensor assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 57ada2ba8e3c4..8c45abf38da2d 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -9,8 +9,8 @@ if is_hpu(): try: - from habana_frameworks.torch.hpex.normalization import ( - FusedRMSNorm as FusedRMSNorm) + from habana_frameworks.torch.hpex.normalization import (FusedRMSNorm as + FusedRMSNorm) except ImportError: print("Not using HPU fused kernel for RMSNorm") FusedRMSNorm = None diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 22d6368bacacc..58ec7302c1bc4 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -94,8 +94,10 @@ def find_bucket(value: int, config: Tuple[int, int, int]): return result -def subtuple(obj: object, typename: str, to_copy: List[str], - to_override: Optional[Dict[str, object]]): +def subtuple(obj: object, + typename: str, + to_copy: List[str], + to_override: Optional[Dict[str, object]] = None): if to_override is None: to_override = {} if obj is None: @@ -1230,5 +1232,4 @@ def get_counter_dict(self, cache_config, duration, seq_len, 'const_gpu_memory_utilization'] = \ cache_config.gpu_memory_utilization counters['const_block_size'] = cache_config.block_size - self.logged_once = True - return counters + self.logged_once = Tru \ No newline at end of file From f1eee8d4005df3a56382835ca9168c8313be1cd7 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 2 Jul 2024 19:19:44 +0300 Subject: [PATCH 074/819] more tiny fixes --- vllm/model_executor/layers/layernorm.py | 4 ++-- vllm/worker/habana_model_runner.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 8c45abf38da2d..57ada2ba8e3c4 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -9,8 +9,8 @@ if is_hpu(): try: - from habana_frameworks.torch.hpex.normalization import (FusedRMSNorm as - FusedRMSNorm) + from habana_frameworks.torch.hpex.normalization import ( + FusedRMSNorm as FusedRMSNorm) except ImportError: print("Not using HPU fused kernel for RMSNorm") FusedRMSNorm = None diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 58ec7302c1bc4..08dad66df1e76 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -1232,4 +1232,5 @@ def get_counter_dict(self, cache_config, duration, seq_len, 'const_gpu_memory_utilization'] = \ cache_config.gpu_memory_utilization counters['const_block_size'] = cache_config.block_size - self.logged_once = Tru \ No newline at end of file + self.logged_once = True + return counters From 6f2f964c2501f9d0d23f46a702239c1bbb3e9aca Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 2 Jul 2024 19:21:03 +0300 Subject: [PATCH 075/819] ?? --- vllm/model_executor/layers/layernorm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 57ada2ba8e3c4..8c45abf38da2d 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -9,8 +9,8 @@ if is_hpu(): try: - from habana_frameworks.torch.hpex.normalization import ( - FusedRMSNorm as FusedRMSNorm) + from habana_frameworks.torch.hpex.normalization import (FusedRMSNorm as + FusedRMSNorm) except ImportError: print("Not using HPU fused kernel for RMSNorm") FusedRMSNorm = None From d932e8dcbd6e10750afb1f344262b56c382797c4 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 2 Jul 2024 19:24:07 +0300 Subject: [PATCH 076/819] can this finally work --- vllm/model_executor/layers/layernorm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 8c45abf38da2d..57ada2ba8e3c4 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -9,8 +9,8 @@ if is_hpu(): try: - from habana_frameworks.torch.hpex.normalization import (FusedRMSNorm as - FusedRMSNorm) + from habana_frameworks.torch.hpex.normalization import ( + FusedRMSNorm as FusedRMSNorm) except ImportError: print("Not using HPU fused kernel for RMSNorm") FusedRMSNorm = None From 4431ed63bf77780df6b7a47c540fe6f8dea0f67e Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 2 Jul 2024 19:26:37 +0300 Subject: [PATCH 077/819] no, it did in fact not work --- vllm/model_executor/layers/layernorm.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 57ada2ba8e3c4..67cef1b47f3bf 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -9,8 +9,7 @@ if is_hpu(): try: - from habana_frameworks.torch.hpex.normalization import ( - FusedRMSNorm as FusedRMSNorm) + from habana_frameworks.torch.hpex.normalization import FusedRMSNorm except ImportError: print("Not using HPU fused kernel for RMSNorm") FusedRMSNorm = None From 262356b21cb40d30b868ad7dcd400b02ab5b40bb Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 2 Jul 2024 19:27:57 +0300 Subject: [PATCH 078/819] fix typos --- README_GAUDI.md | 2 +- docs/source/getting_started/gaudi-installation.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README_GAUDI.md b/README_GAUDI.md index 3b72ad71069c4..1a1b2d9cc6e36 100644 --- a/README_GAUDI.md +++ b/README_GAUDI.md @@ -132,7 +132,7 @@ If you experience device out-of-memory issues or want to attempt inference at hi cache blocks you have available, and therefore reduces the effective maximum number of tokens you can handle at a given time. -- If this methon is not efficient, you can disable `HPUGraph` completely. With +- If this method is not efficient, you can disable `HPUGraph` completely. With HPU Graphs disabled, you are trading latency and throughput at lower batches for potentially higher throughput on higher batches. You can do that by adding `--enforce-eager` flag to server (for diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst index 90f97155e1d75..73b63b3f8d755 100644 --- a/docs/source/getting_started/gaudi-installation.rst +++ b/docs/source/getting_started/gaudi-installation.rst @@ -136,7 +136,7 @@ the below: cache blocks you have available, and therefore reduces the effective maximum number of tokens you can handle at a given time. -- If this methon is not efficient, you can disable ``HPUGraph`` +- If this method is not efficient, you can disable ``HPUGraph`` completely. With HPU Graphs disabled, you are trading latency and throughput at lower batches for potentially higher throughput on higher batches. You can do that by adding ``--enforce-eager`` flag to From 962c91dff4a88323f9b91f3b212e7461f9a38725 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 2 Jul 2024 19:46:17 +0300 Subject: [PATCH 079/819] fix some mypy issues in habana model runner: --- vllm/worker/habana_model_runner.py | 40 +++++++++++++++--------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 08dad66df1e76..7effebd18f363 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -1,5 +1,3 @@ -# mypy: ignore-errors - ############################################################################### # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company ############################################################################### @@ -12,7 +10,7 @@ import os import time from enum import IntEnum -from typing import Dict, List, NamedTuple, Optional, Set, Tuple +from typing import Dict, List, NamedTuple, Optional, Set, Tuple, Any import habana_frameworks.torch as htorch import torch @@ -48,7 +46,7 @@ # dim is either 'bs' or 'seq' # param is either 'min', 'step' or 'max' # example env variable: VLLM_DECODE_BS_BUCKET_STEP=128 -def read_bucket_settings(phase: str, dim: str, **defaults: Dict): +def read_bucket_settings(phase: str, dim: str, **defaults): params = ['min', 'step', 'max'] values = [ int( @@ -61,10 +59,11 @@ def read_bucket_settings(phase: str, dim: str, **defaults: Dict): def warmup_range(config: Tuple[int, int, int]): bmin, bstep, bmax = config base = itertools.repeat(2) - ramp_up = itertools.accumulate(base, func=operator.mul, initial=bmin) - ramp_up = itertools.takewhile(lambda x: x < bstep and x <= bmax, ramp_up) + ramp_up_acc = itertools.accumulate(base, func=operator.mul, initial=bmin) + ramp_up_tw = itertools.takewhile(lambda x: x < bstep and x <= bmax, \ + ramp_up_acc) stable = range(bstep, bmax + 1, bstep) - return list(ramp_up) + list(stable) + return list(ramp_up_tw) + list(stable) def warmup_buckets(bs_bucket_config, seq_bucket_config): @@ -172,16 +171,16 @@ def sample(self, *args, **kwargs): class PreparePromptMetadata(NamedTuple): - input_tokens: List[int] - input_positions: List[int] + input_tokens: List[List[int]] + input_positions: List[List[int]] attn_metadata: Optional[AttentionMetadata] seq_lens: List[int] query_lens: List[int] - lora_index_mapping: List[int] - lora_prompt_mapping: List[int] + lora_index_mapping: List[List[int]] + lora_prompt_mapping: List[List[int]] lora_requests: Set[LoRARequest] multi_modal_input: Optional[torch.Tensor] - slot_mapping: List[int] + slot_mapping: List[List[int]] @classmethod def empty(cls): @@ -200,13 +199,13 @@ def empty(cls): class PrepareDecodeMetadata(NamedTuple): - input_tokens: List[int] - input_positions: List[int] + input_tokens: List[List[int]] + input_positions: List[List[int]] attn_metadata: Optional[AttentionMetadata] lora_index_mapping: List[int] lora_prompt_mapping: List[int] lora_requests: Set[LoRARequest] - slot_mapping: List[int] + slot_mapping: List[List[int]] @classmethod def empty(cls): @@ -363,7 +362,7 @@ def _setup_buckets(self) -> None: min=self.block_size, step=self.block_size, max=2048) - self.graphed_buckets = set() + self.graphed_buckets: Set[Any] = set() msg = ("Prompt bucket config (min, step, max_warmup) " f"bs:{self.prompt_bs_bucket_cfg}, " @@ -756,8 +755,8 @@ def prepare_input_tensors( lora_prompt_mapping = decode_lora_prompt_mapping lora_requests = decode_lora_requests - # FIXME: We need to adjust selected_token_indices to accommodate f - # or padding + # FIXME: We need to adjust selected_token_indices to accommodate + # for padding max_len = input_tokens.size(1) paddings = [max_len - s for s in seq_lens] paddings = [0] + paddings[:-1] @@ -923,8 +922,9 @@ def execute_model( ) = self.prepare_input_tensors(seq_group_metadata_list) is_prompt = attn_metadata.is_prompt - if self.lora_config: - self.set_active_loras(lora_requests, lora_mapping) + # NOTE(kzawora): Need to restore this after adding LoRA + # if self.lora_config: + # self.set_active_loras(lora_requests, lora_mapping) batch_size = input_tokens.size(0) seq_len = self._seq_len(attn_metadata) From eb1ee27915ba6c4f887449cc10962d9652cc9efd Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 2 Jul 2024 20:00:29 +0300 Subject: [PATCH 080/819] re-enable mypy for habana model runner --- vllm/worker/habana_model_runner.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 7effebd18f363..fe574b4bdefa0 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -10,7 +10,8 @@ import os import time from enum import IntEnum -from typing import Dict, List, NamedTuple, Optional, Set, Tuple, Any +from typing import (Any, Callable, Dict, List, NamedTuple, Optional, Set, + Tuple, Union) import habana_frameworks.torch as htorch import torch @@ -171,7 +172,7 @@ def sample(self, *args, **kwargs): class PreparePromptMetadata(NamedTuple): - input_tokens: List[List[int]] + input_tokens: torch.Tensor input_positions: List[List[int]] attn_metadata: Optional[AttentionMetadata] seq_lens: List[int] @@ -199,11 +200,11 @@ def empty(cls): class PrepareDecodeMetadata(NamedTuple): - input_tokens: List[List[int]] + input_tokens: torch.Tensor input_positions: List[List[int]] attn_metadata: Optional[AttentionMetadata] - lora_index_mapping: List[int] - lora_prompt_mapping: List[int] + lora_index_mapping: List[List[int]] + lora_prompt_mapping: List[List[int]] lora_requests: Set[LoRARequest] slot_mapping: List[List[int]] @@ -603,8 +604,8 @@ def _prepare_decode( slot_mapping: List[List[int]] = [] seq_lens: List[int] = [] block_tables: List[List[int]] = [] - lora_index_mapping: List[int] = [] - lora_prompt_mapping: List[int] = [] + lora_index_mapping: List[List[int]] = [] + lora_prompt_mapping: List[List[int]] = [] lora_requests: Set[LoRARequest] = set() if len(seq_group_metadata_list) == 0: @@ -903,6 +904,7 @@ def execute_model( kv_caches: List[torch.Tensor], ) -> Optional[SamplerOutput]: if self.is_driver_worker: + assert seq_group_metadata_list is not None event_start = self.profiler.get_timestamp_us() is_prompt = seq_group_metadata_list[0].is_prompt base_event_name = 'prompt' if is_prompt else 'decode' @@ -917,6 +919,7 @@ def execute_model( seq_group_metadata_list.extend(seq_group_metadata_list[0] for _ in range(batch_size_padding)) with self.profiler.record_event('internal', 'prepare_input_tensors'): + assert seq_group_metadata_list is not None (input_tokens, input_positions, attn_metadata, sampling_metadata, lora_requests, lora_mapping, multi_modal_input ) = self.prepare_input_tensors(seq_group_metadata_list) @@ -1072,7 +1075,8 @@ def warmup_graphs(self, strategy, buckets, is_prompt, kv_caches, idx = 0 phase = f'Graph/{"Prompt" if is_prompt else "Decode"}' num_candidates = len(buckets) - + ordering : Union[Callable[[Any], Tuple[Any, Any]], \ + Callable[[Any], Tuple[Any, Any, Any]]] if strategy == 'min_tokens': ordering = lambda b: (b[0] * b[1], b[1], b[0]) elif strategy == 'max_bs': From 6a5effbeee92725aad09ab57afd30a5511e4a5e0 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 3 Jul 2024 18:59:15 +0300 Subject: [PATCH 081/819] habana components refactor --- vllm/attention/backends/habana_attn.py | 4 +- vllm/executor/ray_habana_executor.py | 119 ++- vllm/model_executor/layers/fused_moe/layer.py | 8 +- vllm/model_executor/model_loader/loader.py | 2 +- vllm/model_executor/models/mixtral.py | 6 +- vllm/worker/habana_model_runner.py | 704 +++++++++++------- vllm/worker/habana_worker.py | 176 ++--- vllm/worker/model_runner_base.py | 11 +- 8 files changed, 592 insertions(+), 438 deletions(-) diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py index 7db5ab2eeeeff..98c16fdca4c3f 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/habana_attn.py @@ -24,8 +24,8 @@ def get_impl_cls() -> Type["HabanaAttentionImpl"]: return HabanaAttentionImpl @staticmethod - def make_metadata(*args, **kwargs) -> "HabanaAttentionMetadata": - return HabanaAttentionMetadata(*args, **kwargs) + def get_metadata_cls() -> Type["AttentionMetadata"]: + return HabanaAttentionMetadata @staticmethod def get_kv_cache_shape( diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py index b57536436bd49..9f57de8d2e060 100644 --- a/vllm/executor/ray_habana_executor.py +++ b/vllm/executor/ray_habana_executor.py @@ -11,7 +11,8 @@ from vllm.executor.ray_utils import RayWorkerWrapper, ray from vllm.logger import init_logger from vllm.sequence import ExecuteModelRequest, SamplerOutput -from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, +from vllm.utils import (error_on_invalid_device_count_status, + get_distributed_init_method, get_ip, get_open_port, get_vllm_instance_id, make_async) if ray is not None: @@ -28,9 +29,6 @@ class RayHabanaExecutor(DistributedGPUExecutor): def _init_executor(self) -> None: - assert (not self.speculative_config - ), "Speculative decoding not yet supported for RayGPU backend." - assert self.parallel_config.distributed_executor_backend == "ray" placement_group = self.parallel_config.placement_group @@ -48,7 +46,8 @@ def _init_executor(self) -> None: def _init_workers_ray(self, placement_group: "PlacementGroup", **ray_remote_kwargs): - if self.parallel_config.tensor_parallel_size == 1: + if (self.parallel_config.tensor_parallel_size == 1 + and self.parallel_config.pipeline_parallel_size == 1): # For single GPU case, we use a ray worker with constrained memory. num_gpus = self.cache_config.gpu_memory_utilization else: @@ -112,6 +111,12 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids): node_workers[node_id].append(i) + # `gpu_ids` can be a list of strings or integers. + # convert them to integers for consistency. + # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs), + # string sorting is not sufficient. + # see https://github.com/vllm-project/vllm/issues/5590 + gpu_ids = [int(x) for x in gpu_ids] node_gpus[node_id].extend(gpu_ids) for node_id, gpu_ids in node_gpus.items(): node_gpus[node_id] = sorted(gpu_ids) @@ -128,9 +133,21 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", self._run_workers("update_environment_variables", all_args=all_args_to_update_environment_variables) + if len(node_gpus) == 1: + # in single node case, we don't need to get the IP address. + # the loopback address is sufficient + # NOTE: a node may have several IP addresses, one for each + # network interface. `get_ip()` might return any of them, + # while they might not work for communication inside the node + # if the network setup is complicated. Using the loopback address + # solves this issue, as it always works for communication inside + # the node. + driver_ip = "127.0.0.1" distributed_init_method = get_distributed_init_method( driver_ip, get_open_port()) + error_on_invalid_device_count_status() + # Initialize the actual workers inside worker wrapper. init_worker_all_kwargs = [ self._get_worker_kwargs( @@ -146,10 +163,29 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", max_concurrent_workers=self.parallel_config. max_parallel_loading_workers) + # This is the list of workers that are rank 0 of each TP group EXCEPT + # global rank 0. These are the workers that will broadcast to the + # rest of the workers. + self.tp_driver_workers: List[RayWorkerWrapper] = [] + # This is the list of workers that are not drivers and not the first + # worker in a TP group. These are the workers that will be + # broadcasted to. + self.non_driver_workers: List[RayWorkerWrapper] = [] + + for pp_rank in range(self.parallel_config.pipeline_parallel_size): + for tp_rank in range(self.parallel_config.tensor_parallel_size): + rank = (pp_rank * + self.parallel_config.tensor_parallel_size) + tp_rank + if rank == 0: + pass + elif rank % self.parallel_config.tensor_parallel_size == 0: + self.tp_driver_workers.append(self.workers[rank - 1]) + else: + self.non_driver_workers.append(self.workers[rank - 1]) + def _driver_execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: + self, execute_model_req: Optional[ExecuteModelRequest] + ) -> Optional[List[SamplerOutput]]: """Run execute_model in the driver worker. Passing None will cause the driver to stop the model execution @@ -162,7 +198,7 @@ def _run_workers( self, method: str, *args, - async_run_remote_workers_only: bool = False, + async_run_tensor_parallel_workers_only: bool = False, all_args: Optional[List[Tuple[Any, ...]]] = None, all_kwargs: Optional[List[Dict[str, Any]]] = None, use_dummy_driver: bool = False, @@ -173,10 +209,11 @@ def _run_workers( """Runs the given method on all workers. Can be used in the following ways: - - async_run_remote_workers_only: If True the method will be run only - in the remote workers, not the driver worker. It will also be - run asynchronously and return a list of futures rather than blocking - on the results. + Args: + - async_run_tensor_parallel_workers_only: If True the method will be + run only in the remote TP workers, not the driver worker. + It will also be run asynchronously and return a list of futures + rather than blocking on the results. - args/kwargs: All workers share the same args/kwargs - all_args/all_kwargs: args/kwargs for each worker are specified individually @@ -186,7 +223,9 @@ def _run_workers( raise NotImplementedError( "max_concurrent_workers is not supported yet.") - count = len(self.workers) + count = len(self.workers) if not \ + async_run_tensor_parallel_workers_only \ + else len(self.non_driver_workers) all_worker_args = repeat(args, count) if all_args is None \ else islice(all_args, 1, None) all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \ @@ -200,14 +239,17 @@ def _run_workers( ray_worker_outputs = [] else: # Start the ray workers first. + ray_workers = self.workers + if async_run_tensor_parallel_workers_only: + ray_workers = self.non_driver_workers ray_worker_outputs = [ worker.execute_method.remote(method, *worker_args, **worker_kwargs) for (worker, worker_args, worker_kwargs - ) in zip(self.workers, all_worker_args, all_worker_kwargs) + ) in zip(ray_workers, all_worker_args, all_worker_kwargs) ] - if async_run_remote_workers_only: + if async_run_tensor_parallel_workers_only: # Just return futures return ray_worker_outputs @@ -254,7 +296,7 @@ def _compiled_ray_dag(self): f"required, but found {current_version}") from ray.dag import InputNode, MultiOutputNode - assert self.parallel_config.worker_use_ray + assert self.parallel_config.distributed_executor_backend == "ray" # Right now, compiled DAG requires at least 1 arg. We send # a dummy value for now. It will be fixed soon. @@ -266,23 +308,6 @@ def _compiled_ray_dag(self): ]) return forward_dag.experimental_compile() - def check_health(self) -> None: - """Raises an error if engine is unhealthy.""" - self._check_if_any_actor_is_dead() - - def _check_if_any_actor_is_dead(self): - if not self.workers: - return - - dead_actors = [] - for actor in self.workers: - actor_state = ray.state.actors(actor._ray_actor_id.hex()) # pylint: disable=protected-access - if actor_state["State"] == "DEAD": - dead_actors.append(actor) - if dead_actors: - raise RuntimeError("At least one Worker is dead. " - f"Dead Workers: {dead_actors}. ") - class RayHabanaExecutorAsync(RayHabanaExecutor, DistributedGPUExecutorAsync): @@ -294,12 +319,32 @@ async def _driver_execute_model_async( self, execute_model_req: Optional[ExecuteModelRequest] = None ) -> List[SamplerOutput]: - return await self.driver_exec_method("execute_model", - execute_model_req) + + async def _run_task_with_lock(task, lock, *args, **kwargs): + async with lock: + return await task(*args, **kwargs) + + tasks = [] + tasks.append( + asyncio.create_task( + _run_task_with_lock(self.driver_exec_method, self.pp_locks[0], + "execute_model", execute_model_req))) + for pp_rank, driver_worker in enumerate(self.tp_driver_workers, + start=1): + tasks.append( + asyncio.create_task( + _run_task_with_lock(driver_worker.execute_method.remote, + self.pp_locks[pp_rank], + "execute_model", execute_model_req))) + + results = await asyncio.gather(*tasks) + + # Only the last PP stage has the final results. + return results[-1] async def _start_worker_execution_loop(self): coros = [ worker.execute_method.remote("start_worker_execution_loop") - for worker in self.workers + for worker in self.non_driver_workers ] return await asyncio.gather(*coros) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 73cfcd7fc85f2..4641b7958f671 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -11,6 +11,10 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.utils import set_weight_attrs +from vllm.utils import is_hpu + +if is_hpu(): + from vllm.hpu.ops import static_fused_moe logger = init_logger(__name__) @@ -64,7 +68,9 @@ def apply(self, router_logits: torch.Tensor, top_k: int, renormalize: bool = True) -> torch.Tensor: - + if is_hpu(): + return static_fused_moe(x, layer.w13_weight, layer.w2_weight, + router_logits, top_k) return fused_moe(x, layer.w13_weight, layer.w2_weight, diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 65af5e1919de0..c808f5c9f75b9 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -36,7 +36,7 @@ supports_vision) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform -from vllm.utils import is_tpu, is_hpu +from vllm.utils import is_hpu, is_tpu logger = init_logger(__name__) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 0c35258ea202b..94cdf54f1d56d 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -45,10 +45,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors, SamplerOutput -from vllm.utils import print_warning_once, is_hpu +from vllm.utils import is_hpu, print_warning_once -if is_hpu(): - from vllm.hpu.ops import static_fused_moe from .interfaces import SupportsLoRA @@ -99,7 +97,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: router_logits, _ = self.gate(hidden_states) final_hidden_states = self.experts(hidden_states, router_logits) if is_hpu(): - return final_hidden_states.view(batch_size, sequence_length, + return final_hidden_states.view(batch_size, sequence_length, hidden_size) return final_hidden_states.view(num_tokens, hidden_size) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index fe574b4bdefa0..f3e2e976c1c5d 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -3,6 +3,7 @@ ############################################################################### import collections +import dataclasses import gc import itertools import math @@ -10,8 +11,8 @@ import os import time from enum import IntEnum -from typing import (Any, Callable, Dict, List, NamedTuple, Optional, Set, - Tuple, Union) +from typing import (TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple, + Optional, Set, Tuple, Type, TypeVar, Union) import habana_frameworks.torch as htorch import torch @@ -20,7 +21,6 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig) -from vllm.distributed import broadcast_tensor_dict from vllm.distributed.parallel_state import get_world_group from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping @@ -29,12 +29,22 @@ from vllm.model_executor import SamplingMetadata from vllm.model_executor.model_loader import get_model from vllm.sampling_params import SamplingParams -from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata +from vllm.sequence import (IntermediateTensors, SamplerOutput, SequenceData, + SequenceGroupMetadata) from vllm.utils import (HabanaMemoryProfiler, format_bytes, is_pin_memory_available, make_tensor_with_pad) +from vllm.worker.model_runner_base import ( + ModelRunnerBase, ModelRunnerInputBase, + _add_attn_metadata_broadcastable_dict, + _add_sampling_metadata_broadcastable_dict, + _init_attn_metadata_from_tensor_dict, + _init_sampling_metadata_from_tensor_dict) from .profiler import Profiler +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionBackend + logger = init_logger(__name__) _PAD_SLOT_ID = 0 @@ -231,7 +241,97 @@ class BatchType(IntEnum): MIXED = 2 -class HabanaModelRunner: +TModelInputForHPU = TypeVar('TModelInputForHPU', bound="ModelInputForHPU") + + +@dataclasses.dataclass(frozen=True) +class ModelInputForHPU(ModelRunnerInputBase): + """ + This base class contains metadata needed for the base model forward pass + but not metadata for possible additional steps, e.g., sampling. Model + runners that run additional steps should subclass this method to add + additional fields. + """ + input_tokens: Optional[torch.Tensor] = None + input_positions: Optional[torch.Tensor] = None + seq_lens: Optional[List[int]] = None + query_lens: Optional[List[int]] = None + lora_mapping: Optional["LoRAMapping"] = None + lora_requests: Optional[Set[LoRARequest]] = None + attn_metadata: Optional["AttentionMetadata"] = None + multi_modal_kwargs: Optional[Dict[str, torch.Tensor]] = None + real_batch_size: Optional[int] = None + batch_size_padded: Optional[int] = None + virtual_engine: int = 0 + + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + tensor_dict = { + "input_tokens": self.input_tokens, + "input_positions": self.input_positions, + "lora_requests": self.lora_requests, + "lora_mapping": self.lora_mapping, + "multi_modal_kwargs": self.multi_modal_kwargs, + "real_batch_size": self.real_batch_size, + "batch_size_padded": self.batch_size_padded, + "virtual_engine": self.virtual_engine + } + _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) + return tensor_dict + + @classmethod + def from_broadcasted_tensor_dict( + cls: Type[TModelInputForHPU], + tensor_dict: Dict[str, Any], + attn_backend: Optional["AttentionBackend"] = None, + ) -> TModelInputForHPU: + if attn_backend is not None: + tensor_dict = _init_attn_metadata_from_tensor_dict( + attn_backend, tensor_dict) + return cls(**tensor_dict) + + +@dataclasses.dataclass(frozen=True) +class ModelInputForHPUWithSamplingMetadata(ModelInputForHPU): + """ + Used by the ModelRunner. + """ + sampling_metadata: Optional["SamplingMetadata"] = None + # Used for speculative decoding. We do not broadcast it because it is only + # used by the driver worker. + is_prompt: Optional[bool] = None + + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + tensor_dict = { + "input_tokens": self.input_tokens, + "input_positions": self.input_positions, + "lora_requests": self.lora_requests, + "lora_mapping": self.lora_mapping, + "multi_modal_kwargs": self.multi_modal_kwargs, + } + _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) + _add_sampling_metadata_broadcastable_dict(tensor_dict, + self.sampling_metadata) + return tensor_dict + + @classmethod + def from_broadcasted_tensor_dict( + cls, + tensor_dict: Dict[str, Any], + attn_backend: Optional["AttentionBackend"] = None, + ) -> "ModelInputForHPUWithSamplingMetadata": + tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) + # FIXME(kzawora): this fails for whatever reason - why? + if attn_backend is not None: + tensor_dict = _init_attn_metadata_from_tensor_dict( + attn_backend, tensor_dict) + return cls(**tensor_dict) + + +class HabanaModelRunnerBase(ModelRunnerBase[TModelInputForHPU]): + """ + Helper class for shared methods between GPU model runners. + """ + _model_input_cls: Type[TModelInputForHPU] def __init__( self, @@ -382,7 +482,7 @@ def _setup_buckets(self) -> None: logger.info(msg) self.decode_buckets = warmup_buckets(self.decode_bs_bucket_cfg, self.decode_seq_bucket_cfg) - msg = ("Generated {len(self.decode_buckets)} decode buckets: " + msg = (f"Generated {len(self.decode_buckets)} decode buckets: " f"{list(sorted(self.decode_buckets))}") logger.info(msg) @@ -698,171 +798,154 @@ def _prepare_decode( def prepare_input_tensors( self, seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata, - Set[LoRARequest], LoRAMapping, torch.Tensor]: - if self.is_driver_worker: - prefill_reqs = [] - decode_reqs = [] - for seq_group_meta in seq_group_metadata_list: - if seq_group_meta.is_prompt: - prefill_reqs.append(seq_group_meta) - else: - decode_reqs.append(seq_group_meta) - - # Prepare input tensors. - ( - input_tokens, - input_positions, - prefill_attn_metadata, - seq_lens, - query_lens, + ) -> TModelInputForHPU: + if len(seq_group_metadata_list) == 0: + return self._model_input_cls() + + input_tokens = None + input_positions = None + lora_mapping = None + lora_requests = None + multi_modal_input = None + batch_type = None + seq_lens = None + query_lens = None + real_batch_size = None + batch_size_padded = None + + self.event_start = self.profiler.get_timestamp_us() + is_prompt = seq_group_metadata_list[0].is_prompt + base_event_name = 'prompt' if is_prompt else 'decode' + self.profiler.start('internal', base_event_name) + + real_batch_size = len(seq_group_metadata_list) + bucket_cfg = self.prompt_bs_bucket_cfg if is_prompt else \ + self.decode_bs_bucket_cfg + batch_size_padded = find_bucket(real_batch_size, bucket_cfg) + batch_size_padding = batch_size_padded - real_batch_size + seq_group_metadata_list = seq_group_metadata_list.copy() + seq_group_metadata_list.extend(seq_group_metadata_list[0] + for _ in range(batch_size_padding)) + + prefill_reqs = [] + decode_reqs = [] + for seq_group_meta in seq_group_metadata_list: + if seq_group_meta.is_prompt: + prefill_reqs.append(seq_group_meta) + else: + decode_reqs.append(seq_group_meta) + + # Prepare input tensors. + ( + input_tokens, + input_positions, + prefill_attn_metadata, + seq_lens, + query_lens, + lora_index_mapping, + lora_prompt_mapping, + lora_requests, + multi_modal_input, + slot_mapping, + ) = self._prepare_prompt(prefill_reqs) + ( + decode_input_tokens, + decode_input_positions, + decode_attn_metadata, + decode_lora_index_mapping, + decode_lora_prompt_mapping, + decode_lora_requests, + decode_slot_mapping, + ) = self._prepare_decode(decode_reqs) + sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list, + seq_lens, query_lens, + self.device, + self.pin_memory) + + if not self.scheduler_config.chunked_prefill_enabled: + assert (len(prefill_reqs) and len(decode_reqs)) == 0 + + num_prefills = len(seq_lens) + num_prefill_tokens = len(input_tokens) + num_decode_tokens = len(decode_input_tokens) + + # NOTE(kzawora): Here we diverge from GPU code - we don't + # support mixed batches, so we either use decode or prefill + # inputs, without coalescing. + assert (num_prefills == 0 and num_decode_tokens > 0) or ( + num_prefills > 0 + and num_decode_tokens == 0), "HPU does not support mixed batches!" + if num_decode_tokens > 0: + input_tokens = decode_input_tokens + input_positions = decode_input_positions + slot_mapping = decode_slot_mapping + lora_index_mapping = decode_lora_index_mapping + lora_prompt_mapping = decode_lora_prompt_mapping + lora_requests = decode_lora_requests + + # FIXME: We need to adjust selected_token_indices to accommodate + # for padding + max_len = input_tokens.size(1) + paddings = [max_len - s for s in seq_lens] + paddings = [0] + paddings[:-1] + paddings = list(itertools.accumulate(paddings)) + paddings = torch.tensor( + paddings, + dtype=sampling_metadata.selected_token_indices.dtype, + device=sampling_metadata.selected_token_indices.device) + sampling_metadata.selected_token_indices.add_(paddings) + + if self.lora_config: + lora_mapping = LoRAMapping( lora_index_mapping, lora_prompt_mapping, - lora_requests, - multi_modal_input, - slot_mapping, - ) = self._prepare_prompt(prefill_reqs) - ( - decode_input_tokens, - decode_input_positions, - decode_attn_metadata, - decode_lora_index_mapping, - decode_lora_prompt_mapping, - decode_lora_requests, - decode_slot_mapping, - ) = self._prepare_decode(decode_reqs) - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, seq_lens, query_lens, self.device, - self.pin_memory) - - if not self.scheduler_config.chunked_prefill_enabled: - assert (len(prefill_reqs) and len(decode_reqs)) == 0 - - num_prefills = len(seq_lens) - num_prefill_tokens = len(input_tokens) - num_decode_tokens = len(decode_input_tokens) - - # NOTE(kzawora): Here we diverge from GPU code - we don't - # support mixed batches, so we either use decode or prefill - # inputs, without coalescing. - assert (num_prefills == 0 and num_decode_tokens > 0) or ( - num_prefills > 0 and num_decode_tokens - == 0), "HPU does not support mixed batches!" - if num_decode_tokens > 0: - input_tokens = decode_input_tokens - input_positions = decode_input_positions - slot_mapping = decode_slot_mapping - lora_index_mapping = decode_lora_index_mapping - lora_prompt_mapping = decode_lora_prompt_mapping - lora_requests = decode_lora_requests - - # FIXME: We need to adjust selected_token_indices to accommodate - # for padding - max_len = input_tokens.size(1) - paddings = [max_len - s for s in seq_lens] - paddings = [0] + paddings[:-1] - paddings = list(itertools.accumulate(paddings)) - paddings = torch.tensor( - paddings, - dtype=sampling_metadata.selected_token_indices.dtype, - device=sampling_metadata.selected_token_indices.device) - sampling_metadata.selected_token_indices.add_(paddings) - - if self.lora_config: - lora_mapping = LoRAMapping( - lora_index_mapping, - lora_prompt_mapping, - ) - else: - lora_mapping = None - - if (prefill_attn_metadata is not None - and decode_attn_metadata is not None): - batch_type = BatchType.MIXED - raise NotImplementedError( - "Mixed batch is not supported on HPU") - elif prefill_attn_metadata is not None: - batch_type = BatchType.PREFILL - else: - batch_type = BatchType.DECODE - - metadata_dict = { - "input_tokens": input_tokens, - "input_positions": input_positions, - "selected_token_indices": - sampling_metadata.selected_token_indices, - "lora_requests": lora_requests, - "lora_mapping": lora_mapping, - "multi_modal_input": multi_modal_input, - "num_prefill_tokens": num_prefill_tokens, - "num_decode_tokens": num_decode_tokens, - "slot_mapping": slot_mapping, - "num_prefills": num_prefills, - "batch_type": batch_type, - } - if prefill_attn_metadata is not None: - metadata_dict.update(prefill_attn_metadata.asdict_zerocopy()) - else: - assert decode_attn_metadata is not None - metadata_dict.update(decode_attn_metadata.asdict_zerocopy()) - broadcast_tensor_dict(metadata_dict, src=0) - - # Broadcast decode attn metadata for mixed batch type. - # The additional broadcast costs 300us overhead on 4 A10 GPUs. - # We can potentially reduce the overhead by coelescing tensors. - if batch_type == BatchType.MIXED: - assert decode_attn_metadata is not None - metadata_dict = decode_attn_metadata.asdict_zerocopy() - broadcast_tensor_dict(metadata_dict, src=0) - else: - metadata_dict = broadcast_tensor_dict(src=0) - input_tokens = metadata_dict.pop("input_tokens") - input_positions = metadata_dict.pop("input_positions") - selected_token_indices = metadata_dict.pop( - "selected_token_indices") - lora_mapping = metadata_dict.pop("lora_mapping") - lora_requests = metadata_dict.pop("lora_requests") - multi_modal_input = metadata_dict.pop("multi_modal_input") - batch_type = metadata_dict.pop("batch_type") - - # Create an attention metadata. - prefill_attn_metadata = None - decode_attn_metadata = None - if batch_type == BatchType.PREFILL or batch_type == BatchType.MIXED: - prefill_attn_metadata = self.attn_backend.make_metadata( - **metadata_dict) - else: - decode_attn_metadata = self.attn_backend.make_metadata( - **metadata_dict) - sampling_metadata = SamplingMetadata( - seq_groups=None, - selected_token_indices=selected_token_indices, - categorized_sample_indices=None, - num_prompts=0, ) - - # if it is a mixed batch, decode attn_metadata is broadcasted - # separately. - if batch_type == BatchType.MIXED: - metadata_dict = broadcast_tensor_dict(src=0) - decode_attn_metadata = self.attn_backend.make_metadata( - **metadata_dict) + else: + lora_mapping = None + + if (prefill_attn_metadata is not None + and decode_attn_metadata is not None): + batch_type = BatchType.MIXED + raise NotImplementedError("Mixed batch is not supported on HPU") + elif prefill_attn_metadata is not None: + batch_type = BatchType.PREFILL + else: + batch_type = BatchType.DECODE + + metadata_dict = { + "input_tokens": input_tokens, + "input_positions": input_positions, + "selected_token_indices": sampling_metadata.selected_token_indices, + "lora_requests": lora_requests, + "lora_mapping": lora_mapping, + "multi_modal_input": multi_modal_input, + "num_prefill_tokens": num_prefill_tokens, + "num_decode_tokens": num_decode_tokens, + "slot_mapping": slot_mapping, + "num_prefills": num_prefills, + "batch_type": batch_type, + "seq_lens": seq_lens, + "query_lens": query_lens + } + if prefill_attn_metadata is not None: + metadata_dict.update(prefill_attn_metadata.asdict_zerocopy()) + else: + assert decode_attn_metadata is not None + metadata_dict.update(decode_attn_metadata.asdict_zerocopy()) attn_metadata = prefill_attn_metadata if \ prefill_attn_metadata is not None else decode_attn_metadata - # attn_metadata = AttentionMetadata( - # num_prefills=num_prefills, - # slot_mapping=slot_mapping, - # num_prefill_tokens=num_prefill_tokens, - # num_decode_tokens=num_decode_tokens, - # prefill_metadata=prefill_attn_metadata, - # decode_metadata=decode_attn_metadata, - # kv_cache_dtype=self.kv_cache_dtype, - # ) - - return (input_tokens, input_positions, attn_metadata, - sampling_metadata, lora_requests, lora_mapping, - multi_modal_input) + + return self._model_input_cls(input_tokens=input_tokens, + seq_lens=seq_lens, + query_lens=query_lens, + input_positions=input_positions, + attn_metadata=attn_metadata, + lora_requests=lora_requests, + lora_mapping=lora_mapping, + multi_modal_kwargs=multi_modal_input, + real_batch_size=real_batch_size, + batch_size_padded=batch_size_padded) def _seq_len(self, attn_metadata): if attn_metadata.num_prefills != 0: @@ -897,110 +980,6 @@ def trim_attn_metadata(self, metadata: AttentionMetadata) -> object: ]) return attention_metadata - @torch.inference_mode() - def execute_model( - self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - kv_caches: List[torch.Tensor], - ) -> Optional[SamplerOutput]: - if self.is_driver_worker: - assert seq_group_metadata_list is not None - event_start = self.profiler.get_timestamp_us() - is_prompt = seq_group_metadata_list[0].is_prompt - base_event_name = 'prompt' if is_prompt else 'decode' - self.profiler.start('internal', base_event_name) - - real_batch_size = len(seq_group_metadata_list) - bucket_cfg = self.prompt_bs_bucket_cfg if is_prompt else \ - self.decode_bs_bucket_cfg - batch_size_padded = find_bucket(real_batch_size, bucket_cfg) - batch_size_padding = batch_size_padded - real_batch_size - seq_group_metadata_list = seq_group_metadata_list.copy() - seq_group_metadata_list.extend(seq_group_metadata_list[0] - for _ in range(batch_size_padding)) - with self.profiler.record_event('internal', 'prepare_input_tensors'): - assert seq_group_metadata_list is not None - (input_tokens, input_positions, attn_metadata, sampling_metadata, - lora_requests, lora_mapping, multi_modal_input - ) = self.prepare_input_tensors(seq_group_metadata_list) - is_prompt = attn_metadata.is_prompt - - # NOTE(kzawora): Need to restore this after adding LoRA - # if self.lora_config: - # self.set_active_loras(lora_requests, lora_mapping) - - batch_size = input_tokens.size(0) - seq_len = self._seq_len(attn_metadata) - use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) - execute_model_kwargs = { - "input_ids": input_tokens, - "positions": input_positions, - "kv_caches": kv_caches, - "attn_metadata": self.trim_attn_metadata(attn_metadata), - } - if self.vision_language_config: - execute_model_kwargs.update({"image_input": multi_modal_input}) - - htorch.core.mark_step() - if self.is_driver_worker: - model_event_name = ("model_" - f"{'prompt' if is_prompt else 'decode'}_" - f"bs{batch_size}_" - f"seq{seq_len}_" - f"graphs{'T' if use_graphs else 'F'}") - else: - model_event_name = 'model_executable' - with self.profiler.record_event('internal', model_event_name): - hidden_states = self.model.forward( - **execute_model_kwargs, - selected_token_indices=sampling_metadata. - selected_token_indices, - bypass_hpu_graphs=not use_graphs) - - # Compute the logits. - with self.profiler.record_event( - 'internal', ('compute_logits_' - f'{"prompt" if is_prompt else "decode"}_bs' - f'{batch_size}_' - f'seq{seq_len}')): - sampling_metadata.selected_token_indices = None - logits = self.model.compute_logits(hidden_states, - sampling_metadata) - htorch.core.mark_step() - - # Only perform sampling in the driver worker. - if not self.is_driver_worker: - return None - - # Sample the next token. - with self.profiler.record_event( - 'internal', ('sample_' - f'{"prompt" if is_prompt else "decode"}_' - f'bs{batch_size}_' - f'seq{seq_len}')): - output = self.model.sample( - logits=logits, - sampling_metadata=sampling_metadata, - ) - output.outputs = output.outputs[:real_batch_size] - htorch.core.mark_step() - - if self.is_driver_worker and self.profiler.enabled: - # Stop recording 'execute_model' event - self.profiler.end() - event_end = self.profiler.get_timestamp_us() - counters = self.profiler_counter_helper.get_counter_dict( - cache_config=self.cache_config, - duration=event_end - event_start, - seq_len=seq_len, - batch_size_padded=batch_size_padded, - real_batch_size=real_batch_size, - seq_group_metadata_list=seq_group_metadata_list, - is_prompt=is_prompt) - self.profiler.record_counter(event_start, counters) - - return output - def create_dummy_seq_group_metadata(self, group_id, seq_len, is_prompt): sampling_params = SamplingParams(temperature=0) num_blocks = math.ceil(seq_len / self.block_size) @@ -1048,7 +1027,8 @@ def warmup_scenario(self, batch_size, seq_len, is_prompt, ] torch.hpu.synchronize() for _ in range(times): - self.execute_model(seqs, kv_caches) + inputs = self.prepare_model_input(seqs) + self.execute_model(inputs, kv_caches) torch.hpu.synchronize() self.profiler.end() gc.collect() @@ -1163,19 +1143,28 @@ def __init__(self): self.niter = 0 self.average_real_throughput = None self.logged_once = False + self.real_seq_lens = [] + self.prompt_seq_lens = [] - def get_counter_dict(self, cache_config, duration, seq_len, - batch_size_padded, real_batch_size, - seq_group_metadata_list, is_prompt): - throughput = batch_size_padded / (duration / 1e6) - throughput_effective = real_batch_size / (duration / 1e6) - real_seq_lens = [ + def capture_seq_group_metadata_stats(self, seq_group_metadata_list): + self.real_seq_lens = [ len(seq_data.prompt_token_ids) + len(seq_data.output_token_ids) for seq_group_metadata in seq_group_metadata_list for seq_data in seq_group_metadata.seq_data.values() ] - real_max_seq_len = max(real_seq_lens) - real_num_tokens = sum(real_seq_lens) + self.prompt_seq_lens = [ + len(seq_data.prompt_token_ids) + for seq_group_metadata in seq_group_metadata_list + for seq_data in seq_group_metadata.seq_data.values() + ] + + def get_counter_dict(self, cache_config, duration, seq_len, + batch_size_padded, real_batch_size, is_prompt): + throughput = batch_size_padded / (duration / 1e6) + throughput_effective = real_batch_size / (duration / 1e6) + + real_max_seq_len = max(self.real_seq_lens) + real_num_tokens = sum(self.real_seq_lens) padded_num_tokens = batch_size_padded * seq_len batch_token_utilization = real_num_tokens / padded_num_tokens if self.average_real_throughput is None: @@ -1198,14 +1187,10 @@ def get_counter_dict(self, cache_config, duration, seq_len, } self.niter += 1 if is_prompt: - prompt_seq_lens = [ - len(seq_data.prompt_token_ids) - for seq_group_metadata in seq_group_metadata_list - for seq_data in seq_group_metadata.seq_data.values() - ] prompt_bucket_in_throughput = (seq_len * batch_size_padded) / ( duration / 1e6) - prompt_real_in_throughput = sum(prompt_seq_lens) / (duration / 1e6) + prompt_real_in_throughput = sum( + self.prompt_seq_lens) / (duration / 1e6) counters[ f'{phase}_bucket_in_throughput'] = prompt_bucket_in_throughput counters[f'{phase}_real_in_throughput'] = prompt_real_in_throughput @@ -1214,7 +1199,8 @@ def get_counter_dict(self, cache_config, duration, seq_len, if cache_config.num_gpu_blocks is not None and \ cache_config.num_gpu_blocks != 0: cache_num_blocks_used = [ - math.ceil(sl / cache_config.block_size) for sl in real_seq_lens + math.ceil(sl / cache_config.block_size) + for sl in self.real_seq_lens ] cache_total_num_blocks_used = sum(cache_num_blocks_used) num_cache_blocks = cache_config.num_gpu_blocks @@ -1238,3 +1224,151 @@ def get_counter_dict(self, cache_config, duration, seq_len, counters['const_block_size'] = cache_config.block_size self.logged_once = True return counters + + +class HabanaModelRunner( + HabanaModelRunnerBase[ModelInputForHPUWithSamplingMetadata]): + """ + GPU model runner with sampling step. + """ + _model_input_cls: Type[ModelInputForHPUWithSamplingMetadata] = ( + ModelInputForHPUWithSamplingMetadata) + + def make_model_input_from_broadcasted_tensor_dict( + self, + tensor_dict: Dict[str, Any], + ) -> ModelInputForHPUWithSamplingMetadata: + return ( + ModelInputForHPUWithSamplingMetadata.from_broadcasted_tensor_dict( + tensor_dict, + attn_backend=self.attn_backend, + )) + + def prepare_model_input( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + virtual_engine: int = 0, + finished_requests_ids: Optional[List[str]] = None + ) -> ModelInputForHPUWithSamplingMetadata: + """Prepare the model input based on a given sequence group, including + metadata for the sampling step. + The API assumes seq_group_metadata_list is sorted by prefill -> decode. + The result tensors and data structure also batches input in prefill + -> decode order. For example, + - input_tokens[:num_prefill_tokens] contains prefill tokens. + - input_tokens[num_prefill_tokens:] contains decode tokens. + If cuda graph is required, this API automatically pads inputs. + """ + with self.profiler.record_event('internal', 'prepare_input_tensors'): + assert seq_group_metadata_list is not None + self.profiler_counter_helper.capture_seq_group_metadata_stats( + seq_group_metadata_list=seq_group_metadata_list) + model_input = self.prepare_input_tensors(seq_group_metadata_list) + sampling_metadata = SamplingMetadata.prepare( + seq_group_metadata_list, model_input.seq_lens, + model_input.query_lens, self.device, self.pin_memory) + assert model_input.attn_metadata is not None + is_prompt = model_input.attn_metadata.is_prompt + + return dataclasses.replace(model_input, + sampling_metadata=sampling_metadata, + is_prompt=is_prompt, + virtual_engine=virtual_engine) + + @torch.inference_mode() + def execute_model( + self, + model_input: ModelInputForHPUWithSamplingMetadata, + kv_caches: List[torch.Tensor], + intermediate_tensors: Optional[IntermediateTensors] = None, + num_steps: int = 1, + ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]: + if num_steps > 1: + raise ValueError( + "num_steps > 1 is not supported in HabanaModelRunner") + + # NOTE(kzawora): Need to restore this after adding LoRA + # if self.lora_config: + # self.set_active_loras(lora_requests, lora_mapping) + input_tokens = model_input.input_tokens + input_positions = model_input.input_positions + attn_metadata = model_input.attn_metadata + sampling_metadata = model_input.sampling_metadata + multi_modal_input = model_input.multi_modal_kwargs + real_batch_size = model_input.real_batch_size + batch_size_padded = model_input.batch_size_padded + is_prompt = model_input.is_prompt + assert input_tokens is not None + assert input_positions is not None + assert attn_metadata is not None + assert is_prompt is not None + assert sampling_metadata is not None + batch_size = input_tokens.size(0) + seq_len = self._seq_len(attn_metadata) + use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) + execute_model_kwargs = { + "input_ids": input_tokens, + "positions": input_positions, + "kv_caches": kv_caches, + "attn_metadata": self.trim_attn_metadata(attn_metadata), + "intermediate_tensors": intermediate_tensors + } + if self.vision_language_config: + execute_model_kwargs.update({"image_input": multi_modal_input}) + + htorch.core.mark_step() + if self.is_driver_worker: + model_event_name = ("model_" + f"{'prompt' if is_prompt else 'decode'}_" + f"bs{batch_size}_" + f"seq{seq_len}_" + f"graphs{'T' if use_graphs else 'F'}") + else: + model_event_name = 'model_executable' + with self.profiler.record_event('internal', model_event_name): + hidden_states = self.model.forward( + **execute_model_kwargs, + selected_token_indices=sampling_metadata. + selected_token_indices, + bypass_hpu_graphs=not use_graphs) + + # Compute the logits. + with self.profiler.record_event( + 'internal', ('compute_logits_' + f'{"prompt" if is_prompt else "decode"}_bs' + f'{batch_size}_' + f'seq{seq_len}')): + sampling_metadata.selected_token_indices = None + logits = self.model.compute_logits(hidden_states, + sampling_metadata) + htorch.core.mark_step() + # Only perform sampling in the driver worker. + if not self.is_driver_worker: + return [] + + # Sample the next token. + with self.profiler.record_event( + 'internal', ('sample_' + f'{"prompt" if is_prompt else "decode"}_' + f'bs{batch_size}_' + f'seq{seq_len}')): + output = self.model.sample( + logits=logits, + sampling_metadata=sampling_metadata, + ) + output.outputs = output.outputs[:real_batch_size] + htorch.core.mark_step() + + if self.is_driver_worker and self.profiler.enabled: + # Stop recording 'execute_model' event + self.profiler.end() + event_end = self.profiler.get_timestamp_us() + counters = self.profiler_counter_helper.get_counter_dict( + cache_config=self.cache_config, + duration=event_end - self.event_start, + seq_len=seq_len, + batch_size_padded=batch_size_padded, + real_batch_size=real_batch_size, + is_prompt=is_prompt) + self.profiler.record_counter(self.event_start, counters) + return [output] diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index 63055bf4f2055..f91d6bc5cefa9 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -4,27 +4,26 @@ import gc import os -from typing import Any, Dict, List, Optional, Set, Tuple +from typing import List, Optional, Set, Tuple -import habana_frameworks.torch as htorch +import habana_frameworks.torch as htorch # noqa:F401 import torch import torch.distributed from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig, VisionLanguageConfig) -from vllm.distributed import (broadcast_tensor_dict, - ensure_model_parallel_initialized, +from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.lora.request import LoRARequest from vllm.model_executor import set_random_seed -from vllm.sequence import ExecuteModelRequest, SamplerOutput +from vllm.sequence import ExecuteModelRequest from vllm.worker.cache_engine import CacheEngine from vllm.worker.habana_model_runner import HabanaModelRunner -from vllm.worker.worker_base import WorkerBase +from vllm.worker.worker_base import LocalOrDistributedWorkerBase, WorkerInput -class HabanaWorker(WorkerBase): +class HabanaWorker(LocalOrDistributedWorkerBase): """A worker class that executes (a partition of) the model on a HPU. Each worker is associated with a single HPU. The worker is responsible for @@ -72,20 +71,21 @@ def __init__( "To be tested: vision language model with LoRA settings.") raise AssertionError("To be tested: vision language model on HPU") - self.model_runner = HabanaModelRunner( + self.model_runner: HabanaModelRunner = HabanaModelRunner( model_config, parallel_config, scheduler_config, device_config, - load_config=load_config, cache_config=cache_config, + load_config=load_config, lora_config=self.lora_config, kv_cache_dtype=self.cache_config.cache_dtype, is_driver_worker=is_driver_worker) # Uninitialized cache engine. Will be initialized by # initialize_cache. - self.cache_engine: CacheEngine - self.hpu_cache: List[torch.Tensor] + self.cache_engine: List[CacheEngine] + # Initialize gpu_cache as embedding models don't initialize kv_caches + self.hpu_cache: Optional[List[List[torch.tensor]]] = None def init_device(self) -> None: if self.device_config.device.type == "hpu": @@ -164,112 +164,78 @@ def initialize_cache(self, num_gpu_blocks: int, self._init_cache_engine() self._warm_up_model() - def _init_cache_engine(self) -> None: + def _init_cache_engine(self): assert self.cache_config.num_gpu_blocks is not None - self.cache_engine = CacheEngine(self.cache_config, self.model_config, - self.parallel_config, - self.device_config) - self.hpu_cache = self.cache_engine.gpu_cache - # we want to materialize cache tensors before we proceed with - # graph capture/execution - htorch.hpu.synchronize() + self.cache_engine = [ + CacheEngine(self.cache_config, self.model_config, + self.parallel_config, self.device_config) + for _ in range(self.parallel_config.pipeline_parallel_size) + ] + self.hpu_cache = [ + self.cache_engine[ve].gpu_cache + for ve in range(self.parallel_config.pipeline_parallel_size) + ] def _warm_up_model(self) -> None: - self.model_runner.warmup_model(self.hpu_cache) + # NOTE(kzawora): We should use virtual engine index here + # for pipeline parallelism. Using 0 for now. + assert self.hpu_cache is not None + self.model_runner.warmup_model(self.hpu_cache[0]) # Reset the seed to ensure that the random state is not affected by # the model initialization and profiling. set_random_seed(self.model_config.seed) - def cache_swap( - self, - blocks_to_swap_in: Dict[int, int], - blocks_to_swap_out: Dict[int, int], - blocks_to_copy: torch.Tensor, - ) -> None: - # Issue cache operations. - # TODO(woosuk): Profile swapping overhead and optimize if needed. - if blocks_to_swap_in: - self.cache_engine.swap_in(blocks_to_swap_in) - if blocks_to_swap_out: - self.cache_engine.swap_out(blocks_to_swap_out) - if blocks_to_copy.numel() > 0: - self.cache_engine.copy(blocks_to_copy) + @property + def do_metadata_broadcast(self) -> bool: + return self.parallel_config.tensor_parallel_size > 1 - @torch.inference_mode() - def execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: - if execute_model_req is None: - seq_group_metadata_list = None - else: - seq_group_metadata_list = execute_model_req.seq_group_metadata_list + @property + def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: + return self.hpu_cache - if self.is_driver_worker: - assert seq_group_metadata_list is not None - assert execute_model_req is not None - num_seq_groups = len(seq_group_metadata_list) - blocks_to_swap_in = execute_model_req.blocks_to_swap_in - blocks_to_swap_out = execute_model_req.blocks_to_swap_out - blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy, - device=self.device, + @torch.inference_mode() + def prepare_worker_input( + self, execute_model_req: ExecuteModelRequest) -> WorkerInput: + virtual_engine = execute_model_req.virtual_engine + num_seq_groups = len(execute_model_req.seq_group_metadata_list) + # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors. + # they contain parameters to launch cudamemcpyasync. + blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in, + device="cpu", + dtype=torch.int64).view(-1, 2) + blocks_to_swap_out = torch.tensor(execute_model_req.blocks_to_swap_out, + device="cpu", dtype=torch.int64).view(-1, 2) - data: Dict[str, Any] = { - "num_seq_groups": num_seq_groups, - "blocks_to_swap_in": blocks_to_swap_in, - "blocks_to_swap_out": blocks_to_swap_out, - "blocks_to_copy": blocks_to_copy, - } - broadcast_tensor_dict(data, src=0) - else: - data = broadcast_tensor_dict(src=0) - num_seq_groups = data["num_seq_groups"] - blocks_to_swap_in = data["blocks_to_swap_in"] - blocks_to_swap_out = data["blocks_to_swap_out"] - blocks_to_copy = data["blocks_to_copy"] - - self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy) - - # If there is no input, we don't need to execute the model. - if num_seq_groups == 0: - return [] - - output = self.model_runner.execute_model(seq_group_metadata_list, - self.hpu_cache) - return [output] + # `blocks_to_copy` is a gpu tensor. The src and tgt of + # blocks to copy are in the same device, and `blocks_to_copy` + # can be used directly within cuda kernels. + blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy, + device=self.device, + dtype=torch.int64).view(-1, 2) + + return WorkerInput( + num_seq_groups=num_seq_groups, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + virtual_engine=virtual_engine, + ) @torch.inference_mode() - def start_worker_execution_loop(self) -> None: - """Execute model loop in parallel worker. - - You can stop the loop by executing a driver worker with an empty output. - See `stop_remote_worker_execution_loop` for more details. - """ - while self._execute_model_non_driver(): - pass - - def _execute_model_non_driver(self) -> bool: - """Execute model in parallel worker. - - Returns True iff there are remaining sequences to process. - """ - assert not self.is_driver_worker - data = broadcast_tensor_dict(src=0) - if not data: - return False - - num_seq_groups = data.get("num_seq_groups", 0) - blocks_to_swap_in = data.get("blocks_to_swap_in") - blocks_to_swap_out = data.get("blocks_to_swap_out") - blocks_to_copy = data.get("blocks_to_copy") - self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy) - - # If there is no input, we don't need to execute the model. - if num_seq_groups == 0: - return False - - self.model_runner.execute_model(None, self.hpu_cache) - return True + def execute_worker(self, worker_input: WorkerInput) -> None: + virtual_engine = worker_input.virtual_engine + # Issue cache operations. + if (worker_input.blocks_to_swap_in is not None + and worker_input.blocks_to_swap_in.numel() > 0): + self.cache_engine[virtual_engine].swap_in( + worker_input.blocks_to_swap_in) + if (worker_input.blocks_to_swap_out is not None + and worker_input.blocks_to_swap_out.numel() > 0): + self.cache_engine[virtual_engine].swap_out( + worker_input.blocks_to_swap_out) + if (worker_input.blocks_to_copy is not None + and worker_input.blocks_to_copy.numel() > 0): + self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy) def add_lora(self, lora_request: LoRARequest) -> bool: raise NotImplementedError("LoRA is not implemented for HPU backend.") diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py index bc0960fa16221..4990511789e11 100644 --- a/vllm/worker/model_runner_base.py +++ b/vllm/worker/model_runner_base.py @@ -1,7 +1,7 @@ import dataclasses from abc import ABC, abstractmethod from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Type, - TypeVar) + TypeVar, Union, get_args, get_origin) import torch @@ -39,9 +39,14 @@ def _init_attn_metadata_from_tensor_dict( valid_attn_kwargs = {} for field in dataclasses.fields(attn_backend.get_metadata_cls()): val = tensor_dict.pop(field.name, None) - if val is not None: + # NOTE(kzawora): None is a valid value if type is optional. If + # we don't check against it, we will crash by not assigning + # Optional types without default value, even if they are + # broadcasted properly. + is_field_optional = get_origin(field.type) is Union and \ + type(None) in get_args(field.type) + if val is not None or (val is None and is_field_optional): valid_attn_kwargs[field.name] = val - attn_metadata = attn_backend.make_metadata(**valid_attn_kwargs) tensor_dict["attn_metadata"] = attn_metadata return tensor_dict From bca41a146ba511a463811104e55f535755eaeac7 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 3 Jul 2024 19:24:55 +0300 Subject: [PATCH 082/819] fix is_prompt for mixtral --- vllm/worker/habana_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index f3e2e976c1c5d..6669c00a49647 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -1297,12 +1297,12 @@ def execute_model( multi_modal_input = model_input.multi_modal_kwargs real_batch_size = model_input.real_batch_size batch_size_padded = model_input.batch_size_padded - is_prompt = model_input.is_prompt assert input_tokens is not None assert input_positions is not None + assert sampling_metadata is not None assert attn_metadata is not None + is_prompt = attn_metadata.is_prompt assert is_prompt is not None - assert sampling_metadata is not None batch_size = input_tokens.size(0) seq_len = self._seq_len(attn_metadata) use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) From 717c0ce6b96369bf0526f3b98cf5f7fc662e5b0b Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 4 Jul 2024 14:10:19 +0300 Subject: [PATCH 083/819] restore HPU autodetection --- setup.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/setup.py b/setup.py index 897958d875284..de37558d738c4 100644 --- a/setup.py +++ b/setup.py @@ -206,10 +206,6 @@ def build_extensions(self) -> None: def _is_hpu() -> bool: - is_hpu_available = True - # FIXME(kzawora): HPU autodetection sporadically fails on certain clients. - # Need to find the cause and fix it. - return is_hpu_available try: subprocess.run(["hl-smi"], capture_output=True, check=True) except (FileNotFoundError, PermissionError, subprocess.CalledProcessError): From 8a4c5c19469edcc7c0b4b51b43b4a103ad93a287 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Kuligowski?= Date: Thu, 4 Jul 2024 13:14:42 +0200 Subject: [PATCH 084/819] SiLU memory leak in fwd --- vllm/model_executor/layers/activation.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index b2641cf89bdc5..5bfdba67b443d 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -37,15 +37,6 @@ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor: ops.silu_and_mul(out, x) return out - def forward_hpu(self, x: torch.Tensor) -> torch.Tensor: - import vllm.hpu.ops as ops - - d = x.shape[-1] // 2 - output_shape = (x.shape[:-1] + (d, )) - out = torch.empty(output_shape, dtype=x.dtype, device=x.device) - ops.silu_and_mul(out, x) - return out - def forward_xpu(self, x: torch.Tensor) -> torch.Tensor: from vllm._ipex_ops import ipex_ops as ops From c5cd04aabbda514b8690f2bef34793868baae393 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 4 Jul 2024 14:17:05 +0300 Subject: [PATCH 085/819] add WA for model loader --- .../layers/vocab_parallel_embedding.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index d70eb1c2704b4..e1212ab8b6376 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -12,7 +12,7 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.utils import set_weight_attrs - +from vllm.utils import is_hpu DEFAULT_VOCAB_PADDING_SIZE = 64 @@ -327,8 +327,15 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): # Copy the data. loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) - param[:loaded_weight.shape[0]].data.copy_(loaded_weight) - param[loaded_weight.shape[0]:].data.fill_(0) + + # FIXME(kzawora): Weight copy with slicing bugs out on Gaudi here, so + # we're using a workaround. Remove this when fixed in HPU PT bridge. + if is_hpu(): + padded_weight = torch.cat([loaded_weight, torch.zeros(param.shape[0] - loaded_weight.shape[0], *loaded_weight.shape[1:])]) + param.data.copy_(padded_weight) + else: + param[:loaded_weight.shape[0]].data.copy_(loaded_weight) + param[loaded_weight.shape[0]:].data.fill_(0) def forward(self, input_): if self.tp_size > 1: From 9efb594ed5a0af4a71dd4ed5727aeff733743dcd Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 4 Jul 2024 14:22:50 +0300 Subject: [PATCH 086/819] remove hpu model loader WA --- vllm/model_executor/model_loader/loader.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 2bdb7438f6eaf..96a29cabe87e1 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -263,9 +263,7 @@ def load_model(self, *, model_config: ModelConfig, scheduler_config: SchedulerConfig, cache_config: CacheConfig) -> nn.Module: with set_default_torch_dtype(model_config.dtype): - load_device = torch.device(device_config.device) if not is_hpu( - ) else 'cpu' # FIXME(kzawora): this is a nasty workaround!!! - with torch.device(load_device): + with torch.device(torch.device(device_config.device)): model = _initialize_model(model_config, self.load_config, lora_config, multimodal_config, cache_config) @@ -285,9 +283,6 @@ def load_model(self, *, model_config: ModelConfig, # to use quant_method. if hasattr(module, "process_weights_after_loading"): module.process_weights_after_loading() - if is_hpu(): - model = model.to( - 'hpu') # FIXME(kzawora): this is a nasty workaround!!! return model.eval() From def464e3d70a5554c12758995ae7c57dc02065f6 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 4 Jul 2024 14:43:01 +0300 Subject: [PATCH 087/819] fix hpu autodetection (again) --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index de37558d738c4..bc71888fc933a 100644 --- a/setup.py +++ b/setup.py @@ -206,6 +206,7 @@ def build_extensions(self) -> None: def _is_hpu() -> bool: + is_hpu_available = True try: subprocess.run(["hl-smi"], capture_output=True, check=True) except (FileNotFoundError, PermissionError, subprocess.CalledProcessError): From 30f36f03ca269b5b1bb35266171797bfe8b0229b Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 4 Jul 2024 14:43:37 +0300 Subject: [PATCH 088/819] fix VLM configs in hpu components --- vllm/executor/habana_executor.py | 2 +- vllm/worker/habana_model_runner.py | 14 +++++++------- vllm/worker/habana_worker.py | 14 ++++++-------- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py index b771b9e026970..012872c72d974 100644 --- a/vllm/executor/habana_executor.py +++ b/vllm/executor/habana_executor.py @@ -43,7 +43,7 @@ def _get_worker_kwargs( rank=rank, distributed_init_method=distributed_init_method, lora_config=self.lora_config, - vision_language_config=self.vision_language_config, + multimodal_config=self.multimodal_config, is_driver_worker=rank == 0, ) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 6669c00a49647..8a698306fe2c5 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -20,7 +20,7 @@ from vllm.attention import AttentionMetadata, get_attn_backend from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, - VisionLanguageConfig) + MultiModalConfig) from vllm.distributed.parallel_state import get_world_group from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping @@ -344,7 +344,7 @@ def __init__( lora_config: Optional[LoRAConfig], kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, - vision_language_config: Optional[VisionLanguageConfig] = None, + multimodal_config: Optional[MultiModalConfig] = None, ): self.model_config = model_config self.parallel_config = parallel_config @@ -370,7 +370,7 @@ def __init__( self.pin_memory = is_pin_memory_available() self.kv_cache_dtype = kv_cache_dtype - self.vision_language_config = vision_language_config + self.multimodal_config = multimodal_config self.attn_backend = get_attn_backend( self.model_config.get_num_attention_heads(self.parallel_config), @@ -399,7 +399,7 @@ def load_model(self) -> None: device_config=self.device_config, load_config=self.load_config, lora_config=self.lora_config, - vision_language_config=self.vision_language_config, + multimodal_config=self.multimodal_config, parallel_config=self.parallel_config, scheduler_config=self.scheduler_config, cache_config=self.cache_config) @@ -615,7 +615,7 @@ def _prepare_prompt( device=self.device) if multi_modal_input_list: - assert self.vision_language_config, ( + assert self.multimodal_config, ( "Multi-modal inputs are only supported by " "vision language models.") multi_modal_input = torch.cat(multi_modal_input_list, @@ -1313,8 +1313,8 @@ def execute_model( "attn_metadata": self.trim_attn_metadata(attn_metadata), "intermediate_tensors": intermediate_tensors } - if self.vision_language_config: - execute_model_kwargs.update({"image_input": multi_modal_input}) + if multi_modal_input is not None: + execute_model_kwargs.update(multi_modal_input) htorch.core.mark_step() if self.is_driver_worker: diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index f91d6bc5cefa9..bde037b990a96 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -12,7 +12,7 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, - SpeculativeConfig, VisionLanguageConfig) + SpeculativeConfig, MultiModalConfig) from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.lora.request import LoRARequest @@ -43,7 +43,7 @@ def __init__( rank: int, distributed_init_method: str, lora_config: Optional[LoRAConfig] = None, - vision_language_config: Optional[VisionLanguageConfig] = None, + multimodal_config: Optional[MultiModalConfig] = None, speculative_config: Optional[SpeculativeConfig] = None, is_driver_worker: bool = False, ) -> None: @@ -65,11 +65,7 @@ def __init__( # note: lazy import to avoid importing torch before initializing from vllm.utils import init_cached_hf_modules init_cached_hf_modules() - self.vision_language_config = vision_language_config - if self.vision_language_config: - assert not self.lora_config, ( - "To be tested: vision language model with LoRA settings.") - raise AssertionError("To be tested: vision language model on HPU") + self.multimodal_config = multimodal_config self.model_runner: HabanaModelRunner = HabanaModelRunner( model_config, @@ -80,7 +76,9 @@ def __init__( load_config=load_config, lora_config=self.lora_config, kv_cache_dtype=self.cache_config.cache_dtype, - is_driver_worker=is_driver_worker) + multimodal_config=self.multimodal_config, + is_driver_worker=is_driver_worker + ) # Uninitialized cache engine. Will be initialized by # initialize_cache. self.cache_engine: List[CacheEngine] From 1dd85025015f5433568d6b3c0525fe629b14f37d Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 4 Jul 2024 18:42:40 +0300 Subject: [PATCH 089/819] fix hpu autodetection --- setup.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index bc71888fc933a..8c585cc822d86 100644 --- a/setup.py +++ b/setup.py @@ -212,7 +212,14 @@ def _is_hpu() -> bool: except (FileNotFoundError, PermissionError, subprocess.CalledProcessError): if not os.path.exists('/dev/accel/accel0') and not os.path.exists( '/dev/accel/accel_controlD0'): - is_hpu_available = False + # last resort... + try: + output = subprocess.check_output( + 'lsmod | grep habanalabs | wc -l', shell=True) + is_hpu_available = int(output) > 0 + except (ValueError, FileNotFoundError, PermissionError, + subprocess.CalledProcessError): + is_hpu_available = False return is_hpu_available From 0836502ee679101ddf250a4e2069eff141756184 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 4 Jul 2024 18:03:47 +0300 Subject: [PATCH 090/819] Remove invasive ALiBi changes --- vllm/attention/backends/abstract.py | 1 - vllm/attention/backends/habana_attn.py | 2 ++ vllm/attention/layer.py | 3 +-- vllm/model_executor/models/mpt.py | 3 +-- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 55d9a43b35652..40768532f59c2 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -116,7 +116,6 @@ def __init__( sliding_window: Optional[int] = None, kv_cache_dtype: str = "auto", blocksparse_params: Optional[Dict[str, Any]] = None, - max_seq_len: Optional[int] = 4096, ) -> None: raise NotImplementedError diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py index 98c16fdca4c3f..6b1695ba3fd52 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/habana_attn.py @@ -146,6 +146,8 @@ def __init__( self.position_bias = None self.alibi_slopes = alibi_slopes if alibi_slopes is not None: + # FIXME(kzawora): Need a general method to set max_seq_len on + # per-model basis. alibi_slopes_tensor = torch.tensor(alibi_slopes, dtype=torch.bfloat16) self.position_bias = _make_alibi_bias(alibi_slopes_tensor, diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 8e796e7a50d59..dfe93be462184 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -34,7 +34,6 @@ def __init__( cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, blocksparse_params: Optional[Dict[str, Any]] = None, - max_seq_len: Optional[int] = 4096, ) -> None: super().__init__() if cache_config is not None: @@ -82,7 +81,7 @@ def __init__( impl_cls = attn_backend.get_impl_cls() self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, - blocksparse_params, max_seq_len) + blocksparse_params) def forward( self, diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 7a753bf96c9d9..7d658b39e6794 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -111,8 +111,7 @@ def __init__( alibi_slopes=alibi_slopes, num_kv_heads=self.num_kv_heads, cache_config=cache_config, - quant_config=quant_config, - max_seq_len=config.max_seq_len) + quant_config=quant_config) def forward( self, From a2f361ccd4cc78a4c166a0e8ca4adf414a6ac652 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 4 Jul 2024 18:52:05 +0300 Subject: [PATCH 091/819] add VLLM_TARGET_DEVICE='hpu' --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8c585cc822d86..e0a0d648dc0a6 100644 --- a/setup.py +++ b/setup.py @@ -220,7 +220,7 @@ def _is_hpu() -> bool: except (ValueError, FileNotFoundError, PermissionError, subprocess.CalledProcessError): is_hpu_available = False - return is_hpu_available + return is_hpu_available or VLLM_TARGET_DEVICE == "hpu" def _is_cuda() -> bool: From 08ba3880d27b3fd6e29847b8ba0de73431c95b3d Mon Sep 17 00:00:00 2001 From: Tomasz Zielinski Date: Mon, 15 Jul 2024 17:50:58 +0300 Subject: [PATCH 092/819] Added docstring and assertion to warmup_range --- vllm/worker/habana_model_runner.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 8a698306fe2c5..15ae8aa8e7b53 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -52,12 +52,14 @@ _TYPE_CACHE = {} -# Read bucketing configuration from env variables -# phase is either 'prompt' or 'decode' -# dim is either 'bs' or 'seq' -# param is either 'min', 'step' or 'max' -# example env variable: VLLM_DECODE_BS_BUCKET_STEP=128 def read_bucket_settings(phase: str, dim: str, **defaults): + """Read bucketing configuration from env variables. + + phase is either 'prompt' or 'decode' + dim is either 'bs' or 'block' + param is either 'min', 'step' or 'max' + example env variable: VLLM_DECODE_BS_BUCKET_STEP=128 + """ params = ['min', 'step', 'max'] values = [ int( @@ -68,7 +70,19 @@ def read_bucket_settings(phase: str, dim: str, **defaults): def warmup_range(config: Tuple[int, int, int]): + """Generate a warmup range. + + Start from bmin and multiply by 2 until you reach bstep. + Then, increase the values in the range by the value of bstep until you reach bmax. + + Example: + bmin = 2, bstep = 32, bmax = 64 + => ramp_up = (2, 4, 8, 16) + => stable = (32, 64) + => return ramp_up + stable => (2, 4, 8, 16, 32, 64) + """ bmin, bstep, bmax = config + assert bmin <= bmax, "Min. batch size cannot be greater than max. batch size. If you want to skip warmup, set VLLM_SKIP_WARMUP=true" base = itertools.repeat(2) ramp_up_acc = itertools.accumulate(base, func=operator.mul, initial=bmin) ramp_up_tw = itertools.takewhile(lambda x: x < bstep and x <= bmax, \ From 6bed24889bd4d94d87d8c0839a917371d67e23fa Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 15 Jul 2024 19:50:16 +0300 Subject: [PATCH 093/819] fix api mismatches --- vllm/attention/backends/habana_attn.py | 3 ++- vllm/executor/habana_executor.py | 14 ++++++++++++++ .../layers/vocab_parallel_embedding.py | 9 +++++++-- vllm/worker/habana_worker.py | 3 +-- 4 files changed, 24 insertions(+), 5 deletions(-) diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py index 6b1695ba3fd52..a26b2f42333d0 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/habana_attn.py @@ -9,7 +9,7 @@ import vllm.hpu.ops as ops from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, - AttentionMetadata) + AttentionMetadata, AttentionType) from vllm.attention.ops.habana_paged_attn import (HabanaPagedAttention, HabanaPagedAttentionMetadata) from vllm.logger import init_logger @@ -172,6 +172,7 @@ def forward( kv_cache: torch.Tensor, attn_metadata: HabanaAttentionMetadata, kv_scale: float = 1.0, + attn_type: AttentionType = AttentionType.DECODER, ) -> torch.Tensor: """Forward pass with xFormers and PagedAttention. diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py index 012872c72d974..8750c3b00dd9e 100644 --- a/vllm/executor/habana_executor.py +++ b/vllm/executor/habana_executor.py @@ -9,6 +9,7 @@ from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.logger import init_logger from vllm.lora.request import LoRARequest +from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.utils import (HabanaMemoryProfiler, get_distributed_init_method, get_ip, get_open_port, make_async) @@ -159,6 +160,19 @@ def list_loras(self) -> Set[int]: def pin_lora(self, lora_id: int) -> bool: raise NotImplementedError("LoRA is not implemented for HPU backend.") + def add_prompt_adapter( + self, prompt_adapter_request: PromptAdapterRequest) -> bool: + raise NotImplementedError("LoRA is not implemented for HPU backend.") + + def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: + raise NotImplementedError("LoRA is not implemented for HPU backend.") + + def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: + raise NotImplementedError("LoRA is not implemented for HPU backend.") + + def list_prompt_adapters(self) -> Set[int]: + raise NotImplementedError("LoRA is not implemented for HPU backend.") + def check_health(self) -> None: # GPUExecutor will always be healthy as long as # it's running. diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index e1212ab8b6376..7860ec511571b 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -13,6 +13,7 @@ QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.utils import set_weight_attrs from vllm.utils import is_hpu + DEFAULT_VOCAB_PADDING_SIZE = 64 @@ -328,10 +329,14 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): # Copy the data. loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) - # FIXME(kzawora): Weight copy with slicing bugs out on Gaudi here, so + # FIXME(kzawora): Weight copy with slicing bugs out on Gaudi here, so # we're using a workaround. Remove this when fixed in HPU PT bridge. if is_hpu(): - padded_weight = torch.cat([loaded_weight, torch.zeros(param.shape[0] - loaded_weight.shape[0], *loaded_weight.shape[1:])]) + padded_weight = torch.cat([ + loaded_weight, + torch.zeros(param.shape[0] - loaded_weight.shape[0], + *loaded_weight.shape[1:]) + ]) param.data.copy_(padded_weight) else: param[:loaded_weight.shape[0]].data.copy_(loaded_weight) diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index bde037b990a96..49bf1b6d7d2ef 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -77,8 +77,7 @@ def __init__( lora_config=self.lora_config, kv_cache_dtype=self.cache_config.cache_dtype, multimodal_config=self.multimodal_config, - is_driver_worker=is_driver_worker - ) + is_driver_worker=is_driver_worker) # Uninitialized cache engine. Will be initialized by # initialize_cache. self.cache_engine: List[CacheEngine] From 03dbee5ba5bc70a3a5aa2ba2190e3586b6f095c5 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 15 Jul 2024 20:02:28 +0300 Subject: [PATCH 094/819] add assert for attn type --- vllm/attention/backends/habana_attn.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py index a26b2f42333d0..2f620a8e98947 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/habana_attn.py @@ -185,6 +185,11 @@ def forward( Returns: shape = [num_tokens, num_heads * head_size] """ + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "HabanaAttentionImpl") batch_size, seq_len, hidden_size = query.shape _, seq_len_kv, _ = key.shape From 8c58a6634c3a5d46950d9e6fd811ebaffea8ac80 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 15 Jul 2024 20:52:07 +0300 Subject: [PATCH 095/819] multi-hpu fixes --- vllm/executor/ray_habana_executor.py | 2 +- vllm/worker/habana_worker.py | 19 +++++++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py index 9f57de8d2e060..1e03fbdab32c2 100644 --- a/vllm/executor/ray_habana_executor.py +++ b/vllm/executor/ray_habana_executor.py @@ -49,7 +49,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", if (self.parallel_config.tensor_parallel_size == 1 and self.parallel_config.pipeline_parallel_size == 1): # For single GPU case, we use a ray worker with constrained memory. - num_gpus = self.cache_config.gpu_memory_utilization + num_gpus = 1 else: # Otherwise, the ray workers are allocated with a full GPU. num_gpus = 1 diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index 49bf1b6d7d2ef..ad87179a1147d 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -11,12 +11,13 @@ import torch.distributed from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ParallelConfig, SchedulerConfig, + ModelConfig, ParallelConfig, PromptAdapterConfig, SchedulerConfig, SpeculativeConfig, MultiModalConfig) from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.lora.request import LoRARequest from vllm.model_executor import set_random_seed +from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import ExecuteModelRequest from vllm.worker.cache_engine import CacheEngine from vllm.worker.habana_model_runner import HabanaModelRunner @@ -44,7 +45,8 @@ def __init__( distributed_init_method: str, lora_config: Optional[LoRAConfig] = None, multimodal_config: Optional[MultiModalConfig] = None, - speculative_config: Optional[SpeculativeConfig] = None, + speculative_config: Optional[SpeculativeConfig] = None, + prompt_adapter_config: Optional[PromptAdapterConfig] = None, is_driver_worker: bool = False, ) -> None: self.model_config = model_config @@ -246,6 +248,19 @@ def list_loras(self) -> Set[int]: def pin_lora(self, lora_id: int) -> bool: raise NotImplementedError("LoRA is not implemented for HPU backend.") + def add_prompt_adapter( + self, prompt_adapter_request: PromptAdapterRequest) -> bool: + raise NotImplementedError("LoRA is not implemented for HPU backend.") + + def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: + raise NotImplementedError("LoRA is not implemented for HPU backend.") + + def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: + raise NotImplementedError("LoRA is not implemented for HPU backend.") + + def list_prompt_adapters(self) -> Set[int]: + raise NotImplementedError("LoRA is not implemented for HPU backend.") + @property def max_model_len(self) -> int: return self.model_config.max_model_len From d7afbf2804cd2366208ffcc17259b5ba1e8f90d7 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 15 Jul 2024 21:15:52 +0300 Subject: [PATCH 096/819] minor formatting stuff --- vllm/executor/ray_habana_executor.py | 9 +++++++++ vllm/model_executor/model_loader/loader.py | 2 +- vllm/worker/habana_worker.py | 7 ++++--- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py index 1e03fbdab32c2..96b08a4dd3895 100644 --- a/vllm/executor/ray_habana_executor.py +++ b/vllm/executor/ray_habana_executor.py @@ -319,6 +319,15 @@ async def _driver_execute_model_async( self, execute_model_req: Optional[ExecuteModelRequest] = None ) -> List[SamplerOutput]: + if self.pp_locks is None: + # This locks each pipeline parallel stage so multiple virtual + # engines can't execute on the same stage at the same time + # We create the locks here to avoid creating them in the constructor + # which uses a different asyncio loop. + self.pp_locks = [ + asyncio.Lock() + for _ in range(self.parallel_config.pipeline_parallel_size) + ] async def _run_task_with_lock(task, lock, *args, **kwargs): async with lock: diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 96a29cabe87e1..294dbb91e735d 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -36,7 +36,7 @@ supports_vision) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform -from vllm.utils import is_hpu, is_tpu +from vllm.utils import is_tpu logger = init_logger(__name__) diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index ad87179a1147d..6be229e037d06 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -11,8 +11,9 @@ import torch.distributed from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ParallelConfig, PromptAdapterConfig, SchedulerConfig, - SpeculativeConfig, MultiModalConfig) + ModelConfig, MultiModalConfig, ParallelConfig, + PromptAdapterConfig, SchedulerConfig, + SpeculativeConfig) from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.lora.request import LoRARequest @@ -45,7 +46,7 @@ def __init__( distributed_init_method: str, lora_config: Optional[LoRAConfig] = None, multimodal_config: Optional[MultiModalConfig] = None, - speculative_config: Optional[SpeculativeConfig] = None, + speculative_config: Optional[SpeculativeConfig] = None, prompt_adapter_config: Optional[PromptAdapterConfig] = None, is_driver_worker: bool = False, ) -> None: From 2b2549ca3de4f230138cf2e6afe391aa3acdc4bc Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 15 Jul 2024 21:16:04 +0300 Subject: [PATCH 097/819] fix sampling metadata for prefill --- vllm/worker/habana_model_runner.py | 42 ++++++++++++++++-------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 15ae8aa8e7b53..15ac0035228cc 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -19,8 +19,8 @@ from vllm.attention import AttentionMetadata, get_attn_backend from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ParallelConfig, SchedulerConfig, - MultiModalConfig) + ModelConfig, MultiModalConfig, ParallelConfig, + SchedulerConfig) from vllm.distributed.parallel_state import get_world_group from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping @@ -73,7 +73,8 @@ def warmup_range(config: Tuple[int, int, int]): """Generate a warmup range. Start from bmin and multiply by 2 until you reach bstep. - Then, increase the values in the range by the value of bstep until you reach bmax. + Then, increase the values in the range by the value of bstep until you + reach bmax. Example: bmin = 2, bstep = 32, bmax = 64 @@ -82,7 +83,9 @@ def warmup_range(config: Tuple[int, int, int]): => return ramp_up + stable => (2, 4, 8, 16, 32, 64) """ bmin, bstep, bmax = config - assert bmin <= bmax, "Min. batch size cannot be greater than max. batch size. If you want to skip warmup, set VLLM_SKIP_WARMUP=true" + assert bmin <= bmax, ("Min. batch size cannot be greater than max. " + "batch size. If you want to skip warmup, " + "set VLLM_SKIP_WARMUP=true") base = itertools.repeat(2) ramp_up_acc = itertools.accumulate(base, func=operator.mul, initial=bmin) ramp_up_tw = itertools.takewhile(lambda x: x < bstep and x <= bmax, \ @@ -812,9 +815,9 @@ def _prepare_decode( def prepare_input_tensors( self, seq_group_metadata_list: List[SequenceGroupMetadata], - ) -> TModelInputForHPU: + ) -> Tuple[TModelInputForHPU, SamplingMetadata]: if len(seq_group_metadata_list) == 0: - return self._model_input_cls() + return self._model_input_cls(), None input_tokens = None input_positions = None @@ -950,16 +953,17 @@ def prepare_input_tensors( attn_metadata = prefill_attn_metadata if \ prefill_attn_metadata is not None else decode_attn_metadata - return self._model_input_cls(input_tokens=input_tokens, - seq_lens=seq_lens, - query_lens=query_lens, - input_positions=input_positions, - attn_metadata=attn_metadata, - lora_requests=lora_requests, - lora_mapping=lora_mapping, - multi_modal_kwargs=multi_modal_input, - real_batch_size=real_batch_size, - batch_size_padded=batch_size_padded) + return self._model_input_cls( + input_tokens=input_tokens, + seq_lens=seq_lens, + query_lens=query_lens, + input_positions=input_positions, + attn_metadata=attn_metadata, + lora_requests=lora_requests, + lora_mapping=lora_mapping, + multi_modal_kwargs=multi_modal_input, + real_batch_size=real_batch_size, + batch_size_padded=batch_size_padded), sampling_metadata def _seq_len(self, attn_metadata): if attn_metadata.num_prefills != 0: @@ -1277,10 +1281,8 @@ def prepare_model_input( assert seq_group_metadata_list is not None self.profiler_counter_helper.capture_seq_group_metadata_stats( seq_group_metadata_list=seq_group_metadata_list) - model_input = self.prepare_input_tensors(seq_group_metadata_list) - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, model_input.seq_lens, - model_input.query_lens, self.device, self.pin_memory) + model_input, sampling_metadata = self.prepare_input_tensors( + seq_group_metadata_list) assert model_input.attn_metadata is not None is_prompt = model_input.attn_metadata.is_prompt From e911fd8270e67facd58c36fe0b0414779699b949 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 16 Jul 2024 12:28:46 +0300 Subject: [PATCH 098/819] bump ray version for hpu --- requirements-hpu.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 339fe989bdb7a..e0f03c8464c7b 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -2,7 +2,7 @@ -r requirements-common.txt # Dependencies for HPU code -ray == 2.23.0 +ray == 2.32.0 triton pandas tabulate From bf349c58b25e82877e2505d05e75bdbf431240a1 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 18 Jul 2024 15:23:45 +0300 Subject: [PATCH 099/819] split k scale and v scale in habana attn --- vllm/attention/backends/habana_attn.py | 6 ++++-- vllm/attention/ops/habana_paged_attn.py | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py index 2f620a8e98947..33b6e2e538b13 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/habana_attn.py @@ -171,7 +171,8 @@ def forward( value: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: HabanaAttentionMetadata, - kv_scale: float = 1.0, + k_scale: float = 1.0, + v_scale: float = 1.0, attn_type: AttentionType = AttentionType.DECODER, ) -> torch.Tensor: """Forward pass with xFormers and PagedAttention. @@ -253,7 +254,8 @@ def forward( output = HabanaPagedAttention.forward_decode( query, key_cache, value_cache, attn_metadata.block_tables, attn_metadata.seq_lens_tensor, self.kv_cache_dtype, - self.num_kv_heads, self.scale, self.position_bias, kv_scale) + self.num_kv_heads, self.scale, self.position_bias, k_scale, + v_scale) # Reshape the output tensor. return output.view(batch_size, seq_len, hidden_size) diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/habana_paged_attn.py index ed47b906168e5..7dd701c7a0cdf 100644 --- a/vllm/attention/ops/habana_paged_attn.py +++ b/vllm/attention/ops/habana_paged_attn.py @@ -73,7 +73,8 @@ def forward_decode( num_kv_heads: int, scale: float, alibi_slopes: Optional[torch.Tensor], - kv_scale: float, + k_scale: float, + v_scale: float, ) -> torch.Tensor: block_size = value_cache.shape[1] return ops.paged_attention_v1( From 8e231a58fcc89a1b82254469ad39b265a50374bd Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 29 Jul 2024 15:58:03 +0200 Subject: [PATCH 100/819] Add workaround for RuntimeError: Invalid inputs for scatter_nd_onnx (#107) --- vllm/hpu/cache_ops.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py index d28a47271c6ac..14824945aa53a 100644 --- a/vllm/hpu/cache_ops.py +++ b/vllm/hpu/cache_ops.py @@ -16,12 +16,31 @@ def reshape_and_cache(key, slot_mapping, dtype, is_prompt=False): + num_blocks = key_cache.size(0) block_size = key_cache.size(1) slot_mapping = slot_mapping.flatten() indices = torch.div(slot_mapping, block_size, rounding_mode="floor") offsets = torch.fmod(slot_mapping, block_size) - key_cache.index_put_((indices, offsets), key) - value_cache.index_put_((indices, offsets), value) + num_slots_requested = slot_mapping.size(0) + num_slots_available = num_blocks * block_size + # NOTE(kzawora): HPU PT bridge crashes with + # RuntimeError: Invalid inputs for scatter_nd_onnx + # on index_put when num_slots_requested > num_slots_available. + # This case might occur when we have little kv cache blocks and + # lots of padding, or are doing warmup. + # This loop is a workaround for this issue. Please remove it + # once key_cache.index_put_(indices, offsets), key) works. + num_kv_cache_passes = torch.div(num_slots_requested, + num_slots_available).ceil().int().item() + for i in range(num_kv_cache_passes): + start_idx = i * num_slots_available + end_idx = (i + 1) * num_slots_available + key_cache.index_put_( + (indices[start_idx:end_idx], offsets[start_idx:end_idx]), + key[start_idx:end_idx]) + value_cache.index_put_( + (indices[start_idx:end_idx], offsets[start_idx:end_idx]), + value[start_idx:end_idx]) def swap_blocks(src, dst, block_mapping): From f7dc5545dc994c0d3a37a9c4eb33190f6ac45018 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 29 Jul 2024 15:58:15 +0200 Subject: [PATCH 101/819] Refactor forward_hpu of RMSNorm (#128) --- vllm/model_executor/layers/layernorm.py | 52 +++++++++---------------- 1 file changed, 19 insertions(+), 33 deletions(-) diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 67cef1b47f3bf..f1b7a73d22d52 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -6,13 +6,15 @@ from vllm.model_executor.custom_op import CustomOp from vllm.utils import is_hpu - +from vllm.logger import init_logger +logger = init_logger(__name__) if is_hpu(): try: - from habana_frameworks.torch.hpex.normalization import FusedRMSNorm + from habana_frameworks.torch.hpex.normalization import FusedRMSNorm as HPUFusedRMSNorm except ImportError: - print("Not using HPU fused kernel for RMSNorm") - FusedRMSNorm = None + logger.warning("Could not import HPU FusedRMSNorm kernel. " + "vLLM will use forward_native implementation of RMSNorm.") + HPUFusedRMSNorm = None class RMSNorm(CustomOp): @@ -80,37 +82,21 @@ def forward_hpu( x: torch.Tensor, residual: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: - from vllm._ipex_ops import ipex_ops as ops - + if HPUFusedRMSNorm is None: + return self.forward_native(x, residual) if residual is not None: - if x.device.type == "hpu" and FusedRMSNorm: - orig_dtype = x.dtype - orig_shape = x.shape - residual += x.view(residual.shape) - # Note: FusedRMSNorm requires 3D tensors as inputs - x = FusedRMSNorm.apply(residual.float(), self.weight.float(), - self.variance_epsilon) - return x.to(orig_dtype).view(orig_shape), residual - ops.fused_add_rms_norm( - x, - residual, - self.weight.data, - self.variance_epsilon, - ) - return x, residual - if x.device.type == "hpu" and FusedRMSNorm: orig_dtype = x.dtype - x = FusedRMSNorm.apply(x.float(), self.weight.float(), - self.variance_epsilon) - return x.to(orig_dtype) - out = torch.empty_like(x) - ops.rms_norm( - out, - x, - self.weight.data, - self.variance_epsilon, - ) - return out + orig_shape = x.shape + residual += x.view(residual.shape) + # Note: HPUFusedRMSNorm requires 3D tensors as inputs + x = HPUFusedRMSNorm.apply(residual.float(), self.weight.float(), + self.variance_epsilon) + return x.to(orig_dtype).view(orig_shape), residual + + orig_dtype = x.dtype + x = HPUFusedRMSNorm.apply(x.float(), self.weight.float(), + self.variance_epsilon) + return x.to(orig_dtype) def forward_xpu( self, From 19993b7490a7c26aa5640b4f50607158754ddda0 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 29 Jul 2024 15:58:29 +0200 Subject: [PATCH 102/819] Refactor & re-enable HPU RoPE for Gaudi1 (#129) * Re-enable FusedRoPE for Gaudi1 * add fallback impl of rope --- vllm/hpu/rotary_embed.py | 102 +++++------------- .../model_executor/layers/rotary_embedding.py | 8 +- 2 files changed, 33 insertions(+), 77 deletions(-) diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py index 26b19e8258285..8bc93cdf5c444 100644 --- a/vllm/hpu/rotary_embed.py +++ b/vllm/hpu/rotary_embed.py @@ -5,80 +5,25 @@ # LICENSE file in the root directory of this source tree. ############################################################################### -import habana_frameworks.torch.utils.experimental as htexp import torch import torch.nn as nn +from vllm.utils import is_hpu +from vllm.logger import init_logger +logger = init_logger(__name__) -def get_device_type(): - return htexp._get_device_type() - - -def is_gaudi1(): - return get_device_type() == htexp.synDeviceType.synDeviceGaudi - - -def is_gaudi2(): - return get_device_type() == htexp.synDeviceType.synDeviceGaudi2 - - -def is_gaudi3(): - return get_device_type() == htexp.synDeviceType.synDeviceGaudi3 - - -# TODO: remove this workaround when FusedRoPE properly works on Gaudi -if not is_gaudi1() and (is_gaudi2() or is_gaudi3()): +if is_hpu(): try: from habana_frameworks.torch.hpex.kernels import ( RotaryPosEmbeddingHelperV1 as FusedRoPE) except ImportError: - print("Not using HPU fused kernel for apply_rotary_pos_emb") - FusedRoPE = None + logger.warning("Could not import HPU FusedRoPE kernel. " + "vLLM will use forward_native implementation of RoPE.") + FusedRoPE = None else: FusedRoPE = None -def rotate_half(x): - """Rotates half the hidden dims of the input.""" - x1 = x[..., :x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2:] - return torch.cat((-x2, x1), dim=-1) - - -def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): - """Applies Rotary Position Embedding to the query and key tensors. - - Args: - q (`torch.Tensor`): The query tensor. - k (`torch.Tensor`): The key tensor. - cos (`torch.Tensor`): The cosine part of the rotary embedding. - sin (`torch.Tensor`): The sine part of the rotary embedding. - position_ids (`torch.Tensor`): - The position indices of the tokens corresponding to the query and - key tensors. For example, this can be used to pass offsetted - position ids when working with a KV-cache. - unsqueeze_dim (`int`, *optional*, defaults to 1): - The 'unsqueeze_dim' argument specifies the dimension along which to - unsqueeze cos[position_ids] and sin[position_ids] so that they can - be properly broadcasted to the dimensions of q and k. For example, - note that cos[position_ids] and sin[position_ids] have the shape - [batch_size, seq_len, head_dim]. Then, if q and k have the shape - [batch_size, heads, seq_len, head_dim], then setting - unsqueeze_dim=1 makes cos[position_ids] and sin[position_ids] - broadcastable to the shapes of q and k. Similarly, if q and k have - the shape [batch_size, seq_len, heads, head_dim], then set - unsqueeze_dim=2. - Returns: - `tuple(torch.Tensor)` comprising of the query and key tensors rotated - using the Rotary Position Embedding. - """ - cos = cos[position_ids] #.unsqueeze(unsqueeze_dim) - sin = sin[position_ids] #.unsqueeze(unsqueeze_dim) - q_embed = (q * cos) + (rotate_half(q) * sin) - k_embed = (k * cos) + (rotate_half(k) * sin) - return q_embed, k_embed - - class HpuRotaryEmbedding(nn.Module): def __init__(self, @@ -87,7 +32,8 @@ def __init__(self, max_position_embeddings=2048, base=10000, is_neox_style=None, - device='hpu'): + device='hpu', + RoPEFallback=None): super().__init__() self.head_size = head_size @@ -102,6 +48,14 @@ def __init__(self, self._set_cos_sin_cache(seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()) + if FusedRoPE is None: + assert RoPEFallback is not None, "HPU FusedRoPE kernel could not be imported, and fallback RoPE implementation was not provided!" + self.fallback_impl = RoPEFallback(head_size, + rotary_dim, + max_position_embeddings, + base, + is_neox_style, + dtype=torch.get_default_dtype()) def _set_cos_sin_cache(self, seq_len, device, dtype): self.max_seq_len_cached = seq_len @@ -122,6 +76,8 @@ def _set_cos_sin_cache(self, seq_len, device, dtype): def forward(self, positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor): + if FusedRoPE is None: + return self.fallback_impl(positions, query, key) if query.dim() == 2: query = query.unsqueeze(0) if key.dim() == 2: @@ -141,19 +97,15 @@ def forward(self, positions: torch.Tensor, query: torch.Tensor, self.head_size)) key = key.reshape((key.shape[0], key.shape[1], key.shape[2] // self.head_size, self.head_size)) - if query.device.type == "hpu" and FusedRoPE: - if len(positions[0]) == 1: - cos = self.cos_cached[positions].unsqueeze(2).to( - dtype=query.dtype) - sin = self.sin_cached[positions].unsqueeze(2).to( - dtype=query.dtype) - else: - cos = cos[positions].unsqueeze(2) - sin = sin[positions].unsqueeze(2) - query, key = FusedRoPE.apply(query, cos, sin, - 0), FusedRoPE.apply(key, cos, sin, 0) + + if len(positions[0]) == 1: + cos = self.cos_cached[positions].unsqueeze(2).to(dtype=query.dtype) + sin = self.sin_cached[positions].unsqueeze(2).to(dtype=query.dtype) else: - query, key = apply_rotary_pos_emb(query, key, cos, sin, positions) + cos = cos[positions].unsqueeze(2) + sin = sin[positions].unsqueeze(2) + query, key = FusedRoPE.apply(query, cos, sin, + 0), FusedRoPE.apply(key, cos, sin, 0) return query.reshape( (query.shape[0], query.shape[1], query.shape[2] * query.shape[3])), key.reshape( diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index d706c70c82374..e7c97a6cf75cc 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -765,8 +765,12 @@ def get_rope( return _ROPE_DICT[key] if rope_scaling is None: if is_hpu(): - rotary_emb = HpuRotaryEmbedding(head_size, rotary_dim, - max_position, base, is_neox_style) + rotary_emb = HpuRotaryEmbedding(head_size, + rotary_dim, + max_position, + base, + is_neox_style, + RoPEFallback=RotaryEmbedding) else: rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base, is_neox_style, dtype) From 03e3ce38d1fdef86fad30c5ee1bed27fce22a842 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 29 Jul 2024 16:45:32 +0200 Subject: [PATCH 103/819] formatting fixes (#132) --- vllm/hpu/rotary_embed.py | 7 +++++-- vllm/model_executor/layers/layernorm.py | 17 ++++++++++------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py index 8bc93cdf5c444..e44bfa2f6210c 100644 --- a/vllm/hpu/rotary_embed.py +++ b/vllm/hpu/rotary_embed.py @@ -7,8 +7,9 @@ import torch import torch.nn as nn -from vllm.utils import is_hpu + from vllm.logger import init_logger +from vllm.utils import is_hpu logger = init_logger(__name__) @@ -49,7 +50,9 @@ def __init__(self, device=self.inv_freq.device, dtype=torch.get_default_dtype()) if FusedRoPE is None: - assert RoPEFallback is not None, "HPU FusedRoPE kernel could not be imported, and fallback RoPE implementation was not provided!" + assert RoPEFallback is not None, ( + "HPU FusedRoPE kernel could not be imported, and " + "fallback RoPE implementation was not provided!") self.fallback_impl = RoPEFallback(head_size, rotary_dim, max_position_embeddings, diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index f1b7a73d22d52..e00cb9ca6e1ac 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -4,16 +4,19 @@ import torch import torch.nn as nn +from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp from vllm.utils import is_hpu -from vllm.logger import init_logger + logger = init_logger(__name__) if is_hpu(): try: - from habana_frameworks.torch.hpex.normalization import FusedRMSNorm as HPUFusedRMSNorm + from habana_frameworks.torch.hpex.normalization import ( + FusedRMSNorm as HPUFusedRMSNorm) except ImportError: - logger.warning("Could not import HPU FusedRMSNorm kernel. " - "vLLM will use forward_native implementation of RMSNorm.") + logger.warning( + "Could not import HPU FusedRMSNorm kernel. " + "vLLM will use forward_native implementation of RMSNorm.") HPUFusedRMSNorm = None @@ -90,12 +93,12 @@ def forward_hpu( residual += x.view(residual.shape) # Note: HPUFusedRMSNorm requires 3D tensors as inputs x = HPUFusedRMSNorm.apply(residual.float(), self.weight.float(), - self.variance_epsilon) + self.variance_epsilon) return x.to(orig_dtype).view(orig_shape), residual - + orig_dtype = x.dtype x = HPUFusedRMSNorm.apply(x.float(), self.weight.float(), - self.variance_epsilon) + self.variance_epsilon) return x.to(orig_dtype) def forward_xpu( From a0646da3d1685847a90c894429a9da3572cbd063 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 29 Jul 2024 17:40:34 +0200 Subject: [PATCH 104/819] Address upstream PR code review comments (#133) * formatting fixes * Upstream CR update --- .../getting_started/gaudi-installation.rst | 62 ++++++++++++------- .../model_executor/layers/logits_processor.py | 2 + .../layers/vocab_parallel_embedding.py | 5 +- vllm/worker/cache_engine.py | 5 +- 4 files changed, 45 insertions(+), 29 deletions(-) diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst index 73b63b3f8d755..a9f3ebdf274f6 100644 --- a/docs/source/getting_started/gaudi-installation.rst +++ b/docs/source/getting_started/gaudi-installation.rst @@ -1,8 +1,7 @@ -vLLM with IntelÂź GaudiÂź 2 AI Accelerators +vLLM with IntelÂź GaudiÂź AI Accelerators ========================================= -This README provides instructions on running vLLM with Intel Gaudi -devices. +This README provides instructions on running vLLM with Intel Gaudi devices. Requirements and Installation ============================= @@ -13,17 +12,13 @@ to set up the environment. To achieve the best performance, please follow the methods outlined in the `Optimizing Training Platform Guide `__. -.. note:: - In this release (1.16.0), we are only targeting functionality - and accuracy. Performance will be improved in next releases. - Requirements ------------ - OS: Ubuntu 22.04 LTS - Python: 3.10 -- Intel Gaudi 2 accelerator -- Intel Gaudi software version 1.16.0 +- Intel Gaudi accelerator +- Intel Gaudi software version 1.16.0 or newer To verify that the Intel Gaudi software was correctly installed, run: @@ -49,20 +44,30 @@ Use the following commands to run a Docker image: .. code:: console - $ docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest - $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest + $ docker pull vault.habana.ai/gaudi-docker/1.16.2/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest + $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.16.2/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest -Build and Install vLLM-fork +Build and Install vLLM --------------------------- -To build and install vLLM-fork from source, run: +To build and install vLLM from source, run: + +.. code:: console + + $ git clone https://github.com/vllm-project/vllm.git + $ cd vllm + $ python setup.py develop + + +Currently, the latest features and performance optimizations are developed in Gaudi's `vLLM-fork `__ and we periodically upstream them to vLLM main repo. To install latest `HabanaAI/vLLM-fork `__, run the following: .. code:: console $ git clone https://github.com/HabanaAI/vllm-fork.git $ cd vllm-fork - # git checkout v0.4.2-Gaudi-1.16.0 - $ pip install -e . # This may take 5-10 minutes. + $ git checkout habana_main + $ python setup.py develop + Supported Features ================== @@ -72,13 +77,12 @@ Supported Features - Online inference via `OpenAI-Compatible Server `__ - HPU autodetection - no need to manually select device within vLLM -- Paged KV cache with algorithms enabled for Intel Gaudi 2 accelerators +- Paged KV cache with algorithms enabled for Intel Gaudi accelerators - Custom Intel Gaudi implementations of Paged Attention, KV cache ops, prefill attention, Root Mean Square Layer Normalization, Rotary Positional Encoding - Tensor parallelism support for multi-card inference -- Inference with `HPU - Graphs `__ +- Inference with `HPU Graphs `__ for accelerating low-batch latency and throughput Unsupported Features @@ -94,7 +98,7 @@ Supported Configurations ======================== The following configurations have been validated to be function with -Gaudi devices. Configurations that are not listed may or may not work. +Gaudi2 devices. Configurations that are not listed may or may not work. - `meta-llama/Llama-2-7b `__ on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 @@ -102,12 +106,24 @@ Gaudi devices. Configurations that are not listed may or may not work. - `meta-llama/Llama-2-7b-chat-hf `__ on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3-8B `__ + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3-8B-Instruct `__ + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling - `meta-llama/Llama-2-70b `__ - with tensor parallelism on 8x HPU, BF16 datatype with random or - greedy sampling + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling - `meta-llama/Llama-2-70b-chat-hf `__ - with tensor parallelism 8x HPU, BF16 datatype with random or greedy - sampling + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3-70B `__ + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3-70B-Instruct `__ + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- `mistralai/Mistral-7B-Instruct-v0.3 `__ + on single HPU or with tensor parallelism on 2x HPU, BF16 datatype with random or greedy sampling +- `mistralai/Mixtral-8x7B-Instruct-v0.1 `__ + with tensor parallelism on 2x HPU, BF16 datatype with random or greedy sampling Performance Tips ================ diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index e87ecbe40fdca..3b4fc88a8ca51 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -93,6 +93,8 @@ def _prune_hidden_states( hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> torch.Tensor: + # NOTE(kzawora): This is needed for Gaudi - in some scenarios (warmup, + # profile_run) we might not have selected_token_indices, so we skip pruning. if sampling_metadata.selected_token_indices is not None: return hidden_states.index_select( 0, sampling_metadata.selected_token_indices) diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 7860ec511571b..6cf79d462bfe0 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -329,9 +329,10 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): # Copy the data. loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) - # FIXME(kzawora): Weight copy with slicing bugs out on Gaudi here, so - # we're using a workaround. Remove this when fixed in HPU PT bridge. if is_hpu(): + # FIXME(kzawora): Weight copy with slicing bugs out on Gaudi here, + # so we're using a workaround. Remove this when fixed in + # HPU PT bridge. padded_weight = torch.cat([ loaded_weight, torch.zeros(param.shape[0] - loaded_weight.shape[0], diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index d2b1891c7e28c..93be2f4c321fe 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -6,12 +6,9 @@ from vllm.attention import get_attn_backend from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig from vllm.logger import init_logger -from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size, is_hpu, +from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size, is_pin_memory_available) -if is_hpu(): - pass - logger = init_logger(__name__) From a642c0cfc4f59041830acda3aae93060d8dc5aff Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 29 Jul 2024 17:47:43 +0200 Subject: [PATCH 105/819] Whitespace fix (#134) * formatting fixes * Upstream CR update * whitespace fix --- vllm/model_executor/layers/logits_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index 3b4fc88a8ca51..cce8f99af5a6c 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -93,7 +93,7 @@ def _prune_hidden_states( hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> torch.Tensor: - # NOTE(kzawora): This is needed for Gaudi - in some scenarios (warmup, + # NOTE(kzawora): This is needed for Gaudi - in some scenarios (warmup, # profile_run) we might not have selected_token_indices, so we skip pruning. if sampling_metadata.selected_token_indices is not None: return hidden_states.index_select( From 58236e7f745bdd2759ea06fe79f8beaab67bf1be Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 29 Jul 2024 20:14:51 +0300 Subject: [PATCH 106/819] formatting --- vllm/executor/ray_utils.py | 9 ++++----- vllm/model_executor/sampling_metadata.py | 4 ++-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index b52324648865a..507dc04f48123 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -3,7 +3,7 @@ from vllm.config import ParallelConfig from vllm.logger import init_logger from vllm.sequence import ExecuteModelRequest -from vllm.utils import get_ip, is_hip, is_tpu, is_hpu, is_xpu +from vllm.utils import get_ip, is_hip, is_hpu, is_tpu, is_xpu from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -87,18 +87,17 @@ def initialize_ray_cluster( ignore_reinit_error=True, num_gpus=parallel_config.world_size) else: - ray.init(address=ray_address, - ignore_reinit_error=True) + ray.init(address=ray_address, ignore_reinit_error=True) if parallel_config.placement_group: # Placement group is already set. return - device_str = "GPU" + device_str = "GPU" if is_tpu(): device_str = "TPU" elif is_hpu(): - device_str = "HPU" + device_str = "HPU" # Create placement group for worker processes current_placement_group = ray.util.get_current_placement_group() if current_placement_group: diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 98e77c22b3875..4687eb5c39fa9 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -8,8 +8,8 @@ from vllm.model_executor.layers.ops.sample import get_num_triton_sampler_splits from vllm.sampling_params import SamplingParams, SamplingType from vllm.sequence import SequenceData, SequenceGroupMetadata -from vllm.utils import (async_tensor_h2d, is_pin_memory_available, - make_tensor_with_pad, maybe_expand_dim, is_hpu) +from vllm.utils import (async_tensor_h2d, is_hpu, is_pin_memory_available, + make_tensor_with_pad, maybe_expand_dim) _SAMPLING_EPS = 1e-5 _SEED_0_REPLACEMENT = 3403598558 From cc01748cd73fee1c912866f8c18b4d7e43d81a1b Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 29 Jul 2024 20:24:33 +0300 Subject: [PATCH 107/819] align to changes in make_tensor_with_pad --- vllm/worker/habana_model_runner.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 15ac0035228cc..11a7630b3cebe 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -646,19 +646,19 @@ def _prepare_prompt( self.block_size) input_tokens = make_tensor_with_pad(input_tokens, - max_prompt_len, + max_len=max_prompt_len, pad=0, dtype=torch.long, device=self.device) input_positions = make_tensor_with_pad(input_positions, - max_prompt_len, + max_len=max_prompt_len, pad=0, dtype=torch.long, device=self.device) slot_mapping = make_tensor_with_pad(slot_mapping, - max_prompt_len, + max_len=max_prompt_len, pad=_PAD_SLOT_ID, dtype=torch.long, device=self.device) From bf86cb1c19b5fc1334bdb7bce86baf331871936d Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 29 Jul 2024 20:25:20 +0300 Subject: [PATCH 108/819] update ray_habana_executor --- vllm/executor/ray_habana_executor.py | 306 +++++++++++++++++---------- 1 file changed, 198 insertions(+), 108 deletions(-) diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py index 96b08a4dd3895..a616e59b3be60 100644 --- a/vllm/executor/ray_habana_executor.py +++ b/vllm/executor/ray_habana_executor.py @@ -1,6 +1,5 @@ import asyncio import os -import pickle from collections import defaultdict from itertools import islice, repeat from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple @@ -11,7 +10,8 @@ from vllm.executor.ray_utils import RayWorkerWrapper, ray from vllm.logger import init_logger from vllm.sequence import ExecuteModelRequest, SamplerOutput -from vllm.utils import (error_on_invalid_device_count_status, +from vllm.utils import (_run_task_with_lock, + error_on_invalid_device_count_status, get_distributed_init_method, get_ip, get_open_port, get_vllm_instance_id, make_async) @@ -28,8 +28,31 @@ class RayHabanaExecutor(DistributedGPUExecutor): + uses_ray: bool = True + def _init_executor(self) -> None: - assert self.parallel_config.distributed_executor_backend == "ray" + self.forward_dag: Optional["ray.dag.CompiledDAG"] = None + # If the env var is set, it uses the Ray's compiled DAG API + # which optimizes the control plane overhead. + # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. + # Currently, this requires USE_RAY_SPMD_WORKER=True. + self.use_ray_compiled_dag = envs.VLLM_USE_RAY_COMPILED_DAG + # If the env var is set, then we do not distinguish between the + # "driver worker" vs other workers. Also, the rank 0 worker will + # be executed in a remote Ray worker. Currently this requires + # USE_RAY_COMPILED_DAG=True. + self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER + if self.use_ray_compiled_dag: + assert self.use_ray_spmd_worker, ( + "VLLM_USE_RAY_COMPILED_DAG=1 requires " + "VLLM_USE_RAY_SPMD_WORKER=1") + if self.use_ray_spmd_worker: + # TODO: Support SPMD worker for non-DAG Ray executor. + assert self.use_ray_compiled_dag, ( + "VLLM_USE_RAY_SPMD_WORKER=1 requires " + "VLLM_USE_RAY_COMPILED_DAG=1") + + assert self.uses_ray placement_group = self.parallel_config.placement_group # Disable Ray usage stats collection. @@ -40,19 +63,19 @@ def _init_executor(self) -> None: # Create the parallel GPU workers. self._init_workers_ray(placement_group) - self.forward_dag = None - if USE_RAY_COMPILED_DAG: - self.forward_dag = self._compiled_ray_dag() + def _get_worker_wrapper_args(self) -> Dict[str, Any]: + worker_module_name = "vllm.worker.habana_worker" + worker_class_name = "HabanaWorker" + + return dict( + worker_module_name=worker_module_name, + worker_class_name=worker_class_name, + trust_remote_code=self.model_config.trust_remote_code, + ) def _init_workers_ray(self, placement_group: "PlacementGroup", **ray_remote_kwargs): - if (self.parallel_config.tensor_parallel_size == 1 - and self.parallel_config.pipeline_parallel_size == 1): - # For single GPU case, we use a ray worker with constrained memory. - num_gpus = 1 - else: - # Otherwise, the ray workers are allocated with a full GPU. - num_gpus = 1 + num_gpus = 1 # The driver dummy worker does not actually use any resources. # It holds the resource for the driver worker. @@ -62,6 +85,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # Create the workers. driver_ip = get_ip() + worker_wrapper_kwargs = self._get_worker_wrapper_args() for bundle_id, bundle in enumerate(placement_group.bundle_specs): if not bundle.get("HPU", 0): continue @@ -70,33 +94,30 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", placement_group_capture_child_tasks=True, placement_group_bundle_index=bundle_id, ) + worker = ray.remote( num_cpus=0, num_gpus=0, resources={'HPU': num_gpus}, scheduling_strategy=scheduling_strategy, **ray_remote_kwargs, - )(RayWorkerWrapper).remote( - worker_module_name="vllm.worker.habana_worker", - worker_class_name="HabanaWorker", - trust_remote_code=self.model_config.trust_remote_code, - ) + )(RayWorkerWrapper).remote(**worker_wrapper_kwargs) - worker_ip = ray.get(worker.get_node_ip.remote()) - if worker_ip == driver_ip and self.driver_dummy_worker is None: - # If the worker is on the same node as the driver, we use it - # as the resource holder for the driver process. - self.driver_dummy_worker = worker - self.driver_worker = RayWorkerWrapper( - worker_module_name="vllm.worker.habana_worker", - worker_class_name="HabanaWorker", - trust_remote_code=self.model_config.trust_remote_code, - ) - else: - # Else, added to the list of workers. + if self.use_ray_spmd_worker: self.workers.append(worker) + else: + worker_ip = ray.get(worker.get_node_ip.remote()) + if worker_ip == driver_ip and self.driver_dummy_worker is None: + # If the worker is on the same node as the driver, we use it + # as the resource holder for the driver process. + self.driver_dummy_worker = worker + self.driver_worker = RayWorkerWrapper( + **worker_wrapper_kwargs) + else: + # Else, added to the list of workers. + self.workers.append(worker) - if self.driver_dummy_worker is None: + if not self.use_ray_spmd_worker and self.driver_dummy_worker is None: raise ValueError( "Ray does not allocate any GPUs on the driver node. Consider " "adjusting the Ray placement group or running the driver on a " @@ -106,11 +127,32 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids", use_dummy_driver=True) - node_workers = defaultdict(list) - node_gpus = defaultdict(list) - - for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids): - node_workers[node_id].append(i) + # the order in `worker_node_and_gpu_ids` does not necessarily match + # the machine boundaries. We need to make sure that workers in the + # same node are assigned consecutive ranks. + # examples: + # [('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [0]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [0]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [1]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [2]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [3]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [1]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [2]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [3])] # noqa + + # initialize worker ranks with -1 (unassigned) + worker_ranks = [-1 for x in worker_node_and_gpu_ids] + current_rank = 0 + while -1 in worker_ranks: + # whenever we find an unassigned worker, find the node + index = worker_ranks.index(-1) + current_node_id = worker_node_and_gpu_ids[index][0] + # assign ranks to all workers in the same node + for i, (node_id, _) in enumerate(worker_node_and_gpu_ids): + if node_id == current_node_id: + worker_ranks[i] = current_rank + current_rank += 1 + # with the above example, worker_ranks will be [0, 4, 5, 6, 7, 1, 2, 3] + + node_workers = defaultdict(list) # node id -> list of worker ranks + node_gpus = defaultdict(list) # node id -> list of gpu ids + + for worker_rank, (node_id, gpu_ids) in zip(worker_ranks, + worker_node_and_gpu_ids): + node_workers[node_id].append(worker_rank) # `gpu_ids` can be a list of strings or integers. # convert them to integers for consistency. # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs), @@ -129,6 +171,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", VLLM_INSTANCE_ID, "VLLM_TRACE_FUNCTION": str(envs.VLLM_TRACE_FUNCTION), + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 'true' }, ) for (node_id, _) in worker_node_and_gpu_ids] self._run_workers("update_environment_variables", all_args=all_args_to_update_environment_variables) @@ -154,7 +197,8 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", local_rank=node_workers[node_id].index(rank), rank=rank, distributed_init_method=distributed_init_method, - ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids) + ) for rank, (node_id, + _) in zip(worker_ranks, worker_node_and_gpu_ids) ] self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs) @@ -172,16 +216,14 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # broadcasted to. self.non_driver_workers: List[RayWorkerWrapper] = [] - for pp_rank in range(self.parallel_config.pipeline_parallel_size): - for tp_rank in range(self.parallel_config.tensor_parallel_size): - rank = (pp_rank * - self.parallel_config.tensor_parallel_size) + tp_rank - if rank == 0: - pass - elif rank % self.parallel_config.tensor_parallel_size == 0: - self.tp_driver_workers.append(self.workers[rank - 1]) - else: - self.non_driver_workers.append(self.workers[rank - 1]) + # Enforce rank order for correct rank to return final output. + for rank, worker in sorted(zip(worker_ranks[1:], self.workers)): + # We need to skip the driver worker, which we + # do by skipping worker_ranks[0] which is always 0. + if rank % self.parallel_config.tensor_parallel_size == 0: + self.tp_driver_workers.append(worker) + else: + self.non_driver_workers.append(worker) def _driver_execute_model( self, execute_model_req: Optional[ExecuteModelRequest] @@ -191,9 +233,23 @@ def _driver_execute_model( Passing None will cause the driver to stop the model execution loop running in each of the remote workers. """ + assert not self.use_ray_spmd_worker, ( + "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1") return self.driver_worker.execute_method("execute_model", execute_model_req) + def execute_model( + self, + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + if not self.use_ray_spmd_worker: + return super().execute_model(execute_model_req) + + if self.forward_dag is None: + self.forward_dag = self._compiled_ray_dag(enable_asyncio=False) + + outputs = ray.get(self.forward_dag.execute(execute_model_req)) + return outputs[0] + def _run_workers( self, method: str, @@ -203,7 +259,6 @@ def _run_workers( all_kwargs: Optional[List[Dict[str, Any]]] = None, use_dummy_driver: bool = False, max_concurrent_workers: Optional[int] = None, - use_ray_compiled_dag: bool = False, **kwargs, ) -> Any: """Runs the given method on all workers. Can be used in the following @@ -218,6 +273,10 @@ def _run_workers( - all_args/all_kwargs: args/kwargs for each worker are specified individually """ + if self.use_ray_spmd_worker: + assert not async_run_tensor_parallel_workers_only, ( + "async_run_tensor_parallel_workers_only is not supported for " + "spmd mode.") if max_concurrent_workers: raise NotImplementedError( @@ -226,99 +285,125 @@ def _run_workers( count = len(self.workers) if not \ async_run_tensor_parallel_workers_only \ else len(self.non_driver_workers) + # If using SPMD worker, all workers are the same, so we should execute + # the args on all workers. Otherwise, we skip the first worker's args + # because those args will go to the driver worker. + first_worker_args_index: int = 0 if self.use_ray_spmd_worker else 1 all_worker_args = repeat(args, count) if all_args is None \ - else islice(all_args, 1, None) + else islice(all_args, first_worker_args_index, None) all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \ - else islice(all_kwargs, 1, None) - - if use_ray_compiled_dag: - # Right now, compiled DAG can only accept a single - # input. TODO(sang): Fix it. - assert self.forward_dag is not None - output_channels = self.forward_dag.execute(1) - ray_worker_outputs = [] - else: - # Start the ray workers first. - ray_workers = self.workers - if async_run_tensor_parallel_workers_only: - ray_workers = self.non_driver_workers - ray_worker_outputs = [ - worker.execute_method.remote(method, *worker_args, - **worker_kwargs) - for (worker, worker_args, worker_kwargs - ) in zip(ray_workers, all_worker_args, all_worker_kwargs) - ] + else islice(all_kwargs, first_worker_args_index, None) + + # Start the ray workers first. + ray_workers = self.workers + if async_run_tensor_parallel_workers_only: + ray_workers = self.non_driver_workers + ray_worker_outputs = [ + worker.execute_method.remote(method, *worker_args, **worker_kwargs) + for (worker, worker_args, worker_kwargs + ) in zip(ray_workers, all_worker_args, all_worker_kwargs) + ] if async_run_tensor_parallel_workers_only: # Just return futures return ray_worker_outputs - driver_args = args if all_args is None else all_args[0] - driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0] - - # Start the driver worker after all the ray workers. - if not use_dummy_driver: - driver_worker_output = self.driver_worker.execute_method( - method, *driver_args, **driver_kwargs) - else: - assert self.driver_dummy_worker is not None - driver_worker_output = ray.get( - self.driver_dummy_worker.execute_method.remote( - method, *driver_args, **driver_kwargs)) + driver_worker_output = [] + # In SPMD mode, the driver worker is the same as any other worker, + # so we only explicitly execute on the driver worker if using a + # non-SPMD worker class. + if not self.use_ray_spmd_worker: + driver_args = args if all_args is None else all_args[0] + driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0] + + # Start the driver worker after all the ray workers. + if not use_dummy_driver: + driver_worker_output = [ + self.driver_worker.execute_method(method, *driver_args, + **driver_kwargs) + ] + else: + assert self.driver_dummy_worker is not None + driver_worker_output = [ + ray.get( + self.driver_dummy_worker.execute_method.remote( + method, *driver_args, **driver_kwargs)) + ] + # Get the results of the ray workers. if self.workers: - if use_ray_compiled_dag: - try: - ray_worker_outputs = [ - pickle.loads(chan.begin_read()) - for chan in output_channels - ] - finally: - # Has to call end_read in order to reuse the DAG. - for chan in output_channels: - chan.end_read() - else: - ray_worker_outputs = ray.get(ray_worker_outputs) + ray_worker_outputs = ray.get(ray_worker_outputs) - return [driver_worker_output] + ray_worker_outputs + return driver_worker_output + ray_worker_outputs def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: """Wait for futures returned from _run_workers() with async_run_remote_workers_only to complete.""" ray.get(parallel_worker_tasks) - def _compiled_ray_dag(self): + def _compiled_ray_dag(self, enable_asyncio: bool): import pkg_resources - required_version = "2.9" - current_version = pkg_resources.get_distribution("ray").version + from packaging import version + + required_version = version.parse("2.32") + current_version = version.parse( + pkg_resources.get_distribution("ray").version) if current_version < required_version: raise ValueError(f"Ray version {required_version} or greater is " f"required, but found {current_version}") from ray.dag import InputNode, MultiOutputNode - assert self.parallel_config.distributed_executor_backend == "ray" + assert self.parallel_config.use_ray # Right now, compiled DAG requires at least 1 arg. We send # a dummy value for now. It will be fixed soon. with InputNode() as input_data: forward_dag = MultiOutputNode([ - worker.execute_model_compiled_dag_remote. - bind( # type: ignore[attr-defined] + worker.execute_model_spmd.bind( # type: ignore[attr-defined] input_data) for worker in self.workers ]) - return forward_dag.experimental_compile() + return forward_dag.experimental_compile(enable_asyncio=enable_asyncio) + + def __del__(self): + if self.forward_dag is not None: + self.forward_dag.teardown() + import ray + for worker in self.workers: + ray.kill(worker) class RayHabanaExecutorAsync(RayHabanaExecutor, DistributedGPUExecutorAsync): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.driver_exec_method = make_async(self.driver_worker.execute_method) + self.pp_locks: Optional[List[asyncio.Lock]] = None + self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER + if not self.use_ray_compiled_dag: + self.driver_exec_method = make_async( + self.driver_worker.execute_method) + + async def execute_model_async( + self, + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + if not self.use_ray_spmd_worker: + return await super().execute_model_async(execute_model_req) + + if self.forward_dag is None: + self.forward_dag = self._compiled_ray_dag(enable_asyncio=True) + + dag_future = await self.forward_dag.execute_async(execute_model_req) + outputs = await dag_future + return outputs[0] async def _driver_execute_model_async( self, execute_model_req: Optional[ExecuteModelRequest] = None ) -> List[SamplerOutput]: + assert not self.use_ray_spmd_worker, ( + "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1") + if not self.tp_driver_workers: + return await self.driver_exec_method("execute_model", + execute_model_req) if self.pp_locks is None: # This locks each pipeline parallel stage so multiple virtual # engines can't execute on the same stage at the same time @@ -329,15 +414,11 @@ async def _driver_execute_model_async( for _ in range(self.parallel_config.pipeline_parallel_size) ] - async def _run_task_with_lock(task, lock, *args, **kwargs): - async with lock: - return await task(*args, **kwargs) - - tasks = [] - tasks.append( + tasks = [ asyncio.create_task( _run_task_with_lock(self.driver_exec_method, self.pp_locks[0], - "execute_model", execute_model_req))) + "execute_model", execute_model_req)) + ] for pp_rank, driver_worker in enumerate(self.tp_driver_workers, start=1): tasks.append( @@ -352,8 +433,17 @@ async def _run_task_with_lock(task, lock, *args, **kwargs): return results[-1] async def _start_worker_execution_loop(self): + assert not self.use_ray_spmd_worker, ( + "worker loop is disabled for VLLM_USE_RAY_SPMD_WORKER=1") coros = [ worker.execute_method.remote("start_worker_execution_loop") for worker in self.non_driver_workers ] return await asyncio.gather(*coros) + + def __del__(self): + if self.forward_dag is not None: + self.forward_dag.teardown() + import ray + for worker in self.workers: + ray.kill(worker) From cf7cf029def29c40bc470440e7a1e27c5e18ddc7 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 30 Jul 2024 10:26:33 +0200 Subject: [PATCH 109/819] Add torch.compile support (#48) * Remove usage of wrap_in_hpu_graph in PT eager * Add torch.compile support * Update habana_model_runner.py * format.sh pass * do not warmup graphs in non-lazy backend --- vllm/worker/habana_model_runner.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 15ac0035228cc..57a3cf18658d9 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -149,8 +149,12 @@ def align_workers(value, op): class HpuModelAdapter(): - def __init__(self, model): + def __init__(self, model, enforce_eager): self.model = model + if not htorch.utils.internal.is_lazy() and not enforce_eager: + self.model = torch.compile(self.model, + backend='hpu_backend', + dynamic=False) def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device, dtype): @@ -428,7 +432,8 @@ def load_model(self) -> None: # FIXME: Running with disable_tensor_cache=True causes # RuntimeErrors. This needs to be debugged with HabanaMemoryProfiler() as m_wrap: - self.model = _maybe_wrap_in_hpu_graph(self.model) + self.model = _maybe_wrap_in_hpu_graph( + self.model, enforce_eager=self.enforce_eager) msg = f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}" logger.info(msg) @@ -1118,7 +1123,7 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: self.warmup_all_buckets(self.prompt_buckets, True, kv_caches) self.warmup_all_buckets(self.decode_buckets, False, kv_caches) - if not self.enforce_eager: + if not self.enforce_eager and htorch.utils.internal.is_lazy(): mem_margin = 1.0 - float( os.environ.get('VLLM_GRAPH_MEM_MARGIN', '0.02')) free_mem = \ @@ -1150,9 +1155,11 @@ def vocab_size(self) -> int: return self.model_config.get_vocab_size() -def _maybe_wrap_in_hpu_graph(model): +def _maybe_wrap_in_hpu_graph(*args, **kwargs): return htorch.hpu.wrap_in_hpu_graph(HpuModelAdapter( - model)) if htorch.utils.internal.is_lazy() else HpuModelAdapter(model) + *args, ** + kwargs)) if htorch.utils.internal.is_lazy() else HpuModelAdapter( + *args, **kwargs) class HabanaProfilerCounterHelper(): From 2f675f32170685e2f526ec07413bfb05bf39eaf6 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 30 Jul 2024 15:35:53 +0300 Subject: [PATCH 110/819] use_ray fix --- vllm/executor/habana_executor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py index 8750c3b00dd9e..f5cf26b687053 100644 --- a/vllm/executor/habana_executor.py +++ b/vllm/executor/habana_executor.py @@ -20,6 +20,8 @@ class HabanaExecutor(ExecutorBase): + uses_ray: bool = False + def _init_executor(self) -> None: """Initialize the worker and load the model.""" self._init_worker() From 16af1c7a8dbdafaab49c9e922e1f7a8ffdf1b89b Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 30 Jul 2024 15:36:06 +0300 Subject: [PATCH 111/819] formatting fixes --- vllm/executor/ray_habana_executor.py | 1 - vllm/model_executor/layers/layernorm.py | 5 +++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py index a616e59b3be60..9e0a89cbeb8aa 100644 --- a/vllm/executor/ray_habana_executor.py +++ b/vllm/executor/ray_habana_executor.py @@ -171,7 +171,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", VLLM_INSTANCE_ID, "VLLM_TRACE_FUNCTION": str(envs.VLLM_TRACE_FUNCTION), - "PT_HPU_ENABLE_LAZY_COLLECTIVES": 'true' }, ) for (node_id, _) in worker_node_and_gpu_ids] self._run_workers("update_environment_variables", all_args=all_args_to_update_environment_variables) diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index e00cb9ca6e1ac..01429d2fcbd17 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -11,8 +11,9 @@ logger = init_logger(__name__) if is_hpu(): try: - from habana_frameworks.torch.hpex.normalization import ( - FusedRMSNorm as HPUFusedRMSNorm) + from habana_frameworks.torch.hpex.normalization import (FusedRMSNorm as + HPUFusedRMSNorm + ) except ImportError: logger.warning( "Could not import HPU FusedRMSNorm kernel. " From e5e59a121eb87039c75f661e9557171dcf7f638e Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 30 Jul 2024 15:36:20 +0300 Subject: [PATCH 112/819] make block size 128 default on gaudi --- vllm/engine/arg_utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index cb0148cce0fe9..e4b223a1b505f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -12,7 +12,7 @@ from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS -from vllm.utils import FlexibleArgumentParser +from vllm.utils import FlexibleArgumentParser, is_hpu if TYPE_CHECKING: from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import ( @@ -52,7 +52,9 @@ class EngineArgs: pipeline_parallel_size: int = 1 tensor_parallel_size: int = 1 max_parallel_loading_workers: Optional[int] = None - block_size: int = 16 + # NOTE(kzawora): default block size for Gaudi should be 128 + # smaller sizes still work, but very inefficiently + block_size: int = 16 if not is_hpu() else 128 enable_prefix_caching: bool = False disable_sliding_window: bool = False use_v2_block_manager: bool = False From 030a2cb8446e9282c56abf6f37adc02a8f230474 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 30 Jul 2024 16:21:47 +0300 Subject: [PATCH 113/819] Add constraints for HPU UnquantizedFusedMoEMethod --- vllm/model_executor/layers/fused_moe/layer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index fcffe7cc76db7..b49bf40d4746e 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -108,6 +108,10 @@ def forward_hpu(self, x: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, router_logits: torch.Tensor, top_k: int, renormalize: bool, use_grouped_topk: bool, num_expert_group: Optional[int], topk_group: Optional[int]): + assert not use_grouped_topk, 'use_grouped_topk must be False on HPU' + assert num_expert_group is None, ('num_expert_group is ' + 'not supported on HPU') + assert topk_group is None, 'topk_group is not supported on HPU' return static_fused_moe(x, w1, w2, router_logits, top_k) def forward_cpu(self, *args, **kwargs): From 2ccf56e468fdd6576425ea0fd8f0ee9fa8c80619 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 31 Jul 2024 10:28:02 +0200 Subject: [PATCH 114/819] Remove redundant torch.device (#139) --- vllm/model_executor/model_loader/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index cbe9ebf35f4dd..bbe49655020da 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -276,7 +276,7 @@ def load_model(self, *, model_config: ModelConfig, scheduler_config: SchedulerConfig, cache_config: CacheConfig) -> nn.Module: with set_default_torch_dtype(model_config.dtype): - with torch.device(torch.device(device_config.device)): + with torch.device(device_config.device): model = _initialize_model(model_config, self.load_config, lora_config, multimodal_config, cache_config, scheduler_config) From bc1af91125e763cb84d5ecfb5387aae0631c7bde Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 31 Jul 2024 10:28:16 +0200 Subject: [PATCH 115/819] Add functools.wraps decorator to with_mark_steps (#138) * Add functools.wraps decorator to with_mark_steps * i cant use functools.wraps properly it seems --- vllm/hpu/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/hpu/utils.py b/vllm/hpu/utils.py index 06f3690aded8b..b7b435c50c295 100644 --- a/vllm/hpu/utils.py +++ b/vllm/hpu/utils.py @@ -5,11 +5,14 @@ # LICENSE file in the root directory of this source tree. ############################################################################### +from functools import wraps + import habana_frameworks.torch as htorch def with_mark_steps(fn): + @wraps(fn) def wrapped(*args, **kwargs): htorch.core.mark_step() result = fn(*args, **kwargs) From 5c7187d8e726c72689abc26a59a6cece757c3e22 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 31 Jul 2024 10:30:02 +0200 Subject: [PATCH 116/819] Add HPU platform and HpuCommunicator for TP (#136) * Add HPU platform and HpuCommunicator for TP * remove print * whoopsie I forgot to add vllm/platforms/__init__.py * format.sh --- vllm/distributed/communication_op.py | 10 ---- .../device_communicators/hpu_communicator.py | 48 +++++++++++++++++++ vllm/distributed/parallel_state.py | 20 ++++++++ vllm/platforms/__init__.py | 5 +- vllm/platforms/hpu.py | 17 +++++++ vllm/platforms/interface.py | 4 ++ 6 files changed, 93 insertions(+), 11 deletions(-) create mode 100644 vllm/distributed/device_communicators/hpu_communicator.py create mode 100644 vllm/platforms/hpu.py diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py index 63c159fce3d71..32394a07b00b9 100644 --- a/vllm/distributed/communication_op.py +++ b/vllm/distributed/communication_op.py @@ -3,21 +3,11 @@ import torch import torch.distributed -from vllm.utils import is_hpu - from .parallel_state import get_tp_group -if is_hpu(): - import habana_frameworks.torch as htorch - def tensor_model_parallel_all_reduce(input_: torch.Tensor) -> torch.Tensor: """All-reduce the input tensor across model parallel group.""" - if is_hpu(): - # FIXME(kzawora): this is a workaround for a bug in Habana PT bridge - # occurring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used - # (which is required for tensor parallel HPUGraph inference) - htorch.core.mark_step() return get_tp_group().all_reduce(input_) diff --git a/vllm/distributed/device_communicators/hpu_communicator.py b/vllm/distributed/device_communicators/hpu_communicator.py new file mode 100644 index 0000000000000..cc9b19ce022b5 --- /dev/null +++ b/vllm/distributed/device_communicators/hpu_communicator.py @@ -0,0 +1,48 @@ +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup + +from vllm.platforms import current_platform + +if current_platform.is_hpu(): + import habana_frameworks.torch as htorch # noqa: F401 + + +class HpuCommunicator: + + def __init__(self, group: ProcessGroup): + if not current_platform.is_hpu(): + self.disabled = True + return + self.disabled = False + self.group = group + self.world_size = dist.get_world_size(self.group) + + def all_reduce(self, x: torch.Tensor) -> torch.Tensor: + # FIXME(kzawora): this is a workaround for a bug in Habana PT bridge + # occurring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used + # (which is required for tensor parallel HPUGraph inference) + htorch.core.mark_step() + dist.all_reduce(x, group=self.group) + return x + + def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor: + world_size = self.world_size + if dim < 0: + # Convert negative dim to positive. + dim += x.dim() + input_size = x.size() + # Allocate output tensor. + output_tensor = torch.empty((world_size, ) + input_size, + dtype=x.dtype, + device=x.device) + # All-gather. + htorch.core.mark_step() + dist.all_gather_into_tensor(output_tensor, x, group=self.group) + # Reshape + output_tensor = output_tensor.movedim(0, dim) + output_tensor = output_tensor.reshape(input_size[:dim] + + (world_size * + input_size[dim], ) + + input_size[dim + 1:]) + return output_tensor diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 4116b1729d188..4971216d450d1 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -134,6 +134,7 @@ def __init__( use_pynccl: bool, use_custom_allreduce: bool, use_tpu_communicator: bool, + use_hpu_communicator: bool, use_message_queue_broadcaster: bool = False, ): @@ -166,6 +167,7 @@ def __init__( self.use_pynccl = use_pynccl self.use_custom_allreduce = use_custom_allreduce self.use_tpu_communicator = use_tpu_communicator + self.use_hpu_communicator = use_hpu_communicator # lazy import to avoid documentation build error from vllm.distributed.device_communicators.custom_all_reduce import ( @@ -198,6 +200,12 @@ def __init__( if use_tpu_communicator and self.world_size > 1: self.tpu_communicator = TpuCommunicator(group=self.cpu_group) + from vllm.distributed.device_communicators.hpu_communicator import ( + HpuCommunicator) + self.hpu_communicator: Optional[HpuCommunicator] + if use_hpu_communicator and self.world_size > 1: + self.hpu_communicator = HpuCommunicator(group=self.device_group) + from vllm.distributed.device_communicators.shm_broadcast import ( MessageQueue) self.mq_broadcaster: Optional[MessageQueue] = None @@ -303,6 +311,11 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: if tpu_comm is not None and not tpu_comm.disabled: return tpu_comm.all_reduce(input_) + # For HPUs, use HPU communicator. + hpu_comm = self.hpu_communicator + if hpu_comm is not None and not hpu_comm.disabled: + return hpu_comm.all_reduce(input_) + if ca_comm is not None: out = ca_comm.custom_all_reduce(input_) if out is not None: @@ -330,6 +343,11 @@ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor: if tpu_comm is not None and not tpu_comm.disabled: return tpu_comm.all_gather(input_, dim) + # For HPUs, use HPU communicator. + hpu_comm = self.hpu_communicator + if hpu_comm is not None and not hpu_comm.disabled: + return hpu_comm.all_gather(input_, dim) + if dim < 0: # Convert negative dim to positive. dim += input_.dim() @@ -748,6 +766,7 @@ def init_world_group(ranks: List[int], local_rank: int, use_pynccl=False, use_custom_allreduce=False, use_tpu_communicator=False, + use_hpu_communicator=False, ) @@ -767,6 +786,7 @@ def init_model_parallel_group( use_pynccl=True, use_custom_allreduce=use_custom_allreduce, use_tpu_communicator=True, + use_hpu_communicator=True, use_message_queue_broadcaster=use_message_queue_broadcaster, ) diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index eac917786bd6b..8ca674af8d479 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -2,7 +2,7 @@ import torch -from vllm.utils import is_tpu +from vllm.utils import is_hpu, is_tpu from .interface import Platform, PlatformEnum, UnspecifiedPlatform @@ -17,6 +17,9 @@ elif is_tpu(): from .tpu import TpuPlatform current_platform = TpuPlatform() +elif is_hpu(): + from .hpu import HpuPlatform + current_platform = HpuPlatform() else: current_platform = UnspecifiedPlatform() diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py new file mode 100644 index 0000000000000..45f2b95e704d6 --- /dev/null +++ b/vllm/platforms/hpu.py @@ -0,0 +1,17 @@ +from typing import Tuple + +import torch + +from .interface import Platform, PlatformEnum + + +class HpuPlatform(Platform): + _enum = PlatformEnum.HPU + + @staticmethod + def get_device_capability(device_id: int = 0) -> Tuple[int, int]: + raise RuntimeError("HPU does not have device capability.") + + @staticmethod + def inference_mode(): + return torch.no_grad() diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 0760f9554fb78..3c7b4dc858327 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -8,6 +8,7 @@ class PlatformEnum(enum.Enum): CUDA = enum.auto() ROCM = enum.auto() TPU = enum.auto() + HPU = enum.auto() UNSPECIFIED = enum.auto() @@ -23,6 +24,9 @@ def is_rocm(self) -> bool: def is_tpu(self) -> bool: return self._enum == PlatformEnum.TPU + def is_hpu(self) -> bool: + return self._enum == PlatformEnum.HPU + @staticmethod def get_device_capability(device_id: int = 0) -> Tuple[int, int]: raise NotImplementedError From 667c7f3c0808edd9d204872f34addf5b22bed134 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 5 Aug 2024 10:15:43 +0200 Subject: [PATCH 117/819] Re-enable FusedRoPE (#145) --- vllm/hpu/rotary_embed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py index e44bfa2f6210c..30a88d68a24af 100644 --- a/vllm/hpu/rotary_embed.py +++ b/vllm/hpu/rotary_embed.py @@ -20,7 +20,7 @@ except ImportError: logger.warning("Could not import HPU FusedRoPE kernel. " "vLLM will use forward_native implementation of RoPE.") - FusedRoPE = None + FusedRoPE = None else: FusedRoPE = None From 14c20a333c908fa3991fa18ef970023464eb752f Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 5 Aug 2024 11:00:34 +0200 Subject: [PATCH 118/819] Overhaul HPU memory management in HPUGraph capture (#147) * Log more HPU memory metrics during vLLM startup * Overhaul memory management in HPUGraph capture * fix percentage in decode buckets --- vllm/utils.py | 2 +- vllm/worker/habana_model_runner.py | 102 +++++++++++++++++++++++------ vllm/worker/habana_worker.py | 42 +++++++++--- 3 files changed, 118 insertions(+), 28 deletions(-) diff --git a/vllm/utils.py b/vllm/utils.py index c1d0f37eb154f..8a1bc5de03eb7 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -664,7 +664,7 @@ def get_summary_string(self): return ( f"{format_bytes(self.consumed_device_memory)} of device memory " f"({format_bytes(self.final_device_memory)}/" - f"({format_bytes(HabanaMemoryProfiler.total_device_memory())} used)" + f"{format_bytes(HabanaMemoryProfiler.total_device_memory())} used)" f" and {format_bytes(self.consumed_host_memory)} of host memory " f"({format_bytes(self.final_host_memory)}/" f"{format_bytes(HabanaMemoryProfiler.total_host_memory())} used)") diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 8a220e2ef0171..cf91c69069ed6 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -409,7 +409,7 @@ def __init__( # Profiler stats self.profiler_counter_helper = HabanaProfilerCounterHelper() - + self._mem_margin: Optional[int] = None self._setup_buckets() def load_model(self) -> None: @@ -1071,10 +1071,15 @@ def warmup_all_buckets(self, buckets, is_prompt, kv_caches): len(buckets), batch_size, seq_len) self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches) - def warmup_graphs(self, strategy, buckets, is_prompt, kv_caches, - available_mem): - total_batch_seq = 0.001 - total_mem = 0 + def warmup_graphs(self, + strategy, + buckets, + is_prompt, + kv_caches, + available_mem, + starting_mem=0, + total_batch_seq=0.001): + total_mem = starting_mem idx = 0 phase = f'Graph/{"Prompt" if is_prompt else "Decode"}' num_candidates = len(buckets) @@ -1088,14 +1093,18 @@ def warmup_graphs(self, strategy, buckets, is_prompt, kv_caches, raise NotImplementedError( f'Unsupported graph allocation strategy: {strategy}') buckets = list(sorted(buckets, key=ordering)) - + captured_all = True for idx, (batch_size, seq_len) in enumerate(buckets): # Graph memory usage is proportional to seq dimension in a batch batch_seq = batch_size * seq_len if is_prompt else batch_size mem_estimate = batch_seq / total_batch_seq * total_mem if mem_estimate >= available_mem: + captured_all = False + continue + graphed_bucket = (batch_size, seq_len, is_prompt) + if graphed_bucket in self.graphed_buckets: continue - self.graphed_buckets.add((batch_size, seq_len, is_prompt)) + self.graphed_buckets.add(graphed_bucket) self.log_warmup(phase, idx, num_candidates, batch_size, seq_len) with HabanaMemoryProfiler() as mem_prof: self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches) @@ -1104,6 +1113,12 @@ def warmup_graphs(self, strategy, buckets, is_prompt, kv_caches, available_mem -= used_mem total_mem += used_mem total_batch_seq += batch_seq + + return total_mem, total_batch_seq, captured_all + + def log_graph_warmup_summary(self, buckets, is_prompt, total_mem): + num_candidates = len(buckets) + phase = f'Graph/{"Prompt" if is_prompt else "Decode"}' graphed = list(c[:2] for c in self.graphed_buckets if c[2] == is_prompt) msg = (f'{phase} captured:{len(graphed)} ' @@ -1124,22 +1139,63 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: self.warmup_all_buckets(self.decode_buckets, False, kv_caches) if not self.enforce_eager and htorch.utils.internal.is_lazy(): - mem_margin = 1.0 - float( - os.environ.get('VLLM_GRAPH_MEM_MARGIN', '0.02')) - free_mem = \ - mem_margin * HabanaMemoryProfiler.current_free_device_memory() - free_mem = align_workers(free_mem, torch.distributed.ReduceOp.MIN) + assert self.mem_margin is not None, \ + ("HabanaWorker.determine_num_available_blocks needs " + "to be called before warming up the model.") + free_mem = HabanaMemoryProfiler.current_free_device_memory() + graph_free_mem = free_mem - self.mem_margin + graph_free_mem = align_workers(graph_free_mem, + torch.distributed.ReduceOp.MIN) prompt_graph_mem_ratio = float( os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.5')) - prompt_available_memory = prompt_graph_mem_ratio * free_mem - decode_available_memory = free_mem - prompt_available_memory - prompt_strategy = 'min_tokens' + prompt_available_memory = prompt_graph_mem_ratio * graph_free_mem + decode_available_memory = graph_free_mem - prompt_available_memory + msg = (f"Using {format_bytes(graph_free_mem)}" + f"/{format_bytes(free_mem)} " + "of free device memory for HPUGraphs, " + f"{format_bytes(prompt_available_memory)} for prompt and " + f"{format_bytes(decode_available_memory)} for decode " + f"(VLLM_GRAPH_PROMPT_RATIO={prompt_graph_mem_ratio})") + logger.info(msg) + prompt_strategy = os.environ.get('VLLM_GRAPH_PROMPT_STRATEGY', + 'min_tokens') decode_strategy = os.environ.get('VLLM_GRAPH_DECODE_STRATEGY', 'max_bs') - self.warmup_graphs(prompt_strategy, self.prompt_buckets, True, - kv_caches, prompt_available_memory) - self.warmup_graphs(decode_strategy, self.decode_buckets, False, - kv_caches, decode_available_memory) + mem_post_prompt, prompt_batch_seq, prompt_captured_all = \ + self.warmup_graphs( + prompt_strategy, self.prompt_buckets, True, kv_caches, + prompt_available_memory) + mem_post_decode, decode_batch_seq, decode_captured_all = \ + self.warmup_graphs( + decode_strategy, self.decode_buckets, False, kv_caches, + decode_available_memory) + + # Not all prompt buckets were captured, but all decode buckets were + # captured and we have some free graph-allocated space left. + # Let's try to use it for capturing more prompt buckets. + if mem_post_decode + mem_post_prompt < graph_free_mem \ + and not prompt_captured_all \ + and decode_captured_all: + mem_post_prompt, _, prompt_captured_all = self.warmup_graphs( + prompt_strategy, self.prompt_buckets, True, kv_caches, + graph_free_mem - mem_post_prompt - mem_post_decode, + mem_post_prompt, prompt_batch_seq) + + # Not all decode buckets were captured, but all prompt buckets were + # captured and we have some free graph-allocated space left. + # Let's try to use it for capturing more decode buckets. + if mem_post_decode + mem_post_prompt < graph_free_mem \ + and not decode_captured_all \ + and prompt_captured_all: + mem_post_decode, _, _ = self.warmup_graphs( + decode_strategy, self.decode_buckets, False, kv_caches, + graph_free_mem - mem_post_prompt - mem_post_decode, + mem_post_decode, decode_batch_seq) + + self.log_graph_warmup_summary(self.prompt_buckets, True, + mem_post_prompt) + self.log_graph_warmup_summary(self.decode_buckets, False, + mem_post_decode) end_time = time.perf_counter() end_mem = HabanaMemoryProfiler.current_device_memory_usage() @@ -1154,6 +1210,14 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: def vocab_size(self) -> int: return self.model_config.get_vocab_size() + @property + def mem_margin(self) -> Optional[int]: + return self._mem_margin + + @mem_margin.setter + def mem_margin(self, value): + self._mem_margin = value + def _maybe_wrap_in_hpu_graph(*args, **kwargs): return htorch.hpu.wrap_in_hpu_graph(HpuModelAdapter( diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index 6be229e037d06..f3fdc4dcc63c6 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -16,14 +16,18 @@ SpeculativeConfig) from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) +from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor import set_random_seed from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import ExecuteModelRequest +from vllm.utils import HabanaMemoryProfiler, format_bytes from vllm.worker.cache_engine import CacheEngine from vllm.worker.habana_model_runner import HabanaModelRunner from vllm.worker.worker_base import LocalOrDistributedWorkerBase, WorkerInput +logger = init_logger(__name__) + class HabanaWorker(LocalOrDistributedWorkerBase): """A worker class that executes (a partition of) the model on a HPU. @@ -122,20 +126,37 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: # Execute a forward pass with dummy inputs to profile the memory usage # of the model. - self.model_runner.profile_run() - torch.hpu.synchronize() - + with HabanaMemoryProfiler() as m: + self.model_runner.profile_run() + torch.hpu.synchronize() + msg = ("Model profiling run " + f"took {m.get_summary_string()}") + logger.info(msg) # At this point we should've allocated the maximum workspace for all # recipes we will use the extra memory for graphs/blocks free_hpu_memory = torch.hpu.mem_get_info()[0] cache_block_size = self.get_cache_block_size_bytes() - graph_headroom = 1 - (float( + graph_reserved_mem = (float( os.environ.get('VLLM_GRAPH_RESERVED_MEM', '0.4')) if not self.model_config.enforce_eager else 0) - num_hpu_blocks = int(free_hpu_memory * graph_headroom * - self.cache_config.gpu_memory_utilization // - cache_block_size) + graph_headroom = 1 - graph_reserved_mem + available_hpu_memory = free_hpu_memory * \ + self.cache_config.gpu_memory_utilization + hpu_memory_margin = free_hpu_memory * ( + 1 - self.cache_config.gpu_memory_utilization) + self.model_runner.mem_margin = hpu_memory_margin + cache_size_bytes = available_hpu_memory * graph_headroom + graph_headroom_bytes = available_hpu_memory * (1 - graph_headroom) + msg = ( + f"Free device memory: {format_bytes(free_hpu_memory)}, " + f"{format_bytes(available_hpu_memory)} usable " + f"(gpu_memory_utilization={self.cache_config.gpu_memory_utilization})," + f" {format_bytes(graph_headroom_bytes)} reserved for HPUGraphs " + f"(VLLM_GRAPH_RESERVED_MEM={graph_reserved_mem}), " + f"{format_bytes(cache_size_bytes)} reserved for KV cache") + logger.info(msg) + num_hpu_blocks = int(cache_size_bytes // cache_block_size) num_cpu_blocks = int(self.cache_config.swap_space_bytes // cache_block_size) num_hpu_blocks = max(num_hpu_blocks, 0) @@ -161,7 +182,12 @@ def initialize_cache(self, num_gpu_blocks: int, self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks - self._init_cache_engine() + with HabanaMemoryProfiler() as m: + self._init_cache_engine() + torch.hpu.synchronize() + msg = ("Initializing cache engine " + f"took {m.get_summary_string()}") + logger.info(msg) self._warm_up_model() def _init_cache_engine(self): From b43c7f915a28ac246aae39901a893eb9e8bed45b Mon Sep 17 00:00:00 2001 From: Karol Damaszke Date: Tue, 6 Aug 2024 10:53:52 +0200 Subject: [PATCH 119/819] Allocate blocks from id=1 for HPU (#160) --- vllm/core/block/cpu_gpu_block_allocator.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index 5287cd9c1bfb3..a4805e4f342f9 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -4,7 +4,7 @@ DeviceAwareBlockAllocator) from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator -from vllm.utils import Device +from vllm.utils import Device, is_hpu class CpuGpuBlockAllocator(DeviceAwareBlockAllocator): @@ -52,7 +52,9 @@ def create( - The block IDs are assigned contiguously, with GPU block IDs coming before CPU block IDs. """ - block_ids = list(range(num_gpu_blocks + num_cpu_blocks)) + # For HPU block ids cannot be equal to 0 + start_id = 1 if is_hpu() else 0 + block_ids = list(range(start_id, num_gpu_blocks + num_cpu_blocks)) gpu_block_ids = block_ids[:num_gpu_blocks] cpu_block_ids = block_ids[num_gpu_blocks:] From 37ca17f0097dae0a03fee6936062871ec49e2351 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 6 Aug 2024 15:55:24 +0200 Subject: [PATCH 120/819] Revert "Allocate blocks from id=1 for HPU (#160)" (#163) This reverts commit b43c7f915a28ac246aae39901a893eb9e8bed45b. --- vllm/core/block/cpu_gpu_block_allocator.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index a4805e4f342f9..5287cd9c1bfb3 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -4,7 +4,7 @@ DeviceAwareBlockAllocator) from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator -from vllm.utils import Device, is_hpu +from vllm.utils import Device class CpuGpuBlockAllocator(DeviceAwareBlockAllocator): @@ -52,9 +52,7 @@ def create( - The block IDs are assigned contiguously, with GPU block IDs coming before CPU block IDs. """ - # For HPU block ids cannot be equal to 0 - start_id = 1 if is_hpu() else 0 - block_ids = list(range(start_id, num_gpu_blocks + num_cpu_blocks)) + block_ids = list(range(num_gpu_blocks + num_cpu_blocks)) gpu_block_ids = block_ids[:num_gpu_blocks] cpu_block_ids = block_ids[num_gpu_blocks:] From 1f348b85459be2b12f9e86be95ef5a7179f641cf Mon Sep 17 00:00:00 2001 From: Jan Kaniecki Date: Mon, 12 Aug 2024 14:54:04 +0200 Subject: [PATCH 121/819] Reimplement silu_and_mul for mixtral (#167) * Reimplement silu and mul in mixtral * Typo fix --- vllm/hpu/ops.py | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index bd737917cb919..3748eb3544dd1 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -16,13 +16,6 @@ PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', '1') == '1') -def silu_and_mul(output, input): - d = input.shape[-1] // 2 - silu = torch.nn.SiLU().to(input.device) - x, y = torch.split(input, d, dim=-1) - output.copy_(silu(x) * y) - - def fetch_from_cache(cache, blocks, permutations): return [ cache.index_select(0, blocks[:, i]).permute(permutations) @@ -81,12 +74,9 @@ def paged_attention_v1(query, return attn_weights.squeeze(-2) -def silu_and_mul_wrapper(x: torch.Tensor) -> torch.Tensor: +def silu_and_mul(x: torch.Tensor) -> torch.Tensor: d = x.shape[-1] // 2 - output_shape = (x.shape[:-1] + (d, )) - out = torch.empty(output_shape, dtype=x.dtype, device=x.device) - silu_and_mul(out, x) - return out + return F.silu(x[..., :d]) * x[..., d:] def static_fused_moe(hidden_states, w1, w2, score, topk): @@ -111,13 +101,10 @@ def static_fused_moe(hidden_states, w1, w2, score, topk): htorch.core.mark_step() for expert_idx in range(num_experts): - padded_weight = padded_weights[expert_idx] - current_state_static = hidden_states.reshape(-1, D) - w_output = silu_and_mul_wrapper( - torch.matmul(current_state_static, w1[expert_idx].transpose(0, 1))) + w_output = torch.matmul(hidden_states, w1[expert_idx].transpose(0, 1)) + w_output = silu_and_mul(w_output) w_output = torch.matmul(w_output, w2[expert_idx].transpose(0, 1)) - current_hidden_states_static = w_output * padded_weight - final_hidden_states += current_hidden_states_static + final_hidden_states += w_output * padded_weights[expert_idx] htorch.core.mark_step() return final_hidden_states.view(-1, D) From d29191000b11b960ca29b65d5876f05756d27ac0 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 13 Aug 2024 12:14:31 +0200 Subject: [PATCH 122/819] Enable GitHub Actions static checks for habana_main (#177) * Update target branch * format layernorm.py * adjust format.sh & vllm.hpu.ops * fix layernorm once for all --- .github/workflows/clang-format.yml | 6 +++--- .github/workflows/mypy.yaml | 8 +++++--- .github/workflows/ruff.yml | 6 +++--- .github/workflows/yapf.yml | 6 +++--- format.sh | 1 + vllm/hpu/ops.py | 16 +++++++++++++--- vllm/model_executor/layers/layernorm.py | 12 +----------- 7 files changed, 29 insertions(+), 26 deletions(-) diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml index e9b6e28fa6bcb..9d40813a98d7a 100644 --- a/.github/workflows/clang-format.yml +++ b/.github/workflows/clang-format.yml @@ -2,13 +2,13 @@ name: clang-format on: # Trigger the workflow on push or pull request, - # but only for the main branch + # but only for the habana_main branch push: branches: - - main + - habana_main pull_request: branches: - - main + - habana_main jobs: clang-format: diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml index 5780f09a646cb..c2674b914f485 100644 --- a/.github/workflows/mypy.yaml +++ b/.github/workflows/mypy.yaml @@ -2,13 +2,13 @@ name: mypy on: # Trigger the workflow on push or pull request, - # but only for the main branch + # but only for the habana_main branch push: branches: - - main + - habana_main pull_request: branches: - - main + - habana_main jobs: ruff: @@ -50,4 +50,6 @@ jobs: mypy vllm/transformers_utils --config-file pyproject.toml mypy vllm/usage --config-file pyproject.toml mypy vllm/worker --config-file pyproject.toml + mypy vllm/hpu --config-file pyproject.toml + diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml index 773def58fd966..a2b7aa2549af9 100644 --- a/.github/workflows/ruff.yml +++ b/.github/workflows/ruff.yml @@ -2,13 +2,13 @@ name: ruff on: # Trigger the workflow on push or pull request, - # but only for the main branch + # but only for the habana_main branch push: branches: - - main + - habana_main pull_request: branches: - - main + - habana_main jobs: ruff: diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml index 04f307bcf8b0e..4e0d67c5b59d6 100644 --- a/.github/workflows/yapf.yml +++ b/.github/workflows/yapf.yml @@ -2,13 +2,13 @@ name: yapf on: # Trigger the workflow on push or pull request, - # but only for the main branch + # but only for the habana_main branch push: branches: - - main + - habana_main pull_request: branches: - - main + - habana_main jobs: yapf: runs-on: ubuntu-latest diff --git a/format.sh b/format.sh index 5ad6d6f2938bb..fbfc27a68bb3d 100755 --- a/format.sh +++ b/format.sh @@ -113,6 +113,7 @@ mypy vllm/spec_decode --config-file pyproject.toml mypy vllm/transformers_utils --config-file pyproject.toml mypy vllm/usage --config-file pyproject.toml mypy vllm/worker --config-file pyproject.toml +mypy vllm/hpu --config-file pyproject.toml # If git diff returns a file that is in the skip list, the file may be checked anyway: diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index 3748eb3544dd1..7a40e6e720259 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -12,6 +12,16 @@ import torch.nn.functional as F import vllm.hpu.utils as hpu_utils +from vllm.logger import init_logger + +logger = init_logger() +HPUFusedRMSNorm = None +try: + from habana_frameworks.torch.hpex.normalization import FusedRMSNorm + HPUFusedRMSNorm = FusedRMSNorm +except ImportError: + logger.warning("Could not import HPU FusedRMSNorm kernel. " + "vLLM will use forward_native implementation of RMSNorm.") PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', '1') == '1') @@ -52,8 +62,7 @@ def paged_attention_v1(query, keys = [k.unflatten(1, (kv_heads, 1)) for k in keys] mask = mask.unsqueeze(2) - attn_weights = [torch.matmul(query, k) for k in keys] - attn_weights = torch.cat(attn_weights, dim=-1) + attn_weights = torch.cat([torch.matmul(query, k) for k in keys], dim=-1) if alibi_slopes is not None: attn_weights.add_(alibi_slopes[:, :, -attn_weights.size(2):, -attn_weights.size(3):]) @@ -128,7 +137,8 @@ def prompt_attention( query = query.unflatten(1, (kv_heads, -1)) key = key.unflatten(1, (kv_heads, 1)) value = value.unflatten(1, (kv_heads, 1)) - attn_bias = attn_bias.unsqueeze(2) + if attn_bias is not None: + attn_bias = attn_bias.unsqueeze(2) attn_weights = torch.matmul(query * scale, key.transpose(-1, -2)) if attn_bias is not None: attn_weights.add_(attn_bias) diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 01429d2fcbd17..55cbbabd7da44 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -6,19 +6,8 @@ from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp -from vllm.utils import is_hpu logger = init_logger(__name__) -if is_hpu(): - try: - from habana_frameworks.torch.hpex.normalization import (FusedRMSNorm as - HPUFusedRMSNorm - ) - except ImportError: - logger.warning( - "Could not import HPU FusedRMSNorm kernel. " - "vLLM will use forward_native implementation of RMSNorm.") - HPUFusedRMSNorm = None class RMSNorm(CustomOp): @@ -86,6 +75,7 @@ def forward_hpu( x: torch.Tensor, residual: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + from vllm.hpu.ops import HPUFusedRMSNorm if HPUFusedRMSNorm is None: return self.forward_native(x, residual) if residual is not None: From 66eae9e75e6e70a69eeefbe24e8a1f0499524a3b Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 13 Aug 2024 13:46:19 +0200 Subject: [PATCH 123/819] remove reminder_comment.yml (#179) --- .github/workflows/reminder_comment.yml | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 .github/workflows/reminder_comment.yml diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml deleted file mode 100644 index 390c88bb65308..0000000000000 --- a/.github/workflows/reminder_comment.yml +++ /dev/null @@ -1,21 +0,0 @@ -name: PR Reminder Comment Bot -on: - pull_request_target: - types: [opened] - -jobs: - pr_reminder: - runs-on: ubuntu-latest - steps: - - name: Remind to run full CI on PR - uses: actions/github-script@v6 - with: - script: | - github.rest.issues.createComment({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which consists a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of default ones by unblocking the steps in your `fast-check` build on Buildkite UI. \n\nOnce the PR is approved and ready to go, please make sure to run full CI as it is required to merge (or just use auto-merge).\n\n To run full CI, you can do one of these:\n- Comment `/ready` on the PR\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀' - }) - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From c0984334c495762b10ee37dc817afad9fec0ef57 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 13 Aug 2024 13:46:33 +0200 Subject: [PATCH 124/819] Fix logger initialization in ops.py (#178) --- vllm/hpu/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index 7a40e6e720259..c8f00c1cbd59d 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -14,7 +14,7 @@ import vllm.hpu.utils as hpu_utils from vllm.logger import init_logger -logger = init_logger() +logger = init_logger(__name__) HPUFusedRMSNorm = None try: from habana_frameworks.torch.hpex.normalization import FusedRMSNorm From 6f047d864ba3f7b409eeaedfd1e92f61389d31da Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 14 Aug 2024 14:53:48 +0200 Subject: [PATCH 125/819] 1.17 documentation update (#172) --- .../getting_started/gaudi-installation.rst | 234 +++++++++++++++++- 1 file changed, 230 insertions(+), 4 deletions(-) diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst index a9f3ebdf274f6..7af291d62efc6 100644 --- a/docs/source/getting_started/gaudi-installation.rst +++ b/docs/source/getting_started/gaudi-installation.rst @@ -18,7 +18,7 @@ Requirements - OS: Ubuntu 22.04 LTS - Python: 3.10 - Intel Gaudi accelerator -- Intel Gaudi software version 1.16.0 or newer +- Intel Gaudi software version 1.17.0 To verify that the Intel Gaudi software was correctly installed, run: @@ -44,8 +44,8 @@ Use the following commands to run a Docker image: .. code:: console - $ docker pull vault.habana.ai/gaudi-docker/1.16.2/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest - $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.16.2/ubuntu22.04/habanalabs/pytorch-installer-2.2.2:latest + $ docker pull vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest + $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest Build and Install vLLM --------------------------- @@ -112,6 +112,12 @@ Gaudi2 devices. Configurations that are not listed may or may not work. - `meta-llama/Meta-Llama-3-8B-Instruct `__ on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3.1-8B `__ + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3.1-8B-Instruct `__ + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling - `meta-llama/Llama-2-70b `__ with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling - `meta-llama/Llama-2-70b-chat-hf `__ @@ -120,14 +126,187 @@ Gaudi2 devices. Configurations that are not listed may or may not work. with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling - `meta-llama/Meta-Llama-3-70B-Instruct `__ with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3.1-70B `__ + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3.1-70B-Instruct `__ + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling - `mistralai/Mistral-7B-Instruct-v0.3 `__ on single HPU or with tensor parallelism on 2x HPU, BF16 datatype with random or greedy sampling - `mistralai/Mixtral-8x7B-Instruct-v0.1 `__ with tensor parallelism on 2x HPU, BF16 datatype with random or greedy sampling -Performance Tips +Performance Tuning ================ +Execution modes +------------ + +Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via ``PT_HPU_LAZY_MODE`` environment variable), and ``--enforce-eager`` flag. + +.. list-table:: vLLM execution modes + :widths: 25 25 50 + :header-rows: 1 + + * - ``PT_HPU_LAZY_MODE`` + - ``enforce_eager`` + - execution mode + * - 0 + - 0 + - torch.compile + * - 0 + - 1 + - PyTorch eager mode + * - 1 + - 0 + - HPU Graphs + * - 1 + - 1 + - PyTorch lazy mode + +.. warning:: + In 1.17.0, all modes utilizing ``PT_HPU_LAZY_MODE=0`` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.17.0, please use HPU Graphs, or PyTorch lazy mode. + + +Bucketing mechanism +------------ + +Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. `Intel Gaudi Graph Compiler `__ is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution. +In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - ``batch_size`` and ``sequence_length``. + +.. note:: + Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase. + +Bucketing ranges are determined with 3 parameters - ``min``, ``step`` and ``max``. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup: + +.. code-block:: + + INFO 08-01 21:37:59 habana_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] + INFO 08-01 21:37:59 habana_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] + INFO 08-01 21:37:59 habana_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] + INFO 08-01 21:37:59 habana_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] + +``min`` determines the lowest value of the bucket. ``step`` determines the interval between buckets, and ``max`` determines the upper bound of the bucket. Furthermore, interval between ``min`` and ``step`` has special handling - ``min`` gets multiplied by consecutive powers of two, until ``step`` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes. + +Example (with ramp-up) + +.. code-block:: + + min = 2, step = 32, max = 64 + => ramp_up = (2, 4, 8, 16) + => stable = (32, 64) + => buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64) + +Example (without ramp-up) + +.. code-block:: + + min = 128, step = 128, max = 512 + => ramp_up = () + => stable = (128, 256, 384, 512) + => buckets = ramp_up + stable => (128, 256, 384, 512) + + +In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket. + +.. warning:: + If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario. + +As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as ``(4, 512)`` prefill bucket, as ``batch_size`` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as ``(4, 512)`` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a ``(2, 512)`` bucket, or context length increases above 512 tokens, in which case it will become ``(4, 640)`` bucket. + +.. note:: + Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests. + +Warmup +------------ + +Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup: + +.. code-block:: + + INFO 08-01 22:26:47 habana_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB + INFO 08-01 22:26:47 habana_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB + INFO 08-01 22:26:48 habana_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB + ... + INFO 08-01 22:26:59 habana_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB + INFO 08-01 22:27:00 habana_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB + INFO 08-01 22:27:00 habana_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB + INFO 08-01 22:27:01 habana_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB + ... + INFO 08-01 22:27:16 habana_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB + INFO 08-01 22:27:16 habana_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB + +This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. + +.. tip:: + Compiling all the buckets might take some time and can be turned off with ``VLLM_SKIP_WARMUP=true`` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment. + +HPU Graph capture +------------ + +`HPU Graphs `__ are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management. + + +When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by ``gpu_memory_utilization`` flag (``0.9`` by default). +Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage. +Only after that, ``gpu_memory_utilization`` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable. +Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. +Environment variable ``VLLM_GRAPH_RESERVED_MEM`` defines the ratio of memory reserved for HPU Graphs capture. +With its default value (``VLLM_GRAPH_RESERVED_MEM=0.4``), 40% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 60% will be utilized for KV cache. +Environment variable ``VLLM_GRAPH_PROMPT_RATIO`` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (``VLLM_GRAPH_PROMPT_RATIO=0.5``), both stages have equal memory constraints. +Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. ``VLLM_GRAPH_PROMPT_RATIO=0.2`` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. + +.. note:: + ``gpu_memory_utilization`` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, ``gpu_memory_utilization`` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory. + +User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented: +- ``max_bs`` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. ``(64, 128)``, ``(64, 256)``, ``(32, 128)``, ``(32, 256)``, ``(1, 128)``, ``(1,256)``), default strategy for decode +- ``min_tokens`` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (``batch_size*sequence_length``), default strategy for prompt + +When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by ``max_bs`` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in ``min_tokens`` strategy. + + +.. note:: + ``VLLM_GRAPH_PROMPT_RATIO`` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * ``VLLM_GRAPH_PROMPT_RATIO``) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below. + + +Each described step is logged by vLLM server, as follows (negative values correspond to memory being released): + +.. code-block:: + + INFO 08-02 17:37:44 habana_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] + INFO 08-02 17:37:44 habana_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] + INFO 08-02 17:37:44 habana_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] + INFO 08-02 17:37:44 habana_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] + INFO 08-02 17:37:52 habana_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) + INFO 08-02 17:37:52 habana_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used) + INFO 08-02 17:37:52 habana_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) + INFO 08-02 17:37:54 habana_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used) + INFO 08-02 17:37:54 habana_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache + INFO 08-02 17:37:54 habana_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0 + INFO 08-02 17:37:54 habana_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used) + INFO 08-02 17:37:54 habana_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB + ... + INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB + INFO 08-02 17:38:22 habana_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.5) + INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB + ... + INFO 08-02 17:38:26 habana_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB + INFO 08-02 17:38:27 habana_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB + ... + INFO 08-02 17:38:41 habana_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB + INFO 08-02 17:38:41 habana_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB + INFO 08-02 17:38:42 habana_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB + INFO 08-02 17:38:42 habana_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB + INFO 08-02 17:38:43 habana_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB + INFO 08-02 17:38:43 habana_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)] + INFO 08-02 17:38:43 habana_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] + INFO 08-02 17:38:43 habana_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory + INFO 08-02 17:38:43 habana_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used) + + +Recommended vLLM Parameters +------------ + - We recommend running inference on Gaudi 2 with ``block_size`` of 128 for BF16 data type. Using default values (16, 32) might lead to sub-optimal performance due to Matrix Multiplication Engine @@ -137,6 +316,53 @@ Performance Tips of 128 or 256 and max context length of 2048 with HPU Graphs enabled. If you encounter out-of-memory issues, see troubleshooting section. +Environment variables +------------ + +**Diagnostic and profiling knobs:** + +- ``VLLM_PROFILER_ENABLED``: if ``true``, high level profiler will be enabled. Resulting JSON traces can be viewed in `perfetto.habana.ai `__. Disabled by default. +- ``VLLM_HPU_LOG_STEP_GRAPH_COMPILATION``: if ``true``, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside ``PT_HPU_METRICS_GC_DETAILS=1``. Disabled by default. +- ``VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL``: if ``true``, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default. +- ``VLLM_HPU_LOG_STEP_CPU_FALLBACKS``: if ``true``, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default. +- ``VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL``: if ``true``, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default. + +**Performance tuning knobs:** + +- ``VLLM_SKIP_WARMUP``: if ``true``, warmup will be skipped, ``false`` by default +- ``VLLM_GRAPH_RESERVED_MEM``: percentage of memory dedicated for HPUGraph capture, ``0.4`` by default +- ``VLLM_GRAPH_PROMPT_RATIO``: percentage of reserved graph memory dedicated for prompt graphs, ``0.5`` by default +- ``VLLM_GRAPH_PROMPT_STRATEGY``: strategy determining order of prompt graph capture, ``min_tokens`` or ``max_bs``, ``min_tokens`` by default +- ``VLLM_GRAPH_DECODE_STRATEGY``: strategy determining order of decode graph capture, ``min_tokens`` or ``max_bs``, ``max_bs`` by default +- ``VLLM_{phase}_{dim}_BUCKET_{param}`` - collection of 12 environment variables configuring ranges of bucketing mechanism + + - ``{phase}`` is either ``PROMPT`` or ``DECODE`` + - ``{dim}`` is either ``BS`` or ``SEQ`` + - ``{param}`` is either ``MIN``, ``STEP`` or ``MAX`` + - Default values: + + - Prompt: + - batch size min (``VLLM_PROMPT_BS_BUCKET_MIN``): ``1`` + - batch size step (``VLLM_PROMPT_BS_BUCKET_STEP``): ``32`` + - batch size max (``VLLM_PROMPT_BS_BUCKET_MAX``): ``min(max_num_seqs, 64)`` + - sequence length min (``VLLM_PROMPT_SEQ_BUCKET_MIN``): ``block_size`` + - sequence length step (``VLLM_PROMPT_SEQ_BUCKET_STEP``): ``block_size`` + - sequence length max (``VLLM_PROMPT_SEQ_BUCKET_MAX``): ``1024`` + + - Decode: + - batch size min (``VLLM_DECODE_BS_BUCKET_MIN``): ``1`` + - batch size step (``VLLM_DECODE_BS_BUCKET_STEP``): ``128`` + - batch size max (``VLLM_DECODE_BS_BUCKET_MAX``): ``max_num_seqs`` + - sequence length min (``VLLM_DECODE_SEQ_BUCKET_MIN``): ``block_size`` + - sequence length step (``VLLM_DECODE_SEQ_BUCKET_STEP``): ``block_size`` + - sequence length max (``VLLM_DECODE_SEQ_BUCKET_MAX``): ``2048`` + + +Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution: + +- ``PT_HPU_LAZY_MODE``: if ``0``, PyTorch Eager backend for Gaudi will be used, if ``1`` PyTorch Lazy backend for Gaudi will be used, ``1`` is default +- ``PT_HPU_ENABLE_LAZY_COLLECTIVES``: required to be ``true`` for tensor parallel inference with HPU Graphs + Troubleshooting: Tweaking HPU Graphs ==================================== From 1e0e492e1400114f9156d61ffdd73585181ed119 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 14 Aug 2024 15:06:19 +0200 Subject: [PATCH 126/819] Readme 1.17 update (#186) FILL IN THE PR DESCRIPTION HERE FIX #xxxx (*link existing issues this PR will resolve*) **BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE** ---
PR Checklist (Click to Expand)

Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.

PR Title and Classification

Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:

  • [Bugfix] for bug fixes.
  • [CI/Build] for build or continuous integration improvements.
  • [Doc] for documentation fixes and improvements.
  • [Model] for adding a new model or improving an existing model. Model name should appear in the title.
  • [Frontend] For changes on the vLLM frontend (e.g., OpenAI API server, LLM class, etc.)
  • [Kernel] for changes affecting CUDA kernels or other compute kernels.
  • [Core] for changes in the core vLLM logic (e.g., LLMEngine, AsyncLLMEngine, Scheduler, etc.)
  • [Hardware][Vendor] for hardware-specific changes. Vendor name should appear in the prefix (e.g., [Hardware][AMD]).
  • [Misc] for PRs that do not fit the above categories. Please use this sparingly.

Note: If the PR spans more than one category, please include all relevant prefixes.

Code Quality

The PR need to meet the following code quality standards:

  • We adhere to Google Python style guide and Google C++ style guide.
  • Pass all linter checks. Please use format.sh to format your code.
  • The code need to be well-documented to ensure future contributors can easily understand the code.
  • Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.
  • Please add documentation to docs/source/ if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.

Notes for Large Changes

Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with rfc-required and might not go through the PR.

What to Expect for the Reviews

The goal of the vLLM team is to be a transparent reviewing machine. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process:

  • After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.
  • After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.
  • After the review, the reviewer will put an action-required label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.
  • Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.

Thank You

Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone!

--- README_GAUDI.md | 497 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 435 insertions(+), 62 deletions(-) diff --git a/README_GAUDI.md b/README_GAUDI.md index 1a1b2d9cc6e36..a569d6314acf8 100644 --- a/README_GAUDI.md +++ b/README_GAUDI.md @@ -1,25 +1,25 @@ -# vLLM with IntelÂź GaudiÂź 2 AI Accelerators +vLLM with IntelÂź GaudiÂź AI Accelerators +======================================= -This README provides instructions on running vLLM with Intel Gaudi devices. +This README provides instructions on running vLLM with Intel Gaudi +devices. Requirements and Installation -============================== +============================= -Please follow the instructions provided in the [Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) -to set up the environment. To achieve the best performance, please follow the methods outlined in the -[Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html). - -> [!NOTE] -> In this release (1.16.0), we are only targeting functionality and -> accuracy. Performance will be improved in next releases. +Please follow the instructions provided in the [Gaudi Installation +Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) +to set up the environment. To achieve the best performance, please +follow the methods outlined in the [Optimizing Training Platform +Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html). Requirements -------------- +------------ - OS: Ubuntu 22.04 LTS - Python: 3.10 -- Intel Gaudi 2 accelerator -- Intel Gaudi software version 1.16.0 +- Intel Gaudi accelerator +- Intel Gaudi software version 1.17.0 To verify that the Intel Gaudi software was correctly installed, run: @@ -29,41 +29,50 @@ $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, ha $ pip list | habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml, habana-media-loader and habana_quantization_toolkit are installed ``` -Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade) for more details. +Refer to [Intel Gaudi Software Stack +Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade) +for more details. Run Docker Image ------------------- +---------------- -It is highly recommended to use the latest Docker image from Intel -Gaudi vault. Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers) for more details. +It is highly recommended to use the latest Docker image from Intel Gaudi +vault. Refer to the [Intel Gaudi +documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers) +for more details. Use the following commands to run a Docker image: ``` {.console} -$ docker pull vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest -$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.16.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest - ``` +$ docker pull vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest +$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest +``` -Build and Install vLLM-fork ------------------------------ +Build and Install vLLM +---------------------- -To build and install vLLM-fork from source, run: +Currently, the latest features and performance optimizations are +developed in Gaudi\'s [vLLM-fork](https://github.com/HabanaAI/vllm-fork) +and we periodically upstream them to vLLM main repo. To install latest +[HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the +following: ``` {.console} $ git clone https://github.com/HabanaAI/vllm-fork.git $ cd vllm-fork -# git checkout v0.4.2-Gaudi-1.16.0 -$ pip install -e . # This may take 5-10 minutes. +$ git checkout habana_main +$ python setup.py develop ``` Supported Features ================== -- [Offline batched inference](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#offline-batched-inference) -- Online inference via [OpenAI-Compatible Server](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server) +- [Offline batched + inference](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#offline-batched-inference) +- Online inference via [OpenAI-Compatible + Server](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server) - HPU autodetection - no need to manually select device within vLLM -- Paged KV cache with algorithms enabled for Intel Gaudi 2 - accelerators +- Paged KV cache with algorithms enabled for Intel Gaudi accelerators - Custom Intel Gaudi implementations of Paged Attention, KV cache ops, prefill attention, Root Mean Square Layer Normalization, Rotary Positional Encoding @@ -72,7 +81,6 @@ Supported Features Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) for accelerating low-batch latency and throughput - Unsupported Features ==================== @@ -82,11 +90,11 @@ Unsupported Features - Quantization (AWQ, FP8 E5M2, FP8 E4M3) - Prefill chunking (mixed-batch inferencing) - Supported Configurations ======================== -The following configurations have been validated to be function with Gaudi devices. Configurations that are not listed may or may not work. +The following configurations have been validated to be function with +Gaudi2 devices. Configurations that are not listed may or may not work. - [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 @@ -94,47 +102,412 @@ The following configurations have been validated to be function with Gaudi devic - [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling - [meta-llama/Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b) - with tensor parallelism on 8x HPU, BF16 datatype with random - or greedy sampling + with tensor parallelism on 8x HPU, BF16 datatype with random or + greedy sampling - [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) - with tensor parallelism 8x HPU, BF16 datatype with random - or greedy sampling + with tensor parallelism on 8x HPU, BF16 datatype with random or + greedy sampling +- [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) + with tensor parallelism on 8x HPU, BF16 datatype with random or + greedy sampling +- [meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) + with tensor parallelism on 8x HPU, BF16 datatype with random or + greedy sampling +- [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) + with tensor parallelism on 8x HPU, BF16 datatype with random or + greedy sampling +- [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) + with tensor parallelism on 8x HPU, BF16 datatype with random or + greedy sampling - [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) - on single HPU or with tensor parallelism 2x HPU, BF16 datatype with random or greedy sampling + on single HPU or with tensor parallelism on 2x HPU, BF16 datatype + with random or greedy sampling - [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) - with tensor parallelism 2x HPU, BF16 datatype with random or greedy sampling + with tensor parallelism on 2x HPU, BF16 datatype with random or + greedy sampling + +Performance Tuning +================ +Execution modes +----------------------------- +Currently in vLLM for HPU we support four execution modes, depending on +selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment +variable), and `--enforce-eager` flag. -Performance Tips -================ +| `PT_HPU_LAZY_MODE` | `enforce_eager` | execution mode | +|--- |--- |--- | +| 0 | 0 | torch.compile | +| 0 | 1 | PyTorch eager mode | +| 1 | 0 | HPU Graphs | +| 1 | 1 | PyTorch lazy mode | + + +> [!WARNING] +> In 1.17.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly +> experimental and should be only used for validating functional +> correctness. Their performance will be improved in the next releases. +> For obtaining the best performance in 1.17.0, please use HPU Graphs, or +> PyTorch lazy mode. + +Bucketing mechanism +----------------------------- + +Intel Gaudi accelerators work best when operating on models with fixed +tensor shapes. [Intel Gaudi Graph +Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) +is responsible for generating optimized binary code that implements the +given model topology on Gaudi. In its default configuration, the +produced binary code may be heavily dependent on input and output tensor +shapes, and can require graph recompilation when encountering +differently shaped tensors within the same topology. While the resulting +binaries utilize Gaudi efficiently, the compilation itself may introduce +a noticeable overhead in end-to-end execution. In a dynamic inference +serving scenario, there is a need to minimize the number of graph +compilations and reduce the risk of graph compilation occurring during +server runtime. Currently it is achieved by \"bucketing\" model\'s +forward pass across two dimensions - `batch_size` and `sequence_length`. + +> [!NOTE] +> Bucketing allows us to reduce the number of required graphs +> significantly, but it does not handle any graph compilation and device +> code generation - this is done in warmup and HPUGraph capture phase. + +Bucketing ranges are determined with 3 parameters - `min`, `step` and +`max`. They can be set separately for prompt and decode phase, and for +batch size and sequence length dimension. These parameters can be +observed in logs during vLLM startup: + +``` {.} +INFO 08-01 21:37:59 habana_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] +INFO 08-01 21:37:59 habana_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] +INFO 08-01 21:37:59 habana_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] +INFO 08-01 21:37:59 habana_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] +``` -- We recommend running inference on Gaudi 2 with - `block_size` of 128 for BF16 data type. Using default - values (16, 32) might lead to sub-optimal performance due to Matrix - Multiplication Engine under-utilization (see [Gaudi +`min` determines the lowest value of the bucket. `step` determines the +interval between buckets, and `max` determines the upper bound of the +bucket. Furthermore, interval between `min` and `step` has special +handling - `min` gets multiplied by consecutive powers of two, until +`step` gets reached. We call this the ramp-up phase and it is used for +handling lower batch sizes with minimum wastage, while allowing larger +padding on larger batch sizes. + +Example (with ramp-up) + +``` {.} +min = 2, step = 32, max = 64 +=> ramp_up = (2, 4, 8, 16) +=> stable = (32, 64) +=> buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64) +``` + +Example (without ramp-up) + +``` {.} +min = 128, step = 128, max = 512 +=> ramp_up = () +=> stable = (128, 256, 384, 512) +=> buckets = ramp_up + stable => (128, 256, 384, 512) +``` + +In the logged scenario, 24 buckets were generated for prompt (prefill) +runs, and 48 buckets for decode runs. Each bucket corresponds to a +separate optimized device binary for a given model with specified tensor +shapes. Whenever a batch of requests is processed, it is padded across +batch and sequence length dimension to the smallest possible bucket. + +> [!WARNING] +> If a request exceeds maximum bucket size in any dimension, it will be +> processed without padding, and its processing may require a graph +> compilation, potentially significantly increasing end-to-end latency. +> The boundaries of the buckets are user-configurable via environment +> variables, and upper bucket boundaries can be increased to avoid such +> scenario. + +As an example, if a request of 3 sequences, with max sequence length of +412 comes in to an idle vLLM server, it will be padded executed as +`(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be +padded to 4 (closest batch\_size dimension higher than 3), and max +sequence length will be padded to 512 (closest sequence length dimension +higher than 412). After prefill stage, it will be executed as `(4, 512)` +decode bucket and will continue as that bucket until either batch +dimension changes (due to request being finished) - in which case it +will become a `(2, 512)` bucket, or context length increases above 512 +tokens, in which case it will become `(4, 640)` bucket. + +> [!NOTE] +> Bucketing is transparent to a client - padding in sequence length +> dimension is never returned to the client, and padding in batch +> dimension does not create new requests. + +Warmup +------ + +Warmup is an optional, but highly recommended step occurring before vLLM +server starts listening. It executes a forward pass for each bucket with +dummy data. The goal is to pre-compile all graphs and not incur any +graph compilation overheads within bucket boundaries during server +runtime. Each warmup step is logged during vLLM startup: + +``` {.} +INFO 08-01 22:26:47 habana_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB +INFO 08-01 22:26:47 habana_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB +INFO 08-01 22:26:48 habana_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB +... +INFO 08-01 22:26:59 habana_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB +INFO 08-01 22:27:00 habana_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB +INFO 08-01 22:27:00 habana_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB +INFO 08-01 22:27:01 habana_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB +... +INFO 08-01 22:27:16 habana_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB +INFO 08-01 22:27:16 habana_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB +``` + +This example uses the same buckets as in *Bucketing mechanism* section. +Each output line corresponds to execution of a single bucket. When +bucket is executed for the first time, its graph is compiled and can be +reused later on, skipping further graph compilations. + +> [!TIP] +> Compiling all the buckets might take some time and can be turned off +> with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if +> you do that, you may face graph compilations once executing a given +> bucket for the first time. It is fine to disable warmup for development, +> but it\'s highly recommended to enable it in deployment. + +HPU Graph capture +----------------------------- + +[HPU +Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) +are currently the most performant execution method of vLLM on Intel +Gaudi. When HPU Graphs are enabled, execution graphs will be traced +(recorded) ahead of time (after performing warmup), to be later replayed +during inference, significantly reducing host overheads. Recording can +take large amounts of memory, which needs to be taken into account when +allocating KV cache. Enabling HPU Graphs will impact the number of +available KV cache blocks, but vLLM provides user-configurable variables +to control memory management. + +When HPU Graphs are being used, they share the common memory pool +(\"usable memory\") as KV cache, determined by `gpu_memory_utilization` +flag (`0.9` by default). Before KV cache gets allocated, model weights +are loaded onto the device, and a forward pass of the model is executed +on dummy data, to estimate memory usage. Only after that, +`gpu_memory_utilization` flag is utilized - at its default value, will +mark 90% of free device memory at that point as usable. Next, KV cache +gets allocated, model is warmed up, and HPU Graphs are captured. +Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of +memory reserved for HPU Graphs capture. With its default value +(`VLLM_GRAPH_RESERVED_MEM=0.4`), 40% of usable memory will be reserved +for graph capture (later referred to as \"usable graph memory\"), and +the remaining 60% will be utilized for KV cache. Environment variable +`VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory +reserved for prefill and decode graphs. By default +(`VLLM_GRAPH_PROMPT_RATIO=0.5`), both stages have equal memory +constraints. Lower value corresponds to less usable graph memory +reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will +reserve 20% of usable graph memory for prefill graphs, and 80% of usable +graph memory for decode graphs. + +> [!NOTE] +> `gpu_memory_utilization` does not correspond to the absolute memory +> usage across HPU. It specifies the memory margin after loading the model +> and performing a profile run. If device has 100 GiB of total memory, and +> 50 GiB of free memory after loading model weights and executing +> profiling run, `gpu_memory_utilization` at its default value will mark +> 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total +> device memory. + +User can also configure the strategy for capturing HPU Graphs for prompt +and decode stages separately. Strategy affects the order of capturing +graphs. There are two strategies implemented: - `max_bs` - graph capture +queue will sorted in descending order by their batch sizes. Buckets with +equal batch sizes are sorted by sequence length in ascending order (e.g. +`(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, +`(1,256)`), default strategy for decode - `min_tokens` - graph capture +queue will be sorted in ascending order by the number of tokens each +graph processes (`batch_size*sequence_length`), default strategy for +prompt + +When there\'s large amount of requests pending, vLLM scheduler will +attempt to fill the maximum batch size for decode as soon as possible. +When a request is finished, decode batch size decreases. When that +happens, vLLM will attempt to schedule a prefill iteration for requests +in the waiting queue, to fill the decode batch size to its previous +state. This means that in a full load scenario, decode batch size is +often at its maximum, which makes large batch size HPU Graphs crucial to +capture, as reflected by `max_bs` strategy. On the other hand, prefills +will be executed most frequently with very low batch sizes (1-4), which +is reflected in `min_tokens` strategy. + +> [!NOTE] +> `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by +> graphs for each stage (prefill and decode). vLLM will first attempt to +> use up entirety of usable prefill graph memory (usable graph memory \* +> `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it +> will attempt do the same for decode graphs and usable decode graph +> memory pool. If one stage is fully captured, and there is unused memory +> left within usable graph memory pool, vLLM will attempt further graph +> capture for the other stage, until no more HPU Graphs can be captured +> without exceeding reserved memory pool. The behavior on that mechanism +> can be observed in the example below. + +Each described step is logged by vLLM server, as follows (negative +values correspond to memory being released): + +``` {.} +INFO 08-02 17:37:44 habana_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] +INFO 08-02 17:37:44 habana_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] +INFO 08-02 17:37:44 habana_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] +INFO 08-02 17:37:44 habana_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] +INFO 08-02 17:37:52 habana_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) +INFO 08-02 17:37:52 habana_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used) +INFO 08-02 17:37:52 habana_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) +INFO 08-02 17:37:54 habana_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used) +INFO 08-02 17:37:54 habana_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache +INFO 08-02 17:37:54 habana_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0 +INFO 08-02 17:37:54 habana_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used) +INFO 08-02 17:37:54 habana_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB +... +INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB +INFO 08-02 17:38:22 habana_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.5) +INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB +... +INFO 08-02 17:38:26 habana_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB +INFO 08-02 17:38:27 habana_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB +... +INFO 08-02 17:38:41 habana_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB +INFO 08-02 17:38:41 habana_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB +INFO 08-02 17:38:42 habana_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB +INFO 08-02 17:38:42 habana_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB +INFO 08-02 17:38:43 habana_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB +INFO 08-02 17:38:43 habana_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)] +INFO 08-02 17:38:43 habana_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] +INFO 08-02 17:38:43 habana_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory +INFO 08-02 17:38:43 habana_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used) +``` + +Recommended vLLM Parameters +----------------------------- + +- We recommend running inference on Gaudi 2 with `block_size` of 128 + for BF16 data type. Using default values (16, 32) might lead to + sub-optimal performance due to Matrix Multiplication Engine + under-utilization (see [Gaudi Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)). - For max throughput on Llama 7B, we recommend running with batch size - of 128 or 256 and max context length of 2048 with HPU Graphs enabled. - If you encounter out-of-memory issues, see troubleshooting section. + of 128 or 256 and max context length of 2048 with HPU Graphs + enabled. If you encounter out-of-memory issues, see troubleshooting + section. + +Environment variables +----------------------------- + +**Diagnostic and profiling knobs:** + +- `VLLM_PROFILER_ENABLED`: if `true`, high level profiler will be + enabled. Resulting JSON traces can be viewed in + [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). Disabled + by default. +- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: if `true`, will log graph + compilations per each vLLM engine step, only when there was any - + highly recommended to use alongside `PT_HPU_METRICS_GC_DETAILS=1`. + Disabled by default. +- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true`, will log graph + compilations per each vLLM engine step, always, even if there were + none. Disabled by default. +- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true`, will log cpu fallbacks + per each vLLM engine step, only when there was any. Disabled by + default. +- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, will log cpu + fallbacks per each vLLM engine step, always, even if there were + none. Disabled by default. + +**Performance tuning knobs:** + +- `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by + default +- `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for + HPUGraph capture, `0.4` by default +- `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory + dedicated for prompt graphs, `0.5` by default +- `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt + graph capture, `min_tokens` or `max_bs`, `min_tokens` by default +- `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode + graph capture, `min_tokens` or `max_bs`, `max_bs` by default +- `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment + variables configuring ranges of bucketing mechanism + - `{phase}` is either `PROMPT` or `DECODE` + - `{dim}` is either `BS` or `SEQ` + - `{param}` is either `MIN`, `STEP` or `MAX` + - Default values: + - Prompt: + - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1` + - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `32` + - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): + `min(max_num_seqs, 64)` + - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): + `block_size` + - sequence length step + (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size` + - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): + `1024` + + - Decode: + - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1` + - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): + `128` + - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): + `max_num_seqs` + - sequence length min (`VLLM_DECODE_SEQ_BUCKET_MIN`): + `block_size` + - sequence length step + (`VLLM_DECODE_SEQ_BUCKET_STEP`): `block_size` + - sequence length max (`VLLM_DECODE_SEQ_BUCKET_MAX`): + `2048` + +Additionally, there are HPU PyTorch Bridge environment variables +impacting vLLM execution: + +- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be + used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is + default +- `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor + parallel inference with HPU Graphs Troubleshooting: Tweaking HPU Graphs ==================================== -If you experience device out-of-memory issues or want to attempt inference at higher batch sizes, try tweaking HPU Graphs by following the below: - -- Tweak `gpu_memory_utilization` knob. It - will decrease the allocation of KV cache, leaving some headroom for - capturing graphs with larger batch size. By default `gpu_memory_utilization` is set to 0.9. - It attempts to allocate \~90% of HBM left for KV cache after short - profiling run. Note that decreasing reduces the number of KV - cache blocks you have available, and therefore reduces the effective - maximum number of tokens you can handle at a given time. - -- If this method is not efficient, you can disable `HPUGraph` completely. With - HPU Graphs disabled, you are trading latency and throughput at lower - batches for potentially higher throughput on higher batches. You can do - that by adding `--enforce-eager` flag to server (for - online inference), or by passing `enforce_eager=True` - argument to LLM constructor (for offline inference). +If you experience device out-of-memory issues or want to attempt +inference at higher batch sizes, try tweaking HPU Graphs by following +the below: + +- Tweak `gpu_memory_utilization` knob. It will decrease the allocation + of KV cache, leaving some headroom for capturing graphs with larger + batch size. By default `gpu_memory_utilization` is set to 0.9. It + attempts to allocate \~90% of HBM left for KV cache after short + profiling run. Note that decreasing reduces the number of KV cache + blocks you have available, and therefore reduces the effective + maximum number of tokens you can handle at a given time. +- If this method is not efficient, you can disable `HPUGraph` + completely. With HPU Graphs disabled, you are trading latency and + throughput at lower batches for potentially higher throughput on + higher batches. You can do that by adding `--enforce-eager` flag to + server (for online inference), or by passing `enforce_eager=True` + argument to LLM constructor (for offline inference). From b0112c3a9a075e83f5bb98127586d925402f3614 Mon Sep 17 00:00:00 2001 From: Nir David <124874956+nirda7@users.noreply.github.com> Date: Wed, 14 Aug 2024 19:34:25 +0300 Subject: [PATCH 127/819] Support FP8 INC in vLLM (#144) FILL IN THE PR DESCRIPTION HERE FIX #xxxx (*link existing issues this PR will resolve*) **BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE** ---
PR Checklist (Click to Expand)

Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.

PR Title and Classification

Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:

  • [Bugfix] for bug fixes.
  • [CI/Build] for build or continuous integration improvements.
  • [Doc] for documentation fixes and improvements.
  • [Model] for adding a new model or improving an existing model. Model name should appear in the title.
  • [Frontend] For changes on the vLLM frontend (e.g., OpenAI API server, LLM class, etc.)
  • [Kernel] for changes affecting CUDA kernels or other compute kernels.
  • [Core] for changes in the core vLLM logic (e.g., LLMEngine, AsyncLLMEngine, Scheduler, etc.)
  • [Hardware][Vendor] for hardware-specific changes. Vendor name should appear in the prefix (e.g., [Hardware][AMD]).
  • [Misc] for PRs that do not fit the above categories. Please use this sparingly.

Note: If the PR spans more than one category, please include all relevant prefixes.

Code Quality

The PR need to meet the following code quality standards:

  • We adhere to Google Python style guide and Google C++ style guide.
  • Pass all linter checks. Please use format.sh to format your code.
  • The code need to be well-documented to ensure future contributors can easily understand the code.
  • Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.
  • Please add documentation to docs/source/ if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.

Notes for Large Changes

Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with rfc-required and might not go through the PR.

What to Expect for the Reviews

The goal of the vLLM team is to be a transparent reviewing machine. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process:

  • After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.
  • After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.
  • After the review, the reviewer will put an action-required label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.
  • Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.

Thank You

Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone!

--- README_GAUDI.md | 3 +- .../getting_started/gaudi-installation.rst | 3 +- vllm/attention/backends/habana_attn.py | 26 +++- vllm/attention/ops/habana_paged_attn.py | 10 ++ vllm/config.py | 8 +- vllm/engine/arg_utils.py | 14 ++- vllm/engine/llm_engine.py | 6 +- vllm/entrypoints/llm.py | 3 + vllm/executor/habana_executor.py | 9 ++ vllm/executor/ray_habana_executor.py | 3 + vllm/hpu/cache_ops.py | 31 +++++ vllm/hpu/ops.py | 33 +++-- vllm/hpu/utils.py | 40 ++++++ vllm/model_executor/layers/layernorm.py | 11 +- vllm/model_executor/layers/linear.py | 10 +- .../layers/quantization/__init__.py | 2 + .../model_executor/layers/quantization/inc.py | 115 ++++++++++++++++++ vllm/model_executor/model_loader/loader.py | 22 ++-- vllm/model_executor/models/llama.py | 6 + vllm/utils.py | 1 + vllm/worker/cache_engine.py | 4 +- vllm/worker/habana_model_runner.py | 57 ++++++++- vllm/worker/habana_worker.py | 21 ++++ 23 files changed, 387 insertions(+), 51 deletions(-) create mode 100644 vllm/model_executor/layers/quantization/inc.py diff --git a/README_GAUDI.md b/README_GAUDI.md index a569d6314acf8..9ea30a2e43f69 100644 --- a/README_GAUDI.md +++ b/README_GAUDI.md @@ -26,7 +26,8 @@ To verify that the Intel Gaudi software was correctly installed, run: ``` {.console} $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core and habanalabs-thunk are installed -$ pip list | habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml, habana-media-loader and habana_quantization_toolkit are installed +$ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed +$ pip list | grep neural # verify that neural-compressor is installed ``` Refer to [Intel Gaudi Software Stack diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst index 7af291d62efc6..ddbac022a8d9d 100644 --- a/docs/source/getting_started/gaudi-installation.rst +++ b/docs/source/getting_started/gaudi-installation.rst @@ -26,7 +26,8 @@ To verify that the Intel Gaudi software was correctly installed, run: $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core and habanalabs-thunk are installed - $ pip list | habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml, habana-media-loader and habana_quantization_toolkit are installed + $ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed + $ pip list | grep neural # verify that neural_compressor is installed Refer to `Intel Gaudi Software Stack Verification `__ diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py index 33b6e2e538b13..7a867e79b203d 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/habana_attn.py @@ -12,6 +12,8 @@ AttentionMetadata, AttentionType) from vllm.attention.ops.habana_paged_attn import (HabanaPagedAttention, HabanaPagedAttentionMetadata) +from vllm.hpu import cache_ops +from vllm.hpu.utils import Matmul, Softmax, VLLMKVCache from vllm.logger import init_logger logger = init_logger(__name__) @@ -108,7 +110,7 @@ def __post_init__(self): self.attn_bias: Optional[torch.Tensor] = None -class HabanaAttentionImpl(AttentionImpl): +class HabanaAttentionImpl(AttentionImpl, torch.nn.Module): """ If the input tensors contain prompt tokens, the layout is as follows: |<--------------- num_prefill_tokens ----------------->| @@ -137,10 +139,16 @@ def __init__( blocksparse_params: Optional[Dict[str, Any]] = None, max_seq_len: int = 4096, ) -> None: + super(AttentionImpl, self).__init__() self.kv_cache_dtype = kv_cache_dtype self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) + self.matmul_qk = Matmul() + self.softmax = Softmax() + self.matmul_av = Matmul() + self.k_cache = VLLMKVCache() + self.v_cache = VLLMKVCache() self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads self.sliding_window = sliding_window self.position_bias = None @@ -204,9 +212,13 @@ def forward( # Reshape the input keys and values and store them in the cache. # If kv_cache is not provided, the new key and value tensors are # not cached. This happens during the initial memory profiling run. - HabanaPagedAttention.write_to_paged_cache( - key, value, key_cache, value_cache, attn_metadata.slot_mapping, - self.kv_cache_dtype, attn_metadata.is_prompt) + num_kv_cache_passes, num_slots_available, indices, offsets = \ + cache_ops.prepare_to_cache(key_cache, + attn_metadata.slot_mapping) + key_cache = self.k_cache(key, key_cache, num_kv_cache_passes, + num_slots_available, indices, offsets) + value_cache = self.v_cache(value, value_cache, num_kv_cache_passes, + num_slots_available, indices, offsets) if attn_metadata.is_prompt: # Prompt run. @@ -232,6 +244,9 @@ def forward( attn_bias=attn_bias, p=0.0, scale=self.scale, + matmul_qk_op=self.matmul_qk, + softmax_op=self.softmax, + matmul_av_op=self.matmul_av, ) output = out.reshape(batch_size, seq_len, hidden_size) else: @@ -255,7 +270,8 @@ def forward( query, key_cache, value_cache, attn_metadata.block_tables, attn_metadata.seq_lens_tensor, self.kv_cache_dtype, self.num_kv_heads, self.scale, self.position_bias, k_scale, - v_scale) + v_scale, self.matmul_qk, self.softmax, self.matmul_av, + self.k_cache, self.v_cache) # Reshape the output tensor. return output.view(batch_size, seq_len, hidden_size) diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/habana_paged_attn.py index 7dd701c7a0cdf..9602886299c47 100644 --- a/vllm/attention/ops/habana_paged_attn.py +++ b/vllm/attention/ops/habana_paged_attn.py @@ -75,6 +75,11 @@ def forward_decode( alibi_slopes: Optional[torch.Tensor], k_scale: float, v_scale: float, + matmul_qk_op, + softmax_op, + matmul_av_op, + k_cache_cls, + v_cache_cls, ) -> torch.Tensor: block_size = value_cache.shape[1] return ops.paged_attention_v1( @@ -88,6 +93,11 @@ def forward_decode( block_size, alibi_slopes, kv_cache_dtype, + matmul_qk_op, + softmax_op, + matmul_av_op, + k_cache_cls, + v_cache_cls, ) @staticmethod diff --git a/vllm/config.py b/vllm/config.py index f16bea16fe646..6acb70ad047b2 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -474,12 +474,13 @@ def _verify_args(self) -> None: def _verify_cache_dtype(self) -> None: if self.cache_dtype == "auto": pass - elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2"): + elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2", "fp8_inc"): logger.info( "Using fp8 data type to store kv cache. It reduces the GPU " "memory footprint and boosts the performance. " "Meanwhile, it may cause accuracy drop without a proper " - "scaling factor") + "scaling factor. " + "Intel Gaudi (HPU) supports fp8 (using fp8_inc).") else: raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}") @@ -600,11 +601,12 @@ class LoadConfig: ignore_patterns: The list of patterns to ignore when loading the model. Default to "original/**/*" to avoid repeated loading of llama's checkpoints. - + device: Device on which weights are loaded. """ load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO download_dir: Optional[str] = None + device: Optional[str] = None model_loader_extra_config: Optional[Union[str, dict]] = field( default_factory=dict) ignore_patterns: Optional[Union[List[str], str]] = None diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index e4b223a1b505f..d6c544750afea 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -38,6 +38,7 @@ class EngineArgs: trust_remote_code: bool = False download_dir: Optional[str] = None load_format: str = 'auto' + weights_load_device: Optional[str] = None dtype: str = 'auto' kv_cache_dtype: str = 'auto' quantization_param_path: Optional[str] = None @@ -205,6 +206,11 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: 'section for more information.\n' '* "bitsandbytes" will load the weights using bitsandbytes ' 'quantization.\n') + parser.add_argument("--weights-load-device", + type=str, + default=EngineArgs.weights_load_device, + choices=["cuda", "neuron", "hpu", "cpu"], + help='Device on which weights are loaded.') parser.add_argument( '--dtype', type=str, @@ -223,11 +229,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument( '--kv-cache-dtype', type=str, - choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], + choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3', 'fp8_inc'], default=EngineArgs.kv_cache_dtype, help='Data type for kv cache storage. If "auto", will use model ' 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' - 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') + 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3). ' + 'Intel Gaudi (HPU) supports fp8 (using fp8_inc).') parser.add_argument( '--quantization-param-path', type=nullable_str, @@ -835,9 +842,12 @@ def create_engine_config(self, ) -> EngineConfig: self.model_loader_extra_config[ "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path + device = device_config.device if self.weights_load_device is None else \ + self.weights_load_device load_config = LoadConfig( load_format=self.load_format, download_dir=self.download_dir, + device=device, model_loader_extra_config=self.model_loader_extra_config, ignore_patterns=self.ignore_patterns, ) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 3f7e0a7a4dc53..f8b9c48bc9589 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -182,7 +182,7 @@ def __init__( "download_dir=%r, load_format=%s, tensor_parallel_size=%d, " "pipeline_parallel_size=%d, " "disable_custom_all_reduce=%s, quantization=%s, " - "enforce_eager=%s, kv_cache_dtype=%s, " + "weights_load_device=%s, enforce_eager=%s, kv_cache_dtype=%s, " "quantization_param_path=%s, device_config=%s, " "decoding_config=%r, observability_config=%r, " "seed=%d, served_model_name=%s, use_v2_block_manager=%s, " @@ -206,6 +206,7 @@ def __init__( parallel_config.pipeline_parallel_size, parallel_config.disable_custom_all_reduce, model_config.quantization, + load_config.device, model_config.enforce_eager, cache_config.cache_dtype, model_config.quantization_param_path, @@ -853,6 +854,9 @@ def _process_model_outputs( request_outputs.append(request_output) return request_outputs + def finish_measurements(self): + self.model_executor.finish_measurements() + def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: """Performs one decoding iteration and returns newly generated results. diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 62309ed345b1d..fc9f118ff14b2 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -173,6 +173,9 @@ def set_tokenizer( self.llm_engine.tokenizer.tokenizer = get_cached_tokenizer( tokenizer) + def finish_measurements(self): + self.llm_engine.finish_measurements() + @overload # LEGACY: single (prompt + optional token ids) def generate( self, diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py index f5cf26b687053..80f8037a2d043 100644 --- a/vllm/executor/habana_executor.py +++ b/vllm/executor/habana_executor.py @@ -90,6 +90,9 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: msg = f"init_cache_engine took {cache_init_m.get_summary_string()}" logger.info(msg) + def finish_measurements(self): + self.driver_worker.finish_measurements() + def execute_model( self, execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: @@ -180,6 +183,12 @@ def check_health(self) -> None: # it's running. return + def shutdown(self) -> None: + self.driver_worker.shutdown_inc() + + def __del__(self): + self.shutdown() + class HabanaExecutorAsync(HabanaExecutor, ExecutorAsyncBase): diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py index 9e0a89cbeb8aa..17e3414a96b57 100644 --- a/vllm/executor/ray_habana_executor.py +++ b/vllm/executor/ray_habana_executor.py @@ -237,6 +237,9 @@ def _driver_execute_model( return self.driver_worker.execute_method("execute_model", execute_model_req) + def finish_measurements(self): + self._run_workers("finish_measurements") + def execute_model( self, execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py index 14824945aa53a..98f109accea06 100644 --- a/vllm/hpu/cache_ops.py +++ b/vllm/hpu/cache_ops.py @@ -43,6 +43,37 @@ def reshape_and_cache(key, value[start_idx:end_idx]) +def prepare_to_cache(cache, slot_mapping): + num_blocks = cache.size(0) + block_size = cache.size(1) + slot_mapping = slot_mapping.flatten() + indices = torch.div(slot_mapping, block_size, rounding_mode="floor") + offsets = torch.fmod(slot_mapping, block_size) + num_slots_requested = slot_mapping.size(0) + num_slots_available = num_blocks * block_size + # NOTE(kzawora): HPU PT bridge crashes with + # RuntimeError: Invalid inputs for scatter_nd_onnx + # on index_put when num_slots_requested > num_slots_available. + # This case might occur when we have little kv cache blocks and + # lots of padding, or are doing warmup. + # This loop is a workaround for this issue. Please remove it + # once key_cache.index_put_(indices, offsets), key) works. + num_kv_cache_passes = torch.div(num_slots_requested, + num_slots_available).ceil().int().item() + + return num_kv_cache_passes, num_slots_available, indices, offsets + + +def insert_or_update_cache(input, cache, num_kv_cache_passes, + num_slots_available, block_indices, block_offsets): + for i in range(num_kv_cache_passes): + start_idx = i * num_slots_available + end_idx = (i + 1) * num_slots_available + cache.index_put_((block_indices[start_idx:end_idx], + block_offsets[start_idx:end_idx]), + input[start_idx:end_idx]) + + def swap_blocks(src, dst, block_mapping): index_src = torch.zeros((1, ), dtype=torch.int32, device=src.device) index_dst = torch.zeros((1, ), dtype=torch.int32, device=dst.device) diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index c8f00c1cbd59d..23f6964723d3f 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -11,7 +11,6 @@ import torch import torch.nn.functional as F -import vllm.hpu.utils as hpu_utils from vllm.logger import init_logger logger = init_logger(__name__) @@ -33,7 +32,6 @@ def fetch_from_cache(cache, blocks, permutations): ] -@hpu_utils.with_mark_steps def paged_attention_v1(query, key_cache, value_cache, @@ -43,7 +41,12 @@ def paged_attention_v1(query, context_lens, block_size, alibi_slopes=None, - kv_cache_dtype=None) -> None: + kv_cache_dtype=None, + matmul_qk_op=torch.matmul, + softmax_op=torch.softmax, + matmul_av_op=torch.matmul, + k_cache_cls=None, + v_cache_cls=None) -> None: seq_len = block_tables.size(1) batch_size, query_heads, _ = query.shape _, _, kv_heads, _ = key_cache.shape @@ -56,19 +59,23 @@ def paged_attention_v1(query, batch_size, 1, 1, -1)) query.mul_(scale) query = query.unsqueeze(-2) - keys = fetch_from_cache(key_cache, block_tables, (0, 2, 3, 1)) + fetch_keys = fetch_from_cache if k_cache_cls is None else \ + k_cache_cls.fetch_from_cache + keys = fetch_keys(key_cache, block_tables, (0, 2, 3, 1)) if query_heads != kv_heads: query = query.unflatten(1, (kv_heads, -1)) keys = [k.unflatten(1, (kv_heads, 1)) for k in keys] mask = mask.unsqueeze(2) - attn_weights = torch.cat([torch.matmul(query, k) for k in keys], dim=-1) + attn_weights = torch.cat([matmul_qk_op(query, k) for k in keys], dim=-1) if alibi_slopes is not None: attn_weights.add_(alibi_slopes[:, :, -attn_weights.size(2):, -attn_weights.size(3):]) - attn_weights = (attn_weights.masked_fill(mask, min_inf).softmax(dim=-1)) + attn_weights = softmax_op(attn_weights.masked_fill(mask, min_inf), dim=-1) - values = fetch_from_cache(value_cache, block_tables, (0, 2, 1, 3)) + fetch_values = fetch_from_cache if v_cache_cls is None else \ + v_cache_cls.fetch_from_cache + values = fetch_values(value_cache, block_tables, (0, 2, 1, 3)) if PA_SPLIT_VALUE: attn_weights = attn_weights.split(block_size, dim=-1) else: @@ -76,7 +83,7 @@ def paged_attention_v1(query, attn_weights = [attn_weights] if query_heads != kv_heads: values = [v.unflatten(1, (kv_heads, 1)) for v in values] - attn_weights = [torch.matmul(a, v) for a, v in zip(attn_weights, values)] + attn_weights = [matmul_av_op(a, v) for a, v in zip(attn_weights, values)] if query_heads != kv_heads: attn_weights = [a.flatten(1, 2) for a in attn_weights] attn_weights = sum(attn_weights) @@ -119,7 +126,6 @@ def static_fused_moe(hidden_states, w1, w2, score, topk): return final_hidden_states.view(-1, D) -@hpu_utils.with_mark_steps def prompt_attention( query: torch.Tensor, key: torch.Tensor, @@ -127,6 +133,9 @@ def prompt_attention( attn_bias: Optional[torch.Tensor] = None, p: float = 0.0, scale: Optional[float] = None, + matmul_qk_op=torch.matmul, + softmax_op=torch.softmax, + matmul_av_op=torch.matmul, ) -> torch.Tensor: query = query.transpose(1, 2) key = key.transpose(1, 2) @@ -139,11 +148,11 @@ def prompt_attention( value = value.unflatten(1, (kv_heads, 1)) if attn_bias is not None: attn_bias = attn_bias.unsqueeze(2) - attn_weights = torch.matmul(query * scale, key.transpose(-1, -2)) + attn_weights = matmul_qk_op(query * scale, key.transpose(-1, -2)) if attn_bias is not None: attn_weights.add_(attn_bias) - attn_weights = torch.softmax(attn_weights, dim=-1) - attn_weights = torch.matmul(attn_weights, value) + attn_weights = softmax_op(attn_weights, dim=-1) + attn_weights = matmul_av_op(attn_weights, value) if query_heads != kv_heads: attn_weights = attn_weights.flatten(1, 2) attn_weights = attn_weights.transpose(1, 2) diff --git a/vllm/hpu/utils.py b/vllm/hpu/utils.py index b7b435c50c295..3d9c7cb1c4c22 100644 --- a/vllm/hpu/utils.py +++ b/vllm/hpu/utils.py @@ -8,6 +8,9 @@ from functools import wraps import habana_frameworks.torch as htorch +import torch + +from vllm.hpu.cache_ops import insert_or_update_cache def with_mark_steps(fn): @@ -22,3 +25,40 @@ def wrapped(*args, **kwargs): return result return wrapped + + +class Matmul(torch.nn.Module): + + def __init__(self): + super(Matmul, self).__init__() + + def forward(self, x, y): + return torch.matmul(x, y) + + +class Softmax(torch.nn.Module): + + def __init__(self): + super().__init__() + + def forward(self, x, dim=None, inv_head=None): + return torch.softmax(x, dim) + + +class VLLMKVCache(torch.nn.Module): + + def __init__(self): + super(VLLMKVCache, self).__init__() + + def forward(self, input, cache, num_kv_cache_passes, num_slots_available, + block_indices, block_offset): + insert_or_update_cache(input, cache, num_kv_cache_passes, + num_slots_available, block_indices, + block_offset) + return cache + + def fetch_from_cache(self, cache, blocks, permutations): + return [ + cache.index_select(0, blocks[:, i]).permute(permutations) + for i in range(blocks.size(1)) + ] diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 55cbbabd7da44..c12668c14887d 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -79,18 +79,15 @@ def forward_hpu( if HPUFusedRMSNorm is None: return self.forward_native(x, residual) if residual is not None: - orig_dtype = x.dtype orig_shape = x.shape residual += x.view(residual.shape) # Note: HPUFusedRMSNorm requires 3D tensors as inputs - x = HPUFusedRMSNorm.apply(residual.float(), self.weight.float(), + x = HPUFusedRMSNorm.apply(residual, self.weight, self.variance_epsilon) - return x.to(orig_dtype).view(orig_shape), residual + return x.view(orig_shape), residual - orig_dtype = x.dtype - x = HPUFusedRMSNorm.apply(x.float(), self.weight.float(), - self.variance_epsilon) - return x.to(orig_dtype) + x = HPUFusedRMSNorm.apply(x, self.weight, self.variance_epsilon) + return x def forward_xpu( self, diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index b6e280ae65049..10c8a95f838da 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -273,6 +273,7 @@ def __init__(self, quant_config, prefix) self.gather_output = gather_output + self.collective_func = tensor_model_parallel_all_gather # Divide the weight matrix along the last dimension. tp_size = get_tensor_model_parallel_world_size() @@ -334,7 +335,7 @@ def forward(self, input_): output_parallel = self.quant_method.apply(self, input_, bias) if self.gather_output: # All-gather across the partitions. - output = tensor_model_parallel_all_gather(output_parallel) + output = self.collective_func(output_parallel) else: output = output_parallel output_bias = self.bias if self.skip_bias_add else None @@ -723,6 +724,7 @@ def __init__(self, self.input_is_parallel = input_is_parallel self.reduce_results = reduce_results + self.collective_func = tensor_model_parallel_all_reduce # Divide the weight matrix along the last dimension. self.tp_rank = get_tensor_model_parallel_rank() @@ -770,7 +772,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): assert param_data.shape == loaded_weight.shape param_data.copy_(loaded_weight) - def forward(self, input_): + def resolve_input(self, input_): if self.input_is_parallel: input_parallel = input_ else: @@ -778,6 +780,10 @@ def forward(self, input_): splitted_input = split_tensor_along_last_dim( input_, num_partitions=self.tp_size) input_parallel = splitted_input[tp_rank].contiguous() + return input_parallel + + def forward(self, input_): + input_parallel = self.resolve_input(input_) # Matrix multiply. assert self.quant_method is not None diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index bd574512e3431..7590d3e980275 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -18,6 +18,7 @@ GPTQMarlinConfig) from vllm.model_executor.layers.quantization.gptq_marlin_24 import ( GPTQMarlin24Config) +from vllm.model_executor.layers.quantization.inc import INCConfig from vllm.model_executor.layers.quantization.marlin import MarlinConfig from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig @@ -37,6 +38,7 @@ "squeezellm": SqueezeLLMConfig, "compressed-tensors": CompressedTensorsConfig, "bitsandbytes": BitsAndBytesConfig, + "inc": INCConfig, } diff --git a/vllm/model_executor/layers/quantization/inc.py b/vllm/model_executor/layers/quantization/inc.py new file mode 100644 index 0000000000000..f6718ec2ac9e7 --- /dev/null +++ b/vllm/model_executor/layers/quantization/inc.py @@ -0,0 +1,115 @@ +from typing import Any, Dict, List, Optional + +import torch +import torch.nn.functional as F +from torch.nn.parameter import Parameter + +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.utils import set_weight_attrs + +ACTIVATION_SCHEMES = ["static", "dynamic"] + +logger = init_logger(__name__) + + +class INCConfig(QuantizationConfig): + """Config class for FP8.""" + + def __init__( + self, + is_checkpoint_fp8_serialized: bool = False, + activation_scheme: str = "dynamic", + ) -> None: + self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized + if is_checkpoint_fp8_serialized: + logger.warning("Detected fp8 checkpoint. Please note that the " + "format is experimental and subject to change.") + if activation_scheme not in ACTIVATION_SCHEMES: + raise ValueError( + f"Unsupported activation scheme {activation_scheme}") + self.activation_scheme = activation_scheme + + @classmethod + def get_name(cls) -> str: + return "inc" + + @classmethod + def get_supported_act_dtypes(cls) -> List[torch.dtype]: + return [torch.bfloat16] + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "INCConfig": + quant_method = cls.get_from_keys(config, ["quant_method"]) + is_checkpoint_fp8_serialized = ("fp8" in quant_method) + activation_scheme = cls.get_from_keys(config, ["activation_scheme"]) + return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized, + activation_scheme=activation_scheme) + + def get_quant_method(self, layer: torch.nn.Module, + prefix: str) -> Optional["INCLinearMethod"]: + if isinstance(layer, LinearBase): + return INCLinearMethod(self) + return None + + def get_scaled_act_names(self) -> List[str]: + return [] + + @classmethod + def get_min_capability(cls) -> int: + # The AWQ kernel only supports Turing or newer GPUs. + return 75 + + @staticmethod + def get_config_filenames() -> List[str]: + return [] + + +class INCLinearMethod(LinearMethodBase): + """Linear method for FP8. + Supports loading FP8 checkpoints with static weight scale and + dynamic/static activation scale. + Also supports loading quantized FP16/BF16 model checkpoints with dynamic + activation scaling. The weight scaling factor will be initialized after + the model weights are loaded. + Limitations: + 1. Only support per-tensor quantization due to torch._scaled_mm support. + 2. Only support float8_e4m3fn data type due to the limitation of + torch._scaled_mm (https://github.com/pytorch/pytorch/blob/2e48b39603411a41c5025efbe52f89560b827825/aten/src/ATen/native/cuda/Blas.cpp#L854-L856) + + Args: + quant_config: The quantization config. + """ + + def __init__(self, + quant_config: INCConfig, + separate_bias_add: bool = False): + self.separate_bias_add = separate_bias_add + self.quant_config = quant_config + + def create_weights(self, layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: List[int], input_size: int, + output_size: int, params_dtype: torch.dtype, + **extra_weight_attrs): + output_size_per_partition = sum(output_partition_sizes) + weight = Parameter(torch.empty(output_size_per_partition, + input_size_per_partition, + dtype=params_dtype), + requires_grad=False) + set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0}) + layer.register_parameter("weight", weight) + set_weight_attrs(weight, extra_weight_attrs) + + def apply(self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + weight = layer.weight + if self.separate_bias_add: + if bias is not None: + return F.linear(x, weight) + bias + return F.linear(x, weight) + return F.linear(x, weight, bias) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index bbe49655020da..06048d97088e1 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -37,7 +37,7 @@ supports_vision) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform -from vllm.utils import is_tpu +from vllm.utils import is_hpu, is_tpu logger = init_logger(__name__) @@ -48,14 +48,15 @@ def _get_quantization_config( """Get the quantization config.""" if model_config.quantization is not None: quant_config = get_quant_config(model_config, load_config) - capability = current_platform.get_device_capability() - capability = capability[0] * 10 + capability[1] - if capability < quant_config.get_min_capability(): - raise ValueError( - f"The quantization method {model_config.quantization} is not " - "supported for the current GPU. " - f"Minimum capability: {quant_config.get_min_capability()}. " - f"Current capability: {capability}.") + if not is_hpu(): + capability = current_platform.get_device_capability() + capability = capability[0] * 10 + capability[1] + if capability < quant_config.get_min_capability(): + raise ValueError( + f"The quantization method {model_config.quantization} " + "is not supported for the current GPU. " + f"Minimum capability: {quant_config.get_min_capability()}. " + f"Current capability: {capability}.") supported_dtypes = quant_config.get_supported_act_dtypes() if model_config.dtype not in supported_dtypes: raise ValueError( @@ -276,10 +277,11 @@ def load_model(self, *, model_config: ModelConfig, scheduler_config: SchedulerConfig, cache_config: CacheConfig) -> nn.Module: with set_default_torch_dtype(model_config.dtype): - with torch.device(device_config.device): + with torch.device(self.load_config.device): model = _initialize_model(model_config, self.load_config, lora_config, multimodal_config, cache_config, scheduler_config) + logger.info("Loading weights on %s ...", self.load_config.device) model.load_weights( self._get_weights_iterator(model_config.model, model_config.revision, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 306d22e42ed1d..676a51ce67f96 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -48,6 +48,7 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors, SamplerOutput from vllm.utils import is_hip @@ -317,6 +318,9 @@ def forward( hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] + if current_platform.is_hpu(): + import habana_frameworks.torch as htorch + htorch.core.mark_step() for i in range(self.start_layer, self.end_layer): layer = self.layers[i] hidden_states, residual = layer( @@ -326,6 +330,8 @@ def forward( attn_metadata, residual, ) + if current_platform.is_hpu(): + htorch.core.mark_step() if not get_pp_group().is_last_rank: return IntermediateTensors({ diff --git a/vllm/utils.py b/vllm/utils.py index 8a1bc5de03eb7..fe84253feb172 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -39,6 +39,7 @@ "fp8": torch.uint8, "fp8_e4m3": torch.uint8, "fp8_e5m2": torch.uint8, + "fp8_inc": torch.float8_e4m3fn, } TORCH_DTYPE_TO_NUMPY_DTYPE = { diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 93be2f4c321fe..ec0b8c2369210 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -91,9 +91,11 @@ def _allocate_kv_cache( # null block in CpuGpuBlockAllocator requires at least that # block to be zeroed-out. # We zero-out everything for simplicity. + dtype = torch.uint8 if self.dtype == torch.float8_e4m3fn else \ + self.dtype kv_cache.append( torch.zeros(kv_cache_shape, - dtype=self.dtype, + dtype=dtype, pin_memory=pin_memory, device=device)) return kv_cache diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index cf91c69069ed6..72aba42ae8553 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -182,8 +182,8 @@ def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device, def forward(self, *args, **kwargs): kwargs = kwargs.copy() selected_token_indices = kwargs.pop('selected_token_indices') - if 'bypass_hpu_graphs' in kwargs: - kwargs.pop('bypass_hpu_graphs') # required for PT eager + if 'warmup_mode' in kwargs: + kwargs.pop('warmup_mode') input_ids = kwargs['input_ids'] kwargs['attn_metadata'] = self._set_attn_bias(kwargs['attn_metadata'], input_ids.size(0), @@ -413,6 +413,9 @@ def __init__( self._setup_buckets() def load_model(self) -> None: + import habana_frameworks.torch.core as htcore + if self.model_config.quantization == 'inc': + htcore.hpu_set_env() with HabanaMemoryProfiler() as m: with HabanaMemoryProfiler() as m_getmodel: self.model = get_model( @@ -429,6 +432,26 @@ def load_model(self) -> None: f"took {m_getmodel.get_summary_string()}") logger.info(msg) + if self.model_config.quantization == 'inc': + logger.info("Preparing model with INC..") + with HabanaMemoryProfiler() as m_inc: + from neural_compressor.torch.quantization import ( + FP8Config, convert, prepare) + config = FP8Config.from_json_file( + os.getenv("QUANT_CONFIG", "")) + if config.measure: + self.model = prepare(self.model, config) + elif config.quantize: + self.model = convert(self.model, config) + htcore.hpu_initialize(self.model, + mark_only_scales_as_const=True) + logger.info("Preparing model with INC took %s", + m_inc.get_summary_string()) + else: + self.model = self.model.to("hpu") + htcore.mark_step() + torch.hpu.synchronize() + # FIXME: Running with disable_tensor_cache=True causes # RuntimeErrors. This needs to be debugged with HabanaMemoryProfiler() as m_wrap: @@ -1051,7 +1074,7 @@ def warmup_scenario(self, batch_size, seq_len, is_prompt, torch.hpu.synchronize() for _ in range(times): inputs = self.prepare_model_input(seqs) - self.execute_model(inputs, kv_caches) + self.execute_model(inputs, kv_caches, warmup_mode=True) torch.hpu.synchronize() self.profiler.end() gc.collect() @@ -1362,6 +1385,10 @@ def prepare_model_input( is_prompt=is_prompt, virtual_engine=virtual_engine) + def finish_measurements(self): + from neural_compressor.torch.quantization import finalize_calibration + finalize_calibration(self.model.model) + @torch.inference_mode() def execute_model( self, @@ -1369,6 +1396,7 @@ def execute_model( kv_caches: List[torch.Tensor], intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, + warmup_mode=False, ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]: if num_steps > 1: raise ValueError( @@ -1402,6 +1430,11 @@ def execute_model( } if multi_modal_input is not None: execute_model_kwargs.update(multi_modal_input) + if htorch.utils.internal.is_lazy(): + execute_model_kwargs.update({ + "bypass_hpu_graphs": not use_graphs, + "warmup_mode": warmup_mode + }) htorch.core.mark_step() if self.is_driver_worker: @@ -1415,9 +1448,8 @@ def execute_model( with self.profiler.record_event('internal', model_event_name): hidden_states = self.model.forward( **execute_model_kwargs, - selected_token_indices=sampling_metadata. - selected_token_indices, - bypass_hpu_graphs=not use_graphs) + selected_token_indices=sampling_metadata.selected_token_indices + ) # Compute the logits. with self.profiler.record_event( @@ -1459,3 +1491,16 @@ def execute_model( is_prompt=is_prompt) self.profiler.record_counter(self.event_start, counters) return [output] + + def shutdown_inc(self): + print('inc shutdown') + if (model_config := getattr(self, "model_config", None)) and \ + getattr(model_config, "quantization", None) == 'inc': + print('inc shutdown start') + from neural_compressor.torch.quantization import ( + finalize_calibration) + finalize_calibration(self.model.model) + print('inc shutdown') + + def __del__(self): + self.shutdown_inc() diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index f3fdc4dcc63c6..87122c03d3c8f 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -91,6 +91,16 @@ def __init__( # Initialize gpu_cache as embedding models don't initialize kv_caches self.hpu_cache: Optional[List[List[torch.tensor]]] = None + def _set_env_vars(self): + local_rank = self.local_rank + if self.parallel_config.world_size == 1: + local_rank = -1 + import os + os.environ["LOCAL_RANK"] = str(local_rank) + os.environ["ID"] = str(local_rank) + os.environ["WORLD_SIZE"] = str(self.parallel_config.world_size) + os.environ["RANK"] = str(self.rank) + def init_device(self) -> None: if self.device_config.device.type == "hpu": self.device = torch.device("hpu") @@ -99,6 +109,8 @@ def init_device(self) -> None: raise RuntimeError( f"Not support device type: {self.device_config.device}") # Initialize the distributed environment. + if self.model_config.quantization == 'inc': + self._set_env_vars() init_worker_distributed_environment(self.parallel_config, self.rank, self.distributed_init_method, self.local_rank) @@ -211,6 +223,9 @@ def _warm_up_model(self) -> None: # the model initialization and profiling. set_random_seed(self.model_config.seed) + def finish_measurements(self): + self.model_runner.finish_measurements() + @property def do_metadata_broadcast(self) -> bool: return self.parallel_config.tensor_parallel_size > 1 @@ -288,6 +303,12 @@ def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: def list_prompt_adapters(self) -> Set[int]: raise NotImplementedError("LoRA is not implemented for HPU backend.") + def shutdown_inc(self): + self.model_runner.shutdown_inc() + + def __del__(self): + self.shutdown_inc() + @property def max_model_len(self) -> int: return self.model_config.max_model_len From 8185d760325a7699c5c07f7cd0e28d443a36051b Mon Sep 17 00:00:00 2001 From: Mohit Deopujari Date: Sun, 18 Aug 2024 23:30:38 -0700 Subject: [PATCH 128/819] [Doc][BugFix] Update setup instructions and reference links (#191) 1. Replaced the non-working setup instruction with the correct command. 2. Fixed broken links and updated references in documentation. --- README_GAUDI.md | 6 +++--- .../getting_started/gaudi-installation.rst | 17 ++++------------- docs/source/getting_started/quickstart.rst | 2 +- 3 files changed, 8 insertions(+), 17 deletions(-) diff --git a/README_GAUDI.md b/README_GAUDI.md index 9ea30a2e43f69..91bcbe49405eb 100644 --- a/README_GAUDI.md +++ b/README_GAUDI.md @@ -62,16 +62,16 @@ following: $ git clone https://github.com/HabanaAI/vllm-fork.git $ cd vllm-fork $ git checkout habana_main -$ python setup.py develop +$ pip install -e . ``` Supported Features ================== - [Offline batched - inference](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#offline-batched-inference) + inference](https://github.com/HabanaAI/vllm-fork/blob/habana_main/docs/source/getting_started/quickstart.rst#offline-batched-inference) - Online inference via [OpenAI-Compatible - Server](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server) + Server](https://github.com/HabanaAI/vllm-fork/blob/habana_main/docs/source/getting_started/quickstart.rst#openai-compatible-server) - HPU autodetection - no need to manually select device within vLLM - Paged KV cache with algorithms enabled for Intel Gaudi accelerators - Custom Intel Gaudi implementations of Paged Attention, KV cache ops, diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst index ddbac022a8d9d..b3234d10b3115 100644 --- a/docs/source/getting_started/gaudi-installation.rst +++ b/docs/source/getting_started/gaudi-installation.rst @@ -30,7 +30,7 @@ To verify that the Intel Gaudi software was correctly installed, run: $ pip list | grep neural # verify that neural_compressor is installed Refer to `Intel Gaudi Software Stack -Verification `__ +Verification `__ for more details. Run Docker Image @@ -51,15 +51,6 @@ Use the following commands to run a Docker image: Build and Install vLLM --------------------------- -To build and install vLLM from source, run: - -.. code:: console - - $ git clone https://github.com/vllm-project/vllm.git - $ cd vllm - $ python setup.py develop - - Currently, the latest features and performance optimizations are developed in Gaudi's `vLLM-fork `__ and we periodically upstream them to vLLM main repo. To install latest `HabanaAI/vLLM-fork `__, run the following: .. code:: console @@ -67,16 +58,16 @@ Currently, the latest features and performance optimizations are developed in Ga $ git clone https://github.com/HabanaAI/vllm-fork.git $ cd vllm-fork $ git checkout habana_main - $ python setup.py develop + $ pip install -e . Supported Features ================== - `Offline batched - inference `__ + inference `__ - Online inference via `OpenAI-Compatible - Server `__ + Server `__ - HPU autodetection - no need to manually select device within vLLM - Paged KV cache with algorithms enabled for Intel Gaudi accelerators - Custom Intel Gaudi implementations of Paged Attention, KV cache ops, diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst index 89bdc247c5e8e..8cfde76adf5fa 100644 --- a/docs/source/getting_started/quickstart.rst +++ b/docs/source/getting_started/quickstart.rst @@ -9,7 +9,7 @@ This guide shows how to use vLLM to: * build an API server for a large language model; * start an OpenAI-compatible API server. -Be sure to complete the :ref:`installation instructions ` before continuing with this guide. +Be sure to complete the `Gaudi installation instructions `_ before continuing with this guide. .. note:: From f7dd91d88e6b9e68479af0817431949f665507a7 Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Mon, 19 Aug 2024 00:46:21 -0700 Subject: [PATCH 129/819] split gptbigcode forward (#194) --- vllm/model_executor/models/gpt_bigcode.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index fc4e13bbb0e68..3ae3c8c8f712c 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -39,6 +39,7 @@ VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors, SamplerOutput from .interfaces import SupportsLoRA @@ -224,9 +225,14 @@ def forward( position_embeds = self.wpe(position_ids) hidden_states = inputs_embeds + position_embeds + if current_platform.is_hpu(): + import habana_frameworks.torch as htorch + htorch.core.mark_step() for i in range(len(self.h)): layer = self.h[i] hidden_states = layer(hidden_states, kv_caches[i], attn_metadata) + if current_platform.is_hpu(): + htorch.core.mark_step() hidden_states = self.ln_f(hidden_states) return hidden_states From 275e3250ba6ed8cc13b2d6e4928db73df420e64b Mon Sep 17 00:00:00 2001 From: Libin Tang Date: Mon, 19 Aug 2024 11:43:41 -0700 Subject: [PATCH 130/819] Enable FusedSDPA for prompt attention with VLLM_PROMPT_USE_FUSEDSDPA (#168) --- vllm/attention/backends/habana_attn.py | 29 +++++++++---- vllm/hpu/ops.py | 58 ++++++++++++++++++++------ vllm/worker/habana_model_runner.py | 7 ++-- 3 files changed, 70 insertions(+), 24 deletions(-) diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py index 7a867e79b203d..2259630fa10b7 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/habana_attn.py @@ -2,6 +2,7 @@ # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company ############################################################################### +import os from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Type @@ -166,6 +167,12 @@ def __init__( assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads + self.prefill_usefusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA', + '0').lower() in ['1', 'true'] + if self.prefill_usefusedsdpa: + assert alibi_slopes is None, \ + 'Prefill with FusedSDPA not supported with alibi slopes!' + suppored_head_sizes = HabanaPagedAttention.get_supported_head_sizes() if head_size not in suppored_head_sizes: raise ValueError( @@ -223,15 +230,18 @@ def forward( if attn_metadata.is_prompt: # Prompt run. if kv_cache is None or attn_metadata.block_tables.numel() == 0: - # TODO: move this outside of model - assert attn_metadata.attn_bias is not None, \ - 'attn_bias must be set before calling model.forward!' - attn_bias = attn_metadata.attn_bias - if self.alibi_slopes is not None and \ - self.position_bias is not None: - attn_bias.add_(self.position_bias[:, :, - -attn_bias.size(2):, - -attn_bias.size(3):]) + if not self.prefill_usefusedsdpa: + # TODO: move this outside of model + assert attn_metadata.attn_bias is not None, \ + 'attn_bias must be set before calling model.forward!' + attn_bias = attn_metadata.attn_bias + if self.alibi_slopes is not None and \ + self.position_bias is not None: + attn_bias.add_(self.position_bias[:, :, + -attn_bias.size(2):, + -attn_bias.size(3):]) + else: + attn_bias = None query_shape = (batch_size, seq_len, self.num_heads, self.head_size) @@ -247,6 +257,7 @@ def forward( matmul_qk_op=self.matmul_qk, softmax_op=self.softmax, matmul_av_op=self.matmul_av, + valid_seq_lengths=attn_metadata.seq_lens_tensor, ) output = out.reshape(batch_size, seq_len, hidden_size) else: diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index 23f6964723d3f..2af5634a8d1a6 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -21,6 +21,13 @@ except ImportError: logger.warning("Could not import HPU FusedRMSNorm kernel. " "vLLM will use forward_native implementation of RMSNorm.") +HPUFusedSDPA = None +try: + from habana_frameworks.torch.hpex.kernels import FusedSDPA + HPUFusedSDPA = FusedSDPA +except ImportError: + logger.warning("Could not import HPU FusedSDPA kernel. " + "vLLM will use native implementation.") PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', '1') == '1') @@ -126,6 +133,21 @@ def static_fused_moe(hidden_states, w1, w2, score, topk): return final_hidden_states.view(-1, D) +#TODO: remove after fusedsdpa fix for query_head != kv_head +def repeat_kv(kv: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). + The kv go from (batch, num_key_value_heads, seqlen, head_dim) to + (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = kv.shape + if n_rep == 1: + return kv + kv = kv[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, + head_dim) + return kv.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + def prompt_attention( query: torch.Tensor, key: torch.Tensor, @@ -136,24 +158,36 @@ def prompt_attention( matmul_qk_op=torch.matmul, softmax_op=torch.softmax, matmul_av_op=torch.matmul, + valid_seq_lengths: Optional[torch.Tensor] = None, ) -> torch.Tensor: query = query.transpose(1, 2) key = key.transpose(1, 2) value = value.transpose(1, 2) query_heads = query.size(1) kv_heads = key.size(1) - if query_heads != kv_heads: - query = query.unflatten(1, (kv_heads, -1)) - key = key.unflatten(1, (kv_heads, 1)) - value = value.unflatten(1, (kv_heads, 1)) + if attn_bias is not None or HPUFusedSDPA is None: + if query_heads != kv_heads: + query = query.unflatten(1, (kv_heads, -1)) + key = key.unflatten(1, (kv_heads, 1)) + value = value.unflatten(1, (kv_heads, 1)) + if attn_bias is not None: + attn_bias = attn_bias.unsqueeze(2) + attn_weights = matmul_qk_op(query * scale, key.transpose(-1, -2)) if attn_bias is not None: - attn_bias = attn_bias.unsqueeze(2) - attn_weights = matmul_qk_op(query * scale, key.transpose(-1, -2)) - if attn_bias is not None: - attn_weights.add_(attn_bias) - attn_weights = softmax_op(attn_weights, dim=-1) - attn_weights = matmul_av_op(attn_weights, value) - if query_heads != kv_heads: - attn_weights = attn_weights.flatten(1, 2) + attn_weights.add_(attn_bias) + attn_weights = softmax_op(attn_weights, dim=-1) + attn_weights = matmul_av_op(attn_weights, value) + if query_heads != kv_heads: + attn_weights = attn_weights.flatten(1, 2) + else: + #TODO: remove after fusedsdpa fix for query_heads != kv_heads + if query_heads != kv_heads: + key = repeat_kv(key, int(query_heads // kv_heads)) + value = repeat_kv(value, int(query_heads // kv_heads)) + softmax_mode = 'fast' + recompute_mode = True + attn_weights = FusedSDPA.apply(query, key, value, None, 0.0, True, + scale, softmax_mode, recompute_mode, + valid_seq_lengths, 'right') attn_weights = attn_weights.transpose(1, 2) return attn_weights diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 72aba42ae8553..e52b61539b540 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -151,6 +151,9 @@ class HpuModelAdapter(): def __init__(self, model, enforce_eager): self.model = model + self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA', + '0').lower() in ['1', 'true'] + if not htorch.utils.internal.is_lazy() and not enforce_eager: self.model = torch.compile(self.model, backend='hpu_backend', @@ -159,7 +162,7 @@ def __init__(self, model, enforce_eager): def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device, dtype): prefill_metadata = attn_metadata - if prefill_metadata is None: + if prefill_metadata is None or self.prefill_use_fusedsdpa: return attn_metadata seq_lens_t = prefill_metadata.seq_lens_tensor @@ -599,7 +602,6 @@ def _prepare_prompt( # actual prompt lens context_lens.append(context_len) query_lens.append(seq_len - context_len) - input_tokens.append(prompt_tokens) # NOTE(woosuk): Here we assume that the first token in the prompt # is always the first token in the sequence. @@ -672,7 +674,6 @@ def _prepare_prompt( max_prompt_len = max( find_bucket(max(seq_lens), self.prompt_seq_bucket_cfg), self.block_size) - input_tokens = make_tensor_with_pad(input_tokens, max_len=max_prompt_len, pad=0, From 55ea6589c7dfa27ca4f07271cf73166971f9f8fe Mon Sep 17 00:00:00 2001 From: Sanju C Sudhakaran Date: Tue, 20 Aug 2024 18:35:38 +0530 Subject: [PATCH 131/819] Enable LoRA support for HPU (#170) This PR enables LoRA support in HPU. * Implemented custom BGMV for LoRA modules using index-select operator. * Support for both single and multi card scenarios has been tested --------- Co-authored-by: Himangshu Lahkar <49579433+hlahkar@users.noreply.github.com> Co-authored-by: Himangshu Lahkar --- examples/lora_inference_hpu.py | 47 ++++++ tests/conftest.py | 8 ++ tests/lora/conftest.py | 8 +- tests/lora/test_llama_hpu.py | 100 +++++++++++++ tests/lora/test_lora_hpu.py | 221 +++++++++++++++++++++++++++++ tests/lora/test_multilora_hpu.py | 130 +++++++++++++++++ tests/lora/utils.py | 11 +- vllm/executor/habana_executor.py | 27 ++-- vllm/hpu/ops.py | 75 ++++++++++ vllm/lora/layers.py | 31 +++- vllm/lora/models.py | 44 ++++-- vllm/utils.py | 6 + vllm/worker/habana_model_runner.py | 205 ++++++++++++++++++++------ vllm/worker/habana_worker.py | 29 ++-- 14 files changed, 848 insertions(+), 94 deletions(-) create mode 100644 examples/lora_inference_hpu.py create mode 100644 tests/lora/test_llama_hpu.py create mode 100644 tests/lora/test_lora_hpu.py create mode 100644 tests/lora/test_multilora_hpu.py diff --git a/examples/lora_inference_hpu.py b/examples/lora_inference_hpu.py new file mode 100644 index 0000000000000..b8154a29a82bb --- /dev/null +++ b/examples/lora_inference_hpu.py @@ -0,0 +1,47 @@ +from huggingface_hub import snapshot_download + +from vllm import LLM, SamplingParams +from vllm.lora.request import LoRARequest + +sql_lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") + +llm = LLM(model="meta-llama/Llama-2-7b-hf", + enable_lora=True, + max_num_seqs=2, + dtype='bfloat16') + +sampling_params = SamplingParams(temperature=0, + max_tokens=1024, + stop=["[/assistant]"]) + +prompts = [ + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /Ë©okiru/ [ĂČkĂŹÉœÉŻÌ]? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 +] + +expected_output = [ + " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 + " SELECT nationality FROM table_name_11 WHERE elector = 'Anchero Pantaleone' ", # noqa: E501 + " SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /Ë©okiru/' [ĂČkĂŹÉœÉŻÌ] AND accented_mora = 'low tone mora with a gloss of /Ë©okiru/' [ĂČkĂŹÉœÉŻÌ] ", # noqa: E501 + " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501 + " SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501 + " SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " # noqa: E501 +] + +outputs = llm.generate(prompts, + sampling_params, + lora_request=LoRARequest("sql_adapter", 1, + sql_lora_path)) + +for i, output in enumerate(outputs): + prompt = output.prompt + generated_text = output.outputs[0].text + match = expected_output[i] == generated_text + if not match: + print( + f"Comparison failed for request_id::{i}\n\t[PROMPT]{prompt!r}\n\t[GENERATED]{generated_text!r}\n\t[EXPECTED]{expected_output[i]!r}" # noqa: E501 + ) diff --git a/tests/conftest.py b/tests/conftest.py index 59510075b0063..cfb7cf56b519a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -590,9 +590,17 @@ def caplog_vllm(temporary_enable_log_propagate, caplog): yield caplog +def is_hpu(): + from importlib import util + return util.find_spec('habana_frameworks') is not None + + @pytest.fixture(scope="session") def num_gpus_available(): """Get number of GPUs without initializing the CUDA context in current process.""" + if is_hpu(): + return torch.hpu.device_count() + return cuda_device_count_stateless() diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 0bcae5b0c96dc..3e4c8be6dbaa3 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -48,13 +48,19 @@ class ContextInfo(TypedDict): }] +def is_hpu(): + from importlib import util + return util.find_spec('habana_frameworks') is not None + + def cleanup(): destroy_model_parallel() destroy_distributed_environment() with contextlib.suppress(AssertionError): torch.distributed.destroy_process_group() gc.collect() - torch.cuda.empty_cache() + if not is_hpu(): + torch.cuda.empty_cache() ray.shutdown() diff --git a/tests/lora/test_llama_hpu.py b/tests/lora/test_llama_hpu.py new file mode 100644 index 0000000000000..dfd551f2ca043 --- /dev/null +++ b/tests/lora/test_llama_hpu.py @@ -0,0 +1,100 @@ +from multiprocessing import Process +from typing import List + +from conftest import cleanup + +import vllm +from vllm.lora.request import LoRARequest + +MODEL_PATH = "meta-llama/Llama-2-7b-hf" + + +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: + prompts = [ + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /Ë©okiru/ [ĂČkĂŹÉœÉŻÌ]? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 + ] + sampling_params = vllm.SamplingParams(temperature=0, + max_tokens=256, + stop=["[/assistant]"]) + outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest(str(lora_id), lora_id, lora_path) + if lora_id else None) + # Print the outputs. + generated_texts: List[str] = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + generated_texts.append(generated_text) + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + return generated_texts + + +def _test_llama_lora(sql_lora_files, tp_size): + llm = vllm.LLM(MODEL_PATH, + enable_lora=True, + max_num_seqs=16, + max_loras=4, + dtype='float32', + tensor_parallel_size=tp_size) + + expected_no_lora_output = [ + "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]", # noqa: E501 + " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ", # noqa: E501 + "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kĂČt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kĂČt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m", # noqa: E501 + " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ", # noqa: E501 + " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ", # noqa: E501 + "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE", # noqa: E501 + ] + expected_lora_output = [ + " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 + " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501 + " SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /Ë©okiru/' [ĂČkĂŹÉœÉŻÌ] AND accented_mora = 'low tone mora with a gloss of /Ë©okiru/' [ĂČkĂŹÉœÉŻÌ] ", # noqa: E501 + " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501 + " SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501 + " SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " # noqa: E501 + ] + + print("lora adapter created") + assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output + + print("lora 1") + assert do_sample(llm, sql_lora_files, lora_id=1) == expected_lora_output + + print("no lora") + assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output + + print("lora 2") + assert do_sample(llm, sql_lora_files, lora_id=2) == expected_lora_output + + print("removing lora") + cleanup() + + +def test_llama_lora_1x(sql_lora_files): + p = Process(target=_test_llama_lora, args=(sql_lora_files, 1)) + p.start() + p.join() + assert p.exitcode == 0 + + +def test_llama_lora_2x(sql_lora_files): + # Work-around to resolve stalling issue in multi-card scenario + p = Process(target=_test_llama_lora, args=(sql_lora_files, 2)) + p.start() + p.join() + assert p.exitcode == 0 + + +def test_llama_lora_4x(sql_lora_files): + # Work-around to resolve stalling issue in multi-card scenario + p = Process(target=_test_llama_lora, args=(sql_lora_files, 4)) + p.start() + p.join() + assert p.exitcode == 0 diff --git a/tests/lora/test_lora_hpu.py b/tests/lora/test_lora_hpu.py new file mode 100644 index 0000000000000..ddbab66e166b3 --- /dev/null +++ b/tests/lora/test_lora_hpu.py @@ -0,0 +1,221 @@ +import pytest +import torch + +from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice + +from .utils import DummyLoRAManager + +TENSOR_SIZES = [128, 1024, 2048, 4096, 8192, 11008, 11008 // 2, 11008 // 4] +QKV_TENSOR_SIZES = [ + (8192, 1024, 1024), + (8192 // 8, 1024 // 8, 1024 // 8), + (4096, 4096, 4096), + (4096 // 2, 4096 // 2, 4096 // 2), +] +BATCH_SIZES = [8, 32, 256] +RANKS = [8] +DTYPES = [torch.bfloat16] +TOLERANCES = { + torch.float16: (5e-3, 5e-3), + torch.bfloat16: (3e-2, 2e-2), +} +MAX_LORAS = 8 + + +@pytest.mark.parametrize("m", TENSOR_SIZES) +@pytest.mark.parametrize("n", TENSOR_SIZES) +@pytest.mark.parametrize("k", BATCH_SIZES) +@pytest.mark.parametrize("rank", RANKS) +@pytest.mark.parametrize("dtype", DTYPES) +def test_apply_lora(m, n, k, rank, dtype) -> None: + manager = DummyLoRAManager() + + module_name = "module" + weight = torch.rand([m, n], device="hpu", dtype=dtype) + + manager.init_random_lora(module_name, weight, rank=rank) + lora = manager.get_module_lora(module_name) + + input = torch.rand(k, n, device="hpu", dtype=dtype) + expected = input @ lora.lora_a @ lora.lora_b * lora.scaling + + lora_a_stack = torch.zeros(MAX_LORAS + 1, + 1, + lora.lora_a.shape[1], + lora.lora_a.shape[0], + device="hpu", + dtype=dtype) + lora_b_stack = torch.zeros(MAX_LORAS + 1, + 1, + lora.lora_b.shape[1], + lora.lora_b.shape[0], + device="hpu", + dtype=dtype) + for i in range(MAX_LORAS): + lora_a_stack[i][0] = lora.lora_a.T + lora_b_stack[i][0] = (lora.lora_b * lora.scaling).T + + output = torch.zeros(k, m, device="hpu", dtype=dtype) + _apply_lora(input, lora_a_stack, lora_b_stack, + torch.randint(0, MAX_LORAS, (len(input), ), device="hpu"), + output) + rtol, atol = TOLERANCES[dtype] + assert torch.allclose(expected, output, rtol=rtol, atol=atol) + + output[:] = 0 + _apply_lora(input, lora_a_stack, lora_b_stack, + torch.full((len(input), ), -1, device="hpu"), output) + assert torch.allclose(torch.zeros_like(output), output) + + manager.reset_lora() + + +@pytest.mark.parametrize("m", TENSOR_SIZES) +@pytest.mark.parametrize("n", TENSOR_SIZES) +@pytest.mark.parametrize("k", BATCH_SIZES) +@pytest.mark.parametrize("rank", RANKS) +@pytest.mark.parametrize("dtype", DTYPES) +def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None: + if m % 2 != 0: + pytest.skip("m must be divisible by 2") + if m // 2 not in TENSOR_SIZES: + pytest.skip("m//2 must be in TENSOR_SIZES") + + manager = DummyLoRAManager() + + module_name = "module" + weight = torch.rand([m // 2, n], device="hpu", dtype=dtype) + + manager.init_random_lora(module_name + "1", weight, rank=rank) + lora_1 = manager.get_module_lora(module_name + "1") + manager.init_random_lora(module_name + "2", weight, rank=rank) + lora_2 = manager.get_module_lora(module_name + "2") + + input = torch.rand(k, n, device="hpu", dtype=dtype) + expected = torch.cat([ + input @ lora_1.lora_a @ lora_1.lora_b * lora_1.scaling, + input @ lora_2.lora_a @ lora_2.lora_b * lora_2.scaling + ], + dim=1) + + lora_a_stacks = [ + torch.zeros(MAX_LORAS + 1, + 1, + lora_1.lora_a.shape[1], + lora_1.lora_a.shape[0], + device="hpu", + dtype=dtype) for i in range(2) + ] + lora_b_stacks = [ + torch.zeros(MAX_LORAS + 1, + 1, + lora_1.lora_b.shape[1], + lora_1.lora_b.shape[0], + device="hpu", + dtype=dtype) for i in range(2) + ] + for i in range(MAX_LORAS): + lora_a_stacks[0][i][0] = lora_1.lora_a.T + lora_b_stacks[0][i][0] = (lora_1.lora_b * lora_1.scaling).T + lora_a_stacks[1][i][0] = lora_2.lora_a.T + lora_b_stacks[1][i][0] = (lora_2.lora_b * lora_2.scaling).T + + output = torch.zeros(k, m, device="hpu", dtype=dtype) + _apply_lora_packed_nslice( + input, lora_a_stacks, lora_b_stacks, + torch.randint(0, MAX_LORAS, (len(input), ), device="hpu"), output, + (m // 2, m // 2)) + + rtol, atol = TOLERANCES[dtype] + assert torch.allclose(expected, output, rtol=rtol, atol=atol) + + output[:] = 0 + _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks, + torch.full((len(input), ), -1, device="hpu"), + output, (m // 2, m // 2)) + assert torch.allclose(torch.zeros_like(output), output) + + manager.reset_lora() + + +@pytest.mark.parametrize("qkv", QKV_TENSOR_SIZES) +@pytest.mark.parametrize("n", TENSOR_SIZES) +@pytest.mark.parametrize("k", BATCH_SIZES) +@pytest.mark.parametrize("rank", RANKS) +@pytest.mark.parametrize("dtype", DTYPES) +def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None: + manager = DummyLoRAManager() + + module_name = "module" + weight_q = torch.empty(qkv[0], n, device="hpu", dtype=dtype) + weight_kv = torch.empty(qkv[1], n, device="hpu", dtype=dtype) + + manager.init_random_lora(module_name + "q", weight_q, rank=rank) + lora_q = manager.get_module_lora(module_name + "q") + manager.init_random_lora(module_name + "k", weight_kv, rank=rank) + lora_k = manager.get_module_lora(module_name + "k") + manager.init_random_lora(module_name + "v", weight_kv, rank=rank) + lora_v = manager.get_module_lora(module_name + "v") + + input = torch.rand(k, n, device="hpu", dtype=dtype) + expected = torch.cat([ + input @ lora_q.lora_a @ lora_q.lora_b * lora_q.scaling, + input @ lora_k.lora_a @ lora_k.lora_b * lora_k.scaling, + input @ lora_v.lora_a @ lora_v.lora_b * lora_v.scaling + ], + dim=1) + + lora_a_stacks = [ + torch.zeros(MAX_LORAS + 1, + 1, + lora_q.lora_a.shape[1], + lora_q.lora_a.shape[0], + device="hpu", + dtype=dtype) + ] + [ + torch.zeros(MAX_LORAS + 1, + 1, + lora_k.lora_a.shape[1], + lora_k.lora_a.shape[0], + device="hpu", + dtype=dtype) for i in range(2) + ] + lora_b_stacks = [ + torch.zeros(MAX_LORAS + 1, + 1, + lora_q.lora_b.shape[1], + lora_q.lora_b.shape[0], + device="hpu", + dtype=dtype) + ] + [ + torch.zeros(MAX_LORAS + 1, + 1, + lora_k.lora_b.shape[1], + lora_k.lora_b.shape[0], + device="hpu", + dtype=dtype) for i in range(2) + ] + for i in range(MAX_LORAS): + lora_a_stacks[0][i][0] = lora_q.lora_a.T + lora_b_stacks[0][i][0] = (lora_q.lora_b * lora_q.scaling).T + lora_a_stacks[1][i][0] = lora_k.lora_a.T + lora_b_stacks[1][i][0] = (lora_k.lora_b * lora_k.scaling).T + lora_a_stacks[2][i][0] = lora_v.lora_a.T + lora_b_stacks[2][i][0] = (lora_v.lora_b * lora_v.scaling).T + + output = torch.zeros(k, sum(qkv), device="hpu", dtype=dtype) + _apply_lora_packed_nslice( + input, lora_a_stacks, lora_b_stacks, + torch.randint(0, MAX_LORAS, (len(input), ), device="hpu"), output, + (qkv[0], qkv[1], qkv[2])) + + rtol, atol = TOLERANCES[dtype] + assert torch.allclose(expected, output, rtol=rtol, atol=atol) + + output[:] = 0 + _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks, + torch.full((len(input), ), -1, device="hpu"), + output, (qkv[0], qkv[1], qkv[2])) + assert torch.allclose(torch.zeros_like(output), output) + + manager.reset_lora() diff --git a/tests/lora/test_multilora_hpu.py b/tests/lora/test_multilora_hpu.py new file mode 100644 index 0000000000000..edca64fd5a2ae --- /dev/null +++ b/tests/lora/test_multilora_hpu.py @@ -0,0 +1,130 @@ +from multiprocessing import Process +from typing import List, Optional, Tuple + +from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams +from vllm.lora.request import LoRARequest + + +def create_test_prompts( + lora_path: str +) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]: + """Create a list of test prompts with their sampling parameters. + + 2 requests for base model, 4 requests for the LoRA. We define 2 + different LoRA adapters (using the same model for demo purposes). + """ + return [ + ("A robot may not injure a human being", + SamplingParams(temperature=0.0, + logprobs=1, + prompt_logprobs=1, + max_tokens=128), None), + ("To be or not to be,", + SamplingParams(temperature=0.8, + top_k=5, + presence_penalty=0.2, + max_tokens=128), None), + ( + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 + SamplingParams(temperature=0.0, + logprobs=1, + prompt_logprobs=1, + max_tokens=128, + stop_token_ids=[32003]), + LoRARequest("sql-lora", 1, lora_path)), + ( + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 + SamplingParams(temperature=0, + max_tokens=128, + stop_token_ids=[32003]), + LoRARequest("sql-lora", 1, lora_path)), + ( + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 + SamplingParams(temperature=0.0, + logprobs=1, + prompt_logprobs=1, + max_tokens=128, + stop_token_ids=[32003]), + LoRARequest("sql-lora2", 2, lora_path)), + ( + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 + SamplingParams(temperature=0, + max_tokens=128, + stop_token_ids=[32003]), + LoRARequest("sql-lora", 1, lora_path)), + ] + + +def process_requests(engine: LLMEngine, + test_prompts: List[Tuple[str, SamplingParams, + Optional[LoRARequest]]]): + """Continuously process a list of prompts and handle the outputs.""" + request_id = 0 + result = {} + + while test_prompts or engine.has_unfinished_requests(): + if test_prompts: + prompt, sampling_params, lora_request = test_prompts.pop(0) + engine.add_request(str(request_id), + prompt, + sampling_params, + lora_request=lora_request) + request_id += 1 + + request_outputs: List[RequestOutput] = engine.step() + + for request_output in request_outputs: + if request_output.finished: + result[ + request_output.request_id] = request_output.outputs[0].text + return result + + +expected_output = [ + " or, through inaction, allow a human being to come to harm.\nA robot must obey the orders given it by human beings except where such orders would conflict with the First Law.\nA robot must protect its own existence as long as such protection does not conflict with the First or Second Law.\nThe Three Laws of Robotics were created by Isaac Asimov in 1942. They are the foundation of robotics and artificial intelligence.\nThe Three Laws of Robotics are the foundation of robotics and artificial intelligence. They were created by Isaac Asimov in 194", # noqa: E501 + " that is the question.\nIt is the most famous line in all of Shakespeare's plays and one of the most famous in English literature. The question is not whether or not to be, but rather the question of who to be.\nIn Hamlet's case, the question is whether or not to be a good person. He is torn between the goodness of his father and the evil of his mother.\nThe question is a difficult one, and one that has been asked many times before. In Hamlet's case, the question is whether or not to be a good person, and he is torn between the", # noqa: E501 + " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 + " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501 + " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 + " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' " # noqa: E501 +] + + +def _test_llama_multilora(sql_lora_files, tp_size): + """Main function that sets up and runs the prompt processing.""" + engine_args = EngineArgs(model="meta-llama/Llama-2-7b-hf", + enable_lora=True, + max_loras=2, + max_lora_rank=8, + max_num_seqs=16, + dtype='float32', + tensor_parallel_size=tp_size) + engine = LLMEngine.from_engine_args(engine_args) + test_prompts = create_test_prompts(sql_lora_files) + results = process_requests(engine, test_prompts) + generated_texts = [results[key] for key in sorted(results)] + assert generated_texts == expected_output + + +def test_llama_multilora_1x(sql_lora_files): + # Work-around to resolve stalling issue in multi-card scenario + p = Process(target=_test_llama_multilora, args=(sql_lora_files, 1)) + p.start() + p.join() + assert p.exitcode == 0 + + +def test_llama_multilora_2x(sql_lora_files): + # Work-around to resolve stalling issue in multi-card scenario + p = Process(target=_test_llama_multilora, args=(sql_lora_files, 2)) + p.start() + p.join() + assert p.exitcode == 0 + + +def test_llama_multilora_4x(sql_lora_files): + # Work-around to resolve stalling issue in multi-card scenario + p = Process(target=_test_llama_multilora, args=(sql_lora_files, 4)) + p.start() + p.join() + assert p.exitcode == 0 diff --git a/tests/lora/utils.py b/tests/lora/utils.py index b73cf5bf55324..6ed985e72e6b3 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -3,6 +3,7 @@ import torch from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights +from vllm.utils import get_device class DummyLoRAManager: @@ -28,16 +29,16 @@ def init_random_lora(self, lora_alpha=1, lora_a=torch.rand([weight.shape[1], rank], dtype=weight.dtype, - device="cuda"), + device=get_device()), lora_b=torch.rand([rank, weight.shape[0]], dtype=weight.dtype, - device="cuda"), + device=get_device()), ) if generate_embeddings_tensor: lora.embeddings_tensor = torch.rand(5, generate_embeddings_tensor, dtype=weight.dtype, - device="cuda") + device=get_device()) self.set_module_lora(module_name, lora) return lora @@ -53,8 +54,8 @@ def init_lora(self, module_name, rank=rank, lora_alpha=1, - lora_a=torch.rand([input_dim, rank], device="cuda"), - lora_b=torch.rand([rank, output_dim], device="cuda"), + lora_a=torch.rand([input_dim, rank], device=get_device()), + lora_b=torch.rand([rank, output_dim], device=get_device()), embeddings_tensor=embeddings_tensor, ) self.set_module_lora(module_name, lora) diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py index 80f8037a2d043..baeaec5afa371 100644 --- a/vllm/executor/habana_executor.py +++ b/vllm/executor/habana_executor.py @@ -154,29 +154,36 @@ def execute_model( return output def add_lora(self, lora_request: LoRARequest) -> bool: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." + return self.driver_worker.add_lora(lora_request) def remove_lora(self, lora_id: int) -> bool: - raise NotImplementedError("LoRA is not implemented for HPU backend.") - - def list_loras(self) -> Set[int]: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + assert lora_id > 0, "lora_id must be greater than 0." + return self.driver_worker.remove_lora(lora_id) def pin_lora(self, lora_id: int) -> bool: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + assert lora_id > 0, "lora_id must be greater than 0." + return self.driver_worker.pin_lora(lora_id) + + def list_loras(self) -> Set[int]: + return self.driver_worker.list_loras() def add_prompt_adapter( self, prompt_adapter_request: PromptAdapterRequest) -> bool: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + raise NotImplementedError( + "Prompt Adapter is not implemented for HPU backend.") def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + raise NotImplementedError( + "Prompt Adapter is not implemented for HPU backend.") def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + raise NotImplementedError( + "Prompt Adapter is not implemented for HPU backend.") def list_prompt_adapters(self) -> Set[int]: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + raise NotImplementedError( + "Prompt Adapter is not implemented for HPU backend.") def check_health(self) -> None: # GPUExecutor will always be healthy as long as diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index 2af5634a8d1a6..662c53486b4ca 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -191,3 +191,78 @@ def prompt_attention( valid_seq_lengths, 'right') attn_weights = attn_weights.transpose(1, 2) return attn_weights + + +def dispatch_bgmv_linear( + y: torch.Tensor, + x: torch.Tensor, + wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, + indices: torch.LongTensor, + layer_idx: int, + scale: float, +): + """ + `wa_t_all` and `wb_t_all` contains all LoRA A and LoRA B weight matrices + stacked into single tensors, assuming same rank. HPU handles no-LoRA + requests using zero valued A and B tensors. These zero valued tensors are + appended at the end of `wa_t_all` and `wb_t_all` during initialization. For + custom BGMV, the corresponding `wa` and `wb` for each batch is created + based on the lora_index of each sample. + + For example: + `wa_t_all` is tensor of shape (num_loras, num_layers, lora_rank, + hidden_dim), where `wa_t_all[-1]` is zero valued tensor which handles + no-LoRA case. The `wa` tensor for a batch of size batch_Size will have + a shape of (batch_size, num_layers, hidden_dim, lora_rank) + + This method avoids for-loop as well as graph breaks. + """ + assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}' + max_loras = wa_t_all.size(0) + # Wrap-around for negative indices + indices = indices % max_loras + wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) + wb = torch.index_select(wb_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) + + x = x.unsqueeze(1) + out = x @ wa + out = out @ wb + out = out.squeeze(1) + y += out * scale + + +def dispatch_bgmv_embedding( + y: torch.Tensor, + x: torch.Tensor, + wa_t_all: torch.Tensor, + indices: torch.LongTensor, + layer_idx: int, + scale: float, +): + """ + `wa_t_all` contains all LoRA A weight matrices stacked into a single tensor + assuming same rank. HPU handles no-LoRA requests using zero valued A + tensor. This zero valued tensor is appended at the end of `wa_t_all` during + initialization. For custom BGMV, the corresponding wa for each batch is + created based on the lora_index of the sample. + + For example: + `wa_t_all` is tensor of shape (num_loras, num_layers, lora_rank, + hidden_dim), where `wa_t_all[-1]` is zero valued tensor which handles + no-LoRA case. The wa tensor for a batch of size batch_Size will have a + shape of (batch_size, num_layers, lora_rank, hidden_dim) + + + This method avoids for-loop as well as graph breaks. + """ + assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}' + max_loras = wa_t_all.size(0) + # Wrap-around for negative indices + indices = indices % max_loras + wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) + + x = x.unsqueeze(1) + out = x @ wa + out = out.squeeze(1) + y += out * scale \ No newline at end of file diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 87de285a373a2..4a45f3fda88f1 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -27,6 +27,10 @@ LinearScalingRotaryEmbedding, RotaryEmbedding) from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) +from vllm.utils import is_hpu + +if is_hpu(): + from vllm.hpu.ops import dispatch_bgmv_embedding, dispatch_bgmv_linear if TYPE_CHECKING: pass @@ -89,7 +93,11 @@ def _apply_lora( x = x.view(-1, x.shape[-1]) output = output.view(-1, output.shape[-1]) indices = indices.view(-1) - add_lora(output, x, lora_a_stacked, lora_b_stacked, indices, 0, 1.0) + if is_hpu(): + dispatch_bgmv_linear(output, x, lora_a_stacked, lora_b_stacked, + indices, 0, 1.0) + else: + add_lora(output, x, lora_a_stacked, lora_b_stacked, indices, 0, 1.0) return output.view_as(org_output) @@ -127,9 +135,15 @@ def _apply_lora_packed_nslice( indices = indices.view(-1) offset_left = 0 for slice_idx in range(len(output_slices)): - add_lora_slice(output, x, lora_a_stacked[slice_idx], - lora_b_stacked[slice_idx], indices, 0, 1.0, offset_left, - output_slices[slice_idx]) + if is_hpu(): + dispatch_bgmv_linear( + output[:, offset_left:offset_left + output_slices[slice_idx]], + x, lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], + indices, 0, 1.0) + else: + add_lora_slice(output, x, lora_a_stacked[slice_idx], + lora_b_stacked[slice_idx], indices, 0, 1.0, + offset_left, output_slices[slice_idx]) offset_left += output_slices[slice_idx] return output.view_as(org_output) @@ -330,8 +344,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: full_lora_a_embeddings = full_lora_a_embeddings.view( full_lora_a_embeddings.shape[0] * full_lora_a_embeddings.shape[1], -1) - bgmv(full_output, full_lora_a_embeddings, self.lora_b_stacked, - self.indices[:self.indices_len[0]], 0, 1.0) + if is_hpu(): + dispatch_bgmv_embedding(full_output, full_lora_a_embeddings, + self.lora_b_stacked, + self.indices[:self.indices_len[0]], 0, 1.0) + else: + bgmv(full_output, full_lora_a_embeddings, self.lora_b_stacked, + self.indices[:self.indices_len[0]], 0, 1.0) return full_output.view_as(full_output_org) @classmethod diff --git a/vllm/lora/models.py b/vllm/lora/models.py index e1ede7d4d710a..30d2fd9502977 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -24,7 +24,7 @@ from vllm.lora.utils import (from_layer, from_layer_logits_processor, parse_fine_tuned_lora_name, replace_submodule) from vllm.model_executor.models.interfaces import SupportsLoRA -from vllm.utils import is_pin_memory_available +from vllm.utils import get_device, is_hpu, is_pin_memory_available logger = init_logger(__name__) @@ -93,7 +93,7 @@ def convert_mapping( long_lora_offsets: Optional[torch.Tensor] = None if long_lora_context: long_lora_offsets = torch.zeros(len(index_mapping_indices), - device="cuda", + device=get_device(), dtype=torch.long) prompt_mapping: List[int] = [ lora_index_to_id.index(x) if x > 0 else -1 @@ -118,9 +118,9 @@ def convert_mapping( if long_lora_context: assert long_lora_offsets is not None indices_list.append(long_lora_offsets) - indices = torch.tensor(indices_list, dtype=torch.long, device="cuda") + indices = torch.tensor(indices_list, dtype=torch.long, device=get_device()) prompt_mapping_tensor = torch.tensor(prompt_mapping, - device="cuda", + device=get_device(), dtype=torch.long) embeddings_indices = torch.stack([ indices[2] * extra_vocab_size, @@ -131,10 +131,10 @@ def convert_mapping( sampler_indices = prompt_mapping_tensor sampler_indices_padded = sampler_indices.clone() sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1 - sampler_indices_padded = ( - torch.arange( - 0, len(sampler_indices_padded), device="cuda", dtype=torch.long) + - (sampler_indices_padded * len(sampler_indices_padded))) + sampler_indices_padded = (torch.arange( + 0, len(sampler_indices_padded), device=get_device(), dtype=torch.long) + + (sampler_indices_padded * + len(sampler_indices_padded))) long_lora_indices = None long_lora_indices_len: Optional[int] = None if long_lora_context: @@ -424,20 +424,20 @@ def __init__( self.long_lora_context: Optional[LongContextLoRAContext] = None self.base_indices = torch.empty(self.max_num_batched_tokens, dtype=torch.long, - device="cuda") + device=get_device()) self.sampler_indices = torch.empty(self.max_num_batched_tokens, dtype=torch.long, - device="cuda") + device=get_device()) self.sampler_indices_padded = torch.empty(self.max_num_batched_tokens, dtype=torch.long, - device="cuda") + device=get_device()) self.embeddings_indices = torch.empty(2, self.max_num_batched_tokens, dtype=torch.long, - device="cuda") + device=get_device()) self.long_lora_indices = torch.empty(self.max_num_batched_tokens, dtype=torch.long, - device="cuda") + device=get_device()) # Scaling factor -> offset to the sin_cos_cache to it. # Used for long context lora. self.scaling_factor_to_offset: Dict[float, int] = {} @@ -465,11 +465,25 @@ def __init__( @property def capacity(self) -> int: - return self.lora_config.max_cpu_loras + if is_hpu(): + # HPU handles no LoRA requests using zero valued A and B tensors. + # These zero valued tensors are appended at the end of A and B, + # making total number of loras to be lora_config.max_cpu_loras + 1. + # This demands the total number of max_cpu_loras to be + # lora_config.max_cpu_loras + 1 + return self.lora_config.max_cpu_loras + 1 + else: + return self.lora_config.max_cpu_loras @property def lora_slots(self) -> int: - return self.lora_config.max_loras + if is_hpu(): + # HPU handles no LoRA requests using zero valued A and B tensors. + # These zero valued tensors are appended at the end of A and B, + # making total number of loras to be lora_config.max_cpu_loras + 1. + return self.lora_config.max_loras + 1 + else: + return self.lora_config.max_loras @property def adapter_slots(self) -> int: diff --git a/vllm/utils.py b/vllm/utils.py index fe84253feb172..fa6e132dd3522 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -970,6 +970,12 @@ def cuda_device_count_stateless() -> int: return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES) +def get_device() -> str: + if is_hpu(): + return "hpu" + return "cuda" + + def error_on_invalid_device_count_status(): cache_entries = 0 with contextlib.suppress(Exception): diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index e52b61539b540..d129bb5cbc0ca 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -435,6 +435,23 @@ def load_model(self) -> None: f"took {m_getmodel.get_summary_string()}") logger.info(msg) + if self.lora_config: + assert hasattr(self.model, "supported_lora_modules" + ) and self.model.supported_lora_modules, ( + "Model does not support LoRA") + assert hasattr(self.model, "embedding_modules" + ), "Model does not have embedding_modules" + assert hasattr( + self.model, "embedding_padding_modules" + ), "Model does not have embedding_padding_modules" + self.lora_manager = LRUCacheWorkerLoRAManager( + self.scheduler_config.max_num_seqs, + self.scheduler_config.max_num_batched_tokens, + self.vocab_size, self.lora_config, self.device, + self.model.embedding_modules, + self.model.embedding_padding_modules) + self.model = self.lora_manager.create_lora_manager(self.model) + if self.model_config.quantization == 'inc': logger.info("Preparing model with INC..") with HabanaMemoryProfiler() as m_inc: @@ -467,35 +484,26 @@ def load_model(self) -> None: msg = f"Loading model weights took in total {m.get_summary_string()}" logger.info(msg) - if self.lora_config: - assert hasattr(self.model, "supported_lora_modules" - ) and self.model.supported_lora_modules, ( - "Model does not support LoRA") - assert hasattr( - self.model, - "embedding_modules"), "Model does not have embedding_modules" - assert hasattr(self.model, "embedding_padding_modules" - ), "Model does not have embedding_padding_modules" - self.lora_manager = LRUCacheWorkerLoRAManager( - self.scheduler_config.max_num_seqs, - self.scheduler_config.max_num_batched_tokens, self.vocab_size, - self.lora_config, self.device, self.model.embedding_modules, - self.model.embedding_padding_modules) - self.model = self.lora_manager.create_lora_manager(self.model) - def _use_graphs(self, batch_size, seq_len, is_prompt): if self.enforce_eager: return False return (batch_size, seq_len, is_prompt) in self.graphed_buckets + def _is_valid_bucket(self, bucket): + return bucket[0] * bucket[1] <= self.max_num_batched_tokens + def _setup_buckets(self) -> None: + max_bucket_cfg = 64 + if self.lora_config and \ + max_bucket_cfg > self.max_num_batched_tokens // self.block_size: + max_bucket_cfg = self.max_num_batched_tokens // self.block_size self.prompt_bs_bucket_cfg = read_bucket_settings('prompt', 'bs', min=1, step=32, max=min( self.max_num_seqs, - 64)) + max_bucket_cfg)) self.decode_bs_bucket_cfg = read_bucket_settings('decode', 'bs', min=1, @@ -520,6 +528,12 @@ def _setup_buckets(self) -> None: self.prompt_buckets = warmup_buckets(self.prompt_bs_bucket_cfg, self.prompt_seq_bucket_cfg) + if self.lora_config: + self.prompt_buckets[:] = [ + bucket for bucket in self.prompt_buckets + if self._is_valid_bucket(bucket) + ] + msg = (f"Generated {len(self.prompt_buckets)} " f"prompt buckets: {list(sorted(self.prompt_buckets))}") logger.info(msg) @@ -530,6 +544,11 @@ def _setup_buckets(self) -> None: logger.info(msg) self.decode_buckets = warmup_buckets(self.decode_bs_bucket_cfg, self.decode_seq_bucket_cfg) + if self.lora_config: + self.decode_buckets[:] = [ + bucket for bucket in self.decode_buckets + if self._is_valid_bucket(bucket) + ] msg = (f"Generated {len(self.decode_buckets)} decode buckets: " f"{list(sorted(self.decode_buckets))}") logger.info(msg) @@ -606,16 +625,6 @@ def _prepare_prompt( # NOTE(woosuk): Here we assume that the first token in the prompt # is always the first token in the sequence. input_positions.append(list(range(context_len, seq_len))) - lora_id = seq_group_metadata.lora_int_id - - if lora_id > 0: - lora_requests.add(seq_group_metadata.lora_request) - - lora_index_mapping += [lora_id] * (seq_len - context_len) - lora_prompt_mapping.append( - [lora_id] * - (seq_len - context_len - if seq_group_metadata.sampling_params.prompt_logprobs else 1)) if seq_group_metadata.multi_modal_data: multi_modal_input_list.append( @@ -674,6 +683,20 @@ def _prepare_prompt( max_prompt_len = max( find_bucket(max(seq_lens), self.prompt_seq_bucket_cfg), self.block_size) + + for seq_group_metadata, context_len in zip(seq_group_metadata_list, + context_lens): + lora_id = seq_group_metadata.lora_int_id + + if lora_id > 0: + lora_requests.add(seq_group_metadata.lora_request) + + lora_index_mapping += [lora_id] * (max_prompt_len - context_len) + lora_prompt_mapping.extend( + [lora_id] * + (max_prompt_len - context_len + if seq_group_metadata.sampling_params.prompt_logprobs else 1)) + input_tokens = make_tensor_with_pad(input_tokens, max_len=max_prompt_len, pad=0, @@ -1027,7 +1050,11 @@ def trim_attn_metadata(self, metadata: AttentionMetadata) -> object: ]) return attention_metadata - def create_dummy_seq_group_metadata(self, group_id, seq_len, is_prompt): + def create_dummy_seq_group_metadata(self, + group_id, + seq_len, + is_prompt, + lora_request=None): sampling_params = SamplingParams(temperature=0) num_blocks = math.ceil(seq_len / self.block_size) if is_prompt: @@ -1042,34 +1069,78 @@ def create_dummy_seq_group_metadata(self, group_id, seq_len, is_prompt): output_token_ids = [1] * output_len seq_data = SequenceData(prompt_token_ids) seq_data.output_token_ids = output_token_ids - return SequenceGroupMetadata( - request_id=str(group_id), - is_prompt=(output_len == 0), - seq_data={group_id: seq_data}, - sampling_params=sampling_params, - block_tables=block_tables, - ) + return SequenceGroupMetadata(request_id=str(group_id), + is_prompt=(output_len == 0), + seq_data={group_id: seq_data}, + sampling_params=sampling_params, + block_tables=block_tables, + lora_request=lora_request) def profile_run(self) -> None: num_layers = self.model_config.get_num_layers(self.parallel_config) kv_caches = [None] * num_layers max_batch_size = self.prompt_bs_bucket_cfg[-1] max_seq_len = self.prompt_seq_bucket_cfg[-1] - - self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches) - - def warmup_scenario(self, batch_size, seq_len, is_prompt, - kv_caches) -> None: + if self.lora_config: + max_seq_len = self.max_num_batched_tokens // max_batch_size + + self.warmup_scenario(max_batch_size, + max_seq_len, + True, + kv_caches, + is_profile_run=True) + + def warmup_scenario(self, + batch_size, + seq_len, + is_prompt, + kv_caches, + is_profile_run=False) -> None: use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) scenario_name = ("warmup_" f"{'prompt' if is_prompt else 'decode'}_" f"bs{batch_size}_" f"seq{seq_len}_" f"graphs{'T' if use_graphs else 'F'}") + max_num_seqs = self.scheduler_config.max_num_seqs + # This represents the maximum number of different requests + # that will have unique loras, an therefore the max amount of memory + # consumption create dummy lora request copies from the lora request + # passed in, which contains a lora from the lora warmup path. + dummy_lora_requests: List[LoRARequest] = [] + dummy_lora_requests_per_seq: List[LoRARequest] = [] + if self.lora_config and is_profile_run: + assert self.lora_manager is not None + with self.lora_manager.dummy_lora_cache(): + for idx in range(self.lora_config.max_loras): + lora_id = idx + 1 + dummy_lora_request = LoRARequest( + lora_name=f"warmup_{lora_id}", + lora_int_id=lora_id, + lora_local_path="/not/a/real/path", + ) + self.lora_manager.add_dummy_lora(dummy_lora_request, + rank=LORA_WARMUP_RANK) + dummy_lora_requests.append(dummy_lora_request) + dummy_lora_requests_per_seq = [ + dummy_lora_requests[idx % len(dummy_lora_requests)] + for idx in range(max_num_seqs) + ] self.profiler.start('internal', scenario_name) times = 3 if use_graphs else 1 + if self.lora_config and not is_profile_run: + lora_mapping = LoRAMapping( + [0] * batch_size * seq_len, + [0] * batch_size * seq_len, + ) + self.set_active_loras(set(), lora_mapping) seqs = [ - self.create_dummy_seq_group_metadata(i, seq_len, is_prompt) + self.create_dummy_seq_group_metadata( + i, + seq_len, + is_prompt, + lora_request=dummy_lora_requests_per_seq[i] + if dummy_lora_requests_per_seq else None) for i in range(batch_size) ] torch.hpu.synchronize() @@ -1080,6 +1151,37 @@ def warmup_scenario(self, batch_size, seq_len, is_prompt, self.profiler.end() gc.collect() + def remove_all_loras(self): + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + self.lora_manager.remove_all_adapters() + + def set_active_loras(self, lora_requests: Set[LoRARequest], + lora_mapping: LoRAMapping) -> None: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + self.lora_manager.set_active_adapters(lora_requests, lora_mapping) + + def add_lora(self, lora_request: LoRARequest) -> bool: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.add_adapter(lora_request) + + def remove_lora(self, lora_id: int) -> bool: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.remove_adapter(lora_id) + + def pin_lora(self, lora_id: int) -> bool: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.pin_adapter(lora_id) + + def list_loras(self) -> Set[int]: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.list_adapters() + def log_warmup(self, phase, i, max_i, batch_size, seq_len): free_mem = format_bytes( HabanaMemoryProfiler.current_free_device_memory()) @@ -1403,9 +1505,11 @@ def execute_model( raise ValueError( "num_steps > 1 is not supported in HabanaModelRunner") - # NOTE(kzawora): Need to restore this after adding LoRA - # if self.lora_config: - # self.set_active_loras(lora_requests, lora_mapping) + if self.lora_config: + assert model_input.lora_requests is not None + assert model_input.lora_mapping is not None + self.set_active_loras(model_input.lora_requests, + model_input.lora_mapping) input_tokens = model_input.input_tokens input_positions = model_input.input_positions attn_metadata = model_input.attn_metadata @@ -1452,6 +1556,19 @@ def execute_model( selected_token_indices=sampling_metadata.selected_token_indices ) + if self.lora_config: + from vllm.lora.layers import VocabParallelEmbeddingWithLoRA + property = vars(self.model.model) + model = list(property['_modules'].values())[0] + property = vars(model) + modules = list(property['_modules'].values()) + for module in modules: + if isinstance(module, VocabParallelEmbeddingWithLoRA): + for i in range(0, 4): + module.indices_len[ + i] = sampling_metadata.selected_token_indices.numel( + ) + # Compute the logits. with self.profiler.record_event( 'internal', ('compute_logits_' diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index 87122c03d3c8f..9d083915041fe 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -174,9 +174,8 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: num_hpu_blocks = max(num_hpu_blocks, 0) num_cpu_blocks = max(num_cpu_blocks, 0) - # NOTE(kzawora): Restore this once LoRA support is added - # if self.model_runner.lora_manager: - # self.model_runner.remove_all_loras() + if self.model_runner.lora_manager: + self.model_runner.remove_all_loras() gc.collect() return num_hpu_blocks, num_cpu_blocks @@ -279,29 +278,33 @@ def execute_worker(self, worker_input: WorkerInput) -> None: self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy) def add_lora(self, lora_request: LoRARequest) -> bool: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + return self.model_runner.add_lora(lora_request) def remove_lora(self, lora_id: int) -> bool: - raise NotImplementedError("LoRA is not implemented for HPU backend.") - - def list_loras(self) -> Set[int]: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + return self.model_runner.remove_lora(lora_id) def pin_lora(self, lora_id: int) -> bool: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + return self.model_runner.pin_lora(lora_id) + + def list_loras(self) -> Set[int]: + return self.model_runner.list_loras() def add_prompt_adapter( self, prompt_adapter_request: PromptAdapterRequest) -> bool: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + raise NotImplementedError( + "Prompt Adapter is not implemented for HPU backend.") def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + raise NotImplementedError( + "Prompt Adapter is not implemented for HPU backend.") def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + raise NotImplementedError( + "Prompt Adapter is not implemented for HPU backend.") def list_prompt_adapters(self) -> Set[int]: - raise NotImplementedError("LoRA is not implemented for HPU backend.") + raise NotImplementedError( + "Prompt Adapter is not implemented for HPU backend.") def shutdown_inc(self): self.model_runner.shutdown_inc() From 1f1e98199cc570baa3f406e3cfff0e4b95ec14d8 Mon Sep 17 00:00:00 2001 From: Sanju C Sudhakaran Date: Wed, 21 Aug 2024 09:33:09 +0300 Subject: [PATCH 132/819] Handle compile-mode unwrap bug for indices length fix in LoRA --- vllm/worker/habana_model_runner.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index d129bb5cbc0ca..7f7f15bea86fa 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -1441,6 +1441,15 @@ def get_counter_dict(self, cache_config, duration, seq_len, return counters +def unwrap_model(model): + if isinstance(model, torch._dynamo.eval_frame.OptimizedModule): + return unwrap_model(model._orig_mod) + else: + model = list(vars(model)['_modules'].values())[0] + modules = list(vars(model)['_modules'].values()) + return modules + + class HabanaModelRunner( HabanaModelRunnerBase[ModelInputForHPUWithSamplingMetadata]): """ @@ -1558,13 +1567,10 @@ def execute_model( if self.lora_config: from vllm.lora.layers import VocabParallelEmbeddingWithLoRA - property = vars(self.model.model) - model = list(property['_modules'].values())[0] - property = vars(model) - modules = list(property['_modules'].values()) + modules = unwrap_model(self.model.model) for module in modules: if isinstance(module, VocabParallelEmbeddingWithLoRA): - for i in range(0, 4): + for i in range(0, len(module.indices_len)): module.indices_len[ i] = sampling_metadata.selected_token_indices.numel( ) From db02be889957ab94897ccc8d95181d8bb422f92a Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Thu, 22 Aug 2024 16:15:26 +0000 Subject: [PATCH 133/819] Add docker hpu for serving Signed-off-by: Chendi.Xue --- Dockerfile.hpu | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 Dockerfile.hpu diff --git a/Dockerfile.hpu b/Dockerfile.hpu new file mode 100644 index 0000000000000..b9acec2b85be4 --- /dev/null +++ b/Dockerfile.hpu @@ -0,0 +1,18 @@ +FROM vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest + +COPY ./ /workspace/vllm + +WORKDIR /workspace/vllm + +RUN pip install -v -r requirements-hpu.txt + +ENV no_proxy=localhost,127.0.0.1 +ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true + +RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install + +WORKDIR /workspace/ + +RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks + +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] \ No newline at end of file From aefd336798248d519ddc4cc5662c9aa03a9dbfad Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 27 Aug 2024 14:42:57 +0200 Subject: [PATCH 134/819] Ensure buckets do not exceed the batch token limit (#206) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR ensures we don't capture buckets that are above the specified token budget (as set by `max_num_batched_tokens` argument) Example for token budget of 2048 (`--max-num-batched-tokens 2048`): ``` $ python vllm_test.py --max-num-batched-tokens 2048 WARNING 08-27 14:48:55 _custom_ops.py:14] Failed to import from vllm._C with ModuleNotFoundError("No module named 'vllm._C'") /usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py:366: UserWarning: torch.distributed.reduce_op is deprecated, please use torch.distributed.ReduceOp instead warnings.warn( No chat template is set for this tokenizer, falling back to a default class-level template. This is very error-prone, because models are often trained with templates different from the class default! Default chat templates are a legacy feature and will be removed in Transformers v4.43, at which point any code depending on them will stop working. We recommend setting a valid chat template before then to ensure that this model continues working without issues. INFO 08-27 14:48:56 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='facebook/opt-125m', speculative_config=None, tokenizer='facebook/opt-125m', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, weights_load_device=hpu, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=hpu, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=facebook/opt-125m, use_v2_block_manager=False, enable_prefix_caching=False) generation_config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████| 137/137 [00:00<00:00, 1.91MB/s] INFO 08-27 14:48:57 profiler.py:62] Profiler enabled for: vllm-instance-d356a015eeb349f7a4650e00bf6ce976 WARNING 08-27 14:48:57 utils.py:566] Pin memory is not supported on HPU. INFO 08-27 14:48:57 selector.py:85] Using HabanaAttention backend. INFO 08-27 14:48:57 habana_model_runner.py:532] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 64], seq:[128, 128, 1024] INFO 08-27 14:48:57 habana_model_runner.py:545] Generated 23 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (8, 128), (8, 256), (16, 128)] INFO 08-27 14:48:57 habana_model_runner.py:550] Decode bucket config (min, step, max_warmup) bs:[1, 128, 256], seq:[128, 128, 2048] INFO 08-27 14:48:57 habana_model_runner.py:561] Generated 31 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (8, 128), (8, 256), (16, 128)] ============================= HABANA PT BRIDGE CONFIGURATION =========================== PT_HPU_LAZY_MODE = 1 PT_RECIPE_CACHE_PATH = PT_CACHE_FOLDER_DELETE = 0 PT_HPU_RECIPE_CACHE_CONFIG = PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807 PT_HPU_LAZY_ACC_PAR_MODE = 1 PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0 ---------------------------: System Configuration :--------------------------- Num CPU Cores : 160 CPU RAM : 1056398260 KB ------------------------------------------------------------------------------ INFO 08-27 14:49:00 selector.py:85] Using HabanaAttention backend. INFO 08-27 14:49:00 loader.py:284] Loading weights on hpu ... INFO 08-27 14:49:00 weight_utils.py:224] Using model weights format ['*.bin'] pytorch_model.bin: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 251M/251M [00:06<00:00, 35.9MB/s] Loading pt checkpoint shards: 0% Completed | 0/1 [00:00 None: f"seq:{self.prompt_seq_bucket_cfg}") logger.info(msg) self.prompt_buckets = warmup_buckets(self.prompt_bs_bucket_cfg, - self.prompt_seq_bucket_cfg) + self.prompt_seq_bucket_cfg, + self.max_num_batched_tokens) if self.lora_config: self.prompt_buckets[:] = [ @@ -543,7 +550,8 @@ def _setup_buckets(self) -> None: f"seq:{self.decode_seq_bucket_cfg}") logger.info(msg) self.decode_buckets = warmup_buckets(self.decode_bs_bucket_cfg, - self.decode_seq_bucket_cfg) + self.decode_seq_bucket_cfg, + self.max_num_batched_tokens) if self.lora_config: self.decode_buckets[:] = [ bucket for bucket in self.decode_buckets From 2ab316db5f9f5f2944cbac68132769411e4833de Mon Sep 17 00:00:00 2001 From: Sayantan Sarkar Date: Mon, 19 Aug 2024 05:43:28 +0000 Subject: [PATCH 135/819] Initial commit --- tests/samplers/test_sampler.py | 61 +++++++++++++- vllm/model_executor/layers/sampler.py | 112 +++++++++++++++++++++++++- 2 files changed, 170 insertions(+), 3 deletions(-) mode change 100644 => 100755 vllm/model_executor/layers/sampler.py diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index 9572588ce6e53..9d0ecb820548e 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -7,7 +7,7 @@ import torch from transformers import GenerationConfig, GenerationMixin -from vllm.model_executor.layers.sampler import Sampler +from vllm.model_executor.layers.sampler import ApplyToppTopkScalar, Sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_random_seed from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata @@ -700,3 +700,62 @@ def test_sampling_params(sampling_params: List[SamplingParams]): assert tokens1[0] == tokens2[1] assert tokens1[1] == tokens2[0] + + +def test_topk_topk_scalar(): + obj1 = ApplyToppTopkScalar(2) + assert ApplyToppTopkScalar._padded_k == 0 + x = torch.tensor([[9, 9, 8, 8, 8, 8, 7, 7, 7.0], + [10, 10, 9, 9, 9, 8, 5, 5, 5]]) + + retval1 = obj1(x, p=0.9, k=5) + ninf = -float("inf") + expected1 = torch.tensor([[9., 9., 8., 8., 8., 8., ninf, ninf, ninf], + [10., 10., 9., 9., 9., ninf, ninf, ninf, ninf]]) + assert torch.all(retval1 == expected1).item() + assert ApplyToppTopkScalar._padded_k == 9 + + obj2 = ApplyToppTopkScalar(2) + assert obj2._padded_k == 9 + + x = torch.tensor([[2, 2, 9, 9, 2, 2, 1, 1, 1.0], + [10, 9, 9, 5, 9, 9, 5, 9, 10]]) + retval2 = obj2(x, p=0.9, k=5) + expected2 = torch.tensor( + [[ninf, ninf, 9., 9., ninf, ninf, ninf, ninf, ninf], + [10., ninf, 9., ninf, 9., 9., ninf, 9., 10.]]) + assert torch.all(retval2 == expected2).item() + assert obj2._padded_k == 9 + + retval3 = obj2(x, p=1.0, k=5) + expected3 = torch.tensor([[2., 2., 9., 9., 2., 2., ninf, ninf, ninf], + [10., 9., 9., ninf, 9., 9., ninf, 9., 10.]]) + + assert torch.all(retval3 == expected3).item() + + # this should not be done in general, doing it here for testing purposes + ApplyToppTopkScalar._padded_k = 0 + x = torch.tensor([[1, 1, 1, 9, 8, 1, 1, 1, 1.0], + [2, 1, 2, 2, 1, 1, 1, 1, 1]]) + obj3 = ApplyToppTopkScalar(2) + retval4 = obj3(x, p=0.9, k=2) + expected4 = torch.tensor( + [[ninf, ninf, ninf, 9., 8., ninf, ninf, ninf, ninf], + [2., ninf, 2., 2., ninf, ninf, ninf, ninf, ninf]]) + assert torch.all(retval4 == expected4).item() + assert obj3._padded_k == 4 + y = torch.tensor([[8, 8, 8, 9, 8, 1, 1, 1, 1.0], + [2, 1, 2, 2, 1, 1, 1, 1, 1]]) + retval5 = obj3(y, p=0.9, k=2) + assert obj3._padded_k == 8 + expected5 = torch.tensor([[8., 8., 8., 9., 8., ninf, ninf, ninf, ninf], + [2., ninf, 2., 2., ninf, ninf, ninf, ninf, + ninf]]) + assert torch.all(retval5 == expected5).item() + y = torch.tensor([[8, 8, 8, 9, 8, 8, 1, 1, 1.0], + [2, 1, 2, 2, 3, 1, 1, 1, 1]]) + retval6 = obj3(y, p=0.9, k=2) + expected6 = torch.tensor([[8., 8., 8., 9., 8., 8., ninf, ninf, ninf], + [2., ninf, 2., 2., 3., ninf, ninf, ninf, ninf]]) + assert torch.all(retval6 == expected6).item() + assert obj3._padded_k == 8 diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py old mode 100644 new mode 100755 index 6632b1c434582..6cb8971534cd3 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -1,5 +1,6 @@ """A layer that samples the next tokens from the model's outputs.""" import itertools +import math from math import inf from typing import Dict, List, Optional, Tuple @@ -77,6 +78,13 @@ def _init_sampling_tensors( self._do_penalties = do_penalties self._do_top_p_top_k = do_top_p_top_k self._do_min_p = do_min_p + self._top_p_scalar = sampling_tensors.top_ps[0].item() + self._top_k_scalar = sampling_tensors.top_ks[0].item() + scalar_p = torch.all(sampling_tensors.top_ps == self._top_p_scalar) + scalar_k = torch.all(sampling_tensors.top_ks == self._top_k_scalar) + self._scalar_p_and_k = (scalar_p and scalar_k).item() + if self._scalar_p_and_k and self._do_top_p_top_k: + self._apply_top_k_top_p_opt = ApplyToppTopkScalar(5) def forward( self, @@ -122,8 +130,13 @@ def forward( logits.div_(sampling_tensors.temperatures.unsqueeze(dim=1)) if do_top_p_top_k: - logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps, - sampling_tensors.top_ks) + if self._scalar_p_and_k: + logits = self._apply_top_k_top_p_opt(logits, + self._top_p_scalar, + self._top_k_scalar) + else: + logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps, + sampling_tensors.top_ks) if do_min_p: logits = _apply_min_p(logits, sampling_tensors.min_ps) @@ -198,6 +211,101 @@ def _get_bin_counts_and_mask( return bin_counts, mask +class ApplyToppTopkScalar(): + """ + The original implementation of _apply_top_k_top_p is more general + as it uses vector topp, topk + However in a lot of cases, topp and topk is same for all batch elements + For such "scalar" topp, topk cases, we can use this class + + The main optimizations in this class is: + Use topk instead of sort, which is much faster especially for small k. + However just using topk might not suffice in cases as shown below + Consider a tensor: 9 9 8 8 8 8 7 7 7 + Topk, with k=5, on this yields 9 9 8 8 8 + The value "8" is on the boundary, hence the last "8" gets snipped off + However the original implementation accepts all the "8"s, + so it should output: + 9 9 8 8 8 8 (6 values, even though k=5) + To ensure these semantics, we perform topk with _padded_k elements + If we find more boundary elements left over, + then we keep incrementing _padded_k + and in future calls use the expanded value of __padded_k + + The increments to _padded_k should be done + with value > 1 to prevent excessive recompilations + due to dynamic shapes (the output shape of the topk) + + The main logic of this is in __call__ + This is a class instead of a function, just to keep track of + the monotonic non-decreasing state _padded_k + """ + _padded_k = 0 + + def __init__(self, increment: int): + self._increment = increment + + def __call__(self, logits: torch.Tensor, p: float, k: int): + if k > ApplyToppTopkScalar._padded_k: + ApplyToppTopkScalar._padded_k = min(k + self._increment, + logits.shape[1]) + + vals, idx = torch.topk(logits, k=ApplyToppTopkScalar._padded_k, \ + dim=1, sorted=True) + + # this "if" checks if we have bucketed so much that + # we have padded k upto shape of logits + if ApplyToppTopkScalar._padded_k != logits.shape[1]: + smallest_of_top_k = vals[:, k - 1] + num_duplicates_of_smallest_of_topk = torch.sum( + logits == smallest_of_top_k.unsqueeze(1), 1) + max_num_duplicates_of_smallest_of_topk = torch.max( + num_duplicates_of_smallest_of_topk).item() + + # there are n repeats for a border + # (border meaning the smallest value of the top k). + # we do not know if only 1 or 2 or (n-1) + # of them lie outside the kth border, + # so we choose to conservatively increase by n-1 + # when num_duplicates > _padded_k - k + if max_num_duplicates_of_smallest_of_topk - 1 > ( + ApplyToppTopkScalar._padded_k - k): + incr = int( + math.ceil((max_num_duplicates_of_smallest_of_topk - 1) / + self._increment) * self._increment) + # this while loop should be traversed at most twice, + # because we dont increment by self._increment and retry + # instead we compute incr in one go + ApplyToppTopkScalar._padded_k = min( + ApplyToppTopkScalar._padded_k + incr, logits.shape[1]) + + # recompute topk with expanded padded_k + vals, idx = torch.topk(logits, \ + k=ApplyToppTopkScalar._padded_k, \ + dim=1, sorted=True) + + idx = torch.fliplr(idx) + vals = torch.fliplr(vals) + + top_k_smallest_val_idx = vals.size(1) - k + top_k_mask = vals[:, top_k_smallest_val_idx].unsqueeze(1) + top_k_mask = vals < top_k_mask + vals.masked_fill_(top_k_mask, -float("inf")) + + probs_sort = vals.softmax(dim=-1) + probs_sum = probs_sort.cumsum(dim=-1) + top_p_mask = probs_sum <= (1 - p) + top_p_mask[:, -1] = False + vals.masked_fill_(top_p_mask, -float("inf")) + + new_logits = torch.full(logits.shape, + -float("inf"), + device=logits.device) + new_logits.scatter_(1, idx, vals.to(new_logits.dtype)) + + return new_logits + + def _apply_min_tokens_penalty( logits: torch.Tensor, sampling_metadata: SamplingMetadata, From 9abadba502916eeb0432c6a8c300e09d0c3a5a48 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 28 Aug 2024 11:39:33 +0200 Subject: [PATCH 136/819] Make max_num_batched_tokens behavior more verbose, add legacy mode (#208) Addressing issues from https://github.com/HabanaAI/vllm-fork/pull/207 Now, filtering behavior is more verbose, handling common errors and displaying numbers of omitted buckets due to token budget (in debug log level, buckets are printed): ``` INFO 08-27 20:57:27 profiler.py:62] Profiler enabled for: vllm-instance-1ab4f6c4d726480d8825044cf74e9af1 WARNING 08-27 20:57:27 utils.py:566] Pin memory is not supported on HPU. INFO 08-27 20:57:27 selector.py:85] Using HabanaAttention backend. INFO 08-27 20:57:27 habana_model_runner.py:563] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 64], seq:[128, 128, 1024] INFO 08-27 20:57:27 habana_model_runner.py:576] Generated 23 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (8, 128), (8, 256), (16, 128)] INFO 08-27 20:57:27 habana_model_runner.py:581] Omitted 33 prompt buckets due to exceeded token budget (max_num_batched_tokens=2048) INFO 08-27 20:57:27 habana_model_runner.py:589] Decode bucket config (min, step, max_warmup) bs:[1, 128, 256], seq:[128, 128, 2048] INFO 08-27 20:57:27 habana_model_runner.py:600] Generated 31 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (8, 128), (8, 256), (16, 128)] INFO 08-27 20:57:27 habana_model_runner.py:605] Omitted 113 decode buckets due to exceeded token budget (max_num_batched_tokens=2048) ``` Legacy mode was also added, which throws a nasty error message whenever token budget is set too low, but then it omits filtering and works as it did previously (ran with ``VLLM_DECODE_BS_BUCKET_MIN=128 VLLM_DECODE_SEQ_BUCKET_MIN=1024 python vllm_test.py --max-num-batched-tokens=2048``): ``` INFO 08-27 21:01:02 profiler.py:62] Profiler enabled for: vllm-instance-51f60d3978d347e992436f1dc0aa4702 WARNING 08-27 21:01:02 utils.py:566] Pin memory is not supported on HPU. INFO 08-27 21:01:02 selector.py:85] Using HabanaAttention backend. INFO 08-27 21:01:02 habana_model_runner.py:563] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 64], seq:[128, 128, 1024] INFO 08-27 21:01:02 habana_model_runner.py:576] Generated 23 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (8, 128), (8, 256), (16, 128)] INFO 08-27 21:01:02 habana_model_runner.py:581] Omitted 33 prompt buckets due to exceeded token budget (max_num_batched_tokens=2048) INFO 08-27 21:01:02 habana_model_runner.py:589] Decode bucket config (min, step, max_warmup) bs:[128, 128, 256], seq:[1024, 128, 2048] ERROR 08-27 21:01:02 habana_model_runner.py:128] The current bucketing configuration (min, step, max_warmup): bs:[128, 128, 256], seq:[1024, 128, 2048] cannot be used with specified max_num_batched_tokens (2048), as the smallest bucket (16384) would exceed token budget. Please increase max_num_batched_tokens or decrease bucket minimum Ignoring max_num_batched_tokens at risk of out-of-memory errors. INFO 08-27 21:01:02 habana_model_runner.py:600] Generated 32 decode buckets: [(128, 128), (128, 256), (128, 384), (128, 512), (128, 640), (128, 768), (128, 896), (128, 1024), (128, 1152), (128, 1280), (128, 1408), (128, 1536), (128, 1664), (128, 1792), (128, 1920), (128, 2048), (256, 128), (256, 256), (256, 384), (256, 512), (256, 640), (256, 768), (256, 896), (256, 1024), (256, 1152), (256, 1280), (256, 1408), (256, 1536), (256, 1664), (256, 1792), (256, 1920), (256, 2048)] INFO 08-27 21:01:02 habana_model_runner.py:605] Omitted 0 decode buckets due to exceeded token budget (max_num_batched_tokens=2048) ``` --- vllm/worker/habana_model_runner.py | 70 +++++++++++++++++++++++++----- 1 file changed, 58 insertions(+), 12 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 62a9e814a5ac4..6627ba1ea5643 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -96,14 +96,44 @@ def warmup_range(config: Tuple[int, int, int]): def warmup_buckets(bs_bucket_config, seq_bucket_config, max_num_batched_tokens): - buckets = itertools.product(warmup_range(bs_bucket_config), - warmup_range(seq_bucket_config)) + buckets = list( + itertools.product(warmup_range(bs_bucket_config), + warmup_range(seq_bucket_config))) + if len(buckets) == 0: + msg = ("No buckets could be captured with following config " + f"(min, step, max_warmup): " + f"bs:{bs_bucket_config}, " + f"seq:{seq_bucket_config}") + raise ValueError(msg) + # Remove buckets exceeding batch token budget - filtered_buckets = filter( - lambda bucket: bucket[0] * bucket[1] <= max_num_batched_tokens, - buckets) - return list( + filtered_buckets = list( + filter(lambda bucket: bucket[0] * bucket[1] <= max_num_batched_tokens, + buckets)) + + if len(filtered_buckets) == 0: + # legacy case - we can handle this if we ignore max_num_batched_tokens + min_bucket_bs, min_bucket_seq = min(buckets, + key=lambda b: (b[0] * b[1])) + min_reqd_budget = min_bucket_bs * min_bucket_seq + msg = ( + "The current bucketing configuration " + f"(min, step, max_warmup): " + f"bs:{bs_bucket_config}, " + f"seq:{seq_bucket_config} cannot be used with specified " + f"max_num_batched_tokens ({max_num_batched_tokens}), as the " + f"smallest bucket ({min_reqd_budget}) would exceed token budget. " + "Please increase max_num_batched_tokens or decrease bucket minimum " + "Ignoring max_num_batched_tokens at risk of out-of-memory errors.") + logger.error(msg) + return list(sorted(buckets, key=lambda b: + (b[0] * b[1], b[1], b[0]))), [] + + captured_buckets = list( sorted(filtered_buckets, key=lambda b: (b[0] * b[1], b[1], b[0]))) + omitted_buckets = list( + sorted([x for x in buckets if x not in filtered_buckets])) + return captured_buckets, omitted_buckets def next_pow2(value: int): @@ -531,9 +561,9 @@ def _setup_buckets(self) -> None: f"bs:{self.prompt_bs_bucket_cfg}, " f"seq:{self.prompt_seq_bucket_cfg}") logger.info(msg) - self.prompt_buckets = warmup_buckets(self.prompt_bs_bucket_cfg, - self.prompt_seq_bucket_cfg, - self.max_num_batched_tokens) + self.prompt_buckets, prompt_omitted_buckets = warmup_buckets( + self.prompt_bs_bucket_cfg, self.prompt_seq_bucket_cfg, + self.max_num_batched_tokens) if self.lora_config: self.prompt_buckets[:] = [ @@ -545,13 +575,21 @@ def _setup_buckets(self) -> None: f"prompt buckets: {list(sorted(self.prompt_buckets))}") logger.info(msg) + msg = (f"Omitted {len(prompt_omitted_buckets)} " + "prompt buckets due to exceeded token budget " + f"(max_num_batched_tokens={self.max_num_batched_tokens})") + logger.info(msg) + + msg = f"Omitted prompt buckets: {list(sorted(prompt_omitted_buckets))}" + logger.debug(msg) + msg = ("Decode bucket config (min, step, max_warmup) " f"bs:{self.decode_bs_bucket_cfg}, " f"seq:{self.decode_seq_bucket_cfg}") logger.info(msg) - self.decode_buckets = warmup_buckets(self.decode_bs_bucket_cfg, - self.decode_seq_bucket_cfg, - self.max_num_batched_tokens) + self.decode_buckets, decode_omitted_buckets = warmup_buckets( + self.decode_bs_bucket_cfg, self.decode_seq_bucket_cfg, + self.max_num_batched_tokens) if self.lora_config: self.decode_buckets[:] = [ bucket for bucket in self.decode_buckets @@ -561,6 +599,14 @@ def _setup_buckets(self) -> None: f"{list(sorted(self.decode_buckets))}") logger.info(msg) + msg = (f"Omitted {len(decode_omitted_buckets)} " + "decode buckets due to exceeded token budget " + f"(max_num_batched_tokens={self.max_num_batched_tokens})") + logger.info(msg) + + msg = f"Omitted decode buckets: {list(sorted(decode_omitted_buckets))}" + logger.debug(msg) + def _prepare_prompt( self, seq_group_metadata_list: List[SequenceGroupMetadata], From 972f3bc8f0a1a11ab84a0edc59bc9e009e29d003 Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Thu, 29 Aug 2024 00:42:19 +0000 Subject: [PATCH 137/819] remove arctic gpu hardcode Signed-off-by: Chendi.Xue --- vllm/model_executor/models/arctic.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index 49e57a847e847..6d92e7597eabf 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -131,14 +131,14 @@ def __init__(self, torch.empty(self.num_experts, 2 * self.intermediate_size, self.hidden_size, - device="cuda", - dtype=self.params_dtype)) + dtype=self.params_dtype), + , requires_grad=False) self.w2s = nn.Parameter( torch.empty(self.num_experts, self.hidden_size, self.intermediate_size, - device="cuda", - dtype=self.params_dtype)) + dtype=self.params_dtype), + requires_grad=False) set_weight_attrs(self.ws, { "weight_loader": self.weight_loader, }) From 778d7e64dcaf2728e9688b1c8d18bed600dab243 Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Thu, 29 Aug 2024 00:42:50 +0000 Subject: [PATCH 138/819] remove dbrx gpu hardcode Signed-off-by: Chendi.Xue --- vllm/model_executor/models/dbrx.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index d758333b22388..463003d0bba7b 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -86,17 +86,15 @@ def __init__( self.num_total_experts, 2 * self.intermediate_size, self.d_model, - device="cuda", dtype=self.params_dtype, - )) + ), requires_grad=False) self.w2s = nn.Parameter( torch.empty( self.num_total_experts, self.d_model, self.intermediate_size, - device="cuda", dtype=self.params_dtype, - )) + ), requires_grad=False) set_weight_attrs( self.ws, From 17cd6251924ef66246eeca224bb2cb09da23217b Mon Sep 17 00:00:00 2001 From: Vivek Goel Date: Thu, 29 Aug 2024 11:23:05 +0530 Subject: [PATCH 139/819] Update paddings computed to adjust selected_token_indices (#210) Fixes assert seen when "prompt_logprobs is not None" and BS > 1. Assert was due to shape of paddings being added to matching sampling_metadata.selected_token_indices shape for the case where prompt_logprobs is configured. --- vllm/worker/habana_model_runner.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 6627ba1ea5643..a975dba6f5136 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -1012,8 +1012,13 @@ def prepare_input_tensors( paddings = [max_len - s for s in seq_lens] paddings = [0] + paddings[:-1] paddings = list(itertools.accumulate(paddings)) + paddings_prompt_logprobs = [] + for i, seq_group_metadata in enumerate(seq_group_metadata_list): + if seq_group_metadata.sampling_params.prompt_logprobs is not None \ + and seq_group_metadata.is_prompt: + paddings_prompt_logprobs += ([paddings[i]] * seq_lens[i]) paddings = torch.tensor( - paddings, + paddings_prompt_logprobs if paddings_prompt_logprobs else paddings, dtype=sampling_metadata.selected_token_indices.dtype, device=sampling_metadata.selected_token_indices.device) sampling_metadata.selected_token_indices.add_(paddings) From f3f1f93b6af654771c20b943d556167f9765a8a8 Mon Sep 17 00:00:00 2001 From: Agata Dobrzyniewicz Date: Fri, 30 Aug 2024 10:35:53 +0300 Subject: [PATCH 140/819] Port not warmed-up configurations log warnings --- vllm/worker/habana_model_runner.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index a975dba6f5136..133706c18aed6 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -448,6 +448,7 @@ def __init__( # Profiler stats self.profiler_counter_helper = HabanaProfilerCounterHelper() + self.seen_configs = set() self._mem_margin: Optional[int] = None self._setup_buckets() @@ -1560,6 +1561,14 @@ def finish_measurements(self): from neural_compressor.torch.quantization import finalize_calibration finalize_calibration(self.model.model) + def _check_config(self, batch_size, seq_len, is_prompt, warmup_mode): + cfg = (batch_size, seq_len, is_prompt) + seen = cfg in self.seen_configs + self.seen_configs.add(cfg) + if not seen and not warmup_mode: + phase = 'prompt' if is_prompt else 'decode' + logger.warning(f'Configuration: ({phase}, {batch_size}, {seq_len}) was not warmed-up!') + @torch.inference_mode() def execute_model( self, @@ -1594,6 +1603,7 @@ def execute_model( batch_size = input_tokens.size(0) seq_len = self._seq_len(attn_metadata) use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) + self._check_config(batch_size, seq_len, is_prompt, warmup_mode) execute_model_kwargs = { "input_ids": input_tokens, "positions": input_positions, @@ -1605,8 +1615,7 @@ def execute_model( execute_model_kwargs.update(multi_modal_input) if htorch.utils.internal.is_lazy(): execute_model_kwargs.update({ - "bypass_hpu_graphs": not use_graphs, - "warmup_mode": warmup_mode + "bypass_hpu_graphs": not use_graphs }) htorch.core.mark_step() From fd38e5d2fa7a6fb6f8c11dfb5bf8ee801b90451b Mon Sep 17 00:00:00 2001 From: Agata Dobrzyniewicz Date: Fri, 30 Aug 2024 11:47:58 +0300 Subject: [PATCH 141/819] Formating for log warnings --- vllm/worker/habana_model_runner.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 133706c18aed6..0100076aec8e2 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -448,7 +448,7 @@ def __init__( # Profiler stats self.profiler_counter_helper = HabanaProfilerCounterHelper() - self.seen_configs = set() + self.seen_configs: set = set() self._mem_margin: Optional[int] = None self._setup_buckets() @@ -1567,7 +1567,8 @@ def _check_config(self, batch_size, seq_len, is_prompt, warmup_mode): self.seen_configs.add(cfg) if not seen and not warmup_mode: phase = 'prompt' if is_prompt else 'decode' - logger.warning(f'Configuration: ({phase}, {batch_size}, {seq_len}) was not warmed-up!') + logger.warning('Configuration: (', phase, ', ', batch_size, ', ', + seq_len, ') was not warmed-up!') @torch.inference_mode() def execute_model( @@ -1614,9 +1615,7 @@ def execute_model( if multi_modal_input is not None: execute_model_kwargs.update(multi_modal_input) if htorch.utils.internal.is_lazy(): - execute_model_kwargs.update({ - "bypass_hpu_graphs": not use_graphs - }) + execute_model_kwargs.update({"bypass_hpu_graphs": not use_graphs}) htorch.core.mark_step() if self.is_driver_worker: From a032ea2781583756f1fca8bdaa6284fa2693b841 Mon Sep 17 00:00:00 2001 From: Liran Bachar Date: Sun, 1 Sep 2024 12:23:16 +0300 Subject: [PATCH 142/819] support loading autofp8 checkpoint fix gaudi2 weight range to +=240 avoid cuda code in hpu path replace _scaled_mm with hpu op --- vllm/_custom_ops/__init__.py | 75 +++++ .../_cuda_ops.py} | 0 vllm/_custom_ops/_hpu_ops.py | 317 ++++++++++++++++++ vllm/{ => _custom_ops}/_ipex_ops.py | 0 .../compressed_tensors/compressed_tensors.py | 5 +- .../schemes/compressed_tensors_w8a8_fp8.py | 2 +- .../model_executor/layers/quantization/fp8.py | 19 +- .../layers/quantization/utils/w8a8_utils.py | 41 ++- vllm/model_executor/models/llama.py | 7 + vllm/utils.py | 58 +--- vllm/worker/habana_model_runner.py | 3 +- vllm/worker/habana_worker.py | 3 +- 12 files changed, 458 insertions(+), 72 deletions(-) create mode 100644 vllm/_custom_ops/__init__.py rename vllm/{_custom_ops.py => _custom_ops/_cuda_ops.py} (100%) create mode 100644 vllm/_custom_ops/_hpu_ops.py rename vllm/{ => _custom_ops}/_ipex_ops.py (100%) diff --git a/vllm/_custom_ops/__init__.py b/vllm/_custom_ops/__init__.py new file mode 100644 index 0000000000000..2411a1465c187 --- /dev/null +++ b/vllm/_custom_ops/__init__.py @@ -0,0 +1,75 @@ + +from functools import lru_cache + +@lru_cache(maxsize=None) +def is_hip() -> bool: + return torch.version.hip is not None + + +@lru_cache(maxsize=None) +def is_cpu() -> bool: + from importlib.metadata import PackageNotFoundError, version + try: + return "cpu" in version("vllm") + except PackageNotFoundError: + return False + + +@lru_cache(maxsize=None) +def is_openvino() -> bool: + from importlib.metadata import PackageNotFoundError, version + try: + return "openvino" in version("vllm") + except PackageNotFoundError: + return False + + +@lru_cache(maxsize=None) +def is_neuron() -> bool: + try: + import transformers_neuronx + except ImportError: + transformers_neuronx = None + return transformers_neuronx is not None + + +@lru_cache(maxsize=None) +def is_hpu() -> bool: + from importlib import util + return util.find_spec('habana_frameworks') is not None + + +@lru_cache(maxsize=None) +def is_tpu() -> bool: + try: + import libtpu + except ImportError: + libtpu = None + return libtpu is not None + + +@lru_cache(maxsize=None) +def is_xpu() -> bool: + from importlib.metadata import version + is_xpu_flag = "xpu" in version("vllm") + # vllm is not build with xpu + if not is_xpu_flag: + return False + try: + import intel_extension_for_pytorch as ipex # noqa: F401 + _import_ipex = True + except ImportError as e: + logger.warning("Import Error for IPEX: %s", e.msg) + _import_ipex = False + # ipex dependency is not ready + if not _import_ipex: + logger.warning("not found ipex lib") + return False + return hasattr(torch, "xpu") and torch.xpu.is_available() + +if is_xpu(): + from ._ipex_ops import * +elif is_hpu(): + from ._hpu_ops import * +else: + from ._cuda_ops import * \ No newline at end of file diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops/_cuda_ops.py similarity index 100% rename from vllm/_custom_ops.py rename to vllm/_custom_ops/_cuda_ops.py diff --git a/vllm/_custom_ops/_hpu_ops.py b/vllm/_custom_ops/_hpu_ops.py new file mode 100644 index 0000000000000..d553540f9e25a --- /dev/null +++ b/vllm/_custom_ops/_hpu_ops.py @@ -0,0 +1,317 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. +############################################################################### +import os +from typing import Optional, Tuple + +import habana_frameworks.torch as htorch +import torch +import torch.nn.functional as F + +import vllm.hpu.utils as hpu_utils + +PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', '1') == '1') + + +def silu_and_mul(output, input): + d = input.shape[-1] // 2 + silu = torch.nn.SiLU().to(input.device) + x, y = torch.split(input, d, dim=-1) + output.copy_(silu(x) * y) + + +def fetch_from_cache(cache, blocks, permutations): + return [ + cache.index_select(0, blocks[:, i]).permute(permutations) + for i in range(blocks.size(1)) + ] + + +def paged_attention_v1(query, + key_cache, + value_cache, + head_mapping, + scale, + block_tables, + context_lens, + block_size, + alibi_slopes=None, + kv_cache_dtype=None, + qk_matmul_op=torch.matmul, + softmax_op=torch.softmax, + av_matmul_op=torch.matmul, + k_cache_cls=None, + v_cache_cls=None) -> None: + seq_len = block_tables.size(1) + batch_size, query_heads, _ = query.shape + _, _, kv_heads, _ = key_cache.shape + min_inf = torch.finfo(query.dtype).min + mask = (torch.arange(0, + seq_len * block_size, + dtype=torch.int32, + device=key_cache.device).view(1, -1).expand( + batch_size, -1).ge(context_lens.view(-1, 1)).view( + batch_size, 1, 1, -1)) + query.mul_(scale) + query = query.unsqueeze(-2) + fetch_keys = fetch_from_cache if k_cache_cls is None else k_cache_cls.fetch_from_cache + keys = fetch_keys(key_cache, block_tables, (0, 2, 3, 1)) + if query_heads != kv_heads: + query = query.unflatten(1, (kv_heads, -1)) + keys = [k.unflatten(1, (kv_heads, 1)) for k in keys] + mask = mask.unsqueeze(2) + + attn_weights = [qk_matmul_op(query, k) for k in keys] + attn_weights = torch.cat(attn_weights, dim=-1) + if alibi_slopes is not None: + attn_weights.add_(alibi_slopes[:, :, -attn_weights.size(2):, + -attn_weights.size(3):]) + attn_weights = softmax_op(attn_weights.masked_fill(mask, min_inf), dim=-1) + + fetch_values = fetch_from_cache if v_cache_cls is None else k_cache_cls.fetch_from_cache + values = fetch_values(value_cache, block_tables, (0, 2, 1, 3)) + if PA_SPLIT_VALUE: + attn_weights = attn_weights.split(block_size, dim=-1) + else: + values = [torch.cat(values, dim=-2)] + attn_weights = [attn_weights] + if query_heads != kv_heads: + values = [v.unflatten(1, (kv_heads, 1)) for v in values] + attn_weights = [av_matmul_op(a, v) for a, v in zip(attn_weights, values)] + if query_heads != kv_heads: + attn_weights = [a.flatten(1, 2) for a in attn_weights] + attn_weights = sum(attn_weights) + return attn_weights.squeeze(-2) + + +def silu_and_mul_wrapper(x: torch.Tensor) -> torch.Tensor: + d = x.shape[-1] // 2 + output_shape = (x.shape[:-1] + (d, )) + out = torch.empty(output_shape, dtype=x.dtype, device=x.device) + silu_and_mul(out, x) + return out + + +def static_fused_moe(hidden_states, w1, w2, score, topk): + B, D = hidden_states.shape + num_experts = w1.shape[0] + routing_weights = F.softmax(score, dim=1, dtype=torch.float32) + routing_weights, selected_experts = torch.topk(routing_weights, + topk, + dim=-1) + routing_weights /= routing_weights.sum(dim=-1, keepdim=True) + routing_weights = routing_weights.to(hidden_states.dtype) + final_hidden_states = torch.zeros((1, B, D), + dtype=hidden_states.dtype, + device=hidden_states.device) + padded_weights = torch.zeros((B, num_experts), + dtype=hidden_states.dtype, + device=hidden_states.device) + padded_weights.scatter_(-1, selected_experts, routing_weights) + padded_weights = padded_weights.reshape(-1, B, w1.shape[0]) + padded_weights = padded_weights.permute(2, 0, 1).unsqueeze(-1) + + htorch.core.mark_step() + + for expert_idx in range(num_experts): + padded_weight = padded_weights[expert_idx] + current_state_static = hidden_states.reshape(-1, D) + w_output = silu_and_mul_wrapper( + torch.matmul(current_state_static, w1[expert_idx].transpose(0, 1))) + w_output = torch.matmul(w_output, w2[expert_idx].transpose(0, 1)) + current_hidden_states_static = w_output * padded_weight + final_hidden_states += current_hidden_states_static + htorch.core.mark_step() + + return final_hidden_states.view(-1, D) + + +def prompt_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attn_bias: Optional[torch.Tensor] = None, + p: float = 0.0, + scale: Optional[float] = None, + qk_matmul_op = torch.matmul, + softmax_op = torch.softmax, + av_matmul_op = torch.matmul, +) -> torch.Tensor: + query = query.transpose(1, 2) + key = key.transpose(1, 2) + value = value.transpose(1, 2) + query_heads = query.size(1) + kv_heads = key.size(1) + if query_heads != kv_heads: + query = query.unflatten(1, (kv_heads, -1)) + key = key.unflatten(1, (kv_heads, 1)) + value = value.unflatten(1, (kv_heads, 1)) + attn_bias = attn_bias.unsqueeze(2) + attn_weights = qk_matmul_op(query * scale, key.transpose(-1, -2)) + if attn_bias is not None: + attn_weights.add_(attn_bias) + attn_weights = softmax_op(attn_weights, dim=-1) + attn_weights = av_matmul_op(attn_weights, value) + if query_heads != kv_heads: + attn_weights = attn_weights.flatten(1, 2) + attn_weights = attn_weights.transpose(1, 2) + return attn_weights + + + + +def reshape_and_cache(key, + value, + key_cache, + value_cache, + slot_mapping, + dtype, + is_prompt=False): + num_blocks = key_cache.size(0) + block_size = key_cache.size(1) + slot_mapping = slot_mapping.flatten() + indices = torch.div(slot_mapping, block_size, rounding_mode="floor") + offsets = torch.fmod(slot_mapping, block_size) + num_slots_requested = slot_mapping.size(0) + num_slots_available = num_blocks * block_size + # NOTE(kzawora): HPU PT bridge crashes with + # RuntimeError: Invalid inputs for scatter_nd_onnx + # on index_put when num_slots_requested > num_slots_available. + # This case might occur when we have little kv cache blocks and + # lots of padding, or are doing warmup. + # This loop is a workaround for this issue. Please remove it + # once key_cache.index_put_(indices, offsets), key) works. + num_kv_cache_passes = torch.div(num_slots_requested, + num_slots_available).ceil().int().item() + for i in range(num_kv_cache_passes): + start_idx = i * num_slots_available + end_idx = (i + 1) * num_slots_available + key_cache.index_put_( + (indices[start_idx:end_idx], offsets[start_idx:end_idx]), + key[start_idx:end_idx]) + value_cache.index_put_( + (indices[start_idx:end_idx], offsets[start_idx:end_idx]), + value[start_idx:end_idx]) + + +def prepare_to_cache(cache, slot_mapping): + num_blocks = cache.size(0) + block_size = cache.size(1) + slot_mapping = slot_mapping.flatten() + indices = torch.div(slot_mapping, block_size, rounding_mode="floor") + offsets = torch.fmod(slot_mapping, block_size) + num_slots_requested = slot_mapping.size(0) + num_slots_available = num_blocks * block_size + # NOTE(kzawora): HPU PT bridge crashes with + # RuntimeError: Invalid inputs for scatter_nd_onnx + # on index_put when num_slots_requested > num_slots_available. + # This case might occur when we have little kv cache blocks and + # lots of padding, or are doing warmup. + # This loop is a workaround for this issue. Please remove it + # once key_cache.index_put_(indices, offsets), key) works. + num_kv_cache_passes = torch.div(num_slots_requested, + num_slots_available).ceil().int().item() + + return num_kv_cache_passes, num_slots_available, indices, offsets + + +def insert_or_update_cache(input, cache, num_kv_cache_passes, num_slots_available, block_indices, block_offsets): + for i in range(num_kv_cache_passes): + start_idx = i * num_slots_available + end_idx = (i + 1) * num_slots_available + cache.index_put_( + (block_indices[start_idx:end_idx], block_offsets[start_idx:end_idx]), + input[start_idx:end_idx]) + + +def swap_blocks(src, dst, block_mapping): + index_src = torch.zeros((1, ), dtype=torch.int32, device=src.device) + index_dst = torch.zeros((1, ), dtype=torch.int32, device=dst.device) + for src_idx, dst_idx in block_mapping.items(): + index_src[0] = src_idx + index_dst[0] = dst_idx + dst.index_put_([index_dst], src.index_select(0, index_src)) + if dst.device.type == 'hpu': + htorch.core.mark_step() + torch.hpu.synchronize() + + +def copy_blocks(key_caches, value_caches, block_mapping): + index_src = torch.zeros((1, ), + dtype=torch.int32, + device=key_caches[0].device) + index_dst = torch.zeros((1, ), + dtype=torch.int32, + device=key_caches[0].device) + for src, dsts in block_mapping.items(): + index_src[0] = src + for dst in dsts: + index_dst[0] = dst + for key_cache in key_caches: + key_cache.index_copy_(0, index_dst, + key_cache.index_select(0, index_src)) + for value_cache in value_caches: + value_cache.index_copy_(0, index_dst, + value_cache.index_select(0, index_src)) + if key_caches[0].device.type == 'hpu': + htorch.core.mark_step() + + +# fp8 +def scaled_fp8_quant( + input: torch.Tensor, + scale: Optional[torch.Tensor] = None, + batch_dim_padding: Optional[int] = None, + scale_ub: Optional[torch.Tensor] = None, + use_per_token_if_dynamic: bool = False, +) -> Tuple[torch.Tensor, torch.Tensor]: + + """ + Quantize input tensor to FP8 and return quantized tensor and scale. + + This function supports both static and dynamic quantization: If you + provide the scale, it will use static scaling and if you omit it, + the scale will be determined dynamically. The function also allows + optional padding of the output tensor for downstream kernels that + will benefit from padding. + + Args: + input: The input tensor to be quantized to FP8 + scale: Optional scaling factor for the FP8 quantization + scale_ub: Optional upper bound for scaling factor in dynamic + per token case + batch_dim_padding: If specified, pad the first dimension + of the output to at least this value. + use_per_token_if_dynamic: Whether to do per_tensor or per_token + in the dynamic quantization case. + + Returns: + Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and + scaling factor. + """ + if batch_dim_padding: + shape = (max(batch_dim_padding, input.shape[0]), *input.shape[1:]) + output = torch.empty(shape, + device=input.device, + dtype=torch.float8_e4m3fn) + else: + output = torch.empty_like(input, dtype=torch.float8_e4m3fn) + if scale is None: + raise "dynamic scaled_fp8_quant not implemented for HPU" + #TODO: calculate scale to match gaudi2 240 range instead of 448 + if use_per_token_if_dynamic: + scale = torch.empty((input.numel() // input.shape[-1], 1), + device=input.device, + dtype=torch.float32) + torch.ops._C.dynamic_per_token_scaled_fp8_quant( + output, input, scale, scale_ub) + else: + scale = torch.zeros(1, device=input.device, dtype=torch.float32) + torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale) + else: + output = torch.ops.hpu.cast_to_fp8_v2(input, 1/scale, False, False, dtype=torch.float8_e4m3fn)[0] + + return output, scale diff --git a/vllm/_ipex_ops.py b/vllm/_custom_ops/_ipex_ops.py similarity index 100% rename from vllm/_ipex_ops.py rename to vllm/_custom_ops/_ipex_ops.py diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 39d00bd5733ff..badb29af1f5f6 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -233,7 +233,7 @@ def _get_scheme_from_parts( if is_activation_quantization_format(self.quant_format): if self._is_fp8_w8a8(weight_quant, input_quant): is_fp8_w8a8_supported = self._check_scheme_supported( - CompressedTensorsW8A8Fp8.get_min_capability(), error=False) + CompressedTensorsW8A8Fp8.get_min_capability(), error=False) if torch.cuda.is_available() else True if is_fp8_w8a8_supported: return CompressedTensorsW8A8Fp8( strategy=weight_quant.strategy, @@ -306,7 +306,8 @@ def get_scheme( # Raise error if device does not support the scheme # (e.g. fp8 needs ada lovelace) - self._check_scheme_supported(scheme.get_min_capability()) + if torch.cuda.is_available(): + self._check_scheme_supported(scheme.get_min_capability()) return scheme diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index cc9d71db140c2..631774994b5c0 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -21,7 +21,7 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme): def __init__(self, strategy: str, is_static_input_scheme: bool): self.strategy = strategy self.is_static_input_scheme = is_static_input_scheme - self.cutlass_fp8_supported = cutlass_fp8_supported() + self.cutlass_fp8_supported = cutlass_fp8_supported() if torch.cuda.is_available() else False @classmethod def get_min_capability(cls) -> int: diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index c829cb836ee4c..8e2ed041adf0b 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -112,13 +112,18 @@ class Fp8LinearMethod(LinearMethodBase): def __init__(self, quant_config: Fp8Config): self.quant_config = quant_config - self.cutlass_fp8_supported = cutlass_fp8_supported() - - # For GPUs that lack FP8 hardware support, we can leverage the Marlin - # kernel for fast weight-only FP8 quantization - capability = current_platform.get_device_capability() - capability = capability[0] * 10 + capability[1] - self.use_marlin = capability < 89 + + if torch.cuda.is_available(): + self.cutlass_fp8_supported = cutlass_fp8_supported() + + # For GPUs that lack FP8 hardware support, we can leverage the Marlin + # kernel for fast weight-only FP8 quantization + capability = current_platform.get_device_capability() + capability = capability[0] * 10 + capability[1] + self.use_marlin = capability < 89 + else: + self.cutlass_fp8_supported = False + self.use_marlin = False def create_weights( self, diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 20100c76bd690..de5cd810b2a94 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -6,6 +6,8 @@ from vllm import _custom_ops as ops from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform +if current_platform.is_hpu(): + import habana_frameworks.torch.utils.experimental as htexp def cutlass_fp8_supported() -> bool: @@ -18,8 +20,17 @@ def cutlass_fp8_supported() -> bool: def per_tensor_dequantize( tensor: torch.Tensor, inv_scale: Union[float, torch.Tensor]) -> torch.Tensor: - fake_qweight = tensor.to(torch.float16) + dtype = torch.float16 + device = tensor.device + if current_platform.is_hpu(): + dtype = torch.bfloat16 + if htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi2: + #dequant on cpu to avoid nan on gaudi2 + tensor = tensor.to('cpu') + + fake_qweight = tensor.to(dtype).to(device) dq_weight = fake_qweight * inv_scale + return dq_weight @@ -76,6 +87,9 @@ def requantize_with_max_scale( logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]: # Max scale to be used for requanitzation. max_w_scale = weight_scale.max() + if current_platform.is_hpu() and htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi2: + max_w_scale = max_w_scale * (448/240) + # QKV / MLP is fused in the on disk checkpoint if any of the # weight scales are still set to the default since we initialize @@ -147,12 +161,25 @@ def apply_fp8_linear( if per_tensor_weights and per_tensor_activations: # Fused GEMM_DQ - output, _ = torch._scaled_mm(qinput, - weight, - out_dtype=input.dtype, - scale_a=x_scale, - scale_b=weight_scale, - bias=bias) + if current_platform.is_hpu(): + #hpu does not support torch._scaled_mm (SW-197036) + output = torch.ops.hpu.fp8_gemm_v2(qinput, + False, + weight, + False, + None, + input.dtype, + x_scale, + weight_scale, + None, + False) + else: + output, _ = torch._scaled_mm(qinput, + weight, + out_dtype=input.dtype, + scale_a=x_scale, + scale_b=weight_scale, + bias=bias) return torch.narrow(output, 0, 0, input.shape[0]) else: diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 676a51ce67f96..f02609aa9ff3b 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -54,6 +54,9 @@ from .interfaces import SupportsLoRA from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers +from vllm.platforms import current_platform +if current_platform.is_hpu(): + import habana_frameworks.torch.core as htcore class LlamaMLP(nn.Module): @@ -518,6 +521,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): default_weight_loader) weight_loader(param, loaded_weight) + #Avoid OOM due to large graph when loading weights + if current_platform.is_hpu(): + htcore.mark_step() + # If this function is called, it should always initialize KV cache scale # factors (or else raise an exception). Thus, handled exceptions should # make sure to leave KV cache scale factors in a known good (dummy) state diff --git a/vllm/utils.py b/vllm/utils.py index fa6e132dd3522..661d5d62e069b 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -176,69 +176,25 @@ def clear(self): def is_hip() -> bool: - return torch.version.hip is not None + return ops.is_hip() - -@lru_cache(maxsize=None) def is_cpu() -> bool: - from importlib.metadata import PackageNotFoundError, version - try: - return "cpu" in version("vllm") - except PackageNotFoundError: - return False - + return ops.is_cpu() -@lru_cache(maxsize=None) def is_openvino() -> bool: - from importlib.metadata import PackageNotFoundError, version - try: - return "openvino" in version("vllm") - except PackageNotFoundError: - return False - + return ops.is_openvino() -@lru_cache(maxsize=None) def is_neuron() -> bool: - try: - import transformers_neuronx - except ImportError: - transformers_neuronx = None - return transformers_neuronx is not None + return ops.is_neuron() - -@lru_cache(maxsize=None) def is_hpu() -> bool: - from importlib import util - return util.find_spec('habana_frameworks') is not None - + return ops.is_hpu() -@lru_cache(maxsize=None) def is_tpu() -> bool: - try: - import libtpu - except ImportError: - libtpu = None - return libtpu is not None + return ops.is_tpu() - -@lru_cache(maxsize=None) def is_xpu() -> bool: - from importlib.metadata import version - is_xpu_flag = "xpu" in version("vllm") - # vllm is not build with xpu - if not is_xpu_flag: - return False - try: - import intel_extension_for_pytorch as ipex # noqa: F401 - _import_ipex = True - except ImportError as e: - logger.warning("Import Error for IPEX: %s", e.msg) - _import_ipex = False - # ipex dependency is not ready - if not _import_ipex: - logger.warning("not found ipex lib") - return False - return hasattr(torch, "xpu") and torch.xpu.is_available() + return ops.is_xpu() @lru_cache(maxsize=None) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index a975dba6f5136..a2c7a96757faa 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -453,8 +453,7 @@ def __init__( def load_model(self) -> None: import habana_frameworks.torch.core as htcore - if self.model_config.quantization == 'inc': - htcore.hpu_set_env() + htcore.hpu_set_env() with HabanaMemoryProfiler() as m: with HabanaMemoryProfiler() as m_getmodel: self.model = get_model( diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index 9d083915041fe..bf285c93cdd47 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -109,8 +109,7 @@ def init_device(self) -> None: raise RuntimeError( f"Not support device type: {self.device_config.device}") # Initialize the distributed environment. - if self.model_config.quantization == 'inc': - self._set_env_vars() + self._set_env_vars() init_worker_distributed_environment(self.parallel_config, self.rank, self.distributed_init_method, self.local_rank) From 221eb5600f7523c957ebad318e54d908af6c8332 Mon Sep 17 00:00:00 2001 From: Liran Bachar Date: Sun, 1 Sep 2024 13:57:58 +0300 Subject: [PATCH 143/819] Revert "support loading autofp8 checkpoint" This reverts commit a032ea2781583756f1fca8bdaa6284fa2693b841. --- .../_cuda_ops.py => _custom_ops.py} | 0 vllm/_custom_ops/__init__.py | 75 ----- vllm/_custom_ops/_hpu_ops.py | 317 ------------------ vllm/{_custom_ops => }/_ipex_ops.py | 0 .../compressed_tensors/compressed_tensors.py | 5 +- .../schemes/compressed_tensors_w8a8_fp8.py | 2 +- .../model_executor/layers/quantization/fp8.py | 19 +- .../layers/quantization/utils/w8a8_utils.py | 41 +-- vllm/model_executor/models/llama.py | 7 - vllm/utils.py | 58 +++- vllm/worker/habana_model_runner.py | 3 +- vllm/worker/habana_worker.py | 3 +- 12 files changed, 72 insertions(+), 458 deletions(-) rename vllm/{_custom_ops/_cuda_ops.py => _custom_ops.py} (100%) delete mode 100644 vllm/_custom_ops/__init__.py delete mode 100644 vllm/_custom_ops/_hpu_ops.py rename vllm/{_custom_ops => }/_ipex_ops.py (100%) diff --git a/vllm/_custom_ops/_cuda_ops.py b/vllm/_custom_ops.py similarity index 100% rename from vllm/_custom_ops/_cuda_ops.py rename to vllm/_custom_ops.py diff --git a/vllm/_custom_ops/__init__.py b/vllm/_custom_ops/__init__.py deleted file mode 100644 index 2411a1465c187..0000000000000 --- a/vllm/_custom_ops/__init__.py +++ /dev/null @@ -1,75 +0,0 @@ - -from functools import lru_cache - -@lru_cache(maxsize=None) -def is_hip() -> bool: - return torch.version.hip is not None - - -@lru_cache(maxsize=None) -def is_cpu() -> bool: - from importlib.metadata import PackageNotFoundError, version - try: - return "cpu" in version("vllm") - except PackageNotFoundError: - return False - - -@lru_cache(maxsize=None) -def is_openvino() -> bool: - from importlib.metadata import PackageNotFoundError, version - try: - return "openvino" in version("vllm") - except PackageNotFoundError: - return False - - -@lru_cache(maxsize=None) -def is_neuron() -> bool: - try: - import transformers_neuronx - except ImportError: - transformers_neuronx = None - return transformers_neuronx is not None - - -@lru_cache(maxsize=None) -def is_hpu() -> bool: - from importlib import util - return util.find_spec('habana_frameworks') is not None - - -@lru_cache(maxsize=None) -def is_tpu() -> bool: - try: - import libtpu - except ImportError: - libtpu = None - return libtpu is not None - - -@lru_cache(maxsize=None) -def is_xpu() -> bool: - from importlib.metadata import version - is_xpu_flag = "xpu" in version("vllm") - # vllm is not build with xpu - if not is_xpu_flag: - return False - try: - import intel_extension_for_pytorch as ipex # noqa: F401 - _import_ipex = True - except ImportError as e: - logger.warning("Import Error for IPEX: %s", e.msg) - _import_ipex = False - # ipex dependency is not ready - if not _import_ipex: - logger.warning("not found ipex lib") - return False - return hasattr(torch, "xpu") and torch.xpu.is_available() - -if is_xpu(): - from ._ipex_ops import * -elif is_hpu(): - from ._hpu_ops import * -else: - from ._cuda_ops import * \ No newline at end of file diff --git a/vllm/_custom_ops/_hpu_ops.py b/vllm/_custom_ops/_hpu_ops.py deleted file mode 100644 index d553540f9e25a..0000000000000 --- a/vllm/_custom_ops/_hpu_ops.py +++ /dev/null @@ -1,317 +0,0 @@ -############################################################################### -# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company -# -# This source code is licensed under the BSD license found in the -# LICENSE file in the root directory of this source tree. -############################################################################### -import os -from typing import Optional, Tuple - -import habana_frameworks.torch as htorch -import torch -import torch.nn.functional as F - -import vllm.hpu.utils as hpu_utils - -PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', '1') == '1') - - -def silu_and_mul(output, input): - d = input.shape[-1] // 2 - silu = torch.nn.SiLU().to(input.device) - x, y = torch.split(input, d, dim=-1) - output.copy_(silu(x) * y) - - -def fetch_from_cache(cache, blocks, permutations): - return [ - cache.index_select(0, blocks[:, i]).permute(permutations) - for i in range(blocks.size(1)) - ] - - -def paged_attention_v1(query, - key_cache, - value_cache, - head_mapping, - scale, - block_tables, - context_lens, - block_size, - alibi_slopes=None, - kv_cache_dtype=None, - qk_matmul_op=torch.matmul, - softmax_op=torch.softmax, - av_matmul_op=torch.matmul, - k_cache_cls=None, - v_cache_cls=None) -> None: - seq_len = block_tables.size(1) - batch_size, query_heads, _ = query.shape - _, _, kv_heads, _ = key_cache.shape - min_inf = torch.finfo(query.dtype).min - mask = (torch.arange(0, - seq_len * block_size, - dtype=torch.int32, - device=key_cache.device).view(1, -1).expand( - batch_size, -1).ge(context_lens.view(-1, 1)).view( - batch_size, 1, 1, -1)) - query.mul_(scale) - query = query.unsqueeze(-2) - fetch_keys = fetch_from_cache if k_cache_cls is None else k_cache_cls.fetch_from_cache - keys = fetch_keys(key_cache, block_tables, (0, 2, 3, 1)) - if query_heads != kv_heads: - query = query.unflatten(1, (kv_heads, -1)) - keys = [k.unflatten(1, (kv_heads, 1)) for k in keys] - mask = mask.unsqueeze(2) - - attn_weights = [qk_matmul_op(query, k) for k in keys] - attn_weights = torch.cat(attn_weights, dim=-1) - if alibi_slopes is not None: - attn_weights.add_(alibi_slopes[:, :, -attn_weights.size(2):, - -attn_weights.size(3):]) - attn_weights = softmax_op(attn_weights.masked_fill(mask, min_inf), dim=-1) - - fetch_values = fetch_from_cache if v_cache_cls is None else k_cache_cls.fetch_from_cache - values = fetch_values(value_cache, block_tables, (0, 2, 1, 3)) - if PA_SPLIT_VALUE: - attn_weights = attn_weights.split(block_size, dim=-1) - else: - values = [torch.cat(values, dim=-2)] - attn_weights = [attn_weights] - if query_heads != kv_heads: - values = [v.unflatten(1, (kv_heads, 1)) for v in values] - attn_weights = [av_matmul_op(a, v) for a, v in zip(attn_weights, values)] - if query_heads != kv_heads: - attn_weights = [a.flatten(1, 2) for a in attn_weights] - attn_weights = sum(attn_weights) - return attn_weights.squeeze(-2) - - -def silu_and_mul_wrapper(x: torch.Tensor) -> torch.Tensor: - d = x.shape[-1] // 2 - output_shape = (x.shape[:-1] + (d, )) - out = torch.empty(output_shape, dtype=x.dtype, device=x.device) - silu_and_mul(out, x) - return out - - -def static_fused_moe(hidden_states, w1, w2, score, topk): - B, D = hidden_states.shape - num_experts = w1.shape[0] - routing_weights = F.softmax(score, dim=1, dtype=torch.float32) - routing_weights, selected_experts = torch.topk(routing_weights, - topk, - dim=-1) - routing_weights /= routing_weights.sum(dim=-1, keepdim=True) - routing_weights = routing_weights.to(hidden_states.dtype) - final_hidden_states = torch.zeros((1, B, D), - dtype=hidden_states.dtype, - device=hidden_states.device) - padded_weights = torch.zeros((B, num_experts), - dtype=hidden_states.dtype, - device=hidden_states.device) - padded_weights.scatter_(-1, selected_experts, routing_weights) - padded_weights = padded_weights.reshape(-1, B, w1.shape[0]) - padded_weights = padded_weights.permute(2, 0, 1).unsqueeze(-1) - - htorch.core.mark_step() - - for expert_idx in range(num_experts): - padded_weight = padded_weights[expert_idx] - current_state_static = hidden_states.reshape(-1, D) - w_output = silu_and_mul_wrapper( - torch.matmul(current_state_static, w1[expert_idx].transpose(0, 1))) - w_output = torch.matmul(w_output, w2[expert_idx].transpose(0, 1)) - current_hidden_states_static = w_output * padded_weight - final_hidden_states += current_hidden_states_static - htorch.core.mark_step() - - return final_hidden_states.view(-1, D) - - -def prompt_attention( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - attn_bias: Optional[torch.Tensor] = None, - p: float = 0.0, - scale: Optional[float] = None, - qk_matmul_op = torch.matmul, - softmax_op = torch.softmax, - av_matmul_op = torch.matmul, -) -> torch.Tensor: - query = query.transpose(1, 2) - key = key.transpose(1, 2) - value = value.transpose(1, 2) - query_heads = query.size(1) - kv_heads = key.size(1) - if query_heads != kv_heads: - query = query.unflatten(1, (kv_heads, -1)) - key = key.unflatten(1, (kv_heads, 1)) - value = value.unflatten(1, (kv_heads, 1)) - attn_bias = attn_bias.unsqueeze(2) - attn_weights = qk_matmul_op(query * scale, key.transpose(-1, -2)) - if attn_bias is not None: - attn_weights.add_(attn_bias) - attn_weights = softmax_op(attn_weights, dim=-1) - attn_weights = av_matmul_op(attn_weights, value) - if query_heads != kv_heads: - attn_weights = attn_weights.flatten(1, 2) - attn_weights = attn_weights.transpose(1, 2) - return attn_weights - - - - -def reshape_and_cache(key, - value, - key_cache, - value_cache, - slot_mapping, - dtype, - is_prompt=False): - num_blocks = key_cache.size(0) - block_size = key_cache.size(1) - slot_mapping = slot_mapping.flatten() - indices = torch.div(slot_mapping, block_size, rounding_mode="floor") - offsets = torch.fmod(slot_mapping, block_size) - num_slots_requested = slot_mapping.size(0) - num_slots_available = num_blocks * block_size - # NOTE(kzawora): HPU PT bridge crashes with - # RuntimeError: Invalid inputs for scatter_nd_onnx - # on index_put when num_slots_requested > num_slots_available. - # This case might occur when we have little kv cache blocks and - # lots of padding, or are doing warmup. - # This loop is a workaround for this issue. Please remove it - # once key_cache.index_put_(indices, offsets), key) works. - num_kv_cache_passes = torch.div(num_slots_requested, - num_slots_available).ceil().int().item() - for i in range(num_kv_cache_passes): - start_idx = i * num_slots_available - end_idx = (i + 1) * num_slots_available - key_cache.index_put_( - (indices[start_idx:end_idx], offsets[start_idx:end_idx]), - key[start_idx:end_idx]) - value_cache.index_put_( - (indices[start_idx:end_idx], offsets[start_idx:end_idx]), - value[start_idx:end_idx]) - - -def prepare_to_cache(cache, slot_mapping): - num_blocks = cache.size(0) - block_size = cache.size(1) - slot_mapping = slot_mapping.flatten() - indices = torch.div(slot_mapping, block_size, rounding_mode="floor") - offsets = torch.fmod(slot_mapping, block_size) - num_slots_requested = slot_mapping.size(0) - num_slots_available = num_blocks * block_size - # NOTE(kzawora): HPU PT bridge crashes with - # RuntimeError: Invalid inputs for scatter_nd_onnx - # on index_put when num_slots_requested > num_slots_available. - # This case might occur when we have little kv cache blocks and - # lots of padding, or are doing warmup. - # This loop is a workaround for this issue. Please remove it - # once key_cache.index_put_(indices, offsets), key) works. - num_kv_cache_passes = torch.div(num_slots_requested, - num_slots_available).ceil().int().item() - - return num_kv_cache_passes, num_slots_available, indices, offsets - - -def insert_or_update_cache(input, cache, num_kv_cache_passes, num_slots_available, block_indices, block_offsets): - for i in range(num_kv_cache_passes): - start_idx = i * num_slots_available - end_idx = (i + 1) * num_slots_available - cache.index_put_( - (block_indices[start_idx:end_idx], block_offsets[start_idx:end_idx]), - input[start_idx:end_idx]) - - -def swap_blocks(src, dst, block_mapping): - index_src = torch.zeros((1, ), dtype=torch.int32, device=src.device) - index_dst = torch.zeros((1, ), dtype=torch.int32, device=dst.device) - for src_idx, dst_idx in block_mapping.items(): - index_src[0] = src_idx - index_dst[0] = dst_idx - dst.index_put_([index_dst], src.index_select(0, index_src)) - if dst.device.type == 'hpu': - htorch.core.mark_step() - torch.hpu.synchronize() - - -def copy_blocks(key_caches, value_caches, block_mapping): - index_src = torch.zeros((1, ), - dtype=torch.int32, - device=key_caches[0].device) - index_dst = torch.zeros((1, ), - dtype=torch.int32, - device=key_caches[0].device) - for src, dsts in block_mapping.items(): - index_src[0] = src - for dst in dsts: - index_dst[0] = dst - for key_cache in key_caches: - key_cache.index_copy_(0, index_dst, - key_cache.index_select(0, index_src)) - for value_cache in value_caches: - value_cache.index_copy_(0, index_dst, - value_cache.index_select(0, index_src)) - if key_caches[0].device.type == 'hpu': - htorch.core.mark_step() - - -# fp8 -def scaled_fp8_quant( - input: torch.Tensor, - scale: Optional[torch.Tensor] = None, - batch_dim_padding: Optional[int] = None, - scale_ub: Optional[torch.Tensor] = None, - use_per_token_if_dynamic: bool = False, -) -> Tuple[torch.Tensor, torch.Tensor]: - - """ - Quantize input tensor to FP8 and return quantized tensor and scale. - - This function supports both static and dynamic quantization: If you - provide the scale, it will use static scaling and if you omit it, - the scale will be determined dynamically. The function also allows - optional padding of the output tensor for downstream kernels that - will benefit from padding. - - Args: - input: The input tensor to be quantized to FP8 - scale: Optional scaling factor for the FP8 quantization - scale_ub: Optional upper bound for scaling factor in dynamic - per token case - batch_dim_padding: If specified, pad the first dimension - of the output to at least this value. - use_per_token_if_dynamic: Whether to do per_tensor or per_token - in the dynamic quantization case. - - Returns: - Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP8 and - scaling factor. - """ - if batch_dim_padding: - shape = (max(batch_dim_padding, input.shape[0]), *input.shape[1:]) - output = torch.empty(shape, - device=input.device, - dtype=torch.float8_e4m3fn) - else: - output = torch.empty_like(input, dtype=torch.float8_e4m3fn) - if scale is None: - raise "dynamic scaled_fp8_quant not implemented for HPU" - #TODO: calculate scale to match gaudi2 240 range instead of 448 - if use_per_token_if_dynamic: - scale = torch.empty((input.numel() // input.shape[-1], 1), - device=input.device, - dtype=torch.float32) - torch.ops._C.dynamic_per_token_scaled_fp8_quant( - output, input, scale, scale_ub) - else: - scale = torch.zeros(1, device=input.device, dtype=torch.float32) - torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale) - else: - output = torch.ops.hpu.cast_to_fp8_v2(input, 1/scale, False, False, dtype=torch.float8_e4m3fn)[0] - - return output, scale diff --git a/vllm/_custom_ops/_ipex_ops.py b/vllm/_ipex_ops.py similarity index 100% rename from vllm/_custom_ops/_ipex_ops.py rename to vllm/_ipex_ops.py diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index badb29af1f5f6..39d00bd5733ff 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -233,7 +233,7 @@ def _get_scheme_from_parts( if is_activation_quantization_format(self.quant_format): if self._is_fp8_w8a8(weight_quant, input_quant): is_fp8_w8a8_supported = self._check_scheme_supported( - CompressedTensorsW8A8Fp8.get_min_capability(), error=False) if torch.cuda.is_available() else True + CompressedTensorsW8A8Fp8.get_min_capability(), error=False) if is_fp8_w8a8_supported: return CompressedTensorsW8A8Fp8( strategy=weight_quant.strategy, @@ -306,8 +306,7 @@ def get_scheme( # Raise error if device does not support the scheme # (e.g. fp8 needs ada lovelace) - if torch.cuda.is_available(): - self._check_scheme_supported(scheme.get_min_capability()) + self._check_scheme_supported(scheme.get_min_capability()) return scheme diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index 631774994b5c0..cc9d71db140c2 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -21,7 +21,7 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme): def __init__(self, strategy: str, is_static_input_scheme: bool): self.strategy = strategy self.is_static_input_scheme = is_static_input_scheme - self.cutlass_fp8_supported = cutlass_fp8_supported() if torch.cuda.is_available() else False + self.cutlass_fp8_supported = cutlass_fp8_supported() @classmethod def get_min_capability(cls) -> int: diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 8e2ed041adf0b..c829cb836ee4c 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -112,18 +112,13 @@ class Fp8LinearMethod(LinearMethodBase): def __init__(self, quant_config: Fp8Config): self.quant_config = quant_config - - if torch.cuda.is_available(): - self.cutlass_fp8_supported = cutlass_fp8_supported() - - # For GPUs that lack FP8 hardware support, we can leverage the Marlin - # kernel for fast weight-only FP8 quantization - capability = current_platform.get_device_capability() - capability = capability[0] * 10 + capability[1] - self.use_marlin = capability < 89 - else: - self.cutlass_fp8_supported = False - self.use_marlin = False + self.cutlass_fp8_supported = cutlass_fp8_supported() + + # For GPUs that lack FP8 hardware support, we can leverage the Marlin + # kernel for fast weight-only FP8 quantization + capability = current_platform.get_device_capability() + capability = capability[0] * 10 + capability[1] + self.use_marlin = capability < 89 def create_weights( self, diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index de5cd810b2a94..20100c76bd690 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -6,8 +6,6 @@ from vllm import _custom_ops as ops from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform -if current_platform.is_hpu(): - import habana_frameworks.torch.utils.experimental as htexp def cutlass_fp8_supported() -> bool: @@ -20,17 +18,8 @@ def cutlass_fp8_supported() -> bool: def per_tensor_dequantize( tensor: torch.Tensor, inv_scale: Union[float, torch.Tensor]) -> torch.Tensor: - dtype = torch.float16 - device = tensor.device - if current_platform.is_hpu(): - dtype = torch.bfloat16 - if htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi2: - #dequant on cpu to avoid nan on gaudi2 - tensor = tensor.to('cpu') - - fake_qweight = tensor.to(dtype).to(device) + fake_qweight = tensor.to(torch.float16) dq_weight = fake_qweight * inv_scale - return dq_weight @@ -87,9 +76,6 @@ def requantize_with_max_scale( logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]: # Max scale to be used for requanitzation. max_w_scale = weight_scale.max() - if current_platform.is_hpu() and htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi2: - max_w_scale = max_w_scale * (448/240) - # QKV / MLP is fused in the on disk checkpoint if any of the # weight scales are still set to the default since we initialize @@ -161,25 +147,12 @@ def apply_fp8_linear( if per_tensor_weights and per_tensor_activations: # Fused GEMM_DQ - if current_platform.is_hpu(): - #hpu does not support torch._scaled_mm (SW-197036) - output = torch.ops.hpu.fp8_gemm_v2(qinput, - False, - weight, - False, - None, - input.dtype, - x_scale, - weight_scale, - None, - False) - else: - output, _ = torch._scaled_mm(qinput, - weight, - out_dtype=input.dtype, - scale_a=x_scale, - scale_b=weight_scale, - bias=bias) + output, _ = torch._scaled_mm(qinput, + weight, + out_dtype=input.dtype, + scale_a=x_scale, + scale_b=weight_scale, + bias=bias) return torch.narrow(output, 0, 0, input.shape[0]) else: diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index f02609aa9ff3b..676a51ce67f96 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -54,9 +54,6 @@ from .interfaces import SupportsLoRA from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers -from vllm.platforms import current_platform -if current_platform.is_hpu(): - import habana_frameworks.torch.core as htcore class LlamaMLP(nn.Module): @@ -521,10 +518,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): default_weight_loader) weight_loader(param, loaded_weight) - #Avoid OOM due to large graph when loading weights - if current_platform.is_hpu(): - htcore.mark_step() - # If this function is called, it should always initialize KV cache scale # factors (or else raise an exception). Thus, handled exceptions should # make sure to leave KV cache scale factors in a known good (dummy) state diff --git a/vllm/utils.py b/vllm/utils.py index 661d5d62e069b..fa6e132dd3522 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -176,25 +176,69 @@ def clear(self): def is_hip() -> bool: - return ops.is_hip() + return torch.version.hip is not None + +@lru_cache(maxsize=None) def is_cpu() -> bool: - return ops.is_cpu() + from importlib.metadata import PackageNotFoundError, version + try: + return "cpu" in version("vllm") + except PackageNotFoundError: + return False + +@lru_cache(maxsize=None) def is_openvino() -> bool: - return ops.is_openvino() + from importlib.metadata import PackageNotFoundError, version + try: + return "openvino" in version("vllm") + except PackageNotFoundError: + return False + +@lru_cache(maxsize=None) def is_neuron() -> bool: - return ops.is_neuron() + try: + import transformers_neuronx + except ImportError: + transformers_neuronx = None + return transformers_neuronx is not None + +@lru_cache(maxsize=None) def is_hpu() -> bool: - return ops.is_hpu() + from importlib import util + return util.find_spec('habana_frameworks') is not None + +@lru_cache(maxsize=None) def is_tpu() -> bool: - return ops.is_tpu() + try: + import libtpu + except ImportError: + libtpu = None + return libtpu is not None + +@lru_cache(maxsize=None) def is_xpu() -> bool: - return ops.is_xpu() + from importlib.metadata import version + is_xpu_flag = "xpu" in version("vllm") + # vllm is not build with xpu + if not is_xpu_flag: + return False + try: + import intel_extension_for_pytorch as ipex # noqa: F401 + _import_ipex = True + except ImportError as e: + logger.warning("Import Error for IPEX: %s", e.msg) + _import_ipex = False + # ipex dependency is not ready + if not _import_ipex: + logger.warning("not found ipex lib") + return False + return hasattr(torch, "xpu") and torch.xpu.is_available() @lru_cache(maxsize=None) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index a2c7a96757faa..a975dba6f5136 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -453,7 +453,8 @@ def __init__( def load_model(self) -> None: import habana_frameworks.torch.core as htcore - htcore.hpu_set_env() + if self.model_config.quantization == 'inc': + htcore.hpu_set_env() with HabanaMemoryProfiler() as m: with HabanaMemoryProfiler() as m_getmodel: self.model = get_model( diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index bf285c93cdd47..9d083915041fe 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -109,7 +109,8 @@ def init_device(self) -> None: raise RuntimeError( f"Not support device type: {self.device_config.device}") # Initialize the distributed environment. - self._set_env_vars() + if self.model_config.quantization == 'inc': + self._set_env_vars() init_worker_distributed_environment(self.parallel_config, self.rank, self.distributed_init_method, self.local_rank) From c899aef31c064523daa5c38746d203dc148518cc Mon Sep 17 00:00:00 2001 From: Agata Dobrzyniewicz Date: Mon, 2 Sep 2024 12:54:54 +0300 Subject: [PATCH 144/819] warmup_mode kward restore --- vllm/worker/habana_model_runner.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 0100076aec8e2..241980f32f097 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -1615,7 +1615,10 @@ def execute_model( if multi_modal_input is not None: execute_model_kwargs.update(multi_modal_input) if htorch.utils.internal.is_lazy(): - execute_model_kwargs.update({"bypass_hpu_graphs": not use_graphs}) + execute_model_kwargs.update({ + "bypass_hpu_graphs": not use_graphs, + "warmup_mode": warmup_mode + }) htorch.core.mark_step() if self.is_driver_worker: From 4eedfb91c8ef33a601b9e203a7ad8048d854222f Mon Sep 17 00:00:00 2001 From: Agata Dobrzyniewicz Date: Mon, 2 Sep 2024 14:24:59 +0300 Subject: [PATCH 145/819] change format --- vllm/worker/habana_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 241980f32f097..dec1b65858eb4 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -1567,8 +1567,8 @@ def _check_config(self, batch_size, seq_len, is_prompt, warmup_mode): self.seen_configs.add(cfg) if not seen and not warmup_mode: phase = 'prompt' if is_prompt else 'decode' - logger.warning('Configuration: (', phase, ', ', batch_size, ', ', - seq_len, ') was not warmed-up!') + logger.warning("Configuration: (%s, %s, %s) was not warmed-up!", + phase, batch_size, seq_len) @torch.inference_mode() def execute_model( From 1dccf88380cbbb1c73e033009b2485f39cf6fde1 Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar Date: Wed, 21 Aug 2024 13:43:53 +0300 Subject: [PATCH 146/819] POC for bgmv --- vllm/decode.py | 3 +++ vllm/hpu/ops.py | 30 ++++++++++++++++++++++++------ vllm/worker/habana_model_runner.py | 3 +++ 3 files changed, 30 insertions(+), 6 deletions(-) create mode 100644 vllm/decode.py diff --git a/vllm/decode.py b/vllm/decode.py new file mode 100644 index 0000000000000..bab140559e321 --- /dev/null +++ b/vllm/decode.py @@ -0,0 +1,3 @@ +def init(): + global is_decode + is_decode = False \ No newline at end of file diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index 662c53486b4ca..67f6bef2ab9a4 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -12,6 +12,7 @@ import torch.nn.functional as F from vllm.logger import init_logger +import vllm.decode as decode logger = init_logger(__name__) HPUFusedRMSNorm = None @@ -222,13 +223,30 @@ def dispatch_bgmv_linear( max_loras = wa_t_all.size(0) # Wrap-around for negative indices indices = indices % max_loras - wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) - wb = torch.index_select(wb_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) + if decode.is_decode: + wa = wa_t_all[:, 0, :, :].transpose(0, 2) + wb = wb_t_all[:, 0, :, :].transpose(1, 2) + wa_shape = wa.shape + wb_shape = wb.shape + wa = wa.reshape(wa_shape[0], wa_shape[1] * wa_shape[2]) + wb = wb.reshape(wb_shape[0] * wb_shape[1], wb_shape[2]) + out = x @ wa + mask = torch.zeros(out.shape[0], out.shape[1], dtype=out.dtype) + for i in range(out.shape[0]): + if indices[i] < 0: + continue + start_pos = indices[i] * wa_shape[1] + mask[i, start_pos : start_pos : start_pos + wa_shape[1]] = 1 + out = out * mask.to('hpu') + out = out@wb + else: + wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) + wb = torch.index_select(wb_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) - x = x.unsqueeze(1) - out = x @ wa - out = out @ wb - out = out.squeeze(1) + x = x.unsqueeze(1) + out = x @ wa + out = out @ wb + out = out.squeeze(1) y += out * scale diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index dec1b65858eb4..d81aea05caecd 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -33,6 +33,7 @@ SequenceGroupMetadata) from vllm.utils import (HabanaMemoryProfiler, format_bytes, is_pin_memory_available, make_tensor_with_pad) +import vllm.decode as decode from vllm.worker.model_runner_base import ( ModelRunnerBase, ModelRunnerInputBase, _add_attn_metadata_broadcastable_dict, @@ -627,6 +628,7 @@ def _prepare_prompt( if len(seq_group_metadata_list) == 0: return PreparePromptMetadata.empty() + decode.is_decode = False for seq_group_metadata in seq_group_metadata_list: assert seq_group_metadata.is_prompt @@ -834,6 +836,7 @@ def _prepare_decode( if len(seq_group_metadata_list) == 0: return PrepareDecodeMetadata.empty() + decode.is_decode = True for seq_group_metadata in seq_group_metadata_list: assert not seq_group_metadata.is_prompt From c8e49552730c30d5f07d085694b72e597edef089 Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar Date: Mon, 26 Aug 2024 15:15:52 +0300 Subject: [PATCH 147/819] Prompt mask implementation --- vllm/decode.py | 4 ++-- vllm/hpu/ops.py | 18 +++++---------- vllm/worker/habana_model_runner.py | 36 +++++++++++++++++++++++++++--- 3 files changed, 41 insertions(+), 17 deletions(-) diff --git a/vllm/decode.py b/vllm/decode.py index bab140559e321..1cf8ea1cdbe11 100644 --- a/vllm/decode.py +++ b/vllm/decode.py @@ -1,3 +1,3 @@ def init(): - global is_decode - is_decode = False \ No newline at end of file + global mask + mask = None \ No newline at end of file diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index 67f6bef2ab9a4..ed3fab733bc59 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -223,21 +223,15 @@ def dispatch_bgmv_linear( max_loras = wa_t_all.size(0) # Wrap-around for negative indices indices = indices % max_loras - if decode.is_decode: - wa = wa_t_all[:, 0, :, :].transpose(0, 2) - wb = wb_t_all[:, 0, :, :].transpose(1, 2) + if decode.mask is not None: + wa = wa_t_all[:, 0, :, :] + wb = wb_t_all[:, 0, :, :].transpose(0, 1) wa_shape = wa.shape wb_shape = wb.shape - wa = wa.reshape(wa_shape[0], wa_shape[1] * wa_shape[2]) - wb = wb.reshape(wb_shape[0] * wb_shape[1], wb_shape[2]) + wa = wa.reshape(wa_shape[0] * wa_shape[1], wa_shape[2]).transpose(0, 1) + wb = wb.reshape(wb_shape[0], wb_shape[1] * wb_shape[2]).transpose(0, 1) out = x @ wa - mask = torch.zeros(out.shape[0], out.shape[1], dtype=out.dtype) - for i in range(out.shape[0]): - if indices[i] < 0: - continue - start_pos = indices[i] * wa_shape[1] - mask[i, start_pos : start_pos : start_pos + wa_shape[1]] = 1 - out = out * mask.to('hpu') + out = out * decode.mask out = out@wb else: wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index d81aea05caecd..e78cf86a54ad1 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -628,7 +628,7 @@ def _prepare_prompt( if len(seq_group_metadata_list) == 0: return PreparePromptMetadata.empty() - decode.is_decode = False + decode.mask = None for seq_group_metadata in seq_group_metadata_list: assert seq_group_metadata.is_prompt @@ -741,18 +741,33 @@ def _prepare_prompt( find_bucket(max(seq_lens), self.prompt_seq_bucket_cfg), self.block_size) + if self.lora_config: + decode.mask = torch.zeros(len(seq_group_metadata_list) * max_prompt_len, + (self.lora_config.max_loras + 1) * self.lora_config.max_lora_rank, + dtype=self.lora_config.lora_dtype) + ones = torch.ones(max_prompt_len, self.lora_config.max_lora_rank, dtype=self.lora_config.lora_dtype) + counter = 0 for seq_group_metadata, context_len in zip(seq_group_metadata_list, context_lens): lora_id = seq_group_metadata.lora_int_id if lora_id > 0: lora_requests.add(seq_group_metadata.lora_request) + start_row = counter * max_prompt_len + end_row = start_row + max_prompt_len + start_col = (lora_id - 1) * self.lora_config.max_lora_rank + end_col = start_col + self.lora_config.max_lora_rank + decode.mask[start_row:end_row, start_col:end_col] = ones + counter = counter + 1 lora_index_mapping += [lora_id] * (max_prompt_len - context_len) lora_prompt_mapping.extend( [lora_id] * (max_prompt_len - context_len if seq_group_metadata.sampling_params.prompt_logprobs else 1)) + + if decode.mask is not None: + decode.mask = decode.mask.to('hpu') input_tokens = make_tensor_with_pad(input_tokens, max_len=max_prompt_len, @@ -836,7 +851,14 @@ def _prepare_decode( if len(seq_group_metadata_list) == 0: return PrepareDecodeMetadata.empty() - decode.is_decode = True + decode.mask = None + + if self.lora_config: + decode.mask = torch.zeros(len(seq_group_metadata_list), + (self.lora_config.max_loras + 1) * self.lora_config.max_lora_rank, + dtype=self.lora_config.lora_dtype) + ones = torch.ones(1, self.lora_config.max_lora_rank, dtype=self.lora_config.lora_dtype) + counter = 0 for seq_group_metadata in seq_group_metadata_list: assert not seq_group_metadata.is_prompt @@ -847,6 +869,10 @@ def _prepare_decode( if lora_id > 0: lora_requests.add(seq_group_metadata.lora_request) + start_pos = (lora_id - 1) * self.lora_config.max_lora_rank + end_pos = start_pos + self.lora_config.max_lora_rank + decode.mask[counter, start_pos:end_pos] = ones + counter = counter + 1 for seq_id in seq_ids: seq_data = seq_group_metadata.seq_data[seq_id] @@ -875,6 +901,8 @@ def _prepare_decode( block_table = block_table[-sliding_window_blocks:] block_tables.append(block_table) + if decode.mask is not None: + decode.mask = decode.mask.to('hpu') input_tokens = torch.tensor(input_tokens, dtype=torch.long, device=self.device) @@ -1152,6 +1180,7 @@ def profile_run(self) -> None: True, kv_caches, is_profile_run=True) + return def warmup_scenario(self, batch_size, @@ -1206,7 +1235,7 @@ def warmup_scenario(self, if dummy_lora_requests_per_seq else None) for i in range(batch_size) ] - torch.hpu.synchronize() + #torch.hpu.synchronize() for _ in range(times): inputs = self.prepare_model_input(seqs) self.execute_model(inputs, kv_caches, warmup_mode=True) @@ -1647,6 +1676,7 @@ def execute_model( module.indices_len[ i] = sampling_metadata.selected_token_indices.numel( ) + decode.mask = None # Compute the logits. with self.profiler.record_event( From 9df71977a62da0f5c0bd475c96112d32d2b7afbd Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar Date: Wed, 28 Aug 2024 11:09:40 +0300 Subject: [PATCH 148/819] Multi Lora Fix --- vllm/hpu/ops.py | 3 ++- vllm/worker/habana_model_runner.py | 5 ++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index ed3fab733bc59..9e17339b0e574 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -222,7 +222,6 @@ def dispatch_bgmv_linear( assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}' max_loras = wa_t_all.size(0) # Wrap-around for negative indices - indices = indices % max_loras if decode.mask is not None: wa = wa_t_all[:, 0, :, :] wb = wb_t_all[:, 0, :, :].transpose(0, 1) @@ -231,9 +230,11 @@ def dispatch_bgmv_linear( wa = wa.reshape(wa_shape[0] * wa_shape[1], wa_shape[2]).transpose(0, 1) wb = wb.reshape(wb_shape[0], wb_shape[1] * wb_shape[2]).transpose(0, 1) out = x @ wa + assert(out.shape == decode.mask.shape) out = out * decode.mask out = out@wb else: + indices = indices % max_loras wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) wb = torch.index_select(wb_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index e78cf86a54ad1..152d7f3000572 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -758,7 +758,7 @@ def _prepare_prompt( start_col = (lora_id - 1) * self.lora_config.max_lora_rank end_col = start_col + self.lora_config.max_lora_rank decode.mask[start_row:end_row, start_col:end_col] = ones - counter = counter + 1 + counter = counter + 1 lora_index_mapping += [lora_id] * (max_prompt_len - context_len) lora_prompt_mapping.extend( @@ -872,7 +872,7 @@ def _prepare_decode( start_pos = (lora_id - 1) * self.lora_config.max_lora_rank end_pos = start_pos + self.lora_config.max_lora_rank decode.mask[counter, start_pos:end_pos] = ones - counter = counter + 1 + counter = counter + 1 for seq_id in seq_ids: seq_data = seq_group_metadata.seq_data[seq_id] @@ -1180,7 +1180,6 @@ def profile_run(self) -> None: True, kv_caches, is_profile_run=True) - return def warmup_scenario(self, batch_size, From 234ffdc637eac9707c1afce7766f8d2445dc5289 Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar Date: Wed, 28 Aug 2024 13:51:41 +0300 Subject: [PATCH 149/819] HPU Graph Fix for decode mask --- vllm/worker/habana_model_runner.py | 47 +++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 152d7f3000572..9d7881768a0d0 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -230,10 +230,11 @@ def forward(self, *args, **kwargs): input_ids.size(1), input_ids.device, torch.bfloat16) + decode.mask = kwargs.pop('mask') hidden_states = self.model(*args, **kwargs) hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) hidden_states = hidden_states.index_select(0, selected_token_indices) - return hidden_states + return hidden_states, decode.mask def compute_logits(self, *args, **kwargs): return self.model.compute_logits(*args, **kwargs) @@ -253,6 +254,7 @@ class PreparePromptMetadata(NamedTuple): lora_requests: Set[LoRARequest] multi_modal_input: Optional[torch.Tensor] slot_mapping: List[List[int]] + mask: Optional[torch.Tensor] @classmethod def empty(cls): @@ -267,6 +269,7 @@ def empty(cls): lora_requests=set(), multi_modal_input=None, slot_mapping=[], + mask = None ) @@ -278,6 +281,7 @@ class PrepareDecodeMetadata(NamedTuple): lora_prompt_mapping: List[List[int]] lora_requests: Set[LoRARequest] slot_mapping: List[List[int]] + mask: Optional[torch.Tensor] @classmethod def empty(cls): @@ -289,6 +293,7 @@ def empty(cls): lora_prompt_mapping=[], lora_requests=set(), slot_mapping=[], + mask=None, ) @@ -324,6 +329,7 @@ class ModelInputForHPU(ModelRunnerInputBase): real_batch_size: Optional[int] = None batch_size_padded: Optional[int] = None virtual_engine: int = 0 + mask: Optional[torch.Tensor] = None def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: tensor_dict = { @@ -334,7 +340,8 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: "multi_modal_kwargs": self.multi_modal_kwargs, "real_batch_size": self.real_batch_size, "batch_size_padded": self.batch_size_padded, - "virtual_engine": self.virtual_engine + "virtual_engine": self.virtual_engine, + "mask": mask } _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) return tensor_dict @@ -368,6 +375,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: "lora_requests": self.lora_requests, "lora_mapping": self.lora_mapping, "multi_modal_kwargs": self.multi_modal_kwargs, + "mask": self.mask } _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) _add_sampling_metadata_broadcastable_dict(tensor_dict, @@ -628,7 +636,7 @@ def _prepare_prompt( if len(seq_group_metadata_list) == 0: return PreparePromptMetadata.empty() - decode.mask = None + mask = None for seq_group_metadata in seq_group_metadata_list: assert seq_group_metadata.is_prompt @@ -742,7 +750,7 @@ def _prepare_prompt( self.block_size) if self.lora_config: - decode.mask = torch.zeros(len(seq_group_metadata_list) * max_prompt_len, + mask = torch.zeros(len(seq_group_metadata_list) * max_prompt_len, (self.lora_config.max_loras + 1) * self.lora_config.max_lora_rank, dtype=self.lora_config.lora_dtype) ones = torch.ones(max_prompt_len, self.lora_config.max_lora_rank, dtype=self.lora_config.lora_dtype) @@ -757,7 +765,7 @@ def _prepare_prompt( end_row = start_row + max_prompt_len start_col = (lora_id - 1) * self.lora_config.max_lora_rank end_col = start_col + self.lora_config.max_lora_rank - decode.mask[start_row:end_row, start_col:end_col] = ones + mask[start_row:end_row, start_col:end_col] = ones counter = counter + 1 lora_index_mapping += [lora_id] * (max_prompt_len - context_len) @@ -766,8 +774,8 @@ def _prepare_prompt( (max_prompt_len - context_len if seq_group_metadata.sampling_params.prompt_logprobs else 1)) - if decode.mask is not None: - decode.mask = decode.mask.to('hpu') + if mask is not None: + mask = mask.to('hpu') input_tokens = make_tensor_with_pad(input_tokens, max_len=max_prompt_len, @@ -834,6 +842,7 @@ def _prepare_prompt( lora_requests=lora_requests, multi_modal_input=multi_modal_input, slot_mapping=slot_mapping, + mask=mask, ) def _prepare_decode( @@ -851,10 +860,10 @@ def _prepare_decode( if len(seq_group_metadata_list) == 0: return PrepareDecodeMetadata.empty() - decode.mask = None + mask = None if self.lora_config: - decode.mask = torch.zeros(len(seq_group_metadata_list), + mask = torch.zeros(len(seq_group_metadata_list), (self.lora_config.max_loras + 1) * self.lora_config.max_lora_rank, dtype=self.lora_config.lora_dtype) ones = torch.ones(1, self.lora_config.max_lora_rank, dtype=self.lora_config.lora_dtype) @@ -871,7 +880,7 @@ def _prepare_decode( lora_requests.add(seq_group_metadata.lora_request) start_pos = (lora_id - 1) * self.lora_config.max_lora_rank end_pos = start_pos + self.lora_config.max_lora_rank - decode.mask[counter, start_pos:end_pos] = ones + mask[counter, start_pos:end_pos] = ones counter = counter + 1 for seq_id in seq_ids: @@ -901,8 +910,8 @@ def _prepare_decode( block_table = block_table[-sliding_window_blocks:] block_tables.append(block_table) - if decode.mask is not None: - decode.mask = decode.mask.to('hpu') + if mask is not None: + mask = mask.to('hpu') input_tokens = torch.tensor(input_tokens, dtype=torch.long, device=self.device) @@ -948,6 +957,7 @@ def _prepare_decode( lora_prompt_mapping=lora_prompt_mapping, lora_requests=lora_requests, slot_mapping=slot_mapping, + mask=mask, ) def prepare_input_tensors( @@ -1002,6 +1012,7 @@ def prepare_input_tensors( lora_requests, multi_modal_input, slot_mapping, + mask, ) = self._prepare_prompt(prefill_reqs) ( decode_input_tokens, @@ -1011,6 +1022,7 @@ def prepare_input_tensors( decode_lora_prompt_mapping, decode_lora_requests, decode_slot_mapping, + decode_mask, ) = self._prepare_decode(decode_reqs) sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list, seq_lens, query_lens, @@ -1037,6 +1049,7 @@ def prepare_input_tensors( lora_index_mapping = decode_lora_index_mapping lora_prompt_mapping = decode_lora_prompt_mapping lora_requests = decode_lora_requests + mask = decode_mask # FIXME: We need to adjust selected_token_indices to accommodate # for padding @@ -1106,7 +1119,8 @@ def prepare_input_tensors( lora_mapping=lora_mapping, multi_modal_kwargs=multi_modal_input, real_batch_size=real_batch_size, - batch_size_padded=batch_size_padded), sampling_metadata + batch_size_padded=batch_size_padded, + mask=mask), sampling_metadata def _seq_len(self, attn_metadata): if attn_metadata.num_prefills != 0: @@ -1651,6 +1665,11 @@ def execute_model( "warmup_mode": warmup_mode }) + if model_input.mask is not None: + execute_model_kwargs.update({ + "mask": model_input.mask + }) + htorch.core.mark_step() if self.is_driver_worker: model_event_name = ("model_" @@ -1661,7 +1680,7 @@ def execute_model( else: model_event_name = 'model_executable' with self.profiler.record_event('internal', model_event_name): - hidden_states = self.model.forward( + hidden_states, _ = self.model.forward( **execute_model_kwargs, selected_token_indices=sampling_metadata.selected_token_indices ) From 0a15fb85b963b6104464dbcb54a13c14bce60fdd Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar Date: Wed, 28 Aug 2024 14:16:07 +0300 Subject: [PATCH 150/819] Fix crash for LoRA disabled --- vllm/hpu/ops.py | 12 +++-- vllm/worker/habana_model_runner.py | 79 +++++++++++++++--------------- 2 files changed, 46 insertions(+), 45 deletions(-) diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index 9e17339b0e574..67e3737f9d8d8 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -230,13 +230,15 @@ def dispatch_bgmv_linear( wa = wa.reshape(wa_shape[0] * wa_shape[1], wa_shape[2]).transpose(0, 1) wb = wb.reshape(wb_shape[0], wb_shape[1] * wb_shape[2]).transpose(0, 1) out = x @ wa - assert(out.shape == decode.mask.shape) + assert (out.shape == decode.mask.shape) out = out * decode.mask - out = out@wb + out = out @ wb else: indices = indices % max_loras - wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) - wb = torch.index_select(wb_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) + wa = torch.index_select(wa_t_all, 0, + indices)[:, 0, :, :].transpose(-1, -2) + wb = torch.index_select(wb_t_all, 0, + indices)[:, 0, :, :].transpose(-1, -2) x = x.unsqueeze(1) out = x @ wa @@ -278,4 +280,4 @@ def dispatch_bgmv_embedding( x = x.unsqueeze(1) out = x @ wa out = out.squeeze(1) - y += out * scale \ No newline at end of file + y += out * scale diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 9d7881768a0d0..07aeaffbcab82 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -258,19 +258,17 @@ class PreparePromptMetadata(NamedTuple): @classmethod def empty(cls): - return PreparePromptMetadata( - input_tokens=[], - input_positions=[], - attn_metadata=None, - seq_lens=[], - query_lens=[], - lora_index_mapping=[], - lora_prompt_mapping=[], - lora_requests=set(), - multi_modal_input=None, - slot_mapping=[], - mask = None - ) + return PreparePromptMetadata(input_tokens=[], + input_positions=[], + attn_metadata=None, + seq_lens=[], + query_lens=[], + lora_index_mapping=[], + lora_prompt_mapping=[], + lora_requests=set(), + multi_modal_input=None, + slot_mapping=[], + mask=None) class PrepareDecodeMetadata(NamedTuple): @@ -637,6 +635,7 @@ def _prepare_prompt( if len(seq_group_metadata_list) == 0: return PreparePromptMetadata.empty() mask = None + counter = 0 for seq_group_metadata in seq_group_metadata_list: assert seq_group_metadata.is_prompt @@ -751,10 +750,12 @@ def _prepare_prompt( if self.lora_config: mask = torch.zeros(len(seq_group_metadata_list) * max_prompt_len, - (self.lora_config.max_loras + 1) * self.lora_config.max_lora_rank, - dtype=self.lora_config.lora_dtype) - ones = torch.ones(max_prompt_len, self.lora_config.max_lora_rank, dtype=self.lora_config.lora_dtype) - counter = 0 + (self.lora_config.max_loras + 1) * + self.lora_config.max_lora_rank, + dtype=self.lora_config.lora_dtype) + ones = torch.ones(max_prompt_len, + self.lora_config.max_lora_rank, + dtype=self.lora_config.lora_dtype) for seq_group_metadata, context_len in zip(seq_group_metadata_list, context_lens): lora_id = seq_group_metadata.lora_int_id @@ -773,7 +774,7 @@ def _prepare_prompt( [lora_id] * (max_prompt_len - context_len if seq_group_metadata.sampling_params.prompt_logprobs else 1)) - + if mask is not None: mask = mask.to('hpu') @@ -861,13 +862,16 @@ def _prepare_decode( if len(seq_group_metadata_list) == 0: return PrepareDecodeMetadata.empty() mask = None + counter = 0 if self.lora_config: mask = torch.zeros(len(seq_group_metadata_list), - (self.lora_config.max_loras + 1) * self.lora_config.max_lora_rank, - dtype=self.lora_config.lora_dtype) - ones = torch.ones(1, self.lora_config.max_lora_rank, dtype=self.lora_config.lora_dtype) - counter = 0 + (self.lora_config.max_loras + 1) * + self.lora_config.max_lora_rank, + dtype=self.lora_config.lora_dtype) + ones = torch.ones(1, + self.lora_config.max_lora_rank, + dtype=self.lora_config.lora_dtype) for seq_group_metadata in seq_group_metadata_list: assert not seq_group_metadata.is_prompt @@ -1109,18 +1113,17 @@ def prepare_input_tensors( attn_metadata = prefill_attn_metadata if \ prefill_attn_metadata is not None else decode_attn_metadata - return self._model_input_cls( - input_tokens=input_tokens, - seq_lens=seq_lens, - query_lens=query_lens, - input_positions=input_positions, - attn_metadata=attn_metadata, - lora_requests=lora_requests, - lora_mapping=lora_mapping, - multi_modal_kwargs=multi_modal_input, - real_batch_size=real_batch_size, - batch_size_padded=batch_size_padded, - mask=mask), sampling_metadata + return self._model_input_cls(input_tokens=input_tokens, + seq_lens=seq_lens, + query_lens=query_lens, + input_positions=input_positions, + attn_metadata=attn_metadata, + lora_requests=lora_requests, + lora_mapping=lora_mapping, + multi_modal_kwargs=multi_modal_input, + real_batch_size=real_batch_size, + batch_size_padded=batch_size_padded, + mask=mask), sampling_metadata def _seq_len(self, attn_metadata): if attn_metadata.num_prefills != 0: @@ -1655,7 +1658,8 @@ def execute_model( "positions": input_positions, "kv_caches": kv_caches, "attn_metadata": self.trim_attn_metadata(attn_metadata), - "intermediate_tensors": intermediate_tensors + "intermediate_tensors": intermediate_tensors, + "mask": model_input.mask } if multi_modal_input is not None: execute_model_kwargs.update(multi_modal_input) @@ -1665,11 +1669,6 @@ def execute_model( "warmup_mode": warmup_mode }) - if model_input.mask is not None: - execute_model_kwargs.update({ - "mask": model_input.mask - }) - htorch.core.mark_step() if self.is_driver_worker: model_event_name = ("model_" From 038e36b2af58e4ca54e38cd544c04419b90de49c Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar Date: Mon, 2 Sep 2024 13:26:24 +0300 Subject: [PATCH 151/819] Remove Global variable --- vllm/decode.py | 3 -- vllm/hpu/ops.py | 53 ++++++++++++++++++++---------- vllm/worker/habana_model_runner.py | 20 +++++------ 3 files changed, 46 insertions(+), 30 deletions(-) delete mode 100644 vllm/decode.py diff --git a/vllm/decode.py b/vllm/decode.py deleted file mode 100644 index 1cf8ea1cdbe11..0000000000000 --- a/vllm/decode.py +++ /dev/null @@ -1,3 +0,0 @@ -def init(): - global mask - mask = None \ No newline at end of file diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index 67e3737f9d8d8..bbbb46c32a378 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -12,7 +12,6 @@ import torch.nn.functional as F from vllm.logger import init_logger -import vllm.decode as decode logger = init_logger(__name__) HPUFusedRMSNorm = None @@ -194,6 +193,18 @@ def prompt_attention( return attn_weights +class LoraMask: + lora_mask = None + + @staticmethod + def setLoraMask(mask): + LoraMask.lora_mask = mask + + @staticmethod + def getLoraMask(): + return LoraMask.lora_mask + + def dispatch_bgmv_linear( y: torch.Tensor, x: torch.Tensor, @@ -207,33 +218,41 @@ def dispatch_bgmv_linear( `wa_t_all` and `wb_t_all` contains all LoRA A and LoRA B weight matrices stacked into single tensors, assuming same rank. HPU handles no-LoRA requests using zero valued A and B tensors. These zero valued tensors are - appended at the end of `wa_t_all` and `wb_t_all` during initialization. For - custom BGMV, the corresponding `wa` and `wb` for each batch is created - based on the lora_index of each sample. - - For example: - `wa_t_all` is tensor of shape (num_loras, num_layers, lora_rank, - hidden_dim), where `wa_t_all[-1]` is zero valued tensor which handles - no-LoRA case. The `wa` tensor for a batch of size batch_Size will have - a shape of (batch_size, num_layers, hidden_dim, lora_rank) - - This method avoids for-loop as well as graph breaks. + appended at the end of `wa_t_all` and `wb_t_all` during initialization. """ + assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}' max_loras = wa_t_all.size(0) # Wrap-around for negative indices - if decode.mask is not None: + mask = LoraMask.getLoraMask() + if mask is not None: + """ + We reshape w_a_t_all to [hidden_dim, num_layers * lora_rank] + and w_b_t_all to [num_layers * lora_rank, hidden_dim]. We also + have a loraMask of shape [batch_size, num_layers * lora_rank] + """ wa = wa_t_all[:, 0, :, :] - wb = wb_t_all[:, 0, :, :].transpose(0, 1) + wb = wb_t_all[:, 0, :, :].transpose(1, 2) wa_shape = wa.shape wb_shape = wb.shape wa = wa.reshape(wa_shape[0] * wa_shape[1], wa_shape[2]).transpose(0, 1) - wb = wb.reshape(wb_shape[0], wb_shape[1] * wb_shape[2]).transpose(0, 1) + wb = wb.reshape(wb_shape[0] * wb_shape[1], wb_shape[2]) out = x @ wa - assert (out.shape == decode.mask.shape) - out = out * decode.mask + assert (out.shape == mask.shape) + out = out * mask out = out @ wb else: + """For custom BGMV, the corresponding `wa` and `wb` for each batch is + created based on the lora_index of each sample. + + For example: + `wa_t_all` is tensor of shape (num_loras, num_layers, lora_rank, + hidden_dim), where `wa_t_all[-1]` is zero valued tensor which handles + no-LoRA case. The `wa` tensor for a batch of size batch_Size will have + a shape of (batch_size, num_layers, hidden_dim, lora_rank) + + This method avoids for-loop as well as graph breaks. + """ indices = indices % max_loras wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 07aeaffbcab82..98e65220edded 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -22,6 +22,7 @@ ModelConfig, MultiModalConfig, ParallelConfig, SchedulerConfig) from vllm.distributed.parallel_state import get_world_group +from vllm.hpu.ops import LoraMask as LoraMask from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping from vllm.lora.request import LoRARequest @@ -33,7 +34,6 @@ SequenceGroupMetadata) from vllm.utils import (HabanaMemoryProfiler, format_bytes, is_pin_memory_available, make_tensor_with_pad) -import vllm.decode as decode from vllm.worker.model_runner_base import ( ModelRunnerBase, ModelRunnerInputBase, _add_attn_metadata_broadcastable_dict, @@ -230,11 +230,11 @@ def forward(self, *args, **kwargs): input_ids.size(1), input_ids.device, torch.bfloat16) - decode.mask = kwargs.pop('mask') + LoraMask.setLoraMask(kwargs.pop('mask')) hidden_states = self.model(*args, **kwargs) hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) hidden_states = hidden_states.index_select(0, selected_token_indices) - return hidden_states, decode.mask + return hidden_states def compute_logits(self, *args, **kwargs): return self.model.compute_logits(*args, **kwargs) @@ -339,7 +339,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: "real_batch_size": self.real_batch_size, "batch_size_padded": self.batch_size_padded, "virtual_engine": self.virtual_engine, - "mask": mask + "mask": self.mask } _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) return tensor_dict @@ -634,8 +634,6 @@ def _prepare_prompt( if len(seq_group_metadata_list) == 0: return PreparePromptMetadata.empty() - mask = None - counter = 0 for seq_group_metadata in seq_group_metadata_list: assert seq_group_metadata.is_prompt @@ -748,6 +746,8 @@ def _prepare_prompt( find_bucket(max(seq_lens), self.prompt_seq_bucket_cfg), self.block_size) + mask: torch.Tensor = None + counter = 0 if self.lora_config: mask = torch.zeros(len(seq_group_metadata_list) * max_prompt_len, (self.lora_config.max_loras + 1) * @@ -861,7 +861,7 @@ def _prepare_decode( if len(seq_group_metadata_list) == 0: return PrepareDecodeMetadata.empty() - mask = None + mask: torch.Tensor = None counter = 0 if self.lora_config: @@ -1251,7 +1251,7 @@ def warmup_scenario(self, if dummy_lora_requests_per_seq else None) for i in range(batch_size) ] - #torch.hpu.synchronize() + torch.hpu.synchronize() for _ in range(times): inputs = self.prepare_model_input(seqs) self.execute_model(inputs, kv_caches, warmup_mode=True) @@ -1679,7 +1679,7 @@ def execute_model( else: model_event_name = 'model_executable' with self.profiler.record_event('internal', model_event_name): - hidden_states, _ = self.model.forward( + hidden_states = self.model.forward( **execute_model_kwargs, selected_token_indices=sampling_metadata.selected_token_indices ) @@ -1693,7 +1693,7 @@ def execute_model( module.indices_len[ i] = sampling_metadata.selected_token_indices.numel( ) - decode.mask = None + LoraMask.setLoraMask(None) # Compute the logits. with self.profiler.record_event( From ab369e3734ace66fbf94623c55869e9642431b06 Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar Date: Mon, 2 Sep 2024 13:50:16 +0300 Subject: [PATCH 152/819] Remove limitation on max_num_batched_tokens when using LoRA --- vllm/config.py | 2 +- vllm/lora/layers.py | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/vllm/config.py b/vllm/config.py index 6acb70ad047b2..7aa3977a497ea 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1326,7 +1326,7 @@ def verify_with_model_config(self, model_config: ModelConfig): model_config.quantization) def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): - if scheduler_config.max_num_batched_tokens > 65528: + if not is_hpu() and scheduler_config.max_num_batched_tokens > 65528: raise ValueError( "Due to limitations of the custom LoRA CUDA kernel, " "max_num_batched_tokens must be <= 65528 when " diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 4a45f3fda88f1..aa01e9fb77af2 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -327,6 +327,17 @@ def set_mapping( def forward(self, x: torch.Tensor) -> torch.Tensor: added_tokens_mask = x > self.base_layer.org_vocab_size - 1 embedding_len = self.indices_len[3] + # NOTE(vgoel): These asserts can be skipped when upstreaming. + # Can be removed from vllm-fork also once lora functionality + # on Gaudi stabilizes. + if is_hpu(): + emb_len = embedding_len + x_shape = x.shape + ind_shape = self.embeddings_indices[1].shape + assert embedding_len == x.shape[0] * x.shape[1], \ + f"Extra Info: {emb_len}, {x_shape}, {ind_shape}" + assert embedding_len <= self.embeddings_indices[1].shape[0], \ + f"Extra Info: {emb_len}, {x.shape}, {ind_shape}" indices = self.embeddings_indices[1][:embedding_len].view_as(x) full_lora_a_embeddings = F.embedding( x + indices, From b4f6a295c3e663872a8dadb0caca0fd21995e105 Mon Sep 17 00:00:00 2001 From: Jan Kaniecki Date: Tue, 3 Sep 2024 13:24:12 +0200 Subject: [PATCH 153/819] Remove mark step from static MoE loop (#231) Removes unnecessary mark step from MoE OP loop to speed up computation --- vllm/hpu/ops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index 662c53486b4ca..f3ffe7edbc49d 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -128,7 +128,6 @@ def static_fused_moe(hidden_states, w1, w2, score, topk): w_output = silu_and_mul(w_output) w_output = torch.matmul(w_output, w2[expert_idx].transpose(0, 1)) final_hidden_states += w_output * padded_weights[expert_idx] - htorch.core.mark_step() return final_hidden_states.view(-1, D) From 733524ae2fe163b69335aab95a493acf451b0ddb Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Tue, 3 Sep 2024 15:26:27 +0000 Subject: [PATCH 154/819] Add newline at EOF Signed-off-by: Chendi.Xue --- Dockerfile.hpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.hpu b/Dockerfile.hpu index b9acec2b85be4..ab714cdac4670 100644 --- a/Dockerfile.hpu +++ b/Dockerfile.hpu @@ -15,4 +15,4 @@ WORKDIR /workspace/ RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks -ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] \ No newline at end of file +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] From fb98cad144e9654abcc698c4b56d793d1d56cce7 Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Tue, 3 Sep 2024 16:30:17 +0000 Subject: [PATCH 155/819] Remove requires_grad=False Signed-off-by: Chendi.Xue --- vllm/model_executor/models/arctic.py | 6 ++---- vllm/model_executor/models/dbrx.py | 6 ++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index 6d92e7597eabf..603579d41946e 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -131,14 +131,12 @@ def __init__(self, torch.empty(self.num_experts, 2 * self.intermediate_size, self.hidden_size, - dtype=self.params_dtype), - , requires_grad=False) + dtype=self.params_dtype)) self.w2s = nn.Parameter( torch.empty(self.num_experts, self.hidden_size, self.intermediate_size, - dtype=self.params_dtype), - requires_grad=False) + dtype=self.params_dtype)) set_weight_attrs(self.ws, { "weight_loader": self.weight_loader, }) diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 463003d0bba7b..e3a45b26d909b 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -86,15 +86,13 @@ def __init__( self.num_total_experts, 2 * self.intermediate_size, self.d_model, - dtype=self.params_dtype, - ), requires_grad=False) + dtype=self.params_dtype)) self.w2s = nn.Parameter( torch.empty( self.num_total_experts, self.d_model, self.intermediate_size, - dtype=self.params_dtype, - ), requires_grad=False) + dtype=self.params_dtype)) set_weight_attrs( self.ws, From 49ffde681d48263f6b1181604bfe5c56049c6f45 Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar Date: Wed, 4 Sep 2024 06:59:38 +0300 Subject: [PATCH 156/819] Change mask to lora_mask --- vllm/worker/habana_model_runner.py | 63 +++++++++++++++--------------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 98e65220edded..4b65a7ef46721 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -230,7 +230,7 @@ def forward(self, *args, **kwargs): input_ids.size(1), input_ids.device, torch.bfloat16) - LoraMask.setLoraMask(kwargs.pop('mask')) + LoraMask.setLoraMask(kwargs.pop('lora_mask')) hidden_states = self.model(*args, **kwargs) hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) hidden_states = hidden_states.index_select(0, selected_token_indices) @@ -254,7 +254,7 @@ class PreparePromptMetadata(NamedTuple): lora_requests: Set[LoRARequest] multi_modal_input: Optional[torch.Tensor] slot_mapping: List[List[int]] - mask: Optional[torch.Tensor] + lora_mask: Optional[torch.Tensor] @classmethod def empty(cls): @@ -268,7 +268,7 @@ def empty(cls): lora_requests=set(), multi_modal_input=None, slot_mapping=[], - mask=None) + lora_mask=None) class PrepareDecodeMetadata(NamedTuple): @@ -279,7 +279,7 @@ class PrepareDecodeMetadata(NamedTuple): lora_prompt_mapping: List[List[int]] lora_requests: Set[LoRARequest] slot_mapping: List[List[int]] - mask: Optional[torch.Tensor] + lora_mask: Optional[torch.Tensor] @classmethod def empty(cls): @@ -291,7 +291,7 @@ def empty(cls): lora_prompt_mapping=[], lora_requests=set(), slot_mapping=[], - mask=None, + lora_mask=None, ) @@ -327,7 +327,7 @@ class ModelInputForHPU(ModelRunnerInputBase): real_batch_size: Optional[int] = None batch_size_padded: Optional[int] = None virtual_engine: int = 0 - mask: Optional[torch.Tensor] = None + lora_mask: Optional[torch.Tensor] = None def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: tensor_dict = { @@ -339,7 +339,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: "real_batch_size": self.real_batch_size, "batch_size_padded": self.batch_size_padded, "virtual_engine": self.virtual_engine, - "mask": self.mask + "lora_mask": self.lora_mask, } _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) return tensor_dict @@ -373,7 +373,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: "lora_requests": self.lora_requests, "lora_mapping": self.lora_mapping, "multi_modal_kwargs": self.multi_modal_kwargs, - "mask": self.mask + "lora_mask": self.lora_mask, } _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) _add_sampling_metadata_broadcastable_dict(tensor_dict, @@ -746,13 +746,14 @@ def _prepare_prompt( find_bucket(max(seq_lens), self.prompt_seq_bucket_cfg), self.block_size) - mask: torch.Tensor = None + lora_mask: torch.Tensor = None counter = 0 if self.lora_config: - mask = torch.zeros(len(seq_group_metadata_list) * max_prompt_len, - (self.lora_config.max_loras + 1) * - self.lora_config.max_lora_rank, - dtype=self.lora_config.lora_dtype) + lora_mask = torch.zeros(len(seq_group_metadata_list) * + max_prompt_len, + (self.lora_config.max_loras + 1) * + self.lora_config.max_lora_rank, + dtype=self.lora_config.lora_dtype) ones = torch.ones(max_prompt_len, self.lora_config.max_lora_rank, dtype=self.lora_config.lora_dtype) @@ -766,7 +767,7 @@ def _prepare_prompt( end_row = start_row + max_prompt_len start_col = (lora_id - 1) * self.lora_config.max_lora_rank end_col = start_col + self.lora_config.max_lora_rank - mask[start_row:end_row, start_col:end_col] = ones + lora_mask[start_row:end_row, start_col:end_col] = ones counter = counter + 1 lora_index_mapping += [lora_id] * (max_prompt_len - context_len) @@ -775,8 +776,8 @@ def _prepare_prompt( (max_prompt_len - context_len if seq_group_metadata.sampling_params.prompt_logprobs else 1)) - if mask is not None: - mask = mask.to('hpu') + if lora_mask is not None: + lora_mask = lora_mask.to('hpu') input_tokens = make_tensor_with_pad(input_tokens, max_len=max_prompt_len, @@ -843,7 +844,7 @@ def _prepare_prompt( lora_requests=lora_requests, multi_modal_input=multi_modal_input, slot_mapping=slot_mapping, - mask=mask, + lora_mask=lora_mask, ) def _prepare_decode( @@ -861,14 +862,14 @@ def _prepare_decode( if len(seq_group_metadata_list) == 0: return PrepareDecodeMetadata.empty() - mask: torch.Tensor = None + lora_mask: torch.Tensor = None counter = 0 if self.lora_config: - mask = torch.zeros(len(seq_group_metadata_list), - (self.lora_config.max_loras + 1) * - self.lora_config.max_lora_rank, - dtype=self.lora_config.lora_dtype) + lora_mask = torch.zeros(len(seq_group_metadata_list), + (self.lora_config.max_loras + 1) * + self.lora_config.max_lora_rank, + dtype=self.lora_config.lora_dtype) ones = torch.ones(1, self.lora_config.max_lora_rank, dtype=self.lora_config.lora_dtype) @@ -884,7 +885,7 @@ def _prepare_decode( lora_requests.add(seq_group_metadata.lora_request) start_pos = (lora_id - 1) * self.lora_config.max_lora_rank end_pos = start_pos + self.lora_config.max_lora_rank - mask[counter, start_pos:end_pos] = ones + lora_mask[counter, start_pos:end_pos] = ones counter = counter + 1 for seq_id in seq_ids: @@ -914,8 +915,8 @@ def _prepare_decode( block_table = block_table[-sliding_window_blocks:] block_tables.append(block_table) - if mask is not None: - mask = mask.to('hpu') + if lora_mask is not None: + lora_mask = lora_mask.to('hpu') input_tokens = torch.tensor(input_tokens, dtype=torch.long, device=self.device) @@ -961,7 +962,7 @@ def _prepare_decode( lora_prompt_mapping=lora_prompt_mapping, lora_requests=lora_requests, slot_mapping=slot_mapping, - mask=mask, + lora_mask=lora_mask, ) def prepare_input_tensors( @@ -1016,7 +1017,7 @@ def prepare_input_tensors( lora_requests, multi_modal_input, slot_mapping, - mask, + lora_mask, ) = self._prepare_prompt(prefill_reqs) ( decode_input_tokens, @@ -1026,7 +1027,7 @@ def prepare_input_tensors( decode_lora_prompt_mapping, decode_lora_requests, decode_slot_mapping, - decode_mask, + decode_lora_mask, ) = self._prepare_decode(decode_reqs) sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list, seq_lens, query_lens, @@ -1053,7 +1054,7 @@ def prepare_input_tensors( lora_index_mapping = decode_lora_index_mapping lora_prompt_mapping = decode_lora_prompt_mapping lora_requests = decode_lora_requests - mask = decode_mask + lora_mask = decode_lora_mask # FIXME: We need to adjust selected_token_indices to accommodate # for padding @@ -1123,7 +1124,7 @@ def prepare_input_tensors( multi_modal_kwargs=multi_modal_input, real_batch_size=real_batch_size, batch_size_padded=batch_size_padded, - mask=mask), sampling_metadata + lora_mask=lora_mask), sampling_metadata def _seq_len(self, attn_metadata): if attn_metadata.num_prefills != 0: @@ -1659,7 +1660,7 @@ def execute_model( "kv_caches": kv_caches, "attn_metadata": self.trim_attn_metadata(attn_metadata), "intermediate_tensors": intermediate_tensors, - "mask": model_input.mask + "lora_mask": model_input.lora_mask } if multi_modal_input is not None: execute_model_kwargs.update(multi_modal_input) From 538c8f15f759ee7b18f6b738c74a00d6f304ba3a Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar Date: Wed, 4 Sep 2024 12:21:04 +0300 Subject: [PATCH 157/819] Move compute_logits to Mask Based Implementation --- vllm/hpu/ops.py | 53 +++++++--------------------- vllm/worker/habana_model_runner.py | 55 +++++++++++++++++++++++------- 2 files changed, 55 insertions(+), 53 deletions(-) diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index bbbb46c32a378..1ee56610d9ee5 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -219,50 +219,23 @@ def dispatch_bgmv_linear( stacked into single tensors, assuming same rank. HPU handles no-LoRA requests using zero valued A and B tensors. These zero valued tensors are appended at the end of `wa_t_all` and `wb_t_all` during initialization. + We reshape w_a_t_all to [hidden_dim, num_layers * lora_rank] + and w_b_t_all to [num_layers * lora_rank, hidden_dim]. We also + have a loraMask of shape [batch_size, num_layers * lora_rank] """ assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}' - max_loras = wa_t_all.size(0) - # Wrap-around for negative indices mask = LoraMask.getLoraMask() - if mask is not None: - """ - We reshape w_a_t_all to [hidden_dim, num_layers * lora_rank] - and w_b_t_all to [num_layers * lora_rank, hidden_dim]. We also - have a loraMask of shape [batch_size, num_layers * lora_rank] - """ - wa = wa_t_all[:, 0, :, :] - wb = wb_t_all[:, 0, :, :].transpose(1, 2) - wa_shape = wa.shape - wb_shape = wb.shape - wa = wa.reshape(wa_shape[0] * wa_shape[1], wa_shape[2]).transpose(0, 1) - wb = wb.reshape(wb_shape[0] * wb_shape[1], wb_shape[2]) - out = x @ wa - assert (out.shape == mask.shape) - out = out * mask - out = out @ wb - else: - """For custom BGMV, the corresponding `wa` and `wb` for each batch is - created based on the lora_index of each sample. - - For example: - `wa_t_all` is tensor of shape (num_loras, num_layers, lora_rank, - hidden_dim), where `wa_t_all[-1]` is zero valued tensor which handles - no-LoRA case. The `wa` tensor for a batch of size batch_Size will have - a shape of (batch_size, num_layers, hidden_dim, lora_rank) - - This method avoids for-loop as well as graph breaks. - """ - indices = indices % max_loras - wa = torch.index_select(wa_t_all, 0, - indices)[:, 0, :, :].transpose(-1, -2) - wb = torch.index_select(wb_t_all, 0, - indices)[:, 0, :, :].transpose(-1, -2) - - x = x.unsqueeze(1) - out = x @ wa - out = out @ wb - out = out.squeeze(1) + wa = wa_t_all[:, 0, :, :] + wb = wb_t_all[:, 0, :, :].transpose(1, 2) + wa_shape = wa.shape + wb_shape = wb.shape + wa = wa.reshape(wa_shape[0] * wa_shape[1], wa_shape[2]).transpose(0, 1) + wb = wb.reshape(wb_shape[0] * wb_shape[1], wb_shape[2]) + out = x @ wa + assert (out.shape == mask.shape) + out = out * mask + out = out @ wb y += out * scale diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 4b65a7ef46721..e03c9167ad308 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -255,6 +255,7 @@ class PreparePromptMetadata(NamedTuple): multi_modal_input: Optional[torch.Tensor] slot_mapping: List[List[int]] lora_mask: Optional[torch.Tensor] + lora_logits_mask: Optional[torch.Tensor] @classmethod def empty(cls): @@ -268,7 +269,8 @@ def empty(cls): lora_requests=set(), multi_modal_input=None, slot_mapping=[], - lora_mask=None) + lora_mask=None, + lora_logits_mask=None) class PrepareDecodeMetadata(NamedTuple): @@ -280,19 +282,19 @@ class PrepareDecodeMetadata(NamedTuple): lora_requests: Set[LoRARequest] slot_mapping: List[List[int]] lora_mask: Optional[torch.Tensor] + lora_logits_mask: Optional[torch.Tensor] @classmethod def empty(cls): - return PrepareDecodeMetadata( - input_tokens=[], - input_positions=[], - attn_metadata=None, - lora_index_mapping=[], - lora_prompt_mapping=[], - lora_requests=set(), - slot_mapping=[], - lora_mask=None, - ) + return PrepareDecodeMetadata(input_tokens=[], + input_positions=[], + attn_metadata=None, + lora_index_mapping=[], + lora_prompt_mapping=[], + lora_requests=set(), + slot_mapping=[], + lora_mask=None, + lora_logits_mask=None) # How batches are constructed. @@ -328,6 +330,7 @@ class ModelInputForHPU(ModelRunnerInputBase): batch_size_padded: Optional[int] = None virtual_engine: int = 0 lora_mask: Optional[torch.Tensor] = None + lora_logits_mask: Optional[torch.Tensor] = None def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: tensor_dict = { @@ -340,6 +343,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: "batch_size_padded": self.batch_size_padded, "virtual_engine": self.virtual_engine, "lora_mask": self.lora_mask, + "lora_logits_mask": self.lora_logits_mask, } _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) return tensor_dict @@ -374,6 +378,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: "lora_mapping": self.lora_mapping, "multi_modal_kwargs": self.multi_modal_kwargs, "lora_mask": self.lora_mask, + "lora_logits_mask": self.lora_logits_mask, } _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) _add_sampling_metadata_broadcastable_dict(tensor_dict, @@ -747,6 +752,7 @@ def _prepare_prompt( self.block_size) lora_mask: torch.Tensor = None + lora_logits_mask: torch.Tensor = None counter = 0 if self.lora_config: lora_mask = torch.zeros(len(seq_group_metadata_list) * @@ -754,9 +760,17 @@ def _prepare_prompt( (self.lora_config.max_loras + 1) * self.lora_config.max_lora_rank, dtype=self.lora_config.lora_dtype) + lora_logits_mask = torch.zeros(len(seq_group_metadata_list), + (self.lora_config.max_loras + 1) * + self.lora_config.max_lora_rank, + dtype=self.lora_config.lora_dtype) + ones = torch.ones(max_prompt_len, self.lora_config.max_lora_rank, dtype=self.lora_config.lora_dtype) + logit_ones = torch.ones(1, + self.lora_config.max_lora_rank, + dtype=self.lora_config.lora_dtype) for seq_group_metadata, context_len in zip(seq_group_metadata_list, context_lens): lora_id = seq_group_metadata.lora_int_id @@ -768,6 +782,7 @@ def _prepare_prompt( start_col = (lora_id - 1) * self.lora_config.max_lora_rank end_col = start_col + self.lora_config.max_lora_rank lora_mask[start_row:end_row, start_col:end_col] = ones + lora_logits_mask[counter, start_col:end_col] = logit_ones counter = counter + 1 lora_index_mapping += [lora_id] * (max_prompt_len - context_len) @@ -778,6 +793,7 @@ def _prepare_prompt( if lora_mask is not None: lora_mask = lora_mask.to('hpu') + lora_logits_mask = lora_logits_mask.to('hpu') input_tokens = make_tensor_with_pad(input_tokens, max_len=max_prompt_len, @@ -845,6 +861,7 @@ def _prepare_prompt( multi_modal_input=multi_modal_input, slot_mapping=slot_mapping, lora_mask=lora_mask, + lora_logits_mask=lora_logits_mask, ) def _prepare_decode( @@ -863,6 +880,7 @@ def _prepare_decode( if len(seq_group_metadata_list) == 0: return PrepareDecodeMetadata.empty() lora_mask: torch.Tensor = None + lora_logits_mask: torch.Tensor = None counter = 0 if self.lora_config: @@ -917,6 +935,7 @@ def _prepare_decode( if lora_mask is not None: lora_mask = lora_mask.to('hpu') + lora_logits_mask = lora_mask input_tokens = torch.tensor(input_tokens, dtype=torch.long, device=self.device) @@ -963,6 +982,7 @@ def _prepare_decode( lora_requests=lora_requests, slot_mapping=slot_mapping, lora_mask=lora_mask, + lora_logits_mask=lora_logits_mask, ) def prepare_input_tensors( @@ -1018,6 +1038,7 @@ def prepare_input_tensors( multi_modal_input, slot_mapping, lora_mask, + lora_logits_mask, ) = self._prepare_prompt(prefill_reqs) ( decode_input_tokens, @@ -1028,6 +1049,7 @@ def prepare_input_tensors( decode_lora_requests, decode_slot_mapping, decode_lora_mask, + decode_lora_logits_mask, ) = self._prepare_decode(decode_reqs) sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list, seq_lens, query_lens, @@ -1055,6 +1077,7 @@ def prepare_input_tensors( lora_prompt_mapping = decode_lora_prompt_mapping lora_requests = decode_lora_requests lora_mask = decode_lora_mask + lora_logits_mask = decode_lora_logits_mask # FIXME: We need to adjust selected_token_indices to accommodate # for padding @@ -1124,7 +1147,9 @@ def prepare_input_tensors( multi_modal_kwargs=multi_modal_input, real_batch_size=real_batch_size, batch_size_padded=batch_size_padded, - lora_mask=lora_mask), sampling_metadata + lora_mask=lora_mask, + lora_logits_mask=lora_logits_mask), \ + sampling_metadata def _seq_len(self, attn_metadata): if attn_metadata.num_prefills != 0: @@ -1198,6 +1223,7 @@ def profile_run(self) -> None: True, kv_caches, is_profile_run=True) + return def warmup_scenario(self, batch_size, @@ -1694,7 +1720,10 @@ def execute_model( module.indices_len[ i] = sampling_metadata.selected_token_indices.numel( ) - LoraMask.setLoraMask(None) + lora_logits_mask: torch.Tensor = model_input.lora_logits_mask + LoraMask.setLoraMask( + lora_logits_mask.index_select( + 0, sampling_metadata.selected_token_indices)) # Compute the logits. with self.profiler.record_event( From 691255b5e8b408d0746eb460c3f1152f819d9c76 Mon Sep 17 00:00:00 2001 From: Artur Fierka <160735857+afierka-intel@users.noreply.github.com> Date: Wed, 4 Sep 2024 13:30:54 +0200 Subject: [PATCH 158/819] Enable llama-405b - w/a for memory allocation error (#184) Work around for allocation error while loading llama-405b. --- vllm/model_executor/models/llama.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 676a51ce67f96..d659d0a3f1127 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -517,6 +517,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + if current_platform.is_hpu(): + torch.hpu.synchronize() # If this function is called, it should always initialize KV cache scale # factors (or else raise an exception). Thus, handled exceptions should From a4e1d5273bdf5b36eb03cbaee763a54282470e59 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 4 Sep 2024 13:45:30 +0200 Subject: [PATCH 159/819] [bugfix] handle large bucket minimums correctly (#235) This bugfix addresses incorrect lower boundary handling for bucketing Previous behavior: ``` INFO 09-03 19:36:28 habana_model_runner.py:564] Prompt bucket config (min, step, max_warmup) bs:[64, 32, 64], seq:[768, 128, 768] INFO 09-03 19:36:28 habana_model_runner.py:577] Generated 12 prompt buckets: [(32, 128), (32, 256), (32, 384), (32, 512), (32, 640), (32, 768), (64, 128), (64, 256), (64, 384), (64, 512), (64, 640), (64, 768)] INFO 09-03 19:36:28 habana_model_runner.py:582] Omitted 0 prompt buckets due to exceeded token budget (max_num_batched_tokens=131072) INFO 09-03 19:36:28 habana_model_runner.py:590] Decode bucket config (min, step, max_warmup) bs:[64, 128, 64], seq:[768, 128, 1024] INFO 09-03 19:36:28 habana_model_runner.py:601] Generated 8 decode buckets: [(64, 128), (64, 256), (64, 384), (64, 512), (64, 640), (64, 768), (64, 896), (64, 1024)] INFO 09-03 19:36:28 habana_model_runner.py:606] Omitted 0 decode buckets due to exceeded token budget (max_num_batched_tokens=131072) ``` Min seq len dimension is set to 768, but buckets with seq_len=128-768 are present Current behavior: ``` INFO 09-03 19:45:42 habana_model_runner.py:563] Prompt bucket config (min, step, max_warmup) bs:[64, 32, 64], seq:[768, 128, 768] INFO 09-03 19:45:42 habana_model_runner.py:576] Generated 1 prompt buckets: [(64, 768)] INFO 09-03 19:45:42 habana_model_runner.py:581] Omitted 0 prompt buckets due to exceeded token budget (max_num_batched_tokens=131072) INFO 09-03 19:45:42 habana_model_runner.py:589] Decode bucket config (min, step, max_warmup) bs:[64, 128, 64], seq:[768, 128, 1024] INFO 09-03 19:45:42 habana_model_runner.py:600] Generated 3 decode buckets: [(64, 768), (64, 896), (64, 1024)] INFO 09-03 19:45:42 habana_model_runner.py:605] Omitted 0 decode buckets due to exceeded token budget (max_num_batched_tokens=131072) ``` No bucket with seq_len < 768 is captured --- vllm/worker/habana_model_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index dec1b65858eb4..d80861babea45 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -91,7 +91,8 @@ def warmup_range(config: Tuple[int, int, int]): ramp_up_tw = itertools.takewhile(lambda x: x < bstep and x <= bmax, \ ramp_up_acc) stable = range(bstep, bmax + 1, bstep) - return list(ramp_up_tw) + list(stable) + buckets = list(ramp_up_tw) + list(stable) + return list(filter(lambda bucket: bucket >= bmin, buckets)) def warmup_buckets(bs_bucket_config, seq_bucket_config, From 8046d81cf279828be7b4d9a0b2a242e592748302 Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Wed, 4 Sep 2024 02:17:11 +0000 Subject: [PATCH 160/819] fix guided_decode HPU failing issue Signed-off-by: Chendi.Xue --- .../guided_decoding/outlines_logits_processors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index 1c8f6cccb3e9a..5382f0f655264 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -61,7 +61,7 @@ def __call__(self, input_ids: List[int], -math.inf, device=scores.device) mask[allowed_tokens] = 0 - scores.add_(mask) + scores = scores.add(mask) return scores From 7cd226c0110a4fcbc01f85df73dd334994e1d767 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 5 Sep 2024 11:30:01 +0200 Subject: [PATCH 161/819] Remove token budget from decode buckets (#241) This PR prevents max_num_batched_tokens from limiting decode buckets, as decode buckets should be limited by number of blocks, not by max_num_batched_tokens. --- vllm/worker/habana_model_runner.py | 66 ++++++++++++++---------------- 1 file changed, 31 insertions(+), 35 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index d80861babea45..92df83bd968d2 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -95,8 +95,9 @@ def warmup_range(config: Tuple[int, int, int]): return list(filter(lambda bucket: bucket >= bmin, buckets)) -def warmup_buckets(bs_bucket_config, seq_bucket_config, - max_num_batched_tokens): +def warmup_buckets(bs_bucket_config, + seq_bucket_config, + max_num_batched_tokens=None): buckets = list( itertools.product(warmup_range(bs_bucket_config), warmup_range(seq_bucket_config))) @@ -107,28 +108,32 @@ def warmup_buckets(bs_bucket_config, seq_bucket_config, f"seq:{seq_bucket_config}") raise ValueError(msg) - # Remove buckets exceeding batch token budget - filtered_buckets = list( - filter(lambda bucket: bucket[0] * bucket[1] <= max_num_batched_tokens, - buckets)) - - if len(filtered_buckets) == 0: - # legacy case - we can handle this if we ignore max_num_batched_tokens - min_bucket_bs, min_bucket_seq = min(buckets, - key=lambda b: (b[0] * b[1])) - min_reqd_budget = min_bucket_bs * min_bucket_seq - msg = ( - "The current bucketing configuration " - f"(min, step, max_warmup): " - f"bs:{bs_bucket_config}, " - f"seq:{seq_bucket_config} cannot be used with specified " - f"max_num_batched_tokens ({max_num_batched_tokens}), as the " - f"smallest bucket ({min_reqd_budget}) would exceed token budget. " - "Please increase max_num_batched_tokens or decrease bucket minimum " - "Ignoring max_num_batched_tokens at risk of out-of-memory errors.") - logger.error(msg) - return list(sorted(buckets, key=lambda b: - (b[0] * b[1], b[1], b[0]))), [] + filtered_buckets = buckets + if max_num_batched_tokens is not None: + # Remove buckets exceeding batch token budget + filtered_buckets = list( + filter( + lambda bucket: bucket[0] * bucket[1] <= max_num_batched_tokens, + buckets)) + + if len(filtered_buckets) == 0: + # we can handle this if we ignore max_num_batched_tokens + min_bucket_bs, min_bucket_seq = min(buckets, + key=lambda b: (b[0] * b[1])) + min_reqd_budget = min_bucket_bs * min_bucket_seq + msg = ( + "The current bucketing configuration " + f"(min, step, max_warmup): " + f"bs:{bs_bucket_config}, " + f"seq:{seq_bucket_config} cannot be used with specified " + f"max_num_batched_tokens ({max_num_batched_tokens}), as the " + f"smallest bucket ({min_reqd_budget}) would exceed token " + "budget. Please increase max_num_batched_tokens or decrease " + "bucket minimum Ignoring max_num_batched_tokens at risk of " + "out-of-memory errors.") + logger.error(msg) + return list( + sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0]))), [] captured_buckets = list( sorted(filtered_buckets, key=lambda b: (b[0] * b[1], b[1], b[0]))) @@ -589,9 +594,8 @@ def _setup_buckets(self) -> None: f"bs:{self.decode_bs_bucket_cfg}, " f"seq:{self.decode_seq_bucket_cfg}") logger.info(msg) - self.decode_buckets, decode_omitted_buckets = warmup_buckets( - self.decode_bs_bucket_cfg, self.decode_seq_bucket_cfg, - self.max_num_batched_tokens) + self.decode_buckets, _ = warmup_buckets(self.decode_bs_bucket_cfg, + self.decode_seq_bucket_cfg) if self.lora_config: self.decode_buckets[:] = [ bucket for bucket in self.decode_buckets @@ -601,14 +605,6 @@ def _setup_buckets(self) -> None: f"{list(sorted(self.decode_buckets))}") logger.info(msg) - msg = (f"Omitted {len(decode_omitted_buckets)} " - "decode buckets due to exceeded token budget " - f"(max_num_batched_tokens={self.max_num_batched_tokens})") - logger.info(msg) - - msg = f"Omitted decode buckets: {list(sorted(decode_omitted_buckets))}" - logger.debug(msg) - def _prepare_prompt( self, seq_group_metadata_list: List[SequenceGroupMetadata], From d0eb7d7087dea5bac4f918a1fc545733d6f72f27 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 5 Sep 2024 11:30:40 +0200 Subject: [PATCH 162/819] [habana_main bugfix] Fix min bucket boundary calculation (#239) Ports https://github.com/HabanaAI/vllm-fork/pull/97 to habana_main --- vllm/worker/habana_model_runner.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 92df83bd968d2..dbd538e45027c 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -142,8 +142,8 @@ def warmup_buckets(bs_bucket_config, return captured_buckets, omitted_buckets -def next_pow2(value: int): - res = 1 +def next_pow2(value: int, base: int): + res = base while value > 1: value = (value + 1) // 2 res *= 2 @@ -155,12 +155,10 @@ def round_up(value: int, k: int): def find_bucket(value: int, config: Tuple[int, int, int]): - bmin, bstep, bmax = config - if value < bstep: - result = min(next_pow2(value), bstep) - else: - result = round_up(value, bstep) - return result + bmin, bstep, _ = config + next_step = round_up(value, bstep) + next_pow = next_pow2(value, bmin) + return max(bmin, min(next_step, next_pow)) def subtuple(obj: object, From d2e2854ed3a99681aed60c177aa36fb7e9945fe8 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Fri, 6 Sep 2024 01:30:43 +0000 Subject: [PATCH 163/819] fix rotary embedding --- vllm/hpu/rotary_embed.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py index 30a88d68a24af..1857253f47f1b 100644 --- a/vllm/hpu/rotary_embed.py +++ b/vllm/hpu/rotary_embed.py @@ -100,6 +100,11 @@ def forward(self, positions: torch.Tensor, query: torch.Tensor, self.head_size)) key = key.reshape((key.shape[0], key.shape[1], key.shape[2] // self.head_size, self.head_size)) + query_rot = query[..., :self.dim] + key_rot = key[..., :self.dim] + if self.dim < self.head_size: + query_pass = query[..., self.dim:] + key_pass = key[..., self.dim:] if len(positions[0]) == 1: cos = self.cos_cached[positions].unsqueeze(2).to(dtype=query.dtype) @@ -107,8 +112,11 @@ def forward(self, positions: torch.Tensor, query: torch.Tensor, else: cos = cos[positions].unsqueeze(2) sin = sin[positions].unsqueeze(2) - query, key = FusedRoPE.apply(query, cos, sin, - 0), FusedRoPE.apply(key, cos, sin, 0) + query, key = FusedRoPE.apply(query_rot, cos, sin, + 0), FusedRoPE.apply(key_rot, cos, sin, 0) + if self.dim < self.head_size: + query = torch.cat((query, query_pass), dim=-1) + key = torch.cat((key, key_pass), dim=-1) return query.reshape( (query.shape[0], query.shape[1], query.shape[2] * query.shape[3])), key.reshape( From 97bd0fdc079b20027f69f5db2494451bdce2b10d Mon Sep 17 00:00:00 2001 From: Sanju C Sudhakaran Date: Tue, 3 Sep 2024 10:15:17 +0300 Subject: [PATCH 164/819] =?UTF-8?q?Avoiding=20torch.index=5Fselect=20for?= =?UTF-8?q?=20embedding=20LoRA=E2=80=93B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- vllm/hpu/ops.py | 60 +++++++++++++++++++++++-------------------------- 1 file changed, 28 insertions(+), 32 deletions(-) diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index 746e87dad4aea..bacb755b39393 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -215,22 +215,24 @@ def dispatch_bgmv_linear( ): """ `wa_t_all` and `wb_t_all` contains all LoRA A and LoRA B weight matrices - stacked into single tensors, assuming same rank. HPU handles no-LoRA - requests using zero valued A and B tensors. These zero valued tensors are - appended at the end of `wa_t_all` and `wb_t_all` during initialization. - We reshape w_a_t_all to [hidden_dim, num_layers * lora_rank] - and w_b_t_all to [num_layers * lora_rank, hidden_dim]. We also - have a loraMask of shape [batch_size, num_layers * lora_rank] + stacked at dimension 0 into single tensors, assuming same rank. `wa` is the + reshaped and transposed version of `wa_t_all` of shape + (h_in, max_loras * lora_rank) and `wb` is the transposed and reshaped + version of `wb_t_all` of shape (max_loras * lora_rank, h_out). + + Matmul input `x` with `wa`. Multiply `x` with a mask to zero-out inputs of + inactive LoRA indices. Matmul masked output with `wb` and scale it to get + the final output. """ assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}' mask = LoraMask.getLoraMask() + wa = wa_t_all[:, 0, :, :] wb = wb_t_all[:, 0, :, :].transpose(1, 2) - wa_shape = wa.shape - wb_shape = wb.shape - wa = wa.reshape(wa_shape[0] * wa_shape[1], wa_shape[2]).transpose(0, 1) - wb = wb.reshape(wb_shape[0] * wb_shape[1], wb_shape[2]) + wa = wa.reshape(wa.shape[0] * wa.shape[1], wa.shape[2]).transpose(0, 1) + wb = wb.reshape(wb.shape[0] * wb.shape[1], wb.shape[2]) + out = x @ wa assert (out.shape == mask.shape) out = out * mask @@ -241,34 +243,28 @@ def dispatch_bgmv_linear( def dispatch_bgmv_embedding( y: torch.Tensor, x: torch.Tensor, - wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, indices: torch.LongTensor, layer_idx: int, scale: float, ): """ - `wa_t_all` contains all LoRA A weight matrices stacked into a single tensor - assuming same rank. HPU handles no-LoRA requests using zero valued A - tensor. This zero valued tensor is appended at the end of `wa_t_all` during - initialization. For custom BGMV, the corresponding wa for each batch is - created based on the lora_index of the sample. - - For example: - `wa_t_all` is tensor of shape (num_loras, num_layers, lora_rank, - hidden_dim), where `wa_t_all[-1]` is zero valued tensor which handles - no-LoRA case. The wa tensor for a batch of size batch_Size will have a - shape of (batch_size, num_layers, lora_rank, hidden_dim) - - - This method avoids for-loop as well as graph breaks. + `wb_t_all` contains all LoRA-B weight matrices stacked at dimension 0 into + a single tensor, assuming same rank. `wb` is the transposed and reshaped + version of `wb_t_all` of shape (num_loras * lora_rank, embedding_dim). + + Output of LoRA-A embedding (tensor x) is repeated max_loras times to match + the shape of `wb`. Multiply `x` with a mask to zero-out inputs of inactive + LoRA indices. Matmul masked output with `wb` and scale it to get the final + output. """ + assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}' - max_loras = wa_t_all.size(0) - # Wrap-around for negative indices - indices = indices % max_loras - wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) + max_loras = wb_t_all.size(0) - x = x.unsqueeze(1) - out = x @ wa - out = out.squeeze(1) + x = x.repeat(1, max_loras) + x = x * LoraMask.getLoraMask() + wb = wb_t_all[:, 0, :, :].transpose(1, 2) + wb = wb.reshape(wb.shape[0] * wb.shape[1], wb.shape[2]) + out = x @ wb y += out * scale From ededdaf38bb7a141c9db03a5df060c72eca68b51 Mon Sep 17 00:00:00 2001 From: Sanju C Sudhakaran Date: Wed, 4 Sep 2024 15:29:24 +0300 Subject: [PATCH 165/819] Remove special handling of no-LoRA case --- vllm/lora/models.py | 20 +++----------------- vllm/worker/habana_model_runner.py | 6 +++--- 2 files changed, 6 insertions(+), 20 deletions(-) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 30d2fd9502977..e8d39591cb17a 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -24,7 +24,7 @@ from vllm.lora.utils import (from_layer, from_layer_logits_processor, parse_fine_tuned_lora_name, replace_submodule) from vllm.model_executor.models.interfaces import SupportsLoRA -from vllm.utils import get_device, is_hpu, is_pin_memory_available +from vllm.utils import get_device, is_pin_memory_available logger = init_logger(__name__) @@ -465,25 +465,11 @@ def __init__( @property def capacity(self) -> int: - if is_hpu(): - # HPU handles no LoRA requests using zero valued A and B tensors. - # These zero valued tensors are appended at the end of A and B, - # making total number of loras to be lora_config.max_cpu_loras + 1. - # This demands the total number of max_cpu_loras to be - # lora_config.max_cpu_loras + 1 - return self.lora_config.max_cpu_loras + 1 - else: - return self.lora_config.max_cpu_loras + return self.lora_config.max_cpu_loras @property def lora_slots(self) -> int: - if is_hpu(): - # HPU handles no LoRA requests using zero valued A and B tensors. - # These zero valued tensors are appended at the end of A and B, - # making total number of loras to be lora_config.max_cpu_loras + 1. - return self.lora_config.max_loras + 1 - else: - return self.lora_config.max_loras + return self.lora_config.max_loras @property def adapter_slots(self) -> int: diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index a9a3f35d3934b..bf708c9ab01d7 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -752,11 +752,11 @@ def _prepare_prompt( if self.lora_config: lora_mask = torch.zeros(len(seq_group_metadata_list) * max_prompt_len, - (self.lora_config.max_loras + 1) * + (self.lora_config.max_loras) * self.lora_config.max_lora_rank, dtype=self.lora_config.lora_dtype) lora_logits_mask = torch.zeros(len(seq_group_metadata_list), - (self.lora_config.max_loras + 1) * + (self.lora_config.max_loras) * self.lora_config.max_lora_rank, dtype=self.lora_config.lora_dtype) @@ -880,7 +880,7 @@ def _prepare_decode( if self.lora_config: lora_mask = torch.zeros(len(seq_group_metadata_list), - (self.lora_config.max_loras + 1) * + (self.lora_config.max_loras) * self.lora_config.max_lora_rank, dtype=self.lora_config.lora_dtype) ones = torch.ones(1, From b507cc4a33c79e241072c6ebf8ec9cf2189ee90a Mon Sep 17 00:00:00 2001 From: Sanju C Sudhakaran Date: Wed, 4 Sep 2024 17:50:08 +0300 Subject: [PATCH 166/819] Update test --- tests/lora/test_multilora_hpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/lora/test_multilora_hpu.py b/tests/lora/test_multilora_hpu.py index edca64fd5a2ae..64eda037ff059 100644 --- a/tests/lora/test_multilora_hpu.py +++ b/tests/lora/test_multilora_hpu.py @@ -96,7 +96,7 @@ def _test_llama_multilora(sql_lora_files, tp_size): enable_lora=True, max_loras=2, max_lora_rank=8, - max_num_seqs=16, + max_num_seqs=256, dtype='float32', tensor_parallel_size=tp_size) engine = LLMEngine.from_engine_args(engine_args) From 016f34351260796dd35ab3c1a06719945ab04067 Mon Sep 17 00:00:00 2001 From: Sanju C Sudhakaran Date: Fri, 6 Sep 2024 08:53:40 +0300 Subject: [PATCH 167/819] Fix formatting --- vllm/worker/habana_model_runner.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index bf708c9ab01d7..4be178c6fb168 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -750,11 +750,10 @@ def _prepare_prompt( lora_logits_mask: torch.Tensor = None counter = 0 if self.lora_config: - lora_mask = torch.zeros(len(seq_group_metadata_list) * - max_prompt_len, - (self.lora_config.max_loras) * - self.lora_config.max_lora_rank, - dtype=self.lora_config.lora_dtype) + lora_mask = torch.zeros( + len(seq_group_metadata_list) * max_prompt_len, + (self.lora_config.max_loras) * self.lora_config.max_lora_rank, + dtype=self.lora_config.lora_dtype) lora_logits_mask = torch.zeros(len(seq_group_metadata_list), (self.lora_config.max_loras) * self.lora_config.max_lora_rank, From d9fa7cfccd6e858916dce70dac24a1ae339097fa Mon Sep 17 00:00:00 2001 From: Michal Adamczyk Date: Fri, 6 Sep 2024 15:00:06 +0200 Subject: [PATCH 168/819] Dispersed dummy slots (#243) Use all possible slot values for dummy blocks to avoid caching issues. --- vllm/worker/habana_model_runner.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index a9a3f35d3934b..166ad760d27ca 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -48,7 +48,11 @@ logger = init_logger(__name__) +# These values are assumed to be zero in several places. +# Use caution when updating them! _PAD_SLOT_ID = 0 +_PAD_BLOCK_ID = 0 + LORA_WARMUP_RANK = 8 _TYPE_CACHE = {} @@ -937,6 +941,13 @@ def _prepare_decode( input_positions = torch.tensor(input_positions, dtype=torch.long, device=self.device) + + dummy_slots = itertools.cycle( + range(_PAD_SLOT_ID, _PAD_SLOT_ID + self.block_size)) + slot_mapping = [[ + s if s != _PAD_SLOT_ID else next(dummy_slots) for s in sl + ] for sl in slot_mapping] + slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device=self.device) @@ -1193,7 +1204,7 @@ def create_dummy_seq_group_metadata(self, else: input_len = seq_len - 1 output_len = 1 - block_tables = {group_id: [0] * num_blocks} + block_tables = {group_id: [_PAD_BLOCK_ID] * num_blocks} prompt_token_ids = [0] * input_len output_token_ids = [1] * output_len seq_data = SequenceData(prompt_token_ids) From 7488c584ddb36a81a900614d434445e1d66dbcf0 Mon Sep 17 00:00:00 2001 From: Marceli Fylcek Date: Fri, 6 Sep 2024 15:16:33 +0200 Subject: [PATCH 169/819] Use PT_COMPILE_ONLY_MODE during warmup (#227) With PT_COMPILE_ONLY_MODE flag, graphs can be compiled without performing synLaunch. The flag has been added to the warmup phase to decrease its execution time. --- vllm/worker/habana_model_runner.py | 125 +++++++++++++++-------------- 1 file changed, 66 insertions(+), 59 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 166ad760d27ca..9dc02fba0213a 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -15,6 +15,7 @@ Optional, Set, Tuple, Type, TypeVar, Union) import habana_frameworks.torch as htorch +import habana_frameworks.torch.internal.bridge_config as bc import torch from vllm.attention import AttentionMetadata, get_attn_backend @@ -1402,67 +1403,73 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: self.profiler.start('internal', 'warmup') start_mem = HabanaMemoryProfiler.current_device_memory_usage() start_time = time.perf_counter() - self.warmup_all_buckets(self.prompt_buckets, True, kv_caches) - self.warmup_all_buckets(self.decode_buckets, False, kv_caches) - - if not self.enforce_eager and htorch.utils.internal.is_lazy(): - assert self.mem_margin is not None, \ - ("HabanaWorker.determine_num_available_blocks needs " - "to be called before warming up the model.") - free_mem = HabanaMemoryProfiler.current_free_device_memory() - graph_free_mem = free_mem - self.mem_margin - graph_free_mem = align_workers(graph_free_mem, - torch.distributed.ReduceOp.MIN) - prompt_graph_mem_ratio = float( - os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.5')) - prompt_available_memory = prompt_graph_mem_ratio * graph_free_mem - decode_available_memory = graph_free_mem - prompt_available_memory - msg = (f"Using {format_bytes(graph_free_mem)}" - f"/{format_bytes(free_mem)} " - "of free device memory for HPUGraphs, " - f"{format_bytes(prompt_available_memory)} for prompt and " - f"{format_bytes(decode_available_memory)} for decode " - f"(VLLM_GRAPH_PROMPT_RATIO={prompt_graph_mem_ratio})") - logger.info(msg) - prompt_strategy = os.environ.get('VLLM_GRAPH_PROMPT_STRATEGY', - 'min_tokens') - decode_strategy = os.environ.get('VLLM_GRAPH_DECODE_STRATEGY', - 'max_bs') - mem_post_prompt, prompt_batch_seq, prompt_captured_all = \ - self.warmup_graphs( - prompt_strategy, self.prompt_buckets, True, kv_caches, - prompt_available_memory) - mem_post_decode, decode_batch_seq, decode_captured_all = \ - self.warmup_graphs( - decode_strategy, self.decode_buckets, False, kv_caches, - decode_available_memory) - - # Not all prompt buckets were captured, but all decode buckets were - # captured and we have some free graph-allocated space left. - # Let's try to use it for capturing more prompt buckets. - if mem_post_decode + mem_post_prompt < graph_free_mem \ - and not prompt_captured_all \ - and decode_captured_all: - mem_post_prompt, _, prompt_captured_all = self.warmup_graphs( + + with bc.env_setting("PT_COMPILE_ONLY_MODE", True): + self.warmup_all_buckets(self.prompt_buckets, True, kv_caches) + self.warmup_all_buckets(self.decode_buckets, False, kv_caches) + + if not self.enforce_eager and htorch.utils.internal.is_lazy(): + assert self.mem_margin is not None, \ + ("HabanaWorker.determine_num_available_blocks needs " + "to be called before warming up the model.") + free_mem = HabanaMemoryProfiler.current_free_device_memory() + graph_free_mem = free_mem - self.mem_margin + graph_free_mem = align_workers(graph_free_mem, + torch.distributed.ReduceOp.MIN) + prompt_graph_mem_ratio = float( + os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.5')) + prompt_available_memory = (prompt_graph_mem_ratio * + graph_free_mem) + decode_available_memory = (graph_free_mem - + prompt_available_memory) + msg = ( + f"Using {format_bytes(graph_free_mem)}" + f"/{format_bytes(free_mem)} " + "of free device memory for HPUGraphs, " + f"{format_bytes(prompt_available_memory)} for prompt and " + f"{format_bytes(decode_available_memory)} for decode " + f"(VLLM_GRAPH_PROMPT_RATIO={prompt_graph_mem_ratio})") + logger.info(msg) + prompt_strategy = os.environ.get('VLLM_GRAPH_PROMPT_STRATEGY', + 'min_tokens') + decode_strategy = os.environ.get('VLLM_GRAPH_DECODE_STRATEGY', + 'max_bs') + mem_post_prompt, prompt_batch_seq, prompt_captured_all = \ + self.warmup_graphs( prompt_strategy, self.prompt_buckets, True, kv_caches, - graph_free_mem - mem_post_prompt - mem_post_decode, - mem_post_prompt, prompt_batch_seq) - - # Not all decode buckets were captured, but all prompt buckets were - # captured and we have some free graph-allocated space left. - # Let's try to use it for capturing more decode buckets. - if mem_post_decode + mem_post_prompt < graph_free_mem \ - and not decode_captured_all \ - and prompt_captured_all: - mem_post_decode, _, _ = self.warmup_graphs( + prompt_available_memory) + mem_post_decode, decode_batch_seq, decode_captured_all = \ + self.warmup_graphs( decode_strategy, self.decode_buckets, False, kv_caches, - graph_free_mem - mem_post_prompt - mem_post_decode, - mem_post_decode, decode_batch_seq) - - self.log_graph_warmup_summary(self.prompt_buckets, True, - mem_post_prompt) - self.log_graph_warmup_summary(self.decode_buckets, False, - mem_post_decode) + decode_available_memory) + + # Not all prompt buckets were captured, but all decode buckets + # were captured and we have some free graph-allocated space + # left. Let's try to use it for capturing more prompt buckets. + if (mem_post_decode + mem_post_prompt < graph_free_mem + and not prompt_captured_all and decode_captured_all): + mem_post_prompt, _, prompt_captured_all = ( + self.warmup_graphs( + prompt_strategy, self.prompt_buckets, True, + kv_caches, + graph_free_mem - mem_post_prompt - mem_post_decode, + mem_post_prompt, prompt_batch_seq)) + + # Not all decode buckets were captured, but all prompt buckets + # were captured and we have some free graph-allocated space + # left. Let's try to use it for capturing more decode buckets. + if mem_post_decode + mem_post_prompt < graph_free_mem \ + and not decode_captured_all \ + and prompt_captured_all: + mem_post_decode, _, _ = self.warmup_graphs( + decode_strategy, self.decode_buckets, False, kv_caches, + graph_free_mem - mem_post_prompt - mem_post_decode, + mem_post_decode, decode_batch_seq) + + self.log_graph_warmup_summary(self.prompt_buckets, True, + mem_post_prompt) + self.log_graph_warmup_summary(self.decode_buckets, False, + mem_post_decode) end_time = time.perf_counter() end_mem = HabanaMemoryProfiler.current_device_memory_usage() From 17447ede71a79e020c174c34f8c993cebc616952 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Fri, 6 Sep 2024 15:18:36 +0200 Subject: [PATCH 170/819] Do not pass warmup_mode to execute_model_kwargs (#229) This fixes a very silly issue where mismatching values of `warmup_mode` flag could cause graph recompilations and eventually memory leaks. --- vllm/worker/habana_model_runner.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 9dc02fba0213a..f9fa2e8af5ec4 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -1704,10 +1704,7 @@ def execute_model( if multi_modal_input is not None: execute_model_kwargs.update(multi_modal_input) if htorch.utils.internal.is_lazy(): - execute_model_kwargs.update({ - "bypass_hpu_graphs": not use_graphs, - "warmup_mode": warmup_mode - }) + execute_model_kwargs.update({"bypass_hpu_graphs": not use_graphs}) htorch.core.mark_step() if self.is_driver_worker: From b50aa14998c7a5fc499daadd7af19dfd94b12d18 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Fri, 6 Sep 2024 16:14:11 +0200 Subject: [PATCH 171/819] Add error handling for PT_COMPILE_ONLY_MODE (#251) This PR fixes crashes observed on older Synapse builds introduced with https://github.com/HabanaAI/vllm-fork/pull/227. Setting PT_COMPILE_ONLY_MODE is not supported in current or older public Synapse builds, but we should not crash because of it, rather we should advise user to use the latest build. Previous behavior: ``` ... INFO 09-06 17:08:37 habana_executor.py:85] # HPU blocks: 10761, # CPU blocks: 910 INFO 09-06 17:08:37 habana_worker.py:201] Initializing cache engine took 47.29 GiB of device memory (54.34 GiB/94.62 GiB used) and -159.6 MiB of host memory (414.9 GiB/1007 GiB used) [rank0]: Traceback (most recent call last): [rank0]: File "/software/users/kzawora/vllm-utils/vllm_hpu_simple_test.py", line 9, in [rank0]: llm = LLM(model="facebook/opt-125m") [rank0]: File "/software/users/kzawora/vllm-fork/vllm/entrypoints/llm.py", line 155, in __init__ [rank0]: self.llm_engine = LLMEngine.from_engine_args( [rank0]: File "/software/users/kzawora/vllm-fork/vllm/engine/llm_engine.py", line 456, in from_engine_args [rank0]: engine = cls( [rank0]: File "/software/users/kzawora/vllm-fork/vllm/engine/llm_engine.py", line 266, in __init__ [rank0]: self._initialize_kv_caches() [rank0]: File "/software/users/kzawora/vllm-fork/vllm/engine/llm_engine.py", line 378, in _initialize_kv_caches [rank0]: self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks) [rank0]: File "/software/users/kzawora/vllm-fork/vllm/executor/habana_executor.py", line 89, in initialize_cache [rank0]: self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) [rank0]: File "/software/users/kzawora/vllm-fork/vllm/worker/habana_worker.py", line 202, in initialize_cache [rank0]: self._warm_up_model() [rank0]: File "/software/users/kzawora/vllm-fork/vllm/worker/habana_worker.py", line 220, in _warm_up_model [rank0]: self.model_runner.warmup_model(self.hpu_cache[0]) [rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context [rank0]: return func(*args, **kwargs) [rank0]: File "/software/users/kzawora/vllm-fork/vllm/worker/habana_model_runner.py", line 1412, in warmup_model [rank0]: with compile_only_mode_context(): [rank0]: File "/usr/lib/python3.10/contextlib.py", line 135, in __enter__ [rank0]: return next(self.gen) [rank0]: File "/usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/internal/bridge_config.py", line 20, in env_setting [rank0]: get_func = globals()['get_' + var.lower()] [rank0]: KeyError: 'get_pt_compile_only_mode' inc shutdown inc shutdown inc shutdown inc shutdown ``` Current behavior: ``` ... INFO 09-06 17:06:42 habana_executor.py:85] # HPU blocks: 10761, # CPU blocks: 910 INFO 09-06 17:06:43 habana_worker.py:201] Initializing cache engine took 47.29 GiB of device memory (54.34 GiB/94.62 GiB used) and -143.7 MiB of host memory (415 GiB/1007 GiB used) WARNING 09-06 17:06:43 habana_model_runner.py:1419] Cannot use PT_COMPILE_ONLY_MODE. Warmup time will be negatively impacted. Please update Gaudi Software Suite. INFO 09-06 17:06:43 habana_model_runner.py:1336] [Warmup][Prompt][1/23] batch_size:2 seq_len:1024 free_mem:40.28 GiB ... ``` --- vllm/worker/habana_model_runner.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index f9fa2e8af5ec4..b62ea1c8afbfe 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -3,7 +3,9 @@ ############################################################################### import collections +import contextlib import dataclasses +import functools import gc import itertools import math @@ -1404,7 +1406,21 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: start_mem = HabanaMemoryProfiler.current_device_memory_usage() start_time = time.perf_counter() - with bc.env_setting("PT_COMPILE_ONLY_MODE", True): + compile_only_mode_context = functools.partial(bc.env_setting, + "PT_COMPILE_ONLY_MODE", + True) + can_use_compile_only_mode = True + try: + with compile_only_mode_context(): + pass + logger.debug("Using PT_COMPILE_ONLY_MODE.") + except KeyError: + can_use_compile_only_mode = False + logger.warning('Cannot use PT_COMPILE_ONLY_MODE. ' + 'Warmup time will be negatively impacted. ' + 'Please update Gaudi Software Suite.') + with compile_only_mode_context( + ) if can_use_compile_only_mode else contextlib.nullcontext(): self.warmup_all_buckets(self.prompt_buckets, True, kv_caches) self.warmup_all_buckets(self.decode_buckets, False, kv_caches) From 00f13331b0c0e65dc9004668b10131b8ca31c933 Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar <49579433+hlahkar@users.noreply.github.com> Date: Mon, 9 Sep 2024 11:20:26 +0530 Subject: [PATCH 172/819] Hardcode fastapi version due to pydantic error (#255) Fixes serving mode issue; due to error in fastapi --- requirements-common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-common.txt b/requirements-common.txt index 3b8d473c1fe7a..7c12fc591f8f7 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -8,7 +8,7 @@ tqdm py-cpuinfo transformers >= 4.43.2 # Required for Chameleon and Llama 3.1 hotfox. tokenizers >= 0.19.1 # Required for Llama 3. -fastapi +fastapi == 0.112.2 # Hardcoding this to workaround issue with new fastapi. aiohttp openai uvicorn[standard] From 73af823681d008554076e39d7a5f406f422745d1 Mon Sep 17 00:00:00 2001 From: yuwenzho Date: Mon, 9 Sep 2024 03:56:04 -0700 Subject: [PATCH 173/819] Eliminate graph breaks for torch.compile mode (#202) Eliminate two graph breaks for torch.compile mode: 1. [__graph_breaks] torch._dynamo.exc.Unsupported: builtin: eq [, ] False 2. [__graph_breaks] torch._dynamo.exc.Unsupported: Tensor.item ---
PR Checklist (Click to Expand)

Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.

PR Title and Classification

Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:

  • [Bugfix] for bug fixes.
  • [CI/Build] for build or continuous integration improvements.
  • [Doc] for documentation fixes and improvements.
  • [Model] for adding a new model or improving an existing model. Model name should appear in the title.
  • [Frontend] For changes on the vLLM frontend (e.g., OpenAI API server, LLM class, etc.)
  • [Kernel] for changes affecting CUDA kernels or other compute kernels.
  • [Core] for changes in the core vLLM logic (e.g., LLMEngine, AsyncLLMEngine, Scheduler, etc.)
  • [Hardware][Vendor] for hardware-specific changes. Vendor name should appear in the prefix (e.g., [Hardware][AMD]).
  • [Misc] for PRs that do not fit the above categories. Please use this sparingly.

Note: If the PR spans more than one category, please include all relevant prefixes.

Code Quality

The PR need to meet the following code quality standards:

  • We adhere to Google Python style guide and Google C++ style guide.
  • Pass all linter checks. Please use format.sh to format your code.
  • The code need to be well-documented to ensure future contributors can easily understand the code.
  • Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.
  • Please add documentation to docs/source/ if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.

Notes for Large Changes

Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with rfc-required and might not go through the PR.

What to Expect for the Reviews

The goal of the vLLM team is to be a transparent reviewing machine. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process:

  • After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.
  • After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.
  • After the review, the reviewer will put an action-required label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.
  • Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.

Thank You

Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone!

--------- Signed-off-by: yuwenzho --- vllm/hpu/cache_ops.py | 8 ++++---- vllm/model_executor/models/gpt_bigcode.py | 6 ++++-- vllm/model_executor/models/llama.py | 6 ++++-- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py index 98f109accea06..9042924f68b3d 100644 --- a/vllm/hpu/cache_ops.py +++ b/vllm/hpu/cache_ops.py @@ -5,6 +5,8 @@ # LICENSE file in the root directory of this source tree. ############################################################################### +import math + import habana_frameworks.torch as htorch import torch @@ -30,8 +32,7 @@ def reshape_and_cache(key, # lots of padding, or are doing warmup. # This loop is a workaround for this issue. Please remove it # once key_cache.index_put_(indices, offsets), key) works. - num_kv_cache_passes = torch.div(num_slots_requested, - num_slots_available).ceil().int().item() + num_kv_cache_passes = math.ceil(num_slots_requested / num_slots_available) for i in range(num_kv_cache_passes): start_idx = i * num_slots_available end_idx = (i + 1) * num_slots_available @@ -58,8 +59,7 @@ def prepare_to_cache(cache, slot_mapping): # lots of padding, or are doing warmup. # This loop is a workaround for this issue. Please remove it # once key_cache.index_put_(indices, offsets), key) works. - num_kv_cache_passes = torch.div(num_slots_requested, - num_slots_available).ceil().int().item() + num_kv_cache_passes = math.ceil(num_slots_requested / num_slots_available) return num_kv_cache_passes, num_slots_available, indices, offsets diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 3ae3c8c8f712c..5d4387dbb9f48 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -44,6 +44,8 @@ from .interfaces import SupportsLoRA +is_hpu = current_platform.is_hpu() + class GPTBigCodeAttention(nn.Module): @@ -225,13 +227,13 @@ def forward( position_embeds = self.wpe(position_ids) hidden_states = inputs_embeds + position_embeds - if current_platform.is_hpu(): + if is_hpu: import habana_frameworks.torch as htorch htorch.core.mark_step() for i in range(len(self.h)): layer = self.h[i] hidden_states = layer(hidden_states, kv_caches[i], attn_metadata) - if current_platform.is_hpu(): + if is_hpu: htorch.core.mark_step() hidden_states = self.ln_f(hidden_states) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index d659d0a3f1127..51716b12513d8 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -55,6 +55,8 @@ from .interfaces import SupportsLoRA from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers +is_hpu = current_platform.is_hpu() + class LlamaMLP(nn.Module): @@ -318,7 +320,7 @@ def forward( hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - if current_platform.is_hpu(): + if is_hpu: import habana_frameworks.torch as htorch htorch.core.mark_step() for i in range(self.start_layer, self.end_layer): @@ -330,7 +332,7 @@ def forward( attn_metadata, residual, ) - if current_platform.is_hpu(): + if is_hpu: htorch.core.mark_step() if not get_pp_group().is_last_rank: From 5cf8441311b341e60d6538c442656e48ab38d230 Mon Sep 17 00:00:00 2001 From: Dominika Olszewska Date: Tue, 10 Sep 2024 12:16:54 +0200 Subject: [PATCH 174/819] Port flat PA from habana_next to habana_main (#169) FILL IN THE PR DESCRIPTION HERE FIX #xxxx (*link existing issues this PR will resolve*) **BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE** ---
PR Checklist (Click to Expand)

Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.

PR Title and Classification

Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:

  • [Bugfix] for bug fixes.
  • [CI/Build] for build or continuous integration improvements.
  • [Doc] for documentation fixes and improvements.
  • [Model] for adding a new model or improving an existing model. Model name should appear in the title.
  • [Frontend] For changes on the vLLM frontend (e.g., OpenAI API server, LLM class, etc.)
  • [Kernel] for changes affecting CUDA kernels or other compute kernels.
  • [Core] for changes in the core vLLM logic (e.g., LLMEngine, AsyncLLMEngine, Scheduler, etc.)
  • [Hardware][Vendor] for hardware-specific changes. Vendor name should appear in the prefix (e.g., [Hardware][AMD]).
  • [Misc] for PRs that do not fit the above categories. Please use this sparingly.

Note: If the PR spans more than one category, please include all relevant prefixes.

Code Quality

The PR need to meet the following code quality standards:

  • We adhere to Google Python style guide and Google C++ style guide.
  • Pass all linter checks. Please use format.sh to format your code.
  • The code need to be well-documented to ensure future contributors can easily understand the code.
  • Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.
  • Please add documentation to docs/source/ if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.

Notes for Large Changes

Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with rfc-required and might not go through the PR.

What to Expect for the Reviews

The goal of the vLLM team is to be a transparent reviewing machine. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process:

  • After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.
  • After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.
  • After the review, the reviewer will put an action-required label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.
  • Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.

Thank You

Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone!

--------- Co-authored-by: Michal Adamczyk Co-authored-by: barak goldberg <149692267+bgoldberg-habana@users.noreply.github.com> Co-authored-by: Michal Szutenberg <37601244+szutenberg@users.noreply.github.com> Co-authored-by: Jan Kaniecki --- README_GAUDI.md | 22 +- .../getting_started/gaudi-installation.rst | 14 +- vllm/attention/backends/habana_attn.py | 136 ++----- vllm/attention/ops/habana_paged_attn.py | 51 +-- vllm/hpu/ops.py | 114 +++--- vllm/hpu/utils.py | 7 +- vllm/worker/habana_model_runner.py | 365 +++++++++++------- 7 files changed, 330 insertions(+), 379 deletions(-) diff --git a/README_GAUDI.md b/README_GAUDI.md index 91bcbe49405eb..5109f7ddf9927 100644 --- a/README_GAUDI.md +++ b/README_GAUDI.md @@ -455,12 +455,12 @@ Environment variables - `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism - `{phase}` is either `PROMPT` or `DECODE` - - `{dim}` is either `BS` or `SEQ` + - `{dim}` is either `BS`, `SEQ` or `BLOCK` - `{param}` is either `MIN`, `STEP` or `MAX` - Default values: - Prompt: - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1` - - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `32` + - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)` - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): @@ -468,20 +468,20 @@ Environment variables - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size` - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): - `1024` + `max_model_len` - Decode: - - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1` + - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `min(max_num_seqs, 32)` - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): - `128` + `min(max_num_seqs, 32)` - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs` - - sequence length min (`VLLM_DECODE_SEQ_BUCKET_MIN`): - `block_size` - - sequence length step - (`VLLM_DECODE_SEQ_BUCKET_STEP`): `block_size` - - sequence length max (`VLLM_DECODE_SEQ_BUCKET_MAX`): - `2048` + - block size min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): + `128` + - block size step + (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `128` + - block size max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): + `max(128, (max_num_seqs*max_model_len)/block_size)` Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution: diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst index b3234d10b3115..ed3beabb2c8aa 100644 --- a/docs/source/getting_started/gaudi-installation.rst +++ b/docs/source/getting_started/gaudi-installation.rst @@ -335,19 +335,19 @@ Environment variables - Prompt: - batch size min (``VLLM_PROMPT_BS_BUCKET_MIN``): ``1`` - - batch size step (``VLLM_PROMPT_BS_BUCKET_STEP``): ``32`` + - batch size step (``VLLM_PROMPT_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)`` - batch size max (``VLLM_PROMPT_BS_BUCKET_MAX``): ``min(max_num_seqs, 64)`` - sequence length min (``VLLM_PROMPT_SEQ_BUCKET_MIN``): ``block_size`` - sequence length step (``VLLM_PROMPT_SEQ_BUCKET_STEP``): ``block_size`` - - sequence length max (``VLLM_PROMPT_SEQ_BUCKET_MAX``): ``1024`` + - sequence length max (``VLLM_PROMPT_SEQ_BUCKET_MAX``): ``max_model_len`` - Decode: - - batch size min (``VLLM_DECODE_BS_BUCKET_MIN``): ``1`` - - batch size step (``VLLM_DECODE_BS_BUCKET_STEP``): ``128`` + - batch size min (``VLLM_DECODE_BS_BUCKET_MIN``): ``min(max_num_seqs, 32)`` + - batch size step (``VLLM_DECODE_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)`` - batch size max (``VLLM_DECODE_BS_BUCKET_MAX``): ``max_num_seqs`` - - sequence length min (``VLLM_DECODE_SEQ_BUCKET_MIN``): ``block_size`` - - sequence length step (``VLLM_DECODE_SEQ_BUCKET_STEP``): ``block_size`` - - sequence length max (``VLLM_DECODE_SEQ_BUCKET_MAX``): ``2048`` + - sequence length min (``VLLM_DECODE_SEQ_BUCKET_MIN``): ``128`` + - sequence length step (``VLLM_DECODE_SEQ_BUCKET_STEP``): ``128`` + - sequence length max (``VLLM_DECODE_SEQ_BUCKET_MAX``): ``max(128, (max_num_seqs*max_model_len)/block_size)`` Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution: diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py index 2259630fa10b7..20b0f2bc7630b 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/habana_attn.py @@ -58,58 +58,14 @@ def copy_blocks( @dataclass -class HabanaAttentionMetadata(AttentionMetadata, HabanaPagedAttentionMetadata): - """Metadata for HabanaAttentionbackend. - - NOTE: Any python object stored here is not updated when it is - cuda-graph replayed. If you have values that need to be changed - dynamically, it should be stored in tensor. The tensor has to be - updated from `CUDAGraphRunner.forward` API. - """ +class HabanaAttentionMetadata(HabanaPagedAttentionMetadata, AttentionMetadata): + """Metadata for HabanaAttentionbackend.""" # Currently, input sequences can only contain all prompts # or all decoding. True if all sequences are prompts. is_prompt: bool - # (batch_size,). The sequence length per sequence. Sequence length means - # the computed tokens + new tokens None if it is a decoding. - seq_lens: Optional[List[int]] - # seq_lens stored as a tensor. + attn_bias: Optional[torch.Tensor] seq_lens_tensor: Optional[torch.Tensor] - # |---------- N-1 iteration --------| - # |---------------- N iteration ---------------------| - # |- tokenA -|......................|-- newTokens ---| - # |---------- context_len ----------| - # |-------------------- seq_len ----------------------| - # |-- query_len ---| - - # Maximum query length in the batch. - max_query_len: Optional[int] - # (batch_size + 1,). The cumulative subquery lengths of the sequences in - # the batch, used to index into subquery. E.g., if the subquery length - # is [4, 6], it is [0, 4, 10]. - subquery_start_loc: Optional[torch.Tensor] - # FIXME: It is for flash attn. - # (batch_size + 1,). The cumulative sequence lengths of the sequences in - # the batch, used to index into sequence. E.g., if the sequence length is - # [4, 6], it is [0, 4, 10]. - seq_start_loc: Optional[torch.Tensor] - # (batch_size,) A tensor of context lengths (tokens that are computed - # so far). - context_lens_tensor: Optional[torch.Tensor] - - # Whether or not if cuda graph is enabled. - # Cuda-graph is currently enabled for decoding only. - # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention. - use_cuda_graph: bool - - def __post_init__(self): - # Set during the execution of the first attention op. - # It is a list because it is needed to set per prompt - # when alibi slopes is used. It is because of the limitation - # from xformer API. - # will not appear in the __repr__ and __init__ - self.attn_bias: Optional[torch.Tensor] = None - class HabanaAttentionImpl(AttentionImpl, torch.nn.Module): """ @@ -229,60 +185,48 @@ def forward( if attn_metadata.is_prompt: # Prompt run. - if kv_cache is None or attn_metadata.block_tables.numel() == 0: - if not self.prefill_usefusedsdpa: - # TODO: move this outside of model - assert attn_metadata.attn_bias is not None, \ + if not self.prefill_usefusedsdpa: + # TODO: move this outside of model + assert attn_metadata.attn_bias is not None, \ 'attn_bias must be set before calling model.forward!' - attn_bias = attn_metadata.attn_bias - if self.alibi_slopes is not None and \ - self.position_bias is not None: - attn_bias.add_(self.position_bias[:, :, - -attn_bias.size(2):, - -attn_bias.size(3):]) - else: - attn_bias = None - - query_shape = (batch_size, seq_len, self.num_heads, - self.head_size) - kv_shape = (batch_size, seq_len_kv, self.num_kv_heads, - self.head_size) - out = ops.prompt_attention( - query.view(query_shape), - key.view(kv_shape), - value.view(kv_shape), - attn_bias=attn_bias, - p=0.0, - scale=self.scale, - matmul_qk_op=self.matmul_qk, - softmax_op=self.softmax, - matmul_av_op=self.matmul_av, - valid_seq_lengths=attn_metadata.seq_lens_tensor, - ) - output = out.reshape(batch_size, seq_len, hidden_size) + attn_bias = attn_metadata.attn_bias + if self.alibi_slopes is not None and \ + self.position_bias is not None: + attn_bias.add_(self.position_bias[:, :, + -attn_bias.size(2):, + -attn_bias.size(3):]) else: - # prefix-enabled attention - output = HabanaPagedAttention.forward_prefix( - query, - key, - value, - key_cache, - value_cache, - attn_metadata.block_tables, - attn_metadata.subquery_start_loc, - attn_metadata.seq_lens_tensor, - attn_metadata.context_lens_tensor, - attn_metadata.max_query_len, - self.alibi_slopes, - ) + attn_bias = None + + query_shape = (batch_size, seq_len, self.num_heads, self.head_size) + kv_shape = (batch_size, seq_len_kv, self.num_kv_heads, + self.head_size) + out = ops.prompt_attention( + query.view(query_shape), + key.view(kv_shape), + value.view(kv_shape), + attn_bias=attn_bias, + p=0.0, + scale=self.scale, + matmul_qk_op=self.matmul_qk, + softmax_op=self.softmax, + matmul_av_op=self.matmul_av, + ) + output = out.reshape(batch_size, seq_len, hidden_size) else: # Decoding run. output = HabanaPagedAttention.forward_decode( - query, key_cache, value_cache, attn_metadata.block_tables, - attn_metadata.seq_lens_tensor, self.kv_cache_dtype, - self.num_kv_heads, self.scale, self.position_bias, k_scale, - v_scale, self.matmul_qk, self.softmax, self.matmul_av, - self.k_cache, self.v_cache) + query=query, + key_cache=key_cache, + value_cache=value_cache, + block_list=attn_metadata.block_list, + block_mapping=attn_metadata.block_mapping, + block_bias=attn_metadata.attn_bias, + scale=self.scale, + matmul_qk_op=self.matmul_qk, + matmul_av_op=self.matmul_av, + keys_fetch_func=self.k_cache.fetch_from_cache, + values_fetch_func=self.v_cache.fetch_from_cache) # Reshape the output tensor. return output.view(batch_size, seq_len, hidden_size) diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/habana_paged_attn.py index 9602886299c47..cab8d7abe95fd 100644 --- a/vllm/attention/ops/habana_paged_attn.py +++ b/vllm/attention/ops/habana_paged_attn.py @@ -16,16 +16,9 @@ @dataclass class HabanaPagedAttentionMetadata: """Metadata for PagedAttention.""" - # (batch_size,). The length of sequences (entire tokens seen so far) per - # sequence. - seq_lens_tensor: Optional[torch.Tensor] - # (batch_size, max_blocks_per_seq). - # Block addresses per sequence. (Seq id -> list of physical block) - # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks - # in the kv cache. Each block can contain up to block_size tokens. - # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph - # captured. - block_tables: Optional[torch.Tensor] + block_list: Optional[torch.Tensor] + block_mapping: Optional[torch.Tensor] + block_usage: Optional[torch.Tensor] class HabanaPagedAttention: @@ -63,42 +56,8 @@ def write_to_paged_cache(key: torch.Tensor, value: torch.Tensor, slot_mapping, kv_cache_dtype, is_prompt) @staticmethod - def forward_decode( - query: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - block_tables: torch.Tensor, - seq_lens: torch.Tensor, - kv_cache_dtype: str, - num_kv_heads: int, - scale: float, - alibi_slopes: Optional[torch.Tensor], - k_scale: float, - v_scale: float, - matmul_qk_op, - softmax_op, - matmul_av_op, - k_cache_cls, - v_cache_cls, - ) -> torch.Tensor: - block_size = value_cache.shape[1] - return ops.paged_attention_v1( - query, - key_cache, - value_cache, - num_kv_heads, - scale, - block_tables, - seq_lens, - block_size, - alibi_slopes, - kv_cache_dtype, - matmul_qk_op, - softmax_op, - matmul_av_op, - k_cache_cls, - v_cache_cls, - ) + def forward_decode(**kwargs) -> torch.Tensor: + return ops.flat_pa(**kwargs) @staticmethod def forward_prefix( diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index bacb755b39393..b2705429906c4 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -4,7 +4,6 @@ # This source code is licensed under the BSD license found in the # LICENSE file in the root directory of this source tree. ############################################################################### -import os from typing import Optional import habana_frameworks.torch as htorch @@ -29,72 +28,57 @@ logger.warning("Could not import HPU FusedSDPA kernel. " "vLLM will use native implementation.") -PA_SPLIT_VALUE = (os.environ.get('PA_SPLIT_VALUE', '1') == '1') - - -def fetch_from_cache(cache, blocks, permutations): - return [ - cache.index_select(0, blocks[:, i]).permute(permutations) - for i in range(blocks.size(1)) - ] - - -def paged_attention_v1(query, - key_cache, - value_cache, - head_mapping, - scale, - block_tables, - context_lens, - block_size, - alibi_slopes=None, - kv_cache_dtype=None, - matmul_qk_op=torch.matmul, - softmax_op=torch.softmax, - matmul_av_op=torch.matmul, - k_cache_cls=None, - v_cache_cls=None) -> None: - seq_len = block_tables.size(1) - batch_size, query_heads, _ = query.shape - _, _, kv_heads, _ = key_cache.shape - min_inf = torch.finfo(query.dtype).min - mask = (torch.arange(0, - seq_len * block_size, - dtype=torch.int32, - device=key_cache.device).view(1, -1).expand( - batch_size, -1).ge(context_lens.view(-1, 1)).view( - batch_size, 1, 1, -1)) - query.mul_(scale) - query = query.unsqueeze(-2) - fetch_keys = fetch_from_cache if k_cache_cls is None else \ - k_cache_cls.fetch_from_cache - keys = fetch_keys(key_cache, block_tables, (0, 2, 3, 1)) - if query_heads != kv_heads: + +def batch2block(tensor, block_mapping): + shape = tuple(tensor.shape) + return (block_mapping @ tensor.view(shape[0], -1)).view(-1, *shape[1:]) + + +def block2batch(tensor, block_mapping): + shape = tuple(tensor.shape) + return (block_mapping.t() @ tensor.view(shape[0], -1)).view(-1, *shape[1:]) + + +def block_softmax(batch_size, attn, block_mapping): + attn.sub_(10.0) + attn = attn.exp_() + sums = attn.sum(dim=-1).unsqueeze(-1) + sums = block2batch(sums, block_mapping) + sums = batch2block(sums, block_mapping) + sums.add_(1.0e-12) + attn.div_(sums) + return attn + + +def flat_pa(query, key_cache, value_cache, block_list, block_mapping, + block_bias, scale, matmul_qk_op, matmul_av_op, keys_fetch_func, + values_fetch_func): + batch_size = query.size(0) + q_heads = query.size(1) + kv_heads = key_cache.size(2) + + query = batch2block(scale * query, block_mapping).unsqueeze(-2) + key = keys_fetch_func(key_cache, block_list).transpose(1, 2) + value = values_fetch_func(value_cache, block_list).transpose(1, 2) + block_bias = block_bias.view(key.size(0), 1, 1, -1) + + if kv_heads != q_heads: + block_bias = block_bias.unsqueeze(1) query = query.unflatten(1, (kv_heads, -1)) - keys = [k.unflatten(1, (kv_heads, 1)) for k in keys] - mask = mask.unsqueeze(2) - - attn_weights = torch.cat([matmul_qk_op(query, k) for k in keys], dim=-1) - if alibi_slopes is not None: - attn_weights.add_(alibi_slopes[:, :, -attn_weights.size(2):, - -attn_weights.size(3):]) - attn_weights = softmax_op(attn_weights.masked_fill(mask, min_inf), dim=-1) - - fetch_values = fetch_from_cache if v_cache_cls is None else \ - v_cache_cls.fetch_from_cache - values = fetch_values(value_cache, block_tables, (0, 2, 1, 3)) - if PA_SPLIT_VALUE: - attn_weights = attn_weights.split(block_size, dim=-1) + key = key.unflatten(1, (kv_heads, 1)) + value = value.unflatten(1, (kv_heads, 1)) + key = key.transpose(3, 4) else: - values = [torch.cat(values, dim=-2)] - attn_weights = [attn_weights] - if query_heads != kv_heads: - values = [v.unflatten(1, (kv_heads, 1)) for v in values] - attn_weights = [matmul_av_op(a, v) for a, v in zip(attn_weights, values)] - if query_heads != kv_heads: - attn_weights = [a.flatten(1, 2) for a in attn_weights] - attn_weights = sum(attn_weights) - return attn_weights.squeeze(-2) + key = key.transpose(2, 3) + + attn = matmul_qk_op(query, key) + block_bias + attn = block_softmax(batch_size, attn, block_mapping) + attn = matmul_av_op(attn, value) + attn = block2batch(attn, block_mapping) + attn = attn.squeeze(-2) + if kv_heads != q_heads: + attn = attn.flatten(1, 2) + return attn def silu_and_mul(x: torch.Tensor) -> torch.Tensor: diff --git a/vllm/hpu/utils.py b/vllm/hpu/utils.py index 3d9c7cb1c4c22..13204b83d5742 100644 --- a/vllm/hpu/utils.py +++ b/vllm/hpu/utils.py @@ -57,8 +57,5 @@ def forward(self, input, cache, num_kv_cache_passes, num_slots_available, block_offset) return cache - def fetch_from_cache(self, cache, blocks, permutations): - return [ - cache.index_select(0, blocks[:, i]).permute(permutations) - for i in range(blocks.size(1)) - ] + def fetch_from_cache(self, cache, blocks): + return cache.index_select(0, blocks) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index a4ade587db089..a6bd5e5f68745 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -51,29 +51,47 @@ logger = init_logger(__name__) +_TYPE_CACHE = {} # These values are assumed to be zero in several places. # Use caution when updating them! _PAD_SLOT_ID = 0 _PAD_BLOCK_ID = 0 LORA_WARMUP_RANK = 8 -_TYPE_CACHE = {} + + +def subtuple(obj: object, + typename: str, + to_copy: List[str], + to_override: Optional[Dict[str, object]] = None): + if obj is None: + return None + if to_override is None: + to_override = {} + fields = set(to_copy) | set(to_override.keys()) + values = {f: to_override.get(f, getattr(obj, f)) for f in fields} + if typename not in _TYPE_CACHE: + _TYPE_CACHE[typename] = collections.namedtuple(typename, + ' '.join(fields)) + return _TYPE_CACHE[typename](**values) def read_bucket_settings(phase: str, dim: str, **defaults): """Read bucketing configuration from env variables. phase is either 'prompt' or 'decode' - dim is either 'bs' or 'block' + dim is either 'bs', 'seq' or 'block' param is either 'min', 'step' or 'max' example env variable: VLLM_DECODE_BS_BUCKET_STEP=128 """ params = ['min', 'step', 'max'] + env_vars = [f'VLLM_{phase}_{dim}_BUCKET_{p}'.upper() for p in params] + default_values = [defaults[p] for p in params] values = [ - int( - os.environ.get(f'VLLM_{phase}_{dim}_BUCKET_{p}'.upper(), - defaults[p])) for p in params + int(os.environ.get(e, d)) for e, d in zip(env_vars, default_values) ] + for e, v, d in zip(env_vars, values, defaults): + logger.info('%s=%s (default:%s)', e, v, d) return values @@ -103,9 +121,9 @@ def warmup_range(config: Tuple[int, int, int]): return list(filter(lambda bucket: bucket >= bmin, buckets)) -def warmup_buckets(bs_bucket_config, - seq_bucket_config, - max_num_batched_tokens=None): +def generate_prompt_buckets(bs_bucket_config, + seq_bucket_config, + max_num_batched_tokens=None): buckets = list( itertools.product(warmup_range(bs_bucket_config), warmup_range(seq_bucket_config))) @@ -150,6 +168,19 @@ def warmup_buckets(bs_bucket_config, return captured_buckets, omitted_buckets +def generate_decode_buckets(bs_bucket_config, blocks_bucket_config, + max_blocks): + buckets = [] + for bs in warmup_range(bs_bucket_config): + for blocks in warmup_range(blocks_bucket_config): + if blocks < bs: + continue + if blocks > max_blocks: + break + buckets.append((bs, blocks)) + return list(sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0]))) + + def next_pow2(value: int, base: int): res = base while value > 1: @@ -169,22 +200,6 @@ def find_bucket(value: int, config: Tuple[int, int, int]): return max(bmin, min(next_step, next_pow)) -def subtuple(obj: object, - typename: str, - to_copy: List[str], - to_override: Optional[Dict[str, object]] = None): - if to_override is None: - to_override = {} - if obj is None: - return None - fields = set(to_copy) | set(to_override.keys()) - values = {f: to_override.get(f, getattr(obj, f)) for f in fields} - if typename not in _TYPE_CACHE: - _TYPE_CACHE[typename] = collections.namedtuple(typename, - ' '.join(fields)) - return _TYPE_CACHE[typename](**values) - - def align_workers(value, op): group = get_world_group().cpu_group world_size = torch.distributed.get_world_size() @@ -195,13 +210,19 @@ def align_workers(value, op): return value_t.item() +def pad_list(list, k, v): + target_len = round_up(len(list), k) + padding = target_len - len(list) + return list + [v] * padding + + class HpuModelAdapter(): - def __init__(self, model, enforce_eager): + def __init__(self, model, block_size, enforce_eager): self.model = model self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA', '0').lower() in ['1', 'true'] - + self.block_size = block_size if not htorch.utils.internal.is_lazy() and not enforce_eager: self.model = torch.compile(self.model, backend='hpu_backend', @@ -225,22 +246,45 @@ def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device, mask = causal_mask.logical_or(len_mask) attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_( mask, -math.inf)) - #FIXME: Restore sliding window support - #if self.sliding_window is not None: attn_metadata = prefill_metadata._replace(attn_bias=attn_bias) return attn_metadata + def _set_block_mapping(self, metadata, batch_size, device, dtype): + mask = torch.arange(0, + self.block_size, + device=device, + dtype=torch.int32).unsqueeze(0) + mask = mask >= metadata.block_usage.unsqueeze(-1) + attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_( + mask, -math.inf)) + block_mapping = torch.nn.functional.one_hot( + metadata.block_mapping.to(torch.long), + num_classes=batch_size).to(dtype) + metadata = metadata._replace(block_mapping=block_mapping, + attn_bias=attn_bias) + return metadata + + def _update_metadata(self, attn_metadata, batch_size, seq_len, device, + dtype): + if attn_metadata.is_prompt: + meta = attn_metadata + attn_metadata = self._set_attn_bias(meta, batch_size, seq_len, + device, dtype) + else: + meta = attn_metadata + attn_metadata = self._set_block_mapping(meta, batch_size, device, + dtype) + return attn_metadata + def forward(self, *args, **kwargs): kwargs = kwargs.copy() selected_token_indices = kwargs.pop('selected_token_indices') if 'warmup_mode' in kwargs: kwargs.pop('warmup_mode') input_ids = kwargs['input_ids'] - kwargs['attn_metadata'] = self._set_attn_bias(kwargs['attn_metadata'], - input_ids.size(0), - input_ids.size(1), - input_ids.device, - torch.bfloat16) + kwargs['attn_metadata'] = self._update_metadata( + kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1), + input_ids.device, torch.bfloat16) LoraMask.setLoraMask(kwargs.pop('lora_mask')) hidden_states = self.model(*args, **kwargs) hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) @@ -536,7 +580,9 @@ def load_model(self) -> None: # RuntimeErrors. This needs to be debugged with HabanaMemoryProfiler() as m_wrap: self.model = _maybe_wrap_in_hpu_graph( - self.model, enforce_eager=self.enforce_eager) + self.model, + self.block_size, + enforce_eager=self.enforce_eager) msg = f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}" logger.info(msg) @@ -553,73 +599,48 @@ def _is_valid_bucket(self, bucket): return bucket[0] * bucket[1] <= self.max_num_batched_tokens def _setup_buckets(self) -> None: + align_bs = lambda x: min(self.max_num_seqs, x) max_bucket_cfg = 64 if self.lora_config and \ max_bucket_cfg > self.max_num_batched_tokens // self.block_size: max_bucket_cfg = self.max_num_batched_tokens // self.block_size - self.prompt_bs_bucket_cfg = read_bucket_settings('prompt', - 'bs', - min=1, - step=32, - max=min( - self.max_num_seqs, - max_bucket_cfg)) + blocks_step = 128 + #FIXME: The default values should be max_model_len + max_prompt_seq = 1024 + max_decode_seq = 2048 + self.prompt_bs_bucket_cfg = read_bucket_settings( + 'prompt', + 'bs', + min=1, + step=align_bs(32), + max=align_bs(max_bucket_cfg)) self.decode_bs_bucket_cfg = read_bucket_settings('decode', 'bs', - min=1, - step=128, + min=align_bs(32), + step=align_bs(32), max=self.max_num_seqs) self.prompt_seq_bucket_cfg = read_bucket_settings('prompt', 'seq', min=self.block_size, step=self.block_size, - max=1024) - self.decode_seq_bucket_cfg = read_bucket_settings('decode', - 'seq', - min=self.block_size, - step=self.block_size, - max=2048) + max=max_prompt_seq) + self.decode_block_bucket_cfg = read_bucket_settings( + 'decode', + 'block', + min=blocks_step, + step=blocks_step, + max=max(blocks_step, + self.max_num_seqs * max_decode_seq // self.block_size)) self.graphed_buckets: Set[Any] = set() msg = ("Prompt bucket config (min, step, max_warmup) " f"bs:{self.prompt_bs_bucket_cfg}, " f"seq:{self.prompt_seq_bucket_cfg}") logger.info(msg) - self.prompt_buckets, prompt_omitted_buckets = warmup_buckets( - self.prompt_bs_bucket_cfg, self.prompt_seq_bucket_cfg, - self.max_num_batched_tokens) - - if self.lora_config: - self.prompt_buckets[:] = [ - bucket for bucket in self.prompt_buckets - if self._is_valid_bucket(bucket) - ] - - msg = (f"Generated {len(self.prompt_buckets)} " - f"prompt buckets: {list(sorted(self.prompt_buckets))}") - logger.info(msg) - - msg = (f"Omitted {len(prompt_omitted_buckets)} " - "prompt buckets due to exceeded token budget " - f"(max_num_batched_tokens={self.max_num_batched_tokens})") - logger.info(msg) - - msg = f"Omitted prompt buckets: {list(sorted(prompt_omitted_buckets))}" - logger.debug(msg) msg = ("Decode bucket config (min, step, max_warmup) " f"bs:{self.decode_bs_bucket_cfg}, " - f"seq:{self.decode_seq_bucket_cfg}") - logger.info(msg) - self.decode_buckets, _ = warmup_buckets(self.decode_bs_bucket_cfg, - self.decode_seq_bucket_cfg) - if self.lora_config: - self.decode_buckets[:] = [ - bucket for bucket in self.decode_buckets - if self._is_valid_bucket(bucket) - ] - msg = (f"Generated {len(self.decode_buckets)} decode buckets: " - f"{list(sorted(self.decode_buckets))}") + f"block:{self.decode_block_bucket_cfg}") logger.info(msg) def _prepare_prompt( @@ -735,10 +756,6 @@ def _prepare_prompt( real_num_seqs = len(query_lens) assert max_query_len > 0 - context_lens_tensor = torch.tensor(context_lens, - dtype=torch.int, - device=self.device) - if multi_modal_input_list: assert self.multimodal_config, ( "Multi-modal inputs are only supported by " @@ -748,7 +765,6 @@ def _prepare_prompt( else: multi_modal_input = None - max_prompt_block_table_len = max(len(t) for t in prefix_block_tables) max_prompt_len = max( find_bucket(max(seq_lens), self.prompt_seq_bucket_cfg), self.block_size) @@ -814,37 +830,17 @@ def _prepare_prompt( dtype=torch.long, device=self.device) - block_tables = make_tensor_with_pad(prefix_block_tables, - max_len=max_prompt_block_table_len, - pad=0, - dtype=torch.int, - device=self.device) - - # Query length can be shorter than key (i.e., prompt) when prefill - # is chunked or prefix cached. - query_lens_tensor = torch.tensor(query_lens, - dtype=torch.long, - device=self.device) - subquery_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1, - dtype=torch.int32, - device=self.device) seq_lens_tensor = torch.tensor(seq_lens, dtype=torch.long, device=self.device) - seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1, - dtype=torch.int32, - device=self.device) attn_metadata = self.attn_backend.make_metadata( is_prompt=True, - seq_lens=seq_lens, + block_list=None, + block_mapping=None, + block_usage=None, + attn_bias=None, seq_lens_tensor=seq_lens_tensor, - max_query_len=max_query_len, - subquery_start_loc=subquery_start_loc, - seq_start_loc=seq_start_loc, - context_lens_tensor=context_lens_tensor, - block_tables=block_tables, - use_cuda_graph=False, num_prefills=real_num_seqs, num_prefill_tokens=sum_query_len, num_decode_tokens=0, @@ -950,32 +946,50 @@ def _prepare_decode( s if s != _PAD_SLOT_ID else next(dummy_slots) for s in sl ] for sl in slot_mapping] + num_decode_tokens = sum(seq_lens) + + blocks_used = [len(bt) for bt in block_tables] + block_list = list(itertools.chain(*block_tables)) + block_mapping_nested: List[List[int]] = [ + [i] * b_u for i, b_u in enumerate(blocks_used) + ] + block_mapping: List[int] = list( + itertools.chain.from_iterable(block_mapping_nested)) + + last_block = [ + sl % self.block_size + 1 for sl in itertools.chain(*slot_mapping) + ] + block_usage = [[self.block_size] * (b_u - 1) + [lb] + for b_u, lb in zip(blocks_used, last_block)] + block_usage = list(itertools.chain(*block_usage)) + + block_bucket_size = find_bucket(len(block_list), + self.decode_block_bucket_cfg) + block_list = pad_list(block_list, block_bucket_size, _PAD_SLOT_ID) + block_mapping = pad_list(block_mapping, block_bucket_size, 0) + block_usage = pad_list(block_usage, block_bucket_size, 0) + + block_list = torch.tensor(block_list, + dtype=torch.int, + device=self.device) + block_mapping = torch.tensor(block_mapping, + dtype=torch.int, + device=self.device) + block_usage = torch.tensor(block_usage, + dtype=torch.bfloat16, + device=self.device) + slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device=self.device) - seq_lens_tensor = torch.tensor(seq_lens, - dtype=torch.int, - device=self.device) - num_decode_tokens = sum(seq_lens) - max_block_table_len = max( - len(block_table) for block_table in block_tables) - block_tables = make_tensor_with_pad( - block_tables, - max_len=max_block_table_len, - pad=0, - dtype=torch.int, - device=self.device, - ) + attn_metadata = self.attn_backend.make_metadata( is_prompt=False, - seq_lens=None, - seq_lens_tensor=seq_lens_tensor, - max_query_len=None, - subquery_start_loc=None, - seq_start_loc=None, - context_lens_tensor=None, - block_tables=block_tables, - use_cuda_graph=False, + block_list=block_list, + block_mapping=block_mapping, + block_usage=block_usage, + attn_bias=None, + seq_lens_tensor=None, num_prefills=0, num_prefill_tokens=0, num_decode_tokens=num_decode_tokens, @@ -1163,7 +1177,7 @@ def _seq_len(self, attn_metadata): if attn_metadata.num_prefills != 0: return attn_metadata.slot_mapping.size(1) else: - return attn_metadata.block_tables.size(1) * self.block_size + return attn_metadata.block_list.numel() def trim_attn_metadata(self, metadata: AttentionMetadata) -> object: # NOTE(kzawora): To anyone working on this in the future: @@ -1187,8 +1201,8 @@ def trim_attn_metadata(self, metadata: AttentionMetadata) -> object: # input_hash(123) != input_hash(321) # input_hash("abc") != input_hash("cba") attention_metadata = subtuple(metadata, 'TrimmedAttentionMetadata', [ - 'block_tables', 'seq_lens_tensor', 'attn_bias', 'slot_mapping', - 'is_prompt' + 'attn_bias', 'seq_lens_tensor', 'block_list', 'block_mapping', + 'block_usage', 'slot_mapping', 'is_prompt' ]) return attention_metadata @@ -1222,9 +1236,8 @@ def profile_run(self) -> None: num_layers = self.model_config.get_num_layers(self.parallel_config) kv_caches = [None] * num_layers max_batch_size = self.prompt_bs_bucket_cfg[-1] - max_seq_len = self.prompt_seq_bucket_cfg[-1] - if self.lora_config: - max_seq_len = self.max_num_batched_tokens // max_batch_size + max_seq_len = min(self.prompt_seq_bucket_cfg[-1], + self.max_num_batched_tokens // max_batch_size) self.warmup_scenario(max_batch_size, max_seq_len, @@ -1277,21 +1290,34 @@ def warmup_scenario(self, [0] * batch_size * seq_len, ) self.set_active_loras(set(), lora_mapping) - seqs = [ - self.create_dummy_seq_group_metadata( - i, - seq_len, - is_prompt, - lora_request=dummy_lora_requests_per_seq[i] - if dummy_lora_requests_per_seq else None) - for i in range(batch_size) - ] + if is_prompt: + seqs = [ + self.create_dummy_seq_group_metadata( + i, + seq_len, + is_prompt, + lora_request=dummy_lora_requests_per_seq[i] + if dummy_lora_requests_per_seq else None) + for i in range(batch_size) + ] + else: + # FIXME: seq_len is actually number of blocks + blocks = [seq_len // batch_size for _ in range(batch_size)] + blocks[0] += seq_len % batch_size + seqs = [ + self.create_dummy_seq_group_metadata( + i, + b * self.block_size - 1, + is_prompt, + lora_request=dummy_lora_requests_per_seq[i] + if dummy_lora_requests_per_seq else None) + for i, b in enumerate(blocks) + ] torch.hpu.synchronize() for _ in range(times): inputs = self.prepare_model_input(seqs) - self.execute_model(inputs, kv_caches, warmup_mode=True) + self.execute_model(inputs, kv_caches, warmup_mode=False) torch.hpu.synchronize() - self.profiler.end() gc.collect() def remove_all_loras(self): @@ -1328,9 +1354,12 @@ def list_loras(self) -> Set[int]: def log_warmup(self, phase, i, max_i, batch_size, seq_len): free_mem = format_bytes( HabanaMemoryProfiler.current_free_device_memory()) + dim = "num_blocks" + if phase == "Prompt": + dim = "seq_len" msg = (f"[Warmup][{phase}][{i+1}/{max_i}] " f"batch_size:{batch_size} " - f"seq_len:{seq_len} " + f"{dim}:{seq_len} " f"free_mem:{free_mem}") logger.info(msg) @@ -1390,6 +1419,8 @@ def log_graph_warmup_summary(self, buckets, is_prompt, total_mem): phase = f'Graph/{"Prompt" if is_prompt else "Decode"}' graphed = list(c[:2] for c in self.graphed_buckets if c[2] == is_prompt) + if num_candidates == 0: + num_candidates = 1 msg = (f'{phase} captured:{len(graphed)} ' f'({100 * len(graphed) / num_candidates:.1f}%) ' f'used_mem:{format_bytes(total_mem)} ' @@ -1402,6 +1433,42 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: logger.info("Skipping warmup...") return self.profiler.start('internal', 'warmup') + max_blocks = kv_caches[0][0].size(0) + + self.prompt_buckets, prompt_omitted_buckets = generate_prompt_buckets( + self.prompt_bs_bucket_cfg, self.prompt_seq_bucket_cfg, + self.max_num_batched_tokens) + if self.lora_config: + self.prompt_buckets[:] = [ + bucket for bucket in self.prompt_buckets + if self._is_valid_bucket(bucket) + ] + + msg = ( + f"Generated {len(self.prompt_buckets)} " + f"prompt buckets [bs, seq]: {list(sorted(self.prompt_buckets))}") + logger.info(msg) + + msg = (f"Omitted {len(prompt_omitted_buckets)} " + "prompt buckets due to exceeded token budget " + f"(max_num_batched_tokens={self.max_num_batched_tokens})") + logger.info(msg) + + msg = f"Omitted prompt buckets: {list(sorted(prompt_omitted_buckets))}" + logger.debug(msg) + + self.decode_buckets = generate_decode_buckets( + self.decode_bs_bucket_cfg, self.decode_block_bucket_cfg, + max_blocks) + if self.lora_config: + self.decode_buckets[:] = [ + bucket for bucket in self.decode_buckets + if self._is_valid_bucket(bucket) + ] + logger.info("Generated %d decode buckets [bs, total_blocks]: %s", + len(self.decode_buckets), + list(sorted(self.decode_buckets))) + start_mem = HabanaMemoryProfiler.current_device_memory_usage() start_time = time.perf_counter() From e2c8b5ae2efd0e10aa3273ede60e263796e0a615 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 10 Sep 2024 15:13:43 +0300 Subject: [PATCH 175/819] format.sh --- vllm/engine/arg_utils.py | 2 +- vllm/lora/layers.py | 6 +++--- vllm/lora/models.py | 21 +++++++++++---------- vllm/model_executor/model_loader/loader.py | 3 ++- vllm/platforms/__init__.py | 2 +- vllm/worker/habana_model_runner.py | 19 ++++++++++--------- 6 files changed, 28 insertions(+), 25 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 36a9c919e8e0e..1a997b01a43c6 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -988,7 +988,7 @@ def create_engine_config(self) -> EngineConfig: self.model_loader_extra_config = {} self.model_loader_extra_config[ "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path - + load_device = device_config.device if self.weights_load_device is None else \ self.weights_load_device load_config = self.create_load_config(load_device) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index d2ef97e50fcbd..13a6813346f8b 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -349,9 +349,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: else: # Embedding layer only need expand op self.punica_wrapper.add_expand(full_output, - full_lora_a_embeddings, - self.lora_b_stacked, - add_input=True) + full_lora_a_embeddings, + self.lora_b_stacked, + add_input=True) return full_output.view_as(full_output_org) @classmethod diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 30f29ae6963e9..f21b45657c993 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -431,18 +431,19 @@ def __init__( dtype=torch.long, device=get_device()) self.sampler_indices = torch.empty(self.max_num_batched_tokens, - dtype=torch.long, - device=get_device()) - self.sampler_indices_padded = torch.empty(self.max_num_batched_tokens, - dtype=torch.long, - device=get_device()) + dtype=torch.long, + device=get_device()) + self.sampler_indices_padded = torch.empty( + self.max_num_batched_tokens, + dtype=torch.long, + device=get_device()) self.embeddings_indices = torch.empty(2, - self.max_num_batched_tokens, - dtype=torch.long, - device=get_device()) + self.max_num_batched_tokens, + dtype=torch.long, + device=get_device()) self.long_lora_indices = torch.empty(self.max_num_batched_tokens, - dtype=torch.long, - device=get_device()) + dtype=torch.long, + device=get_device()) else: self.punica_wrapper = PunicaWrapper(max_num_batched_tokens, max_batches=self.max_num_seqs, diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 37d392872b0e3..0cb373441f869 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -355,7 +355,8 @@ def load_model(self, *, model_config: ModelConfig, with set_default_torch_dtype(model_config.dtype): with torch.device(self.load_config.device): model = _initialize_model(model_config, self.load_config, - lora_config, cache_config, scheduler_config) + lora_config, cache_config, + scheduler_config) logger.info("Loading weights on %s ...", self.load_config.device) model.load_weights( self._get_weights_iterator(model_config.model, diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index 458fc85237aa2..983399af550a9 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -47,7 +47,7 @@ from importlib import util is_hpu = util.find_spec('habana_frameworks') is not None except Exception: - pass + pass if is_tpu: # people might install pytorch built with cuda but run on tpu diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 448dd9d876690..6f4d449880ef5 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -527,14 +527,13 @@ def load_model(self) -> None: htcore.hpu_set_env() with HabanaMemoryProfiler() as m: with HabanaMemoryProfiler() as m_getmodel: - self.model = get_model( - model_config=self.model_config, - device_config=self.device_config, - load_config=self.load_config, - lora_config=self.lora_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - cache_config=self.cache_config) + self.model = get_model(model_config=self.model_config, + device_config=self.device_config, + load_config=self.load_config, + lora_config=self.lora_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + cache_config=self.cache_config) msg = ("Pre-loading model weights on " f"{next(self.model.parameters()).device} " f"took {m_getmodel.get_summary_string()}") @@ -1224,7 +1223,7 @@ def create_dummy_seq_group_metadata(self, block_tables = {group_id: [_PAD_BLOCK_ID] * num_blocks} prompt_token_ids = [0] * input_len output_token_ids = [1] * output_len - prompt_token_ids_array = array('l', [1,3,5,7,9]) # noqa: F821 + prompt_token_ids_array = array('l', [1, 3, 5, 7, 9]) # noqa: F821 seq_data = SequenceData(prompt_token_ids_array) seq_data.output_token_ids = output_token_ids return SequenceGroupMetadata(request_id=str(group_id), @@ -1738,6 +1737,7 @@ def _check_config(self, batch_size, seq_len, is_prompt, warmup_mode): seen = cfg in self.seen_configs self.seen_configs.add(cfg) if not seen and not warmup_mode: + import pdb; pdb.set_trace() phase = 'prompt' if is_prompt else 'decode' logger.warning("Configuration: (%s, %s, %s) was not warmed-up!", phase, batch_size, seq_len) @@ -1776,6 +1776,7 @@ def execute_model( batch_size = input_tokens.size(0) seq_len = self._seq_len(attn_metadata) use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) + import pdb; pdb.set_trace() self._check_config(batch_size, seq_len, is_prompt, warmup_mode) execute_model_kwargs = { "input_ids": input_tokens, From 41941953da5d747627bb42f8bf5541b984153c1d Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 10 Sep 2024 15:56:50 +0300 Subject: [PATCH 176/819] i did not drink my afternoon coffee and made an oopsie --- vllm/worker/habana_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 6f4d449880ef5..d8f87b8845821 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -1223,7 +1223,7 @@ def create_dummy_seq_group_metadata(self, block_tables = {group_id: [_PAD_BLOCK_ID] * num_blocks} prompt_token_ids = [0] * input_len output_token_ids = [1] * output_len - prompt_token_ids_array = array('l', [1, 3, 5, 7, 9]) # noqa: F821 + prompt_token_ids_array = array('l', prompt_token_ids) # noqa: F821 seq_data = SequenceData(prompt_token_ids_array) seq_data.output_token_ids = output_token_ids return SequenceGroupMetadata(request_id=str(group_id), From 4052bdb728ba3bbddca82af1a71574c8db706179 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 10 Sep 2024 15:04:34 +0200 Subject: [PATCH 177/819] Add disable_tensor_cache=True to HPUGraph capture (#252) RuntimeErrors are not observed anymore on habana_main when disable_tensor_cache is used. This PR enables disable_tensor_cache. --- vllm/worker/habana_model_runner.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index a6bd5e5f68745..dfc2ee152076f 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -576,8 +576,6 @@ def load_model(self) -> None: htcore.mark_step() torch.hpu.synchronize() - # FIXME: Running with disable_tensor_cache=True causes - # RuntimeErrors. This needs to be debugged with HabanaMemoryProfiler() as m_wrap: self.model = _maybe_wrap_in_hpu_graph( self.model, @@ -1576,10 +1574,9 @@ def mem_margin(self, value): def _maybe_wrap_in_hpu_graph(*args, **kwargs): - return htorch.hpu.wrap_in_hpu_graph(HpuModelAdapter( - *args, ** - kwargs)) if htorch.utils.internal.is_lazy() else HpuModelAdapter( - *args, **kwargs) + return htorch.hpu.wrap_in_hpu_graph( + HpuModelAdapter(*args, **kwargs), disable_tensor_cache=True + ) if htorch.utils.internal.is_lazy() else HpuModelAdapter(*args, **kwargs) class HabanaProfilerCounterHelper(): From c9bf9081ee51c793e16bfcda17f5ef68db369b68 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 10 Sep 2024 16:41:25 +0300 Subject: [PATCH 178/819] do not build core ext on hpu --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ec5bc24a1b834..c0f5ec984a408 100644 --- a/setup.py +++ b/setup.py @@ -301,7 +301,8 @@ def _build_custom_ops() -> bool: def _build_core_ext() -> bool: - return not (_is_neuron() or _is_tpu() or _is_openvino() or _is_xpu()) + return not (_is_neuron() or _is_tpu() or _is_openvino() or _is_xpu() + or _is_hpu()) def get_hipcc_rocm_version(): From 69df1e7e3f6b580945ce0d0cab88233829dae205 Mon Sep 17 00:00:00 2001 From: Michal Adamczyk Date: Tue, 10 Sep 2024 15:43:20 +0200 Subject: [PATCH 179/819] Fix dispersed slots (#261) On habana_main the slots are calculated by adding an offset to the block which breaks the check for _PAD_SLOT_ID. Reworked it so that in case of _PAD_BLOCK_ID we're automatically inserting the right value. --- vllm/worker/habana_model_runner.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index dfc2ee152076f..8d6c386a9975e 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -887,6 +887,9 @@ def _prepare_decode( self.lora_config.max_lora_rank, dtype=self.lora_config.lora_dtype) + dummy_slots = itertools.cycle( + range(_PAD_SLOT_ID, _PAD_SLOT_ID + self.block_size)) + for seq_group_metadata in seq_group_metadata_list: assert not seq_group_metadata.is_prompt assert seq_group_metadata.token_chunk_size == 1 @@ -916,8 +919,11 @@ def _prepare_decode( block_table = seq_group_metadata.block_tables[seq_id] block_number = block_table[position // self.block_size] - block_offset = position % self.block_size - slot = block_number * self.block_size + block_offset + if block_number == _PAD_BLOCK_ID: + slot = next(dummy_slots) + else: + block_offset = position % self.block_size + slot = block_number * self.block_size + block_offset slot_mapping.append([slot]) lora_index_mapping.append(lora_id) lora_prompt_mapping.append(lora_id) @@ -938,12 +944,6 @@ def _prepare_decode( dtype=torch.long, device=self.device) - dummy_slots = itertools.cycle( - range(_PAD_SLOT_ID, _PAD_SLOT_ID + self.block_size)) - slot_mapping = [[ - s if s != _PAD_SLOT_ID else next(dummy_slots) for s in sl - ] for sl in slot_mapping] - num_decode_tokens = sum(seq_lens) blocks_used = [len(bt) for bt in block_tables] From 53f96b784980b60ca12418b39c4785210931fb09 Mon Sep 17 00:00:00 2001 From: Jan Kaniecki Date: Tue, 10 Sep 2024 15:53:11 +0200 Subject: [PATCH 180/819] Skip compilation warnings during warmup phase (#262) --- vllm/worker/habana_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 8d6c386a9975e..b6218f3cc4cfb 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -1314,7 +1314,7 @@ def warmup_scenario(self, torch.hpu.synchronize() for _ in range(times): inputs = self.prepare_model_input(seqs) - self.execute_model(inputs, kv_caches, warmup_mode=False) + self.execute_model(inputs, kv_caches, warmup_mode=True) torch.hpu.synchronize() gc.collect() From d436d387e1641c146974573332621dbed9266e8b Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 10 Sep 2024 17:21:23 +0300 Subject: [PATCH 181/819] fix tensor parallelism --- vllm/executor/ray_habana_executor.py | 12 ++++++------ vllm/worker/habana_model_runner.py | 24 +++++++++++++----------- vllm/worker/habana_worker.py | 20 ++++++++++++-------- 3 files changed, 31 insertions(+), 25 deletions(-) diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py index 17e3414a96b57..d69a85a816636 100644 --- a/vllm/executor/ray_habana_executor.py +++ b/vllm/executor/ray_habana_executor.py @@ -9,11 +9,11 @@ DistributedGPUExecutor, DistributedGPUExecutorAsync) from vllm.executor.ray_utils import RayWorkerWrapper, ray from vllm.logger import init_logger -from vllm.sequence import ExecuteModelRequest, SamplerOutput -from vllm.utils import (_run_task_with_lock, - error_on_invalid_device_count_status, - get_distributed_init_method, get_ip, get_open_port, - get_vllm_instance_id, make_async) +from vllm.sequence import ExecuteModelRequest +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.utils import (_run_task_with_lock, get_distributed_init_method, + get_ip, get_open_port, get_vllm_instance_id, + make_async) if ray is not None: from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy @@ -188,7 +188,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", distributed_init_method = get_distributed_init_method( driver_ip, get_open_port()) - error_on_invalid_device_count_status() + # error_on_invalid_device_count_status() # Initialize the actual workers inside worker wrapper. init_worker_all_kwargs = [ diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index d8f87b8845821..c7315eb804283 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -23,16 +23,18 @@ from vllm.attention import AttentionMetadata, get_attn_backend from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, MultiModalConfig, ParallelConfig, - SchedulerConfig) + ModelConfig, MultiModalConfig, ObservabilityConfig, + ParallelConfig, PromptAdapterConfig, SchedulerConfig) from vllm.distributed.parallel_state import get_world_group from vllm.hpu.ops import LoraMask as LoraMask +from vllm.inputs.registry import InputRegistry from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping from vllm.lora.request import LoRARequest from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.model_executor import SamplingMetadata from vllm.model_executor.model_loader import get_model +from vllm.multimodal.registry import MultiModalRegistry from vllm.sampling_params import SamplingParams from vllm.sequence import (IntermediateTensors, SequenceData, SequenceGroupMetadata) @@ -468,20 +470,26 @@ def __init__( parallel_config: ParallelConfig, scheduler_config: SchedulerConfig, device_config: DeviceConfig, - load_config: LoadConfig, cache_config: CacheConfig, + load_config: LoadConfig, lora_config: Optional[LoRAConfig], kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, - multimodal_config: Optional[MultiModalConfig] = None, + prompt_adapter_config: Optional[PromptAdapterConfig] = None, + return_hidden_states: bool = False, + observability_config: Optional[ObservabilityConfig] = None, ): self.model_config = model_config self.parallel_config = parallel_config self.scheduler_config = scheduler_config + self.device_config = device_config + self.cache_config = cache_config self.lora_config = lora_config self.load_config = load_config - self.cache_config = cache_config self.is_driver_worker = is_driver_worker + self.prompt_adapter_config = prompt_adapter_config + self.return_hidden_states = return_hidden_states + self.observability_config = observability_config self.profiler = Profiler() self.sliding_window = (model_config.get_sliding_window() @@ -499,7 +507,6 @@ def __init__( self.pin_memory = is_pin_memory_available() self.kv_cache_dtype = kv_cache_dtype - self.multimodal_config = multimodal_config self.attn_backend = get_attn_backend( self.model_config.get_num_attention_heads(self.parallel_config), @@ -757,9 +764,6 @@ def _prepare_prompt( assert max_query_len > 0 if multi_modal_input_list: - assert self.multimodal_config, ( - "Multi-modal inputs are only supported by " - "vision language models.") multi_modal_input = torch.cat(multi_modal_input_list, dim=0).to(self.device) else: @@ -1737,7 +1741,6 @@ def _check_config(self, batch_size, seq_len, is_prompt, warmup_mode): seen = cfg in self.seen_configs self.seen_configs.add(cfg) if not seen and not warmup_mode: - import pdb; pdb.set_trace() phase = 'prompt' if is_prompt else 'decode' logger.warning("Configuration: (%s, %s, %s) was not warmed-up!", phase, batch_size, seq_len) @@ -1776,7 +1779,6 @@ def execute_model( batch_size = input_tokens.size(0) seq_len = self._seq_len(attn_metadata) use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) - import pdb; pdb.set_trace() self._check_config(batch_size, seq_len, is_prompt, warmup_mode) execute_model_kwargs = { "input_ids": input_tokens, diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index 9d083915041fe..407c618a9d597 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -4,15 +4,15 @@ import gc import os -from typing import List, Optional, Set, Tuple +from typing import List, Optional, Set, Tuple, Type import habana_frameworks.torch as htorch # noqa:F401 import torch import torch.distributed from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, MultiModalConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig, + ModelConfig, MultiModalConfig, ObservabilityConfig, + ParallelConfig, PromptAdapterConfig, SchedulerConfig, SpeculativeConfig) from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) @@ -24,6 +24,7 @@ from vllm.utils import HabanaMemoryProfiler, format_bytes from vllm.worker.cache_engine import CacheEngine from vllm.worker.habana_model_runner import HabanaModelRunner +from vllm.worker.model_runner_base import ModelRunnerBase from vllm.worker.worker_base import LocalOrDistributedWorkerBase, WorkerInput logger = init_logger(__name__) @@ -49,13 +50,15 @@ def __init__( rank: int, distributed_init_method: str, lora_config: Optional[LoRAConfig] = None, - multimodal_config: Optional[MultiModalConfig] = None, speculative_config: Optional[SpeculativeConfig] = None, prompt_adapter_config: Optional[PromptAdapterConfig] = None, is_driver_worker: bool = False, + model_runner_cls: Optional[Type[ModelRunnerBase]] = None, + observability_config: Optional[ObservabilityConfig] = None, ) -> None: self.model_config = model_config self.parallel_config = parallel_config + self.parallel_config.rank = rank self.scheduler_config = scheduler_config self.device_config = device_config self.cache_config = cache_config @@ -64,6 +67,7 @@ def __init__( self.distributed_init_method = distributed_init_method self.lora_config = lora_config self.load_config = load_config + self.prompt_adapter_config = prompt_adapter_config self.is_driver_worker = is_driver_worker if self.is_driver_worker: assert self.rank == 0, "The driver worker must have rank 0." @@ -72,19 +76,19 @@ def __init__( # note: lazy import to avoid importing torch before initializing from vllm.utils import init_cached_hf_modules init_cached_hf_modules() - self.multimodal_config = multimodal_config self.model_runner: HabanaModelRunner = HabanaModelRunner( model_config, parallel_config, scheduler_config, device_config, - cache_config=cache_config, + cache_config, load_config=load_config, lora_config=self.lora_config, kv_cache_dtype=self.cache_config.cache_dtype, - multimodal_config=self.multimodal_config, - is_driver_worker=is_driver_worker) + is_driver_worker=is_driver_worker, + prompt_adapter_config=prompt_adapter_config, + observability_config=observability_config) # Uninitialized cache engine. Will be initialized by # initialize_cache. self.cache_engine: List[CacheEngine] From 61b6fbb15d7871ee95dbb6bdea6021458b8550f9 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 10 Sep 2024 17:25:38 +0300 Subject: [PATCH 182/819] add missing functions --- vllm/lora/layers.py | 4 ++-- vllm/lora/models.py | 17 +++++++++-------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 13a6813346f8b..9e4a0098dc44e 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -93,7 +93,7 @@ def _apply_lora( x = x.view(-1, x.shape[-1]) output = output.view(-1, output.shape[-1]) indices = indices.view(-1) - if is_hpu(): + if current_platform.is_hpu(): dispatch_bgmv_linear(output, x, lora_a_stacked, lora_b_stacked, indices, 0, 1.0) else: @@ -314,7 +314,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: # NOTE(vgoel): These asserts can be skipped when upstreaming. # Can be removed from vllm-fork also once lora functionality # on Gaudi stabilizes. - if is_hpu(): + if current_platform.is_hpu(): emb_len = embedding_len x_shape = x.shape ind_shape = self.embeddings_indices[1].shape diff --git a/vllm/lora/models.py b/vllm/lora/models.py index f21b45657c993..e3abf0fc96196 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -4,7 +4,7 @@ import os import re from dataclasses import dataclass, field -from typing import Any, Callable, Dict, List, Optional, Tuple, Type +from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union import safetensors.torch import torch @@ -94,9 +94,10 @@ def convert_mapping( embedding_indices = index_mapping_indices.copy() lora_indices = index_mapping_indices.copy() long_lora_offsets: Optional[torch.Tensor] = None + device = "hpu" if current_platform.is_hpu() else "cuda" if long_lora_context: long_lora_offsets = torch.zeros(len(index_mapping_indices), - device=get_device(), + device=device, dtype=torch.long) prompt_mapping: List[int] = [ lora_index_to_id.index(x) if x > 0 else -1 @@ -121,9 +122,9 @@ def convert_mapping( if long_lora_context: assert long_lora_offsets is not None indices_list.append(long_lora_offsets) - indices = torch.tensor(indices_list, dtype=torch.long, device=get_device()) + indices = torch.tensor(indices_list, dtype=torch.long, device=device) prompt_mapping_tensor = torch.tensor(prompt_mapping, - device=get_device(), + device=device, dtype=torch.long) embeddings_indices = torch.stack([ indices[2] * extra_vocab_size, @@ -134,10 +135,10 @@ def convert_mapping( sampler_indices = prompt_mapping_tensor sampler_indices_padded = sampler_indices.clone() sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1 - sampler_indices_padded = (torch.arange( - 0, len(sampler_indices_padded), device=get_device(), dtype=torch.long) - + (sampler_indices_padded * - len(sampler_indices_padded))) + sampler_indices_padded = ( + torch.arange( + 0, len(sampler_indices_padded), device=device, dtype=torch.long) + + (sampler_indices_padded * len(sampler_indices_padded))) long_lora_indices = None long_lora_indices_len: Optional[int] = None if long_lora_context: From 2091161b4a2e3acaa531d1a1a3c0cba65bb50b21 Mon Sep 17 00:00:00 2001 From: Agata Dobrzyniewicz <160237065+adobrzyniewicz-habana@users.noreply.github.com> Date: Wed, 11 Sep 2024 10:15:09 +0200 Subject: [PATCH 183/819] Port PT Profiler to habana_main (#256) Porting PT Profiler from: https://github.com/HabanaAI/vllm-fork/commit/81a23a708195faef6167919890cefa225a721907 and https://github.com/HabanaAI/vllm-fork/commit/e805b885d32a749d9409f13b6446895d13e8b885 --- vllm/worker/habana_model_runner.py | 46 ++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index b6218f3cc4cfb..2360e39fcba28 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -210,6 +210,26 @@ def align_workers(value, op): return value_t.item() +def setup_profiler(): + schedule = torch.profiler.schedule(wait=0, warmup=2, active=1, repeat=1) + DEVICE = 'hpu' + activities = [torch.profiler.ProfilerActivity.CPU] + activities.extend([torch.profiler.ProfilerActivity.HPU] if DEVICE == + 'hpu' else []) + #from habana_frameworks.torch.activity_profiler import DebugActivity + #debug_activities=[DebugActivity.BRIDGE_FUNCTION_CALLS] + + profiler = torch.profiler.profile( + schedule=schedule, + activities=activities, + #debug_activities=debug_activities, + on_trace_ready=torch.profiler.tensorboard_trace_handler('.', + use_gzip=True), + record_shapes=False, + with_stack=True) + return profiler + + def pad_list(list, k, v): target_len = round_up(len(list), k) padding = target_len - len(list) @@ -1237,11 +1257,7 @@ def profile_run(self) -> None: max_seq_len = min(self.prompt_seq_bucket_cfg[-1], self.max_num_batched_tokens // max_batch_size) - self.warmup_scenario(max_batch_size, - max_seq_len, - True, - kv_caches, - is_profile_run=True) + self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches) return def warmup_scenario(self, @@ -1281,7 +1297,7 @@ def warmup_scenario(self, for idx in range(max_num_seqs) ] self.profiler.start('internal', scenario_name) - times = 3 if use_graphs else 1 + times = 3 if use_graphs or is_profile_run else 1 if self.lora_config and not is_profile_run: lora_mapping = LoRAMapping( [0] * batch_size * seq_len, @@ -1312,10 +1328,19 @@ def warmup_scenario(self, for i, b in enumerate(blocks) ] torch.hpu.synchronize() + profiler = None + if is_profile_run and self.is_driver_worker: + profiler = setup_profiler() + profiler.start() for _ in range(times): inputs = self.prepare_model_input(seqs) self.execute_model(inputs, kv_caches, warmup_mode=True) torch.hpu.synchronize() + if profiler: + profiler.step() + if profiler: + profiler.stop() + self.profiler.end() gc.collect() def remove_all_loras(self): @@ -1427,6 +1452,15 @@ def log_graph_warmup_summary(self, buckets, is_prompt, total_mem): @torch.inference_mode() def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: + if profile := os.environ.get('VLLM_PT_PROFILE', None): + phase, bs, seq_len, graph = profile.split('_') + is_prompt = phase == 'prompt' + graphs = graph == 't' + if graphs: + self.graphed_buckets.add((int(bs), int(seq_len), is_prompt)) + self.warmup_scenario(int(bs), int(seq_len), is_prompt, kv_caches, + True) + raise AssertionError("Finished profiling") if os.environ.get('VLLM_SKIP_WARMUP', 'false').lower() == 'true': logger.info("Skipping warmup...") return From 68e0f57b83995f8eae67986a8f12b77e24bb2789 Mon Sep 17 00:00:00 2001 From: Krzysztof Wisniewski Date: Fri, 6 Sep 2024 14:20:30 +0300 Subject: [PATCH 184/819] Reduce frequency of garbage collector --- vllm/worker/habana_model_runner.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 2360e39fcba28..fdf1e9c444406 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -539,6 +539,21 @@ def __init__( self._mem_margin: Optional[int] = None self._setup_buckets() + + # Read https://docs.python.org/3/library/gc.html#gc.set_threshold + # for comprehensive description of gc generations. + # We can either use VLLM_GC_THR_GEN[0-2] (this has higher priority) + # to set particular generation threshold or use simpler + # VLLM_GC_THR_MULTIPLIER to multiply default values. + default_gc_thrs = list(gc.get_threshold()) + requested_gc_thrs = [None] * len(default_gc_thrs) + for i in range(len(default_gc_thrs)): + requested_gc_thrs[i] = int(os.environ.get(f'VLLM_GC_THR_GEN{i}', default_gc_thrs[i])) + if requested_gc_thrs == default_gc_thrs: + gc_thr_multiplier = int(os.environ.get('VLLM_GC_THR_MULTIPLIER', 2)) + requested_gc_thrs = [t * gc_thr_multiplier for t in default_gc_thrs] + gc.set_threshold(*requested_gc_thrs) + def load_model(self) -> None: import habana_frameworks.torch.core as htcore if self.model_config.quantization == 'inc': From b776d5e8fa287018e7e373e6588f2d15176e0d72 Mon Sep 17 00:00:00 2001 From: Sanju C Sudhakaran Date: Wed, 11 Sep 2024 12:49:20 +0300 Subject: [PATCH 185/819] Fix LoRA test by handling mask creation inside the test --- tests/lora/test_lora_hpu.py | 93 +++++++++++++++++++++++++------------ 1 file changed, 64 insertions(+), 29 deletions(-) diff --git a/tests/lora/test_lora_hpu.py b/tests/lora/test_lora_hpu.py index ddbab66e166b3..01b6472745e1c 100644 --- a/tests/lora/test_lora_hpu.py +++ b/tests/lora/test_lora_hpu.py @@ -1,6 +1,7 @@ import pytest import torch +from vllm.hpu.ops import LoraMask from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice from .utils import DummyLoRAManager @@ -19,7 +20,19 @@ torch.float16: (5e-3, 5e-3), torch.bfloat16: (3e-2, 2e-2), } -MAX_LORAS = 8 + + +def createLoraMask(indices, batch_size, seq_len, max_loras, max_lora_rank, + lora_dtype): + indices = indices.view(-1, 1) + mask = torch.arange(max_loras * max_lora_rank, device=indices.device) + mask = mask.view(1, -1) + mask = ((mask >= ((indices) * max_lora_rank)) * + (mask < ((indices + 1) * max_lora_rank))).to(dtype=lora_dtype) + mask = mask.view(batch_size, 1, + -1).expand(batch_size, seq_len, + -1).reshape(batch_size * seq_len, -1) + return mask @pytest.mark.parametrize("m", TENSOR_SIZES) @@ -39,32 +52,40 @@ def test_apply_lora(m, n, k, rank, dtype) -> None: input = torch.rand(k, n, device="hpu", dtype=dtype) expected = input @ lora.lora_a @ lora.lora_b * lora.scaling - lora_a_stack = torch.zeros(MAX_LORAS + 1, + lora_a_stack = torch.zeros(8, 1, lora.lora_a.shape[1], lora.lora_a.shape[0], device="hpu", dtype=dtype) - lora_b_stack = torch.zeros(MAX_LORAS + 1, + lora_b_stack = torch.zeros(8, 1, lora.lora_b.shape[1], lora.lora_b.shape[0], device="hpu", dtype=dtype) - for i in range(MAX_LORAS): + for i in range(lora_a_stack.shape[0]): lora_a_stack[i][0] = lora.lora_a.T lora_b_stack[i][0] = (lora.lora_b * lora.scaling).T output = torch.zeros(k, m, device="hpu", dtype=dtype) - _apply_lora(input, lora_a_stack, lora_b_stack, - torch.randint(0, MAX_LORAS, (len(input), ), device="hpu"), - output) + indices = torch.randint(0, + lora_a_stack.shape[0], (len(input), ), + device="hpu") + mask = createLoraMask(indices, k, 1, 8, rank, dtype) + LoraMask.setLoraMask(mask) + + _apply_lora(input, lora_a_stack, lora_b_stack, indices, output) + rtol, atol = TOLERANCES[dtype] assert torch.allclose(expected, output, rtol=rtol, atol=atol) output[:] = 0 - _apply_lora(input, lora_a_stack, lora_b_stack, - torch.full((len(input), ), -1, device="hpu"), output) + indices = torch.full((len(input), ), -1, device="hpu") + mask = createLoraMask(indices, k, 1, 8, rank, dtype) + LoraMask.setLoraMask(mask) + + _apply_lora(input, lora_a_stack, lora_b_stack, indices, output) assert torch.allclose(torch.zeros_like(output), output) manager.reset_lora() @@ -99,7 +120,7 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None: dim=1) lora_a_stacks = [ - torch.zeros(MAX_LORAS + 1, + torch.zeros(8, 1, lora_1.lora_a.shape[1], lora_1.lora_a.shape[0], @@ -107,31 +128,38 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None: dtype=dtype) for i in range(2) ] lora_b_stacks = [ - torch.zeros(MAX_LORAS + 1, + torch.zeros(8, 1, lora_1.lora_b.shape[1], lora_1.lora_b.shape[0], device="hpu", dtype=dtype) for i in range(2) ] - for i in range(MAX_LORAS): + for i in range(lora_a_stacks[0].shape[0]): lora_a_stacks[0][i][0] = lora_1.lora_a.T lora_b_stacks[0][i][0] = (lora_1.lora_b * lora_1.scaling).T lora_a_stacks[1][i][0] = lora_2.lora_a.T lora_b_stacks[1][i][0] = (lora_2.lora_b * lora_2.scaling).T output = torch.zeros(k, m, device="hpu", dtype=dtype) - _apply_lora_packed_nslice( - input, lora_a_stacks, lora_b_stacks, - torch.randint(0, MAX_LORAS, (len(input), ), device="hpu"), output, - (m // 2, m // 2)) + indices = torch.randint(0, + lora_a_stacks[0].shape[0], (len(input), ), + device="hpu") + mask = createLoraMask(indices, k, 1, 8, rank, dtype) + LoraMask.setLoraMask(mask) + + _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks, indices, + output, (m // 2, m // 2)) rtol, atol = TOLERANCES[dtype] assert torch.allclose(expected, output, rtol=rtol, atol=atol) output[:] = 0 - _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks, - torch.full((len(input), ), -1, device="hpu"), + indices = torch.full((len(input), ), -1, device="hpu") + mask = createLoraMask(indices, k, 1, 8, rank, dtype) + LoraMask.setLoraMask(mask) + + _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks, indices, output, (m // 2, m // 2)) assert torch.allclose(torch.zeros_like(output), output) @@ -166,14 +194,14 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None: dim=1) lora_a_stacks = [ - torch.zeros(MAX_LORAS + 1, + torch.zeros(8, 1, lora_q.lora_a.shape[1], lora_q.lora_a.shape[0], device="hpu", dtype=dtype) ] + [ - torch.zeros(MAX_LORAS + 1, + torch.zeros(8, 1, lora_k.lora_a.shape[1], lora_k.lora_a.shape[0], @@ -181,21 +209,21 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None: dtype=dtype) for i in range(2) ] lora_b_stacks = [ - torch.zeros(MAX_LORAS + 1, + torch.zeros(8, 1, lora_q.lora_b.shape[1], lora_q.lora_b.shape[0], device="hpu", dtype=dtype) ] + [ - torch.zeros(MAX_LORAS + 1, + torch.zeros(8, 1, lora_k.lora_b.shape[1], lora_k.lora_b.shape[0], device="hpu", dtype=dtype) for i in range(2) ] - for i in range(MAX_LORAS): + for i in range(lora_a_stacks[0].shape[0]): lora_a_stacks[0][i][0] = lora_q.lora_a.T lora_b_stacks[0][i][0] = (lora_q.lora_b * lora_q.scaling).T lora_a_stacks[1][i][0] = lora_k.lora_a.T @@ -204,17 +232,24 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None: lora_b_stacks[2][i][0] = (lora_v.lora_b * lora_v.scaling).T output = torch.zeros(k, sum(qkv), device="hpu", dtype=dtype) - _apply_lora_packed_nslice( - input, lora_a_stacks, lora_b_stacks, - torch.randint(0, MAX_LORAS, (len(input), ), device="hpu"), output, - (qkv[0], qkv[1], qkv[2])) + indices = torch.randint(0, + lora_a_stacks[0].shape[0], (len(input), ), + device="hpu") + mask = createLoraMask(indices, k, 1, 8, rank, dtype) + LoraMask.setLoraMask(mask) + + _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks, indices, + output, (qkv[0], qkv[1], qkv[2])) rtol, atol = TOLERANCES[dtype] assert torch.allclose(expected, output, rtol=rtol, atol=atol) output[:] = 0 - _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks, - torch.full((len(input), ), -1, device="hpu"), + indices = torch.full((len(input), ), -1, device="hpu") + mask = createLoraMask(indices, k, 1, 8, rank, dtype) + LoraMask.setLoraMask(mask) + + _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks, indices, output, (qkv[0], qkv[1], qkv[2])) assert torch.allclose(torch.zeros_like(output), output) From f858d4359657db1ea01f39e8a8b39ec68076d6a6 Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar <49579433+hlahkar@users.noreply.github.com> Date: Thu, 12 Sep 2024 09:57:03 +0530 Subject: [PATCH 186/819] Attn MetaData dtype should be same as model dtype (#271) Attn MetaData was hard coded to bfloat16, leading to a runtime error for float32 model instantiation. --- vllm/worker/habana_model_runner.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 2360e39fcba28..55f205915ea8c 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -238,11 +238,12 @@ def pad_list(list, k, v): class HpuModelAdapter(): - def __init__(self, model, block_size, enforce_eager): + def __init__(self, model, block_size, dtype, enforce_eager): self.model = model self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA', '0').lower() in ['1', 'true'] self.block_size = block_size + self.dtype = dtype if not htorch.utils.internal.is_lazy() and not enforce_eager: self.model = torch.compile(self.model, backend='hpu_backend', @@ -304,7 +305,7 @@ def forward(self, *args, **kwargs): input_ids = kwargs['input_ids'] kwargs['attn_metadata'] = self._update_metadata( kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1), - input_ids.device, torch.bfloat16) + input_ids.device, self.dtype) LoraMask.setLoraMask(kwargs.pop('lora_mask')) hidden_states = self.model(*args, **kwargs) hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) @@ -600,6 +601,7 @@ def load_model(self) -> None: self.model = _maybe_wrap_in_hpu_graph( self.model, self.block_size, + dtype=self.model_config.dtype, enforce_eager=self.enforce_eager) msg = f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}" logger.info(msg) From acf7d548ee0352c5482d0c424ddb4a0558007ef7 Mon Sep 17 00:00:00 2001 From: Dudi Lester <160421192+dudilester@users.noreply.github.com> Date: Thu, 12 Sep 2024 11:42:31 +0300 Subject: [PATCH 187/819] Support Mixtral quantization using INC (#267) --- vllm/hpu/ops.py | 88 ++++++++++++------- vllm/model_executor/layers/fused_moe/layer.py | 42 ++++++--- .../model_executor/layers/quantization/inc.py | 6 +- vllm/model_executor/model_loader/utils.py | 2 +- 4 files changed, 96 insertions(+), 42 deletions(-) diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index b2705429906c4..3d76c36f2648b 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -86,36 +86,6 @@ def silu_and_mul(x: torch.Tensor) -> torch.Tensor: return F.silu(x[..., :d]) * x[..., d:] -def static_fused_moe(hidden_states, w1, w2, score, topk): - B, D = hidden_states.shape - num_experts = w1.shape[0] - routing_weights = F.softmax(score, dim=1, dtype=torch.float32) - routing_weights, selected_experts = torch.topk(routing_weights, - topk, - dim=-1) - routing_weights /= routing_weights.sum(dim=-1, keepdim=True) - routing_weights = routing_weights.to(hidden_states.dtype) - final_hidden_states = torch.zeros((1, B, D), - dtype=hidden_states.dtype, - device=hidden_states.device) - padded_weights = torch.zeros((B, num_experts), - dtype=hidden_states.dtype, - device=hidden_states.device) - padded_weights.scatter_(-1, selected_experts, routing_weights) - padded_weights = padded_weights.reshape(-1, B, w1.shape[0]) - padded_weights = padded_weights.permute(2, 0, 1).unsqueeze(-1) - - htorch.core.mark_step() - - for expert_idx in range(num_experts): - w_output = torch.matmul(hidden_states, w1[expert_idx].transpose(0, 1)) - w_output = silu_and_mul(w_output) - w_output = torch.matmul(w_output, w2[expert_idx].transpose(0, 1)) - final_hidden_states += w_output * padded_weights[expert_idx] - - return final_hidden_states.view(-1, D) - - #TODO: remove after fusedsdpa fix for query_head != kv_head def repeat_kv(kv: torch.Tensor, n_rep: int) -> torch.Tensor: """ @@ -252,3 +222,61 @@ def dispatch_bgmv_embedding( wb = wb.reshape(wb.shape[0] * wb.shape[1], wb.shape[2]) out = x @ wb y += out * scale + + +class MoeMatmul(torch.nn.Module): + + def __init__(self): + super().__init__() + + def set_weight(self, w): + self.weight = w + + def calc(self, state, expert_id, w): + self.weight = w[expert_id].transpose(0, 1) + return self.forward(state) + + def forward(self, state): + return torch.matmul(state, self.weight) + + +class StaticFusedMOE(torch.nn.Module): + + def __init__(self, num_total_experts): + super().__init__() + self.w13_list = torch.nn.ModuleList( + [MoeMatmul() for _ in range(num_total_experts)]) + self.w2_list = torch.nn.ModuleList( + [MoeMatmul() for _ in range(num_total_experts)]) + self.num_total_experts = num_total_experts + + def forward(self, hidden_states, w1, w2, score, topk): + B, D = hidden_states.shape + routing_weights = F.softmax(score, dim=1, dtype=torch.float32) + routing_weights, selected_experts = torch.topk(routing_weights, + topk, + dim=-1) + routing_weights /= routing_weights.sum(dim=-1, keepdim=True) + routing_weights = routing_weights.to(hidden_states.dtype) + final_hidden_states = torch.zeros((1, B, D), + dtype=hidden_states.dtype, + device=hidden_states.device) + padded_weights = torch.zeros((B, self.num_total_experts), + dtype=hidden_states.dtype, + device=hidden_states.device) + padded_weights.scatter_(-1, selected_experts, routing_weights) + padded_weights = padded_weights.reshape(-1, B, self.num_total_experts) + padded_weights = padded_weights.permute(2, 0, 1).unsqueeze(-1) + htorch.core.mark_step() + + for expert_idx in range(self.num_total_experts): + padded_weight = padded_weights[expert_idx] + current_state_static = hidden_states.reshape(-1, D) + w_output = self.w13_list[expert_idx].calc(current_state_static, + expert_idx, w1) + w_output = silu_and_mul(w_output) + w_output = self.w2_list[expert_idx].calc(w_output, expert_idx, w2) + current_hidden_states_static = w_output * padded_weight + final_hidden_states += current_hidden_states_static + + return final_hidden_states.view(-1, D) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index b49bf40d4746e..cf0d5f98f1b01 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -13,9 +13,6 @@ from vllm.model_executor.utils import set_weight_attrs from vllm.utils import is_hpu -if is_hpu(): - from vllm.hpu.ops import static_fused_moe - logger = init_logger(__name__) @@ -78,7 +75,8 @@ def apply( ) -> torch.Tensor: return self.forward(x, layer.w13_weight, layer.w2_weight, router_logits, top_k, renormalize, - use_grouped_topk, num_expert_group, topk_group) + use_grouped_topk, num_expert_group, topk_group, + layer) def forward_cuda( self, @@ -91,6 +89,7 @@ def forward_cuda( use_grouped_topk: bool, num_expert_group: Optional[int], topk_group: Optional[int], + layer: Optional[torch.nn.Module], ) -> torch.Tensor: from vllm.model_executor.layers.fused_moe.fused_moe import fused_moe return fused_moe(x, @@ -104,15 +103,25 @@ def forward_cuda( num_expert_group=num_expert_group, topk_group=topk_group) - def forward_hpu(self, x: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, - router_logits: torch.Tensor, top_k: int, renormalize: bool, - use_grouped_topk: bool, num_expert_group: Optional[int], - topk_group: Optional[int]): + def forward_hpu( + self, + x: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + router_logits: torch.Tensor, + top_k: int, + renormalize: bool, + use_grouped_topk: bool, + num_expert_group: Optional[int], + topk_group: Optional[int], + layer: Optional[torch.nn.Module], + ): assert not use_grouped_topk, 'use_grouped_topk must be False on HPU' assert num_expert_group is None, ('num_expert_group is ' 'not supported on HPU') assert topk_group is None, 'topk_group is not supported on HPU' - return static_fused_moe(x, w1, w2, router_logits, top_k) + if layer is not None: + return layer.hpu_static_fused_moe(x, w1, w2, router_logits, top_k) def forward_cpu(self, *args, **kwargs): raise NotImplementedError( @@ -129,6 +138,7 @@ def forward_tpu( use_grouped_topk: bool, num_expert_group: Optional[int], topk_group: Optional[int], + layer: Optional[torch.nn.Module], ) -> torch.Tensor: from vllm.model_executor.layers.fused_moe.moe_pallas import fused_moe assert not use_grouped_topk @@ -140,7 +150,7 @@ def forward_tpu( class FusedMoE(torch.nn.Module): """FusedMoE layer for MoE models. - This layer contains both MergedColumnParallel weights (gate_up_proj / + This layer contains both MergedColumnParallel weights (gate_up_proj / w13) and RowParallelLinear weights (down_proj/ w2). Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We @@ -191,6 +201,9 @@ def __init__( assert num_expert_group is not None and topk_group is not None self.num_expert_group = num_expert_group self.topk_group = topk_group + if is_hpu(): + from vllm.hpu.ops import StaticFusedMOE + self.hpu_static_fused_moe = StaticFusedMOE(self.num_experts) if quant_config is None: self.quant_method: Optional[QuantizeMethodBase] = ( @@ -245,13 +258,22 @@ def weight_loader(self, param: torch.nn.Parameter, if shard_id == 0: param_data[expert_id, 0:shard_size, :] = loaded_weight[shard, :] + if is_hpu(): + self.hpu_static_fused_moe.w13_list[expert_id].set_weight( + param_data[expert_id]) # w3, up_proj case: Load into second shard of w13. elif shard_id == 2: param_data[expert_id, shard_size:2 * shard_size, :] = loaded_weight[shard, :] + if is_hpu(): + self.hpu_static_fused_moe.w13_list[expert_id].set_weight( + param_data[expert_id]) # w2, down_proj case: Load into only shard of w2. elif shard_id == 1: param_data[expert_id, :, :] = loaded_weight[:, shard] + if is_hpu(): + self.hpu_static_fused_moe.w2_list[expert_id].set_weight( + param_data[expert_id]) else: raise ValueError( f"Shard id must be in [0,1,2] but got {shard_id}") diff --git a/vllm/model_executor/layers/quantization/inc.py b/vllm/model_executor/layers/quantization/inc.py index f6718ec2ac9e7..ec0141b61f58f 100644 --- a/vllm/model_executor/layers/quantization/inc.py +++ b/vllm/model_executor/layers/quantization/inc.py @@ -5,6 +5,8 @@ from torch.nn.parameter import Parameter from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe.layer import ( + FusedMoE, UnquantizedFusedMoEMethod) from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) @@ -52,6 +54,8 @@ def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["INCLinearMethod"]: if isinstance(layer, LinearBase): return INCLinearMethod(self) + elif isinstance(layer, FusedMoE): + return UnquantizedFusedMoEMethod() return None def get_scaled_act_names(self) -> List[str]: @@ -78,7 +82,7 @@ class INCLinearMethod(LinearMethodBase): 1. Only support per-tensor quantization due to torch._scaled_mm support. 2. Only support float8_e4m3fn data type due to the limitation of torch._scaled_mm (https://github.com/pytorch/pytorch/blob/2e48b39603411a41c5025efbe52f89560b827825/aten/src/ATen/native/cuda/Blas.cpp#L854-L856) - + Args: quant_config: The quantization config. """ diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index f7e0f56c1a46e..a8b0a7b07ed8e 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -24,7 +24,7 @@ def get_model_architecture( # Special handling for quantized Mixtral. # FIXME(woosuk): This is a temporary hack. if (model_config.quantization is not None - and model_config.quantization != "fp8" + and model_config.quantization not in ["fp8", "inc"] and "MixtralForCausalLM" in architectures): architectures = ["QuantMixtralForCausalLM"] From 6a734f4d2b14040b3bbcd8cb9843fac9dfc8318b Mon Sep 17 00:00:00 2001 From: Ilia Taraban Date: Thu, 12 Sep 2024 11:51:05 +0200 Subject: [PATCH 188/819] Fixed ALiBi (#254) Fixed ALiB and [MPT-7B](https://www.databricks.com/blog/mpt-7b) model. Accuracy results comparing to CPU(collected using [EleutherAI](https://github.com/EleutherAI/lm-evaluation-harness)) | Tasks | CPU | HPU | | -------------- | ------ | ------ | | arc_challenge | 0.4224 | 0.4189 | | arc_easy | 0.6974 | 0.6999 | | hellaswag | 0.7603 | 0.7626 | | lambada_openai | 0.7306 | 0.7326 | | mmlu | 0.293 | 0.2925 | | winogrande | 0.6851 | 0.6811 | --- vllm/attention/backends/habana_attn.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py index 20b0f2bc7630b..56b71a431aca7 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/habana_attn.py @@ -108,17 +108,10 @@ def __init__( self.v_cache = VLLMKVCache() self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads self.sliding_window = sliding_window - self.position_bias = None self.alibi_slopes = alibi_slopes if alibi_slopes is not None: - # FIXME(kzawora): Need a general method to set max_seq_len on - # per-model basis. alibi_slopes_tensor = torch.tensor(alibi_slopes, dtype=torch.bfloat16) - self.position_bias = _make_alibi_bias(alibi_slopes_tensor, - num_kv_heads, - alibi_slopes_tensor.dtype, - max_seq_len) self.alibi_slopes = alibi_slopes_tensor assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads @@ -190,11 +183,13 @@ def forward( assert attn_metadata.attn_bias is not None, \ 'attn_bias must be set before calling model.forward!' attn_bias = attn_metadata.attn_bias - if self.alibi_slopes is not None and \ - self.position_bias is not None: - attn_bias.add_(self.position_bias[:, :, - -attn_bias.size(2):, - -attn_bias.size(3):]) + if self.alibi_slopes is not None: + position_bias = _make_alibi_bias(self.alibi_slopes, + self.num_kv_heads, + attn_bias.dtype, + attn_bias.shape[-1]) + attn_bias = attn_bias.tile((1, self.num_kv_heads, 1, 1)) + attn_bias.add_(position_bias) else: attn_bias = None From 543bb6dcd0626394e671dd82c1fbb0d15f9f3341 Mon Sep 17 00:00:00 2001 From: Dominika Olszewska Date: Thu, 12 Sep 2024 12:48:09 +0200 Subject: [PATCH 189/819] Update gaudi-installation.rst (#279) Fixing ENV variables' names after flat-PA merge --- docs/source/getting_started/gaudi-installation.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst index ed3beabb2c8aa..27bc0186675f1 100644 --- a/docs/source/getting_started/gaudi-installation.rst +++ b/docs/source/getting_started/gaudi-installation.rst @@ -329,7 +329,7 @@ Environment variables - ``VLLM_{phase}_{dim}_BUCKET_{param}`` - collection of 12 environment variables configuring ranges of bucketing mechanism - ``{phase}`` is either ``PROMPT`` or ``DECODE`` - - ``{dim}`` is either ``BS`` or ``SEQ`` + - ``{dim}`` is either ``BS``, ``SEQ`` or ``BLOCK`` - ``{param}`` is either ``MIN``, ``STEP`` or ``MAX`` - Default values: @@ -345,9 +345,9 @@ Environment variables - batch size min (``VLLM_DECODE_BS_BUCKET_MIN``): ``min(max_num_seqs, 32)`` - batch size step (``VLLM_DECODE_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)`` - batch size max (``VLLM_DECODE_BS_BUCKET_MAX``): ``max_num_seqs`` - - sequence length min (``VLLM_DECODE_SEQ_BUCKET_MIN``): ``128`` - - sequence length step (``VLLM_DECODE_SEQ_BUCKET_STEP``): ``128`` - - sequence length max (``VLLM_DECODE_SEQ_BUCKET_MAX``): ``max(128, (max_num_seqs*max_model_len)/block_size)`` + - sequence length min (``VLLM_DECODE_BLOCK_BUCKET_MIN``): ``128`` + - sequence length step (``VLLM_DECODE_BLOCK_BUCKET_STEP``): ``128`` + - sequence length max (``VLLM_DECODE_BLOCK_BUCKET_MAX``): ``max(128, (max_num_seqs*max_model_len)/block_size)`` Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution: From c2c1e0f1137569722adda547c8b34c1889cd3230 Mon Sep 17 00:00:00 2001 From: kwisniewski98 Date: Thu, 12 Sep 2024 14:41:49 +0300 Subject: [PATCH 190/819] Move setting gc threshold to separate function --- vllm/worker/habana_model_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index fdf1e9c444406..1d94bbc9dd8ab 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -538,8 +538,9 @@ def __init__( self.seen_configs: set = set() self._mem_margin: Optional[int] = None self._setup_buckets() - + self._set_gc_threshold() + def _set_gc_threshold(self) -> None: # Read https://docs.python.org/3/library/gc.html#gc.set_threshold # for comprehensive description of gc generations. # We can either use VLLM_GC_THR_GEN[0-2] (this has higher priority) From 6b3503c2f16e5d8bdadbcdd84b3e1ddeeb1bce13 Mon Sep 17 00:00:00 2001 From: kwisniewski98 Date: Thu, 12 Sep 2024 15:00:24 +0300 Subject: [PATCH 191/819] Fix mypy issues --- vllm/worker/habana_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 1d94bbc9dd8ab..4949fd7aba7ad 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -547,7 +547,7 @@ def _set_gc_threshold(self) -> None: # to set particular generation threshold or use simpler # VLLM_GC_THR_MULTIPLIER to multiply default values. default_gc_thrs = list(gc.get_threshold()) - requested_gc_thrs = [None] * len(default_gc_thrs) + requested_gc_thrs = [0] * len(default_gc_thrs) for i in range(len(default_gc_thrs)): requested_gc_thrs[i] = int(os.environ.get(f'VLLM_GC_THR_GEN{i}', default_gc_thrs[i])) if requested_gc_thrs == default_gc_thrs: From 8535d53b309397f194d0f5c85cab69130b1cd083 Mon Sep 17 00:00:00 2001 From: kwisniewski98 Date: Thu, 12 Sep 2024 15:03:26 +0300 Subject: [PATCH 192/819] Fix line too long --- vllm/worker/habana_model_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 4949fd7aba7ad..577ba80e6185c 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -549,7 +549,8 @@ def _set_gc_threshold(self) -> None: default_gc_thrs = list(gc.get_threshold()) requested_gc_thrs = [0] * len(default_gc_thrs) for i in range(len(default_gc_thrs)): - requested_gc_thrs[i] = int(os.environ.get(f'VLLM_GC_THR_GEN{i}', default_gc_thrs[i])) + requested_gc_thrs[i] = int(os.environ.get(f'VLLM_GC_THR_GEN{i}', + default_gc_thrs[i])) if requested_gc_thrs == default_gc_thrs: gc_thr_multiplier = int(os.environ.get('VLLM_GC_THR_MULTIPLIER', 2)) requested_gc_thrs = [t * gc_thr_multiplier for t in default_gc_thrs] From 27b618a3e889d28731cc909919b12b1c97b36244 Mon Sep 17 00:00:00 2001 From: kwisniewski98 Date: Thu, 12 Sep 2024 15:08:22 +0300 Subject: [PATCH 193/819] Format files --- vllm/worker/habana_model_runner.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 577ba80e6185c..e61a76fa3dadf 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -539,7 +539,7 @@ def __init__( self._mem_margin: Optional[int] = None self._setup_buckets() self._set_gc_threshold() - + def _set_gc_threshold(self) -> None: # Read https://docs.python.org/3/library/gc.html#gc.set_threshold # for comprehensive description of gc generations. @@ -549,11 +549,14 @@ def _set_gc_threshold(self) -> None: default_gc_thrs = list(gc.get_threshold()) requested_gc_thrs = [0] * len(default_gc_thrs) for i in range(len(default_gc_thrs)): - requested_gc_thrs[i] = int(os.environ.get(f'VLLM_GC_THR_GEN{i}', - default_gc_thrs[i])) + requested_gc_thrs[i] = int( + os.environ.get(f'VLLM_GC_THR_GEN{i}', default_gc_thrs[i])) if requested_gc_thrs == default_gc_thrs: - gc_thr_multiplier = int(os.environ.get('VLLM_GC_THR_MULTIPLIER', 2)) - requested_gc_thrs = [t * gc_thr_multiplier for t in default_gc_thrs] + gc_thr_multiplier = int(os.environ.get('VLLM_GC_THR_MULTIPLIER', + 2)) + requested_gc_thrs = [ + t * gc_thr_multiplier for t in default_gc_thrs + ] gc.set_threshold(*requested_gc_thrs) def load_model(self) -> None: From 35a4a984a79dc421320a2e520005e48ed884571d Mon Sep 17 00:00:00 2001 From: Michal Adamczyk Date: Thu, 12 Sep 2024 15:53:33 +0200 Subject: [PATCH 194/819] Remove hardcoded value from softmax in flat_pa (#280) This PR removes the hardcoded value used to normalize softmax in flat_pa . Current approach is to use the global maximum as it is very easy to compute, but it has the drawback that other samples in a batch might slightly affect numerical stability. This is a first step to eliminated some of the INF/NaN issues we see in certain configurations and by no means this is a complete solutions. This needs to be revised in the future. --- vllm/hpu/ops.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index 3d76c36f2648b..939d195a12b08 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -40,7 +40,18 @@ def block2batch(tensor, block_mapping): def block_softmax(batch_size, attn, block_mapping): - attn.sub_(10.0) + # We're using global maximum to decrease the exponent as + # it's fast to compute and performs reasonably well. + # This is by no means a final solution and needs to + # be properly addressed in the future. + # + # Additionally there's a bug where 'max' is not parallelized + # across TPC cores, so we need to split the tensor manually + # instead of simply doing attn_max = attn.max() + + tail_dims = tuple(range(1, attn.dim())) + attn_max = attn.amax(tail_dims).amax() + attn.sub_(attn_max) attn = attn.exp_() sums = attn.sum(dim=-1).unsqueeze(-1) sums = block2batch(sums, block_mapping) From 046cb25a4a549f985105152cb3dec2c25279252e Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Thu, 12 Sep 2024 15:23:51 +0000 Subject: [PATCH 195/819] Fix yapf detected format issue Signed-off-by: Chendi.Xue --- vllm/model_executor/models/dbrx.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index e3a45b26d909b..71362299a9fcf 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -82,17 +82,15 @@ def __init__( self.router = DbrxRouter(config, self.params_dtype) self.ws = nn.Parameter( - torch.empty( - self.num_total_experts, - 2 * self.intermediate_size, - self.d_model, - dtype=self.params_dtype)) + torch.empty(self.num_total_experts, + 2 * self.intermediate_size, + self.d_model, + dtype=self.params_dtype)) self.w2s = nn.Parameter( - torch.empty( - self.num_total_experts, - self.d_model, - self.intermediate_size, - dtype=self.params_dtype)) + torch.empty(self.num_total_experts, + self.d_model, + self.intermediate_size, + dtype=self.params_dtype)) set_weight_attrs( self.ws, From aa4c59cf7047c5250f8d9f6dea988d3c48bb508e Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Thu, 12 Sep 2024 15:14:39 +0000 Subject: [PATCH 196/819] some update to vision model Signed-off-by: Chendi.Xue --- vllm/worker/habana_model_runner.py | 47 +++++++++++++++--------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index dec1b65858eb4..e690f37dd820e 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -39,6 +39,8 @@ _add_sampling_metadata_broadcastable_dict, _init_attn_metadata_from_tensor_dict, _init_sampling_metadata_from_tensor_dict) +from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors, + MultiModalInputs) from .profiler import Profiler @@ -250,7 +252,7 @@ class PreparePromptMetadata(NamedTuple): lora_index_mapping: List[List[int]] lora_prompt_mapping: List[List[int]] lora_requests: Set[LoRARequest] - multi_modal_input: Optional[torch.Tensor] + multi_modal_kwargs: Dict[str, BatchedTensors] slot_mapping: List[List[int]] @classmethod @@ -264,7 +266,7 @@ def empty(cls): lora_index_mapping=[], lora_prompt_mapping=[], lora_requests=set(), - multi_modal_input=None, + multi_modal_kwargs=None, slot_mapping=[], ) @@ -452,6 +454,10 @@ def __init__( self._mem_margin: Optional[int] = None self._setup_buckets() + # Multi-modal data support + self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \ + .create_input_mapper(self.model_config) + def load_model(self) -> None: import habana_frameworks.torch.core as htcore if self.model_config.quantization == 'inc': @@ -623,7 +629,7 @@ def _prepare_prompt( context_lens: List[int] = [] query_lens: List[int] = [] prefix_block_tables: List[List[int]] = [] - multi_modal_input_list: List[torch.Tensor] = [] + multi_modal_inputs_list: List[MultiModalInputs] = [] if len(seq_group_metadata_list) == 0: return PreparePromptMetadata.empty() @@ -681,9 +687,10 @@ def _prepare_prompt( # is always the first token in the sequence. input_positions.append(list(range(context_len, seq_len))) - if seq_group_metadata.multi_modal_data: - multi_modal_input_list.append( - seq_group_metadata.multi_modal_data.data) + mm_data = seq_group_metadata.multi_modal_data + if mm_data: + mm_kwargs = self.multi_modal_input_mapper(mm_data) + multi_modal_inputs_list.append(mm_kwargs) if seq_group_metadata.block_tables is None: # During memory profiling, the block tables are not initialized @@ -725,15 +732,6 @@ def _prepare_prompt( dtype=torch.int, device=self.device) - if multi_modal_input_list: - assert self.multimodal_config, ( - "Multi-modal inputs are only supported by " - "vision language models.") - multi_modal_input = torch.cat(multi_modal_input_list, - dim=0).to(self.device) - else: - multi_modal_input = None - max_prompt_block_table_len = max(len(t) for t in prefix_block_tables) max_prompt_len = max( find_bucket(max(seq_lens), self.prompt_seq_bucket_cfg), @@ -806,6 +804,9 @@ def _prepare_prompt( num_decode_tokens=0, slot_mapping=slot_mapping, ) + multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list, + device=self.device) + return PreparePromptMetadata( input_tokens=input_tokens, input_positions=input_positions, @@ -815,7 +816,7 @@ def _prepare_prompt( lora_index_mapping=lora_index_mapping, lora_prompt_mapping=lora_prompt_mapping, lora_requests=lora_requests, - multi_modal_input=multi_modal_input, + multi_modal_kwargs=multi_modal_kwargs, slot_mapping=slot_mapping, ) @@ -930,7 +931,7 @@ def prepare_input_tensors( input_positions = None lora_mapping = None lora_requests = None - multi_modal_input = None + multi_modal_kwargs = None batch_type = None seq_lens = None query_lens = None @@ -969,7 +970,7 @@ def prepare_input_tensors( lora_index_mapping, lora_prompt_mapping, lora_requests, - multi_modal_input, + multi_modal_kwargs, slot_mapping, ) = self._prepare_prompt(prefill_reqs) ( @@ -1047,7 +1048,7 @@ def prepare_input_tensors( "selected_token_indices": sampling_metadata.selected_token_indices, "lora_requests": lora_requests, "lora_mapping": lora_mapping, - "multi_modal_input": multi_modal_input, + "multi_modal_kwargs": multi_modal_kwargs, "num_prefill_tokens": num_prefill_tokens, "num_decode_tokens": num_decode_tokens, "slot_mapping": slot_mapping, @@ -1073,7 +1074,7 @@ def prepare_input_tensors( attn_metadata=attn_metadata, lora_requests=lora_requests, lora_mapping=lora_mapping, - multi_modal_kwargs=multi_modal_input, + multi_modal_kwargs=multi_modal_kwargs, real_batch_size=real_batch_size, batch_size_padded=batch_size_padded), sampling_metadata @@ -1592,7 +1593,6 @@ def execute_model( input_positions = model_input.input_positions attn_metadata = model_input.attn_metadata sampling_metadata = model_input.sampling_metadata - multi_modal_input = model_input.multi_modal_kwargs real_batch_size = model_input.real_batch_size batch_size_padded = model_input.batch_size_padded assert input_tokens is not None @@ -1610,10 +1610,9 @@ def execute_model( "positions": input_positions, "kv_caches": kv_caches, "attn_metadata": self.trim_attn_metadata(attn_metadata), - "intermediate_tensors": intermediate_tensors + "intermediate_tensors": intermediate_tensors, + **(model_input.multi_modal_kwargs or {}), } - if multi_modal_input is not None: - execute_model_kwargs.update(multi_modal_input) if htorch.utils.internal.is_lazy(): execute_model_kwargs.update({ "bypass_hpu_graphs": not use_graphs, From 1a35da26ce0810a7e11d1ec8a572b7b6729a8937 Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Fri, 13 Sep 2024 16:36:29 +0000 Subject: [PATCH 197/819] fix ruff detected format error Signed-off-by: Chendi.Xue --- vllm/worker/habana_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index ff20316f89de2..2abc4b6fdf38c 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -32,6 +32,8 @@ from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.model_executor import SamplingMetadata from vllm.model_executor.model_loader import get_model +from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors, + MultiModalInputs) from vllm.sampling_params import SamplingParams from vllm.sequence import (IntermediateTensors, SamplerOutput, SequenceData, SequenceGroupMetadata) @@ -43,8 +45,6 @@ _add_sampling_metadata_broadcastable_dict, _init_attn_metadata_from_tensor_dict, _init_sampling_metadata_from_tensor_dict) -from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors, - MultiModalInputs) from .profiler import Profiler From 3b710a6139dac38f18ec01753248d0b434a4e5ac Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Fri, 13 Sep 2024 16:44:37 +0000 Subject: [PATCH 198/819] fix mypy format error Signed-off-by: Chendi.Xue --- vllm/worker/habana_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 2abc4b6fdf38c..c32ee9f92e694 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -330,7 +330,7 @@ class PreparePromptMetadata(NamedTuple): lora_index_mapping: List[List[int]] lora_prompt_mapping: List[List[int]] lora_requests: Set[LoRARequest] - multi_modal_kwargs: Dict[str, BatchedTensors] + multi_modal_kwargs: Optional[Dict[str, BatchedTensors]] slot_mapping: List[List[int]] lora_mask: Optional[torch.Tensor] lora_logits_mask: Optional[torch.Tensor] From 5abe4d7ba2c30713b0e56829b84cfaee202ee09a Mon Sep 17 00:00:00 2001 From: kwisniewski98 Date: Mon, 16 Sep 2024 15:39:47 +0300 Subject: [PATCH 199/819] Move ALiBi to supported features in README_GAUDI.md --- README_GAUDI.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README_GAUDI.md b/README_GAUDI.md index 5109f7ddf9927..644829210125c 100644 --- a/README_GAUDI.md +++ b/README_GAUDI.md @@ -81,13 +81,13 @@ Supported Features - Inference with [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) for accelerating low-batch latency and throughput +- Attention with Linear Biases (ALiBi) Unsupported Features ==================== - Beam search - LoRA adapters -- Attention with Linear Biases (ALiBi) - Quantization (AWQ, FP8 E5M2, FP8 E4M3) - Prefill chunking (mixed-batch inferencing) From 1a712d5be7127fb8b4b1e9a8d09d62dd6a38a874 Mon Sep 17 00:00:00 2001 From: kwisniewski98 Date: Tue, 17 Sep 2024 12:35:35 +0300 Subject: [PATCH 200/819] Move ALiBi to supported features in gaudi-installation.rst --- docs/source/getting_started/gaudi-installation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst index 27bc0186675f1..328f9e723ec71 100644 --- a/docs/source/getting_started/gaudi-installation.rst +++ b/docs/source/getting_started/gaudi-installation.rst @@ -76,13 +76,13 @@ Supported Features - Tensor parallelism support for multi-card inference - Inference with `HPU Graphs `__ for accelerating low-batch latency and throughput +- Attention with Linear Biases (ALiBi) Unsupported Features ==================== - Beam search - LoRA adapters -- Attention with Linear Biases (ALiBi) - Quantization (AWQ, FP8 E5M2, FP8 E4M3) - Prefill chunking (mixed-batch inferencing) From a9de5ba2385d5c332a2610a055465234905ff334 Mon Sep 17 00:00:00 2001 From: Jakub Maksymczuk Date: Tue, 17 Sep 2024 12:31:26 +0200 Subject: [PATCH 201/819] Add fake HPU mode to Habana components with dummy habana_frameworks module. (#250) Co-authored-by: Konrad Zawora --- .github/workflows/cpu-test.yml | 34 ++++++++++++ examples/offline_inference_fakehpu.py | 38 +++++++++++++ vllm/__init__.py | 4 ++ vllm/executor/ray_habana_executor.py | 12 +++-- vllm/executor/ray_utils.py | 5 +- vllm/model_executor/model_loader/loader.py | 7 ++- vllm/model_executor/models/opt.py | 1 - vllm/utils.py | 63 ++++++++++++++++++++++ vllm/worker/cache_engine.py | 4 +- vllm/worker/habana_model_runner.py | 11 ++-- vllm/worker/habana_worker.py | 18 +++++-- 11 files changed, 177 insertions(+), 20 deletions(-) create mode 100644 .github/workflows/cpu-test.yml create mode 100644 examples/offline_inference_fakehpu.py diff --git a/.github/workflows/cpu-test.yml b/.github/workflows/cpu-test.yml new file mode 100644 index 0000000000000..89a702f9751d9 --- /dev/null +++ b/.github/workflows/cpu-test.yml @@ -0,0 +1,34 @@ +name: cpu-test + +on: + # Trigger the workflow on push or pull request, + # but only for the habana_main branch + push: + branches: + - habana_main + pull_request: + branches: + - habana_main + + +jobs: + cputest: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.10"] + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install torch --extra-index-url https://download.pytorch.org/whl/cpu + pip install -r requirements-hpu.txt + VLLM_TARGET_DEVICE=hpu python setup.py develop + - name: cpu-test + run: | + VLLM_SKIP_WARMUP=true VLLM_PROMPT_SEQ_BUCKET_MAX=128 VLLM_USE_FAKE_HPU=1 python examples/offline_inference_fakehpu.py diff --git a/examples/offline_inference_fakehpu.py b/examples/offline_inference_fakehpu.py new file mode 100644 index 0000000000000..972d84b60b318 --- /dev/null +++ b/examples/offline_inference_fakehpu.py @@ -0,0 +1,38 @@ +import os + +from vllm import LLM, SamplingParams + +if os.environ.get('VLLM_USE_FAKE_HPU', '0') != '0': + from vllm.utils import migrate_to_cpu + migrate_to_cpu() + +# Sample prompts. +prompts = [ + "Berlin is the capital city of ", + "Louvre is located in the city of ", + "Barack Obama was the 44th president of ", + "Warsaw is the capital city of ", + "Gniezno is a city in ", + "San Francisco is located in the state of ", + "Llanfairpwllgwyngyll is located in country of ", +] +ref_answers = [ + "Germany", "Paris", "United States", "Poland", "Poland", "California", + "Wales" +] +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0, n=1, use_beam_search=False) + +# Create an LLM. +llm = LLM(model="facebook/opt-125m", max_model_len=32, max_num_seqs=4) +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts, sampling_params) +# Print the outputs. +for output, answer in zip(outputs, ref_answers): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + assert answer in generated_text, ( + f"The generated text does not contain the correct answer: {answer}") +print('PASSED') diff --git a/vllm/__init__.py b/vllm/__init__.py index 0895c571d1d89..29fc02ae3e96a 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -1,4 +1,8 @@ """vLLM: a high-throughput and memory-efficient inference engine for LLMs""" +from vllm.utils import is_fake_hpu, migrate_to_cpu + +if is_fake_hpu(): + migrate_to_cpu() from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py index 17e3414a96b57..2a8e2df37f031 100644 --- a/vllm/executor/ray_habana_executor.py +++ b/vllm/executor/ray_habana_executor.py @@ -13,7 +13,7 @@ from vllm.utils import (_run_task_with_lock, error_on_invalid_device_count_status, get_distributed_init_method, get_ip, get_open_port, - get_vllm_instance_id, make_async) + get_vllm_instance_id, is_fake_hpu, make_async) if ray is not None: from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy @@ -87,18 +87,20 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", driver_ip = get_ip() worker_wrapper_kwargs = self._get_worker_wrapper_args() for bundle_id, bundle in enumerate(placement_group.bundle_specs): - if not bundle.get("HPU", 0): + resource_name = "HPU" if not is_fake_hpu() else "CPU" + if not bundle.get(resource_name, 0): continue scheduling_strategy = PlacementGroupSchedulingStrategy( placement_group=placement_group, placement_group_capture_child_tasks=True, placement_group_bundle_index=bundle_id, ) - + resources = {'HPU': num_gpus} if not is_fake_hpu() else {} + num_cpus = 0 if not is_fake_hpu() else num_gpus worker = ray.remote( - num_cpus=0, + num_cpus=num_cpus, num_gpus=0, - resources={'HPU': num_gpus}, + resources=resources, scheduling_strategy=scheduling_strategy, **ray_remote_kwargs, )(RayWorkerWrapper).remote(**worker_wrapper_kwargs) diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 507dc04f48123..8f5bc30a9599c 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -3,7 +3,8 @@ from vllm.config import ParallelConfig from vllm.logger import init_logger from vllm.sequence import ExecuteModelRequest -from vllm.utils import get_ip, is_hip, is_hpu, is_tpu, is_xpu +from vllm.utils import (get_ip, hpu_device_string, is_hip, is_hpu, is_tpu, + is_xpu) from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -97,7 +98,7 @@ def initialize_ray_cluster( if is_tpu(): device_str = "TPU" elif is_hpu(): - device_str = "HPU" + device_str = hpu_device_string() # Create placement group for worker processes current_placement_group = ray.util.get_current_placement_group() if current_placement_group: diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 06048d97088e1..c49ccc96c7080 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -37,7 +37,7 @@ supports_vision) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform -from vllm.utils import is_hpu, is_tpu +from vllm.utils import is_fake_hpu, is_hpu, is_tpu logger = init_logger(__name__) @@ -277,7 +277,10 @@ def load_model(self, *, model_config: ModelConfig, scheduler_config: SchedulerConfig, cache_config: CacheConfig) -> nn.Module: with set_default_torch_dtype(model_config.dtype): - with torch.device(self.load_config.device): + _device = torch.device( + device_config.device) if is_fake_hpu() else torch.device( + self.load_config.device) + with _device: model = _initialize_model(model_config, self.load_config, lora_config, multimodal_config, cache_config, scheduler_config) diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index a05090cd46648..3f842ea757d2f 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -254,7 +254,6 @@ def forward( if self.project_in is not None: inputs_embeds, _ = self.project_in(inputs_embeds) hidden_states = inputs_embeds + pos_embeds - for i in range(len(self.layers)): layer = self.layers[i] hidden_states = layer(hidden_states, kv_caches[i], attn_metadata) diff --git a/vllm/utils.py b/vllm/utils.py index fa6e132dd3522..04782cf13fce5 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -208,10 +208,41 @@ def is_neuron() -> bool: @lru_cache(maxsize=None) def is_hpu() -> bool: + return _is_habana_frameworks_installed() or _is_built_for_hpu() + + +@lru_cache(maxsize=None) +def is_fake_hpu() -> bool: + return os.environ.get('VLLM_USE_FAKE_HPU', '0') != '0' + + +@lru_cache(maxsize=None) +def hpu_device_string(): + device_string = 'hpu' if not is_fake_hpu() else 'cpu' + return device_string + + +@lru_cache(maxsize=None) +def hpu_backend_string(): + backend_string = 'hccl' if not is_fake_hpu() else 'gloo' + return backend_string + + +@lru_cache(maxsize=None) +def _is_habana_frameworks_installed() -> bool: from importlib import util return util.find_spec('habana_frameworks') is not None +@lru_cache(maxsize=None) +def _is_built_for_hpu() -> bool: + from importlib.metadata import PackageNotFoundError, version + try: + return "gaudi" in version("vllm") + except PackageNotFoundError: + return False + + @lru_cache(maxsize=None) def is_tpu() -> bool: try: @@ -624,18 +655,24 @@ def __init__(self, device=None): @staticmethod def current_device_memory_usage() -> float: + if is_fake_hpu(): + return 0 # Return the device memory usage in bytes. free_hpu_memory, total_hpu_memory = torch.hpu.mem_get_info() return total_hpu_memory - free_hpu_memory @staticmethod def current_free_device_memory() -> float: + if is_fake_hpu(): + return 0 # Return the device memory usage in bytes. free_hpu_memory, _ = torch.hpu.mem_get_info() return free_hpu_memory @staticmethod def total_device_memory() -> float: + if is_fake_hpu(): + return 0 # Return the device memory usage in bytes. _, total_hpu_memory = torch.hpu.mem_get_info() return total_hpu_memory @@ -1088,3 +1125,29 @@ async def _run_task_with_lock(task: Callable, lock: asyncio.Lock, *args, """Utility function to run async task in a lock""" async with lock: return await task(*args, **kwargs) + + +def migrate_to_cpu(): + import importlib + from unittest.mock import MagicMock + + torch.hpu = MagicMock(name="torch.hpu") + + # Adding dummy submodules to habana_frameworks.torch for cpu-test, + # functions from dummy modules will do nothing by default + spec = importlib.util.spec_from_loader('habana_frameworks', loader=None) + sys.modules['habana_frameworks'] = MagicMock() + sys.modules['habana_frameworks'].__spec__ = spec + + builtin_import = __builtins__['__import__'] # type: ignore + + def import_wrapper(name, *args, **kwargs): + if 'habana_frameworks' in name: + sys.modules[name] = MagicMock() + return builtin_import(name, *args, **kwargs) + + __builtins__['__import__'] = import_wrapper + + # In case you want to mock a function to actually do something + import habana_frameworks.torch as htorch + htorch.utils.internal.is_lazy.return_value = False diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index ec0b8c2369210..f678d44f71dd3 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -6,7 +6,7 @@ from vllm.attention import get_attn_backend from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig from vllm.logger import init_logger -from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size, +from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size, is_fake_hpu, is_pin_memory_available) logger = init_logger(__name__) @@ -78,7 +78,7 @@ def _allocate_kv_cache( pin_memory = is_pin_memory_available() if device == "cpu" else False kv_cache: List[torch.Tensor] = [] for _ in range(self.num_attention_layers): - if device == 'hpu': + if device == 'hpu' or is_fake_hpu(): key_cache = torch.zeros(kv_cache_shape, dtype=self.dtype, device=device) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 6c157fd43fffd..171ae0510d6c6 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -37,7 +37,7 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import (IntermediateTensors, SamplerOutput, SequenceData, SequenceGroupMetadata) -from vllm.utils import (HabanaMemoryProfiler, format_bytes, +from vllm.utils import (HabanaMemoryProfiler, format_bytes, is_fake_hpu, is_pin_memory_available, make_tensor_with_pad) from vllm.worker.model_runner_base import ( ModelRunnerBase, ModelRunnerInputBase, @@ -246,7 +246,8 @@ def __init__(self, model, block_size, dtype, enforce_eager): '0').lower() in ['1', 'true'] self.block_size = block_size self.dtype = dtype - if not htorch.utils.internal.is_lazy() and not enforce_eager: + if not is_fake_hpu() and not htorch.utils.internal.is_lazy( + ) and not enforce_eager: self.model = torch.compile(self.model, backend='hpu_backend', dynamic=False) @@ -509,7 +510,9 @@ def __init__( if model_config is not None else None) self.device_config = (device_config if device_config is not None else DeviceConfig()) - + if is_fake_hpu(): + device_config.device = torch.device('cpu') + device_config.device_type = 'cpu' self.device = self.device_config.device self.enforce_eager = self.model_config.enforce_eager self.max_num_seqs = self.scheduler_config.max_num_seqs @@ -618,7 +621,7 @@ def load_model(self) -> None: mark_only_scales_as_const=True) logger.info("Preparing model with INC took %s", m_inc.get_summary_string()) - else: + elif not is_fake_hpu(): self.model = self.model.to("hpu") htcore.mark_step() torch.hpu.synchronize() diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index 9d083915041fe..b4f6e53c1745a 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -21,7 +21,8 @@ from vllm.model_executor import set_random_seed from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import ExecuteModelRequest -from vllm.utils import HabanaMemoryProfiler, format_bytes +from vllm.utils import (HabanaMemoryProfiler, format_bytes, hpu_backend_string, + hpu_device_string, is_fake_hpu) from vllm.worker.cache_engine import CacheEngine from vllm.worker.habana_model_runner import HabanaModelRunner from vllm.worker.worker_base import LocalOrDistributedWorkerBase, WorkerInput @@ -105,6 +106,8 @@ def init_device(self) -> None: if self.device_config.device.type == "hpu": self.device = torch.device("hpu") torch.hpu.set_device(self.device) + elif self.device_config.device_type == "cpu": + self.device = torch.device("cpu") else: raise RuntimeError( f"Not support device type: {self.device_config.device}") @@ -138,6 +141,10 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: # Execute a forward pass with dummy inputs to profile the memory usage # of the model. + if is_fake_hpu(): + cache_block_size = self.get_cache_block_size_bytes() + fake_hpu_cache_alloc = 4 * 2**30 # take 4 GiB flat on fake hpu + return fake_hpu_cache_alloc // cache_block_size, 0 with HabanaMemoryProfiler() as m: self.model_runner.profile_run() torch.hpu.synchronize() @@ -335,11 +342,12 @@ def init_worker_distributed_environment( local_rank: int = -1, ) -> None: """Initialize the distributed environment.""" + backend = hpu_backend_string() init_distributed_environment(parallel_config.world_size, rank, distributed_init_method, local_rank, - backend='hccl') + backend=backend) ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size) @@ -356,15 +364,17 @@ def init_worker_distributed_environment( "distributed_init_method must be set if torch.distributed " "is not already initialized") else: + backend = hpu_backend_string() torch.distributed.init_process_group( - backend="hccl", + backend=backend, world_size=parallel_config.world_size, rank=rank, init_method=distributed_init_method, ) # A small all_reduce for warmup & checking conformance. - dummy_tensor_hpu = torch.ones(1).to('hpu') + device = hpu_device_string() + dummy_tensor_hpu = torch.ones(1).to(device) torch.distributed.all_reduce(dummy_tensor_hpu) assert dummy_tensor_hpu.item() == parallel_config.world_size ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, From d39298c1289a7dcc4d95b08bcd7ad90e9fbf12e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Kuligowski?= Date: Tue, 17 Sep 2024 15:46:51 +0200 Subject: [PATCH 202/819] Update documentation on support of fp8 (#288) Update documentation on support of fp8 --- README_GAUDI.md | 3 ++- docs/source/getting_started/gaudi-installation.rst | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/README_GAUDI.md b/README_GAUDI.md index 5109f7ddf9927..2ae9d5f2cc6e4 100644 --- a/README_GAUDI.md +++ b/README_GAUDI.md @@ -81,6 +81,7 @@ Supported Features - Inference with [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) for accelerating low-batch latency and throughput +- INC quantization Unsupported Features ==================== @@ -88,7 +89,7 @@ Unsupported Features - Beam search - LoRA adapters - Attention with Linear Biases (ALiBi) -- Quantization (AWQ, FP8 E5M2, FP8 E4M3) +- AWQ quantization - Prefill chunking (mixed-batch inferencing) Supported Configurations diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst index 27bc0186675f1..2d810380af59b 100644 --- a/docs/source/getting_started/gaudi-installation.rst +++ b/docs/source/getting_started/gaudi-installation.rst @@ -76,6 +76,7 @@ Supported Features - Tensor parallelism support for multi-card inference - Inference with `HPU Graphs `__ for accelerating low-batch latency and throughput +- INC quantization Unsupported Features ==================== @@ -83,7 +84,7 @@ Unsupported Features - Beam search - LoRA adapters - Attention with Linear Biases (ALiBi) -- Quantization (AWQ, FP8 E5M2, FP8 E4M3) +- AWQ quantization - Prefill chunking (mixed-batch inferencing) Supported Configurations From ed19acd8a0065410b9172d1fa31b92e348100bf9 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 17 Sep 2024 17:06:07 +0300 Subject: [PATCH 203/819] Reduce default value of VLLM_GRAPH_RESERVED_MEM to 0.1 --- README_GAUDI.md | 6 +++--- docs/source/getting_started/gaudi-installation.rst | 4 ++-- vllm/worker/habana_worker.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README_GAUDI.md b/README_GAUDI.md index 5109f7ddf9927..9e289658fd5c2 100644 --- a/README_GAUDI.md +++ b/README_GAUDI.md @@ -315,9 +315,9 @@ mark 90% of free device memory at that point as usable. Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of memory reserved for HPU Graphs capture. With its default value -(`VLLM_GRAPH_RESERVED_MEM=0.4`), 40% of usable memory will be reserved +(`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory will be reserved for graph capture (later referred to as \"usable graph memory\"), and -the remaining 60% will be utilized for KV cache. Environment variable +the remaining 90% will be utilized for KV cache. Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.5`), both stages have equal memory @@ -445,7 +445,7 @@ Environment variables - `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by default - `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for - HPUGraph capture, `0.4` by default + HPUGraph capture, `0.1` by default - `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory dedicated for prompt graphs, `0.5` by default - `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst index 27bc0186675f1..5af81210c4159 100644 --- a/docs/source/getting_started/gaudi-installation.rst +++ b/docs/source/getting_started/gaudi-installation.rst @@ -243,7 +243,7 @@ Before KV cache gets allocated, model weights are loaded onto the device, and a Only after that, ``gpu_memory_utilization`` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable. Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. Environment variable ``VLLM_GRAPH_RESERVED_MEM`` defines the ratio of memory reserved for HPU Graphs capture. -With its default value (``VLLM_GRAPH_RESERVED_MEM=0.4``), 40% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 60% will be utilized for KV cache. +With its default value (``VLLM_GRAPH_RESERVED_MEM=0.1``), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache. Environment variable ``VLLM_GRAPH_PROMPT_RATIO`` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (``VLLM_GRAPH_PROMPT_RATIO=0.5``), both stages have equal memory constraints. Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. ``VLLM_GRAPH_PROMPT_RATIO=0.2`` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. @@ -322,7 +322,7 @@ Environment variables **Performance tuning knobs:** - ``VLLM_SKIP_WARMUP``: if ``true``, warmup will be skipped, ``false`` by default -- ``VLLM_GRAPH_RESERVED_MEM``: percentage of memory dedicated for HPUGraph capture, ``0.4`` by default +- ``VLLM_GRAPH_RESERVED_MEM``: percentage of memory dedicated for HPUGraph capture, ``0.1`` by default - ``VLLM_GRAPH_PROMPT_RATIO``: percentage of reserved graph memory dedicated for prompt graphs, ``0.5`` by default - ``VLLM_GRAPH_PROMPT_STRATEGY``: strategy determining order of prompt graph capture, ``min_tokens`` or ``max_bs``, ``min_tokens`` by default - ``VLLM_GRAPH_DECODE_STRATEGY``: strategy determining order of decode graph capture, ``min_tokens`` or ``max_bs``, ``max_bs`` by default diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index 9d083915041fe..291a7fc0d4489 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -150,7 +150,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: cache_block_size = self.get_cache_block_size_bytes() graph_reserved_mem = (float( - os.environ.get('VLLM_GRAPH_RESERVED_MEM', '0.4')) + os.environ.get('VLLM_GRAPH_RESERVED_MEM', '0.1')) if not self.model_config.enforce_eager else 0) graph_headroom = 1 - graph_reserved_mem available_hpu_memory = free_hpu_memory * \ From 6a96d9bd9180437b04133d3c023daaa174d8d516 Mon Sep 17 00:00:00 2001 From: Tomasz Zielinski <85164140+tzielinski-habana@users.noreply.github.com> Date: Tue, 17 Sep 2024 16:14:18 +0200 Subject: [PATCH 204/819] Removed vllm.hpu directory and changed relevant imports (#291) Moved files from vllm/hpu to another public repo: https://github.com/HabanaAI/vllm-hpu-extension It can be installed with pip install git+https://github.com/HabanaAI/vllm-hpu-extension.git --- .github/workflows/mypy.yaml | 1 - format.sh | 1 - requirements-hpu.txt | 1 + tests/lora/test_lora_hpu.py | 2 +- vllm/attention/backends/habana_attn.py | 6 +- vllm/attention/ops/habana_paged_attn.py | 3 +- vllm/hpu/__init__.py | 6 - vllm/hpu/cache_ops.py | 107 ------- vllm/hpu/ops.py | 293 ------------------ vllm/hpu/rotary_embed.py | 123 -------- vllm/hpu/utils.py | 61 ---- vllm/lora/layers.py | 3 +- vllm/model_executor/layers/fused_moe/layer.py | 2 +- vllm/model_executor/layers/layernorm.py | 2 +- .../model_executor/layers/rotary_embedding.py | 2 +- vllm/worker/habana_model_runner.py | 2 +- 16 files changed, 12 insertions(+), 603 deletions(-) delete mode 100644 vllm/hpu/__init__.py delete mode 100644 vllm/hpu/cache_ops.py delete mode 100644 vllm/hpu/ops.py delete mode 100644 vllm/hpu/rotary_embed.py delete mode 100644 vllm/hpu/utils.py diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml index c2674b914f485..9858d00cfb5c1 100644 --- a/.github/workflows/mypy.yaml +++ b/.github/workflows/mypy.yaml @@ -50,6 +50,5 @@ jobs: mypy vllm/transformers_utils --config-file pyproject.toml mypy vllm/usage --config-file pyproject.toml mypy vllm/worker --config-file pyproject.toml - mypy vllm/hpu --config-file pyproject.toml diff --git a/format.sh b/format.sh index fbfc27a68bb3d..5ad6d6f2938bb 100755 --- a/format.sh +++ b/format.sh @@ -113,7 +113,6 @@ mypy vllm/spec_decode --config-file pyproject.toml mypy vllm/transformers_utils --config-file pyproject.toml mypy vllm/usage --config-file pyproject.toml mypy vllm/worker --config-file pyproject.toml -mypy vllm/hpu --config-file pyproject.toml # If git diff returns a file that is in the skip list, the file may be checked anyway: diff --git a/requirements-hpu.txt b/requirements-hpu.txt index e0f03c8464c7b..d451200aa1144 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -6,3 +6,4 @@ ray == 2.32.0 triton pandas tabulate +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@30ee2d1 diff --git a/tests/lora/test_lora_hpu.py b/tests/lora/test_lora_hpu.py index 01b6472745e1c..c8ecaef1a6316 100644 --- a/tests/lora/test_lora_hpu.py +++ b/tests/lora/test_lora_hpu.py @@ -1,7 +1,7 @@ import pytest import torch +from vllm_hpu_extension.ops import LoraMask -from vllm.hpu.ops import LoraMask from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice from .utils import DummyLoRAManager diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py index 56b71a431aca7..b7b8072de3fe5 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/habana_attn.py @@ -7,14 +7,14 @@ from typing import Any, Dict, List, Optional, Tuple, Type import torch +import vllm_hpu_extension.ops as ops +from vllm_hpu_extension import cache_ops +from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache -import vllm.hpu.ops as ops from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType) from vllm.attention.ops.habana_paged_attn import (HabanaPagedAttention, HabanaPagedAttentionMetadata) -from vllm.hpu import cache_ops -from vllm.hpu.utils import Matmul, Softmax, VLLMKVCache from vllm.logger import init_logger logger = init_logger(__name__) diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/habana_paged_attn.py index cab8d7abe95fd..49a3e3f774d58 100644 --- a/vllm/attention/ops/habana_paged_attn.py +++ b/vllm/attention/ops/habana_paged_attn.py @@ -6,8 +6,7 @@ from typing import Dict, List, Optional, Tuple import torch - -from vllm.hpu import cache_ops, ops +from vllm_hpu_extension import cache_ops, ops # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. _PARTITION_SIZE = 512 diff --git a/vllm/hpu/__init__.py b/vllm/hpu/__init__.py deleted file mode 100644 index b8e4d3aac98a7..0000000000000 --- a/vllm/hpu/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -############################################################################### -# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company -# -# This source code is licensed under the BSD license found in the -# LICENSE file in the root directory of this source tree. -############################################################################### diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py deleted file mode 100644 index 9042924f68b3d..0000000000000 --- a/vllm/hpu/cache_ops.py +++ /dev/null @@ -1,107 +0,0 @@ -############################################################################### -# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company -# -# This source code is licensed under the BSD license found in the -# LICENSE file in the root directory of this source tree. -############################################################################### - -import math - -import habana_frameworks.torch as htorch -import torch - - -def reshape_and_cache(key, - value, - key_cache, - value_cache, - slot_mapping, - dtype, - is_prompt=False): - num_blocks = key_cache.size(0) - block_size = key_cache.size(1) - slot_mapping = slot_mapping.flatten() - indices = torch.div(slot_mapping, block_size, rounding_mode="floor") - offsets = torch.fmod(slot_mapping, block_size) - num_slots_requested = slot_mapping.size(0) - num_slots_available = num_blocks * block_size - # NOTE(kzawora): HPU PT bridge crashes with - # RuntimeError: Invalid inputs for scatter_nd_onnx - # on index_put when num_slots_requested > num_slots_available. - # This case might occur when we have little kv cache blocks and - # lots of padding, or are doing warmup. - # This loop is a workaround for this issue. Please remove it - # once key_cache.index_put_(indices, offsets), key) works. - num_kv_cache_passes = math.ceil(num_slots_requested / num_slots_available) - for i in range(num_kv_cache_passes): - start_idx = i * num_slots_available - end_idx = (i + 1) * num_slots_available - key_cache.index_put_( - (indices[start_idx:end_idx], offsets[start_idx:end_idx]), - key[start_idx:end_idx]) - value_cache.index_put_( - (indices[start_idx:end_idx], offsets[start_idx:end_idx]), - value[start_idx:end_idx]) - - -def prepare_to_cache(cache, slot_mapping): - num_blocks = cache.size(0) - block_size = cache.size(1) - slot_mapping = slot_mapping.flatten() - indices = torch.div(slot_mapping, block_size, rounding_mode="floor") - offsets = torch.fmod(slot_mapping, block_size) - num_slots_requested = slot_mapping.size(0) - num_slots_available = num_blocks * block_size - # NOTE(kzawora): HPU PT bridge crashes with - # RuntimeError: Invalid inputs for scatter_nd_onnx - # on index_put when num_slots_requested > num_slots_available. - # This case might occur when we have little kv cache blocks and - # lots of padding, or are doing warmup. - # This loop is a workaround for this issue. Please remove it - # once key_cache.index_put_(indices, offsets), key) works. - num_kv_cache_passes = math.ceil(num_slots_requested / num_slots_available) - - return num_kv_cache_passes, num_slots_available, indices, offsets - - -def insert_or_update_cache(input, cache, num_kv_cache_passes, - num_slots_available, block_indices, block_offsets): - for i in range(num_kv_cache_passes): - start_idx = i * num_slots_available - end_idx = (i + 1) * num_slots_available - cache.index_put_((block_indices[start_idx:end_idx], - block_offsets[start_idx:end_idx]), - input[start_idx:end_idx]) - - -def swap_blocks(src, dst, block_mapping): - index_src = torch.zeros((1, ), dtype=torch.int32, device=src.device) - index_dst = torch.zeros((1, ), dtype=torch.int32, device=dst.device) - for src_idx, dst_idx in block_mapping.items(): - index_src[0] = src_idx - index_dst[0] = dst_idx - dst.index_put_([index_dst], src.index_select(0, index_src)) - if dst.device.type == 'hpu': - htorch.core.mark_step() - torch.hpu.synchronize() - - -def copy_blocks(key_caches, value_caches, block_mapping): - index_src = torch.zeros((1, ), - dtype=torch.int32, - device=key_caches[0].device) - index_dst = torch.zeros((1, ), - dtype=torch.int32, - device=key_caches[0].device) - for src, dsts in block_mapping.items(): - index_src[0] = src - for dst in dsts: - index_dst[0] = dst - for key_cache in key_caches: - key_cache.index_copy_(0, index_dst, - key_cache.index_select(0, index_src)) - for value_cache in value_caches: - value_cache.index_copy_(0, index_dst, - value_cache.index_select(0, index_src)) - if key_caches[0].device.type == 'hpu': - htorch.core.mark_step() diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py deleted file mode 100644 index 939d195a12b08..0000000000000 --- a/vllm/hpu/ops.py +++ /dev/null @@ -1,293 +0,0 @@ -############################################################################### -# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company -# -# This source code is licensed under the BSD license found in the -# LICENSE file in the root directory of this source tree. -############################################################################### -from typing import Optional - -import habana_frameworks.torch as htorch -import torch -import torch.nn.functional as F - -from vllm.logger import init_logger - -logger = init_logger(__name__) -HPUFusedRMSNorm = None -try: - from habana_frameworks.torch.hpex.normalization import FusedRMSNorm - HPUFusedRMSNorm = FusedRMSNorm -except ImportError: - logger.warning("Could not import HPU FusedRMSNorm kernel. " - "vLLM will use forward_native implementation of RMSNorm.") -HPUFusedSDPA = None -try: - from habana_frameworks.torch.hpex.kernels import FusedSDPA - HPUFusedSDPA = FusedSDPA -except ImportError: - logger.warning("Could not import HPU FusedSDPA kernel. " - "vLLM will use native implementation.") - - -def batch2block(tensor, block_mapping): - shape = tuple(tensor.shape) - return (block_mapping @ tensor.view(shape[0], -1)).view(-1, *shape[1:]) - - -def block2batch(tensor, block_mapping): - shape = tuple(tensor.shape) - return (block_mapping.t() @ tensor.view(shape[0], -1)).view(-1, *shape[1:]) - - -def block_softmax(batch_size, attn, block_mapping): - # We're using global maximum to decrease the exponent as - # it's fast to compute and performs reasonably well. - # This is by no means a final solution and needs to - # be properly addressed in the future. - # - # Additionally there's a bug where 'max' is not parallelized - # across TPC cores, so we need to split the tensor manually - # instead of simply doing attn_max = attn.max() - - tail_dims = tuple(range(1, attn.dim())) - attn_max = attn.amax(tail_dims).amax() - attn.sub_(attn_max) - attn = attn.exp_() - sums = attn.sum(dim=-1).unsqueeze(-1) - sums = block2batch(sums, block_mapping) - sums = batch2block(sums, block_mapping) - sums.add_(1.0e-12) - attn.div_(sums) - return attn - - -def flat_pa(query, key_cache, value_cache, block_list, block_mapping, - block_bias, scale, matmul_qk_op, matmul_av_op, keys_fetch_func, - values_fetch_func): - batch_size = query.size(0) - q_heads = query.size(1) - kv_heads = key_cache.size(2) - - query = batch2block(scale * query, block_mapping).unsqueeze(-2) - key = keys_fetch_func(key_cache, block_list).transpose(1, 2) - value = values_fetch_func(value_cache, block_list).transpose(1, 2) - block_bias = block_bias.view(key.size(0), 1, 1, -1) - - if kv_heads != q_heads: - block_bias = block_bias.unsqueeze(1) - query = query.unflatten(1, (kv_heads, -1)) - key = key.unflatten(1, (kv_heads, 1)) - value = value.unflatten(1, (kv_heads, 1)) - key = key.transpose(3, 4) - else: - key = key.transpose(2, 3) - - attn = matmul_qk_op(query, key) + block_bias - attn = block_softmax(batch_size, attn, block_mapping) - attn = matmul_av_op(attn, value) - attn = block2batch(attn, block_mapping) - attn = attn.squeeze(-2) - if kv_heads != q_heads: - attn = attn.flatten(1, 2) - return attn - - -def silu_and_mul(x: torch.Tensor) -> torch.Tensor: - d = x.shape[-1] // 2 - return F.silu(x[..., :d]) * x[..., d:] - - -#TODO: remove after fusedsdpa fix for query_head != kv_head -def repeat_kv(kv: torch.Tensor, n_rep: int) -> torch.Tensor: - """ - This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). - The kv go from (batch, num_key_value_heads, seqlen, head_dim) to - (batch, num_attention_heads, seqlen, head_dim) - """ - batch, num_key_value_heads, slen, head_dim = kv.shape - if n_rep == 1: - return kv - kv = kv[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, - head_dim) - return kv.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) - - -def prompt_attention( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - attn_bias: Optional[torch.Tensor] = None, - p: float = 0.0, - scale: Optional[float] = None, - matmul_qk_op=torch.matmul, - softmax_op=torch.softmax, - matmul_av_op=torch.matmul, - valid_seq_lengths: Optional[torch.Tensor] = None, -) -> torch.Tensor: - query = query.transpose(1, 2) - key = key.transpose(1, 2) - value = value.transpose(1, 2) - query_heads = query.size(1) - kv_heads = key.size(1) - if attn_bias is not None or HPUFusedSDPA is None: - if query_heads != kv_heads: - query = query.unflatten(1, (kv_heads, -1)) - key = key.unflatten(1, (kv_heads, 1)) - value = value.unflatten(1, (kv_heads, 1)) - if attn_bias is not None: - attn_bias = attn_bias.unsqueeze(2) - attn_weights = matmul_qk_op(query * scale, key.transpose(-1, -2)) - if attn_bias is not None: - attn_weights.add_(attn_bias) - attn_weights = softmax_op(attn_weights, dim=-1) - attn_weights = matmul_av_op(attn_weights, value) - if query_heads != kv_heads: - attn_weights = attn_weights.flatten(1, 2) - else: - #TODO: remove after fusedsdpa fix for query_heads != kv_heads - if query_heads != kv_heads: - key = repeat_kv(key, int(query_heads // kv_heads)) - value = repeat_kv(value, int(query_heads // kv_heads)) - softmax_mode = 'fast' - recompute_mode = True - attn_weights = FusedSDPA.apply(query, key, value, None, 0.0, True, - scale, softmax_mode, recompute_mode, - valid_seq_lengths, 'right') - attn_weights = attn_weights.transpose(1, 2) - return attn_weights - - -class LoraMask: - lora_mask = None - - @staticmethod - def setLoraMask(mask): - LoraMask.lora_mask = mask - - @staticmethod - def getLoraMask(): - return LoraMask.lora_mask - - -def dispatch_bgmv_linear( - y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - indices: torch.LongTensor, - layer_idx: int, - scale: float, -): - """ - `wa_t_all` and `wb_t_all` contains all LoRA A and LoRA B weight matrices - stacked at dimension 0 into single tensors, assuming same rank. `wa` is the - reshaped and transposed version of `wa_t_all` of shape - (h_in, max_loras * lora_rank) and `wb` is the transposed and reshaped - version of `wb_t_all` of shape (max_loras * lora_rank, h_out). - - Matmul input `x` with `wa`. Multiply `x` with a mask to zero-out inputs of - inactive LoRA indices. Matmul masked output with `wb` and scale it to get - the final output. - """ - - assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}' - mask = LoraMask.getLoraMask() - - wa = wa_t_all[:, 0, :, :] - wb = wb_t_all[:, 0, :, :].transpose(1, 2) - wa = wa.reshape(wa.shape[0] * wa.shape[1], wa.shape[2]).transpose(0, 1) - wb = wb.reshape(wb.shape[0] * wb.shape[1], wb.shape[2]) - - out = x @ wa - assert (out.shape == mask.shape) - out = out * mask - out = out @ wb - y += out * scale - - -def dispatch_bgmv_embedding( - y: torch.Tensor, - x: torch.Tensor, - wb_t_all: torch.Tensor, - indices: torch.LongTensor, - layer_idx: int, - scale: float, -): - """ - `wb_t_all` contains all LoRA-B weight matrices stacked at dimension 0 into - a single tensor, assuming same rank. `wb` is the transposed and reshaped - version of `wb_t_all` of shape (num_loras * lora_rank, embedding_dim). - - Output of LoRA-A embedding (tensor x) is repeated max_loras times to match - the shape of `wb`. Multiply `x` with a mask to zero-out inputs of inactive - LoRA indices. Matmul masked output with `wb` and scale it to get the final - output. - """ - - assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}' - max_loras = wb_t_all.size(0) - - x = x.repeat(1, max_loras) - x = x * LoraMask.getLoraMask() - wb = wb_t_all[:, 0, :, :].transpose(1, 2) - wb = wb.reshape(wb.shape[0] * wb.shape[1], wb.shape[2]) - out = x @ wb - y += out * scale - - -class MoeMatmul(torch.nn.Module): - - def __init__(self): - super().__init__() - - def set_weight(self, w): - self.weight = w - - def calc(self, state, expert_id, w): - self.weight = w[expert_id].transpose(0, 1) - return self.forward(state) - - def forward(self, state): - return torch.matmul(state, self.weight) - - -class StaticFusedMOE(torch.nn.Module): - - def __init__(self, num_total_experts): - super().__init__() - self.w13_list = torch.nn.ModuleList( - [MoeMatmul() for _ in range(num_total_experts)]) - self.w2_list = torch.nn.ModuleList( - [MoeMatmul() for _ in range(num_total_experts)]) - self.num_total_experts = num_total_experts - - def forward(self, hidden_states, w1, w2, score, topk): - B, D = hidden_states.shape - routing_weights = F.softmax(score, dim=1, dtype=torch.float32) - routing_weights, selected_experts = torch.topk(routing_weights, - topk, - dim=-1) - routing_weights /= routing_weights.sum(dim=-1, keepdim=True) - routing_weights = routing_weights.to(hidden_states.dtype) - final_hidden_states = torch.zeros((1, B, D), - dtype=hidden_states.dtype, - device=hidden_states.device) - padded_weights = torch.zeros((B, self.num_total_experts), - dtype=hidden_states.dtype, - device=hidden_states.device) - padded_weights.scatter_(-1, selected_experts, routing_weights) - padded_weights = padded_weights.reshape(-1, B, self.num_total_experts) - padded_weights = padded_weights.permute(2, 0, 1).unsqueeze(-1) - htorch.core.mark_step() - - for expert_idx in range(self.num_total_experts): - padded_weight = padded_weights[expert_idx] - current_state_static = hidden_states.reshape(-1, D) - w_output = self.w13_list[expert_idx].calc(current_state_static, - expert_idx, w1) - w_output = silu_and_mul(w_output) - w_output = self.w2_list[expert_idx].calc(w_output, expert_idx, w2) - current_hidden_states_static = w_output * padded_weight - final_hidden_states += current_hidden_states_static - - return final_hidden_states.view(-1, D) diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py deleted file mode 100644 index 1857253f47f1b..0000000000000 --- a/vllm/hpu/rotary_embed.py +++ /dev/null @@ -1,123 +0,0 @@ -############################################################################### -# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company -# -# This source code is licensed under the BSD license found in the -# LICENSE file in the root directory of this source tree. -############################################################################### - -import torch -import torch.nn as nn - -from vllm.logger import init_logger -from vllm.utils import is_hpu - -logger = init_logger(__name__) - -if is_hpu(): - try: - from habana_frameworks.torch.hpex.kernels import ( - RotaryPosEmbeddingHelperV1 as FusedRoPE) - except ImportError: - logger.warning("Could not import HPU FusedRoPE kernel. " - "vLLM will use forward_native implementation of RoPE.") - FusedRoPE = None -else: - FusedRoPE = None - - -class HpuRotaryEmbedding(nn.Module): - - def __init__(self, - head_size, - rotary_dim, - max_position_embeddings=2048, - base=10000, - is_neox_style=None, - device='hpu', - RoPEFallback=None): - super().__init__() - - self.head_size = head_size - self.dim = rotary_dim - self.max_position_embeddings = max_position_embeddings - self.base = base - inv_freq = 1.0 / (self.base**( - torch.arange(0, self.dim, 2).float().to(device) / self.dim)) - self.register_buffer("inv_freq", inv_freq, persistent=False) - - # Build here to make `torch.jit.trace` work. - self._set_cos_sin_cache(seq_len=max_position_embeddings, - device=self.inv_freq.device, - dtype=torch.get_default_dtype()) - if FusedRoPE is None: - assert RoPEFallback is not None, ( - "HPU FusedRoPE kernel could not be imported, and " - "fallback RoPE implementation was not provided!") - self.fallback_impl = RoPEFallback(head_size, - rotary_dim, - max_position_embeddings, - base, - is_neox_style, - dtype=torch.get_default_dtype()) - - def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len - t = torch.arange(self.max_seq_len_cached, - device=device, - dtype=self.inv_freq.dtype) - - freqs = torch.einsum("i,j->ij", t, self.inv_freq) - # Different from paper, but it uses a different permutation in order - # to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer("cos_cached", - emb.cos().to(dtype), - persistent=False) - self.register_buffer("sin_cached", - emb.sin().to(dtype), - persistent=False) - - def forward(self, positions: torch.Tensor, query: torch.Tensor, - key: torch.Tensor): - if FusedRoPE is None: - return self.fallback_impl(positions, query, key) - if query.dim() == 2: - query = query.unsqueeze(0) - if key.dim() == 2: - key = key.unsqueeze(0) - if positions.dim() == 1: - positions = positions.unsqueeze(0) - seq_len = key.shape[-2] - if seq_len > self.max_seq_len_cached: - self._set_cos_sin_cache(seq_len=seq_len, - device=query.device, - dtype=query.dtype) - - cos, sin = self.cos_cached[:seq_len].to( - dtype=query.dtype), self.sin_cached[:seq_len].to(dtype=query.dtype) - query = query.reshape( - (query.shape[0], query.shape[1], query.shape[2] // self.head_size, - self.head_size)) - key = key.reshape((key.shape[0], key.shape[1], - key.shape[2] // self.head_size, self.head_size)) - query_rot = query[..., :self.dim] - key_rot = key[..., :self.dim] - if self.dim < self.head_size: - query_pass = query[..., self.dim:] - key_pass = key[..., self.dim:] - - if len(positions[0]) == 1: - cos = self.cos_cached[positions].unsqueeze(2).to(dtype=query.dtype) - sin = self.sin_cached[positions].unsqueeze(2).to(dtype=query.dtype) - else: - cos = cos[positions].unsqueeze(2) - sin = sin[positions].unsqueeze(2) - query, key = FusedRoPE.apply(query_rot, cos, sin, - 0), FusedRoPE.apply(key_rot, cos, sin, 0) - if self.dim < self.head_size: - query = torch.cat((query, query_pass), dim=-1) - key = torch.cat((key, key_pass), dim=-1) - return query.reshape( - (query.shape[0], query.shape[1], - query.shape[2] * query.shape[3])), key.reshape( - (key.shape[0], key.shape[1], key.shape[2] * key.shape[3])) diff --git a/vllm/hpu/utils.py b/vllm/hpu/utils.py deleted file mode 100644 index 13204b83d5742..0000000000000 --- a/vllm/hpu/utils.py +++ /dev/null @@ -1,61 +0,0 @@ -############################################################################### -# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company -# -# This source code is licensed under the BSD license found in the -# LICENSE file in the root directory of this source tree. -############################################################################### - -from functools import wraps - -import habana_frameworks.torch as htorch -import torch - -from vllm.hpu.cache_ops import insert_or_update_cache - - -def with_mark_steps(fn): - - @wraps(fn) - def wrapped(*args, **kwargs): - htorch.core.mark_step() - result = fn(*args, **kwargs) - del args - del kwargs - htorch.core.mark_step() - return result - - return wrapped - - -class Matmul(torch.nn.Module): - - def __init__(self): - super(Matmul, self).__init__() - - def forward(self, x, y): - return torch.matmul(x, y) - - -class Softmax(torch.nn.Module): - - def __init__(self): - super().__init__() - - def forward(self, x, dim=None, inv_head=None): - return torch.softmax(x, dim) - - -class VLLMKVCache(torch.nn.Module): - - def __init__(self): - super(VLLMKVCache, self).__init__() - - def forward(self, input, cache, num_kv_cache_passes, num_slots_available, - block_indices, block_offset): - insert_or_update_cache(input, cache, num_kv_cache_passes, - num_slots_available, block_indices, - block_offset) - return cache - - def fetch_from_cache(self, cache, blocks): - return cache.index_select(0, blocks) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index aa01e9fb77af2..59b7432b6e6eb 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -30,7 +30,8 @@ from vllm.utils import is_hpu if is_hpu(): - from vllm.hpu.ops import dispatch_bgmv_embedding, dispatch_bgmv_linear + from vllm_hpu_extension.ops import (dispatch_bgmv_embedding, + dispatch_bgmv_linear) if TYPE_CHECKING: pass diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index cf0d5f98f1b01..bda8a0622ef31 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -202,7 +202,7 @@ def __init__( self.num_expert_group = num_expert_group self.topk_group = topk_group if is_hpu(): - from vllm.hpu.ops import StaticFusedMOE + from vllm_hpu_extension.ops import StaticFusedMOE self.hpu_static_fused_moe = StaticFusedMOE(self.num_experts) if quant_config is None: diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index c12668c14887d..9ef532e61a7c0 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -75,7 +75,7 @@ def forward_hpu( x: torch.Tensor, residual: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: - from vllm.hpu.ops import HPUFusedRMSNorm + from vllm_hpu_extension.ops import HPUFusedRMSNorm if HPUFusedRMSNorm is None: return self.forward_native(x, residual) if residual is not None: diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 4e3c840bede60..2581e3a74dc72 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -31,7 +31,7 @@ from vllm.utils import is_hpu, is_tpu if is_hpu(): - from vllm.hpu.rotary_embed import HpuRotaryEmbedding + from vllm_hpu_extension.rotary_embed import HpuRotaryEmbedding def _rotate_neox(x: torch.Tensor) -> torch.Tensor: diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 171ae0510d6c6..d1d4e783dfe80 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -19,13 +19,13 @@ import habana_frameworks.torch as htorch import habana_frameworks.torch.internal.bridge_config as bc import torch +from vllm_hpu_extension.ops import LoraMask as LoraMask from vllm.attention import AttentionMetadata, get_attn_backend from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, MultiModalConfig, ParallelConfig, SchedulerConfig) from vllm.distributed.parallel_state import get_world_group -from vllm.hpu.ops import LoraMask as LoraMask from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping from vllm.lora.request import LoRARequest From 18d633972d43444cdf9130edb9f960aa34f7fb8f Mon Sep 17 00:00:00 2001 From: Sun Choi Date: Tue, 17 Sep 2024 21:43:05 +0000 Subject: [PATCH 205/819] fix minor logging issue --- vllm/worker/habana_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index d1d4e783dfe80..d465d883898cd 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -92,7 +92,7 @@ def read_bucket_settings(phase: str, dim: str, **defaults): values = [ int(os.environ.get(e, d)) for e, d in zip(env_vars, default_values) ] - for e, v, d in zip(env_vars, values, defaults): + for e, v, d in zip(env_vars, values, default_values): logger.info('%s=%s (default:%s)', e, v, d) return values From b62fba85ac03326e9f466d8d37e91ae1b14a6511 Mon Sep 17 00:00:00 2001 From: Iryna Boiko Date: Wed, 18 Sep 2024 12:09:13 +0200 Subject: [PATCH 206/819] Fix blocks number calculation for Flat PA (#269) Fix blocks number calculation for Flat PA via adding empty table_block (https://github.com/HabanaAI/vllm-fork/issues/158) --- vllm/worker/habana_model_runner.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index d465d883898cd..73156ad6aea5b 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -173,11 +173,16 @@ def generate_prompt_buckets(bs_bucket_config, def generate_decode_buckets(bs_bucket_config, blocks_bucket_config, max_blocks): buckets = [] - for bs in warmup_range(bs_bucket_config): - for blocks in warmup_range(blocks_bucket_config): + bs_buckets = warmup_range(bs_bucket_config) + block_buckets = warmup_range(blocks_bucket_config) + bmin, bstep, bmax = blocks_bucket_config + last_bucket = max_blocks if (max_blocks // bstep + == 0) else (max_blocks // bstep + 1) * bstep + for bs in bs_buckets: + for blocks in block_buckets: if blocks < bs: continue - if blocks > max_blocks: + if blocks > last_bucket: break buckets.append((bs, blocks)) return list(sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0]))) @@ -964,10 +969,12 @@ def _prepare_decode( seq_lens.append(seq_len) block_table = seq_group_metadata.block_tables[seq_id] - block_number = block_table[position // self.block_size] - if block_number == _PAD_BLOCK_ID: + if len(block_table) == 0: + block_number = _PAD_BLOCK_ID + block_table = [] slot = next(dummy_slots) else: + block_number = block_table[position // self.block_size] block_offset = position % self.block_size slot = block_number * self.block_size + block_offset slot_mapping.append([slot]) @@ -992,7 +999,7 @@ def _prepare_decode( num_decode_tokens = sum(seq_lens) - blocks_used = [len(bt) for bt in block_tables] + blocks_used = [len(bt) for bt in block_tables if bt] block_list = list(itertools.chain(*block_tables)) block_mapping_nested: List[List[int]] = [ [i] * b_u for i, b_u in enumerate(blocks_used) @@ -1080,8 +1087,9 @@ def prepare_input_tensors( batch_size_padded = find_bucket(real_batch_size, bucket_cfg) batch_size_padding = batch_size_padded - real_batch_size seq_group_metadata_list = seq_group_metadata_list.copy() - seq_group_metadata_list.extend(seq_group_metadata_list[0] - for _ in range(batch_size_padding)) + seq_group_metadata_list.extend( + self.create_dummy_seq_group_metadata(0, 0, is_prompt) + for _ in range(batch_size_padding)) prefill_reqs = [] decode_reqs = [] From cd7b1c15a3e1a07bf38a9f29acaafc437024be4b Mon Sep 17 00:00:00 2001 From: Iryna Boiko Date: Fri, 20 Sep 2024 09:13:06 +0200 Subject: [PATCH 207/819] Remove dummy seq group data creation from loop (#301) Remove dummy seq metadata from loop for Flat PA fix --- vllm/worker/habana_model_runner.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 73156ad6aea5b..0d5df1f312ec9 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -1087,9 +1087,11 @@ def prepare_input_tensors( batch_size_padded = find_bucket(real_batch_size, bucket_cfg) batch_size_padding = batch_size_padded - real_batch_size seq_group_metadata_list = seq_group_metadata_list.copy() - seq_group_metadata_list.extend( - self.create_dummy_seq_group_metadata(0, 0, is_prompt) - for _ in range(batch_size_padding)) + if batch_size_padding > 0: + dummy_seq_group_metadata = self.create_dummy_seq_group_metadata( + 0, 0, is_prompt) + seq_group_metadata_list.extend(dummy_seq_group_metadata + for _ in range(batch_size_padding)) prefill_reqs = [] decode_reqs = [] From 12d7033e768677e78b62b051f2dbe2ab8b994c77 Mon Sep 17 00:00:00 2001 From: Bob Zhu <41610754+czhu15@users.noreply.github.com> Date: Fri, 20 Sep 2024 15:31:57 +0800 Subject: [PATCH 208/819] optimize qwen2 model on Gaudi (#233) Add extra mark_step() on each decode layer to optimize the performance on Gaudi. Signed-off-by: Bob Zhu --- vllm/model_executor/models/qwen2.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 3deb3d8840cc4..1e4f62fcce7d6 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -47,6 +47,7 @@ default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors, SamplerOutput +from vllm.utils import is_hpu from .interfaces import SupportsLoRA @@ -260,6 +261,9 @@ def forward( else: hidden_states = self.embed_tokens(input_ids) residual = None + if is_hpu(): + import habana_frameworks.torch as htorch + htorch.core.mark_step() for i in range(len(self.layers)): layer = self.layers[i] hidden_states, residual = layer( @@ -269,6 +273,9 @@ def forward( attn_metadata, residual, ) + if is_hpu(): + htorch.core.mark_step() + hidden_states, _ = self.norm(hidden_states, residual) return hidden_states From bc39baa482dcfefeae6289e80cea63b4adc9beeb Mon Sep 17 00:00:00 2001 From: hlin99 <73271530+hlin99@users.noreply.github.com> Date: Fri, 20 Sep 2024 16:19:05 +0800 Subject: [PATCH 209/819] fix bug: device_str in initialize_ray_cluster requires uppercase string (#297) fix bug: device_str in initialize_ray_cluster requires uppercase string w/o the bug fix, multi HPUs will encounter "ValueError: The number of required hpus exceeds the total number of available hpus in the placement group" error, as the device_str is not expected as uppercase, then available hpus always returns 0. --- vllm/executor/ray_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 8f5bc30a9599c..ea81c313f2da9 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -98,7 +98,7 @@ def initialize_ray_cluster( if is_tpu(): device_str = "TPU" elif is_hpu(): - device_str = hpu_device_string() + device_str = hpu_device_string().upper() # Create placement group for worker processes current_placement_group = ray.util.get_current_placement_group() if current_placement_group: From b2653ab884da92a67da8c66b612a4dd33ac9efb2 Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar <49579433+hlahkar@users.noreply.github.com> Date: Fri, 20 Sep 2024 14:38:36 +0530 Subject: [PATCH 210/819] Fix Lora Rebase (#290) Fixes Lora Related issues in vllm Rebase --- tests/lora/test_lora_hpu.py | 108 +++++++++++++++++++--------- vllm/hpu/ops.py | 2 - vllm/hpu/punica_hpu.py | 77 ++++++++++++++++++++ vllm/lora/layers.py | 109 ++--------------------------- vllm/lora/models.py | 25 ++----- vllm/lora/punica.py | 9 +-- vllm/utils.py | 5 ++ vllm/worker/habana_model_runner.py | 10 +-- 8 files changed, 181 insertions(+), 164 deletions(-) create mode 100644 vllm/hpu/punica_hpu.py diff --git a/tests/lora/test_lora_hpu.py b/tests/lora/test_lora_hpu.py index ddbab66e166b3..57bc19b2170db 100644 --- a/tests/lora/test_lora_hpu.py +++ b/tests/lora/test_lora_hpu.py @@ -1,7 +1,8 @@ import pytest import torch +from vllm.hpu.ops import LoraMask -from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice +from vllm.hpu.punica_hpu import GaudiPunicaWrapper from .utils import DummyLoRAManager @@ -19,7 +20,19 @@ torch.float16: (5e-3, 5e-3), torch.bfloat16: (3e-2, 2e-2), } -MAX_LORAS = 8 + + +def createLoraMask(indices, batch_size, seq_len, max_loras, max_lora_rank, + lora_dtype): + indices = indices.view(-1, 1) + mask = torch.arange(max_loras * max_lora_rank, device=indices.device) + mask = mask.view(1, -1) + mask = ((mask >= ((indices) * max_lora_rank)) * + (mask < ((indices + 1) * max_lora_rank))).to(dtype=lora_dtype) + mask = mask.view(batch_size, 1, + -1).expand(batch_size, seq_len, + -1).reshape(batch_size * seq_len, -1) + return mask @pytest.mark.parametrize("m", TENSOR_SIZES) @@ -39,32 +52,41 @@ def test_apply_lora(m, n, k, rank, dtype) -> None: input = torch.rand(k, n, device="hpu", dtype=dtype) expected = input @ lora.lora_a @ lora.lora_b * lora.scaling - lora_a_stack = torch.zeros(MAX_LORAS + 1, + lora_a_stack = torch.zeros(8, 1, lora.lora_a.shape[1], lora.lora_a.shape[0], device="hpu", dtype=dtype) - lora_b_stack = torch.zeros(MAX_LORAS + 1, + lora_b_stack = torch.zeros(8, 1, lora.lora_b.shape[1], lora.lora_b.shape[0], device="hpu", dtype=dtype) - for i in range(MAX_LORAS): + for i in range(lora_a_stack.shape[0]): lora_a_stack[i][0] = lora.lora_a.T lora_b_stack[i][0] = (lora.lora_b * lora.scaling).T output = torch.zeros(k, m, device="hpu", dtype=dtype) - _apply_lora(input, lora_a_stack, lora_b_stack, - torch.randint(0, MAX_LORAS, (len(input), ), device="hpu"), - output) + indices = torch.randint(0, + lora_a_stack.shape[0], (len(input), ), + device="hpu") + mask = createLoraMask(indices, k, 1, 8, rank, dtype) + LoraMask.setLoraMask(mask) + punica_wrapper = GaudiPunicaWrapper(4096, max_batches=256, device="hpu") + + punica_wrapper.add_lora(output, input, lora_a_stack, lora_b_stack, 1.0) + rtol, atol = TOLERANCES[dtype] assert torch.allclose(expected, output, rtol=rtol, atol=atol) output[:] = 0 - _apply_lora(input, lora_a_stack, lora_b_stack, - torch.full((len(input), ), -1, device="hpu"), output) + indices = torch.full((len(input), ), -1, device="hpu") + mask = createLoraMask(indices, k, 1, 8, rank, dtype) + LoraMask.setLoraMask(mask) + + punica_wrapper.add_lora(output, input, lora_a_stack, lora_b_stack, 1.0) assert torch.allclose(torch.zeros_like(output), output) manager.reset_lora() @@ -99,7 +121,7 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None: dim=1) lora_a_stacks = [ - torch.zeros(MAX_LORAS + 1, + torch.zeros(8, 1, lora_1.lora_a.shape[1], lora_1.lora_a.shape[0], @@ -107,32 +129,40 @@ def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None: dtype=dtype) for i in range(2) ] lora_b_stacks = [ - torch.zeros(MAX_LORAS + 1, + torch.zeros(8, 1, lora_1.lora_b.shape[1], lora_1.lora_b.shape[0], device="hpu", dtype=dtype) for i in range(2) ] - for i in range(MAX_LORAS): + for i in range(lora_a_stacks[0].shape[0]): lora_a_stacks[0][i][0] = lora_1.lora_a.T lora_b_stacks[0][i][0] = (lora_1.lora_b * lora_1.scaling).T lora_a_stacks[1][i][0] = lora_2.lora_a.T lora_b_stacks[1][i][0] = (lora_2.lora_b * lora_2.scaling).T output = torch.zeros(k, m, device="hpu", dtype=dtype) - _apply_lora_packed_nslice( - input, lora_a_stacks, lora_b_stacks, - torch.randint(0, MAX_LORAS, (len(input), ), device="hpu"), output, - (m // 2, m // 2)) + indices = torch.randint(0, + lora_a_stacks[0].shape[0], (len(input), ), + device="hpu") + mask = createLoraMask(indices, k, 1, 8, rank, dtype) + LoraMask.setLoraMask(mask) + + punica_wrapper = GaudiPunicaWrapper(4096, max_batches=256, device="hpu") + punica_wrapper.add_lora_packed_nslice(output, input, lora_a_stacks, + lora_b_stacks, 1.0, (m // 2, m // 2)) rtol, atol = TOLERANCES[dtype] assert torch.allclose(expected, output, rtol=rtol, atol=atol) output[:] = 0 - _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks, - torch.full((len(input), ), -1, device="hpu"), - output, (m // 2, m // 2)) + indices = torch.full((len(input), ), -1, device="hpu") + mask = createLoraMask(indices, k, 1, 8, rank, dtype) + LoraMask.setLoraMask(mask) + + punica_wrapper.add_lora_packed_nslice(output, input, lora_a_stacks, + lora_b_stacks, 1.0, (m // 2, m // 2)) assert torch.allclose(torch.zeros_like(output), output) manager.reset_lora() @@ -166,14 +196,14 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None: dim=1) lora_a_stacks = [ - torch.zeros(MAX_LORAS + 1, + torch.zeros(8, 1, lora_q.lora_a.shape[1], lora_q.lora_a.shape[0], device="hpu", dtype=dtype) ] + [ - torch.zeros(MAX_LORAS + 1, + torch.zeros(8, 1, lora_k.lora_a.shape[1], lora_k.lora_a.shape[0], @@ -181,21 +211,21 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None: dtype=dtype) for i in range(2) ] lora_b_stacks = [ - torch.zeros(MAX_LORAS + 1, + torch.zeros(8, 1, lora_q.lora_b.shape[1], lora_q.lora_b.shape[0], device="hpu", dtype=dtype) ] + [ - torch.zeros(MAX_LORAS + 1, + torch.zeros(8, 1, lora_k.lora_b.shape[1], lora_k.lora_b.shape[0], device="hpu", dtype=dtype) for i in range(2) ] - for i in range(MAX_LORAS): + for i in range(lora_a_stacks[0].shape[0]): lora_a_stacks[0][i][0] = lora_q.lora_a.T lora_b_stacks[0][i][0] = (lora_q.lora_b * lora_q.scaling).T lora_a_stacks[1][i][0] = lora_k.lora_a.T @@ -204,18 +234,30 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None: lora_b_stacks[2][i][0] = (lora_v.lora_b * lora_v.scaling).T output = torch.zeros(k, sum(qkv), device="hpu", dtype=dtype) - _apply_lora_packed_nslice( - input, lora_a_stacks, lora_b_stacks, - torch.randint(0, MAX_LORAS, (len(input), ), device="hpu"), output, - (qkv[0], qkv[1], qkv[2])) + indices = torch.randint(0, + lora_a_stacks[0].shape[0], (len(input), ), + device="hpu") + mask = createLoraMask(indices, k, 1, 8, rank, dtype) + LoraMask.setLoraMask(mask) + + punica_wrapper = GaudiPunicaWrapper(4096, max_batches=256, device="hpu") + punica_wrapper.add_lora_packed_nslice(output, input, + lora_a_stacks, + lora_b_stacks, + 1.0, (qkv[0], qkv[1], qkv[2])) rtol, atol = TOLERANCES[dtype] assert torch.allclose(expected, output, rtol=rtol, atol=atol) output[:] = 0 - _apply_lora_packed_nslice(input, lora_a_stacks, lora_b_stacks, - torch.full((len(input), ), -1, device="hpu"), - output, (qkv[0], qkv[1], qkv[2])) + indices = torch.full((len(input), ), -1, device="hpu") + mask = createLoraMask(indices, k, 1, 8, rank, dtype) + LoraMask.setLoraMask(mask) + + punica_wrapper.add_lora_packed_nslice(output, input, + lora_a_stacks, + lora_b_stacks, + 1.0, (qkv[0], qkv[1], qkv[2])) assert torch.allclose(torch.zeros_like(output), output) - manager.reset_lora() + manager.reset_lora() \ No newline at end of file diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index b2705429906c4..aaf863aff0cad 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -193,7 +193,6 @@ def dispatch_bgmv_linear( x: torch.Tensor, wa_t_all: torch.Tensor, wb_t_all: torch.Tensor, - indices: torch.LongTensor, layer_idx: int, scale: float, ): @@ -228,7 +227,6 @@ def dispatch_bgmv_embedding( y: torch.Tensor, x: torch.Tensor, wb_t_all: torch.Tensor, - indices: torch.LongTensor, layer_idx: int, scale: float, ): diff --git a/vllm/hpu/punica_hpu.py b/vllm/hpu/punica_hpu.py new file mode 100644 index 0000000000000..aed015ac4ae06 --- /dev/null +++ b/vllm/hpu/punica_hpu.py @@ -0,0 +1,77 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +# +# This source code is licensed under the BSD license found in the +# LICENSE file in the root directory of this source tree. +############################################################################### + +from typing import TYPE_CHECKING, Callable, List, Optional, Tuple, Union + +import torch +from vllm.lora.punica import PunicaWrapper +from vllm.hpu.ops import dispatch_bgmv_linear, dispatch_bgmv_embedding + +class GaudiPunicaWrapper(PunicaWrapper): + def __init__(self, max_num_batched_tokens: int, max_batches: int, + device: str): + super().__init__(max_num_batched_tokens, max_batches, device) + + def add_lora(self, + y: torch.Tensor, + x: torch.Tensor, + wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, + scale: float, + y_offset: Optional[int] = None, + y_slice_size: Optional[int] = None, + *, + buffer: Optional[torch.Tensor] = None) -> None: + y_org = y + x = x.view(-1, x.shape[-1]) + y = y.view(-1, y.shape[-1]) + dispatch_bgmv_linear(y, x, wa_t_all, wb_t_all, 0, 1.0) + y = y.view_as(y_org) + + def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, + lora_a_stacked: Tuple[torch.Tensor, + torch.Tensor, + torch.Tensor], + lora_b_stacked: Tuple[torch.Tensor, + torch.Tensor, + torch.Tensor], + scale: float, + output_slices: Tuple[int, ...]) -> None: + y_org = y + x = x.view(-1, x.shape[-1]) + y = y.view(-1, y.shape[-1]) + offset_left = 0 + + for slice_idx in range(len(output_slices)): + dispatch_bgmv_linear( + y[:, offset_left:offset_left + output_slices[slice_idx]], + x, lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], 0, 1.0) + offset_left += output_slices[slice_idx] + y = y.view_as(y_org) + + def add_lora_logits(self, + y: torch.Tensor, + x: torch.Tensor, + wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, + scale, + *, + buffer: Optional[torch.Tensor] = None) -> None: + y_org = y + y = y.view(-1, y.shape[-1]) + x = x.view(-1, x.shape[-1]) + dispatch_bgmv_linear(y, x, wa_t_all, wb_t_all, 0, 1.0) + y = y.view_as(y_org) + + def add_lora_embedding( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + add_input: bool = True, + ): + dispatch_bgmv_embedding(y, x, w_t_all, 0, 1.0) \ No newline at end of file diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 9e4a0098dc44e..e6be20edc8ce6 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -29,8 +29,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.platforms import current_platform -if current_platform.is_hpu(): - from vllm.hpu.ops import dispatch_bgmv_embedding, dispatch_bgmv_linear if TYPE_CHECKING: pass @@ -67,87 +65,6 @@ def dec(*args, **kwargs): return dec -def _apply_lora( - x: torch.Tensor, - lora_a_stacked: torch.Tensor, - lora_b_stacked: torch.Tensor, - indices: torch.Tensor, - output: torch.Tensor, -): - """Applies lora to each input. - - This method applies all loras to each input. It uses the - indices vector to determine which lora yields the - correct output. An index of -1 means no lora should be - applied. This method adds the final lora results to the - output. - - Input shapes: - x: (batch_size, hidden_dim) - lora_a_stacked: (num_loras, lora_rank, hidden_dim) - lora_b_stacked: (num_loras, output_dim, lora_rank) - indices: (batch_size) - output: (batch_size, output_dim) - """ - org_output = output - x = x.view(-1, x.shape[-1]) - output = output.view(-1, output.shape[-1]) - indices = indices.view(-1) - if current_platform.is_hpu(): - dispatch_bgmv_linear(output, x, lora_a_stacked, lora_b_stacked, - indices, 0, 1.0) - else: - add_lora(output, x, lora_a_stacked, lora_b_stacked, indices, 0, 1.0) - return output.view_as(org_output) - - -def _apply_lora_packed_nslice( - x: torch.Tensor, - lora_a_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], - lora_b_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], - indices: torch.Tensor, - output: torch.Tensor, - output_slices: Tuple[int, ...], -): - """Applies lora to each input. - - This method applies all loras to each input. It uses the - indices vector to determine which lora yields the - correct output. An index of -1 means no lora should be - applied. This method adds the final lora results to the - output. - - This method is used for layers that are composed of multiple sublayers - (slices) packed together. - - Input shapes: - x: (batch_size, hidden_dim) - lora_a_stacked: 3 element tuple of (num_loras, lora_rank, hidden_dim) - lora_b_stacked: 3 element tuple of (num_loras, output_dim, lora_rank) - indices: (batch_size) - output: (batch_size, q_slice_size + 2*kv_slice_size) - output_slices: n-1 element tuple of (slice_size...), - where n is number of slices - """ - org_output = output - x = x.view(-1, x.shape[-1]) - output = output.view(-1, output.shape[-1]) - indices = indices.view(-1) - offset_left = 0 - for slice_idx in range(len(output_slices)): - if is_hpu(): - dispatch_bgmv_linear( - output[:, offset_left:offset_left + output_slices[slice_idx]], - x, lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], - indices, 0, 1.0) - else: - add_lora_slice(output, x, lora_a_stacked[slice_idx], - lora_b_stacked[slice_idx], indices, 0, 1.0, - offset_left, output_slices[slice_idx]) - offset_left += output_slices[slice_idx] - return output.view_as(org_output) - - @dataclass class LoRAMapping(AdapterMapping): is_prefill: bool = False @@ -309,22 +226,7 @@ def set_lora( def forward(self, x: torch.Tensor) -> torch.Tensor: added_tokens_mask = x > self.base_layer.org_vocab_size - 1 embeddings_indices = None - if current_platform.is_hpu(): - embedding_len = self.indices_len[3] - # NOTE(vgoel): These asserts can be skipped when upstreaming. - # Can be removed from vllm-fork also once lora functionality - # on Gaudi stabilizes. - if current_platform.is_hpu(): - emb_len = embedding_len - x_shape = x.shape - ind_shape = self.embeddings_indices[1].shape - assert embedding_len == x.shape[0] * x.shape[1], \ - f"Extra Info: {emb_len}, {x_shape}, {ind_shape}" - assert embedding_len <= self.embeddings_indices[1].shape[0], \ - f"Extra Info: {emb_len}, {x.shape}, {ind_shape}" - indices = self.embeddings_indices[1][:embedding_len].view_as(x) - else: - embeddings_indices = self.punica_wrapper.embeddings_indices + embeddings_indices = self.punica_wrapper.embeddings_indices indices = embeddings_indices[1].view_as(x) full_lora_a_embeddings = F.embedding( x + indices, @@ -342,12 +244,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: full_lora_a_embeddings = full_lora_a_embeddings.view( full_lora_a_embeddings.shape[0] * full_lora_a_embeddings.shape[1], -1) + # Embedding layer only need expand op if current_platform.is_hpu(): - dispatch_bgmv_embedding(full_output, full_lora_a_embeddings, - self.lora_b_stacked, - self.indices[:self.indices_len[0]], 0, 1.0) + self.punica_wrapper.add_lora_embedding(full_output, + full_lora_a_embeddings, + self.lora_b_stacked, + add_input=True) else: - # Embedding layer only need expand op self.punica_wrapper.add_expand(full_output, full_lora_a_embeddings, self.lora_b_stacked, diff --git a/vllm/lora/models.py b/vllm/lora/models.py index e3abf0fc96196..c29660eb3bda5 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -26,9 +26,12 @@ parse_fine_tuned_lora_name, replace_submodule) from vllm.model_executor.models.interfaces import SupportsLoRA from vllm.model_executor.models.utils import PPMissingLayer -from vllm.utils import is_pin_memory_available +from vllm.utils import is_pin_memory_available, get_device from vllm.platforms import current_platform +if current_platform.is_hpu(): + from vllm.hpu.punica_hpu import GaudiPunicaWrapper + logger = init_logger(__name__) _GLOBAL_LORA_ID = 0 @@ -428,23 +431,9 @@ def __init__( self.vocab_size = vocab_size self.long_lora_context: Optional[LongContextLoRAContext] = None if current_platform.is_hpu(): - self.base_indices = torch.empty(self.max_num_batched_tokens, - dtype=torch.long, - device=get_device()) - self.sampler_indices = torch.empty(self.max_num_batched_tokens, - dtype=torch.long, - device=get_device()) - self.sampler_indices_padded = torch.empty( - self.max_num_batched_tokens, - dtype=torch.long, - device=get_device()) - self.embeddings_indices = torch.empty(2, - self.max_num_batched_tokens, - dtype=torch.long, - device=get_device()) - self.long_lora_indices = torch.empty(self.max_num_batched_tokens, - dtype=torch.long, - device=get_device()) + self.punica_wrapper = GaudiPunicaWrapper(max_num_batched_tokens, + max_batches=self.max_num_seqs, + device="hpu") else: self.punica_wrapper = PunicaWrapper(max_num_batched_tokens, max_batches=self.max_num_seqs, diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 6d5c834299961..d9c074b6144a1 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -10,6 +10,7 @@ import torch from vllm.triton_utils import HAS_TRITON +from vllm.utils import get_device if HAS_TRITON: from vllm.lora.ops.bgmv_expand import bgmv_expand @@ -104,7 +105,7 @@ def convert_mapping( long_lora_offsets: Optional[torch.Tensor] = None if long_lora_context: long_lora_offsets = torch.zeros(len(index_mapping_indices), - device="cuda", + device=get_device(), dtype=torch.long) prompt_mapping: List[int] = [ lora_index_to_id.index(x) if x > 0 else -1 @@ -131,9 +132,9 @@ def convert_mapping( if long_lora_context: assert long_lora_offsets is not None indices_list.append(long_lora_offsets) - indices = torch.tensor(indices_list, dtype=torch.long, device="cuda") + indices = torch.tensor(indices_list, dtype=torch.long, device=get_device()) prompt_mapping_tensor = torch.tensor(prompt_mapping, - device="cuda", + device=get_device(), dtype=torch.long) embeddings_indices = torch.stack([ indices[2] * extra_vocab_size, @@ -145,7 +146,7 @@ def convert_mapping( sampler_indices_padded = sampler_indices.clone() sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1 sampler_indices_padded = torch.arange( - 0, len(sampler_indices_padded), device="cuda", dtype=torch.long) + ( + 0, len(sampler_indices_padded), device=get_device(), dtype=torch.long) + ( sampler_indices_padded * len(sampler_indices_padded)) long_lora_indices = None long_lora_indices_len: Optional[int] = None diff --git a/vllm/utils.py b/vllm/utils.py index 6409a2de7b142..ed565d3244541 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -741,6 +741,11 @@ def is_hpu() -> bool: from importlib import util return util.find_spec('habana_frameworks') is not None +def get_device() -> str: + if is_hpu(): + return "hpu" + return "cuda" + class HabanaMemoryProfiler: diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index aefe3508fecb5..5336ad3ed4da9 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -242,11 +242,12 @@ def pad_list(list, k, v): class HpuModelAdapter(): - def __init__(self, model, block_size, enforce_eager): + def __init__(self, model, block_size, dtype, enforce_eager): self.model = model self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA', '0').lower() in ['1', 'true'] self.block_size = block_size + self.dtype = dtype if not htorch.utils.internal.is_lazy() and not enforce_eager: self.model = torch.compile(self.model, backend='hpu_backend', @@ -308,7 +309,7 @@ def forward(self, *args, **kwargs): input_ids = kwargs['input_ids'] kwargs['attn_metadata'] = self._update_metadata( kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1), - input_ids.device, torch.bfloat16) + input_ids.device, self.dtype) LoraMask.setLoraMask(kwargs.pop('lora_mask')) hidden_states = self.model(*args, **kwargs) hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) @@ -607,6 +608,7 @@ def load_model(self) -> None: self.model = _maybe_wrap_in_hpu_graph( self.model, self.block_size, + dtype=self.model_config.dtype, enforce_eager=self.enforce_eager) msg = f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}" logger.info(msg) @@ -1844,8 +1846,8 @@ def execute_model( modules = unwrap_model(self.model.model) for module in modules: if isinstance(module, VocabParallelEmbeddingWithLoRA): - for i in range(0, len(module.indices_len)): - module.indices_len[ + for i in range(0, len(module.punica_wrapper.indices_len)): + module.punica_wrapper.indices_len[ i] = sampling_metadata.selected_token_indices.numel( ) lora_logits_mask: torch.Tensor = model_input.lora_logits_mask From 9f8b8e72e9fcb6b7a8cce40e147bbfef57d05883 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Fri, 20 Sep 2024 13:26:20 +0300 Subject: [PATCH 211/819] add missing files --- vllm/engine/arg_utils.py | 1 + vllm/platforms/interface.py | 4 ---- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 837332da7efd7..f8b544c6bde4d 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -35,6 +35,7 @@ "openvino", "tpu", "xpu", + "hpu", ] diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 31bf5268c1f19..fea4358953745 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -42,13 +42,10 @@ def is_rocm(self) -> bool: def is_tpu(self) -> bool: return self._enum == PlatformEnum.TPU -<<<<<<< HEAD def is_hpu(self) -> bool: return self._enum == PlatformEnum.HPU @staticmethod - def get_device_capability(device_id: int = 0) -> Tuple[int, int]: -======= def is_cpu(self) -> bool: return self._enum == PlatformEnum.CPU @@ -89,7 +86,6 @@ def has_device_capability( @classmethod def get_device_name(cls, device_id: int = 0) -> str: ->>>>>>> upstream/main raise NotImplementedError @classmethod From 346139dd6ca1db44e6a7b8f649306fbf8800a5a7 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Fri, 20 Sep 2024 16:05:35 +0300 Subject: [PATCH 212/819] format.sh --- format.sh | 29 +++++++++------------- tests/samplers/test_sampler.py | 1 + vllm/hpu/punica_hpu.py | 8 +++--- vllm/lora/layers.py | 2 +- vllm/lora/models.py | 7 +++--- vllm/lora/punica.py | 5 ++-- vllm/model_executor/model_loader/loader.py | 1 + vllm/platforms/hpu.py | 7 +++--- vllm/platforms/interface.py | 1 - vllm/utils.py | 6 +---- vllm/worker/habana_model_runner.py | 2 +- vllm/worker/model_runner_base.py | 2 +- 12 files changed, 34 insertions(+), 37 deletions(-) diff --git a/format.sh b/format.sh index adaed1a51c343..6563d89b192ea 100755 --- a/format.sh +++ b/format.sh @@ -96,23 +96,18 @@ echo 'vLLM yapf: Done' # Run mypy echo 'vLLM mypy:' -mypy tests --config-file pyproject.toml -mypy vllm/*.py --config-file pyproject.toml -mypy vllm/attention --config-file pyproject.toml -mypy vllm/core --config-file pyproject.toml -mypy vllm/distributed --config-file pyproject.toml -mypy vllm/engine --config-file pyproject.toml -mypy vllm/entrypoints --config-file pyproject.toml -mypy vllm/executor --config-file pyproject.toml -mypy vllm/logging --config-file pyproject.toml -mypy vllm/lora --config-file pyproject.toml -mypy vllm/model_executor --config-file pyproject.toml -mypy vllm/multimodal --config-file pyproject.toml -mypy vllm/prompt_adapter --config-file pyproject.toml -mypy vllm/spec_decode --config-file pyproject.toml -mypy vllm/transformers_utils --config-file pyproject.toml -mypy vllm/usage --config-file pyproject.toml -mypy vllm/worker --config-file pyproject.toml +mypy --follow-imports skip # Note that this is less strict than CI +mypy tests --follow-imports skip +mypy vllm/attention --follow-imports skip +mypy vllm/distributed --follow-imports skip +mypy vllm/engine --follow-imports skip +mypy vllm/executor --follow-imports skip +mypy vllm/lora --follow-imports skip +mypy vllm/model_executor --follow-imports skip +mypy vllm/prompt_adapter --follow-imports skip +mypy vllm/spec_decode --follow-imports skip +mypy vllm/worker --follow-imports skip +echo 'vLLM mypy: Done' # If git diff returns a file that is in the skip list, the file may be checked anyway: diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index 3cb46dbc213d9..65bd7b09acdc3 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -782,6 +782,7 @@ def test_sampler_include_gpu_probs_tensor(device: str): assert sampler_output.logprobs is not None assert sampler_output.sampled_token_ids is not None + @pytest.mark.parametrize("device", CUDA_DEVICES) def test_topk_topk_scalar(): obj1 = ApplyToppTopkScalar(2) diff --git a/vllm/hpu/punica_hpu.py b/vllm/hpu/punica_hpu.py index 3c37558831bb5..8f732a98c3e29 100644 --- a/vllm/hpu/punica_hpu.py +++ b/vllm/hpu/punica_hpu.py @@ -11,7 +11,9 @@ from vllm.lora.punica import PunicaWrapper from vllm_hpu_extension.ops import dispatch_bgmv_linear, dispatch_bgmv_embedding + class GaudiPunicaWrapper(PunicaWrapper): + def __init__(self, max_num_batched_tokens: int, max_batches: int, device: str): super().__init__(max_num_batched_tokens, max_batches, device) @@ -48,8 +50,8 @@ def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, for slice_idx in range(len(output_slices)): dispatch_bgmv_linear( - y[:, offset_left:offset_left + output_slices[slice_idx]], - x, lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], 0, 1.0) + y[:, offset_left:offset_left + output_slices[slice_idx]], x, + lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], 0, 1.0) offset_left += output_slices[slice_idx] y = y.view_as(y_org) @@ -74,4 +76,4 @@ def add_lora_embedding( w_t_all: torch.Tensor, add_input: bool = True, ): - dispatch_bgmv_embedding(y, x, w_t_all, 0, 1.0) \ No newline at end of file + dispatch_bgmv_embedding(y, x, w_t_all, 0, 1.0) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index b6e7e6783a328..461f4d435d67d 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -253,7 +253,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: self.punica_wrapper.add_lora_embedding(full_output, full_lora_a_embeddings, self.lora_b_stacked, - add_input=True) + add_input=True) else: self.punica_wrapper.add_expand(full_output, full_lora_a_embeddings, diff --git a/vllm/lora/models.py b/vllm/lora/models.py index c29660eb3bda5..6d6fd05c55e93 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -431,9 +431,10 @@ def __init__( self.vocab_size = vocab_size self.long_lora_context: Optional[LongContextLoRAContext] = None if current_platform.is_hpu(): - self.punica_wrapper = GaudiPunicaWrapper(max_num_batched_tokens, - max_batches=self.max_num_seqs, - device="hpu") + self.punica_wrapper = GaudiPunicaWrapper( + max_num_batched_tokens, + max_batches=self.max_num_seqs, + device="hpu") else: self.punica_wrapper = PunicaWrapper(max_num_batched_tokens, max_batches=self.max_num_seqs, diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index d9c074b6144a1..5a2f02ee91456 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -146,8 +146,9 @@ def convert_mapping( sampler_indices_padded = sampler_indices.clone() sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1 sampler_indices_padded = torch.arange( - 0, len(sampler_indices_padded), device=get_device(), dtype=torch.long) + ( - sampler_indices_padded * len(sampler_indices_padded)) + 0, len(sampler_indices_padded), device=get_device(), + dtype=torch.long) + (sampler_indices_padded * + len(sampler_indices_padded)) long_lora_indices = None long_lora_indices_len: Optional[int] = None if long_lora_context: diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 2d51ed6e50bf2..d8f0f68f1c02e 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -87,6 +87,7 @@ def device_loading_context(module: torch.nn.Module, p.data = p.data.to(original_device) # New parameters or parameters already on target device are untouched + from vllm.utils import is_fake_hpu logger = init_logger(__name__) diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index 45f2b95e704d6..ceb3934f29342 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -1,15 +1,16 @@ -from typing import Tuple +from typing import Optional, Tuple import torch -from .interface import Platform, PlatformEnum +from .interface import DeviceCapability, Platform, PlatformEnum class HpuPlatform(Platform): _enum = PlatformEnum.HPU @staticmethod - def get_device_capability(device_id: int = 0) -> Tuple[int, int]: + def get_device_capability( + device_id: int = 0) -> Optional[DeviceCapability]: raise RuntimeError("HPU does not have device capability.") @staticmethod diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index fea4358953745..3b00a9fd98da5 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -45,7 +45,6 @@ def is_tpu(self) -> bool: def is_hpu(self) -> bool: return self._enum == PlatformEnum.HPU - @staticmethod def is_cpu(self) -> bool: return self._enum == PlatformEnum.CPU diff --git a/vllm/utils.py b/vllm/utils.py index 80bbed7cac1a5..377c933b466a6 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -796,11 +796,6 @@ def format_bytes(size): return f'{size:.4g} {power_labels[n]+"B"}' -@lru_cache(maxsize=None) -def is_hpu() -> bool: - from importlib import util - return util.find_spec('habana_frameworks') is not None - def get_device() -> str: if is_hpu(): return "hpu" @@ -1425,6 +1420,7 @@ def dec(self, num=1): def value(self): return self._value + def migrate_to_cpu(): import importlib from unittest.mock import MagicMock diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index e8bf5dfb34628..d92b34c92ea29 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -1096,7 +1096,7 @@ def prepare_input_tensors( if batch_size_padding > 0: dummy_seq_group_metadata = self.create_dummy_seq_group_metadata( 0, 0, is_prompt) - seq_group_metadata_list.extend(dummy_seq_group_metadata + seq_group_metadata_list.extend(seq_group_metadata_list[0] for _ in range(batch_size_padding)) prefill_reqs = [] diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py index 9013e9d251cb6..89613c91ac543 100644 --- a/vllm/worker/model_runner_base.py +++ b/vllm/worker/model_runner_base.py @@ -4,7 +4,7 @@ from datetime import datetime from functools import wraps from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List, - Optional, Type, TypeVar, Union, get_args, get_origin) + Optional, Type, TypeVar, Union, get_args, get_origin) import torch from torch import is_tensor From 6d4544343c0f6121750931ca1da79954c33f6524 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Fri, 20 Sep 2024 16:19:29 +0300 Subject: [PATCH 213/819] more format.sh --- vllm/engine/arg_utils.py | 6 +- vllm/executor/habana_executor.py | 2 +- vllm/executor/ray_habana_executor.py | 4 +- vllm/executor/ray_utils.py | 2 +- vllm/hpu/punica_hpu.py | 6 +- vllm/lora/layers.py | 6 +- vllm/lora/models.py | 2 +- vllm/model_executor/layers/fused_moe/layer.py | 58 ------------------- vllm/model_executor/layers/sampler.py | 2 +- vllm/model_executor/model_loader/loader.py | 4 +- vllm/model_executor/models/gpt_bigcode.py | 2 +- vllm/model_executor/models/mixtral.py | 2 +- vllm/model_executor/models/qwen2.py | 3 +- vllm/model_executor/sampling_metadata.py | 1 - vllm/platforms/hpu.py | 2 +- vllm/worker/habana_model_runner.py | 12 ++-- vllm/worker/habana_worker.py | 4 +- 17 files changed, 28 insertions(+), 90 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index f8b544c6bde4d..2b1667023a1fa 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -16,9 +16,9 @@ from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.platforms import current_platform from vllm.transformers_utils.utils import check_gguf_file from vllm.utils import FlexibleArgumentParser -from vllm.platforms import current_platform if TYPE_CHECKING: from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup @@ -1023,8 +1023,8 @@ def create_engine_config(self) -> EngineConfig: self.model_loader_extra_config[ "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path - load_device = device_config.device if self.weights_load_device is None else \ - self.weights_load_device + load_device = device_config.device if self.weights_load_device is \ + None else self.weights_load_device load_config = self.create_load_config(load_device) prompt_adapter_config = PromptAdapterConfig( diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py index 6b362e6f1e326..6e92da0245836 100644 --- a/vllm/executor/habana_executor.py +++ b/vllm/executor/habana_executor.py @@ -9,9 +9,9 @@ from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.logger import init_logger from vllm.lora.request import LoRARequest +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import ExecuteModelRequest -from vllm.model_executor.layers.sampler import SamplerOutput from vllm.utils import (HabanaMemoryProfiler, get_distributed_init_method, get_ip, get_open_port, make_async) from vllm.worker.worker_base import WorkerWrapperBase diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py index f0822283296dd..66b6e76e92004 100644 --- a/vllm/executor/ray_habana_executor.py +++ b/vllm/executor/ray_habana_executor.py @@ -9,11 +9,11 @@ DistributedGPUExecutor, DistributedGPUExecutorAsync) from vllm.executor.ray_utils import RayWorkerWrapper, ray from vllm.logger import init_logger -from vllm.sequence import ExecuteModelRequest from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sequence import ExecuteModelRequest from vllm.utils import (_run_task_with_lock, get_distributed_init_method, get_ip, get_open_port, get_vllm_instance_id, - make_async, is_fake_hpu) + is_fake_hpu, make_async) if ray is not None: from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 34b002514c27a..8971f5aac626e 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -10,7 +10,7 @@ from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.sequence import ExecuteModelRequest, IntermediateTensors -from vllm.utils import get_ip, is_hip, is_xpu, hpu_device_string +from vllm.utils import get_ip, hpu_device_string, is_hip, is_xpu from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) diff --git a/vllm/hpu/punica_hpu.py b/vllm/hpu/punica_hpu.py index 8f732a98c3e29..9b7261564e629 100644 --- a/vllm/hpu/punica_hpu.py +++ b/vllm/hpu/punica_hpu.py @@ -5,11 +5,13 @@ # LICENSE file in the root directory of this source tree. ############################################################################### -from typing import TYPE_CHECKING, Callable, List, Optional, Tuple, Union +from typing import Optional, Tuple import torch +from vllm_hpu_extension.ops import (dispatch_bgmv_embedding, + dispatch_bgmv_linear) + from vllm.lora.punica import PunicaWrapper -from vllm_hpu_extension.ops import dispatch_bgmv_linear, dispatch_bgmv_embedding class GaudiPunicaWrapper(PunicaWrapper): diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 461f4d435d67d..cc55d4afc7d6f 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -17,6 +17,7 @@ tensor_model_parallel_all_reduce, tensor_model_parallel_gather) from vllm.distributed.utils import divide +from vllm.hpu.punica_hpu import GaudiPunicaWrapper from vllm.lora.punica import PunicaWrapper from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, @@ -30,10 +31,6 @@ VocabParallelEmbedding) from vllm.platforms import current_platform -if current_platform.is_hpu(): - from vllm_hpu_extension.ops import (dispatch_bgmv_embedding, - dispatch_bgmv_linear) - if TYPE_CHECKING: pass @@ -250,6 +247,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: full_lora_a_embeddings.shape[1], -1) # Embedding layer only need expand op if current_platform.is_hpu(): + assert isinstance(self.punica_wrapper, GaudiPunicaWrapper) self.punica_wrapper.add_lora_embedding(full_output, full_lora_a_embeddings, self.lora_b_stacked, diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 6d6fd05c55e93..4ec7a6815a755 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -26,8 +26,8 @@ parse_fine_tuned_lora_name, replace_submodule) from vllm.model_executor.models.interfaces import SupportsLoRA from vllm.model_executor.models.utils import PPMissingLayer -from vllm.utils import is_pin_memory_available, get_device from vllm.platforms import current_platform +from vllm.utils import is_pin_memory_available if current_platform.is_hpu(): from vllm.hpu.punica_hpu import GaudiPunicaWrapper diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 9e4e7233c1eba..179a8609a17f4 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -579,61 +579,3 @@ def _load_fp8_scale(self, param: torch.nn.Parameter, else: raise ValueError( f"Shard id must be in [0,1,2] but got {shard_id}") - - def forward(self, hidden_states: torch.Tensor, - router_logits: torch.Tensor): - assert self.quant_method is not None - - # Matrix multiply. - final_hidden_states = self.quant_method.apply( - self, - x=hidden_states, - router_logits=router_logits, - top_k=self.top_k, - renormalize=self.renormalize, - use_grouped_topk=self.use_grouped_topk, - num_expert_group=self.num_expert_group, - topk_group=self.topk_group) - - if self.reduce_results and self.tp_size > 1: - final_hidden_states = tensor_model_parallel_all_reduce( - final_hidden_states) - - return final_hidden_states - - @classmethod - def make_expert_params_mapping( - cls, ckpt_gate_proj_name: str, ckpt_down_proj_name: str, - ckpt_up_proj_name: str, - num_experts: int) -> List[Tuple[str, str, int, int]]: - - gate_up = [ckpt_gate_proj_name, ckpt_up_proj_name] - gate_down_up = [ - ckpt_gate_proj_name, ckpt_down_proj_name, ckpt_up_proj_name - ] - - return [ - # These are the weight scales for the experts - # (param_name, weight_name, expert_id, shard_id) - ("experts.w13_scale" - if weight_name in gate_up else "experts.w2_scale", - f"experts.{expert_id}.{weight_name}.weight_scale", expert_id, - shard_id) for expert_id in range(num_experts) - for shard_id, weight_name in enumerate(gate_down_up) - ] + [ - # These are the weights for the experts - # (param_name, weight_name, expert_id, shard_id) - ("experts.w13_weight" - if weight_name in gate_up else "experts.w2_weight", - f"experts.{expert_id}.{weight_name}.weight", expert_id, shard_id) - for expert_id in range(num_experts) - for shard_id, weight_name in enumerate(gate_down_up) - ] + [ - # These are the weight scales for the experts - # (param_name, weight_name, expert_id, shard_id) - ("experts.a13_scale" - if weight_name in gate_up else "experts.a2_scale", - f"experts.{expert_id}.{weight_name}.input_scale", expert_id, - shard_id) for expert_id in range(num_experts) - for shard_id, weight_name in enumerate(gate_down_up) - ] diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 9dda63f9768c6..6da6199a01962 100755 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -1,9 +1,9 @@ """A layer that samples the next tokens from the model's outputs.""" import itertools +import math import warnings from dataclasses import dataclass from importlib.util import find_spec -import math from math import inf from typing import Dict, List, Optional, Tuple, Union diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index d8f0f68f1c02e..b03e6aca48c0e 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -44,7 +44,7 @@ supports_multimodal) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform -from vllm.utils import is_pin_memory_available +from vllm.utils import is_fake_hpu, is_pin_memory_available @contextmanager @@ -88,8 +88,6 @@ def device_loading_context(module: torch.nn.Module, # New parameters or parameters already on target device are untouched -from vllm.utils import is_fake_hpu - logger = init_logger(__name__) diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index f01cace91d2ab..a8567f32958be 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -39,8 +39,8 @@ ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors from vllm.platforms import current_platform +from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index a5ef454ee80e0..7a075162d579f 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -45,8 +45,8 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors from vllm.platforms import current_platform +from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA from .utils import is_pp_missing_parameter, make_layers diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 649caba5d9424..9801b218ddb83 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -46,8 +46,9 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors from vllm.platform import current_platform +from vllm.sequence import IntermediateTensors + from .interfaces import SupportsLoRA from .utils import is_pp_missing_parameter, make_layers diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 829e3fd6d8eb5..97d36d31f2b11 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -9,7 +9,6 @@ SequenceGroupMetadata) from vllm.utils import (PyObjectCache, async_tensor_h2d, is_pin_memory_available, make_tensor_with_pad) -from vllm.platforms import current_platform _SAMPLING_EPS = 1e-5 diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index ceb3934f29342..feddce69ac5b4 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -1,4 +1,4 @@ -from typing import Optional, Tuple +from typing import Optional import torch diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index d92b34c92ea29..04674e505b01f 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -2,7 +2,6 @@ # Copyright (C) 2024 Habana Labs, Ltd. an Intel Company ############################################################################### -from array import array import collections import contextlib import dataclasses @@ -13,6 +12,7 @@ import operator import os import time +from array import array from enum import IntEnum from typing import (TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple, Optional, Set, Tuple, Type, TypeVar, Union) @@ -24,23 +24,21 @@ from vllm.attention import AttentionMetadata, get_attn_backend from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, MultiModalConfig, ObservabilityConfig, - ParallelConfig, PromptAdapterConfig, SchedulerConfig) + ModelConfig, ObservabilityConfig, ParallelConfig, + PromptAdapterConfig, SchedulerConfig) from vllm.distributed.parallel_state import get_world_group -from vllm.inputs.registry import InputRegistry from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping from vllm.lora.request import LoRARequest from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager from vllm.model_executor import SamplingMetadata +from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader import get_model -from vllm.multimodal.registry import MultiModalRegistry from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, MultiModalInputs) from vllm.sampling_params import SamplingParams from vllm.sequence import (IntermediateTensors, SequenceData, SequenceGroupMetadata) -from vllm.model_executor.layers.sampler import SamplerOutput from vllm.utils import (HabanaMemoryProfiler, format_bytes, is_fake_hpu, is_pin_memory_available, make_tensor_with_pad) from vllm.worker.model_runner_base import ( @@ -1096,7 +1094,7 @@ def prepare_input_tensors( if batch_size_padding > 0: dummy_seq_group_metadata = self.create_dummy_seq_group_metadata( 0, 0, is_prompt) - seq_group_metadata_list.extend(seq_group_metadata_list[0] + seq_group_metadata_list.extend(dummy_seq_group_metadata for _ in range(batch_size_padding)) prefill_reqs = [] diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index 89c796068bac4..f2678c5e405dc 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -11,8 +11,8 @@ import torch.distributed from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, MultiModalConfig, ObservabilityConfig, - ParallelConfig, PromptAdapterConfig, SchedulerConfig, + ModelConfig, ObservabilityConfig, ParallelConfig, + PromptAdapterConfig, SchedulerConfig, SpeculativeConfig) from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) From 3a0ff3b8dfeebc0b891a3e87ffd9fdf72268782e Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Fri, 20 Sep 2024 16:23:03 +0300 Subject: [PATCH 214/819] gha update --- .github/workflows/mypy.yaml | 30 +++++++++++------------------- .github/workflows/yapf.yml | 1 + 2 files changed, 12 insertions(+), 19 deletions(-) diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml index 42c141237fb15..6ebe512c5dbf6 100644 --- a/.github/workflows/mypy.yaml +++ b/.github/workflows/mypy.yaml @@ -32,23 +32,15 @@ jobs: pip install types-setuptools - name: Mypy run: | - mypy tests --config-file pyproject.toml - mypy vllm/*.py --config-file pyproject.toml - mypy vllm/attention --config-file pyproject.toml - mypy vllm/core --config-file pyproject.toml - mypy vllm/distributed --config-file pyproject.toml - mypy vllm/engine --config-file pyproject.toml - mypy vllm/entrypoints --config-file pyproject.toml - mypy vllm/executor --config-file pyproject.toml - mypy vllm/inputs --config-file pyproject.toml - mypy vllm/logging --config-file pyproject.toml - mypy vllm/lora --config-file pyproject.toml - mypy vllm/model_executor --config-file pyproject.toml - mypy vllm/multimodal --config-file pyproject.toml - mypy vllm/platforms --config-file pyproject.toml - mypy vllm/spec_decode --config-file pyproject.toml - mypy vllm/transformers_utils --config-file pyproject.toml - mypy vllm/usage --config-file pyproject.toml - mypy vllm/worker --config-file pyproject.toml - + mypy + mypy tests --follow-imports skip + mypy vllm/attention --follow-imports skip + mypy vllm/distributed --follow-imports skip + mypy vllm/engine --follow-imports skip + mypy vllm/executor --follow-imports skip + mypy vllm/lora --follow-imports skip + mypy vllm/model_executor --follow-imports skip + mypy vllm/prompt_adapter --follow-imports skip + mypy vllm/spec_decode --follow-imports skip + mypy vllm/worker --follow-imports skip diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml index 448c52a3b49dc..b1002578610d4 100644 --- a/.github/workflows/yapf.yml +++ b/.github/workflows/yapf.yml @@ -9,6 +9,7 @@ on: pull_request: branches: - habana_main + jobs: yapf: runs-on: ubuntu-latest From 6502b9150d695eda96deec497b54a6751965ac7a Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Fri, 20 Sep 2024 16:47:35 +0300 Subject: [PATCH 215/819] Separate LoRA algorithms --- requirements-hpu.txt | 2 +- vllm/hpu/punica_hpu.py | 81 ------------------------------------------ vllm/lora/layers.py | 4 ++- vllm/lora/models.py | 2 +- 4 files changed, 5 insertions(+), 84 deletions(-) delete mode 100644 vllm/hpu/punica_hpu.py diff --git a/requirements-hpu.txt b/requirements-hpu.txt index d451200aa1144..56caa4ba03862 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -6,4 +6,4 @@ ray == 2.32.0 triton pandas tabulate -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@30ee2d1 +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@bdd4f2b diff --git a/vllm/hpu/punica_hpu.py b/vllm/hpu/punica_hpu.py deleted file mode 100644 index 9b7261564e629..0000000000000 --- a/vllm/hpu/punica_hpu.py +++ /dev/null @@ -1,81 +0,0 @@ -############################################################################### -# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company -# -# This source code is licensed under the BSD license found in the -# LICENSE file in the root directory of this source tree. -############################################################################### - -from typing import Optional, Tuple - -import torch -from vllm_hpu_extension.ops import (dispatch_bgmv_embedding, - dispatch_bgmv_linear) - -from vllm.lora.punica import PunicaWrapper - - -class GaudiPunicaWrapper(PunicaWrapper): - - def __init__(self, max_num_batched_tokens: int, max_batches: int, - device: str): - super().__init__(max_num_batched_tokens, max_batches, device) - - def add_lora(self, - y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - scale: float, - y_offset: Optional[int] = None, - y_slice_size: Optional[int] = None, - *, - buffer: Optional[torch.Tensor] = None) -> None: - y_org = y - x = x.view(-1, x.shape[-1]) - y = y.view(-1, y.shape[-1]) - dispatch_bgmv_linear(y, x, wa_t_all, wb_t_all, 0, 1.0) - y = y.view_as(y_org) - - def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, - lora_a_stacked: Tuple[torch.Tensor, - torch.Tensor, - torch.Tensor], - lora_b_stacked: Tuple[torch.Tensor, - torch.Tensor, - torch.Tensor], - scale: float, - output_slices: Tuple[int, ...]) -> None: - y_org = y - x = x.view(-1, x.shape[-1]) - y = y.view(-1, y.shape[-1]) - offset_left = 0 - - for slice_idx in range(len(output_slices)): - dispatch_bgmv_linear( - y[:, offset_left:offset_left + output_slices[slice_idx]], x, - lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], 0, 1.0) - offset_left += output_slices[slice_idx] - y = y.view_as(y_org) - - def add_lora_logits(self, - y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - scale, - *, - buffer: Optional[torch.Tensor] = None) -> None: - y_org = y - y = y.view(-1, y.shape[-1]) - x = x.view(-1, x.shape[-1]) - dispatch_bgmv_linear(y, x, wa_t_all, wb_t_all, 0, 1.0) - y = y.view_as(y_org) - - def add_lora_embedding( - self, - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - add_input: bool = True, - ): - dispatch_bgmv_embedding(y, x, w_t_all, 0, 1.0) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index cc55d4afc7d6f..b3758ad883d56 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -17,7 +17,6 @@ tensor_model_parallel_all_reduce, tensor_model_parallel_gather) from vllm.distributed.utils import divide -from vllm.hpu.punica_hpu import GaudiPunicaWrapper from vllm.lora.punica import PunicaWrapper from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, @@ -31,6 +30,9 @@ VocabParallelEmbedding) from vllm.platforms import current_platform +if current_platform.is_hpu(): + from vllm_hpu_extension.punica_hpu import GaudiPunicaWrapper + if TYPE_CHECKING: pass diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 4ec7a6815a755..546a4c402aedc 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -30,7 +30,7 @@ from vllm.utils import is_pin_memory_available if current_platform.is_hpu(): - from vllm.hpu.punica_hpu import GaudiPunicaWrapper + from vllm_hpu_extension.punica_hpu import GaudiPunicaWrapper logger = init_logger(__name__) From 7057da5f76e465e4735490787b543cf3d2b5ad3b Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Fri, 20 Sep 2024 16:54:18 +0300 Subject: [PATCH 216/819] yapf is being a headache --- tests/lora/test_lora_hpu.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/lora/test_lora_hpu.py b/tests/lora/test_lora_hpu.py index 1e0e728ae7240..06b687282391b 100644 --- a/tests/lora/test_lora_hpu.py +++ b/tests/lora/test_lora_hpu.py @@ -254,10 +254,9 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None: mask = createLoraMask(indices, k, 1, 8, rank, dtype) LoraMask.setLoraMask(mask) - punica_wrapper.add_lora_packed_nslice(output, input, - lora_a_stacks, - lora_b_stacks, - 1.0, (qkv[0], qkv[1], qkv[2])) + punica_wrapper.add_lora_packed_nslice(output, input, lora_a_stacks, + lora_b_stacks, 1.0, + (qkv[0], qkv[1], qkv[2])) assert torch.allclose(torch.zeros_like(output), output) manager.reset_lora() \ No newline at end of file From 43df76205de572c7c1cd86321ceb72f1f9759633 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Fri, 20 Sep 2024 17:00:33 +0300 Subject: [PATCH 217/819] oh come on now --- tests/lora/test_lora_hpu.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/tests/lora/test_lora_hpu.py b/tests/lora/test_lora_hpu.py index 06b687282391b..a59cfe875ef9c 100644 --- a/tests/lora/test_lora_hpu.py +++ b/tests/lora/test_lora_hpu.py @@ -241,10 +241,9 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None: LoraMask.setLoraMask(mask) punica_wrapper = GaudiPunicaWrapper(4096, max_batches=256, device="hpu") - punica_wrapper.add_lora_packed_nslice(output, input, - lora_a_stacks, - lora_b_stacks, - 1.0, (qkv[0], qkv[1], qkv[2])) + qkvs = (qkv[0], qkv[1], qkv[2]) + punica_wrapper.add_lora_packed_nslice(output, input, lora_a_stacks, + lora_b_stacks, 1.0, qkvs) rtol, atol = TOLERANCES[dtype] assert torch.allclose(expected, output, rtol=rtol, atol=atol) @@ -253,10 +252,9 @@ def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None: indices = torch.full((len(input), ), -1, device="hpu") mask = createLoraMask(indices, k, 1, 8, rank, dtype) LoraMask.setLoraMask(mask) - + qkvs = (qkv[0], qkv[1], qkv[2]) punica_wrapper.add_lora_packed_nslice(output, input, lora_a_stacks, - lora_b_stacks, 1.0, - (qkv[0], qkv[1], qkv[2])) + lora_b_stacks, 1.0, qkvs) assert torch.allclose(torch.zeros_like(output), output) - manager.reset_lora() \ No newline at end of file + manager.reset_lora() From 3134b8a0f40753534c211da2bbc4cca0413904fe Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Fri, 20 Sep 2024 17:18:32 +0300 Subject: [PATCH 218/819] fix fakehpu mode --- vllm/platforms/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index b717e256e43f5..e3b7dd3bb216e 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -44,8 +44,11 @@ is_hpu = False try: + import os from importlib import util - is_hpu = util.find_spec('habana_frameworks') is not None + is_hpu = util.find_spec('habana_frameworks') is not None or os.environ.get( + 'VLLM_USE_FAKE_HPU', '0') != '0' + except Exception: pass From f92ffc15d2268149beb90bbb1e19f539d77928ad Mon Sep 17 00:00:00 2001 From: Michal Adamczyk Date: Mon, 23 Sep 2024 09:27:18 +0200 Subject: [PATCH 219/819] Fix calculating slots for warmup (#310) Recent changes broke slot sparsity for warmup slots. This commit restores the functionality. --- vllm/worker/habana_model_runner.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 0d5df1f312ec9..1d8566e0edff4 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -971,10 +971,11 @@ def _prepare_decode( block_table = seq_group_metadata.block_tables[seq_id] if len(block_table) == 0: block_number = _PAD_BLOCK_ID - block_table = [] - slot = next(dummy_slots) else: block_number = block_table[position // self.block_size] + if block_number == _PAD_BLOCK_ID: + slot = next(dummy_slots) + else: block_offset = position % self.block_size slot = block_number * self.block_size + block_offset slot_mapping.append([slot]) From 63fae5106748b9ec86bec436ce275d878c5dfe02 Mon Sep 17 00:00:00 2001 From: Tomasz Zielinski <85164140+tzielinski-habana@users.noreply.github.com> Date: Mon, 23 Sep 2024 09:27:33 +0200 Subject: [PATCH 220/819] Removed padding block from a list of available blocks in allocators (#313) Block 0 is used for padding. This PR removes the padding block from a list of available blocks in block allocators v1 and v2 --- vllm/core/block/cpu_gpu_block_allocator.py | 8 ++++++-- vllm/core/block_manager_v1.py | 6 ++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index 5287cd9c1bfb3..6ade639fc3ade 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -4,7 +4,7 @@ DeviceAwareBlockAllocator) from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator -from vllm.utils import Device +from vllm.utils import Device, is_hpu class CpuGpuBlockAllocator(DeviceAwareBlockAllocator): @@ -52,7 +52,11 @@ def create( - The block IDs are assigned contiguously, with GPU block IDs coming before CPU block IDs. """ - block_ids = list(range(num_gpu_blocks + num_cpu_blocks)) + # For HPU, block id 0 is used only for padding + reserved_blocks = 1 if is_hpu() else 0 + block_ids = list( + range(reserved_blocks, num_gpu_blocks + num_cpu_blocks)) + num_gpu_blocks -= reserved_blocks gpu_block_ids = block_ids[:num_gpu_blocks] cpu_block_ids = block_ids[num_gpu_blocks:] diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index e29eba375f4dd..2a3cbe2e642cb 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -13,7 +13,7 @@ from vllm.core.interfaces import AllocStatus, BlockSpaceManager from vllm.logger import init_logger from vllm.sequence import Sequence, SequenceGroup, SequenceStatus -from vllm.utils import Device +from vllm.utils import Device, is_hpu logger = init_logger(__name__) @@ -171,7 +171,9 @@ def __init__( # Initialize the free blocks. self.free_blocks: BlockTable = [] - for i in range(num_blocks): + # For HPU, block id 0 is used only for padding + reserved_blocks = 1 if is_hpu() else 0 + for i in range(reserved_blocks, num_blocks): block = PhysicalTokenBlock(device=device, block_number=i, block_size=block_size, From aa507d41ffc1900ba3420b6574b14ebc8d63c031 Mon Sep 17 00:00:00 2001 From: Michal Adamczyk Date: Mon, 23 Sep 2024 12:48:02 +0200 Subject: [PATCH 221/819] Fix seq_len for padding sequences (#318) Before the fix we used seq_len=0 for padding samples. This was later translated to an empty attention_mask (since we don't have any tokens that we should include in calculations) and in turn caused NaNs in prompt attention (0 divided by 0). Those NaNs later got propagated to kv-cache causing issues in flat_pa. --- vllm/worker/habana_model_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 1d8566e0edff4..f7a3c8569e229 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -1268,6 +1268,7 @@ def create_dummy_seq_group_metadata(self, lora_request=None): sampling_params = SamplingParams(temperature=0) num_blocks = math.ceil(seq_len / self.block_size) + seq_len = max(seq_len, 1) if is_prompt: input_len = seq_len output_len = 0 From a844837032a30ee6f0b76ace9d1937975d20cf6c Mon Sep 17 00:00:00 2001 From: Sanju C Sudhakaran Date: Mon, 23 Sep 2024 11:24:46 +0300 Subject: [PATCH 222/819] Fix lora specific conditions in profile-run --- vllm/worker/habana_model_runner.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 1d8566e0edff4..7efe71ea6ba19 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -1294,7 +1294,8 @@ def profile_run(self) -> None: max_seq_len = min(self.prompt_seq_bucket_cfg[-1], self.max_num_batched_tokens // max_batch_size) - self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches) + self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches, + False, True) return def warmup_scenario(self, @@ -1302,7 +1303,8 @@ def warmup_scenario(self, seq_len, is_prompt, kv_caches, - is_profile_run=False) -> None: + is_pt_profiler_run=False, + is_lora_profile_run=False) -> None: use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) scenario_name = ("warmup_" f"{'prompt' if is_prompt else 'decode'}_" @@ -1316,7 +1318,7 @@ def warmup_scenario(self, # passed in, which contains a lora from the lora warmup path. dummy_lora_requests: List[LoRARequest] = [] dummy_lora_requests_per_seq: List[LoRARequest] = [] - if self.lora_config and is_profile_run: + if self.lora_config and is_lora_profile_run: assert self.lora_manager is not None with self.lora_manager.dummy_lora_cache(): for idx in range(self.lora_config.max_loras): @@ -1334,8 +1336,8 @@ def warmup_scenario(self, for idx in range(max_num_seqs) ] self.profiler.start('internal', scenario_name) - times = 3 if use_graphs or is_profile_run else 1 - if self.lora_config and not is_profile_run: + times = 3 if use_graphs or is_pt_profiler_run else 1 + if self.lora_config and not is_lora_profile_run: lora_mapping = LoRAMapping( [0] * batch_size * seq_len, [0] * batch_size * seq_len, @@ -1366,7 +1368,7 @@ def warmup_scenario(self, ] torch.hpu.synchronize() profiler = None - if is_profile_run and self.is_driver_worker: + if is_pt_profiler_run and self.is_driver_worker: profiler = setup_profiler() profiler.start() for _ in range(times): From 9bb65b727b37030ed0d2ae489a7d46b8ab72d217 Mon Sep 17 00:00:00 2001 From: Michal Adamczyk Date: Mon, 23 Sep 2024 15:04:17 +0200 Subject: [PATCH 223/819] Run with HPU graphs even when warmup was skipped (#320) Before that PR we relied on stored information which configuration should have HPU graphs enabled. Unfortunately that set was computed during warmup. If we skipped warmup we didn't had that information. This PR allows to run all buckets with HPU graphs enabled when warmup is skipped. --- vllm/worker/habana_model_runner.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 57cab468bef3a..f7f9990c1370f 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -574,6 +574,9 @@ def _set_gc_threshold(self) -> None: self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \ .create_input_mapper(self.model_config) + self.skip_warmup = os.environ.get('VLLM_SKIP_WARMUP', + 'false').lower() == 'true' + def load_model(self) -> None: import habana_frameworks.torch.core as htcore if self.model_config.quantization == 'inc': @@ -647,6 +650,8 @@ def load_model(self) -> None: def _use_graphs(self, batch_size, seq_len, is_prompt): if self.enforce_eager: return False + if self.skip_warmup: + return True return (batch_size, seq_len, is_prompt) in self.graphed_buckets def _is_valid_bucket(self, bucket): @@ -1501,7 +1506,7 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: self.warmup_scenario(int(bs), int(seq_len), is_prompt, kv_caches, True) raise AssertionError("Finished profiling") - if os.environ.get('VLLM_SKIP_WARMUP', 'false').lower() == 'true': + if self.skip_warmup: logger.info("Skipping warmup...") return self.profiler.start('internal', 'warmup') From 2a499c7bb9bd458f01597d2de0a4e512fba6b8ab Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 23 Sep 2024 16:41:22 +0300 Subject: [PATCH 224/819] mixtral api fixes --- vllm/model_executor/layers/fused_moe/layer.py | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 179a8609a17f4..421fbe7187dfa 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -120,24 +120,23 @@ def forward_cuda( inplace=True) def forward_hpu( - self, - x: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool, - num_expert_group: Optional[int], - topk_group: Optional[int], - layer: Optional[torch.nn.Module], + self, + layer: torch.nn.Module, + x: torch.Tensor, + use_grouped_topk: bool, + top_k: int, + router_logits: torch.Tensor, + renormalize: bool, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None ): assert not use_grouped_topk, 'use_grouped_topk must be False on HPU' assert num_expert_group is None, ('num_expert_group is ' 'not supported on HPU') assert topk_group is None, 'topk_group is not supported on HPU' if layer is not None: - return layer.hpu_static_fused_moe(x, w1, w2, router_logits, top_k) + return layer.hpu_static_fused_moe(x, layer.w13_weight, layer.w2_weight, router_logits, top_k) def forward_cpu(self, *args, **kwargs): raise NotImplementedError( From 93727344e3e8218575cbe72c7b035c3585382539 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 23 Sep 2024 16:44:04 +0300 Subject: [PATCH 225/819] revert debug prints --- vllm/worker/habana_worker.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index 8ba47373fb47b..f2678c5e405dc 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -345,7 +345,6 @@ def init_worker_distributed_environment( distributed_init_method: Optional[str] = None, local_rank: int = -1, ) -> None: - print("Initializing TP...") """Initialize the distributed environment.""" backend = hpu_backend_string() init_distributed_environment(parallel_config.world_size, @@ -353,15 +352,12 @@ def init_worker_distributed_environment( distributed_init_method, local_rank, backend=backend) - print(f"init_distributed_environment with backend {backend} and distributed_init_method {distributed_init_method} done!") ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size) - print("ensure_model_parallel_initialized done!") if torch.distributed.is_initialized(): torch_world_size = torch.distributed.get_world_size() - print(f"torch.distributed is already initialized, torch_world_size: {torch_world_size}") if torch_world_size != parallel_config.world_size: raise RuntimeError( "torch.distributed is already initialized but the torch world " @@ -372,7 +368,6 @@ def init_worker_distributed_environment( "distributed_init_method must be set if torch.distributed " "is not already initialized") else: - print(f"torch.distributed is not initialized, initializing world_size: {parallel_config.world_size}") backend = hpu_backend_string() torch.distributed.init_process_group( backend=backend, @@ -380,24 +375,14 @@ def init_worker_distributed_environment( rank=rank, init_method=distributed_init_method, ) - print(f"torch.distributed initialized!") # A small all_reduce for warmup & checking conformance. device = hpu_device_string() dummy_tensor_hpu = torch.ones(1).to(device) - torch.hpu.synchronize() - print(f"testing allreduce...") - htorch.core.mark_step() - print(f"testing allreduce...") torch.distributed.all_reduce(dummy_tensor_hpu) - htorch.core.mark_step() - torch.hpu.synchronize() - print(f"allreduce done, checking result...") assert dummy_tensor_hpu.item() == parallel_config.world_size - print(f"allreduce works fine!!") ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, parallel_config.pipeline_parallel_size) - print("TP initialized successfully!!") def raise_if_cache_size_invalid(num_gpu_blocks, block_size, From c15ddd22fc965e85738faf5e0d377a9b56770898 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 23 Sep 2024 16:48:03 +0300 Subject: [PATCH 226/819] format.sh --- vllm/executor/ray_habana_executor.py | 31 +++++++------------ vllm/model_executor/layers/fused_moe/layer.py | 26 ++++++++-------- 2 files changed, 25 insertions(+), 32 deletions(-) diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py index 887cd0f1029ce..645bceb1af446 100644 --- a/vllm/executor/ray_habana_executor.py +++ b/vllm/executor/ray_habana_executor.py @@ -2,7 +2,8 @@ import os from collections import defaultdict from itertools import islice, repeat -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Type +from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, + Type) import msgspec @@ -15,8 +16,8 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest from vllm.utils import (_run_task_with_lock, get_distributed_init_method, - get_ip, get_open_port, get_vllm_instance_id, is_fake_hpu, - make_async) + get_ip, get_open_port, get_vllm_instance_id, + is_fake_hpu, make_async) from vllm.worker.worker_base import WorkerBase if ray is not None: @@ -78,12 +79,16 @@ def shutdown(self) -> None: self.forward_dag = None def _get_worker_module_and_class( - self) -> Tuple[str, str, Optional[Callable[[], Type[WorkerBase]]]]: # noqa: F821 + self + ) -> Tuple[str, str, Optional[Callable[[], + Type[WorkerBase]]]]: # noqa: F821 worker_class_fn = None if self.scheduler_config.is_multi_step: - raise NotImplementedError("Multi-step execution is not implemented for HPU") + raise NotImplementedError( + "Multi-step execution is not implemented for HPU") elif self.speculative_config: - raise NotImplementedError("Speculative decoding is not implemented for HPU") + raise NotImplementedError( + "Speculative decoding is not implemented for HPU") else: worker_module_name = "vllm.worker.habana_worker" worker_class_name = "HabanaWorker" @@ -100,10 +105,6 @@ def _get_worker_wrapper_args(self) -> Dict[str, Any]: trust_remote_code=self.model_config.trust_remote_code, ) - # child class could overwrite this to return actual env vars. - def _get_env_vars_to_be_updated(self): - return self._env_vars_for_all_workers - def _init_workers_ray(self, placement_group: "PlacementGroup", **ray_remote_kwargs): # Otherwise, the ray workers are allocated with a full GPU. @@ -120,10 +121,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", # the TP group of workers for a PP rank. self.pp_tp_workers: List[List[RayWorkerWrapper]] = [] - if self.parallel_config.ray_workers_use_nsight: - ray_remote_kwargs = self._configure_ray_workers_use_nsight( - ray_remote_kwargs) - logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker) # Create the workers. @@ -443,8 +440,6 @@ def _compiled_ray_dag(self, enable_asyncio: bool): from ray.dag import InputNode, MultiOutputNode from ray.experimental.channel.torch_tensor_type import TorchTensorType - logger.info("VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL = %s", - envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL) with InputNode() as input_data: # Example DAG: PP=2, TP=4 # (ExecuteModelReq, None) -> 0 -> (ExecuteModelReq, IntermediateOutput) -> 4 -> SamplerOutput # noqa: E501 @@ -469,9 +464,7 @@ def _compiled_ray_dag(self, enable_asyncio: bool): # Specify how intermediate tensors should be passed # between pp stages, no need to specify for the last # pp stage. - transport = "nccl" \ - if envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL \ - else "auto" + transport = "auto" outputs = [ output.with_type_hint( TorchTensorType(transport=transport)) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 421fbe7187dfa..da374de26a991 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -119,24 +119,24 @@ def forward_cuda( topk_ids=topk_ids, inplace=True) - def forward_hpu( - self, - layer: torch.nn.Module, - x: torch.Tensor, - use_grouped_topk: bool, - top_k: int, - router_logits: torch.Tensor, - renormalize: bool, - topk_group: Optional[int] = None, - num_expert_group: Optional[int] = None, - custom_routing_function: Optional[Callable] = None - ): + def forward_hpu(self, + layer: torch.nn.Module, + x: torch.Tensor, + use_grouped_topk: bool, + top_k: int, + router_logits: torch.Tensor, + renormalize: bool, + topk_group: Optional[int] = None, + num_expert_group: Optional[int] = None, + custom_routing_function: Optional[Callable] = None): assert not use_grouped_topk, 'use_grouped_topk must be False on HPU' assert num_expert_group is None, ('num_expert_group is ' 'not supported on HPU') assert topk_group is None, 'topk_group is not supported on HPU' if layer is not None: - return layer.hpu_static_fused_moe(x, layer.w13_weight, layer.w2_weight, router_logits, top_k) + return layer.hpu_static_fused_moe(x, layer.w13_weight, + layer.w2_weight, router_logits, + top_k) def forward_cpu(self, *args, **kwargs): raise NotImplementedError( From 3bb593a137c4c2c05890451d735629925c6909ac Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 23 Sep 2024 17:02:09 +0300 Subject: [PATCH 227/819] use ray for hpu distributed inference --- vllm/config.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/config.py b/vllm/config.py index f019736d1dc45..b8ec23e030ac9 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -855,6 +855,13 @@ def __init__( raise ValueError( "TPU backend only supports Ray for distributed inference.") + if current_platform.is_hpu() and self.world_size > 1: + if self.distributed_executor_backend is None: + self.distributed_executor_backend = "ray" + if self.distributed_executor_backend != "ray": + raise ValueError( + "HPU backend only supports Ray for distributed inference.") + if self.distributed_executor_backend is None and self.world_size > 1: # We use multiprocessing by default if world_size fits on the # current node and we aren't in a ray placement group. From c64dc8359b717b9ceed84ee9feb65f1fb739ab60 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 23 Sep 2024 17:37:59 +0200 Subject: [PATCH 228/819] Move profilers to vllm-hpu-extension (#323) Continuation of https://github.com/HabanaAI/vllm-hpu-extension/pull/4 I've also removed is_tpu, as it got mistakenly restored in the rebase. It's not in the upstream. --- requirements-hpu.txt | 2 +- vllm/executor/habana_executor.py | 6 +- vllm/utils.py | 104 ------------------------ vllm/worker/habana_model_runner.py | 10 +-- vllm/worker/habana_worker.py | 4 +- vllm/worker/profiler.py | 126 ----------------------------- 6 files changed, 12 insertions(+), 240 deletions(-) delete mode 100644 vllm/worker/profiler.py diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 56caa4ba03862..1ab81898b5f7e 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -6,4 +6,4 @@ ray == 2.32.0 triton pandas tabulate -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@bdd4f2b +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@0e05e25 diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py index 6e92da0245836..44226fc898218 100644 --- a/vllm/executor/habana_executor.py +++ b/vllm/executor/habana_executor.py @@ -6,14 +6,16 @@ import os from typing import Any, Dict, List, Optional, Set, Tuple +from vllm_hpu_extension.profiler import HabanaMemoryProfiler + from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.layers.sampler import SamplerOutput from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import ExecuteModelRequest -from vllm.utils import (HabanaMemoryProfiler, get_distributed_init_method, - get_ip, get_open_port, make_async) +from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, + make_async) from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) diff --git a/vllm/utils.py b/vllm/utils.py index f7e7a64619b1f..e5cef9b4419c0 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -374,15 +374,6 @@ def _is_built_for_hpu() -> bool: return False -@lru_cache(maxsize=None) -def is_tpu() -> bool: - try: - import libtpu - except ImportError: - libtpu = None - return libtpu is not None - - @lru_cache(maxsize=None) def is_xpu() -> bool: from importlib.metadata import PackageNotFoundError, version @@ -785,107 +776,12 @@ def print_warning_once(msg: str) -> None: logger.warning(msg) -# Adapted from https://stackoverflow.com/a/49361727 -def format_bytes(size): - # 2**10 = 1024 - power = 2**10 - n = 0 - power_labels = {0: '', 1: 'Ki', 2: 'Mi', 3: 'Gi', 4: 'Ti'} - while abs(size) > power: - size /= power - n += 1 - return f'{size:.4g} {power_labels[n]+"B"}' - - def get_device() -> str: if is_hpu(): return "hpu" return "cuda" -class HabanaMemoryProfiler: - - def __init__(self, device=None): - self.device = device - - @staticmethod - def current_device_memory_usage() -> float: - if is_fake_hpu(): - return 0 - # Return the device memory usage in bytes. - free_hpu_memory, total_hpu_memory = torch.hpu.mem_get_info() - return total_hpu_memory - free_hpu_memory - - @staticmethod - def current_free_device_memory() -> float: - if is_fake_hpu(): - return 0 - # Return the device memory usage in bytes. - free_hpu_memory, _ = torch.hpu.mem_get_info() - return free_hpu_memory - - @staticmethod - def total_device_memory() -> float: - if is_fake_hpu(): - return 0 - # Return the device memory usage in bytes. - _, total_hpu_memory = torch.hpu.mem_get_info() - return total_hpu_memory - - @staticmethod - def current_host_memory_usage() -> float: - # Return the host memory usage in bytes. - return HabanaMemoryProfiler.total_host_memory( - ) - HabanaMemoryProfiler.current_free_host_memory() - - @staticmethod - def current_free_host_memory() -> float: - # Return the host memory usage in bytes. - return psutil.virtual_memory().available - - @staticmethod - def total_host_memory() -> float: - # Return the host memory usage in bytes. - return psutil.virtual_memory().total - - def get_summary_string(self): - if getattr(self, 'final_device_memory', None) is None or getattr( - self, 'final_host_memory', None) is None: - raise RuntimeError( - "HabanaMemoryProfiler.get_summary_string() can only be called " - "after closing context manager") - return ( - f"{format_bytes(self.consumed_device_memory)} of device memory " - f"({format_bytes(self.final_device_memory)}/" - f"{format_bytes(HabanaMemoryProfiler.total_device_memory())} used)" - f" and {format_bytes(self.consumed_host_memory)} of host memory " - f"({format_bytes(self.final_host_memory)}/" - f"{format_bytes(HabanaMemoryProfiler.total_host_memory())} used)") - - def __enter__(self): - # Force garbage collection - gc.collect() - self.initial_device_memory = \ - HabanaMemoryProfiler.current_device_memory_usage() - self.initial_host_memory = \ - HabanaMemoryProfiler.current_host_memory_usage() - # This allows us to call methods of the context manager if needed - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - # Force garbage collection - gc.collect() - self.final_device_memory = \ - HabanaMemoryProfiler.current_device_memory_usage( - ) - self.final_host_memory = HabanaMemoryProfiler.current_host_memory_usage( - ) - self.consumed_device_memory = \ - self.final_device_memory - self.initial_device_memory - self.consumed_host_memory = \ - self.final_host_memory - self.initial_host_memory - - @lru_cache(maxsize=None) def is_pin_memory_available() -> bool: diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 6cd80eb15107e..c99500ef1296b 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -21,6 +21,8 @@ import habana_frameworks.torch.internal.bridge_config as bc import torch from vllm_hpu_extension.ops import LoraMask as LoraMask +from vllm_hpu_extension.profiler import (HabanaHighLevelProfiler, + HabanaMemoryProfiler, format_bytes) from vllm.attention import AttentionMetadata, get_attn_backend from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, @@ -39,8 +41,8 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import (IntermediateTensors, SequenceData, SequenceGroupMetadata) -from vllm.utils import (HabanaMemoryProfiler, format_bytes, is_fake_hpu, - is_pin_memory_available, make_tensor_with_pad) +from vllm.utils import (is_fake_hpu, is_pin_memory_available, + make_tensor_with_pad) from vllm.worker.model_runner_base import ( ModelRunnerBase, ModelRunnerInputBase, _add_attn_metadata_broadcastable_dict, @@ -48,8 +50,6 @@ _init_attn_metadata_from_tensor_dict, _init_sampling_metadata_from_tensor_dict) -from .profiler import Profiler - if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionBackend @@ -517,7 +517,7 @@ def __init__( self.prompt_adapter_config = prompt_adapter_config self.return_hidden_states = return_hidden_states self.observability_config = observability_config - self.profiler = Profiler() + self.profiler = HabanaHighLevelProfiler() self.sliding_window = (model_config.get_sliding_window() if model_config is not None else None) diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index f2678c5e405dc..8cdbba02fbb33 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -9,6 +9,7 @@ import habana_frameworks.torch as htorch # noqa:F401 import torch import torch.distributed +from vllm_hpu_extension.profiler import HabanaMemoryProfiler, format_bytes from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ObservabilityConfig, ParallelConfig, @@ -21,8 +22,7 @@ from vllm.model_executor import set_random_seed from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import ExecuteModelRequest -from vllm.utils import (HabanaMemoryProfiler, format_bytes, hpu_backend_string, - hpu_device_string, is_fake_hpu) +from vllm.utils import hpu_backend_string, hpu_device_string, is_fake_hpu from vllm.worker.cache_engine import CacheEngine from vllm.worker.habana_model_runner import HabanaModelRunner from vllm.worker.model_runner_base import ModelRunnerBase diff --git a/vllm/worker/profiler.py b/vllm/worker/profiler.py deleted file mode 100644 index 48348de41f520..0000000000000 --- a/vllm/worker/profiler.py +++ /dev/null @@ -1,126 +0,0 @@ -############################################################################### -# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company -############################################################################### - -import json -import os -import queue -import threading -import time -from contextlib import contextmanager -from typing import Any, List - -from vllm.logger import init_logger -from vllm.utils import get_vllm_instance_id - -logger = init_logger(__name__) - - -class FileWriter(threading.Thread): - - def __init__(self, filename, event_queue): - super().__init__() - self.filename = filename - self.event_queue = event_queue - self.daemon = True - self.timer_event = threading.Event() - - def _drain_event_queue(self): - content = '' - while True: - try: - element = self.event_queue.get_nowait() - content += element - except queue.Empty: - break - return content - - def run(self): - # don't check the queue too often - while not self.timer_event.wait(1): - # Block and wait for the next item in the queue - content = self.event_queue.get() - # Collect any other items in the queue - content += self._drain_event_queue() - - with open(self.filename, 'a') as outfile: - outfile.write(content) - - -class Profiler: - profiling_trace_events: queue.Queue = queue.Queue() - event_tid = {'counter': 1, 'external': 2, 'internal': 3} - vllm_instance_id = get_vllm_instance_id() - filename = f'server_events_{vllm_instance_id}.json' - event_cache: List[Any] = [] - - def __init__(self): - self.enabled = os.getenv('VLLM_PROFILER_ENABLED', - 'false').lower() == 'true' and int( - os.getenv('RANK', '0')) == 0 - msg = f'Profiler enabled for: {self.vllm_instance_id}' - logger.info(msg) - if self.enabled: - # initialize the trace file (JSON Array Format) - with open(self.filename, 'w') as outfile: - outfile.write('[') - file_writer = FileWriter(self.filename, - self.profiling_trace_events) - file_writer.start() - - def _dump_with_sep(self, entry): - entry = json.dumps(entry) + ',' - self.profiling_trace_events.put(entry) - - def get_timestamp_us(self): - return time.time() * 1000000.0 - - def record_counter(self, ts, counter): - if self.enabled: - self._dump_with_sep({ - 'pid': 1, - 'tid': self.event_tid['counter'], - 'ph': 'C', - 'name': 'utils', - 'ts': ts, - 'args': counter - }) - - def start(self, type, name, args=None): - if self.enabled: - ts = self.get_timestamp_us() - if args is not None and 'counter' in args: - self.record_counter(ts, args['counter']) - del args['counter'] - event = { - 'pid': 1, - 'tid': self.event_tid[type], - 'ph': 'X', - 'name': name, - 'ts': ts, - 'dur': None, - 'args': args - } - self.event_cache.append(event) - - def end(self): - if self.enabled: - ts = self.get_timestamp_us() - if not self.event_cache: - logger.warning( - 'Profiler: end() call does not have matching start() call. ' - 'Disabling profiler.') - self.enabled = False - return - event = self.event_cache.pop() - event['dur'] = ts - event['ts'] - self._dump_with_sep(event) - - @contextmanager - def record_event(self, type, name, args=None): - if self.enabled: - self.start(type, name, args) - yield - self.end() - else: - yield From c9683205eda7a9d850e373be9fc495d7bc39e6ba Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 24 Sep 2024 10:32:42 +0200 Subject: [PATCH 229/819] Restore upstream requirements-build.txt (#324) At some point, someone added whitespaces to each entry in requirements-build.txt. Upstream does not contain it. Easy fix. --- requirements-build.txt | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/requirements-build.txt b/requirements-build.txt index 6ec80356fdbca..3f08f5d67b6da 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -1,8 +1,8 @@ -# Should be mirrored in pyproject.toml -cmake>=3.26 -ninja -packaging -setuptools>=49.4.0 -torch==2.4.0 -wheel -jinja2 +# Should be mirrored in pyproject.toml +cmake>=3.26 +ninja +packaging +setuptools>=49.4.0 +torch==2.4.0 +wheel +jinja2 From 58d5cde67662fd48bbc8d0f229991f7e79eeffb7 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 24 Sep 2024 10:32:53 +0200 Subject: [PATCH 230/819] Remove reminder_comment.yml workflow (#325) This workflow never worked properly in the fork. This PR removes it. --- .github/workflows/reminder_comment.yml | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 .github/workflows/reminder_comment.yml diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml deleted file mode 100644 index 99827756d2066..0000000000000 --- a/.github/workflows/reminder_comment.yml +++ /dev/null @@ -1,21 +0,0 @@ -name: PR Reminder Comment Bot -on: - pull_request_target: - types: [opened] - -jobs: - pr_reminder: - runs-on: ubuntu-latest - steps: - - name: Remind to run full CI on PR - uses: actions/github-script@v6 - with: - script: | - github.rest.issues.createComment({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀' - }) - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From cf4c3e5d117a44472137fc08f04b90b64eba7bab Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 24 Sep 2024 10:33:03 +0200 Subject: [PATCH 231/819] Don't throw "Failed to import from vllm._C" warning on HPU (#326) --- vllm/_custom_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 678700055c992..a41b0e40b11d6 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -11,7 +11,7 @@ logger = init_logger(__name__) -if not current_platform.is_tpu(): +if not current_platform.is_tpu() and not current_platform.is_hpu(): try: import vllm._C except ImportError as e: From 41217cfacfd949912fbe0eda6196d3e44236433e Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 24 Sep 2024 11:05:25 +0200 Subject: [PATCH 232/819] Fix doc build warnings (#330) This PR fixes all the little warnings gaudi-installation.rst introduces during documentation build ("WARNING: Title underline too short." etc.) --- docs/source/getting_started/gaudi-installation.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst index 4c094eaec842a..8c4905e2a488a 100644 --- a/docs/source/getting_started/gaudi-installation.rst +++ b/docs/source/getting_started/gaudi-installation.rst @@ -129,10 +129,10 @@ Gaudi2 devices. Configurations that are not listed may or may not work. with tensor parallelism on 2x HPU, BF16 datatype with random or greedy sampling Performance Tuning -================ +================== Execution modes ------------- +--------------- Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via ``PT_HPU_LAZY_MODE`` environment variable), and ``--enforce-eager`` flag. @@ -161,7 +161,7 @@ Currently in vLLM for HPU we support four execution modes, depending on selected Bucketing mechanism ------------- +------------------- Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. `Intel Gaudi Graph Compiler `__ is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution. In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - ``batch_size`` and ``sequence_length``. @@ -234,7 +234,7 @@ This example uses the same buckets as in *Bucketing mechanism* section. Each out Compiling all the buckets might take some time and can be turned off with ``VLLM_SKIP_WARMUP=true`` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment. HPU Graph capture ------------- +----------------- `HPU Graphs `__ are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management. @@ -298,7 +298,7 @@ Each described step is logged by vLLM server, as follows (negative values corres Recommended vLLM Parameters ------------- +--------------------------- - We recommend running inference on Gaudi 2 with ``block_size`` of 128 for BF16 data type. Using default values (16, 32) might lead to @@ -310,7 +310,7 @@ Recommended vLLM Parameters If you encounter out-of-memory issues, see troubleshooting section. Environment variables ------------- +--------------------- **Diagnostic and profiling knobs:** From 4eb9809a4d80aab8168cea2cc9906558abd486c3 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Tue, 24 Sep 2024 17:08:40 +0800 Subject: [PATCH 233/819] fix qwen2 model issue (#329) FILL IN THE PR DESCRIPTION HERE typo: `platform` -> `platforms` FIX #xxxx (*link existing issues this PR will resolve*) **BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE** ---
PR Checklist (Click to Expand)

Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.

PR Title and Classification

Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:

  • [Bugfix] for bug fixes.
  • [CI/Build] for build or continuous integration improvements.
  • [Doc] for documentation fixes and improvements.
  • [Model] for adding a new model or improving an existing model. Model name should appear in the title.
  • [Frontend] For changes on the vLLM frontend (e.g., OpenAI API server, LLM class, etc.)
  • [Kernel] for changes affecting CUDA kernels or other compute kernels.
  • [Core] for changes in the core vLLM logic (e.g., LLMEngine, AsyncLLMEngine, Scheduler, etc.)
  • [Hardware][Vendor] for hardware-specific changes. Vendor name should appear in the prefix (e.g., [Hardware][AMD]).
  • [Misc] for PRs that do not fit the above categories. Please use this sparingly.

Note: If the PR spans more than one category, please include all relevant prefixes.

Code Quality

The PR need to meet the following code quality standards:

  • We adhere to Google Python style guide and Google C++ style guide.
  • Pass all linter checks. Please use format.sh to format your code.
  • The code need to be well-documented to ensure future contributors can easily understand the code.
  • Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.
  • Please add documentation to docs/source/ if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.

Adding or changing kernels

Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.

  • Make sure custom ops are registered following PyTorch guidelines: Custom C++ and CUDA Operators and The Custom Operators Manual
  • Custom operations that return Tensors require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.
  • Use torch.libary.opcheck() to test the function registration and meta-function for any registered ops. See tests/kernels for examples.
  • When changing the C++ signature of an existing op, the schema must be updated to reflect the changes.
  • If a new custom type is needed, see the following document: Custom Class Support in PT2.

Notes for Large Changes

Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with rfc-required and might not go through the PR.

What to Expect for the Reviews

The goal of the vLLM team is to be a transparent reviewing machine. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process:

  • After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.
  • After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.
  • After the review, the reviewer will put an action-required label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.
  • Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.

Thank You

Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone!

--- vllm/model_executor/models/qwen2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 1e6fb4ad1ca7c..7fd90b2e8b282 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -46,7 +46,7 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.platform import current_platform +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA From 9be37a356ada10dfea80e58613f710b19877c487 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 24 Sep 2024 11:46:00 +0200 Subject: [PATCH 234/819] Remove vllm.utils.is_hpu() (#331) vllm.utils.is_hpu() was redundant for some time now and has always been problematic particularly for torch.compile mode. Now, we're fully switching to current_platform.is_hpu(). --- requirements-hpu.txt | 2 +- vllm/core/block/cpu_gpu_block_allocator.py | 5 +++-- vllm/core/block_manager_v1.py | 5 +++-- vllm/utils.py | 24 ++-------------------- 4 files changed, 9 insertions(+), 27 deletions(-) diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 1ab81898b5f7e..c7376a7c504f9 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -6,4 +6,4 @@ ray == 2.32.0 triton pandas tabulate -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@0e05e25 +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@0a7adab diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index dd99dd94e4ad0..422c1f4bd8f8b 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -4,7 +4,8 @@ DeviceAwareBlockAllocator) from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator -from vllm.utils import Device, is_hpu +from vllm.platforms import current_platform +from vllm.utils import Device class CpuGpuBlockAllocator(DeviceAwareBlockAllocator): @@ -53,7 +54,7 @@ def create( before CPU block IDs. """ # For HPU, block id 0 is used only for padding - reserved_blocks = 1 if is_hpu() else 0 + reserved_blocks = 1 if current_platform.is_hpu() else 0 block_ids = list( range(reserved_blocks, num_gpu_blocks + num_cpu_blocks)) num_gpu_blocks -= reserved_blocks diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index 8e7335a4016e9..b1160e8d2f163 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -13,8 +13,9 @@ from vllm.core.evictor_v1 import EvictionPolicy, Evictor, make_evictor from vllm.core.interfaces import AllocStatus, BlockSpaceManager from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm.sequence import Sequence, SequenceGroup, SequenceStatus -from vllm.utils import Device, is_hpu +from vllm.utils import Device logger = init_logger(__name__) @@ -185,7 +186,7 @@ def __init__( # Initialize the free blocks. self.free_blocks: List[PhysicalTokenBlock] = [] # For HPU, block id 0 is used only for padding - reserved_blocks = 1 if is_hpu() else 0 + reserved_blocks = 1 if current_platform.is_hpu() else 0 for i in range(reserved_blocks, num_blocks): block = PhysicalTokenBlock(device=device, block_number=i, diff --git a/vllm/utils.py b/vllm/utils.py index e5cef9b4419c0..ca36ad8cd9592 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -337,11 +337,6 @@ def is_neuron() -> bool: return transformers_neuronx is not None -@lru_cache(maxsize=None) -def is_hpu() -> bool: - return _is_habana_frameworks_installed() or _is_built_for_hpu() - - @lru_cache(maxsize=None) def is_fake_hpu() -> bool: return os.environ.get('VLLM_USE_FAKE_HPU', '0') != '0' @@ -359,21 +354,6 @@ def hpu_backend_string(): return backend_string -@lru_cache(maxsize=None) -def _is_habana_frameworks_installed() -> bool: - from importlib import util - return util.find_spec('habana_frameworks') is not None - - -@lru_cache(maxsize=None) -def _is_built_for_hpu() -> bool: - from importlib.metadata import PackageNotFoundError, version - try: - return "gaudi" in version("vllm") - except PackageNotFoundError: - return False - - @lru_cache(maxsize=None) def is_xpu() -> bool: from importlib.metadata import PackageNotFoundError, version @@ -777,7 +757,7 @@ def print_warning_once(msg: str) -> None: def get_device() -> str: - if is_hpu(): + if current_platform.is_hpu(): return "hpu" return "cuda" @@ -797,7 +777,7 @@ def is_pin_memory_available() -> bool: elif is_neuron(): print_warning_once("Pin memory is not supported on Neuron.") return False - elif is_hpu(): + elif current_platform.is_hpu(): print_warning_once("Pin memory is not supported on HPU.") return False elif is_cpu() or is_openvino(): From e16918d41e5e82930872dbb263594f1aeb3b73a4 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 24 Sep 2024 12:01:31 +0200 Subject: [PATCH 235/819] Remove logger from layernorm (#332) Upstream does not use logger in layernorm. Neither do we. No idea why it's there. --- vllm/model_executor/layers/layernorm.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 8f1c60db3ebd7..257e6b37c7890 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -4,11 +4,8 @@ import torch import torch.nn as nn -from vllm.logger import init_logger from vllm.model_executor.custom_op import CustomOp -logger = init_logger(__name__) - class RMSNorm(CustomOp): """Root mean square normalization. From 73f4b48905f75ebbed035ab5e7abde2b64e701c3 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 24 Sep 2024 14:22:16 +0200 Subject: [PATCH 236/819] Fix INC FP8 inference after rebase (#333) This PR fixes the "RuntimeError: HPU does not have device capability." error introduced after rebase & fixes loading weights on CPU for quantization. --- vllm/model_executor/model_loader/loader.py | 2 +- vllm/platforms/hpu.py | 9 +-------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index b03e6aca48c0e..b3274b6d95115 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -59,7 +59,7 @@ def device_loading_context(module: torch.nn.Module, # Store original device states and move parameters to GPU if they're on CPU for name, p in module.named_parameters(): - if p.device.type == "cpu": + if p.device.type == "cpu" and target_device.type != 'hpu': original_device_states[name] = p.device p.data = p.data.to(target_device) # Parameters already on target device are not touched diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index feddce69ac5b4..170cfff94f90d 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -1,18 +1,11 @@ -from typing import Optional - import torch -from .interface import DeviceCapability, Platform, PlatformEnum +from .interface import Platform, PlatformEnum class HpuPlatform(Platform): _enum = PlatformEnum.HPU - @staticmethod - def get_device_capability( - device_id: int = 0) -> Optional[DeviceCapability]: - raise RuntimeError("HPU does not have device capability.") - @staticmethod def inference_mode(): return torch.no_grad() From 9111a8059b699344313f21a4314562d9405ec991 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 24 Sep 2024 18:07:01 +0200 Subject: [PATCH 237/819] Make weights_load_device not change EngineArgs.create_load_config() (#336) Some backends rely on calling EngineArgs.create_load_config() directly, for which we've altered the API. We don't need to alter it to enable weight load device functionality. This PR fixes it. --- vllm/config.py | 3 ++- vllm/engine/arg_utils.py | 9 ++++++--- vllm/model_executor/model_loader/loader.py | 2 +- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index b8ec23e030ac9..011563038e6bb 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -751,7 +751,8 @@ class LoadConfig: ignore_patterns: The list of patterns to ignore when loading the model. Default to "original/**/*" to avoid repeated loading of llama's checkpoints. - device: Device on which weights are loaded. + device: Device to which model weights will be loaded, default to + device_config.device """ load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ffe12d4cd5fb6..84529b267ce0b 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -268,8 +268,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument("--weights-load-device", type=str, default=EngineArgs.weights_load_device, - choices=["cuda", "neuron", "hpu", "cpu"], - help='Device on which weights are loaded.') + choices=DEVICE_OPTIONS, + help=('Device to which model weights ' + 'will be loaded.')) parser.add_argument( '--config-format', default=EngineArgs.config_format, @@ -843,7 +844,9 @@ def create_model_config(self) -> ModelConfig: mm_processor_kwargs=self.mm_processor_kwargs, ) - def create_load_config(self, load_device) -> LoadConfig: + def create_load_config(self, load_device=None) -> LoadConfig: + if load_device is None: + load_device = DeviceConfig(device=self.device).device return LoadConfig( load_format=self.load_format, download_dir=self.download_dir, diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index b3274b6d95115..fcff39f790564 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -363,7 +363,7 @@ def load_model(self, *, model_config: ModelConfig, model = _initialize_model(model_config, self.load_config, lora_config, cache_config, scheduler_config) - logger.info("Loading weights on %s ...", self.load_config.device) + logger.info("Loading weights on %s...", self.load_config.device) model.load_weights( self._get_weights_iterator(model_config.model, model_config.revision, From 8c6dcae75c06144081c98cb07668be87d763eb47 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 25 Sep 2024 14:13:15 +0200 Subject: [PATCH 238/819] Refine INC shutdown code (#335) This PR removes debug printouts in INC shutdown method and covers the case where application exits before model is initialized properly. --- vllm/executor/habana_executor.py | 3 --- vllm/worker/habana_model_runner.py | 16 +++++++++++----- vllm/worker/habana_worker.py | 3 --- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py index 44226fc898218..e4bd54f8849b3 100644 --- a/vllm/executor/habana_executor.py +++ b/vllm/executor/habana_executor.py @@ -195,9 +195,6 @@ def check_health(self) -> None: def shutdown(self) -> None: self.driver_worker.shutdown_inc() - def __del__(self): - self.shutdown() - class HabanaExecutorAsync(HabanaExecutor, ExecutorAsyncBase): diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index c99500ef1296b..6940e7637dbb7 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -550,6 +550,7 @@ def __init__( # Lazy initialization self.lora_manager: LRUCacheWorkerLoRAManager = None self.model: torch.nn.Module = None + self.inc_initialized_successfully = False # Profiler stats self.profiler_counter_helper = HabanaProfilerCounterHelper() @@ -632,6 +633,7 @@ def load_model(self) -> None: self.model = convert(self.model, config) htcore.hpu_initialize(self.model, mark_only_scales_as_const=True) + self.inc_initialized_successfully = True logger.info("Preparing model with INC took %s", m_inc.get_summary_string()) elif not is_fake_hpu(): @@ -1938,14 +1940,18 @@ def execute_model( return [output] def shutdown_inc(self): - print('inc shutdown') - if (model_config := getattr(self, "model_config", None)) and \ - getattr(model_config, "quantization", None) == 'inc': - print('inc shutdown start') + can_finalize_inc = False + from contextlib import suppress + with suppress(AttributeError): + can_finalize_inc = (self.model_config.quantization == 'inc') and \ + (self.model.model is not None) and \ + self.inc_initialized_successfully and \ + not getattr(self, "_is_inc_finalized", False) + if can_finalize_inc: from neural_compressor.torch.quantization import ( finalize_calibration) finalize_calibration(self.model.model) - print('inc shutdown') + self._is_inc_finalized = True def __del__(self): self.shutdown_inc() diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index 8cdbba02fbb33..2e4dfeac42c3e 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -320,9 +320,6 @@ def list_prompt_adapters(self) -> Set[int]: def shutdown_inc(self): self.model_runner.shutdown_inc() - def __del__(self): - self.shutdown_inc() - @property def max_model_len(self) -> int: return self.model_config.max_model_len From cef2f54b1d369195b5485161f9af941caa11d734 Mon Sep 17 00:00:00 2001 From: Zehao Huang Date: Wed, 25 Sep 2024 20:28:48 +0800 Subject: [PATCH 239/819] Setting enough cache_size_limit for torch.compile warmup (#238) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the issue that warmup sometimes doesn't work because the default cache_size_limit is only 8 . --------- Signed-off-by: zehao-intel Co-authored-by: Andrzej KotƂowski --- vllm/worker/habana_model_runner.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 6940e7637dbb7..394bb5318d10e 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -1553,6 +1553,17 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: len(self.decode_buckets), list(sorted(self.decode_buckets))) + if not htorch.utils.internal.is_lazy() and not self.enforce_eager: + cache_size_limit = len(self.prompt_buckets) + len( + self.decode_buckets) + 1 + torch._dynamo.config.cache_size_limit = max( + cache_size_limit, torch._dynamo.config.cache_size_limit) + # Multiply by 8 to follow the original default ratio between + # the cache_size_limit and accumulated_cache_size_limit + torch._dynamo.config.accumulated_cache_size_limit = max( + cache_size_limit * 8, + torch._dynamo.config.accumulated_cache_size_limit) + start_mem = HabanaMemoryProfiler.current_device_memory_usage() start_time = time.perf_counter() From 45ee5863019e955305c3ea545f9a0ccec3ff8bee Mon Sep 17 00:00:00 2001 From: Iryna Boiko Date: Wed, 25 Sep 2024 15:28:28 +0200 Subject: [PATCH 240/819] Change default values for decode bucket flags (#316) Change default values for decode bucket flags --- README_GAUDI.md | 12 ++++++------ docs/source/getting_started/gaudi-installation.rst | 12 ++++++------ vllm/worker/habana_model_runner.py | 11 +++++------ 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/README_GAUDI.md b/README_GAUDI.md index 0ef30d5f96e64..04e2ff22f96e5 100644 --- a/README_GAUDI.md +++ b/README_GAUDI.md @@ -321,7 +321,7 @@ for graph capture (later referred to as \"usable graph memory\"), and the remaining 90% will be utilized for KV cache. Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default -(`VLLM_GRAPH_PROMPT_RATIO=0.5`), both stages have equal memory +(`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints. Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable @@ -388,7 +388,7 @@ INFO 08-02 17:37:54 habana_worker.py:190] Initializing cache engine took 23.73 G INFO 08-02 17:37:54 habana_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB ... INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB -INFO 08-02 17:38:22 habana_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.5) +INFO 08-02 17:38:22 habana_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 4.755 GiB for prompt and 11.095 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3) INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB ... INFO 08-02 17:38:26 habana_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB @@ -448,7 +448,7 @@ Environment variables - `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for HPUGraph capture, `0.1` by default - `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory - dedicated for prompt graphs, `0.5` by default + dedicated for prompt graphs, `0.3` by default - `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt graph capture, `min_tokens` or `max_bs`, `min_tokens` by default - `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode @@ -472,15 +472,15 @@ Environment variables `max_model_len` - Decode: - - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `min(max_num_seqs, 32)` + - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1` - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs` - block size min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): - `128` + `block_size` - block size step - (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `128` + (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size` - block size max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)` diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst index 8c4905e2a488a..db1d8666e4800 100644 --- a/docs/source/getting_started/gaudi-installation.rst +++ b/docs/source/getting_started/gaudi-installation.rst @@ -245,7 +245,7 @@ Only after that, ``gpu_memory_utilization`` flag is utilized - at its default va Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. Environment variable ``VLLM_GRAPH_RESERVED_MEM`` defines the ratio of memory reserved for HPU Graphs capture. With its default value (``VLLM_GRAPH_RESERVED_MEM=0.1``), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache. -Environment variable ``VLLM_GRAPH_PROMPT_RATIO`` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (``VLLM_GRAPH_PROMPT_RATIO=0.5``), both stages have equal memory constraints. +Environment variable ``VLLM_GRAPH_PROMPT_RATIO`` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (``VLLM_GRAPH_PROMPT_RATIO=0.3``), both stages have equal memory constraints. Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. ``VLLM_GRAPH_PROMPT_RATIO=0.2`` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. .. note:: @@ -280,7 +280,7 @@ Each described step is logged by vLLM server, as follows (negative values corres INFO 08-02 17:37:54 habana_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB ... INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB - INFO 08-02 17:38:22 habana_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.5) + INFO 08-02 17:38:22 habana_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3) INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB ... INFO 08-02 17:38:26 habana_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB @@ -324,7 +324,7 @@ Environment variables - ``VLLM_SKIP_WARMUP``: if ``true``, warmup will be skipped, ``false`` by default - ``VLLM_GRAPH_RESERVED_MEM``: percentage of memory dedicated for HPUGraph capture, ``0.1`` by default -- ``VLLM_GRAPH_PROMPT_RATIO``: percentage of reserved graph memory dedicated for prompt graphs, ``0.5`` by default +- ``VLLM_GRAPH_PROMPT_RATIO``: percentage of reserved graph memory dedicated for prompt graphs, ``0.3`` by default - ``VLLM_GRAPH_PROMPT_STRATEGY``: strategy determining order of prompt graph capture, ``min_tokens`` or ``max_bs``, ``min_tokens`` by default - ``VLLM_GRAPH_DECODE_STRATEGY``: strategy determining order of decode graph capture, ``min_tokens`` or ``max_bs``, ``max_bs`` by default - ``VLLM_{phase}_{dim}_BUCKET_{param}`` - collection of 12 environment variables configuring ranges of bucketing mechanism @@ -343,11 +343,11 @@ Environment variables - sequence length max (``VLLM_PROMPT_SEQ_BUCKET_MAX``): ``max_model_len`` - Decode: - - batch size min (``VLLM_DECODE_BS_BUCKET_MIN``): ``min(max_num_seqs, 32)`` + - batch size min (``VLLM_DECODE_BS_BUCKET_MIN``): ``1`` - batch size step (``VLLM_DECODE_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)`` - batch size max (``VLLM_DECODE_BS_BUCKET_MAX``): ``max_num_seqs`` - - sequence length min (``VLLM_DECODE_BLOCK_BUCKET_MIN``): ``128`` - - sequence length step (``VLLM_DECODE_BLOCK_BUCKET_STEP``): ``128`` + - sequence length min (``VLLM_DECODE_BLOCK_BUCKET_MIN``): ``block_size`` + - sequence length step (``VLLM_DECODE_BLOCK_BUCKET_STEP``): ``block_size`` - sequence length max (``VLLM_DECODE_BLOCK_BUCKET_MAX``): ``max(128, (max_num_seqs*max_model_len)/block_size)`` diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 394bb5318d10e..e80df4e7c8c16 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -670,7 +670,6 @@ def _setup_buckets(self) -> None: if self.lora_config and \ max_bucket_cfg > self.max_num_batched_tokens // self.block_size: max_bucket_cfg = self.max_num_batched_tokens // self.block_size - blocks_step = 128 #FIXME: The default values should be max_model_len max_prompt_seq = 1024 max_decode_seq = 2048 @@ -682,7 +681,7 @@ def _setup_buckets(self) -> None: max=align_bs(max_bucket_cfg)) self.decode_bs_bucket_cfg = read_bucket_settings('decode', 'bs', - min=align_bs(32), + min=1, step=align_bs(32), max=self.max_num_seqs) self.prompt_seq_bucket_cfg = read_bucket_settings('prompt', @@ -693,9 +692,9 @@ def _setup_buckets(self) -> None: self.decode_block_bucket_cfg = read_bucket_settings( 'decode', 'block', - min=blocks_step, - step=blocks_step, - max=max(blocks_step, + min=self.block_size, + step=self.block_size, + max=max(self.block_size, self.max_num_seqs * max_decode_seq // self.block_size)) self.graphed_buckets: Set[Any] = set() @@ -1594,7 +1593,7 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: graph_free_mem = align_workers(graph_free_mem, torch.distributed.ReduceOp.MIN) prompt_graph_mem_ratio = float( - os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.5')) + os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.3')) prompt_available_memory = (prompt_graph_mem_ratio * graph_free_mem) decode_available_memory = (graph_free_mem - From 29fb5edd1df36aa4fa0ff95c7b2cbb711b8cb035 Mon Sep 17 00:00:00 2001 From: Yan Tomsinsky <73292515+Yantom1@users.noreply.github.com> Date: Wed, 25 Sep 2024 19:19:40 +0300 Subject: [PATCH 241/819] Support loading checkpoints quantized using Autofp8 (#286) Support loading https://huggingface.co/collections/neuralmagic/fp8-llms-for-vllm-666742ed2b78b7ac8df13127 Skip cuda checks Use scaled_fp8_quant instead of _scaled_mm Fix weights and weight_scale for guudi2 flot8_e4m3fn range. --------- Co-authored-by: Nir David Co-authored-by: Konrad Zawora --- requirements-hpu.txt | 3 +- .../layers/fused_moe/fused_moe.py | 4 ++ .../compressed_tensors/compressed_tensors.py | 9 +++-- .../schemes/compressed_tensors_w8a8_fp8.py | 4 +- .../model_executor/layers/quantization/fp8.py | 24 +++++++---- .../layers/quantization/utils/w8a8_utils.py | 40 +++++++++++++++---- vllm/worker/habana_model_runner.py | 3 +- 7 files changed, 64 insertions(+), 23 deletions(-) diff --git a/requirements-hpu.txt b/requirements-hpu.txt index c7376a7c504f9..1af5460128fbb 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -6,4 +6,5 @@ ray == 2.32.0 triton pandas tabulate -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@0a7adab + +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@0a7adab \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 3e01112eaa14d..cf17f1e240e47 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -13,6 +13,10 @@ from vllm.logger import init_logger from vllm.platforms import current_platform +if current_platform.is_hpu(): + from vllm_hpu_extension.ops import scaled_fp8_quant + ops.scaled_fp8_quant = scaled_fp8_quant + logger = init_logger(__name__) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index e536fae45c845..252ad864ced3e 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -243,8 +243,10 @@ def _get_scheme_from_parts( # TODO @dsikka: clean-up conditions if is_activation_quantization_format(self.quant_format): if self._is_fp8_w8a8(weight_quant, input_quant): - is_fp8_w8a8_supported = self._check_scheme_supported( - CompressedTensorsW8A8Fp8.get_min_capability(), error=False) + is_fp8_w8a8_supported = current_platform.is_hpu() or \ + self._check_scheme_supported( + CompressedTensorsW8A8Fp8.get_min_capability(), + error=False) if is_fp8_w8a8_supported: return CompressedTensorsW8A8Fp8( strategy=weight_quant.strategy, @@ -314,7 +316,8 @@ def get_scheme( # Raise error if device does not support the scheme # (e.g. fp8 needs ada lovelace) - self._check_scheme_supported(scheme.get_min_capability()) + if not current_platform.is_hpu(): + self._check_scheme_supported(scheme.get_min_capability()) return scheme diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index 5931ec36c97d5..29f3228c0dc5d 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -13,6 +13,7 @@ from vllm.model_executor.parameter import (ChannelQuantScaleParameter, ModelWeightParameter, PerTensorScaleParameter) +from vllm.platforms import current_platform from vllm.utils import is_hip __all__ = ["CompressedTensorsW8A8Fp8"] @@ -23,7 +24,8 @@ class CompressedTensorsW8A8Fp8(CompressedTensorsScheme): def __init__(self, strategy: str, is_static_input_scheme: bool): self.strategy = strategy self.is_static_input_scheme = is_static_input_scheme - self.cutlass_fp8_supported = cutlass_fp8_supported() + self.cutlass_fp8_supported = not current_platform.is_hpu() and \ + cutlass_fp8_supported() @classmethod def get_min_capability(cls) -> int: diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index b5feb55db0e74..88915942220ca 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -28,6 +28,10 @@ from vllm.platforms import current_platform from vllm.utils import is_hip, print_warning_once +if current_platform.is_hpu(): + from vllm_hpu_extension.ops import scaled_fp8_quant + ops.scaled_fp8_quant = scaled_fp8_quant + ACTIVATION_SCHEMES = ["static", "dynamic"] logger = init_logger(__name__) @@ -116,14 +120,18 @@ class Fp8LinearMethod(LinearMethodBase): def __init__(self, quant_config: Fp8Config): self.quant_config = quant_config - self.cutlass_fp8_supported = cutlass_fp8_supported() - - # For GPUs that lack FP8 hardware support, we can leverage the Marlin - # kernel for fast weight-only FP8 quantization - self.use_marlin = (not current_platform.has_device_capability(89) - or envs.VLLM_TEST_FORCE_FP8_MARLIN) - # Disable marlin for rocm - if is_hip(): + if current_platform.is_cuda_alike(): + self.cutlass_fp8_supported = cutlass_fp8_supported() + + # For GPUs that lack FP8 hardware support, we can leverage the + # Marlin kernel for fast weight-only FP8 quantization + self.use_marlin = (not current_platform.has_device_capability(89) + or envs.VLLM_TEST_FORCE_FP8_MARLIN) + # Disable marlin for rocm + if is_hip(): + self.use_marlin = False + else: + self.cutlass_fp8_supported = False self.use_marlin = False def create_weights( diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index fb263d121fe55..048962721e26b 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -10,6 +10,11 @@ # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale TORCH_DEVICE_IDENTITY = torch.ones(1).cuda() if is_hip() else None +if current_platform.is_hpu(): + import habana_frameworks.torch.utils.experimental as htexp + from vllm_hpu_extension.ops import scaled_fp8_quant + ops.scaled_fp8_quant = scaled_fp8_quant + def cutlass_fp8_supported() -> bool: # cutlass is not supported on Rocm @@ -25,7 +30,15 @@ def cutlass_fp8_supported() -> bool: def per_tensor_dequantize( tensor: torch.Tensor, inv_scale: Union[float, torch.Tensor]) -> torch.Tensor: - fake_qweight = tensor.to(torch.float16) + dtype = torch.float16 + device = tensor.device + if current_platform.is_hpu(): + dtype = torch.bfloat16 + if htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi2: + #dequant on cpu to avoid nan on gaudi2 + tensor = tensor.to('cpu') + + fake_qweight = tensor.to(dtype).to(device) dq_weight = fake_qweight * inv_scale return dq_weight @@ -58,7 +71,10 @@ def requantize_with_max_scale( logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]: # Max scale to be used for requanitzation. max_w_scale = weight_scale.max() - + if current_platform.is_hpu() and htexp._get_device_type( + ) == htexp.synDeviceType.synDeviceGaudi2: + max_w_scale = max_w_scale * (torch.finfo(torch.float8_e4m3fn).max / + torch.finfo(torch.float8_e4m3fnuz).max) # QKV / MLP is fused in the on disk checkpoint if any of the # weight scales are still set to the default since we initialize # N weight scales for N shards but we only load 1 weight scale @@ -129,12 +145,20 @@ def apply_fp8_linear( if per_tensor_weights and per_tensor_activations: # Fused GEMM_DQ - output = torch._scaled_mm(qinput, - weight, - out_dtype=input.dtype, - scale_a=x_scale, - scale_b=weight_scale, - bias=bias) + if current_platform.is_hpu(): + #hpu does not support torch._scaled_mm (SW-197036) + output = torch.ops.hpu.fp8_gemm_v2(qinput, False, weight, + False, None, input.dtype, + x_scale, weight_scale, None, + False) + else: + output = torch._scaled_mm(qinput, + weight, + out_dtype=input.dtype, + scale_a=x_scale, + scale_b=weight_scale, + bias=bias) + # A fix for discrepancy in scaled_mm which returns tuple # for torch < 2.5 and a single value in torch >= 2.5 if type(output) is tuple and len(output) == 2: diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index e80df4e7c8c16..c43acdf04923b 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -587,8 +587,7 @@ def _set_gc_threshold(self) -> None: def load_model(self) -> None: import habana_frameworks.torch.core as htcore - if self.model_config.quantization == 'inc': - htcore.hpu_set_env() + htcore.hpu_set_env() with HabanaMemoryProfiler() as m: with HabanaMemoryProfiler() as m_getmodel: self.model = get_model(model_config=self.model_config, From 4c8a6c6092532d8df3f45831d2bfa2715a06507f Mon Sep 17 00:00:00 2001 From: yuwenzho Date: Thu, 26 Sep 2024 20:26:16 +0800 Subject: [PATCH 242/819] Fix torch.compile issue of dispatch key set mismatch (#299) ### Issue: torch.compile recompiles after warmup because `tensor 'L['input_ids']' dispatch key set mismatch. expected DispatchKeySet(HPU, BackendSelect), actual DispatchKeySet(HPU, BackendSelect, ADInplaceOrView). ` ### Detail: Run script with `TORCH_LOGS="guards"` and get different dispatch key set info: - warmup: ``` TENSOR_MATCH: check_tensor(L['input_ids'], Tensor, DispatchKeySet(HPU, BackendSelect), torch.int64, device=0, requires_grad=False, size=[2, 1], stride=[1, 1]) # masked_input = input_ # ome/zyuwen/workspace/vllm/habana_main_g3_v2/vllm/model_executor/layers/vocab_parallel_embedding.py:358 in forward ``` - after warmup: ``` TENSOR_MATCH: check_tensor(L['input_ids'], Tensor, DispatchKeySet(HPU, BackendSelect, ADInplaceOrView), torch.int64, device=0, requires_grad=False, size=[2, 1], stride=[1, 1]) # masked_input = input_ # ome/zyuwen/workspace/vllm/habana_main_g3_v2/vllm/model_executor/layers/vocab_parallel_embedding.py:358 in forward ``` ### Solution: The difference in dispatch key set is caused by the 'torch.inference_mode()' decoration, and here is a simple example: ```python import torch import habana_frameworks.torch as htorch @torch.inference_mode() def func(): x = torch.rand(3, 3).to("hpu") print(torch._C._dispatch_key_set(x)) func() # output: DispatchKeySet(HPU, AutocastHPU) ``` ```python import torch import habana_frameworks.torch as htorch def func(): x = torch.rand(3, 3).to("hpu") print(torch._C._dispatch_key_set(x)) func() # output: DispatchKeySet(HPU, ADInplaceOrView, AutogradHPU, AutocastHPU) ``` In vllm-fork, the warmup phase is decorated with `torch.inference_mode()` in [habana_model_runner.py#L1487-L1488](https://github.com/HabanaAI/vllm-fork/blob/b62fba85ac03326e9f466d8d37e91ae1b14a6511/vllm/worker/habana_model_runner.py#L1487-L1488), but the after-warmup phase is not. So in this PR I add the decorator to `prepare_input_tensors` function to keep the dispatch key set the same. ---
PR Checklist (Click to Expand)

Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.

PR Title and Classification

Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:

  • [Bugfix] for bug fixes.
  • [CI/Build] for build or continuous integration improvements.
  • [Doc] for documentation fixes and improvements.
  • [Model] for adding a new model or improving an existing model. Model name should appear in the title.
  • [Frontend] For changes on the vLLM frontend (e.g., OpenAI API server, LLM class, etc.)
  • [Kernel] for changes affecting CUDA kernels or other compute kernels.
  • [Core] for changes in the core vLLM logic (e.g., LLMEngine, AsyncLLMEngine, Scheduler, etc.)
  • [Hardware][Vendor] for hardware-specific changes. Vendor name should appear in the prefix (e.g., [Hardware][AMD]).
  • [Misc] for PRs that do not fit the above categories. Please use this sparingly.

Note: If the PR spans more than one category, please include all relevant prefixes.

Code Quality

The PR need to meet the following code quality standards:

  • We adhere to Google Python style guide and Google C++ style guide.
  • Pass all linter checks. Please use format.sh to format your code.
  • The code need to be well-documented to ensure future contributors can easily understand the code.
  • Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.
  • Please add documentation to docs/source/ if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.

Notes for Large Changes

Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with rfc-required and might not go through the PR.

What to Expect for the Reviews

The goal of the vLLM team is to be a transparent reviewing machine. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process:

  • After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.
  • After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.
  • After the review, the reviewer will put an action-required label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.
  • Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.

Thank You

Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone!

Signed-off-by: yuwenzho --- vllm/worker/habana_model_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index c43acdf04923b..f3bda39ec4822 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -1790,6 +1790,7 @@ def make_model_input_from_broadcasted_tensor_dict( attn_backend=self.attn_backend, )) + @torch.inference_mode() def prepare_model_input( self, seq_group_metadata_list: List[SequenceGroupMetadata], From 1c6bada23884043cdd2a5715bce405bf2bb000f0 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 26 Sep 2024 14:53:29 +0200 Subject: [PATCH 243/819] Chunk prefill cache writes, remove div_i32 from insert_or_update_cache (#289) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Re-implements following PRs for current habana_main: https://github.com/HabanaAI/vllm-fork/pull/102 (Removing div_i32 operations from each layer) https://github.com/HabanaAI/vllm-fork/pull/115 (removing scatter for reshape&cache in case of prompt) Accuracy (GSM8K on Llama3.1-8B-Instruct): | Tasks |Version| Filter |n-shot| Metric | |Value | |Stderr| |---------------|------:|----------------|-----:|-----------|---|-----:|---|-----:| |gsm8k_cot_llama| 3|flexible-extract| 8|exact_match|↑ |0.8415|± |0.0101| | | |strict-match | 8|exact_match|↑ |0.8400|± |0.0101| I've benchmarked this change on Llama3.1-8B-Instruct and on average, +2.50% throughput gain (+558.14 tok/s, ~21594 tok/s -> ~22152 tok/s) can be observed across all prefill buckets on G2, with up to +4.40% (+956.79 tok/s, ~25031 -> ~25988 tok/s) throughput increase in compute-bound scenarios. --- requirements-hpu.txt | 3 +-- vllm/attention/backends/habana_attn.py | 17 +++++++++-------- vllm/attention/ops/habana_paged_attn.py | 2 ++ vllm/worker/habana_model_runner.py | 22 +++++++++++++++++++++- 4 files changed, 33 insertions(+), 11 deletions(-) diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 1af5460128fbb..33619dc4883d5 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -6,5 +6,4 @@ ray == 2.32.0 triton pandas tabulate - -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@0a7adab \ No newline at end of file +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@940fdb7 diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py index 59a99b89c293f..dad33fefc51f3 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/habana_attn.py @@ -8,7 +8,6 @@ import torch import vllm_hpu_extension.ops as ops -from vllm_hpu_extension import cache_ops from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, @@ -166,6 +165,11 @@ def forward( query = query.view(-1, self.num_heads, self.head_size) key = key.view(-1, self.num_kv_heads, self.head_size) value = value.view(-1, self.num_kv_heads, self.head_size) + block_indices = attn_metadata.block_indices + block_offsets = attn_metadata.block_offsets + if attn_metadata.is_prompt: + key = key.unflatten(0, (block_indices.size(0), -1)) + value = value.unflatten(0, (block_indices.size(0), -1)) if kv_cache is not None: key_cache, value_cache = HabanaPagedAttention.split_kv_cache( kv_cache, self.num_kv_heads, self.head_size) @@ -173,13 +177,10 @@ def forward( # Reshape the input keys and values and store them in the cache. # If kv_cache is not provided, the new key and value tensors are # not cached. This happens during the initial memory profiling run. - num_kv_cache_passes, num_slots_available, indices, offsets = \ - cache_ops.prepare_to_cache(key_cache, - attn_metadata.slot_mapping) - key_cache = self.k_cache(key, key_cache, num_kv_cache_passes, - num_slots_available, indices, offsets) - value_cache = self.v_cache(value, value_cache, num_kv_cache_passes, - num_slots_available, indices, offsets) + key_cache = self.k_cache(key, key_cache, block_indices, + block_offsets) + value_cache = self.v_cache(value, value_cache, block_indices, + block_offsets) if attn_metadata.is_prompt: # Prompt run. diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/habana_paged_attn.py index 49a3e3f774d58..7f080e0727457 100644 --- a/vllm/attention/ops/habana_paged_attn.py +++ b/vllm/attention/ops/habana_paged_attn.py @@ -18,6 +18,8 @@ class HabanaPagedAttentionMetadata: block_list: Optional[torch.Tensor] block_mapping: Optional[torch.Tensor] block_usage: Optional[torch.Tensor] + block_indices: Optional[torch.Tensor] + block_offsets: Optional[torch.Tensor] class HabanaPagedAttention: diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index f3bda39ec4822..d3d2973688843 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -245,6 +245,17 @@ def pad_list(list, k, v): return list + [v] * padding +def precompute_indices_and_offsets(block_size, slot_mapping, is_prompt): + slot_mapping = slot_mapping.flatten() + indices = torch.div(slot_mapping, block_size, rounding_mode="floor") + if is_prompt: + indices = indices.unflatten(0, (-1, block_size))[:, 0] + offsets = None + else: + offsets = torch.fmod(slot_mapping, block_size) + return indices, offsets + + class HpuModelAdapter(): def __init__(self, model, block_size, dtype, enforce_eager): @@ -890,11 +901,15 @@ def _prepare_prompt( dtype=torch.long, device=self.device) + block_indices, block_offsets = precompute_indices_and_offsets( + self.block_size, slot_mapping, True) attn_metadata = self.attn_backend.make_metadata( is_prompt=True, block_list=None, block_mapping=None, block_usage=None, + block_indices=block_indices, + block_offsets=block_offsets, attn_bias=None, seq_lens_tensor=seq_lens_tensor, num_prefills=real_num_seqs, @@ -1044,11 +1059,15 @@ def _prepare_decode( dtype=torch.long, device=self.device) + block_indices, block_offsets = precompute_indices_and_offsets( + self.block_size, slot_mapping, False) attn_metadata = self.attn_backend.make_metadata( is_prompt=False, block_list=block_list, block_mapping=block_mapping, block_usage=block_usage, + block_indices=block_indices, + block_offsets=block_offsets, attn_bias=None, seq_lens_tensor=None, num_prefills=0, @@ -1266,7 +1285,8 @@ def trim_attn_metadata(self, metadata: AttentionMetadata) -> object: # input_hash("abc") != input_hash("cba") attention_metadata = subtuple(metadata, 'TrimmedAttentionMetadata', [ 'attn_bias', 'seq_lens_tensor', 'block_list', 'block_mapping', - 'block_usage', 'slot_mapping', 'is_prompt' + 'block_usage', 'slot_mapping', 'is_prompt', 'block_indices', + 'block_offsets' ]) return attention_metadata From 5ffcfa3e377c83331f0f062ef90a2ab2f6b40da4 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 26 Sep 2024 17:39:49 +0200 Subject: [PATCH 244/819] Update cpu-test.yml --- .github/workflows/cpu-test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/cpu-test.yml b/.github/workflows/cpu-test.yml index 89a702f9751d9..60af77749bb1f 100644 --- a/.github/workflows/cpu-test.yml +++ b/.github/workflows/cpu-test.yml @@ -27,6 +27,7 @@ jobs: run: | python -m pip install --upgrade pip pip install torch --extra-index-url https://download.pytorch.org/whl/cpu + pip install -r requirements-build.txt pip install -r requirements-hpu.txt VLLM_TARGET_DEVICE=hpu python setup.py develop - name: cpu-test From c3577af3b52bd93b69dcc224f77179133bcdfc49 Mon Sep 17 00:00:00 2001 From: Vivek Goel Date: Fri, 27 Sep 2024 12:28:36 +0530 Subject: [PATCH 245/819] Fix runtime errors reported when using long input sequence lengths with LoRA (#339) This PR has following fixes, - Increase size of indices tensors used to maintain multi-lora state information from max_num_batched_tokens to 3*max_num_batched_tokens. This increase is done to provide buffer for padding done in batch & sequence dimensions. - Move logic to remove padding from lora_logits from execute_model() back to Class LogitsProcessorWithLoRA, this is done to fix race condition caused by updating multi-lora state information directly. FIX https://github.com/HabanaAI/vllm-fork/issues/237 --- vllm/lora/layers.py | 2 ++ vllm/lora/models.py | 2 +- vllm/worker/habana_model_runner.py | 20 ++++++-------------- 3 files changed, 9 insertions(+), 15 deletions(-) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index b3758ad883d56..06160367054e4 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -1203,6 +1203,8 @@ def _get_logits( ).index_select(0, indices_padded).nan_to_num_(nan=float("-inf"), posinf=float("inf"), neginf=float("-inf"))) + if current_platform.is_hpu(): + lora_logits = lora_logits[:logits.shape[0], :] logits[:, self.base_layer.org_vocab_size:self.base_layer.org_vocab_size + lora_logits.shape[1], ] = lora_logits diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 546a4c402aedc..582170a2df627 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -432,7 +432,7 @@ def __init__( self.long_lora_context: Optional[LongContextLoRAContext] = None if current_platform.is_hpu(): self.punica_wrapper = GaudiPunicaWrapper( - max_num_batched_tokens, + 3 * max_num_batched_tokens, max_batches=self.max_num_seqs, device="hpu") else: diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index d3d2973688843..bfbe4085ddd3f 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -1203,9 +1203,9 @@ def prepare_input_tensors( if self.lora_config: lora_mapping = LoRAMapping( - lora_index_mapping, - lora_prompt_mapping, - ) + **dict(index_mapping=lora_index_mapping, + prompt_mapping=lora_prompt_mapping, + is_prefill=(num_prefills > 0))) else: lora_mapping = None @@ -1370,9 +1370,9 @@ def warmup_scenario(self, times = 3 if use_graphs or is_pt_profiler_run else 1 if self.lora_config and not is_lora_profile_run: lora_mapping = LoRAMapping( - [0] * batch_size * seq_len, - [0] * batch_size * seq_len, - ) + **dict(index_mapping=[0] * batch_size * seq_len, + prompt_mapping=[0] * batch_size * seq_len, + is_prefill=is_prompt)) self.set_active_loras(set(), lora_mapping) if is_prompt: seqs = [ @@ -1915,14 +1915,6 @@ def execute_model( ) if self.lora_config: - from vllm.lora.layers import VocabParallelEmbeddingWithLoRA - modules = unwrap_model(self.model.model) - for module in modules: - if isinstance(module, VocabParallelEmbeddingWithLoRA): - for i in range(0, len(module.punica_wrapper.indices_len)): - module.punica_wrapper.indices_len[ - i] = sampling_metadata.selected_token_indices.numel( - ) lora_logits_mask: torch.Tensor = model_input.lora_logits_mask LoraMask.setLoraMask( lora_logits_mask.index_select( From ed85058387bdab264de44bee40f1f75ea847db72 Mon Sep 17 00:00:00 2001 From: Yu-Zhou Date: Fri, 27 Sep 2024 21:47:23 +0800 Subject: [PATCH 246/819] Enable Async output process for HPU (#342) FILL IN THE PR DESCRIPTION HERE This PR refer to [#7049](https://github.com/vllm-project/vllm/pull/7049) to implement Asynchronous Output Processor on HPU. It is open by default, to disable it, please pass the `--disable_async_output_proc` flag. From my local test on latest habana_main branch(commit 29fb5edd1df36aa4fa0ff95c7b2cbb711b8cb035), the throughput improves from 3847 TPS to 4011 TPS. **BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE** ---
PR Checklist (Click to Expand)

Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.

PR Title and Classification

Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:

  • [Bugfix] for bug fixes.
  • [CI/Build] for build or continuous integration improvements.
  • [Doc] for documentation fixes and improvements.
  • [Model] for adding a new model or improving an existing model. Model name should appear in the title.
  • [Frontend] For changes on the vLLM frontend (e.g., OpenAI API server, LLM class, etc.)
  • [Kernel] for changes affecting CUDA kernels or other compute kernels.
  • [Core] for changes in the core vLLM logic (e.g., LLMEngine, AsyncLLMEngine, Scheduler, etc.)
  • [Hardware][Vendor] for hardware-specific changes. Vendor name should appear in the prefix (e.g., [Hardware][AMD]).
  • [Misc] for PRs that do not fit the above categories. Please use this sparingly.

Note: If the PR spans more than one category, please include all relevant prefixes.

Code Quality

The PR need to meet the following code quality standards:

  • We adhere to Google Python style guide and Google C++ style guide.
  • Pass all linter checks. Please use format.sh to format your code.
  • The code need to be well-documented to ensure future contributors can easily understand the code.
  • Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.
  • Please add documentation to docs/source/ if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.

Adding or changing kernels

Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.

  • Make sure custom ops are registered following PyTorch guidelines: Custom C++ and CUDA Operators and The Custom Operators Manual
  • Custom operations that return Tensors require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.
  • Use torch.libary.opcheck() to test the function registration and meta-function for any registered ops. See tests/kernels for examples.
  • When changing the C++ signature of an existing op, the schema must be updated to reflect the changes.
  • If a new custom type is needed, see the following document: Custom Class Support in PT2.

Notes for Large Changes

Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with rfc-required and might not go through the PR.

What to Expect for the Reviews

The goal of the vLLM team is to be a transparent reviewing machine. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process:

  • After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.
  • After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.
  • After the review, the reviewer will put an action-required label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.
  • Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.

Thank You

Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone!

--- vllm/config.py | 5 +++-- vllm/worker/habana_model_runner.py | 4 ++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index eef1c2bfb9df9..e732c84c54520 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -372,9 +372,10 @@ def verify_async_output_proc(self, parallel_config, speculative_config, self.use_async_output_proc = False return - if device_config.device_type not in ("cuda", "tpu"): + if device_config.device_type not in ("cuda", "tpu", "hpu"): logger.warning( - "Async output processing is only supported for CUDA or TPU. " + "Async output processing is only supported for CUDA, TPU " + "and HPU. " "Disabling it for other platforms.") self.use_async_output_proc = False return diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index bfbe4085ddd3f..f3f679dbd1878 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -428,6 +428,7 @@ class ModelInputForHPU(ModelRunnerInputBase): virtual_engine: int = 0 lora_mask: Optional[torch.Tensor] = None lora_logits_mask: Optional[torch.Tensor] = None + async_callback: Optional[Callable] = None def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: tensor_dict = { @@ -1934,6 +1935,9 @@ def execute_model( if not self.is_driver_worker: return [] + if model_input.async_callback is not None: + model_input.async_callback() + # Sample the next token. with self.profiler.record_event( 'internal', ('sample_' From b611e209eff27383f3b25ec15f667c23008c837d Mon Sep 17 00:00:00 2001 From: Iryna Boiko Date: Mon, 30 Sep 2024 07:10:32 +0200 Subject: [PATCH 247/819] Port last_bucket change from v1.18.0 (#347) Port last_bucket change from v1.18.0 --- vllm/worker/habana_model_runner.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index f3f679dbd1878..79133aaf8f0f2 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -178,8 +178,7 @@ def generate_decode_buckets(bs_bucket_config, blocks_bucket_config, bs_buckets = warmup_range(bs_bucket_config) block_buckets = warmup_range(blocks_bucket_config) bmin, bstep, bmax = blocks_bucket_config - last_bucket = max_blocks if (max_blocks // bstep - == 0) else (max_blocks // bstep + 1) * bstep + last_bucket = round_up(max_blocks, bstep) for bs in bs_buckets: for blocks in block_buckets: if blocks < bs: From 3010f8cbd93d6696560f55d18ef6b074ad2535f4 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 30 Sep 2024 11:41:21 +0200 Subject: [PATCH 248/819] Add setuptools_scm to requirements-hpu.txt (#349) This removes the crash during installation for dependency that's inside requirements-build.txt --- requirements-hpu.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 33619dc4883d5..62ff11eba81e2 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -6,4 +6,6 @@ ray == 2.32.0 triton pandas tabulate +setuptools>=61 +setuptools-scm>=8 vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@940fdb7 From 44d8173bb1d94bd20dbfbb75e52483c2296ed28e Mon Sep 17 00:00:00 2001 From: Ruheena Suhani Shaik Date: Thu, 19 Sep 2024 08:57:53 +0300 Subject: [PATCH 249/819] test_lora_manager fix --- tests/lora/test_lora_manager_hpu.py | 553 ++++++++++++++++++++++++++++ 1 file changed, 553 insertions(+) create mode 100644 tests/lora/test_lora_manager_hpu.py diff --git a/tests/lora/test_lora_manager_hpu.py b/tests/lora/test_lora_manager_hpu.py new file mode 100644 index 0000000000000..ef3bf5272d709 --- /dev/null +++ b/tests/lora/test_lora_manager_hpu.py @@ -0,0 +1,553 @@ +import os +from typing import Dict, List + +import pytest +import torch +from safetensors.torch import load_file +from torch import nn + +from vllm.config import LoRAConfig +from vllm.lora.layers import (ColumnParallelLinearWithLoRA, + MergedColumnParallelLinearWithLoRA, + RowParallelLinearWithLoRA) +from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights +from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager, + LRUCacheLoRAModelManager) +from vllm.lora.request import LoRARequest +from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager, + WorkerLoRAManager) +from vllm.model_executor.layers.linear import RowParallelLinear + +EMBEDDING_MODULES = { + "embed_tokens": "input_embeddings", + "lm_head": "output_embeddings", +} + +EMBEDDING_PADDING_MODULES = ["lm_head"] + + +def test_from_lora_tensors(sql_lora_files): + tensors = load_file( + os.path.join(sql_lora_files, "adapter_model.safetensors")) + new_embeddings = load_file( + os.path.join(sql_lora_files, "new_embeddings.safetensors")) + lora_model = LoRAModel.from_lora_tensors( + 1, + 8, + 16, + tensors, + "hpu", + embeddings=new_embeddings, + embedding_modules=EMBEDDING_MODULES, + embedding_padding_modules=EMBEDDING_PADDING_MODULES) + for module_name, lora in lora_model.loras.items(): + assert lora.module_name == module_name + assert lora.rank == 8 + assert lora.lora_alpha == 16 + assert lora.lora_a is not None + assert lora.lora_b is not None + assert (lora.lora_a.shape[1] == lora.lora_b.shape[0] + ), f"{lora.lora_a.shape=}, {lora.lora_b.shape=}" + assert lora.lora_a.shape[1] == 8 + embeddings_module = next( + (k for k in EMBEDDING_MODULES if k in module_name), None) + if embeddings_module: + assert torch.equal( + lora.embeddings_tensor, + new_embeddings[EMBEDDING_MODULES[embeddings_module]].to( + device=lora.embeddings_tensor.device)) + else: + assert lora.embeddings_tensor is None + + +def create_lora(lora_id: int, model: nn.Module, + sub_modules: List[str]) -> LoRAModel: + loras: Dict[str, LoRALayerWeights] = {} + for name in sub_modules: + w = model.get_submodule(name).weight + loras[name] = LoRALayerWeights( + name, + 8, + 16, + torch.rand([w.shape[1], 8], device="hpu"), + torch.rand([8, w.shape[0]], device="hpu"), + ) + return LoRAModel(lora_id, 8, loras) + + +def create_packed_lora( + lora_id: int, + model: nn.Module, + module_name, + replaced_module_names, + empty_replaced_module_name=None, +) -> LoRAModel: + w = model.get_submodule(module_name).weight + loras: Dict[str, LoRALayerWeights] = {} + for replaced_module_name in replaced_module_names: + if replaced_module_name == empty_replaced_module_name: + continue + loras[replaced_module_name] = LoRALayerWeights( + replaced_module_name, + 8, + 16, + torch.rand([w.shape[1], 8], device="hpu"), + torch.rand([8, w.shape[0] // len(replaced_module_names)], + device="hpu"), + ) + return LoRAModel(lora_id, 8, loras) + + +def test_replace_submodules(dist_init, dummy_model): + model = dummy_model + model.supported_lora_modules = ["dense1", "layer1.dense2"] + model.packed_modules_mapping = {} + manager = LoRAModelManager( + model, 1, 1, 1, + LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8)) + model = manager.model + + assert isinstance(model.get_submodule("dense1"), + ColumnParallelLinearWithLoRA) + assert isinstance(model.get_submodule("layer1.dense1"), + ColumnParallelLinearWithLoRA) + assert isinstance(model.get_submodule("dense2"), RowParallelLinear) + assert isinstance(model.get_submodule("layer1.dense2"), + RowParallelLinearWithLoRA) + + +def test_lora_model_manager(dist_init, dummy_model): + model = dummy_model + model.supported_lora_modules = ["dense1", "dense2", "lm_head"] + model.packed_modules_mapping = {} + model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"]) + model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"]) + model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"]) + manager = LoRAModelManager( + model, 2, 2, 2, + LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2)) + assert all(x is None for x in manager.lora_index_to_id) + assert manager.add_adapter(model_lora1) + assert manager.activate_adapter(1) + assert manager.lora_index_to_id[0] == 1 + assert not manager.add_adapter(model_lora1) + assert not manager.activate_adapter(1) + assert manager.add_adapter(model_lora2) + assert manager.activate_adapter(2) + assert manager.lora_index_to_id[0] == 1 + assert manager.lora_index_to_id[1] == 2 + assert not manager.add_adapter(model_lora2) + assert not manager.activate_adapter(2) + assert manager.add_adapter(model_lora3) + assert manager.lora_index_to_id[0] == 1 + assert manager.lora_index_to_id[1] == 2 + with pytest.raises(ValueError): + assert manager.activate_adapter(3) + assert manager.lora_index_to_id[0] == 1 + assert manager.lora_index_to_id[1] == 2 + assert manager.remove_adapter(model_lora2.id) + assert manager.lora_index_to_id[1] is None + assert not manager.remove_adapter(model_lora2.id) + assert manager.remove_adapter(model_lora1.id) + assert not manager.remove_adapter(model_lora1.id) + assert manager.add_adapter(model_lora1) + assert manager.lora_index_to_id[0] is None + assert manager.lora_index_to_id[1] is None + assert manager.add_adapter(model_lora2) + assert manager.activate_adapter(3) + assert manager.lora_index_to_id[0] == 3 + assert manager.lora_index_to_id[1] is None + assert manager.activate_adapter(2) + assert manager.lora_index_to_id[0] == 3 + assert manager.lora_index_to_id[1] == 2 + + +def test_lora_lru_cache_model_manager(dist_init, dummy_model): + model = dummy_model + model.supported_lora_modules = ["dense1", "dense2", "lm_head"] + model.packed_modules_mapping = {} + model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"]) + model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"]) + model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"]) + manager = LRUCacheLoRAModelManager( + model, 2, 2, 2, + LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2)) + assert all(x is None for x in manager.lora_index_to_id) + assert manager.add_adapter(model_lora1) + assert manager.activate_adapter(1) + assert manager.lora_index_to_id[0] == 1 + assert not manager.add_adapter(model_lora1) + assert not manager.activate_adapter(1) + assert manager.add_adapter(model_lora2) + assert manager.activate_adapter(2) + assert manager.lora_index_to_id[0] == 1 + assert manager.lora_index_to_id[1] == 2 + assert not manager.add_adapter(model_lora2) + assert not manager.activate_adapter(2) + assert manager.add_adapter(model_lora3) + assert manager.lora_index_to_id[0] == 1 + assert manager.lora_index_to_id[1] == 2 + assert manager.activate_adapter(3) + assert manager.lora_index_to_id[0] == 3 + assert manager.lora_index_to_id[1] == 2 + assert manager.remove_adapter(model_lora2.id) + assert manager.lora_index_to_id[1] is None + assert not manager.remove_adapter(model_lora2.id) + assert manager.remove_adapter(model_lora1.id) + assert not manager.remove_adapter(model_lora1.id) + assert manager.add_adapter(model_lora1) + assert manager.activate_adapter(1) + assert manager.lora_index_to_id[0] == 3 + assert manager.lora_index_to_id[1] == 1 + assert manager.add_adapter(model_lora2) + assert manager.deactivate_adapter(3) + assert manager.lora_index_to_id[0] is None + assert manager.lora_index_to_id[1] == 1 + assert manager.activate_adapter(2) + assert manager.lora_index_to_id[0] == 2 + assert manager.lora_index_to_id[1] == 1 + assert manager.activate_adapter(3) + assert manager.lora_index_to_id[0] == 2 + assert manager.lora_index_to_id[1] == 3 + assert manager.pin_adapter(2) + assert manager.lora_index_to_id[0] == 2 + assert manager.lora_index_to_id[1] == 3 + assert manager.activate_adapter(1) + assert manager.lora_index_to_id[0] == 2 + assert manager.lora_index_to_id[1] == 1 + assert manager.deactivate_adapter(2) + assert manager.lora_index_to_id[0] is None + assert manager.lora_index_to_id[1] == 1 + assert manager.activate_adapter(3) + assert manager.lora_index_to_id[0] == 3 + assert manager.lora_index_to_id[1] == 1 + assert manager.pin_adapter(3) + assert manager.pin_adapter(1) + with pytest.raises(RuntimeError): + assert manager.pin_adapter(2) + assert manager.lora_index_to_id[0] == 3 + assert manager.lora_index_to_id[1] == 1 + with pytest.raises(RuntimeError): + assert manager.activate_adapter(2) + + assert manager.deactivate_adapter(3) + assert manager.pin_adapter(2) + assert manager.lora_index_to_id[0] == 2 + assert manager.lora_index_to_id[1] == 1 + assert manager.remove_adapter(3) + with pytest.raises(ValueError): + assert manager.pin_adapter(3) + + +def test_lru_lora_model_manager(dist_init, dummy_model): + # This tests just the LRU cache functionality, everything else is + # tested in test_lora_model_manager + model = dummy_model + model.supported_lora_modules = ["dense1", "dense2", "lm_head"] + model.packed_modules_mapping = {} + model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"]) + model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"]) + model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"]) + model_lora4 = create_lora(4, model, ["dense1", "dense2", "lm_head"]) + manager = LRUCacheLoRAModelManager( + model, 2, 2, 2, + LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2)) + + assert all(x is None for x in manager.lora_index_to_id) + + # Add up to capacity + assert manager.add_adapter(model_lora1) + assert manager.add_adapter(model_lora2) + assert manager.activate_adapter(1) + assert manager.activate_adapter(2) + + assert set(manager.list_adapters()) == {1, 2} + assert manager.lora_index_to_id[0] == 1 + assert manager.lora_index_to_id[1] == 2 + + # Add over capacity + assert manager.add_adapter(model_lora3) + assert manager.add_adapter(model_lora4) + assert manager.activate_adapter(3) + assert manager.activate_adapter(4) + + assert set(manager.list_adapters()) == {3, 4} + assert manager.lora_index_to_id[0] == 3 + assert manager.lora_index_to_id[1] == 4 + + # Add 3 again to move it to the top and then add 2 + # should return false since it's in already + assert not manager.add_adapter(model_lora3) + assert not manager.activate_adapter(3) + assert manager.add_adapter(model_lora2) + assert manager.activate_adapter(2) + + assert set(manager.list_adapters()) == {3, 2} + assert manager.lora_index_to_id[0] == 3 + assert manager.lora_index_to_id[1] == 2 + + # Remove manually + assert manager.remove_adapter(3) + assert not manager.remove_adapter(3) + + assert set(manager.list_adapters()) == {2} + assert manager.lora_index_to_id[0] is None + assert manager.lora_index_to_id[1] == 2 + + assert manager.add_adapter(model_lora3) + assert manager.activate_adapter(3) + assert manager.add_adapter(model_lora4) + assert manager.activate_adapter(4) + + assert set(manager.list_adapters()) == {3, 4} + assert manager.lora_index_to_id[0] == 3 + assert manager.lora_index_to_id[1] == 4 + + assert manager.remove_oldest_adapter() + assert set(manager.list_adapters()) == {4} + assert manager.lora_index_to_id[0] is None + assert manager.lora_index_to_id[1] == 4 + + assert manager.remove_oldest_adapter() + assert set(manager.list_adapters()) == set() + assert all(x is None for x in manager.lora_index_to_id) + + assert not manager.remove_oldest_adapter() + assert set(manager.list_adapters()) == set() + assert all(x is None for x in manager.lora_index_to_id) + + # pinning + assert manager.add_adapter(model_lora3) + assert manager.activate_adapter(3) + assert manager.add_adapter(model_lora4) + assert manager.activate_adapter(4) + assert set(manager.list_adapters()) == {3, 4} + with pytest.raises(ValueError): + assert manager.pin_adapter(1) + assert manager.pin_adapter(3) + # Remove manually + assert manager.remove_adapter(3) + assert not manager.remove_adapter(3) + + assert set(manager.list_adapters()) == {4} + assert manager.lora_index_to_id[0] is None + assert manager.lora_index_to_id[1] == 4 + + assert manager.add_adapter(model_lora1) + assert manager.pin_adapter(1) + assert manager.add_adapter(model_lora2) + assert manager.activate_adapter(2) + + assert set(manager.list_adapters()) == {1, 2} + assert manager.lora_index_to_id[0] == 1 + assert manager.lora_index_to_id[1] == 2 + + assert manager.remove_oldest_adapter() + assert set(manager.list_adapters()) == {1} + assert manager.lora_index_to_id[0] == 1 + assert manager.lora_index_to_id[1] is None + + with pytest.raises(RuntimeError): + assert manager.remove_oldest_adapter() + + assert set(manager.list_adapters()) == {1} + + +def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, + sql_lora_files): + lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4) + worker_adapter_manager = LRUCacheWorkerLoRAManager( + 4, 2, llama_2_7b_model_extra_embeddings.model.unpadded_vocab_size - + lora_config.lora_extra_vocab_size, lora_config, torch.device("hpu"), + EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES) + worker_adapter_manager.create_lora_manager( + llama_2_7b_model_extra_embeddings.model) + + mapping = LoRAMapping([], []) + worker_adapter_manager.set_active_adapters([ + LoRARequest("1", 1, sql_lora_files), + LoRARequest("2", 2, sql_lora_files) + ], mapping) + assert worker_adapter_manager.list_adapters() == {1, 2} + assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2 + + worker_adapter_manager.set_active_adapters([ + LoRARequest("1", 1, sql_lora_files), + LoRARequest("3", 3, sql_lora_files), + LoRARequest("4", 4, sql_lora_files) + ], mapping) + assert worker_adapter_manager.list_adapters() == {1, 2, 3, 4} + assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 3 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4 + + worker_adapter_manager.set_active_adapters([ + LoRARequest("1", 1, sql_lora_files), + LoRARequest("2", 2, sql_lora_files), + LoRARequest("5", 5, sql_lora_files) + ], mapping) + assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5} + assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4 + + worker_adapter_manager.set_active_adapters([ + LoRARequest("1", 1, sql_lora_files), + LoRARequest("1", 1, sql_lora_files), + LoRARequest("1", 1, sql_lora_files) + ], mapping) + assert worker_adapter_manager.list_adapters() == {1, 2, 4, 5} + assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 4 + + worker_adapter_manager.set_active_adapters([ + LoRARequest("6", 6, sql_lora_files), + LoRARequest("7", 7, sql_lora_files), + LoRARequest("8", 8, sql_lora_files) + ], mapping) + assert worker_adapter_manager.list_adapters() == {1, 6, 7, 8} + assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 7 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 8 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[3] == 6 + + # Over capacity + with pytest.raises(RuntimeError): + worker_adapter_manager.set_active_adapters([ + LoRARequest("10", 10, sql_lora_files), + LoRARequest("11", 11, sql_lora_files), + LoRARequest("12", 12, sql_lora_files), + LoRARequest("13", 13, sql_lora_files), + LoRARequest("14", 14, sql_lora_files) + ], mapping) + + +def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings, + sql_lora_files): + # Should remove every LoRA not specified in the request. + lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4) + worker_adapter_manager = WorkerLoRAManager( + 4, 2, llama_2_7b_model_extra_embeddings.model.unpadded_vocab_size - + lora_config.lora_extra_vocab_size, lora_config, torch.device("hpu"), + EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES) + worker_adapter_manager.create_lora_manager( + llama_2_7b_model_extra_embeddings.model) + + mapping = LoRAMapping([], []) + worker_adapter_manager.set_active_adapters([ + LoRARequest("1", 1, sql_lora_files), + LoRARequest("2", 2, sql_lora_files) + ], mapping) + assert worker_adapter_manager.list_adapters() == {1, 2} + assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2 + + worker_adapter_manager.set_active_adapters([ + LoRARequest("1", 1, sql_lora_files), + LoRARequest("3", 3, sql_lora_files), + LoRARequest("4", 4, sql_lora_files) + ], mapping) + assert worker_adapter_manager.list_adapters() == {1, 3, 4} + assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 3 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 4 + + worker_adapter_manager.set_active_adapters([ + LoRARequest("1", 1, sql_lora_files), + LoRARequest("2", 2, sql_lora_files), + LoRARequest("5", 5, sql_lora_files) + ], mapping) + assert worker_adapter_manager.list_adapters() == {1, 2, 5} + assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 2 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 5 + + worker_adapter_manager.set_active_adapters([ + LoRARequest("1", 1, sql_lora_files), + LoRARequest("1", 1, sql_lora_files), + LoRARequest("1", 1, sql_lora_files) + ], mapping) + assert worker_adapter_manager.list_adapters() == {1} + assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 1 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] is None + assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] is None + + worker_adapter_manager.set_active_adapters([ + LoRARequest("6", 6, sql_lora_files), + LoRARequest("7", 7, sql_lora_files), + LoRARequest("8", 8, sql_lora_files) + ], mapping) + assert worker_adapter_manager.list_adapters() == {6, 7, 8} + assert worker_adapter_manager._adapter_manager.lora_index_to_id[0] == 8 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[1] == 6 + assert worker_adapter_manager._adapter_manager.lora_index_to_id[2] == 7 + + # Over capacity + with pytest.raises(RuntimeError): + worker_adapter_manager.set_active_adapters([ + LoRARequest("10", 10, sql_lora_files), + LoRARequest("11", 11, sql_lora_files), + LoRARequest("12", 12, sql_lora_files), + LoRARequest("13", 13, sql_lora_files), + LoRARequest("14", 14, sql_lora_files) + ], mapping) + + +def test_packed_loras(dist_init, dummy_model_gate_up): + model = dummy_model_gate_up + model.supported_lora_modules = ["gate_up_proj"] + model.packed_modules_mapping = { + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + model_lora = create_packed_lora( + 1, + model, + module_name="gate_up_proj", + replaced_module_names=["gate_proj", "up_proj"]) + model_lora1 = create_packed_lora( + 2, + model, + module_name="gate_up_proj", + replaced_module_names=["gate_proj", "up_proj"], + empty_replaced_module_name="gate_proj", + ) + + manager = LoRAModelManager( + model, 2, 2, 2, + LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2)) + model = manager.model + + assert isinstance(model.get_submodule("gate_up_proj"), + MergedColumnParallelLinearWithLoRA) + assert manager.add_adapter(model_lora) + assert manager.add_adapter(model_lora1) + + packed_lora = model_lora.get_lora("gate_up_proj") + assert packed_lora and isinstance(packed_lora, PackedLoRALayerWeights) + + assert torch.allclose(packed_lora.lora_a[0], + model_lora.get_lora("gate_proj").lora_a) + assert torch.allclose(packed_lora.lora_b[0], + model_lora.get_lora("gate_proj").lora_b) + assert torch.allclose(packed_lora.lora_a[1], + model_lora.get_lora("up_proj").lora_a) + assert torch.allclose(packed_lora.lora_b[1], + model_lora.get_lora("up_proj").lora_b) + + packed_lora1 = model_lora1.get_lora("gate_up_proj") + assert packed_lora1 and isinstance(packed_lora1, PackedLoRALayerWeights) + + assert packed_lora1.lora_a[0] is None + assert packed_lora1.lora_b[0] is None + assert torch.allclose(packed_lora1.lora_a[1], + model_lora1.get_lora("up_proj").lora_a) + assert torch.allclose(packed_lora1.lora_b[1], + model_lora1.get_lora("up_proj").lora_b) From 188bd3adaa27a35cf05608e4383037d0ad2cb7e2 Mon Sep 17 00:00:00 2001 From: Ruheena Suhani Shaik Date: Mon, 23 Sep 2024 10:13:59 +0300 Subject: [PATCH 250/819] Added both hpu and gpu specific changes confest --- tests/lora/conftest.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index d3ebd15510284..099158798aa56 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -84,12 +84,16 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool): @pytest.fixture def dist_init(): temp_file = tempfile.mkstemp()[1] + if is_hpu(): + backend_type = "hccl" + else: + backend_type = "nccl" init_distributed_environment( world_size=1, rank=0, distributed_init_method=f"file://{temp_file}", local_rank=0, - backend="nccl", + backend=backend_type, ) initialize_model_parallel(1, 1) yield @@ -259,8 +263,13 @@ def get_model_patched(*, model_config, device_config, **kwargs): device_config=device_config, **kwargs) - with patch("vllm.worker.model_runner.get_model", get_model_patched): - engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False) + if is_hpu(): + with patch("vllm.worker.habana_model_runner.get_model", get_model_patched): + engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False) + else: + with patch("vllm.worker.model_runner.get_model", get_model_patched): + engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False) + yield engine.llm_engine del engine cleanup() From f59495ad30dd838c5b2dbd83154e4182f7f1df16 Mon Sep 17 00:00:00 2001 From: Ruheena Suhani Shaik Date: Mon, 30 Sep 2024 14:32:35 +0300 Subject: [PATCH 251/819] Added the changes to conftest to fix test_lora_manager --- tests/lora/conftest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 099158798aa56..0b7e381075637 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -262,7 +262,6 @@ def get_model_patched(*, model_config, device_config, **kwargs): return get_model_old(model_config=model_config, device_config=device_config, **kwargs) - if is_hpu(): with patch("vllm.worker.habana_model_runner.get_model", get_model_patched): engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False) From b0a9d02ea329e8135aa4ace1681f3751ed99e227 Mon Sep 17 00:00:00 2001 From: Ruheena Suhani Shaik Date: Mon, 30 Sep 2024 14:27:12 +0300 Subject: [PATCH 252/819] Applied the format changes in conftest --- tests/lora/conftest.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 0b7e381075637..77fb0897f6113 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -84,10 +84,7 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool): @pytest.fixture def dist_init(): temp_file = tempfile.mkstemp()[1] - if is_hpu(): - backend_type = "hccl" - else: - backend_type = "nccl" + backend_type = "hccl" if is_hpu() else "nccl" init_distributed_environment( world_size=1, rank=0, @@ -262,8 +259,10 @@ def get_model_patched(*, model_config, device_config, **kwargs): return get_model_old(model_config=model_config, device_config=device_config, **kwargs) + if is_hpu(): - with patch("vllm.worker.habana_model_runner.get_model", get_model_patched): + with patch("vllm.worker.habana_model_runner.get_model", + get_model_patched): engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False) else: with patch("vllm.worker.model_runner.get_model", get_model_patched): From 70f544c93baacf781fd37bc71c76d71d9fc3b2c8 Mon Sep 17 00:00:00 2001 From: Ruheena Suhani Shaik Date: Tue, 1 Oct 2024 08:36:03 +0300 Subject: [PATCH 253/819] Resolved format issues in conftest --- tests/lora/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 77fb0897f6113..35224d508fab3 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -267,7 +267,7 @@ def get_model_patched(*, model_config, device_config, **kwargs): else: with patch("vllm.worker.model_runner.get_model", get_model_patched): engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False) - + yield engine.llm_engine del engine cleanup() From ec34f88ecb68af760aa1cd74f95a7b714e3c8039 Mon Sep 17 00:00:00 2001 From: Ruheena Suhani Shaik Date: Tue, 1 Oct 2024 10:14:54 +0300 Subject: [PATCH 254/819] Added changes of HPU flags --- tests/lora/conftest.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 35224d508fab3..1c30f4147e8b5 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -24,6 +24,7 @@ from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader import get_model +from vllm.platforms import current_platform class ContextIDInfo(TypedDict): @@ -48,18 +49,13 @@ class ContextInfo(TypedDict): }] -def is_hpu(): - from importlib import util - return util.find_spec('habana_frameworks') is not None - - def cleanup(): destroy_model_parallel() destroy_distributed_environment() with contextlib.suppress(AssertionError): torch.distributed.destroy_process_group() gc.collect() - if not is_hpu(): + if not current_platform.is_hpu(): torch.cuda.empty_cache() ray.shutdown() @@ -84,7 +80,7 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool): @pytest.fixture def dist_init(): temp_file = tempfile.mkstemp()[1] - backend_type = "hccl" if is_hpu() else "nccl" + backend_type = "hccl" if current_platform.is_hpu() else "nccl" init_distributed_environment( world_size=1, rank=0, @@ -260,7 +256,7 @@ def get_model_patched(*, model_config, device_config, **kwargs): device_config=device_config, **kwargs) - if is_hpu(): + if current_platform.is_hpu(): with patch("vllm.worker.habana_model_runner.get_model", get_model_patched): engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False) From da03d8b8fa14fbc1cb276d19849a6c40b86a8b0e Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar <49579433+hlahkar@users.noreply.github.com> Date: Thu, 3 Oct 2024 16:09:10 +0530 Subject: [PATCH 255/819] Lora Mask based on lora index (#348) Changes the filling of lora mask from lora_id to lora_index. This is needed to ensure that the mask does not fail in case lora id is greater than max_loras --- vllm/worker/habana_model_runner.py | 211 +++++++++++++++-------------- 1 file changed, 113 insertions(+), 98 deletions(-) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 79133aaf8f0f2..2d72be5690664 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -350,8 +350,7 @@ class PreparePromptMetadata(NamedTuple): lora_requests: Set[LoRARequest] multi_modal_kwargs: Optional[Dict[str, BatchedTensorInputs]] slot_mapping: List[List[int]] - lora_mask: Optional[torch.Tensor] - lora_logits_mask: Optional[torch.Tensor] + lora_ids: List[int] @classmethod def empty(cls): @@ -365,8 +364,7 @@ def empty(cls): lora_requests=set(), multi_modal_kwargs=None, slot_mapping=[], - lora_mask=None, - lora_logits_mask=None) + lora_ids=[]) class PrepareDecodeMetadata(NamedTuple): @@ -377,8 +375,7 @@ class PrepareDecodeMetadata(NamedTuple): lora_prompt_mapping: List[List[int]] lora_requests: Set[LoRARequest] slot_mapping: List[List[int]] - lora_mask: Optional[torch.Tensor] - lora_logits_mask: Optional[torch.Tensor] + lora_ids: List[int] @classmethod def empty(cls): @@ -389,8 +386,7 @@ def empty(cls): lora_prompt_mapping=[], lora_requests=set(), slot_mapping=[], - lora_mask=None, - lora_logits_mask=None) + lora_ids=[]) # How batches are constructed. @@ -425,8 +421,7 @@ class ModelInputForHPU(ModelRunnerInputBase): real_batch_size: Optional[int] = None batch_size_padded: Optional[int] = None virtual_engine: int = 0 - lora_mask: Optional[torch.Tensor] = None - lora_logits_mask: Optional[torch.Tensor] = None + lora_ids: Optional[List[int]] = None async_callback: Optional[Callable] = None def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: @@ -439,8 +434,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: "real_batch_size": self.real_batch_size, "batch_size_padded": self.batch_size_padded, "virtual_engine": self.virtual_engine, - "lora_mask": self.lora_mask, - "lora_logits_mask": self.lora_logits_mask, + "lora_ids": self.lora_ids, } _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) return tensor_dict @@ -474,8 +468,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: "lora_requests": self.lora_requests, "lora_mapping": self.lora_mapping, "multi_modal_kwargs": self.multi_modal_kwargs, - "lora_mask": self.lora_mask, - "lora_logits_mask": self.lora_logits_mask, + "lora_ids": self.lora_ids, } _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) _add_sampling_metadata_broadcastable_dict(tensor_dict, @@ -836,38 +829,14 @@ def _prepare_prompt( find_bucket(max(seq_lens), self.prompt_seq_bucket_cfg), self.block_size) - lora_mask: torch.Tensor = None - lora_logits_mask: torch.Tensor = None - counter = 0 - if self.lora_config: - lora_mask = torch.zeros( - len(seq_group_metadata_list) * max_prompt_len, - (self.lora_config.max_loras) * self.lora_config.max_lora_rank, - dtype=self.lora_config.lora_dtype) - lora_logits_mask = torch.zeros(len(seq_group_metadata_list), - (self.lora_config.max_loras) * - self.lora_config.max_lora_rank, - dtype=self.lora_config.lora_dtype) - - ones = torch.ones(max_prompt_len, - self.lora_config.max_lora_rank, - dtype=self.lora_config.lora_dtype) - logit_ones = torch.ones(1, - self.lora_config.max_lora_rank, - dtype=self.lora_config.lora_dtype) + lora_ids: List[int] = [] for seq_group_metadata, context_len in zip(seq_group_metadata_list, context_lens): lora_id = seq_group_metadata.lora_int_id + lora_ids.append(lora_id) if lora_id > 0: lora_requests.add(seq_group_metadata.lora_request) - start_row = counter * max_prompt_len - end_row = start_row + max_prompt_len - start_col = (lora_id - 1) * self.lora_config.max_lora_rank - end_col = start_col + self.lora_config.max_lora_rank - lora_mask[start_row:end_row, start_col:end_col] = ones - lora_logits_mask[counter, start_col:end_col] = logit_ones - counter = counter + 1 lora_index_mapping += [lora_id] * (max_prompt_len - context_len) lora_prompt_mapping.extend( @@ -875,10 +844,6 @@ def _prepare_prompt( (max_prompt_len - context_len if seq_group_metadata.sampling_params.prompt_logprobs else 1)) - if lora_mask is not None: - lora_mask = lora_mask.to('hpu') - lora_logits_mask = lora_logits_mask.to('hpu') - input_tokens = make_tensor_with_pad(input_tokens, max_len=max_prompt_len, pad=0, @@ -919,20 +884,17 @@ def _prepare_prompt( ) multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list) - return PreparePromptMetadata( - input_tokens=input_tokens, - input_positions=input_positions, - attn_metadata=attn_metadata, - seq_lens=seq_lens, - query_lens=query_lens, - lora_index_mapping=lora_index_mapping, - lora_prompt_mapping=lora_prompt_mapping, - lora_requests=lora_requests, - multi_modal_kwargs=multi_modal_kwargs, - slot_mapping=slot_mapping, - lora_mask=lora_mask, - lora_logits_mask=lora_logits_mask, - ) + return PreparePromptMetadata(input_tokens=input_tokens, + input_positions=input_positions, + attn_metadata=attn_metadata, + seq_lens=seq_lens, + query_lens=query_lens, + lora_index_mapping=lora_index_mapping, + lora_prompt_mapping=lora_prompt_mapping, + lora_requests=lora_requests, + multi_modal_kwargs=multi_modal_kwargs, + slot_mapping=slot_mapping, + lora_ids=lora_ids) def _prepare_decode( self, @@ -949,18 +911,7 @@ def _prepare_decode( if len(seq_group_metadata_list) == 0: return PrepareDecodeMetadata.empty() - lora_mask: torch.Tensor = None - lora_logits_mask: torch.Tensor = None - counter = 0 - - if self.lora_config: - lora_mask = torch.zeros(len(seq_group_metadata_list), - (self.lora_config.max_loras) * - self.lora_config.max_lora_rank, - dtype=self.lora_config.lora_dtype) - ones = torch.ones(1, - self.lora_config.max_lora_rank, - dtype=self.lora_config.lora_dtype) + lora_ids: List[int] = [] dummy_slots = itertools.cycle( range(_PAD_SLOT_ID, _PAD_SLOT_ID + self.block_size)) @@ -971,13 +922,10 @@ def _prepare_decode( seq_ids = list(seq_group_metadata.seq_data.keys()) lora_id = seq_group_metadata.lora_int_id + lora_ids.append(lora_id) if lora_id > 0: lora_requests.add(seq_group_metadata.lora_request) - start_pos = (lora_id - 1) * self.lora_config.max_lora_rank - end_pos = start_pos + self.lora_config.max_lora_rank - lora_mask[counter, start_pos:end_pos] = ones - counter = counter + 1 for seq_id in seq_ids: seq_data = seq_group_metadata.seq_data[seq_id] @@ -1012,9 +960,6 @@ def _prepare_decode( block_table = block_table[-sliding_window_blocks:] block_tables.append(block_table) - if lora_mask is not None: - lora_mask = lora_mask.to('hpu') - lora_logits_mask = lora_mask input_tokens = torch.tensor(input_tokens, dtype=torch.long, device=self.device) @@ -1075,17 +1020,14 @@ def _prepare_decode( num_decode_tokens=num_decode_tokens, slot_mapping=slot_mapping, ) - return PrepareDecodeMetadata( - input_tokens=input_tokens, - input_positions=input_positions, - attn_metadata=attn_metadata, - lora_index_mapping=lora_index_mapping, - lora_prompt_mapping=lora_prompt_mapping, - lora_requests=lora_requests, - slot_mapping=slot_mapping, - lora_mask=lora_mask, - lora_logits_mask=lora_logits_mask, - ) + return PrepareDecodeMetadata(input_tokens=input_tokens, + input_positions=input_positions, + attn_metadata=attn_metadata, + lora_index_mapping=lora_index_mapping, + lora_prompt_mapping=lora_prompt_mapping, + lora_requests=lora_requests, + slot_mapping=slot_mapping, + lora_ids=lora_ids) def prepare_input_tensors( self, @@ -1142,8 +1084,7 @@ def prepare_input_tensors( lora_requests, multi_modal_kwargs, slot_mapping, - lora_mask, - lora_logits_mask, + lora_ids, ) = self._prepare_prompt(prefill_reqs) ( decode_input_tokens, @@ -1153,8 +1094,7 @@ def prepare_input_tensors( decode_lora_prompt_mapping, decode_lora_requests, decode_slot_mapping, - decode_lora_mask, - decode_lora_logits_mask, + decode_lora_ids, ) = self._prepare_decode(decode_reqs) sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list, seq_lens, query_lens, @@ -1181,8 +1121,7 @@ def prepare_input_tensors( lora_index_mapping = decode_lora_index_mapping lora_prompt_mapping = decode_lora_prompt_mapping lora_requests = decode_lora_requests - lora_mask = decode_lora_mask - lora_logits_mask = decode_lora_logits_mask + lora_ids = decode_lora_ids # FIXME: We need to adjust selected_token_indices to accommodate # for padding @@ -1252,8 +1191,7 @@ def prepare_input_tensors( multi_modal_kwargs=multi_modal_kwargs, real_batch_size=real_batch_size, batch_size_padded=batch_size_padded, - lora_mask=lora_mask, - lora_logits_mask=lora_logits_mask), \ + lora_ids=lora_ids), \ sampling_metadata def _seq_len(self, attn_metadata): @@ -1853,6 +1791,76 @@ def _check_config(self, batch_size, seq_len, is_prompt, warmup_mode): logger.warning("Configuration: (%s, %s, %s) was not warmed-up!", phase, batch_size, seq_len) + def create_lora_mask(self, input_tokens: torch.Tensor, lora_ids: List[int], + is_prompt: bool): + ''' + This is a helper function to create the mask for lora computations. + Lora Mask is needed to ensure we match the correct lora weights for the + for the request. + For Prompt phase we have + lora_mask with shape (batch_size * seq_len, max_loras * max_rank) + lora_logits_mask with shape (batch_size, max_loras * max_rank) + For Decode phase we have both + lora_mask and lora_logits_mask with shape + (batch_size, max_loras * max_rank) + ''' + lora_mask: torch.Tensor = None + lora_logits_mask: torch.Tensor = None + lora_index = 0 + + if self.lora_config: + if is_prompt: + lora_mask = torch.zeros( + input_tokens.shape[0] * input_tokens.shape[1], + (self.lora_config.max_loras) *\ + self.lora_config.max_lora_rank, + dtype=self.lora_config.lora_dtype) + lora_logits_mask = torch.zeros( + input_tokens.shape[0], (self.lora_config.max_loras) * + self.lora_config.max_lora_rank, + dtype=self.lora_config.lora_dtype) + + ones = torch.ones(input_tokens.shape[1], + self.lora_config.max_lora_rank, + dtype=self.lora_config.lora_dtype) + logit_ones = torch.ones(1, + self.lora_config.max_lora_rank, + dtype=self.lora_config.lora_dtype) + + for i in range(len(lora_ids)): + if lora_ids[i] == 0: + continue + lora_index = self.lora_manager._adapter_manager.\ + lora_index_to_id.index(lora_ids[i]) + start_row = i * input_tokens.shape[1] + end_row = start_row + input_tokens.shape[1] + start_col = lora_index * self.lora_config.max_lora_rank + end_col = start_col + self.lora_config.max_lora_rank + lora_mask[start_row:end_row, start_col:end_col] = ones + lora_logits_mask[i, start_col:end_col] = logit_ones + lora_mask = lora_mask.to('hpu') + lora_logits_mask = lora_logits_mask.to('hpu') + else: + lora_mask = torch.zeros(input_tokens.shape[0], + (self.lora_config.max_loras) * + self.lora_config.max_lora_rank, + dtype=self.lora_config.lora_dtype) + ones = torch.ones(1, + self.lora_config.max_lora_rank, + dtype=self.lora_config.lora_dtype) + for i in range(len(lora_ids)): + if lora_ids[i] == 0: + continue + lora_index = self.lora_manager._adapter_manager.\ + lora_index_to_id.index(lora_ids[i]) + start_pos = lora_index * self.lora_config.max_lora_rank + end_pos = start_pos + self.lora_config.max_lora_rank + lora_mask[i, start_pos:end_pos] = ones + lora_mask = lora_mask.to('hpu') + lora_logits_mask = lora_mask + + return lora_mask, lora_logits_mask + @torch.inference_mode() def execute_model( self, @@ -1887,13 +1895,21 @@ def execute_model( seq_len = self._seq_len(attn_metadata) use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) self._check_config(batch_size, seq_len, is_prompt, warmup_mode) + + lora_mask: torch.Tensor = None + lora_logits_mask: torch.Tensor = None + if self.lora_config: + assert model_input.lora_ids is not None + lora_mask, lora_logits_mask = self.create_lora_mask( + input_tokens, model_input.lora_ids, attn_metadata.is_prompt) + execute_model_kwargs = { "input_ids": input_tokens, "positions": input_positions, "kv_caches": kv_caches, "attn_metadata": self.trim_attn_metadata(attn_metadata), "intermediate_tensors": intermediate_tensors, - "lora_mask": model_input.lora_mask, + "lora_mask": lora_mask, **(model_input.multi_modal_kwargs or {}), } if htorch.utils.internal.is_lazy(): @@ -1915,7 +1931,6 @@ def execute_model( ) if self.lora_config: - lora_logits_mask: torch.Tensor = model_input.lora_logits_mask LoraMask.setLoraMask( lora_logits_mask.index_select( 0, sampling_metadata.selected_token_indices)) From f848d27b24d307e872bfed7572659882b341efaa Mon Sep 17 00:00:00 2001 From: Karol Damaszke Date: Thu, 3 Oct 2024 15:47:47 +0200 Subject: [PATCH 256/819] Add rope_scaling support for LLama3.1 (#356) Add support for rope scaling and FusedRoPE in LLama3.1 --- requirements-hpu.txt | 2 +- .../model_executor/layers/rotary_embedding.py | 26 ++++++++++++++----- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 62ff11eba81e2..602a5060c29aa 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -8,4 +8,4 @@ pandas tabulate setuptools>=61 setuptools-scm>=8 -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@940fdb7 +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@bb56d3b diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 5b746ae928b16..30bcf954c99b5 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -31,7 +31,8 @@ from vllm.platforms import current_platform if current_platform.is_hpu(): - from vllm_hpu_extension.rotary_embed import HpuRotaryEmbedding + from vllm_hpu_extension.rotary_embed import (HpuLlama3RotaryEmbedding, + HpuRotaryEmbedding) def _rotate_neox(x: torch.Tensor) -> torch.Tensor: @@ -943,12 +944,23 @@ def get_rope( high_freq_factor = rope_scaling["high_freq_factor"] original_max_position = rope_scaling[ "original_max_position_embeddings"] - rotary_emb = Llama3RotaryEmbedding(head_size, rotary_dim, - max_position, base, - is_neox_style, dtype, - scaling_factor, low_freq_factor, - high_freq_factor, - original_max_position) + if current_platform.is_hpu(): + rotary_emb = HpuLlama3RotaryEmbedding( + head_size, + rotary_dim, + max_position, + base, + is_neox_style, + scaling_factor, + low_freq_factor, + high_freq_factor, + original_max_position, + RoPEFallback=Llama3RotaryEmbedding) + else: + rotary_emb = Llama3RotaryEmbedding( + head_size, rotary_dim, max_position, base, is_neox_style, + dtype, scaling_factor, low_freq_factor, high_freq_factor, + original_max_position) elif scaling_type == "linear": rotary_emb = LinearScalingRotaryEmbedding(head_size, rotary_dim, max_position, base, From d8ba780f233ebb3a66fad1dbd879d6ac29a116df Mon Sep 17 00:00:00 2001 From: Marcin Swiniarski Date: Fri, 4 Oct 2024 10:33:42 +0200 Subject: [PATCH 257/819] [Core] Support Torch profiler in Habana Worker (#357) This PR allows to profile execution on HPU through flag VLLM_TORCH_PROFILER_DIR. Similar as it is done for GPU. The profiling can be controlled: 1. Asynchronously by posting requests to the server: a) to start collecting profile: ` curl -X POST http://localhost:8080/start_profile ` b) to stop collecting profile: ` curl -X POST http://localhost:8080/stop_profile ` 2. In script, by instructing LLM object to start and stop profiling: ```python from vllm import LLM, SamplingParams llm = LLM(...) llm.start_profile() llm.stop_profile() ``` --- vllm/engine/async_llm_engine.py | 7 +++++-- vllm/engine/llm_engine.py | 7 +++++-- vllm/engine/multiprocessing/engine.py | 7 +++++-- vllm/executor/habana_executor.py | 6 ++++++ vllm/worker/habana_worker.py | 27 +++++++++++++++++++++++++++ 5 files changed, 48 insertions(+), 6 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 9e6eecf992520..cb489084f48de 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -16,6 +16,7 @@ from vllm.engine.metrics_types import StatLoggerBase from vllm.executor.executor_base import ExecutorAsyncBase from vllm.executor.gpu_executor import GPUExecutorAsync +from vllm.executor.habana_executor import HabanaExecutorAsync from vllm.executor.ray_utils import initialize_ray_cluster from vllm.inputs import PromptType from vllm.logger import init_logger @@ -1204,7 +1205,8 @@ def remove_logger(self, logger_name: str) -> None: async def start_profile(self) -> None: # using type instead of isinstance to check to avoid capturing # inherited classes - if type(self.engine.model_executor) == GPUExecutorAsync: # noqa: E721 + if type(self.engine.model_executor) == GPUExecutorAsync or \ + type(self.engine.model_executor) == HabanaExecutorAsync: # noqa: E721 self.engine.model_executor.start_profile() else: self.engine.model_executor._run_workers("start_profile") @@ -1212,7 +1214,8 @@ async def start_profile(self) -> None: async def stop_profile(self) -> None: # using type instead of isinstance to check to avoid capturing # inherited classes - if type(self.engine.model_executor) == GPUExecutorAsync: # noqa: E721 + if type(self.engine.model_executor) == GPUExecutorAsync or \ + type(self.engine.model_executor) == HabanaExecutorAsync: # noqa: E721 self.engine.model_executor.stop_profile() else: self.engine.model_executor._run_workers("stop_profile") diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 5cb0161b73e2c..f41d074ad536c 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -28,6 +28,7 @@ from vllm.entrypoints.openai.logits_processors import get_logits_processors from vllm.executor.executor_base import ExecutorBase from vllm.executor.gpu_executor import GPUExecutor +from vllm.executor.habana_executor import HabanaExecutor from vllm.executor.ray_utils import initialize_ray_cluster from vllm.inputs import (INPUT_REGISTRY, EncoderDecoderLLMInputs, InputRegistry, LLMInputs, PromptType) @@ -1794,7 +1795,8 @@ def check_health(self) -> None: def start_profile(self) -> None: # using type instead of isinstance to check to avoid capturing # inherited classes (MultiprocessingGPUExecutor) - if type(self.model_executor) == GPUExecutor: # noqa: E721 + if type(self.model_executor) == GPUExecutor or \ + type(self.model_executor) == HabanaExecutor: # noqa: E721 self.model_executor.start_profile() else: self.model_executor._run_workers("start_profile") @@ -1802,7 +1804,8 @@ def start_profile(self) -> None: def stop_profile(self) -> None: # using type instead of isinstance to check to avoid capturing # inherited classes (MultiprocessingGPUExecutor) - if type(self.model_executor) == GPUExecutor: # noqa: E721 + if type(self.model_executor) == GPUExecutor or \ + type(self.model_executor) == HabanaExecutor: # noqa: E721 self.model_executor.stop_profile() else: self.model_executor._run_workers("stop_profile") diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index eecca82cd2f7d..49500099fbcaf 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -23,6 +23,7 @@ # yapf: enable from vllm.envs import VLLM_RPC_TIMEOUT from vllm.executor.gpu_executor import GPUExecutor +from vllm.executor.habana_executor import HabanaExecutor from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.usage.usage_lib import UsageContext @@ -364,13 +365,15 @@ def _alive(self): self._last_alive_time = time.time() def start_profile(self) -> None: - if type(self.engine.model_executor) is GPUExecutor: + if type(self.engine.model_executor) is GPUExecutor or \ + type(self.engine.model_executor) is HabanaExecutor: self.engine.model_executor.start_profile() else: self.engine.model_executor._run_workers("start_profile") def stop_profile(self) -> None: - if type(self.engine.model_executor) is GPUExecutor: + if type(self.engine.model_executor) is GPUExecutor or \ + type(self.engine.model_executor) is HabanaExecutor: self.engine.model_executor.stop_profile() else: self.engine.model_executor._run_workers("stop_profile") diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py index e4bd54f8849b3..e6d0fbc0d431d 100644 --- a/vllm/executor/habana_executor.py +++ b/vllm/executor/habana_executor.py @@ -192,6 +192,12 @@ def check_health(self) -> None: # it's running. return + def start_profile(self) -> None: + self.driver_worker.start_profile() + + def stop_profile(self) -> None: + self.driver_worker.stop_profile() + def shutdown(self) -> None: self.driver_worker.shutdown_inc() diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index 2e4dfeac42c3e..7fc1e48b8c960 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -11,6 +11,7 @@ import torch.distributed from vllm_hpu_extension.profiler import HabanaMemoryProfiler, format_bytes +import vllm.envs as envs from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ObservabilityConfig, ParallelConfig, PromptAdapterConfig, SchedulerConfig, @@ -95,6 +96,32 @@ def __init__( self.cache_engine: List[CacheEngine] # Initialize gpu_cache as embedding models don't initialize kv_caches self.hpu_cache: Optional[List[List[torch.tensor]]] = None + # Torch profiler. Enabled and configured through env vars: + # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace + if envs.VLLM_TORCH_PROFILER_DIR: + torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR + logger.info("Profiling enabled. Traces will be saved to: %s", + torch_profiler_trace_dir) + self.profiler = torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.HPU, + ], + with_stack=True, + on_trace_ready=torch.profiler.tensorboard_trace_handler( + torch_profiler_trace_dir, use_gzip=True)) + else: + self.profiler = None + + def start_profile(self): + if self.profiler is None: + raise RuntimeError("Profiler is not enabled.") + self.profiler.start() + + def stop_profile(self): + if self.profiler is None: + raise RuntimeError("Profiler is not enabled.") + self.profiler.stop() def _set_env_vars(self): local_rank = self.local_rank From 250487b567a889c8936acb119131b84fb242e423 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Fri, 4 Oct 2024 18:35:25 +0300 Subject: [PATCH 258/819] [Refactor] Rename components *Habana* -> *HPU* --- vllm/engine/async_llm_engine.py | 16 ++++++++-------- vllm/engine/llm_engine.py | 14 +++++++------- vllm/engine/multiprocessing/engine.py | 6 +++--- .../{habana_executor.py => hpu_executor.py} | 8 ++++---- ...ay_habana_executor.py => ray_hpu_executor.py} | 8 ++++---- ...abana_model_runner.py => hpu_model_runner.py} | 8 ++++---- vllm/worker/{habana_worker.py => hpu_worker.py} | 6 +++--- vllm/worker/kzawora.code-workspace | 11 +++++++++++ 8 files changed, 44 insertions(+), 33 deletions(-) rename vllm/executor/{habana_executor.py => hpu_executor.py} (97%) rename vllm/executor/{ray_habana_executor.py => ray_hpu_executor.py} (99%) rename vllm/worker/{habana_model_runner.py => hpu_model_runner.py} (99%) rename vllm/worker/{habana_worker.py => hpu_worker.py} (99%) create mode 100644 vllm/worker/kzawora.code-workspace diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index cb489084f48de..a2a940148b87e 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -16,7 +16,7 @@ from vllm.engine.metrics_types import StatLoggerBase from vllm.executor.executor_base import ExecutorAsyncBase from vllm.executor.gpu_executor import GPUExecutorAsync -from vllm.executor.habana_executor import HabanaExecutorAsync +from vllm.executor.hpu_executor import HPUExecutorAsync from vllm.executor.ray_utils import initialize_ray_cluster from vllm.inputs import PromptType from vllm.logger import init_logger @@ -620,12 +620,12 @@ def _get_executor_cls( elif engine_config.device_config.device_type == "hpu": if distributed_executor_backend == "ray": initialize_ray_cluster(engine_config.parallel_config) - from vllm.executor.ray_habana_executor import ( - RayHabanaExecutorAsync) - executor_class = RayHabanaExecutorAsync + from vllm.executor.ray_hpu_executor import ( + RayHPUExecutorAsync) + executor_class = RayHPUExecutorAsync else: - from vllm.executor.habana_executor import HabanaExecutorAsync - executor_class = HabanaExecutorAsync + from vllm.executor.hpu_executor import HPUExecutorAsync + executor_class = HPUExecutorAsync elif engine_config.device_config.device_type == "openvino": assert distributed_executor_backend is None, ( "Distributed execution is not supported with " @@ -1206,7 +1206,7 @@ async def start_profile(self) -> None: # using type instead of isinstance to check to avoid capturing # inherited classes if type(self.engine.model_executor) == GPUExecutorAsync or \ - type(self.engine.model_executor) == HabanaExecutorAsync: # noqa: E721 + type(self.engine.model_executor) == HPUExecutorAsync: # noqa: E721 self.engine.model_executor.start_profile() else: self.engine.model_executor._run_workers("start_profile") @@ -1215,7 +1215,7 @@ async def stop_profile(self) -> None: # using type instead of isinstance to check to avoid capturing # inherited classes if type(self.engine.model_executor) == GPUExecutorAsync or \ - type(self.engine.model_executor) == HabanaExecutorAsync: # noqa: E721 + type(self.engine.model_executor) == HPUExecutorAsync: # noqa: E721 self.engine.model_executor.stop_profile() else: self.engine.model_executor._run_workers("stop_profile") diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index f41d074ad536c..3635443421e88 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -28,7 +28,7 @@ from vllm.entrypoints.openai.logits_processors import get_logits_processors from vllm.executor.executor_base import ExecutorBase from vllm.executor.gpu_executor import GPUExecutor -from vllm.executor.habana_executor import HabanaExecutor +from vllm.executor.hpu_executor import HPUExecutor from vllm.executor.ray_utils import initialize_ray_cluster from vllm.inputs import (INPUT_REGISTRY, EncoderDecoderLLMInputs, InputRegistry, LLMInputs, PromptType) @@ -533,11 +533,11 @@ def _get_executor_cls(cls, elif engine_config.device_config.device_type == "hpu": if distributed_executor_backend == "ray": initialize_ray_cluster(engine_config.parallel_config) - from vllm.executor.ray_habana_executor import RayHabanaExecutor - executor_class = RayHabanaExecutor + from vllm.executor.ray_hpu_executor import RayHPUExecutor + executor_class = RayHPUExecutor else: - from vllm.executor.habana_executor import HabanaExecutor - executor_class = HabanaExecutor + from vllm.executor.hpu_executor import HPUExecutor + executor_class = HPUExecutor elif engine_config.device_config.device_type == "openvino": from vllm.executor.openvino_executor import OpenVINOExecutor executor_class = OpenVINOExecutor @@ -1796,7 +1796,7 @@ def start_profile(self) -> None: # using type instead of isinstance to check to avoid capturing # inherited classes (MultiprocessingGPUExecutor) if type(self.model_executor) == GPUExecutor or \ - type(self.model_executor) == HabanaExecutor: # noqa: E721 + type(self.model_executor) == HPUExecutor: # noqa: E721 self.model_executor.start_profile() else: self.model_executor._run_workers("start_profile") @@ -1805,7 +1805,7 @@ def stop_profile(self) -> None: # using type instead of isinstance to check to avoid capturing # inherited classes (MultiprocessingGPUExecutor) if type(self.model_executor) == GPUExecutor or \ - type(self.model_executor) == HabanaExecutor: # noqa: E721 + type(self.model_executor) == HPUExecutor: # noqa: E721 self.model_executor.stop_profile() else: self.model_executor._run_workers("stop_profile") diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 49500099fbcaf..3501f12c065cf 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -23,7 +23,7 @@ # yapf: enable from vllm.envs import VLLM_RPC_TIMEOUT from vllm.executor.gpu_executor import GPUExecutor -from vllm.executor.habana_executor import HabanaExecutor +from vllm.executor.hpu_executor import HPUExecutor from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.usage.usage_lib import UsageContext @@ -366,14 +366,14 @@ def _alive(self): def start_profile(self) -> None: if type(self.engine.model_executor) is GPUExecutor or \ - type(self.engine.model_executor) is HabanaExecutor: + type(self.engine.model_executor) is HPUExecutor: self.engine.model_executor.start_profile() else: self.engine.model_executor._run_workers("start_profile") def stop_profile(self) -> None: if type(self.engine.model_executor) is GPUExecutor or \ - type(self.engine.model_executor) is HabanaExecutor: + type(self.engine.model_executor) is HPUExecutor: self.engine.model_executor.stop_profile() else: self.engine.model_executor._run_workers("stop_profile") diff --git a/vllm/executor/habana_executor.py b/vllm/executor/hpu_executor.py similarity index 97% rename from vllm/executor/habana_executor.py rename to vllm/executor/hpu_executor.py index e6d0fbc0d431d..cc5609ebe5c8e 100644 --- a/vllm/executor/habana_executor.py +++ b/vllm/executor/hpu_executor.py @@ -21,7 +21,7 @@ logger = init_logger(__name__) -class HabanaExecutor(ExecutorBase): +class HPUExecutor(ExecutorBase): uses_ray: bool = False @@ -57,8 +57,8 @@ def _create_worker(self, rank: int = 0, distributed_init_method: Optional[str] = None): wrapper = WorkerWrapperBase( - worker_module_name="vllm.worker.habana_worker", - worker_class_name="HabanaWorker", + worker_module_name="vllm.worker.hpu_worker", + worker_class_name="HPUWorker", ) wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank, distributed_init_method)) @@ -202,7 +202,7 @@ def shutdown(self) -> None: self.driver_worker.shutdown_inc() -class HabanaExecutorAsync(HabanaExecutor, ExecutorAsyncBase): +class HPUExecutorAsync(HPUExecutor, ExecutorAsyncBase): async def execute_model_async( self, diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_hpu_executor.py similarity index 99% rename from vllm/executor/ray_habana_executor.py rename to vllm/executor/ray_hpu_executor.py index 645bceb1af446..343fa43b0eda1 100644 --- a/vllm/executor/ray_habana_executor.py +++ b/vllm/executor/ray_hpu_executor.py @@ -29,7 +29,7 @@ logger = init_logger(__name__) -class RayHabanaExecutor(DistributedGPUExecutor): +class RayHPUExecutor(DistributedGPUExecutor): uses_ray: bool = True @@ -90,8 +90,8 @@ def _get_worker_module_and_class( raise NotImplementedError( "Speculative decoding is not implemented for HPU") else: - worker_module_name = "vllm.worker.habana_worker" - worker_class_name = "HabanaWorker" + worker_module_name = "vllm.worker.hpu_worker" + worker_class_name = "HPUWorker" return (worker_module_name, worker_class_name, worker_class_fn) def _get_worker_wrapper_args(self) -> Dict[str, Any]: @@ -479,7 +479,7 @@ def __del__(self): self.shutdown() -class RayHabanaExecutorAsync(RayHabanaExecutor, DistributedGPUExecutorAsync): +class RayHPUExecutorAsync(RayHPUExecutor, DistributedGPUExecutorAsync): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/hpu_model_runner.py similarity index 99% rename from vllm/worker/habana_model_runner.py rename to vllm/worker/hpu_model_runner.py index 2d72be5690664..2ee3832e6e076 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -489,7 +489,7 @@ def from_broadcasted_tensor_dict( return cls(**tensor_dict) -class HabanaModelRunnerBase(ModelRunnerBase[TModelInputForHPU]): +class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]): """ Helper class for shared methods between GPU model runners. """ @@ -1730,8 +1730,8 @@ def unwrap_model(model): return modules -class HabanaModelRunner( - HabanaModelRunnerBase[ModelInputForHPUWithSamplingMetadata]): +class HPUModelRunner( + HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]): """ GPU model runner with sampling step. """ @@ -1872,7 +1872,7 @@ def execute_model( ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]: if num_steps > 1: raise ValueError( - "num_steps > 1 is not supported in HabanaModelRunner") + "num_steps > 1 is not supported in HPUModelRunner") if self.lora_config: assert model_input.lora_requests is not None diff --git a/vllm/worker/habana_worker.py b/vllm/worker/hpu_worker.py similarity index 99% rename from vllm/worker/habana_worker.py rename to vllm/worker/hpu_worker.py index 7fc1e48b8c960..59a5adf65ebc1 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/hpu_worker.py @@ -25,14 +25,14 @@ from vllm.sequence import ExecuteModelRequest from vllm.utils import hpu_backend_string, hpu_device_string, is_fake_hpu from vllm.worker.cache_engine import CacheEngine -from vllm.worker.habana_model_runner import HabanaModelRunner +from vllm.worker.hpu_model_runner import HPUModelRunner from vllm.worker.model_runner_base import ModelRunnerBase from vllm.worker.worker_base import LocalOrDistributedWorkerBase, WorkerInput logger = init_logger(__name__) -class HabanaWorker(LocalOrDistributedWorkerBase): +class HPUWorker(LocalOrDistributedWorkerBase): """A worker class that executes (a partition of) the model on a HPU. Each worker is associated with a single HPU. The worker is responsible for @@ -79,7 +79,7 @@ def __init__( from vllm.utils import init_cached_hf_modules init_cached_hf_modules() - self.model_runner: HabanaModelRunner = HabanaModelRunner( + self.model_runner: HPUModelRunner = HPUModelRunner( model_config, parallel_config, scheduler_config, diff --git a/vllm/worker/kzawora.code-workspace b/vllm/worker/kzawora.code-workspace new file mode 100644 index 0000000000000..d5ced898f0957 --- /dev/null +++ b/vllm/worker/kzawora.code-workspace @@ -0,0 +1,11 @@ +{ + "folders": [ + { + "path": "../../.." + }, + { + "path": "../.." + } + ], + "settings": {} +} \ No newline at end of file From eb095b3f4f98d4a64657da5bb4e4d3c825527d33 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Fri, 4 Oct 2024 18:38:31 +0300 Subject: [PATCH 259/819] oopsie --- vllm/worker/kzawora.code-workspace | 11 ----------- 1 file changed, 11 deletions(-) delete mode 100644 vllm/worker/kzawora.code-workspace diff --git a/vllm/worker/kzawora.code-workspace b/vllm/worker/kzawora.code-workspace deleted file mode 100644 index d5ced898f0957..0000000000000 --- a/vllm/worker/kzawora.code-workspace +++ /dev/null @@ -1,11 +0,0 @@ -{ - "folders": [ - { - "path": "../../.." - }, - { - "path": "../.." - } - ], - "settings": {} -} \ No newline at end of file From 65fa6f6bfa733c3cb64e090d9624e9afa335b1cf Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Fri, 4 Oct 2024 18:42:14 +0300 Subject: [PATCH 260/819] format.sh --- vllm/engine/async_llm_engine.py | 3 +-- vllm/worker/hpu_model_runner.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index a2a940148b87e..3ba73b68580fb 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -620,8 +620,7 @@ def _get_executor_cls( elif engine_config.device_config.device_type == "hpu": if distributed_executor_backend == "ray": initialize_ray_cluster(engine_config.parallel_config) - from vllm.executor.ray_hpu_executor import ( - RayHPUExecutorAsync) + from vllm.executor.ray_hpu_executor import RayHPUExecutorAsync executor_class = RayHPUExecutorAsync else: from vllm.executor.hpu_executor import HPUExecutorAsync diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 2ee3832e6e076..b1b62e6bde7f6 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1730,8 +1730,7 @@ def unwrap_model(model): return modules -class HPUModelRunner( - HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]): +class HPUModelRunner(HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]): """ GPU model runner with sampling step. """ From 05763607d8da304a12e0f218d97ae26d2b169e36 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Fri, 4 Oct 2024 18:44:52 +0300 Subject: [PATCH 261/819] make yapf happy --- vllm/engine/async_llm_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 3ba73b68580fb..6f3b73dbeee20 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -620,7 +620,7 @@ def _get_executor_cls( elif engine_config.device_config.device_type == "hpu": if distributed_executor_backend == "ray": initialize_ray_cluster(engine_config.parallel_config) - from vllm.executor.ray_hpu_executor import RayHPUExecutorAsync + from vllm.executor.ray_hpu_executor import RayHPUExecutorAsync executor_class = RayHPUExecutorAsync else: from vllm.executor.hpu_executor import HPUExecutorAsync From b4e26d3af5293c38cea95233defc7c834fc2b3fd Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Fri, 4 Oct 2024 19:16:59 +0300 Subject: [PATCH 262/819] fix sampler metadata generation --- vllm/model_executor/sampling_metadata.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index ee02368bec8a8..84f35f75a0c32 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -284,7 +284,8 @@ def _prepare_seq_groups( else: # Decode prompt_logprob_len = 0 - query_len = query_lens[i] if query_lens is not None else 1 + query_len = query_lens[i] if query_lens is not None and len( + query_lens) > 0 else 1 sample_len = len(seq_ids) * query_len if do_sample else 0 if sampling_params.seed is not None and generators is not None: From cfe231d905fe9e3ecf779eaf62e5d177900a0e6e Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Fri, 4 Oct 2024 18:35:34 +0200 Subject: [PATCH 263/819] [Refactor] Rename components *Habana* -> *HPU* (#359) Refactoring Gaudi-specific components to use `hpu` name instead of `habana` (e.g. `habana_model_runner.py` -> `hpu_model_runner.py`, `habana_executor.py` -> `hpu_executor.py`, etc.), as suggested in the upstream PR. --- README_GAUDI.md | 78 +++++++++---------- .../getting_started/gaudi-installation.rst | 78 +++++++++---------- vllm/engine/async_llm_engine.py | 15 ++-- vllm/engine/llm_engine.py | 14 ++-- vllm/engine/multiprocessing/engine.py | 6 +- .../{habana_executor.py => hpu_executor.py} | 8 +- ...habana_executor.py => ray_hpu_executor.py} | 8 +- ...na_model_runner.py => hpu_model_runner.py} | 7 +- .../{habana_worker.py => hpu_worker.py} | 6 +- 9 files changed, 109 insertions(+), 111 deletions(-) rename vllm/executor/{habana_executor.py => hpu_executor.py} (97%) rename vllm/executor/{ray_habana_executor.py => ray_hpu_executor.py} (99%) rename vllm/worker/{habana_model_runner.py => hpu_model_runner.py} (99%) rename vllm/worker/{habana_worker.py => hpu_worker.py} (99%) diff --git a/README_GAUDI.md b/README_GAUDI.md index 04e2ff22f96e5..6ba3bb50d4a04 100644 --- a/README_GAUDI.md +++ b/README_GAUDI.md @@ -195,10 +195,10 @@ batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup: ``` {.} -INFO 08-01 21:37:59 habana_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] -INFO 08-01 21:37:59 habana_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] -INFO 08-01 21:37:59 habana_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] -INFO 08-01 21:37:59 habana_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] +INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] +INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] +INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] +INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] ``` `min` determines the lowest value of the bucket. `step` determines the @@ -267,17 +267,17 @@ graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup: ``` {.} -INFO 08-01 22:26:47 habana_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB -INFO 08-01 22:26:47 habana_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB -INFO 08-01 22:26:48 habana_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB +INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB +INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB +INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB ... -INFO 08-01 22:26:59 habana_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB -INFO 08-01 22:27:00 habana_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB -INFO 08-01 22:27:00 habana_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB -INFO 08-01 22:27:01 habana_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB +INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB +INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB +INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB +INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB ... -INFO 08-01 22:27:16 habana_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB -INFO 08-01 22:27:16 habana_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB +INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB +INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB ``` This example uses the same buckets as in *Bucketing mechanism* section. @@ -374,35 +374,35 @@ Each described step is logged by vLLM server, as follows (negative values correspond to memory being released): ``` {.} -INFO 08-02 17:37:44 habana_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] -INFO 08-02 17:37:44 habana_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] -INFO 08-02 17:37:44 habana_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] -INFO 08-02 17:37:44 habana_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] -INFO 08-02 17:37:52 habana_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) -INFO 08-02 17:37:52 habana_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used) -INFO 08-02 17:37:52 habana_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) -INFO 08-02 17:37:54 habana_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used) -INFO 08-02 17:37:54 habana_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache -INFO 08-02 17:37:54 habana_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0 -INFO 08-02 17:37:54 habana_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used) -INFO 08-02 17:37:54 habana_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB +INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] +INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] +INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] +INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] +INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) +INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used) +INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) +INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used) +INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache +INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0 +INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used) +INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB ... -INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB -INFO 08-02 17:38:22 habana_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 4.755 GiB for prompt and 11.095 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3) -INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB +INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB +INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 4.755 GiB for prompt and 11.095 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3) +INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB ... -INFO 08-02 17:38:26 habana_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB -INFO 08-02 17:38:27 habana_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB +INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB +INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB ... -INFO 08-02 17:38:41 habana_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB -INFO 08-02 17:38:41 habana_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB -INFO 08-02 17:38:42 habana_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB -INFO 08-02 17:38:42 habana_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB -INFO 08-02 17:38:43 habana_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB -INFO 08-02 17:38:43 habana_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)] -INFO 08-02 17:38:43 habana_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] -INFO 08-02 17:38:43 habana_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory -INFO 08-02 17:38:43 habana_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used) +INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB +INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB +INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB +INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB +INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB +INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)] +INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] +INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory +INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used) ``` Recommended vLLM Parameters diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst index db1d8666e4800..5915de92802d9 100644 --- a/docs/source/getting_started/gaudi-installation.rst +++ b/docs/source/getting_started/gaudi-installation.rst @@ -173,10 +173,10 @@ Bucketing ranges are determined with 3 parameters - ``min``, ``step`` and ``max` .. code-block:: - INFO 08-01 21:37:59 habana_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] - INFO 08-01 21:37:59 habana_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] - INFO 08-01 21:37:59 habana_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] - INFO 08-01 21:37:59 habana_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] + INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] + INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] + INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] + INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] ``min`` determines the lowest value of the bucket. ``step`` determines the interval between buckets, and ``max`` determines the upper bound of the bucket. Furthermore, interval between ``min`` and ``step`` has special handling - ``min`` gets multiplied by consecutive powers of two, until ``step`` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes. @@ -216,17 +216,17 @@ Warmup is an optional, but highly recommended step occurring before vLLM server .. code-block:: - INFO 08-01 22:26:47 habana_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB - INFO 08-01 22:26:47 habana_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB - INFO 08-01 22:26:48 habana_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB + INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB + INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB + INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB ... - INFO 08-01 22:26:59 habana_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB - INFO 08-01 22:27:00 habana_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB - INFO 08-01 22:27:00 habana_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB - INFO 08-01 22:27:01 habana_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB + INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB + INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB + INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB + INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB ... - INFO 08-01 22:27:16 habana_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB - INFO 08-01 22:27:16 habana_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB + INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB + INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. @@ -266,35 +266,35 @@ Each described step is logged by vLLM server, as follows (negative values corres .. code-block:: - INFO 08-02 17:37:44 habana_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] - INFO 08-02 17:37:44 habana_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] - INFO 08-02 17:37:44 habana_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] - INFO 08-02 17:37:44 habana_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] - INFO 08-02 17:37:52 habana_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) - INFO 08-02 17:37:52 habana_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used) - INFO 08-02 17:37:52 habana_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) - INFO 08-02 17:37:54 habana_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used) - INFO 08-02 17:37:54 habana_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache - INFO 08-02 17:37:54 habana_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0 - INFO 08-02 17:37:54 habana_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used) - INFO 08-02 17:37:54 habana_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB + INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] + INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] + INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] + INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] + INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) + INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used) + INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) + INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used) + INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache + INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0 + INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used) + INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB ... - INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB - INFO 08-02 17:38:22 habana_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3) - INFO 08-02 17:38:22 habana_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB + INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB + INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3) + INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB ... - INFO 08-02 17:38:26 habana_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB - INFO 08-02 17:38:27 habana_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB + INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB + INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB ... - INFO 08-02 17:38:41 habana_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB - INFO 08-02 17:38:41 habana_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB - INFO 08-02 17:38:42 habana_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB - INFO 08-02 17:38:42 habana_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB - INFO 08-02 17:38:43 habana_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB - INFO 08-02 17:38:43 habana_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)] - INFO 08-02 17:38:43 habana_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] - INFO 08-02 17:38:43 habana_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory - INFO 08-02 17:38:43 habana_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used) + INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB + INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB + INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB + INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB + INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB + INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)] + INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] + INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory + INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used) Recommended vLLM Parameters diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index cb489084f48de..6f3b73dbeee20 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -16,7 +16,7 @@ from vllm.engine.metrics_types import StatLoggerBase from vllm.executor.executor_base import ExecutorAsyncBase from vllm.executor.gpu_executor import GPUExecutorAsync -from vllm.executor.habana_executor import HabanaExecutorAsync +from vllm.executor.hpu_executor import HPUExecutorAsync from vllm.executor.ray_utils import initialize_ray_cluster from vllm.inputs import PromptType from vllm.logger import init_logger @@ -620,12 +620,11 @@ def _get_executor_cls( elif engine_config.device_config.device_type == "hpu": if distributed_executor_backend == "ray": initialize_ray_cluster(engine_config.parallel_config) - from vllm.executor.ray_habana_executor import ( - RayHabanaExecutorAsync) - executor_class = RayHabanaExecutorAsync + from vllm.executor.ray_hpu_executor import RayHPUExecutorAsync + executor_class = RayHPUExecutorAsync else: - from vllm.executor.habana_executor import HabanaExecutorAsync - executor_class = HabanaExecutorAsync + from vllm.executor.hpu_executor import HPUExecutorAsync + executor_class = HPUExecutorAsync elif engine_config.device_config.device_type == "openvino": assert distributed_executor_backend is None, ( "Distributed execution is not supported with " @@ -1206,7 +1205,7 @@ async def start_profile(self) -> None: # using type instead of isinstance to check to avoid capturing # inherited classes if type(self.engine.model_executor) == GPUExecutorAsync or \ - type(self.engine.model_executor) == HabanaExecutorAsync: # noqa: E721 + type(self.engine.model_executor) == HPUExecutorAsync: # noqa: E721 self.engine.model_executor.start_profile() else: self.engine.model_executor._run_workers("start_profile") @@ -1215,7 +1214,7 @@ async def stop_profile(self) -> None: # using type instead of isinstance to check to avoid capturing # inherited classes if type(self.engine.model_executor) == GPUExecutorAsync or \ - type(self.engine.model_executor) == HabanaExecutorAsync: # noqa: E721 + type(self.engine.model_executor) == HPUExecutorAsync: # noqa: E721 self.engine.model_executor.stop_profile() else: self.engine.model_executor._run_workers("stop_profile") diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index f41d074ad536c..3635443421e88 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -28,7 +28,7 @@ from vllm.entrypoints.openai.logits_processors import get_logits_processors from vllm.executor.executor_base import ExecutorBase from vllm.executor.gpu_executor import GPUExecutor -from vllm.executor.habana_executor import HabanaExecutor +from vllm.executor.hpu_executor import HPUExecutor from vllm.executor.ray_utils import initialize_ray_cluster from vllm.inputs import (INPUT_REGISTRY, EncoderDecoderLLMInputs, InputRegistry, LLMInputs, PromptType) @@ -533,11 +533,11 @@ def _get_executor_cls(cls, elif engine_config.device_config.device_type == "hpu": if distributed_executor_backend == "ray": initialize_ray_cluster(engine_config.parallel_config) - from vllm.executor.ray_habana_executor import RayHabanaExecutor - executor_class = RayHabanaExecutor + from vllm.executor.ray_hpu_executor import RayHPUExecutor + executor_class = RayHPUExecutor else: - from vllm.executor.habana_executor import HabanaExecutor - executor_class = HabanaExecutor + from vllm.executor.hpu_executor import HPUExecutor + executor_class = HPUExecutor elif engine_config.device_config.device_type == "openvino": from vllm.executor.openvino_executor import OpenVINOExecutor executor_class = OpenVINOExecutor @@ -1796,7 +1796,7 @@ def start_profile(self) -> None: # using type instead of isinstance to check to avoid capturing # inherited classes (MultiprocessingGPUExecutor) if type(self.model_executor) == GPUExecutor or \ - type(self.model_executor) == HabanaExecutor: # noqa: E721 + type(self.model_executor) == HPUExecutor: # noqa: E721 self.model_executor.start_profile() else: self.model_executor._run_workers("start_profile") @@ -1805,7 +1805,7 @@ def stop_profile(self) -> None: # using type instead of isinstance to check to avoid capturing # inherited classes (MultiprocessingGPUExecutor) if type(self.model_executor) == GPUExecutor or \ - type(self.model_executor) == HabanaExecutor: # noqa: E721 + type(self.model_executor) == HPUExecutor: # noqa: E721 self.model_executor.stop_profile() else: self.model_executor._run_workers("stop_profile") diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 49500099fbcaf..3501f12c065cf 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -23,7 +23,7 @@ # yapf: enable from vllm.envs import VLLM_RPC_TIMEOUT from vllm.executor.gpu_executor import GPUExecutor -from vllm.executor.habana_executor import HabanaExecutor +from vllm.executor.hpu_executor import HPUExecutor from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.usage.usage_lib import UsageContext @@ -366,14 +366,14 @@ def _alive(self): def start_profile(self) -> None: if type(self.engine.model_executor) is GPUExecutor or \ - type(self.engine.model_executor) is HabanaExecutor: + type(self.engine.model_executor) is HPUExecutor: self.engine.model_executor.start_profile() else: self.engine.model_executor._run_workers("start_profile") def stop_profile(self) -> None: if type(self.engine.model_executor) is GPUExecutor or \ - type(self.engine.model_executor) is HabanaExecutor: + type(self.engine.model_executor) is HPUExecutor: self.engine.model_executor.stop_profile() else: self.engine.model_executor._run_workers("stop_profile") diff --git a/vllm/executor/habana_executor.py b/vllm/executor/hpu_executor.py similarity index 97% rename from vllm/executor/habana_executor.py rename to vllm/executor/hpu_executor.py index e6d0fbc0d431d..cc5609ebe5c8e 100644 --- a/vllm/executor/habana_executor.py +++ b/vllm/executor/hpu_executor.py @@ -21,7 +21,7 @@ logger = init_logger(__name__) -class HabanaExecutor(ExecutorBase): +class HPUExecutor(ExecutorBase): uses_ray: bool = False @@ -57,8 +57,8 @@ def _create_worker(self, rank: int = 0, distributed_init_method: Optional[str] = None): wrapper = WorkerWrapperBase( - worker_module_name="vllm.worker.habana_worker", - worker_class_name="HabanaWorker", + worker_module_name="vllm.worker.hpu_worker", + worker_class_name="HPUWorker", ) wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank, distributed_init_method)) @@ -202,7 +202,7 @@ def shutdown(self) -> None: self.driver_worker.shutdown_inc() -class HabanaExecutorAsync(HabanaExecutor, ExecutorAsyncBase): +class HPUExecutorAsync(HPUExecutor, ExecutorAsyncBase): async def execute_model_async( self, diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_hpu_executor.py similarity index 99% rename from vllm/executor/ray_habana_executor.py rename to vllm/executor/ray_hpu_executor.py index 645bceb1af446..343fa43b0eda1 100644 --- a/vllm/executor/ray_habana_executor.py +++ b/vllm/executor/ray_hpu_executor.py @@ -29,7 +29,7 @@ logger = init_logger(__name__) -class RayHabanaExecutor(DistributedGPUExecutor): +class RayHPUExecutor(DistributedGPUExecutor): uses_ray: bool = True @@ -90,8 +90,8 @@ def _get_worker_module_and_class( raise NotImplementedError( "Speculative decoding is not implemented for HPU") else: - worker_module_name = "vllm.worker.habana_worker" - worker_class_name = "HabanaWorker" + worker_module_name = "vllm.worker.hpu_worker" + worker_class_name = "HPUWorker" return (worker_module_name, worker_class_name, worker_class_fn) def _get_worker_wrapper_args(self) -> Dict[str, Any]: @@ -479,7 +479,7 @@ def __del__(self): self.shutdown() -class RayHabanaExecutorAsync(RayHabanaExecutor, DistributedGPUExecutorAsync): +class RayHPUExecutorAsync(RayHPUExecutor, DistributedGPUExecutorAsync): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/hpu_model_runner.py similarity index 99% rename from vllm/worker/habana_model_runner.py rename to vllm/worker/hpu_model_runner.py index 2d72be5690664..b1b62e6bde7f6 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -489,7 +489,7 @@ def from_broadcasted_tensor_dict( return cls(**tensor_dict) -class HabanaModelRunnerBase(ModelRunnerBase[TModelInputForHPU]): +class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]): """ Helper class for shared methods between GPU model runners. """ @@ -1730,8 +1730,7 @@ def unwrap_model(model): return modules -class HabanaModelRunner( - HabanaModelRunnerBase[ModelInputForHPUWithSamplingMetadata]): +class HPUModelRunner(HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]): """ GPU model runner with sampling step. """ @@ -1872,7 +1871,7 @@ def execute_model( ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]: if num_steps > 1: raise ValueError( - "num_steps > 1 is not supported in HabanaModelRunner") + "num_steps > 1 is not supported in HPUModelRunner") if self.lora_config: assert model_input.lora_requests is not None diff --git a/vllm/worker/habana_worker.py b/vllm/worker/hpu_worker.py similarity index 99% rename from vllm/worker/habana_worker.py rename to vllm/worker/hpu_worker.py index 7fc1e48b8c960..59a5adf65ebc1 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/hpu_worker.py @@ -25,14 +25,14 @@ from vllm.sequence import ExecuteModelRequest from vllm.utils import hpu_backend_string, hpu_device_string, is_fake_hpu from vllm.worker.cache_engine import CacheEngine -from vllm.worker.habana_model_runner import HabanaModelRunner +from vllm.worker.hpu_model_runner import HPUModelRunner from vllm.worker.model_runner_base import ModelRunnerBase from vllm.worker.worker_base import LocalOrDistributedWorkerBase, WorkerInput logger = init_logger(__name__) -class HabanaWorker(LocalOrDistributedWorkerBase): +class HPUWorker(LocalOrDistributedWorkerBase): """A worker class that executes (a partition of) the model on a HPU. Each worker is associated with a single HPU. The worker is responsible for @@ -79,7 +79,7 @@ def __init__( from vllm.utils import init_cached_hf_modules init_cached_hf_modules() - self.model_runner: HabanaModelRunner = HabanaModelRunner( + self.model_runner: HPUModelRunner = HPUModelRunner( model_config, parallel_config, scheduler_config, From 1f6de5df8ed22a5ffaf7558e83d8b04a86728f27 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 7 Oct 2024 09:29:21 +0200 Subject: [PATCH 264/819] Lazily import HPU-dependent components (#363) --- vllm/executor/hpu_executor.py | 4 +--- vllm/model_executor/layers/rotary_embedding.py | 7 +++---- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/vllm/executor/hpu_executor.py b/vllm/executor/hpu_executor.py index cc5609ebe5c8e..34879bc4e7ef5 100644 --- a/vllm/executor/hpu_executor.py +++ b/vllm/executor/hpu_executor.py @@ -6,8 +6,6 @@ import os from typing import Any, Dict, List, Optional, Set, Tuple -from vllm_hpu_extension.profiler import HabanaMemoryProfiler - from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -86,7 +84,7 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: # remains to abstract away the device for non-GPU configurations. logger.info("# HPU blocks: %d, # CPU blocks: %d", num_gpu_blocks, num_cpu_blocks) - + from vllm_hpu_extension.profiler import HabanaMemoryProfiler with HabanaMemoryProfiler() as cache_init_m: self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) msg = f"init_cache_engine took {cache_init_m.get_summary_string()}" diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 30bcf954c99b5..85cd700c978ea 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -30,10 +30,6 @@ from vllm.model_executor.custom_op import CustomOp from vllm.platforms import current_platform -if current_platform.is_hpu(): - from vllm_hpu_extension.rotary_embed import (HpuLlama3RotaryEmbedding, - HpuRotaryEmbedding) - def _rotate_neox(x: torch.Tensor) -> torch.Tensor: x1 = x[..., :x.shape[-1] // 2] @@ -923,6 +919,7 @@ def get_rope( if rope_scaling is None: if current_platform.is_hpu(): + from vllm_hpu_extension.rotary_embed import HpuRotaryEmbedding rotary_emb = HpuRotaryEmbedding(head_size, rotary_dim, max_position, @@ -945,6 +942,8 @@ def get_rope( original_max_position = rope_scaling[ "original_max_position_embeddings"] if current_platform.is_hpu(): + from vllm_hpu_extension.rotary_embed import ( + HpuLlama3RotaryEmbedding) rotary_emb = HpuLlama3RotaryEmbedding( head_size, rotary_dim, From ad08dd4e6616206398907c14dee589ffa7081df4 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 7 Oct 2024 09:29:37 +0200 Subject: [PATCH 265/819] [Refactor] Rename HabanaAttention -> HPUAttention (#362) I've missed the attention backend in https://github.com/HabanaAI/vllm-fork/pull/359 --- .../backends/{habana_attn.py => hpu_attn.py} | 37 +++++++++---------- ...habana_paged_attn.py => hpu_paged_attn.py} | 6 +-- vllm/attention/selector.py | 13 +++---- 3 files changed, 27 insertions(+), 29 deletions(-) rename vllm/attention/backends/{habana_attn.py => hpu_attn.py} (88%) rename vllm/attention/ops/{habana_paged_attn.py => hpu_paged_attn.py} (95%) diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/hpu_attn.py similarity index 88% rename from vllm/attention/backends/habana_attn.py rename to vllm/attention/backends/hpu_attn.py index dad33fefc51f3..17201fe6e1cd6 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -13,22 +13,22 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType) from vllm.attention.backends.utils import CommonAttentionState -from vllm.attention.ops.habana_paged_attn import (HabanaPagedAttention, - HabanaPagedAttentionMetadata) +from vllm.attention.ops.hpu_paged_attn import (HPUPagedAttention, + HPUPagedAttentionMetadata) from vllm.logger import init_logger logger = init_logger(__name__) -class HabanaAttentionBackend(AttentionBackend): +class HPUAttentionBackend(AttentionBackend): @staticmethod - def get_impl_cls() -> Type["HabanaAttentionImpl"]: - return HabanaAttentionImpl + def get_impl_cls() -> Type["HPUAttentionImpl"]: + return HPUAttentionImpl @staticmethod def get_metadata_cls() -> Type["AttentionMetadata"]: - return HabanaAttentionMetadata + return HPUAttentionMetadata @staticmethod def get_state_cls() -> Type["CommonAttentionState"]: @@ -41,8 +41,8 @@ def get_kv_cache_shape( num_kv_heads: int, head_size: int, ) -> Tuple[int, ...]: - return HabanaPagedAttention.get_kv_cache_shape(num_blocks, block_size, - num_kv_heads, head_size) + return HPUPagedAttention.get_kv_cache_shape(num_blocks, block_size, + num_kv_heads, head_size) @staticmethod def swap_blocks( @@ -50,20 +50,19 @@ def swap_blocks( dst_kv_cache: torch.Tensor, src_to_dst: Dict[int, int], ) -> None: - HabanaPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, - src_to_dst) + HPUPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) @staticmethod def copy_blocks( kv_caches: List[torch.Tensor], src_to_dists: Dict[int, List[int]], ) -> None: - HabanaPagedAttention.copy_blocks(kv_caches, src_to_dists) + HPUPagedAttention.copy_blocks(kv_caches, src_to_dists) @dataclass -class HabanaAttentionMetadata(HabanaPagedAttentionMetadata, AttentionMetadata): - """Metadata for HabanaAttentionbackend.""" +class HPUAttentionMetadata(HPUPagedAttentionMetadata, AttentionMetadata): + """Metadata for HPUAttentionbackend.""" # Currently, input sequences can only contain all prompts # or all decoding. True if all sequences are prompts. is_prompt: bool @@ -71,7 +70,7 @@ class HabanaAttentionMetadata(HabanaPagedAttentionMetadata, AttentionMetadata): seq_lens_tensor: Optional[torch.Tensor] -class HabanaAttentionImpl(AttentionImpl, torch.nn.Module): +class HPUAttentionImpl(AttentionImpl, torch.nn.Module): """ If the input tensors contain prompt tokens, the layout is as follows: |<--------------- num_prefill_tokens ----------------->| @@ -126,7 +125,7 @@ def __init__( assert alibi_slopes is None, \ 'Prefill with FusedSDPA not supported with alibi slopes!' - suppored_head_sizes = HabanaPagedAttention.get_supported_head_sizes() + suppored_head_sizes = HPUPagedAttention.get_supported_head_sizes() if head_size not in suppored_head_sizes: raise ValueError( f"Head size {head_size} is not supported by PagedAttention. " @@ -138,7 +137,7 @@ def forward( key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, - attn_metadata: HabanaAttentionMetadata, + attn_metadata: HPUAttentionMetadata, k_scale: float = 1.0, v_scale: float = 1.0, attn_type: AttentionType = AttentionType.DECODER, @@ -158,7 +157,7 @@ def forward( raise NotImplementedError("Encoder self-attention and " "encoder/decoder cross-attention " "are not implemented for " - "HabanaAttentionImpl") + "HPUAttentionImpl") batch_size, seq_len, hidden_size = query.shape _, seq_len_kv, _ = key.shape @@ -171,7 +170,7 @@ def forward( key = key.unflatten(0, (block_indices.size(0), -1)) value = value.unflatten(0, (block_indices.size(0), -1)) if kv_cache is not None: - key_cache, value_cache = HabanaPagedAttention.split_kv_cache( + key_cache, value_cache = HPUPagedAttention.split_kv_cache( kv_cache, self.num_kv_heads, self.head_size) # Reshape the input keys and values and store them in the cache. @@ -216,7 +215,7 @@ def forward( output = out.reshape(batch_size, seq_len, hidden_size) else: # Decoding run. - output = HabanaPagedAttention.forward_decode( + output = HPUPagedAttention.forward_decode( query=query, key_cache=key_cache, value_cache=value_cache, diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py similarity index 95% rename from vllm/attention/ops/habana_paged_attn.py rename to vllm/attention/ops/hpu_paged_attn.py index 7f080e0727457..7fbe26d83f320 100644 --- a/vllm/attention/ops/habana_paged_attn.py +++ b/vllm/attention/ops/hpu_paged_attn.py @@ -13,7 +13,7 @@ @dataclass -class HabanaPagedAttentionMetadata: +class HPUPagedAttentionMetadata: """Metadata for PagedAttention.""" block_list: Optional[torch.Tensor] block_mapping: Optional[torch.Tensor] @@ -22,7 +22,7 @@ class HabanaPagedAttentionMetadata: block_offsets: Optional[torch.Tensor] -class HabanaPagedAttention: +class HPUPagedAttention: @staticmethod def get_supported_head_sizes() -> List[int]: @@ -76,7 +76,7 @@ def forward_prefix( sliding_window: Optional[int], ) -> torch.Tensor: raise NotImplementedError( - "forward_prefix is not implemented for HabanaPagedAttention") + "forward_prefix is not implemented for HPUPagedAttention") @staticmethod def swap_blocks( diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index c7a416a78519b..52d3dfa820752 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -22,7 +22,7 @@ class _Backend(enum.Enum): TORCH_SDPA = enum.auto() OPENVINO = enum.auto() FLASHINFER = enum.auto() - HABANA_ATTN = enum.auto() + HPU_ATTN = enum.auto() PALLAS = enum.auto() IPEX = enum.auto() @@ -143,11 +143,10 @@ def get_attn_backend( logger.info("Using Flashinfer backend.") from vllm.attention.backends.flashinfer import FlashInferBackend return FlashInferBackend - elif backend == _Backend.HABANA_ATTN: - logger.info("Using HabanaAttention backend.") - from vllm.attention.backends.habana_attn import ( # noqa: F401 - HabanaAttentionBackend) - return HabanaAttentionBackend + elif backend == _Backend.HPU_ATTN: + logger.info("Using HPUAttention backend.") + from vllm.attention.backends.hpu_attn import HPUAttentionBackend + return HPUAttentionBackend elif backend == _Backend.PALLAS: logger.info("Using Pallas backend.") from vllm.attention.backends.pallas import PallasAttentionBackend @@ -217,7 +216,7 @@ def which_attn_to_use( return _Backend.ROCM_FLASH if current_platform.is_hpu(): - return _Backend.HABANA_ATTN + return _Backend.HPU_ATTN # FlashAttn in NVIDIA GPUs. if selected_backend == _Backend.FLASH_ATTN: From e00750e2b24b433d157b514725a69ed4e0e58f70 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 7 Oct 2024 09:30:12 +0200 Subject: [PATCH 266/819] Use BF16 on HPU by default (#361) We don't *officially* support FP16, and for the most part, we use BF16 wherever we can. This removes the need of specifying `--dtype bfloat16` - when `dtype` is not provided (is `auto`), and model default data type is `float16`, we cast it to `bfloat16` for HPU. --- vllm/config.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/config.py b/vllm/config.py index 786ed1586a3ea..b3329f1c449ff 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1635,6 +1635,13 @@ def _get_and_verify_dtype( torch_dtype = torch.float16 else: torch_dtype = config_dtype + + if current_platform.is_hpu() and config_dtype == torch.float16: + logger.info( + "For HPU, we cast models to bfloat16 instead of" + "using float16 by default. Please specify `dtype` if you " + "want to use float16.") + torch_dtype = torch.bfloat16 else: if dtype not in _STR_DTYPE_TO_TORCH_DTYPE: raise ValueError(f"Unknown dtype: {dtype}") From db5aed61529d04b0604a07728a3ce9eb95a2072d Mon Sep 17 00:00:00 2001 From: Michal Adamczyk Date: Mon, 7 Oct 2024 11:58:01 +0200 Subject: [PATCH 267/819] Set vllm-hpu-extension to 36c7f9c (#365) This includes: https://github.com/HabanaAI/vllm-hpu-extension/pull/8 (BlockSoftmax: fix guard value for fp16) --- requirements-hpu.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 602a5060c29aa..8f7f0339b02e3 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -8,4 +8,4 @@ pandas tabulate setuptools>=61 setuptools-scm>=8 -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@bb56d3b +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@36c7f9c From bb4c23e284f73ed00748a43819432574a96384d8 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 7 Oct 2024 16:54:47 +0300 Subject: [PATCH 268/819] format.sh --- vllm/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/utils.py b/vllm/utils.py index 0bfc9b0adca3c..bf8923e532334 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1424,6 +1424,7 @@ def import_wrapper(name, *args, **kwargs): import habana_frameworks.torch as htorch htorch.utils.internal.is_lazy.return_value = False + def get_beam_search_score( tokens: List[int], cumulative_logprob: float, From 563184ab2fc2ab34b5d4ea7b67d146d64538f5c4 Mon Sep 17 00:00:00 2001 From: Yan Tomsinsky <73292515+Yantom1@users.noreply.github.com> Date: Mon, 7 Oct 2024 17:03:36 +0300 Subject: [PATCH 269/819] Fix hpu_set_env call in load_model in vllm (#364) FILL IN THE PR DESCRIPTION HERE FIX #xxxx (*link existing issues this PR will resolve*) **BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE** ---
PR Checklist (Click to Expand)

Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.

PR Title and Classification

Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:

  • [Bugfix] for bug fixes.
  • [CI/Build] for build or continuous integration improvements.
  • [Doc] for documentation fixes and improvements.
  • [Model] for adding a new model or improving an existing model. Model name should appear in the title.
  • [Frontend] For changes on the vLLM frontend (e.g., OpenAI API server, LLM class, etc.)
  • [Kernel] for changes affecting CUDA kernels or other compute kernels.
  • [Core] for changes in the core vLLM logic (e.g., LLMEngine, AsyncLLMEngine, Scheduler, etc.)
  • [Hardware][Vendor] for hardware-specific changes. Vendor name should appear in the prefix (e.g., [Hardware][AMD]).
  • [Misc] for PRs that do not fit the above categories. Please use this sparingly.

Note: If the PR spans more than one category, please include all relevant prefixes.

Code Quality

The PR need to meet the following code quality standards:

  • We adhere to Google Python style guide and Google C++ style guide.
  • Pass all linter checks. Please use format.sh to format your code.
  • The code need to be well-documented to ensure future contributors can easily understand the code.
  • Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.
  • Please add documentation to docs/source/ if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.

Adding or changing kernels

Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.

  • Make sure custom ops are registered following PyTorch guidelines: Custom C++ and CUDA Operators and The Custom Operators Manual
  • Custom operations that return Tensors require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.
  • Use torch.libary.opcheck() to test the function registration and meta-function for any registered ops. See tests/kernels for examples.
  • When changing the C++ signature of an existing op, the schema must be updated to reflect the changes.
  • If a new custom type is needed, see the following document: Custom Class Support in PT2.

Notes for Large Changes

Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with rfc-required and might not go through the PR.

What to Expect for the Reviews

The goal of the vLLM team is to be a transparent reviewing machine. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process:

  • After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.
  • After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.
  • After the review, the reviewer will put an action-required label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.
  • Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.

Thank You

Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone!

--- vllm/model_executor/layers/quantization/utils/w8a8_utils.py | 2 +- vllm/model_executor/model_loader/loader.py | 4 ++-- vllm/worker/hpu_model_runner.py | 4 +++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index bf1aa6fbd5dca..0abb4e0f10546 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -137,7 +137,7 @@ def apply_fp8_linear( qinput, x_scale = ops.scaled_fp8_quant( input, input_scale, - num_token_padding=17, + batch_dim_padding=17, use_per_token_if_dynamic=use_per_token_if_dynamic) per_tensor_weights = (weight_scale.numel() == 1) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 876da67c02436..618800dee5fbe 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -98,8 +98,8 @@ def _get_quantization_config( """Get the quantization config.""" if model_config.quantization is not None: quant_config = get_quant_config(model_config, load_config) - capability_tuple = current_platform.get_device_capability() - + capability_tuple = current_platform.get_device_capability() \ + if current_platform.is_cuda_alike() else None if capability_tuple is not None: capability = capability_tuple.to_int() if capability < quant_config.get_min_capability(): diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index b1b62e6bde7f6..d3fa9c287234c 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -591,7 +591,9 @@ def _set_gc_threshold(self) -> None: def load_model(self) -> None: import habana_frameworks.torch.core as htcore - htcore.hpu_set_env() + if self.model_config.quantization == 'inc' or \ + self.model_config.quantization == 'fp8': + htcore.hpu_set_env() with HabanaMemoryProfiler() as m: with HabanaMemoryProfiler() as m_getmodel: self.model = get_model(model_config=self.model_config, From 0e46492dc834b71a82e5bbeb097abbc364717151 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Kuligowski?= Date: Tue, 8 Oct 2024 10:15:08 +0200 Subject: [PATCH 270/819] Update offline_inference_fakehpu.py Beam search was removed from SamplingParams. In this example it was set to False, with this commit I removed it --- examples/offline_inference_fakehpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/offline_inference_fakehpu.py b/examples/offline_inference_fakehpu.py index 972d84b60b318..248b5740fa35e 100644 --- a/examples/offline_inference_fakehpu.py +++ b/examples/offline_inference_fakehpu.py @@ -21,7 +21,7 @@ "Wales" ] # Create a sampling params object. -sampling_params = SamplingParams(temperature=0, n=1, use_beam_search=False) +sampling_params = SamplingParams(temperature=0, n=1) # Create an LLM. llm = LLM(model="facebook/opt-125m", max_model_len=32, max_num_seqs=4) From 6028354b838dd1e0670925bd4b1757e728c7b9b9 Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Tue, 8 Oct 2024 12:23:36 +0200 Subject: [PATCH 271/819] Timeout adjusted in MLLMEngine (#368) Currently in Multiprocess LLMEngine there is a polling timeout fixed to 10000 ms . This may not be good when we are running torch compiled models that happen to compile (we did not have particular configuration -- shape -- model warmed up during warmup phase). So torch compilation if happens after warmup then 10000ms is not enough. So It would be good to have a way to modify fixed timeout. Changes disscussed here are replacing fixed timeout of 10000 ms with value as provided with VLLM_RPC_TIMEOUT . Please suggest if separate env var should be made. Co-authored-by: Jacek Czaja --- vllm/engine/multiprocessing/engine.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 3501f12c065cf..8446d23604195 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -33,7 +33,6 @@ logger = init_logger(__name__) -POLLING_TIMEOUT_MS = 10000 HEALTHY_RESPONSE = (pickle.dumps(VLLM_RPC_SUCCESS_STR), ) @@ -207,7 +206,7 @@ def run_engine_loop(self): self._alive() if not self.engine.has_unfinished_requests(): # Poll until there is work to do. - while self.input_socket.poll(timeout=POLLING_TIMEOUT_MS) == 0: + while self.input_socket.poll(timeout=VLLM_RPC_TIMEOUT) == 0: self._alive() self.engine.do_log_stats() logger.debug("Waiting for new requests in engine loop.") From 64369fdff907ae2e3b4194a8aa17d71ce943d25c Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 8 Oct 2024 14:34:03 +0200 Subject: [PATCH 272/819] Add Jenkins test definitions (#369) --- .../configs/Meta-Llama-3-70B-Instruct.yaml | 12 ++ .../configs/Meta-Llama-3-8B-Instruct.yaml | 12 ++ .../configs/Meta-Llama-3.1-8B-Instruct.yaml | 15 +++ .../lm-eval-harness/configs/models-large.txt | 1 + .../lm-eval-harness/configs/models-small.txt | 2 + .../run-lm-eval-gsm-vllm-baseline.sh | 51 ++++++++ .jenkins/lm-eval-harness/run-tests.sh | 69 +++++++++++ .../test_lm_eval_correctness.py | 115 ++++++++++++++++++ .jenkins/requirements-test-hpu.txt | 2 + .jenkins/test_config.yaml | 24 ++++ 10 files changed, 303 insertions(+) create mode 100644 .jenkins/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml create mode 100644 .jenkins/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml create mode 100644 .jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct.yaml create mode 100644 .jenkins/lm-eval-harness/configs/models-large.txt create mode 100644 .jenkins/lm-eval-harness/configs/models-small.txt create mode 100644 .jenkins/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh create mode 100644 .jenkins/lm-eval-harness/run-tests.sh create mode 100644 .jenkins/lm-eval-harness/test_lm_eval_correctness.py create mode 100644 .jenkins/requirements-test-hpu.txt create mode 100644 .jenkins/test_config.yaml diff --git a/.jenkins/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml b/.jenkins/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml new file mode 100644 index 0000000000000..38965c6197c55 --- /dev/null +++ b/.jenkins/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml @@ -0,0 +1,12 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5 +model_name: "/mnt/weka/data/pytorch/llama3/Meta-Llama-3-70B-Instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.892 + - name: "exact_match,flexible-extract" + value: 0.892 +limit: 250 +num_fewshot: 5 +dtype: "bfloat16" diff --git a/.jenkins/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml b/.jenkins/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml new file mode 100644 index 0000000000000..9fe7d634b887b --- /dev/null +++ b/.jenkins/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml @@ -0,0 +1,12 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1 +model_name: "/mnt/weka/data/pytorch/llama3/Meta-Llama-3-8B-Instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.756 + - name: "exact_match,flexible-extract" + value: 0.752 +limit: 250 +num_fewshot: 5 +dtype: "bfloat16" diff --git a/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct.yaml b/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct.yaml new file mode 100644 index 0000000000000..e2458a8ea4f1c --- /dev/null +++ b/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct.yaml @@ -0,0 +1,15 @@ +# FIXME(kzawora): these scores were generated using vLLM on HPU, we need to confirm them on HF +# VLLM_SKIP_WARMUP=true bash run-lm-eval-gsm-cot-llama-vllm-baseline.sh -m "/mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct" -b 128 -l 1319 -f 8 -t 1 +model_name: "/mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct" +tasks: +- name: "gsm8k_cot_llama" + metrics: + - name: "exact_match,strict-match" + value: 0.8317 + - name: "exact_match,flexible-extract" + value: 0.8355 +limit: null +num_fewshot: 8 +dtype: "bfloat16" +fewshot_as_multiturn: true +apply_chat_template: true \ No newline at end of file diff --git a/.jenkins/lm-eval-harness/configs/models-large.txt b/.jenkins/lm-eval-harness/configs/models-large.txt new file mode 100644 index 0000000000000..ca2548d1234a8 --- /dev/null +++ b/.jenkins/lm-eval-harness/configs/models-large.txt @@ -0,0 +1 @@ +Meta-Llama-3-70B-Instruct.yaml \ No newline at end of file diff --git a/.jenkins/lm-eval-harness/configs/models-small.txt b/.jenkins/lm-eval-harness/configs/models-small.txt new file mode 100644 index 0000000000000..d8ae241e58ad3 --- /dev/null +++ b/.jenkins/lm-eval-harness/configs/models-small.txt @@ -0,0 +1,2 @@ +Meta-Llama-3-8B-Instruct.yaml +Meta-Llama-3.1-8B-Instruct.yaml \ No newline at end of file diff --git a/.jenkins/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.jenkins/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh new file mode 100644 index 0000000000000..65128d6b437e1 --- /dev/null +++ b/.jenkins/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# We can use this script to compute baseline accuracy on GSM for vllm. +# We use this for fp8, which HF does not support. +# +# Make sure you have lm-eval-harness installed: +# pip install lm-eval==0.4.3 + +usage() { + echo`` + echo "Runs lm eval harness on GSM8k using huggingface transformers." + echo "This pathway is intended to be used to create baselines for " + echo "our automated nm-test-accuracy workflow" + echo + echo "usage: ${0} " + echo + echo " -m - huggingface stub or local directory of the model" + echo " -b - batch size to run the evaluation at" + echo " -l - limit number of samples to run" + echo " -f - number of fewshot samples to use" + echo " -t - tensor parallel size to run at" + echo +} + +while getopts "m:b:l:f:t:" OPT; do + case ${OPT} in + m ) + MODEL="$OPTARG" + ;; + b ) + BATCH_SIZE="$OPTARG" + ;; + l ) + LIMIT="$OPTARG" + ;; + f ) + FEWSHOT="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +lm_eval --model vllm \ + --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096,dtype=bfloat16 \ + --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \ + --batch_size $BATCH_SIZE diff --git a/.jenkins/lm-eval-harness/run-tests.sh b/.jenkins/lm-eval-harness/run-tests.sh new file mode 100644 index 0000000000000..8c51606c4a2dd --- /dev/null +++ b/.jenkins/lm-eval-harness/run-tests.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +usage() { + echo`` + echo "Runs lm eval harness on GSM8k using vllm and compares to " + echo "precomputed baseline (measured by HF transformers.)" + echo + echo "usage: ${0} " + echo + echo " -c - path to the test data config (e.g. configs/small-models.txt)" + echo " -t - tensor parallel size" + echo +} + +SUCCESS=0 + +while getopts "c:t:j:" OPT; do + case ${OPT} in + c ) + CONFIG="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +# Parse list of configs. +IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG + +for MODEL_CONFIG in "${MODEL_CONFIGS[@]}" +do + LOCAL_SUCCESS=0 + + echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE===" + + export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG} + export LM_EVAL_TP_SIZE=$TP_SIZE + export PT_HPU_ENABLE_LAZY_COLLECTIVES=true + export VLLM_SKIP_WARMUP=true + RANDOM_SUFFIX=$(tr -dc A-Za-z0-9 Date: Tue, 8 Oct 2024 17:04:11 +0200 Subject: [PATCH 273/819] Make workaround for SW-204785 broader (#374) PT bridge bug in recent Synapse builds causes PyTest to return 0 unconditionally. Previous workaround fixed that issue if comparison failed, but left out a case in which vLLM (or anything else) actually crashes during the test execution. This patch broadens the workaround to catch any exceptions and add atexit callback when any test fails. --- .../test_lm_eval_correctness.py | 83 ++++++++++--------- 1 file changed, 43 insertions(+), 40 deletions(-) diff --git a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py index be90872d8cf6d..fd4532196e36f 100644 --- a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py +++ b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py @@ -64,7 +64,7 @@ def report_performance(task, input_lens, output_lens, time): context_lens = [i + o for i, o in zip(input_lens, output_lens)] gen_tput = sum(output_lens) / time msg = ( - f'{task} | average generation throughput: {gen_tput:.2f} tokens/s \n' # noqa: G004 + f'{task} | estimated average generation throughput: {gen_tput:.2f} tokens/s \n' # noqa: G004, E501 f'{task} | input_tokens | min: {min(input_lens)} | max: {max(input_lens)} | mean: {statistics.mean(input_lens):.2f} | stddev: {statistics.stdev(input_lens):.2f}\n' # noqa: E501 f'{task} | output_tokens | min: {min(output_lens)} | max: {max(output_lens)} | mean: {statistics.mean(output_lens):.2f} | stddev: {statistics.stdev(output_lens):.2f}\n' # noqa: E501 f'{task} | context_length | min: {min(context_lens)} | max: {max(context_lens)} | mean: {statistics.mean(context_lens):.2f} | stddev: {statistics.stdev(context_lens):.2f}' # noqa: E501 @@ -73,43 +73,46 @@ def report_performance(task, input_lens, output_lens, time): def test_lm_eval_correctness(): - eval_config = yaml.safe_load( - Path(TEST_DATA_FILE).read_text(encoding="utf-8")) - - # Launch eval requests. - start_time = time.perf_counter() - results = launch_lm_eval(eval_config) - total_time = time.perf_counter() - start_time - - tokenizer = vllm.transformers_utils.tokenizer.get_tokenizer( - eval_config['model_name']) - - # Confirm scores match ground truth. - for task in eval_config["tasks"]: - - samples = results['samples'][task["name"]] - tokenized_inputs = [ - tokenizer(x['arguments'][0][0])['input_ids'] for x in samples - ] - tokenized_inputs_lens = [len(x) for x in tokenized_inputs] - tokenized_outputs = [ - list( - itertools.chain.from_iterable( - tokenizer(list(itertools.chain.from_iterable( - x['resps'])))['input_ids'])) for x in samples - ] - tokenized_outputs_lens = [len(x) for x in tokenized_outputs] - report_performance(task['name'], tokenized_inputs_lens, - tokenized_outputs_lens, total_time) - - for metric in task["metrics"]: - ground_truth = metric["value"] - measured_value = results["results"][task["name"]][metric["name"]] - print(f'{task["name"]} | {metric["name"]}: ' - f'ground_truth={ground_truth} | measured={measured_value}') - try: + try: + eval_config = yaml.safe_load( + Path(TEST_DATA_FILE).read_text(encoding="utf-8")) + + # Launch eval requests. + start_time = time.perf_counter() + results = launch_lm_eval(eval_config) + total_time = time.perf_counter() - start_time + + tokenizer = vllm.transformers_utils.tokenizer.get_tokenizer( + eval_config['model_name']) + + # Confirm scores match ground truth. + for task in eval_config["tasks"]: + + samples = results['samples'][task["name"]] + tokenized_inputs = [ + tokenizer(x['arguments'][0][0])['input_ids'] for x in samples + ] + tokenized_inputs_lens = [len(x) for x in tokenized_inputs] + tokenized_outputs = [ + list( + itertools.chain.from_iterable( + tokenizer( + list(itertools.chain.from_iterable( + x['resps'])))['input_ids'])) for x in samples + ] + tokenized_outputs_lens = [len(x) for x in tokenized_outputs] + report_performance(task['name'], tokenized_inputs_lens, + tokenized_outputs_lens, total_time) + + for metric in task["metrics"]: + ground_truth = metric["value"] + measured_value = results["results"][task["name"]][ + metric["name"]] + print( + f'{task["name"]} | {metric["name"]}: ' + f'ground_truth={ground_truth} | measured={measured_value}') assert numpy.isclose(ground_truth, measured_value, rtol=RTOL) - except AssertionError as exc: - # nasty workaround for HPU PT bridge bug (SW-204785) - atexit.register(fail_on_exit) - raise exc + except Exception as exc: + # nasty workaround for a nasty HPU PT bridge bug (SW-204785) + atexit.register(fail_on_exit) + raise exc From ca98daec894114a0db87d86d6148b72879893b90 Mon Sep 17 00:00:00 2001 From: Sanju C Sudhakaran Date: Wed, 9 Oct 2024 19:00:56 +0300 Subject: [PATCH 274/819] Fix LoRA tests by handling broken imports --- tests/lora/test_lora_hpu.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/lora/test_lora_hpu.py b/tests/lora/test_lora_hpu.py index a59cfe875ef9c..2f4f7f31c0100 100644 --- a/tests/lora/test_lora_hpu.py +++ b/tests/lora/test_lora_hpu.py @@ -1,8 +1,7 @@ import pytest import torch from vllm_hpu_extension.ops import LoraMask - -from vllm.hpu.punica_hpu import GaudiPunicaWrapper +from vllm_hpu_extension.punica_hpu import GaudiPunicaWrapper from .utils import DummyLoRAManager From b70c1a5e9fbe29b37ae1b2a79ea953ccf613587d Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 10 Oct 2024 15:11:58 +0200 Subject: [PATCH 275/819] [CI] Report test name, add properties to JUnitXML (#377) --- .jenkins/lm-eval-harness/run-tests.sh | 2 +- .../test_lm_eval_correctness.py | 77 +++++++++++++++++-- 2 files changed, 72 insertions(+), 7 deletions(-) diff --git a/.jenkins/lm-eval-harness/run-tests.sh b/.jenkins/lm-eval-harness/run-tests.sh index 8c51606c4a2dd..09d507d404ede 100644 --- a/.jenkins/lm-eval-harness/run-tests.sh +++ b/.jenkins/lm-eval-harness/run-tests.sh @@ -48,7 +48,7 @@ do LOG_DIR=$TEST_RESULTS_DIR LOG_FILENAME="$test_${MODEL_CONFIG}_${RANDOM_SUFFIX}.xml" LOG_PATH="${LOG_DIR}/${LOG_FILENAME}" - JUNIT_SUFFIX="--junitxml=${LOG_PATH}" + JUNIT_SUFFIX="-o junit_family=xunit1 --junitxml=${LOG_PATH}" fi pytest -s test_lm_eval_correctness.py $JUNIT_SUFFIX || LOCAL_SUCCESS=$? diff --git a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py index fd4532196e36f..9c6d0ee48caf5 100644 --- a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py +++ b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py @@ -59,24 +59,81 @@ def launch_lm_eval(eval_config): return results -def report_performance(task, input_lens, output_lens, time): +def report_performance(task, input_lens, output_lens, time, record_property): assert len(input_lens) == len(output_lens) context_lens = [i + o for i, o in zip(input_lens, output_lens)] gen_tput = sum(output_lens) / time + all_lens = [input_lens, output_lens, context_lens] + min_input_tokens, min_output_tokens, min_context_tokens = [ + min(x) for x in all_lens + ] + max_input_tokens, max_output_tokens, max_context_tokens = [ + max(x) for x in all_lens + ] + mean_input_tokens, mean_output_tokens, mean_context_tokens = [ + statistics.mean(x) for x in all_lens + ] + stddev_input_tokens, stddev_output_tokens, stddev_context_tokens = [ + statistics.stdev(x) for x in all_lens + ] msg = ( f'{task} | estimated average generation throughput: {gen_tput:.2f} tokens/s \n' # noqa: G004, E501 - f'{task} | input_tokens | min: {min(input_lens)} | max: {max(input_lens)} | mean: {statistics.mean(input_lens):.2f} | stddev: {statistics.stdev(input_lens):.2f}\n' # noqa: E501 - f'{task} | output_tokens | min: {min(output_lens)} | max: {max(output_lens)} | mean: {statistics.mean(output_lens):.2f} | stddev: {statistics.stdev(output_lens):.2f}\n' # noqa: E501 - f'{task} | context_length | min: {min(context_lens)} | max: {max(context_lens)} | mean: {statistics.mean(context_lens):.2f} | stddev: {statistics.stdev(context_lens):.2f}' # noqa: E501 + f'{task} | input_tokens | min: {min_input_tokens} | max: {max_input_tokens} | mean: {mean_input_tokens:.2f} | stddev: {stddev_input_tokens:.2f}\n' # noqa: E501 + f'{task} | output_tokens | min: {min_output_tokens} | max: {max_output_tokens} | mean: {mean_output_tokens:.2f} | stddev: {stddev_output_tokens:.2f}\n' # noqa: E501 + f'{task} | context_length | min: {min_context_tokens} | max: {max_context_tokens} | mean: {mean_context_tokens:.2f} | stddev: {stddev_context_tokens:.2f}' # noqa: E501 ) + + # Log all of these stats to JUnitXML + record_property(f"{task}_gen_tput", gen_tput) + record_property(f"{task}_input_tokens_min", min_input_tokens) + record_property(f"{task}_input_tokens_max", max_input_tokens) + record_property(f"{task}_input_tokens_mean", mean_input_tokens) + record_property(f"{task}_input_tokens_stddev", stddev_input_tokens) + + record_property(f"{task}_output_tokens_min", min_output_tokens) + record_property(f"{task}_output_tokens_max", max_output_tokens) + record_property(f"{task}_output_tokens_mean", mean_output_tokens) + record_property(f"{task}_output_tokens_stddev", stddev_output_tokens) + + record_property(f"{task}_context_tokens_min", min_context_tokens) + record_property(f"{task}_context_tokens_max", max_context_tokens) + record_property(f"{task}_context_tokens_mean", mean_context_tokens) + record_property(f"{task}_context_tokens_stddev", stddev_context_tokens) + print(msg) -def test_lm_eval_correctness(): +def get_current_gaudi_platform(): + """ + Inspired by: https://github.com/HabanaAI/Model-References/blob/a87c21f14f13b70ffc77617b9e80d1ec989a3442/PyTorch/computer_vision/classification/torchvision/utils.py#L274 + """ + import habana_frameworks.torch.utils.experimental as htexp + + device_type = htexp._get_device_type() + + if device_type == htexp.synDeviceType.synDeviceGaudi: + return "Gaudi1" + elif device_type == htexp.synDeviceType.synDeviceGaudi2: + return "Gaudi2" + elif device_type == htexp.synDeviceType.synDeviceGaudi3: + return "Gaudi3" + else: + raise ValueError( + f"Unsupported device: the device type is {device_type}.") + + +def test_lm_eval_correctness(record_xml_attribute, record_property): try: eval_config = yaml.safe_load( Path(TEST_DATA_FILE).read_text(encoding="utf-8")) + # Record JUnitXML test name + tasks_str = '_'.join([t['name'] for t in eval_config["tasks"]]) + platform = get_current_gaudi_platform() + testname = (f'test_{Path(TEST_DATA_FILE).stem}_{tasks_str}_{platform}_' + f'tp{TP_SIZE}') + record_xml_attribute("name", testname) + # Launch eval requests. start_time = time.perf_counter() results = launch_lm_eval(eval_config) @@ -102,7 +159,8 @@ def test_lm_eval_correctness(): ] tokenized_outputs_lens = [len(x) for x in tokenized_outputs] report_performance(task['name'], tokenized_inputs_lens, - tokenized_outputs_lens, total_time) + tokenized_outputs_lens, total_time, + record_property) for metric in task["metrics"]: ground_truth = metric["value"] @@ -111,6 +169,13 @@ def test_lm_eval_correctness(): print( f'{task["name"]} | {metric["name"]}: ' f'ground_truth={ground_truth} | measured={measured_value}') + + # Record ground truth and measured value to JUnitXML + record_property( + f"{task['name']}_{metric['name']}_ground_truth", + ground_truth) + record_property(f"{task['name']}_{metric['name']}_measured", + measured_value) assert numpy.isclose(ground_truth, measured_value, rtol=RTOL) except Exception as exc: # nasty workaround for a nasty HPU PT bridge bug (SW-204785) From 49444bce5edacdfa9ba16f721a1bf29afa7d73c1 Mon Sep 17 00:00:00 2001 From: Karol Damaszke Date: Fri, 11 Oct 2024 10:45:53 +0200 Subject: [PATCH 276/819] Disable performance counters if profiler is not enabled (#383) Currently, if `HabanaHighLevelProfiler` is not enabled, `HabanaProfilerCounterHelper` collects the statistics that will not be used later. This creates additional host overhead that can be removed. This change will only allow performance statistics to be collected when the profiler is enabled. Potential gain on `prepare_model_input`: - before image - after image --- vllm/worker/hpu_model_runner.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index d3fa9c287234c..b50e9451ea09c 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -521,7 +521,6 @@ def __init__( self.prompt_adapter_config = prompt_adapter_config self.return_hidden_states = return_hidden_states self.observability_config = observability_config - self.profiler = HabanaHighLevelProfiler() self.sliding_window = (model_config.get_sliding_window() if model_config is not None else None) @@ -557,6 +556,7 @@ def __init__( self.inc_initialized_successfully = False # Profiler stats + self.profiler = HabanaHighLevelProfiler() self.profiler_counter_helper = HabanaProfilerCounterHelper() self.seen_configs: set = set() self._mem_margin: Optional[int] = None @@ -1767,8 +1767,9 @@ def prepare_model_input( """ with self.profiler.record_event('internal', 'prepare_input_tensors'): assert seq_group_metadata_list is not None - self.profiler_counter_helper.capture_seq_group_metadata_stats( - seq_group_metadata_list=seq_group_metadata_list) + if self.profiler.enabled: + self.profiler_counter_helper.capture_seq_group_metadata_stats( + seq_group_metadata_list=seq_group_metadata_list) model_input, sampling_metadata = self.prepare_input_tensors( seq_group_metadata_list) assert model_input.attn_metadata is not None From d6bd37505c82f305124813a4e2be8b0b945833fd Mon Sep 17 00:00:00 2001 From: Sanju C Sudhakaran Date: Fri, 11 Oct 2024 08:16:27 +0300 Subject: [PATCH 277/819] Remove constraints for bucket creation during warmup in LoRA --- vllm/worker/hpu_model_runner.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index d3fa9c287234c..7d6d0cd8af4f7 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -672,9 +672,6 @@ def _is_valid_bucket(self, bucket): def _setup_buckets(self) -> None: align_bs = lambda x: min(self.max_num_seqs, x) max_bucket_cfg = 64 - if self.lora_config and \ - max_bucket_cfg > self.max_num_batched_tokens // self.block_size: - max_bucket_cfg = self.max_num_batched_tokens // self.block_size #FIXME: The default values should be max_model_len max_prompt_seq = 1024 max_decode_seq = 2048 @@ -1480,11 +1477,6 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: self.prompt_buckets, prompt_omitted_buckets = generate_prompt_buckets( self.prompt_bs_bucket_cfg, self.prompt_seq_bucket_cfg, self.max_num_batched_tokens) - if self.lora_config: - self.prompt_buckets[:] = [ - bucket for bucket in self.prompt_buckets - if self._is_valid_bucket(bucket) - ] msg = ( f"Generated {len(self.prompt_buckets)} " @@ -1502,11 +1494,6 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: self.decode_buckets = generate_decode_buckets( self.decode_bs_bucket_cfg, self.decode_block_bucket_cfg, max_blocks) - if self.lora_config: - self.decode_buckets[:] = [ - bucket for bucket in self.decode_buckets - if self._is_valid_bucket(bucket) - ] logger.info("Generated %d decode buckets [bs, total_blocks]: %s", len(self.decode_buckets), list(sorted(self.decode_buckets))) From d8f2aa70198a07836ee91df23aa6a234f700c955 Mon Sep 17 00:00:00 2001 From: Sanju C Sudhakaran Date: Mon, 14 Oct 2024 12:50:58 +0530 Subject: [PATCH 278/819] seed_everything function doesn't handle HPU (#384) This PR adds manual seed setting for HPU in the function `seed_everything`. Previously the torch.manual_seed was getting set to the given seed, which got removed in the following PR https://github.com/HabanaAI/vllm-fork/commit/6ffa3f314c59e42238f1c5f923ff2839e0af9698 --- vllm/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/utils.py b/vllm/utils.py index 2ff9668d9a463..6b325458e62a9 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -413,6 +413,9 @@ def seed_everything(seed: int) -> None: if is_xpu(): torch.xpu.manual_seed_all(seed) + if current_platform.is_hpu(): + torch.hpu.manual_seed_all(seed) + def random_uuid() -> str: return str(uuid.uuid4().hex) From 03b407bdf2be755b29ccc72dfadb23ebad8b7b4c Mon Sep 17 00:00:00 2001 From: Ruheena Suhani Shaik Date: Mon, 14 Oct 2024 13:02:23 +0530 Subject: [PATCH 279/819] Fixed lora_manager tests with hpu_model_runner (#386) lora_manager tests have been fixed with the recent changes of hpu_model_runner from habana_model_runner --- tests/lora/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 65e38b2e4e6e4..d5ce1906c40c1 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -262,7 +262,7 @@ def get_model_patched(*, model_config, device_config, **kwargs): **kwargs) if current_platform.is_hpu(): - with patch("vllm.worker.habana_model_runner.get_model", + with patch("vllm.worker.hpu_model_runner.get_model", get_model_patched): engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False) else: From ebd42c4bc9314c9d3fa240fe19462bb3df4704ce Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 14 Oct 2024 14:27:30 +0200 Subject: [PATCH 280/819] Reformat README_GAUDI.md (#389) This PR removes the awkward line breaks in README_GAUDI.md and uses appropriate markdown formatting instead of RST. Rendered document should look the same. --- README_GAUDI.md | 560 ++++++++++++++---------------------------------- 1 file changed, 161 insertions(+), 399 deletions(-) diff --git a/README_GAUDI.md b/README_GAUDI.md index 483b6e6cda741..08458251a753d 100644 --- a/README_GAUDI.md +++ b/README_GAUDI.md @@ -1,217 +1,126 @@ -vLLM with IntelÂź GaudiÂź AI Accelerators -======================================= +# vLLM with IntelÂź GaudiÂź AI Accelerators -This README provides instructions on running vLLM with Intel Gaudi -devices. +This README provides instructions on running vLLM with Intel Gaudi devices. -Requirements and Installation -============================= +# Requirements and Installation -Please follow the instructions provided in the [Gaudi Installation -Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) -to set up the environment. To achieve the best performance, please -follow the methods outlined in the [Optimizing Training Platform -Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html). +Please follow the instructions provided in the [Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) to set up the environment. To achieve the best performance, please follow the methods outlined in the [Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html). -Requirements ------------- +## Requirements -- OS: Ubuntu 22.04 LTS -- Python: 3.10 -- Intel Gaudi accelerator -- Intel Gaudi software version 1.17.0 +- OS: Ubuntu 22.04 LTS +- Python: 3.10 +- Intel Gaudi accelerator +- Intel Gaudi software version 1.17.0 To verify that the Intel Gaudi software was correctly installed, run: -``` {.console} +```{.console} $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core and habanalabs-thunk are installed $ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed $ pip list | grep neural # verify that neural-compressor is installed ``` -Refer to [Intel Gaudi Software Stack -Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade) -for more details. +Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade) for more details. -Run Docker Image ----------------- +## Run Docker Image -It is highly recommended to use the latest Docker image from Intel Gaudi -vault. Refer to the [Intel Gaudi -documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers) -for more details. +It is highly recommended to use the latest Docker image from Intel Gaudi vault. Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers) for more details. Use the following commands to run a Docker image: -``` {.console} +```{.console} $ docker pull vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest ``` -Build and Install vLLM ----------------------- +## Build and Install vLLM -Currently, the latest features and performance optimizations are -developed in Gaudi\'s [vLLM-fork](https://github.com/HabanaAI/vllm-fork) -and we periodically upstream them to vLLM main repo. To install latest -[HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the -following: +Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following: -``` {.console} +```{.console} $ git clone https://github.com/HabanaAI/vllm-fork.git $ cd vllm-fork $ git checkout habana_main $ pip install -e . ``` -Supported Features -================== - -- [Offline batched - inference](https://github.com/HabanaAI/vllm-fork/blob/habana_main/docs/source/getting_started/quickstart.rst#offline-batched-inference) -- Online inference via [OpenAI-Compatible - Server](https://github.com/HabanaAI/vllm-fork/blob/habana_main/docs/source/getting_started/quickstart.rst#openai-compatible-server) -- HPU autodetection - no need to manually select device within vLLM -- Paged KV cache with algorithms enabled for Intel Gaudi accelerators -- Custom Intel Gaudi implementations of Paged Attention, KV cache ops, - prefill attention, Root Mean Square Layer Normalization, Rotary - Positional Encoding -- Tensor parallelism support for multi-card inference -- Inference with [HPU - Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) - for accelerating low-batch latency and throughput -- Attention with Linear Biases (ALiBi) -- INC quantization - -Unsupported Features -==================== - -- Beam search -- LoRA adapters -- AWQ quantization -- Prefill chunking (mixed-batch inferencing) - -Supported Configurations -======================== - -The following configurations have been validated to be function with -Gaudi2 devices. Configurations that are not listed may or may not work. - -- [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) - on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 - datatype with random or greedy sampling -- [meta-llama/Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b) - with tensor parallelism on 8x HPU, BF16 datatype with random or - greedy sampling -- [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) - with tensor parallelism on 8x HPU, BF16 datatype with random or - greedy sampling -- [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) - with tensor parallelism on 8x HPU, BF16 datatype with random or - greedy sampling -- [meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) - with tensor parallelism on 8x HPU, BF16 datatype with random or - greedy sampling -- [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) - with tensor parallelism on 8x HPU, BF16 datatype with random or - greedy sampling -- [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) - with tensor parallelism on 8x HPU, BF16 datatype with random or - greedy sampling -- [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) - on single HPU or with tensor parallelism on 2x HPU, BF16 datatype - with random or greedy sampling -- [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) - with tensor parallelism on 2x HPU, BF16 datatype with random or - greedy sampling - -Performance Tuning -================ - -Execution modes ------------------------------ - -Currently in vLLM for HPU we support four execution modes, depending on -selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment -variable), and `--enforce-eager` flag. - -| `PT_HPU_LAZY_MODE` | `enforce_eager` | execution mode | -|--- |--- |--- | -| 0 | 0 | torch.compile | -| 0 | 1 | PyTorch eager mode | -| 1 | 0 | HPU Graphs | -| 1 | 1 | PyTorch lazy mode | - - -> [!WARNING] -> In 1.17.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly -> experimental and should be only used for validating functional -> correctness. Their performance will be improved in the next releases. -> For obtaining the best performance in 1.17.0, please use HPU Graphs, or -> PyTorch lazy mode. - -Bucketing mechanism ------------------------------ - -Intel Gaudi accelerators work best when operating on models with fixed -tensor shapes. [Intel Gaudi Graph -Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) -is responsible for generating optimized binary code that implements the -given model topology on Gaudi. In its default configuration, the -produced binary code may be heavily dependent on input and output tensor -shapes, and can require graph recompilation when encountering -differently shaped tensors within the same topology. While the resulting -binaries utilize Gaudi efficiently, the compilation itself may introduce -a noticeable overhead in end-to-end execution. In a dynamic inference -serving scenario, there is a need to minimize the number of graph -compilations and reduce the risk of graph compilation occurring during -server runtime. Currently it is achieved by \"bucketing\" model\'s -forward pass across two dimensions - `batch_size` and `sequence_length`. +# Supported Features -> [!NOTE] -> Bucketing allows us to reduce the number of required graphs -> significantly, but it does not handle any graph compilation and device -> code generation - this is done in warmup and HPUGraph capture phase. +- [Offline batched inference](https://github.com/HabanaAI/vllm-fork/blob/habana_main/docs/source/getting_started/quickstart.rst#offline-batched-inference) +- Online inference via [OpenAI-Compatible Server](https://github.com/HabanaAI/vllm-fork/blob/habana_main/docs/source/getting_started/quickstart.rst#openai-compatible-server) +- HPU autodetection - no need to manually select device within vLLM +- Paged KV cache with algorithms enabled for Intel Gaudi accelerators +- Custom Intel Gaudi implementations of Paged Attention, KV cache ops, prefill attention, Root Mean Square Layer Normalization, Rotary Positional Encoding +- Tensor parallelism support for multi-card inference +- Inference with [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) for accelerating low-batch latency and throughput +- Attention with Linear Biases (ALiBi) +- INC quantization + +# Unsupported Features + +- Beam search +- LoRA adapters +- AWQ quantization +- Prefill chunking (mixed-batch inferencing) + +# Supported Configurations + +The following configurations have been validated to be function with Gaudi2 devices. Configurations that are not listed may or may not work. + +- [meta-llama/Llama-2-7b](https://huggingface.co/meta-llama/Llama-2-7b) on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) on single HPU or with tensor parallelism on 2x HPU, BF16 datatype with random or greedy sampling +- [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) with tensor parallelism on 2x HPU, BF16 datatype with random or greedy sampling + +# Performance Tuning + +## Execution modes + +Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag. + +| `PT_HPU_LAZY_MODE` | `enforce_eager` | execution mode | +| ------------------ | --------------- | ------------------ | +| 0 | 0 | torch.compile | +| 0 | 1 | PyTorch eager mode | +| 1 | 0 | HPU Graphs | +| 1 | 1 | PyTorch lazy mode | + +> [!WARNING] +> In 1.17.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.17.0, please use HPU Graphs, or PyTorch lazy mode. + +## Bucketing mechanism + +Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution. In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - `batch_size` and `sequence_length`. + +> [!NOTE] +> Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase. -Bucketing ranges are determined with 3 parameters - `min`, `step` and -`max`. They can be set separately for prompt and decode phase, and for -batch size and sequence length dimension. These parameters can be -observed in logs during vLLM startup: +Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup: -``` {.} +```{.} INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] ``` -`min` determines the lowest value of the bucket. `step` determines the -interval between buckets, and `max` determines the upper bound of the -bucket. Furthermore, interval between `min` and `step` has special -handling - `min` gets multiplied by consecutive powers of two, until -`step` gets reached. We call this the ramp-up phase and it is used for -handling lower batch sizes with minimum wastage, while allowing larger -padding on larger batch sizes. +`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling - `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes. Example (with ramp-up) -``` {.} +```{.} min = 2, step = 32, max = 64 => ramp_up = (2, 4, 8, 16) => stable = (32, 64) @@ -220,53 +129,28 @@ min = 2, step = 32, max = 64 Example (without ramp-up) -``` {.} +```{.} min = 128, step = 128, max = 512 => ramp_up = () => stable = (128, 256, 384, 512) => buckets = ramp_up + stable => (128, 256, 384, 512) ``` -In the logged scenario, 24 buckets were generated for prompt (prefill) -runs, and 48 buckets for decode runs. Each bucket corresponds to a -separate optimized device binary for a given model with specified tensor -shapes. Whenever a batch of requests is processed, it is padded across -batch and sequence length dimension to the smallest possible bucket. - -> [!WARNING] -> If a request exceeds maximum bucket size in any dimension, it will be -> processed without padding, and its processing may require a graph -> compilation, potentially significantly increasing end-to-end latency. -> The boundaries of the buckets are user-configurable via environment -> variables, and upper bucket boundaries can be increased to avoid such -> scenario. - -As an example, if a request of 3 sequences, with max sequence length of -412 comes in to an idle vLLM server, it will be padded executed as -`(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be -padded to 4 (closest batch\_size dimension higher than 3), and max -sequence length will be padded to 512 (closest sequence length dimension -higher than 412). After prefill stage, it will be executed as `(4, 512)` -decode bucket and will continue as that bucket until either batch -dimension changes (due to request being finished) - in which case it -will become a `(2, 512)` bucket, or context length increases above 512 -tokens, in which case it will become `(4, 640)` bucket. +In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket. + +> [!WARNING] +> If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario. + +As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket. > [!NOTE] -> Bucketing is transparent to a client - padding in sequence length -> dimension is never returned to the client, and padding in batch -> dimension does not create new requests. +> Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests. -Warmup ------- +## Warmup -Warmup is an optional, but highly recommended step occurring before vLLM -server starts listening. It executes a forward pass for each bucket with -dummy data. The goal is to pre-compile all graphs and not incur any -graph compilation overheads within bucket boundaries during server -runtime. Each warmup step is logged during vLLM startup: +Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup: -``` {.} +```{.} INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB @@ -280,100 +164,30 @@ INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB ``` -This example uses the same buckets as in *Bucketing mechanism* section. -Each output line corresponds to execution of a single bucket. When -bucket is executed for the first time, its graph is compiled and can be -reused later on, skipping further graph compilations. - -> [!TIP] -> Compiling all the buckets might take some time and can be turned off -> with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if -> you do that, you may face graph compilations once executing a given -> bucket for the first time. It is fine to disable warmup for development, -> but it\'s highly recommended to enable it in deployment. - -HPU Graph capture ------------------------------ - -[HPU -Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) -are currently the most performant execution method of vLLM on Intel -Gaudi. When HPU Graphs are enabled, execution graphs will be traced -(recorded) ahead of time (after performing warmup), to be later replayed -during inference, significantly reducing host overheads. Recording can -take large amounts of memory, which needs to be taken into account when -allocating KV cache. Enabling HPU Graphs will impact the number of -available KV cache blocks, but vLLM provides user-configurable variables -to control memory management. - -When HPU Graphs are being used, they share the common memory pool -(\"usable memory\") as KV cache, determined by `gpu_memory_utilization` -flag (`0.9` by default). Before KV cache gets allocated, model weights -are loaded onto the device, and a forward pass of the model is executed -on dummy data, to estimate memory usage. Only after that, -`gpu_memory_utilization` flag is utilized - at its default value, will -mark 90% of free device memory at that point as usable. Next, KV cache -gets allocated, model is warmed up, and HPU Graphs are captured. -Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of -memory reserved for HPU Graphs capture. With its default value -(`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory will be reserved -for graph capture (later referred to as \"usable graph memory\"), and -the remaining 90% will be utilized for KV cache. Environment variable -`VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory -reserved for prefill and decode graphs. By default -(`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory -constraints. Lower value corresponds to less usable graph memory -reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will -reserve 20% of usable graph memory for prefill graphs, and 80% of usable -graph memory for decode graphs. +This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. -> [!NOTE] -> `gpu_memory_utilization` does not correspond to the absolute memory -> usage across HPU. It specifies the memory margin after loading the model -> and performing a profile run. If device has 100 GiB of total memory, and -> 50 GiB of free memory after loading model weights and executing -> profiling run, `gpu_memory_utilization` at its default value will mark -> 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total -> device memory. - -User can also configure the strategy for capturing HPU Graphs for prompt -and decode stages separately. Strategy affects the order of capturing -graphs. There are two strategies implemented: - `max_bs` - graph capture -queue will sorted in descending order by their batch sizes. Buckets with -equal batch sizes are sorted by sequence length in ascending order (e.g. -`(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, -`(1,256)`), default strategy for decode - `min_tokens` - graph capture -queue will be sorted in ascending order by the number of tokens each -graph processes (`batch_size*sequence_length`), default strategy for -prompt - -When there\'s large amount of requests pending, vLLM scheduler will -attempt to fill the maximum batch size for decode as soon as possible. -When a request is finished, decode batch size decreases. When that -happens, vLLM will attempt to schedule a prefill iteration for requests -in the waiting queue, to fill the decode batch size to its previous -state. This means that in a full load scenario, decode batch size is -often at its maximum, which makes large batch size HPU Graphs crucial to -capture, as reflected by `max_bs` strategy. On the other hand, prefills -will be executed most frequently with very low batch sizes (1-4), which -is reflected in `min_tokens` strategy. +> [!TIP] +> Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment. -> [!NOTE] -> `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by -> graphs for each stage (prefill and decode). vLLM will first attempt to -> use up entirety of usable prefill graph memory (usable graph memory \* -> `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it -> will attempt do the same for decode graphs and usable decode graph -> memory pool. If one stage is fully captured, and there is unused memory -> left within usable graph memory pool, vLLM will attempt further graph -> capture for the other stage, until no more HPU Graphs can be captured -> without exceeding reserved memory pool. The behavior on that mechanism -> can be observed in the example below. - -Each described step is logged by vLLM server, as follows (negative -values correspond to memory being released): - -``` {.} +## HPU Graph capture + +[HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management. + +When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by `gpu_memory_utilization` flag (`0.9` by default). Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage. Only after that, `gpu_memory_utilization` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable. Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of memory reserved for HPU Graphs capture. With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache. Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints. Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. + +> [!NOTE] +> `gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory. + +User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented: - `max_bs` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. `(64, 128)`, `(64, 256)`, `(32, 128)`, `(32, 256)`, `(1, 128)`, `(1,256)`), default strategy for decode - `min_tokens` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (`batch_size*sequence_length`), default strategy for prompt + +When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy. + +> [!NOTE] +> `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below. + +Each described step is logged by vLLM server, as follows (negative values correspond to memory being released): + +```{.} INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] @@ -405,111 +219,59 @@ INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, alloca INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used) ``` -Recommended vLLM Parameters ------------------------------ +## Recommended vLLM Parameters -- We recommend running inference on Gaudi 2 with `block_size` of 128 - for BF16 data type. Using default values (16, 32) might lead to - sub-optimal performance due to Matrix Multiplication Engine - under-utilization (see [Gaudi - Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)). -- For max throughput on Llama 7B, we recommend running with batch size - of 128 or 256 and max context length of 2048 with HPU Graphs - enabled. If you encounter out-of-memory issues, see troubleshooting - section. +- We recommend running inference on Gaudi 2 with `block_size` of 128 for BF16 data type. Using default values (16, 32) might lead to sub-optimal performance due to Matrix Multiplication Engine under-utilization (see [Gaudi Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html)). +- For max throughput on Llama 7B, we recommend running with batch size of 128 or 256 and max context length of 2048 with HPU Graphs enabled. If you encounter out-of-memory issues, see troubleshooting section. -Environment variables ------------------------------ +## Environment variables **Diagnostic and profiling knobs:** -- `VLLM_PROFILER_ENABLED`: if `true`, high level profiler will be - enabled. Resulting JSON traces can be viewed in - [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). Disabled - by default. -- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: if `true`, will log graph - compilations per each vLLM engine step, only when there was any - - highly recommended to use alongside `PT_HPU_METRICS_GC_DETAILS=1`. - Disabled by default. -- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true`, will log graph - compilations per each vLLM engine step, always, even if there were - none. Disabled by default. -- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true`, will log cpu fallbacks - per each vLLM engine step, only when there was any. Disabled by - default. -- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, will log cpu - fallbacks per each vLLM engine step, always, even if there were - none. Disabled by default. +- `VLLM_PROFILER_ENABLED`: if `true`, high level profiler will be enabled. Resulting JSON traces can be viewed in [perfetto.habana.ai](https://perfetto.habana.ai/#!/viewer). Disabled by default. +- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION`: if `true`, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside `PT_HPU_METRICS_GC_DETAILS=1`. Disabled by default. +- `VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL`: if `true`, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default. +- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS`: if `true`, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default. +- `VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL`: if `true`, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default. **Performance tuning knobs:** -- `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by - default -- `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for - HPUGraph capture, `0.1` by default -- `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory - dedicated for prompt graphs, `0.3` by default -- `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt - graph capture, `min_tokens` or `max_bs`, `min_tokens` by default -- `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode - graph capture, `min_tokens` or `max_bs`, `max_bs` by default -- `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment - variables configuring ranges of bucketing mechanism - - `{phase}` is either `PROMPT` or `DECODE` - - `{dim}` is either `BS`, `SEQ` or `BLOCK` - - `{param}` is either `MIN`, `STEP` or `MAX` - - Default values: - - Prompt: - - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1` - - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` - - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): - `min(max_num_seqs, 64)` - - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): - `block_size` - - sequence length step - (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size` - - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): - `max_model_len` - - - Decode: - - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1` - - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): - `min(max_num_seqs, 32)` - - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): - `max_num_seqs` - - block size min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): - `block_size` - - block size step - (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size` - - block size max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): - `max(128, (max_num_seqs*max_model_len)/block_size)` - -Additionally, there are HPU PyTorch Bridge environment variables -impacting vLLM execution: - -- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be - used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is - default -- `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor - parallel inference with HPU Graphs - -Troubleshooting: Tweaking HPU Graphs -==================================== - -If you experience device out-of-memory issues or want to attempt -inference at higher batch sizes, try tweaking HPU Graphs by following -the below: - -- Tweak `gpu_memory_utilization` knob. It will decrease the allocation - of KV cache, leaving some headroom for capturing graphs with larger - batch size. By default `gpu_memory_utilization` is set to 0.9. It - attempts to allocate \~90% of HBM left for KV cache after short - profiling run. Note that decreasing reduces the number of KV cache - blocks you have available, and therefore reduces the effective - maximum number of tokens you can handle at a given time. -- If this method is not efficient, you can disable `HPUGraph` - completely. With HPU Graphs disabled, you are trading latency and - throughput at lower batches for potentially higher throughput on - higher batches. You can do that by adding `--enforce-eager` flag to - server (for online inference), or by passing `enforce_eager=True` - argument to LLM constructor (for offline inference). +- `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by default +- `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for HPUGraph capture, `0.1` by default +- `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory dedicated for prompt graphs, `0.3` by default +- `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt graph capture, `min_tokens` or `max_bs`, `min_tokens` by default +- `VLLM_GRAPH_DECODE_STRATEGY`: strategy determining order of decode graph capture, `min_tokens` or `max_bs`, `max_bs` by default +- `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism + - `{phase}` is either `PROMPT` or `DECODE` + - `{dim}` is either `BS`, `SEQ` or `BLOCK` + - `{param}` is either `MIN`, `STEP` or `MAX` + - Default values: + - Prompt: + + - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1` + - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` + - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)` + - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size` + - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size` + - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len` + + - Decode: + + - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1` + - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` + - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs` + - block size min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size` + - block size step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size` + - block size max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)` + +Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution: + +- `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is default +- `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs + +# Troubleshooting: Tweaking HPU Graphs + +If you experience device out-of-memory issues or want to attempt inference at higher batch sizes, try tweaking HPU Graphs by following the below: + +- Tweak `gpu_memory_utilization` knob. It will decrease the allocation of KV cache, leaving some headroom for capturing graphs with larger batch size. By default `gpu_memory_utilization` is set to 0.9. It attempts to allocate ~90% of HBM left for KV cache after short profiling run. Note that decreasing reduces the number of KV cache blocks you have available, and therefore reduces the effective maximum number of tokens you can handle at a given time. +- If this method is not efficient, you can disable `HPUGraph` completely. With HPU Graphs disabled, you are trading latency and throughput at lower batches for potentially higher throughput on higher batches. You can do that by adding `--enforce-eager` flag to server (for online inference), or by passing `enforce_eager=True` argument to LLM constructor (for offline inference). From 2d2bf7a03d3d7461c21029910370379644d22aa7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Kot=C5=82owski?= Date: Mon, 14 Oct 2024 14:32:12 +0200 Subject: [PATCH 281/819] [CI] Prepare separate Jenkins tests for torch compile mode (#388) --- .jenkins/test_config_t_compile.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 .jenkins/test_config_t_compile.yaml diff --git a/.jenkins/test_config_t_compile.yaml b/.jenkins/test_config_t_compile.yaml new file mode 100644 index 0000000000000..58fcb45a7edfb --- /dev/null +++ b/.jenkins/test_config_t_compile.yaml @@ -0,0 +1,16 @@ +# test_config_t_compile.yaml +stages: + - name: test_gsm8k_small_models_tcompile + steps: + - name: gsm8k_small_g3_tp1_tc + flavor: g3 + command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-small.txt -t 1 + - name: gsm8k_small_g3_tp2_tc + flavor: g3.s + command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-small.txt -t 2 + - name: gsm8k_small_g2_tp1_tc + flavor: g2 + command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-small.txt -t 1 + - name: gsm8k_small_g2_tp2_tc + flavor: g2.s + command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-small.txt -t 2 From 9df1d4abda18c7b0eec4c8e4edf9437fbb3f7ea0 Mon Sep 17 00:00:00 2001 From: Sanju C Sudhakaran Date: Mon, 14 Oct 2024 18:04:13 +0530 Subject: [PATCH 282/819] Remove workaround added to resolve multi-card stall issue (#387) This PR removes additional `multiprocessing.Process` object created as a workaround for resolving multi-card stall issue. --- tests/lora/test_llama_hpu.py | 18 +++--------------- tests/lora/test_multilora_hpu.py | 19 +++---------------- 2 files changed, 6 insertions(+), 31 deletions(-) diff --git a/tests/lora/test_llama_hpu.py b/tests/lora/test_llama_hpu.py index dfd551f2ca043..5571d727ef8e2 100644 --- a/tests/lora/test_llama_hpu.py +++ b/tests/lora/test_llama_hpu.py @@ -1,4 +1,3 @@ -from multiprocessing import Process from typing import List from conftest import cleanup @@ -78,23 +77,12 @@ def _test_llama_lora(sql_lora_files, tp_size): def test_llama_lora_1x(sql_lora_files): - p = Process(target=_test_llama_lora, args=(sql_lora_files, 1)) - p.start() - p.join() - assert p.exitcode == 0 + _test_llama_lora(sql_lora_files, 1) def test_llama_lora_2x(sql_lora_files): - # Work-around to resolve stalling issue in multi-card scenario - p = Process(target=_test_llama_lora, args=(sql_lora_files, 2)) - p.start() - p.join() - assert p.exitcode == 0 + _test_llama_lora(sql_lora_files, 2) def test_llama_lora_4x(sql_lora_files): - # Work-around to resolve stalling issue in multi-card scenario - p = Process(target=_test_llama_lora, args=(sql_lora_files, 4)) - p.start() - p.join() - assert p.exitcode == 0 + _test_llama_lora(sql_lora_files, 4) diff --git a/tests/lora/test_multilora_hpu.py b/tests/lora/test_multilora_hpu.py index 64eda037ff059..d035761923dd6 100644 --- a/tests/lora/test_multilora_hpu.py +++ b/tests/lora/test_multilora_hpu.py @@ -1,4 +1,3 @@ -from multiprocessing import Process from typing import List, Optional, Tuple from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams @@ -107,24 +106,12 @@ def _test_llama_multilora(sql_lora_files, tp_size): def test_llama_multilora_1x(sql_lora_files): - # Work-around to resolve stalling issue in multi-card scenario - p = Process(target=_test_llama_multilora, args=(sql_lora_files, 1)) - p.start() - p.join() - assert p.exitcode == 0 + _test_llama_multilora(sql_lora_files, 1) def test_llama_multilora_2x(sql_lora_files): - # Work-around to resolve stalling issue in multi-card scenario - p = Process(target=_test_llama_multilora, args=(sql_lora_files, 2)) - p.start() - p.join() - assert p.exitcode == 0 + _test_llama_multilora(sql_lora_files, 2) def test_llama_multilora_4x(sql_lora_files): - # Work-around to resolve stalling issue in multi-card scenario - p = Process(target=_test_llama_multilora, args=(sql_lora_files, 4)) - p.start() - p.join() - assert p.exitcode == 0 + _test_llama_multilora(sql_lora_files, 4) From 9777c9f8538b497d5f6cb986d4535db0185edf49 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 14 Oct 2024 14:46:11 +0200 Subject: [PATCH 283/819] Update SynapseAI version in README & Dockerfile (#390) --- Dockerfile.hpu | 4 +--- README_GAUDI.md | 8 ++++---- docs/source/getting_started/gaudi-installation.rst | 8 ++++---- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/Dockerfile.hpu b/Dockerfile.hpu index ab714cdac4670..f481c8c6a57bf 100644 --- a/Dockerfile.hpu +++ b/Dockerfile.hpu @@ -1,4 +1,4 @@ -FROM vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest +FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest COPY ./ /workspace/vllm @@ -13,6 +13,4 @@ RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install WORKDIR /workspace/ -RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks - ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/README_GAUDI.md b/README_GAUDI.md index 08458251a753d..555cd1738b909 100644 --- a/README_GAUDI.md +++ b/README_GAUDI.md @@ -11,7 +11,7 @@ Please follow the instructions provided in the [Gaudi Installation Guide](https: - OS: Ubuntu 22.04 LTS - Python: 3.10 - Intel Gaudi accelerator -- Intel Gaudi software version 1.17.0 +- Intel Gaudi software version 1.18.0 To verify that the Intel Gaudi software was correctly installed, run: @@ -31,8 +31,8 @@ It is highly recommended to use the latest Docker image from Intel Gaudi vault. Use the following commands to run a Docker image: ```{.console} -$ docker pull vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest -$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest +$ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest ``` ## Build and Install vLLM @@ -98,7 +98,7 @@ Currently in vLLM for HPU we support four execution modes, depending on selected | 1 | 1 | PyTorch lazy mode | > [!WARNING] -> In 1.17.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.17.0, please use HPU Graphs, or PyTorch lazy mode. +> In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode. ## Bucketing mechanism diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst index c9df862197f0a..111bab2494990 100644 --- a/docs/source/getting_started/gaudi-installation.rst +++ b/docs/source/getting_started/gaudi-installation.rst @@ -18,7 +18,7 @@ Requirements - OS: Ubuntu 22.04 LTS - Python: 3.10 - Intel Gaudi accelerator -- Intel Gaudi software version 1.17.0 +- Intel Gaudi software version 1.18.0 To verify that the Intel Gaudi software was correctly installed, run: @@ -45,8 +45,8 @@ Use the following commands to run a Docker image: .. code:: console - $ docker pull vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest - $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest + $ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest + $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest Build and Install vLLM --------------------------- @@ -157,7 +157,7 @@ Currently in vLLM for HPU we support four execution modes, depending on selected - PyTorch lazy mode .. warning:: - In 1.17.0, all modes utilizing ``PT_HPU_LAZY_MODE=0`` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.17.0, please use HPU Graphs, or PyTorch lazy mode. + In 1.18.0, all modes utilizing ``PT_HPU_LAZY_MODE=0`` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode. Bucketing mechanism From 9ac52ab11186926530648385135b8a8f7eadfe7f Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 14 Oct 2024 17:01:02 +0300 Subject: [PATCH 284/819] fix attention backend selector: --- vllm/worker/hpu_model_runner.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 99dc326612588..f81e4aa59b289 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -541,13 +541,12 @@ def __init__( self.kv_cache_dtype = kv_cache_dtype self.attn_backend = get_attn_backend( - self.model_config.get_num_attention_heads(self.parallel_config), self.model_config.get_head_size(), - self.model_config.get_num_kv_heads(self.parallel_config), self.model_config.get_sliding_window(), self.model_config.dtype, self.kv_cache_dtype, self.block_size, + self.model_config.is_attention_free, ) # Lazy initialization From 55dd07e949db7fb2839c4d91b175ea76985a3257 Mon Sep 17 00:00:00 2001 From: Dudi Lester <160421192+dudilester@users.noreply.github.com> Date: Tue, 15 Oct 2024 10:46:45 +0300 Subject: [PATCH 285/819] enable mixtral quantization using INC (#372) --- requirements-hpu.txt | 2 +- vllm/executor/ray_hpu_executor.py | 3 + vllm/model_executor/layers/fused_moe/layer.py | 97 +++++++------------ 3 files changed, 37 insertions(+), 65 deletions(-) diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 8f7f0339b02e3..8495d63ce72fa 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -8,4 +8,4 @@ pandas tabulate setuptools>=61 setuptools-scm>=8 -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@36c7f9c +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@7531cc6 diff --git a/vllm/executor/ray_hpu_executor.py b/vllm/executor/ray_hpu_executor.py index 343fa43b0eda1..775c0a5d95899 100644 --- a/vllm/executor/ray_hpu_executor.py +++ b/vllm/executor/ray_hpu_executor.py @@ -78,6 +78,9 @@ def shutdown(self) -> None: ray.kill(worker) self.forward_dag = None + def finish_measurements(self): + self._run_workers("finish_measurements") + def _get_worker_module_and_class( self ) -> Tuple[str, str, Optional[Callable[[], diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 865f5c6aad1eb..457450cda2ce6 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -14,6 +14,8 @@ from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform +is_hpu = current_platform.is_hpu() + logger = init_logger(__name__) @@ -262,7 +264,7 @@ def _load_model_weight_or_group_weight_scale(self, shard_dim: int, expert_data: torch.Tensor, shard_id: str, loaded_weight: torch.tensor, - tp_rank: int): + tp_rank: int, expert_id: int): # Load grouped weight scales for group quantization # or model weights if shard_id == "w2": @@ -270,13 +272,15 @@ def _load_model_weight_or_group_weight_scale(self, shard_dim: int, shard_dim=shard_dim, loaded_weight=loaded_weight, expert_data=expert_data, - tp_rank=tp_rank) + tp_rank=tp_rank, + expert_id=expert_id) elif shard_id in ("w1", "w3"): self._load_w13(shard_id=shard_id, shard_dim=shard_dim, loaded_weight=loaded_weight, expert_data=expert_data, - tp_rank=tp_rank) + tp_rank=tp_rank, + expert_id=expert_id) def _load_per_channel_weight_scale(self, expert_data: torch.Tensor, shard_dim: int, shard_id: str, @@ -292,9 +296,15 @@ def _load_per_channel_weight_scale(self, expert_data: torch.Tensor, expert_data=expert_data, tp_rank=tp_rank) - def _load_w13(self, expert_data: torch.Tensor, shard_dim: int, - shard_id: str, loaded_weight: torch.tensor, tp_rank: int): + def _load_w13(self, + expert_data: torch.Tensor, + shard_dim: int, + shard_id: str, + loaded_weight: torch.tensor, + tp_rank: int, + expert_id: Optional[int] = None): + orig_exp_data = expert_data.view(expert_data.size()) # Index the loaded weight for tp sharding. # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim shard_size = expert_data.shape[shard_dim] // 2 @@ -310,8 +320,17 @@ def _load_w13(self, expert_data: torch.Tensor, shard_dim: int, expert_data = expert_data.narrow(shard_dim, shard_size, shard_size) expert_data.copy_(loaded_weight) - def _load_w2(self, expert_data: torch.Tensor, shard_dim: int, - shard_id: str, loaded_weight: torch.tensor, tp_rank: int): + if is_hpu: + self.hpu_static_fused_moe.w13_list[expert_id].set_weight( + orig_exp_data) + + def _load_w2(self, + expert_data: torch.Tensor, + shard_dim: int, + shard_id: str, + loaded_weight: torch.tensor, + tp_rank: int, + expert_id: Optional[int] = None): # Index the loaded weight for tp sharding. # down_proj: "RowParallel" so tp sharding on input_dim @@ -321,6 +340,9 @@ def _load_w2(self, expert_data: torch.Tensor, shard_dim: int, shard_size) # w2, down_proj: Load into only logical weight of w2. expert_data.copy_(loaded_weight) + if is_hpu: + self.hpu_static_fused_moe.w2_list[expert_id].set_weight( + expert_data) def _load_single_value(self, param: torch.nn.Parameter, loaded_weight: torch.Tensor, expert_id: int): @@ -423,7 +445,8 @@ def weight_loader(self, param: torch.nn.Parameter, shard_dim=shard_dim, loaded_weight=loaded_weight, expert_data=expert_data, - tp_rank=tp_rank) + tp_rank=tp_rank, + expert_id=expert_id) elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value: self._load_per_tensor_weight_scale(shard_id=shard_id, param=param, @@ -449,7 +472,8 @@ def weight_loader(self, param: torch.nn.Parameter, shard_dim=shard_dim, loaded_weight=loaded_weight, expert_data=expert_data, - tp_rank=tp_rank) + tp_rank=tp_rank, + expert_id=expert_id) return @staticmethod @@ -528,58 +552,3 @@ def make_expert_params_mapping( ("w3", ckpt_up_proj_name), ] ] - - def _load_fp8_scale(self, param: torch.nn.Parameter, - loaded_weight: torch.Tensor, weight_name: str, - shard_id: str, expert_id: int) -> None: - param_data = param.data - - # Input scales can be loaded directly and should be equal. - if "input_scale" in weight_name: - if param_data[expert_id] != 1 and (param_data[expert_id] - - loaded_weight).abs() > 1e-5: - raise ValueError( - "input_scales of w1 and w3 of a layer " - f"must be equal. But got {param_data[expert_id]} " - f"vs. {loaded_weight}") - param_data[expert_id] = loaded_weight - # Weight scales - elif "weight_scale" in weight_name: - # If we are in merged column case (gate_up_proj) - if shard_id in ("w1", "w3"): - # We have to keep the weight scales of w1 and w3 because - # we need to re-quantize w1/w3 weights after weight loading. - idx = 0 if shard_id == "w1" else 1 - param_data[expert_id][idx] = loaded_weight - # If we are in the row parallel case (down_proj) - else: - param_data[expert_id] = loaded_weight - # Weights - else: - tp_rank = get_tensor_model_parallel_rank() - shard_size = self.intermediate_size_per_partition - shard = slice(tp_rank * shard_size, (tp_rank + 1) * shard_size) - - # w1, gate_proj case: Load into first shard of w13. - if shard_id == 0: - param_data[expert_id, - 0:shard_size, :] = loaded_weight[shard, :] - if current_platform.is_hpu(): - self.hpu_static_fused_moe.w13_list[expert_id].set_weight( - param_data[expert_id]) - # w3, up_proj case: Load into second shard of w13. - elif shard_id == 2: - param_data[expert_id, shard_size:2 * - shard_size, :] = loaded_weight[shard, :] - if current_platform.is_hpu(): - self.hpu_static_fused_moe.w13_list[expert_id].set_weight( - param_data[expert_id]) - # w2, down_proj case: Load into only shard of w2. - elif shard_id == 1: - param_data[expert_id, :, :] = loaded_weight[:, shard] - if current_platform.is_hpu(): - self.hpu_static_fused_moe.w2_list[expert_id].set_weight( - param_data[expert_id]) - else: - raise ValueError( - f"Shard id must be in [0,1,2] but got {shard_id}") From 401f5ae3d339b1b0402b7f276905ef28d4ba0b21 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 15 Oct 2024 12:05:57 +0200 Subject: [PATCH 286/819] [CI] Temporarily increase test tolerances (#392) This PR raises the allowed relative tolerance in GSM8K to 0.06, and moves Llama-70B test to 4xG2 from 2xG2 until memory usage is investigated (success run: vLLM-CI-Pipeline/206) --- .jenkins/lm-eval-harness/test_lm_eval_correctness.py | 2 +- .jenkins/test_config.yaml | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py index 9c6d0ee48caf5..421a949ab72e5 100644 --- a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py +++ b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py @@ -19,7 +19,7 @@ import vllm -RTOL = 0.05 +RTOL = 0.06 TEST_DATA_FILE = os.environ.get( "LM_EVAL_TEST_DATA_FILE", ".jenkins/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml") diff --git a/.jenkins/test_config.yaml b/.jenkins/test_config.yaml index 99ff97df8cd34..f90cdb354d4f5 100644 --- a/.jenkins/test_config.yaml +++ b/.jenkins/test_config.yaml @@ -19,6 +19,6 @@ stages: - name: gsm8k_large_g3_tp2 flavor: g3.s command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-large.txt -t 2 - - name: gsm8k_large_g2_tp2 - flavor: g2.s - command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-large.txt -t 2 + - name: gsm8k_large_g2_tp4 + flavor: g2.m + command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-large.txt -t 4 From e598f3f125a50326e8f187ce59b096129aab40eb Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 15 Oct 2024 12:07:00 +0200 Subject: [PATCH 287/819] Add quickstart section to READMEs (#391) --- README_GAUDI.md | 21 ++++++++++++--- .../getting_started/gaudi-installation.rst | 27 ++++++++++++++++--- 2 files changed, 40 insertions(+), 8 deletions(-) diff --git a/README_GAUDI.md b/README_GAUDI.md index 555cd1738b909..b9c744bd9e23f 100644 --- a/README_GAUDI.md +++ b/README_GAUDI.md @@ -4,7 +4,7 @@ This README provides instructions on running vLLM with Intel Gaudi devices. # Requirements and Installation -Please follow the instructions provided in the [Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) to set up the environment. To achieve the best performance, please follow the methods outlined in the [Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html). +Please follow the instructions provided in the [Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html) to set up the execution environment. To achieve the best performance, please follow the methods outlined in the [Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html). ## Requirements @@ -13,18 +13,31 @@ Please follow the instructions provided in the [Gaudi Installation Guide](https: - Intel Gaudi accelerator - Intel Gaudi software version 1.18.0 +## Quick start using Dockerfile +``` +$ docker build -f Dockerfile.hpu -t vllm-hpu-env . +$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env +``` + +> [!TIP] +> If you're facing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered correctly. + + +## Build from source + +### Environment verification To verify that the Intel Gaudi software was correctly installed, run: ```{.console} $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible -$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core and habanalabs-thunk are installed +$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed $ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed $ pip list | grep neural # verify that neural-compressor is installed ``` Refer to [Intel Gaudi Software Stack Verification](https://docs.habana.ai/en/latest/Installation_Guide/SW_Verification.html#platform-upgrade) for more details. -## Run Docker Image +### Run Docker Image It is highly recommended to use the latest Docker image from Intel Gaudi vault. Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Installation_Guide/Bare_Metal_Fresh_OS.html#pull-prebuilt-containers) for more details. @@ -35,7 +48,7 @@ $ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest ``` -## Build and Install vLLM +### Build and Install vLLM-fork Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following: diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst index 111bab2494990..c943625a65f29 100644 --- a/docs/source/getting_started/gaudi-installation.rst +++ b/docs/source/getting_started/gaudi-installation.rst @@ -8,8 +8,8 @@ Requirements and Installation Please follow the instructions provided in the `Gaudi Installation Guide `__ -to set up the environment. To achieve the best performance, please -follow the methods outlined in the `Optimizing Training Platform +to set up the execution environment. To achieve the best performance, +please follow the methods outlined in the `Optimizing Training Platform Guide `__. Requirements @@ -20,12 +20,31 @@ Requirements - Intel Gaudi accelerator - Intel Gaudi software version 1.18.0 + +Quick start using Dockerfile +============================ +.. code:: console + + $ docker build -f Dockerfile.hpu -t vllm-hpu-env . + $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env + + +.. tip:: + If you're observing the following error: ``docker: Error response from daemon: Unknown runtime specified habana.``, please refer to "Install Using Containers" section of `Intel Gaudi Software Stack and Driver Installation Date: Wed, 16 Oct 2024 09:40:20 +0200 Subject: [PATCH 288/819] Softmax: add weighted-sum normalization (#378) Supporting PR for https://github.com/HabanaAI/vllm-hpu-extension/pull/10 --- requirements-hpu.txt | 2 +- vllm/attention/backends/hpu_attn.py | 1 + vllm/attention/ops/hpu_paged_attn.py | 1 + vllm/worker/hpu_model_runner.py | 45 +++++++++++++++++++++------- 4 files changed, 38 insertions(+), 11 deletions(-) diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 8495d63ce72fa..1a583974be151 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -8,4 +8,4 @@ pandas tabulate setuptools>=61 setuptools-scm>=8 -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@7531cc6 +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@fd7f2e6 diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index 17201fe6e1cd6..a8f4b09b67274 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -222,6 +222,7 @@ def forward( block_list=attn_metadata.block_list, block_mapping=attn_metadata.block_mapping, block_bias=attn_metadata.attn_bias, + block_scales=attn_metadata.block_scales, scale=self.scale, matmul_qk_op=self.matmul_qk, matmul_av_op=self.matmul_av, diff --git a/vllm/attention/ops/hpu_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py index 7fbe26d83f320..4c0fb2a628361 100644 --- a/vllm/attention/ops/hpu_paged_attn.py +++ b/vllm/attention/ops/hpu_paged_attn.py @@ -20,6 +20,7 @@ class HPUPagedAttentionMetadata: block_usage: Optional[torch.Tensor] block_indices: Optional[torch.Tensor] block_offsets: Optional[torch.Tensor] + block_scales: Optional[torch.Tensor] class HPUPagedAttention: diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index f81e4aa59b289..d8150a56844a2 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -298,9 +298,19 @@ def _set_block_mapping(self, metadata, batch_size, device, dtype): mask = mask >= metadata.block_usage.unsqueeze(-1) attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_( mask, -math.inf)) - block_mapping = torch.nn.functional.one_hot( - metadata.block_mapping.to(torch.long), - num_classes=batch_size).to(dtype) + if is_fake_hpu(): + # Unfortunately one_hot on CPU doesn't handle + # out of bounds classes. We need to mask those + # values manually + oob_values = metadata.block_mapping.lt(0) + block_mapping = metadata.block_mapping.masked_fill(oob_values, 0) + block_mapping = torch.nn.functional.one_hot(block_mapping, + num_classes=batch_size) + block_mapping.masked_fill_(oob_values.unsqueeze(-1), 0) + else: + block_mapping = torch.nn.functional.one_hot(metadata.block_mapping, + num_classes=batch_size) + block_mapping = block_mapping.to(dtype) metadata = metadata._replace(block_mapping=block_mapping, attn_bias=attn_bias) return metadata @@ -873,6 +883,7 @@ def _prepare_prompt( block_usage=None, block_indices=block_indices, block_offsets=block_offsets, + block_scales=None, attn_bias=None, seq_lens_tensor=seq_lens_tensor, num_prefills=real_num_seqs, @@ -968,7 +979,15 @@ def _prepare_decode( num_decode_tokens = sum(seq_lens) blocks_used = [len(bt) for bt in block_tables if bt] - block_list = list(itertools.chain(*block_tables)) + block_list = [] + block_scales = [] + for i, bt in enumerate(block_tables): + block_list.extend(bt) + blocks_in_group = len(bt) + if blocks_in_group > 0: + scale = 1.0 / blocks_in_group + block_scales.extend([scale] * blocks_in_group) + block_mapping_nested: List[List[int]] = [ [i] * b_u for i, b_u in enumerate(blocks_used) ] @@ -984,18 +1003,19 @@ def _prepare_decode( block_bucket_size = find_bucket(len(block_list), self.decode_block_bucket_cfg) - block_list = pad_list(block_list, block_bucket_size, _PAD_SLOT_ID) - block_mapping = pad_list(block_mapping, block_bucket_size, 0) - block_usage = pad_list(block_usage, block_bucket_size, 0) + block_list = pad_list(block_list, block_bucket_size, _PAD_BLOCK_ID) + block_mapping = pad_list(block_mapping, block_bucket_size, -1) + block_usage = pad_list(block_usage, block_bucket_size, 1) + block_scales = pad_list(block_scales, block_bucket_size, 0.0) block_list = torch.tensor(block_list, dtype=torch.int, device=self.device) block_mapping = torch.tensor(block_mapping, - dtype=torch.int, + dtype=torch.long, device=self.device) block_usage = torch.tensor(block_usage, - dtype=torch.bfloat16, + dtype=self.model_config.dtype, device=self.device) slot_mapping = torch.tensor(slot_mapping, @@ -1004,6 +1024,10 @@ def _prepare_decode( block_indices, block_offsets = precompute_indices_and_offsets( self.block_size, slot_mapping, False) + block_scales = torch.tensor(block_scales, + dtype=self.model_config.dtype, + device=self.device) + attn_metadata = self.attn_backend.make_metadata( is_prompt=False, block_list=block_list, @@ -1011,6 +1035,7 @@ def _prepare_decode( block_usage=block_usage, block_indices=block_indices, block_offsets=block_offsets, + block_scales=block_scales, attn_bias=None, seq_lens_tensor=None, num_prefills=0, @@ -1222,7 +1247,7 @@ def trim_attn_metadata(self, metadata: AttentionMetadata) -> object: attention_metadata = subtuple(metadata, 'TrimmedAttentionMetadata', [ 'attn_bias', 'seq_lens_tensor', 'block_list', 'block_mapping', 'block_usage', 'slot_mapping', 'is_prompt', 'block_indices', - 'block_offsets' + 'block_offsets', 'block_scales' ]) return attention_metadata From a59fc7b481b1807f27de1165383b6e10476850d2 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 16 Oct 2024 19:30:58 +0200 Subject: [PATCH 289/819] Remove HPU changes from cache_engine.py (#400) We were asked on upstream PR to remove our changes from cache_engine.py. This PR does just that, and creates HPUCacheEngine inheriting from CacheEngine, just overriding _allocate_kv_cache method. --- vllm/worker/cache_engine.py | 30 +++++++++--------------------- vllm/worker/hpu_worker.py | 35 +++++++++++++++++++++++++++++------ 2 files changed, 38 insertions(+), 27 deletions(-) diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 9618585c8acb0..090f95e6e892c 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -6,7 +6,7 @@ from vllm.attention import get_attn_backend from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig from vllm.logger import init_logger -from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size, is_fake_hpu, +from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size, is_pin_memory_available) logger = init_logger(__name__) @@ -75,26 +75,14 @@ def _allocate_kv_cache( pin_memory = is_pin_memory_available() if device == "cpu" else False kv_cache: List[torch.Tensor] = [] for _ in range(self.num_attention_layers): - if device == 'hpu' or is_fake_hpu(): - key_cache = torch.zeros(kv_cache_shape, - dtype=self.dtype, - device=device) - value_cache = torch.zeros(kv_cache_shape, - dtype=self.dtype, - device=device) - kv_layer = (key_cache, value_cache) - kv_cache.append(kv_layer) - else: - # null block in CpuGpuBlockAllocator requires at least that - # block to be zeroed-out. - # We zero-out everything for simplicity. - dtype = torch.uint8 if self.dtype == torch.float8_e4m3fn else \ - self.dtype - kv_cache.append( - torch.zeros(kv_cache_shape, - dtype=dtype, - pin_memory=pin_memory, - device=device)) + # null block in CpuGpuBlockAllocator requires at least that + # block to be zeroed-out. + # We zero-out everything for simplicity. + kv_cache.append( + torch.zeros(kv_cache_shape, + dtype=self.dtype, + pin_memory=pin_memory, + device=device)) return kv_cache def swap_in(self, src_to_dst: torch.Tensor) -> None: diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 59a5adf65ebc1..752388e0d632f 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -93,7 +93,7 @@ def __init__( observability_config=observability_config) # Uninitialized cache engine. Will be initialized by # initialize_cache. - self.cache_engine: List[CacheEngine] + self.cache_engine: List[HPUCacheEngine] # Initialize gpu_cache as embedding models don't initialize kv_caches self.hpu_cache: Optional[List[List[torch.tensor]]] = None # Torch profiler. Enabled and configured through env vars: @@ -242,8 +242,8 @@ def initialize_cache(self, num_gpu_blocks: int, def _init_cache_engine(self): assert self.cache_config.num_gpu_blocks is not None self.cache_engine = [ - CacheEngine(self.cache_config, self.model_config, - self.parallel_config, self.device_config) + HPUCacheEngine(self.cache_config, self.model_config, + self.parallel_config, self.device_config) for _ in range(self.parallel_config.pipeline_parallel_size) ] self.hpu_cache = [ @@ -358,9 +358,9 @@ def vocab_size(self) -> int: def get_cache_block_size_bytes(self) -> int: """Get the size of the KV cache block size in bytes. """ - return CacheEngine.get_cache_block_size(self.cache_config, - self.model_config, - self.parallel_config) + return HPUCacheEngine.get_cache_block_size(self.cache_config, + self.model_config, + self.parallel_config) def init_worker_distributed_environment( @@ -423,3 +423,26 @@ def raise_if_cache_size_invalid(num_gpu_blocks, block_size, f"stored in KV cache ({max_seq_len}). Try increasing " "`gpu_memory_utilization` or decreasing `max_model_len` when " "initializing the engine.") + + +class HPUCacheEngine(CacheEngine): + + def _allocate_kv_cache( + self, + num_blocks: int, + device: str, + ) -> List[Tuple[torch.Tensor, torch.Tensor]]: + """Allocates KV cache on the specified device.""" + kv_cache_shape = self.attn_backend.get_kv_cache_shape( + num_blocks, self.block_size, self.num_kv_heads, self.head_size) + kv_cache: List[Tuple[torch.Tensor, torch.Tensor]] = [] + for _ in range(self.num_attention_layers): + key_cache = torch.zeros(kv_cache_shape, + dtype=self.dtype, + device=device) + value_cache = torch.zeros(kv_cache_shape, + dtype=self.dtype, + device=device) + kv_layer = (key_cache, value_cache) + kv_cache.append(kv_layer) + return kv_cache From 05bcdf5e169be9d746ff4c9d6163fff9f4b310b9 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 17 Oct 2024 12:18:10 +0200 Subject: [PATCH 290/819] [bucketing overhaul 1/n] Add padding-aware scheduling and option to limit prefill batch size (#394) This PR adds following functionality that can be enabled via engine flags: - use_padding_aware_scheduling - vLLM scheduler will now calculate token cost considering padded prefill shape (similar to https://github.com/HabanaAI/vllm-fork/pull/109). - max_num_prefill_seqs - padding-aware scheduler will perform an additional check for prefill batch size and will effectively limit prefill batch size at maximum of `max_num_prefill_seqs`. If unset, max prefill batch size will be `max_num_seqs`. Both features are generic and do not require HPU, although they may be specialized for particular vendor's usage. Padding aware scheduling includes padding function selector which selects HPU padding function (considering currently used HPU buckets) if current device is HPU. Otherwise, it will take a product of batch_size x max_seq_len. --- vllm/config.py | 18 ++++- vllm/core/scheduler.py | 122 ++++++++++++++++++++++++++-- vllm/engine/arg_utils.py | 19 ++++- vllm/worker/hpu_model_runner.py | 137 ++++++++++++++++++++------------ 4 files changed, 238 insertions(+), 58 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 5499b349bcfc8..67a4ec0761cc3 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -940,6 +940,9 @@ class SchedulerConfig: a single iteration. max_num_seqs: Maximum number of sequences to be processed in a single iteration. + max_num_prefill_seqs: Maximum number of prefill sequences to be + processed in a single iteration. Used only with padding-aware + scheduling. max_model_len: Maximum length of a sequence (including prompt and generated text). use_v2_block_manager: Whether to use the BlockSpaceManagerV2 or not. @@ -963,11 +966,14 @@ class SchedulerConfig: when SPMD worker architecture is enabled. I.e., VLLM_USE_RAY_SPMD_WORKER=1 policy: The scheduling policy to use. "fcfs" (default) or "priority". + use_padding_aware_scheduling: If True, scheduler will consider padded + tokens in prefill. """ def __init__(self, max_num_batched_tokens: Optional[int], max_num_seqs: int, + max_num_prefill_seqs: Optional[int], max_model_len: int, use_v2_block_manager: bool = True, num_lookahead_slots: int = 0, @@ -979,7 +985,8 @@ def __init__(self, num_scheduler_steps: int = 1, multi_step_stream_outputs: bool = False, send_delta_data: bool = False, - policy: str = "fcfs") -> None: + policy: str = "fcfs", + use_padding_aware_scheduling=False) -> None: if max_num_batched_tokens is None: if enable_chunked_prefill: if num_scheduler_steps > 1: @@ -1018,6 +1025,7 @@ def __init__(self, self.max_num_batched_tokens) self.max_num_seqs = max_num_seqs + self.max_num_prefill_seqs = max_num_prefill_seqs self.max_model_len = max_model_len self.use_v2_block_manager = use_v2_block_manager self.num_lookahead_slots = num_lookahead_slots @@ -1029,6 +1037,7 @@ def __init__(self, self.multi_step_stream_outputs = multi_step_stream_outputs self.send_delta_data = send_delta_data self.policy = policy + self.use_padding_aware_scheduling = use_padding_aware_scheduling self._verify_args() def _verify_args(self) -> None: @@ -1059,6 +1068,13 @@ def _verify_args(self) -> None: "num_scheduler_steps " f"({self.num_scheduler_steps}) must be greater than or " "equal to 1.") + if self.max_num_prefill_seqs is not None \ + and not self.use_padding_aware_scheduling: + raise ValueError("max_num_prefill_seqs can be only " + "used with padding-aware-scheduling. ") + if self.use_padding_aware_scheduling and self.chunked_prefill_enabled: + raise ValueError("Padding-aware scheduling currently " + "does not work with chunked prefill ") if (not self.use_v2_block_manager \ and not envs.VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1): diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 1f0a121711db5..1c69c72933b79 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -11,6 +11,7 @@ from vllm.core.interfaces import AllocStatus, BlockSpaceManager from vllm.logger import init_logger from vllm.lora.request import LoRARequest +from vllm.platforms import current_platform from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import (Sequence, SequenceData, SequenceGroup, SequenceGroupMetadata, SequenceGroupMetadataDelta, @@ -101,6 +102,94 @@ def num_curr_seqs(self): return self._num_curr_seqs +@dataclass +class PaddingAwareSchedulingBudget(SchedulingBudget): + max_num_prefill_seqs: Optional[int] = None + _prefill_request_ids_max_seq_lens: Dict[str, + int] = field(default_factory=dict) + _max_seq_len: int = 0 + _num_curr_prefill_seqs: int = 0 + + def _generic_padding_fn(self, batch_size, max_seq_len) -> int: + return batch_size * max_seq_len + + def _hpu_padding_fn(self, batch_size, max_seq_len): + from vllm.worker.hpu_model_runner import (HPUBucketingGlobalState, + find_bucket) + padded_bs = batch_size + padded_seq = max_seq_len + + hpu_bucketing_global_state = HPUBucketingGlobalState() + + bs_cfg = hpu_bucketing_global_state.prompt_bs_bucket_cfg + if bs_cfg is not None: + padded_bs = find_bucket(batch_size, bs_cfg) + else: + logger.warning( + "prompt_bs_bucket_cfg was not set! Using unpadded batch size.") + seq_cfg = hpu_bucketing_global_state.prompt_seq_bucket_cfg + if seq_cfg is not None: + padded_seq = find_bucket(max_seq_len, seq_cfg) + else: + logger.warning("prompt_seq_bucket_cfg was not set! " + "Using unpadded sequence length.") + return padded_bs * padded_seq + + def _padding_fn_selector(self): + if current_platform.is_hpu(): + return self._hpu_padding_fn + return self._generic_padding_fn + + def _maybe_update_max_seq_len(self, + new_seq_max_seq_len: Optional[int] = None): + if new_seq_max_seq_len is not None \ + and new_seq_max_seq_len > self._max_seq_len: + self._max_seq_len = new_seq_max_seq_len + return + self._max_seq_len = max( + self._prefill_request_ids_max_seq_lens.values()) + + def add_prefill_seqs(self, req_id, num_curr_prefill_seqs, max_seq_len): + self._prefill_request_ids_max_seq_lens[req_id] = max_seq_len + self._num_curr_prefill_seqs += num_curr_prefill_seqs + self._maybe_update_max_seq_len(max_seq_len) + + def subtract_prefill_seqs(self, req_id, num_curr_prefill_seqs): + if req_id in self._prefill_request_ids_max_seq_lens: + popped_seq_len = self._prefill_request_ids_max_seq_lens.pop(req_id) + self._num_curr_prefill_seqs -= num_curr_prefill_seqs + if popped_seq_len == self._max_seq_len: + self._maybe_update_max_seq_len() + + def can_schedule(self, + *args, + num_new_tokens: int, + num_new_seqs: int, + is_prefill: bool = False, + max_seq_len: int = 0): + can_parent_schedule = super().can_schedule( + *args, num_new_tokens=num_new_tokens, num_new_seqs=num_new_seqs) + if not can_parent_schedule or not is_prefill: + return can_parent_schedule + new_batch_size = self._num_curr_prefill_seqs + num_new_seqs + new_max_seq_len = max(max(self._max_seq_len, max_seq_len), 1) + padding_fn = self._padding_fn_selector() + num_new_padded_tokens = padding_fn(new_batch_size, new_max_seq_len) + result = num_new_padded_tokens <= self.token_budget + if self.max_num_prefill_seqs is not None and result: + result = self._num_curr_prefill_seqs + num_new_seqs \ + <= self.max_num_prefill_seqs + return result + + @property + def max_seq_len(self): + return self._max_seq_len + + @property + def num_curr_prefill_seqs(self): + return self._num_curr_prefill_seqs + + @dataclass class ScheduledSequenceGroup: # A sequence group that's scheduled. @@ -938,9 +1027,18 @@ def _schedule_prefills( continue num_new_seqs = seq_group.get_max_num_running_seqs() + max_prefill_seq_len = None + can_schedule_kwargs = { + 'num_new_tokens': num_new_tokens, + 'num_new_seqs': num_new_seqs + } + if self.scheduler_config.use_padding_aware_scheduling: + max_prefill_seq_len = max( + [seq.get_num_new_tokens() for seq in seq_group.get_seqs()]) + can_schedule_kwargs['is_prefill'] = True + can_schedule_kwargs['max_seq_len'] = max_prefill_seq_len if (num_new_tokens == 0 - or not budget.can_schedule(num_new_tokens=num_new_tokens, - num_new_seqs=num_new_seqs)): + or not budget.can_schedule(**can_schedule_kwargs)): break # Can schedule this request. @@ -971,6 +1069,10 @@ def _schedule_prefills( token_chunk_size=num_new_tokens)) budget.add_num_batched_tokens(seq_group.request_id, num_new_tokens) budget.add_num_seqs(seq_group.request_id, num_new_seqs) + if self.scheduler_config.use_padding_aware_scheduling: + assert isinstance(budget, PaddingAwareSchedulingBudget) + budget.add_prefill_seqs(seq_group.request_id, num_new_seqs, + max_prefill_seq_len) # Queue requests that couldn't be scheduled. waiting_queue.extendleft(leftover_waiting_sequences) @@ -992,10 +1094,18 @@ def _schedule_default(self) -> SchedulerOutputs: be swapped or preempted. """ # Include running requests to the budget. - budget = SchedulingBudget( - token_budget=self.scheduler_config.max_num_batched_tokens, - max_num_seqs=self.scheduler_config.max_num_seqs, - ) + budget: SchedulingBudget + if self.scheduler_config.use_padding_aware_scheduling: + budget = PaddingAwareSchedulingBudget( + token_budget=self.scheduler_config.max_num_batched_tokens, + max_num_seqs=self.scheduler_config.max_num_seqs, + max_num_prefill_seqs=self.scheduler_config.max_num_prefill_seqs + ) + else: + budget = SchedulingBudget( + token_budget=self.scheduler_config.max_num_batched_tokens, + max_num_seqs=self.scheduler_config.max_num_seqs, + ) # Make sure we include num running seqs before scheduling prefill, # so that we don't schedule beyond max_num_seqs for prefill. for seq_group in self.running: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 3c9f3d4fe4ab3..cdf1401816800 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -113,11 +113,13 @@ class EngineArgs: enable_prefix_caching: bool = False disable_sliding_window: bool = False use_v2_block_manager: bool = True + use_padding_aware_scheduling: bool = False swap_space: float = 4 # GiB cpu_offload_gb: float = 0 # GiB gpu_memory_utilization: float = 0.90 max_num_batched_tokens: Optional[int] = None max_num_seqs: int = 256 + max_num_prefill_seqs: Optional[int] = None max_logprobs: int = 20 # Default value for OpenAI Chat Completions API disable_log_stats: bool = False revision: Optional[str] = None @@ -391,6 +393,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: action='store_true', help='Use BlockSpaceMangerV2. By default this is set to True. ' 'Set to False to use BlockSpaceManagerV1') + parser.add_argument( + '--use-padding-aware-scheduling', + default=EngineArgs.use_padding_aware_scheduling, + action='store_true', + help=('Use padding-aware scheduling. If True, the scheduler ' + 'will consider padded tokens in prefill. ' + 'By default this is set to False. ')) parser.add_argument( '--num-lookahead-slots', type=int, @@ -445,6 +454,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: type=int, default=EngineArgs.max_num_seqs, help='Maximum number of sequences per iteration.') + parser.add_argument( + '--max-num-prefill-seqs', + type=int, + default=EngineArgs.max_num_prefill_seqs, + help=('Maximum number of prefill sequences per ' + 'iteration. Can be used only with padding-aware ' + 'scheduling. Must be <= max_num_seqs.')) parser.add_argument( '--max-logprobs', type=int, @@ -1036,6 +1052,7 @@ def create_engine_config(self) -> EngineConfig: scheduler_config = SchedulerConfig( max_num_batched_tokens=self.max_num_batched_tokens, max_num_seqs=self.max_num_seqs, + max_num_prefill_seqs=self.max_num_prefill_seqs, max_model_len=model_config.max_model_len, use_v2_block_manager=self.use_v2_block_manager, num_lookahead_slots=num_lookahead_slots, @@ -1049,7 +1066,7 @@ def create_engine_config(self) -> EngineConfig: send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER and parallel_config.use_ray), policy=self.scheduling_policy, - ) + use_padding_aware_scheduling=self.use_padding_aware_scheduling) lora_config = LoRAConfig( max_lora_rank=self.max_lora_rank, max_loras=self.max_loras, diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index d8150a56844a2..785337478468f 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -13,6 +13,7 @@ import os import time from array import array +from dataclasses import dataclass, field from enum import IntEnum from typing import (TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple, Optional, Set, Tuple, Type, TypeVar, Union) @@ -64,6 +65,26 @@ LORA_WARMUP_RANK = 8 +class Singleton(type): + _instances: Dict[type, object] = {} + + def __call__(cls, *args, **kwargs): + if cls not in cls._instances: + cls._instances[cls] = super(Singleton, + cls).__call__(*args, **kwargs) + return cls._instances[cls] + + +@dataclass +class HPUBucketingGlobalState(metaclass=Singleton): + prompt_bs_bucket_cfg: Tuple[int, int, int] = field(init=False) + decode_bs_bucket_cfg: Tuple[int, int, int] = field(init=False) + prompt_seq_bucket_cfg: Tuple[int, int, int] = field(init=False) + decode_block_bucket_cfg: Tuple[int, int, int] = field(init=False) + prompt_buckets: List[Tuple[int, int]] = field(init=False) + decode_buckets: List[Tuple[int, int]] = field(init=False) + + def subtuple(obj: object, typename: str, to_copy: List[str], @@ -542,6 +563,9 @@ def __init__( self.device = self.device_config.device self.enforce_eager = self.model_config.enforce_eager self.max_num_seqs = self.scheduler_config.max_num_seqs + self.max_num_prefill_seqs = self.scheduler_config.max_num_prefill_seqs \ + if self.scheduler_config.max_num_prefill_seqs is not None \ + else self.max_num_seqs self.max_model_len = self.scheduler_config.max_model_len self.max_num_batched_tokens = \ self.scheduler_config.max_num_batched_tokens @@ -569,6 +593,7 @@ def __init__( self.profiler_counter_helper = HabanaProfilerCounterHelper() self.seen_configs: set = set() self._mem_margin: Optional[int] = None + self.bucketing_global_state = HPUBucketingGlobalState() self._setup_buckets() self._set_gc_threshold() @@ -680,27 +705,26 @@ def _is_valid_bucket(self, bucket): def _setup_buckets(self) -> None: align_bs = lambda x: min(self.max_num_seqs, x) - max_bucket_cfg = 64 #FIXME: The default values should be max_model_len max_prompt_seq = 1024 max_decode_seq = 2048 - self.prompt_bs_bucket_cfg = read_bucket_settings( + self.bucketing_global_state.prompt_bs_bucket_cfg = read_bucket_settings( 'prompt', 'bs', min=1, step=align_bs(32), - max=align_bs(max_bucket_cfg)) - self.decode_bs_bucket_cfg = read_bucket_settings('decode', - 'bs', - min=1, - step=align_bs(32), - max=self.max_num_seqs) - self.prompt_seq_bucket_cfg = read_bucket_settings('prompt', - 'seq', - min=self.block_size, - step=self.block_size, - max=max_prompt_seq) - self.decode_block_bucket_cfg = read_bucket_settings( + max=self.max_num_prefill_seqs) + self.bucketing_global_state.decode_bs_bucket_cfg = read_bucket_settings( + 'decode', 'bs', min=1, step=align_bs(32), max=self.max_num_seqs) + self.bucketing_global_state.prompt_seq_bucket_cfg = \ + read_bucket_settings( + 'prompt', + 'seq', + min=self.block_size, + step=self.block_size, + max=max_prompt_seq) + self.bucketing_global_state.decode_block_bucket_cfg = \ + read_bucket_settings( 'decode', 'block', min=self.block_size, @@ -710,13 +734,13 @@ def _setup_buckets(self) -> None: self.graphed_buckets: Set[Any] = set() msg = ("Prompt bucket config (min, step, max_warmup) " - f"bs:{self.prompt_bs_bucket_cfg}, " - f"seq:{self.prompt_seq_bucket_cfg}") + f"bs:{self.bucketing_global_state.prompt_bs_bucket_cfg}, " + f"seq:{self.bucketing_global_state.prompt_seq_bucket_cfg}") logger.info(msg) msg = ("Decode bucket config (min, step, max_warmup) " - f"bs:{self.decode_bs_bucket_cfg}, " - f"block:{self.decode_block_bucket_cfg}") + f"bs:{self.bucketing_global_state.decode_bs_bucket_cfg}, " + f"block:{self.bucketing_global_state.decode_block_bucket_cfg}") logger.info(msg) def _prepare_prompt( @@ -834,7 +858,8 @@ def _prepare_prompt( assert max_query_len > 0 max_prompt_len = max( - find_bucket(max(seq_lens), self.prompt_seq_bucket_cfg), + find_bucket(max(seq_lens), + self.bucketing_global_state.prompt_seq_bucket_cfg), self.block_size) lora_ids: List[int] = [] @@ -1001,8 +1026,9 @@ def _prepare_decode( for b_u, lb in zip(blocks_used, last_block)] block_usage = list(itertools.chain(*block_usage)) - block_bucket_size = find_bucket(len(block_list), - self.decode_block_bucket_cfg) + block_bucket_size = find_bucket( + len(block_list), + self.bucketing_global_state.decode_block_bucket_cfg) block_list = pad_list(block_list, block_bucket_size, _PAD_BLOCK_ID) block_mapping = pad_list(block_mapping, block_bucket_size, -1) block_usage = pad_list(block_usage, block_bucket_size, 1) @@ -1076,8 +1102,8 @@ def prepare_input_tensors( self.profiler.start('internal', base_event_name) real_batch_size = len(seq_group_metadata_list) - bucket_cfg = self.prompt_bs_bucket_cfg if is_prompt else \ - self.decode_bs_bucket_cfg + bucket_cfg = self.bucketing_global_state.prompt_bs_bucket_cfg \ + if is_prompt else self.bucketing_global_state.decode_bs_bucket_cfg batch_size_padded = find_bucket(real_batch_size, bucket_cfg) batch_size_padding = batch_size_padded - real_batch_size seq_group_metadata_list = seq_group_metadata_list.copy() @@ -1282,9 +1308,10 @@ def create_dummy_seq_group_metadata(self, def profile_run(self) -> None: num_layers = self.model_config.get_num_layers(self.parallel_config) kv_caches = [None] * num_layers - max_batch_size = self.prompt_bs_bucket_cfg[-1] - max_seq_len = min(self.prompt_seq_bucket_cfg[-1], - self.max_num_batched_tokens // max_batch_size) + max_batch_size = self.bucketing_global_state.prompt_bs_bucket_cfg[-1] + max_seq_len = min( + self.bucketing_global_state.prompt_seq_bucket_cfg[-1], + self.max_num_batched_tokens // max_batch_size) self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches, False, True) @@ -1498,13 +1525,15 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: self.profiler.start('internal', 'warmup') max_blocks = kv_caches[0][0].size(0) - self.prompt_buckets, prompt_omitted_buckets = generate_prompt_buckets( - self.prompt_bs_bucket_cfg, self.prompt_seq_bucket_cfg, + self.bucketing_global_state.prompt_buckets, prompt_omitted_buckets = \ + generate_prompt_buckets( + self.bucketing_global_state.prompt_bs_bucket_cfg, + self.bucketing_global_state.prompt_seq_bucket_cfg, self.max_num_batched_tokens) - msg = ( - f"Generated {len(self.prompt_buckets)} " - f"prompt buckets [bs, seq]: {list(sorted(self.prompt_buckets))}") + msg = (f"Generated {len(self.bucketing_global_state.prompt_buckets)} " + f"prompt buckets [bs, seq]: \ + {list(sorted(self.bucketing_global_state.prompt_buckets))}") logger.info(msg) msg = (f"Omitted {len(prompt_omitted_buckets)} " @@ -1515,16 +1544,17 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: msg = f"Omitted prompt buckets: {list(sorted(prompt_omitted_buckets))}" logger.debug(msg) - self.decode_buckets = generate_decode_buckets( - self.decode_bs_bucket_cfg, self.decode_block_bucket_cfg, - max_blocks) + self.bucketing_global_state.decode_buckets = generate_decode_buckets( + self.bucketing_global_state.decode_bs_bucket_cfg, + self.bucketing_global_state.decode_block_bucket_cfg, max_blocks) logger.info("Generated %d decode buckets [bs, total_blocks]: %s", - len(self.decode_buckets), - list(sorted(self.decode_buckets))) + len(self.bucketing_global_state.decode_buckets), + list(sorted(self.bucketing_global_state.decode_buckets))) if not htorch.utils.internal.is_lazy() and not self.enforce_eager: - cache_size_limit = len(self.prompt_buckets) + len( - self.decode_buckets) + 1 + cache_size_limit = len( + self.bucketing_global_state.prompt_buckets) + len( + self.bucketing_global_state.decode_buckets) + 1 torch._dynamo.config.cache_size_limit = max( cache_size_limit, torch._dynamo.config.cache_size_limit) # Multiply by 8 to follow the original default ratio between @@ -1551,8 +1581,10 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: 'Please update Gaudi Software Suite.') with compile_only_mode_context( ) if can_use_compile_only_mode else contextlib.nullcontext(): - self.warmup_all_buckets(self.prompt_buckets, True, kv_caches) - self.warmup_all_buckets(self.decode_buckets, False, kv_caches) + self.warmup_all_buckets(self.bucketing_global_state.prompt_buckets, + True, kv_caches) + self.warmup_all_buckets(self.bucketing_global_state.decode_buckets, + False, kv_caches) if not self.enforce_eager and htorch.utils.internal.is_lazy(): assert self.mem_margin is not None, \ @@ -1582,12 +1614,12 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: 'max_bs') mem_post_prompt, prompt_batch_seq, prompt_captured_all = \ self.warmup_graphs( - prompt_strategy, self.prompt_buckets, True, kv_caches, - prompt_available_memory) + prompt_strategy, self.bucketing_global_state.prompt_buckets, + True, kv_caches, prompt_available_memory) mem_post_decode, decode_batch_seq, decode_captured_all = \ self.warmup_graphs( - decode_strategy, self.decode_buckets, False, kv_caches, - decode_available_memory) + decode_strategy, self.bucketing_global_state.decode_buckets, + False, kv_caches, decode_available_memory) # Not all prompt buckets were captured, but all decode buckets # were captured and we have some free graph-allocated space @@ -1596,7 +1628,8 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: and not prompt_captured_all and decode_captured_all): mem_post_prompt, _, prompt_captured_all = ( self.warmup_graphs( - prompt_strategy, self.prompt_buckets, True, + prompt_strategy, + self.bucketing_global_state.prompt_buckets, True, kv_caches, graph_free_mem - mem_post_prompt - mem_post_decode, mem_post_prompt, prompt_batch_seq)) @@ -1608,14 +1641,18 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: and not decode_captured_all \ and prompt_captured_all: mem_post_decode, _, _ = self.warmup_graphs( - decode_strategy, self.decode_buckets, False, kv_caches, + decode_strategy, + self.bucketing_global_state.decode_buckets, False, + kv_caches, graph_free_mem - mem_post_prompt - mem_post_decode, mem_post_decode, decode_batch_seq) - self.log_graph_warmup_summary(self.prompt_buckets, True, - mem_post_prompt) - self.log_graph_warmup_summary(self.decode_buckets, False, - mem_post_decode) + self.log_graph_warmup_summary( + self.bucketing_global_state.prompt_buckets, True, + mem_post_prompt) + self.log_graph_warmup_summary( + self.bucketing_global_state.decode_buckets, False, + mem_post_decode) end_time = time.perf_counter() end_mem = HabanaMemoryProfiler.current_device_memory_usage() From 9276ccca92ed8703648c03ddc713990c168d6e96 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Thu, 17 Oct 2024 15:23:49 +0200 Subject: [PATCH 291/819] Add WA for RuntimeError: "fill_cpu" not implemented for 'Float8_e4m3fn' (#402) --- vllm/worker/hpu_worker.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 752388e0d632f..8badc5f6bdb43 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -436,12 +436,14 @@ def _allocate_kv_cache( kv_cache_shape = self.attn_backend.get_kv_cache_shape( num_blocks, self.block_size, self.num_kv_heads, self.head_size) kv_cache: List[Tuple[torch.Tensor, torch.Tensor]] = [] + dtype = self.dtype + if device != 'hpu' and not is_fake_hpu() \ + and self.dtype == torch.float8_e4m3fn: + dtype = torch.uint8 for _ in range(self.num_attention_layers): - key_cache = torch.zeros(kv_cache_shape, - dtype=self.dtype, - device=device) + key_cache = torch.zeros(kv_cache_shape, dtype=dtype, device=device) value_cache = torch.zeros(kv_cache_shape, - dtype=self.dtype, + dtype=dtype, device=device) kv_layer = (key_cache, value_cache) kv_cache.append(kv_layer) From 07c98a5263967af6afd7eb58119c5c5504d9a9f2 Mon Sep 17 00:00:00 2001 From: Artur Fierka Date: Fri, 18 Oct 2024 09:13:23 +0200 Subject: [PATCH 292/819] Workaround for OOM during loading llama-405 (#396) Repeating missing code --- vllm/model_executor/models/llama.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 18ce8d7f7d164..a64edc94825f3 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -3,6 +3,7 @@ # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# Copyright 2024 Habana Labs, Ltd. an Intel Company # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX # and OPT implementations in this library. It has been modified from its @@ -420,6 +421,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + if is_hpu: + torch.hpu.synchronize() # If this function is called, it should always initialize KV cache scale # factors (or else raise an exception). Thus, handled exceptions should From acde882c25f64d150c4ef0c60d27e010f78c8fd5 Mon Sep 17 00:00:00 2001 From: Karol Damaszke Date: Tue, 22 Oct 2024 10:24:52 +0200 Subject: [PATCH 293/819] Add HPU specific arguments to benchmark_throughput (#406) Modify `benchmark_throughput.py` to allow running with FP8 on HPU (KV cache dtype `fp8_inc`) and to use padding-aware scheduling. --- benchmarks/benchmark_throughput.py | 38 ++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index b7bc2a6402375..e1a359b871e71 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -90,6 +90,10 @@ def run_vllm( download_dir: Optional[str] = None, load_format: str = EngineArgs.load_format, disable_async_output_proc: bool = False, + weights_load_device: str = None, + use_padding_aware_scheduling: bool = False, + max_num_seqs: int = 256, + max_num_prefill_seqs: int = None, ) -> float: from vllm import LLM, SamplingParams llm = LLM( @@ -115,6 +119,10 @@ def run_vllm( num_scheduler_steps=num_scheduler_steps, use_v2_block_manager=use_v2_block_manager, disable_async_output_proc=disable_async_output_proc, + weights_load_device=weights_load_device, + use_padding_aware_scheduling=use_padding_aware_scheduling, + max_num_seqs=max_num_seqs, + max_num_prefill_seqs=max_num_prefill_seqs, ) # Add the requests to the engine. @@ -181,6 +189,10 @@ async def run_vllm_async( load_format: str = EngineArgs.load_format, disable_async_output_proc: bool = False, disable_frontend_multiprocessing: bool = False, + weights_load_device: str = None, + use_padding_aware_scheduling: bool = False, + max_num_seqs: int = 256, + max_num_prefill_seqs: int = None, ) -> float: from vllm import SamplingParams engine_args = AsyncEngineArgs( @@ -208,6 +220,9 @@ async def run_vllm_async( disable_async_output_proc=disable_async_output_proc, worker_use_ray=False, disable_log_requests=True, + weights_load_device=weights_load_device, + use_padding_aware_scheduling=use_padding_aware_scheduling, + max_num_prefill_seqs=max_num_prefill_seqs, ) async with build_async_engine_client_from_engine_args( @@ -342,7 +357,9 @@ def main(args: argparse.Namespace): args.max_num_batched_tokens, args.distributed_executor_backend, args.gpu_memory_utilization, args.num_scheduler_steps, args.use_v2_block_manager, args.download_dir, args.load_format, - args.disable_async_output_proc + args.disable_async_output_proc, args.weights_load_device, + args.use_padding_aware_scheduling, args.max_num_seqs, + args.max_num_prefill_seqs ] if args.async_engine: @@ -446,7 +463,7 @@ def main(args: argparse.Namespace): parser.add_argument( '--kv-cache-dtype', type=str, - choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], + choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3', 'fp8_inc'], default="auto", help='Data type for kv cache storage. If "auto", will use model ' 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' @@ -540,6 +557,23 @@ def main(args: argparse.Namespace): action='store_true', default=False, help="Disable decoupled async engine frontend.") + parser.add_argument("--weights-load-device", + type=str, + default=None, + choices=DEVICE_OPTIONS, + help='Device on which weights are loaded.') + parser.add_argument("--use-padding-aware-scheduling", + action='store_true', + default=False, + help="Enable padding-aware scheduling.") + parser.add_argument("--max-num-seqs", + type=int, + default=256, + help="Maximum number of requests for single decode.") + parser.add_argument("--max-num-prefill-seqs", + type=int, + default=None, + help="Maximum number of requests for single prefill.") args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model From 8c43ff1fb2c5e1c62dfb1771f6d7f5665958ff85 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 22 Oct 2024 10:25:43 +0200 Subject: [PATCH 294/819] Add forward_hpu to RotaryEmbedding, remove custom module (#404) This PR removes the usage of custom HPU RotaryEmbedding modules, and adds a forward_hpu method to existing RotaryEmbedding, for reusing multiple derived implementations without the need of adding them to HPU extension. Mark_steps should not be needed within the test, but for whatever reason, if they are not there, PT bridge crashes. To be investigated later on. It does not affect actual model execution in any way I could test/observe. --- tests/kernels/test_pos_encoding.py | 10 ++ .../model_executor/layers/rotary_embedding.py | 94 +++++++++++++------ 2 files changed, 73 insertions(+), 31 deletions(-) diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index ba9d2d4389b21..6ca3a645c7771 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -5,6 +5,7 @@ import torch from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.platforms import current_platform from vllm.utils import seed_everything from .allclose_default import get_default_atol, get_default_rtol @@ -20,6 +21,9 @@ CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] +if current_platform.is_hpu(): + import habana_frameworks.torch as htorch + CUDA_DEVICES = ['hpu'] @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) @@ -65,6 +69,8 @@ def test_rotary_embedding( # NOTE(woosuk): The reference implementation should be executed first # because the custom kernel is in-place. ref_query, ref_key = rope.forward_native(positions, query, key) + if current_platform.is_hpu(): + htorch.core.mark_step() out_query, out_key = rope.forward(positions, query, key) # Compare the results. torch.testing.assert_close(out_query, @@ -120,6 +126,8 @@ def test_batched_rotary_embedding( # NOTE(woosuk): The reference implementation should be executed first # because the custom kernel is in-place. ref_query, ref_key = rope.forward_native(positions, query, key) + if current_platform.is_hpu(): + htorch.core.mark_step() out_query, out_key = rope.forward(positions, query, key, @@ -193,6 +201,8 @@ def test_batched_rotary_embedding_multi_lora( # because the custom kernel is in-place. ref_query, ref_key = rope.forward_native(positions, query, key, query_offsets) + if current_platform.is_hpu(): + htorch.core.mark_step() out_query, out_key = rope.forward(positions, query, key, query_offsets.flatten()) # Compare the results. diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 85cd700c978ea..10626d53338e3 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -28,7 +28,6 @@ import torch.nn as nn from vllm.model_executor.custom_op import CustomOp -from vllm.platforms import current_platform def _rotate_neox(x: torch.Tensor) -> torch.Tensor: @@ -195,6 +194,61 @@ def forward_xpu( self.cos_sin_cache, self.is_neox_style) return query, key + def forward_hpu( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + from habana_frameworks.torch.hpex.kernels import ( + RotaryPosEmbeddingMode, apply_rotary_pos_emb) + positions = positions.flatten() + if offsets is not None: + positions = positions + offsets + num_tokens = positions.shape[0] + cos_sin = self.cos_sin_cache.index_select(0, positions).view( + num_tokens, 1, -1) + cos, sin = cos_sin.chunk(2, dim=-1) + # HPU RoPE kernel requires hidden dimension for cos and sin to be equal + # to query hidden dimension, so the original tensors need to be + # expanded + # GPT-NeoX kernel requires position_ids = None, offset, mode = BLOCKWISE + # and expansion of cos/sin tensors via concatenation + # GPT-J kernel requires position_ids = None, offset = 0, mode = PAIRWISE + # and expansion of cos/sin tensors via repeat_interleave + rope_mode: RotaryPosEmbeddingMode + if self.is_neox_style: + rope_mode = RotaryPosEmbeddingMode.BLOCKWISE + cos = torch.cat((cos, cos), dim=-1) + sin = torch.cat((sin, sin), dim=-1) + else: + rope_mode = RotaryPosEmbeddingMode.PAIRWISE + sin = torch.repeat_interleave(sin, + 2, + dim=-1, + output_size=cos_sin.shape[-1]) + cos = torch.repeat_interleave(cos, + 2, + dim=-1, + output_size=cos_sin.shape[-1]) + + query_shape = query.shape + query = query.view(num_tokens, -1, self.head_size) + query_rot = query[..., :self.rotary_dim] + query_pass = query[..., self.rotary_dim:] + query_rot = apply_rotary_pos_emb(query_rot, cos, sin, None, 0, + rope_mode) + query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + + key_shape = key.shape + key = key.view(num_tokens, -1, self.head_size) + key_rot = key[..., :self.rotary_dim] + key_pass = key[..., self.rotary_dim:] + key_rot = apply_rotary_pos_emb(key_rot, cos, sin, None, 0, rope_mode) + key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + return query, key + def extra_repr(self) -> str: s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}" s += f", max_position_embeddings={self.max_position_embeddings}" @@ -918,17 +972,8 @@ def get_rope( return _ROPE_DICT[key] if rope_scaling is None: - if current_platform.is_hpu(): - from vllm_hpu_extension.rotary_embed import HpuRotaryEmbedding - rotary_emb = HpuRotaryEmbedding(head_size, - rotary_dim, - max_position, - base, - is_neox_style, - RoPEFallback=RotaryEmbedding) - else: - rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, - base, is_neox_style, dtype) + rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base, + is_neox_style, dtype) else: scaling_type = rope_scaling[ "type"] if "type" in rope_scaling else rope_scaling["rope_type"] @@ -941,25 +986,12 @@ def get_rope( high_freq_factor = rope_scaling["high_freq_factor"] original_max_position = rope_scaling[ "original_max_position_embeddings"] - if current_platform.is_hpu(): - from vllm_hpu_extension.rotary_embed import ( - HpuLlama3RotaryEmbedding) - rotary_emb = HpuLlama3RotaryEmbedding( - head_size, - rotary_dim, - max_position, - base, - is_neox_style, - scaling_factor, - low_freq_factor, - high_freq_factor, - original_max_position, - RoPEFallback=Llama3RotaryEmbedding) - else: - rotary_emb = Llama3RotaryEmbedding( - head_size, rotary_dim, max_position, base, is_neox_style, - dtype, scaling_factor, low_freq_factor, high_freq_factor, - original_max_position) + rotary_emb = Llama3RotaryEmbedding(head_size, rotary_dim, + max_position, base, + is_neox_style, dtype, + scaling_factor, low_freq_factor, + high_freq_factor, + original_max_position) elif scaling_type == "linear": rotary_emb = LinearScalingRotaryEmbedding(head_size, rotary_dim, max_position, base, From aecd6677f07ab493caf2ca78174e4b40c65aa163 Mon Sep 17 00:00:00 2001 From: Kamil Kaczor Date: Tue, 22 Oct 2024 10:29:33 +0200 Subject: [PATCH 295/819] Remove if blocks smaller than bs in generate_decode_buckets (#412) With this check while running decode_block_bucket_min=128 and bs>128 it will skip buckets smaller than bs. Then during the run buckets that got skipped can be used by vllm and are being warmed-up which is causing perf drop & they are not run as hpu graphs. This change is removing said check. --- vllm/worker/hpu_model_runner.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 785337478468f..888a9a9da942c 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -202,8 +202,6 @@ def generate_decode_buckets(bs_bucket_config, blocks_bucket_config, last_bucket = round_up(max_blocks, bstep) for bs in bs_buckets: for blocks in block_buckets: - if blocks < bs: - continue if blocks > last_bucket: break buckets.append((bs, blocks)) From 0cf52619196d7f8f277aafe4af48624291b42d12 Mon Sep 17 00:00:00 2001 From: Karol Damaszke Date: Tue, 22 Oct 2024 10:43:03 +0200 Subject: [PATCH 296/819] Remove CPU sync before Sampler (#414) Currently before each Sampler call we have a CPU sync, which causes a host gap: image This PR is removing that sync, so the host gap is no longer visible: image NOTE: class `ApplyToppTopkScalar` still has some CPU syncs inside. It means that the biggest gain will be observed in the scenario without `top_p` or `top_k` parameters. I think it is worth to investigate if we can remove the syncs from this function too. --- vllm/model_executor/layers/sampler.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 58c4940c12fb2..74c0416e4b379 100755 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -200,13 +200,13 @@ def _init_sampling_tensors( self._do_penalties = do_penalties self._do_top_p_top_k = do_top_p_top_k self._do_min_p = do_min_p - self._top_p_scalar = sampling_tensors.top_ps[0].item() - self._top_k_scalar = sampling_tensors.top_ks[0].item() + self._top_p_scalar = sampling_tensors.top_ps[0] + self._top_k_scalar = sampling_tensors.top_ks[0] scalar_p = torch.all(sampling_tensors.top_ps == self._top_p_scalar) scalar_k = torch.all(sampling_tensors.top_ks == self._top_k_scalar) - self._scalar_p_and_k = (scalar_p and scalar_k).item() - if self._scalar_p_and_k and self._do_top_p_top_k: - self._apply_top_k_top_p_opt = ApplyToppTopkScalar(5) + self._scalar_p_and_k = torch.logical_and(scalar_p, scalar_k) + + self._apply_top_k_top_p_opt = ApplyToppTopkScalar(5) def forward( self, @@ -266,13 +266,13 @@ def forward( logits.div_(sampling_tensors.temperatures.unsqueeze(dim=1)) if do_top_p_top_k and flashinfer_top_k_top_p_sampling is None: - if self._scalar_p_and_k: - logits = self._apply_top_k_top_p_opt(logits, - self._top_p_scalar, - self._top_k_scalar) - else: - logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps, - sampling_tensors.top_ks) + # If we have a scalar p and k, we can use the optimized version. + logits = torch.where( + self._scalar_p_and_k, + self._apply_top_k_top_p_opt(logits, self._top_p_scalar, + self._top_k_scalar), + _apply_top_k_top_p(logits, sampling_tensors.top_ps, + sampling_tensors.top_ks)) if do_min_p: logits = _apply_min_p(logits, sampling_tensors.min_ps) From 3af4b6ce54ccdfc91516b335c5331045d78c99a2 Mon Sep 17 00:00:00 2001 From: Sanju C Sudhakaran Date: Tue, 22 Oct 2024 19:03:58 +0530 Subject: [PATCH 297/819] Remove redundant set_active_loras call during warmup (#413) CUDA uses `capture` for warmup runs and `execute_model` for actual runs. During each phase they call `set_active_loras` only once. HPU uses `execute_model` for both warmup and actual runs. Since `execute_model` already takes care of `set_active_loras` internally, the redundant call can be removed. This special handling is redundant and incorrect, as it causes out-of-bound slicing in decode phase reported in https://github.com/HabanaAI/vllm-fork/issues/405. This PR removes special handling of `set_active_loras` function call from warmup runs and resolves the issue in https://github.com/HabanaAI/vllm-fork/issues/405. --- vllm/worker/hpu_model_runner.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 888a9a9da942c..f2875194e93a0 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1354,12 +1354,6 @@ def warmup_scenario(self, ] self.profiler.start('internal', scenario_name) times = 3 if use_graphs or is_pt_profiler_run else 1 - if self.lora_config and not is_lora_profile_run: - lora_mapping = LoRAMapping( - **dict(index_mapping=[0] * batch_size * seq_len, - prompt_mapping=[0] * batch_size * seq_len, - is_prefill=is_prompt)) - self.set_active_loras(set(), lora_mapping) if is_prompt: seqs = [ self.create_dummy_seq_group_metadata( From 892c09026833d85dd0fc408ee7d5e3c1b461394c Mon Sep 17 00:00:00 2001 From: Himangshu Lahkar <49579433+hlahkar@users.noreply.github.com> Date: Wed, 23 Oct 2024 15:03:53 +0530 Subject: [PATCH 298/819] Change profile Run batch based on max_seq_len (#415) Changes the profile_run batches based on the max sequence length. This avoids padding during prepare_prompt; thus avoiding breaking constraints based on batch_size * seq_len <= max_num_batch_tokens. Current logic for profile_run max_batch_size takes precedence. e.g. - > max_batch_size = 256, max_num_batch_tokens = 2048, block_size = 128, max_seq_len = 1024 with current logic max_seq_len is updated as 8; however in **prepare_prompt** seq_len is padded to 128, thus getting batch_size * seq_len as 256 * 128 > max_num_batch_tokens; thus violating the above mentioned constraint with the updated logic, we calculate max_batch_size as 2, this avoids the padding at **prepare_prompt**, thus keeping the constraints in place. Fixes: https://github.com/HabanaAI/vllm-fork/issues/405 --- vllm/worker/hpu_model_runner.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index f2875194e93a0..e8e76f6ab67ef 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1306,10 +1306,8 @@ def create_dummy_seq_group_metadata(self, def profile_run(self) -> None: num_layers = self.model_config.get_num_layers(self.parallel_config) kv_caches = [None] * num_layers - max_batch_size = self.bucketing_global_state.prompt_bs_bucket_cfg[-1] - max_seq_len = min( - self.bucketing_global_state.prompt_seq_bucket_cfg[-1], - self.max_num_batched_tokens // max_batch_size) + max_seq_len = self.bucketing_global_state.prompt_seq_bucket_cfg[-1] + max_batch_size = self.max_num_batched_tokens // max_seq_len self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches, False, True) From 7f58ad1583a2d11a07705ba9d88bda54c8f19843 Mon Sep 17 00:00:00 2001 From: Michal Adamczyk Date: Wed, 23 Oct 2024 15:04:08 +0200 Subject: [PATCH 299/819] Add support for various softmax normalization options (#420) Supporting PR for https://github.com/HabanaAI/vllm-hpu-extension/pull/14 --- requirements-hpu.txt | 2 +- vllm/attention/backends/hpu_attn.py | 1 + vllm/attention/ops/hpu_paged_attn.py | 1 + vllm/worker/hpu_model_runner.py | 9 ++++++++- 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 1a583974be151..7cefa4e631fa8 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -8,4 +8,4 @@ pandas tabulate setuptools>=61 setuptools-scm>=8 -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@fd7f2e6 +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@c2801bb diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index a8f4b09b67274..f4674cedf01ce 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -223,6 +223,7 @@ def forward( block_mapping=attn_metadata.block_mapping, block_bias=attn_metadata.attn_bias, block_scales=attn_metadata.block_scales, + block_groups=attn_metadata.block_groups, scale=self.scale, matmul_qk_op=self.matmul_qk, matmul_av_op=self.matmul_av, diff --git a/vllm/attention/ops/hpu_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py index 4c0fb2a628361..603d3959377c4 100644 --- a/vllm/attention/ops/hpu_paged_attn.py +++ b/vllm/attention/ops/hpu_paged_attn.py @@ -21,6 +21,7 @@ class HPUPagedAttentionMetadata: block_indices: Optional[torch.Tensor] block_offsets: Optional[torch.Tensor] block_scales: Optional[torch.Tensor] + block_groups: Optional[torch.Tensor] class HPUPagedAttention: diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index e8e76f6ab67ef..382a0abb21240 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -907,6 +907,7 @@ def _prepare_prompt( block_indices=block_indices, block_offsets=block_offsets, block_scales=None, + block_groups=None, attn_bias=None, seq_lens_tensor=seq_lens_tensor, num_prefills=real_num_seqs, @@ -1028,6 +1029,8 @@ def _prepare_decode( len(block_list), self.bucketing_global_state.decode_block_bucket_cfg) block_list = pad_list(block_list, block_bucket_size, _PAD_BLOCK_ID) + block_groups = pad_list(block_mapping, block_bucket_size, + len(block_tables)) block_mapping = pad_list(block_mapping, block_bucket_size, -1) block_usage = pad_list(block_usage, block_bucket_size, 1) block_scales = pad_list(block_scales, block_bucket_size, 0.0) @@ -1038,6 +1041,9 @@ def _prepare_decode( block_mapping = torch.tensor(block_mapping, dtype=torch.long, device=self.device) + block_groups = torch.tensor(block_groups, + dtype=torch.long, + device=self.device) block_usage = torch.tensor(block_usage, dtype=self.model_config.dtype, device=self.device) @@ -1060,6 +1066,7 @@ def _prepare_decode( block_indices=block_indices, block_offsets=block_offsets, block_scales=block_scales, + block_groups=block_groups, attn_bias=None, seq_lens_tensor=None, num_prefills=0, @@ -1271,7 +1278,7 @@ def trim_attn_metadata(self, metadata: AttentionMetadata) -> object: attention_metadata = subtuple(metadata, 'TrimmedAttentionMetadata', [ 'attn_bias', 'seq_lens_tensor', 'block_list', 'block_mapping', 'block_usage', 'slot_mapping', 'is_prompt', 'block_indices', - 'block_offsets', 'block_scales' + 'block_offsets', 'block_scales', 'block_groups' ]) return attention_metadata From f603353e2057808f46c86395334ef507fd2bb351 Mon Sep 17 00:00:00 2001 From: Artur Fierka Date: Fri, 25 Oct 2024 08:46:30 +0200 Subject: [PATCH 300/819] Update README_GAUDI about fp8 calibration procedure (#423) --- README_GAUDI.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README_GAUDI.md b/README_GAUDI.md index b9c744bd9e23f..6dd7837116d52 100644 --- a/README_GAUDI.md +++ b/README_GAUDI.md @@ -282,6 +282,10 @@ Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM - `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used, `1` is default - `PT_HPU_ENABLE_LAZY_COLLECTIVES`: required to be `true` for tensor parallel inference with HPU Graphs +# Quantization and FP8 model calibration process + +The FP8 model calibration procedure has been described as a part of [vllm-hpu-extention](https://github.com/HabanaAI/vllm-hpu-extension/tree/main/calibration/README.md) package. + # Troubleshooting: Tweaking HPU Graphs If you experience device out-of-memory issues or want to attempt inference at higher batch sizes, try tweaking HPU Graphs by following the below: From a5136ec1fd78c2fb640cd89a48b479472bd5666a Mon Sep 17 00:00:00 2001 From: Michal Adamczyk Date: Fri, 25 Oct 2024 09:58:38 +0200 Subject: [PATCH 301/819] Set vllm-hpu-extension to 341a77f (#428) --- requirements-hpu.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 7cefa4e631fa8..20f4dc74a3955 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -8,4 +8,4 @@ pandas tabulate setuptools>=61 setuptools-scm>=8 -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@c2801bb +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@341a77f From a926d140256334d8185091d20f5b80b435e0afcd Mon Sep 17 00:00:00 2001 From: Roman Zhukov Date: Fri, 25 Oct 2024 12:41:46 +0100 Subject: [PATCH 302/819] Create scorecard.yml Adding calculation of OpenSSF Scorecard. Note: badge (visible at repo main page) will be disabled for now. --- .github/workflows/scorecard.yml | 73 +++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 .github/workflows/scorecard.yml diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml new file mode 100644 index 0000000000000..c610f06360d1f --- /dev/null +++ b/.github/workflows/scorecard.yml @@ -0,0 +1,73 @@ +# This workflow uses actions that are not certified by GitHub. They are provided +# by a third-party and are governed by separate terms of service, privacy +# policy, and support documentation. + +name: Scorecard supply-chain security +on: + # For Branch-Protection check. Only the default branch is supported. See + # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection + branch_protection_rule: + # To guarantee Maintained check is occasionally updated. See + # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained + schedule: + - cron: '20 13 * * 0' + push: + branches: [ "habana_main" ] + +# Declare default permissions as read only. +permissions: read-all + +jobs: + analysis: + name: Scorecard analysis + runs-on: ubuntu-latest + permissions: + # Needed to upload the results to code-scanning dashboard. + security-events: write + # Needed to publish results and get a badge (see publish_results below). + id-token: write + # Uncomment the permissions below if installing in a private repository. + # contents: read + # actions: read + + steps: + - name: "Checkout code" + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + persist-credentials: false + + - name: "Run analysis" + uses: ossf/scorecard-action@0864cf19026789058feabb7e87baa5f140aac736 # v2.3.1 + with: + results_file: results.sarif + results_format: sarif + # (Optional) "write" PAT token. Uncomment the `repo_token` line below if: + # - you want to enable the Branch-Protection check on a *public* repository, or + # - you are installing Scorecard on a *private* repository + # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action?tab=readme-ov-file#authentication-with-fine-grained-pat-optional. + # repo_token: ${{ secrets.SCORECARD_TOKEN }} + + # Public repositories: + # - Publish results to OpenSSF REST API for easy access by consumers + # - Allows the repository to include the Scorecard badge. + # - See https://github.com/ossf/scorecard-action#publishing-results. + # For private repositories: + # - `publish_results` will always be set to `false`, regardless + # of the value entered here. + publish_results: false + + # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF + # format to the repository Actions tab. + - name: "Upload artifact" + uses: actions/upload-artifact@97a0fba1372883ab732affbe8f94b823f91727db # v3.pre.node20 + with: + name: SARIF file + path: results.sarif + retention-days: 5 + + # Upload the results to GitHub's code scanning dashboard (optional). + # Commenting out will disable upload of results to your repo's Code Scanning dashboard + - name: "Upload to code-scanning" + uses: github/codeql-action/upload-sarif@v3 + with: + sarif_file: results.sarif From 5b7f685c3416d735c8866ad4f3282fe86b978739 Mon Sep 17 00:00:00 2001 From: Marceli Fylcek Date: Fri, 25 Oct 2024 14:35:13 +0200 Subject: [PATCH 303/819] Contiguous PA (#424) Contiguous cache fetching to avoid using costly gather operation. Requires changes in vllm-hpu-extension (https://github.com/HabanaAI/vllm-hpu-extension/pull/17) to work. Introduces redundant calculations in decoding phase. In all tested cases improves performance over the entire run (5-12%). For even better performance cache defragmentation is required. Only compatible with v2-block-manager. --- requirements-hpu.txt | 2 +- vllm/worker/hpu_model_runner.py | 54 +++++++++++++++++---------------- 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 20f4dc74a3955..4719639da6188 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -8,4 +8,4 @@ pandas tabulate setuptools>=61 setuptools-scm>=8 -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@341a77f +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@6cb6e19 diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 382a0abb21240..4be0dc1a1abd8 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -199,10 +199,11 @@ def generate_decode_buckets(bs_bucket_config, blocks_bucket_config, bs_buckets = warmup_range(bs_bucket_config) block_buckets = warmup_range(blocks_bucket_config) bmin, bstep, bmax = blocks_bucket_config - last_bucket = round_up(max_blocks, bstep) + last_bucket = max_blocks for bs in bs_buckets: for blocks in block_buckets: if blocks > last_bucket: + buckets.append((bs, last_bucket)) break buckets.append((bs, blocks)) return list(sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0]))) @@ -1002,39 +1003,40 @@ def _prepare_decode( num_decode_tokens = sum(seq_lens) - blocks_used = [len(bt) for bt in block_tables if bt] - block_list = [] - block_scales = [] + block_list = list(itertools.chain(*block_tables)) + + max_idx = max(block_list) + max_blocks = max(max_idx + 1, len(block_list)) + block_bucket_size = find_bucket( + max_blocks, self.bucketing_global_state.decode_block_bucket_cfg) + block_bucket_size = min(block_bucket_size, + self.cache_config.num_gpu_blocks) + + block_mapping: List[Union[None, int]] = [None] * block_bucket_size + block_usage: List[Union[None, int]] = [None] * block_bucket_size + block_scales: List[Union[None, float]] = [None] * block_bucket_size + for i, bt in enumerate(block_tables): - block_list.extend(bt) - blocks_in_group = len(bt) - if blocks_in_group > 0: + if bt: + blocks_in_group = len(bt) scale = 1.0 / blocks_in_group - block_scales.extend([scale] * blocks_in_group) + for b in bt: + if block_mapping[b] is None: + block_mapping[b] = i + block_usage[b] = self.block_size + block_scales[b] = scale - block_mapping_nested: List[List[int]] = [ - [i] * b_u for i, b_u in enumerate(blocks_used) - ] - block_mapping: List[int] = list( - itertools.chain.from_iterable(block_mapping_nested)) + block_mapping = [b if b is not None else -1 for b in block_mapping] + block_scales = [b if b is not None else 0.0 for b in block_scales] - last_block = [ - sl % self.block_size + 1 for sl in itertools.chain(*slot_mapping) - ] - block_usage = [[self.block_size] * (b_u - 1) + [lb] - for b_u, lb in zip(blocks_used, last_block)] - block_usage = list(itertools.chain(*block_usage)) + for bt, sl in zip(block_tables, slot_mapping): + if bt: + block_usage[bt[-1]] = sl[-1] % self.block_size + 1 + block_usage = [u if u is not None else 1 for u in block_usage] - block_bucket_size = find_bucket( - len(block_list), - self.bucketing_global_state.decode_block_bucket_cfg) block_list = pad_list(block_list, block_bucket_size, _PAD_BLOCK_ID) block_groups = pad_list(block_mapping, block_bucket_size, len(block_tables)) - block_mapping = pad_list(block_mapping, block_bucket_size, -1) - block_usage = pad_list(block_usage, block_bucket_size, 1) - block_scales = pad_list(block_scales, block_bucket_size, 0.0) - block_list = torch.tensor(block_list, dtype=torch.int, device=self.device) From e3ae2ebffcb233a67c63ab1fe9acab3dad1d53dc Mon Sep 17 00:00:00 2001 From: Michal Adamczyk Date: Fri, 25 Oct 2024 14:49:54 +0200 Subject: [PATCH 304/819] Revert "Contiguous PA" (#432) Reverts HabanaAI/vllm-fork#424 --- requirements-hpu.txt | 2 +- vllm/worker/hpu_model_runner.py | 54 ++++++++++++++++----------------- 2 files changed, 27 insertions(+), 29 deletions(-) diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 4719639da6188..20f4dc74a3955 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -8,4 +8,4 @@ pandas tabulate setuptools>=61 setuptools-scm>=8 -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@6cb6e19 +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@341a77f diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 4be0dc1a1abd8..382a0abb21240 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -199,11 +199,10 @@ def generate_decode_buckets(bs_bucket_config, blocks_bucket_config, bs_buckets = warmup_range(bs_bucket_config) block_buckets = warmup_range(blocks_bucket_config) bmin, bstep, bmax = blocks_bucket_config - last_bucket = max_blocks + last_bucket = round_up(max_blocks, bstep) for bs in bs_buckets: for blocks in block_buckets: if blocks > last_bucket: - buckets.append((bs, last_bucket)) break buckets.append((bs, blocks)) return list(sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0]))) @@ -1003,40 +1002,39 @@ def _prepare_decode( num_decode_tokens = sum(seq_lens) - block_list = list(itertools.chain(*block_tables)) - - max_idx = max(block_list) - max_blocks = max(max_idx + 1, len(block_list)) - block_bucket_size = find_bucket( - max_blocks, self.bucketing_global_state.decode_block_bucket_cfg) - block_bucket_size = min(block_bucket_size, - self.cache_config.num_gpu_blocks) - - block_mapping: List[Union[None, int]] = [None] * block_bucket_size - block_usage: List[Union[None, int]] = [None] * block_bucket_size - block_scales: List[Union[None, float]] = [None] * block_bucket_size - + blocks_used = [len(bt) for bt in block_tables if bt] + block_list = [] + block_scales = [] for i, bt in enumerate(block_tables): - if bt: - blocks_in_group = len(bt) + block_list.extend(bt) + blocks_in_group = len(bt) + if blocks_in_group > 0: scale = 1.0 / blocks_in_group - for b in bt: - if block_mapping[b] is None: - block_mapping[b] = i - block_usage[b] = self.block_size - block_scales[b] = scale + block_scales.extend([scale] * blocks_in_group) - block_mapping = [b if b is not None else -1 for b in block_mapping] - block_scales = [b if b is not None else 0.0 for b in block_scales] + block_mapping_nested: List[List[int]] = [ + [i] * b_u for i, b_u in enumerate(blocks_used) + ] + block_mapping: List[int] = list( + itertools.chain.from_iterable(block_mapping_nested)) - for bt, sl in zip(block_tables, slot_mapping): - if bt: - block_usage[bt[-1]] = sl[-1] % self.block_size + 1 - block_usage = [u if u is not None else 1 for u in block_usage] + last_block = [ + sl % self.block_size + 1 for sl in itertools.chain(*slot_mapping) + ] + block_usage = [[self.block_size] * (b_u - 1) + [lb] + for b_u, lb in zip(blocks_used, last_block)] + block_usage = list(itertools.chain(*block_usage)) + block_bucket_size = find_bucket( + len(block_list), + self.bucketing_global_state.decode_block_bucket_cfg) block_list = pad_list(block_list, block_bucket_size, _PAD_BLOCK_ID) block_groups = pad_list(block_mapping, block_bucket_size, len(block_tables)) + block_mapping = pad_list(block_mapping, block_bucket_size, -1) + block_usage = pad_list(block_usage, block_bucket_size, 1) + block_scales = pad_list(block_scales, block_bucket_size, 0.0) + block_list = torch.tensor(block_list, dtype=torch.int, device=self.device) From 93609a2a54aba5088f6fc94c49edad90ff4c3aa0 Mon Sep 17 00:00:00 2001 From: Tomasz Pawlowski Date: Fri, 25 Oct 2024 15:30:11 +0200 Subject: [PATCH 305/819] Enable Dynamic MoE for Mixtral on 1.19.0 (#425) Move Dynamic MoE implementation to habana_main. It was previously implemented for 1.18, but had to be modified as ops have been moved to [github.com/HabanaAI/vllm-hpu-extension](https://github.com/HabanaAI/vllm-hpu-extension). Works with bf16, uses static (legacy) mode when running with quantization. Related PRs: - https://github.com/HabanaAI/vllm-fork/pull/303 - https://github.com/HabanaAI/vllm-hpu-extension/pull/13 ---
PR Checklist (Click to Expand)

Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.

PR Title and Classification

Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:

  • [Bugfix] for bug fixes.
  • [CI/Build] for build or continuous integration improvements.
  • [Doc] for documentation fixes and improvements.
  • [Model] for adding a new model or improving an existing model. Model name should appear in the title.
  • [Frontend] For changes on the vLLM frontend (e.g., OpenAI API server, LLM class, etc.)
  • [Kernel] for changes affecting CUDA kernels or other compute kernels.
  • [Core] for changes in the core vLLM logic (e.g., LLMEngine, AsyncLLMEngine, Scheduler, etc.)
  • [Hardware][Vendor] for hardware-specific changes. Vendor name should appear in the prefix (e.g., [Hardware][AMD]).
  • [Misc] for PRs that do not fit the above categories. Please use this sparingly.

Note: If the PR spans more than one category, please include all relevant prefixes.

Code Quality

The PR need to meet the following code quality standards:

  • We adhere to Google Python style guide and Google C++ style guide.
  • Pass all linter checks. Please use format.sh to format your code.
  • The code need to be well-documented to ensure future contributors can easily understand the code.
  • Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.
  • Please add documentation to docs/source/ if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.

Adding or changing kernels

Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.

  • Make sure custom ops are registered following PyTorch guidelines: Custom C++ and CUDA Operators and The Custom Operators Manual
  • Custom operations that return Tensors require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.
  • Use torch.libary.opcheck() to test the function registration and meta-function for any registered ops. See tests/kernels for examples.
  • When changing the C++ signature of an existing op, the schema must be updated to reflect the changes.
  • If a new custom type is needed, see the following document: Custom Class Support in PT2.

Notes for Large Changes

Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with rfc-required and might not go through the PR.

What to Expect for the Reviews

The goal of the vLLM team is to be a transparent reviewing machine. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process:

  • After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.
  • After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.
  • After the review, the reviewer will put an action-required label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.
  • Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.

Thank You

Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone!

--- requirements-hpu.txt | 2 +- vllm/model_executor/layers/fused_moe/layer.py | 22 +++++++++++++------ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 20f4dc74a3955..4019950062efe 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -8,4 +8,4 @@ pandas tabulate setuptools>=61 setuptools-scm>=8 -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@341a77f +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@341a77f \ No newline at end of file diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 457450cda2ce6..8f6bdaa7ab44a 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -226,9 +226,13 @@ def __init__( self.num_expert_group = num_expert_group self.topk_group = topk_group self.custom_routing_function = custom_routing_function - if current_platform.is_hpu(): - from vllm_hpu_extension.ops import StaticFusedMOE - self.hpu_static_fused_moe = StaticFusedMOE(self.num_experts) + if is_hpu: + from vllm_hpu_extension.ops import DynamicFusedMOE, StaticFusedMOE + + from vllm.model_executor.layers.quantization.inc import INCConfig + selected_fused_moe = (StaticFusedMOE if isinstance( + quant_config, INCConfig) else DynamicFusedMOE) + self.hpu_static_fused_moe = selected_fused_moe(self.num_experts) if quant_config is None: self.quant_method: Optional[QuantizeMethodBase] = ( @@ -321,8 +325,10 @@ def _load_w13(self, expert_data.copy_(loaded_weight) if is_hpu: - self.hpu_static_fused_moe.w13_list[expert_id].set_weight( - orig_exp_data) + from vllm_hpu_extension.ops import StaticFusedMOE + if isinstance(self.hpu_static_fused_moe, StaticFusedMOE): + self.hpu_static_fused_moe.w13_list[expert_id].set_weight( + orig_exp_data) def _load_w2(self, expert_data: torch.Tensor, @@ -341,8 +347,10 @@ def _load_w2(self, # w2, down_proj: Load into only logical weight of w2. expert_data.copy_(loaded_weight) if is_hpu: - self.hpu_static_fused_moe.w2_list[expert_id].set_weight( - expert_data) + from vllm_hpu_extension.ops import StaticFusedMOE + if isinstance(self.hpu_static_fused_moe, StaticFusedMOE): + self.hpu_static_fused_moe.w2_list[expert_id].set_weight( + expert_data) def _load_single_value(self, param: torch.nn.Parameter, loaded_weight: torch.Tensor, expert_id: int): From 3a55e77bc7d99b2ccbe3eb738fa9e8648dbf7f4e Mon Sep 17 00:00:00 2001 From: Sanju C Sudhakaran Date: Mon, 28 Oct 2024 14:16:58 +0530 Subject: [PATCH 306/819] Support long contexts with LoRA (#418) This PR enables long-contexts support with LoRA --- tests/lora/test_long_context_hpu.py | 304 ++++++++++++++++++++++++++++ vllm/lora/punica.py | 23 ++- vllm/worker/hpu_model_runner.py | 29 ++- 3 files changed, 346 insertions(+), 10 deletions(-) create mode 100644 tests/lora/test_long_context_hpu.py diff --git a/tests/lora/test_long_context_hpu.py b/tests/lora/test_long_context_hpu.py new file mode 100644 index 0000000000000..33250edde00d3 --- /dev/null +++ b/tests/lora/test_long_context_hpu.py @@ -0,0 +1,304 @@ +import ast +from typing import List, Optional, Tuple + +import numpy as np +import pytest + +import vllm +from vllm import SamplingParams +from vllm.lora.layers import LinearScalingRotaryEmbeddingWithLora +from vllm.lora.request import LoRARequest +from vllm.model_executor.layers.rotary_embedding import ( + LinearScalingRotaryEmbedding) + +from .data.long_context_test_data import prompts_and_responses + +context_len_to_scaling_factor = { + "16k": 4, + "32k": 8, +} + +# We use the same sampling params for all requests +sampling_params = SamplingParams( + temperature=0, + max_tokens=100, +) + + +def _create_lora_request(lora_id, long_context_infos): + context_len = long_context_infos[lora_id]["context_length"] + scaling_factor = context_len_to_scaling_factor[context_len] + return LoRARequest(f'{context_len}_{lora_id}', lora_id, + long_context_infos[lora_id]["lora"], None, + 4096 * scaling_factor) + + +def evaluate_json_response(model_response, golden_response): + """Evaluates the model response against the golden response. + + Returns a score between 0 and 1, where 1 is a perfect match and 0 is no + match. The score quantifies how well the model is able to extract the + golden JSON from the long context. + """ + try: + model_response = ast.literal_eval(model_response) + except Exception as e: + raise ValueError( + f"Model response is not a valid JSON. Expected {golden_response}, " + f"got {model_response}") from e + + # Normally, we would flatten the dictionary and compare the values, but in + # this case, we know that the dictionary is only 2 levels deep + positive_values = 0 + total_values = 0 + # We look at all the attributes of the person that we are extracting a + # biography of and copmare them to the golden response + for person_attribute, person_attribute_value in golden_response.items(): + if person_attribute in model_response: + if isinstance(person_attribute_value, dict): + for (sub_attribute, + sub_attribute_value) in person_attribute_value.items(): + total_values += 1 + if sub_attribute in model_response[ + person_attribute] and model_response[ + person_attribute][ + sub_attribute] == sub_attribute_value: + positive_values += 1 + else: + total_values += 1 + if model_response[person_attribute] == person_attribute_value: + positive_values += 1 + else: + # We count a missing sub-dict as a single missed value. + total_values += 1 + + # Return a score between 0 and 1 + return positive_values / total_values + + +def generate( + llm: vllm.LLM, + inputs: Tuple[str, SamplingParams, Optional[LoRARequest]], +): + prompts, sampling_param, lora_request = inputs + outputs = llm.generate(prompts, sampling_param, lora_request=lora_request) + return outputs[0].outputs[0].text.strip() + + +def batched_generate( + llm: vllm.LLM, + inputs: List[Tuple[str, SamplingParams, Optional[LoRARequest]]], +): + for input in inputs: + prompt, sampling_param, lora_req = input + # Add requests to the engine and run the engine + llm._validate_and_add_requests(prompt, + sampling_param, + lora_request=lora_req, + prompt_adapter_request=None) + + outputs = llm._run_engine(use_tqdm=True) + return [outputs[i].outputs[0].text.strip() for i in range(len(outputs))] + + +@pytest.fixture(scope="module") +def lora_llm(long_context_infos): + scaling_factors = [ + context_len_to_scaling_factor[info["context_length"]] + for info in long_context_infos.values() + ] + + llm = vllm.LLM( + "meta-llama/Llama-2-13b-chat-hf", + enable_lora=True, + max_num_seqs=16, + max_loras=2, + long_lora_scaling_factors=tuple(scaling_factors), + max_num_batched_tokens=4096 * 8, + tensor_parallel_size=1, + enforce_eager=True, # TODO Remove after SW-205153 is fixed + dtype="bfloat16", + disable_async_output_proc=True, # TODO Remove after SW-204469 is fixed. + distributed_executor_backend="mp") + yield llm + del llm + + +def test_rotary_emb_replaced(dist_init): + """Verify rotary emb in all the layers are replaced""" + from vllm.engine.arg_utils import EngineArgs + from vllm.platforms import current_platform + if current_platform.is_hpu(): + from vllm.worker.hpu_model_runner import HPUModelRunner as ModelRunner + else: + from vllm.worker.model_runner import ModelRunner + engine_args = EngineArgs("meta-llama/Llama-2-7b-hf", + long_lora_scaling_factors=(4.0, ), + enable_lora=True) + engine_config = engine_args.create_engine_config() + model_runner = ModelRunner( + model_config=engine_config.model_config, + parallel_config=engine_config.parallel_config, + scheduler_config=engine_config.scheduler_config, + device_config=engine_config.device_config, + cache_config=engine_config.cache_config, + load_config=engine_config.load_config, + lora_config=engine_config.lora_config, + is_driver_worker=True, + ) + model_runner.load_model() + rotary_emb_count = 0 + model = model_runner.model.model if current_platform.is_hpu( + ) else model_runner.model + for module_name, module in model.named_modules(remove_duplicate=False): + if "rotary_emb" in module_name: + if "base_layer" not in module_name: + rotary_emb_count += 1 + assert isinstance(module, LinearScalingRotaryEmbeddingWithLora) + else: + assert isinstance(module, LinearScalingRotaryEmbedding) + # Llama 2 has 32 layers. + assert rotary_emb_count == 32 + + +@pytest.mark.skip_global_cleanup +def test_batched_rope_kernel(lora_llm, long_context_infos): + """We test the batched kernel by comparing the results of batched an + non-batched generation. + """ + # Create non batched results first to compare against batched results + non_batched_results: List[str] = [] + + for lora_id, info in long_context_infos.items(): + context_len = info["context_length"] + lora_prompt = (prompts_and_responses[context_len][0]["prompt"], + sampling_params, + _create_lora_request(lora_id, long_context_infos)) + lora_output = generate(lora_llm, lora_prompt) + non_batched_results.append(lora_output) + + # Create batched results + # Each element of the batch must be + # (prompt, prompt_sampling_params, prompt_lora_request) + batched_prompts: List[Tuple[str, SamplingParams, + Optional[LoRARequest]]] = [] + for lora_id, info in long_context_infos.items(): + context_len = info["context_length"] + batched_prompts.extend([ + (prompts_and_responses[context_len][0]["prompt"], sampling_params, + _create_lora_request(lora_id, long_context_infos)) + ]) + batched_results = batched_generate(lora_llm, batched_prompts) + + # Results should be the same + for non_batched, batched in zip(non_batched_results, batched_results): + assert non_batched == batched, ( + "Non batched and batched results should be the " + f"same:\n{batched}\n{non_batched}") + + +@pytest.mark.skip_global_cleanup +def test_self_consistency(lora_llm, long_context_infos): + """We test consistency of the batched kernel by permuting batched + inputs and comparing the results to the non-permuted batched results. + """ + num_loras = len(long_context_infos) + + # Create results in order of long_context_infos + batched_prompts: List[Tuple[str, SamplingParams, + Optional[LoRARequest]]] = [] + for lora_id, info in long_context_infos.items(): + context_len = info["context_length"] + batched_prompts.extend([ + (prompts_and_responses[context_len][0]["prompt"], sampling_params, + _create_lora_request(lora_id, long_context_infos)) + ]) + + batched_results = batched_generate(lora_llm, batched_prompts) + + permutation = np.random.default_rng(seed=42).permutation(num_loras) + + # Create results in random order of permutation + batched_prompts = [] + for i in permutation: + lora_id, info = list(long_context_infos.items())[i] + context_len = info["context_length"] + batched_prompts.extend([ + (prompts_and_responses[context_len][0]["prompt"], sampling_params, + _create_lora_request(lora_id, long_context_infos)) + ]) + + permutated_batched_results = batched_generate(lora_llm, batched_prompts) + + # Results should be the same + for i in range(num_loras): + assert batched_results[i] == permutated_batched_results[ + permutation[i]], ( + f"Results should be the same:\n{batched_results[i]}" + f"\n{permutated_batched_results[permutation[i]]}") + + +@pytest.mark.skip_global_cleanup +def test_quality(lora_llm, long_context_infos): + """We test the quality of the answers given by the LoRA model by + comparing the generated text to the merged model's outputs. + + This is effectively a mini-benchmark over four prompts. + If this test fails, this indicates that the quality of the LoRA model + is suboptimal compared to the merged model. For example, if the model + does not output valid dictionaries, this test will fail. + + If needed for testing, the merged versions of the models are available + as part of the `conftest`. + + The test is expected to run for about 1 minute on a p4de.24xlarge + instance. + """ + scores: List[float] = [] + for lora_id, info in long_context_infos.items(): + context_len = info["context_length"] + for prompt_and_response in prompts_and_responses[context_len]: + lora_prompt = (prompt_and_response["prompt"], sampling_params, + _create_lora_request(lora_id, long_context_infos)) + response = generate(lora_llm, lora_prompt) + golden_answer = prompt_and_response["golden_answer"] + score = evaluate_json_response(response, golden_answer) + scores.append(score) + assert score > 0.3, ("Quality of the answer is not good enough. " + f"Expected {golden_answer}, got {response}") + assert np.mean(scores) > 0.5 + + +@pytest.mark.skip_global_cleanup +def test_max_len(lora_llm, long_context_infos): + """Test that we raise an ValueError when the input of a given LoRA + model exceeds the maximum length.""" + # Since each LoRA model has a different maximum length, we need to + # test each one separately + for lora_id, info in long_context_infos.items(): + context_len = info["context_length"] + lora_request = _create_lora_request(lora_id, long_context_infos) + # Good prompt should be fine + good_prompt = prompts_and_responses[context_len][0]["prompt"] + generate(lora_llm, (good_prompt, sampling_params, lora_request)) + # Bad prompt should raise an error + bad_prompt = good_prompt * 2 + with pytest.raises(ValueError): + generate(lora_llm, (bad_prompt, sampling_params, lora_request)) + + # Also test batched + batched_prompts: List[Tuple[str, SamplingParams, + Optional[LoRARequest]]] = [] + for lora_id_with_bad_inputs in long_context_infos: + for lora_id, info in long_context_infos.items(): + context_len = info["context_length"] + batched_prompts.extend([ + (prompts_and_responses[context_len][0]["prompt"] * + (2 if lora_id == lora_id_with_bad_inputs else 1), + sampling_params, + _create_lora_request(lora_id, long_context_infos)) + ]) + # Turn good prompt into bad prompt inside of batched prompts + + with pytest.raises(ValueError): + batched_generate(lora_llm, batched_prompts) diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index f22f92b6fe64b..1fdd15df99c19 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -103,10 +103,15 @@ def convert_mapping( embedding_indices = index_mapping_indices.copy() lora_indices = index_mapping_indices.copy() long_lora_offsets: Optional[torch.Tensor] = None + + from vllm.platforms import current_platform if long_lora_context: - long_lora_offsets = torch.zeros(len(index_mapping_indices), - device=get_device(), - dtype=torch.long) + if current_platform.is_hpu(): + long_lora_offsets_list: List[int] = [] + else: + long_lora_offsets = torch.zeros(len(index_mapping_indices), + device=get_device(), + dtype=torch.long) prompt_mapping: List[int] = [ lora_index_to_id.index(x) if x > 0 else -1 for x in mapping.prompt_mapping @@ -119,10 +124,18 @@ def convert_mapping( embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0 lora_indices[i] = lora_idx if long_lora_context: - assert long_lora_offsets is not None lora_offset: int = long_lora_context.offsets_by_lora_id.get( index_mapping_indices[i], 0) - long_lora_offsets[i] = lora_offset + if current_platform.is_hpu(): + long_lora_offsets_list.append(lora_offset) + else: + assert long_lora_offsets is not None + long_lora_offsets[i] = lora_offset + + if long_lora_context and current_platform.is_hpu(): + long_lora_offsets = torch.tensor(long_lora_offsets_list, + device=get_device(), + dtype=torch.long) indices_list: List[Union[List[int], torch.Tensor]] = [ index_mapping_indices, diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 382a0abb21240..b5100491c4135 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -37,6 +37,7 @@ from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader import get_model +from vllm.model_executor.models import supports_multimodal from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, MultiModalInputs) from vllm.sampling_params import SamplingParams @@ -649,12 +650,30 @@ def load_model(self) -> None: assert hasattr( self.model, "embedding_padding_modules" ), "Model does not have embedding_padding_modules" + + if supports_multimodal(self.model): + logger.warning( + "Regarding multimodal models, vLLM currently " + "only supports adding LoRA to language model.") + # It's necessary to distinguish between the + # max_position_embeddings of VLMs and LLMs. + if hasattr(self.model.config, "max_position_embeddings"): + max_pos_embeddings = ( + self.model.config.max_position_embeddings) + else: + max_pos_embeddings = ( + self.model.config.text_config.max_position_embeddings) + self.lora_manager = LRUCacheWorkerLoRAManager( self.scheduler_config.max_num_seqs, self.scheduler_config.max_num_batched_tokens, - self.vocab_size, self.lora_config, self.device, + self.vocab_size, + self.lora_config, + self.device, self.model.embedding_modules, - self.model.embedding_padding_modules) + self.model.embedding_padding_modules, + max_position_embeddings=max_pos_embeddings, + ) self.model = self.lora_manager.create_lora_manager(self.model) if self.model_config.quantization == 'inc': @@ -1314,7 +1333,8 @@ def profile_run(self) -> None: num_layers = self.model_config.get_num_layers(self.parallel_config) kv_caches = [None] * num_layers max_seq_len = self.bucketing_global_state.prompt_seq_bucket_cfg[-1] - max_batch_size = self.max_num_batched_tokens // max_seq_len + max_batch_size = min(self.max_num_batched_tokens // max_seq_len, + self.scheduler_config.max_num_seqs) self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches, False, True) @@ -1333,7 +1353,6 @@ def warmup_scenario(self, f"bs{batch_size}_" f"seq{seq_len}_" f"graphs{'T' if use_graphs else 'F'}") - max_num_seqs = self.scheduler_config.max_num_seqs # This represents the maximum number of different requests # that will have unique loras, an therefore the max amount of memory # consumption create dummy lora request copies from the lora request @@ -1355,7 +1374,7 @@ def warmup_scenario(self, dummy_lora_requests.append(dummy_lora_request) dummy_lora_requests_per_seq = [ dummy_lora_requests[idx % len(dummy_lora_requests)] - for idx in range(max_num_seqs) + for idx in range(batch_size) ] self.profiler.start('internal', scenario_name) times = 3 if use_graphs or is_pt_profiler_run else 1 From 4fd5c4c9601c82bd9240ba974310b20c9535d11c Mon Sep 17 00:00:00 2001 From: Karol Damaszke Date: Mon, 28 Oct 2024 10:47:52 +0100 Subject: [PATCH 307/819] Add HPU specific changes to benchmark_latency.py (#436) Add support for HPU FP8 in `benchmark_latency.py` script. Limit `max_num_seqs` based on the `batch_size` as there will be no more requests. --- benchmarks/benchmark_latency.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 79a48b2a1a845..30373b119a2ca 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -47,6 +47,7 @@ def main(args: argparse.Namespace): distributed_executor_backend=args.distributed_executor_backend, otlp_traces_endpoint=args.otlp_traces_endpoint, enable_prefix_caching=args.enable_prefix_caching, + max_num_seqs=args.batch_size, ) sampling_params = SamplingParams( @@ -179,7 +180,7 @@ def run_to_completion(profile_dir: Optional[str] = None): parser.add_argument( '--kv-cache-dtype', type=str, - choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], + choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3', 'fp8_inc'], default="auto", help='Data type for kv cache storage. If "auto", will use model ' 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' From 96e0d6f1c344873258ce8cfe8067840ec980d14a Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 28 Oct 2024 14:13:32 +0200 Subject: [PATCH 308/819] Rebase fix --- vllm/worker/hpu_model_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index b5100491c4135..90cd70669837a 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -26,6 +26,7 @@ HabanaMemoryProfiler, format_bytes) from vllm.attention import AttentionMetadata, get_attn_backend +from vllm.attention.backends.hpu_attn import HPUAttentionBackend from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ObservabilityConfig, ParallelConfig, PromptAdapterConfig, SchedulerConfig) @@ -575,12 +576,12 @@ def __init__( self.attn_backend = get_attn_backend( self.model_config.get_head_size(), - self.model_config.get_sliding_window(), self.model_config.dtype, self.kv_cache_dtype, self.block_size, self.model_config.is_attention_free, ) + assert self.attn_backend == HPUAttentionBackend # Lazy initialization self.lora_manager: LRUCacheWorkerLoRAManager = None From ebebbbbcda77a77c3d740911124cf4a47543b035 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 28 Oct 2024 15:45:04 +0200 Subject: [PATCH 309/819] fix ci fails --- vllm/entrypoints/llm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 56a3a99302332..84a1cdd98ee22 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -214,6 +214,7 @@ def set_tokenizer(self, tokenizer: AnyTokenizer) -> None: tokenizer_group.tokenizer = get_cached_tokenizer(tokenizer) def finish_measurements(self): + assert not envs.VLLM_USE_V1, "INC does not support vLLM V1" self.llm_engine.finish_measurements() @overload # LEGACY: single (prompt + optional token ids) From 4c0caa585939d85d1a318cd050c17d7c152618f3 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 28 Oct 2024 15:47:20 +0200 Subject: [PATCH 310/819] fix ci again --- vllm/entrypoints/llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 84a1cdd98ee22..7d0553be260a7 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -215,7 +215,7 @@ def set_tokenizer(self, tokenizer: AnyTokenizer) -> None: def finish_measurements(self): assert not envs.VLLM_USE_V1, "INC does not support vLLM V1" - self.llm_engine.finish_measurements() + self.llm_engine.finish_measurements() # type: ignore[attr-defined] @overload # LEGACY: single (prompt + optional token ids) def generate( From 72a2856ddf7f311849c348beef5c74419b65cc5b Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 28 Oct 2024 15:52:14 +0200 Subject: [PATCH 311/819] formatting --- vllm/entrypoints/llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 7d0553be260a7..93dfea67c38a6 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -215,7 +215,7 @@ def set_tokenizer(self, tokenizer: AnyTokenizer) -> None: def finish_measurements(self): assert not envs.VLLM_USE_V1, "INC does not support vLLM V1" - self.llm_engine.finish_measurements() # type: ignore[attr-defined] + self.llm_engine.finish_measurements() # type: ignore[attr-defined] @overload # LEGACY: single (prompt + optional token ids) def generate( From 2a38e6f575f86e0853d2b057cb004c48109f8b77 Mon Sep 17 00:00:00 2001 From: Sayantan Sarkar Date: Mon, 28 Oct 2024 10:21:19 -0700 Subject: [PATCH 312/819] sarkar/Add htrandom generator for hpu (#246) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To repro: start server: `VLLM_SKIP_WARMUP=true python -m vllm.entrypoints.openai.api_server` send a request (this works fine): ``` curl -v http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "facebook/opt-125m","prompt": "The future of AI is ","max_tokens": 100,"temperature": 0}' ``` if request has a seed it fails: ``` curl -v http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "facebook/opt-125m","prompt": "The future of AI is ","max_tokens": 100,"temperature": 0, "seed" : 37}' ``` Failure happens here: [vllm-fork/vllm/model_executor/sampling_metadata.py at habana_main · HabanaAI/vllm-fork](https://github.com/HabanaAI/vllm-fork/blob/habana_main/vllm/model_executor/sampling_metadata.py#L220) ``` if sampling_params.seed is not None: seq_group_metadata.state.generator = torch.Generator( device=device).manual_seed(sampling_params.seed) ``` `RuntimeError: Device type HPU is not supported for torch.Generator() api.` This PR fixes above issue by using htrandom [Intel Gaudi PyTorch Python API (habana_frameworks.torch) — Gaudi Documentation 1.17.1 documentation](https://docs.habana.ai/en/latest/PyTorch/Reference/Python_Packages.html?highlight=htrandom#random-number-generator-apis) --- vllm/model_executor/sampling_metadata.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 84f35f75a0c32..d4a8024095286 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -4,6 +4,7 @@ import torch +from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams, SamplingType from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData, SequenceGroupMetadata) @@ -266,8 +267,14 @@ def _prepare_seq_groups( if seq_group_metadata.is_prompt: if sampling_params.seed is not None: - generator = torch.Generator(device=device).manual_seed( - sampling_params.seed) + if current_platform.is_hpu(): + import habana_frameworks.torch.hpu.random as htrandom + generator = \ + htrandom.default_generators[ + 0].manual_seed(sampling_params.seed) + else: + generator = torch.Generator(device=device).manual_seed( + sampling_params.seed) if generators is not None: generators[seq_group_metadata.request_id] = generator From 3e135aea80d463d85416c08c9e0bf12d08f3ae3b Mon Sep 17 00:00:00 2001 From: yuwenzho Date: Tue, 29 Oct 2024 14:41:07 +0800 Subject: [PATCH 313/819] Fix one_hot bug in torch compile mode (#427) Fix one_hot bug in torch compile mode ``` > block_mapping = torch.nn.functional.one_hot(metadata.block_mapping, num_classes=batch_size) E RuntimeError: Class values must be non-negative. ../../vllm/worker/hpu_model_runner.py:311: RuntimeError ``` --- vllm/worker/hpu_model_runner.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index b5100491c4135..78e8620d7c43c 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -318,18 +318,19 @@ def _set_block_mapping(self, metadata, batch_size, device, dtype): mask = mask >= metadata.block_usage.unsqueeze(-1) attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_( mask, -math.inf)) - if is_fake_hpu(): - # Unfortunately one_hot on CPU doesn't handle - # out of bounds classes. We need to mask those - # values manually - oob_values = metadata.block_mapping.lt(0) - block_mapping = metadata.block_mapping.masked_fill(oob_values, 0) - block_mapping = torch.nn.functional.one_hot(block_mapping, + + if not is_fake_hpu() and htorch.utils.internal.is_lazy(): + block_mapping = torch.nn.functional.one_hot(metadata.block_mapping, num_classes=batch_size) - block_mapping.masked_fill_(oob_values.unsqueeze(-1), 0) else: - block_mapping = torch.nn.functional.one_hot(metadata.block_mapping, + # Unfortunately one_hot on CPU/torch.compile mode/eager mode + # doesn't handle out of bounds classes, + # so we convert all negative values to 0. + block_mapping = torch.nn.functional.relu(metadata.block_mapping) + block_mapping = torch.nn.functional.one_hot(block_mapping, num_classes=batch_size) + oob_values = metadata.block_mapping.lt(0) + block_mapping.masked_fill_(oob_values.unsqueeze(-1), 0) block_mapping = block_mapping.to(dtype) metadata = metadata._replace(block_mapping=block_mapping, attn_bias=attn_bias) From 3203bd96072ebf3a0d4e40671c8d64706f9bdc39 Mon Sep 17 00:00:00 2001 From: Michal Adamczyk Date: Tue, 29 Oct 2024 08:09:53 +0100 Subject: [PATCH 314/819] HPU: offload logits processing to CPU (#358) Due to high dynamicity on logits processing it's better to offload it completely to CPU instead of computing it on HPU. --- .../outlines_logits_processors.py | 58 +++++++++++++------ .../model_executor/layers/logits_processor.py | 23 +++++++- 2 files changed, 61 insertions(+), 20 deletions(-) diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index 092c143bd59b0..e1b7c11eb00a6 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -30,11 +30,48 @@ from transformers import PreTrainedTokenizerBase +# Unfortunately we cannot use lru_cache as it breaks pickling +# so we use a simpler implementation +def _cached(fn): + cache = {} + + def cached_fn(*args): + if args in cache: + result = cache[args] + else: + result = fn(*args) + cache[args] = result + return result + + return cached_fn + + class BaseLogitsProcessor: def __init__(self, guide: Guide): self._guide: Guide = guide self._fsm_state: DefaultDict[int, int] = defaultdict(int) + self._cached_get_mask_tensor = _cached(self._get_mask_tensor) + + @staticmethod + @lru_cache(maxsize=128) + def _create_mask_tensor(allowed_tokens, vocab_size, device): + mask = torch.full((vocab_size, ), -math.inf, device=device) + mask[list(allowed_tokens)] = 0 + return mask + + def _get_mask_tensor(self, state_id, vocab_size, device): + instruction = self._guide.get_next_instruction(state=state_id) + if type(instruction) == Generate: # noqa: E721 + allowed_tokens = instruction.tokens + elif type(instruction) == Write: # noqa: E721 + # TODO: support fast forward tokens + allowed_tokens = [instruction.tokens[0]] + else: + raise TypeError( + f"Unsupported instruction type {type(instruction)}") + return BaseLogitsProcessor._create_mask_tensor(tuple(allowed_tokens), + vocab_size, device) def __call__(self, input_ids: List[int], scores: torch.Tensor) -> torch.Tensor: @@ -64,23 +101,10 @@ def __call__(self, input_ids: List[int], import_paths=[grammars.GRAMMAR_PATH], ) - instruction = self._guide.get_next_instruction( - state=self._fsm_state[seq_id]) - - if type(instruction) == Generate: # noqa: E721 - allowed_tokens = instruction.tokens - elif type(instruction) == Write: # noqa: E721 - # TODO: support fast forward tokens - allowed_tokens = [instruction.tokens[0]] - else: - raise TypeError( - f"Unsupported instruction type {type(instruction)}") - - mask = torch.full((scores.shape[-1], ), - -math.inf, - device=scores.device) - mask[allowed_tokens] = 0 - scores = scores.add(mask) + state_id = self._fsm_state[seq_id] + mask = self._cached_get_mask_tensor(state_id, scores.size(-1), + scores.device) + scores.add_(mask) return scores diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index bee3d38565f4c..e0194b36652a2 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -118,12 +118,28 @@ def _prune_hidden_states( return hidden_states +def get_num_parameters(logits_processor): + """Extracts the number of parameters from the + signature and stores it for further use""" + if hasattr(logits_processor, 'num_parameters'): + return logits_processor.num_parameters + logits_processor.num_parameters = len( + inspect.signature(logits_processor).parameters) + return logits_processor.num_parameters + + def _apply_logits_processors( logits: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> torch.Tensor: - found_logits_processors = False logits_processed = 0 + found_logits_processors = any( + seq_group.sampling_params.logits_processors + for seq_group in sampling_metadata.seq_groups) + offload_to_cpu = current_platform.is_hpu() and found_logits_processors + if offload_to_cpu: + logits_device = logits.device + logits = logits.cpu() for seq_group in sampling_metadata.seq_groups: seq_ids = seq_group.seq_ids sampling_params = seq_group.sampling_params @@ -138,8 +154,7 @@ def _apply_logits_processors( prompt_tokens_ids = seq_group.seq_data[seq_id].prompt_token_ids for logits_processor in logits_processors: - parameters = inspect.signature(logits_processor).parameters - if len(parameters) == 3: + if get_num_parameters(logits_processor) == 3: logits_row = logits_processor(prompt_tokens_ids, past_tokens_ids, logits_row) @@ -155,4 +170,6 @@ def _apply_logits_processors( if found_logits_processors: # verifies that no rows in logits were missed unexpectedly assert logits_processed == logits.shape[0] + if offload_to_cpu: + logits = logits.to(logits_device) return logits From 2fa54e20eae7847cba9ae53b64e615ce9252b037 Mon Sep 17 00:00:00 2001 From: Ruheena Suhani Shaik Date: Tue, 29 Oct 2024 13:53:47 +0530 Subject: [PATCH 315/819] Lora layers (#435) This PR supports the unit test test_layers with LoraMask based approach --- tests/lora/test_layers_hpu.py | 1341 +++++++++++++++++++++++++++++++++ 1 file changed, 1341 insertions(+) create mode 100644 tests/lora/test_layers_hpu.py diff --git a/tests/lora/test_layers_hpu.py b/tests/lora/test_layers_hpu.py new file mode 100644 index 0000000000000..7e33813c7a6a2 --- /dev/null +++ b/tests/lora/test_layers_hpu.py @@ -0,0 +1,1341 @@ +import random +from copy import deepcopy +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple +from unittest.mock import patch + +import habana_frameworks.torch.core as htcore +import pytest +import torch +import torch.nn.functional as F +from vllm_hpu_extension.ops import LoraMask +from vllm_hpu_extension.punica_hpu import GaudiPunicaWrapper + +from vllm.config import LoRAConfig +from vllm.lora.fully_sharded_layers import ( + ColumnParallelLinearWithShardedLoRA, + MergedColumnParallelLinearWithShardedLoRA, + MergedQKVParallelLinearWithShardedLora, QKVParallelLinearWithShardedLora, + RowParallelLinearWithShardedLoRA) +# yapf conflicts with isort for this block +# yapf: disable +from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, + LinearScalingRotaryEmbeddingWithLora, + LogitsProcessorWithLoRA, LoRAMapping, + MergedColumnParallelLinearWithLoRA, + MergedQKVParallelLinearWithLora, + QKVParallelLinearWithLora, + ReplicatedLinearWithLoRA, + RowParallelLinearWithLoRA, + VocabParallelEmbeddingWithLoRA) +# yapf: enable +from vllm.lora.models import (LongContextLoRAContext, LoRALayerWeights, + PackedLoRALayerWeights) +from vllm.lora.punica import PunicaWrapper +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + MergedColumnParallelLinear, + QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask) +from vllm.model_executor.utils import set_random_seed +from vllm.platforms import current_platform +from vllm.utils import seed_everything + +from .utils import DummyLoRAManager + +TOLERANCES = { + torch.float16: (5e-3, 5e-3), + torch.float32: (5e-3, 5e-3), + torch.bfloat16: (3e-2, 2e-2), +} +if current_platform.is_hpu(): + CUDA_DEVICES = ["hpu"] +else: + CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) + ] +# We will launch different triton kernels between the prefill and decode +# stages, so we need to verify this. prefill stage(True) or decode stage(False) +STAGES = [True, False] + + +def get_random_id_to_index(num_loras: int, + num_slots: int, + log: bool = True) -> List[Optional[int]]: + """Creates a random lora_id_to_index mapping. + + Args: + num_loras: The number of active loras in the mapping. + num_slots: The number of slots in the mapping. Must be larger + than num_loras. + log: Whether to log the output. + """ + + if num_loras > num_slots: + raise ValueError( + f"num_loras is higher than num_slots: {num_loras} > {num_slots}. " + "num_loras must be less than or equal to num_slots.") + + slots: List[Optional[int]] = [None] * num_slots + random_slot_selections = (torch.randperm(num_slots)[:num_loras]).tolist() + for lora_id, slot_idx in enumerate(random_slot_selections, start=1): + slots[slot_idx] = lora_id + + if log: + print(f"Created lora_id_to_index mapping: {slots}.") + + return slots + + +def populate_loras( + id_to_index: List[Optional[int]], + layer: BaseLayerWithLoRA, + layer_weights: torch.Tensor, + generate_embeddings_tensor: int = 0, + repeats: int = 1, +) -> Tuple[Dict[int, LoRALayerWeights], Dict[int, List[LoRALayerWeights]]]: + """This method populates the lora layers with lora weights. + + Args: + id_to_index: a list of lora ids. The index of the lora id + represents which memory slot the lora matrices are + stored in. A None value indicates a free slot. + layer: the LoRAlayer to populate. + layer_weights: the PyTorch tensor containing the layer's + weights. + generate_embeddings_tensor: whether to generate an + embeddings tensor for each LoRA. + repeats: must only be set for column parallel packed + layers. Indicates the number of loras to compose + together to create a single lora layer. + """ + + # Dictionary that maps the lora ID to the + # corresponding lora weights. + lora_dict: Dict[int, LoRALayerWeights] = dict() + + # Dictionary that maps the lora ID to the + # corresponding subloras. + sublora_dict: Dict[int, List[LoRALayerWeights]] = dict() + + for slot_idx, lora_id in enumerate(id_to_index): + if lora_id is not None: + subloras: List[LoRALayerWeights] = [] + sublora_len = layer_weights.shape[0] // repeats + for i in range(repeats): + sublora = DummyLoRAManager().init_random_lora( + module_name=f"fake_{i}", + weight=layer_weights, + generate_embeddings_tensor=generate_embeddings_tensor, + ) + sublora.lora_b = sublora.lora_b[:, (sublora_len * + i):(sublora_len * (i + 1))] + sublora.optimize() + subloras.append(sublora) + + lora = PackedLoRALayerWeights.pack( + subloras) if repeats > 1 else subloras[0] + + layer.set_lora( + slot_idx, + lora_a=lora.lora_a, + lora_b=lora.lora_b, + embeddings_tensor=lora.embeddings_tensor, + ) + + lora_dict[lora_id] = lora + sublora_dict[lora_id] = subloras + + return lora_dict, sublora_dict + + +def create_random_inputs( + active_lora_ids: List[int], + num_inputs: int, + input_size: Tuple[int, ...], + input_range: Tuple[float, float], + input_type: torch.dtype = torch.int, +) -> Tuple[List[torch.Tensor], List[int], List[int]]: + """Creates random inputs. + + Args: + active_lora_ids: lora IDs of active lora weights. + num_inputs: the number of inputs to create. + input_size: the size of each individual input. + input_range: the range of values to include in the input. + input_range[0] <= possible input values < input_range[1] + input_type: the type of values in the input. + """ + + low, high = input_range + + inputs: List[torch.Tensor] = [] + index_mapping: List[int] = [] + prompt_mapping: List[int] = [] + + for _ in range(num_inputs): + if input_type == torch.int: + inputs.append( + torch.randint(low=int(low), high=int(high), size=input_size)) + else: + inputs.append( + torch.rand(size=input_size, dtype=input_type) * high + low) + + lora_id = random.choice(active_lora_ids) + index_mapping += [lora_id] * input_size[0] + prompt_mapping += [lora_id] + + return inputs, index_mapping, prompt_mapping + + +def createLoraMask(indices, batch_size, seq_len, max_loras, max_lora_rank, + lora_dtype): + indices = indices.view(-1, 1) + mask = torch.arange(max_loras * max_lora_rank, device=indices.device) + mask = mask.view(1, -1) + mask = ((mask >= ((indices) * max_lora_rank)) * + (mask < ((indices + 1) * max_lora_rank))).to(dtype=lora_dtype) + mask = mask.view(batch_size, 1, + -1).expand(batch_size, seq_len, + -1).reshape(batch_size * seq_len, -1) + return mask + + +@torch.inference_mode() +@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000]) +@pytest.mark.parametrize("stage", STAGES) +def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: + + torch.set_default_device(torch.device("hpu")) + max_loras = 8 + if current_platform.is_hpu(): + punica_wrapper = GaudiPunicaWrapper(8192, 256, device="hpu") + else: + punica_wrapper = PunicaWrapper(8192, 256, device) + lora_config = LoRAConfig(max_loras=max_loras, + max_lora_rank=8, + lora_dtype=torch.bfloat16) + + def create_random_embedding_layer(): + embedding = VocabParallelEmbedding(vocab_size, 256) + embedding.weight.data = torch.rand_like(embedding.weight.data) + embedding.weight.data[vocab_size:, :] = 0 + lora_embedding = VocabParallelEmbeddingWithLoRA(embedding) + lora_embedding.create_lora_weights(max_loras, lora_config) + + return embedding, lora_embedding + + for i in range(10): + set_random_seed(i) + + id_to_index = get_random_id_to_index(num_loras, max_loras) + embedding, lora_embedding = create_random_embedding_layer() + lora_embedding.set_mapping(punica_wrapper) + lora_dict, _ = populate_loras( + id_to_index, + layer=lora_embedding, + layer_weights=embedding.weight.T, + ) + + htcore.mark_step() + inputs, index_mapping, prompt_mapping = create_random_inputs( + active_lora_ids=list(lora_dict.keys()), + num_inputs=num_loras * 3, + input_size=(200, ), + input_range=(1, vocab_size), + ) + + indices_list = [id_to_index.index(value) for value in index_mapping] + indices = torch.tensor(indices_list) + mask = createLoraMask(indices, indices.shape[0], 1, max_loras, 8, + torch.bfloat16) + LoraMask.setLoraMask(mask) + + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras, + vocab_size, + lora_config.lora_extra_vocab_size) + + lora_result = lora_embedding(torch.cat(inputs)) + + expected_results: List[torch.Tensor] = [] + for input_, lora_id in zip(inputs, prompt_mapping): + lora = lora_dict[lora_id] + result = embedding(input_) + after_a = F.embedding( + input_, + lora.lora_a, + ) + result += (after_a @ lora.lora_b) + expected_results.append(result) + expected_result = torch.cat(expected_results) + + rtol, atol = TOLERANCES[lora_result.dtype] + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) + + # Check that resetting the lora weights succeeds + + for slot_idx in range(max_loras): + lora_embedding.reset_lora(slot_idx) + + inputs, index_mapping, prompt_mapping = create_random_inputs( + active_lora_ids=[0], + num_inputs=num_loras * 3, + input_size=(200, ), + input_range=(1, vocab_size), + ) + indices = torch.full((len(inputs) * len(inputs[0]), ), 0, device="hpu") + mask = createLoraMask(indices, indices.shape[0], 1, 8, 8, + torch.bfloat16) + LoraMask.setLoraMask(mask) + + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras, + vocab_size, + lora_config.lora_extra_vocab_size) + + lora_result = lora_embedding(torch.cat(inputs)) + expected_result = embedding(torch.cat(inputs)) + + rtol, atol = TOLERANCES[lora_result.dtype] + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) + + +@torch.inference_mode() +# @pytest.mark.skip( +# reason="Fails when loras are in any slot other than the first.") +@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000]) +@pytest.mark.parametrize("stage", STAGES) +def test_embeddings_with_new_embeddings(dist_init, num_loras, device, + vocab_size, stage) -> None: + + torch.set_default_device(torch.device("hpu")) + max_loras = 8 + if current_platform.is_hpu(): + punica_wrapper = GaudiPunicaWrapper(8192, 256, device="hpu") + else: + punica_wrapper = PunicaWrapper(8192, 256, device) + lora_config = LoRAConfig(max_loras=max_loras, + max_lora_rank=8, + lora_dtype=torch.bfloat16) + + def create_random_embedding_layer(): + embedding = VocabParallelEmbedding(vocab_size, 256) + embedding_data = torch.rand_like(embedding.weight.data) + embedding.weight.data = embedding_data + embedding.weight.data[vocab_size:, :] = 0 + expanded_embedding = VocabParallelEmbedding( + vocab_size + lora_config.lora_extra_vocab_size * max_loras, + 256, + org_num_embeddings=vocab_size) + expanded_embedding.weight.data[:vocab_size, :] = embedding_data + # We need to deepcopy the embedding as it will be modified + # in place + lora_embedding = VocabParallelEmbeddingWithLoRA( + deepcopy(expanded_embedding)) + lora_embedding.create_lora_weights(max_loras, lora_config) + + return expanded_embedding, lora_embedding + + for i in range(10): + set_random_seed(i) + + id_to_index = get_random_id_to_index(num_loras, max_loras) + expanded_embedding, lora_embedding = create_random_embedding_layer() + lora_dict, _ = populate_loras( + id_to_index, + layer=lora_embedding, + layer_weights=torch.zeros( + (256, vocab_size + lora_config.lora_extra_vocab_size)), + generate_embeddings_tensor=256, + ) + + lora_embedding.set_mapping(punica_wrapper) + # All embeddings tensors have the same shape. + embeddings_tensors = [ + lora_dict[id].embeddings_tensor for id in sorted(lora_dict.keys()) + ] + embeddings_tensor_len = embeddings_tensors[0].shape[0] + + # Add empty embeddings_tensors for unoccupied lora slots. + for _ in range(max_loras - len(embeddings_tensors)): + embeddings_tensors.append(torch.zeros(embeddings_tensors[0].shape)) + + inputs, index_mapping, prompt_mapping = create_random_inputs( + active_lora_ids=list(lora_dict.keys()), + num_inputs=num_loras * 3, + input_size=(200, ), + input_range=(1, vocab_size), + ) + indices_list = [id_to_index.index(value) for value in index_mapping] + indices = torch.tensor(indices_list) + mask = createLoraMask(indices, indices.shape[0], 1, max_loras, 8, + torch.bfloat16) + LoraMask.setLoraMask(mask) + + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras, + vocab_size, + lora_config.lora_extra_vocab_size) + original_inputs = deepcopy(inputs) + + # Force some of the inputs to be in the extended embeddings range + # to guarantee that their behavior is tested. + for input_, original_input_, lora_id in zip(inputs, original_inputs, + prompt_mapping): + embedding_id = lora_id - 1 + input_[-1] = vocab_size + (embedding_id * embeddings_tensor_len) + original_input_[-1] = vocab_size + input_[-2] = vocab_size + ( + (embedding_id + 1) * embeddings_tensor_len - 1) + original_input_[-2] = vocab_size + embeddings_tensor_len - 1 + + expanded_embedding.weight[vocab_size:vocab_size + + (embeddings_tensor_len * + max_loras)] = torch.cat(embeddings_tensors) + + lora_result = lora_embedding(torch.cat(original_inputs)) + + expected_results: List[torch.Tensor] = [] + for input_, original_input_, lora_id in zip(inputs, original_inputs, + prompt_mapping): + lora = lora_dict[lora_id] + result = expanded_embedding(input_) + after_a = F.embedding( + original_input_, + lora.lora_a, + ) + result += (after_a @ lora.lora_b) + expected_results.append(result) + expected_result = torch.cat(expected_results) + + rtol, atol = TOLERANCES[lora_result.dtype] + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) + + # Check that resetting the lora weights succeeds + + for slot_idx in range(max_loras): + lora_embedding.reset_lora(slot_idx) + + inputs, index_mapping, prompt_mapping = create_random_inputs( + active_lora_ids=[0], + num_inputs=num_loras * 3, + input_size=(200, ), + input_range=(1, vocab_size), + ) + indices = torch.full((len(inputs) * len(inputs[0]), ), 0, device="hpu") + mask = createLoraMask(indices, indices.shape[0], 1, 8, 8, + torch.bfloat16) + LoraMask.setLoraMask(mask) + + original_inputs = deepcopy(inputs) + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras, + vocab_size, + lora_config.lora_extra_vocab_size) + lora_result = lora_embedding(torch.cat(original_inputs)) + expected_result = expanded_embedding(torch.cat(inputs)) + + rtol, atol = TOLERANCES[lora_result.dtype] + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) + + +@torch.inference_mode() +@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512]) +@pytest.mark.parametrize("stage", STAGES) +def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size, + stage) -> None: + + torch.set_default_device(torch.device("hpu")) + max_loras = 8 + if current_platform.is_hpu(): + punica_wrapper = GaudiPunicaWrapper(8192, 256, device="hpu") + else: + punica_wrapper = PunicaWrapper(8192, 256, device) + lora_config = LoRAConfig(max_loras=max_loras, + max_lora_rank=8, + lora_dtype=torch.bfloat16) + + def _pretest(): + linear = ParallelLMHead(vocab_size + lora_config.lora_extra_vocab_size, + 1024, + vocab_size, + params_dtype=torch.bfloat16) + linear.weight.data = torch.rand_like(linear.weight.data) + linear.weight.data[:, vocab_size:] = 0 + logits_processor = LogitsProcessor( + vocab_size + lora_config.lora_extra_vocab_size, vocab_size) + lora_logits_processor = LogitsProcessorWithLoRA( + logits_processor, 1024, linear.weight.dtype, linear.weight.device, + None) + lora_logits_processor.create_lora_weights(max_loras, lora_config) + + return linear, logits_processor, lora_logits_processor + + for i in range(10): + set_random_seed(i) + + id_to_index = get_random_id_to_index(num_loras, max_loras) + linear, logits_processor, lora_logits_processor = _pretest() + lora_logits_processor.set_mapping(punica_wrapper) + # NOTE: all the generated loras share the same embeddings tensor. + lora_dict, _ = populate_loras( + id_to_index, + layer=lora_logits_processor, + layer_weights=linear.weight, + generate_embeddings_tensor=1024, + ) + htcore.mark_step() + embeddings_tensor = list(lora_dict.values())[0].embeddings_tensor + embeddings_tensor_len = embeddings_tensor.shape[0] + + inputs, index_mapping, prompt_mapping = create_random_inputs( + active_lora_ids=list(lora_dict.keys()), + num_inputs=8 * num_loras, # * 3, + input_size=(1, 1024), + input_range=(0, 1), + input_type=torch.bfloat16, + ) + indices_list = [id_to_index.index(value) for value in index_mapping] + indices = torch.tensor(indices_list) + mask = createLoraMask(indices, indices.shape[0], 1, max_loras, 8, + torch.bfloat16) + LoraMask.setLoraMask(mask) + + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + punica_wrapper.update_metadata( + lora_mapping, + id_to_index, + max_loras, + vocab_size, + lora_config.lora_extra_vocab_size, + ) + input_ = torch.rand(20, 1024) + + lora_result = lora_logits_processor._get_logits( + hidden_states=torch.cat(inputs), + lm_head=linear, + embedding_bias=None) + + original_lm_head = deepcopy(linear) + + linear.weight[logits_processor. + org_vocab_size:logits_processor.org_vocab_size + + embeddings_tensor_len] = embeddings_tensor + + logits_processor.org_vocab_size = (vocab_size + + lora_config.lora_extra_vocab_size) + expected_results: List[torch.Tensor] = [] + for input_, lora_id in zip(inputs, prompt_mapping): + lora = lora_dict[lora_id] + result = logits_processor._get_logits(hidden_states=input_, + lm_head=linear, + embedding_bias=None) + result[:, vocab_size + embeddings_tensor_len:] = float("-inf") + result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling + expected_results.append(result) + expected_result = torch.cat(expected_results) + logits_processor.org_vocab_size = vocab_size + + # Check that resetting the lora weights succeeds + + for slot_idx in range(max_loras): + lora_logits_processor.reset_lora(slot_idx) + + inputs, index_mapping, prompt_mapping = create_random_inputs( + active_lora_ids=[0], + num_inputs=8 * num_loras * 3, + input_size=(1, 1024), + input_range=(0, 1), + input_type=torch.bfloat16, + ) + indices = torch.full((len(inputs) * len(inputs[0]), ), 0, device="hpu") + mask = createLoraMask(indices, indices.shape[0], 1, 8, 8, + torch.bfloat16) + LoraMask.setLoraMask(mask) + + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + punica_wrapper.update_metadata( + lora_mapping, + id_to_index, + max_loras, + vocab_size, + lora_config.lora_extra_vocab_size, + ) + + lora_result = lora_logits_processor._get_logits( + hidden_states=torch.cat(inputs), + lm_head=original_lm_head, + embedding_bias=None)[:, :vocab_size] + expected_result = logits_processor._get_logits( + hidden_states=torch.cat(inputs), + lm_head=original_lm_head, + embedding_bias=None) + + rtol, atol = TOLERANCES[lora_result.dtype] + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) + + +@torch.inference_mode() +@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("stage", STAGES) +def test_linear_replicated(dist_init, num_loras, device, stage) -> None: + + torch.set_default_device(torch.device("hpu")) + if current_platform.is_hpu(): + punica_wrapper = GaudiPunicaWrapper(8192, 256, device="hpu") + else: + punica_wrapper = PunicaWrapper(8192, 256, device) + max_loras = 8 + lora_config = LoRAConfig(max_loras=max_loras, + max_lora_rank=8, + lora_dtype=torch.bfloat16) + + def create_random_linear_replicated_layer(): + + linear = ReplicatedLinear(4096, + 4096, + bias=False, + params_dtype=torch.bfloat16) + linear.weight.data = torch.rand_like(linear.weight.data) + lora_linear = ReplicatedLinearWithLoRA(linear) + + lora_linear.create_lora_weights(max_loras, lora_config) + + return linear, lora_linear + + for i in range(10): + set_random_seed(i) + + id_to_index = get_random_id_to_index(num_loras, max_loras) + linear, lora_linear = create_random_linear_replicated_layer() + lora_linear.set_mapping(punica_wrapper) + lora_dict, _ = populate_loras( + id_to_index, + layer=lora_linear, + layer_weights=linear.weight, + ) + + inputs, index_mapping, prompt_mapping = create_random_inputs( + active_lora_ids=list(lora_dict.keys()), + num_inputs=32 * num_loras, + input_size=(1, 4096), + input_range=(0, 1), + input_type=torch.bfloat16, + ) + indices_list = [id_to_index.index(value) for value in index_mapping] + indices = torch.tensor(indices_list) + mask = createLoraMask(indices, len(inputs), 1, max_loras, 8, + torch.bfloat16) + LoraMask.setLoraMask(mask) + + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + punica_wrapper.update_metadata( + lora_mapping, + id_to_index, + max_loras, + 512, + lora_config.lora_extra_vocab_size, + ) + + lora_result = lora_linear(torch.cat(inputs))[0] + + expected_results: List[torch.Tensor] = [] + for input_, lora_id in zip(inputs, prompt_mapping): + htcore.mark_step() + lora = lora_dict[lora_id] + result = linear(input_)[0] + result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling + expected_results.append(result) + expected_result = torch.cat(expected_results) + + rtol, atol = TOLERANCES[lora_result.dtype] + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) + + # Check that resetting the lora weights succeeds + + for slot_idx in range(max_loras): + lora_linear.reset_lora(slot_idx) + + inputs, index_mapping, prompt_mapping = create_random_inputs( + active_lora_ids=[0], + num_inputs=32 * num_loras, + input_size=(1, 4096), + input_range=(0, 1), + input_type=torch.bfloat16, + ) + indices = torch.full((len(inputs), ), 0, device="hpu") + mask = createLoraMask(indices, len(inputs), 1, max_loras, 8, + torch.bfloat16) + LoraMask.setLoraMask(mask) + + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + + punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras, + 512, lora_config.lora_extra_vocab_size) + + lora_result = lora_linear(torch.cat(inputs))[0] + expected_result = linear(torch.cat(inputs))[0] + + rtol, atol = TOLERANCES[lora_result.dtype] + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) + + +@torch.inference_mode() +# @pytest.mark.skip( +# reason="Fails when fully_shard is True.") +@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) +@pytest.mark.parametrize("orientation", ["row", "column"]) +@pytest.mark.parametrize("fully_shard", [True, False]) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("stage", STAGES) +def test_linear_parallel(dist_init, num_loras, orientation, fully_shard, + device, stage) -> None: + + if fully_shard: + pytest.skip("Skipping the test when fully_shard is True") + + torch.set_default_device(torch.device("hpu")) + if current_platform.is_hpu(): + punica_wrapper = GaudiPunicaWrapper(8192, 256, device="hpu") + else: + punica_wrapper = PunicaWrapper(8192, 256, device) + max_loras = 8 + lora_config = LoRAConfig(max_loras=max_loras, + max_lora_rank=8, + fully_sharded_loras=fully_shard, + lora_dtype=torch.bfloat16) + + def create_random_linear_parallel_layer(): + if orientation == "row": + linear = RowParallelLinear(4096, + 4096, + bias=False, + params_dtype=torch.bfloat16) + linear.weight.data = torch.rand_like(linear.weight.data) + lora_linear = (RowParallelLinearWithLoRA(linear) if not fully_shard + else RowParallelLinearWithShardedLoRA(linear)) + else: + linear = ColumnParallelLinear(4096, + 4096, + bias=False, + params_dtype=torch.bfloat16) + linear.weight.data = torch.rand_like(linear.weight.data) + lora_linear = (ColumnParallelLinearWithLoRA(linear) + if not fully_shard else + ColumnParallelLinearWithShardedLoRA(linear)) + lora_linear.create_lora_weights(max_loras, lora_config) + + return linear, lora_linear + + for i in range(10): + set_random_seed(i) + + id_to_index = get_random_id_to_index(num_loras, max_loras) + linear, lora_linear = create_random_linear_parallel_layer() + lora_linear.set_mapping(punica_wrapper) + lora_dict, _ = populate_loras( + id_to_index, + layer=lora_linear, + layer_weights=linear.weight, + ) + + inputs, index_mapping, prompt_mapping = create_random_inputs( + active_lora_ids=list(lora_dict.keys()), + num_inputs=32 * num_loras, + input_size=(1, 4096), + input_range=(0, 1), + input_type=torch.bfloat16, + ) + indices_list = [id_to_index.index(value) for value in index_mapping] + indices = torch.tensor(indices_list) + mask = createLoraMask(indices, len(inputs), 1, max_loras, 8, + torch.bfloat16) + LoraMask.setLoraMask(mask) + + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + punica_wrapper.update_metadata( + lora_mapping, + id_to_index, + max_loras, + 512, + lora_config.lora_extra_vocab_size, + ) + + lora_result = lora_linear(torch.cat(inputs))[0] + + expected_results: List[torch.Tensor] = [] + for input_, lora_id in zip(inputs, prompt_mapping): + htcore.mark_step() + lora = lora_dict[lora_id] + result = linear(input_)[0] + result += input_ @ lora.lora_a @ lora.lora_b * lora.scaling + expected_results.append(result) + expected_result = torch.cat(expected_results) + + rtol, atol = TOLERANCES[lora_result.dtype] + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) + + # Check that resetting the lora weights succeeds + + for slot_idx in range(max_loras): + lora_linear.reset_lora(slot_idx) + + inputs, index_mapping, prompt_mapping = create_random_inputs( + active_lora_ids=[0], + num_inputs=32 * num_loras, + input_size=(1, 4096), + input_range=(0, 1), + input_type=torch.bfloat16, + ) + indices = torch.full((len(inputs), ), 0, device="hpu") + mask = createLoraMask(indices, len(inputs), 1, max_loras, 8, + torch.bfloat16) + LoraMask.setLoraMask(mask) + + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + + punica_wrapper.update_metadata(lora_mapping, id_to_index, max_loras, + 512, lora_config.lora_extra_vocab_size) + + lora_result = lora_linear(torch.cat(inputs))[0] + expected_result = linear(torch.cat(inputs))[0] + + rtol, atol = TOLERANCES[lora_result.dtype] + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) + + +@torch.inference_mode() +# @pytest.mark.skip( +# reason="Fails when fully_shard is True.") +@pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) +@pytest.mark.parametrize("repeats", [1, 2, 3]) +@pytest.mark.parametrize("fully_shard", [True, False]) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("stage", STAGES) +def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, + device, stage) -> None: + + if fully_shard: + pytest.skip("Skipping the test when fully_shard is True") + + torch.set_default_device(torch.device("hpu")) + if current_platform.is_hpu(): + punica_wrapper = GaudiPunicaWrapper(8192, 256, device="hpu") + else: + punica_wrapper = PunicaWrapper(8192, 256, device) + max_loras = 8 + lora_config = LoRAConfig(max_loras=max_loras, + max_lora_rank=8, + fully_sharded_loras=fully_shard, + lora_dtype=torch.bfloat16) + + def create_column_parallel_packed_layer(): + if repeats == 2: + linear = MergedColumnParallelLinear(4096, [4096] * repeats, + bias=False, + params_dtype=torch.bfloat16) + linear.weight.data = torch.rand_like(linear.weight.data) + lora_linear = (MergedColumnParallelLinearWithLoRA(linear) + if not fully_shard else + MergedColumnParallelLinearWithShardedLoRA(linear)) + elif repeats == 3: + linear = QKVParallelLinear(4096, + 64, + 32, + bias=False, + params_dtype=torch.bfloat16) + linear.weight.data = torch.rand_like(linear.weight.data) + lora_linear = (MergedQKVParallelLinearWithLora(linear) + if not fully_shard else + MergedQKVParallelLinearWithShardedLora(linear)) + else: + linear = QKVParallelLinear(4096, + 64, + 32, + bias=False, + params_dtype=torch.bfloat16) + linear.weight.data = torch.rand_like(linear.weight.data) + lora_linear = QKVParallelLinearWithLora( + linear + ) if not fully_shard else QKVParallelLinearWithShardedLora(linear) + + @dataclass + class FakeConfig: + hidden_size = 4096 + num_key_value_heads = 32 + num_attention_heads = 32 + + lora_linear.create_lora_weights(max_loras, + lora_config, + model_config=FakeConfig()) + + return linear, lora_linear + + for i in range(10): + set_random_seed(i) + + id_to_index = get_random_id_to_index(num_loras, max_loras) + + linear, lora_linear = create_column_parallel_packed_layer() + lora_linear.set_mapping(punica_wrapper) + lora_dict, sublora_dict = populate_loras( + id_to_index, + layer=lora_linear, + layer_weights=linear.weight, + repeats=repeats, + ) + + inputs, index_mapping, prompt_mapping = create_random_inputs( + active_lora_ids=list(lora_dict.keys()), + num_inputs=32 * num_loras, + input_size=(1, 4096), + input_range=(0, 1), + input_type=torch.bfloat16, + ) + indices_list = [id_to_index.index(value) for value in index_mapping] + indices = torch.tensor(indices_list) + mask = createLoraMask(indices, len(inputs), 1, max_loras, 8, + torch.bfloat16) + LoraMask.setLoraMask(mask) + + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + + punica_wrapper.update_metadata( + lora_mapping, + id_to_index, + max_loras, + 512, + lora_config.lora_extra_vocab_size, + ) + + lora_result = lora_linear(torch.cat(inputs))[0] + + expected_results: List[torch.Tensor] = [] + for input_, lora_id in zip(inputs, prompt_mapping): + htcore.mark_step() + result = linear(input_)[0] + subloras = sublora_dict[lora_id] + for i, sublora in enumerate(subloras): + result[:, sublora.lora_b.shape[1] * i:sublora.lora_b.shape[1] * + (i + 1)] += (input_ @ sublora.lora_a @ sublora.lora_b * + sublora.scaling) + expected_results.append(result) + expected_result = torch.cat(expected_results) + + rtol, atol = TOLERANCES[lora_result.dtype] + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) + + for slot_idx in range(max_loras): + lora_linear.reset_lora(slot_idx) + + inputs, index_mapping, prompt_mapping = create_random_inputs( + active_lora_ids=[0], + num_inputs=32 * num_loras, + input_size=(1, 4096), + input_range=(0, 1), + input_type=torch.bfloat16, + ) + indices = torch.full((len(inputs), ), 0, device="hpu") + mask = createLoraMask(indices, len(inputs), 1, max_loras, 8, + torch.bfloat16) + LoraMask.setLoraMask(mask) + + lora_mapping = LoRAMapping(index_mapping, + prompt_mapping, + is_prefill=stage) + + punica_wrapper.update_metadata( + lora_mapping, + id_to_index, + max_loras, + 512, + lora_config.lora_extra_vocab_size, + ) + # lora_linear.set_mapping(*mapping_info) + + lora_result = lora_linear(torch.cat(inputs))[0] + expected_result = linear(torch.cat(inputs))[0] + + rtol, atol = TOLERANCES[lora_result.dtype] + torch.testing.assert_close(lora_result, + expected_result, + rtol=rtol, + atol=atol) + + +@torch.inference_mode() +@pytest.mark.parametrize("num_loras", [1, 8]) +@pytest.mark.parametrize("device", ["hpu"]) +@pytest.mark.parametrize("scaling_factors", [(1.0, ), (4.0, ), (4.0, 8.0), + (6.0, 1.0)]) +@pytest.mark.parametrize("max_position", [11, 4096, 32768]) +@pytest.mark.parametrize("is_neox_style", [True, False]) +@pytest.mark.parametrize("rotary_dim", [None, 32]) +@pytest.mark.parametrize("head_size", [32, 108]) +@pytest.mark.parametrize("seq_len", [11, 1024]) +def test_rotary_embedding_long_context(dist_init, num_loras, device, + scaling_factors, max_position, + is_neox_style, rotary_dim, head_size, + seq_len) -> None: + dtype = torch.bfloat16 + seed = 0 + seed_everything(seed) + torch.set_default_device(torch.device("hpu")) + if current_platform.is_hpu(): + punica_wrapper = GaudiPunicaWrapper(8192, 256, device="hpu") + else: + punica_wrapper = PunicaWrapper(8192, 256, device) + max_loras = 8 + lora_config = LoRAConfig(max_loras=max_loras, + max_lora_rank=8, + long_lora_scaling_factors=scaling_factors, + lora_dtype=dtype) + + if rotary_dim is None: + rotary_dim = head_size + base = 10000 + batch_size = 5 * num_loras + num_heads = 7 + + # Verify lora is equivalent to linear scaling rotary embedding. + rope = get_rope(head_size, + rotary_dim, + max_position, + base, + is_neox_style, + dtype=torch.bfloat16) + lora_rope = LinearScalingRotaryEmbeddingWithLora(rope) + lora_rope.set_mapping(punica_wrapper) + lora_rope.create_lora_weights(max_loras, lora_config) + linear_rope = get_rope(head_size, + rotary_dim, + max_position, + base, + is_neox_style, { + "type": "linear", + "factor": scaling_factors + }, + dtype=torch.bfloat16) + #linear_rope = linear_rope.to(dtype=dtype) + id_to_index = get_random_id_to_index(num_loras, max_loras) + _, index_mapping, prompt_mapping = create_random_inputs( + active_lora_ids=[0], + num_inputs=batch_size, + input_size=(seq_len, max_position), + input_range=(0, lora_config.lora_extra_vocab_size), + input_type=torch.bfloat16, + ) + + lora_mapping = LoRAMapping(index_mapping, prompt_mapping) + long_lora_context = LongContextLoRAContext(list(scaling_factors), + rotary_dim) + + next_expected_offset = 0 + # Make sure the offset is correct. + scaling_factor_to_offset = lora_rope.scaling_factor_to_offset + for scaling_factor, offset in scaling_factor_to_offset.items(): + assert offset == next_expected_offset + next_expected_offset += scaling_factor * max_position + + for i in range(len(scaling_factors)): + long_lora_context.offsets_by_lora_id[i] = scaling_factor_to_offset.get( + scaling_factors[i], 0) + punica_wrapper.update_metadata( + lora_mapping, + id_to_index, + max_loras, + 512, + lora_config.lora_extra_vocab_size, + long_lora_context=long_lora_context, + ) + # lora_rope.set_mapping(*mapping_info) + + positions = torch.randint(0, max_position, (batch_size, seq_len)) + query = torch.randn(batch_size, + seq_len, + num_heads * head_size, + dtype=dtype) + key = torch.randn_like(query) + ref_q, ref_k = linear_rope(positions, query, key) + htcore.mark_step() + actual_q, actual_k = lora_rope(positions, query, key) + + torch.allclose(ref_q, actual_q) + torch.allclose(ref_k, actual_k) + + +@pytest.mark.parametrize("tp_size", [1, 2, 4, 8]) +@pytest.mark.parametrize("seed", list(range(256))) +def test_vocab_parallel_embedding_indices(tp_size, seed): + random.seed(seed) + vocab_size = random.randint(4000, 64000) + added_vocab_size = random.randint(0, 1024) + org_vocab_size = vocab_size - added_vocab_size + last_org_vocab_end_index = 0 + last_added_vocab_end_index = org_vocab_size + computed_vocab_size = 0 + computed_org_vocab_size = 0 + computed_added_vocab_size = 0 + vocab_size_padded = -1 + + all_org_tokens: List[int] = [] + all_added_tokens: List[int] = [] + token_ids: List[int] = [] + + for tp_rank in range(tp_size): + with patch( + "vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_rank", + return_value=tp_rank + ), patch( + "vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_world_size", + return_value=tp_size): + vocab_embedding = VocabParallelEmbedding( + vocab_size, 1, org_num_embeddings=org_vocab_size) + vocab_size_padded = vocab_embedding.num_embeddings_padded + shard_indices = vocab_embedding.shard_indices + # Assert that the ranges are contiguous + assert shard_indices.org_vocab_start_index == last_org_vocab_end_index + assert (shard_indices.added_vocab_start_index == + last_added_vocab_end_index) + + # Ensure that we are not exceeding the vocab size + computed_vocab_size += shard_indices.num_elements_padded + computed_org_vocab_size += shard_indices.num_org_elements + computed_added_vocab_size += shard_indices.num_added_elements + + # Ensure that the ranges are not overlapping + all_org_tokens.extend( + range(shard_indices.org_vocab_start_index, + shard_indices.org_vocab_end_index)) + all_added_tokens.extend( + range(shard_indices.added_vocab_start_index, + shard_indices.added_vocab_end_index)) + + token_ids.extend( + range(shard_indices.org_vocab_start_index, + shard_indices.org_vocab_end_index)) + token_ids.extend([-1] * (shard_indices.num_org_elements_padded - + shard_indices.num_org_elements)) + token_ids.extend( + range(shard_indices.added_vocab_start_index, + shard_indices.added_vocab_end_index)) + token_ids.extend([-1] * (shard_indices.num_added_elements_padded - + shard_indices.num_added_elements)) + + last_org_vocab_end_index = shard_indices.org_vocab_end_index + last_added_vocab_end_index = shard_indices.added_vocab_end_index + + assert computed_vocab_size == vocab_size_padded + assert computed_org_vocab_size == org_vocab_size + assert computed_added_vocab_size == added_vocab_size + + # Ensure that the ranges are not overlapping + assert len(all_org_tokens) == len(set(all_org_tokens)) + assert len(all_added_tokens) == len(set(all_added_tokens)) + assert not set(all_org_tokens).intersection(set(all_added_tokens)) + + token_ids_tensor = torch.tensor(token_ids, dtype=torch.long) + reindex_mapping = vocab_embedding.get_sharded_to_full_mapping() + assert reindex_mapping is not None or tp_size == 1 + if reindex_mapping is not None: + reindexed_token_ids = token_ids_tensor[reindex_mapping] + expected = torch.tensor(list(range(0, vocab_size))) + assert reindexed_token_ids[:vocab_size].equal(expected) + assert torch.all(reindexed_token_ids[vocab_size:] == -1) + + +def test_get_masked_input_and_mask(): + x = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]) + + # base tp 1 case, no padding + modified_x, _ = get_masked_input_and_mask(x, + org_vocab_start_index=0, + org_vocab_end_index=8, + added_vocab_start_index=8, + added_vocab_end_index=12, + num_org_vocab_padding=0) + assert torch.equal(x, modified_x) + + # tp 2 case, no padding + modified_x_rank_0, _ = get_masked_input_and_mask(x, + org_vocab_start_index=0, + org_vocab_end_index=4, + added_vocab_start_index=8, + added_vocab_end_index=10, + num_org_vocab_padding=0) + modified_x_rank_1, _ = get_masked_input_and_mask( + x, + org_vocab_start_index=4, + org_vocab_end_index=8, + added_vocab_start_index=10, + added_vocab_end_index=12, + num_org_vocab_padding=0) + assert torch.equal(modified_x_rank_0, + torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 0, 0])) + assert torch.equal(modified_x_rank_1, + torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 4, 5])) + + # tp 4 case, no padding + modified_x_rank_0, _ = get_masked_input_and_mask(x, + org_vocab_start_index=0, + org_vocab_end_index=2, + added_vocab_start_index=8, + added_vocab_end_index=9, + num_org_vocab_padding=0) + modified_x_rank_1, _ = get_masked_input_and_mask(x, + org_vocab_start_index=2, + org_vocab_end_index=4, + added_vocab_start_index=9, + added_vocab_end_index=10, + num_org_vocab_padding=0) + modified_x_rank_2, _ = get_masked_input_and_mask( + x, + org_vocab_start_index=4, + org_vocab_end_index=6, + added_vocab_start_index=10, + added_vocab_end_index=11, + num_org_vocab_padding=0) + modified_x_rank_3, _ = get_masked_input_and_mask( + x, + org_vocab_start_index=6, + org_vocab_end_index=8, + added_vocab_start_index=11, + added_vocab_end_index=12, + num_org_vocab_padding=0) + assert torch.equal(modified_x_rank_0, + torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0])) + assert torch.equal(modified_x_rank_1, + torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0])) + assert torch.equal(modified_x_rank_2, + torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0])) + assert torch.equal(modified_x_rank_3, + torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2])) + + # base tp 1 case, with padding + modified_x, _ = get_masked_input_and_mask(x, + org_vocab_start_index=0, + org_vocab_end_index=8, + added_vocab_start_index=8, + added_vocab_end_index=12, + num_org_vocab_padding=2) + assert torch.equal(modified_x, + torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13])) + + # tp 2 case, with padding + modified_x_rank_0, _ = get_masked_input_and_mask(x, + org_vocab_start_index=0, + org_vocab_end_index=4, + added_vocab_start_index=8, + added_vocab_end_index=10, + num_org_vocab_padding=2) + modified_x_rank_1, _ = get_masked_input_and_mask( + x, + org_vocab_start_index=4, + org_vocab_end_index=8, + added_vocab_start_index=10, + added_vocab_end_index=12, + num_org_vocab_padding=2) + assert torch.equal(modified_x_rank_0, + torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 6, 7, 0, 0])) + assert torch.equal(modified_x_rank_1, + torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 6, 7])) + + # tp 4 case, with padding + modified_x_rank_0, _ = get_masked_input_and_mask(x, + org_vocab_start_index=0, + org_vocab_end_index=2, + added_vocab_start_index=8, + added_vocab_end_index=9, + num_org_vocab_padding=2) + modified_x_rank_1, _ = get_masked_input_and_mask(x, + org_vocab_start_index=2, + org_vocab_end_index=4, + added_vocab_start_index=9, + added_vocab_end_index=10, + num_org_vocab_padding=2) + modified_x_rank_2, _ = get_masked_input_and_mask( + x, + org_vocab_start_index=4, + org_vocab_end_index=6, + added_vocab_start_index=10, + added_vocab_end_index=11, + num_org_vocab_padding=2) + modified_x_rank_3, _ = get_masked_input_and_mask( + x, + org_vocab_start_index=6, + org_vocab_end_index=8, + added_vocab_start_index=11, + added_vocab_end_index=12, + num_org_vocab_padding=2) + assert torch.equal(modified_x_rank_0, + torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0])) + assert torch.equal(modified_x_rank_1, + torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0])) + assert torch.equal(modified_x_rank_2, + torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0])) + assert torch.equal(modified_x_rank_3, + torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 4])) From 1dcdb37d928262c154589ef1a840d35feed8b57d Mon Sep 17 00:00:00 2001 From: HUIJONG JEONG <64083281+huijjj@users.noreply.github.com> Date: Tue, 29 Oct 2024 18:28:18 +0900 Subject: [PATCH 316/819] initial works on enabling automatic prefix caching (#162) This PR enables automatic prefix caching in intel gaudi HPUs. Please refer to this [RFC](https://github.com/vllm-project/vllm/issues/2614) for detailed informations about prefix caching. --- requirements-hpu.txt | 2 +- vllm/attention/backends/hpu_attn.py | 78 +++++++++++++++++----------- vllm/attention/ops/hpu_paged_attn.py | 36 ++++--------- vllm/worker/hpu_model_runner.py | 69 ++++++++++++++++++++---- 4 files changed, 118 insertions(+), 67 deletions(-) diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 4019950062efe..20f4dc74a3955 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -8,4 +8,4 @@ pandas tabulate setuptools>=61 setuptools-scm>=8 -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@341a77f \ No newline at end of file +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@341a77f diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index f4674cedf01ce..2e987b039c220 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -48,16 +48,16 @@ def get_kv_cache_shape( def swap_blocks( src_kv_cache: torch.Tensor, dst_kv_cache: torch.Tensor, - src_to_dst: Dict[int, int], + src_to_dsts: torch.Tensor, ) -> None: - HPUPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) + HPUPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dsts) @staticmethod def copy_blocks( kv_caches: List[torch.Tensor], - src_to_dists: Dict[int, List[int]], + src_to_dsts: torch.Tensor, ) -> None: - HPUPagedAttention.copy_blocks(kv_caches, src_to_dists) + HPUPagedAttention.copy_blocks(kv_caches, src_to_dsts) @dataclass @@ -68,6 +68,7 @@ class HPUAttentionMetadata(HPUPagedAttentionMetadata, AttentionMetadata): is_prompt: bool attn_bias: Optional[torch.Tensor] seq_lens_tensor: Optional[torch.Tensor] + context_lens_tensor: Optional[torch.Tensor] class HPUAttentionImpl(AttentionImpl, torch.nn.Module): @@ -183,35 +184,52 @@ def forward( if attn_metadata.is_prompt: # Prompt run. - if not self.prefill_usefusedsdpa: - # TODO: move this outside of model - assert attn_metadata.attn_bias is not None, \ - 'attn_bias must be set before calling model.forward!' - attn_bias = attn_metadata.attn_bias - if self.alibi_slopes is not None: - position_bias = _make_alibi_bias(self.alibi_slopes, - self.num_kv_heads, - attn_bias.dtype, - attn_bias.shape[-1]) - attn_bias = attn_bias.tile((1, self.num_kv_heads, 1, 1)) - attn_bias.add_(position_bias) - else: - attn_bias = None - query_shape = (batch_size, seq_len, self.num_heads, self.head_size) kv_shape = (batch_size, seq_len_kv, self.num_kv_heads, self.head_size) - out = ops.prompt_attention( - query.view(query_shape), - key.view(kv_shape), - value.view(kv_shape), - attn_bias=attn_bias, - p=0.0, - scale=self.scale, - matmul_qk_op=self.matmul_qk, - softmax_op=self.softmax, - matmul_av_op=self.matmul_av, - ) + if attn_metadata is None or attn_metadata.block_list is None: + if not self.prefill_usefusedsdpa: + # TODO: move this outside of model + assert attn_metadata.attn_bias is not None, \ + 'attn_bias must be set before calling model.forward' + attn_bias = attn_metadata.attn_bias + if self.alibi_slopes is not None: + position_bias = _make_alibi_bias( + self.alibi_slopes, self.num_kv_heads, + attn_bias.dtype, attn_bias.shape[-1]) + attn_bias = attn_bias.tile( + (1, self.num_kv_heads, 1, 1)) + attn_bias.add_(position_bias) + else: + attn_bias = None + + out = ops.prompt_attention( + query.view(query_shape), + key.view(kv_shape), + value.view(kv_shape), + attn_bias=attn_bias, + p=0.0, + scale=self.scale, + matmul_qk_op=self.matmul_qk, + softmax_op=self.softmax, + matmul_av_op=self.matmul_av, + ) + else: + # TODO: enable FusedSDPA + out = HPUPagedAttention.forward_prefix( + query=query.view(query_shape), + key=key.view(kv_shape), + value=value.view(kv_shape), + key_cache=key_cache, + value_cache=value_cache, + block_list=attn_metadata.block_list, + attn_bias=attn_metadata.attn_bias, + scale=self.scale, + matmul_qk_op=self.matmul_qk, + matmul_av_op=self.matmul_av, + softmax_op=self.softmax, + keys_fetch_func=self.k_cache.fetch_from_cache, + values_fetch_func=self.v_cache.fetch_from_cache) output = out.reshape(batch_size, seq_len, hidden_size) else: # Decoding run. diff --git a/vllm/attention/ops/hpu_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py index 603d3959377c4..e55a4de11fd6c 100644 --- a/vllm/attention/ops/hpu_paged_attn.py +++ b/vllm/attention/ops/hpu_paged_attn.py @@ -3,7 +3,7 @@ ############################################################################### from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple +from typing import List, Optional, Tuple import torch from vllm_hpu_extension import cache_ops, ops @@ -63,42 +63,28 @@ def forward_decode(**kwargs) -> torch.Tensor: return ops.flat_pa(**kwargs) @staticmethod - def forward_prefix( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - block_tables: torch.Tensor, - subquery_start_loc: torch.Tensor, - seq_lens_tensor: torch.Tensor, - context_lens: torch.Tensor, - max_query_len: int, - alibi_slopes: Optional[torch.Tensor], - sliding_window: Optional[int], - ) -> torch.Tensor: - raise NotImplementedError( - "forward_prefix is not implemented for HPUPagedAttention") + def forward_prefix(**kwargs) -> torch.Tensor: + return ops.prompt_attention_with_context(**kwargs) @staticmethod def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dst: Dict[int, int], + src_kv_cache: Tuple[torch.Tensor, torch.Tensor], + dst_kv_cache: Tuple[torch.Tensor, torch.Tensor], + src_to_dsts: torch.Tensor, ) -> None: src_key_cache = src_kv_cache[0] dst_key_cache = dst_kv_cache[0] - cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst) + cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dsts) src_value_cache = src_kv_cache[1] dst_value_cache = dst_kv_cache[1] - cache_ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst) + cache_ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dsts) @staticmethod def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: Dict[int, List[int]], + kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], + src_to_dsts: torch.Tensor, ) -> None: key_caches = [kv_cache[0] for kv_cache in kv_caches] value_caches = [kv_cache[1] for kv_cache in kv_caches] - cache_ops.copy_blocks(key_caches, value_caches, src_to_dists) + cache_ops.copy_blocks(key_caches, value_caches, src_to_dsts) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 78e8620d7c43c..56c1d3a4b79df 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -291,20 +291,38 @@ def __init__(self, model, block_size, dtype, enforce_eager): def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device, dtype): - prefill_metadata = attn_metadata - if prefill_metadata is None or self.prefill_use_fusedsdpa: + if (attn_metadata is None or self.prefill_use_fusedsdpa + or not attn_metadata.is_prompt): return attn_metadata + prefill_metadata = attn_metadata + seq_lens_t = prefill_metadata.seq_lens_tensor + context_lens_t = prefill_metadata.context_lens_tensor + query_lens_t = seq_lens_t - context_lens_t + + block_list = attn_metadata.block_list + max_context_len = (block_list.size(-1) // + batch_size if block_list is not None else 0) + max_context_len = max_context_len * self.block_size + past_mask = torch.arange(0, + max_context_len, + dtype=torch.int32, + device=device) + past_mask = (past_mask.view(1, -1).expand(batch_size, -1).ge( + context_lens_t.view(-1, 1)).view(batch_size, 1, -1).expand( + batch_size, seq_len, -1).view(batch_size, 1, seq_len, -1)) + len_mask = (torch.arange(0, seq_len, device=device, dtype=torch.int32).view(1, seq_len).ge( - seq_lens_t.unsqueeze(-1)).view( + query_lens_t.unsqueeze(-1)).view( batch_size, 1, 1, seq_len)) causal_mask = torch.triu(torch.ones((batch_size, 1, seq_len, seq_len), device=device, dtype=torch.bool), diagonal=1) mask = causal_mask.logical_or(len_mask) + mask = torch.concat((past_mask, mask), dim=-1) attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_( mask, -math.inf)) attn_metadata = prefill_metadata._replace(attn_bias=attn_bias) @@ -876,7 +894,7 @@ def _prepare_prompt( assert max_query_len > 0 max_prompt_len = max( - find_bucket(max(seq_lens), + find_bucket(max_query_len, self.bucketing_global_state.prompt_seq_bucket_cfg), self.block_size) @@ -889,12 +907,34 @@ def _prepare_prompt( if lora_id > 0: lora_requests.add(seq_group_metadata.lora_request) - lora_index_mapping += [lora_id] * (max_prompt_len - context_len) + lora_index_mapping += [lora_id] * max_prompt_len lora_prompt_mapping.extend( [lora_id] * - (max_prompt_len - context_len + (max_prompt_len if seq_group_metadata.sampling_params.prompt_logprobs else 1)) + if any(context_lens): + assert not self.scheduler_config.chunked_prefill_enabled + # prefix caching + + max_num_block = max(len(bt) for bt in prefix_block_tables) + prefix_block_list = list( + itertools.chain.from_iterable( + bt if len(bt) == max_num_block else bt + + ([_PAD_BLOCK_ID] * (max_num_block - len(bt))) + for bt in prefix_block_tables)) + + # TODO: pad to proper len + pad_len = len(prefix_block_list) + prefix_block_list = pad_list(prefix_block_list, pad_len, + _PAD_BLOCK_ID) + + prefix_block_list_tensor = torch.tensor(prefix_block_list, + dtype=torch.long, + device=self.device) + else: + prefix_block_list_tensor = None + input_tokens = make_tensor_with_pad(input_tokens, max_len=max_prompt_len, pad=0, @@ -917,11 +957,15 @@ def _prepare_prompt( dtype=torch.long, device=self.device) + context_lens_tensor = torch.tensor(context_lens, + dtype=torch.long, + device=self.device) + block_indices, block_offsets = precompute_indices_and_offsets( self.block_size, slot_mapping, True) attn_metadata = self.attn_backend.make_metadata( is_prompt=True, - block_list=None, + block_list=prefix_block_list_tensor, block_mapping=None, block_usage=None, block_indices=block_indices, @@ -930,6 +974,7 @@ def _prepare_prompt( block_groups=None, attn_bias=None, seq_lens_tensor=seq_lens_tensor, + context_lens_tensor=context_lens_tensor, num_prefills=real_num_seqs, num_prefill_tokens=sum_query_len, num_decode_tokens=0, @@ -1089,6 +1134,7 @@ def _prepare_decode( block_groups=block_groups, attn_bias=None, seq_lens_tensor=None, + context_lens_tensor=None, num_prefills=0, num_prefill_tokens=0, num_decode_tokens=num_decode_tokens, @@ -1200,7 +1246,7 @@ def prepare_input_tensors( # FIXME: We need to adjust selected_token_indices to accommodate # for padding max_len = input_tokens.size(1) - paddings = [max_len - s for s in seq_lens] + paddings = [max_len - q for q in query_lens] paddings = [0] + paddings[:-1] paddings = list(itertools.accumulate(paddings)) paddings_prompt_logprobs = [] @@ -1296,9 +1342,10 @@ def trim_attn_metadata(self, metadata: AttentionMetadata) -> object: # input_hash(123) != input_hash(321) # input_hash("abc") != input_hash("cba") attention_metadata = subtuple(metadata, 'TrimmedAttentionMetadata', [ - 'attn_bias', 'seq_lens_tensor', 'block_list', 'block_mapping', - 'block_usage', 'slot_mapping', 'is_prompt', 'block_indices', - 'block_offsets', 'block_scales', 'block_groups' + 'attn_bias', 'seq_lens_tensor', 'context_lens_tensor', + 'block_list', 'block_mapping', 'block_usage', 'slot_mapping', + 'is_prompt', 'block_indices', 'block_offsets', 'block_scales', + 'block_groups' ]) return attention_metadata From 78e947a9f773473cd7afe6d48da4f6b132ff8ef1 Mon Sep 17 00:00:00 2001 From: Tomasz Zielinski <85164140+tzielinski-habana@users.noreply.github.com> Date: Tue, 29 Oct 2024 13:42:30 +0100 Subject: [PATCH 317/819] Multi step scheduling (#441) Implementation of multi-step scheduling. To use the feature, pass --num_scheduler_steps=[n] as a server parameter. In my tests, best results were achieved with n==64, but this will vary depending on the model. --------- Co-authored-by: Karol Damaszke Co-authored-by: jmaksymczuk --- vllm/executor/hpu_executor.py | 11 +- vllm/executor/ray_hpu_executor.py | 4 +- vllm/worker/hpu_model_runner.py | 348 ++++++++++++++++++--------- vllm/worker/multi_step_hpu_worker.py | 116 +++++++++ 4 files changed, 361 insertions(+), 118 deletions(-) create mode 100644 vllm/worker/multi_step_hpu_worker.py diff --git a/vllm/executor/hpu_executor.py b/vllm/executor/hpu_executor.py index 34879bc4e7ef5..dc8a7b4b1d1c8 100644 --- a/vllm/executor/hpu_executor.py +++ b/vllm/executor/hpu_executor.py @@ -54,9 +54,16 @@ def _create_worker(self, local_rank: int = 0, rank: int = 0, distributed_init_method: Optional[str] = None): + if self.scheduler_config.is_multi_step: + module_name = "vllm.worker.multi_step_hpu_worker" + class_name = "MultiStepHPUWorker" + else: + module_name = "vllm.worker.hpu_worker" + class_name = "HPUWorker" + wrapper = WorkerWrapperBase( - worker_module_name="vllm.worker.hpu_worker", - worker_class_name="HPUWorker", + worker_module_name=module_name, + worker_class_name=class_name, ) wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank, distributed_init_method)) diff --git a/vllm/executor/ray_hpu_executor.py b/vllm/executor/ray_hpu_executor.py index 775c0a5d95899..58b1447531841 100644 --- a/vllm/executor/ray_hpu_executor.py +++ b/vllm/executor/ray_hpu_executor.py @@ -87,8 +87,8 @@ def _get_worker_module_and_class( Type[WorkerBase]]]]: # noqa: F821 worker_class_fn = None if self.scheduler_config.is_multi_step: - raise NotImplementedError( - "Multi-step execution is not implemented for HPU") + worker_module_name = "vllm.worker.multi_step_hpu_worker" + worker_class_name = "MultiStepHPUWorker" elif self.speculative_config: raise NotImplementedError( "Speculative decoding is not implemented for HPU") diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 56c1d3a4b79df..559ed33548dea 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -38,11 +38,13 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader import get_model from vllm.model_executor.models import supports_multimodal +from vllm.model_executor.sampling_metadata import SequenceGroupToSample from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, MultiModalInputs) from vllm.sampling_params import SamplingParams -from vllm.sequence import (IntermediateTensors, SequenceData, - SequenceGroupMetadata) +from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors, + Logprob, SequenceData, SequenceGroupMetadata, + SequenceOutput) from vllm.utils import (is_fake_hpu, is_pin_memory_available, make_tensor_with_pad) from vllm.worker.model_runner_base import ( @@ -472,6 +474,8 @@ class ModelInputForHPU(ModelRunnerInputBase): virtual_engine: int = 0 lora_ids: Optional[List[int]] = None async_callback: Optional[Callable] = None + is_first_multi_step: bool = True + is_last_step: bool = True def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: tensor_dict = { @@ -484,6 +488,8 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: "batch_size_padded": self.batch_size_padded, "virtual_engine": self.virtual_engine, "lora_ids": self.lora_ids, + "is_first_multi_step": self.is_first_multi_step, + "is_last_step": self.is_last_step, } _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) return tensor_dict @@ -615,6 +621,9 @@ def __init__( self._setup_buckets() self._set_gc_threshold() + # For multi-step scheduling + self.cached_step_outputs: List[torch.Tensor] = [] + def _set_gc_threshold(self) -> None: # Read https://docs.python.org/3/library/gc.html#gc.set_threshold # for comprehensive description of gc generations. @@ -997,6 +1006,7 @@ def _prepare_prompt( def _prepare_decode( self, seq_group_metadata_list: List[SequenceGroupMetadata], + output=None, ) -> PrepareDecodeMetadata: input_tokens: List[List[int]] = [] input_positions: List[List[int]] = [] @@ -1027,8 +1037,9 @@ def _prepare_decode( for seq_id in seq_ids: seq_data = seq_group_metadata.seq_data[seq_id] - generation_token = seq_data.get_last_token_id() - input_tokens.append([generation_token]) + if output is None: + generation_token = seq_data.get_last_token_id() + input_tokens.append([generation_token]) seq_len = seq_data.get_len() position = seq_len - 1 @@ -1039,6 +1050,9 @@ def _prepare_decode( seq_lens.append(seq_len) block_table = seq_group_metadata.block_tables[seq_id] + num_fully_occupied_blocks = position // self.block_size + block_table = block_table[:num_fully_occupied_blocks + 1] + if len(block_table) == 0: block_number = _PAD_BLOCK_ID else: @@ -1058,9 +1072,14 @@ def _prepare_decode( block_table = block_table[-sliding_window_blocks:] block_tables.append(block_table) - input_tokens = torch.tensor(input_tokens, - dtype=torch.long, - device=self.device) + if output is None: + input_tokens = torch.tensor(input_tokens, + dtype=torch.long, + device=self.device) + else: + real_batch_size = len(seq_group_metadata_list) + input_tokens = output[:real_batch_size] + input_positions = torch.tensor(input_positions, dtype=torch.long, device=self.device) @@ -1070,7 +1089,7 @@ def _prepare_decode( blocks_used = [len(bt) for bt in block_tables if bt] block_list = [] block_scales = [] - for i, bt in enumerate(block_tables): + for bt in block_tables: block_list.extend(bt) blocks_in_group = len(bt) if blocks_in_group > 0: @@ -1984,114 +2003,215 @@ def execute_model( num_steps: int = 1, warmup_mode=False, ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]: - if num_steps > 1: - raise ValueError( - "num_steps > 1 is not supported in HPUModelRunner") - - if self.lora_config: - assert model_input.lora_requests is not None - assert model_input.lora_mapping is not None - self.set_active_loras(model_input.lora_requests, - model_input.lora_mapping) - input_tokens = model_input.input_tokens - input_positions = model_input.input_positions - attn_metadata = model_input.attn_metadata - sampling_metadata = model_input.sampling_metadata - real_batch_size = model_input.real_batch_size - batch_size_padded = model_input.batch_size_padded - assert input_tokens is not None - assert input_positions is not None - assert sampling_metadata is not None - assert attn_metadata is not None - is_prompt = attn_metadata.is_prompt - assert is_prompt is not None - batch_size = input_tokens.size(0) - seq_len = self._seq_len(attn_metadata) - use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) - self._check_config(batch_size, seq_len, is_prompt, warmup_mode) - - lora_mask: torch.Tensor = None - lora_logits_mask: torch.Tensor = None - if self.lora_config: - assert model_input.lora_ids is not None - lora_mask, lora_logits_mask = self.create_lora_mask( - input_tokens, model_input.lora_ids, attn_metadata.is_prompt) - - execute_model_kwargs = { - "input_ids": input_tokens, - "positions": input_positions, - "kv_caches": kv_caches, - "attn_metadata": self.trim_attn_metadata(attn_metadata), - "intermediate_tensors": intermediate_tensors, - "lora_mask": lora_mask, - **(model_input.multi_modal_kwargs or {}), - } - if htorch.utils.internal.is_lazy(): - execute_model_kwargs.update({"bypass_hpu_graphs": not use_graphs}) - - htorch.core.mark_step() - if self.is_driver_worker: - model_event_name = ("model_" - f"{'prompt' if is_prompt else 'decode'}_" - f"bs{batch_size}_" - f"seq{seq_len}_" - f"graphs{'T' if use_graphs else 'F'}") + if not model_input.is_first_multi_step: + if not model_input.is_last_step: + # not first or last multi-step + return [] + # last multi-step + output = self._decode_sampler_outputs(model_input) + if model_input.is_first_multi_step: + # first multi-step + if self.lora_config: + assert model_input.lora_requests is not None + assert model_input.lora_mapping is not None + self.set_active_loras(model_input.lora_requests, + model_input.lora_mapping) + input_tokens = model_input.input_tokens + input_positions = model_input.input_positions + attn_metadata = model_input.attn_metadata + sampling_metadata = model_input.sampling_metadata + real_batch_size = model_input.real_batch_size + batch_size_padded = model_input.batch_size_padded + assert input_tokens is not None + assert input_positions is not None + assert sampling_metadata is not None + assert attn_metadata is not None + is_prompt = attn_metadata.is_prompt + assert is_prompt is not None + batch_size = input_tokens.size(0) + seq_len = self._seq_len(attn_metadata) + use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) + self._check_config(batch_size, seq_len, is_prompt, warmup_mode) + + lora_mask: torch.Tensor = None + lora_logits_mask: torch.Tensor = None + if self.lora_config: + assert model_input.lora_ids is not None + lora_mask, lora_logits_mask = self.create_lora_mask( + input_tokens, model_input.lora_ids, + attn_metadata.is_prompt) + + execute_model_kwargs = { + "input_ids": input_tokens, + "positions": input_positions, + "kv_caches": kv_caches, + "attn_metadata": self.trim_attn_metadata(attn_metadata), + "intermediate_tensors": intermediate_tensors, + "lora_mask": lora_mask, + **(model_input.multi_modal_kwargs or {}), + } + if htorch.utils.internal.is_lazy(): + execute_model_kwargs.update( + {"bypass_hpu_graphs": not use_graphs}) + + htorch.core.mark_step() + if self.is_driver_worker: + model_event_name = ("model_" + f"{'prompt' if is_prompt else 'decode'}_" + f"bs{batch_size}_" + f"seq{seq_len}_" + f"graphs{'T' if use_graphs else 'F'}") + else: + model_event_name = 'model_executable' + if num_steps > 1: + # in case of multi-step scheduling + # we only want to pythonize in the last step + sampling_metadata.skip_sampler_cpu_output = True + self.model.model.sampler.include_gpu_probs_tensor = True + for i in range(num_steps): + with self.profiler.record_event('internal', model_event_name): + hidden_states = self.model.forward( + **execute_model_kwargs, + selected_token_indices=sampling_metadata. + selected_token_indices) + + if self.lora_config: + LoraMask.setLoraMask( + lora_logits_mask.index_select( + 0, sampling_metadata.selected_token_indices)) + + # Compute the logits. + with self.profiler.record_event( + 'internal', + ('compute_logits_' + f'{"prompt" if is_prompt else "decode"}_bs' + f'{batch_size}_' + f'seq{seq_len}')): + if num_steps == 1: + sampling_metadata.selected_token_indices = None + logits = self.model.compute_logits(hidden_states, + sampling_metadata) + htorch.core.mark_step() + # Only perform sampling in the driver worker. + if not self.is_driver_worker: + return [] + + if model_input.async_callback is not None: + model_input.async_callback() + # Sample the next token. + with self.profiler.record_event( + 'internal', ('sample_' + f'{"prompt" if is_prompt else "decode"}_' + f'bs{batch_size}_' + f'seq{seq_len}')): + output = self.model.sample( + logits=logits, + sampling_metadata=sampling_metadata, + ) + if num_steps > 1: + output = output.sampled_token_ids + self.cached_step_outputs.append(output) + htorch.core.mark_step() + if i < num_steps - 1: + if i == 0: + import copy + ctx = model_input.async_callback.keywords[ # type: ignore + "ctx"] + seq_group_metadata_list = ctx.seq_group_metadata_list + seq_group_metadata_list = copy.deepcopy( + seq_group_metadata_list) + for seq_group_metadata in seq_group_metadata_list: + for data in seq_group_metadata.seq_data.values(): + max_output_len = sampling_metadata.seq_groups[ + 0].sampling_params.max_tokens + if len(data.output_token_ids) < max_output_len - 1: + # arbitrary value, this could be any token + dummy_token = (540, ) + data.output_token_ids += (dummy_token) + else: + if num_steps == 1: + return [output] + else: + return [] + + result = self._prepare_decode(seq_group_metadata_list, + output=output) + execute_model_kwargs.update({ + "input_ids": + result.input_tokens, + "positions": + result.input_positions, + "attn_metadata": + self.trim_attn_metadata(result.attn_metadata) + }) + + if self.is_driver_worker and self.profiler.enabled: + # Stop recording 'execute_model' event + self.profiler.end() + event_end = self.profiler.get_timestamp_us() + counters = self.profiler_counter_helper.get_counter_dict( + cache_config=self.cache_config, + duration=event_end - self.event_start, + seq_len=seq_len, + batch_size_padded=batch_size_padded, + real_batch_size=real_batch_size, + is_prompt=is_prompt) + self.profiler.record_counter(self.event_start, counters) + if num_steps == 1: + return [output] + else: + return [] + return output if type(output) is list else [output] + + def _decode_sampler_outputs(self, model_input): + use_async_out_proc = model_input.async_callback is not None + sampler_outputs = [] + num_outputs = len(self.cached_step_outputs) + for i in range(num_outputs): + next_token_ids = self.cached_step_outputs.pop(0) + next_token_ids = next_token_ids.cpu().tolist() + sampler_output = self._make_decode_output( + next_token_ids, model_input.sampling_metadata.seq_groups) + sampler_outputs.append(sampler_output) + + if i < num_outputs - 1 and use_async_out_proc: + assert model_input.async_callback is not None + ctx = model_input.async_callback.keywords[ # type: ignore + "ctx"] + ctx.append_output( + outputs=[sampler_output], + seq_group_metadata_list=ctx.seq_group_metadata_list, + scheduler_outputs=ctx.scheduler_outputs, + is_async=False, + is_last_step=False, + is_first_step_output=False) + model_input.async_callback() + + if use_async_out_proc: + return [sampler_outputs[-1]] else: - model_event_name = 'model_executable' - with self.profiler.record_event('internal', model_event_name): - hidden_states = self.model.forward( - **execute_model_kwargs, - selected_token_indices=sampling_metadata.selected_token_indices - ) + return sampler_outputs - if self.lora_config: - LoraMask.setLoraMask( - lora_logits_mask.index_select( - 0, sampling_metadata.selected_token_indices)) - - # Compute the logits. - with self.profiler.record_event( - 'internal', ('compute_logits_' - f'{"prompt" if is_prompt else "decode"}_bs' - f'{batch_size}_' - f'seq{seq_len}')): - sampling_metadata.selected_token_indices = None - logits = self.model.compute_logits(hidden_states, - sampling_metadata) - htorch.core.mark_step() - # Only perform sampling in the driver worker. - if not self.is_driver_worker: - return [] - - if model_input.async_callback is not None: - model_input.async_callback() - - # Sample the next token. - with self.profiler.record_event( - 'internal', ('sample_' - f'{"prompt" if is_prompt else "decode"}_' - f'bs{batch_size}_' - f'seq{seq_len}')): - output = self.model.sample( - logits=logits, - sampling_metadata=sampling_metadata, - ) - output.outputs = output.outputs[:real_batch_size] - htorch.core.mark_step() - - if self.is_driver_worker and self.profiler.enabled: - # Stop recording 'execute_model' event - self.profiler.end() - event_end = self.profiler.get_timestamp_us() - counters = self.profiler_counter_helper.get_counter_dict( - cache_config=self.cache_config, - duration=event_end - self.event_start, - seq_len=seq_len, - batch_size_padded=batch_size_padded, - real_batch_size=real_batch_size, - is_prompt=is_prompt) - self.profiler.record_counter(self.event_start, counters) - return [output] + def _make_decode_output( + self, + next_token_ids: List[List[int]], + seq_groups: List[SequenceGroupToSample], + ) -> SamplerOutput: + zero_logprob = Logprob(0.0) + sampler_outputs = [] + batch_idx = 0 + for seq_group in seq_groups: + seq_ids = seq_group.seq_ids + seq_outputs = [] + for seq_id in seq_ids: + next_token_id = next_token_ids[batch_idx][0] + seq_outputs.append( + SequenceOutput(seq_id, next_token_id, + {next_token_id: zero_logprob})) + batch_idx += 1 + sampler_outputs.append( + CompletionSequenceGroupOutput(seq_outputs, None)) + return SamplerOutput(sampler_outputs) def shutdown_inc(self): can_finalize_inc = False diff --git a/vllm/worker/multi_step_hpu_worker.py b/vllm/worker/multi_step_hpu_worker.py new file mode 100644 index 0000000000000..f2791a833c4b7 --- /dev/null +++ b/vllm/worker/multi_step_hpu_worker.py @@ -0,0 +1,116 @@ +import dataclasses +from typing import Dict, Optional, Tuple + +import torch + +from vllm.distributed import broadcast_tensor_dict +from vllm.sequence import ExecuteModelRequest +from vllm.worker.hpu_model_runner import ModelInputForHPU +from vllm.worker.hpu_worker import HPUWorker +from vllm.worker.worker_base import WorkerInput + + +class MultiStepHPUWorker(HPUWorker): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.cached_model_input: Optional[ModelInputForHPU] = None + + def _get_driver_input_and_broadcast( + self, execute_model_req: ExecuteModelRequest + ) -> Tuple[ModelInputForHPU, WorkerInput, Dict[str, torch.Tensor]]: + """ + Get the driver input and broadcast it to other workers. + """ + assert self.is_driver_worker + assert execute_model_req.virtual_engine == 0 + + is_first_multi_step = execute_model_req.is_first_multi_step + is_last_step = execute_model_req.is_last_step + + if is_first_multi_step: + # on first step we prepare the worker input and model input normally + worker_input: WorkerInput = self.prepare_worker_input( + execute_model_req=execute_model_req) + worker_input = dataclasses.replace( + worker_input, + num_steps=execute_model_req.num_lookahead_slots + 1) + model_input: ModelInputForHPU = ( + self.model_runner.prepare_model_input( + execute_model_req.seq_group_metadata_list, + execute_model_req.virtual_engine, + execute_model_req.finished_requests_ids)) + + if execute_model_req.async_callback: + model_input = dataclasses.replace( + model_input, + async_callback=execute_model_req.async_callback) + else: + # on subsequent steps we reuse the worker input and model input + assert self.cached_model_input is not None + model_input = self.cached_model_input + worker_input = WorkerInput() + + model_input = dataclasses.replace( + model_input, + is_first_multi_step=is_first_multi_step, + is_last_step=is_last_step) + + if self.do_metadata_broadcast: + if is_first_multi_step: + broadcast_data = worker_input.as_broadcastable_tensor_dict() + broadcast_data.update( + model_input.as_broadcastable_tensor_dict()) + broadcast_tensor_dict(broadcast_data, src=0) + else: + broadcast_data = { + "is_first_multi_step": is_first_multi_step, + "is_last_step": is_last_step, + } + broadcast_tensor_dict(broadcast_data, src=0) + + # Returning empty dict here to keep this compatible with + # `LocalOrDistributedWorkerBase._get_driver_input_and_broadcast` + return model_input, worker_input, {} + + def prepare_input( + self, + execute_model_req: Optional[ExecuteModelRequest] = None, + ) -> Optional[Tuple[ModelInputForHPU, WorkerInput, Dict[str, + torch.Tensor]]]: + if self.is_driver_worker: + if execute_model_req is None: + if self.do_metadata_broadcast: + # This signals that there's no more requests to process for + # now. All workers are running infinite loop with + # broadcast_tensor_dict, and it stops the loop when the + # driver broadcasts an empty input. Send an empty input to + # notify all other workers to stop their execution loop. + broadcast_tensor_dict({}, src=0) + return None + model_input, worker_input, _ = self._get_driver_input_and_broadcast( + execute_model_req) + if model_input.is_first_multi_step: + self.cached_model_input = model_input + return model_input, worker_input, {} + else: + broadcast_data = broadcast_tensor_dict(src=0) + if not broadcast_data: + return None + + if len(broadcast_data) == 2: + assert self.cached_model_input is not None + self.cached_model_input = dataclasses.replace( + self.cached_model_input, + is_first_multi_step=broadcast_data["is_first_multi_step"], + is_last_step=broadcast_data["is_last_step"]) + empty_worker_input = WorkerInput() + return self.cached_model_input, empty_worker_input, {} + + worker_input = WorkerInput.from_broadcasted_tensor_dict( + broadcast_data) + model_input = ( + self.model_runner. + make_model_input_from_broadcasted_tensor_dict(broadcast_data)) + self.cached_model_input = model_input + return model_input, worker_input, {} From a821717495325662dac466267407bf1819a6c1c8 Mon Sep 17 00:00:00 2001 From: Artur Fierka Date: Wed, 30 Oct 2024 10:58:48 +0100 Subject: [PATCH 318/819] Add fp8 test to jenkins CI (#429) --- .../configs/Meta-Llama-3.1-8B-Instruct-fp8.yaml | 16 ++++++++++++++++ .jenkins/lm-eval-harness/configs/models-fp8.txt | 1 + .../lm-eval-harness/test_lm_eval_correctness.py | 15 +++++++++++++++ .jenkins/test_config.yaml | 5 +++++ 4 files changed, 37 insertions(+) create mode 100644 .jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct-fp8.yaml create mode 100644 .jenkins/lm-eval-harness/configs/models-fp8.txt diff --git a/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct-fp8.yaml b/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct-fp8.yaml new file mode 100644 index 0000000000000..80a8c522bc5a0 --- /dev/null +++ b/.jenkins/lm-eval-harness/configs/Meta-Llama-3.1-8B-Instruct-fp8.yaml @@ -0,0 +1,16 @@ +# FIXME(kzawora): these scores were generated using vLLM on HPU, we need to confirm them on HF +# VLLM_SKIP_WARMUP=true bash run-lm-eval-gsm-cot-llama-vllm-baseline.sh -m "/mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct" -b 128 -l 1319 -f 8 -t 1 +model_name: "/mnt/weka/data/pytorch/llama3.1/Meta-Llama-3.1-8B-Instruct" +tasks: +- name: "gsm8k_cot_llama" + metrics: + - name: "exact_match,strict-match" + value: 0.8317 + - name: "exact_match,flexible-extract" + value: 0.8355 +limit: null +num_fewshot: 8 +dtype: "bfloat16" +fewshot_as_multiturn: true +apply_chat_template: true +fp8: true \ No newline at end of file diff --git a/.jenkins/lm-eval-harness/configs/models-fp8.txt b/.jenkins/lm-eval-harness/configs/models-fp8.txt new file mode 100644 index 0000000000000..8a318a9ec936d --- /dev/null +++ b/.jenkins/lm-eval-harness/configs/models-fp8.txt @@ -0,0 +1 @@ +Meta-Llama-3.1-8B-Instruct-fp8.yaml \ No newline at end of file diff --git a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py index 421a949ab72e5..3df0621f49a72 100644 --- a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py +++ b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py @@ -27,6 +27,14 @@ TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1) +def setup_fp8(model_path, device_type): + flavor = f"g{device_type[-1]}" + normalized_model_name = Path(model_path).parts[-1].lower() + os.environ[ + "QUANT_CONFIG"] = \ + f"/software/data/vllm-benchmarks/inc/{normalized_model_name}/maxabs_quant_{flavor}.json" + + def fail_on_exit(): os._exit(1) @@ -42,6 +50,10 @@ def launch_lm_eval(eval_config): f"max_model_len=4096," \ f"max_num_seqs={max_num_seqs}," \ f"trust_remote_code={trust_remote_code}" + if eval_config.get("fp8"): + model_args += ",quantization=inc," \ + "kv_cache_dtype=fp8_inc," \ + "weights_load_device=cpu" kwargs = {} if 'fewshot_as_multiturn' in eval_config: kwargs['fewshot_as_multiturn'] = eval_config['fewshot_as_multiturn'] @@ -134,6 +146,9 @@ def test_lm_eval_correctness(record_xml_attribute, record_property): f'tp{TP_SIZE}') record_xml_attribute("name", testname) + # Set up environment for FP8 inference + if eval_config.get("fp8"): + setup_fp8(eval_config["model_name"], platform) # Launch eval requests. start_time = time.perf_counter() results = launch_lm_eval(eval_config) diff --git a/.jenkins/test_config.yaml b/.jenkins/test_config.yaml index f90cdb354d4f5..b32563d6222e9 100644 --- a/.jenkins/test_config.yaml +++ b/.jenkins/test_config.yaml @@ -22,3 +22,8 @@ stages: - name: gsm8k_large_g2_tp4 flavor: g2.m command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-large.txt -t 4 + - name: test_gsm8k_fp8 + steps: + - name: gsm8k_small_g3_tp1_fp8 + flavor: g3 + command: cd .jenkins/lm-eval-harness && bash run-tests.sh -c configs/models-fp8.txt -t 1 From 79dc102b85f937465c42fa44b8fcad477cbe3131 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 30 Oct 2024 12:57:29 +0100 Subject: [PATCH 319/819] Enable FusedSDPA prefill by default (#447) This removers the need to pass VLLM_PROMPT_USE_FUSEDSDPA environment variable in order to enable FusedSDPA attention. Fallback attention can still be used if VLLM_PROMPT_USE_FUSEDSDPA=0 is provided. --- vllm/worker/hpu_model_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 559ed33548dea..a92d952c308fd 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -282,7 +282,8 @@ class HpuModelAdapter(): def __init__(self, model, block_size, dtype, enforce_eager): self.model = model self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA', - '0').lower() in ['1', 'true'] + '1').lower() in ['1', 'true'] \ + and not is_fake_hpu() self.block_size = block_size self.dtype = dtype if not is_fake_hpu() and not htorch.utils.internal.is_lazy( From 2f7f963e6495286f49c9cf66d40ee5894d7fb46b Mon Sep 17 00:00:00 2001 From: Marceli Fylcek Date: Wed, 30 Oct 2024 13:08:30 +0100 Subject: [PATCH 320/819] Contiguous PA (#433) Contiguous cache fetching to avoid using costly gather operation on Gaudi3. Requires changes in vllm-hpu-extension (https://github.com/HabanaAI/vllm-hpu-extension/pull/17) to work. Introduces redundant calculations in decoding phase. Feature improves the performance of all tested workloads over the entire benchmark (5-12%) on Gaudi3. PR https://github.com/HabanaAI/vllm-fork/pull/426 further improves the performance of this feature (9-22%). Only compatible with v2-block-manager. Feature negatively impacts the performance of Gaudi2. Use VLLM_CONTIGUOUS_PA=true environment variable to enable. --- requirements-hpu.txt | 3 +- vllm/worker/hpu_model_runner.py | 104 ++++++++++++++++++++++---------- 2 files changed, 74 insertions(+), 33 deletions(-) diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 20f4dc74a3955..a5c9d1a35260a 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -8,4 +8,5 @@ pandas tabulate setuptools>=61 setuptools-scm>=8 -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@341a77f +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@aaba344 + diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index a92d952c308fd..c50e4e244dffe 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -202,10 +202,11 @@ def generate_decode_buckets(bs_bucket_config, blocks_bucket_config, bs_buckets = warmup_range(bs_bucket_config) block_buckets = warmup_range(blocks_bucket_config) bmin, bstep, bmax = blocks_bucket_config - last_bucket = round_up(max_blocks, bstep) + last_bucket = max_blocks for bs in bs_buckets: for blocks in block_buckets: - if blocks > last_bucket: + if blocks >= last_bucket: + buckets.append((bs, last_bucket)) break buckets.append((bs, blocks)) return list(sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0]))) @@ -621,7 +622,8 @@ def __init__( self.bucketing_global_state = HPUBucketingGlobalState() self._setup_buckets() self._set_gc_threshold() - + self.use_contiguous_pa = os.environ.get('VLLM_CONTIGUOUS_PA', + 'false').lower() == 'true' # For multi-step scheduling self.cached_step_outputs: List[torch.Tensor] = [] @@ -1087,39 +1089,78 @@ def _prepare_decode( num_decode_tokens = sum(seq_lens) - blocks_used = [len(bt) for bt in block_tables if bt] - block_list = [] - block_scales = [] - for bt in block_tables: - block_list.extend(bt) - blocks_in_group = len(bt) - if blocks_in_group > 0: - scale = 1.0 / blocks_in_group - block_scales.extend([scale] * blocks_in_group) - - block_mapping_nested: List[List[int]] = [ - [i] * b_u for i, b_u in enumerate(blocks_used) - ] - block_mapping: List[int] = list( - itertools.chain.from_iterable(block_mapping_nested)) + block_mapping: Union[List[Union[None, int]], torch.Tensor] + block_usage: Union[List[Union[None, int]], torch.Tensor] + block_scales: Union[List[Union[None, float]], torch.Tensor] + block_list: Union[List[int], torch.Tensor] + + if self.use_contiguous_pa: + block_list = list(itertools.chain(*block_tables)) + max_idx = max(block_list) + max_blocks = max(max_idx + 1, len(block_list)) + block_bucket_size = find_bucket( + max_blocks, + self.bucketing_global_state.decode_block_bucket_cfg) + block_bucket_size = min(block_bucket_size, + self.cache_config.num_gpu_blocks) + + block_mapping = [None] * block_bucket_size + block_usage = [None] * block_bucket_size + block_scales = [None] * block_bucket_size + + for i, bt in enumerate(block_tables): + if bt: + blocks_in_group = len(bt) + scale = 1.0 / blocks_in_group + for b in bt: + if block_mapping[b] is None: + block_mapping[b] = i + block_usage[b] = self.block_size + block_scales[b] = scale + + block_mapping = [b if b is not None else -1 for b in block_mapping] + block_scales = [b if b is not None else 0.0 for b in block_scales] + + for bt, sl in zip(block_tables, slot_mapping): + if bt: + block_usage[bt[-1]] = sl[-1] % self.block_size + 1 + block_usage = [u if u is not None else 1 for u in block_usage] - last_block = [ - sl % self.block_size + 1 for sl in itertools.chain(*slot_mapping) - ] - block_usage = [[self.block_size] * (b_u - 1) + [lb] - for b_u, lb in zip(blocks_used, last_block)] - block_usage = list(itertools.chain(*block_usage)) + else: + blocks_used = [len(bt) for bt in block_tables if bt] + block_list = [] + block_scales = [] + for bt in block_tables: + block_list.extend(bt) + blocks_in_group = len(bt) + if blocks_in_group > 0: + scale = 1.0 / blocks_in_group + block_scales.extend([scale] * blocks_in_group) + + block_mapping_nested: List[List[int]] = [ + [i] * b_u for i, b_u in enumerate(blocks_used) + ] + block_mapping = list( + itertools.chain.from_iterable(block_mapping_nested)) + + last_block = [ + sl % self.block_size + 1 + for sl in itertools.chain(*slot_mapping) + ] + block_usage_ = [[self.block_size] * (b_u - 1) + [lb] + for b_u, lb in zip(blocks_used, last_block)] + block_usage = list(itertools.chain(*block_usage_)) + + block_bucket_size = find_bucket( + len(block_list), + self.bucketing_global_state.decode_block_bucket_cfg) + block_mapping = pad_list(block_mapping, block_bucket_size, -1) + block_usage = pad_list(block_usage, block_bucket_size, 1) + block_scales = pad_list(block_scales, block_bucket_size, 0.0) - block_bucket_size = find_bucket( - len(block_list), - self.bucketing_global_state.decode_block_bucket_cfg) block_list = pad_list(block_list, block_bucket_size, _PAD_BLOCK_ID) block_groups = pad_list(block_mapping, block_bucket_size, len(block_tables)) - block_mapping = pad_list(block_mapping, block_bucket_size, -1) - block_usage = pad_list(block_usage, block_bucket_size, 1) - block_scales = pad_list(block_scales, block_bucket_size, 0.0) - block_list = torch.tensor(block_list, dtype=torch.int, device=self.device) @@ -1132,7 +1173,6 @@ def _prepare_decode( block_usage = torch.tensor(block_usage, dtype=self.model_config.dtype, device=self.device) - slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device=self.device) From 94858b5af49068330c6bdfeaa89b838244182f63 Mon Sep 17 00:00:00 2001 From: Michal Adamczyk Date: Wed, 30 Oct 2024 14:21:05 +0100 Subject: [PATCH 321/819] Fix default value for FSDPA (#448) --- vllm/attention/backends/hpu_attn.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index 2e987b039c220..8f16081e2e2b5 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -16,6 +16,7 @@ from vllm.attention.ops.hpu_paged_attn import (HPUPagedAttention, HPUPagedAttentionMetadata) from vllm.logger import init_logger +from vllm.utils import is_fake_hpu logger = init_logger(__name__) @@ -120,9 +121,10 @@ def __init__( assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads - self.prefill_usefusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA', - '0').lower() in ['1', 'true'] - if self.prefill_usefusedsdpa: + self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA', + '1').lower() in ['1', 'true'] \ + and not is_fake_hpu() + if self.prefill_use_fusedsdpa: assert alibi_slopes is None, \ 'Prefill with FusedSDPA not supported with alibi slopes!' @@ -188,7 +190,7 @@ def forward( kv_shape = (batch_size, seq_len_kv, self.num_kv_heads, self.head_size) if attn_metadata is None or attn_metadata.block_list is None: - if not self.prefill_usefusedsdpa: + if not self.prefill_use_fusedsdpa: # TODO: move this outside of model assert attn_metadata.attn_bias is not None, \ 'attn_bias must be set before calling model.forward' From d3257b21c240e63a7075e8f9abe39a77cba4c3cc Mon Sep 17 00:00:00 2001 From: Karol Damaszke Date: Wed, 30 Oct 2024 16:58:19 +0100 Subject: [PATCH 322/819] Fix performance of top_p and top_k calculations (#449) This change is fixing the performance issue I have introduced in the PR #414 -- due to the usage of `torch.where` both functions have been called. Now we will run only the selected one. --- vllm/model_executor/layers/sampler.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 74c0416e4b379..1b6bc2b1848c1 100755 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -267,12 +267,13 @@ def forward( if do_top_p_top_k and flashinfer_top_k_top_p_sampling is None: # If we have a scalar p and k, we can use the optimized version. - logits = torch.where( - self._scalar_p_and_k, - self._apply_top_k_top_p_opt(logits, self._top_p_scalar, - self._top_k_scalar), - _apply_top_k_top_p(logits, sampling_tensors.top_ps, - sampling_tensors.top_ks)) + if self._scalar_p_and_k.any(): + logits = self._apply_top_k_top_p_opt(logits, + self._top_p_scalar.item(), + self._top_k_scalar.item()) + else: + logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps, + sampling_tensors.top_ks) if do_min_p: logits = _apply_min_p(logits, sampling_tensors.min_ps) From d42c2a245eb9819274a6d01b493ad4c9c01f824c Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Thu, 31 Oct 2024 02:27:47 -0700 Subject: [PATCH 323/819] Reduce block fragmentation (#426) Change `NaiveBlockAllocator` to use a priority queue so that we always allocate the lowest block id first. This further increases the performance of contiguous paged attention. - [ ] Add an option or env variable to enable/disable this behavior. (Not sure if this is necessary) --------- Co-authored-by: Yang Wang --- vllm/core/block/naive_block.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index 9341a518d11c6..25696d30b14ba 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -1,4 +1,4 @@ -from collections import deque +import heapq from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter, @@ -36,7 +36,8 @@ def __init__( if block_ids is None: block_ids = range(num_blocks) - self._free_block_indices: Deque[BlockId] = deque(block_ids) + self._free_block_indices: Deque[BlockId] = block_ids[:] + heapq.heapify(self._free_block_indices) self._all_block_indices = frozenset(block_ids) assert len(self._all_block_indices) == num_blocks @@ -129,7 +130,7 @@ def _allocate_block_id(self) -> BlockId: if not self._free_block_indices: raise BlockAllocator.NoFreeBlocksError() - block_id = self._free_block_indices.popleft() + block_id = heapq.heappop(self._free_block_indices) self._refcounter.incr(block_id) return block_id @@ -139,7 +140,7 @@ def _free_block_id(self, block: Block) -> None: refcount = self._refcounter.decr(block_id) if refcount == 0: - self._free_block_indices.appendleft(block_id) + heapq.heappush(self._free_block_indices, block_id) block.block_id = None From 0cc72b914c0e0f4210fea09bd91582544ed3fba7 Mon Sep 17 00:00:00 2001 From: Sanju C Sudhakaran Date: Mon, 4 Nov 2024 07:04:31 +0200 Subject: [PATCH 324/819] Enable HPUGraphs for lora long-contexts tests --- tests/lora/test_long_context_hpu.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/lora/test_long_context_hpu.py b/tests/lora/test_long_context_hpu.py index 33250edde00d3..3c3e1b7c1e41c 100644 --- a/tests/lora/test_long_context_hpu.py +++ b/tests/lora/test_long_context_hpu.py @@ -116,7 +116,6 @@ def lora_llm(long_context_infos): long_lora_scaling_factors=tuple(scaling_factors), max_num_batched_tokens=4096 * 8, tensor_parallel_size=1, - enforce_eager=True, # TODO Remove after SW-205153 is fixed dtype="bfloat16", disable_async_output_proc=True, # TODO Remove after SW-204469 is fixed. distributed_executor_backend="mp") From 24ba4d41eaffb1bff5e9ec1eaec317ae55f6cdd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Kot=C5=82owski?= Date: Mon, 4 Nov 2024 09:43:06 +0100 Subject: [PATCH 325/819] [CI] Add Llama2 to torch compile tests (#446) --- .../lm-eval-harness/configs/Llama-2-7B-hf.yaml | 14 ++++++++++++++ .jenkins/lm-eval-harness/configs/models-llama2.txt | 1 + .jenkins/test_config_t_compile.yaml | 12 ++++++++++++ 3 files changed, 27 insertions(+) create mode 100644 .jenkins/lm-eval-harness/configs/Llama-2-7B-hf.yaml create mode 100644 .jenkins/lm-eval-harness/configs/models-llama2.txt diff --git a/.jenkins/lm-eval-harness/configs/Llama-2-7B-hf.yaml b/.jenkins/lm-eval-harness/configs/Llama-2-7B-hf.yaml new file mode 100644 index 0000000000000..da048ba19305f --- /dev/null +++ b/.jenkins/lm-eval-harness/configs/Llama-2-7B-hf.yaml @@ -0,0 +1,14 @@ +# These scores were chosen to place within 6% range of values achieved using vLLM on HPU: +# 0.148 - 0.164 +# where on https://www.llama.com/llama2/: 0.146 is given +model_name: "/mnt/weka/data/pytorch/llama2/Llama-2-7b-hf" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.155 + - name: "exact_match,flexible-extract" + value: 0.155 +limit: 250 +num_fewshot: 5 +dtype: "bfloat16" \ No newline at end of file diff --git a/.jenkins/lm-eval-harness/configs/models-llama2.txt b/.jenkins/lm-eval-harness/configs/models-llama2.txt new file mode 100644 index 0000000000000..7ae5af4cce4d3 --- /dev/null +++ b/.jenkins/lm-eval-harness/configs/models-llama2.txt @@ -0,0 +1 @@ +Llama-2-7B-hf.yaml \ No newline at end of file diff --git a/.jenkins/test_config_t_compile.yaml b/.jenkins/test_config_t_compile.yaml index 58fcb45a7edfb..da20c3486aa86 100644 --- a/.jenkins/test_config_t_compile.yaml +++ b/.jenkins/test_config_t_compile.yaml @@ -14,3 +14,15 @@ stages: - name: gsm8k_small_g2_tp2_tc flavor: g2.s command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-small.txt -t 2 + - name: gsm8k_llama2_g3_tp1_tc + flavor: g3 + command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-llama2.txt -t 1 + - name: gsm8k_llama2_g3_tp2_tc + flavor: g3.s + command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-llama2.txt -t 2 + - name: gsm8k_lama2_g2_tp1_tc + flavor: g2 + command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-llama2.txt -t 1 + - name: gsm8k_lama2_g2_tp2_tc + flavor: g2.s + command: cd .jenkins/lm-eval-harness && PT_HPU_LAZY_MODE=0 bash run-tests.sh -c configs/models-llama2.txt -t 2 \ No newline at end of file From ac12d535811bbd10055d0a25de50841faec221e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Durejko?= Date: Tue, 5 Nov 2024 09:30:27 +0100 Subject: [PATCH 326/819] Fix SchedulerConfig params (#459) max_num_prefill_seqs parameter is used only when use_padding_aware_scheduling is True. use_padding_aware_scheduling default value is False, so max_num_prefill_seqs shouldn't be required to pass each time SchedulerConfig is initialized. Dozens of tests in tests/core are failing due to these parameters issue. --- vllm/config.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 67a4ec0761cc3..68957771fa7e4 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -940,9 +940,6 @@ class SchedulerConfig: a single iteration. max_num_seqs: Maximum number of sequences to be processed in a single iteration. - max_num_prefill_seqs: Maximum number of prefill sequences to be - processed in a single iteration. Used only with padding-aware - scheduling. max_model_len: Maximum length of a sequence (including prompt and generated text). use_v2_block_manager: Whether to use the BlockSpaceManagerV2 or not. @@ -966,6 +963,9 @@ class SchedulerConfig: when SPMD worker architecture is enabled. I.e., VLLM_USE_RAY_SPMD_WORKER=1 policy: The scheduling policy to use. "fcfs" (default) or "priority". + max_num_prefill_seqs: Maximum number of prefill sequences to be + processed in a single iteration. Used only with padding-aware + scheduling. use_padding_aware_scheduling: If True, scheduler will consider padded tokens in prefill. """ @@ -973,7 +973,6 @@ class SchedulerConfig: def __init__(self, max_num_batched_tokens: Optional[int], max_num_seqs: int, - max_num_prefill_seqs: Optional[int], max_model_len: int, use_v2_block_manager: bool = True, num_lookahead_slots: int = 0, @@ -986,6 +985,7 @@ def __init__(self, multi_step_stream_outputs: bool = False, send_delta_data: bool = False, policy: str = "fcfs", + max_num_prefill_seqs: Optional[int] = None, use_padding_aware_scheduling=False) -> None: if max_num_batched_tokens is None: if enable_chunked_prefill: From 653e56c387a1435fa16ed099dbace5bc852abbfc Mon Sep 17 00:00:00 2001 From: Tomasz Zielinski <85164140+tzielinski-habana@users.noreply.github.com> Date: Tue, 5 Nov 2024 10:02:11 +0100 Subject: [PATCH 327/819] Tensor parallelism for multi-step scheduling (#457) This PR implements tensor parallelism for multi-step scheduling. --- vllm/worker/hpu_model_runner.py | 36 +++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index c50e4e244dffe..fec5f3d01cff8 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -29,6 +29,7 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ObservabilityConfig, ParallelConfig, PromptAdapterConfig, SchedulerConfig) +from vllm.distributed import broadcast_tensor_dict from vllm.distributed.parallel_state import get_world_group from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping @@ -97,7 +98,10 @@ def subtuple(obj: object, if to_override is None: to_override = {} fields = set(to_copy) | set(to_override.keys()) - values = {f: to_override.get(f, getattr(obj, f)) for f in fields} + if type(obj) is dict: + values = {key: obj[key] for key in fields if key in obj} + else: + values = {f: to_override.get(f, getattr(obj, f)) for f in fields} if typename not in _TYPE_CACHE: _TYPE_CACHE[typename] = collections.namedtuple(typename, ' '.join(fields)) @@ -2049,7 +2053,9 @@ def execute_model( # not first or last multi-step return [] # last multi-step - output = self._decode_sampler_outputs(model_input) + output = self._decode_sampler_outputs( + model_input) if self.is_driver_worker else [] + torch.hpu.synchronize() if model_input.is_first_multi_step: # first multi-step if self.lora_config: @@ -2110,6 +2116,20 @@ def execute_model( sampling_metadata.skip_sampler_cpu_output = True self.model.model.sampler.include_gpu_probs_tensor = True for i in range(num_steps): + if i != 0 and not self.is_driver_worker: + broadcast_data = broadcast_tensor_dict(src=0) + if 'early_exit' in broadcast_data and broadcast_data[ + 'early_exit']: + return [output] if num_steps == 1 else [] + execute_model_kwargs.update({ + "input_ids": + broadcast_data["input_ids"], + "positions": + broadcast_data["positions"], + "attn_metadata": + self.trim_attn_metadata( + broadcast_data["attn_metadata"]) + }) with self.profiler.record_event('internal', model_event_name): hidden_states = self.model.forward( **execute_model_kwargs, @@ -2135,7 +2155,7 @@ def execute_model( htorch.core.mark_step() # Only perform sampling in the driver worker. if not self.is_driver_worker: - return [] + continue if model_input.async_callback is not None: model_input.async_callback() @@ -2170,6 +2190,8 @@ def execute_model( dummy_token = (540, ) data.output_token_ids += (dummy_token) else: + broadcast_tensor_dict({'early_exit': True}, + src=0) if num_steps == 1: return [output] else: @@ -2185,6 +2207,12 @@ def execute_model( "attn_metadata": self.trim_attn_metadata(result.attn_metadata) }) + model_kwargs_broadcast_data = { + "input_ids": result.input_tokens, + "positions": result.input_positions, + "attn_metadata": vars(result.attn_metadata) + } + broadcast_tensor_dict(model_kwargs_broadcast_data, src=0) if self.is_driver_worker and self.profiler.enabled: # Stop recording 'execute_model' event @@ -2199,7 +2227,7 @@ def execute_model( is_prompt=is_prompt) self.profiler.record_counter(self.event_start, counters) if num_steps == 1: - return [output] + return [output] if self.is_driver_worker else [] else: return [] return output if type(output) is list else [output] From 1033c3ebc58dd08b8eeb4f6d001f8d76511398fd Mon Sep 17 00:00:00 2001 From: Michal Adamczyk Date: Tue, 5 Nov 2024 13:10:05 +0100 Subject: [PATCH 328/819] Set tokenizers version to <0.20.2 (#460) 0.20.2 had some changes that break lm_eval API --- .jenkins/requirements-test-hpu.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.jenkins/requirements-test-hpu.txt b/.jenkins/requirements-test-hpu.txt index e0710d3775957..523eb0d39d145 100644 --- a/.jenkins/requirements-test-hpu.txt +++ b/.jenkins/requirements-test-hpu.txt @@ -1,2 +1,3 @@ lm_eval -pytest \ No newline at end of file +pytest +tokenizers<0.20.2 From d397ba57868714bb767465a72bd706318b09205c Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 5 Nov 2024 15:12:06 +0200 Subject: [PATCH 329/819] fix hpu execution --- vllm/distributed/parallel_state.py | 4 +++ vllm/executor/hpu_executor.py | 8 +---- vllm/worker/hpu_model_runner.py | 50 +++++++++--------------------- vllm/worker/hpu_worker.py | 37 ++++------------------ 4 files changed, 26 insertions(+), 73 deletions(-) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index d3ccd49797068..efa3525910a5e 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -370,6 +370,10 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: # TPU handles Dynamo with its own logic. return self.tpu_communicator.all_reduce(input_) + if self.hpu_communicator is not None and \ + not self.hpu_communicator.disabled: + return self.hpu_communicator.all_reduce(input_) + if self.ca_comm is not None and \ not self.ca_comm.disabled and \ self.ca_comm.should_custom_ar(input_): diff --git a/vllm/executor/hpu_executor.py b/vllm/executor/hpu_executor.py index dc8a7b4b1d1c8..21dac2be598c4 100644 --- a/vllm/executor/hpu_executor.py +++ b/vllm/executor/hpu_executor.py @@ -37,16 +37,10 @@ def _get_worker_kwargs( distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) return dict( - model_config=self.model_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - device_config=self.device_config, - cache_config=self.cache_config, - load_config=self.load_config, + vllm_config=self.vllm_config, local_rank=local_rank, rank=rank, distributed_init_method=distributed_init_method, - lora_config=self.lora_config, is_driver_worker=rank == 0, ) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index fb76447cce0d0..e9f5b755dca91 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -29,7 +29,7 @@ from vllm.attention.backends.hpu_attn import HPUAttentionBackend from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ObservabilityConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig) + PromptAdapterConfig, SchedulerConfig, VllmConfig) from vllm.distributed import broadcast_tensor_dict from vllm.distributed.parallel_state import get_world_group from vllm.logger import init_logger @@ -559,38 +559,21 @@ class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]): def __init__( self, - model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - cache_config: CacheConfig, - load_config: LoadConfig, - lora_config: Optional[LoRAConfig], - kv_cache_dtype: Optional[str] = "auto", + vllm_config: VllmConfig, is_driver_worker: bool = False, - prompt_adapter_config: Optional[PromptAdapterConfig] = None, return_hidden_states: bool = False, - observability_config: Optional[ObservabilityConfig] = None, ): - self.model_config = model_config - self.parallel_config = parallel_config - self.scheduler_config = scheduler_config - self.device_config = device_config - self.cache_config = cache_config - self.lora_config = lora_config - self.load_config = load_config + ModelRunnerBase.__init__(self, vllm_config=vllm_config) self.is_driver_worker = is_driver_worker - self.prompt_adapter_config = prompt_adapter_config self.return_hidden_states = return_hidden_states - self.observability_config = observability_config - self.sliding_window = (model_config.get_sliding_window() - if model_config is not None else None) - self.device_config = (device_config - if device_config is not None else DeviceConfig()) + self.sliding_window = (self.model_config.get_sliding_window() + if self.model_config is not None else None) + self.device_config = (self.device_config + if self.device_config is not None else DeviceConfig()) if is_fake_hpu(): - device_config.device = torch.device('cpu') - device_config.device_type = 'cpu' + self.device_config.device = torch.device('cpu') + self.device_config.device_type = 'cpu' self.device = self.device_config.device self.enforce_eager = self.model_config.enforce_eager self.max_num_seqs = self.scheduler_config.max_num_seqs @@ -600,10 +583,10 @@ def __init__( self.max_model_len = self.scheduler_config.max_model_len self.max_num_batched_tokens = \ self.scheduler_config.max_num_batched_tokens - self.block_size = cache_config.block_size + self.block_size = self.cache_config.block_size self.pin_memory = is_pin_memory_available() - self.kv_cache_dtype = kv_cache_dtype + self.kv_cache_dtype = self.cache_config.cache_dtype self.attn_backend = get_attn_backend( self.model_config.get_head_size(), @@ -665,13 +648,7 @@ def load_model(self) -> None: htcore.hpu_set_env() with HabanaMemoryProfiler() as m: with HabanaMemoryProfiler() as m_getmodel: - self.model = get_model(model_config=self.model_config, - device_config=self.device_config, - load_config=self.load_config, - lora_config=self.lora_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - cache_config=self.cache_config) + self.model = get_model(vllm_config=self.vllm_config) msg = ("Pre-loading model weights on " f"{next(self.model.parameters()).device} " f"took {m_getmodel.get_summary_string()}") @@ -978,6 +955,7 @@ def _prepare_prompt( dtype=torch.long, device=self.device) + block_indices, block_offsets = precompute_indices_and_offsets( self.block_size, slot_mapping, True) attn_metadata = self.attn_backend.make_metadata( @@ -996,6 +974,7 @@ def _prepare_prompt( num_prefill_tokens=sum_query_len, num_decode_tokens=0, slot_mapping=slot_mapping, + multi_modal_placeholder_index_maps=None # FIXME(kzawora): mutli-modality will not work here ) multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list) @@ -1204,6 +1183,7 @@ def _prepare_decode( num_prefill_tokens=0, num_decode_tokens=num_decode_tokens, slot_mapping=slot_mapping, + multi_modal_placeholder_index_maps=None ) return PrepareDecodeMetadata(input_tokens=input_tokens, input_positions=input_positions, diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 8badc5f6bdb43..d61243a511b90 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -15,7 +15,7 @@ from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, ObservabilityConfig, ParallelConfig, PromptAdapterConfig, SchedulerConfig, - SpeculativeConfig) + SpeculativeConfig, VllmConfig) from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.logger import init_logger @@ -27,7 +27,7 @@ from vllm.worker.cache_engine import CacheEngine from vllm.worker.hpu_model_runner import HPUModelRunner from vllm.worker.model_runner_base import ModelRunnerBase -from vllm.worker.worker_base import LocalOrDistributedWorkerBase, WorkerInput +from vllm.worker.worker_base import LocalOrDistributedWorkerBase, WorkerBase, WorkerInput logger = init_logger(__name__) @@ -42,34 +42,18 @@ class HPUWorker(LocalOrDistributedWorkerBase): def __init__( self, - model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - cache_config: CacheConfig, - load_config: LoadConfig, + vllm_config: VllmConfig, local_rank: int, rank: int, distributed_init_method: str, - lora_config: Optional[LoRAConfig] = None, - speculative_config: Optional[SpeculativeConfig] = None, - prompt_adapter_config: Optional[PromptAdapterConfig] = None, is_driver_worker: bool = False, model_runner_cls: Optional[Type[ModelRunnerBase]] = None, - observability_config: Optional[ObservabilityConfig] = None, ) -> None: - self.model_config = model_config - self.parallel_config = parallel_config + WorkerBase.__init__(self, vllm_config=vllm_config) self.parallel_config.rank = rank - self.scheduler_config = scheduler_config - self.device_config = device_config - self.cache_config = cache_config self.local_rank = local_rank self.rank = rank self.distributed_init_method = distributed_init_method - self.lora_config = lora_config - self.load_config = load_config - self.prompt_adapter_config = prompt_adapter_config self.is_driver_worker = is_driver_worker if self.is_driver_worker: assert self.rank == 0, "The driver worker must have rank 0." @@ -80,17 +64,8 @@ def __init__( init_cached_hf_modules() self.model_runner: HPUModelRunner = HPUModelRunner( - model_config, - parallel_config, - scheduler_config, - device_config, - cache_config, - load_config=load_config, - lora_config=self.lora_config, - kv_cache_dtype=self.cache_config.cache_dtype, - is_driver_worker=is_driver_worker, - prompt_adapter_config=prompt_adapter_config, - observability_config=observability_config) + vllm_config = vllm_config, + is_driver_worker=is_driver_worker) # Uninitialized cache engine. Will be initialized by # initialize_cache. self.cache_engine: List[HPUCacheEngine] From 4c0647f9a227d604c526ab8620d21331b4e700f1 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 5 Nov 2024 15:21:24 +0200 Subject: [PATCH 330/819] format.sh --- vllm/engine/multiprocessing/engine.py | 1 + .../guided_decoding/outlines_logits_processors.py | 2 +- vllm/worker/hpu_model_runner.py | 15 ++++++--------- vllm/worker/hpu_worker.py | 11 ++++------- vllm/worker/model_runner_base.py | 2 +- 5 files changed, 13 insertions(+), 18 deletions(-) diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 2271827aad749..5369aaff756f4 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -31,6 +31,7 @@ logger = init_logger(__name__) +POLLING_TIMEOUT_MS = 10000 HEALTHY_RESPONSE = (pickle.dumps(VLLM_RPC_SUCCESS_STR), ) diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index f23b1a051f480..e1b7c11eb00a6 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -15,11 +15,11 @@ # limitations under the License. import copy import json +import math from collections import defaultdict from functools import lru_cache from typing import Callable, DefaultDict, Dict, List, Union -import numpy as np import torch from lark import Lark from outlines import grammars diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index e9f5b755dca91..4ba1cc5b96de3 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -27,9 +27,7 @@ from vllm.attention import AttentionMetadata, get_attn_backend from vllm.attention.backends.hpu_attn import HPUAttentionBackend -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ObservabilityConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig, VllmConfig) +from vllm.config import DeviceConfig, VllmConfig from vllm.distributed import broadcast_tensor_dict from vllm.distributed.parallel_state import get_world_group from vllm.logger import init_logger @@ -569,8 +567,8 @@ def __init__( self.sliding_window = (self.model_config.get_sliding_window() if self.model_config is not None else None) - self.device_config = (self.device_config - if self.device_config is not None else DeviceConfig()) + self.device_config = (self.device_config if self.device_config + is not None else DeviceConfig()) if is_fake_hpu(): self.device_config.device = torch.device('cpu') self.device_config.device_type = 'cpu' @@ -955,7 +953,6 @@ def _prepare_prompt( dtype=torch.long, device=self.device) - block_indices, block_offsets = precompute_indices_and_offsets( self.block_size, slot_mapping, True) attn_metadata = self.attn_backend.make_metadata( @@ -974,7 +971,8 @@ def _prepare_prompt( num_prefill_tokens=sum_query_len, num_decode_tokens=0, slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=None # FIXME(kzawora): mutli-modality will not work here + multi_modal_placeholder_index_maps= + None # FIXME(kzawora): mutli-modality will not work here ) multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list) @@ -1183,8 +1181,7 @@ def _prepare_decode( num_prefill_tokens=0, num_decode_tokens=num_decode_tokens, slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=None - ) + multi_modal_placeholder_index_maps=None) return PrepareDecodeMetadata(input_tokens=input_tokens, input_positions=input_positions, attn_metadata=attn_metadata, diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index d61243a511b90..4b73a54c283a8 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -12,10 +12,7 @@ from vllm_hpu_extension.profiler import HabanaMemoryProfiler, format_bytes import vllm.envs as envs -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ObservabilityConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig, - SpeculativeConfig, VllmConfig) +from vllm.config import ParallelConfig, VllmConfig from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.logger import init_logger @@ -27,7 +24,8 @@ from vllm.worker.cache_engine import CacheEngine from vllm.worker.hpu_model_runner import HPUModelRunner from vllm.worker.model_runner_base import ModelRunnerBase -from vllm.worker.worker_base import LocalOrDistributedWorkerBase, WorkerBase, WorkerInput +from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase, + WorkerInput) logger = init_logger(__name__) @@ -64,8 +62,7 @@ def __init__( init_cached_hf_modules() self.model_runner: HPUModelRunner = HPUModelRunner( - vllm_config = vllm_config, - is_driver_worker=is_driver_worker) + vllm_config=vllm_config, is_driver_worker=is_driver_worker) # Uninitialized cache engine. Will be initialized by # initialize_cache. self.cache_engine: List[HPUCacheEngine] diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py index fbcd69918d8d0..9e529f86b46bb 100644 --- a/vllm/worker/model_runner_base.py +++ b/vllm/worker/model_runner_base.py @@ -4,7 +4,7 @@ from datetime import datetime from functools import wraps from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List, - Optional, Type, TypeVar, Union, get_args, get_origin) + Optional, Type, TypeVar) import torch from torch import is_tensor From c41788fbba0dcbbd12448db2e6a9812f6f17c5e2 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 5 Nov 2024 15:28:30 +0200 Subject: [PATCH 331/819] fix type checks --- vllm/core/block/naive_block.py | 5 +++-- .../guided_decoding/outlines_logits_processors.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index 25696d30b14ba..f6ef4cbae7627 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -1,5 +1,5 @@ import heapq -from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple +from typing import FrozenSet, Iterable, List, Optional, Tuple from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter, get_all_blocks_recursively) @@ -36,7 +36,8 @@ def __init__( if block_ids is None: block_ids = range(num_blocks) - self._free_block_indices: Deque[BlockId] = block_ids[:] + self._free_block_indices: List[ + BlockId] = block_ids[:] # type: ignore[index] heapq.heapify(self._free_block_indices) self._all_block_indices = frozenset(block_ids) assert len(self._all_block_indices) == num_blocks diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index e1b7c11eb00a6..b096c59c1ba6a 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -18,7 +18,7 @@ import math from collections import defaultdict from functools import lru_cache -from typing import Callable, DefaultDict, Dict, List, Union +from typing import Any, Callable, DefaultDict, Dict, List, Union import torch from lark import Lark @@ -33,7 +33,7 @@ # Unfortunately we cannot use lru_cache as it breaks pickling # so we use a simpler implementation def _cached(fn): - cache = {} + cache: Dict[Any, Any] = {} def cached_fn(*args): if args in cache: From c3c0e90282cb7a86df9fc6aa597eb832701babe5 Mon Sep 17 00:00:00 2001 From: "Chendi.Xue" Date: Wed, 6 Nov 2024 03:45:14 -0600 Subject: [PATCH 332/819] [BugFix][Habana_main][Multistep]Fix multistep deepcopy overhead (#452) --- vllm/worker/hpu_model_runner.py | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index fec5f3d01cff8..e6015d6844b62 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -2115,6 +2115,19 @@ def execute_model( # we only want to pythonize in the last step sampling_metadata.skip_sampler_cpu_output = True self.model.model.sampler.include_gpu_probs_tensor = True + cache_orig_output_tokens_len: List[Dict] = [] + + def try_revert_dummy_output_tokens(): + if len(cache_orig_output_tokens_len) > 0: + # Reuse the original output token ids length + for i, seq_group_metadata in enumerate( + seq_group_metadata_list): + for j, data in seq_group_metadata.seq_data.items(): + orig_output_tokens_len = \ + cache_orig_output_tokens_len[i][j] + data.output_token_ids = \ + data.output_token_ids[:orig_output_tokens_len] + for i in range(num_steps): if i != 0 and not self.is_driver_worker: broadcast_data = broadcast_tensor_dict(src=0) @@ -2175,17 +2188,22 @@ def execute_model( htorch.core.mark_step() if i < num_steps - 1: if i == 0: - import copy ctx = model_input.async_callback.keywords[ # type: ignore "ctx"] seq_group_metadata_list = ctx.seq_group_metadata_list - seq_group_metadata_list = copy.deepcopy( - seq_group_metadata_list) + # Cache the original output token ids + for i, seq_group_metadata in enumerate( + seq_group_metadata_list): + cache_orig_output_tokens_len.append({}) + for j, data in seq_group_metadata.seq_data.items(): + cache_orig_output_tokens_len[i][j] = \ + len(data.output_token_ids) for seq_group_metadata in seq_group_metadata_list: for data in seq_group_metadata.seq_data.values(): max_output_len = sampling_metadata.seq_groups[ 0].sampling_params.max_tokens if len(data.output_token_ids) < max_output_len - 1: + # add a place holder for prepare_decode # arbitrary value, this could be any token dummy_token = (540, ) data.output_token_ids += (dummy_token) @@ -2195,6 +2213,7 @@ def execute_model( if num_steps == 1: return [output] else: + try_revert_dummy_output_tokens() return [] result = self._prepare_decode(seq_group_metadata_list, @@ -2213,6 +2232,8 @@ def execute_model( "attn_metadata": vars(result.attn_metadata) } broadcast_tensor_dict(model_kwargs_broadcast_data, src=0) + else: + try_revert_dummy_output_tokens() if self.is_driver_worker and self.profiler.enabled: # Stop recording 'execute_model' event From 2003cc35135319b240230e686f26f13524403ee0 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 6 Nov 2024 17:49:19 +0800 Subject: [PATCH 333/819] [Model][LoRA]LoRA support added for LlamaEmbeddingModel (#10071) Signed-off-by: Jee Jee Li --- docs/source/models/supported_models.rst | 2 +- vllm/model_executor/models/llama.py | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 55835d945b00c..87f45cf695c8d 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -333,7 +333,7 @@ Text Embedding * - :code:`MistralModel` - Mistral-based - :code:`intfloat/e5-mistral-7b-instruct`, etc. - - + - ✅ - ✅ .. important:: diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 6c0a8b5ef8451..d768a57b7ef8a 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -627,7 +627,7 @@ def permute(w: torch.Tensor, n_heads: int): return name, loaded_weight -class LlamaEmbeddingModel(nn.Module, SupportsPP): +class LlamaEmbeddingModel(nn.Module, SupportsLoRA, SupportsPP): """ A model that uses Llama with additional embedding functionalities. @@ -638,6 +638,19 @@ class LlamaEmbeddingModel(nn.Module, SupportsPP): model: An instance of LlamaModel used for forward operations. _pooler: An instance of Pooler used for pooling operations. """ + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"] + } + + # LoRA specific attributes + supported_lora_modules = [ + "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens" + ] + embedding_modules = { + "embed_tokens": "input_embeddings", + } + embedding_padding_modules = [] def __init__( self, @@ -679,3 +692,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): def load_kv_cache_scales(self, quantization_param_path: str) -> None: self.model.load_kv_cache_scales(quantization_param_path) + + # LRUCacheWorkerLoRAManager instantiation requires model config. + @property + def config(self): + return self.model.config From dc5cdfbb81175f5760419adb1a17592ce8b03cfc Mon Sep 17 00:00:00 2001 From: Michal Adamczyk Date: Wed, 6 Nov 2024 11:23:59 +0100 Subject: [PATCH 334/819] Set vllm-hpu-extension to 0063520 (#455) --- requirements-hpu.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-hpu.txt b/requirements-hpu.txt index a5c9d1a35260a..b9eb1b3ae07fa 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -8,5 +8,5 @@ pandas tabulate setuptools>=61 setuptools-scm>=8 -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@aaba344 +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@0063520 From a5bba7d234b4e0d82e6a64de82a8497760ed44cf Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 6 Nov 2024 19:41:17 +0800 Subject: [PATCH 335/819] [Model] Add Idefics3 support (#9767) Signed-off-by: Jee Jee Li Signed-off-by: B-201 Co-authored-by: B-201 --- docs/source/models/supported_models.rst | 6 + examples/offline_inference_vision_language.py | 17 + ...e_inference_vision_language_multi_image.py | 25 + .../vision_language/test_models.py | 16 + vllm/entrypoints/chat_utils.py | 2 + .../models/idefics2_vision_model.py | 25 +- vllm/model_executor/models/idefics3.py | 632 ++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + 8 files changed, 723 insertions(+), 1 deletion(-) create mode 100644 vllm/model_executor/models/idefics3.py diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 87f45cf695c8d..cdcea70c6cb7d 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -446,6 +446,12 @@ Text Generation - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc. - - ✅ + * - :code:`Idefics3ForConditionalGeneration` + - Idefics3 + - T + I + - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc. + - + - * - :code:`InternVLChatModel` - InternVL2 - T + I\ :sup:`E+` diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 4fd002caf1763..8d17ce3754515 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -377,6 +377,22 @@ def run_glm4v(question: str, modality: str): return llm, prompt, stop_token_ids +# Idefics3-8B-Llama3 +def run_idefics3(question: str, modality: str): + assert modality == "image" + model_name = "HuggingFaceM4/Idefics3-8B-Llama3" + + llm = LLM(model=model_name, + max_model_len=8192, + max_num_seqs=2, + enforce_eager=True) + prompt = ( + f"<|begin_of_text|>User:{question}\nAssistant:" + ) + stop_token_ids = None + return llm, prompt, stop_token_ids + + model_example_map = { "llava": run_llava, "llava-next": run_llava_next, @@ -397,6 +413,7 @@ def run_glm4v(question: str, modality: str): "mllama": run_mllama, "molmo": run_molmo, "glm4v": run_glm4v, + "idefics3": run_idefics3, } diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py index d99684078ff3d..7e883568995a4 100644 --- a/examples/offline_inference_vision_language_multi_image.py +++ b/examples/offline_inference_vision_language_multi_image.py @@ -290,6 +290,30 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData: ) +def load_idefics3(question, image_urls: List[str]) -> ModelRequestData: + model_name = "HuggingFaceM4/Idefics3-8B-Llama3" + + # The configuration below has been confirmed to launch on a single L40 GPU. + llm = LLM( + model=model_name, + max_model_len=8192, + max_num_seqs=16, + enforce_eager=True, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = "\n".join(f"Image-{i}: \n" + for i, _ in enumerate(image_urls, start=1)) + prompt = f"<|begin_of_text|>User:{placeholders}\n{question}\nAssistant:" # noqa: E501 + return ModelRequestData( + llm=llm, + prompt=prompt, + stop_token_ids=None, + image_data=[fetch_image(url) for url in image_urls], + chat_template=None, + ) + + model_example_map = { "phi3_v": load_phi3v, "h2ovl_chat": load_h2onvl, @@ -298,6 +322,7 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData: "qwen2_vl": load_qwen2_vl, "qwen_vl_chat": load_qwenvl_chat, "mllama": load_mllama, + "idefics3": load_idefics3, } diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index cfd2d61f2b633..3dbfaafb781af 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -327,6 +327,22 @@ vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output, prompt_path_encoder=model_utils.qwen_prompt_path_encoder, ), + "idefics3": VLMTestInfo( + models=["HuggingFaceM4/Idefics3-8B-Llama3"], + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}\nAssistant:", # noqa: E501 + img_idx_to_prompt=lambda idx: "", + max_model_len=8192, + max_num_seqs=2, + auto_cls=AutoModelForVision2Seq, + marks=[ + pytest.mark.skipif( + transformers.__version__ < "4.46.0", + reason="Model introduced in HF >= 4.46.0" + ), + large_gpu_mark(min_gb=48), + ], + ), ### Tensor parallel / multi-gpu broadcast tests "broadcast-chameleon": VLMTestInfo( models=["facebook/chameleon-7b"], diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 0ada0aaacda24..ed4e4399d5514 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -187,6 +187,8 @@ def _placeholder_str(self, modality: ModalityStr, return "<|vision_start|><|image_pad|><|vision_end|>" if model_type == "molmo": return "" + if model_type == "idefics3": + return "" raise TypeError(f"Unknown {modality} model type: {model_type}") elif modality == "audio": diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index 53869b8fa6bd8..b21bc2a3f9ce1 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -15,7 +15,7 @@ # limitations under the License. """PyTorch Idefics2 model.""" -from typing import Optional +from typing import Iterable, Optional, Tuple import torch from torch import nn @@ -29,6 +29,7 @@ QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.model_loader.weight_utils import default_weight_loader class Idefics2VisionEmbeddings(nn.Module): @@ -329,3 +330,25 @@ def forward( encoder_outputs = self.encoder(hidden_states) last_hidden_state = self.post_layernorm(encoder_outputs) return last_hidden_state + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + param = params_dict[name.replace(weight_name, param_name)] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py new file mode 100644 index 0000000000000..e4c98f22fb16f --- /dev/null +++ b/vllm/model_executor/models/idefics3.py @@ -0,0 +1,632 @@ +# Copyright 2024 the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Idefics3 model compatible with HuggingFace weights.""" + +import math +from typing import (Iterable, List, Literal, Mapping, Optional, Tuple, + TypedDict, Union) + +import torch +import torch.utils.checkpoint +from PIL import Image +from torch import nn +# Temporary solution for transformers below 4.46.0. +from transformers import PretrainedConfig as Idefics3Config + +from vllm.attention import AttentionMetadata +from vllm.config import CacheConfig, MultiModalConfig +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, + InputContext, token_inputs) +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import ReplicatedLinear +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs +from vllm.multimodal.image import cached_get_image_processor +from vllm.sequence import IntermediateTensors, SequenceData +from vllm.transformers_utils.processor import cached_get_processor +from vllm.utils import is_list_of + +# yapf: disable +from .idefics2_vision_model import ( + Idefics2VisionTransformer as Idefics3VisionTransformer) +# yapf: enable +from .interfaces import SupportsMultiModal +from .llama import LlamaModel +from .utils import AutoWeightsLoader, flatten_bn, merge_multimodal_embeddings + +logger = init_logger(__name__) + + +class Idefics3ImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + data: torch.Tensor + """ + Shape: `(batch_size * num_images, num_channels, height, width)` + """ + rows: List[int] + cols: List[int] + pixel_attention_mask: Optional[torch.BoolTensor] + + +class Idefics3ImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + data: torch.Tensor + """ + Shape: `(batch_size * num_images, image_feature_size, hidden_size)` + `hidden_size` must match the hidden size of language model backbone. + """ + + +ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs] + + +def input_mapper_for_idefics3( + ctx: InputContext, + data: object, +): + model_config = ctx.model_config + image_processor = cached_get_image_processor( + model_config.model, trust_remote_code=model_config.trust_remote_code) + if image_processor is None: + raise RuntimeError("No HuggingFace processor is available " + "to process the image object") + + if isinstance(data, Image.Image): + images = [[data]] + elif is_list_of(data, Image.Image): + images = [data] + else: + raise TypeError(f"Invalid image type: {type(data)}") + + try: + batch_data = image_processor(images, + return_tensors="pt", + return_row_col_info=True).data + except Exception: + logger.error("Failed to process image (%s)", data) + raise + + return MultiModalInputs(batch_data) + + +def _resize_output_size(height: int, + width: int, + max_len: Optional[int] = None, + min_len: Optional[int] = 1, + max_size: Optional[int] = None) -> Tuple[int, int]: + # Set default value for max_len if not provided + max_len = max(height, width) if max_len is None else max_len + aspect_ratio = width / height + + # Handle the maximum size constraint + if max_size is not None: + max_len = min(max_len, max_size) + + # Adjust dimensions according to the aspect ratio + if width >= height: + width = max_len + height = int(width / aspect_ratio) + else: + height = max_len + width = int(height * aspect_ratio) + + # Ensure both width and height are even (if needed) + height += 1 if height % 2 != 0 else 0 + width += 1 if width % 2 != 0 else 0 + + # Ensure dimensions are not smaller than the minimum length + height = max(height, min_len) + width = max(width, min_len) + + return height, width + + +def _get_resize_output_image_size( + image_size: Tuple[int, int], + resolution_max_side: int, + max_image_size: int = 1820, +) -> Tuple[int, int]: + if resolution_max_side > max_image_size: + raise ValueError( + "`resolution_max_side` cannot be larger than `max_image_size`") + + height, width = image_size + + # Find the output size, when rescaling the longest edge to max_len and + # preserving the aspect ratio + height, width = _resize_output_size(height, + width, + max_len=resolution_max_side) + + return height, width + + +def _prompt_split_image(image_seq_len: int, image_rows: int, image_cols: int, + fake_token_around_image: str, image_token: str, + global_img_token: str) -> str: + """ + Prompt with expanded image tokens for when the image is split + into patches. + """ + text_split_images = "" + for n_h in range(image_rows): + for n_w in range(image_cols): + text_split_images += (fake_token_around_image + + f"" + + image_token * image_seq_len) + text_split_images += "\n" + + text_split_images += "\n" + _prompt_single_image( + image_seq_len=image_seq_len, + fake_token_around_image=fake_token_around_image, + image_token=image_token, + global_img_token=global_img_token) + return text_split_images + + +def _prompt_single_image(image_seq_len: int, fake_token_around_image: str, + image_token: str, global_img_token: str): + """Prompt with expanded image tokens for a single image.""" + return (fake_token_around_image + global_img_token + + image_token * image_seq_len + fake_token_around_image) + + +def _get_image_prompt_string(image_rows: int, image_cols: int, + image_seq_len: int, fake_token_around_image: str, + image_token: str, global_img_token: str): + if image_rows == 0 and image_cols == 0: + return _prompt_single_image( + image_seq_len=image_seq_len, + fake_token_around_image=fake_token_around_image, + image_token=image_token, + global_img_token=global_img_token, + ) + return _prompt_split_image(image_seq_len, image_rows, image_cols, + fake_token_around_image, image_token, + global_img_token) + + +def input_processor_for_idefics3(ctx: InputContext, inputs: DecoderOnlyInputs): + multi_modal_data = inputs.get("multi_modal_data") + if multi_modal_data is None or "image" not in multi_modal_data: + return inputs + + model_config = ctx.model_config + processor = cached_get_processor(model_config.model) + image_processor = processor.image_processor + tokenizer = processor.tokenizer + size = image_processor.size['longest_edge'] + max_image_size = image_processor.max_image_size['longest_edge'] + + image_data = multi_modal_data["image"] + if isinstance(image_data, Image.Image): + image_list = [image_data] + elif is_list_of(image_data, Image.Image): + image_list = image_data + else: + raise TypeError(f"Invalid image type: {type(image_data)}") + + image_rows = [] + image_cols = [] + for image in image_list: + height, width = _get_resize_output_image_size(image.size, size) + + rows = math.ceil(height / max_image_size) + cols = math.ceil(width / max_image_size) + image_rows.append(rows) + image_cols.append(cols) + image_rows = [image_rows] + image_cols = [image_cols] + + n_images_in_text = [] + + text = inputs.get("prompt") + if text is not None: + if isinstance(text, str): + text = [text] + elif not isinstance(text, list) and not isinstance(text[0], str): + raise ValueError("Invalid input text. Please provide a string, " + "or a list of strings") + + fake_image_token = processor.fake_image_token.content + image_token = processor.image_token.content + global_img_token = processor.global_image_tag + + prompt_strings = [] + for sample, sample_rows, sample_cols in zip(text, image_rows, + image_cols): + n_images_in_text.append(sample.count(image_token)) + + # Replace the image token with fake tokens around the expanded + # image token sequence of length `image_seq_len` + image_prompt_strings = [] + for n_rows, n_cols in zip(sample_rows, sample_cols): + image_prompt_string = _get_image_prompt_string( + n_rows, + n_cols, + processor.image_seq_len, + image_token=image_token, + fake_token_around_image=fake_image_token, + global_img_token=global_img_token, + ) + image_prompt_strings.append(image_prompt_string) + + split_sample = sample.split(image_token) + if len(split_sample) == 0: + raise ValueError( + "The image token should be present in the text.") + + # Place in the image prompt strings where the image tokens are + sample = split_sample[0] + for i, image_prompt_string in enumerate(image_prompt_strings): + sample += image_prompt_string + split_sample[i + 1] + prompt_strings.append(sample) + + prompt_token_ids = tokenizer(text=prompt_strings[0]).input_ids + + return token_inputs( + prompt_token_ids=prompt_token_ids, + prompt=prompt_strings[0], + multi_modal_data=multi_modal_data, + ) + + +def get_max_idefics3_image_tokens(ctx: InputContext, + *, + num_crops: Optional[int] = None): + model_config = ctx.model_config + processor = cached_get_processor(model_config.model) + image_seq_len = processor.image_seq_len + image_processor = processor.image_processor + + size = image_processor.size['longest_edge'] + max_image_size = image_processor.max_image_size['longest_edge'] + resized_height, resized_width = size, size + + grid_h = resized_height // max_image_size + grid_w = resized_width // max_image_size + + return (grid_h * grid_w + 1) * image_seq_len + + +def dummy_data_for_idefics3(ctx: InputContext, seq_len: int, + mm_counts: Mapping[str, int]) -> DummyData: + hf_config = ctx.get_hf_config() + num_images = mm_counts["image"] + + processor = cached_get_processor(ctx.model_config.model) + image_seq_len = processor.image_seq_len + max_llm_image_tokens = 17 * image_seq_len * num_images + + seq_data = SequenceData.from_prompt_token_counts( + (hf_config.image_token_id, max_llm_image_tokens), (0, seq_len)) + + width = height = hf_config.vision_config.image_size + image = Image.new("RGB", (width, height), color=0) + mm_data = {"image": [image] if num_images == 1 else [image] * num_images} + + return DummyData(seq_data, mm_data) + + +class Idefics3SimpleMLP(nn.Module): + + def __init__(self, config): + super().__init__() + input_size = config.vision_config.hidden_size * (config.scale_factor** + 2) + output_size = config.text_config.hidden_size + self.proj = ReplicatedLinear(input_size, output_size, bias=False) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + out, _ = self.proj(x) + return out + + +class Idefics3Connector(nn.Module): + + def __init__(self, config): + super().__init__() + self.scale_factor = config.scale_factor + self.modality_projection = Idefics3SimpleMLP(config) + + def pixel_shuffle(self, + x: torch.Tensor, + scale_factor: int = 2) -> torch.Tensor: + bsz, seq, embed_dim = x.size() + height = width = int(seq**0.5) + x = x.view(bsz, height, width, embed_dim) + x = x.view(bsz, height, int(width / scale_factor), + embed_dim * scale_factor) + x = x.permute(0, 2, 1, 3) + x = x.reshape( + bsz, + int(width / scale_factor), + int(height / scale_factor), + embed_dim * (scale_factor**2), + ) + x = x.permute(0, 2, 1, 3) + x = x.reshape(bsz, int(seq / (scale_factor**2)), + embed_dim * (scale_factor**2)) + return x + + def forward(self, image_hidden_states: torch.Tensor) -> torch.Tensor: + image_hidden_states = self.pixel_shuffle(image_hidden_states, + self.scale_factor) + image_hidden_states = self.modality_projection(image_hidden_states) + return image_hidden_states + + +class Idefics3Model(nn.Module): + + def __init__( + self, + config: Idefics3Config, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + self.config = config + self.padding_idx = self.config.text_config.pad_token_id + self.vocab_size = self.config.text_config.vocab_size + + self.vision_model = Idefics3VisionTransformer(config.vision_config, + quant_config) + self.connector = Idefics3Connector(config) + self.text_model = LlamaModel(config.text_config, cache_config, + quant_config) + + self.image_seq_len = int( + ((config.vision_config.image_size // + config.vision_config.patch_size)**2) / (config.scale_factor**2)) + self.image_token_id = self.config.image_token_id + + def _validate_pixel_values( + self, data: Union[torch.Tensor, List[torch.Tensor]] + ) -> Union[torch.Tensor, List[torch.Tensor]]: + + h = w = self.config.vision_config.image_size + expected_dims = (3, h, w) + + def _validate_shape(d: torch.Tensor): + actual_dims = tuple(d.shape[1:]) + + if actual_dims != expected_dims: + expected_expr = ("num_patches", *map(str, expected_dims)) + raise ValueError( + "The expected shape of pixel values per image per batch " + f"is {expected_expr}. You supplied {tuple(d.shape)}.") + + for d in data: + _validate_shape(d) + + return data + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[ImageInputs]: + pixel_values = kwargs.pop("pixel_values", None) + image_embeds = kwargs.pop("image_embeds", None) + rows = kwargs.pop("rows", None) + cols = kwargs.pop("cols", None) + pixel_attention_mask = kwargs.pop("pixel_attention_mask", None) + + if pixel_values is None and image_embeds is None: + return None + + if image_embeds is not None: + if not isinstance(image_embeds, (torch.Tensor, list)): + raise ValueError("Incorrect type of image embeddings. " + f"Got type: {type(image_embeds)}") + + return Idefics3ImageEmbeddingInputs( + type="image_embeds", + data=flatten_bn(image_embeds, concat=True), + ) + + if pixel_values is not None: + if not isinstance(pixel_values, (torch.Tensor, list)): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") + + return Idefics3ImagePixelInputs(type="pixel_values", + data=self._validate_pixel_values( + flatten_bn(pixel_values, + concat=True)), + rows=rows, + cols=cols, + pixel_attention_mask=flatten_bn( + pixel_attention_mask, + concat=True)) + + raise AssertionError("This line should be unreachable.") + + def _image_pixels_to_features( + self, + pixel_values: torch.Tensor, + pixel_attention_mask: Optional[torch.BoolTensor] = None, + ) -> torch.Tensor: + # NOTE: we skip the step to select the vision feature layer since + # this is already done inside the vision tower + batch_size, num_images, num_channels, height, width = pixel_values.shape + pixel_values = pixel_values.to( + dtype=self.vision_model.embeddings.patch_embedding.weight.dtype + ) # fp16 compatibility + pixel_values = pixel_values.view(batch_size * num_images, + *pixel_values.shape[2:]) + + # Remove padding images - padding images are full 0. + nb_values_per_image = pixel_values.shape[1:].numel() + real_images_inds = (pixel_values == 0.0).sum( + dim=(-1, -2, -3)) != nb_values_per_image + pixel_values = pixel_values[real_images_inds].contiguous() + + # Handle the vision attention mask + if pixel_attention_mask is None: + pixel_attention_mask = torch.ones( + size=(pixel_values.size(0), pixel_values.size(2), + pixel_values.size(3)), + dtype=torch.bool, + device=pixel_values.device, + ) + else: + # Remove padding images from the mask + pixel_attention_mask = pixel_attention_mask.view( + batch_size * num_images, *pixel_attention_mask.shape[2:]) + pixel_attention_mask = pixel_attention_mask[ + real_images_inds].contiguous() + + patch_size = self.config.vision_config.patch_size + patches_subgrid = pixel_attention_mask.unfold(dimension=1, + size=patch_size, + step=patch_size) + patches_subgrid = patches_subgrid.unfold(dimension=2, + size=patch_size, + step=patch_size) + patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() + + # Get sequence from the vision encoder + image_hidden_states = self.vision_model( + pixel_values=pixel_values, + patch_attention_mask=patch_attention_mask, + ) + + return image_hidden_states + + def _process_image_pixels( + self, inputs: Idefics3ImagePixelInputs) -> torch.Tensor: + assert self.vision_model is not None + + pixel_values = inputs["data"] + pixel_attention_mask = inputs["pixel_attention_mask"] + + return self._image_pixels_to_features(pixel_values, + pixel_attention_mask) + + def _process_image_input(self, image_input: ImageInputs) -> torch.Tensor: + if image_input["type"] == "image_embeds": + return image_input["data"] + + assert self.vision_model is not None + image_features = self._process_image_pixels(image_input) + return self.connector(image_features) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + if intermediate_tensors is not None: + input_ids = None + inputs_embeds = None + else: + # always pass the input via `inputs_embeds` + # to make sure the computation graph is consistent + image_input = self._parse_and_validate_image_input(**kwargs) + + if image_input is not None: + vision_embeddings = self._process_image_input(image_input) + inputs_embeds = self.text_model.get_input_embeddings(input_ids) + + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, vision_embeddings, + self.image_token_id) + else: + inputs_embeds = self.text_model.get_input_embeddings(input_ids) + input_ids = None + + hidden_states = self.text_model( + input_ids, + positions, + kv_caches, + attn_metadata, + intermediate_tensors, + inputs_embeds=inputs_embeds, + ) + return hidden_states + + +@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_idefics3) +@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_idefics3_image_tokens) +@INPUT_REGISTRY.register_dummy_data(dummy_data_for_idefics3) +@INPUT_REGISTRY.register_input_processor(input_processor_for_idefics3) +class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal): + + def __init__( + self, + config: Idefics3Config, + multimodal_config: MultiModalConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + + self.config = config + self.multimodal_config = multimodal_config + + self.model = Idefics3Model(config, cache_config, quant_config) + self.image_token_id = self.config.image_token_id + + self.lm_head = ParallelLMHead( + config.text_config.vocab_size, + config.text_config.hidden_size, + quant_config=quant_config, + ) + if self.config.text_config.tie_word_embeddings: + self.lm_head.weight = self.model.text_model.wte.weight + self.logits_processor = LogitsProcessor(config.text_config.vocab_size) + self.sampler = Sampler() + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + hidden_states = self.model( + input_ids, + positions, + kv_caches, + attn_metadata, + intermediate_tensors, + **kwargs, + ) + return hidden_states + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + loader = AutoWeightsLoader(self) + loader.load_weights(weights) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 792c6cec34ae0..32750602b988c 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -120,6 +120,7 @@ "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"), "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"), "InternVLChatModel": ("internvl", "InternVLChatModel"), + "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"), "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"), "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), # noqa: E501 From 7578f3b32c359d49cb836d973ae8049f09e21d9a Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 6 Nov 2024 13:21:18 +0100 Subject: [PATCH 336/819] Oct 28 rebase (#439) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Max de Bayser Signed-off-by: Max de Bayser Signed-off-by: Joe Runde Signed-off-by: Russell Bryant Signed-off-by: Thomas Parnell Signed-off-by: Russell Bryant Signed-off-by: Varad Ahirwadkar Signed-off-by: Wallas Santos Signed-off-by: Travis Johnson Signed-off-by: Rafael Vasquez Signed-off-by: Yuan Zhou Signed-off-by: luka Signed-off-by: Alex-Brooks Signed-off-by: youkaichao Signed-off-by: Tyler Michael Smith Signed-off-by: mgoin Signed-off-by: Vinay Damodaran Signed-off-by: Woosuk Kwon Signed-off-by: Jee Jee Li Signed-off-by: Harry Mellor Signed-off-by: charlifu Signed-off-by: Sam Stoelinga Signed-off-by: Vasily Alexeev Signed-off-by: Kevin-Yang Signed-off-by: Abatom Signed-off-by: Bill Nell Signed-off-by: wangshuai09 <391746016@qq.com> Signed-off-by: Qishuai Ferdinandzhong@gmail.com Signed-off-by: yuze.zyz Signed-off-by: Yannick Schnider Signed-off-by: Kunjan Patel Signed-off-by: simon-mo Signed-off-by: kevin Signed-off-by: YiSheng5 Signed-off-by: yan ma Signed-off-by: Went-Liang Signed-off-by: Roger Wang Signed-off-by: sasha0552 Signed-off-by: mzusman Signed-off-by: Prashant Gupta Signed-off-by: AndrĂ© Jonasson Signed-off-by: Gene Su Signed-off-by: dependabot[bot] Signed-off-by: Peter Salas Signed-off-by: Nick Hill Signed-off-by: Nick Hill Signed-off-by: Michael Green Signed-off-by: Shanshan Wang Signed-off-by: Gregory Shtrasberg Signed-off-by: daitran2k1 Signed-off-by: MengqingCao Signed-off-by: chaunceyjiang Signed-off-by: Robert Shaw Signed-off-by: Hissu Hyvarinen Signed-off-by: rshaw@neuralmagic.com Signed-off-by: Linkun Chen Signed-off-by: Tomer Asida Signed-off-by: DarkLight1337 Co-authored-by: sasha0552 Co-authored-by: Woosuk Kwon Co-authored-by: Li, Jiang Co-authored-by: Kuntai Du Co-authored-by: Daniele <36171005+dtrifiro@users.noreply.github.com> Co-authored-by: Cyrus Leung Co-authored-by: Luka Govedič Co-authored-by: bnellnm <49004751+bnellnm@users.noreply.github.com> Co-authored-by: Kai Wu Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Shashwat Srijan <119712013+sssrijan-amazon@users.noreply.github.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Co-authored-by: Andrew Feldman Co-authored-by: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com> Co-authored-by: laishzh Co-authored-by: Max de Bayser Co-authored-by: Max de Bayser Co-authored-by: Dipika Sikka Co-authored-by: Joe Runde Co-authored-by: Haoyu Wang <30562758+blueyo0@users.noreply.github.com> Co-authored-by: Russell Bryant Co-authored-by: Nick Hill Co-authored-by: tomeras91 <57313761+tomeras91@users.noreply.github.com> Co-authored-by: Tyler Michael Smith Co-authored-by: Michael Goin Co-authored-by: Kunjan Co-authored-by: Kunjan Patel Co-authored-by: Cody Yu Co-authored-by: Thomas Parnell Co-authored-by: Chih-Chieh Yang Co-authored-by: Yue Zhang <130511128+yue-anyscale@users.noreply.github.com> Co-authored-by: Chen Zhang Co-authored-by: Andy Dai <76841985+Imss27@users.noreply.github.com> Co-authored-by: Dhia Eddine Rhaiem <163106757+dhiaEddineRhaiem@users.noreply.github.com> Co-authored-by: yudian0504 <138860534+yudian0504@users.noreply.github.com> Co-authored-by: Varad Ahirwadkar <86718090+varad-ahirwadkar@users.noreply.github.com> Co-authored-by: youkaichao Co-authored-by: Baoyuan Qi Co-authored-by: Wallas Henrique Co-authored-by: Travis Johnson Co-authored-by: Cyrus Leung Co-authored-by: ngrozae <104074686+ngrozae@users.noreply.github.com> Co-authored-by: Falko1 <61779598+Falko1@users.noreply.github.com> Co-authored-by: Rafael Vasquez Co-authored-by: chenqianfzh <51831990+chenqianfzh@users.noreply.github.com> Co-authored-by: wangshuai09 <391746016@qq.com> Co-authored-by: Jee Jee Li Co-authored-by: xendo Co-authored-by: Jerzy Zagorski Co-authored-by: gopalsarda Co-authored-by: Yuan Co-authored-by: Gubrud, Aaron D Co-authored-by: adgubrud <96072084+adgubrud@users.noreply.github.com> Co-authored-by: Yuhong Guo Co-authored-by: Yuhong Guo Co-authored-by: Ronen Schaffer Co-authored-by: Aurick Qiao Co-authored-by: Jeremy Arnold <103538711+JArnoldAMD@users.noreply.github.com> Co-authored-by: Lucas Wilkinson Co-authored-by: yulei Co-authored-by: Seth Kimmel Co-authored-by: Kaunil Dhruv Co-authored-by: Flex Wang Co-authored-by: Mengqing Cao Co-authored-by: Alex Brooks Co-authored-by: Yongzao <532741407@qq.com> Co-authored-by: Yunfei Chu Co-authored-by: Vinay R Damodaran Co-authored-by: Yan Ma Co-authored-by: Zhuohan Li Co-authored-by: litianjian <45817262+litianjian@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Charlie Fu Co-authored-by: Kevin H. Luu Co-authored-by: Will Johnson Co-authored-by: pavlo-ruban Co-authored-by: Sam Stoelinga Co-authored-by: ErkinSagiroglu <52523336+MErkinSag@users.noreply.github.com> Co-authored-by: Vasiliy Alekseev Co-authored-by: kakao-kevin-us Co-authored-by: Kevin-Yang Co-authored-by: 科英 Co-authored-by: madt2709 <55849102+madt2709@users.noreply.github.com> Co-authored-by: litianjian Co-authored-by: Zhong Qishuai Co-authored-by: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com> Co-authored-by: Sven Seeberg Co-authored-by: yannicks1 <43552841+yannicks1@users.noreply.github.com> Co-authored-by: Junichi Sato Co-authored-by: Kunjan Co-authored-by: Will Eaton Co-authored-by: Simon Mo Co-authored-by: Lily Liu Co-authored-by: YiSheng5 Co-authored-by: Went-Liang Co-authored-by: Elfie Guo <164945471+elfiegg@users.noreply.github.com> Co-authored-by: Harsha vardhan manoj Bikki <39381063+hbikki@users.noreply.github.com> Co-authored-by: Guillaume Calmettes Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> Co-authored-by: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com> Co-authored-by: Mor Zusman Co-authored-by: Prashant Gupta Co-authored-by: Patrick von Platen Co-authored-by: AndrĂ© Jonasson Co-authored-by: Pavani Majety Co-authored-by: Gene Der Su Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Peter Salas Co-authored-by: sroy745 <142070531+sroy745@users.noreply.github.com> Co-authored-by: Michael Green <59619482+mikegre-google@users.noreply.github.com> Co-authored-by: Nick Hill Co-authored-by: Nikita Furin Co-authored-by: shanshan wang Co-authored-by: Roger Wang Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com> Co-authored-by: Yang Zheng <50227060+zhengy001@users.noreply.github.com> Co-authored-by: Yang Zheng(SW)(Alex) Co-authored-by: Tran Quang Dai <62875701+daitran2k1@users.noreply.github.com> Co-authored-by: Chauncey Co-authored-by: hissu-hyvarinen Co-authored-by: lkchen Co-authored-by: Linkun Chen Co-authored-by: Linkun Chen Co-authored-by: Gene Der Su --- ...2-1B-Instruct-INT8-compressed-tensors.yaml | 11 + .../lm-eval-harness/configs/models-small.txt | 2 +- .buildkite/release-pipeline.yaml | 4 +- .buildkite/run-amd-test.sh | 15 +- .buildkite/run-cpu-test.sh | 8 +- .buildkite/run-tpu-test.sh | 2 +- .buildkite/test-pipeline.yaml | 73 +- .dockerignore | 31 +- .github/dependabot.yml | 25 + .github/mergify.yml | 58 + .github/workflows/actionlint.yml | 1 + .github/workflows/add_label_automerge.yml | 2 +- .github/workflows/clang-format.yml | 6 +- .github/workflows/matchers/mypy.json | 16 + .github/workflows/matchers/ruff.json | 17 + .github/workflows/mypy.yaml | 7 +- .github/workflows/publish.yml | 12 +- .github/workflows/ruff.yml | 7 +- .github/workflows/scripts/build.sh | 7 +- .github/workflows/stale.yml | 52 + .github/workflows/yapf.yml | 4 +- .readthedocs.yaml | 4 +- CMakeLists.txt | 50 +- CONTRIBUTING.md | 12 +- DCO | 34 + Dockerfile | 16 +- Dockerfile.cpu | 18 +- Dockerfile.neuron | 16 +- Dockerfile.openvino | 22 +- Dockerfile.ppc64le | 7 +- Dockerfile.rocm | 7 +- Dockerfile.tpu | 15 +- Dockerfile.xpu | 5 +- README.md | 15 +- benchmarks/backend_request_func.py | 2 +- benchmarks/benchmark_latency.py | 160 +- benchmarks/benchmark_prefix_caching.py | 28 +- benchmarks/benchmark_prioritization.py | 134 +- benchmarks/benchmark_serving.py | 181 ++- benchmarks/benchmark_throughput.py | 370 ++--- benchmarks/kernels/benchmark_layernorm.py | 6 +- benchmarks/kernels/benchmark_moe.py | 40 +- .../kernels/benchmark_paged_attention.py | 5 +- benchmarks/kernels/benchmark_quant.py | 6 +- benchmarks/kernels/benchmark_rope.py | 9 +- benchmarks/overheads/benchmark_hashing.py | 4 - cmake/cpu_extension.cmake | 40 +- cmake/utils.cmake | 6 +- collect_env.py | 27 +- csrc/activation_kernels.cu | 42 + csrc/core/scalar_type.hpp | 209 +-- csrc/core/torch_bindings.cpp | 16 - csrc/cpu/cpu_types_x86.hpp | 41 +- csrc/cpu/quant.cpp | 417 +++++- csrc/cpu/torch_bindings.cpp | 15 + csrc/mamba/causal_conv1d/causal_conv1d.cu | 71 +- csrc/mamba/causal_conv1d/causal_conv1d.h | 1 + csrc/mamba/mamba_ssm/selective_scan.h | 1 + csrc/mamba/mamba_ssm/selective_scan_fwd.cu | 24 +- csrc/moe/marlin_moe_ops.cu | 15 +- .../moe_align_sum_kernels.cu} | 98 +- csrc/moe/moe_ops.h | 7 + csrc/moe/torch_bindings.cpp | 19 +- csrc/ops.h | 64 +- .../compressed_tensors/int8_quant_kernels.cu | 42 +- .../cutlass_w8a8/scaled_mm_entry.cu | 8 +- csrc/quantization/fp8/common.cu | 6 +- csrc/quantization/gptq_marlin/gptq_marlin.cu | 23 +- csrc/quantization/machete/machete_pytorch.cu | 16 +- .../marlin/sparse/marlin_24_cuda_kernel.cu | 15 +- csrc/torch_bindings.cpp | 70 +- docs/requirements-docs.txt | 2 + docs/source/conf.py | 2 +- .../input_processing/model_inputs_index.rst | 2 +- docs/source/dev/pooling_params.rst | 5 + .../getting_started/cpu-installation.rst | 37 +- docs/source/getting_started/debugging.rst | 8 +- docs/source/getting_started/installation.rst | 36 +- docs/source/getting_started/quickstart.rst | 97 +- .../getting_started/tpu-installation.rst | 162 ++- .../getting_started/xpu-installation.rst | 18 + docs/source/index.rst | 2 + docs/source/models/adding_model.rst | 6 +- docs/source/models/spec_decode.rst | 9 +- docs/source/models/supported_models.rst | 240 ++- docs/source/models/vlm.rst | 74 +- docs/source/serving/compatibility_matrix.rst | 2 +- docs/source/serving/deploying_with_nginx.rst | 142 ++ docs/source/serving/distributed_serving.rst | 5 +- .../serving/openai_compatible_server.md | 113 +- docs/source/serving/run_on_sky.rst | 6 +- docs/source/serving/tensorizer.rst | 5 +- examples/florence2_inference.py | 44 + examples/offline_inference_audio_language.py | 56 +- examples/offline_inference_mlpspeculator.py | 2 - examples/offline_inference_openai.md | 8 +- examples/offline_inference_vision_language.py | 77 +- ...ine_inference_vision_language_embedding.py | 170 +++ ...e_inference_vision_language_multi_image.py | 65 +- examples/offline_inference_with_prefix.py | 15 +- examples/offline_profile.py | 282 ++++ examples/openai_audio_api_client.py | 90 -- ...i_chat_completion_client_for_multimodal.py | 236 +++ ...ai_chat_embedding_client_for_multimodal.py | 33 + examples/openai_example_batch.jsonl | 4 +- examples/openai_vision_api_client.py | 126 -- examples/template_vlm2vec.jinja | 16 + .../tool_chat_template_granite_20b_fc.jinja | 130 ++ format.sh | 24 +- pyproject.toml | 5 +- python_only_dev.py | 1 - requirements-build.txt | 18 +- requirements-common.txt | 5 +- requirements-cuda.txt | 8 +- requirements-lint.txt | 2 +- requirements-openvino.txt | 2 +- requirements-test.in | 37 + requirements-test.txt | 594 +++++++- requirements-xpu.txt | 2 +- setup.py | 16 +- tests/async_engine/test_async_llm_engine.py | 4 +- .../test_basic_correctness.py | 6 +- .../basic_correctness/test_chunked_prefill.py | 25 +- tests/basic_correctness/test_cpu_offload.py | 4 +- tests/compile/piecewise/__init__.py | 0 .../piecewise_compilation_config.json | 4 + tests/compile/piecewise/test_simple.py | 108 ++ tests/compile/piecewise/test_toy_llama.py | 346 +++++ tests/compile/test_basic_correctness.py | 136 +- tests/compile/test_full_graph.py | 2 +- tests/compile/utils.py | 28 +- tests/conftest.py | 303 ++-- tests/core/block/e2e/conftest.py | 5 +- tests/core/block/e2e/test_correctness.py | 78 +- .../e2e/test_correctness_sliding_window.py | 19 +- ...ck_manager_v2.py => test_block_manager.py} | 57 +- tests/core/test_block_manager.py | 637 -------- tests/core/test_chunked_prefill_scheduler.py | 222 +-- tests/core/test_num_computed_tokens_update.py | 1 - tests/core/test_scheduler.py | 374 +---- tests/core/test_scheduler_encoder_decoder.py | 7 +- tests/core/utils.py | 57 +- tests/data/test_config.yaml | 2 + tests/distributed/test_pipeline_parallel.py | 127 +- tests/encoder_decoder/test_e2e_correctness.py | 94 +- .../output_processor/test_stop_checker.py | 3 +- tests/engine/test_short_mm_context.py | 29 + tests/entrypoints/llm/test_chat.py | 92 ++ tests/entrypoints/llm/test_encode.py | 5 +- tests/entrypoints/llm/test_generate.py | 93 +- .../llm/test_generate_multiple_loras.py | 5 +- tests/entrypoints/llm/test_guided_generate.py | 5 +- tests/entrypoints/llm/test_init.py | 22 + tests/entrypoints/llm/test_lazy_outlines.py | 9 +- .../entrypoints/llm/test_prompt_validation.py | 8 +- .../offline_mode/test_offline_mode.py | 61 +- tests/entrypoints/openai/test_accuracy.py | 17 +- tests/entrypoints/openai/test_audio.py | 32 +- tests/entrypoints/openai/test_basic.py | 13 +- tests/entrypoints/openai/test_chat.py | 142 +- .../entrypoints/openai/test_chunked_prompt.py | 126 ++ tests/entrypoints/openai/test_completion.py | 52 +- tests/entrypoints/openai/test_embedding.py | 137 +- tests/entrypoints/openai/test_metrics.py | 25 +- .../openai/test_prompt_validation.py | 35 + tests/entrypoints/openai/test_serving_chat.py | 3 +- tests/entrypoints/openai/test_shutdown.py | 2 +- tests/entrypoints/openai/test_tokenization.py | 32 +- tests/entrypoints/openai/test_vision.py | 107 +- .../openai/test_vision_embedding.py | 99 ++ tests/entrypoints/test_chat_utils.py | 253 +++- tests/kernels/quant_utils.py | 17 +- tests/kernels/test_activation.py | 29 +- tests/kernels/test_attention.py | 27 +- tests/kernels/test_attention_selector.py | 44 +- tests/kernels/test_awq_marlin.py | 21 +- tests/kernels/test_awq_triton.py | 6 +- tests/kernels/test_blocksparse_attention.py | 11 +- tests/kernels/test_cache.py | 33 +- tests/kernels/test_causal_conv1d.py | 210 +-- tests/kernels/test_encoder_decoder_attn.py | 236 ++- tests/kernels/test_flash_attn.py | 35 +- tests/kernels/test_flashinfer.py | 10 +- tests/kernels/test_fp8_quant.py | 8 +- tests/kernels/test_gguf.py | 6 +- tests/kernels/test_int8_quant.py | 10 +- tests/kernels/test_layernorm.py | 4 +- tests/kernels/test_machete_gemm.py | 9 +- tests/kernels/test_mamba_ssm.py | 136 +- tests/kernels/test_marlin_gemm.py | 16 +- tests/kernels/test_moe.py | 19 +- tests/kernels/test_pos_encoding.py | 15 +- tests/kernels/test_prefix_prefill.py | 7 +- tests/kernels/utils.py | 92 +- tests/lora/conftest.py | 36 +- tests/lora/test_baichuan.py | 9 +- tests/lora/test_gemma.py | 5 +- tests/lora/test_layers.py | 6 +- tests/lora/test_llama.py | 9 +- tests/lora/test_long_context.py | 39 +- tests/lora/test_minicpmv.py | 8 +- tests/lora/test_punica_sizes.py | 10 +- tests/lora/test_punica_variation.py | 12 +- tests/lora/test_quant_model.py | 13 +- tests/lora/test_worker.py | 17 +- tests/metrics/test_metrics.py | 61 +- .../model_executor/test_enabled_custom_ops.py | 92 ++ .../audio_language/test_ultravox.py | 95 +- .../decoder_only/language/test_big_models.py | 56 +- .../decoder_only/language/test_danube3_4b.py | 52 - .../decoder_only/language/test_jamba.py | 25 + .../decoder_only/language/test_mamba.py | 2 +- .../decoder_only/language/test_phimoe.py | 4 +- .../models/decoder_only/language/test_qwen.py | 34 + .../mm_processor_kwargs/__init__.py | 0 .../mm_processor_kwargs/test_llava_next.py | 68 + .../mm_processor_kwargs/test_phi3v.py | 181 +++ .../mm_processor_kwargs/test_qwen.py | 144 ++ .../mm_processor_kwargs/test_qwen2_vl.py | 160 ++ .../vision_language/test_blip2.py | 101 -- .../vision_language/test_broadcast.py | 42 - .../vision_language/test_chameleon.py | 125 -- .../decoder_only/vision_language/test_fuyu.py | 139 -- .../decoder_only/vision_language/test_glm4.py | 133 -- .../vision_language/test_h2ovl.py | 130 ++ .../vision_language/test_intern_vit.py | 7 +- .../vision_language/test_internvl.py | 285 +--- .../vision_language/test_llava.py | 313 ---- .../test_llava_image_embeds.py | 158 -- .../vision_language/test_llava_next.py | 283 ---- .../vision_language/test_llava_next_video.py | 226 --- .../vision_language/test_llava_onevision.py | 349 ----- .../vision_language/test_minicpmv.py | 199 --- .../vision_language/test_models.py | 643 ++++++++ .../vision_language/test_paligemma.py | 164 --- .../vision_language/test_phi3v.py | 196 +-- .../decoder_only/vision_language/test_qwen.py | 374 ----- .../vision_language/vlm_utils/__init__.py | 0 .../vision_language/vlm_utils/builders.py | 235 +++ .../vlm_utils/case_filtering.py | 157 ++ .../vision_language/vlm_utils/core.py | 141 ++ .../vlm_utils/custom_inputs.py | 102 ++ .../vision_language/vlm_utils/model_utils.py | 409 ++++++ .../vision_language/vlm_utils/runners.py | 139 ++ .../vision_language/vlm_utils/types.py | 185 +++ .../embedding/language/test_cls_models.py | 53 + .../embedding/language/test_embedding.py | 42 +- tests/models/embedding/utils.py | 30 + .../embedding/vision_language/__init__.py | 0 .../vision_language/test_llava_next.py | 138 ++ .../embedding/vision_language/test_phi3v.py | 124 ++ .../vision_language/test_florence2.py | 102 ++ .../vision_language/test_mllama.py | 87 +- tests/models/utils.py | 14 +- tests/mq_llm_engine/test_error_handling.py | 78 +- tests/mq_llm_engine/utils.py | 2 +- .../multi_step/test_correctness_async_llm.py | 1 - tests/multi_step/test_correctness_llm.py | 4 - tests/multimodal/test_mapper.py | 4 + tests/multimodal/test_processor_kwargs.py | 37 +- tests/multimodal/test_utils.py | 96 +- .../my_gemma_embedding.py | 2 +- .../test_disable_sliding_window.py | 6 +- tests/prefix_caching/test_prefix_caching.py | 115 +- tests/quantization/test_bitsandbytes.py | 3 +- tests/quantization/test_compressed_tensors.py | 3 +- tests/quantization/test_configs.py | 3 +- tests/samplers/test_no_bad_words.py | 185 +++ tests/spec_decode/e2e/conftest.py | 4 +- tests/spec_decode/e2e/test_compatibility.py | 68 +- .../spec_decode/e2e/test_eagle_correctness.py | 18 - tests/spec_decode/e2e/test_integration.py | 8 - .../e2e/test_integration_dist_tp2.py | 10 +- .../e2e/test_integration_dist_tp4.py | 6 - tests/spec_decode/e2e/test_logprobs.py | 14 - .../e2e/test_medusa_correctness.py | 21 - tests/spec_decode/e2e/test_mlp_correctness.py | 27 - .../e2e/test_multistep_correctness.py | 36 - .../spec_decode/e2e/test_ngram_correctness.py | 16 - tests/spec_decode/e2e/test_seed.py | 3 - tests/spec_decode/utils.py | 7 +- tests/tensorizer_loader/conftest.py | 13 +- tests/test_cache_block_hashing.py | 7 +- tests/test_config.py | 61 +- tests/test_scalartype.py | 4 +- tests/test_sharded_state_loader.py | 10 +- tests/test_utils.py | 18 +- tests/tokenization/test_detokenize.py | 86 +- tests/tool_use/test_chat_completions.py | 8 +- tests/tool_use/test_jamba_tool_parser.py | 275 ++++ tests/tool_use/test_parallel_tool_calls.py | 8 +- tests/tool_use/test_tool_calls.py | 8 +- tests/tool_use/utils.py | 12 + tests/tracing/test_tracing.py | 30 +- tests/utils.py | 179 ++- .../test_encoder_decoder_model_runner.py | 20 +- tests/worker/test_model_input.py | 3 + tests/worker/test_model_runner.py | 10 +- tests/worker/test_profile.py | 65 + tests/worker/test_swap.py | 7 +- tools/check_repo.sh | 14 + tools/mypy.sh | 16 +- tools/profiler/print_layerwise_table.py | 77 + tools/profiler/visualize_layerwise_profile.py | 522 +++++++ tools/report_build_time_ninja.py | 1 - vllm/_core_ext.py | 278 ---- vllm/_custom_ops.py | 169 +-- vllm/attention/backends/abstract.py | 18 +- vllm/attention/backends/blocksparse_attn.py | 3 + vllm/attention/backends/flash_attn.py | 451 ++++-- vllm/attention/backends/flashinfer.py | 149 +- vllm/attention/backends/ipex_attn.py | 2 +- vllm/attention/backends/openvino.py | 2 +- vllm/attention/backends/pallas.py | 105 +- vllm/attention/backends/placeholder_attn.py | 24 +- vllm/attention/backends/rocm_flash_attn.py | 13 +- vllm/attention/backends/torch_sdpa.py | 10 +- vllm/attention/backends/utils.py | 230 ++- vllm/attention/backends/xformers.py | 183 +-- vllm/attention/layer.py | 9 +- .../ops/blocksparse_attention/interface.py | 22 +- vllm/attention/selector.py | 34 +- vllm/beam_search.py | 14 +- vllm/compilation/backends.py | 399 +++-- vllm/compilation/config.py | 154 ++ vllm/compilation/counter.py | 30 + vllm/compilation/decorators.py | 86 +- vllm/compilation/levels.py | 3 +- vllm/compilation/wrapper.py | 2 +- vllm/config.py | 439 ++++-- vllm/core/block/naive_block.py | 5 +- vllm/core/block/prefix_caching_block.py | 2 +- vllm/core/block/utils.py | 24 +- .../{block_manager_v2.py => block_manager.py} | 2 +- vllm/core/block_manager_v1.py | 746 ---------- vllm/core/{evictor_v2.py => evictor.py} | 0 vllm/core/evictor_v1.py | 106 -- vllm/core/interfaces.py | 10 +- vllm/core/scheduler.py | 28 +- .../device_communicators/custom_all_reduce.py | 14 +- .../device_communicators/shm_broadcast.py | 15 +- vllm/distributed/parallel_state.py | 83 +- vllm/engine/arg_utils.py | 204 ++- vllm/engine/async_llm_engine.py | 137 +- vllm/engine/llm_engine.py | 286 +++- vllm/engine/metrics.py | 116 +- vllm/engine/metrics_types.py | 7 + vllm/engine/multiprocessing/client.py | 198 +-- vllm/engine/multiprocessing/engine.py | 131 +- vllm/engine/output_processor/multi_step.py | 25 +- vllm/engine/output_processor/single_step.py | 129 +- vllm/engine/output_processor/stop_checker.py | 4 +- vllm/engine/output_processor/util.py | 13 +- vllm/engine/protocol.py | 178 ++- vllm/entrypoints/chat_utils.py | 253 +++- vllm/entrypoints/llm.py | 88 +- vllm/entrypoints/openai/api_server.py | 126 +- vllm/entrypoints/openai/protocol.py | 128 +- vllm/entrypoints/openai/run_batch.py | 34 +- vllm/entrypoints/openai/serving_chat.py | 364 ++--- vllm/entrypoints/openai/serving_completion.py | 214 ++- vllm/entrypoints/openai/serving_embedding.py | 86 +- vllm/entrypoints/openai/serving_engine.py | 173 ++- .../openai/serving_tokenization.py | 87 +- .../openai/tool_parsers/__init__.py | 7 +- .../granite_20b_fc_tool_parser.py | 251 ++++ .../openai/tool_parsers/hermes_tool_parser.py | 13 +- .../tool_parsers/internlm2_tool_parser.py | 4 +- .../openai/tool_parsers/jamba_tool_parser.py | 300 ++++ .../openai/tool_parsers/llama_tool_parser.py | 36 +- .../tool_parsers/mistral_tool_parser.py | 10 +- vllm/entrypoints/openai/tool_parsers/utils.py | 36 +- vllm/envs.py | 45 +- vllm/executor/cpu_executor.py | 9 +- vllm/executor/executor_base.py | 37 +- vllm/executor/gpu_executor.py | 11 +- vllm/executor/hpu_executor.py | 8 +- vllm/executor/multiproc_worker_utils.py | 8 +- vllm/executor/neuron_executor.py | 6 +- vllm/executor/openvino_executor.py | 28 +- vllm/executor/ray_utils.py | 17 +- vllm/executor/tpu_executor.py | 7 +- vllm/executor/xpu_executor.py | 54 +- vllm/inputs/__init__.py | 39 +- vllm/inputs/data.py | 118 +- vllm/inputs/parse.py | 20 +- vllm/inputs/preprocess.py | 291 ++-- vllm/inputs/registry.py | 55 +- vllm/logger.py | 4 +- vllm/logits_process.py | 119 ++ vllm/lora/models.py | 6 +- vllm/model_executor/custom_op.py | 74 +- .../guided_decoding/__init__.py | 3 +- .../lm_format_enforcer_decoding.py | 3 +- .../outlines_logits_processors.py | 4 +- vllm/model_executor/layers/activation.py | 69 +- .../layers/fused_moe/__init__.py | 28 +- .../layers/fused_moe/fused_marlin_moe.py | 70 +- .../layers/fused_moe/fused_moe.py | 119 +- vllm/model_executor/layers/fused_moe/layer.py | 31 +- vllm/model_executor/layers/layernorm.py | 3 +- .../model_executor/layers/logits_processor.py | 10 +- .../layers/mamba/mamba_mixer.py | 217 +++ .../layers/mamba/ops/causal_conv1d.py | 53 +- .../layers/mamba/ops/mamba_ssm.py | 70 +- vllm/model_executor/layers/pooler.py | 75 +- .../model_executor/layers/quantization/awq.py | 20 +- .../layers/quantization/awq_marlin.py | 42 +- .../layers/quantization/bitsandbytes.py | 40 +- .../compressed_tensors/compressed_tensors.py | 41 +- .../compressed_tensors_moe.py | 18 +- .../schemes/compressed_tensors_w8a16_fp8.py | 3 +- .../schemes/compressed_tensors_w8a8_fp8.py | 8 +- .../schemes/compressed_tensors_w8a8_int8.py | 3 +- .../schemes/compressed_tensors_wNa16.py | 3 +- .../quantization/compressed_tensors/utils.py | 102 +- .../layers/quantization/fbgemm_fp8.py | 3 +- .../model_executor/layers/quantization/fp8.py | 23 +- .../layers/quantization/gptq_marlin.py | 6 +- .../quantization/kernels/MPLinearKernel.py | 4 + .../layers/quantization/kernels/__init__.py | 8 +- .../layers/quantization/kernels/exllama.py | 140 ++ .../layers/quantization/kernels/machete.py | 14 +- .../layers/quantization/modelopt.py | 7 +- .../layers/quantization/utils/quant_utils.py | 12 +- .../layers/quantization/utils/w8a8_utils.py | 39 +- vllm/model_executor/layers/resampler.py | 49 +- .../model_executor/layers/rotary_embedding.py | 50 +- vllm/model_executor/layers/sampler.py | 7 +- vllm/model_executor/model_loader/__init__.py | 20 +- vllm/model_executor/model_loader/loader.py | 253 ++-- vllm/model_executor/model_loader/neuron.py | 35 +- vllm/model_executor/model_loader/openvino.py | 8 +- .../model_loader/weight_utils.py | 4 +- vllm/model_executor/models/__init__.py | 2 +- vllm/model_executor/models/arctic.py | 2 + vllm/model_executor/models/baichuan.py | 10 +- vllm/model_executor/models/bart.py | 2 - vllm/model_executor/models/bert.py | 424 ++++++ vllm/model_executor/models/blip.py | 115 +- vllm/model_executor/models/blip2.py | 39 +- vllm/model_executor/models/bloom.py | 2 + vllm/model_executor/models/chameleon.py | 40 +- vllm/model_executor/models/chatglm.py | 147 +- vllm/model_executor/models/clip.py | 158 +- vllm/model_executor/models/commandr.py | 2 + vllm/model_executor/models/deepseek_v2.py | 4 +- vllm/model_executor/models/eagle.py | 2 +- vllm/model_executor/models/exaone.py | 6 +- vllm/model_executor/models/falcon.py | 4 +- vllm/model_executor/models/florence2.py | 261 ++++ vllm/model_executor/models/fuyu.py | 46 +- vllm/model_executor/models/gemma.py | 63 +- vllm/model_executor/models/gemma2.py | 67 +- .../model_executor/models/gemma2_embedding.py | 57 - vllm/model_executor/models/gpt2.py | 2 + vllm/model_executor/models/gpt_bigcode.py | 2 + vllm/model_executor/models/gpt_j.py | 2 + vllm/model_executor/models/gpt_neox.py | 2 + vllm/model_executor/models/granite.py | 6 +- vllm/model_executor/models/granitemoe.py | 3 + vllm/model_executor/models/h2ovl.py | 401 +++++ .../models/idefics2_vision_model.py | 51 +- vllm/model_executor/models/intern_vit.py | 72 +- vllm/model_executor/models/internlm2.py | 58 +- vllm/model_executor/models/internlm2_ve.py | 174 +++ vllm/model_executor/models/internvl.py | 126 +- vllm/model_executor/models/jais.py | 4 +- vllm/model_executor/models/jamba.py | 246 +--- vllm/model_executor/models/llama.py | 94 +- vllm/model_executor/models/llama_embedding.py | 59 - vllm/model_executor/models/llava.py | 180 ++- vllm/model_executor/models/llava_next.py | 151 +- .../model_executor/models/llava_next_video.py | 76 +- vllm/model_executor/models/llava_onevision.py | 189 +-- vllm/model_executor/models/mamba.py | 354 +---- vllm/model_executor/models/mamba_cache.py | 186 +-- vllm/model_executor/models/minicpm.py | 15 +- vllm/model_executor/models/minicpmv.py | 239 ++- vllm/model_executor/models/mixtral.py | 2 + vllm/model_executor/models/mllama.py | 603 ++++++-- vllm/model_executor/models/molmo.py | 1293 +++++++++++++++++ vllm/model_executor/models/mpt.py | 2 + vllm/model_executor/models/nemotron.py | 2 + vllm/model_executor/models/nvlm_d.py | 38 +- vllm/model_executor/models/olmo.py | 2 + vllm/model_executor/models/olmoe.py | 2 + vllm/model_executor/models/opt.py | 38 +- vllm/model_executor/models/orion.py | 18 +- vllm/model_executor/models/paligemma.py | 32 +- vllm/model_executor/models/persimmon.py | 2 + vllm/model_executor/models/phi.py | 9 +- vllm/model_executor/models/phi3_small.py | 2 +- vllm/model_executor/models/phi3v.py | 148 +- vllm/model_executor/models/phimoe.py | 2 + vllm/model_executor/models/pixtral.py | 534 ++++++- vllm/model_executor/models/qwen.py | 142 +- vllm/model_executor/models/qwen2.py | 71 +- vllm/model_executor/models/qwen2_audio.py | 469 ++++++ vllm/model_executor/models/qwen2_cls.py | 110 ++ vllm/model_executor/models/qwen2_moe.py | 2 + vllm/model_executor/models/qwen2_rm.py | 13 +- vllm/model_executor/models/qwen2_vl.py | 279 ++-- vllm/model_executor/models/registry.py | 62 +- vllm/model_executor/models/siglip.py | 114 +- vllm/model_executor/models/solar.py | 6 +- vllm/model_executor/models/starcoder2.py | 2 + vllm/model_executor/models/ultravox.py | 82 +- vllm/model_executor/models/utils.py | 220 ++- vllm/model_executor/models/xverse.py | 3 + vllm/model_executor/utils.py | 25 +- vllm/multimodal/__init__.py | 7 +- vllm/multimodal/base.py | 214 ++- vllm/multimodal/image.py | 8 +- vllm/multimodal/registry.py | 18 +- vllm/multimodal/utils.py | 96 +- vllm/multimodal/video.py | 20 +- vllm/outputs.py | 43 +- vllm/platforms/__init__.py | 23 + vllm/platforms/cuda.py | 12 +- vllm/platforms/interface.py | 22 + vllm/platforms/neuron.py | 9 + vllm/platforms/openvino.py | 33 + vllm/platforms/tpu.py | 2 +- vllm/platforms/xpu.py | 4 + vllm/plugins/__init__.py | 33 +- vllm/pooling_params.py | 4 +- vllm/profiler/__init__.py | 5 + vllm/profiler/layerwise_profile.py | 354 +++++ vllm/profiler/utils.py | 145 ++ vllm/sampling_params.py | 44 +- vllm/scalar_type.py | 303 +++- vllm/sequence.py | 394 ++--- vllm/spec_decode/draft_model_runner.py | 38 +- vllm/spec_decode/ngram_worker.py | 19 +- vllm/spec_decode/spec_decode_worker.py | 49 +- vllm/spec_decode/target_model_runner.py | 34 +- vllm/transformers_utils/__init__.py | 2 +- vllm/transformers_utils/config.py | 108 +- vllm/transformers_utils/configs/__init__.py | 8 +- vllm/transformers_utils/configs/h2ovl.py | 13 + vllm/transformers_utils/configs/qwen2vl.py | 131 -- vllm/transformers_utils/detokenizer.py | 170 +-- vllm/transformers_utils/detokenizer_utils.py | 167 +++ vllm/transformers_utils/processor.py | 4 + vllm/transformers_utils/tokenizer.py | 5 + vllm/transformers_utils/tokenizers/mistral.py | 109 +- vllm/triton_utils/importing.py | 11 +- vllm/utils.py | 368 +++-- vllm/v1/attention/__init__.py | 0 vllm/v1/attention/backends/__init__.py | 0 vllm/v1/attention/backends/flash_attn.py | 243 ++++ vllm/v1/core/__init__.py | 0 vllm/v1/core/kv_cache_manager.py | 108 ++ vllm/v1/core/scheduler.py | 412 ++++++ vllm/v1/engine/__init__.py | 0 vllm/v1/engine/llm_engine.py | 513 +++++++ vllm/v1/executor/__init__.py | 0 vllm/v1/executor/gpu_executor.py | 77 + vllm/v1/outputs.py | 37 + vllm/v1/request.py | 92 ++ vllm/v1/sample/__init__.py | 0 vllm/v1/sample/metadata.py | 21 + vllm/v1/sample/sampler.py | 158 ++ vllm/v1/tokenizer/__init__.py | 0 vllm/v1/tokenizer/detokenizer.py | 215 +++ vllm/v1/worker/__init__.py | 0 vllm/v1/worker/gpu_model_runner.py | 678 +++++++++ vllm/v1/worker/gpu_worker.py | 229 +++ vllm/worker/cache_engine.py | 1 - vllm/worker/cpu_model_runner.py | 78 +- vllm/worker/cpu_worker.py | 38 +- vllm/worker/embedding_model_runner.py | 26 +- vllm/worker/enc_dec_model_runner.py | 86 +- vllm/worker/hpu_model_runner.py | 56 +- vllm/worker/hpu_worker.py | 40 +- vllm/worker/model_runner.py | 146 +- vllm/worker/model_runner_base.py | 31 +- vllm/worker/multi_step_model_runner.py | 5 +- vllm/worker/multi_step_worker.py | 10 +- vllm/worker/neuron_model_runner.py | 16 +- vllm/worker/neuron_worker.py | 20 +- vllm/worker/openvino_model_runner.py | 77 +- vllm/worker/openvino_worker.py | 51 +- vllm/worker/tpu_model_runner.py | 41 +- vllm/worker/tpu_worker.py | 43 +- vllm/worker/worker.py | 117 +- vllm/worker/worker_base.py | 18 +- vllm/worker/xpu_model_runner.py | 86 +- vllm/worker/xpu_worker.py | 60 +- 590 files changed, 30447 insertions(+), 15168 deletions(-) create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml create mode 100644 .github/mergify.yml create mode 100644 .github/workflows/matchers/mypy.json create mode 100644 .github/workflows/matchers/ruff.json create mode 100644 .github/workflows/stale.yml create mode 100644 DCO delete mode 100644 csrc/core/torch_bindings.cpp rename csrc/{moe_align_block_size_kernels.cu => moe/moe_align_sum_kernels.cu} (59%) create mode 100644 docs/source/dev/pooling_params.rst create mode 100644 docs/source/serving/deploying_with_nginx.rst create mode 100644 examples/florence2_inference.py create mode 100644 examples/offline_inference_vision_language_embedding.py create mode 100644 examples/offline_profile.py delete mode 100644 examples/openai_audio_api_client.py create mode 100644 examples/openai_chat_completion_client_for_multimodal.py create mode 100644 examples/openai_chat_embedding_client_for_multimodal.py delete mode 100644 examples/openai_vision_api_client.py create mode 100644 examples/template_vlm2vec.jinja create mode 100644 examples/tool_chat_template_granite_20b_fc.jinja create mode 100644 requirements-test.in create mode 100644 tests/compile/piecewise/__init__.py create mode 100644 tests/compile/piecewise/piecewise_compilation_config.json create mode 100644 tests/compile/piecewise/test_simple.py create mode 100644 tests/compile/piecewise/test_toy_llama.py rename tests/core/block/{test_block_manager_v2.py => test_block_manager.py} (91%) delete mode 100644 tests/core/test_block_manager.py create mode 100644 tests/engine/test_short_mm_context.py create mode 100644 tests/entrypoints/llm/test_chat.py create mode 100644 tests/entrypoints/llm/test_init.py create mode 100644 tests/entrypoints/openai/test_chunked_prompt.py create mode 100644 tests/entrypoints/openai/test_vision_embedding.py create mode 100644 tests/model_executor/test_enabled_custom_ops.py delete mode 100644 tests/models/decoder_only/language/test_danube3_4b.py create mode 100644 tests/models/decoder_only/language/test_qwen.py create mode 100644 tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py create mode 100644 tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py create mode 100644 tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py create mode 100644 tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py create mode 100644 tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py delete mode 100644 tests/models/decoder_only/vision_language/test_blip2.py delete mode 100644 tests/models/decoder_only/vision_language/test_broadcast.py delete mode 100644 tests/models/decoder_only/vision_language/test_chameleon.py delete mode 100644 tests/models/decoder_only/vision_language/test_fuyu.py delete mode 100644 tests/models/decoder_only/vision_language/test_glm4.py create mode 100644 tests/models/decoder_only/vision_language/test_h2ovl.py delete mode 100644 tests/models/decoder_only/vision_language/test_llava.py delete mode 100644 tests/models/decoder_only/vision_language/test_llava_image_embeds.py delete mode 100644 tests/models/decoder_only/vision_language/test_llava_next.py delete mode 100644 tests/models/decoder_only/vision_language/test_llava_next_video.py delete mode 100644 tests/models/decoder_only/vision_language/test_llava_onevision.py delete mode 100644 tests/models/decoder_only/vision_language/test_minicpmv.py create mode 100644 tests/models/decoder_only/vision_language/test_models.py delete mode 100644 tests/models/decoder_only/vision_language/test_paligemma.py delete mode 100644 tests/models/decoder_only/vision_language/test_qwen.py create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/__init__.py create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/builders.py create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/case_filtering.py create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/core.py create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/custom_inputs.py create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/model_utils.py create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/runners.py create mode 100644 tests/models/decoder_only/vision_language/vlm_utils/types.py create mode 100644 tests/models/embedding/language/test_cls_models.py create mode 100644 tests/models/embedding/utils.py create mode 100644 tests/models/embedding/vision_language/__init__.py create mode 100644 tests/models/embedding/vision_language/test_llava_next.py create mode 100644 tests/models/embedding/vision_language/test_phi3v.py create mode 100644 tests/models/encoder_decoder/vision_language/test_florence2.py create mode 100644 tests/samplers/test_no_bad_words.py create mode 100644 tests/tool_use/test_jamba_tool_parser.py create mode 100644 tests/worker/test_profile.py create mode 100644 tools/check_repo.sh create mode 100644 tools/profiler/print_layerwise_table.py create mode 100644 tools/profiler/visualize_layerwise_profile.py delete mode 100644 vllm/_core_ext.py create mode 100644 vllm/compilation/config.py create mode 100644 vllm/compilation/counter.py rename vllm/core/{block_manager_v2.py => block_manager.py} (99%) delete mode 100644 vllm/core/block_manager_v1.py rename vllm/core/{evictor_v2.py => evictor.py} (100%) delete mode 100644 vllm/core/evictor_v1.py create mode 100644 vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py create mode 100644 vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py create mode 100644 vllm/logits_process.py create mode 100644 vllm/model_executor/layers/mamba/mamba_mixer.py create mode 100644 vllm/model_executor/layers/quantization/kernels/exllama.py create mode 100644 vllm/model_executor/models/bert.py create mode 100644 vllm/model_executor/models/florence2.py delete mode 100644 vllm/model_executor/models/gemma2_embedding.py create mode 100644 vllm/model_executor/models/h2ovl.py create mode 100644 vllm/model_executor/models/internlm2_ve.py delete mode 100644 vllm/model_executor/models/llama_embedding.py create mode 100644 vllm/model_executor/models/molmo.py create mode 100644 vllm/model_executor/models/qwen2_audio.py create mode 100644 vllm/model_executor/models/qwen2_cls.py create mode 100644 vllm/platforms/neuron.py create mode 100644 vllm/platforms/openvino.py create mode 100644 vllm/profiler/__init__.py create mode 100644 vllm/profiler/layerwise_profile.py create mode 100644 vllm/profiler/utils.py create mode 100644 vllm/transformers_utils/configs/h2ovl.py delete mode 100644 vllm/transformers_utils/configs/qwen2vl.py create mode 100644 vllm/transformers_utils/detokenizer_utils.py create mode 100644 vllm/v1/attention/__init__.py create mode 100644 vllm/v1/attention/backends/__init__.py create mode 100644 vllm/v1/attention/backends/flash_attn.py create mode 100644 vllm/v1/core/__init__.py create mode 100644 vllm/v1/core/kv_cache_manager.py create mode 100644 vllm/v1/core/scheduler.py create mode 100644 vllm/v1/engine/__init__.py create mode 100644 vllm/v1/engine/llm_engine.py create mode 100644 vllm/v1/executor/__init__.py create mode 100644 vllm/v1/executor/gpu_executor.py create mode 100644 vllm/v1/outputs.py create mode 100644 vllm/v1/request.py create mode 100644 vllm/v1/sample/__init__.py create mode 100644 vllm/v1/sample/metadata.py create mode 100644 vllm/v1/sample/sampler.py create mode 100644 vllm/v1/tokenizer/__init__.py create mode 100644 vllm/v1/tokenizer/detokenizer.py create mode 100644 vllm/v1/worker/__init__.py create mode 100644 vllm/v1/worker/gpu_model_runner.py create mode 100644 vllm/v1/worker/gpu_worker.py diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml new file mode 100644 index 0000000000000..78347f63fa793 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1 +model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.356 + - name: "exact_match,flexible-extract" + value: 0.358 +limit: 1000 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt index 64a0f428587af..6057229ac50f3 100644 --- a/.buildkite/lm-eval-harness/configs/models-small.txt +++ b/.buildkite/lm-eval-harness/configs/models-small.txt @@ -1,6 +1,6 @@ Meta-Llama-3-8B-Instruct.yaml Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml -Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml +Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index e72138e29dd65..3b7fa0f2d94b3 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -3,7 +3,7 @@ steps: agents: queue: cpu_queue commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" # rename the files to change linux -> manylinux1 @@ -22,7 +22,7 @@ steps: agents: queue: cpu_queue commands: - - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ." + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" # rename the files to change linux -> manylinux1 diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index df201cdc7c554..860272e71fd84 100755 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -31,8 +31,8 @@ cleanup_docker() { echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." # Remove dangling images (those that are not tagged and not used by any container) docker image prune -f - # Remove unused volumes - docker volume prune -f + # Remove unused volumes / force the system prune for old images as well. + docker volume prune -f && docker system prune --force --filter "until=72h" --all echo "Docker images and volumes cleanup completed." else echo "Disk usage is below $threshold%. No cleanup needed." @@ -107,11 +107,12 @@ fi PARALLEL_JOB_COUNT=8 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. if [[ $commands == *"--shard-id="* ]]; then + # assign job count as the number of shards used + commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "} for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do - #replace shard arguments - commands=${commands//"--shard-id= "/"--shard-id=${GPU} "} - commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "} - echo "Shard ${GPU} commands:$commands" + # assign shard-id for each shard + commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "} + echo "Shard ${GPU} commands:$commands_gpu" docker run \ --device /dev/kfd --device /dev/dri \ --network host \ @@ -123,7 +124,7 @@ if [[ $commands == *"--shard-id="* ]]; then -e HF_HOME=${HF_MOUNT} \ --name ${container_name}_${GPU} \ ${image_name} \ - /bin/bash -c "${commands}" \ + /bin/bash -c "${commands_gpu}" \ |& while read -r line; do echo ">>Shard $GPU: $line"; done & PIDS+=($!) done diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index c2818c38965ea..c331a9c49c0d0 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -32,10 +32,10 @@ docker exec cpu-test bash -c " --ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported # Run compressed-tensor test -# docker exec cpu-test bash -c " -# pytest -s -v \ -# tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ -# tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token" +docker exec cpu-test bash -c " + pytest -s -v \ + tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ + tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token" # Run AWQ test docker exec cpu-test bash -c " diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh index 6989c94d46a89..988d5aef5fb8c 100644 --- a/.buildkite/run-tpu-test.sh +++ b/.buildkite/run-tpu-test.sh @@ -12,4 +12,4 @@ remove_docker_container # For HF_TOKEN. source /etc/environment # Run a simple end-to-end example. -docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py" +docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py" diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 4c2fe41c739b1..9444dc43ea97e 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -9,6 +9,7 @@ # label(str): the name of the test. emoji allowed. # fast_check(bool): whether to run this on each commit on fastcheck pipeline. # fast_check_only(bool): run this test on fastcheck pipeline only +# nightly(bool): run this test in nightly pipeline only # optional(bool): never run this test by default (i.e. need to unblock manually) # command(str): the single command to run for tests. incompatible with commands. # commands(list): the list of commands to run for test. incompatbile with command. @@ -77,8 +78,8 @@ steps: - vllm/ - tests/basic_correctness/test_chunked_prefill commands: - - VLLM_ATTENTION_BACKEND=XFORMERS VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py - - VLLM_ATTENTION_BACKEND=FLASH_ATTN VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s basic_correctness/test_chunked_prefill.py + - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py + - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py - label: Core Test # 10min mirror_hardwares: [amd] @@ -88,11 +89,7 @@ steps: - vllm/distributed - tests/core commands: - - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core/test_scheduler.py - - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/test_chunked_prefill_scheduler.py - - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness.py - - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s core core/block/e2e/test_correctness_sliding_window.py - - pytest -v -s core --ignore=core/block/e2e/test_correctness.py --ignore=core/test_scheduler.py --ignore=core/test_chunked_prefill_scheduler.py --ignore=core/block/e2e/test_correctness.py --ignore=core/block/e2e/test_correctness_sliding_window.py + - pytest -v -s core - label: Entrypoints Test # 40min working_dir: "/vllm-workspace/tests" @@ -184,6 +181,7 @@ steps: - python3 offline_inference_vision_language_multi_image.py - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference_encoder_decoder.py + - python3 offline_profile.py --model facebook/opt-125m - label: Prefix Caching Test # 9min #mirror_hardwares: [amd] @@ -191,8 +189,7 @@ steps: - vllm/ - tests/prefix_caching commands: - - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s prefix_caching/test_prefix_caching.py - - pytest -v -s prefix_caching --ignore=prefix_caching/test_prefix_caching.py + - pytest -v -s prefix_caching - label: Samplers Test # 36min source_file_dependencies: @@ -216,8 +213,7 @@ steps: - tests/spec_decode commands: - pytest -v -s spec_decode/e2e/test_multistep_correctness.py - - VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest -v -s spec_decode/e2e/test_compatibility.py - - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_compatibility.py + - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py - label: LoRA Test %N # 15min each mirror_hardwares: [amd] @@ -234,15 +230,16 @@ steps: - tests/compile commands: - pytest -v -s compile/test_basic_correctness.py + # these tests need to be separated, cannot combine + - pytest -v -s compile/piecewise/test_simple.py + - pytest -v -s compile/piecewise/test_toy_llama.py -# TODO: re-write in comparison tests, and fix symbolic shape -# for quantization ops. -# - label: "PyTorch Fullgraph Test" # 18min -# source_file_dependencies: -# - vllm/ -# - tests/compile -# commands: -# - pytest -v -s compile/test_full_graph.py +- label: "PyTorch Fullgraph Test" # 18min + source_file_dependencies: + - vllm/ + - tests/compile + commands: + - pytest -v -s compile/test_full_graph.py - label: Kernels Test %N # 1h each mirror_hardwares: [amd] @@ -317,33 +314,57 @@ steps: - pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s models/*.py --ignore=models/test_oot_registration.py -- label: Decoder-only Language Models Test # 1h36min +- label: Decoder-only Language Models Test (Standard) # 35min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/models/decoder_only/language commands: - - pytest -v -s models/decoder_only/language + - pytest -v -s models/decoder_only/language/test_models.py + - pytest -v -s models/decoder_only/language/test_big_models.py -- label: Decoder-only Multi-Modal Models Test # 1h31min +- label: Decoder-only Language Models Test (Extended) # 1h20min + nightly: true + source_file_dependencies: + - vllm/ + - tests/models/decoder_only/language + commands: + - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py + +- label: Decoder-only Multi-Modal Models Test (Standard) #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/models/decoder_only/audio_language - tests/models/decoder_only/vision_language commands: - - pytest -v -s models/decoder_only/audio_language - - pytest -v -s models/decoder_only/vision_language + - pytest -v -s models/decoder_only/audio_language -m core_model + - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m core_model + +- label: Decoder-only Multi-Modal Models Test (Extended) + nightly: true + source_file_dependencies: + - vllm/ + - tests/models/decoder_only/audio_language + - tests/models/decoder_only/vision_language + commands: + - pytest -v -s models/decoder_only/audio_language -m 'not core_model' + # HACK - run phi3v tests separately to sidestep this transformers bug + # https://github.com/huggingface/transformers/issues/34307 + - pytest -v -s models/decoder_only/vision_language/test_phi3v.py + - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model' - label: Other Models Test # 6min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/models/embedding/language + - tests/models/embedding/vision_language - tests/models/encoder_decoder/language - tests/models/encoder_decoder/vision_language commands: - pytest -v -s models/embedding/language + - pytest -v -s models/embedding/vision_language - pytest -v -s models/encoder_decoder/language - pytest -v -s models/encoder_decoder/vision_language @@ -402,11 +423,11 @@ steps: - pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed' - - TARGET_TEST_SUITE=L4 VLLM_ALLOW_DEPRECATED_BLOCK_MANAGER_V1=1 pytest basic_correctness/ -v -s -m distributed_2_gpus + - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus # Avoid importing model tests that cause CUDA reinitialization error - pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus - - pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus + - pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - pip install -e ./plugins/vllm_add_dummy_model - pytest -v -s distributed/test_distributed_oot.py diff --git a/.dockerignore b/.dockerignore index 17ed0d97c88b3..3863656915d03 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,6 +1,33 @@ -/.github/ /.venv /build dist -Dockerfile* vllm/*.so + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +.mypy_cache + +# Distribution / packaging +.Python +/build/ +cmake-build-*/ +CMakeUserPresets.json +develop-eggs/ +/dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 6fddca0d6e4b9..4f54eea564ecb 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -5,3 +5,28 @@ updates: directory: "/" schedule: interval: "weekly" + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "weekly" + labels: ["dependencies"] + open-pull-requests-limit: 5 + reviewers: ["khluu", "simon-mo"] + allow: + - dependency-type: "all" + ignore: + - dependency-name: "torch" + - dependency-name: "torchvision" + - dependency-name: "xformers" + - dependency-name: "lm-format-enforcer" + - dependency-name: "gguf" + - dependency-name: "compressed-tensors" + - dependency-name: "ray[adag]" + - dependency-name: "lm-eval" + groups: + patch-update: + applies-to: version-updates + update-types: ["patch"] + minor-update: + applies-to: version-updates + update-types: ["minor"] diff --git a/.github/mergify.yml b/.github/mergify.yml new file mode 100644 index 0000000000000..1ce5039a061b2 --- /dev/null +++ b/.github/mergify.yml @@ -0,0 +1,58 @@ +pull_request_rules: +- name: label-documentation + description: Automatically apply documentation label + conditions: + - or: + - files~=^[^/]+\.md$ + - files~=^docs/ + actions: + label: + add: + - documentation + +- name: label-ci-build + description: Automatically apply ci/build label + conditions: + - or: + - files~=^\.github/ + - files~=\.buildkite/ + - files~=^cmake/ + - files=CMakeLists.txt + - files~=^Dockerfile + - files~=^requirements.*\.txt + - files=setup.py + actions: + label: + add: + - ci/build + +- name: label-frontend + description: Automatically apply frontend label + conditions: + - files~=^vllm/entrypoints/ + actions: + label: + add: + - frontend + +- name: ping author on conflicts and add 'needs-rebase' label + conditions: + - conflict + - -closed + actions: + label: + add: + - needs-rebase + comment: + message: | + This pull request has merge conflicts that must be resolved before it can be + merged. @{{author}} please rebase it. https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork + +- name: remove 'needs-rebase' label when conflict is resolved + conditions: + - -conflict + - -closed + actions: + label: + remove: + - needs-rebase diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml index 2a0e3239f58da..b80749aaa8fec 100644 --- a/.github/workflows/actionlint.yml +++ b/.github/workflows/actionlint.yml @@ -34,4 +34,5 @@ jobs: - name: "Run actionlint" run: | + echo "::add-matcher::.github/workflows/matchers/actionlint.json" tools/actionlint.sh -color diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml index 2e7c7f7f087af..c9d6d4259df99 100644 --- a/.github/workflows/add_label_automerge.yml +++ b/.github/workflows/add_label_automerge.yml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Add label - uses: actions/github-script@v7 + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 with: script: | github.rest.issues.addLabels({ diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml index 341fc0665a402..77c007c2ad1ad 100644 --- a/.github/workflows/clang-format.yml +++ b/.github/workflows/clang-format.yml @@ -17,9 +17,9 @@ jobs: matrix: python-version: ["3.11"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: ${{ matrix.python-version }} - name: Install dependencies @@ -38,4 +38,4 @@ jobs: ) find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \ | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \ - | xargs clang-format --dry-run --Werror \ No newline at end of file + | xargs clang-format --dry-run --Werror diff --git a/.github/workflows/matchers/mypy.json b/.github/workflows/matchers/mypy.json new file mode 100644 index 0000000000000..f048fce528941 --- /dev/null +++ b/.github/workflows/matchers/mypy.json @@ -0,0 +1,16 @@ +{ + "problemMatcher": [ + { + "owner": "mypy", + "pattern": [ + { + "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$", + "file": 1, + "line": 2, + "severity": 3, + "message": 4 + } + ] + } + ] +} diff --git a/.github/workflows/matchers/ruff.json b/.github/workflows/matchers/ruff.json new file mode 100644 index 0000000000000..f6d4479ee1996 --- /dev/null +++ b/.github/workflows/matchers/ruff.json @@ -0,0 +1,17 @@ +{ + "problemMatcher": [ + { + "owner": "ruff", + "pattern": [ + { + "regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$", + "file": 1, + "line": 2, + "column": 3, + "code": 4, + "message": 5 + } + ] + } + ] + } diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml index 053684bebb6f2..5d73daf09b1ce 100644 --- a/.github/workflows/mypy.yaml +++ b/.github/workflows/mypy.yaml @@ -17,9 +17,9 @@ jobs: matrix: python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: ${{ matrix.python-version }} - name: Install dependencies @@ -32,4 +32,5 @@ jobs: pip install types-setuptools - name: Mypy run: | - tools/mypy.sh + echo "::add-matcher::.github/workflows/matchers/mypy.json" + tools/mypy.sh 1 diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 96549b3f99181..f959a1cacf866 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -21,7 +21,7 @@ jobs: upload_url: ${{ steps.create_release.outputs.upload_url }} steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Extract branch info shell: bash @@ -30,7 +30,7 @@ jobs: - name: Create Release id: create_release - uses: "actions/github-script@v7" + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 env: RELEASE_TAG: ${{ env.release_tag }} with: @@ -54,10 +54,10 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Setup ccache - uses: hendrikmuhs/ccache-action@v1.2 + uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14 with: create-symlink: true key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }} @@ -68,7 +68,7 @@ jobs: bash -x .github/workflows/scripts/env.sh - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: ${{ matrix.python-version }} @@ -92,7 +92,7 @@ jobs: echo "asset_name=${asset_name}" >> "$GITHUB_ENV" - name: Upload Release Asset - uses: actions/upload-release-asset@v1 + uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml index 98c5570f61aff..c65730f77a6a6 100644 --- a/.github/workflows/ruff.yml +++ b/.github/workflows/ruff.yml @@ -17,9 +17,9 @@ jobs: matrix: python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: ${{ matrix.python-version }} - name: Install dependencies @@ -28,7 +28,8 @@ jobs: pip install -r requirements-lint.txt - name: Analysing the code with ruff run: | - ruff check . + echo "::add-matcher::.github/workflows/matchers/ruff.json" + ruff check --output-format github . - name: Spelling check with codespell run: | codespell --toml pyproject.toml diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh index cda0c28c75c2a..122e4e101e201 100644 --- a/.github/workflows/scripts/build.sh +++ b/.github/workflows/scripts/build.sh @@ -1,4 +1,5 @@ #!/bin/bash +set -eux python_executable=python$1 cuda_home=/usr/local/cuda-$2 @@ -8,13 +9,15 @@ PATH=${cuda_home}/bin:$PATH LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH # Install requirements -$python_executable -m pip install wheel packaging 'setuptools-scm>=8' -$python_executable -m pip install -r requirements-cuda.txt +$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt # Limit the number of parallel jobs to avoid OOM export MAX_JOBS=1 # Make sure release wheels are built for the following architectures export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real" + +bash tools/check_repo.sh + # Build $python_executable setup.py bdist_wheel --dist-dir=dist diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml new file mode 100644 index 0000000000000..81e7c9b050760 --- /dev/null +++ b/.github/workflows/stale.yml @@ -0,0 +1,52 @@ +name: 'Close inactive issues and PRs' + +on: + schedule: + # Daily at 1:30 AM UTC + - cron: '30 1 * * *' + +jobs: + close-issues-and-pull-requests: + permissions: + issues: write + pull-requests: write + actions: write + runs-on: ubuntu-latest + steps: + - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0 + with: + # Increasing this value ensures that changes to this workflow + # propagate to all issues and PRs in days rather than months + operations-per-run: 1000 + + exempt-draft-pr: true + exempt-issue-labels: 'keep-open' + exempt-pr-labels: 'keep-open' + + labels-to-add-when-unstale: 'unstale' + labels-to-remove-when-stale: 'unstale' + + days-before-issue-stale: 90 + days-before-issue-close: 30 + stale-issue-label: 'stale' + stale-issue-message: > + This issue has been automatically marked as stale because it has not + had any activity within 90 days. It will be automatically closed if no + further activity occurs within 30 days. Leave a comment if + you feel this issue should remain open. Thank you! + close-issue-message: > + This issue has been automatically closed due to inactivity. Please + feel free to reopen if you feel it is still relevant. Thank you! + + days-before-pr-stale: 90 + days-before-pr-close: 30 + stale-pr-label: 'stale' + stale-pr-message: > + This pull request has been automatically marked as stale because it + has not had any activity within 90 days. It will be automatically + closed if no further activity occurs within 30 days. Leave a comment + if you feel this pull request should remain open. Thank you! + close-pr-message: > + This pull request has been automatically closed due to inactivity. + Please feel free to reopen if you intend to continue working on it. + Thank you! diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml index 68eb06dea47a3..f422588a5f73b 100644 --- a/.github/workflows/yapf.yml +++ b/.github/workflows/yapf.yml @@ -17,9 +17,9 @@ jobs: matrix: python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/.readthedocs.yaml b/.readthedocs.yaml index f1959ad2743f3..42cbf18a0f712 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -13,10 +13,10 @@ sphinx: fail_on_warning: true # If using Sphinx, optionally build your docs in additional formats such as PDF -formats: - - pdf +formats: [] # Optionally declare the Python requirements required to build your docs python: install: - requirements: docs/requirements-docs.txt + diff --git a/CMakeLists.txt b/CMakeLists.txt index 3a424ad7b110f..943424bc4edfa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11 # requirements.txt files and should be kept consistent. The ROCm torch # versions are derived from Dockerfile.rocm # -set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0") -set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0") +set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1") +set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1") # # Try to find python package with an executable that exactly matches @@ -83,24 +83,6 @@ endif() # find_package(Torch REQUIRED) -# -message(STATUS "Enabling core extension.") - -# Define _core_C extension -# built for (almost) every target platform, (excludes TPU and Neuron) - -set(VLLM_EXT_SRC - "csrc/core/torch_bindings.cpp") - -define_gpu_extension_target( - _core_C - DESTINATION vllm - LANGUAGE CXX - SOURCES ${VLLM_EXT_SRC} - COMPILE_FLAGS ${CXX_COMPILE_FLAGS} - USE_SABI 3 - WITH_SOABI) - # # Forward the non-CUDA device extensions to external CMake scripts. # @@ -187,12 +169,12 @@ endif() # # Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process. -# Configure it to place files in vllm/.deps, in order to play nicely with sccache. +# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache. +# Each dependency that produces build artifacts should override its BINARY_DIR to avoid +# conflicts between build types. It should instead be set to ${CMAKE_BINARY_DIR}/. # include(FetchContent) -get_filename_component(PROJECT_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) -file(MAKE_DIRECTORY "${FETCHCONTENT_BASE_DIR}") -set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT_DIR}/.deps") +file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}") # @@ -213,7 +195,6 @@ set(VLLM_EXT_SRC "csrc/quantization/compressed_tensors/int8_quant_kernels.cu" "csrc/quantization/fp8/common.cu" "csrc/cuda_utils_kernels.cu" - "csrc/moe_align_block_size_kernels.cu" "csrc/prepare_inputs/advance_step.cu" "csrc/torch_bindings.cpp") @@ -270,7 +251,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}") else() message(STATUS "Not building Marlin kernels as no compatible archs found" - "in CUDA target architectures") + " in CUDA target architectures") endif() # @@ -286,10 +267,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1") message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}") else() - # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't - # build any 3x kernels - set(SCALED_MM_3X_ARCHS) - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS) message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is " "not >= 12.0, we recommend upgrading to CUDA 12.0 or " @@ -299,13 +276,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Not building scaled_mm_c3x as no compatible archs found " "in CUDA target architectures") endif() + + # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't + # build any 3x kernels + set(SCALED_MM_3X_ARCHS) endif() # # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x) # kernels for the remaining archs that are not already built for 3x. cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS - "7.5;8.0;8.6;8.9;9.0;9.0a" "${CUDA_ARCHS}") + "7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}") # subtract out the archs that are already built for 3x list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) if (SCALED_MM_2X_ARCHS) @@ -423,6 +404,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1) set(VLLM_MOE_EXT_SRC "csrc/moe/torch_bindings.cpp" + "csrc/moe/moe_align_sum_kernels.cu" "csrc/moe/topk_softmax_kernels.cu") set_gencode_flags_for_srcs( @@ -450,7 +432,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}") else() message(STATUS "Not building Marlin MOE kernels as no compatible archs found" - "in CUDA target architectures") + " in CUDA target architectures") endif() endif() @@ -525,8 +507,10 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd + GIT_TAG 5259c586c403a4e4d8bf69973c159b40cc346fb9 GIT_PROGRESS TRUE + # Don't share the vllm-flash-attn build between build types + BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn ) endif() diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5f79356bd32f7..b39fd75b5fb70 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -11,12 +11,14 @@ We also believe in the power of community support; thus, answering queries, offe Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository! +## License + +See [LICENSE](LICENSE). ## Developing Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source](https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source) documentation for details. - ## Testing ```bash @@ -33,6 +35,14 @@ pytest tests/ ## Contribution Guidelines +### DCO and Signed-off-by + +When contributing changes to this project, you must agree to the [DCO](DCO). +Commits must include a `Signed-off-by:` header which certifies agreement with +the terms of the [DCO](DCO). + +Using `-s` with `git commit` will automatically add this header. + ### Issues If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. diff --git a/DCO b/DCO new file mode 100644 index 0000000000000..49b8cb0549267 --- /dev/null +++ b/DCO @@ -0,0 +1,34 @@ +Developer Certificate of Origin +Version 1.1 + +Copyright (C) 2004, 2006 The Linux Foundation and its contributors. + +Everyone is permitted to copy and distribute verbatim copies of this +license document, but changing it is not allowed. + + +Developer's Certificate of Origin 1.1 + +By making a contribution to this project, I certify that: + +(a) The contribution was created in whole or in part by me and I + have the right to submit it under the open source license + indicated in the file; or + +(b) The contribution is based upon previous work that, to the best + of my knowledge, is covered under an appropriate open source + license and I have the right under that license to submit that + work with modifications, whether created in whole or in part + by me, under the same open source license (unless I am + permitted to submit under a different license), as indicated + in the file; or + +(c) The contribution was provided directly to me by some other + person who certified (a), (b) or (c) and I have not modified + it. + +(d) I understand and agree that this project and the contribution + are public and that a record of the contribution (including all + personal information I submit with it, including my sign-off) is + maintained indefinitely and may be redistributed consistent with + this project or the open source license(s) involved. diff --git a/Dockerfile b/Dockerfile index 8405e0a88a106..343364da2ebf5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -70,16 +70,10 @@ COPY requirements-build.txt requirements-build.txt RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -r requirements-build.txt -# files and directories related to build wheels -COPY csrc csrc -COPY setup.py setup.py -COPY cmake cmake -COPY CMakeLists.txt CMakeLists.txt -COPY README.md README.md -COPY requirements-common.txt requirements-common.txt -COPY requirements-cuda.txt requirements-cuda.txt -COPY pyproject.toml pyproject.toml -COPY vllm vllm +COPY . . +ARG GIT_REPO_CHECK=0 +RUN --mount=type=bind,source=.git,target=.git \ + if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi # max jobs used by Ninja to build extensions ARG max_jobs=2 @@ -212,7 +206,7 @@ FROM vllm-base AS vllm-openai # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ - pip install accelerate hf_transfer 'modelscope!=1.15.0' bitsandbytes>=0.44.0 timm==0.9.10 + pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' timm==0.9.10 ENV VLLM_USAGE_SOURCE production-docker-image diff --git a/Dockerfile.cpu b/Dockerfile.cpu index b9134d4ae41cb..f1a21d6bd13fc 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -33,19 +33,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \ pip install --upgrade pip && \ pip install -r requirements-build.txt -# install oneDNN -RUN git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git - -RUN --mount=type=cache,target=/root/.cache/ccache \ - cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \ - -DONEDNN_BUILD_DOC=OFF \ - -DONEDNN_BUILD_EXAMPLES=OFF \ - -DONEDNN_BUILD_TESTS=OFF \ - -DONEDNN_BUILD_GRAPH=OFF \ - -DONEDNN_ENABLE_WORKLOAD=INFERENCE \ - -DONEDNN_ENABLE_PRIMITIVE=MATMUL && \ - cmake --build ./oneDNN/build --target install --config Release - FROM cpu-test-1 AS build WORKDIR /workspace/vllm @@ -55,7 +42,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \ pip install -v -r requirements-cpu.txt -COPY ./ ./ +COPY . . +ARG GIT_REPO_CHECK=0 +RUN --mount=type=bind,source=.git,target=.git \ + if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ... ARG VLLM_CPU_DISABLE_AVX512 diff --git a/Dockerfile.neuron b/Dockerfile.neuron index adae6db87ba87..2143315d2a078 100644 --- a/Dockerfile.neuron +++ b/Dockerfile.neuron @@ -17,7 +17,7 @@ RUN apt-get update && \ # When launching the container, mount the code directory to /app ARG APP_MOUNT=/app VOLUME [ ${APP_MOUNT} ] -WORKDIR ${APP_MOUNT} +WORKDIR ${APP_MOUNT}/vllm RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas @@ -25,17 +25,17 @@ RUN python3 -m pip install sentencepiece transformers==4.36.2 -U RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U -COPY . /app/vllm +COPY . . +ARG GIT_REPO_CHECK=0 +RUN --mount=type=bind,source=.git,target=.git \ + if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi -RUN cd /app/vllm \ - && python3 -m pip install -U \ - cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \ +RUN python3 -m pip install -U \ + 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ -r requirements-neuron.txt ENV VLLM_TARGET_DEVICE neuron RUN --mount=type=bind,source=.git,target=.git \ - cd /app/vllm \ - && pip install --no-build-isolation -v -e . \ - && cd .. + pip install --no-build-isolation -v -e . CMD ["/bin/bash"] diff --git a/Dockerfile.openvino b/Dockerfile.openvino index 95714a3d17188..a05ff452cd36e 100644 --- a/Dockerfile.openvino +++ b/Dockerfile.openvino @@ -9,23 +9,17 @@ RUN apt-get update -y && \ ffmpeg libsm6 libxext6 libgl1 WORKDIR /workspace -# copy requirements -COPY requirements-build.txt /workspace/vllm/ -COPY requirements-common.txt /workspace/vllm/ -COPY requirements-openvino.txt /workspace/vllm/ - -COPY vllm/ /workspace/vllm/vllm -COPY csrc/core /workspace/vllm/csrc/core -COPY cmake/utils.cmake /workspace/vllm/cmake/ -COPY CMakeLists.txt /workspace/vllm/ -COPY setup.py /workspace/vllm/ +COPY . . +ARG GIT_REPO_CHECK=0 +RUN --mount=type=bind,source=.git,target=.git \ + if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi # install build requirements -RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt +RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt # build vLLM with OpenVINO backend -RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/ +RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace -COPY examples/ /workspace/vllm/examples -COPY benchmarks/ /workspace/vllm/benchmarks +COPY examples/ /workspace/examples +COPY benchmarks/ /workspace/benchmarks CMD ["/bin/bash"] diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le index 1f374b01b9bc0..b19c6ddec7948 100644 --- a/Dockerfile.ppc64le +++ b/Dockerfile.ppc64le @@ -14,11 +14,14 @@ RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p COPY ./ /workspace/vllm WORKDIR /workspace/vllm +ARG GIT_REPO_CHECK=0 +RUN --mount=type=bind,source=.git,target=.git \ + if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi # These packages will be in rocketce eventually RUN --mount=type=cache,target=/root/.cache/pip \ pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \ - cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \ + 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ torch==2.3.1 \ -r requirements-cpu.txt \ xformers uvloop==0.20.0 @@ -30,4 +33,4 @@ WORKDIR /workspace/ RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks -ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] +ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 496e6bed7c022..8fb79afaebe97 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -52,7 +52,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip uninstall -y torch torchvision \ && python3 -m pip install --pre \ torch==2.6.0.dev20240918 \ - setuptools-scm>=8 \ + 'setuptools-scm>=8' \ torchvision==0.20.0.dev20240918 \ --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \ *) ;; esac @@ -117,6 +117,11 @@ RUN --mount=type=cache,target=${CCACHE_DIR} \ FROM base AS final # Import the vLLM development directory from the build context COPY . . +ARG GIT_REPO_CHECK=0 +RUN --mount=type=bind,source=.git,target=.git \ + if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi + +RUN python3 -m pip install --upgrade pip # Package upgrades for useful functionality or to avoid dependency issues RUN --mount=type=cache,target=/root/.cache/pip \ diff --git a/Dockerfile.tpu b/Dockerfile.tpu index d8f1a42c45177..b43442e4c0af1 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -1,8 +1,8 @@ -ARG NIGHTLY_DATE="20240828" +ARG NIGHTLY_DATE="20241017" ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE" FROM $BASE_IMAGE -WORKDIR /workspace +WORKDIR /workspace/vllm # Install some basic utilities RUN apt-get update && apt-get install -y \ @@ -16,14 +16,17 @@ RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html # Build vLLM. -COPY . /workspace/vllm +COPY . . +ARG GIT_REPO_CHECK=0 +RUN --mount=type=bind,source=.git,target=.git \ + if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi + ENV VLLM_TARGET_DEVICE="tpu" RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=bind,source=.git,target=.git \ - cd /workspace/vllm && \ python3 -m pip install \ - cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \ + 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ -r requirements-tpu.txt -RUN cd /workspace/vllm && python3 setup.py develop +RUN python3 setup.py develop CMD ["/bin/bash"] diff --git a/Dockerfile.xpu b/Dockerfile.xpu index 83db341556eaf..0ecb46df6256c 100644 --- a/Dockerfile.xpu +++ b/Dockerfile.xpu @@ -33,7 +33,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \ --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \ -r requirements-xpu.txt -COPY ./ /workspace/vllm +COPY . . +ARG GIT_REPO_CHECK +RUN --mount=type=bind,source=.git,target=.git \ + if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi ENV VLLM_TARGET_DEVICE=xpu diff --git a/README.md b/README.md index 7768cbfa06749..f41749ecb1148 100644 --- a/README.md +++ b/README.md @@ -13,9 +13,19 @@ Easy, fast, and cheap LLM serving for everyone | IntelÂź GaudiÂź README | Documentation | Blog | Paper | Discord | Twitter/X | Developer Slack |

+--- + +**vLLM x Snowflake Meetup (Wednesday, November 13th, 5:30-8PM PT) at Snowflake HQ, San Mateo** + +We are excited to announce the last in-person vLLM meetup of the year! +Join the vLLM developers and engineers from Snowflake AI Research to chat about the latest LLM inference optimizations and your 2025 vLLM wishlist! +Register [here](https://lu.ma/h0qvrajz) and be a part of the event! + +--- + *Latest News* đŸ”„ -- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there! +- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there! - [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/sessioncatalog?tab.day=20241001&search.sessiontracks=1719251906298001uzJ2) from other vLLM contributors and users! - [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing). - [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing). @@ -43,7 +53,7 @@ vLLM is fast with: - Speculative decoding - Chunked prefill -**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script. +**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script. vLLM is flexible and easy to use with: @@ -128,5 +138,6 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs * For technical questions and feature requests, please use Github issues or discussions. * For discussing with fellow users, please use Discord. +* For coordinating contributions and development, please use Slack. * For security disclosures, please use Github's security advisory feature. * For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu. diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 4813fde27f0bc..0a903877f000d 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -324,7 +324,7 @@ async def async_request_openai_chat_completions( }, ], "temperature": 0.0, - "max_tokens": request_func_input.output_len, + "max_completion_tokens": request_func_input.output_len, "stream": True, "ignore_eos": request_func_input.ignore_eos, } diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 30373b119a2ca..0a14aedd5feba 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -1,5 +1,6 @@ """Benchmark the latency of processing a single batch of requests.""" import argparse +import dataclasses import json import time from pathlib import Path @@ -10,45 +11,19 @@ from tqdm import tqdm from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs +from vllm.engine.arg_utils import EngineArgs from vllm.inputs import PromptType -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.utils import FlexibleArgumentParser def main(args: argparse.Namespace): print(args) + engine_args = EngineArgs.from_cli_args(args) + # NOTE(woosuk): If the request cannot be processed in a single batch, # the engine will automatically process the request in multiple batches. - llm = LLM( - model=args.model, - speculative_model=args.speculative_model, - num_speculative_tokens=args.num_speculative_tokens, - speculative_draft_tensor_parallel_size=\ - args.speculative_draft_tensor_parallel_size, - tokenizer=args.tokenizer, - quantization=args.quantization, - tensor_parallel_size=args.tensor_parallel_size, - trust_remote_code=args.trust_remote_code, - dtype=args.dtype, - max_model_len=args.max_model_len, - enforce_eager=args.enforce_eager, - kv_cache_dtype=args.kv_cache_dtype, - quantization_param_path=args.quantization_param_path, - device=args.device, - ray_workers_use_nsight=args.ray_workers_use_nsight, - use_v2_block_manager=args.use_v2_block_manager, - enable_chunked_prefill=args.enable_chunked_prefill, - download_dir=args.download_dir, - block_size=args.block_size, - gpu_memory_utilization=args.gpu_memory_utilization, - load_format=args.load_format, - distributed_executor_backend=args.distributed_executor_backend, - otlp_traces_endpoint=args.otlp_traces_endpoint, - enable_prefix_caching=args.enable_prefix_caching, - max_num_seqs=args.batch_size, - ) + llm = LLM(**dataclasses.asdict(engine_args)) sampling_params = SamplingParams( n=args.n, @@ -127,19 +102,6 @@ def run_to_completion(profile_dir: Optional[str] = None): parser = FlexibleArgumentParser( description='Benchmark the latency of processing a single batch of ' 'requests till completion.') - parser.add_argument('--model', type=str, default='facebook/opt-125m') - parser.add_argument('--speculative-model', type=str, default=None) - parser.add_argument('--num-speculative-tokens', type=int, default=None) - parser.add_argument('--speculative-draft-tensor-parallel-size', - '-spec-draft-tp', - type=int, - default=None) - parser.add_argument('--tokenizer', type=str, default=None) - parser.add_argument('--quantization', - '-q', - choices=[*QUANTIZATION_METHODS, None], - default=None) - parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) parser.add_argument('--input-len', type=int, default=32) parser.add_argument('--output-len', type=int, default=128) parser.add_argument('--batch-size', type=int, default=8) @@ -156,45 +118,6 @@ def run_to_completion(profile_dir: Optional[str] = None): type=int, default=30, help='Number of iterations to run.') - parser.add_argument('--trust-remote-code', - action='store_true', - help='trust remote code from huggingface') - parser.add_argument( - '--max-model-len', - type=int, - default=None, - help='Maximum length of a sequence (including prompt and output). ' - 'If None, will be derived from the model.') - parser.add_argument( - '--dtype', - type=str, - default='auto', - choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], - help='data type for model weights and activations. ' - 'The "auto" option will use FP16 precision ' - 'for FP32 and FP16 models, and BF16 precision ' - 'for BF16 models.') - parser.add_argument('--enforce-eager', - action='store_true', - help='enforce eager mode and disable CUDA graph') - parser.add_argument( - '--kv-cache-dtype', - type=str, - choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3', 'fp8_inc'], - default="auto", - help='Data type for kv cache storage. If "auto", will use model ' - 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' - 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') - parser.add_argument( - '--quantization-param-path', - type=str, - default=None, - help='Path to the JSON file containing the KV cache scaling factors. ' - 'This should generally be supplied, when KV cache dtype is FP8. ' - 'Otherwise, KV cache scaling factors default to 1.0, which may cause ' - 'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' - 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' - 'instead supported for common inference criteria.') parser.add_argument( '--profile', action='store_true', @@ -205,81 +128,12 @@ def run_to_completion(profile_dir: Optional[str] = None): default=None, help=('path to save the pytorch profiler output. Can be visualized ' 'with ui.perfetto.dev or Tensorboard.')) - parser.add_argument("--device", - type=str, - default="auto", - choices=DEVICE_OPTIONS, - help='device type for vLLM execution') - parser.add_argument('--block-size', - type=int, - default=16, - help='block size of key/value cache') - parser.add_argument( - '--enable-chunked-prefill', - action='store_true', - help='If True, the prefill requests can be chunked based on the ' - 'max_num_batched_tokens') - parser.add_argument("--enable-prefix-caching", - action='store_true', - help="Enable automatic prefix caching") - parser.add_argument('--use-v2-block-manager', - action='store_true', - default=EngineArgs.use_v2_block_manager) - parser.add_argument( - "--ray-workers-use-nsight", - action='store_true', - help="If specified, use nsight to profile ray workers", - ) - parser.add_argument('--download-dir', - type=str, - default=None, - help='directory to download and load the weights, ' - 'default to the default cache dir of huggingface') parser.add_argument( '--output-json', type=str, default=None, help='Path to save the latency results in JSON format.') - parser.add_argument('--gpu-memory-utilization', - type=float, - default=0.9, - help='the fraction of GPU memory to be used for ' - 'the model executor, which can range from 0 to 1.' - 'If unspecified, will use the default value of 0.9.') - parser.add_argument( - '--load-format', - type=str, - default=EngineArgs.load_format, - choices=[ - 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer', - 'bitsandbytes' - ], - help='The format of the model weights to load.\n\n' - '* "auto" will try to load the weights in the safetensors format ' - 'and fall back to the pytorch bin format if safetensors format ' - 'is not available.\n' - '* "pt" will load the weights in the pytorch bin format.\n' - '* "safetensors" will load the weights in the safetensors format.\n' - '* "npcache" will load the weights in pytorch format and store ' - 'a numpy cache to speed up the loading.\n' - '* "dummy" will initialize the weights with random values, ' - 'which is mainly for profiling.\n' - '* "tensorizer" will load the weights using tensorizer from ' - 'CoreWeave. See the Tensorize vLLM Model script in the Examples' - 'section for more information.\n' - '* "bitsandbytes" will load the weights using bitsandbytes ' - 'quantization.\n') - parser.add_argument( - '--distributed-executor-backend', - choices=['ray', 'mp'], - default=None, - help='Backend to use for distributed serving. When more than 1 GPU ' - 'is used, will be automatically set to "ray" if installed ' - 'or "mp" (multiprocessing) otherwise.') - parser.add_argument( - '--otlp-traces-endpoint', - type=str, - default=None, - help='Target URL to which OpenTelemetry traces will be sent.') + + parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index f14092d347343..1aac029992dbf 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -25,6 +25,7 @@ --input-length-range 128:256 """ +import dataclasses import json import random import time @@ -130,13 +131,9 @@ def main(args): filtered_datasets = [(PROMPT, prompt_len, args.output_len) ] * args.num_prompts - llm = LLM(model=args.model, - tokenizer_mode='auto', - trust_remote_code=True, - enforce_eager=True, - use_v2_block_manager=args.use_v2_block_manager, - tensor_parallel_size=args.tensor_parallel_size, - enable_prefix_caching=args.enable_prefix_caching) + engine_args = EngineArgs.from_cli_args(args) + + llm = LLM(**dataclasses.asdict(engine_args)) sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len) @@ -164,22 +161,11 @@ def main(args): parser = FlexibleArgumentParser( description= 'Benchmark the performance with or without automatic prefix caching.') - parser.add_argument('--model', - type=str, - default='baichuan-inc/Baichuan2-13B-Chat') parser.add_argument("--dataset-path", type=str, default=None, help="Path to the dataset.") - parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) parser.add_argument('--output-len', type=int, default=10) - parser.add_argument('--enable-prefix-caching', - action='store_true', - help='enable prefix caching') - parser.add_argument('--use-v2-block-manager', - action='store_true', - default=EngineArgs.use_v2_block_manager, - help='Use BlockSpaceMangerV2') parser.add_argument('--num-prompts', type=int, default=1, @@ -196,9 +182,7 @@ def main(args): default='128:256', help='Range of input lengths for sampling prompts,' 'specified as "min:max" (e.g., "128:256").') - parser.add_argument("--seed", - type=int, - default=0, - help='Random seed for reproducibility') + + parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index 8843e3a927a01..e0c9e6a6db502 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -1,5 +1,6 @@ """Benchmark offline prioritization.""" import argparse +import dataclasses import json import random import time @@ -7,7 +8,8 @@ from transformers import AutoTokenizer, PreTrainedTokenizerBase -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.engine.arg_utils import EngineArgs +from vllm.utils import FlexibleArgumentParser def sample_requests( @@ -62,46 +64,11 @@ def sample_requests( def run_vllm( requests: List[Tuple[str, int, int]], - model: str, - tokenizer: str, - quantization: Optional[str], - tensor_parallel_size: int, - seed: int, n: int, - trust_remote_code: bool, - dtype: str, - max_model_len: Optional[int], - enforce_eager: bool, - kv_cache_dtype: str, - quantization_param_path: Optional[str], - device: str, - enable_prefix_caching: bool, - enable_chunked_prefill: bool, - max_num_batched_tokens: int, - gpu_memory_utilization: float = 0.9, - download_dir: Optional[str] = None, + engine_args: EngineArgs, ) -> float: from vllm import LLM, SamplingParams - llm = LLM( - model=model, - tokenizer=tokenizer, - quantization=quantization, - tensor_parallel_size=tensor_parallel_size, - seed=seed, - trust_remote_code=trust_remote_code, - dtype=dtype, - max_model_len=max_model_len, - gpu_memory_utilization=gpu_memory_utilization, - enforce_eager=enforce_eager, - kv_cache_dtype=kv_cache_dtype, - quantization_param_path=quantization_param_path, - device=device, - enable_prefix_caching=enable_prefix_caching, - download_dir=download_dir, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - disable_log_stats=False, - ) + llm = LLM(**dataclasses.asdict(engine_args)) # Add the requests to the engine. prompts = [] @@ -142,16 +109,8 @@ def main(args: argparse.Namespace): args.output_len) if args.backend == "vllm": - elapsed_time = run_vllm(requests, args.model, args.tokenizer, - args.quantization, args.tensor_parallel_size, - args.seed, args.n, args.trust_remote_code, - args.dtype, args.max_model_len, - args.enforce_eager, args.kv_cache_dtype, - args.quantization_param_path, args.device, - args.enable_prefix_caching, - args.enable_chunked_prefill, - args.max_num_batched_tokens, - args.gpu_memory_utilization, args.download_dir) + elapsed_time = run_vllm(requests, args.n, + EngineArgs.from_cli_args(args)) else: raise ValueError(f"Unknown backend: {args.backend}") total_num_tokens = sum(prompt_len + output_len @@ -173,7 +132,7 @@ def main(args: argparse.Namespace): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Benchmark the throughput.") + parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser.add_argument("--backend", type=str, choices=["vllm", "hf", "mii"], @@ -191,13 +150,6 @@ def main(args: argparse.Namespace): default=None, help="Output length for each request. Overrides the " "output length from the dataset.") - parser.add_argument("--model", type=str, default="facebook/opt-125m") - parser.add_argument("--tokenizer", type=str, default=None) - parser.add_argument('--quantization', - '-q', - choices=[*QUANTIZATION_METHODS, None], - default=None) - parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1) parser.add_argument("--n", type=int, default=1, @@ -206,81 +158,13 @@ def main(args: argparse.Namespace): type=int, default=200, help="Number of prompts to process.") - parser.add_argument("--seed", type=int, default=0) - parser.add_argument('--trust-remote-code', - action='store_true', - help='trust remote code from huggingface') - parser.add_argument( - '--max-model-len', - type=int, - default=None, - help='Maximum length of a sequence (including prompt and output). ' - 'If None, will be derived from the model.') - parser.add_argument( - '--dtype', - type=str, - default='auto', - choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], - help='data type for model weights and activations. ' - 'The "auto" option will use FP16 precision ' - 'for FP32 and FP16 models, and BF16 precision ' - 'for BF16 models.') - parser.add_argument('--gpu-memory-utilization', - type=float, - default=0.9, - help='the fraction of GPU memory to be used for ' - 'the model executor, which can range from 0 to 1.' - 'If unspecified, will use the default value of 0.9.') - parser.add_argument("--enforce-eager", - action="store_true", - help="enforce eager execution") - parser.add_argument( - '--kv-cache-dtype', - type=str, - choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], - default="auto", - help='Data type for kv cache storage. If "auto", will use model ' - 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' - 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') - parser.add_argument( - '--quantization-param-path', - type=str, - default=None, - help='Path to the JSON file containing the KV cache scaling factors. ' - 'This should generally be supplied, when KV cache dtype is FP8. ' - 'Otherwise, KV cache scaling factors default to 1.0, which may cause ' - 'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' - 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' - 'instead supported for common inference criteria.') - parser.add_argument( - "--device", - type=str, - default="cuda", - choices=["cuda", "cpu"], - help='device type for vLLM execution, supporting CUDA and CPU.') - parser.add_argument( - "--enable-prefix-caching", - action='store_true', - help="enable automatic prefix caching for vLLM backend.") - parser.add_argument("--enable-chunked-prefill", - action='store_true', - help="enable chunked prefill for vLLM backend.") - parser.add_argument('--max-num-batched-tokens', - type=int, - default=None, - help='maximum number of batched tokens per ' - 'iteration') - parser.add_argument('--download-dir', - type=str, - default=None, - help='directory to download and load the weights, ' - 'default to the default cache dir of huggingface') parser.add_argument( '--output-json', type=str, default=None, help='Path to save the throughput results in JSON format.') + parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 04999518b7138..ff06622628219 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -53,6 +53,8 @@ except ImportError: from argparse import ArgumentParser as FlexibleArgumentParser +MILLISECONDS_TO_SECONDS_CONVERSION = 1000 + @dataclass class BenchmarkMetrics: @@ -60,6 +62,7 @@ class BenchmarkMetrics: total_input: int total_output: int request_throughput: float + request_goodput: float output_throughput: float total_token_throughput: float mean_ttft_ms: float @@ -202,6 +205,7 @@ def sample_hf_requests( dataset_split: str, num_requests: int, tokenizer: PreTrainedTokenizerBase, + random_seed: int, fixed_output_len: Optional[int] = None, ) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]: dataset = load_dataset(dataset_path, @@ -210,8 +214,8 @@ def sample_hf_requests( streaming=True) assert "conversations" in dataset.features, ( "HF Dataset must have 'conversations' column.") - filtered_dataset = dataset.shuffle().filter( - lambda x: len(x["conversations"]) >= 2) + filter_func = lambda x: len(x["conversations"]) >= 2 + filtered_dataset = dataset.shuffle(seed=random_seed).filter(filter_func) sampled_requests: List[Tuple[str, int, int, Dict[str, Collection[str]]]] = [] for data in filtered_dataset: @@ -315,12 +319,15 @@ def calculate_metrics( tokenizer: PreTrainedTokenizerBase, selected_percentile_metrics: List[str], selected_percentiles: List[float], + gootput_config_dict: Dict[str, float], ) -> Tuple[BenchmarkMetrics, List[int]]: actual_output_lens: List[int] = [] total_input = 0 completed = 0 + good_completed = 0 itls: List[float] = [] tpots: List[float] = [] + all_tpots: List[float] = [] ttfts: List[float] = [] e2els: List[float] = [] for i in range(len(outputs)): @@ -334,9 +341,13 @@ def calculate_metrics( add_special_tokens=False).input_ids) actual_output_lens.append(output_len) total_input += input_requests[i][1] + tpot = 0 if output_len > 1: - tpots.append( - (outputs[i].latency - outputs[i].ttft) / (output_len - 1)) + tpot = (outputs[i].latency - outputs[i].ttft) / (output_len - + 1) + tpots.append(tpot) + # Note: if output_len <= 1, we regard tpot as 0 for goodput + all_tpots.append(tpot) itls += outputs[i].itl ttfts.append(outputs[i].ttft) e2els.append(outputs[i].latency) @@ -344,6 +355,28 @@ def calculate_metrics( else: actual_output_lens.append(0) + if gootput_config_dict: + valid_metrics = [] + slo_values = [] + + if "ttft" in gootput_config_dict: + valid_metrics.append(ttfts) + slo_values.append(gootput_config_dict["ttft"] / + MILLISECONDS_TO_SECONDS_CONVERSION) + if "tpot" in gootput_config_dict: + valid_metrics.append(all_tpots) + slo_values.append(gootput_config_dict["tpot"] / + MILLISECONDS_TO_SECONDS_CONVERSION) + if "e2el" in gootput_config_dict: + valid_metrics.append(e2els) + slo_values.append(gootput_config_dict["e2el"] / + MILLISECONDS_TO_SECONDS_CONVERSION) + + for req_metric in zip(*valid_metrics): + is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)]) + if is_good_req: + good_completed += 1 + if completed == 0: warnings.warn( "All requests failed. This is likely due to a misconfiguration " @@ -354,6 +387,7 @@ def calculate_metrics( total_input=total_input, total_output=sum(actual_output_lens), request_throughput=completed / dur_s, + request_goodput=good_completed / dur_s, output_throughput=sum(actual_output_lens) / dur_s, total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, mean_ttft_ms=np.mean(ttfts or 0) * @@ -372,9 +406,9 @@ def calculate_metrics( median_itl_ms=np.median(itls or 0) * 1000, percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles], - mean_e2el_ms=np.median(e2els or 0) * 1000, + mean_e2el_ms=np.mean(e2els or 0) * 1000, std_e2el_ms=np.std(e2els or 0) * 1000, - median_e2el_ms=np.mean(e2els or 0) * 1000, + median_e2el_ms=np.median(e2els or 0) * 1000, percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles], ) @@ -397,6 +431,8 @@ async def benchmark( selected_percentile_metrics: List[str], selected_percentiles: List[str], ignore_eos: bool, + gootput_config_dict: Dict[str, float], + max_concurrency: Optional[int], ): if backend in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[backend] @@ -431,42 +467,56 @@ async def benchmark( if profile: print("Starting profiler...") - profile_input = RequestFuncInput( - model=model_id, - prompt=test_prompt, - api_url=base_url + "/start_profile", - prompt_len=test_prompt_len, - output_len=test_output_len, - logprobs=logprobs, - best_of=best_of, - multi_modal_content=test_mm_content, - ) + profile_input = RequestFuncInput(model=model_id, + prompt=test_prompt, + api_url=base_url + "/start_profile", + prompt_len=test_prompt_len, + output_len=test_output_len, + logprobs=logprobs, + best_of=best_of, + multi_modal_content=test_mm_content, + ignore_eos=ignore_eos) profile_output = await request_func(request_func_input=profile_input) if profile_output.success: print("Profiler started") print(f"Traffic request rate: {request_rate}") + print(f"Maximum request concurrency: {max_concurrency}") pbar = None if disable_tqdm else tqdm(total=len(input_requests)) + # This can be used once the minimum Python version is 3.10 or higher, + # and it will simplify the code in limited_request_func. + # semaphore = (asyncio.Semaphore(max_concurrency) + # if max_concurrency else contextlib.nullcontext()) + semaphore = (asyncio.Semaphore(max_concurrency) + if max_concurrency else None) + + async def limited_request_func(request_func_input, pbar): + if semaphore is None: + return await request_func(request_func_input=request_func_input, + pbar=pbar) + async with semaphore: + return await request_func(request_func_input=request_func_input, + pbar=pbar) + benchmark_start_time = time.perf_counter() tasks: List[asyncio.Task] = [] async for request in get_request(input_requests, request_rate): prompt, prompt_len, output_len, mm_content = request - request_func_input = RequestFuncInput( - model=model_id, - prompt=prompt, - api_url=api_url, - prompt_len=prompt_len, - output_len=output_len, - logprobs=logprobs, - best_of=best_of, - multi_modal_content=mm_content, - ) + request_func_input = RequestFuncInput(model=model_id, + prompt=prompt, + api_url=api_url, + prompt_len=prompt_len, + output_len=output_len, + logprobs=logprobs, + best_of=best_of, + multi_modal_content=mm_content, + ignore_eos=ignore_eos) tasks.append( asyncio.create_task( - request_func(request_func_input=request_func_input, - pbar=pbar))) + limited_request_func(request_func_input=request_func_input, + pbar=pbar))) outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) if profile: @@ -496,6 +546,7 @@ async def benchmark( tokenizer=tokenizer, selected_percentile_metrics=selected_percentile_metrics, selected_percentiles=selected_percentiles, + gootput_config_dict=gootput_config_dict, ) print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) @@ -507,6 +558,9 @@ async def benchmark( metrics.total_output)) print("{:<40} {:<10.2f}".format("Request throughput (req/s):", metrics.request_throughput)) + if gootput_config_dict: + print("{:<40} {:<10.2f}".format("Request goodput (req/s):", + metrics.request_goodput)) print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", metrics.output_throughput)) print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", @@ -518,6 +572,8 @@ async def benchmark( "total_input_tokens": metrics.total_input, "total_output_tokens": metrics.total_output, "request_throughput": metrics.request_throughput, + "request_goodput:": + metrics.request_goodput if gootput_config_dict else None, "output_throughput": metrics.output_throughput, "total_token_throughput": metrics.total_token_throughput, "input_lens": [output.prompt_len for output in outputs], @@ -571,6 +627,41 @@ def process_one_metric( return result +def check_goodput_args(args): + # Check and parse goodput arguments + gootput_config_dict = {} + VALID_NAMES = ["ttft", "tpot", "e2el"] + if args.goodput: + gootput_config_dict = parse_goodput(args.goodput) + for slo_name, slo_val in gootput_config_dict.items(): + if slo_name not in VALID_NAMES: + raise ValueError( + f"Invalid metric name found, {slo_name}: {slo_val}. " + "The service level objective name should be one of " + f"{str(VALID_NAMES)}. ") + if slo_val < 0: + raise ValueError( + f"Invalid value found, {slo_name}: {slo_val}. " + "The service level objective value should be " + "non-negative.") + return gootput_config_dict + + +def parse_goodput(slo_pairs): + gootput_config_dict = {} + try: + for slo_pair in slo_pairs: + slo_name, slo_val = slo_pair.split(":") + gootput_config_dict[slo_name] = float(slo_val) + except ValueError as err: + raise argparse.ArgumentTypeError( + "Invalid format found for service level objectives. " + "Specify service level objectives for goodput as \"KEY:VALUE\" " + "pairs, where the key is a metric name, and the value is a " + "number in milliseconds.") from err + return gootput_config_dict + + def main(args: argparse.Namespace): print(args) random.seed(args.seed) @@ -648,6 +739,7 @@ def main(args: argparse.Namespace): dataset_split=args.hf_split, num_requests=args.num_prompts, tokenizer=tokenizer, + random_seed=args.seed, fixed_output_len=args.hf_output_len, ) @@ -664,6 +756,8 @@ def main(args: argparse.Namespace): else: raise ValueError(f"Unknown dataset: {args.dataset_name}") + gootput_config_dict = check_goodput_args(args) + benchmark_result = asyncio.run( benchmark( backend=backend, @@ -682,6 +776,8 @@ def main(args: argparse.Namespace): float(p) for p in args.metric_percentiles.split(",") ], ignore_eos=args.ignore_eos, + gootput_config_dict=gootput_config_dict, + max_concurrency=args.max_concurrency, )) # Save config and results to json @@ -711,13 +807,16 @@ def main(args: argparse.Namespace): # Traffic result_json["request_rate"] = ( args.request_rate if args.request_rate < float("inf") else "inf") + result_json["max_concurrency"] = args.max_concurrency # Merge with benchmark result result_json = {**result_json, **benchmark_result} # Save to file base_model_id = model_id.split("/")[-1] - file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa + max_concurrency_str = (f"-concurrency{args.max_concurrency}" + if args.max_concurrency is not None else "") + file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" #noqa if args.result_filename: file_name = args.result_filename if args.result_dir: @@ -768,6 +867,19 @@ def main(args: argparse.Namespace): default=None, help="Path to the sharegpt/sonnet dataset. " "Or the huggingface dataset ID if using HF dataset.") + parser.add_argument( + "--max-concurrency", + type=int, + default=None, + help="Maximum number of concurrent requests. This can be used " + "to help simulate an environment where a higher level component " + "is enforcing a maximum number of concurrent requests. While the " + "--request-rate argument controls the rate at which requests are " + "initiated, this argument will control how many are actually allowed " + "to execute at a time. This means that when used in combination, the " + "actual request rate may be lower than specified with --request-rate, " + "if the server is not processing requests fast enough to keep up.") + parser.add_argument( "--model", type=str, @@ -881,6 +993,17 @@ def main(args: argparse.Namespace): "Default value is \"99\". " "Use \"--percentile-metrics\" to select metrics.", ) + parser.add_argument( + "--goodput", + nargs="+", + required=False, + help="Specify service level objectives for goodput as \"KEY:VALUE\" " + "pairs, where the key is a metric name, and the value is in " + "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, " + "separated by spaces. Allowed request level metric names are " + "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of " + "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " + "and the blog: https://hao-ai-lab.github.io/blogs/distserve") # group for dataset specific arguments sonnet_group = parser.add_argument_group("sonnet dataset options") diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index e1a359b871e71..a49f37c7d797a 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -1,9 +1,10 @@ """Benchmark offline inference throughput.""" import argparse +import dataclasses import json import random import time -from typing import List, Optional, Tuple +from typing import List, Optional import torch import uvloop @@ -11,20 +12,38 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) -from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs +from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.entrypoints.openai.api_server import ( build_async_engine_client_from_engine_args) -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.inputs import TextPrompt +from vllm.multimodal import MultiModalDataDict from vllm.sampling_params import BeamSearchParams from vllm.utils import FlexibleArgumentParser, merge_async_iterators +@dataclasses.dataclass +class SampleRequest: + """A class representing a single inference request for benchmarking. + + Attributes: + prompt: The input text prompt for the model. + multi_modal_data: Optional dictionary containing multi-modal data (e.g. + images). + prompt_len: The length of the prompt in tokens. + expected_output_len: The expected length of the output in tokens. + """ + prompt: str + prompt_len: int + expected_output_len: int + multi_modal_data: Optional[MultiModalDataDict] = None + + def sample_requests( dataset_path: str, num_requests: int, tokenizer: PreTrainedTokenizerBase, fixed_output_len: Optional[int], -) -> List[Tuple[str, int, int]]: +) -> List[SampleRequest]: if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") @@ -41,7 +60,7 @@ def sample_requests( random.shuffle(dataset) # Filter out sequences that are too long or too short - filtered_dataset: List[Tuple[str, int, int]] = [] + filtered_dataset: List[SampleRequest] = [] for i in range(len(dataset)): if len(filtered_dataset) == num_requests: break @@ -60,83 +79,34 @@ def sample_requests( if prompt_len > 1024 or prompt_len + output_len > 2048: # Prune too long sequences. continue - filtered_dataset.append((prompt, prompt_len, output_len)) + filtered_dataset.append( + SampleRequest(prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len)) return filtered_dataset def run_vllm( - requests: List[Tuple[str, int, int]], - model: str, - tokenizer: str, - quantization: Optional[str], - tensor_parallel_size: int, - seed: int, + requests: List[SampleRequest], n: int, - trust_remote_code: bool, - dtype: str, - max_model_len: Optional[int], - enforce_eager: bool, - kv_cache_dtype: str, - quantization_param_path: Optional[str], - device: str, - enable_prefix_caching: bool, - enable_chunked_prefill: bool, - max_num_batched_tokens: int, - distributed_executor_backend: Optional[str], - gpu_memory_utilization: float = 0.9, - num_scheduler_steps: int = 1, - use_v2_block_manager: bool = False, - download_dir: Optional[str] = None, - load_format: str = EngineArgs.load_format, - disable_async_output_proc: bool = False, - weights_load_device: str = None, - use_padding_aware_scheduling: bool = False, - max_num_seqs: int = 256, - max_num_prefill_seqs: int = None, + engine_args: EngineArgs, ) -> float: from vllm import LLM, SamplingParams - llm = LLM( - model=model, - tokenizer=tokenizer, - quantization=quantization, - tensor_parallel_size=tensor_parallel_size, - seed=seed, - trust_remote_code=trust_remote_code, - dtype=dtype, - max_model_len=max_model_len, - gpu_memory_utilization=gpu_memory_utilization, - enforce_eager=enforce_eager, - kv_cache_dtype=kv_cache_dtype, - quantization_param_path=quantization_param_path, - device=device, - enable_prefix_caching=enable_prefix_caching, - download_dir=download_dir, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - distributed_executor_backend=distributed_executor_backend, - load_format=load_format, - num_scheduler_steps=num_scheduler_steps, - use_v2_block_manager=use_v2_block_manager, - disable_async_output_proc=disable_async_output_proc, - weights_load_device=weights_load_device, - use_padding_aware_scheduling=use_padding_aware_scheduling, - max_num_seqs=max_num_seqs, - max_num_prefill_seqs=max_num_prefill_seqs, - ) + llm = LLM(**dataclasses.asdict(engine_args)) # Add the requests to the engine. - prompts: List[str] = [] + prompts: List[TextPrompt] = [] sampling_params: List[SamplingParams] = [] - for prompt, _, output_len in requests: - prompts.append(prompt) + for request in requests: + prompts.append(TextPrompt(prompt=request.prompt)) sampling_params.append( SamplingParams( n=n, temperature=1.0, top_p=1.0, ignore_eos=True, - max_tokens=output_len, + max_tokens=request.expected_output_len, )) use_beam_search = False @@ -146,11 +116,11 @@ def run_vllm( llm.generate(prompts, sampling_params, use_tqdm=True) end = time.perf_counter() else: - prompts = [prompt for prompt, _, _ in requests] + prompts = [request.prompt for request in requests] # output_len should be the same for all requests. output_len = requests[0][2] - for prompt, input_len, _output_len in requests: - assert _output_len == output_len + for request in requests: + assert request.expected_output_len == output_len start = time.perf_counter() llm.beam_search( prompts, @@ -164,30 +134,9 @@ def run_vllm( async def run_vllm_async( - requests: List[Tuple[str, int, int]], - model: str, - tokenizer: str, - quantization: Optional[str], - tensor_parallel_size: int, - seed: int, + requests: List[SampleRequest], n: int, - trust_remote_code: bool, - dtype: str, - max_model_len: Optional[int], - enforce_eager: bool, - kv_cache_dtype: str, - quantization_param_path: Optional[str], - device: str, - enable_prefix_caching: bool, - enable_chunked_prefill: bool, - max_num_batched_tokens: int, - distributed_executor_backend: Optional[str], - gpu_memory_utilization: float = 0.9, - num_scheduler_steps: int = 1, - use_v2_block_manager: bool = False, - download_dir: Optional[str] = None, - load_format: str = EngineArgs.load_format, - disable_async_output_proc: bool = False, + engine_args: AsyncEngineArgs, disable_frontend_multiprocessing: bool = False, weights_load_device: str = None, use_padding_aware_scheduling: bool = False, @@ -195,51 +144,22 @@ async def run_vllm_async( max_num_prefill_seqs: int = None, ) -> float: from vllm import SamplingParams - engine_args = AsyncEngineArgs( - model=model, - tokenizer=tokenizer, - quantization=quantization, - tensor_parallel_size=tensor_parallel_size, - seed=seed, - trust_remote_code=trust_remote_code, - dtype=dtype, - max_model_len=max_model_len, - gpu_memory_utilization=gpu_memory_utilization, - enforce_eager=enforce_eager, - kv_cache_dtype=kv_cache_dtype, - quantization_param_path=quantization_param_path, - device=device, - enable_prefix_caching=enable_prefix_caching, - download_dir=download_dir, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - distributed_executor_backend=distributed_executor_backend, - load_format=load_format, - num_scheduler_steps=num_scheduler_steps, - use_v2_block_manager=use_v2_block_manager, - disable_async_output_proc=disable_async_output_proc, - worker_use_ray=False, - disable_log_requests=True, - weights_load_device=weights_load_device, - use_padding_aware_scheduling=use_padding_aware_scheduling, - max_num_prefill_seqs=max_num_prefill_seqs, - ) async with build_async_engine_client_from_engine_args( engine_args, disable_frontend_multiprocessing) as llm: # Add the requests to the engine. - prompts: List[str] = [] + prompts: List[TextPrompt] = [] sampling_params: List[SamplingParams] = [] - for prompt, _, output_len in requests: - prompts.append(prompt) + for request in requests: + prompts.append(TextPrompt(prompt=request.prompt)) sampling_params.append( SamplingParams( n=n, temperature=1.0, top_p=1.0, ignore_eos=True, - max_tokens=output_len, + max_tokens=request.expected_output_len, )) generators = [] @@ -255,7 +175,7 @@ async def run_vllm_async( def run_hf( - requests: List[Tuple[str, int, int]], + requests: List[SampleRequest], model: str, tokenizer: PreTrainedTokenizerBase, n: int, @@ -313,14 +233,14 @@ def run_hf( def run_mii( - requests: List[Tuple[str, int, int]], + requests: List[SampleRequest], model: str, tensor_parallel_size: int, output_len: int, ) -> float: from mii import client, serve llm = serve(model, tensor_parallel=tensor_parallel_size) - prompts = [prompt for prompt, _, _ in requests] + prompts = [request.prompt for request in requests] start = time.perf_counter() llm.generate(prompts, max_new_tokens=output_len) @@ -339,34 +259,38 @@ def main(args: argparse.Namespace): args.tokenizer, trust_remote_code=args.trust_remote_code) if args.dataset is None: # Synthesize a prompt with the given input length. - prompt = "hi" * (args.input_len - 1) - requests = [(prompt, args.input_len, args.output_len) - for _ in range(args.num_prompts)] + # As tokenizer may add additional tokens like BOS, we need to try + # different lengths to get the desired input length. + for i in range(-10, 10): + prompt = "hi " * (args.input_len + i) + tokenized_prompt = tokenizer(prompt).input_ids + if len(tokenized_prompt) == args.input_len: + break + else: + raise ValueError( + f"Failed to synthesize a prompt with {args.input_len} tokens.") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=args.input_len, + expected_output_len=args.output_len) + for _ in range(args.num_prompts) + ] else: requests = sample_requests(args.dataset, args.num_prompts, tokenizer, args.output_len) if args.backend == "vllm": - run_args = [ - requests, args.model, args.tokenizer, args.quantization, - args.tensor_parallel_size, args.seed, args.n, - args.trust_remote_code, args.dtype, args.max_model_len, - args.enforce_eager, args.kv_cache_dtype, - args.quantization_param_path, args.device, - args.enable_prefix_caching, args.enable_chunked_prefill, - args.max_num_batched_tokens, args.distributed_executor_backend, - args.gpu_memory_utilization, args.num_scheduler_steps, - args.use_v2_block_manager, args.download_dir, args.load_format, - args.disable_async_output_proc, args.weights_load_device, - args.use_padding_aware_scheduling, args.max_num_seqs, - args.max_num_prefill_seqs - ] - if args.async_engine: - run_args.append(args.disable_frontend_multiprocessing) - elapsed_time = uvloop.run(run_vllm_async(*run_args)) + elapsed_time = uvloop.run( + run_vllm_async( + requests, + args.n, + AsyncEngineArgs.from_cli_args(args), + args.disable_frontend_multiprocessing, + )) else: - elapsed_time = run_vllm(*run_args) + elapsed_time = run_vllm(requests, args.n, + EngineArgs.from_cli_args(args)) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -376,10 +300,13 @@ def main(args: argparse.Namespace): args.output_len) else: raise ValueError(f"Unknown backend: {args.backend}") - total_num_tokens = sum(prompt_len + output_len - for _, prompt_len, output_len in requests) + total_num_tokens = sum(request.prompt_len + request.expected_output_len + for request in requests) + total_output_tokens = sum(request.expected_output_len + for request in requests) print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " - f"{total_num_tokens / elapsed_time:.2f} tokens/s") + f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " + f"{total_output_tokens / elapsed_time:.2f} output tokens/s") # Output JSON results if specified if args.output_json: @@ -403,7 +330,9 @@ def main(args: argparse.Namespace): parser.add_argument("--dataset", type=str, default=None, - help="Path to the dataset.") + help="Path to the dataset. The dataset is expected to " + "be a json in form of List[Dict[..., conversations: " + "List[Dict[..., value: ]]]]") parser.add_argument("--input-len", type=int, default=None, @@ -413,13 +342,6 @@ def main(args: argparse.Namespace): default=None, help="Output length for each request. Overrides the " "output length from the dataset.") - parser.add_argument("--model", type=str, default="facebook/opt-125m") - parser.add_argument("--tokenizer", type=str, default=None) - parser.add_argument('--quantization', - '-q', - choices=[*QUANTIZATION_METHODS, None], - default=None) - parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1) parser.add_argument("--n", type=int, default=1, @@ -428,127 +350,15 @@ def main(args: argparse.Namespace): type=int, default=1000, help="Number of prompts to process.") - parser.add_argument("--seed", type=int, default=0) parser.add_argument("--hf-max-batch-size", type=int, default=None, help="Maximum batch size for HF backend.") - parser.add_argument('--trust-remote-code', - action='store_true', - help='trust remote code from huggingface') - parser.add_argument( - '--max-model-len', - type=int, - default=None, - help='Maximum length of a sequence (including prompt and output). ' - 'If None, will be derived from the model.') - parser.add_argument( - '--dtype', - type=str, - default='auto', - choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], - help='data type for model weights and activations. ' - 'The "auto" option will use FP16 precision ' - 'for FP32 and FP16 models, and BF16 precision ' - 'for BF16 models.') - parser.add_argument('--gpu-memory-utilization', - type=float, - default=0.9, - help='the fraction of GPU memory to be used for ' - 'the model executor, which can range from 0 to 1.' - 'If unspecified, will use the default value of 0.9.') - parser.add_argument("--enforce-eager", - action="store_true", - help="enforce eager execution") - parser.add_argument( - '--kv-cache-dtype', - type=str, - choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3', 'fp8_inc'], - default="auto", - help='Data type for kv cache storage. If "auto", will use model ' - 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' - 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') - parser.add_argument( - '--quantization-param-path', - type=str, - default=None, - help='Path to the JSON file containing the KV cache scaling factors. ' - 'This should generally be supplied, when KV cache dtype is FP8. ' - 'Otherwise, KV cache scaling factors default to 1.0, which may cause ' - 'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' - 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' - 'instead supported for common inference criteria.') - parser.add_argument("--device", - type=str, - default="auto", - choices=DEVICE_OPTIONS, - help='device type for vLLM execution') - parser.add_argument( - "--num-scheduler-steps", - type=int, - default=1, - help="Maximum number of forward steps per scheduler call.") - parser.add_argument("--use-v2-block-manager", - action='store_true', - default=EngineArgs.use_v2_block_manager, - help="Enable block manager v2.") - parser.add_argument( - "--enable-prefix-caching", - action='store_true', - help="Enable automatic prefix caching for vLLM backend.") - parser.add_argument("--enable-chunked-prefill", - action='store_true', - help="enable chunked prefill for vLLM backend.") - parser.add_argument('--max-num-batched-tokens', - type=int, - default=None, - help='maximum number of batched tokens per ' - 'iteration') - parser.add_argument('--download-dir', - type=str, - default=None, - help='directory to download and load the weights, ' - 'default to the default cache dir of huggingface') parser.add_argument( '--output-json', type=str, default=None, help='Path to save the throughput results in JSON format.') - parser.add_argument( - '--distributed-executor-backend', - choices=['ray', 'mp'], - default=None, - help='Backend to use for distributed serving. When more than 1 GPU ' - 'is used, will be automatically set to "ray" if installed ' - 'or "mp" (multiprocessing) otherwise.') - parser.add_argument( - '--load-format', - type=str, - default=EngineArgs.load_format, - choices=[ - 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer', - 'bitsandbytes' - ], - help='The format of the model weights to load.\n\n' - '* "auto" will try to load the weights in the safetensors format ' - 'and fall back to the pytorch bin format if safetensors format ' - 'is not available.\n' - '* "pt" will load the weights in the pytorch bin format.\n' - '* "safetensors" will load the weights in the safetensors format.\n' - '* "npcache" will load the weights in pytorch format and store ' - 'a numpy cache to speed up the loading.\n' - '* "dummy" will initialize the weights with random values, ' - 'which is mainly for profiling.\n' - '* "tensorizer" will load the weights using tensorizer from ' - 'CoreWeave. See the Tensorize vLLM Model script in the Examples' - 'section for more information.\n' - '* "bitsandbytes" will load the weights using bitsandbytes ' - 'quantization.\n') - parser.add_argument( - "--disable-async-output-proc", - action='store_true', - default=False, - help="Disable async output processor for vLLM backend.") parser.add_argument("--async-engine", action='store_true', default=False, @@ -557,23 +367,7 @@ def main(args: argparse.Namespace): action='store_true', default=False, help="Disable decoupled async engine frontend.") - parser.add_argument("--weights-load-device", - type=str, - default=None, - choices=DEVICE_OPTIONS, - help='Device on which weights are loaded.') - parser.add_argument("--use-padding-aware-scheduling", - action='store_true', - default=False, - help="Enable padding-aware scheduling.") - parser.add_argument("--max-num-seqs", - type=int, - default=256, - help="Maximum number of requests for single decode.") - parser.add_argument("--max-num-prefill-seqs", - type=int, - default=None, - help="Maximum number of requests for single prefill.") + parser = AsyncEngineArgs.add_cli_args(parser) args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py index 92f6053cc6d7e..7acea6087fdfd 100644 --- a/benchmarks/kernels/benchmark_layernorm.py +++ b/benchmarks/kernels/benchmark_layernorm.py @@ -3,8 +3,8 @@ import torch from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser, - seed_everything) +from vllm.platforms import current_platform +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser @torch.inference_mode() @@ -16,7 +16,7 @@ def main(num_tokens: int, do_profile: bool = False, num_warmup_iters: int = 5, num_iters: int = 100) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device("cuda") layer = RMSNorm(hidden_size).to(dtype=dtype) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index c2ad98b7e2656..8f538c21f7f7e 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -10,7 +10,8 @@ from transformers import AutoConfig from vllm.model_executor.layers.fused_moe.fused_moe import * -from vllm.utils import FlexibleArgumentParser, seed_everything +from vllm.platforms import current_platform +from vllm.utils import FlexibleArgumentParser class BenchmarkConfig(TypedDict): @@ -88,22 +89,23 @@ def prepare(i: int): input_gating.copy_(gating_output[i]) def run(): - fused_moe( - x, - w1, - w2, - input_gating, - topk, - renormalize=True, - inplace=True, - override_config=config, - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a16=use_int8_w8a16, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale, - ) + from vllm.model_executor.layers.fused_moe import override_config + with override_config(config): + fused_moe( + x, + w1, + w2, + input_gating, + topk, + renormalize=True, + inplace=True, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a16=use_int8_w8a16, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + ) # JIT compilation & warmup run() @@ -166,7 +168,7 @@ class BenchmarkWorker: def __init__(self, seed: int) -> None: torch.set_default_device("cuda") - seed_everything(seed) + current_platform.seed_everything(seed) self.seed = seed def benchmark( @@ -180,7 +182,7 @@ def benchmark( use_fp8_w8a8: bool, use_int8_w8a16: bool, ) -> Tuple[Dict[str, int], float]: - seed_everything(self.seed) + current_platform.seed_everything(self.seed) dtype_str = get_config_dtype_str(dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8) diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 87864d038d593..14eef00b855ac 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -5,8 +5,9 @@ import torch from vllm import _custom_ops as ops +from vllm.platforms import current_platform from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser, - create_kv_caches_with_random, seed_everything) + create_kv_caches_with_random) NUM_BLOCKS = 1024 PARTITION_SIZE = 512 @@ -28,7 +29,7 @@ def main( device: str = "cuda", kv_cache_dtype: Optional[str] = None, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) scale = float(1.0 / (head_size**0.5)) query = torch.empty(num_seqs, diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py index 743a5744e8614..1d62483448946 100644 --- a/benchmarks/kernels/benchmark_quant.py +++ b/benchmarks/kernels/benchmark_quant.py @@ -3,8 +3,8 @@ import torch from vllm import _custom_ops as ops -from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser, - seed_everything) +from vllm.platforms import current_platform +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser @torch.inference_mode() @@ -17,7 +17,7 @@ def main(num_tokens: int, do_profile: bool = False, num_warmup_iters: int = 5, num_iters: int = 100) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device("cuda") x = torch.randn(num_tokens, hidden_size, dtype=dtype) diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 73fc9e9dbf461..250d505168d09 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -6,7 +6,8 @@ from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding, get_rope) -from vllm.utils import FlexibleArgumentParser, seed_everything +from vllm.platforms import current_platform +from vllm.utils import FlexibleArgumentParser def benchmark_rope_kernels_multi_lora( @@ -22,7 +23,7 @@ def benchmark_rope_kernels_multi_lora( max_position: int = 8192, base: int = 10000, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) if rotary_dim is None: rotary_dim = head_size @@ -31,7 +32,7 @@ def benchmark_rope_kernels_multi_lora( # batched RoPE can take multiple scaling factors batched_rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, { - "type": "linear", + "rope_type": "linear", "factor": tuple(scaling_factors) }) # non-batched RoPE takes only one scaling factor, we create multiple @@ -41,7 +42,7 @@ def benchmark_rope_kernels_multi_lora( non_batched_ropes.append( get_rope(head_size, rotary_dim, max_position, base, is_neox_style, { - "type": "linear", + "rope_type": "linear", "factor": (scaling_factor, ) })) diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py index 203699e9a8d06..d16d6f9fba442 100644 --- a/benchmarks/overheads/benchmark_hashing.py +++ b/benchmarks/overheads/benchmark_hashing.py @@ -16,7 +16,6 @@ def main(args): enforce_eager=True, enable_prefix_caching=True, tensor_parallel_size=args.tensor_parallel_size, - use_v2_block_manager=args.use_v2_block_manager, ) sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len) @@ -56,8 +55,5 @@ def main(args): parser.add_argument('--enable-prefix-caching', action='store_true', help='enable prefix caching') - parser.add_argument('--use-v2-block-manager', - action='store_true', - help='Use BlockSpaceMangerV2') args = parser.parse_args() main(args) diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index bc5f24d3f591c..7237d246ddf55 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -1,5 +1,8 @@ +include(FetchContent) + +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) -set(CMAKE_CXX_STANDARD 17) # # Define environment variables for special configurations @@ -82,15 +85,40 @@ else() message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.") endif() +# +# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 platforms) +# +if (AVX512_FOUND AND NOT AVX512_DISABLED) + FetchContent_Declare( + oneDNN + GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git + GIT_TAG v3.5.3 + GIT_PROGRESS TRUE + GIT_SHALLOW TRUE + ) + + set(ONEDNN_LIBRARY_TYPE "STATIC") + set(ONEDNN_BUILD_DOC "OFF") + set(ONEDNN_BUILD_EXAMPLES "OFF") + set(ONEDNN_BUILD_TESTS "OFF") + set(ONEDNN_ENABLE_WORKLOAD "INFERENCE") + set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER") + set(ONEDNN_BUILD_GRAPH "OFF") + set(ONEDNN_ENABLE_JIT_PROFILING "OFF") + set(ONEDNN_ENABLE_ITT_TASKS "OFF") + set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF") + set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF") + set(CMAKE_POLICY_DEFAULT_CMP0077 NEW) + + FetchContent_MakeAvailable(oneDNN) + + list(APPEND LIBS dnnl) +endif() + message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}") list(APPEND LIBS numa) -# Appending the dnnl library for the AVX2 and AVX512, as it is not utilized by Power architecture. -if (AVX2_FOUND OR AVX512_FOUND) - list(APPEND LIBS dnnl) -endif() - # # _C extension # diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 24bb7299338ac..40430dae10c5b 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -424,11 +424,7 @@ function (define_gpu_extension_target GPU_MOD_NAME) # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of # dependencies that are not necessary and may not be installed. if (GPU_LANGUAGE STREQUAL "CUDA") - if ("${CUDA_CUDA_LIB}" STREQUAL "") - set(CUDA_CUDA_LIB "${CUDA_CUDA_LIBRARY}") - endif() - target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB} - ${CUDA_LIBRARIES}) + target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver) else() target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) endif() diff --git a/collect_env.py b/collect_env.py index ae7f97f355253..80403d576d78f 100644 --- a/collect_env.py +++ b/collect_env.py @@ -267,23 +267,16 @@ def get_neuron_sdk_version(run_lambda): def get_vllm_version(): - version = "" - try: - import vllm - version = vllm.__version__ - except Exception: - pass - commit = "" - try: - import vllm - commit = vllm.__commit__ - except Exception: - pass - if version != "" and commit != "": - return f"{version}@{commit}" - if version == "" and commit == "": - return "N/A" - return version or commit + from vllm import __version__, __version_tuple__ + + if __version__ == "dev": + return "N/A (dev)" + + if len(__version_tuple__) == 4: # dev build + git_sha = __version_tuple__[-1][1:] # type: ignore + return f"{__version__} (git sha: {git_sha}" + + return __version__ def summarize_vllm_build_flags(): # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc. diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu index 5ed1dc3b8f792..839dc36ba4e29 100644 --- a/csrc/activation_kernels.cu +++ b/csrc/activation_kernels.cu @@ -89,6 +89,48 @@ void gelu_tanh_and_mul(torch::Tensor& out, // [..., d] namespace vllm { +template +__device__ __forceinline__ T fatrelu_kernel(const T& x, const float threshold) { + const float f = (float)x; + return (T)(f > threshold ? f : 0.0f); +} + +template +__global__ void act_and_mul_kernel_with_param( + scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d, + const float param) { + const int64_t token_idx = blockIdx.x; + for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { + const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]); + const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]); + out[token_idx * d + idx] = ACT_FN(x, param) * y; + } +} + +} // namespace vllm + +#define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM) \ + int d = input.size(-1) / 2; \ + int64_t num_tokens = input.numel() / input.size(-1); \ + dim3 grid(num_tokens); \ + dim3 block(std::min(d, 1024)); \ + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ + VLLM_DISPATCH_FLOATING_TYPES( \ + input.scalar_type(), "act_and_mul_kernel_with_param", [&] { \ + vllm::act_and_mul_kernel_with_param> \ + <<>>(out.data_ptr(), \ + input.data_ptr(), d, \ + PARAM); \ + }); + +void fatrelu_and_mul(torch::Tensor& out, // [..., d], + torch::Tensor& input, // [..., 2 * d] + double threshold) { + LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold); +} +namespace vllm { + // Element-wise activation kernel template. template __global__ void activation_kernel( diff --git a/csrc/core/scalar_type.hpp b/csrc/core/scalar_type.hpp index 0e1f360d74bd5..408e736d5bc0f 100644 --- a/csrc/core/scalar_type.hpp +++ b/csrc/core/scalar_type.hpp @@ -1,6 +1,7 @@ #pragma once -#include +// For TORCH_CHECK +#include namespace vllm { @@ -9,12 +10,7 @@ namespace vllm { // in particular it can be used to represent sub-byte data types (something // that torch.dtype currently does not support). // -// ScalarTypeTorch is a subclass of ScalarType that is compatible with -// TORCH_LIBRARY, making it accessible from Python as well meaning this class -// can be used as a argument for custom operators, helping to simplify these -// interfaces. -// -// The type definitions on the Python side can be found in: vllm/_core_ext.pyi +// The type definitions on the Python side can be found in: vllm/scalar_type.py // these type definitions should be kept up to date with any Python API changes // here. // @@ -308,204 +304,7 @@ class ScalarType { } }; -// Create a TORCH_LIBRARY compatible version of ScalarType (i.e. inherit from -// torch::CustomClassHolder), we use multiple inheritance here since we cannot -// have ScalarType inherit from torch::CustomClassHolder and have a constexpr -// constructor at the same time (torch::CustomClassHolder does not have a -// constexpr destructor) -// See also: -// https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA -class ScalarTypeTorch : public torch::CustomClassHolder, public ScalarType { - public: - ScalarTypeTorch(int64_t exponent, int64_t mantissa, int64_t bias, - bool _signed) - : ScalarType(exponent, mantissa, bias, _signed){}; - - ScalarTypeTorch(ScalarType type) : ScalarType(type){}; - - using Base = ScalarType; - using Self = ScalarTypeTorch; - using SelfPtr = c10::intrusive_ptr; - - static void check_size_bits(int64_t size_bits, bool signed_) { - TORCH_CHECK( - size_bits <= - std::numeric_limits().mantissa)>::max(), - "size_bits bit width is too large to be represented"); - } - - static void check_bias(int64_t bias) { - using Bias = decltype(std::declval().bias); - TORCH_CHECK(bias <= std::numeric_limits::max() && - bias >= std::numeric_limits::min(), - "bias too large or small to be represented"); - } - - static void check_exponent(int64_t exponent) { - TORCH_CHECK( - exponent <= - std::numeric_limits().exponent)>::max(), - "exponent bit width is too large to be represented"); - } - - static void check_mantissa(int64_t mantissa) { - TORCH_CHECK( - mantissa <= - std::numeric_limits().mantissa)>::max(), - "mantissa bit width is too large to be represented"); - } - - static SelfPtr int_(int64_t size_bits, c10::optional bias) { - check_size_bits(size_bits, true); - check_bias(bias.value_or(0)); - return c10::make_intrusive( - ScalarType::int_(size_bits, bias.value_or(0))); - } - - static SelfPtr uint(int64_t size_bits, c10::optional bias) { - check_size_bits(size_bits, true); - check_bias(bias.value_or(0)); - return c10::make_intrusive( - ScalarType::uint(size_bits, bias.value_or(0))); - } - - static SelfPtr float_IEEE754(int64_t exponent, int64_t mantissa) { - check_mantissa(mantissa); - check_exponent(exponent); - return c10::make_intrusive( - ScalarType::float_IEEE754(exponent, mantissa)); - } - - static SelfPtr float_(int64_t exponent, int64_t mantissa, - bool finite_values_only, int64_t nan_repr) { - check_mantissa(mantissa); - check_exponent(exponent); - return c10::make_intrusive(ScalarType::float_( - exponent, mantissa, finite_values_only, NanRepr(nan_repr))); - } - - // This needs to be implemented and throw a TypeError in order for - // PyTorch's opcheck to work on ops that use ScalarTypes. - int64_t len() const { - throw c10::TypeError({__func__, __FILE__, static_cast(__LINE__)}, - "__len__ not implemented"); - return 0; - } - - // Serialize a ScalarType into a tuple of pairs. Where each pair - // is a (fieldname, value). - // For simplicity, we are just going to convert to a ScalarTypeId. - std::tuple> obj_flatten() const { - return {{"ScalarType", id()}}; - } - - // Deserialize a scalar type that has been serialized by obj_flatten, - // ostensibly from a tuple of (member name, value) pairs, but in reality - // just a ScalarTypeId. - static SelfPtr obj_unflatten( - std::tuple> const& flat_type) { - return c10::make_intrusive( - from_id(std::get<1>(std::get<0>(flat_type)))); - } - - template - static void bind_readonly_property(torch::class_& cls, - std::string const& name, T Base::*field) { - auto getter_func_helper = [field = std::move(field)](SelfPtr const& self) { - if constexpr (std::is_member_function_pointer_v) { - return (self.get()->*field)(); - } else { - return self.get()->*field; - } - }; - - auto getter_func = [field = std::move(field), - getter_func_helper = std::move(getter_func_helper)]( - SelfPtr const& self) { - auto val = getter_func_helper(self); - // upconvert uint8_t, int32_t etc. to int64_t for python - if constexpr (std::is_integral_v) { - return static_cast(val); - } else { - return val; - } - }; - - cls.def_property(name, getter_func); - } - - template - static void bind_function(torch::class_& cls, const std::string& name, - MemberFunc Cls::*member) { - cls.def(name, [member = std::move(member)](SelfPtr const& self) { - return (self.get()->*member)(); - }); - } - - template - static void bind_function(torch::class_& cls, const std::string& name, - Func func) { - cls.def(name, func); - } - - template - static void bind_static_function(torch::class_& cls, - const std::string& name, Func func) { - cls.def_static(name, func); - } - - static void bind_class(torch::Library& lib) { - auto cls = lib.class_("ScalarType") - .def(torch::init()); - - // Bind Properties - bind_readonly_property(cls, "mantissa", &Base::mantissa); - bind_readonly_property(cls, "exponent", &Base::exponent); - bind_readonly_property(cls, "bias", &Base::bias); - bind_readonly_property(cls, "signed", &Base::is_signed); - bind_readonly_property(cls, "size_bits", &Base::size_bits); - - // Bind member functions - bind_function(cls, "is_signed", &Base::is_signed); - bind_function(cls, "is_integer", &Base::is_integer); - bind_function(cls, "is_floating_point", &Base::is_floating_point); - bind_function(cls, "is_ieee_754", &Base::is_ieee_754); - bind_function(cls, "has_nans", &Base::has_nans); - bind_function(cls, "has_infs", &Base::has_infs); - bind_function(cls, "has_bias", &Base::has_bias); - - bind_function(cls, "max", [](SelfPtr const& self) { - return std::visit([](auto arg) { return c10::IValue(arg); }, - self.get()->max()); - }); - bind_function(cls, "min", [](SelfPtr const& self) { - return std::visit([](auto arg) { return c10::IValue(arg); }, - self.get()->min()); - }); - - bind_function(cls, "__len__", &ScalarTypeTorch::len); - bind_function(cls, "__str__", &Base::str); - bind_function(cls, "__eq__", [](SelfPtr const& self, SelfPtr const& other) { - return *self == *other; - }); - bind_function(cls, "__repr__", [](SelfPtr const& self) { - return "ScalarType." + self.get()->str(); - }); - - bind_function(cls, "__obj_flatten__", &ScalarTypeTorch::obj_flatten); - bind_static_function(cls, "__obj_unflatten__", - &ScalarTypeTorch::obj_unflatten); - - // Bind static functions (convenience constructors) - bind_static_function(cls, "int_", &ScalarTypeTorch::int_); - bind_static_function(cls, "uint", &ScalarTypeTorch::uint); - bind_static_function(cls, "float_IEEE754", &ScalarTypeTorch::float_IEEE754); - bind_static_function(cls, "float_", &ScalarTypeTorch::float_); - } -}; - -using ScalarTypeId = int64_t; -using ScalarTypeTorchPtr = c10::intrusive_ptr; +using ScalarTypeId = ScalarType::Id; // "rust style" names generally following: // https://github.com/pytorch/pytorch/blob/6d9f74f0af54751311f0dd71f7e5c01a93260ab3/torch/csrc/api/include/torch/types.h#L60-L70 diff --git a/csrc/core/torch_bindings.cpp b/csrc/core/torch_bindings.cpp deleted file mode 100644 index f60254189a2f7..0000000000000 --- a/csrc/core/torch_bindings.cpp +++ /dev/null @@ -1,16 +0,0 @@ -#include - -#include "scalar_type.hpp" -#include "registration.h" - -// Note the CORE exstension will be built for (almost) all hardware targets so -// new additions must account for this. (currently not built for TPU and Neuron) - -TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, lib) { - // ScalarType, a custom class for representing data types that supports - // quantized types, declared here so it can be used when creating interfaces - // for custom ops. - vllm::ScalarTypeTorch::bind_class(lib); -} - -REGISTER_EXTENSION(TORCH_EXTENSION_NAME) diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp index 5b1d3d6442b2b..a325153b470cc 100644 --- a/csrc/cpu/cpu_types_x86.hpp +++ b/csrc/cpu/cpu_types_x86.hpp @@ -265,6 +265,30 @@ struct FP32Vec8 : public Vec { void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); } }; +#ifdef __AVX512F__ +struct INT32Vec16: public Vec { + constexpr static int VEC_ELEM_NUM = 16; + union AliasReg { + __m512i reg; + int32_t values[VEC_ELEM_NUM]; + }; + + __m512i reg; + + explicit INT32Vec16(const void* data_ptr) : reg(_mm512_loadu_epi32(data_ptr)) {} + + void save(int32_t* ptr) const { + _mm512_storeu_epi32(ptr, reg); + } + + void save(int32_t* ptr, const int elem_num) const { + constexpr uint32_t M = 0xFFFFFFFF; + __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num)); + _mm512_mask_storeu_epi32(ptr, mask, reg); + } +}; +#endif + #ifdef __AVX512F__ struct FP32Vec16 : public Vec { constexpr static int VEC_ELEM_NUM = 16; @@ -283,8 +307,6 @@ struct FP32Vec16 : public Vec { explicit FP32Vec16(__m512 data) : reg(data) {} - explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {} - explicit FP32Vec16(const FP32Vec4 &data) : reg((__m512)_mm512_inserti32x4( _mm512_inserti32x4( @@ -303,6 +325,9 @@ struct FP32Vec16 : public Vec { explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const INT32Vec16 &v) + : reg(_mm512_cvt_roundepi32_ps(v.reg, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)) {} + FP32Vec16 operator*(const FP32Vec16 &b) const { return FP32Vec16(_mm512_mul_ps(reg, b.reg)); } @@ -333,6 +358,16 @@ struct FP32Vec16 : public Vec { return FP32Vec16(_mm512_mask_max_ps(reg, mask, reg, b.reg)); } + FP32Vec16 min(const FP32Vec16& b) const { + return FP32Vec16(_mm512_min_ps(reg, b.reg)); + } + + FP32Vec16 min(const FP32Vec16& b, const int elem_num) const { + constexpr uint32_t M = 0xFFFFFFFF; + __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num)); + return FP32Vec16(_mm512_mask_min_ps(reg, mask, reg, b.reg)); + } + FP32Vec16 abs() const { return FP32Vec16(_mm512_abs_ps(reg)); } @@ -341,6 +376,8 @@ struct FP32Vec16 : public Vec { float reduce_max() const { return _mm512_reduce_max_ps(reg); } + float reduce_min() const { return _mm512_reduce_min_ps(reg); } + template float reduce_sub_sum(int idx) { static_assert(VEC_ELEM_NUM % group_size == 0); constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size)); diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp index 2d7abe6145fee..b493fd793818a 100644 --- a/csrc/cpu/quant.cpp +++ b/csrc/cpu/quant.cpp @@ -5,25 +5,29 @@ namespace { template struct KernelVecType { using load_vec_type = void; + using azp_adj_load_vec_type = void; using cvt_vec_type = void; }; template <> struct KernelVecType { using load_vec_type = vec_op::FP32Vec16; + using azp_adj_load_vec_type = vec_op::INT32Vec16; using cvt_vec_type = vec_op::FP32Vec16; }; template <> struct KernelVecType { using load_vec_type = vec_op::BF16Vec16; + using azp_adj_load_vec_type = vec_op::INT32Vec16; using cvt_vec_type = vec_op::FP32Vec16; }; #ifdef __AVX512F__ -template +template void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - const float* scale, const int num_tokens, + const float* scale, const int32_t* azp, + const int num_tokens, const int hidden_size) { using load_vec_t = typename KernelVecType::load_vec_type; using cvt_vec_t = typename KernelVecType::cvt_vec_type; @@ -37,62 +41,110 @@ void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, const cvt_vec_t i8_min_vec(i8_min); const cvt_vec_t i8_max_vec(i8_max); + cvt_vec_t zp_vec; + if constexpr (AZP) { + zp_vec = cvt_vec_t(static_cast(*azp)); + } + #pragma omp parallel for for (int i = 0; i < num_tokens; ++i) { int j = 0; for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { load_vec_t elems(input + i * hidden_size + j); cvt_vec_t elems_fp32(elems); - elems_fp32 = (elems_fp32 * inv_scale).clamp(i8_min_vec, i8_max_vec); + elems_fp32 = elems_fp32 * inv_scale; + + if constexpr (AZP) { + elems_fp32 = elems_fp32 + zp_vec; + } + + elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); vec_op::INT8Vec16 elems_int8(elems_fp32); elems_int8.save(output + i * hidden_size + j); } load_vec_t elems(input + i * hidden_size + j); cvt_vec_t elems_fp32(elems); - elems_fp32 = (elems_fp32 * inv_scale).clamp(i8_min_vec, i8_max_vec); - vec_op::INT8Vec16 elems_int8(elems_fp32); + elems_fp32 = elems_fp32 * inv_scale; - if (j + vec_elem_num == hidden_size) { - elems_int8.save(output + i * hidden_size + j); - } else { - elems_int8.save(output + i * hidden_size + j, hidden_size - j); + if constexpr (AZP) { + elems_fp32 = elems_fp32 + zp_vec; } + + elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); + vec_op::INT8Vec16 elems_int8(elems_fp32); + elems_int8.save(output + i * hidden_size + j, hidden_size - j); } } -template +template void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - float* scale, const int num_tokens, + float* scale, int32_t* azp, + const int num_tokens, const int hidden_size) { using load_vec_t = typename KernelVecType::load_vec_type; using cvt_vec_t = typename KernelVecType::cvt_vec_type; constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; + constexpr float i8_min = + static_cast(std::numeric_limits::min()); + constexpr float i8_max = + static_cast(std::numeric_limits::max()); + const cvt_vec_t i8_min_vec(i8_min); + const cvt_vec_t i8_max_vec(i8_max); + #pragma omp parallel for for (int i = 0; i < num_tokens; ++i) { - cvt_vec_t max_abs(0.0); + cvt_vec_t max_value(std::numeric_limits::lowest()); + cvt_vec_t min_value(std::numeric_limits::max()); { int j = 0; for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { load_vec_t elems(input + i * hidden_size + j); cvt_vec_t elems_fp32(elems); - max_abs = max_abs.max(elems_fp32.abs()); + if constexpr (AZP) { + max_value = max_value.max(elems_fp32); + min_value = min_value.min(elems_fp32); + } else { + max_value = max_value.max(elems_fp32.abs()); + } } load_vec_t elems(input + i * hidden_size + j); cvt_vec_t elems_fp32(elems); if (j + vec_elem_num == hidden_size) { - max_abs = max_abs.max(elems_fp32.abs()); + if constexpr (AZP) { + max_value = max_value.max(elems_fp32); + min_value = min_value.min(elems_fp32); + } else { + max_value = max_value.max(elems_fp32.abs()); + } } else { - max_abs = max_abs.max(elems_fp32.abs(), hidden_size - j); + if constexpr (AZP) { + max_value = max_value.max(elems_fp32, hidden_size - j); + min_value = min_value.min(elems_fp32, hidden_size - j); + } else { + max_value = max_value.max(elems_fp32.abs(), hidden_size - j); + } } } - float scale_val = max_abs.reduce_max() / 127.0f; - scale[i] = scale_val; + float scale_val, azp_val; + if constexpr (AZP) { + float max_scalar = max_value.reduce_max(); + float min_scalar = min_value.reduce_min(); + scale_val = (max_scalar - min_scalar) / 255.0f; + azp_val = std::nearbyint(-128.0f - min_scalar / scale_val); + azp[i] = static_cast(azp_val); + scale[i] = scale_val; + } else { + scale_val = max_value.reduce_max() / 127.0f; + scale[i] = scale_val; + } + const cvt_vec_t inv_scale(1.0 / scale_val); + const cvt_vec_t azp_vec(azp_val); { int j = 0; @@ -100,6 +152,11 @@ void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, load_vec_t elems(input + i * hidden_size + j); cvt_vec_t elems_fp32(elems); elems_fp32 = (elems_fp32 * inv_scale); + + if constexpr (AZP) { + elems_fp32 = elems_fp32 + azp_vec; + } + elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); vec_op::INT8Vec16 elems_int8(elems_fp32); elems_int8.save(output + i * hidden_size + j); } @@ -107,34 +164,111 @@ void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, load_vec_t elems(input + i * hidden_size + j); cvt_vec_t elems_fp32(elems); elems_fp32 = (elems_fp32 * inv_scale); - vec_op::INT8Vec16 elems_int8(elems_fp32); - if (j + vec_elem_num == hidden_size) { - elems_int8.save(output + i * hidden_size + j); - } else { - elems_int8.save(output + i * hidden_size + j, hidden_size - j); + if constexpr (AZP) { + elems_fp32 = elems_fp32 + azp_vec; } + elems_fp32 = elems_fp32.clamp(i8_min_vec, i8_max_vec); + vec_op::INT8Vec16 elems_int8(elems_fp32); + elems_int8.save(output + i * hidden_size + j, hidden_size - j); } } } -template -void dynamic_output_scale_impl(const float* input, scalar_t* output, - const float* scale, const scalar_t* bias, - const int num_tokens, const int hidden_size) { +template +void static_quant_epilogue(const float* input, scalar_t* output, + const float a_scale, const float* b_scale, + const int32_t* azp_with_adj, const int num_tokens, + const int hidden_size) { CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl) using load_vec_t = typename KernelVecType::load_vec_type; + using azp_adj_load_vec_t = + typename KernelVecType::azp_adj_load_vec_type; + using cvt_vec_t = typename KernelVecType::cvt_vec_type; + constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; + + #pragma omp parallel for + for (int i = 0; i < num_tokens; ++i) { + cvt_vec_t a_scale_vec(a_scale); + cvt_vec_t b_scale_vec(*b_scale); + cvt_vec_t scale_vec = a_scale_vec * b_scale_vec; + + int j = 0; + for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { + cvt_vec_t elems_fp32(input + i * hidden_size + j); + azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j); + cvt_vec_t azp_adj_fp32(azp_adj_vec); + + if constexpr (PerChannel) { + b_scale_vec = cvt_vec_t(b_scale + j); + scale_vec = b_scale_vec * a_scale_vec; + } + + elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32; + + load_vec_t elems_out(elems_fp32); + elems_out.save(output + i * hidden_size + j); + } + + cvt_vec_t elems_fp32(input + i * hidden_size + j); + azp_adj_load_vec_t azp_adj_vec(azp_with_adj + j); + cvt_vec_t azp_adj_fp32(azp_adj_vec); + + if constexpr (PerChannel) { + b_scale_vec = cvt_vec_t(b_scale + j); + scale_vec = b_scale_vec * a_scale_vec; + } + + elems_fp32 = elems_fp32 - scale_vec * azp_adj_fp32; + + load_vec_t elems_out(elems_fp32); + elems_out.save(output + i * hidden_size + j, hidden_size - j); + } +} + +template +void dynamic_quant_epilogue(const float* input, scalar_t* output, + const float* a_scale, const float* b_scale, + const int32_t* azp, const int32_t* azp_adj, + const scalar_t* bias, const int num_tokens, + const int hidden_size) { + CPU_KERNEL_GUARD_IN(dynamic_quant_epilogue) + using load_vec_t = typename KernelVecType::load_vec_type; + using azp_adj_load_vec_t = + typename KernelVecType::azp_adj_load_vec_type; using cvt_vec_t = typename KernelVecType::cvt_vec_type; constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM; #pragma omp parallel for for (int i = 0; i < num_tokens; ++i) { int j = 0; - cvt_vec_t token_scale_vec(scale[i]); + cvt_vec_t token_scale_vec(a_scale[i]); + cvt_vec_t token_zp_scale_vec; + if constexpr (AZP) { + float zp_scale_val = a_scale[i] * static_cast(azp[i]); + if constexpr (!PerChannel) { + zp_scale_val *= *b_scale; + } + token_zp_scale_vec = cvt_vec_t(zp_scale_val); + } + for (; j < hidden_size - vec_elem_num; j += vec_elem_num) { cvt_vec_t elems_fp32(input + i * hidden_size + j); elems_fp32 = elems_fp32 * token_scale_vec; + if constexpr (AZP) { + azp_adj_load_vec_t azp_adj_vec(azp_adj + j); + cvt_vec_t azp_adj_fp32(azp_adj_vec); + azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec; + + if constexpr (PerChannel) { + cvt_vec_t b_scale_vec(b_scale + j); + azp_adj_fp32 = azp_adj_fp32 * b_scale_vec; + } + + elems_fp32 = elems_fp32 - azp_adj_fp32; + } + if constexpr (Bias) { load_vec_t bias_vec(bias + j); cvt_vec_t bias_vec_fp32(bias_vec); @@ -148,6 +282,19 @@ void dynamic_output_scale_impl(const float* input, scalar_t* output, cvt_vec_t elems_fp32(input + i * hidden_size + j); elems_fp32 = elems_fp32 * token_scale_vec; + if constexpr (AZP) { + azp_adj_load_vec_t azp_adj_vec(azp_adj + j); + cvt_vec_t azp_adj_fp32(azp_adj_vec); + azp_adj_fp32 = azp_adj_fp32 * token_zp_scale_vec; + + if constexpr (PerChannel) { + cvt_vec_t b_scale_vec(b_scale + j); + azp_adj_fp32 = azp_adj_fp32 * b_scale_vec; + } + + elems_fp32 = elems_fp32 - azp_adj_fp32; + } + if constexpr (Bias) { load_vec_t bias_vec(bias + j); cvt_vec_t bias_vec_fp32(bias_vec); @@ -155,32 +302,41 @@ void dynamic_output_scale_impl(const float* input, scalar_t* output, } load_vec_t elems_out(elems_fp32); - - if (j + vec_elem_num == hidden_size) { - elems_out.save(output + i * hidden_size + j); - } else { - elems_out.save(output + i * hidden_size + j, hidden_size - j); - } + elems_out.save(output + i * hidden_size + j, hidden_size - j); } } #else template void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - const float* scale, const int num_tokens, + const float* scale, const int32_t* azp, + const int num_tokens, const int hidden_size) { TORCH_CHECK(false, "static_scaled_int8_quant_impl requires AVX512 support.") } template void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, - float* scale, const int num_tokens, + float* scale, int32_t* azp, + const int num_tokens, const int hidden_size) { TORCH_CHECK(false, "dynamic_scaled_int8_quant_impl requires AVX512 support.") } +template +void static_quant_epilogue(const float* input, scalar_t* output, + const float a_scale, const float* b_scale, + const int32_t* azp_with_adj, const int num_tokens, + const int hidden_size) { + TORCH_CHECK(false, "static_quant_epilogue requires AVX512 support.") +} + template -void dynamic_output_scale_impl() { - TORCH_CHECK(false, "dynamic_output_scale_impl requires AVX512 support.") +void dynamic_quant_epilogue(const float* input, scalar_t* output, + const float* a_scale, const float* b_scale, + const int32_t* azp, const int32_t* azp_with_adj, + const scalar_t* bias, const int num_tokens, + const int hidden_size) { + TORCH_CHECK(false, "dynamic_quant_epilogue requires AVX512 support.") } #endif } // namespace @@ -214,39 +370,52 @@ void int8_scaled_mm(torch::Tensor& c, // [M, OC], row-major bias->dim() == 1); } - VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "cutlass_scaled_mm", [&] { + VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm", [&] { if (a_scales.numel() != 1) { // per-token // Note: oneDNN doesn't support per-token activation quantization + // Ideally we want to fuse the GEMM and the scale procedure with oneDNN + // JIT, the intermediate data is cached in registers or L1. But for now + // the oneDNN GEMM code generation only supports two quantization + // patterns: per-tensor or per-output-channel of weight. + // So we have to apply the per-token scale with a 'epilogue'. In C=s_a * + // s_b * (A@B) + bias, the C_inter = s_b * (A@B) is computed by oneDNN + // GEMM, then the per-token scale (and bias) is applied with the epilogue + // C=s_a * C_inter + bias. torch::Tensor tmp_fp32_out = torch::empty_like(c, ::at::ScalarType::Float); - DNNLPrimitiveHelper::gemm_s8s8_jit( + // Compute C_inter=s_b * (A@B) + DNNLPrimitiveHelper::gemm_s8s8_jit( a.data_ptr(), b.data_ptr(), - tmp_fp32_out.data_ptr(), (void*)(0), a.size(0), b.size(1), - a.size(1), (float*)(0), b_scales.data_ptr(), 0, - b_scales.numel()); + tmp_fp32_out.data_ptr(), nullptr, a.size(0), b.size(1), + a.size(1), nullptr, b_scales.data_ptr(), 0, b_scales.numel()); if (bias.has_value()) { - dynamic_output_scale_impl( + // Compute C=s_a * C_inter + bias + dynamic_quant_epilogue( tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), bias->data_ptr(), c.size(0), - c.size(1)); + a_scales.data_ptr(), nullptr, nullptr, nullptr, + bias->data_ptr(), c.size(0), c.size(1)); } else { - dynamic_output_scale_impl( + // Compute C=s_a * C_inter + dynamic_quant_epilogue( tmp_fp32_out.data_ptr(), c.data_ptr(), - a_scales.data_ptr(), (scalar_t*)(0), c.size(0), c.size(1)); + a_scales.data_ptr(), nullptr, nullptr, nullptr, nullptr, + c.size(0), c.size(1)); } } else { // per-tensor if (bias.has_value()) { + // Compute C=s_a * s_b * (A@B) + bias DNNLPrimitiveHelper::gemm_s8s8_jit( a.data_ptr(), b.data_ptr(), c.data_ptr(), bias->data_ptr(), a.size(0), b.size(1), a.size(1), a_scales.data_ptr(), b_scales.data_ptr(), a_scales.numel(), b_scales.numel()); } else { - DNNLPrimitiveHelper::gemm_s8s8_jit( + // Compute C=s_a * s_b * (A@B) + DNNLPrimitiveHelper::gemm_s8s8_jit( a.data_ptr(), b.data_ptr(), c.data_ptr(), - (void*)(0), a.size(0), b.size(1), a.size(1), + nullptr, a.size(0), b.size(1), a.size(1), a_scales.data_ptr(), b_scales.data_ptr(), a_scales.numel(), b_scales.numel()); } @@ -254,6 +423,127 @@ void int8_scaled_mm(torch::Tensor& c, // [M, OC], row-major }); } +void int8_scaled_mm_azp(torch::Tensor& c, // [M, OC], row-major + const torch::Tensor& a, // [M, IC], row-major + const torch::Tensor& b, // [IC, OC], column-major + const torch::Tensor& a_scales, // [1] or [M] + const torch::Tensor& b_scales, // [1] or [OC] + const torch::Tensor& azp_adj, // [OC] + const c10::optional& azp, // [1] or [M] + const c10::optional& bias // [OC] +) { + CPU_KERNEL_GUARD_IN(cutlass_scaled_mm_azp) + // Checks for conformality + TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8, + "int8_scaled_mm_azp only supports INT8 inputs.") + TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2); + TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) && + b.size(1) == c.size(1)); + TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0)); + TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1)); + + // Check for strides and alignment + TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1); // Row-major + TORCH_CHECK(b.stride(0) == 1); // Column-major + TORCH_CHECK(c.stride(0) % 16 == 0 && + b.stride(1) % 16 == 0); // 16 Byte Alignment + TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous()); + + if (bias) { + TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous()); + } + if (azp) { + TORCH_CHECK(azp->numel() == a.size(0) && azp->is_contiguous()); + } + TORCH_CHECK(azp_adj.numel() == b.size(1) && azp_adj.is_contiguous()); + + // azp & bias types + TORCH_CHECK(azp_adj.dtype() == torch::kInt32); + TORCH_CHECK(!azp || azp->dtype() == torch::kInt32); + TORCH_CHECK(!bias || bias->dtype() == c.dtype(), + "currently bias dtype must match output dtype ", c.dtype()); + + VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "int8_scaled_mm_azp", [&] { + torch::Tensor tmp_fp32_out = torch::empty_like(c, ::at::ScalarType::Float); + if (a_scales.numel() != 1) { + // per-token + // Note: oneDNN doesn't support per-token activation quantization + // Compute C_inter=s_b * (A@B) + DNNLPrimitiveHelper::gemm_s8s8_jit( + a.data_ptr(), b.data_ptr(), + tmp_fp32_out.data_ptr(), nullptr, a.size(0), b.size(1), + a.size(1), nullptr, b_scales.data_ptr(), 0, b_scales.numel()); + if (bias.has_value()) { + // Compute C=s_a * C_inter - s_a * s_b * azp * azp_adj + bias + if (b_scales.numel() != 1) { + // Per-Channel + dynamic_quant_epilogue( + tmp_fp32_out.data_ptr(), c.data_ptr(), + a_scales.data_ptr(), b_scales.data_ptr(), + azp->data_ptr(), azp_adj.data_ptr(), + bias->data_ptr(), c.size(0), c.size(1)); + } else { + // Per-Tensor + dynamic_quant_epilogue( + tmp_fp32_out.data_ptr(), c.data_ptr(), + a_scales.data_ptr(), b_scales.data_ptr(), + azp->data_ptr(), azp_adj.data_ptr(), + bias->data_ptr(), c.size(0), c.size(1)); + } + } else { + // Compute C=s_a * C_inter - s_a * s_b * azp * azp_adj + if (b_scales.numel() != 1) { + // Per-Channel + dynamic_quant_epilogue( + tmp_fp32_out.data_ptr(), c.data_ptr(), + a_scales.data_ptr(), b_scales.data_ptr(), + azp->data_ptr(), azp_adj.data_ptr(), nullptr, + c.size(0), c.size(1)); + } else { + // Per-Tensor + dynamic_quant_epilogue( + tmp_fp32_out.data_ptr(), c.data_ptr(), + a_scales.data_ptr(), b_scales.data_ptr(), + azp->data_ptr(), azp_adj.data_ptr(), nullptr, + c.size(0), c.size(1)); + } + } + } else { + // per-tensor + if (bias.has_value()) { + // Compute C_inter=s_a * s_b * (A@B) + bias + DNNLPrimitiveHelper::gemm_s8s8_jit( + a.data_ptr(), b.data_ptr(), + tmp_fp32_out.data_ptr(), bias->data_ptr(), + a.size(0), b.size(1), a.size(1), a_scales.data_ptr(), + b_scales.data_ptr(), a_scales.numel(), b_scales.numel()); + } else { + // Compute C_inter=s_a * s_b * (A@B) + DNNLPrimitiveHelper::gemm_s8s8_jit( + a.data_ptr(), b.data_ptr(), + tmp_fp32_out.data_ptr(), nullptr, a.size(0), b.size(1), + a.size(1), a_scales.data_ptr(), b_scales.data_ptr(), + a_scales.numel(), b_scales.numel()); + } + + // Compute C=C_inter - s_a * s_b * azp_adj + if (b_scales.numel() != 1) { + // Per-Channel + static_quant_epilogue( + tmp_fp32_out.data_ptr(), c.data_ptr(), + *a_scales.data_ptr(), b_scales.data_ptr(), + azp_adj.data_ptr(), a.size(0), b.size(1)); + } else { + // Per-Tensor + static_quant_epilogue( + tmp_fp32_out.data_ptr(), c.data_ptr(), + *a_scales.data_ptr(), b_scales.data_ptr(), + azp_adj.data_ptr(), a.size(0), b.size(1)); + } + } + }); +} + // static-per-tensor quantization. void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] const torch::Tensor& input, // [..., hidden_size] @@ -263,15 +553,22 @@ void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(out.is_contiguous()); TORCH_CHECK(scale.numel() == 1); - TORCH_CHECK(!azp.has_value(), "Zero point is not supported on CPU."); + TORCH_CHECK(!azp.has_value() || azp->numel() == 1); const int hidden_size = input.size(-1); const int num_tokens = input.numel() / hidden_size; VLLM_DISPATCH_FLOATING_TYPES( input.scalar_type(), "static_scaled_int8_quant_impl", [&] { - static_scaled_int8_quant_impl( - input.data_ptr(), out.data_ptr(), - scale.data_ptr(), num_tokens, hidden_size); + if (azp.has_value()) { + static_scaled_int8_quant_impl( + input.data_ptr(), out.data_ptr(), + scale.data_ptr(), azp->data_ptr(), num_tokens, + hidden_size); + } else { + static_scaled_int8_quant_impl( + input.data_ptr(), out.data_ptr(), + scale.data_ptr(), nullptr, num_tokens, hidden_size); + } }); } @@ -284,14 +581,20 @@ void dynamic_scaled_int8_quant( CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant) TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(out.is_contiguous()); - TORCH_CHECK(!azp.has_value(), "Zero point is not supported on CPU."); int const hidden_size = input.size(-1); int const num_tokens = input.numel() / hidden_size; VLLM_DISPATCH_FLOATING_TYPES( input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] { - dynamic_scaled_int8_quant_impl( - input.data_ptr(), out.data_ptr(), - scale.data_ptr(), num_tokens, hidden_size); + if (azp.has_value()) { + dynamic_scaled_int8_quant_impl( + input.data_ptr(), out.data_ptr(), + scale.data_ptr(), azp->data_ptr(), num_tokens, + hidden_size); + } else { + dynamic_scaled_int8_quant_impl( + input.data_ptr(), out.data_ptr(), + scale.data_ptr(), nullptr, num_tokens, hidden_size); + } }); } diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index ab697e3e6aef7..03beefbc6de7d 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -11,6 +11,13 @@ void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a, const torch::Tensor& b_scales, const c10::optional& bias); +void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a, + const torch::Tensor& b, const torch::Tensor& a_scales, + const torch::Tensor& b_scales, + const torch::Tensor& azp_adj, + const c10::optional& azp, + const c10::optional& bias); + TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // vLLM custom ops @@ -111,6 +118,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor b, Tensor a_scales," " Tensor b_scales, Tensor? bias) -> ()"); ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm); + // w8a8 GEMM, supporting asymmetric per-tensor or per-row/column + // quantization. + ops.def( + "cutlass_scaled_mm_azp(Tensor! out, Tensor a," + " Tensor b, Tensor a_scales," + " Tensor b_scales, Tensor azp_adj," + " Tensor? azp, Tensor? bias) -> ()"); + ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp); #endif } diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu index 30831efdfa1a2..498d069c05f0d 100644 --- a/csrc/mamba/causal_conv1d/causal_conv1d.cu +++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu @@ -55,6 +55,7 @@ void set_conv_params_fwd(ConvParamsBase ¶ms, const at::Tensor out, const c10::optional& bias, bool silu_activation, + int64_t pad_slot_id, const c10::optional& query_start_loc = std::nullopt, const c10::optional& cache_indices = std::nullopt, const c10::optional& has_initial_state = std::nullopt) { @@ -66,6 +67,7 @@ void set_conv_params_fwd(ConvParamsBase ¶ms, params.dim = dim; params.seqlen = seqlen; params.width = width; + params.pad_slot_id = pad_slot_id; params.silu_activation = silu_activation; @@ -90,14 +92,16 @@ void set_conv_params_fwd(ConvParamsBase ¶ms, } -at::Tensor -causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight, +void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight, const c10::optional &bias_, const c10::optional &conv_states, const c10::optional &query_start_loc, const c10::optional &cache_indices, const c10::optional &has_initial_state, - bool silu_activation) { + bool silu_activation, + // used to identify padding entries if cache_indices provided + // in case of padding, the kernel will return early + int64_t pad_slot_id) { auto input_type = x.scalar_type(); auto weight_type = weight.scalar_type(); TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16); @@ -153,12 +157,13 @@ causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight, CHECK_SHAPE(cache_indices_, batch_size); } - at::Tensor out = torch::empty_like(x); + at::Tensor out = x; ConvParamsBase params; set_conv_params_fwd(params, batch_size, dim, seqlen, width, x, weight, out, bias_, silu_activation, + pad_slot_id, query_start_loc, cache_indices, has_initial_state @@ -183,18 +188,19 @@ causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight, DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_fwd", [&] { causal_conv1d_fwd_cuda(params, stream); }); - return out; } -at::Tensor -causal_conv1d_update(const at::Tensor &x, +void causal_conv1d_update(const at::Tensor &x, const at::Tensor &conv_state, const at::Tensor &weight, const c10::optional &bias_, bool silu_activation, const c10::optional &cache_seqlens_, - const c10::optional &conv_state_indices_) { + const c10::optional &conv_state_indices_, + // used to identify padding entries if cache_indices provided + // in case of padding, the kernel will return early + int64_t pad_slot_id) { auto input_type = x.scalar_type(); auto weight_type = weight.scalar_type(); TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16); @@ -227,12 +233,13 @@ causal_conv1d_update(const at::Tensor &x, CHECK_SHAPE(bias, dim); } - at::Tensor out = torch::empty_like(x); + at::Tensor out = x; ConvParamsBase params; set_conv_params_fwd(params, batch_size, dim, seqlen, width, x, weight, out, bias_, - silu_activation); + silu_activation, + pad_slot_id); params.conv_state_ptr = conv_state.data_ptr(); params.conv_state_len = conv_state_len; // All stride are in elements, not bytes. @@ -274,7 +281,6 @@ causal_conv1d_update(const at::Tensor &x, DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_update", [&] { causal_conv1d_update_cuda(params, stream); }); - return out; } template @@ -340,7 +346,10 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) { int* cache_indices = params.cache_indices_ptr == nullptr ? nullptr : reinterpret_cast(params.cache_indices_ptr); int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id]; - + // cache_index == params.pad_slot_id is defined as padding, so we exit early + if (cache_index == params.pad_slot_id){ + return; + } input_t *conv_states = params.conv_states_ptr == nullptr ? nullptr : reinterpret_cast(params.conv_states_ptr) + cache_index * params.conv_states_batch_stride + channel_id * params.conv_states_c_stride; @@ -409,6 +418,31 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) { typename Ktraits::BlockStoreT(smem_store).Store(out, out_vals_store, seqlen - chunk * kChunkSize); } out += kChunkSize; + + int final_state_position = ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize); + // in case the final state is separated between the last "smem_exchange" and + // and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2), + // (which occurs when `final_state_position` is a non-positivie index) + // we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it + if (final_state_position < 0 && seqlen > kWidth){ + input_t vals_load[kNElts] = {0}; + if ((chunk == n_chunks - 2) && (tidx == kNThreads - 1)){ + // chunk = n_chunks - 2, a segment of the final state sits in the last index + reinterpret_cast(vals_load)[0] = smem_exchange[kNThreads - 1]; + #pragma unroll + for (int w = 0; w < -final_state_position; ++w){ + conv_states[w] = vals_load[kNElts + final_state_position + w]; + } + } + if ((chunk == n_chunks - 1) && tidx == 0){ + // chunk = n_chunks - 1, the second segment of the final state first positions + reinterpret_cast(vals_load)[0] = smem_exchange[0]; + for (int w = -final_state_position; w < kWidth - 1; ++w){ + conv_states[w] = vals_load[w + final_state_position]; + } + return; + } + } } // Final state is stored in the smem_exchange last token slot, // in case seqlen < kWidth, we would need to take the final state from the @@ -437,9 +471,14 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) { } else { // in case the final state is in between the threads data - reinterpret_cast(x_vals_load)[1] = smem_exchange[last_thread + 1]; - reinterpret_cast(x_vals_load)[0] = smem_exchange[last_thread]; const int offset = ((seqlen - (kWidth - 1)) % (kNElts)); + if ((offset + kWidth - 2) >= kNElts && (last_thread + 1 < kNThreads)){ + // In case last_thread == kNThreads - 1, accessing last_thread + 1 will result in a + // illegal access error on H100. + // Therefore, we access last_thread + 1, only if the final state data sits there + reinterpret_cast(x_vals_load)[1] = smem_exchange[last_thread + 1]; + } + reinterpret_cast(x_vals_load)[0] = smem_exchange[last_thread]; #pragma unroll for (int w = 0; w < kWidth - 1; ++w){ conv_states[w] = x_vals_load[offset + w ]; @@ -528,6 +567,10 @@ void causal_conv1d_update_kernel(ConvParamsBase params) { const int conv_state_batch_coord = params.conv_state_indices_ptr == nullptr ? batch_id : params.conv_state_indices_ptr[batch_id]; + // conv_state_batch_coord == params.pad_slot_id is defined as padding so we exit early + if (conv_state_batch_coord == params.pad_slot_id){ + return; + } input_t *conv_state = reinterpret_cast(params.conv_state_ptr) + conv_state_batch_coord * params.conv_state_batch_stride + channel_id * params.conv_state_c_stride; diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.h b/csrc/mamba/causal_conv1d/causal_conv1d.h index 49e37ee4528be..e26684a2b98b8 100644 --- a/csrc/mamba/causal_conv1d/causal_conv1d.h +++ b/csrc/mamba/causal_conv1d/causal_conv1d.h @@ -13,6 +13,7 @@ struct ConvParamsBase { using index_t = uint32_t; int batch, dim, seqlen, width; + int64_t pad_slot_id; bool silu_activation; index_t x_batch_stride; diff --git a/csrc/mamba/mamba_ssm/selective_scan.h b/csrc/mamba/mamba_ssm/selective_scan.h index 580d0b2e17e74..563d2fe4ef65b 100644 --- a/csrc/mamba/mamba_ssm/selective_scan.h +++ b/csrc/mamba/mamba_ssm/selective_scan.h @@ -21,6 +21,7 @@ struct SSMParamsBase { int dim_ngroups_ratio; bool is_variable_B; bool is_variable_C; + int64_t pad_slot_id; bool delta_softplus; diff --git a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu index 6b225b41d295d..71624696338d0 100644 --- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu +++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu @@ -115,6 +115,10 @@ void selective_scan_fwd_kernel(SSMParamsBase params) { const int* cache_indices = params.cache_indices_ptr == nullptr ? nullptr : reinterpret_cast(params.cache_indices_ptr); const int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id]; + // cache_index == params.pad_slot_id is defined as padding, so we exit early + if (cache_index == params.pad_slot_id){ + return; + } input_t *u = reinterpret_cast(params.u_ptr) + sequence_start_index * params.u_batch_stride + dim_id * kNRows * params.u_d_stride; input_t *delta = reinterpret_cast(params.delta_ptr) + sequence_start_index * params.delta_batch_stride @@ -387,7 +391,6 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms, const size_t seqlen, const size_t dstate, const size_t n_groups, - const size_t n_chunks, const bool is_variable_B, const bool is_variable_C, // device pointers @@ -407,7 +410,8 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms, const c10::optional& query_start_loc, const c10::optional& cache_indices, const c10::optional& has_initial_state, - bool varlen) { + bool varlen, + int64_t pad_slot_id) { // Reset the parameters memset(¶ms, 0, sizeof(params)); @@ -417,8 +421,8 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms, params.seqlen = seqlen; params.dstate = dstate; params.n_groups = n_groups; - params.n_chunks = n_chunks; params.dim_ngroups_ratio = dim / n_groups; + params.pad_slot_id = pad_slot_id; params.delta_softplus = delta_softplus; @@ -507,7 +511,10 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta, const c10::optional &query_start_loc, const c10::optional &cache_indices, const c10::optional &has_initial_state, - const torch::Tensor &ssm_states) { + const torch::Tensor &ssm_states, + // used to identify padding entries if cache_indices provided + // in case of padding, the kernel will return early + int64_t pad_slot_id) { auto input_type = u.scalar_type(); auto weight_type = A.scalar_type(); TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16); @@ -618,18 +625,14 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta, out_z = z; - const int n_chunks = (seqlen + 2048 - 1) / 2048; - // const int n_chunks = (seqlen + 1024 - 1) / 1024; - // at::Tensor out = torch::empty_like(u); // Right now u has BHL layout and delta has HBL layout, and we want out to have HBL layout at::Tensor out = delta; TORCH_CHECK(ssm_states.scalar_type() == input_type); TORCH_CHECK(ssm_states.is_cuda()); TORCH_CHECK(ssm_states.stride(-1) == 1); - CHECK_SHAPE(ssm_states, batch_size, dim, dstate); SSMParamsBase params; - set_ssm_params_fwd(params, batch_size, dim, seqlen, dstate, n_groups, n_chunks, is_variable_B, is_variable_C, + set_ssm_params_fwd(params, batch_size, dim, seqlen, dstate, n_groups, is_variable_B, is_variable_C, u, delta, A, B, C, out, z, out_z, D_, delta_bias_, @@ -639,7 +642,8 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta, query_start_loc, cache_indices, has_initial_state, - varlen + varlen, + pad_slot_id ); diff --git a/csrc/moe/marlin_moe_ops.cu b/csrc/moe/marlin_moe_ops.cu index e2db4e4196b6f..5f12483e951e8 100644 --- a/csrc/moe/marlin_moe_ops.cu +++ b/csrc/moe/marlin_moe_ops.cu @@ -484,21 +484,22 @@ torch::Tensor marlin_gemm_moe( const torch::Tensor& topk_ids, const torch::Tensor& b_scales, torch::Tensor& b_zeros, const torch::Tensor& g_idx, const torch::Tensor& perm, torch::Tensor& workspace, - vllm::ScalarTypeTorchPtr const& b_q_type, int64_t size_m, int64_t size_n, + vllm::ScalarTypeId const b_q_type_id, int64_t size_m, int64_t size_n, int64_t size_k, bool is_k_full, int64_t num_experts, int64_t topk, int64_t moe_block_size, bool replicate_input, bool apply_weights) { + vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id); bool has_zp = b_zeros.size(1) != 0; if (has_zp) { TORCH_CHECK( - *b_q_type == vllm::kU4, - "b_q_type must be u4 when has_zp = True. Got = ", b_q_type->str()); + b_q_type == vllm::kU4, + "b_q_type must be u4 when has_zp = True. Got = ", b_q_type.str()); } else { TORCH_CHECK( - *b_q_type == vllm::kU4B8 || *b_q_type == vllm::kU8B128, - "b_q_type must be uint4b8 or uint8b128. Got = ", b_q_type->str()); + b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128, + "b_q_type must be uint4b8 or uint8b128. Got = ", b_q_type.str()); } - int pack_factor = 32 / b_q_type->size_bits(); + int pack_factor = 32 / b_q_type.size_bits(); int max_par = 4; @@ -575,7 +576,7 @@ torch::Tensor marlin_gemm_moe( topk_weights.data_ptr(), topk_ids.data_ptr(), b_scales.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(), expert_offsets.data_ptr(), size_m, size_n, size_k, workspace.data_ptr(), - *b_q_type, has_act_order, is_k_full, has_zp, num_groups, group_size, + b_q_type, has_act_order, is_k_full, has_zp, num_groups, group_size, num_experts, topk, moe_block_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, max_par, replicate_input, apply_weights); diff --git a/csrc/moe_align_block_size_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu similarity index 59% rename from csrc/moe_align_block_size_kernels.cu rename to csrc/moe/moe_align_sum_kernels.cu index 1f8d75da83bb8..fff7ce34c838a 100644 --- a/csrc/moe_align_block_size_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -1,15 +1,17 @@ #include #include +#include #include #include -#include "cuda_compat.h" -#include "dispatch_utils.h" +#include "../cuda_compat.h" +#include "../dispatch_utils.h" #define CEILDIV(x, y) (((x) + (y) - 1) / (y)) namespace vllm { +namespace moe { namespace { __device__ __forceinline__ int32_t index(int32_t total_col, int32_t row, @@ -32,10 +34,10 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, extern __shared__ int32_t shared_mem[]; int32_t* tokens_cnts = - shared_mem; // 2d tensor with shape (num_experts + 1, num_experts) + shared_mem; // 2d tensor with shape (blockDim.x + 1, num_experts) int32_t* cumsum = - shared_mem + (num_experts + 1) * - num_experts; // 1d tensor with shape (num_experts + 1) + shared_mem + + (blockDim.x + 1) * num_experts; // 1d tensor with shape (num_experts + 1) for (int i = 0; i < num_experts; ++i) { tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0; @@ -53,10 +55,12 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, __syncthreads(); // For each expert we accumulate the token counts from the different threads. - tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0; - for (int i = 1; i <= blockDim.x; ++i) { - tokens_cnts[index(num_experts, i, threadIdx.x)] += - tokens_cnts[index(num_experts, i - 1, threadIdx.x)]; + if (threadIdx.x < num_experts) { + tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0; + for (int i = 1; i <= blockDim.x; ++i) { + tokens_cnts[index(num_experts, i, threadIdx.x)] += + tokens_cnts[index(num_experts, i - 1, threadIdx.x)]; + } } __syncthreads(); @@ -79,9 +83,11 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, * For each expert, each thread processes the tokens of the corresponding * blocks and stores the corresponding expert_id for each block. */ - for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1]; - i += block_size) { - expert_ids[i / block_size] = threadIdx.x; + if (threadIdx.x < num_experts) { + for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1]; + i += block_size) { + expert_ids[i / block_size] = threadIdx.x; + } } /** @@ -106,6 +112,24 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, ++tokens_cnts[index(num_experts, threadIdx.x, expert_id)]; } } + +template +__global__ void moe_sum_kernel( + scalar_t* __restrict__ out, // [..., d] + const scalar_t* __restrict__ input, // [..., topk, d] + const int d) { + const int64_t token_idx = blockIdx.x; + for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { + scalar_t x = 0.0; +#pragma unroll + for (int k = 0; k < TOPK; ++k) { + x += VLLM_LDG(&input[token_idx * TOPK * d + k * d + idx]); + } + out[token_idx * d + idx] = x; + } +} + +} // namespace moe } // namespace vllm void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, @@ -117,18 +141,62 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { // calc needed amount of shared mem for `tokens_cnts` and `cumsum` // tensors + const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE); const int32_t shared_mem = - ((num_experts + 1) * num_experts + (num_experts + 1)) * + ((num_thread + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t); // set dynamic shared mem - auto kernel = vllm::moe_align_block_size_kernel; + auto kernel = vllm::moe::moe_align_block_size_kernel; AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( (void*)kernel, shared_mem)); - kernel<<<1, num_experts, shared_mem, stream>>>( + kernel<<<1, num_thread, shared_mem, stream>>>( topk_ids.data_ptr(), sorted_token_ids.data_ptr(), experts_ids.data_ptr(), num_tokens_post_pad.data_ptr(), num_experts, block_size, topk_ids.numel()); }); } + +void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size] + torch::Tensor& output) // [num_tokens, hidden_size] +{ + const int hidden_size = input.size(-1); + const int num_tokens = output.numel() / hidden_size; + const int topk = input.size(1); + + dim3 grid(num_tokens); + dim3 block(std::min(hidden_size, 1024)); + const at::cuda::OptionalCUDAGuard device_guard(device_of(output)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + switch (topk) { + case 2: + VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] { + vllm::moe::moe_sum_kernel<<>>( + output.data_ptr(), input.data_ptr(), + hidden_size); + }); + break; + + case 3: + VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] { + vllm::moe::moe_sum_kernel<<>>( + output.data_ptr(), input.data_ptr(), + hidden_size); + }); + break; + + case 4: + VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] { + vllm::moe::moe_sum_kernel<<>>( + output.data_ptr(), input.data_ptr(), + hidden_size); + }); + break; + + default: + at::sum_out(output, input, 1); + break; + } +} diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h index a251730aa765a..596cc0aa6c855 100644 --- a/csrc/moe/moe_ops.h +++ b/csrc/moe/moe_ops.h @@ -5,3 +5,10 @@ void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices, torch::Tensor& token_expert_indices, torch::Tensor& gating_output); + +void moe_sum(torch::Tensor& input, torch::Tensor& output); + +void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, + int64_t block_size, torch::Tensor sorted_token_ids, + torch::Tensor experts_ids, + torch::Tensor num_tokens_post_pad); diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp index 18fbc57ac7834..f3a558c14ab93 100644 --- a/csrc/moe/torch_bindings.cpp +++ b/csrc/moe/torch_bindings.cpp @@ -8,13 +8,28 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { "token_expert_indices, Tensor gating_output) -> ()"); m.impl("topk_softmax", torch::kCUDA, &topk_softmax); + // Calculate the result of moe by summing up the partial results + // from all selected experts. + m.def("moe_sum(Tensor! input, Tensor output) -> ()"); + m.impl("moe_sum", torch::kCUDA, &moe_sum); + + // Aligning the number of tokens to be processed by each expert such + // that it is divisible by the block size. + m.def( + "moe_align_block_size(Tensor topk_ids, int num_experts," + " int block_size, Tensor! sorted_token_ids," + " Tensor! experts_ids," + " Tensor! num_tokens_post_pad) -> ()"); + m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size); + #ifndef USE_ROCM m.def( "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, " "Tensor! topk_weights, Tensor! topk_ids, Tensor! b_scales, Tensor! " "b_zeros, Tensor! g_idx, Tensor! perm, Tensor! workspace, " - "__torch__.torch.classes._core_C.ScalarType b_q_type, int size_m, " - "int size_n, int size_k, bool is_k_full, int num_experts, int topk, " + "int b_q_type, SymInt size_m, " + "SymInt size_n, SymInt size_k, bool is_k_full, int num_experts, int " + "topk, " "int moe_block_size, bool replicate_input, bool apply_weights)" " -> Tensor"); // conditionally compiled so impl registration is in source file diff --git a/csrc/ops.h b/csrc/ops.h index fce545f95a7cc..c50eb39a3dacc 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -5,6 +5,30 @@ #include "core/scalar_type.hpp" +#include + +torch::Tensor weak_ref_tensor(torch::Tensor& tensor) { + // Ensure tensor is on CUDA + if (!tensor.is_cuda()) { + throw std::runtime_error("Tensor must be on CUDA device"); + } + + // Get the raw data pointer + void* data_ptr = tensor.data_ptr(); + + // Get tensor sizes and strides + std::vector sizes = tensor.sizes().vec(); + std::vector strides = tensor.strides().vec(); + + // Get tensor options (dtype, device) + auto options = tensor.options(); + + // Create a new tensor from the raw data pointer + auto new_tensor = torch::from_blob(data_ptr, sizes, strides, options); + + return new_tensor; +} + void paged_attention_v1( torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int64_t num_kv_heads, double scale, @@ -48,6 +72,9 @@ void gelu_and_mul(torch::Tensor& out, torch::Tensor& input); void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input); +void fatrelu_and_mul(torch::Tensor& out, torch::Tensor& input, + double threshold); + void gelu_new(torch::Tensor& out, torch::Tensor& input); void gelu_fast(torch::Tensor& out, torch::Tensor& input); @@ -142,11 +169,6 @@ void dynamic_per_token_scaled_fp8_quant( torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale, c10::optional const& scale_ub); -void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, - int64_t block_size, torch::Tensor sorted_token_ids, - torch::Tensor experts_ids, - torch::Tensor num_tokens_post_pad); - void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta, const torch::Tensor& A, const torch::Tensor& B, const torch::Tensor& C, @@ -157,21 +179,23 @@ void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta, const c10::optional& query_start_loc, const c10::optional& cache_indices, const c10::optional& has_initial_state, - const torch::Tensor& ssm_states); - -at::Tensor causal_conv1d_update( - const at::Tensor& x, const at::Tensor& conv_state, const at::Tensor& weight, - const c10::optional& bias_, bool silu_activation, - const c10::optional& cache_seqlens_, - const c10::optional& conv_state_indices_); - -at::Tensor causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight, - const c10::optional& bias_, - const c10::optional& conv_states, - const c10::optional& query_start_loc, - const c10::optional& cache_indices, - const c10::optional& has_initial_state, - bool silu_activation); + const torch::Tensor& ssm_states, int64_t pad_slot_id); + +void causal_conv1d_update(const at::Tensor& x, const at::Tensor& conv_state, + const at::Tensor& weight, + const c10::optional& bias_, + bool silu_activation, + const c10::optional& cache_seqlens_, + const c10::optional& conv_state_indices_, + int64_t pad_slot_id); + +void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight, + const c10::optional& bias_, + const c10::optional& conv_states, + const c10::optional& query_start_loc, + const c10::optional& cache_indices, + const c10::optional& has_initial_state, + bool silu_activation, int64_t pad_slot_id); #ifndef USE_ROCM using fptr_t = int64_t; diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu index aec9fa002f96e..e9987535bd3ea 100644 --- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu +++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu @@ -96,12 +96,15 @@ __global__ void static_scaled_int8_quant_kernel( scalar_t const* __restrict__ input, int8_t* __restrict__ out, scale_type const* scale_ptr, const int hidden_size) { int const tid = threadIdx.x; - int const token_idx = blockIdx.x; + int64_t const token_idx = blockIdx.x; scale_type const scale = *scale_ptr; + // Must be performed using 64-bit math to avoid integer overflow. + out += token_idx * hidden_size; + input += token_idx * hidden_size; + for (int i = tid; i < hidden_size; i += blockDim.x) { - out[token_idx * hidden_size + i] = float_to_int8_rn( - static_cast(input[token_idx * hidden_size + i]) / scale); + out[i] = float_to_int8_rn(static_cast(input[i]) / scale); } } @@ -111,14 +114,18 @@ __global__ void static_scaled_int8_azp_quant_kernel( scale_type const* scale_ptr, azp_type const* azp_ptr, const int hidden_size) { int const tid = threadIdx.x; - int const token_idx = blockIdx.x; + int64_t const token_idx = blockIdx.x; scale_type const scale = *scale_ptr; azp_type const azp = *azp_ptr; + // Must be performed using 64-bit math to avoid integer overflow. + out += token_idx * hidden_size; + input += token_idx * hidden_size; + for (int i = tid; i < hidden_size; i += blockDim.x) { - auto const val = static_cast(input[token_idx * hidden_size + i]); + auto const val = static_cast(input[i]); auto const quant_val = int32_to_int8(float_to_int32_rn(val / scale) + azp); - out[token_idx * hidden_size + i] = quant_val; + out[i] = quant_val; } } @@ -127,12 +134,16 @@ __global__ void dynamic_scaled_int8_quant_kernel( scalar_t const* __restrict__ input, int8_t* __restrict__ out, scale_type* scale, const int hidden_size) { int const tid = threadIdx.x; - int const token_idx = blockIdx.x; + int64_t const token_idx = blockIdx.x; float absmax_val = 0.0f; float const zero = 0.0f; + // Must be performed using 64-bit math to avoid integer overflow. + out += token_idx * hidden_size; + input += token_idx * hidden_size; + for (int i = tid; i < hidden_size; i += blockDim.x) { - float val = static_cast(input[token_idx * hidden_size + i]); + float val = static_cast(input[i]); val = val > zero ? val : -val; absmax_val = val > absmax_val ? val : absmax_val; } @@ -150,8 +161,7 @@ __global__ void dynamic_scaled_int8_quant_kernel( float const tmp_scale = 127.0f / block_absmax_val; for (int i = tid; i < hidden_size; i += blockDim.x) { - out[token_idx * hidden_size + i] = float_to_int8_rn( - static_cast(input[token_idx * hidden_size + i]) * tmp_scale); + out[i] = float_to_int8_rn(static_cast(input[i]) * tmp_scale); } } @@ -159,13 +169,17 @@ template __global__ void dynamic_scaled_int8_azp_quant_kernel( scalar_t const* __restrict__ input, int8_t* __restrict__ out, scale_type* scale, azp_type* azp, const int hidden_size) { - int const token_idx = blockIdx.x; + int64_t const token_idx = blockIdx.x; + + // Must be performed using 64-bit math to avoid integer overflow. + out += token_idx * hidden_size; + input += token_idx * hidden_size; // Scan for the min and max value for this token float max_val = std::numeric_limits::min(); float min_val = std::numeric_limits::max(); for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) { - auto val = static_cast(input[token_idx * hidden_size + i]); + auto val = static_cast(input[i]); max_val = std::max(max_val, val); min_val = std::min(min_val, val); } @@ -200,10 +214,10 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel( // Quantize the values for (int i = threadIdx.x; i < hidden_size; i += blockDim.x) { - auto const val = static_cast(input[token_idx * hidden_size + i]); + auto const val = static_cast(input[i]); auto const quant_val = int32_to_int8(float_to_int32_rn(val / scale_val) + azp_val); - out[token_idx * hidden_size + i] = quant_val; + out[i] = quant_val; } } diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu index 1657f7d0b16e8..97a969cf5e3e0 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu @@ -137,9 +137,11 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a, return; } - // Turing - TORCH_CHECK(version_num >= 75); - cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales, bias); + if (version_num >= 75) { + // Turing + cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales, bias); + return; + } #endif TORCH_CHECK_NOT_IMPLEMENTED( diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu index 7e23f92257769..f2c609c1b68c3 100644 --- a/csrc/quantization/fp8/common.cu +++ b/csrc/quantization/fp8/common.cu @@ -204,8 +204,10 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel( int const tid = threadIdx.x; int const token_idx = blockIdx.x; - scalar_t const* __restrict__ token_input = &input[token_idx * hidden_size]; - FP8_TYPE* __restrict__ token_output = &out[token_idx * hidden_size]; + // Use int64 to avoid overflowing an int32 when calculating this offset + int64_t offset = static_cast(token_idx) * hidden_size; + scalar_t const* __restrict__ token_input = &input[offset]; + FP8_TYPE* __restrict__ token_output = &out[offset]; // For vectorization, token_input and token_output pointers need to be // aligned at 8-byte and 4-byte addresses respectively. diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu index 5efe15d2b2f6b..6dbf9594e8492 100644 --- a/csrc/quantization/gptq_marlin/gptq_marlin.cu +++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu @@ -80,7 +80,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, torch::Tensor& b_scales, torch::Tensor& b_zeros, torch::Tensor& g_idx, torch::Tensor& perm, torch::Tensor& workspace, - vllm::ScalarTypeTorchPtr const& b_q_type, + vllm::ScalarTypeId const b_q_type_id, int64_t size_m, int64_t size_n, int64_t size_k, bool is_k_full, bool has_zp) { TORCH_CHECK_NOT_IMPLEMENTED(false, @@ -2132,22 +2132,23 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, torch::Tensor& b_scales, torch::Tensor& b_zeros, torch::Tensor& g_idx, torch::Tensor& perm, torch::Tensor& workspace, - vllm::ScalarTypeTorchPtr const& b_q_type, + vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n, int64_t size_k, bool is_k_full, bool has_zp, bool use_fp32_reduce) { + vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id); if (has_zp) { - TORCH_CHECK(*b_q_type == vllm::kU4 || *b_q_type == vllm::kU8, - "b_q_type must be u4 or u8 when has_zp = True. Got = ", - b_q_type->str()); + TORCH_CHECK( + b_q_type == vllm::kU4 || b_q_type == vllm::kU8, + "b_q_type must be u4 or u8 when has_zp = True. Got = ", b_q_type.str()); } else { TORCH_CHECK( - *b_q_type == vllm::kU4B8 || *b_q_type == vllm::kU8B128, + b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128, "b_q_type must be uint4b8 or uint8b128 when has_zp = False. Got = ", - b_q_type->str()); + b_q_type.str()); } - int pack_factor = 32 / b_q_type->size_bits(); + int pack_factor = 32 / b_q_type.size_bits(); // Verify A TORCH_CHECK(a.size(0) == size_m, "Shape mismatch: a.size(0) = ", a.size(0), @@ -2279,7 +2280,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, c_tmp.data_ptr(), b_scales.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(), size_m, size_n, size_k, - workspace.data_ptr(), *b_q_type, has_act_order, is_k_full, has_zp, + workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp, num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce); } else if (a.scalar_type() == at::ScalarType::BFloat16) { @@ -2288,7 +2289,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, c.data_ptr(), c_tmp.data_ptr(), b_scales.data_ptr(), b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr(), size_m, size_n, size_k, - workspace.data_ptr(), *b_q_type, has_act_order, is_k_full, has_zp, + workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp, num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce); } else { @@ -2302,4 +2303,4 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { m.impl("gptq_marlin_gemm", &gptq_marlin_gemm); -} \ No newline at end of file +} diff --git a/csrc/quantization/machete/machete_pytorch.cu b/csrc/quantization/machete/machete_pytorch.cu index ff037756f55ab..9f9073ded6191 100644 --- a/csrc/quantization/machete/machete_pytorch.cu +++ b/csrc/quantization/machete/machete_pytorch.cu @@ -38,9 +38,10 @@ static auto scalar_type_dispatch(ScalarType const& type, Fn fn) { // Interface // -std::vector supported_schedules(ScalarTypeTorchPtr const& btype) { +std::vector supported_schedules(ScalarTypeId const btype_id) { #if defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 12 - return scalar_type_dispatch(*btype, [&](auto BType) { + vllm::ScalarType b_type = ScalarType::from_id(btype_id); + return scalar_type_dispatch(b_type, [&](auto BType) { return GemmDispatcher::supported_schedules(); }); #else @@ -49,7 +50,7 @@ std::vector supported_schedules(ScalarTypeTorchPtr const& btype) { } torch::Tensor gemm(torch::Tensor const& A, torch::Tensor const& B, - ScalarTypeTorchPtr const& btype, + ScalarTypeId const btype_id, c10::optional const& scales, c10::optional const& zeros, c10::optional group_size, @@ -57,6 +58,7 @@ torch::Tensor gemm(torch::Tensor const& A, torch::Tensor const& B, c10::optional alpha, c10::optional beta, c10::optional schedule) { #if defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 12 + ScalarType const btype = ScalarType::from_id(btype_id); auto args = PyTorchArguments{.A = A, .B = B, .scales = scales, @@ -67,7 +69,7 @@ torch::Tensor gemm(torch::Tensor const& A, torch::Tensor const& B, .beta = beta, .schedule = schedule}; - return scalar_type_dispatch(*btype, [&](auto BType) { + return scalar_type_dispatch(btype, [&](auto BType) { return AT_DISPATCH_SUPPORTED_COMPUTE_TYPES( A.scalar_type(), "machete_gemm", [&] { using ComputeType = equivalent_cutlass_type_t; @@ -79,9 +81,9 @@ torch::Tensor gemm(torch::Tensor const& A, torch::Tensor const& B, #endif } -torch::Tensor prepack_B(torch::Tensor const& B, - vllm::ScalarTypeTorchPtr const& btype) { - return scalar_type_dispatch(*btype, [&](auto BType) { +torch::Tensor prepack_B(torch::Tensor const& B, ScalarTypeId const btype_id) { + ScalarType const btype = ScalarType::from_id(btype_id); + return scalar_type_dispatch(btype, [&](auto BType) { return PrepackBDispatcher::dispatch(B); }); } diff --git a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu index 908e4f70ab1e6..a33e2660d760e 100644 --- a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu +++ b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu @@ -89,7 +89,7 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, torch::Tensor& b_meta, torch::Tensor& b_scales, torch::Tensor& workspace, - vllm::ScalarTypeTorchPtr const& b_q_type, + vllm::ScalarTypeId const b_q_type_id, int64_t size_m, int64_t size_n, int64_t size_k) { TORCH_CHECK_NOT_IMPLEMENTED( @@ -1029,13 +1029,14 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, torch::Tensor& b_meta, torch::Tensor& b_scales, torch::Tensor& workspace, - vllm::ScalarTypeTorchPtr const& b_q_type, + vllm::ScalarTypeId const b_q_type_id, int64_t size_m, int64_t size_n, int64_t size_k) { + vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id); // Verify num_bits - TORCH_CHECK(*b_q_type == vllm::kU4B8 || *b_q_type == vllm::kU8B128, - "num_bits must be uint4b8 or uint8b128. Got = ", b_q_type->str()); - int pack_factor = 32 / b_q_type->size_bits(); + TORCH_CHECK(b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128, + "num_bits must be uint4b8 or uint8b128. Got = ", b_q_type.str()); + int pack_factor = 32 / b_q_type.size_bits(); // Verify M TORCH_CHECK(size_m == a.size(0), @@ -1130,8 +1131,8 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, marlin_24::marlin_cuda_2_4( a.data_ptr(), b_q_weight.data_ptr(), b_meta.data_ptr(), c.data_ptr(), b_scales.data_ptr(), size_n, size_m, size_k, workspace.data_ptr(), - b_q_type->size_bits(), groupsize, dev, - at::cuda::getCurrentCUDAStream(dev), thread_k, thread_m, sms, max_par); + b_q_type.size_bits(), groupsize, dev, at::cuda::getCurrentCUDAStream(dev), + thread_k, thread_m, sms, max_par); return c; } diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index a0100b4a85edd..b8185c24d5628 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -18,6 +18,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // vLLM custom ops + ops.def("weak_ref_tensor(Tensor input) -> Tensor"); + ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor); + // Attention ops // Compute the attention between an input query and the cached // keys/values using PagedAttention. @@ -60,6 +63,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("gelu_tanh_and_mul(Tensor! out, Tensor input) -> ()"); ops.impl("gelu_tanh_and_mul", torch::kCUDA, &gelu_tanh_and_mul); + // FATReLU implementation. + ops.def("fatrelu_and_mul(Tensor! out, Tensor input, float threshold) -> ()"); + ops.impl("fatrelu_and_mul", torch::kCUDA, &fatrelu_and_mul); + // GELU implementation used in GPT-2. ops.def("gelu_new(Tensor! out, Tensor input) -> ()"); ops.impl("gelu_new", torch::kCUDA, &gelu_new); @@ -140,13 +147,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // Quantized GEMM for AWQ. ops.def( "awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, " - "Tensor _zeros, int split_k_iters) -> Tensor"); + "Tensor _zeros, SymInt split_k_iters) -> Tensor"); ops.impl("awq_gemm", torch::kCUDA, &awq_gemm); // Dequantization for AWQ. ops.def( "awq_dequantize(Tensor _kernel, Tensor _scaling_factors, " - "Tensor _zeros, int split_k_iters, int thx, int thy) -> Tensor"); + "Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor"); ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize); // Note about marlin kernel 'workspace' arguments: @@ -166,32 +173,26 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // Marlin (Dense) Optimized Quantized GEMM for GPTQ. ops.def( "marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, " - "Tensor! workspace, int size_m, int size_n, int size_k) -> Tensor"); + "Tensor! workspace, SymInt size_m, SymInt size_n, SymInt size_k) -> " + "Tensor"); // conditionally compiled so impl in source file // Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ. ops.def( "gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, " "Tensor b_scales, Tensor workspace, " - "__torch__.torch.classes._core_C.ScalarType b_q_type, " - "int size_m, int size_n, int size_k) -> Tensor"); + "int b_q_type, " + "SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor"); // conditionally compiled so impl in source file // Machete (Dense) Optimized Mixed Precision GEMM for Hopper. + ops.def("machete_supported_schedules(int btype) -> str[]"); ops.def( - "machete_supported_schedules(" - " __torch__.torch.classes._core_C.ScalarType btype" - ") -> str[]"); - ops.def( - "machete_gemm(Tensor A, Tensor B," - " __torch__.torch.classes._core_C.ScalarType btype," - " Tensor? scales, Tensor? zeros, int? group_size," + "machete_gemm(Tensor A, Tensor B, int btype, " + " Tensor? scales, Tensor? zeros, int? group_size, " " Tensor? C, float? alpha, float? beta, str? schedule)" "-> Tensor"); - ops.def( - "machete_prepack_B(Tensor B," - " __torch__.torch.classes._core_C.ScalarType btype)" - "-> Tensor"); + ops.def("machete_prepack_B(Tensor B, int btype) -> Tensor"); // conditionally compiled so impl registration is in source file ops.def("permute_cols(Tensor A, Tensor perm) -> Tensor"); @@ -201,8 +202,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def( "gptq_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, " "Tensor b_zeros, Tensor g_idx, Tensor perm, Tensor workspace, " - "__torch__.torch.classes._core_C.ScalarType b_q_type, " - "int size_m, int size_n, int size_k, bool is_k_full, " + "int b_q_type, " + "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, " "bool has_zp, bool use_fp32_reduce) -> Tensor"); // conditionally compiled so impl registration is in source file @@ -219,32 +220,33 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // conditionally compiled so impl registrations are in source file // Dequantization for GGML. - ops.def("ggml_dequantize(Tensor W, int type, int m, int n) -> Tensor"); + ops.def("ggml_dequantize(Tensor W, int type, SymInt m, SymInt n) -> Tensor"); ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize); // mmvq kernel for GGML. ops.def( - "ggml_mul_mat_vec_a8(Tensor W, Tensor X, int type, int row) " + "ggml_mul_mat_vec_a8(Tensor W, Tensor X, int type, SymInt row) " "-> Tensor"); ops.impl("ggml_mul_mat_vec_a8", torch::kCUDA, &ggml_mul_mat_vec_a8); // mmq kernel for GGML. - ops.def("ggml_mul_mat_a8(Tensor W, Tensor X, int type, int row) -> Tensor"); + ops.def( + "ggml_mul_mat_a8(Tensor W, Tensor X, int type, SymInt row) -> Tensor"); ops.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8); // fp8_marlin Optimized Quantized GEMM for FP8 weight-only. ops.def( "fp8_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, " - "Tensor! workspace, int num_bits, int size_m, int size_n, " - "int size_k) -> Tensor"); + "Tensor! workspace, int num_bits, SymInt size_m, SymInt size_n, " + "SymInt size_k) -> Tensor"); // conditionally compiled so impl registration is in source file // marlin_qqq_gemm for QQQ. ops.def( "marlin_qqq_gemm(Tensor a, Tensor b_q_weight, " "Tensor s_tok, Tensor s_ch, Tensor s_group, " - "Tensor! workspace, int size_m, int size_n, " - "int size_k) -> Tensor"); + "Tensor! workspace, SymInt size_m, SymInt size_n, " + "SymInt size_k) -> Tensor"); // conditionally compiled so impl registration is in source file // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column @@ -278,7 +280,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "Tensor? query_start_loc," "Tensor? cache_indices," "Tensor? has_initial_state," - "Tensor! ssm_states) -> ()"); + "Tensor! ssm_states," + "int pad_slot_id) -> ()"); ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd); ops.def( @@ -288,7 +291,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "Tensor? bias_," "bool silu_activation," "Tensor? cache_seqlens_," - "Tensor? conv_state_indices) -> Tensor"); + "Tensor? conv_state_indices," + "int pad_slot_id) -> ()"); ops.impl("causal_conv1d_update", torch::kCUDA, &causal_conv1d_update); ops.def( @@ -298,7 +302,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "Tensor? query_start_loc," "Tensor? cache_indices," "Tensor? has_initial_state," - "bool silu_activation) -> Tensor"); + "bool silu_activation," + "int pad_slot_id) -> ()"); ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd); #endif @@ -334,15 +339,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA, &dynamic_per_token_scaled_fp8_quant); - // Aligning the number of tokens to be processed by each expert such - // that it is divisible by the block size. - ops.def( - "moe_align_block_size(Tensor topk_ids, int num_experts," - " int block_size, Tensor! sorted_token_ids," - " Tensor! experts_ids," - " Tensor! num_tokens_post_pad) -> ()"); - ops.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size); - // Compute int8 quantized tensor for given scaling factor. ops.def( "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale," diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index d58f226136918..e3e35844405ac 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -13,5 +13,7 @@ torch py-cpuinfo transformers mistral_common >= 1.3.4 +aiohttp +starlette openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py index 8435129e752e1..c7b638473a931 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -96,7 +96,6 @@ def setup(app): # Mock out external dependencies here, otherwise the autodoc pages may be blank. autodoc_mock_imports = [ - "aiohttp", "compressed_tensors", "cpuinfo", "cv2", @@ -143,6 +142,7 @@ def add_line(self, line: str, source: str, *lineno: int) -> None: "python": ("https://docs.python.org/3", None), "typing_extensions": ("https://typing-extensions.readthedocs.io/en/latest", None), + "aiohttp": ("https://docs.aiohttp.org/en/stable", None), "pillow": ("https://pillow.readthedocs.io/en/stable", None), "numpy": ("https://numpy.org/doc/stable", None), "torch": ("https://pytorch.org/docs/stable", None), diff --git a/docs/source/dev/input_processing/model_inputs_index.rst b/docs/source/dev/input_processing/model_inputs_index.rst index 5d895837590ba..f0ec1fea15ddb 100644 --- a/docs/source/dev/input_processing/model_inputs_index.rst +++ b/docs/source/dev/input_processing/model_inputs_index.rst @@ -25,7 +25,7 @@ Module Contents LLM Engine Inputs ----------------- -.. autoclass:: vllm.inputs.LLMInputs +.. autoclass:: vllm.inputs.DecoderOnlyInputs :members: :show-inheritance: diff --git a/docs/source/dev/pooling_params.rst b/docs/source/dev/pooling_params.rst new file mode 100644 index 0000000000000..334e0287aff09 --- /dev/null +++ b/docs/source/dev/pooling_params.rst @@ -0,0 +1,5 @@ +Pooling Parameters +================== + +.. autoclass:: vllm.PoolingParams + :members: diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst index c8947beb34942..d12aeebbbc184 100644 --- a/docs/source/getting_started/cpu-installation.rst +++ b/docs/source/getting_started/cpu-installation.rst @@ -3,7 +3,13 @@ Installation with CPU ======================== -vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32 and BF16. +vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32 and BF16. vLLM CPU backend supports the following vLLM features: + +- Tensor Parallel (``-tp = N``) +- Quantization (``INT8 W8A8, AWQ``) + +.. note:: + FP16 data type and more advanced features on `chunked-prefill`, `prefix-caching` and `FP8 KV cache` are under development and will be available soon. Table of contents: @@ -59,20 +65,6 @@ Build from source $ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy $ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu -- Third, build and install oneDNN library from source: - -.. code-block:: console - - $ git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git - $ cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \ - -DONEDNN_BUILD_DOC=OFF \ - -DONEDNN_BUILD_EXAMPLES=OFF \ - -DONEDNN_BUILD_TESTS=OFF \ - -DONEDNN_BUILD_GRAPH=OFF \ - -DONEDNN_ENABLE_WORKLOAD=INFERENCE \ - -DONEDNN_ENABLE_PRIMITIVE=MATMUL - $ cmake --build ./oneDNN/build --target install --config Release - - Finally, build and install vLLM CPU backend: .. code-block:: console @@ -155,5 +147,20 @@ Performance tips - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using ``VLLM_CPU_OMP_THREADS_BIND`` to avoid cross NUMA node memory access. +CPU Backend Considerations +-------------------------- + +- The CPU backend significantly differs from the GPU backend since the vLLM architecture was originally optimized for GPU use. A number of optimizations are needed to enhance its performance. + +- Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance. + +- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the `topology `_. For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel. + + * Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With `TP feature on CPU `_ merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving: + + .. code-block:: console + + $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp + * Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like `Nginx <../serving/deploying_with_nginx.html>`_ or HAProxy are recommended. Anyscale Ray project provides the feature on LLM `serving `_. Here is the example to setup a scalable LLM serving with `Ray Serve `_. \ No newline at end of file diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst index cfd2dcb3bd5d3..91978065faf42 100644 --- a/docs/source/getting_started/debugging.rst +++ b/docs/source/getting_started/debugging.rst @@ -107,15 +107,15 @@ If GPU/CPU communication cannot be established, you can use the following Python If you are testing with a single node, adjust ``--nproc-per-node`` to the number of GPUs you want to use: -.. code-block:: shell +.. code-block:: console - NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py + $ NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py If you are testing with multi-nodes, adjust ``--nproc-per-node`` and ``--nnodes`` according to your setup and set ``MASTER_ADDR`` to the correct IP address of the master node, reachable from all nodes. Then, run: -.. code-block:: shell +.. code-block:: console - NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py + $ NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py If the script runs successfully, you should see the message ``sanity check is successful!``. diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst index 99c695ac4ddb1..a706b285edede 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation.rst @@ -7,14 +7,14 @@ Installation vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries. Requirements -=========================== +============ * OS: Linux -* Python: 3.8 -- 3.12 +* Python: 3.8 - 3.12 * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) Install released versions -=========================== +========================= You can install vLLM using pip: @@ -51,9 +51,9 @@ You can install vLLM using pip: .. _install-the-latest-code: Install the latest code -========================= +======================= -LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on x86 platform with cuda 12 for every commit since v0.5.3. You can download and install the latest one with the following command: +LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since ``v0.5.3``. You can download and install it with the following command: .. code-block:: console @@ -66,7 +66,7 @@ If you want to access the wheels for previous commits, you can specify the commi $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl -Note that the wheels are built with Python 3.8 abi (see `PEP 425 `_ for more details about abi), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. +Note that the wheels are built with Python 3.8 ABI (see `PEP 425 `_ for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Another way to access the latest code is to use the docker images: @@ -77,17 +77,17 @@ Another way to access the latest code is to use the docker images: These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days. -Latest code can contain bugs and may not be stable. Please use it with caution. +The latest code can contain bugs and may not be stable. Please use it with caution. .. _build_from_source: Build from source -================== +================= .. _python-only-build: Python-only build (without compilation) ----------------------------------------- +--------------------------------------- If you only need to change Python code, you can simply build vLLM without compilation. @@ -116,28 +116,28 @@ The script will: Now, you can edit the Python code in the current directory, and the changes will be reflected when you run vLLM. -Once you have finished editing or want to install another vLLM wheel, you should exit the development environment using `the same script `_ with the ``--quit-dev``(or ``-q`` for short) flag: +Once you have finished editing or want to install another vLLM wheel, you should exit the development environment using `the same script `_ with the ``--quit-dev`` (or ``-q`` for short) flag: .. code-block:: console $ python python_only_dev.py --quit-dev -The script with ``--quit-dev`` flag will: +The ``--quit-dev`` flag will: * Remove the symbolic link from the current directory to the vLLM package. * Restore the original vLLM package from the backup. -If you update the vLLM wheel and want to rebuild from the source and make further edits, you will need to start `all above <#python-only-build>`_ over again. +If you update the vLLM wheel and rebuild from the source to make further edits, you will need to repeat the `Python-only build <#python-only-build>`_ steps again. .. note:: There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors. - It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to `the above section <#install-the-latest-code>`_ for instructions on how to install a specified wheel. + It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to `the section above <#install-the-latest-code>`_ for instructions on how to install a specified wheel. Full build (with compilation) ---------------------------------- +----------------------------- -If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes: +If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes: .. code-block:: console @@ -153,7 +153,7 @@ If you want to modify C++ or CUDA code, you'll need to build vLLM from source. T Use an existing PyTorch installation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.: * Building vLLM with PyTorch nightly or a custom PyTorch build. @@ -171,7 +171,7 @@ To build vLLM using an existing PyTorch installation: Troubleshooting -~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~ To avoid your system being overloaded, you can limit the number of compilation jobs to be run simultaneously, via the environment variable ``MAX_JOBS``. For example: @@ -207,7 +207,7 @@ Here is a sanity check to verify that the CUDA Toolkit is correctly installed: Unsupported OS build ----------------------- +-------------------- vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst index 71f4e4a1b6656..e9775a20d72d1 100644 --- a/docs/source/getting_started/quickstart.rst +++ b/docs/source/getting_started/quickstart.rst @@ -1,38 +1,51 @@ .. _quickstart: +========== Quickstart ========== -This guide shows how to use vLLM to: +This guide will help you quickly get started with vLLM to: -* run offline batched inference on a dataset; -* build an API server for a large language model; -* start an OpenAI-compatible API server. +* :ref:`Run offline batched inference ` +* :ref:`Run OpenAI-compatible inference ` Be sure to complete the `Gaudi installation instructions `_ before continuing with this guide. +Prerequisites +-------------- +- OS: Linux +- Python: 3.8 - 3.12 +- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) -.. note:: +Installation +-------------- + +You can install vLLM using pip. It's recommended to use `conda `_ to create and manage Python environments. + +.. code-block:: console - By default, vLLM downloads model from `HuggingFace `_. If you would like to use models from `ModelScope `_ in the following examples, please set the environment variable: + $ conda create -n myenv python=3.10 -y + $ conda activate myenv + $ pip install vllm - .. code-block:: shell +Please refer to the :ref:`installation documentation ` for more details on installing vLLM. - export VLLM_USE_MODELSCOPE=True +.. _offline_batched_inference: Offline Batched Inference ------------------------- -We first show an example of using vLLM for offline batched inference on a dataset. In other words, we use vLLM to generate texts for a list of input prompts. +With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). The example script for this section can be found `here `__. + +The first line of this example imports the classes :class:`~vllm.LLM` and :class:`~vllm.SamplingParams`: -Import :class:`~vllm.LLM` and :class:`~vllm.SamplingParams` from vLLM. -The :class:`~vllm.LLM` class is the main class for running offline inference with vLLM engine. -The :class:`~vllm.SamplingParams` class specifies the parameters for the sampling process. +- :class:`~vllm.LLM` is the main class for running offline inference with vLLM engine. +- :class:`~vllm.SamplingParams` specifies the parameters for the sampling process. .. code-block:: python from vllm import LLM, SamplingParams -Define the list of input prompts and the sampling parameters for generation. The sampling temperature is set to 0.8 and the nucleus sampling probability is set to 0.95. For more information about the sampling parameters, refer to the `class definition `_. +The next section defines a list of input prompts and sampling parameters for text generation. The `sampling temperature `_ is set to ``0.8`` and the `nucleus sampling probability `_ is set to ``0.95``. You can find more information about the sampling parameters `here `__. .. code-block:: python @@ -44,46 +57,46 @@ Define the list of input prompts and the sampling parameters for generation. The ] sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -Initialize vLLM's engine for offline inference with the :class:`~vllm.LLM` class and the `OPT-125M model `_. The list of supported models can be found at :ref:`supported models `. +The :class:`~vllm.LLM` class initializes vLLM's engine and the `OPT-125M model `_ for offline inference. The list of supported models can be found :ref:`here `. .. code-block:: python llm = LLM(model="facebook/opt-125m") -Call ``llm.generate`` to generate the outputs. It adds the input prompts to vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all the output tokens. +.. note:: + + By default, vLLM downloads models from `HuggingFace `_. If you would like to use models from `ModelScope `_, set the environment variable ``VLLM_USE_MODELSCOPE`` before initializing the engine. + +Now, the fun part! The outputs are generated using ``llm.generate``. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all of the output tokens. .. code-block:: python outputs = llm.generate(prompts, sampling_params) - # Print the outputs. for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -The code example can also be found in `examples/offline_inference.py `_. +.. _openai_compatible_server: OpenAI-Compatible Server ------------------------ vLLM can be deployed as a server that implements the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API. -By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the command below) and implements `list models `_, `create chat completion `_, and `create completion `_ endpoints. We are actively adding support for more endpoints. +By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time and implements endpoints such as `list models `_, `create chat completion `_, and `create completion `_ endpoints. -Start the server: +Run the following command to start the vLLM server with the `Qwen2.5-1.5B-Instruct `_ model: .. code-block:: console - $ vllm serve facebook/opt-125m + $ vllm serve Qwen/Qwen2.5-1.5B-Instruct -By default, the server uses a predefined chat template stored in the tokenizer. You can override this template by using the ``--chat-template`` argument: - -.. code-block:: console +.. note:: - $ vllm serve facebook/opt-125m --chat-template ./examples/template_chatml.jinja + By default, the server uses a predefined chat template stored in the tokenizer. You can learn about overriding it `here `__. -This server can be queried in the same format as OpenAI API. For example, list the models: +This server can be queried in the same format as OpenAI API. For example, to list the models: .. code-block:: console @@ -91,17 +104,17 @@ This server can be queried in the same format as OpenAI API. For example, list t You can pass in the argument ``--api-key`` or environment variable ``VLLM_API_KEY`` to enable the server to check for API key in the header. -Using OpenAI Completions API with vLLM -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +OpenAI Completions API with vLLM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Query the model with input prompts: +Once your server is started, you can query the model with input prompts: .. code-block:: console $ curl http://localhost:8000/v1/completions \ $ -H "Content-Type: application/json" \ $ -d '{ - $ "model": "facebook/opt-125m", + $ "model": "Qwen/Qwen2.5-1.5B-Instruct", $ "prompt": "San Francisco is a", $ "max_tokens": 7, $ "temperature": 0 @@ -120,36 +133,32 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep api_key=openai_api_key, base_url=openai_api_base, ) - completion = client.completions.create(model="facebook/opt-125m", + completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", prompt="San Francisco is a") print("Completion result:", completion) -For a more detailed client example, refer to `examples/openai_completion_client.py `_. - -Using OpenAI Chat API with vLLM -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +A more detailed client example can be found `here `__. -The vLLM server is designed to support the OpenAI Chat API, allowing you to engage in dynamic conversations with the model. The chat interface is a more interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations. +OpenAI Chat Completions API with vLLM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Querying the model using OpenAI Chat API: +vLLM is designed to also support the OpenAI Chat Completions API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations. -You can use the `create chat completion `_ endpoint to communicate with the model in a chat-like interface: +You can use the `create chat completion `_ endpoint to interact with the model: .. code-block:: console $ curl http://localhost:8000/v1/chat/completions \ $ -H "Content-Type: application/json" \ $ -d '{ - $ "model": "facebook/opt-125m", + $ "model": "Qwen/Qwen2.5-1.5B-Instruct", $ "messages": [ $ {"role": "system", "content": "You are a helpful assistant."}, $ {"role": "user", "content": "Who won the world series in 2020?"} $ ] $ }' -Python Client Example: - -Using the `openai` python package, you can also communicate with the model in a chat-like manner: +Alternatively, you can use the ``openai`` python package: .. code-block:: python @@ -164,12 +173,10 @@ Using the `openai` python package, you can also communicate with the model in a ) chat_response = client.chat.completions.create( - model="facebook/opt-125m", + model="Qwen/Qwen2.5-1.5B-Instruct", messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Tell me a joke."}, ] ) print("Chat response:", chat_response) - -For more in-depth examples and advanced features of the chat API, you can refer to the official OpenAI documentation. diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst index 217028839e347..f0c812b941c1f 100644 --- a/docs/source/getting_started/tpu-installation.rst +++ b/docs/source/getting_started/tpu-installation.rst @@ -1,35 +1,167 @@ .. _installation_tpu: +##################### Installation with TPU -===================== +##################### -vLLM supports Google Cloud TPUs using PyTorch XLA. +Tensor Processing Units (TPUs) are Google's custom-developed application-specific +integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs +are available in different versions each with different hardware specifications. +For more information about TPUs, see `TPU System Architecture `_. +For more information on the TPU versions supported with vLLM, see: + +* `TPU v6e `_ +* `TPU v5e `_ +* `TPU v5p `_ +* `TPU v4 `_ + +These TPU versions allow you to configure the physical arrangements of the TPU +chips. This can improve throughput and networking performance. For more +information see: + +* `TPU v6e topologies `_ +* `TPU v5e topologies `_ +* `TPU v5p topologies `_ +* `TPU v4 topologies `_ + +In order for you to use Cloud TPUs you need to have TPU quota granted to your +Google Cloud Platform project. TPU quotas specify how many TPUs you can use in a +GPC project and are specified in terms of TPU version, the number of TPU you +want to use, and quota type. For more information, see `TPU quota `_. + +For TPU pricing information, see `Cloud TPU pricing `_. + +You may need additional persistent storage for your TPU VMs. For more +information, see `Storage options for Cloud TPU data `_. Requirements ------------ -* Google Cloud TPU VM (single & multi host) -* TPU versions: v5e, v5p, v4 -* Python: 3.10 +* Google Cloud TPU VM +* TPU versions: v6e, v5e, v5p, v4 +* Python: 3.10 or newer + +Provision Cloud TPUs +==================== + +You can provision Cloud TPUs using the `Cloud TPU API `_` +or the `queued resources `_` +API. This section shows how to create TPUs using the queued resource API. +For more information about using the Cloud TPU API, see `Create a Cloud TPU using the Create Node API `_. +`Queued resources `_ +enable you to request Cloud TPU resources in a queued manner. When you request +queued resources, the request is added to a queue maintained by the Cloud TPU +service. When the requested resource becomes available, it's assigned to your +Google Cloud project for your immediate exclusive use. + +Provision a Cloud TPU with the queued resource API +-------------------------------------------------- +Create a TPU v5e with 4 TPU chips: + +.. code-block:: console + + gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \ + --node-id TPU_NAME \ + --project PROJECT_ID \ + --zone ZONE \ + --accelerator-type ACCELERATOR_TYPE \ + --runtime-version RUNTIME_VERSION \ + --service-account SERVICE_ACCOUNT + +.. list-table:: Parameter descriptions + :header-rows: 1 + + * - Parameter name + - Description + * - QUEUED_RESOURCE_ID + - The user-assigned ID of the queued resource request. + * - TPU_NAME + - The user-assigned name of the TPU which is created when the queued + resource request is allocated. + * - PROJECT_ID + - Your Google Cloud project + * - ZONE + - The `zone `_ where you + want to create your Cloud TPU. + * - ACCELERATOR_TYPE + - The TPU version you want to use. Specify the TPU version, followed by a + '-' and the number of TPU cores. For example `v5e-4` specifies a v5e TPU + with 4 cores. For more information, see `TPU versions `_. + * - RUNTIME_VERSION + - The TPU VM runtime version to use. For more information see `TPU VM images `_. + * - SERVICE_ACCOUNT + - The email address for your service account. You can find it in the IAM + Cloud Console under *Service Accounts*. For example: + `tpu-service-account@.iam.gserviceaccount.com` + +Connect to your TPU using SSH: + +.. code-block:: bash + + gcloud compute tpus tpu-vm ssh TPU_NAME + +Create and activate a Conda environment for vLLM: + +.. code-block:: bash -Installation options: + conda create -n vllm python=3.10 -y + conda activate vllm -1. :ref:`Build a docker image with Dockerfile `. -2. :ref:`Build from source `. +Clone the vLLM repository and go to the vLLM directory: + +.. code-block:: bash + + git clone https://github.com/vllm-project/vllm.git && cd vllm + +Uninstall the existing `torch` and `torch_xla` packages: + +.. code-block:: bash + + pip uninstall torch torch-xla -y + +Install `torch` and `torch_xla` + +.. code-block:: bash + + pip install --pre torch==2.6.0.dev20241028+cpu torchvision==0.20.0.dev20241028+cpu --index-url https://download.pytorch.org/whl/nightly/cpu + pip install 'torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev-cp310-cp310-linux_x86_64.whl' -f https://storage.googleapis.com/libtpu-releases/index.html + +Install JAX and Pallas: + +.. code-block:: bash + + pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html + pip install jaxlib==0.4.32.dev20240829 jax==0.4.32.dev20240829 -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html + +Install other build dependencies: + +.. code-block:: bash + + pip install -r requirements-tpu.txt + VLLM_TARGET_DEVICE="tpu" python setup.py develop + sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev + +Provision Cloud TPUs with GKE +----------------------------- + +For more information about using TPUs with GKE, see +https://cloud.google.com/kubernetes-engine/docs/how-to/tpus +https://cloud.google.com/kubernetes-engine/docs/concepts/tpus +https://cloud.google.com/kubernetes-engine/docs/concepts/plan-tpus .. _build_docker_tpu: Build a docker image with :code:`Dockerfile.tpu` ------------------------------------------------ -`Dockerfile.tpu `_ is provided to build a docker image with TPU support. +You can use `Dockerfile.tpu `_ +to build a Docker image with TPU support. .. code-block:: console $ docker build -f Dockerfile.tpu -t vllm-tpu . - -You can run the docker image with the following command: +Run the Docker image with the following command: .. code-block:: console @@ -56,8 +188,8 @@ First, install the dependencies: $ pip uninstall torch torch-xla -y $ # Install PyTorch and PyTorch XLA. - $ export DATE="20240828" - $ export TORCH_VERSION="2.5.0" + $ export DATE="20241017" + $ export TORCH_VERSION="2.6.0" $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl @@ -75,14 +207,12 @@ Next, build vLLM from source. This will only take a few seconds: $ VLLM_TARGET_DEVICE="tpu" python setup.py develop - .. note:: Since TPU relies on XLA which requires static shapes, vLLM bucketizes the possible input shapes and compiles an XLA graph for each different shape. The compilation time may take 20~30 minutes in the first run. However, the compilation time reduces to ~5 minutes afterwards because the XLA graphs are cached in the disk (in :code:`VLLM_XLA_CACHE_PATH` or :code:`~/.cache/vllm/xla_cache` by default). - .. tip:: If you encounter the following error: @@ -93,7 +223,7 @@ Next, build vLLM from source. This will only take a few seconds: ImportError: libopenblas.so.0: cannot open shared object file: No such file or directory - Please install OpenBLAS with the following command: + Install OpenBLAS with the following command: .. code-block:: console diff --git a/docs/source/getting_started/xpu-installation.rst b/docs/source/getting_started/xpu-installation.rst index 151ebb5f1811f..b1868acbc84b0 100644 --- a/docs/source/getting_started/xpu-installation.rst +++ b/docs/source/getting_started/xpu-installation.rst @@ -60,3 +60,21 @@ Build from source - FP16 is the default data type in the current XPU backend. The BF16 data type will be supported in the future. + +Distributed inference and serving +--------------------------------- + +XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following: + +.. code-block:: console + + $ python -m vllm.entrypoints.openai.api_server \ + $ --model=facebook/opt-13b \ + $ --dtype=bfloat16 \ + $ --device=xpu \ + $ --max_model_len=1024 \ + $ --distributed-executor-backend=ray \ + $ --pipeline-parallel-size=2 \ + $ -tp=8 + +By default, a ray instance will be launched automatically if no existing one is detected in system, with ``num-gpus`` equals to ``parallel_config.world_size``. We recommend properly starting a ray cluster before execution, referring helper `script `_. diff --git a/docs/source/index.rst b/docs/source/index.rst index dc6807deb8261..cf1377014be0b 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -82,6 +82,7 @@ Documentation serving/openai_compatible_server serving/deploying_with_docker serving/deploying_with_k8s + serving/deploying_with_nginx serving/distributed_serving serving/metrics serving/env_vars @@ -135,6 +136,7 @@ Documentation :caption: Developer Documentation dev/sampling_params + dev/pooling_params dev/offline_inference/offline_index dev/engine/engine_index dev/kernel/paged_attention diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst index ae09259c0756c..c6d88cc38e99b 100644 --- a/docs/source/models/adding_model.rst +++ b/docs/source/models/adding_model.rst @@ -133,7 +133,9 @@ If you are running api server with :code:`vllm serve `, you can wrap the e from vllm import ModelRegistry from your_code import YourModelForCausalLM ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) - import runpy - runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__') + + if __name__ == '__main__': + import runpy + runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__') Save the above code in a file and run it with :code:`python your_file.py `. diff --git a/docs/source/models/spec_decode.rst b/docs/source/models/spec_decode.rst index 50468f25b922a..b02c80aebec69 100644 --- a/docs/source/models/spec_decode.rst +++ b/docs/source/models/spec_decode.rst @@ -30,7 +30,6 @@ The following code configures vLLM in an offline mode to use speculative decodin tensor_parallel_size=1, speculative_model="facebook/opt-125m", num_speculative_tokens=5, - use_v2_block_manager=True, ) outputs = llm.generate(prompts, sampling_params) @@ -44,10 +43,10 @@ To perform the same with an online mode launch the server: .. code-block:: bash python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \ - --seed 42 -tp 1 --speculative_model facebook/opt-125m --use-v2-block-manager \ - --num_speculative_tokens 5 --gpu_memory_utilization 0.8 + --seed 42 -tp 1 --speculative_model facebook/opt-125m --use-v2-block-manager \ + --num_speculative_tokens 5 --gpu_memory_utilization 0.8 - Then use a client: +Then use a client: .. code-block:: python @@ -104,7 +103,6 @@ matching n-grams in the prompt. For more information read `this thread. `_. -The following is the list of model architectures that are currently supported by vLLM. +vLLM supports a variety of generative and embedding models from `HuggingFace (HF) Transformers `_. +This page lists the model architectures that are currently supported by vLLM. Alongside each architecture, we include some popular models that use it. +For other models, you can check the :code:`config.json` file inside the model repository. +If the :code:`"architectures"` field contains a model architecture listed below, then it should be supported in theory. + +.. tip:: + The easiest way to check if your model is really supported at runtime is to run the program below: + + .. code-block:: python + + from vllm import LLM + + llm = LLM(model=...) # Name or path of your model + output = llm.generate("Hello, my name is") + print(output) + + If vLLM successfully generates text, it indicates that your model is supported. + +Otherwise, please refer to :ref:`Adding a New Model ` and :ref:`Enabling Multimodal Inputs ` +for instructions on how to implement your model in vLLM. +Alternatively, you can `open an issue on GitHub `_ to request vLLM support. + +.. note:: + To use models from `ModelScope `_ instead of HuggingFace Hub, set an environment variable: + + .. code-block:: shell + + $ export VLLM_USE_MODELSCOPE=True + + And use with :code:`trust_remote_code=True`. + + .. code-block:: python + + from vllm import LLM + + llm = LLM(model=..., revision=..., trust_remote_code=True) # Name or path of your model + output = llm.generate("Hello, my name is") + print(output) + Text-only Language Models ^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -19,7 +56,7 @@ Text Generation * - Architecture - Models - - Example HuggingFace Models + - Example HF Models - :ref:`LoRA ` - :ref:`PP ` * - :code:`AquilaForCausalLM` @@ -87,6 +124,11 @@ Text Generation - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc. - - ✅ + * - :code:`FalconMambaForCausalLM` + - FalconMamba + - :code:`tiiuae/falcon-mamba-7b`, :code:`tiiuae/falcon-mamba-7b-instruct`, etc. + - ✅ + - * - :code:`GemmaForCausalLM` - Gemma - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc. @@ -118,13 +160,13 @@ Text Generation - - ✅ * - :code:`GraniteForCausalLM` - - PowerLM - - :code:`ibm/PowerLM-3b` etc. + - Granite 3.0, PowerLM + - :code:`ibm-granite/granite-3.0-2b-base`, :code:`ibm-granite/granite-3.0-8b-instruct`, :code:`ibm/PowerLM-3b`, etc. - ✅ - ✅ * - :code:`GraniteMoeForCausalLM` - - PowerMoE - - :code:`ibm/PowerMoE-3b` etc. + - Granite 3.0 MoE, PowerMoE + - :code:`ibm-granite/granite-3.0-1b-a400m-base`, :code:`ibm-granite/granite-3.0-3b-a800m-instruct`, :code:`ibm/PowerMoE-3b`, etc. - ✅ - ✅ * - :code:`InternLMForCausalLM` @@ -139,7 +181,7 @@ Text Generation - ✅ * - :code:`JAISLMHeadModel` - Jais - - :code:`core42/jais-13b`, :code:`core42/jais-13b-chat`, :code:`core42/jais-30b-v3`, :code:`core42/jais-30b-chat-v3`, etc. + - :code:`inceptionai/jais-13b`, :code:`inceptionai/jais-13b-chat`, :code:`inceptionai/jais-30b-v3`, :code:`inceptionai/jais-30b-chat-v3`, etc. - - ✅ * - :code:`JambaForCausalLM` @@ -155,11 +197,11 @@ Text Generation * - :code:`MambaForCausalLM` - Mamba - :code:`state-spaces/mamba-130m-hf`, :code:`state-spaces/mamba-790m-hf`, :code:`state-spaces/mamba-2.8b-hf`, etc. - - ✅ + - - * - :code:`MiniCPMForCausalLM` - MiniCPM - - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, etc. + - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, :code:`openbmb/MiniCPM-S-1B-sft`, etc. - ✅ - ✅ * - :code:`MiniCPM3ForCausalLM` @@ -235,11 +277,11 @@ Text Generation * - :code:`QWenLMHeadModel` - Qwen - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc. - - + - ✅ - ✅ * - :code:`Qwen2ForCausalLM` - Qwen2 - - :code:`Qwen/Qwen2-beta-7B`, :code:`Qwen/Qwen2-beta-7B-Chat`, etc. + - :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc. - ✅ - ✅ * - :code:`Qwen2MoeForCausalLM` @@ -280,7 +322,7 @@ Text Embedding * - Architecture - Models - - Example HuggingFace Models + - Example HF Models - :ref:`LoRA ` - :ref:`PP ` * - :code:`Gemma2Model` @@ -294,6 +336,10 @@ Text Embedding - - ✅ +.. important:: + Some model architectures support both generation and embedding tasks. + In this case, you have to pass :code:`--task embedding` to run the model in embedding mode. + Reward Modeling --------------- @@ -303,7 +349,7 @@ Reward Modeling * - Architecture - Models - - Example HuggingFace Models + - Example HF Models - :ref:`LoRA ` - :ref:`PP ` * - :code:`Qwen2ForRewardModel` @@ -315,8 +361,45 @@ Reward Modeling .. note:: As an interim measure, these models are supported via Embeddings API. See `this RFC `_ for upcoming changes. +Classification +--------------- + +.. list-table:: + :widths: 25 25 50 5 5 + :header-rows: 1 + + * - Architecture + - Models + - Example HF Models + - :ref:`LoRA ` + - :ref:`PP ` + * - :code:`Qwen2ForSequenceClassification` + - Qwen2-based + - :code:`jason9693/Qwen2.5-1.5B-apeach`, etc. + - + - ✅ + +.. note:: + As an interim measure, these models are supported via Embeddings API. It will be supported via Classification API in the future (no reference APIs exist now). + + Multimodal Language Models -^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The following modalities are supported depending on the model: + +- **T**\ ext +- **I**\ mage +- **V**\ ideo +- **A**\ udio + +Any combination of modalities joined by :code:`+` are supported. + +- e.g.: :code:`T + I` means that the model supports text-only, image-only, and text-with-image inputs. + +On the other hand, modalities separated by :code:`/` are mutually exclusive. + +- e.g.: :code:`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs. .. _supported_vlms: @@ -324,120 +407,138 @@ Text Generation --------------- .. list-table:: - :widths: 25 25 25 25 5 5 + :widths: 25 25 15 25 5 5 :header-rows: 1 * - Architecture - Models - - Modalities - - Example HuggingFace Models + - Inputs + - Example HF Models - :ref:`LoRA ` - :ref:`PP ` * - :code:`Blip2ForConditionalGeneration` - BLIP-2 - - Image\ :sup:`E` + - T + I\ :sup:`E` - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc. - - ✅ * - :code:`ChameleonForConditionalGeneration` - Chameleon - - Image + - T + I - :code:`facebook/chameleon-7b` etc. - - ✅ * - :code:`FuyuForCausalLM` - Fuyu - - Image + - T + I - :code:`adept/fuyu-8b` etc. - - ✅ * - :code:`ChatGLMModel` - GLM-4V - - Image + - T + I - :code:`THUDM/glm-4v-9b` etc. - - ✅ + * - :code:`H2OVLChatModel` + - H2OVL + - T + I\ :sup:`E+` + - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc. + - + - ✅ * - :code:`InternVLChatModel` - InternVL2 - - Image\ :sup:`E+` - - :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc. + - T + I\ :sup:`E+` + - :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc. - - ✅ * - :code:`LlavaForConditionalGeneration` - LLaVA-1.5 - - Image\ :sup:`E+` + - T + I\ :sup:`E+` - :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc. - - ✅ * - :code:`LlavaNextForConditionalGeneration` - LLaVA-NeXT - - Image\ :sup:`E+` + - T + I\ :sup:`E+` - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc. - - ✅ * - :code:`LlavaNextVideoForConditionalGeneration` - LLaVA-NeXT-Video - - Video + - T + V - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. - - ✅ * - :code:`LlavaOnevisionForConditionalGeneration` - LLaVA-Onevision - - Image\ :sup:`+` / Video + - T + I\ :sup:`+` + V\ :sup:`+` - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. - - ✅ * - :code:`MiniCPMV` - MiniCPM-V - - Image\ :sup:`E+` + - T + I\ :sup:`E+` - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc. - ✅ - ✅ * - :code:`MllamaForConditionalGeneration` - Llama 3.2 - - Image + - T + I\ :sup:`+` - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc. - - + * - :code:`MolmoForCausalLM` + - Molmo + - T + I + - :code:`allenai/Molmo-7B-D-0924`, :code:`allenai/Molmo-72B-0924`, etc. + - + - ✅ * - :code:`NVLM_D_Model` - NVLM-D 1.0 - - Image\ :sup:`E+` + - T + I\ :sup:`E+` - :code:`nvidia/NVLM-D-72B`, etc. - - ✅ * - :code:`PaliGemmaForConditionalGeneration` - PaliGemma - - Image\ :sup:`E` + - T + I\ :sup:`E` - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc. - - ✅ * - :code:`Phi3VForCausalLM` - Phi-3-Vision, Phi-3.5-Vision - - Image\ :sup:`E+` + - T + I\ :sup:`E+` - :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc. - - ✅ * - :code:`PixtralForConditionalGeneration` - Pixtral - - Image\ :sup:`+` - - :code:`mistralai/Pixtral-12B-2409` + - T + I\ :sup:`+` + - :code:`mistralai/Pixtral-12B-2409`, :code:`mistral-community/pixtral-12b` etc. - - ✅ * - :code:`QWenLMHeadModel` - Qwen-VL - - Image\ :sup:`E+` + - T + I\ :sup:`E+` - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc. + - ✅ + - ✅ + * - :code:`Qwen2AudioForConditionalGeneration` + - Qwen2-Audio + - T + A\ :sup:`+` + - :code:`Qwen/Qwen2-Audio-7B-Instruct` - - ✅ * - :code:`Qwen2VLForConditionalGeneration` - Qwen2-VL - - Image\ :sup:`E+` / Video\ :sup:`+` + - T + I\ :sup:`E+` + V\ :sup:`+` - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc. - - ✅ * - :code:`UltravoxModel` - Ultravox - - Audio\ :sup:`E+` + - T + A\ :sup:`E+` - :code:`fixie-ai/ultravox-v0_3` - - ✅ @@ -445,47 +546,42 @@ Text Generation | :sup:`E` Pre-computed embeddings can be inputted for this modality. | :sup:`+` Multiple items can be inputted per text prompt for this modality. +.. note:: + vLLM currently only supports adding LoRA to the language backbone of multimodal models. + .. note:: For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now. For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 ----- - -If your model uses one of the above model architectures, you can seamlessly run your model with vLLM. -Otherwise, please refer to :ref:`Adding a New Model ` and :ref:`Enabling Multimodal Inputs ` -for instructions on how to implement support for your model. -Alternatively, you can raise an issue on our `GitHub `_ project. - -.. tip:: - The easiest way to check if your model is supported is to run the program below: - - .. code-block:: python - - from vllm import LLM - - llm = LLM(model=...) # Name or path of your model - output = llm.generate("Hello, my name is") - print(output) - - If vLLM successfully generates text, it indicates that your model is supported. - -.. tip:: - To use models from `ModelScope `_ instead of HuggingFace Hub, set an environment variable: - - .. code-block:: shell - - $ export VLLM_USE_MODELSCOPE=True - - And use with :code:`trust_remote_code=True`. +Multimodal Embedding +-------------------- - .. code-block:: python - - from vllm import LLM +.. list-table:: + :widths: 25 25 15 25 5 5 + :header-rows: 1 - llm = LLM(model=..., revision=..., trust_remote_code=True) # Name or path of your model - output = llm.generate("Hello, my name is") - print(output) + * - Architecture + - Models + - Inputs + - Example HF Models + - :ref:`LoRA ` + - :ref:`PP ` + * - :code:`LlavaNextForConditionalGeneration` + - LLaVA-NeXT-based + - T / I + - :code:`royokong/e5-v` + - + - ✅ + * - :code:`Phi3VForCausalLM` + - Phi-3-Vision-based + - T + I + - :code:`TIGER-Lab/VLM2Vec-Full` + - 🚧 + - ✅ +.. important:: + Some model architectures support both generation and embedding tasks. + In this case, you have to pass :code:`--task embedding` to run the model in embedding mode. Model Support Policy ===================== diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index a3ee5da044220..112e9db6a41de 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -181,11 +181,11 @@ Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruc .. code-block:: bash - vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \ - --trust-remote-code --limit-mm-per-prompt image=2 + vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ + --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2 .. important:: - Since OpenAI Vision API is based on `Chat Completions `_ API, + Since OpenAI Vision API is based on `Chat Completions API `_, a chat template is **required** to launch the API server. Although Phi-3.5-Vision comes with a chat template, for other models you may have to provide one if the model's tokenizer does not come with it. @@ -240,16 +240,74 @@ To consume the server, you can use the OpenAI client like in the example below: ) print("Chat completion output:", chat_response.choices[0].message.content) +A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py `_. -A full code example can be found in `examples/openai_vision_api_client.py `_. +.. tip:: + Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via ``--allowed-local-media-path`` when launching the API server/engine, + and pass the file path as ``url`` in the API request. + +.. tip:: + There is no need to place image placeholders in the text content of the API request - they are already represented by the image content. + In fact, you can place image placeholders in the middle of the text by interleaving text and image content. .. note:: By default, the timeout for fetching images through http url is ``5`` seconds. You can override this by setting the environment variable: - .. code-block:: shell + .. code-block:: console - export VLLM_IMAGE_FETCH_TIMEOUT= + $ export VLLM_IMAGE_FETCH_TIMEOUT= -.. note:: - There is no need to format the prompt in the API request since it will be handled by the server. +Chat Embeddings API +^^^^^^^^^^^^^^^^^^^ + +vLLM's Chat Embeddings API is a superset of OpenAI's `Embeddings API `_, +where a list of ``messages`` can be passed instead of batched ``inputs``. This enables multi-modal inputs to be passed to embedding models. + +.. tip:: + The schema of ``messages`` is exactly the same as in Chat Completions API. + +In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model. + +.. code-block:: bash + + vllm serve TIGER-Lab/VLM2Vec-Full --task embedding \ + --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja + +.. important:: + + Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embedding`` + to run this model in embedding mode instead of text generation mode. + +.. important:: + + VLM2Vec does not expect chat-based input. We use a `custom chat template `_ + to combine the text and images together. + +Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library: + +.. code-block:: python + + import requests + + image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + + response = requests.post( + "http://localhost:8000/v1/embeddings", + json={ + "model": "TIGER-Lab/VLM2Vec-Full", + "messages": [{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "Represent the given image."}, + ], + }], + "encoding_format": "float", + }, + ) + response.raise_for_status() + response_json = response.json() + print("Embedding output:", response_json["data"][0]["embedding"]) + +A full code example can be found in `examples/openai_chat_embedding_client_for_multimodal.py `_. diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst index cac0605ca132b..cab19e4ec5b6c 100644 --- a/docs/source/serving/compatibility_matrix.rst +++ b/docs/source/serving/compatibility_matrix.rst @@ -283,7 +283,7 @@ Feature x Feature - ✅ - ✅ - ✅ - - ✗ + - `✗ `__ - ? - ✅ - ✅ diff --git a/docs/source/serving/deploying_with_nginx.rst b/docs/source/serving/deploying_with_nginx.rst new file mode 100644 index 0000000000000..b5dff02b6bae6 --- /dev/null +++ b/docs/source/serving/deploying_with_nginx.rst @@ -0,0 +1,142 @@ +.. _nginxloadbalancer: + +Deploying with Nginx Loadbalancer +================================= + +This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers. + +Table of contents: + +#. :ref:`Build Nginx Container ` +#. :ref:`Create Simple Nginx Config file ` +#. :ref:`Build vLLM Container ` +#. :ref:`Create Docker Network ` +#. :ref:`Launch vLLM Containers ` +#. :ref:`Launch Nginx ` +#. :ref:`Verify That vLLM Servers Are Ready ` + +.. _nginxloadbalancer_nginx_build: + +Build Nginx Container +--------------------- + +This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory. + +.. code-block:: console + + export vllm_root=`pwd` + +Create a file named ``Dockerfile.nginx``: + +.. code-block:: console + + FROM nginx:latest + RUN rm /etc/nginx/conf.d/default.conf + EXPOSE 80 + CMD ["nginx", "-g", "daemon off;"] + +Build the container: + +.. code-block:: console + + docker build . -f Dockerfile.nginx --tag nginx-lb + +.. _nginxloadbalancer_nginx_conf: + +Create Simple Nginx Config file +------------------------------- + +Create a file named ``nginx_conf/nginx.conf``. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another ``server vllmN:8000 max_fails=3 fail_timeout=10000s;`` entry to ``upstream backend``. + +.. code-block:: console + + upstream backend { + least_conn; + server vllm0:8000 max_fails=3 fail_timeout=10000s; + server vllm1:8000 max_fails=3 fail_timeout=10000s; + } + server { + listen 80; + location / { + proxy_pass http://backend; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + } + +.. _nginxloadbalancer_nginx_vllm_container: + +Build vLLM Container +-------------------- + +.. code-block:: console + + cd $vllm_root + docker build -f Dockerfile . --tag vllm + + +If you are behind proxy, you can pass the proxy settings to the docker build command as shown below: + +.. code-block:: console + + cd $vllm_root + docker build -f Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy + +.. _nginxloadbalancer_nginx_docker_network: + +Create Docker Network +--------------------- + +.. code-block:: console + + docker network create vllm_nginx + + +.. _nginxloadbalancer_nginx_launch_container: + +Launch vLLM Containers +---------------------- + +Notes: + +* If you have your HuggingFace models cached somewhere else, update ``hf_cache_dir`` below. +* If you don't have an existing HuggingFace cache you will want to start ``vllm0`` and wait for the model to complete downloading and the server to be ready. This will ensure that ``vllm1`` can leverage the model you just downloaded and it won't have to be downloaded again. +* The below example assumes GPU backend used. If you are using CPU backend, remove ``--gpus all``, add ``VLLM_CPU_KVCACHE_SPACE`` and ``VLLM_CPU_OMP_THREADS_BIND`` environment variables to the docker run command. +* Adjust the model name that you want to use in your vLLM servers if you don't want to use ``Llama-2-7b-chat-hf``. + +.. code-block:: console + + mkdir -p ~/.cache/huggingface/hub/ + hf_cache_dir=~/.cache/huggingface/ + docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8081:8000 --name vllm0 vllm --model meta-llama/Llama-2-7b-chat-hf + docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf + +.. note:: + If you are behind proxy, you can pass the proxy settings to the docker run command via ``-e http_proxy=$http_proxy -e https_proxy=$https_proxy``. + +.. _nginxloadbalancer_nginx_launch_nginx: + +Launch Nginx +------------ + +.. code-block:: console + + docker run -itd -p 8000:80 --network vllm_nginx -v ./nginx_conf/:/etc/nginx/conf.d/ --name nginx-lb nginx-lb:latest + +.. _nginxloadbalancer_nginx_verify_nginx: + +Verify That vLLM Servers Are Ready +---------------------------------- + +.. code-block:: console + + docker logs vllm0 | grep Uvicorn + docker logs vllm1 | grep Uvicorn + +Both outputs should look like this: + +.. code-block:: console + + INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst index fcb2646df50d3..4d57206e53a05 100644 --- a/docs/source/serving/distributed_serving.rst +++ b/docs/source/serving/distributed_serving.rst @@ -22,7 +22,7 @@ After adding enough GPUs and nodes to hold the model, you can run vLLM first, wh Details for Distributed Inference and Serving ---------------------------------------------- -vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm `_. We also support pipeline parallel as a beta feature for online serving. We manage the distributed runtime with either `Ray `_ or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray. +vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm `_. We manage the distributed runtime with either `Ray `_ or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray. Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured :code:`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the :code:`LLM` class :code:`distributed-executor-backend` argument or :code:`--distributed-executor-backend` API server argument. Set it to :code:`mp` for multiprocessing or :code:`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case. @@ -49,9 +49,6 @@ You can also additionally specify :code:`--pipeline-parallel-size` to enable pip $ --tensor-parallel-size 4 \ $ --pipeline-parallel-size 2 -.. note:: - Pipeline parallel is a beta feature. It is only supported for online serving as well as LLaMa, GPT2, Mixtral, Qwen, Qwen2, and Nemotron style models. - Multi-Node Inference and Serving -------------------------------- diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 9132e12a36ba5..0b5f75caf2475 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -26,13 +26,26 @@ print(completion.choices[0].message) ``` ## API Reference -Please see the [OpenAI API Reference](https://platform.openai.com/docs/api-reference) for more information on the API. We support all parameters except: -- Chat: `tools`, and `tool_choice`. -- Completions: `suffix`. -vLLM also provides experimental support for OpenAI Vision API compatible inference. See more details in [Using VLMs](../models/vlm.rst). +We currently support the following OpenAI APIs: + +- [Completions API](https://platform.openai.com/docs/api-reference/completions) + - *Note: `suffix` parameter is not supported.* +- [Chat Completions API](https://platform.openai.com/docs/api-reference/chat) + - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Using VLMs](../models/vlm.rst). + - *Note: `image_url.detail` parameter is not supported.* + - We also support `audio_url` content type for audio files. + - Refer to [vllm.entrypoints.chat_utils](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py) for the exact schema. + - *TODO: Support `input_audio` content type as defined [here](https://github.com/openai/openai-python/blob/v1.52.2/src/openai/types/chat/chat_completion_content_part_input_audio_param.py).* + - *Note: `parallel_tool_calls` and `user` parameters are ignored.* +- [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings) + - Instead of `inputs`, you can pass in a list of `messages` (same schema as Chat Completions API), + which will be treated as a single prompt to the model according to its chat template. + - This enables multi-modal inputs to be passed to embedding models, see [Using VLMs](../models/vlm.rst). + - *Note: You should run `vllm serve` with `--task embedding` to ensure that the model is being run in embedding mode.* ## Extra Parameters + vLLM supports a set of parameters that are not part of the OpenAI API. In order to use them, you can pass them as extra parameters in the OpenAI client. Or directly merge them into the JSON payload if you are using HTTP call directly. @@ -49,7 +62,26 @@ completion = client.chat.completions.create( ) ``` -### Extra Parameters for Chat API +### Extra Parameters for Completions API + +The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-completion-sampling-params +:end-before: end-completion-sampling-params +``` + +The following extra parameters are supported: + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-completion-extra-params +:end-before: end-completion-extra-params +``` + +### Extra Parameters for Chat Completions API + The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py @@ -66,21 +98,22 @@ The following extra parameters are supported: :end-before: end-chat-completion-extra-params ``` -### Extra Parameters for Completions API -The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. +### Extra Parameters for Embeddings API + +The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python -:start-after: begin-completion-sampling-params -:end-before: end-completion-sampling-params +:start-after: begin-embedding-pooling-params +:end-before: end-embedding-pooling-params ``` The following extra parameters are supported: ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python -:start-after: begin-completion-extra-params -:end-before: end-completion-extra-params +:start-after: begin-embedding-extra-params +:end-before: end-embedding-extra-params ``` ## Chat Template @@ -103,6 +136,23 @@ vllm serve --chat-template ./path-to-chat-template.jinja vLLM community provides a set of chat templates for popular models. You can find them in the examples directory [here](https://github.com/vllm-project/vllm/tree/main/examples/) +With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies +both a `type` and a `text` field. An example is provided below: +```python +completion = client.chat.completions.create( + model="NousResearch/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]} + ] +) +``` +Most chat templates for LLMs expect the `content` to be a `string` but there are some newer models like +`meta-llama/Llama-Guard-3-1B` that expect the content to be parsed with the new OpenAI spec. In order to choose which +format the content needs to be parsed in by vLLM, please use the `--chat-template-text-format` argument to specify +between `string` or `openai`. The default value is `string` and vLLM internally converts both spec formats to match +this, unless explicitly specified. + + ## Command line arguments for the server ```{argparse} @@ -157,7 +207,7 @@ vLLM will use guided decoding to ensure the response matches the tool parameter To enable this feature, you should set the following flags: * `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it deems appropriate. -* `--tool-call-parser` -- select the tool parser to use - currently either `hermes` or `mistral` or `llama3_json` or `internlm`. Additional tool parsers +* `--tool-call-parser` -- select the tool parser to use (listed below). Additional tool parsers will continue to be added in the future, and also can register your own tool parsers in the `--tool-parser-plugin`. * `--tool-parser-plugin` -- **optional** tool parser plugin used to register user defined tool parsers into vllm, the registered tool parser name can be specified in `--tool-call-parser`. * `--chat-template` -- **optional** for auto tool choice. the path to the chat template which handles `tool`-role messages and `assistant`-role messages @@ -168,7 +218,9 @@ from HuggingFace; and you can find an example of this in a `tokenizer_config.jso If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template! -#### Hermes Models + +#### Hermes Models (`hermes`) + All Nous Research Hermes-series models newer than Hermes 2 Pro should be supported. * `NousResearch/Hermes-2-Pro-*` * `NousResearch/Hermes-2-Theta-*` @@ -180,7 +232,9 @@ step in their creation_. Flags: `--tool-call-parser hermes` -#### Mistral Models + +#### Mistral Models (`mistral`) + Supported models: * `mistralai/Mistral-7B-Instruct-v0.3` (confirmed) * Additional mistral function-calling models are compatible as well. @@ -199,7 +253,9 @@ when tools are provided, that results in much better reliability when working wi Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja` -#### Llama Models + +#### Llama Models (`llama3_json`) + Supported models: * `meta-llama/Meta-Llama-3.1-8B-Instruct` * `meta-llama/Meta-Llama-3.1-70B-Instruct` @@ -219,17 +275,38 @@ it works better with vLLM. Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool_chat_template_llama3_json.jinja` -#### Internlm Models + +#### InternLM Models (`internlm`) + Supported models: * `internlm/internlm2_5-7b-chat` (confirmed) * Additional internlm2.5 function-calling models are compatible as well Known issues: -* Although this implementation also supports Internlm2, the tool call results are not stable when testing with the `internlm/internlm2-chat-7b` model. +* Although this implementation also supports InternLM2, the tool call results are not stable when testing with the `internlm/internlm2-chat-7b` model. Recommended flags: `--tool-call-parser internlm --chat-template examples/tool_chat_template_internlm2_tool.jinja` +#### Jamba Models (`jamba`) +AI21's Jamba-1.5 models are supported. +* `ai21labs/AI21-Jamba-1.5-Mini` +* `ai21labs/AI21-Jamba-1.5-Large` + + +Flags: `--tool-call-parser jamba` + + +#### IBM Granite (`granite-20b-fc`) + +Supported models: +* `ibm-granite/granite-20b-functioncalling` + +Flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja` + +The example chat template deviates slightly from the original on Huggingface, which is not vLLM compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported. + + ### How to write a tool parser plugin A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py. @@ -287,5 +364,5 @@ Then you can use this plugin in the command line like this. --tool-parser-plugin --tool-call-parser example \ --chat-template \ -``` +``` diff --git a/docs/source/serving/run_on_sky.rst b/docs/source/serving/run_on_sky.rst index 674b14a879bc3..227e6fd2a7818 100644 --- a/docs/source/serving/run_on_sky.rst +++ b/docs/source/serving/run_on_sky.rst @@ -109,7 +109,7 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut messages: - role: user content: Hello! What is your name? - max_tokens: 1 + max_completion_tokens: 1 .. raw:: html @@ -129,7 +129,7 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut messages: - role: user content: Hello! What is your name? - max_tokens: 1 + max_completion_tokens: 1 resources: accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. @@ -255,7 +255,7 @@ This will scale the service up to when the QPS exceeds 2 for each replica. messages: - role: user content: Hello! What is your name? - max_tokens: 1 + max_completion_tokens: 1 resources: accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. diff --git a/docs/source/serving/tensorizer.rst b/docs/source/serving/tensorizer.rst index a44696507fb9a..96a93db94871b 100644 --- a/docs/source/serving/tensorizer.rst +++ b/docs/source/serving/tensorizer.rst @@ -9,4 +9,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor For more information on CoreWeave's Tensorizer, please refer to `CoreWeave's Tensorizer documentation `_. For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see -the `vLLM example script `_. \ No newline at end of file +the `vLLM example script `_. + +.. note:: + Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. diff --git a/examples/florence2_inference.py b/examples/florence2_inference.py new file mode 100644 index 0000000000000..b58ac2e1f7ed4 --- /dev/null +++ b/examples/florence2_inference.py @@ -0,0 +1,44 @@ +''' +Demonstrate prompting of text-to-text +encoder/decoder models, specifically Florence-2 +''' +# TODO(Isotr0py): +# Move to offline_inference_vision_language.py after porting vision backbone +from vllm import LLM, SamplingParams + +dtype = "float" + +# Create a Florence-2 encoder/decoder model instance +llm = LLM( + model="microsoft/Florence-2-base", + tokenizer="facebook/bart-base", + dtype=dtype, + trust_remote_code=True, +) + +prompts = [ + "", "", "", + "", "", "", + "", "", "" +] +# Create a sampling params object. +sampling_params = SamplingParams( + temperature=0, + top_p=1.0, + min_tokens=0, + max_tokens=20, +) + +# Generate output tokens from the prompts. The output is a list of +# RequestOutput objects that contain the prompt, generated +# text, and other information. +outputs = llm.generate(prompts, sampling_params) + +# Print the outputs. +for output in outputs: + prompt = output.prompt + encoder_prompt = output.encoder_prompt + generated_text = output.outputs[0].text + print(f"Encoder prompt: {encoder_prompt!r}, " + f"Decoder prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py index 1c6ac06123bbb..050b791b62adb 100644 --- a/examples/offline_inference_audio_language.py +++ b/examples/offline_inference_audio_language.py @@ -12,14 +12,15 @@ from vllm.utils import FlexibleArgumentParser audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] -question_per_audio_count = [ - "What is recited in the audio?", - "What sport and what nursery rhyme are referenced?" -] +question_per_audio_count = { + 0: "What is 1+1?", + 1: "What is recited in the audio?", + 2: "What sport and what nursery rhyme are referenced?" +} # Ultravox 0.3 -def run_ultravox(question, audio_count): +def run_ultravox(question: str, audio_count: int): model_name = "fixie-ai/ultravox-v0_3" tokenizer = AutoTokenizer.from_pretrained(model_name) @@ -33,18 +34,34 @@ def run_ultravox(question, audio_count): tokenize=False, add_generation_prompt=True) + llm = LLM(model=model_name, limit_mm_per_prompt={"audio": audio_count}) + stop_token_ids = None + return llm, prompt, stop_token_ids + + +# Qwen2-Audio +def run_qwen2_audio(question: str, audio_count: int): + model_name = "Qwen/Qwen2-Audio-7B-Instruct" + llm = LLM(model=model_name, - enforce_eager=True, - enable_chunked_prefill=False, - max_model_len=8192, + max_model_len=4096, + max_num_seqs=5, limit_mm_per_prompt={"audio": audio_count}) + + audio_in_prompt = "".join([ + f"Audio {idx+1}: " + f"<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count) + ]) + + prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + "<|im_start|>user\n" + f"{audio_in_prompt}{question}<|im_end|>\n" + "<|im_start|>assistant\n") stop_token_ids = None return llm, prompt, stop_token_ids -model_example_map = { - "ultravox": run_ultravox, -} +model_example_map = {"ultravox": run_ultravox, "qwen2_audio": run_qwen2_audio} def main(args): @@ -54,7 +71,7 @@ def main(args): audio_count = args.num_audios llm, prompt, stop_token_ids = model_example_map[model]( - question_per_audio_count[audio_count - 1], audio_count) + question_per_audio_count[audio_count], audio_count) # We set temperature to 0.2 so that outputs can be different # even when all prompts are identical when running batch inference. @@ -62,16 +79,17 @@ def main(args): max_tokens=64, stop_token_ids=stop_token_ids) - assert args.num_prompts > 0 - inputs = { - "prompt": prompt, - "multi_modal_data": { + mm_data = {} + if audio_count > 0: + mm_data = { "audio": [ asset.audio_and_sample_rate for asset in audio_assets[:audio_count] ] - }, - } + } + + assert args.num_prompts > 0 + inputs = {"prompt": prompt, "multi_modal_data": mm_data} if args.num_prompts > 1: # Batch inference inputs = [inputs] * args.num_prompts @@ -100,7 +118,7 @@ def main(args): parser.add_argument("--num-audios", type=int, default=1, - choices=[1, 2], + choices=[0, 1, 2], help="Number of audio items per prompt.") args = parser.parse_args() diff --git a/examples/offline_inference_mlpspeculator.py b/examples/offline_inference_mlpspeculator.py index 5dec4a76afb2f..8f0eb65e47f6a 100644 --- a/examples/offline_inference_mlpspeculator.py +++ b/examples/offline_inference_mlpspeculator.py @@ -50,8 +50,6 @@ def time_generation(llm: LLM, prompts: List[str], llm = LLM( model="meta-llama/Llama-2-13b-chat-hf", speculative_model="ibm-fms/llama-13b-accelerator", - # These are currently required for MLPSpeculator decoding - use_v2_block_manager=True, ) print("With speculation") diff --git a/examples/offline_inference_openai.md b/examples/offline_inference_openai.md index ea34374edd3f9..4c64197975534 100644 --- a/examples/offline_inference_openai.md +++ b/examples/offline_inference_openai.md @@ -35,8 +35,8 @@ ``` $ cat openai_example_batch.jsonl -{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} -{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} +{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} ``` ### Step 2: Run the batch @@ -94,8 +94,8 @@ To follow along with this example, you can download the example batch, or create ``` $ cat openai_example_batch.jsonl -{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} -{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} +{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} ``` Now upload your batch file to your S3 bucket. diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 8d6818e7dfd3e..4fd002caf1763 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -1,6 +1,6 @@ """ -This example shows how to use vLLM for running offline inference -with the correct prompt format on vision language models. +This example shows how to use vLLM for running offline inference with +the correct prompt format on vision language models for text generation. For most models, the prompt format should follow corresponding examples on HuggingFace model repository. @@ -176,6 +176,31 @@ def run_minicpmv(question: str, modality: str): return llm, prompt, stop_token_ids +# H2OVL-Mississippi +def run_h2ovl(question: str, modality: str): + assert modality == "image" + + model_name = "h2oai/h2ovl-mississippi-2b" + + llm = LLM( + model=model_name, + trust_remote_code=True, + max_model_len=8192, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_name, + trust_remote_code=True) + messages = [{'role': 'user', 'content': f"\n{question}"}] + prompt = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + + # Stop tokens for H2OVL-Mississippi + # https://huggingface.co/h2oai/h2ovl-mississippi-2b + stop_token_ids = [tokenizer.eos_token_id] + return llm, prompt, stop_token_ids + + # InternVL def run_internvl(question: str, modality: str): assert modality == "image" @@ -262,11 +287,15 @@ def run_qwen2_vl(question: str, modality: str): model_name = "Qwen/Qwen2-VL-7B-Instruct" - # Tested on L40 llm = LLM( model=model_name, - max_model_len=8192, + max_model_len=4096, max_num_seqs=5, + # Note - mm_processor_kwargs can also be passed to generate/chat calls + mm_processor_kwargs={ + "min_pixels": 28 * 28, + "max_pixels": 1280 * 28 * 28, + }, ) prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" @@ -277,6 +306,22 @@ def run_qwen2_vl(question: str, modality: str): return llm, prompt, stop_token_ids +# Pixtral HF-format +def run_pixtral_hf(question: str, modality: str): + assert modality == "image" + + model_name = "mistral-community/pixtral-12b" + + llm = LLM( + model=model_name, + max_model_len=8192, + ) + + prompt = f"[INST]{question}\n[IMG][/INST]" + stop_token_ids = None + return llm, prompt, stop_token_ids + + # LLama 3.2 def run_mllama(question: str, modality: str): assert modality == "image" @@ -300,6 +345,23 @@ def run_mllama(question: str, modality: str): return llm, prompt, stop_token_ids +# Molmo +def run_molmo(question, modality): + assert modality == "image" + + model_name = "allenai/Molmo-7B-D-0924" + + llm = LLM( + model=model_name, + trust_remote_code=True, + dtype="bfloat16", + ) + + prompt = question + stop_token_ids = None + return llm, prompt, stop_token_ids + + # GLM-4v def run_glm4v(question: str, modality: str): assert modality == "image" @@ -326,11 +388,14 @@ def run_glm4v(question: str, modality: str): "chameleon": run_chameleon, "minicpmv": run_minicpmv, "blip-2": run_blip2, + "h2ovl_chat": run_h2ovl, "internvl_chat": run_internvl, "NVLM_D": run_nvlm_d, "qwen_vl": run_qwen_vl, "qwen2_vl": run_qwen2_vl, + "pixtral_hf": run_pixtral_hf, "mllama": run_mllama, + "molmo": run_molmo, "glm4v": run_glm4v, } @@ -415,7 +480,7 @@ def main(args): if __name__ == "__main__": parser = FlexibleArgumentParser( description='Demo on using vLLM for offline inference with ' - 'vision language models') + 'vision language models for text generation') parser.add_argument('--model-type', '-m', type=str, @@ -436,4 +501,4 @@ def main(args): default=16, help='Number of frames to extract from the video.') args = parser.parse_args() - main(args) + main(args) \ No newline at end of file diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference_vision_language_embedding.py new file mode 100644 index 0000000000000..e1732d045f949 --- /dev/null +++ b/examples/offline_inference_vision_language_embedding.py @@ -0,0 +1,170 @@ +""" +This example shows how to use vLLM for running offline inference with +the correct prompt format on vision language models for multimodal embedding. + +For most models, the prompt format should follow corresponding examples +on HuggingFace model repository. +""" +from argparse import Namespace +from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args + +from PIL.Image import Image + +from vllm import LLM +from vllm.multimodal.utils import fetch_image +from vllm.utils import FlexibleArgumentParser + + +class TextQuery(TypedDict): + modality: Literal["text"] + text: str + + +class ImageQuery(TypedDict): + modality: Literal["image"] + image: Image + + +class TextImageQuery(TypedDict): + modality: Literal["text+image"] + text: str + image: Image + + +QueryModality = Literal["text", "image", "text+image"] +Query = Union[TextQuery, ImageQuery, TextImageQuery] + + +class ModelRequestData(NamedTuple): + llm: LLM + prompt: str + image: Optional[Image] + + +def run_e5_v(query: Query): + llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501 + + if query["modality"] == "text": + text = query["text"] + prompt = llama3_template.format( + f"{text}\nSummary above sentence in one word: ") + image = None + elif query["modality"] == "image": + prompt = llama3_template.format( + "\nSummary above image in one word: ") + image = query["image"] + else: + modality = query['modality'] + raise ValueError(f"Unsupported query modality: '{modality}'") + + llm = LLM( + model="royokong/e5-v", + task="embedding", + max_model_len=4096, + ) + + return ModelRequestData( + llm=llm, + prompt=prompt, + image=image, + ) + + +def run_vlm2vec(query: Query): + if query["modality"] == "text": + text = query["text"] + prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501 + image = None + elif query["modality"] == "image": + prompt = "<|image_1|> Find a day-to-day image that looks similar to the provided image." # noqa: E501 + image = query["image"] + elif query["modality"] == "text+image": + text = query["text"] + prompt = f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501 + image = query["image"] + else: + modality = query['modality'] + raise ValueError(f"Unsupported query modality: '{modality}'") + + llm = LLM( + model="TIGER-Lab/VLM2Vec-Full", + task="embedding", + trust_remote_code=True, + mm_processor_kwargs={"num_crops": 4}, + ) + + return ModelRequestData( + llm=llm, + prompt=prompt, + image=image, + ) + + +def get_query(modality: QueryModality): + if modality == "text": + return TextQuery(modality="text", text="A dog sitting in the grass") + + if modality == "image": + return ImageQuery( + modality="image", + image=fetch_image( + "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg" # noqa: E501 + ), + ) + + if modality == "text+image": + return TextImageQuery( + modality="text+image", + text="A cat standing in the snow.", + image=fetch_image( + "https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Felis_catus-cat_on_snow.jpg/179px-Felis_catus-cat_on_snow.jpg" # noqa: E501 + ), + ) + + msg = f"Modality {modality} is not supported." + raise ValueError(msg) + + +def run_encode(model: str, modality: QueryModality): + query = get_query(modality) + req_data = model_example_map[model](query) + + mm_data = {} + if req_data.image is not None: + mm_data["image"] = req_data.image + + outputs = req_data.llm.encode({ + "prompt": req_data.prompt, + "multi_modal_data": mm_data, + }) + + for output in outputs: + print(output.outputs.embedding) + + +def main(args: Namespace): + run_encode(args.model_name, args.modality) + + +model_example_map = { + "e5_v": run_e5_v, + "vlm2vec": run_vlm2vec, +} + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description='Demo on using vLLM for offline inference with ' + 'vision language models for multimodal embedding') + parser.add_argument('--model-name', + '-m', + type=str, + default="vlm2vec", + choices=model_example_map.keys(), + help='The name of the embedding model.') + parser.add_argument('--modality', + type=str, + default="image", + choices=get_args(QueryModality), + help='Modality of the input.') + args = parser.parse_args() + main(args) diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py index c4e4cdc0db95f..d99684078ff3d 100644 --- a/examples/offline_inference_vision_language_multi_image.py +++ b/examples/offline_inference_vision_language_multi_image.py @@ -1,7 +1,7 @@ """ This example shows how to use vLLM for running offline inference with -multi-image input on vision language models, using the chat template defined -by the model. +multi-image input on vision language models for text generation, +using the chat template defined by the model. """ from argparse import Namespace from typing import List, NamedTuple, Optional @@ -107,6 +107,40 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData: ) +def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData: + model_name = "h2oai/h2ovl-mississippi-2b" + + llm = LLM( + model=model_name, + trust_remote_code=True, + max_model_len=8192, + limit_mm_per_prompt={"image": len(image_urls)}, + mm_processor_kwargs={"max_dynamic_patch": 4}, + ) + + placeholders = "\n".join(f"Image-{i}: \n" + for i, _ in enumerate(image_urls, start=1)) + messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}] + + tokenizer = AutoTokenizer.from_pretrained(model_name, + trust_remote_code=True) + prompt = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + + # Stop tokens for H2OVL-Mississippi + # https://huggingface.co/h2oai/h2ovl-mississippi-2b + stop_token_ids = [tokenizer.eos_token_id] + + return ModelRequestData( + llm=llm, + prompt=prompt, + stop_token_ids=stop_token_ids, + image_data=[fetch_image(url) for url in image_urls], + chat_template=None, + ) + + def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData: model_name = "OpenGVLab/InternVL2-2B" @@ -234,12 +268,36 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData: ) +def load_mllama(question, image_urls: List[str]) -> ModelRequestData: + model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" + + # The configuration below has been confirmed to launch on a single L40 GPU. + llm = LLM( + model=model_name, + max_model_len=4096, + max_num_seqs=16, + enforce_eager=True, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + prompt = f"<|image|><|image|><|begin_of_text|>{question}" + return ModelRequestData( + llm=llm, + prompt=prompt, + stop_token_ids=None, + image_data=[fetch_image(url) for url in image_urls], + chat_template=None, + ) + + model_example_map = { "phi3_v": load_phi3v, + "h2ovl_chat": load_h2onvl, "internvl_chat": load_internvl, "NVLM_D": load_nvlm_d, "qwen2_vl": load_qwen2_vl, "qwen_vl_chat": load_qwenvl_chat, + "mllama": load_mllama, } @@ -311,7 +369,8 @@ def main(args: Namespace): if __name__ == "__main__": parser = FlexibleArgumentParser( description='Demo on using vLLM for offline inference with ' - 'vision language models that support multi-image input') + 'vision language models that support multi-image input for text ' + 'generation') parser.add_argument('--model-type', '-m', type=str, diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference_with_prefix.py index 3b3e0ae64a037..67b755a155966 100644 --- a/examples/offline_inference_with_prefix.py +++ b/examples/offline_inference_with_prefix.py @@ -1,4 +1,5 @@ from vllm import LLM, SamplingParams +from vllm.distributed import cleanup_dist_env_and_memory # NOTE: This is just a running example. For benchmarking purpose, # please see benchmarks/benchmark_prefix_caching.py @@ -28,12 +29,9 @@ # Create a sampling params object. sampling_params = SamplingParams(temperature=0.0) -# Create an LLM. +# Create an LLM without prefix caching as a baseline. regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4) -prefix_cached_llm = LLM(model="facebook/opt-125m", - enable_prefix_caching=True, - gpu_memory_utilization=0.4) print("Results without `enable_prefix_caching`") # Generate texts from the prompts. The output is a list of RequestOutput objects @@ -50,6 +48,15 @@ print("-" * 80) +# Destroy the LLM object and free up the GPU memory. +del regular_llm +cleanup_dist_env_and_memory() + +# Create an LLM with prefix caching enabled. +prefix_cached_llm = LLM(model="facebook/opt-125m", + enable_prefix_caching=True, + gpu_memory_utilization=0.4) + # Warmup so that the shared prompt's KV cache is computed. prefix_cached_llm.generate(generating_prompts[0], sampling_params) diff --git a/examples/offline_profile.py b/examples/offline_profile.py new file mode 100644 index 0000000000000..1d415b82cddb6 --- /dev/null +++ b/examples/offline_profile.py @@ -0,0 +1,282 @@ +import inspect +import json +import os +import sys +from argparse import RawTextHelpFormatter +from dataclasses import asdict, dataclass +from typing import Optional + +import torch + +from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import EngineArgs +from vllm.profiler import layerwise_profile +from vllm.utils import FlexibleArgumentParser + +BATCH_SIZE_DEFAULT = 1 +PROMPT_LEN_DEFAULT = 256 +OUTPUT_LEN_DEFAULT = 2 + + +@dataclass +class ProfileContext: + engine_args: EngineArgs + prompt_len: int + output_len: int + batch_size: int + save_chrome_traces_folder: Optional[str] + + +def get_dtype(dtype: str): + if dtype == "torch.float": + return torch.float + else: + return dtype + + +def run_profile(context: ProfileContext, csv_output: Optional[str], + json_output: Optional[str]): + print("Run profile with:") + for key, value in asdict(context).items(): + print(f" {key} = {value}") + + # Create sampling params + sampling_params = SamplingParams(temperature=0.8, + top_p=0.95, + max_tokens=args.output_len, + ignore_eos=True) + + # Create LLM + llm = LLM(**asdict(context.engine_args)) + batch_size = context.batch_size + prompt_len = context.prompt_len + output_len = context.output_len + + scheduler_config = llm.llm_engine.scheduler_config + max_model_len = llm.llm_engine.model_config.max_model_len + max_num_batched_tokens = scheduler_config.max_num_batched_tokens + max_num_seqs = scheduler_config.max_num_seqs + + if batch_size * prompt_len > max_num_batched_tokens: + print(f"ERROR: chosen batch_size * prompt_len " + f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is " + f"larger than max_num_batched_tokens ({max_num_batched_tokens}) " + f"and therefore cannot be run in a single profile step, please " + f"choose a smaller batch size or prompt length, or increase " + f"--max-num-batched-tokens") + sys.exit(-1) + if batch_size >= max_num_seqs: + print( + f"ERROR: chosen batch_size ({batch_size}) is larger than " + f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a " + f"single profile step, please choose a smaller batch size") + sys.exit(-1) + print("llm.llm_engine.model_config.max_model_len: ", + llm.llm_engine.model_config.max_model_len) + if prompt_len + output_len > llm.llm_engine.model_config.max_model_len: + print( + f"ERROR: chosen prompt_len + output_len ({prompt_len} + " + f"{output_len} = {prompt_len + output_len}) is larger than the " + f"model's max_model_len ({max_model_len}), please choose a smaller " + f"prompt_len or output_len, or increase --max-model-len") + sys.exit(-1) + + def add_requests(): + for i in range(batch_size): + prompt_token_ids = torch.randint( + llm.llm_engine.model_config.get_vocab_size(), + size=(prompt_len, )).tolist() + + llm.llm_engine.add_request( + request_id=f"seq{i}", + prompt={'prompt_token_ids': prompt_token_ids}, + params=sampling_params) + + def abort_requests(): + for i in range(batch_size): + llm.llm_engine.abort_request(f"seq{i}") + + # Warm up run + print("Warm up run ...") + add_requests() + llm.llm_engine.step() # Prefill + llm.llm_engine.step() # Decode + abort_requests() + + print("Profile run ...") + add_requests() + + with layerwise_profile() as prefill_prof: + llm.llm_engine.step() # First step is prefill + + decode_profs = [] + for x in range(args.output_len - 1): + with layerwise_profile() as decode_prof: + llm.llm_engine.step() + decode_profs.append(decode_prof) + + decode_results_list = [prof.results for prof in decode_profs] + prefill_results = prefill_prof.results + has_decode = len(decode_results_list) > 0 + + LINE_WIDTH = 80 + print("=" * LINE_WIDTH) + print(f"= Prefill Model Table " + f"(prompt_len={prompt_len}, batch_size={batch_size})") + print("=" * LINE_WIDTH) + print() + prefill_results.print_model_table() + + if has_decode: + print() + print("=" * LINE_WIDTH) + print(f"= First Decode Step Model Table " + f"(prompt_len={prompt_len}, batch_size={batch_size})") + print("=" * LINE_WIDTH) + print() + decode_results_list[0].print_model_table() + + print() + print("=" * LINE_WIDTH) + print(f"= Prefill Summary Table " + f"(prompt_len={prompt_len}, batch_size={batch_size})") + print("=" * LINE_WIDTH) + print() + prefill_results.print_summary_table() + + if has_decode: + print() + print("=" * LINE_WIDTH) + print(f"= First Decode Step Summary Table " + f"(prompt_len={prompt_len}, batch_size={batch_size})") + print("=" * LINE_WIDTH) + print() + decode_results_list[0].print_summary_table() + + if csv_output: + csv_filename_base = csv_output.rstrip(".csv") + prefill_results.export_model_stats_table_csv( + csv_filename_base + "_prefill_model_table.csv") + prefill_results.export_summary_stats_table_csv( + csv_filename_base + "_prefill_summary_table.csv") + + if has_decode: + decode_results_list[0].export_model_stats_table_csv(\ + csv_filename_base + "_decode_model_table.csv") + decode_results_list[0].export_summary_stats_table_csv( + csv_filename_base + "_decode_summary_table.csv") + + if json_output: + cuda_devices = [ + torch.cuda.get_device_properties(dev_idx) + for dev_idx in range(torch.cuda.device_count()) + ] + + json_dict = { + "context": { + "python_version": f"{sys.version}", + "torch_version": f"{torch.__version__}", + "torch_cuda_version": f"{torch.version.cuda}", + "cuda_devices": f"{cuda_devices}", + **asdict(context) + }, + "prefill": prefill_results.convert_stats_to_dict(), + } + + if has_decode: + for idx, dr in enumerate(decode_results_list): + json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict() + + for idx, dr in enumerate(decode_results_list[1:]): + json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict() + + with open(json_output.rstrip(".json") + ".json", "w+") as f: + json.dump(json_dict, f, indent=2) + pass + + if context.save_chrome_traces_folder is not None: + os.makedirs(context.save_chrome_traces_folder, exist_ok=True) + prefill_prof.profiler.export_chrome_trace( + context.save_chrome_traces_folder + "/prefill.json") + for idx, decode_prof in enumerate(decode_profs): + decode_prof.profiler.export_chrome_trace( + context.save_chrome_traces_folder + f"/decode_{idx + 1}.json") + print("Traces saved as prefill.json and decode_1.json, etc." + f" in folder {context.save_chrome_traces_folder}") + + +if __name__ == "__main__": + parser = FlexibleArgumentParser(description=""" +Profile a model + + example: + ``` + python examples/offline_profile.py \\ + --model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\ + --prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\ + --enforce-eager + ``` + + then you can use various tools to analyze the json output + terminal ascii tables: + ``` + python tools/profiler/print_layerwise_table.py \\ + --json-trace Llama31-8b-FP8.json --phase prefill --table summary + ``` + or create matplotlib stacked bar charts: + ``` + python tools/profiler/visualize_layerwise_profile.py \\ + --json-trace Llama31-8b-FP8.json \\ + --output-directory profile_breakdown --plot-metric pct_cuda_time + ``` +""", + formatter_class=RawTextHelpFormatter) + parser.add_argument( + "--csv", + type=str, + default=None, + help="Export the results as multiple csv file. This should be the root " + "filename, will create _prefill_model_table.csv, " + "_prefill_summary_table.csv, " + "_decode_model_table.csv, and " + "_decode_summary_table.csv") + parser.add_argument( + "--json", + type=str, + default=None, + help="Export the results as a json file. This should be the filename") + parser.add_argument("--save-chrome-traces-folder", + type=str, + help="Save chrome traces for the prefill and decode " + "will save traces as prefill.json and decode_1.json, " + "etc. inside this folder") + parser.add_argument( + "--prompt-len", + type=int, + default=PROMPT_LEN_DEFAULT, + help=f"Length of the random prompt to use when profiling, all batched " + f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}") + parser.add_argument("--batch-size", + type=int, + default=BATCH_SIZE_DEFAULT, + help=f"Number of requests to run as a single batch, " + f"default={BATCH_SIZE_DEFAULT}") + parser.add_argument( + "--output-len", + type=int, + default=OUTPUT_LEN_DEFAULT, + help="Number of llm steps to run (includes prefill and decode) " + "- default={OUTPUT_LEN_DEFAULT}") + + EngineArgs.add_cli_args(parser) + + args = parser.parse_args() + + context = ProfileContext( + engine_args=EngineArgs.from_cli_args(args), + **{ + k: v + for k, v in vars(args).items() + if k in inspect.signature(ProfileContext).parameters + }) + run_profile(context, csv_output=args.csv, json_output=args.json) diff --git a/examples/openai_audio_api_client.py b/examples/openai_audio_api_client.py deleted file mode 100644 index 80a972683871f..0000000000000 --- a/examples/openai_audio_api_client.py +++ /dev/null @@ -1,90 +0,0 @@ -"""An example showing how to use vLLM to serve VLMs. - -Launch the vLLM server with the following command: -vllm serve fixie-ai/ultravox-v0_3 -""" -import base64 - -import requests -from openai import OpenAI - -from vllm.assets.audio import AudioAsset - -# Modify OpenAI's API key and API base to use vLLM's API server. -openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" - -client = OpenAI( - # defaults to os.environ.get("OPENAI_API_KEY") - api_key=openai_api_key, - base_url=openai_api_base, -) - -models = client.models.list() -model = models.data[0].id - -# Any format supported by librosa is supported -audio_url = AudioAsset("winning_call").url - -# Use audio url in the payload -chat_completion_from_url = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's in this audio?" - }, - { - "type": "audio_url", - "audio_url": { - "url": audio_url - }, - }, - ], - }], - model=model, - max_tokens=64, -) - -result = chat_completion_from_url.choices[0].message.content -print(f"Chat completion output:{result}") - - -# Use base64 encoded audio in the payload -def encode_audio_base64_from_url(audio_url: str) -> str: - """Encode an audio retrieved from a remote url to base64 format.""" - - with requests.get(audio_url) as response: - response.raise_for_status() - result = base64.b64encode(response.content).decode('utf-8') - - return result - - -audio_base64 = encode_audio_base64_from_url(audio_url=audio_url) -chat_completion_from_base64 = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's in this audio?" - }, - { - "type": "audio_url", - "audio_url": { - # Any format supported by librosa is supported - "url": f"data:audio/ogg;base64,{audio_base64}" - }, - }, - ], - }], - model=model, - max_tokens=64, -) - -result = chat_completion_from_base64.choices[0].message.content -print(f"Chat completion output:{result}") diff --git a/examples/openai_chat_completion_client_for_multimodal.py b/examples/openai_chat_completion_client_for_multimodal.py new file mode 100644 index 0000000000000..0ec4f71dddf93 --- /dev/null +++ b/examples/openai_chat_completion_client_for_multimodal.py @@ -0,0 +1,236 @@ +"""An example showing how to use vLLM to serve multimodal models +and run online inference with OpenAI client. + +Launch the vLLM server with the following command: + +(single image inference with Llava) +vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja + +(multi-image inference with Phi-3.5-vision-instruct) +vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ + --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2 + +(audio inference with Ultravox) +vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096 +""" +import base64 + +import requests +from openai import OpenAI + +from vllm.assets.audio import AudioAsset +from vllm.utils import FlexibleArgumentParser + +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key=openai_api_key, + base_url=openai_api_base, +) + +models = client.models.list() +model = models.data[0].id + + +def encode_base64_content_from_url(content_url: str) -> str: + """Encode a content retrieved from a remote url to base64 format.""" + + with requests.get(content_url) as response: + response.raise_for_status() + result = base64.b64encode(response.content).decode('utf-8') + + return result + + +# Text-only inference +def run_text_only() -> None: + chat_completion = client.chat.completions.create( + messages=[{ + "role": "user", + "content": "What's the capital of France?" + }], + model=model, + max_completion_tokens=64, + ) + + result = chat_completion.choices[0].message.content + print("Chat completion output:", result) + + +# Single-image input inference +def run_single_image() -> None: + + ## Use image url in the payload + image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + chat_completion_from_url = client.chat.completions.create( + messages=[{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": image_url + }, + }, + ], + }], + model=model, + max_completion_tokens=64, + ) + + result = chat_completion_from_url.choices[0].message.content + print("Chat completion output from image url:", result) + + ## Use base64 encoded image in the payload + image_base64 = encode_base64_content_from_url(image_url) + chat_completion_from_base64 = client.chat.completions.create( + messages=[{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's in this image?" + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{image_base64}" + }, + }, + ], + }], + model=model, + max_completion_tokens=64, + ) + + result = chat_completion_from_base64.choices[0].message.content + print("Chat completion output from base64 encoded image:", result) + + +# Multi-image input inference +def run_multi_image() -> None: + image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg" + image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" + chat_completion_from_url = client.chat.completions.create( + messages=[{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "What are the animals in these images?" + }, + { + "type": "image_url", + "image_url": { + "url": image_url_duck + }, + }, + { + "type": "image_url", + "image_url": { + "url": image_url_lion + }, + }, + ], + }], + model=model, + max_completion_tokens=64, + ) + + result = chat_completion_from_url.choices[0].message.content + print("Chat completion output:", result) + + +# Audio input inference +def run_audio() -> None: + # Any format supported by librosa is supported + audio_url = AudioAsset("winning_call").url + + # Use audio url in the payload + chat_completion_from_url = client.chat.completions.create( + messages=[{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's in this audio?" + }, + { + "type": "audio_url", + "audio_url": { + "url": audio_url + }, + }, + ], + }], + model=model, + max_completion_tokens=64, + ) + + result = chat_completion_from_url.choices[0].message.content + print("Chat completion output from audio url:", result) + + audio_base64 = encode_base64_content_from_url(audio_url) + chat_completion_from_base64 = client.chat.completions.create( + messages=[{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "What's in this audio?" + }, + { + "type": "audio_url", + "audio_url": { + # Any format supported by librosa is supported + "url": f"data:audio/ogg;base64,{audio_base64}" + }, + }, + ], + }], + model=model, + max_completion_tokens=64, + ) + + result = chat_completion_from_base64.choices[0].message.content + print("Chat completion output from base64 encoded audio:", result) + + +example_function_map = { + "text-only": run_text_only, + "single-image": run_single_image, + "multi-image": run_multi_image, + "audio": run_audio, +} + + +def main(args) -> None: + chat_type = args.chat_type + example_function_map[chat_type]() + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description='Demo on using OpenAI client for online inference with ' + 'multimodal language models served with vLLM.') + parser.add_argument( + '--chat-type', + '-c', + type=str, + default="single-image", + choices=["text-only", "single-image", "multi-image", "audio"], + help='Conversation type with multimodal data.') + args = parser.parse_args() + main(args) diff --git a/examples/openai_chat_embedding_client_for_multimodal.py b/examples/openai_chat_embedding_client_for_multimodal.py new file mode 100644 index 0000000000000..effb588e1387f --- /dev/null +++ b/examples/openai_chat_embedding_client_for_multimodal.py @@ -0,0 +1,33 @@ +import requests + +image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + +response = requests.post( + "http://localhost:8000/v1/embeddings", + json={ + "model": + "TIGER-Lab/VLM2Vec-Full", + "messages": [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "Represent the given image." + }, + ], + }], + "encoding_format": + "float", + }, +) +response.raise_for_status() +response_json = response.json() + +print("Embedding output:", response_json["data"][0]["embedding"]) diff --git a/examples/openai_example_batch.jsonl b/examples/openai_example_batch.jsonl index 5aa7e185c180a..54ac8c813ddb7 100644 --- a/examples/openai_example_batch.jsonl +++ b/examples/openai_example_batch.jsonl @@ -1,2 +1,2 @@ -{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} -{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} +{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} diff --git a/examples/openai_vision_api_client.py b/examples/openai_vision_api_client.py deleted file mode 100644 index 71ae03e4d148b..0000000000000 --- a/examples/openai_vision_api_client.py +++ /dev/null @@ -1,126 +0,0 @@ -"""An example showing how to use vLLM to serve VLMs. - -Launch the vLLM server with the following command: - -(single image inference with Llava) -vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja - -(multi-image inference with Phi-3.5-vision-instruct) -vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \ - --trust-remote-code --limit-mm-per-prompt image=2 -""" -import base64 - -import requests -from openai import OpenAI - -# Modify OpenAI's API key and API base to use vLLM's API server. -openai_api_key = "EMPTY" -openai_api_base = "http://localhost:8000/v1" - -client = OpenAI( - # defaults to os.environ.get("OPENAI_API_KEY") - api_key=openai_api_key, - base_url=openai_api_base, -) - -models = client.models.list() -model = models.data[0].id - -# Single-image input inference -image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" - -## Use image url in the payload -chat_completion_from_url = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": image_url - }, - }, - ], - }], - model=model, - max_tokens=64, -) - -result = chat_completion_from_url.choices[0].message.content -print("Chat completion output:", result) - - -## Use base64 encoded image in the payload -def encode_image_base64_from_url(image_url: str) -> str: - """Encode an image retrieved from a remote url to base64 format.""" - - with requests.get(image_url) as response: - response.raise_for_status() - result = base64.b64encode(response.content).decode('utf-8') - - return result - - -image_base64 = encode_image_base64_from_url(image_url=image_url) -chat_completion_from_base64 = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What's in this image?" - }, - { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{image_base64}" - }, - }, - ], - }], - model=model, - max_tokens=64, -) - -result = chat_completion_from_base64.choices[0].message.content -print(f"Chat completion output:{result}") - -# Multi-image input inference -image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg" -image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" -chat_completion_from_url = client.chat.completions.create( - messages=[{ - "role": - "user", - "content": [ - { - "type": "text", - "text": "What are the animals in these images?" - }, - { - "type": "image_url", - "image_url": { - "url": image_url_duck - }, - }, - { - "type": "image_url", - "image_url": { - "url": image_url_lion - }, - }, - ], - }], - model=model, - max_tokens=64, -) - -result = chat_completion_from_url.choices[0].message.content -print("Chat completion output:", result) diff --git a/examples/template_vlm2vec.jinja b/examples/template_vlm2vec.jinja new file mode 100644 index 0000000000000..489b99604af38 --- /dev/null +++ b/examples/template_vlm2vec.jinja @@ -0,0 +1,16 @@ +{%- if messages | length > 1 -%} + {{ raise_exception('Embedding models should only embed one message at a time') }} +{%- endif -%} + +{% set vars = namespace(parts=[], next_image_id=1) %} +{%- for message in messages -%} + {%- for content in message['content'] -%} + {%- if content['type'] == 'text' -%} + {%- set vars.parts = vars.parts + [content['text']] %} + {%- elif content['type'] == 'image' -%} + {%- set vars.parts = vars.parts + ['<|image_{i:d}|>'.format(i=vars.next_image_id)] %} + {%- set vars.next_image_id = vars.next_image_id + 1 %} + {%- endif -%} + {%- endfor -%} +{%- endfor -%} +{{ vars.parts | join(' ') }} diff --git a/examples/tool_chat_template_granite_20b_fc.jinja b/examples/tool_chat_template_granite_20b_fc.jinja new file mode 100644 index 0000000000000..cb52188ec72d9 --- /dev/null +++ b/examples/tool_chat_template_granite_20b_fc.jinja @@ -0,0 +1,130 @@ +{%- macro json_to_python_type(json_spec) %} + {%- set basic_type_map = { + "string": "str", + "number": "float", + "integer": "int", + "boolean": "bool" +} %} + + {%- if basic_type_map[json_spec.type] is defined %} + {{- basic_type_map[json_spec.type] }} + {%- elif json_spec.type == "array" %} + {{- "list[" + json_to_python_type(json_spec|items) + "]" }} + {%- elif json_spec.type == "object" %} + {%- if json_spec.additionalProperties is defined %} + {{- "dict[str, " + json_to_python_type(json_spec.additionalProperties) + ']' }} + {%- else %} + {{- "dict" }} + {%- endif %} + {%- elif json_spec.type is iterable %} + {{- "Union[" }} + {%- for t in json_spec.type %} + {{- json_to_python_type({"type": t}) }} + {%- if not loop.last %} + {{- "," }} + {%- endif %} + {%- endfor %} + {{- "]" }} + {%- else %} + {{- "Any" }} + {%- endif %} +{%- endmacro %} + +{%- if not full_function_description is defined %} + {%- set full_function_description = false %} +{%- endif %} + +{%- macro full_description(tool) %} + {{- tool.name + '(' }} + {%- if tool.parameters is defined %} + {%- for param_name, param_fields in tool.parameters.properties|items %} + {{- param_name + ": " + json_to_python_type(param_fields) }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {%- endif %} + {{- ")" }} + {%- if tool.return is defined %} + {{- " -> " + json_to_python_type(tool.return) }} + {%- endif %} + {{- " - " + tool.description + "\n\n" }} + {%- if tool.parameters is defined %} + {%- for param_name, param_fields in tool.parameters.properties|items %} + {%- if loop.first %} + {{- " Args:\n" }} + {%- endif %} + {{- " " + param_name + "(" + json_to_python_type(param_fields) + "): " + param_fields.description|trim }} + {%- endfor %} + {%- endif %} + {%- if tool.return is defined and tool.return.description is defined %} + {{- "\n Returns:\n " + tool.return.description }} + {%- endif %} + {{- '"' }} +{%- endmacro %} + +{%- macro simple_description(tool) %} + {{- tool.description }} +{%- endmacro %} + +{%- macro function_description(tool) %} + {%- if full_function_description %} + {{- full_description(tool) }} + {%- else %} + {{- simple_description(tool) }} + {%- endif %} +{%- endmacro %} + +{%- if messages[0]["role"] == "system" %} + {%- set sys_prompt = messages[0]["content"] %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set loop_messages = messages %} + {% set sys_prompt = 'You are a helpful assistant with access to the following function calls. Your task is to understand the given conversation with function calls and responses and generate natural language response as the ASSISTANT to continue the conversation. You may use the following function calls to understand how to respond to the user query.' %} +{%- endif %} + +{{ 'SYSTEM: ' + sys_prompt }} +{% if tools is iterable and tools | length > 0 %} +<|function_call_library|> + {%- for tool in tools %} + {%- if tool.function is defined %} + {%- set tool = tool.function %} + {%- endif %} + {{- '{"name": "' + tool.name + '", ' }} + {{- '"description": "' + function_description(tool) }} + {{- ', "parameters": ' }} + {%- if not tool.parameters is defined or tool.parameters.properties | length == 0 %} + {{- "{}" }} + {%- else %} + {{- tool.parameters|tojson }} + {%- endif %} + {{- "}" }} + {%- if not loop.last %} + {{- "\n" }} + {%- endif %} + {%- endfor %} +If none of the functions are relevant or the given question lacks the parameters required by the function, please output \" {\"name\": \"no_function\", \"arguments\": {}}\". +{%- endif %} + + + +{% for message in messages %} + {% if message['role'] == 'user' %} + {{- '\nUSER: ' + message['content'] }} + {% elif message['role'] == 'assistant' and message.tool_calls is defined %} + {{- '\nASSISTANT:' }} + {% for tc in message.tool_calls %} + {{- ' ' + {'name': tc.function.name, 'arguments': tc.function.arguments}|tojson }} + {% endfor %} + {{- '<|endoftext|>' }} + {% elif message['role'] == 'assistant' %} + {{- '\nASSISTANT: ' + message['content'] + ' <|endoftext|>' }} + {% elif message['role'] == 'tool' %} + {{- ' ' + message['content'] }} + {%- else %} + {{- raise_exception("Unexpected combination of role and message content") }} + {% endif %} + {% if loop.last and add_generation_prompt %} + {{- '\nASSISTANT: ' }} + {% endif %} +{% endfor %} diff --git a/format.sh b/format.sh index 1ac028d00e3a4..be6ee0ce46dcb 100755 --- a/format.sh +++ b/format.sh @@ -21,6 +21,20 @@ builtin cd "$(dirname "${BASH_SOURCE:-$0}")" ROOT="$(git rev-parse --show-toplevel)" builtin cd "$ROOT" || exit 1 +check_command() { + if ! command -v "$1" &> /dev/null; then + echo "❓❓$1 is not installed, please run \`pip install -r requirements-lint.txt\`" + exit 1 + fi +} + +check_command yapf +check_command ruff +check_command mypy +check_command codespell +check_command isort +check_command clang-format + YAPF_VERSION=$(yapf --version | awk '{print $2}') RUFF_VERSION=$(ruff --version | awk '{print $2}') MYPY_VERSION=$(mypy --version | awk '{print $2}') @@ -31,7 +45,7 @@ CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}') # # params: tool name, tool version, required version tool_version_check() { if [[ $2 != $3 ]]; then - echo "Wrong $1 version installed: $3 is required, not $2." + echo "❓❓Wrong $1 version installed: $3 is required, not $2." exit 1 fi } @@ -281,10 +295,12 @@ tools/actionlint.sh -color echo 'vLLM actionlint: Done' if ! git diff --quiet &>/dev/null; then - echo 'Reformatted files. Please review and stage the changes.' - echo 'Changes not staged for commit:' - echo + echo + echo "🔍🔍There are files changed by the format checker or by you that are not added and committed:" git --no-pager diff --name-only + echo "🔍🔍Format checker passed, but please add, commit and push all the files above to include changes made by the format checker." exit 1 +else + echo "✹🎉 Format check passed! Congratulations! 🎉✹" fi diff --git a/pyproject.toml b/pyproject.toml index c9057b061aad9..0bbab3cd3fbc3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,12 +6,15 @@ requires = [ "packaging", "setuptools>=61", "setuptools-scm>=8.0", - "torch == 2.4.0", + "torch == 2.5.1", "wheel", "jinja2", ] build-backend = "setuptools.build_meta" +[tool.setuptools_scm] +# version_file = "vllm/_version.py" # currently handled by `setup.py:get_version()` + [tool.ruff] # Allow lines to be as long as 80. line-length = 80 diff --git a/python_only_dev.py b/python_only_dev.py index 72d4e78ee14f6..4ab203bb6f9d6 100644 --- a/python_only_dev.py +++ b/python_only_dev.py @@ -39,7 +39,6 @@ files_to_copy = [ "vllm/_C.abi3.so", - "vllm/_core_C.abi3.so", "vllm/_moe_C.abi3.so", "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so", "vllm/vllm_flash_attn/flash_attn_interface.py", diff --git a/requirements-build.txt b/requirements-build.txt index 6144a56da8c47..fec01caaf25ef 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -1,9 +1,9 @@ -# Should be mirrored in pyproject.toml -cmake>=3.26 -ninja -packaging -setuptools>=61 -setuptools-scm>=8 -torch==2.4.0 -wheel -jinja2 +# Should be mirrored in pyproject.toml +cmake>=3.26 +ninja +packaging +setuptools>=61 +setuptools-scm>=8 +torch==2.5.1 +wheel +jinja2 diff --git a/requirements-common.txt b/requirements-common.txt index aa165ff6d6a5e..ef5ed8b645158 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -4,13 +4,13 @@ numpy < 2.0.0 requests >= 2.26.0 tqdm py-cpuinfo -transformers >= 4.45.0 # Required for Llama 3.2. +transformers >= 4.45.2 # Required for Llama 3.2 and Qwen2-VL. tokenizers >= 0.19.1 # Required for Llama 3. protobuf # Required by LlamaTokenizer. fastapi >= 0.107.0, < 0.113.0; python_version < '3.9' fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9' aiohttp -openai >= 1.40.0 # Ensure modern openai package (ensure types module present) +openai >= 1.45.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support) uvicorn[standard] pydantic >= 2.9 # Required for fastapi >= 0.113.0 pillow # Required for image processing @@ -31,3 +31,4 @@ pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. +compressed-tensors == 0.7.1 # required for compressed-tensors diff --git a/requirements-cuda.txt b/requirements-cuda.txt index 3b3c2f876919e..058ab7c1ee9df 100644 --- a/requirements-cuda.txt +++ b/requirements-cuda.txt @@ -3,8 +3,8 @@ # Dependencies for NVIDIA GPUs ray >= 2.9 -nvidia-ml-py # for pynvml package -torch == 2.4.0 +nvidia-ml-py >= 12.560.30 # for pynvml package +torch == 2.5.1 # These must be updated alongside torch -torchvision == 0.19 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version -xformers == 0.0.27.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.4.0 +torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version +xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.1 diff --git a/requirements-lint.txt b/requirements-lint.txt index 07f738873e1a8..f9132bbf96437 100644 --- a/requirements-lint.txt +++ b/requirements-lint.txt @@ -1,7 +1,7 @@ # formatting yapf==0.32.0 toml==0.10.2 -tomli==2.0.1 +tomli==2.0.2 ruff==0.6.5 codespell==2.3.0 isort==5.13.2 diff --git a/requirements-openvino.txt b/requirements-openvino.txt index ac54cf0c3288f..95e5914757812 100644 --- a/requirements-openvino.txt +++ b/requirements-openvino.txt @@ -1,7 +1,7 @@ # Common dependencies -r requirements-common.txt -torch == 2.4.0 # should be aligned with "common" vLLM torch version +torch == 2.5.1 # should be aligned with "common" vLLM torch version openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version diff --git a/requirements-test.in b/requirements-test.in new file mode 100644 index 0000000000000..560c005fd6157 --- /dev/null +++ b/requirements-test.in @@ -0,0 +1,37 @@ +# testing +pytest +tensorizer>=2.9.0 +pytest-forked +pytest-asyncio +pytest-rerunfailures +pytest-shard + +# testing utils +awscli +einops # required for MPT, qwen-vl and Mamba +httpx +librosa # required for audio tests +opencv-python # required for video tests +peft +requests +ray[adag]==2.35 +sentence-transformers # required for embedding +soundfile # required for audio test +timm # required for internvl test +torch==2.5.1 +transformers_stream_generator # required for qwen-vl test +matplotlib # required for qwen-vl test +datamodel_code_generator # required for minicpm3 test +lm-eval[api]==0.4.4 # required for model evaluation test + +# TODO: Add this after fully implementing llava(mantis) +# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test + +# Benchmarking +aiohttp + +# quantization +bitsandbytes>=0.44.0 +buildkite-test-collector==0.1.9 + +numpy < 2.0.0 diff --git a/requirements-test.txt b/requirements-test.txt index 997df9afac763..518e81021cbcb 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,35 +1,561 @@ -# testing -pytest -tensorizer>=2.9.0 -pytest-forked -pytest-asyncio -pytest-rerunfailures -pytest-shard +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile --output-file=requirements-test.txt requirements-test.in +# +absl-py==2.1.0 + # via rouge-score +accelerate==1.0.1 + # via + # lm-eval + # peft +aiohappyeyeballs==2.4.3 + # via aiohttp +aiohttp==3.10.10 + # via + # -r requirements-test.in + # datasets + # fsspec + # lm-eval +aiosignal==1.3.1 + # via + # aiohttp + # ray +annotated-types==0.7.0 + # via pydantic +anyio==4.6.2.post1 + # via httpx +argcomplete==3.5.1 + # via datamodel-code-generator +attrs==24.2.0 + # via + # aiohttp + # jsonlines + # jsonschema + # referencing +audioread==3.0.1 + # via librosa +awscli==1.35.19 + # via -r requirements-test.in +bitsandbytes==0.44.1 + # via -r requirements-test.in +black==24.10.0 + # via datamodel-code-generator +boto3==1.35.53 + # via tensorizer +botocore==1.35.53 + # via + # awscli + # boto3 + # s3transfer +buildkite-test-collector==0.1.9 + # via -r requirements-test.in +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests +cffi==1.17.1 + # via soundfile +chardet==5.2.0 + # via mbstrdecoder +charset-normalizer==3.4.0 + # via requests +click==8.1.7 + # via + # black + # nltk + # ray +colorama==0.4.6 + # via + # awscli + # sacrebleu + # tqdm-multiprocess +contourpy==1.3.0 + # via matplotlib +cupy-cuda12x==13.3.0 + # via ray +cycler==0.12.1 + # via matplotlib +datamodel-code-generator==0.26.2 + # via -r requirements-test.in +dataproperty==1.0.1 + # via + # pytablewriter + # tabledata +datasets==3.0.2 + # via + # evaluate + # lm-eval +decorator==5.1.1 + # via librosa +dill==0.3.8 + # via + # datasets + # evaluate + # lm-eval + # multiprocess +dnspython==2.7.0 + # via email-validator +docutils==0.16 + # via awscli +einops==0.8.0 + # via -r requirements-test.in +email-validator==2.2.0 + # via pydantic +evaluate==0.4.3 + # via lm-eval +fastrlock==0.8.2 + # via cupy-cuda12x +filelock==3.16.1 + # via + # datasets + # huggingface-hub + # ray + # torch + # transformers + # triton +fonttools==4.54.1 + # via matplotlib +frozenlist==1.5.0 + # via + # aiohttp + # aiosignal + # ray +fsspec[http]==2024.9.0 + # via + # datasets + # evaluate + # huggingface-hub + # torch +genson==1.3.0 + # via datamodel-code-generator +h11==0.14.0 + # via httpcore +hiredis==3.0.0 + # via tensorizer +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via -r requirements-test.in +huggingface-hub==0.26.2 + # via + # accelerate + # datasets + # evaluate + # peft + # sentence-transformers + # timm + # tokenizers + # transformers +idna==3.10 + # via + # anyio + # email-validator + # httpx + # requests + # yarl +inflect==5.6.2 + # via datamodel-code-generator +iniconfig==2.0.0 + # via pytest +isort==5.13.2 + # via datamodel-code-generator +jinja2==3.1.4 + # via + # datamodel-code-generator + # torch +jmespath==1.0.1 + # via + # boto3 + # botocore +joblib==1.4.2 + # via + # librosa + # nltk + # scikit-learn +jsonlines==4.0.0 + # via lm-eval +jsonschema==4.23.0 + # via ray +jsonschema-specifications==2024.10.1 + # via jsonschema +kiwisolver==1.4.7 + # via matplotlib +lazy-loader==0.4 + # via librosa +libnacl==2.1.0 + # via tensorizer +librosa==0.10.2.post1 + # via -r requirements-test.in +llvmlite==0.43.0 + # via numba +lm-eval[api]==0.4.4 + # via -r requirements-test.in +lxml==5.3.0 + # via sacrebleu +markupsafe==3.0.2 + # via jinja2 +matplotlib==3.9.2 + # via -r requirements-test.in +mbstrdecoder==1.1.3 + # via + # dataproperty + # pytablewriter + # typepy +more-itertools==10.5.0 + # via lm-eval +mpmath==1.3.0 + # via sympy +msgpack==1.1.0 + # via + # librosa + # ray +multidict==6.1.0 + # via + # aiohttp + # yarl +multiprocess==0.70.16 + # via + # datasets + # evaluate +mypy-extensions==1.0.0 + # via black +networkx==3.2.1 + # via torch +nltk==3.9.1 + # via rouge-score +numba==0.60.0 + # via librosa +numexpr==2.10.1 + # via lm-eval +numpy==1.26.4 + # via + # -r requirements-test.in + # accelerate + # bitsandbytes + # contourpy + # cupy-cuda12x + # datasets + # evaluate + # librosa + # matplotlib + # numba + # numexpr + # opencv-python + # pandas + # peft + # rouge-score + # sacrebleu + # scikit-learn + # scipy + # soxr + # tensorizer + # torchvision + # transformers +nvidia-cublas-cu12==12.4.5.8 + # via + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.4.127 + # via torch +nvidia-cuda-nvrtc-cu12==12.4.127 + # via torch +nvidia-cuda-runtime-cu12==12.4.127 + # via torch +nvidia-cudnn-cu12==9.1.0.70 + # via torch +nvidia-cufft-cu12==11.2.1.3 + # via torch +nvidia-curand-cu12==10.3.5.147 + # via torch +nvidia-cusolver-cu12==11.6.1.9 + # via torch +nvidia-cusparse-cu12==12.3.1.170 + # via + # nvidia-cusolver-cu12 + # torch +nvidia-nccl-cu12==2.21.5 + # via torch +nvidia-nvjitlink-cu12==12.4.127 + # via + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 + # torch +nvidia-nvtx-cu12==12.4.127 + # via torch +opencv-python==4.10.0.84 + # via -r requirements-test.in +packaging==24.1 + # via + # accelerate + # black + # datamodel-code-generator + # datasets + # evaluate + # huggingface-hub + # lazy-loader + # matplotlib + # peft + # pooch + # pytest + # pytest-rerunfailures + # ray + # transformers + # typepy +pandas==2.2.3 + # via + # datasets + # evaluate +pathspec==0.12.1 + # via black +pathvalidate==3.2.1 + # via pytablewriter +peft==0.13.2 + # via + # -r requirements-test.in + # lm-eval +pillow==11.0.0 + # via + # matplotlib + # sentence-transformers + # torchvision +platformdirs==4.3.6 + # via + # black + # pooch +pluggy==1.5.0 + # via pytest +pooch==1.8.2 + # via librosa +portalocker==2.10.1 + # via sacrebleu +propcache==0.2.0 + # via yarl +protobuf==5.28.3 + # via + # ray + # tensorizer +psutil==6.1.0 + # via + # accelerate + # peft + # tensorizer +py==1.11.0 + # via pytest-forked +pyarrow==18.0.0 + # via datasets +pyasn1==0.6.1 + # via rsa +pybind11==2.13.6 + # via lm-eval +pycparser==2.22 + # via cffi +pydantic[email]==2.9.2 + # via datamodel-code-generator +pydantic-core==2.23.4 + # via pydantic +pyparsing==3.2.0 + # via matplotlib +pytablewriter==1.2.0 + # via lm-eval +pytest==8.3.3 + # via + # -r requirements-test.in + # buildkite-test-collector + # pytest-asyncio + # pytest-forked + # pytest-rerunfailures + # pytest-shard +pytest-asyncio==0.24.0 + # via -r requirements-test.in +pytest-forked==1.6.0 + # via -r requirements-test.in +pytest-rerunfailures==14.0 + # via -r requirements-test.in +pytest-shard==0.1.2 + # via -r requirements-test.in +python-dateutil==2.9.0.post0 + # via + # botocore + # matplotlib + # pandas + # typepy +pytz==2024.2 + # via + # pandas + # typepy +pyyaml==6.0.2 + # via + # accelerate + # awscli + # datamodel-code-generator + # datasets + # huggingface-hub + # peft + # ray + # timm + # transformers +ray[adag]==2.35.0 + # via -r requirements-test.in +redis==5.2.0 + # via tensorizer +referencing==0.35.1 + # via + # jsonschema + # jsonschema-specifications +regex==2024.9.11 + # via + # nltk + # sacrebleu + # tiktoken + # transformers +requests==2.32.3 + # via + # -r requirements-test.in + # buildkite-test-collector + # datasets + # evaluate + # huggingface-hub + # lm-eval + # pooch + # ray + # tiktoken + # transformers +rouge-score==0.1.2 + # via lm-eval +rpds-py==0.20.1 + # via + # jsonschema + # referencing +rsa==4.7.2 + # via awscli +s3transfer==0.10.3 + # via + # awscli + # boto3 +sacrebleu==2.4.3 + # via lm-eval +safetensors==0.4.5 + # via + # accelerate + # peft + # timm + # transformers +scikit-learn==1.5.2 + # via + # librosa + # lm-eval + # sentence-transformers +scipy==1.13.1 + # via + # librosa + # scikit-learn + # sentence-transformers +sentence-transformers==3.2.1 + # via -r requirements-test.in +six==1.16.0 + # via + # python-dateutil + # rouge-score +sniffio==1.3.1 + # via + # anyio + # httpx +soundfile==0.12.1 + # via + # -r requirements-test.in + # librosa +soxr==0.5.0.post1 + # via librosa +sqlitedict==2.1.0 + # via lm-eval +sympy==1.13.1 + # via torch +tabledata==1.3.3 + # via pytablewriter +tabulate==0.9.0 + # via sacrebleu +tcolorpy==0.1.6 + # via pytablewriter +tenacity==9.0.0 + # via lm-eval +tensorizer==2.9.0 + # via -r requirements-test.in +threadpoolctl==3.5.0 + # via scikit-learn +tiktoken==0.8.0 + # via lm-eval +timm==1.0.11 + # via -r requirements-test.in +tokenizers==0.20.1 + # via transformers +torch==2.5.1 + # via + # -r requirements-test.in + # accelerate + # bitsandbytes + # lm-eval + # peft + # sentence-transformers + # tensorizer + # timm + # torchvision +torchvision==0.20.1 + # via timm +tqdm==4.66.6 + # via + # datasets + # evaluate + # huggingface-hub + # lm-eval + # nltk + # peft + # sentence-transformers + # tqdm-multiprocess + # transformers +tqdm-multiprocess==0.0.11 + # via lm-eval +transformers==4.45.2 + # via + # lm-eval + # peft + # sentence-transformers + # transformers-stream-generator +transformers-stream-generator==0.0.5 + # via -r requirements-test.in +triton==3.1.0 + # via torch +typepy[datetime]==1.3.2 + # via + # dataproperty + # pytablewriter + # tabledata +typing-extensions==4.12.2 + # via + # huggingface-hub + # librosa + # pydantic + # pydantic-core + # torch +tzdata==2024.2 + # via pandas +urllib3==1.26.20 + # via + # botocore + # requests +word2number==1.1 + # via lm-eval +xxhash==3.5.0 + # via + # datasets + # evaluate +yarl==1.17.1 + # via aiohttp +zstandard==0.23.0 + # via lm-eval -# testing utils -awscli -einops # required for MPT, qwen-vl and Mamba -httpx -librosa # required for audio tests -opencv-python # required for video tests -peft -requests -ray[adag]==2.35 -sentence-transformers # required for embedding -soundfile # required for audio test -compressed-tensors==0.4.0 # required for compressed-tensors -timm # required for internvl test -transformers_stream_generator # required for qwen-vl test -matplotlib # required for qwen-vl test -datamodel_code_generator # required for minicpm3 test -lm-eval[api]==0.4.4 # required for model evaluation test - -# TODO: Add this after fully implementing llava(mantis) -# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test - -# Benchmarking -aiohttp - -# quantization -bitsandbytes>=0.44.0 -buildkite-test-collector==0.1.8 +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/requirements-xpu.txt b/requirements-xpu.txt index ce83a178c618f..eb76a33dab5c2 100644 --- a/requirements-xpu.txt +++ b/requirements-xpu.txt @@ -13,4 +13,4 @@ torch == 2.3.1+cxx11.abi intel-extension-for-pytorch == 2.3.110+xpu oneccl_bind_pt == 2.3.100+xpu -triton-xpu == 3.0.0b2 +triton-xpu == 3.0.0b1 diff --git a/setup.py b/setup.py index c3a462cae8a5d..13d2d8faa8805 100644 --- a/setup.py +++ b/setup.py @@ -157,6 +157,14 @@ def configure(self, ext: CMakeExtension) -> None: # on subsequent calls to python. cmake_args += ['-DVLLM_PYTHON_PATH={}'.format(":".join(sys.path))] + # Override the base directory for FetchContent downloads to $ROOT/.deps + # This allows sharing dependencies between profiles, + # and plays more nicely with sccache. + # To override this, set the FETCHCONTENT_BASE_DIR environment variable. + fc_base_dir = os.path.join(ROOT_DIR, ".deps") + fc_base_dir = os.environ.get("FETCHCONTENT_BASE_DIR", fc_base_dir) + cmake_args += ['-DFETCHCONTENT_BASE_DIR={}'.format(fc_base_dir)] + # # Setup parallelism and build tool # @@ -308,11 +316,6 @@ def _build_custom_ops() -> bool: return _is_cuda() or _is_hip() or _is_cpu() -def _build_core_ext() -> bool: - return not (_is_neuron() or _is_tpu() or _is_openvino() or _is_xpu() - or _is_hpu()) - - def get_hipcc_rocm_version(): # Run the hipcc --version command result = subprocess.run(['hipcc', '--version'], @@ -500,9 +503,6 @@ def _read_requirements(filename: str) -> List[str]: ext_modules = [] -if _build_core_ext(): - ext_modules.append(CMakeExtension(name="vllm._core_C")) - if _is_cuda() or _is_hip(): ext_modules.append(CMakeExtension(name="vllm._moe_C")) diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index 1903a7582dc89..8a04693ba676d 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -12,11 +12,11 @@ from vllm import SamplingParams from vllm.config import ParallelConfig +from vllm.distributed import cleanup_dist_env_and_memory from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine from vllm.outputs import RequestOutput as RealRequestOutput from vllm.sampling_params import RequestOutputKind -from ..conftest import cleanup from ..utils import wait_for_gpu_memory_to_clear @@ -157,7 +157,7 @@ async def async_engine(): engine.shutdown_background_loop() del engine await asyncio.sleep(0.1) - cleanup() + cleanup_dist_env_and_memory() @pytest.fixture() diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 0fe88e792520a..79647589d5204 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -11,7 +11,7 @@ import pytest from vllm import LLM -from vllm.utils import is_hip +from vllm.platforms import current_platform from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata from ..models.utils import check_outputs_equal @@ -19,7 +19,7 @@ MODELS = [ "facebook/opt-125m", - "meta-llama/Llama-2-7b-hf", + "meta-llama/Llama-3.2-1B", ] TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4") @@ -51,7 +51,7 @@ def test_models( enforce_eager: bool, ) -> None: - if backend == "FLASHINFER" and is_hip(): + if backend == "FLASHINFER" and current_platform.is_rocm(): pytest.skip("Flashinfer does not support ROCm/HIP.") os.environ["VLLM_ATTENTION_BACKEND"] = backend diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index e8819688c9e83..cc5bc2aca27c9 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -11,21 +11,17 @@ import pytest +from tests.kernels.utils import override_backend_env_variable + from ..models.utils import check_logprobs_close, check_outputs_equal -from ..utils import check_deprecated_block_manager_usage, multi_gpu_test +from ..utils import multi_gpu_test MODELS = [ "facebook/opt-125m", - "meta-llama/Llama-2-7b-hf", + "meta-llama/Llama-3.2-1B", ] -@pytest.fixture(scope="module", autouse=True) -def check_deprecated_block_manager(): - check_deprecated_block_manager_usage( - 'tests/basic_correctness/test_chunked_prefill.py') - - @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [32]) @@ -34,6 +30,7 @@ def check_deprecated_block_manager(): # NOTE: Increasing this in this suite will fail CI because we currently cannot # reset distributed env properly. Use a value > 1 just when you test. @pytest.mark.parametrize("tensor_parallel_size", [1]) +@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) def test_models( hf_runner, vllm_runner, @@ -44,11 +41,15 @@ def test_models( chunked_prefill_token_size: int, enforce_eager: bool, tensor_parallel_size: int, + attention_backend: str, + monkeypatch, ) -> None: """ Checks exact match decode between huggingface model and vllm runner with chunked prefill. """ + override_backend_env_variable(monkeypatch, attention_backend) + max_num_seqs = chunked_prefill_token_size max_num_batched_tokens = chunked_prefill_token_size @@ -77,13 +78,18 @@ def test_models( @multi_gpu_test(num_gpus=2) @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) @pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) def test_models_distributed( hf_runner, vllm_runner, example_prompts, model: str, distributed_executor_backend: str, + attention_backend: str, + monkeypatch, ) -> None: + override_backend_env_variable(monkeypatch, attention_backend) + if (model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray"): # test ray adag @@ -197,7 +203,6 @@ def test_models_with_fp8_kv_cache( @pytest.mark.parametrize("max_tokens", [16]) @pytest.mark.parametrize("enforce_eager", [False]) @pytest.mark.parametrize("chunk_size", [30, 32]) -@pytest.mark.parametrize("use_v2_block_manager", [False, True]) # NOTE: Increasing this in this suite will fail CI because we currently cannot # reset distributed env properly. Use a value > 1 just when you test. @pytest.mark.parametrize("tensor_parallel_size", [1]) @@ -206,7 +211,6 @@ def test_with_prefix_caching( max_tokens: int, enforce_eager: bool, chunk_size: int, - use_v2_block_manager: bool, tensor_parallel_size: int, ) -> None: """ @@ -234,7 +238,6 @@ def test_with_prefix_caching( enable_chunked_prefill=True, enable_prefix_caching=enable, tensor_parallel_size=tensor_parallel_size, - use_v2_block_manager=use_v2_block_manager, enforce_eager=enforce_eager, max_num_seqs=max_num_seqs, ) as vllm_model: diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py index a5df5639cf948..d7f36a7812802 100644 --- a/tests/basic_correctness/test_cpu_offload.py +++ b/tests/basic_correctness/test_cpu_offload.py @@ -2,5 +2,5 @@ def test_cpu_offload(): - compare_two_settings("meta-llama/Llama-2-7b-hf", [], - ["--cpu-offload-gb", "4"]) + compare_two_settings("meta-llama/Llama-3.2-1B", [], + ["--cpu-offload-gb", "1"]) diff --git a/tests/compile/piecewise/__init__.py b/tests/compile/piecewise/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/compile/piecewise/piecewise_compilation_config.json b/tests/compile/piecewise/piecewise_compilation_config.json new file mode 100644 index 0000000000000..03d077b76f627 --- /dev/null +++ b/tests/compile/piecewise/piecewise_compilation_config.json @@ -0,0 +1,4 @@ +{ + "use_cudagraph": true, + "non_cudagraph_ops": ["silly.attention"] +} \ No newline at end of file diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py new file mode 100644 index 0000000000000..d151d62516b07 --- /dev/null +++ b/tests/compile/piecewise/test_simple.py @@ -0,0 +1,108 @@ +""" +Test the piecewise compilation with a simple model so that we +can exactly calculate the expected output and side effects. +""" +import os + +import torch +from torch import nn +from torch.library import Library + +from vllm.compilation.compile_context import set_compile_context +from vllm.compilation.counter import compilation_counter +from vllm.compilation.decorators import support_torch_compile +from vllm.compilation.levels import CompilationLevel +from vllm.utils import direct_register_custom_op + +os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE) + +global_counter = 0 + +# create a library to hold the custom op +silly_lib = Library("silly", "FRAGMENT") # noqa + + +def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + out: torch.Tensor) -> None: + global global_counter + global_counter += 1 + print(f"{global_counter=}") + out.copy_(q) + out[0] += 1 + + +def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + out: torch.Tensor) -> None: + return + + +direct_register_custom_op( + op_name="attention", + op_func=silly_attention, + mutates_args=["out"], + fake_impl=silly_attention_fake, + target_lib=silly_lib, +) + + +@support_torch_compile +class SillyModel(nn.Module): + + def __init__(self) -> None: + super().__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Overall effect: + x += 1 + x[0] += 2 + global_counter += 2 + """ + x = x + 1 + x = x + 2 + out = torch.empty_like(x) + torch.ops.silly.attention(x, x, x, out) + x = out + x = x - 2 + x = x - 1 + out = torch.empty_like(x) + torch.ops.silly.attention(x, x, x, out) + x = out + x = x + 1 + return x + + +def test_simple_piecewise_compile(): + + model = SillyModel() + + directory = os.path.dirname(__file__) + config = os.path.join(directory, "piecewise_compilation_config.json") + os.environ["VLLM_TORCH_COMPILE_CONFIG"] = config + + input_buffer = torch.randn(100).cuda() + + with compilation_counter.expect( + num_graphs_seen=1, # one graph for the model + num_piecewise_graphs_seen=5, # 2 * num_layers + 1 + num_piecewise_capturable_graphs_seen=3, # 1 + num_layers + num_inductor_compilations=3, # num_piecewise_capturable_graphs_seen + num_cudagraph_caputured= + 6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ): + + with set_compile_context([1, 2]): + model(input_buffer) + + model(input_buffer[:2]) + model(input_buffer[:1]) + + input_buffer[:2].zero_() + global global_counter + global_counter = 0 + output = model(input_buffer[:2]) + assert global_counter == 2 + assert torch.allclose(output.cpu(), torch.tensor([3., 1.])) + + # clean up to avoid side effects for other tests + del os.environ["VLLM_TORCH_COMPILE_CONFIG"] diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py new file mode 100644 index 0000000000000..e3e5a7d0fc5a5 --- /dev/null +++ b/tests/compile/piecewise/test_toy_llama.py @@ -0,0 +1,346 @@ +""" +Test the piecewise compilation with a simple model, comparing the output +with and without the piecewise compilation. +""" +import os +from dataclasses import dataclass +from typing import Optional, Tuple + +import torch +from torch import nn +from torch.library import Library + +from vllm.compilation.compile_context import set_compile_context +from vllm.compilation.config import CompilationConfig +from vllm.compilation.counter import compilation_counter +from vllm.compilation.decorators import support_torch_compile +from vllm.compilation.levels import CompilationLevel +from vllm.plugins import set_compilation_config +from vllm.utils import direct_register_custom_op + +# create a library to hold the custom op +silly_lib = Library("silly", "FRAGMENT") # noqa + + +def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + out: torch.Tensor) -> None: + out.copy_(q) + out += k + out += v + + +def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + out: torch.Tensor) -> None: + return + + +direct_register_custom_op( + op_name="attention", + op_func=silly_attention, + mutates_args=["out"], + fake_impl=silly_attention_fake, + target_lib=silly_lib, +) + + +@dataclass +class LlamaConfig: + hidden_size: int = 128 + mlp_size: int = 256 + vocab_size: int = 128 + num_layers: int = 2 + + +class LlamaMLP(nn.Module): + + def __init__(self, config: LlamaConfig) -> None: + super().__init__() + self.gate_up_projection = nn.Linear( + in_features=config.hidden_size, + out_features=config.mlp_size * 2, + bias=False, + ) + self.down_projection = nn.Linear( + in_features=config.mlp_size, + out_features=config.hidden_size, + bias=False, + ) + + self.gate_up_projection.weight.data.fill_(0.0) + self.down_projection.weight.data.fill_(0.0) + + def forward(self, x): + x = self.gate_up_projection(x) + x = x[:, :x.size(1) // 2] * torch.nn.functional.relu( + x[:, x.size(1) // 2:]) + x = self.down_projection(x) + return x + + +class LlamaAttention(nn.Module): + + def __init__(self, config: LlamaConfig) -> None: + super().__init__() + self.qkv_projection = nn.Linear( + in_features=config.hidden_size, + out_features=config.hidden_size * 3, + ) + + self.output_projection = nn.Linear( + in_features=config.hidden_size, + out_features=config.hidden_size, + ) + + self.qkv_projection.weight.data.fill_(0.0) + self.output_projection.weight.data.fill_(0.0) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + qkv = self.qkv_projection(hidden_states) + hidden_size = qkv.size(-1) // 3 + q, k, v = qkv.split([hidden_size, hidden_size, hidden_size], dim=-1) + + q = q + positions.unsqueeze(1) + k = k + positions.unsqueeze(1) + + attn_output = torch.empty_like(q) + torch.ops.silly.attention(q, k, v, attn_output) + + output = self.output_projection(attn_output) + return output + + +class LlamaDecoderLayer(nn.Module): + + def __init__(self, config: LlamaConfig) -> None: + super().__init__() + self.self_attention = LlamaAttention(config) + self.mlp = LlamaMLP(config) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + ) -> Tuple[torch.Tensor, torch.Tensor]: + if residual is None: + residual = hidden_states + hidden_states = hidden_states / 2 + else: + hidden_states = hidden_states + residual + residual = hidden_states + hidden_states = hidden_states / 2 + + hidden_states = self.self_attention(positions=positions, + hidden_states=hidden_states) + + hidden_states = hidden_states + residual + residual = hidden_states + hidden_states = hidden_states / 2 + hidden_states = self.mlp(hidden_states) + + return hidden_states, residual + + +class LlamaModel(nn.Module): + + def __init__(self, config: LlamaConfig) -> None: + super().__init__() + self.embedding_tokens = nn.Embedding( + num_embeddings=config.vocab_size, + embedding_dim=config.hidden_size, + ) + self.layers = nn.ModuleList( + [LlamaDecoderLayer(config) for _ in range(config.num_layers)]) + + self.embedding_tokens.weight.data.fill_(0.0) + + def forward( + self, + input_ids: Optional[torch.Tensor], + positions: torch.Tensor, + ) -> torch.Tensor: + hidden_states = self.embedding_tokens(input_ids) + residual = None + for layer in self.layers: + hidden_states, residual = layer(positions, hidden_states, residual) + return hidden_states + + +@torch.inference_mode +def run_model(llama_config, + use_compile: bool, + split_attn: bool = False) -> torch.Tensor: + + if use_compile: + os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str( + CompilationLevel.PIECEWISE) + + if split_attn: + set_compilation_config( + CompilationConfig( + use_cudagraph=True, + non_cudagraph_ops=["silly.attention"], + )) + else: + set_compilation_config(CompilationConfig(use_cudagraph=True, )) + else: + os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str( + CompilationLevel.NO_COMPILATION) + set_compilation_config(None) + + cls = LlamaModel + if use_compile: + cls = support_torch_compile(LlamaModel) + model = cls(llama_config).eval().cuda() + + B = 16 # max batch size + input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda() + positions = torch.arange(B).cuda() + + with set_compile_context([1, 2]): + model(input_ids, positions) + model(input_ids[:2], positions[:2]) + model(input_ids[:1], positions[:1]) + + input_ids[:2].zero_() + output = model(input_ids[:2], positions[:2]) + + # manual cleanup + del os.environ["VLLM_TORCH_COMPILE_LEVEL"] + set_compilation_config(None) + + return output.cpu() + + +def test_toy_llama(): + # compare output with and without piecewise compilation + + llama_config = LlamaConfig(hidden_size=128, + mlp_size=256, + vocab_size=128, + num_layers=2) + + outputs = [] + with compilation_counter.expect( + num_graphs_seen=0, + num_piecewise_graphs_seen=0, + num_piecewise_capturable_graphs_seen=0, + num_inductor_compilations=0, + num_cudagraph_caputured=0, + ): + outputs.append(run_model(llama_config, use_compile=False)) + with compilation_counter.expect( + num_graphs_seen=1, # one graph for the model + num_piecewise_graphs_seen=1, + num_piecewise_capturable_graphs_seen=1, + num_inductor_compilations=1, # num_piecewise_capturable_graphs_seen + num_cudagraph_caputured= + 2, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ): + outputs.append(run_model(llama_config, use_compile=True)) + + with compilation_counter.expect( + num_graphs_seen=1, # one graph for the model + num_piecewise_graphs_seen=2 * llama_config.num_layers + + 1, # 2 * num_layers + 1 + num_piecewise_capturable_graphs_seen=1 + + llama_config.num_layers, # 1 + num_layers + num_inductor_compilations=1 + + llama_config.num_layers, # num_piecewise_capturable_graphs_seen + num_cudagraph_caputured=2 * + (1 + llama_config.num_layers + ), # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ): + outputs.append( + run_model(llama_config, use_compile=True, split_attn=True)) + + for i in range(1, len(outputs)): + assert torch.allclose(outputs[0], outputs[i]) + + +@torch.inference_mode +def benchmark(): + os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE) + from triton.testing import do_bench + cls = support_torch_compile(LlamaModel) + + # similar to llama 3.1-8B + llama_config = LlamaConfig(hidden_size=4096, + mlp_size=14336, + vocab_size=128 * 1024, + num_layers=32) + + # a tiny model to measure the overhead + # of piecewise cudagraph + llama_config = LlamaConfig(hidden_size=40, + mlp_size=80, + vocab_size=128, + num_layers=2) + + cudagraph_sizes = [1, 2, 4] + [i * 8 for i in range(1, 33)] + + eager_time = {} + full_cudagraph_time = {} + piecewise_cudagraph_time = {} + + pool = torch.cuda.graph_pool_handle() + + for piecewise in [False, True]: + if piecewise: + set_compilation_config( + CompilationConfig( + use_cudagraph=True, + non_cudagraph_ops=["silly.attention"], + )) + else: + set_compilation_config(None) + + model = cls(llama_config).eval().cuda().to(torch.bfloat16) + + B = 256 # max batch size + input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda() + positions = torch.arange(B).cuda().to(torch.bfloat16) + + graphs = {} + + with set_compile_context(cudagraph_sizes): + model(input_ids, positions) + for b in cudagraph_sizes[::-1]: + if not piecewise: + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph, pool=pool): + output = model(input_ids[:b], positions[:b]) + graphs[b] = (graph, output) + else: + output = model(input_ids[:b], positions[:b]) + graphs[b] = (model, output) + for b in cudagraph_sizes: + if piecewise: + # noqa is for `Function definition does not bind loop variable` + # it will be problematic if we save the created lambda function + # and use it later, because it will look up the name `b` in the + # enclosing scope, and the value of `b` will always be 256. + # it is fine here, because we only use the lambda function once. + runtime = do_bench(lambda: graphs[b][0] # noqa + (input_ids[:b], positions[:b])) # noqa + piecewise_cudagraph_time[b] = runtime + else: + runtime = do_bench(lambda: graphs[b][0].replay()) # noqa + eager_runtime = do_bench( + lambda: model(input_ids[:b], positions[:b])) # noqa + full_cudagraph_time[b] = runtime + eager_time[b] = eager_runtime + + # print in tabular format + print("batch size\teager mode\tfull cudagraph\tpiecewise cudagraph") + for b in cudagraph_sizes: + print((f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}" + f"\t{piecewise_cudagraph_time[b]:.3f}")) + + +if __name__ == "__main__": + benchmark() diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index b6ec7413978f4..833589ba5dc9f 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -1,3 +1,4 @@ +import dataclasses from typing import Dict, List, Optional import pytest @@ -8,41 +9,118 @@ from ..utils import compare_all_settings +@dataclasses.dataclass +class TestSetting: + model: str + model_args: List[str] + pp_size: int + tp_size: int + attn_backend: str + method: str + fullgraph: bool + + +# representative settings for testing +test_settings = [ + # basic llama model + TestSetting( + model="meta-llama/Llama-3.2-1B", + model_args=[], + pp_size=2, + tp_size=2, + attn_backend="FLASHINFER", + method="generate", + fullgraph=True, + ), + # llama model with quantization + TestSetting( + model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", + model_args=["--quantization", "gptq"], + pp_size=1, + tp_size=1, + attn_backend="FLASH_ATTN", + method="generate", + fullgraph=True, + ), + # MoE model + TestSetting( + model="ibm/PowerMoE-3b", + model_args=[], + pp_size=1, + tp_size=2, + attn_backend="FLASH_ATTN", + method="generate", + fullgraph=True, + ), + # embedding model + TestSetting( + model="BAAI/bge-multilingual-gemma2", + model_args=["--task", "embedding"], + pp_size=1, + tp_size=1, + attn_backend="FLASHINFER", + method="encode", + fullgraph=True, + ), + # vision language model + TestSetting( + model="microsoft/Phi-3.5-vision-instruct", + model_args=["--trust-remote-code", "--max-model-len", "2048"], + pp_size=2, + tp_size=1, + attn_backend="FLASH_ATTN", + method="generate_with_image", + fullgraph=False, + ), +] + + # we cannot afford testing the full Catesian product # of all models and all levels -@pytest.mark.parametrize( - "model, model_args, pp_size, tp_size, attn_backend, method, fullgraph", - [ - ("meta-llama/Meta-Llama-3-8B", [], 2, 2, "FLASH_ATTN", "generate", - True), - ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", - ["--quantization", "compressed-tensors" - ], 1, 1, "FLASH_ATTN", "generate", True), - ("google/gemma-2-2b-it", [], 1, 2, "FLASHINFER", "generate", True), - # TODO: add multi-modality test for llava - ("llava-hf/llava-1.5-7b-hf", [], 2, 1, "FLASHINFER", "generate", False) - ]) -def test_compile_correctness(model, model_args, pp_size, tp_size, attn_backend, - method, fullgraph): +@pytest.mark.parametrize("test_setting", test_settings) +def test_compile_correctness(test_setting: TestSetting): # this test is run under multiple suits, with different GPUs. # make sure we only run the test with correct CUDA devices. # don't use "<", as it will duplicate the tests. + model = test_setting.model + model_args = test_setting.model_args + pp_size = test_setting.pp_size + tp_size = test_setting.tp_size + attn_backend = test_setting.attn_backend + method = test_setting.method + fullgraph = test_setting.fullgraph if cuda_device_count_stateless() != pp_size * tp_size: pytest.skip("Not correct CUDA devices for the test.") import os os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend - if not fullgraph: - os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" - all_args = [["--enforce-eager"] + model_args + ["--max_model_len", "1024"] - + ["-pp", str(pp_size)] + ["-tp", str(tp_size)]] * 3 - # don't test VLLM_TORCH_COMPILE_LEVEL == 3 case - # inductor will change the output, so we cannot compare them. - all_envs: List[Optional[Dict[str, str]]] = [{ - "VLLM_TORCH_COMPILE_LEVEL": - str(level) - } for level in [ - CompilationLevel.NO_COMPILATION, - CompilationLevel.DYNAMO_AS_IS, - CompilationLevel.DYNAMO_ONCE, - ]] - compare_all_settings(model, all_args, all_envs, method=method) + final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \ + ["-tp", str(tp_size)] + + all_envs: List[Optional[Dict[str, str]]] = [] + + for level in [ + CompilationLevel.NO_COMPILATION, + CompilationLevel.PIECEWISE, + ]: + all_envs.append({"VLLM_TORCH_COMPILE_LEVEL": str(level)}) + + # inductor will change the output, so we only compare if the output + # is close, not exactly the same. + compare_all_settings( + model, [final_args] * 2, + all_envs, + method=method if method != "generate" else "generate_close") + all_envs.clear() + + for level in [ + CompilationLevel.NO_COMPILATION, + CompilationLevel.DYNAMO_AS_IS, + CompilationLevel.DYNAMO_ONCE, + ]: + all_envs.append({"VLLM_TORCH_COMPILE_LEVEL": str(level)}) + if level != CompilationLevel.DYNAMO_ONCE and not fullgraph: + # "DYNAMO_ONCE" will always use fullgraph + all_envs[-1][ + "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore + + compare_all_settings(model, [final_args] * 3, all_envs, method=method) diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py index f28f9145bb442..f00334934cb46 100644 --- a/tests/compile/test_full_graph.py +++ b/tests/compile/test_full_graph.py @@ -9,7 +9,7 @@ @pytest.mark.parametrize("model_info", TEST_MODELS) @pytest.mark.parametrize( "optimization_level", - [CompilationLevel.DYNAMO_ONCE, CompilationLevel.INDUCTOR]) + [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE]) @fork_new_process_for_each_test def test_full_graph(model_info, optimization_level): model = model_info[0] diff --git a/tests/compile/utils.py b/tests/compile/utils.py index 5386eb0e3795d..95cad19126df6 100644 --- a/tests/compile/utils.py +++ b/tests/compile/utils.py @@ -5,21 +5,23 @@ from tests.quantization.utils import is_quant_method_supported from vllm import LLM, SamplingParams from vllm.compilation.levels import CompilationLevel -from vllm.utils import is_hip +from vllm.platforms import current_platform TEST_MODELS = [ ("facebook/opt-125m", {}), - ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", { - "dtype": torch.float16, - "quantization": "compressed-tensors" - }), + # TODO: add fake implementation for compressed-tensors + # ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", { + # "dtype": torch.float16, + # "quantization": "compressed-tensors" + # }), ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", { "dtype": torch.float16, "quantization": "fp8" }), - ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", { - "quantization": "compressed-tensors" - }), + # TODO: add fake implementation for compressed-tensors + # ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", { + # "quantization": "compressed-tensors" + # }), ("meta-llama/Meta-Llama-3-8B", {}), ] @@ -55,7 +57,7 @@ "quantization": "marlin" })) -if not is_hip() and is_quant_method_supported("awq"): +if not current_platform.is_rocm() and is_quant_method_supported("awq"): TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", { "quantization": "AWQ" })) @@ -69,11 +71,11 @@ def check_full_graph_support(model, os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(optimization_level) os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1" - # Inductor doesn't support fp8/gptq_marlin_24 yet. + # Inductor doesn't support fp8 and the base meta llama uses too + # much memory. quantization = model_kwargs.get("quantization") - if (quantization == "fp8" or quantization == "gptq_marlin" - or quantization == "gptq_marlin_24" - ) and optimization_level >= CompilationLevel.INDUCTOR: + if ((quantization == "fp8" or model == "meta-llama/Meta-Llama-3-8B") + and optimization_level >= CompilationLevel.PIECEWISE): return prompts = [ diff --git a/tests/conftest.py b/tests/conftest.py index 640549b269ef7..f24e610a15847 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,3 @@ -import contextlib -import gc import json import os import sys @@ -25,19 +23,19 @@ from vllm import LLM, SamplingParams from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset -from vllm.config import TokenizerPoolConfig +from vllm.config import TaskOption, TokenizerPoolConfig from vllm.connections import global_http_connection -from vllm.distributed import (destroy_distributed_environment, - destroy_model_parallel, +from vllm.distributed import (cleanup_dist_env_and_memory, init_distributed_environment, initialize_model_parallel) from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, to_enc_dec_tuple_list, zip_enc_dec_prompts) from vllm.logger import init_logger from vllm.outputs import RequestOutput +from vllm.platforms import current_platform from vllm.sampling_params import BeamSearchParams from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless, - identity, is_cpu) + identity) logger = init_logger(__name__) @@ -45,10 +43,12 @@ _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")] -PromptImageInput = Union[List[Image.Image], List[List[Image.Image]]] -PromptAudioInput = Union[List[Tuple[np.ndarray, int]], - List[List[Tuple[np.ndarray, int]]]] -PromptVideoInput = Union[List[np.ndarray], List[List[np.ndarray]]] +_M = TypeVar("_M") +_PromptMultiModalInput = Union[List[_M], List[List[_M]]] + +PromptImageInput = _PromptMultiModalInput[Image.Image] +PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]] +PromptVideoInput = _PromptMultiModalInput[np.ndarray] def _read_prompts(filename: str) -> List[str]: @@ -140,17 +140,7 @@ def dist_init(): ) initialize_model_parallel(1, 1) yield - cleanup() - - -def cleanup(): - destroy_model_parallel() - destroy_distributed_environment() - with contextlib.suppress(AssertionError): - torch.distributed.destroy_process_group() - gc.collect() - if not is_cpu(): - torch.cuda.empty_cache() + cleanup_dist_env_and_memory() @pytest.fixture() @@ -167,7 +157,7 @@ def should_do_global_cleanup_after_test(request) -> bool: def cleanup_fixture(should_do_global_cleanup_after_test: bool): yield if should_do_global_cleanup_after_test: - cleanup() + cleanup_dist_env_and_memory() @pytest.fixture(autouse=True) @@ -242,19 +232,22 @@ def video_assets() -> _VideoAssets: return VIDEO_ASSETS -_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature) +_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict) class HfRunner: - def wrap_device(self, input: _T, device: Optional[str] = None) -> _T: + def wrap_device(self, x: _T, device: Optional[str] = None) -> _T: if device is None: - return self.wrap_device(input, "cpu" if is_cpu() else "cuda") + device = "cpu" if current_platform.is_cpu() else "cuda" - if hasattr(input, "device") and input.device.type == device: - return input + if isinstance(x, dict): + return {k: self.wrap_device(v, device) for k, v in x.items()} - return input.to(device) + if hasattr(x, "device") and x.device.type == device: + return x + + return x.to(device) def __init__( self, @@ -263,15 +256,16 @@ def __init__( *, model_kwargs: Optional[Dict[str, Any]] = None, is_embedding_model: bool = False, + is_sentence_transformer: bool = False, + skip_tokenizer_init: bool = False, auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM, - postprocess_inputs: Callable[[BatchEncoding], - BatchEncoding] = identity, + postprocess_inputs: Callable[..., BatchEncoding] = identity, ) -> None: torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype] self.model_name = model_name - if is_embedding_model: + if is_sentence_transformer: # Lazy init required for AMD CI from sentence_transformers import SentenceTransformer self.model = self.wrap_device( @@ -290,11 +284,12 @@ def __init__( **model_kwargs, )) - self.tokenizer = AutoTokenizer.from_pretrained( - model_name, - torch_dtype=torch_dtype, - trust_remote_code=True, - ) + if not skip_tokenizer_init: + self.tokenizer = AutoTokenizer.from_pretrained( + model_name, + torch_dtype=torch_dtype, + trust_remote_code=True, + ) # don't put this import at the top level # it will call torch.cuda.device_count() @@ -304,33 +299,76 @@ def __init__( torch_dtype=torch_dtype, trust_remote_code=True, ) + if skip_tokenizer_init: + self.tokenizer = self.processor.tokenizer + self.dtype = dtype self.postprocess_inputs = postprocess_inputs - def generate( + def get_inputs( self, prompts: List[str], images: Optional[PromptImageInput] = None, - videos: Optional[List[np.ndarray]] = None, - **kwargs: Any, - ) -> List[Tuple[List[List[int]], List[str]]]: - if images: + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, + ) -> List[BatchEncoding]: + if images is not None: assert len(prompts) == len(images) - outputs: List[Tuple[List[List[int]], List[str]]] = [] + if videos is not None: + assert len(prompts) == len(videos) + + if audios is not None: + assert len(prompts) == len(audios) + + all_inputs: List[BatchEncoding] = [] for i, prompt in enumerate(prompts): processor_kwargs: Dict[str, Any] = { "text": prompt, "return_tensors": "pt", } - if images is not None and images[i] is not None: - processor_kwargs["images"] = images[i] - if videos is not None and videos[i] is not None: - processor_kwargs["videos"] = videos[i] + if images is not None and (image := images[i]) is not None: + processor_kwargs["images"] = image + if videos is not None and (video := videos[i]) is not None: + processor_kwargs["videos"] = video + if audios is not None and (audio_tuple := audios[i]) is not None: + audio, sr = audio_tuple + processor_kwargs["audio"] = audio + processor_kwargs["sampling_rate"] = sr inputs = self.processor(**processor_kwargs) - inputs = self.postprocess_inputs(inputs) + inputs = self.postprocess_inputs(inputs, dtype=self.dtype) + + all_inputs.append(inputs) + + return all_inputs + def classify(self, prompts: List[str]) -> List[str]: + # output is final logits + all_inputs = self.get_inputs(prompts) + outputs = [] + for inputs in all_inputs: + output = self.model(**self.wrap_device(inputs)) + logits = output.logits.softmax(dim=-1)[0].tolist() + outputs.append(logits) + + return outputs + + def generate( + self, + prompts: List[str], + images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, + **kwargs: Any, + ) -> List[Tuple[List[List[int]], List[str]]]: + all_inputs = self.get_inputs(prompts, + images=images, + videos=videos, + audios=audios) + + outputs: List[Tuple[List[List[int]], List[str]]] = [] + for inputs in all_inputs: output_ids = self.model.generate( **self.wrap_device(inputs, device=self.model.device.type), use_cache=True, @@ -350,12 +388,16 @@ def generate_greedy( prompts: List[str], max_tokens: int, images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, **kwargs: Any, ) -> List[Tuple[List[int], str]]: outputs = self.generate(prompts, do_sample=False, max_new_tokens=max_tokens, images=images, + videos=videos, + audios=audios, **kwargs) return [(output_ids[0], output_str[0]) @@ -387,23 +429,17 @@ def generate_greedy_logprobs( prompts: List[str], max_tokens: int, images: Optional[PromptImageInput] = None, - videos: Optional[List[np.ndarray]] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, **kwargs: Any, ) -> List[List[torch.Tensor]]: - all_logprobs: List[List[torch.Tensor]] = [] - for i, prompt in enumerate(prompts): - processor_kwargs: Dict[str, Any] = { - "text": prompt, - "return_tensors": "pt", - } - if images is not None and images[i] is not None: - processor_kwargs["images"] = images[i] - if videos is not None and videos[i] is not None: - processor_kwargs["videos"] = videos[i] - - inputs = self.processor(**processor_kwargs) - inputs = self.postprocess_inputs(inputs) + all_inputs = self.get_inputs(prompts, + images=images, + videos=videos, + audios=audios) + all_logprobs: List[List[torch.Tensor]] = [] + for inputs in all_inputs: output = self.model.generate( **self.wrap_device(inputs, device=self.model.device.type), use_cache=True, @@ -472,31 +508,19 @@ def generate_greedy_logprobs_limit( num_logprobs: int, images: Optional[PromptImageInput] = None, audios: Optional[PromptAudioInput] = None, - videos: Optional[List[np.ndarray]] = None, + videos: Optional[PromptVideoInput] = None, **kwargs: Any, ) -> List[TokensTextLogprobs]: + all_inputs = self.get_inputs(prompts, + images=images, + videos=videos, + audios=audios) + all_logprobs: List[List[Dict[int, float]]] = [] all_output_ids: List[List[int]] = [] all_output_strs: List[str] = [] - for i, prompt in enumerate(prompts): - processor_kwargs: Dict[str, Any] = { - "text": prompt, - "return_tensors": "pt", - } - if images is not None and images[i] is not None: - processor_kwargs["images"] = images[i] - - if audios is not None: - audio, sr = audios[i] - processor_kwargs["audio"] = audio - processor_kwargs["sampling_rate"] = sr - - if videos is not None: - processor_kwargs["videos"] = videos[i] - inputs = self.processor(**processor_kwargs) - inputs = self.postprocess_inputs(inputs) - + for inputs in all_inputs: output = self.model.generate( **self.wrap_device(inputs, device=self.model.device.type), use_cache=True, @@ -529,6 +553,7 @@ def generate_encoder_decoder_greedy_logprobs_limit( encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]], max_tokens: int, num_logprobs: int, + images: Optional[PromptImageInput] = None, **kwargs: Any, ) -> List[TokensTextLogprobs]: ''' @@ -539,11 +564,17 @@ def generate_encoder_decoder_greedy_logprobs_limit( all_output_ids: List[List[int]] = [] all_output_strs: List[str] = [] - for (encoder_prompt, - decoder_prompt) in to_enc_dec_tuple_list(encoder_decoder_prompts): + for i, (encoder_prompt, decoder_prompt) in enumerate( + to_enc_dec_tuple_list(encoder_decoder_prompts)): + processor_kwargs: Dict[str, Any] = { + "text": encoder_prompt, + "return_tensors": "pt", + } + if images is not None and images[i] is not None: + processor_kwargs["images"] = images[i] encoder_input_ids = self.wrap_device( - self.tokenizer(encoder_prompt, return_tensors="pt").input_ids, + self.processor(**processor_kwargs).input_ids, device=self.model.device.type, ) @@ -591,7 +622,7 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): del self.model - cleanup() + cleanup_dist_env_and_memory() @pytest.fixture(scope="session") @@ -604,6 +635,7 @@ class VllmRunner: def __init__( self, model_name: str, + task: TaskOption = "auto", tokenizer_name: Optional[str] = None, # Use smaller max model length, otherwise bigger model cannot run due # to kv cache size limit. @@ -619,6 +651,7 @@ def __init__( ) -> None: self.model = LLM( model=model_name, + task=task, tokenizer=tokenizer_name, trust_remote_code=True, dtype=dtype, @@ -632,19 +665,60 @@ def __init__( **kwargs, ) - def generate( + def get_inputs( self, prompts: List[str], - sampling_params: SamplingParams, images: Optional[PromptImageInput] = None, - ) -> List[Tuple[List[List[int]], List[str]]]: + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, + ) -> List[TextPrompt]: if images is not None: assert len(prompts) == len(images) + if videos is not None: + assert len(prompts) == len(videos) + + if audios is not None: + assert len(prompts) == len(audios) + inputs = [TextPrompt(prompt=prompt) for prompt in prompts] if images is not None: for i, image in enumerate(images): - inputs[i]["multi_modal_data"] = {"image": image} + if image is not None: + inputs[i]["multi_modal_data"] = {"image": image} + + if videos is not None: + for i, video in enumerate(videos): + if video is not None: + inputs[i]["multi_modal_data"] = {"video": video} + + if audios is not None: + for i, audio in enumerate(audios): + if audio is not None: + inputs[i]["multi_modal_data"] = {"audio": audio} + + return inputs + + def classify(self, prompts: List[str]) -> List[str]: + req_outputs = self.model.encode(prompts) + outputs = [] + for req_output in req_outputs: + embedding = req_output.outputs.embedding + outputs.append(embedding) + return outputs + + def generate( + self, + prompts: List[str], + sampling_params: SamplingParams, + images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, + ) -> List[Tuple[List[List[int]], List[str]]]: + inputs = self.get_inputs(prompts, + images=images, + videos=videos, + audios=audios) req_outputs = self.model.generate(inputs, sampling_params=sampling_params) @@ -687,24 +761,10 @@ def generate_w_logprobs( videos: Optional[PromptVideoInput] = None, ) -> Union[List[TokensTextLogprobs], List[TokensTextLogprobsPromptLogprobs]]: - if images is not None: - assert len(prompts) == len(images) - - if videos is not None: - assert len(prompts) == len(videos) - - inputs = [TextPrompt(prompt=prompt) for prompt in prompts] - if images is not None: - for i, image in enumerate(images): - inputs[i]["multi_modal_data"] = {"image": image} - - if audios is not None: - for i, audio in enumerate(audios): - inputs[i]["multi_modal_data"] = {"audio": audio} - - if videos is not None: - for i, video in enumerate(videos): - inputs[i]["multi_modal_data"] = {"video": video} + inputs = self.get_inputs(prompts, + images=images, + videos=videos, + audios=audios) req_outputs = self.model.generate(inputs, sampling_params=sampling_params) @@ -741,9 +801,15 @@ def generate_greedy( prompts: List[str], max_tokens: int, images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, ) -> List[Tuple[List[int], str]]: greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) - outputs = self.generate(prompts, greedy_params, images=images) + outputs = self.generate(prompts, + greedy_params, + images=images, + videos=videos, + audios=audios) return [(output_ids[0], output_str[0]) for output_ids, output_str in outputs] @@ -809,20 +875,27 @@ def generate_beam_search( returned_outputs.append((token_ids, texts)) return returned_outputs - def encode(self, prompts: List[str]) -> List[List[float]]: - req_outputs = self.model.encode(prompts) - outputs = [] - for req_output in req_outputs: - embedding = req_output.outputs.embedding - outputs.append(embedding) - return outputs + def encode( + self, + prompts: List[str], + images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, + ) -> List[List[float]]: + inputs = self.get_inputs(prompts, + images=images, + videos=videos, + audios=audios) + + req_outputs = self.model.encode(inputs) + return [req_output.outputs.embedding for req_output in req_outputs] def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): del self.model - cleanup() + cleanup_dist_env_and_memory() @pytest.fixture(scope="session") diff --git a/tests/core/block/e2e/conftest.py b/tests/core/block/e2e/conftest.py index e870597b7a011..70577ec052a2c 100644 --- a/tests/core/block/e2e/conftest.py +++ b/tests/core/block/e2e/conftest.py @@ -3,10 +3,9 @@ import pytest from vllm import LLM +from vllm.distributed import cleanup_dist_env_and_memory from vllm.model_executor.utils import set_random_seed -from ....conftest import cleanup - @pytest.fixture def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, @@ -37,7 +36,7 @@ def generator_inner(): yield llm del llm - cleanup() + cleanup_dist_env_and_memory() for llm in generator_inner(): yield llm diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index b3f626714d351..86502f613b187 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -2,18 +2,11 @@ import pytest -from tests.utils import check_deprecated_block_manager_usage from vllm import SamplingParams from .conftest import get_token_ids_from_llm_generator -@pytest.fixture(scope="module", autouse=True) -def check_deprecated_block_manager(): - check_deprecated_block_manager_usage( - 'tests/core/block/e2e/test_correctness.py') - - @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -28,32 +21,32 @@ def check_deprecated_block_manager(): "num_gpu_blocks_override": 5 * (64 + 1), }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{ - "use_v2_block_manager": False -}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [{ - "use_v2_block_manager": True, "preemption_mode": "swap" }, { - "use_v2_block_manager": True, "preemption_mode": "recompute" }]) @pytest.mark.parametrize("batch_size", [10]) @pytest.mark.parametrize("seed", [1]) -def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator, - test_llm_generator, batch_size): - """Verify block manager v2 produces same outputs as block manager v1, even - when there is preemption. +def test_block_manager_with_preemption(baseline_llm_generator, + test_llm_generator, batch_size): + """Verify block manager produces same outputs even when there is preemption. This constructs two LLM, each with limited number of GPU blocks. The limit is decided such that as the sequences in the batch grow, sequences must be preempted and removed from cache. If the output token ids are equivalent, then we have confidence that the KV - cache is not corrupted in the v2 block manager. + cache is not corrupted. NOTE: We want a significant number of generated tokens so that any incorrect KV mapping has time to build up error. + + NOTE(Kuntai): Though we have removed block manager v1, this test is still + useful as it asserts the behavior of block manager v2 (now it is called + SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we + keep this test. """ output_len = 1024 temperature = 0.0 @@ -77,11 +70,9 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator, temperature=temperature, ) - print('Getting token ids from block manager v1') baseline_token_ids = get_token_ids_from_llm_generator( baseline_llm_generator, prompts, sampling_params) - print('Getting token ids from block manager v2') test_token_ids = get_token_ids_from_llm_generator(test_llm_generator, prompts, sampling_params) @@ -104,9 +95,6 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator, # skip cuda graph creation for fast test. "enforce_eager": True, - - # Lookahead scheduling only supported in v2 block manager. - "use_v2_block_manager": True, }]) @pytest.mark.parametrize( "per_test_common_llm_kwargs", @@ -218,26 +206,22 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator, "max_num_seqs": 10, }]) @pytest.mark.parametrize("baseline_llm_kwargs", [ - { - "use_v2_block_manager": False, - }, + {}, ]) @pytest.mark.parametrize("test_llm_kwargs", [ { - "use_v2_block_manager": True, "num_lookahead_slots": 0, }, { - "use_v2_block_manager": True, "num_lookahead_slots": 5, }, ]) @pytest.mark.parametrize("batch_size", [4]) @pytest.mark.parametrize("seed", [1]) -def test_chunked_prefill_block_manager_v2(baseline_llm_generator, - test_llm_generator, batch_size): - """Verify that chunked prefill works with BlockManagerV2, with and without - lookahead scheduling. +def test_chunked_prefill_block_manager(baseline_llm_generator, + test_llm_generator, batch_size): + """Verify that chunked prefill works with SelfAttnBlockSpaceManager, + with and without lookahead scheduling. """ output_len = 32 temperature = 0.0 @@ -258,11 +242,11 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator, temperature=temperature, ) - print('Getting token ids with BlockManagerV1') + print('Getting token ids with BlockManager') baseline_token_ids = get_token_ids_from_llm_generator( baseline_llm_generator, prompts, sampling_params) - print('Getting token ids with BlockManagerV2') + print('Getting token ids with BlockManager, with lookahead slots.') test_token_ids = get_token_ids_from_llm_generator(test_llm_generator, prompts, sampling_params) @@ -290,32 +274,32 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator, "enable_prefix_caching": True, }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{ - "use_v2_block_manager": False -}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [{ - "use_v2_block_manager": True, "preemption_mode": "swap" }, { - "use_v2_block_manager": True, "preemption_mode": "recompute" }]) @pytest.mark.parametrize("batch_size", [10]) @pytest.mark.parametrize("seed", [1]) -def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption( +def test_block_manager_prefix_caching_enabled_with_preemption( baseline_llm_generator, test_llm_generator, batch_size): - """Verify block manager v2 produces same outputs as block manager v1, even - when there is preemption. + """Verify block manager produces same outputs even when there is preemption. This constructs two LLM, each with limited number of GPU blocks. The limit is decided such that as the sequences in the batch grow, sequences must be preempted and removed from cache. If the output token ids are equivalent, then we have confidence that the KV - cache is not corrupted in the v2 block manager. + cache is not corrupted. NOTE: We want a significant number of generated tokens so that any incorrect KV mapping has time to build up error. + + NOTE(Kuntai): Though we have removed block manager v1, this test is still + useful as it asserts the behavior of block manager v2 (now it is called + SelfAttnBlockSpaceManager) is the same when swapping / preemption, so we + keep this test. """ output_len = 1024 temperature = 0.0 @@ -339,11 +323,11 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption( temperature=temperature, ) - print('Getting token ids from block manager v1') + print('Getting token ids from block manager') baseline_token_ids = get_token_ids_from_llm_generator( baseline_llm_generator, prompts, sampling_params) - print('Getting token ids from block manager v2') + print('Getting token ids from block manager, with preemption') test_token_ids = get_token_ids_from_llm_generator(test_llm_generator, prompts, sampling_params) @@ -366,9 +350,6 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption( # Allow only 5 sequences of ~1024 tokens in worst case. "block_size": 16, "num_gpu_blocks_override": 5 * (64 + 1), - - # Test APC in v2 block - "use_v2_block_manager": True, }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{ @@ -444,9 +425,6 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator, "max_model_len": 48, "block_size": 16, "num_gpu_blocks_override": 3, - - # Test APC in v2 block - "use_v2_block_manager": True, }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{ diff --git a/tests/core/block/e2e/test_correctness_sliding_window.py b/tests/core/block/e2e/test_correctness_sliding_window.py index 731131984b0eb..9320a9ef62314 100644 --- a/tests/core/block/e2e/test_correctness_sliding_window.py +++ b/tests/core/block/e2e/test_correctness_sliding_window.py @@ -3,7 +3,6 @@ import pytest -from tests.utils import check_deprecated_block_manager_usage from vllm import LLM, SamplingParams from .conftest import get_text_from_llm_generator @@ -13,12 +12,6 @@ BLOCK_SIZE = 16 -@pytest.fixture(scope="module", autouse=True) -def check_deprecated_block_manager(): - check_deprecated_block_manager_usage( - 'tests/core/block/e2e/test_correctness_sliding_window.py') - - @pytest.mark.parametrize( "common_llm_kwargs", [{ @@ -31,10 +24,8 @@ def check_deprecated_block_manager(): "num_gpu_blocks_override": 100000 // BLOCK_SIZE, }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{ - "use_v2_block_manager": False -}]) -@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}]) +@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) +@pytest.mark.parametrize("test_llm_kwargs", [{}]) @pytest.mark.parametrize("batch_size", [5]) @pytest.mark.parametrize("seed", [1]) def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator, @@ -55,7 +46,6 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator, prompts, answer, indices = prep_prompts(batch_size) - print('Getting token ids from block manager v1') baseline_texts = get_text_from_llm_generator(baseline_llm_generator, prompts, sampling_params, @@ -91,10 +81,7 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator, "num_gpu_blocks_override": 100000 // BLOCK_SIZE, }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "use_v2_block_manager": True, - "enable_chunked_prefill": True -}]) +@pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}]) @pytest.mark.parametrize("batch_size", [5]) @pytest.mark.parametrize("seed", [1]) def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed): diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager.py similarity index 91% rename from tests/core/block/test_block_manager_v2.py rename to tests/core/block/test_block_manager.py index e67883367879f..cfd749ad58694 100644 --- a/tests/core/block/test_block_manager_v2.py +++ b/tests/core/block/test_block_manager.py @@ -2,7 +2,7 @@ from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, STR_NOT_IMPL_ENC_DEC_SWA) -from vllm.core.block_manager_v2 import BlockSpaceManagerV2 +from vllm.core.block_manager import SelfAttnBlockSpaceManager from vllm.core.interfaces import AllocStatus from vllm.sequence import Logprob, SequenceStatus from vllm.utils import chunk_list @@ -17,7 +17,7 @@ @pytest.mark.parametrize("watermark", [0.0, 0.5]) def test_can_allocate_seq_group(block_size: int, num_seqs_per_group: int, num_gpu_blocks: int, watermark: float): - block_manager = BlockSpaceManagerV2( + block_manager = SelfAttnBlockSpaceManager( block_size=block_size, num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=1024, @@ -63,7 +63,7 @@ def test_can_allocate_seq_group_encoder_decoder(block_size: int, num_seqs_per_group: int, num_gpu_blocks: int, watermark: float): - block_manager = BlockSpaceManagerV2( + block_manager = SelfAttnBlockSpaceManager( block_size=block_size, num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=1024, @@ -117,16 +117,16 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int, ''' SWA short for Sliding Window Attention. - At time of writing block manager v2 does not support SWA. + At time of writing block manager does not support SWA. - However even when SWA is implemented for block manager v2, + However even when SWA is implemented for block manager, there will still most likely be a separate workstream required to enable SWA for encoder/decoder models. Therefore this test enforces that one of the following cases hold true: - 1. Block manager v2 does not support SWA at all (true at time of writing) - 2. Block manager v2 fails with NotImplementError when SWA is enabled + 1. Block manager does not support SWA at all (true at time of writing) + 2. Block manager fails with NotImplementError when SWA is enabled AND a SequenceGroup with an encoder sequence (i.e. in support of an encoder/decoder model) is passed into can_allocate() as an argument @@ -135,7 +135,7 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int, ''' with pytest.raises((NotImplementedError, AssertionError)) as exc_info: - block_manager = BlockSpaceManagerV2( + block_manager = SelfAttnBlockSpaceManager( block_size=block_size, num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=1024, @@ -158,7 +158,7 @@ def test_can_allocate_encoder_decoder_fails_with_swa(block_size: int, block_manager.can_allocate(seq_group) # Assert that either - # 1. Block manager v2 constructor fails with assertion that sliding window + # 1. Block manager constructor fails with assertion that sliding window # is not yet supported (most likely near-term outcome at time of # writing), or # 2. can_allocate() fails with NotImplementedError due to combination of @@ -177,7 +177,7 @@ def test_can_allocate_encoder_decoder_fails_with_prefix_cache( block_size: int, num_seqs_per_group: int, num_gpu_blocks: int, watermark: float): - block_manager = BlockSpaceManagerV2( + block_manager = SelfAttnBlockSpaceManager( block_size=block_size, num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=1024, @@ -217,7 +217,7 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append, num_gpu_blocks = 1024 watermark = 0.1 - block_manager = BlockSpaceManagerV2( + block_manager = SelfAttnBlockSpaceManager( block_size=block_size, num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=0, @@ -269,14 +269,15 @@ def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots, """Verify blocks number on src/desc device is correct after swapping in/out sequence group (not missing or extra blocks). """ - block_manager = BlockSpaceManagerV2(block_size, - num_cpu_blocks, - num_gpu_blocks, - watermark=0, - enable_caching=enable_caching) + block_manager = SelfAttnBlockSpaceManager(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0, + enable_caching=enable_caching) prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1) prompt.status = SequenceStatus.WAITING block_manager.allocate(seq_group) + # Emulate a forward pass by appending a single token. # The block manager then knows how many unprocessed # tokens will be written in the next forward pass. @@ -321,11 +322,11 @@ def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots, can be swapped in/out. """ num_cpu_blocks = num_gpu_blocks - block_manager = BlockSpaceManagerV2(block_size, - num_cpu_blocks, - num_gpu_blocks, - watermark=0, - enable_caching=enable_caching) + block_manager = SelfAttnBlockSpaceManager(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0, + enable_caching=enable_caching) prompt, seq_group = create_dummy_prompt( "1", prompt_length=(num_gpu_blocks - 1) * block_size - 1) prompt.status = SequenceStatus.WAITING @@ -382,11 +383,11 @@ def test_swap_in_infeasible(num_lookahead_slots, enable_caching): block_size = 8 num_cpu_blocks = 1 num_gpu_blocks = 1 - block_manager = BlockSpaceManagerV2(block_size, - num_cpu_blocks, - num_gpu_blocks, - watermark=0, - enable_caching=enable_caching) + block_manager = SelfAttnBlockSpaceManager(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0, + enable_caching=enable_caching) prompt_length = block_size - 3 assert prompt_length > 0 prompt, seq_group = create_dummy_prompt("1", prompt_length=prompt_length) @@ -434,7 +435,7 @@ def test_sliding_window(block_size, prompt_len, num_slots_to_append, num_gpu_blocks = 1024 watermark = 0.1 - block_manager = BlockSpaceManagerV2( + block_manager = SelfAttnBlockSpaceManager( block_size=block_size, num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=0, @@ -474,7 +475,7 @@ def num_blocks(num_tokens): seq.data.update_num_computed_tokens(prompt_len) check_used(num_blocks(prompt_len)) - # this is how we compute it in BlockSpaceManagerV2.__init__ + # this is how we compute it in SelfAttnBlockSpaceManager.__init__ sliding_blocks = (sliding_window // block_size) + 2 # plus one block for null block sliding_blocks += 1 diff --git a/tests/core/test_block_manager.py b/tests/core/test_block_manager.py deleted file mode 100644 index 2ee9f20824f2f..0000000000000 --- a/tests/core/test_block_manager.py +++ /dev/null @@ -1,637 +0,0 @@ -import time -from collections import defaultdict -from typing import List - -import pytest - -from vllm import SamplingParams -from vllm.block import PhysicalTokenBlock -from vllm.core.block.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE, - STR_NOT_IMPL_ENC_DEC_SWA) -from vllm.core.block_manager_v1 import (BlockSpaceManagerV1, - UncachedBlockAllocator) -from vllm.core.interfaces import AllocStatus -from vllm.sequence import Logprob, Sequence, SequenceGroup, SequenceStatus -from vllm.utils import Device - -from .utils import create_dummy_prompt, create_dummy_prompt_encoder_decoder - - -def test_block_allocator_allocate(): - block_size = 4 - num_cpu_blocks = 4 - cpu_allocator = UncachedBlockAllocator(Device.CPU, block_size, - num_cpu_blocks) - - # Allocate all available cpu blocks. - num_free = num_cpu_blocks - assert cpu_allocator.get_num_free_blocks() == num_free - for _ in range(num_cpu_blocks): - block = cpu_allocator.allocate() - num_free -= 1 - - assert block not in cpu_allocator.free_blocks - assert cpu_allocator.get_num_free_blocks() == num_free - - with pytest.raises(ValueError): - cpu_allocator.allocate() - - -def test_block_allocator_free(): - block_size = 4 - num_cpu_blocks = 4 - cpu_allocator = UncachedBlockAllocator(Device.CPU, block_size, - num_cpu_blocks) - - # Allocate all available cpu blocks. - blocks: List[PhysicalTokenBlock] = [] - for _ in range(num_cpu_blocks): - block = cpu_allocator.allocate() - blocks.append(block) - assert block not in cpu_allocator.free_blocks - - # Free all allocated cpu blocks. - num_free = 0 - assert cpu_allocator.get_num_free_blocks() == num_free - for block in blocks: - cpu_allocator.free(block) - num_free += 1 - assert block in cpu_allocator.free_blocks - assert cpu_allocator.get_num_free_blocks() == num_free - - with pytest.raises(ValueError): - cpu_allocator.free(block) - - -def test_allocate(): - block_size = 4 - num_cpu_blocks = 4 - num_gpu_blocks = 4 - block_manager = BlockSpaceManagerV1(block_size, - num_cpu_blocks, - num_gpu_blocks, - watermark=0) - - # Allocate same sequence group to all available gpu blocks. - for i in range(num_gpu_blocks): - _, seq_group = create_dummy_prompt(str(i), block_size) - assert block_manager.can_allocate(seq_group) == AllocStatus.OK - block_manager.allocate(seq_group) - assert block_manager.can_allocate(seq_group) != AllocStatus.OK - - # Allocate same sequence group to all available gpu blocks. - # Use watermark to reserve one gpu block. - block_manager = BlockSpaceManagerV1(block_size, - num_cpu_blocks, - num_gpu_blocks, - watermark=1 / num_gpu_blocks) - for i in range(num_gpu_blocks - 1): - _, seq_group = create_dummy_prompt(str(i), block_size) - assert block_manager.can_allocate(seq_group) == AllocStatus.OK - block_manager.allocate(seq_group) - assert block_manager.can_allocate(seq_group) != AllocStatus.OK - - -def test_allocate_encoder_decoder(): - block_size = 4 - num_cpu_blocks = 4 - num_gpu_blocks = 4 - block_req_per_seq_group = 2 - block_manager = BlockSpaceManagerV1(block_size, - num_cpu_blocks, - num_gpu_blocks, - watermark=0) - - # Allocate same sequence group to all available gpu blocks. - for i in range(num_gpu_blocks // block_req_per_seq_group): - _, _, seq_group = create_dummy_prompt_encoder_decoder( - str(i), - decoder_prompt_length=block_size, - encoder_prompt_length=block_size) - assert block_manager.can_allocate(seq_group) == AllocStatus.OK - block_manager.allocate(seq_group) - assert block_manager.can_allocate(seq_group) != AllocStatus.OK - - # Allocate same sequence group to all available gpu blocks. - # Use watermark to reserve one gpu block. - block_manager = BlockSpaceManagerV1(block_size, - num_cpu_blocks, - num_gpu_blocks, - watermark=1 / num_gpu_blocks) - for i in range((num_gpu_blocks - 1) // block_req_per_seq_group): - _, _, seq_group = create_dummy_prompt_encoder_decoder( - str(i), - decoder_prompt_length=block_size, - encoder_prompt_length=block_size) - assert block_manager.can_allocate(seq_group) == AllocStatus.OK - block_manager.allocate(seq_group) - assert block_manager.can_allocate(seq_group) != AllocStatus.OK - - -def test_allocate_encoder_decoder_fails_with_swa(): - # SWA short for sliding window attention - - block_size = 4 - num_cpu_blocks = 4 - num_gpu_blocks = 4 - block_manager = BlockSpaceManagerV1(block_size, - num_cpu_blocks, - num_gpu_blocks, - watermark=0, - sliding_window=5) # swa - - # Allocate same sequence group to all available gpu blocks. - _, _, seq_group = create_dummy_prompt_encoder_decoder( - "0", - decoder_prompt_length=block_size, - encoder_prompt_length=block_size) - - # Assert that can_allocate() fails due to SWA - with pytest.raises(NotImplementedError) as exc_info: - block_manager.can_allocate(seq_group) - - assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA - - # Assert that allocate() fails due to SWA - with pytest.raises(NotImplementedError) as exc_info: - block_manager.allocate(seq_group) - - assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_SWA - - -def test_allocate_encoder_decoder_fails_with_prefix_caching(): - block_size = 4 - num_cpu_blocks = 4 - num_gpu_blocks = 4 - block_manager = BlockSpaceManagerV1(block_size, - num_cpu_blocks, - num_gpu_blocks, - watermark=0, - enable_caching=True) # Prefix cache - - # Allocate same sequence group to all available gpu blocks. - _, _, seq_group = create_dummy_prompt_encoder_decoder( - "0", - decoder_prompt_length=block_size, - encoder_prompt_length=block_size) - - # Assert that can_allocate() fails due to prefix caching - with pytest.raises(NotImplementedError) as exc_info: - block_manager.can_allocate(seq_group) - - assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE - - # Assert that allocate() fails due to prefix caching - with pytest.raises(NotImplementedError) as exc_info: - block_manager.allocate(seq_group) - - assert str(exc_info.value) == STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE - - -def test_append_slot_single_seq(): - block_size = 4 - num_cpu_blocks = 4 - num_gpu_blocks = 4 - block_manager = BlockSpaceManagerV1(block_size, - num_cpu_blocks, - num_gpu_blocks, - watermark=0) - - # Allocate single seq to gpu block. - prompt, seq_group = create_dummy_prompt("1", block_size) - block_manager.allocate(seq_group) - - # Nothing to append. Sequence has no new logical blocks. - assert block_manager.can_append_slots(seq_group) - before_blocks = block_manager.get_num_free_gpu_blocks() - assert not block_manager.append_slots(prompt) - after_blocks = block_manager.get_num_free_gpu_blocks() - assert before_blocks == after_blocks - - # Add block_size number of new tokens and append slot. - for i in range(block_size): - token_id = i + 5 - prompt.append_token_id(token_id, {token_id: Logprob(0.0)}) - - assert block_manager.can_append_slots(seq_group) - before_blocks = block_manager.get_num_free_gpu_blocks() - assert not block_manager.append_slots(prompt) - after_blocks = block_manager.get_num_free_gpu_blocks() - assert before_blocks - after_blocks == 1 - - -def test_append_slot_cow(): - block_size = 4 - num_cpu_blocks = 4 - num_gpu_blocks = 4 - block_manager = BlockSpaceManagerV1(block_size=block_size, - num_cpu_blocks=num_cpu_blocks, - num_gpu_blocks=num_gpu_blocks, - watermark=0) - - # Allocate prompt to gpu block. There is one slot left in the block. - prompt = Sequence(seq_id=1, - inputs={ - "prompt": "one two three", - "prompt_token_ids": [1, 2, 3], - }, - block_size=block_size) - - # Fork the sequence, such that a COW will be required when we append a new - # token id. - child = prompt.fork(new_seq_id=2) - - # Allocate space for the sequence group. - seq_group = SequenceGroup(request_id="1", - seqs=[prompt, child], - arrival_time=time.time(), - sampling_params=SamplingParams()) - block_manager.allocate(seq_group) - - # Fork and append a new token id. We expect a COW to be scheduled. - token_id = 4 - child.append_token_id(token_id, {token_id: Logprob(0.0)}) - block_manager.fork(prompt, child) - - assert block_manager.can_append_slots(seq_group) - before_blocks = block_manager.get_num_free_gpu_blocks() - - cows = block_manager.append_slots(child) - assert cows - dict_cows = defaultdict(list) - for src_block, dst_block in cows: - dict_cows[src_block].append(dst_block) - for src_block, dst_blocks in dict_cows.items(): - assert src_block not in dst_blocks - - after_blocks = block_manager.get_num_free_gpu_blocks() - assert before_blocks - after_blocks == 1 - - -def test_fork(): - block_size = 4 - num_cpu_blocks = 4 - num_gpu_blocks = 4 - block_manager = BlockSpaceManagerV1(block_size, - num_cpu_blocks, - num_gpu_blocks, - watermark=0) - - prompt, seq_group = create_dummy_prompt("1", - block_size - 1, - block_size=block_size) - block_manager.allocate(seq_group) - - # Fork prompt and copy block tables. - child = prompt.fork(2) - block_manager.fork(prompt, child) - assert block_manager.get_block_table( - prompt) == block_manager.get_block_table(child) - token_id = 4 - # Append token to child. Block is shared so copy on write occurs. - child.append_token_id(token_id, {token_id: Logprob(0.0)}) - block_manager.append_slots(child) - assert block_manager.get_block_table( - prompt) != block_manager.get_block_table(child) - - -def test_swap(): - block_size = 4 - num_cpu_blocks = 4 - num_gpu_blocks = 4 - block_manager = BlockSpaceManagerV1(block_size, - num_cpu_blocks, - num_gpu_blocks, - watermark=0) - - prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1) - prompt.status = SequenceStatus.WAITING - block_manager.allocate(seq_group) - - # Emulate a forward pass by appending a single token. - # The block manager then knows how many unprocessed - # tokens will be written in the next forward pass. - token_id = 0 - prompt.status = SequenceStatus.RUNNING - prompt.append_token_id(token_id, {token_id: Logprob(0.0)}) - - # Swap seq group from GPU -> CPU. - gpu_blocks = block_manager.get_block_table(prompt) - assert block_manager.can_swap_out(seq_group) - before_cpu_blocks = block_manager.get_num_free_cpu_blocks() - before_gpu_blocks = block_manager.get_num_free_gpu_blocks() - mapping = block_manager.swap_out(seq_group) - assert [x[0] for x in mapping] == gpu_blocks - after_cpu_blocks = block_manager.get_num_free_cpu_blocks() - after_gpu_blocks = block_manager.get_num_free_gpu_blocks() - assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks) - assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks - prompt.status = SequenceStatus.SWAPPED - - # Swap seq group from CPU -> GPU. - cpu_blocks = block_manager.get_block_table(prompt) - assert block_manager.can_swap_in(seq_group) == AllocStatus.OK - before_cpu_blocks = block_manager.get_num_free_cpu_blocks() - before_gpu_blocks = block_manager.get_num_free_gpu_blocks() - mapping = block_manager.swap_in(seq_group) - assert [x[0] for x in mapping] == cpu_blocks - after_cpu_blocks = block_manager.get_num_free_cpu_blocks() - after_gpu_blocks = block_manager.get_num_free_gpu_blocks() - assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks - assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks) - - -def test_swap_encoder_decoder(): - block_size = 4 - num_cpu_blocks = 4 - num_gpu_blocks = 4 - block_manager = BlockSpaceManagerV1(block_size, - num_cpu_blocks, - num_gpu_blocks, - watermark=0) - - decoder_prompt, encoder_prompt, seq_group = \ - create_dummy_prompt_encoder_decoder( - "1", - decoder_prompt_length=block_size, - encoder_prompt_length=block_size) - decoder_prompt.status = SequenceStatus.WAITING - encoder_prompt.status = SequenceStatus.WAITING - block_manager.allocate(seq_group) - - # Emulate a forward pass by appending a single token. - # The block manager then knows how many unprocessed - # tokens will be written in the next forward pass. - token_id = 0 - decoder_prompt.status = SequenceStatus.RUNNING - decoder_prompt.append_token_id(token_id, {token_id: Logprob(0.0)}) - - # Swap encoder/decoder seq group from GPU -> CPU. - decoder_gpu_blocks = block_manager.get_block_table(decoder_prompt) - cross_gpu_blocks = block_manager.get_cross_block_table(seq_group) - gpu_blocks = decoder_gpu_blocks + cross_gpu_blocks - assert block_manager.can_swap_out(seq_group) - before_cpu_blocks = block_manager.get_num_free_cpu_blocks() - before_gpu_blocks = block_manager.get_num_free_gpu_blocks() - mapping = block_manager.swap_out(seq_group) - assert [x[0] for x in mapping] == gpu_blocks - #assert list(mapping.keys()) == gpu_blocks - after_cpu_blocks = block_manager.get_num_free_cpu_blocks() - after_gpu_blocks = block_manager.get_num_free_gpu_blocks() - assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks) - assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks - decoder_prompt.status = SequenceStatus.SWAPPED - - # Swap encoder/decoder seq group from CPU -> GPU. - decoder_cpu_blocks = block_manager.get_block_table(decoder_prompt) - cross_cpu_blocks = block_manager.get_cross_block_table(seq_group) - cpu_blocks = decoder_cpu_blocks + cross_cpu_blocks - assert block_manager.can_swap_in(seq_group) == AllocStatus.OK - before_cpu_blocks = block_manager.get_num_free_cpu_blocks() - before_gpu_blocks = block_manager.get_num_free_gpu_blocks() - mapping = block_manager.swap_in(seq_group) - assert [x[0] for x in mapping] == cpu_blocks - after_cpu_blocks = block_manager.get_num_free_cpu_blocks() - after_gpu_blocks = block_manager.get_num_free_gpu_blocks() - assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks - assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks) - - -def test_free(): - block_size = 4 - num_cpu_blocks = 4 - num_gpu_blocks = 4 - block_manager = BlockSpaceManagerV1(block_size, - num_cpu_blocks, - num_gpu_blocks, - watermark=0) - - prompt, seq_group = create_dummy_prompt("1", block_size) - block_manager.allocate(seq_group) - - # Free allocated seq. - prompt_blocks = len(block_manager.get_block_table(prompt)) - before_blocks = block_manager.get_num_free_gpu_blocks() - block_manager.free(prompt) - after_blocks = block_manager.get_num_free_gpu_blocks() - assert after_blocks == before_blocks + prompt_blocks - - # Block table for freed seq is deleted. - with pytest.raises(KeyError): - block_manager.get_block_table(prompt) - - -def test_free_encoder_decoder(): - block_size = 4 - num_cpu_blocks = 4 - num_gpu_blocks = 4 - block_manager = BlockSpaceManagerV1(block_size, - num_cpu_blocks, - num_gpu_blocks, - watermark=0) - - decoder_prompt, encoder_prompt, seq_group = \ - create_dummy_prompt_encoder_decoder( - "1", - decoder_prompt_length=block_size, - encoder_prompt_length=block_size) - block_manager.allocate(seq_group) - - # Free allocated seq. - decoder_prompt_blocks = len(block_manager.get_block_table(decoder_prompt)) - encoder_prompt_blocks = len(block_manager.get_cross_block_table(seq_group)) - prompt_blocks = decoder_prompt_blocks + encoder_prompt_blocks - before_blocks = block_manager.get_num_free_gpu_blocks() - block_manager.free(decoder_prompt) - block_manager.free_cross(seq_group) - after_blocks = block_manager.get_num_free_gpu_blocks() - assert after_blocks == before_blocks + prompt_blocks - - # Block table for freed encoder & decoder seq's are deleted. - with pytest.raises(KeyError): - block_manager.get_block_table(decoder_prompt) - - # Block table for freed encoder & decoder seq's are deleted. - with pytest.raises(KeyError): - block_manager.get_block_table(encoder_prompt) - - -def test_reset(): - block_size = 4 - num_cpu_blocks = 4 - num_gpu_blocks = 4 - block_manager = BlockSpaceManagerV1(block_size, - num_cpu_blocks, - num_gpu_blocks, - watermark=0) - - # Allocate same seq group on all available gpu blocks. - original_blocks = block_manager.get_num_free_gpu_blocks() - for i in range(num_gpu_blocks): - _, seq_group = create_dummy_prompt(str(i), block_size) - block_manager.allocate(seq_group) - assert block_manager.get_num_free_gpu_blocks() == 0 - - # Resetting block manager frees all allocated blocks. - block_manager.reset() - assert block_manager.get_num_free_gpu_blocks() == original_blocks - - -def test_reset_encoder_decoder(): - block_size = 4 - num_cpu_blocks = 4 - num_gpu_blocks = 4 - block_req_per_seq_group = 2 - block_manager = BlockSpaceManagerV1(block_size, - num_cpu_blocks, - num_gpu_blocks, - watermark=0) - - # Allocate same seq group on all available gpu blocks. - original_blocks = block_manager.get_num_free_gpu_blocks() - for i in range(num_gpu_blocks // block_req_per_seq_group): - _, _, seq_group = create_dummy_prompt_encoder_decoder( - f"{i}", - decoder_prompt_length=block_size, - encoder_prompt_length=block_size) - block_manager.allocate(seq_group) - assert block_manager.get_num_free_gpu_blocks() == 0 - - # Resetting block manager frees all allocated blocks. - block_manager.reset() - assert block_manager.get_num_free_gpu_blocks() == original_blocks - - -def test_sliding_window_multi_seq(): - """ - Tests that memory allocation and deallocation is handled - correctly with multiple sequences that exceed the sliding - window's capacity. - """ - block_size = 1 - num_cpu_blocks = 8 - num_gpu_blocks = 8 - sliding_window = 2 - block_manager = BlockSpaceManagerV1(block_size, - num_cpu_blocks, - num_gpu_blocks, - sliding_window=sliding_window, - watermark=0) - - assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks - - parent = Sequence(seq_id=1, - inputs={ - "prompt": "one two three", - "prompt_token_ids": [0, 1, 2], - }, - block_size=block_size) - seq_group = SequenceGroup(request_id="1", - seqs=[parent], - arrival_time=time.time(), - sampling_params=SamplingParams(), - lora_request=None) - block_manager.allocate(seq_group) - - # assert the number of blocks allocated is correct - # the parent seq has len 3, but since sliding_window is 2, - # we will use at most 2 blocks - assert block_manager.get_num_free_gpu_blocks( - ) == num_gpu_blocks - sliding_window - - # Fork prompt and copy block tables. - child = parent.fork(2) - block_manager.fork(parent, child) - - # assert the number of blocks allocated is correct - # forking does not increase memory consumption - assert block_manager.get_num_free_gpu_blocks( - ) == num_gpu_blocks - sliding_window - - # assert both parent and child share all blocks - assert block_manager.get_block_table( - parent) == block_manager.get_block_table(child) - - token_id = 4 - # Append token to child. Block is shared so copy on write occurs. - child.append_token_id(token_id, {token_id: Logprob(0.0)}) - block_manager.append_slots(child) - - # assert the number of blocks allocated is correct - # we will use now one block more. Each seq will use 2 blocks, - # but only one can be shared - assert block_manager.get_num_free_gpu_blocks( - ) == num_gpu_blocks - sliding_window - 1 - - token_id = 5 - parent.append_token_id(token_id, {token_id: Logprob(0.0)}) - block_manager.append_slots(parent) - - # assert the number of blocks allocated is correct - # no change, because both sequences are still just sharing one block - assert block_manager.get_num_free_gpu_blocks( - ) == num_gpu_blocks - sliding_window - 1 - - block_table_parent = block_manager.get_block_table(parent) - block_table_child = block_manager.get_block_table(child) - - assert block_table_parent != block_table_child - - # assert both blocks are sharing the second-last block - assert block_table_parent[-2] == block_table_child[-2] - - # now let's clean up... - block_manager.free(parent) - - # assert the number of blocks allocated is correct - # We have freed one seq, reducing the ref count of two blocks by one. - # One of the two was only used by the parent seq, so this is now free. - # The child seq still consumes sliding_window blocks - assert block_manager.get_num_free_gpu_blocks( - ) == num_gpu_blocks - sliding_window - - # free all blocks - block_manager.free(child) - - # assert all blocks are free now - assert block_manager.get_num_free_gpu_blocks() == num_gpu_blocks - - -def test_mark_blocks_as_computed_with_prefix_cache_and_chunked_prefill(): - """When prefix cache and chunked prefill are enabled, the block manager - should only mark a chunk of blocks as computed instead of all blocks. - """ - - block_size = 4 - num_cpu_blocks = 0 - num_gpu_blocks = 16 - block_manager = BlockSpaceManagerV1(block_size, - num_gpu_blocks, - num_cpu_blocks, - watermark=0, - enable_caching=True) - - # Set prompt size to have num_gpu_blocks - 1 full blocks. - prompt_length = block_size * num_gpu_blocks - 1 - - # Allocate (reserve) all blocks. - _, seq_group = create_dummy_prompt("0", - prompt_length, - block_size=block_size) - block_manager.allocate(seq_group) - assert seq_group.seqs[0].n_blocks == num_gpu_blocks - - # 1st chunk: Compute 2 and half blocks. Should mark 2 blocks as computed. - token_chunk_size = int(block_size * 2.5) - block_manager.mark_blocks_as_computed(seq_group, token_chunk_size) - computed_blocks = block_manager.get_all_computed_blocks(seq_group.seqs[0]) - assert len(computed_blocks) == 2 - - # Actual computed tokens. - seq_group.seqs[0].data.update_num_computed_tokens(token_chunk_size) - - # 2nd chunk: Complete 3rd block and additional 4 blocks. - token_chunk_size = int(block_size * 4.5) - block_manager.mark_blocks_as_computed(seq_group, token_chunk_size) - computed_blocks = block_manager.get_all_computed_blocks(seq_group.seqs[0]) - assert len(computed_blocks) == 7 diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index c9495fd50d7c9..acd82065ae457 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -4,11 +4,9 @@ import pytest # noqa from vllm.config import CacheConfig, SchedulerConfig -from vllm.core.interfaces import AllocStatus from vllm.core.scheduler import Scheduler from vllm.sequence import Logprob, SequenceGroup -from ..utils import check_deprecated_block_manager_usage from .utils import create_dummy_prompt @@ -28,25 +26,17 @@ def schedule_and_update_computed_tokens(scheduler): return metas, out -@pytest.fixture(scope="module", autouse=True) -def check_deprecated_block_manager(): - check_deprecated_block_manager_usage( - 'tests/core/test_chunked_prefill_scheduler.py') - - -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_simple(use_v2_block_manager: bool): +def test_simple(): """Verify basic scheduling works.""" block_size = 4 num_seq_group = 4 max_model_len = 16 max_num_batched_tokens = 64 - scheduler_config = SchedulerConfig( - max_num_batched_tokens, - num_seq_group, - max_model_len, - enable_chunked_prefill=True, - use_v2_block_manager=use_v2_block_manager) + scheduler_config = SchedulerConfig("generate", + max_num_batched_tokens, + num_seq_group, + max_model_len, + enable_chunked_prefill=True) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 8 cache_config.num_gpu_blocks = 8 @@ -81,19 +71,19 @@ def test_simple(use_v2_block_manager: bool): assert len(seq_group_meta) == num_seq_group -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_chunk(use_v2_block_manager: bool): +def test_chunk(): """Verify prefills are chunked properly.""" block_size = 4 max_seqs = 60 max_model_len = 80 max_num_batched_tokens = 64 scheduler_config = SchedulerConfig( + "generate", max_num_batched_tokens, max_seqs, max_model_len, enable_chunked_prefill=True, - use_v2_block_manager=use_v2_block_manager) + ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 32 cache_config.num_gpu_blocks = 32 @@ -131,18 +121,18 @@ def test_chunk(use_v2_block_manager: bool): assert out.num_batched_tokens == 57 -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_complex(use_v2_block_manager: bool): +def test_complex(): block_size = 4 max_seqs = 60 max_model_len = 80 max_num_batched_tokens = 64 scheduler_config = SchedulerConfig( + "generate", max_num_batched_tokens, max_seqs, max_model_len, enable_chunked_prefill=True, - use_v2_block_manager=use_v2_block_manager) + ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 64 cache_config.num_gpu_blocks = 64 @@ -201,19 +191,19 @@ def test_complex(use_v2_block_manager: bool): assert running[2].is_prefill() -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_maximal_decoding(use_v2_block_manager: bool): +def test_maximal_decoding(): """Verify decoding requests are prioritized.""" block_size = 4 max_seqs = 2 max_model_len = 8 max_num_batched_tokens = 2 scheduler_config = SchedulerConfig( + "generate", max_num_batched_tokens, max_seqs, max_model_len, enable_chunked_prefill=True, - use_v2_block_manager=use_v2_block_manager) + ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 8 cache_config.num_gpu_blocks = 8 @@ -295,19 +285,19 @@ def test_maximal_decoding(use_v2_block_manager: bool): assert out.num_batched_tokens == 2 -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_prompt_limit(use_v2_block_manager: bool): +def test_prompt_limit(): """Verify max_num_batched_tokens < max_model_len is possible.""" block_size = 4 max_seqs = 32 max_model_len = 64 max_num_batched_tokens = 32 scheduler_config = SchedulerConfig( + "generate", max_num_batched_tokens, max_seqs, max_model_len, enable_chunked_prefill=True, - use_v2_block_manager=use_v2_block_manager) + ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 16 cache_config.num_gpu_blocks = 16 @@ -330,13 +320,13 @@ def test_prompt_limit(use_v2_block_manager: bool): assert out.num_batched_tokens == 32 -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_prompt_limit_exceed(use_v2_block_manager: bool): +def test_prompt_limit_exceed(): block_size = 4 max_seqs = 64 max_model_len = 32 max_num_batched_tokens = 64 - scheduler_config = SchedulerConfig(max_num_batched_tokens, + scheduler_config = SchedulerConfig("generate", + max_num_batched_tokens, max_seqs, max_model_len, enable_chunked_prefill=True) @@ -356,171 +346,19 @@ def test_prompt_limit_exceed(use_v2_block_manager: bool): assert out.ignored_seq_groups[0] == seq_group -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_swap(use_v2_block_manager: bool): - """Verify swapping works with chunked prefill requests""" - block_size = 4 - max_seqs = 30 - max_model_len = 200 - max_num_batched_tokens = 30 - scheduler_config = SchedulerConfig( - max_num_batched_tokens, - max_seqs, - max_model_len, - enable_chunked_prefill=True, - use_v2_block_manager=use_v2_block_manager) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 16 - cache_config.num_gpu_blocks = 16 - scheduler = Scheduler(scheduler_config, cache_config, None) - - _, seq_group = create_dummy_prompt("1", - prompt_length=60, - best_of=2, - block_size=block_size) - scheduler.add_seq_group(seq_group) - _, out = schedule_and_update_computed_tokens(scheduler) - # The request is chunked. - # prefill scheduled now. - assert len(out.scheduled_seq_groups) == 1 - assert out.num_prefill_groups == 1 - assert seq_group.is_prefill() - assert out.num_batched_tokens == max_num_batched_tokens - - # The last request should be swapped out. - scheduler.block_manager.can_append_slots = MagicMock() - - def cannot_append_second_group(seq_group, num_lookahead_slots): - return seq_group.request_id != "1" - - scheduler.block_manager.can_append_slots.side_effect = ( - cannot_append_second_group) - - # The running prefill is now swapped. - _, out = schedule_and_update_computed_tokens(scheduler) - assert len(out.scheduled_seq_groups) == 0 - assert out.num_batched_tokens == 0 - assert out.blocks_to_swap_out != [] - assert out.blocks_to_swap_in == [] - - # Add 1 more task. Swap should be prioritized over new prefill. - _, seq_group = create_dummy_prompt("2", prompt_length=60) - scheduler.add_seq_group(seq_group) - _, out = schedule_and_update_computed_tokens(scheduler) - assert len(out.scheduled_seq_groups) == 1 - # 3 decodes. It is swapped in. - assert out.num_batched_tokens == 30 - assert out.blocks_to_swap_in != [] - assert out.blocks_to_swap_out == [] - - -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_running_prefill_prioritized_over_swap(use_v2_block_manager: bool): - block_size = 4 - max_seqs = 30 - max_model_len = 200 - max_num_batched_tokens = 30 - scheduler_config = SchedulerConfig( - max_num_batched_tokens, - max_seqs, - max_model_len, - enable_chunked_prefill=True, - use_v2_block_manager=use_v2_block_manager) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 32 - cache_config.num_gpu_blocks = 32 - scheduler = Scheduler(scheduler_config, cache_config, None) - - _, seq_group = create_dummy_prompt("1", - prompt_length=60, - best_of=2, - block_size=block_size) - scheduler.add_seq_group(seq_group) - _, out = schedule_and_update_computed_tokens(scheduler) - # The request is chunked. - # prefill scheduled now. - assert len(out.scheduled_seq_groups) == 1 - assert out.num_prefill_groups == 1 - assert seq_group.is_prefill() - assert out.num_batched_tokens == max_num_batched_tokens - - # The request should be swapped out. - scheduler.block_manager.can_append_slots = MagicMock() - - def cannot_append_second_group(seq_group, num_lookahead_slots): - return seq_group.request_id != "1" - - scheduler.block_manager.can_append_slots.side_effect = ( - cannot_append_second_group) - - # The running prefill is now swapped. - _, out = schedule_and_update_computed_tokens(scheduler) - assert len(out.scheduled_seq_groups) == 0 - assert out.num_batched_tokens == 0 - assert out.blocks_to_swap_out != [] - assert out.blocks_to_swap_in == [] - - # Add 1 more task. Swap is not possible, so prefill is running. - scheduler.block_manager.can_swap_in = MagicMock() - scheduler.block_manager.can_swap_in.return_value = AllocStatus.LATER - - _, seq_group2 = create_dummy_prompt("2", - prompt_length=60, - block_size=block_size) - scheduler.add_seq_group(seq_group2) - _, out = schedule_and_update_computed_tokens(scheduler) - assert len(out.scheduled_seq_groups) == 1 - # 3 decodes. It is swapped in. - assert out.num_batched_tokens == 30 - assert out.blocks_to_swap_in == [] - assert out.blocks_to_swap_out == [] - assert out.scheduled_seq_groups[0].seq_group == seq_group2 - - # Now although swap is possible, running prefill is prioritized. - scheduler.block_manager.can_swap_in.return_value = AllocStatus.OK - _, out = schedule_and_update_computed_tokens(scheduler) - assert len(out.scheduled_seq_groups) == 1 - # 3 decodes. It is swapped in. - assert out.num_batched_tokens == 30 - assert out.blocks_to_swap_in == [] - assert out.blocks_to_swap_out == [] - assert not seq_group2.is_prefill() - assert out.scheduled_seq_groups[0].seq_group == seq_group2 - append_new_token(seq_group2, 1) - - # Decoding is prioritized. - _, out = schedule_and_update_computed_tokens(scheduler) - assert len(out.scheduled_seq_groups) == 1 - # 3 decodes. It is swapped in. - assert out.num_batched_tokens == 1 - assert out.blocks_to_swap_in == [] - assert out.blocks_to_swap_out == [] - assert not seq_group2.is_prefill() - assert out.scheduled_seq_groups[0].seq_group == seq_group2 - append_new_token(seq_group2, 1) - - # Since we abort the sequence group, we can finally swap. - scheduler.abort_seq_group(seq_group2.request_id) - _, out = schedule_and_update_computed_tokens(scheduler) - assert len(out.scheduled_seq_groups) == 1 - assert out.num_batched_tokens == 30 - assert out.blocks_to_swap_in != [] - assert out.blocks_to_swap_out == [] - - -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_chunked_prefill_preempt(use_v2_block_manager: bool): +def test_chunked_prefill_preempt(): """Verify preempt works with chunked prefill requests""" block_size = 4 max_seqs = 30 max_model_len = 200 max_num_batched_tokens = 30 scheduler_config = SchedulerConfig( + "generate", max_num_batched_tokens, max_seqs, max_model_len, enable_chunked_prefill=True, - use_v2_block_manager=use_v2_block_manager) + ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 16 cache_config.num_gpu_blocks = 16 @@ -575,18 +413,18 @@ def cannot_append_second_group2(seq_group, num_lookahead_slots): assert out.num_batched_tokens == max_num_batched_tokens -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_chunked_prefill_max_seqs(use_v2_block_manager: bool): +def test_chunked_prefill_max_seqs(): block_size = 4 max_seqs = 2 max_model_len = 80 max_num_batched_tokens = 64 scheduler_config = SchedulerConfig( + "generate", max_num_batched_tokens, max_seqs, max_model_len, enable_chunked_prefill=True, - use_v2_block_manager=use_v2_block_manager) + ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 128 cache_config.num_gpu_blocks = 128 @@ -629,19 +467,19 @@ def test_chunked_prefill_max_seqs(use_v2_block_manager: bool): assert not running[1].is_prefill() -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_perfix_caching(use_v2_block_manager: bool): +def test_perfix_caching(): """Verify allocating full blocks when prefix caching is enabled.""" block_size = 4 max_seqs = 10 max_model_len = 80 max_num_batched_tokens = 64 scheduler_config = SchedulerConfig( + "generate", max_num_batched_tokens, max_seqs, max_model_len, enable_chunked_prefill=True, - use_v2_block_manager=use_v2_block_manager) + ) cache_config = CacheConfig(block_size, 1.0, 1, diff --git a/tests/core/test_num_computed_tokens_update.py b/tests/core/test_num_computed_tokens_update.py index f3ec24e7bee3e..bd4accab7f37d 100644 --- a/tests/core/test_num_computed_tokens_update.py +++ b/tests/core/test_num_computed_tokens_update.py @@ -31,7 +31,6 @@ def test_num_computed_tokens_update(num_scheduler_steps: int, # Make a vllm engine runner = VllmRunner(model_name=MODEL, gpu_memory_utilization=0.7, - use_v2_block_manager=True, num_scheduler_steps=num_scheduler_steps, enable_chunked_prefill=enable_chunked_prefill, enforce_eager=enforce_eager) diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index 5cdf743a4509c..5ff32be611592 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -3,32 +3,28 @@ from typing import List, Set, Tuple from unittest.mock import MagicMock -import pytest +import pytest # noqa from torch import Use # noqa from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig from vllm.core.interfaces import AllocStatus from vllm.core.scheduler import Scheduler, SchedulingBudget from vllm.lora.request import LoRARequest -from vllm.sequence import SequenceGroup, SequenceStatus +from vllm.sequence import SequenceGroup -from ..utils import check_deprecated_block_manager_usage from .utils import (append_new_token, append_new_token_seq_group, create_dummy_prompt, get_sequence_groups, schedule_and_update_computed_tokens) -@pytest.fixture(scope="module", autouse=True) -def check_deprecated_block_manager(): - check_deprecated_block_manager_usage( - "tests/core/test_chunked_prefill_scheduler.py") - - -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_scheduler_add_seq_group(use_v2_block_manager: bool): +def test_scheduler_add_seq_group(): block_size = 4 scheduler_config = SchedulerConfig( - 100, 64, 1, use_v2_block_manager=use_v2_block_manager) + "generate", + max_num_batched_tokens=100, + max_num_seqs=64, + max_model_len=1, + ) cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto") cache_config.num_cpu_blocks = 4 cache_config.num_gpu_blocks = 4 @@ -44,11 +40,14 @@ def test_scheduler_add_seq_group(use_v2_block_manager: bool): assert scheduler.get_num_unfinished_seq_groups() == i + 1 -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_scheduler_abort_seq_group(use_v2_block_manager: bool): +def test_scheduler_abort_seq_group(): block_size = 4 scheduler_config = SchedulerConfig( - 100, 64, 1, use_v2_block_manager=use_v2_block_manager) + "generate", + max_num_batched_tokens=100, + max_num_seqs=64, + max_model_len=1, + ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 4 cache_config.num_gpu_blocks = 4 @@ -68,16 +67,16 @@ def test_scheduler_abort_seq_group(use_v2_block_manager: bool): assert scheduler.get_num_unfinished_seq_groups() == 0 -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_scheduler_schedule_simple(use_v2_block_manager: bool): +def test_scheduler_schedule_simple(): block_size = 4 num_seq_group = 4 max_model_len = 16 scheduler_config = SchedulerConfig( - 64, - num_seq_group, - max_model_len, - use_v2_block_manager=use_v2_block_manager) + "generate", + max_num_batched_tokens=64, + max_num_seqs=num_seq_group, + max_model_len=max_model_len, + ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 8 cache_config.num_gpu_blocks = 8 @@ -112,17 +111,17 @@ def test_scheduler_schedule_simple(use_v2_block_manager: bool): append_new_token(out, 1) -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_scheduler_prefill_prioritized(use_v2_block_manager: bool): +def test_scheduler_prefill_prioritized(): """Verify running batched tokens are not applied to prefill requests.""" block_size = 4 max_model_len = 30 max_batched_num_tokens = 30 scheduler_config = SchedulerConfig( - max_batched_num_tokens, - 2, - max_model_len, - use_v2_block_manager=use_v2_block_manager) + "generate", + max_num_batched_tokens=max_batched_num_tokens, + max_num_seqs=2, + max_model_len=max_model_len, + ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 16 cache_config.num_gpu_blocks = 16 @@ -146,12 +145,15 @@ def test_scheduler_prefill_prioritized(use_v2_block_manager: bool): assert get_sequence_groups(out) == [seq_group_b] -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_scheduler_schedule_preempt_abort(use_v2_block_manager: bool): +def test_scheduler_schedule_preempt_abort(): block_size = 4 max_model_len = 16 scheduler_config = SchedulerConfig( - 64, 2, max_model_len, use_v2_block_manager=use_v2_block_manager) + "generate", + max_num_batched_tokens=64, + max_num_seqs=2, + max_model_len=max_model_len, + ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 2 cache_config.num_gpu_blocks = 2 @@ -201,17 +203,17 @@ def test_scheduler_schedule_preempt_abort(use_v2_block_manager: bool): assert scheduler.get_num_unfinished_seq_groups() == 1 -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_scheduler_max_seqs(use_v2_block_manager: bool): +def test_scheduler_max_seqs(): block_size = 4 num_seq_group = 4 max_seq_group = 2 max_model_len = 16 scheduler_config = SchedulerConfig( - 64, - max_seq_group, - max_model_len, - use_v2_block_manager=use_v2_block_manager) + "generate", + max_num_batched_tokens=64, + max_num_seqs=max_seq_group, + max_model_len=max_model_len, + ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 8 cache_config.num_gpu_blocks = 8 @@ -249,15 +251,15 @@ def test_scheduler_max_seqs(use_v2_block_manager: bool): assert set(get_sequence_groups(out)) == set([all_seq_groups[1]]) -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_scheduler_delay_factor(use_v2_block_manager: bool): +def test_scheduler_delay_factor(): block_size = 4 scheduler_config = SchedulerConfig( - 100, - 64, - 16, + "generate", + max_num_batched_tokens=100, + max_num_seqs=64, + max_model_len=16, delay_factor=0.5, - use_v2_block_manager=use_v2_block_manager) + ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 8 cache_config.num_gpu_blocks = 8 @@ -294,74 +296,23 @@ def test_scheduler_delay_factor(use_v2_block_manager: bool): append_new_token(out, 1) -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_swapped_out_prioritized(use_v2_block_manager: bool): - block_size = 4 - scheduler = initialize_scheduler(max_num_seqs=6, - block_size=block_size, - use_v2_block_manager=use_v2_block_manager, - num_cpu_blocks=64, - num_gpu_blocks=64) - # best_of=2 * 3 == 6 sequences. - for i in range(3): - _, seq_group = create_dummy_prompt(str(i), - prompt_length=60, - best_of=2, - block_size=block_size) - scheduler.add_seq_group(seq_group) - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - # prefill scheduled now. - assert len(out.scheduled_seq_groups) == 3 - append_new_token(out, 1) - - # The last request should be swapped out. - scheduler.block_manager.can_append_slots = MagicMock() - - def cannot_append_second_group(seq_group, num_lookahead_slots): - return seq_group.request_id != "2" - - scheduler.block_manager.can_append_slots.side_effect = ( - cannot_append_second_group) - - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert len(out.scheduled_seq_groups) == 2 - assert out.num_batched_tokens == 2 - assert out.blocks_to_swap_out != [] - assert out.blocks_to_swap_in == [] - append_new_token(out, 1) - - # Add 1 more task. Swap should be prioritized over prefill. - _, seq_group = create_dummy_prompt(str(i), - prompt_length=60, - best_of=2, - block_size=block_size) - scheduler.add_seq_group(seq_group) - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - append_new_token(out, 1) - assert len(out.scheduled_seq_groups) == 3 - # 3 decodes. It is swapped in. - assert out.num_batched_tokens == 3 - assert out.blocks_to_swap_in != [] - assert out.blocks_to_swap_out == [] - - def initialize_scheduler( *, max_num_seqs=1000, max_token_budget=1000, max_model_len=1000, lora_config=None, - use_v2_block_manager=False, block_size=4, num_cpu_blocks=8, num_gpu_blocks=8, ): block_size = block_size scheduler_config = SchedulerConfig( - max_token_budget, - max_num_seqs, - max_model_len, - use_v2_block_manager=use_v2_block_manager) + "generate", + max_num_batched_tokens=max_token_budget, + max_num_seqs=max_num_seqs, + max_model_len=max_model_len, + ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = num_cpu_blocks cache_config.num_gpu_blocks = num_gpu_blocks @@ -386,15 +337,12 @@ def add_token_budget(budget: SchedulingBudget, budget.add_num_seqs(mock_seq_group.request_id, num_curr_seqs) -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_prefill_schedule_max_prompt_len(use_v2_block_manager: bool): +def test_prefill_schedule_max_prompt_len(): """ Test prompt longer than max_prompt_len is aborted. """ block_size = 4 - scheduler = initialize_scheduler(max_model_len=30, - use_v2_block_manager=use_v2_block_manager, - block_size=block_size) + scheduler = initialize_scheduler(max_model_len=30, block_size=block_size) _, seq_group = create_dummy_prompt("0", prompt_length=60, block_size=block_size) @@ -409,14 +357,12 @@ def test_prefill_schedule_max_prompt_len(use_v2_block_manager: bool): assert len(remaining_waiting) == 0 -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_prefill_schedule_token_budget(use_v2_block_manager: bool): +def test_prefill_schedule_token_budget(): """ Test token budget respected. """ block_size = 4 - scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager, - block_size=block_size, + scheduler = initialize_scheduler(block_size=block_size, num_cpu_blocks=64, num_gpu_blocks=64) budget = create_token_budget(token_budget=0) @@ -446,8 +392,7 @@ def test_prefill_schedule_token_budget(use_v2_block_manager: bool): assert len(remaining_waiting) == 1 # Test when current_batched_tokens respected. - scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager, - block_size=block_size, + scheduler = initialize_scheduler(block_size=block_size, num_cpu_blocks=16, num_gpu_blocks=16) budget = create_token_budget(token_budget=60) @@ -474,14 +419,12 @@ def test_prefill_schedule_token_budget(use_v2_block_manager: bool): assert len(remaining_waiting) == 0 -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_prefill_schedule_max_seqs(use_v2_block_manager: bool): +def test_prefill_schedule_max_seqs(): """ Test max seq respected. """ block_size = 4 - scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager, - block_size=block_size, + scheduler = initialize_scheduler(block_size=block_size, num_cpu_blocks=64, num_gpu_blocks=64) budget = create_token_budget(max_num_seqs=2) @@ -515,15 +458,13 @@ def test_prefill_schedule_max_seqs(use_v2_block_manager: bool): assert len(remaining_waiting) == 1 -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_prefill_schedule_max_lora(use_v2_block_manager: bool): +def test_prefill_schedule_max_lora(): """ Test max lora is respected and prioritized. """ block_size = 4 lora_config = LoRAConfig(max_lora_rank=8, max_loras=1) scheduler = initialize_scheduler(lora_config=lora_config, - use_v2_block_manager=use_v2_block_manager, block_size=block_size, num_cpu_blocks=64, num_gpu_blocks=64) @@ -570,14 +511,12 @@ def test_prefill_schedule_max_lora(use_v2_block_manager: bool): assert budget.num_batched_tokens == 60 -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_prefill_schedule_no_block_manager_capacity(use_v2_block_manager): +def test_prefill_schedule_no_block_manager_capacity(): """ Test sequence cannot be scheduled due to block manager has no capacity. """ block_size = 4 - scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager, - block_size=block_size, + scheduler = initialize_scheduler(block_size=block_size, num_gpu_blocks=128, num_cpu_blocks=128) budget = create_token_budget() @@ -614,14 +553,12 @@ def test_prefill_schedule_no_block_manager_capacity(use_v2_block_manager): assert len(remaining_waiting) == 0 -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_decode_schedule_preempted(use_v2_block_manager: bool): +def test_decode_schedule_preempted(): """ Test decodes cannot be scheduled and preempted. """ block_size = 4 - scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager, - block_size=block_size, + scheduler = initialize_scheduler(block_size=block_size, num_cpu_blocks=64, num_gpu_blocks=64) curr_loras = None @@ -660,70 +597,12 @@ def cannot_append_second_group(seq_group, num_lookahead_slots): assert output.blocks_to_copy == [] -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_decode_swap_beam_search(use_v2_block_manager: bool): - """ - Test best_of > 1 swap out blocks - """ - block_size = 4 - scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager, - block_size=block_size, - num_gpu_blocks=64, - num_cpu_blocks=64) - curr_loras = None - budget = create_token_budget() - for i in range(3): - _, seq_group = create_dummy_prompt(str(i), - prompt_length=60, - best_of=2, - block_size=block_size) - scheduler._allocate_and_set_running(seq_group) - scheduler._add_seq_group_to_running(seq_group) - append_new_token_seq_group(60, seq_group, 1) - budget.add_num_seqs(seq_group.request_id, - seq_group.get_max_num_running_seqs()) - budget.add_num_batched_tokens( - seq_group.request_id, seq_group.num_seqs(SequenceStatus.RUNNING)) - - # The last request should be swapped out. - scheduler.block_manager.can_append_slots = MagicMock() - - def cannot_append_second_group(seq_group, num_lookahead_slots): - return seq_group.request_id != "2" - - scheduler.block_manager.can_append_slots.side_effect = ( - cannot_append_second_group) - scheduler.block_manager.swap_out = MagicMock() - expected_swap_mapping = [("5", "7")] - scheduler.block_manager.swap_out.return_value = expected_swap_mapping - - output = scheduler._schedule_running(budget, curr_loras) - remainig_running = scheduler.running - assert len(remainig_running) == 0 - assert len(output.decode_seq_groups) == 2 - assert len(output.prefill_seq_groups) == 0 - assert output.decode_seq_groups[0].seq_group.request_id == "0" - assert output.decode_seq_groups[1].seq_group.request_id == "1" - assert len(output.preempted) == 0 - assert len(output.swapped_out) == 1 - # Budget should refledct preempted requests. - assert budget.num_batched_tokens == 2 - # since there are 2 sequences, 2 should be subtracted. - assert budget.num_curr_seqs == 4 - # Both should be preempted, not swapped. - assert output.blocks_to_swap_out == expected_swap_mapping - # Nothing is copied. - assert output.blocks_to_copy == [] - - -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_schedule_decode_blocks_to_copy_update(use_v2_block_manager: bool): +def test_schedule_decode_blocks_to_copy_update(): """ Verify blocks_to_copy is updated. """ block_size = 4 - scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager, - block_size=4, + scheduler = initialize_scheduler(block_size=4, num_cpu_blocks=16, num_gpu_blocks=16) _, seq_group = create_dummy_prompt("1", @@ -754,117 +633,10 @@ def test_schedule_decode_blocks_to_copy_update(use_v2_block_manager: bool): assert output.blocks_to_copy == [(2, 3)] -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_schedule_swapped_simple(use_v2_block_manager: bool): - block_size = 4 - scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager, - block_size=block_size) - curr_loras = None - blocks_to_swap_out: List[Tuple[int, int]] = [] - _, seq_group = create_dummy_prompt("1", - prompt_length=4, - best_of=2, - block_size=block_size) - scheduler._allocate_and_set_running(seq_group) - append_new_token_seq_group(4, seq_group, 1) - scheduler._swap_out(seq_group, blocks_to_swap_out) - scheduler._add_seq_group_to_swapped(seq_group) - - budget = create_token_budget() - output = scheduler._schedule_swapped(budget, curr_loras) - remaining_swapped = scheduler.swapped - assert len(remaining_swapped) == 0 - assert budget.num_batched_tokens == 1 - assert budget.num_curr_seqs == 2 - assert len(output.decode_seq_groups) == 1 - assert len(output.prefill_seq_groups) == 0 - # swap in is the reverse of swap out - blocks_to_swap_in_reverse = [] - for swapin, swapout in output.blocks_to_swap_in: - blocks_to_swap_in_reverse.append((swapout, swapin)) - assert blocks_to_swap_out == blocks_to_swap_in_reverse - - -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_schedule_swapped_max_token_budget(use_v2_block_manager: bool): - block_size = 4 - scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager, - block_size=block_size, - num_cpu_blocks=32, - num_gpu_blocks=32) - curr_loras = None - blocks_to_swap_out: List[Tuple[int, int]] = [] - for i in range(2): - _, seq_group = create_dummy_prompt(str(i), prompt_length=60, best_of=2) - scheduler._allocate_and_set_running(seq_group) - append_new_token_seq_group(60, seq_group, 1) - scheduler._swap_out(seq_group, blocks_to_swap_out) - scheduler._add_seq_group_to_swapped(seq_group) - - budget = create_token_budget(token_budget=1) - output = scheduler._schedule_swapped(budget, curr_loras) - remaining_swapped = scheduler.swapped - assert len(remaining_swapped) == 1 - assert budget.num_batched_tokens == 1 - assert budget.num_curr_seqs == 2 - assert len(output.decode_seq_groups) == 1 - assert len(output.prefill_seq_groups) == 0 - - # Verify num_batched_tokens are respected. - budget = create_token_budget(token_budget=1) - add_token_budget(budget, 1, 0) - output = scheduler._schedule_swapped(budget, curr_loras) - remaining_swapped = scheduler.swapped - assert len(remaining_swapped) == 1 - assert budget.num_batched_tokens == 1 - assert budget.num_curr_seqs == 0 - assert len(output.decode_seq_groups) == 0 - assert len(output.prefill_seq_groups) == 0 - - -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_schedule_swapped_max_seqs(use_v2_block_manager: bool): - block_size = 4 - scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager, - block_size=block_size, - num_cpu_blocks=64, - num_gpu_blocks=64) - curr_loras = None - blocks_to_swap_out: List[Tuple[int, int]] = [] - for i in range(4): - _, seq_group = create_dummy_prompt(str(i), - prompt_length=60, - block_size=4) - scheduler._allocate_and_set_running(seq_group) - append_new_token_seq_group(60, seq_group, 1) - scheduler._swap_out(seq_group, blocks_to_swap_out) - scheduler._add_seq_group_to_swapped(seq_group) - - budget = create_token_budget(max_num_seqs=2) - output = scheduler._schedule_swapped(budget, curr_loras) - remaining_swapped = scheduler.swapped - assert len(remaining_swapped) == 2 - assert budget.num_batched_tokens == 2 - assert budget.num_curr_seqs == 2 - assert len(output.decode_seq_groups) == 2 - assert len(output.prefill_seq_groups) == 0 - - # Verify num_curr_seqs are respected. - output = scheduler._schedule_swapped(budget, curr_loras) - remaining_swapped = scheduler.swapped - assert len(remaining_swapped) == 2 - assert budget.num_batched_tokens == 2 - assert budget.num_curr_seqs == 2 - assert len(output.decode_seq_groups) == 0 - assert len(output.prefill_seq_groups) == 0 - - -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_schedule_swapped_max_loras(use_v2_block_manager: bool): +def test_schedule_swapped_max_loras(): block_size = 4 lora_config = LoRAConfig(max_lora_rank=8, max_loras=1) scheduler = initialize_scheduler(lora_config=lora_config, - use_v2_block_manager=use_v2_block_manager, block_size=block_size, num_cpu_blocks=32, num_gpu_blocks=32) @@ -894,11 +666,9 @@ def test_schedule_swapped_max_loras(use_v2_block_manager: bool): assert len(curr_loras) == 1 -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_schedule_swapped_cannot_swap_in(use_v2_block_manager: bool): +def test_schedule_swapped_cannot_swap_in(): block_size = 4 - scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager, - block_size=block_size, + scheduler = initialize_scheduler(block_size=block_size, num_cpu_blocks=32, num_gpu_blocks=32) curr_loras = None @@ -927,11 +697,9 @@ def test_schedule_swapped_cannot_swap_in(use_v2_block_manager: bool): assert len(output.prefill_seq_groups) == 0 -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_infeasible_swap(use_v2_block_manager: bool): +def test_infeasible_swap(): block_size = 4 - scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager, - block_size=block_size, + scheduler = initialize_scheduler(block_size=block_size, num_cpu_blocks=32, num_gpu_blocks=32) curr_loras = None @@ -961,11 +729,9 @@ def test_infeasible_swap(use_v2_block_manager: bool): assert len(output.prefill_seq_groups) == 0 -@pytest.mark.parametrize('use_v2_block_manager', [True, False]) -def test_schedule_swapped_blocks_to_copy(use_v2_block_manager: bool): +def test_schedule_swapped_blocks_to_copy(): block_size = 4 - scheduler = initialize_scheduler(use_v2_block_manager=use_v2_block_manager, - block_size=block_size, + scheduler = initialize_scheduler(block_size=block_size, num_cpu_blocks=32, num_gpu_blocks=32) curr_loras = None diff --git a/tests/core/test_scheduler_encoder_decoder.py b/tests/core/test_scheduler_encoder_decoder.py index 50c047f30b80d..7cd0416d321ef 100644 --- a/tests/core/test_scheduler_encoder_decoder.py +++ b/tests/core/test_scheduler_encoder_decoder.py @@ -36,7 +36,12 @@ def test_scheduler_schedule_simple_encoder_decoder(): block_size = 4 num_seq_group = 4 max_model_len = 16 - scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len) + scheduler_config = SchedulerConfig( + task="generate", + max_num_batched_tokens=64, + max_num_seqs=num_seq_group, + max_model_len=max_model_len, + ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 16 # enc and dec prompts per seq_group cache_config.num_gpu_blocks = 16 # enc and dec prompts per seq_group diff --git a/tests/core/utils.py b/tests/core/utils.py index a95a573db7cd3..cd0caa4704e11 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -4,6 +4,7 @@ from typing import Tuple from vllm import SamplingParams +from vllm.inputs import EncoderDecoderInputs, token_inputs from vllm.lora.request import LoRARequest from vllm.sequence import Logprob, Sequence, SequenceGroup @@ -27,10 +28,7 @@ def create_dummy_prompt( prompt_tokens = list(range(prompt_length)) prompt_str = " ".join([str(t) for t in prompt_tokens]) prompt = Sequence(int(request_id), - inputs={ - "prompt": prompt_str, - "prompt_token_ids": prompt_tokens, - }, + inputs=token_inputs(prompt_tokens, prompt=prompt_str), block_size=block_size) seq_group = SequenceGroup(request_id=request_id, seqs=[prompt], @@ -63,23 +61,21 @@ def create_dummy_prompt_encoder_decoder( encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length)))) encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens]) - inputs = { - "prompt": decoder_prompt_str, - "prompt_token_ids": decoder_prompt_tokens, - "encoder_prompt": encoder_prompt_str, - "encoder_prompt_token_ids": encoder_prompt_tokens, - "multi_modal_data": None, + inputs: EncoderDecoderInputs = { + "decoder": token_inputs(decoder_prompt_tokens, + prompt=decoder_prompt_str), + "encoder": token_inputs(encoder_prompt_tokens, + prompt=encoder_prompt_str), } decoder_prompt = Sequence(int(request_id), - inputs=inputs, - block_size=block_size, - from_decoder_prompt=True) + inputs=inputs["decoder"], + block_size=block_size) encoder_prompt = Sequence(int(request_id), - inputs=inputs, - block_size=block_size, - from_decoder_prompt=False) + inputs=inputs["encoder"], + block_size=block_size) + seq_group = SequenceGroup(request_id=request_id, seqs=[decoder_prompt], sampling_params=SamplingParams(best_of=best_of), @@ -108,7 +104,7 @@ def create_seq_group( for seq_id_offset, output_len in enumerate(seq_output_lens): seq = Sequence( seq_id=seq_id_start + seq_id_offset, - inputs={"prompt_token_ids": prompt_token_ids}, + inputs=token_inputs(prompt_token_ids), block_size=16, ) @@ -143,21 +139,19 @@ def create_seq_group_encoder_decoder( prompt_token_ids = [0] * seq_prompt_len - inputs = { - "prompt": "", - "prompt_token_ids": prompt_token_ids, - "encoder_prompt": "", - "encoder_prompt_token_ids": prompt_token_ids, - "multi_modal_data": None, + inputs: EncoderDecoderInputs = { + "decoder": token_inputs(prompt_token_ids), + "encoder": token_inputs(prompt_token_ids), } seqs = [] for seq_id_offset, output_len in enumerate(seq_output_lens): # Construct decoder input sequences - seq = Sequence(seq_id=seq_id_start + seq_id_offset, - inputs=inputs, - block_size=16, - from_decoder_prompt=True) + seq = Sequence( + seq_id=seq_id_start + seq_id_offset, + inputs=inputs["decoder"], + block_size=16, + ) for i in range(output_len): seq.append_token_id( @@ -167,10 +161,11 @@ def create_seq_group_encoder_decoder( seqs.append(seq) # Encoder input sequence - encoder_seq = Sequence(seq_id=seq_id_start + len(seq_output_lens), - inputs=inputs, - block_size=16, - from_decoder_prompt=False) + encoder_seq = Sequence( + seq_id=seq_id_start + len(seq_output_lens), + inputs=inputs["encoder"], + block_size=16, + ) return SequenceGroup(request_id=request_id, seqs=seqs, diff --git a/tests/data/test_config.yaml b/tests/data/test_config.yaml index 42f4f6f7bb992..5090e8f357bb8 100644 --- a/tests/data/test_config.yaml +++ b/tests/data/test_config.yaml @@ -1,3 +1,5 @@ port: 12312 served_model_name: mymodel tensor_parallel_size: 2 +trust_remote_code: true +multi_step_stream_outputs: false diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 88d0a4ba7f57b..1489a60891761 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -11,6 +11,7 @@ import pytest +from vllm.config import TaskOption from vllm.logger import init_logger from ..utils import compare_two_settings, fork_new_process_for_each_test @@ -27,18 +28,26 @@ class ParallelSetup(NamedTuple): chunked_prefill: bool +class PPTestOptions(NamedTuple): + multi_node_only: bool + trust_remote_code: bool + tokenizer_mode: Optional[str] + + @dataclass class PPTestSettings: parallel_setups: List[ParallelSetup] distributed_backends: List[str] - trust_remote_code: bool - tokenizer_mode: Optional[str] + task: TaskOption + test_options: PPTestOptions @staticmethod def detailed( *, tp_base: int = 1, pp_base: int = 2, + multi_node_only: bool = False, + task: TaskOption = "auto", trust_remote_code: bool = False, tokenizer_mode: Optional[str] = None, ): @@ -66,8 +75,10 @@ def detailed( chunked_prefill=False), ], distributed_backends=["mp", "ray"], - trust_remote_code=trust_remote_code, - tokenizer_mode=tokenizer_mode, + task=task, + test_options=PPTestOptions(multi_node_only=multi_node_only, + trust_remote_code=trust_remote_code, + tokenizer_mode=tokenizer_mode), ) @staticmethod @@ -75,6 +86,8 @@ def fast( *, tp_base: int = 1, pp_base: int = 2, + task: TaskOption = "auto", + multi_node_only: bool = False, trust_remote_code: bool = False, tokenizer_mode: Optional[str] = None, ): @@ -86,25 +99,27 @@ def fast( chunked_prefill=False), ], distributed_backends=["mp"], - trust_remote_code=trust_remote_code, - tokenizer_mode=tokenizer_mode, + task=task, + test_options=PPTestOptions(multi_node_only=multi_node_only, + trust_remote_code=trust_remote_code, + tokenizer_mode=tokenizer_mode), ) def iter_params(self, model_name: str): + opts = self.test_options + for parallel_setup in self.parallel_setups: for distributed_backend in self.distributed_backends: yield (model_name, parallel_setup, distributed_backend, - self.trust_remote_code, self.tokenizer_mode) + self.task, opts) # NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU # The values displayed here are only a rough indicator of the size of the model # yapf: disable -GENERATION_MODEL_SETTINGS = { - # [DETAILED TESTS] - "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(), - # [FAST TESTS] +TEXT_GENERATION_MODELS = { + # [Decoder-only] # Uses Llama # "BAAI/AquilaChat-7B": PPTestSettings.fast(), "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(tp_base=8, trust_remote_code=True), # noqa: E501 @@ -130,9 +145,10 @@ def iter_params(self, model_name: str): # Uses Llama # "internlm/internlm-chat-7b": PPTestSettings.fast(), "internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True), - "core42/jais-13b-chat": PPTestSettings.fast(), + "inceptionai/jais-13b-chat": PPTestSettings.fast(), # TODO: Implement PP # "ai21labs/AI21-Jamba-1.5-Mini": PPTestSettings.fast(), + "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(), "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True), "openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True), # Uses Llama @@ -145,52 +161,53 @@ def iter_params(self, model_name: str): "facebook/opt-iml-max-1.3b": PPTestSettings.fast(), "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True), "microsoft/phi-2": PPTestSettings.fast(), - "microsoft/Phi-3-mini-4k-instruct": PPTestSettings.fast(), + "microsoft/Phi-3-mini-4k-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True), # noqa: E501 "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501 - # FIXME: https://github.com/vllm-project/vllm/issues/8553 - # "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501 + "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501 "adept/persimmon-8b-chat": PPTestSettings.fast(), "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True), - "Qwen/Qwen2-beta-7B-Chat": PPTestSettings.fast(), + "Qwen/Qwen2-7B-Instruct": PPTestSettings.fast(), "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(), "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(), "bigcode/starcoder2-3b": PPTestSettings.fast(), "upstage/solar-pro-preview-instruct": PPTestSettings.fast(tp_base=2), - # FIXME: Cannot load tokenizer in latest transformers version + # FIXME: Cannot load tokenizer in latest transformers version. + # Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf` # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True), + # [Encoder-only] + # TODO: Implement PP + # "facebook/bart-base": PPTestSettings.fast(), } -EMBEDDING_MODEL_SETTINGS = { # type: ignore[var-annotated] - # [FAST TESTS] +EMBEDDING_MODELS = { # type: ignore[var-annotated] + # [Text-only] "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(), "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(), "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(tp_base=4, trust_remote_code=True), # noqa: E501 } -MULTIMODAL_MODEL_SETTINGS = { - # [FAST TESTS] +MULTIMODAL_MODELS = { + # [Decoder-only] "Salesforce/blip2-opt-2.7b": PPTestSettings.fast(), "facebook/chameleon-7b": PPTestSettings.fast(), "adept/fuyu-8b": PPTestSettings.fast(), + "THUDM/glm-4v-9b": PPTestSettings.fast(trust_remote_code=True), "OpenGVLab/InternVL2-1B": PPTestSettings.fast(trust_remote_code=True), "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(), "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(), "llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(), "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(), "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(trust_remote_code=True), - # TODO: Implement PP - # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(), + "allenai/Molmo-7B-D-0924": PPTestSettings.fast(trust_remote_code=True), "microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501 "mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"), # noqa: E501 "Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True), + "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(), "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(), "fixie-ai/ultravox-v0_3": PPTestSettings.fast(), -} - -CONDITIONAL_GENERATION_MODEL_SETTINGS = { # type: ignore[var-annotated] - # [FAST TESTS] + # [Encoder-decoder] # TODO: Implement PP - # "facebook/bart-base": PPTestSettings.fast(), + # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(), } # yapf: enable @@ -199,6 +216,7 @@ def iter_params(self, model_name: str): # [LANGUAGE GENERATION] "meta-llama/Meta-Llama-3-8B", "ibm/PowerLM-3b", + "microsoft/Phi-3-mini-4k-instruct", # [LANGUAGE EMBEDDING] "intfloat/e5-mistral-7b-instruct", "BAAI/bge-multilingual-gemma2", @@ -213,19 +231,22 @@ def _compare_tp( model_name: str, parallel_setup: ParallelSetup, distributed_backend: str, - trust_remote_code: bool, - tokenizer_mode: Optional[str], + task: TaskOption, + test_options: PPTestOptions, num_gpus_available: int, *, - method: Literal["generate", "encode"] = "encode", + method: Literal["generate", "encode"], ): tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup + multi_node_only, trust_remote_code, tokenizer_mode = test_options if num_gpus_available < tp_size * pp_size: pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs") if VLLM_MULTI_NODE and distributed_backend == "mp": pytest.skip("Skipping multi-node pipeline parallel test for " "multiprocessing distributed backend") + if multi_node_only and not VLLM_MULTI_NODE: + pytest.skip("Not in multi-node setting") common_args = [ # use half precision for speed and memory savings in CI environment @@ -240,6 +261,8 @@ def _compare_tp( common_args.append("--enable-chunked-prefill") if eager_mode: common_args.append("--enforce-eager") + if task != "auto": + common_args.extend(["--task", task]) if trust_remote_code: common_args.append("--trust-remote-code") if tokenizer_mode: @@ -297,10 +320,10 @@ def _compare_tp( @pytest.mark.parametrize( - ("model_name", "parallel_setup", "distributed_backend", - "trust_remote_code", "tokenizer_mode"), + ("model_name", "parallel_setup", "distributed_backend", "task", + "test_options"), [ - params for model_name, settings in GENERATION_MODEL_SETTINGS.items() + params for model_name, settings in TEXT_GENERATION_MODELS.items() for params in settings.iter_params(model_name) if model_name in TEST_MODELS ], @@ -310,24 +333,24 @@ def test_tp_language_generation( model_name: str, parallel_setup: ParallelSetup, distributed_backend: str, - trust_remote_code: bool, - tokenizer_mode: Optional[str], + task: TaskOption, + test_options: PPTestOptions, num_gpus_available, ): _compare_tp(model_name, parallel_setup, distributed_backend, - trust_remote_code, - tokenizer_mode, + task, + test_options, num_gpus_available, method="generate") @pytest.mark.parametrize( - ("model_name", "parallel_setup", "distributed_backend", - "trust_remote_code", "tokenizer_mode"), + ("model_name", "parallel_setup", "distributed_backend", "task", + "test_options"), [ - params for model_name, settings in EMBEDDING_MODEL_SETTINGS.items() + params for model_name, settings in EMBEDDING_MODELS.items() for params in settings.iter_params(model_name) if model_name in TEST_MODELS ], @@ -337,24 +360,24 @@ def test_tp_language_embedding( model_name: str, parallel_setup: ParallelSetup, distributed_backend: str, - trust_remote_code: bool, - tokenizer_mode: Optional[str], + task: TaskOption, + test_options: PPTestOptions, num_gpus_available, ): _compare_tp(model_name, parallel_setup, distributed_backend, - trust_remote_code, - tokenizer_mode, + task, + test_options, num_gpus_available, method="encode") @pytest.mark.parametrize( - ("model_name", "parallel_setup", "distributed_backend", - "trust_remote_code", "tokenizer_mode"), + ("model_name", "parallel_setup", "distributed_backend", "task", + "test_options"), [ - params for model_name, settings in MULTIMODAL_MODEL_SETTINGS.items() + params for model_name, settings in MULTIMODAL_MODELS.items() for params in settings.iter_params(model_name) if model_name in TEST_MODELS ], @@ -364,14 +387,14 @@ def test_tp_multimodal_generation( model_name: str, parallel_setup: ParallelSetup, distributed_backend: str, - trust_remote_code: bool, - tokenizer_mode: Optional[str], + task: TaskOption, + test_options: PPTestOptions, num_gpus_available, ): _compare_tp(model_name, parallel_setup, distributed_backend, - trust_remote_code, - tokenizer_mode, + task, + test_options, num_gpus_available, method="generate") diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py index 9324a737a779c..f2d7e9fd78cf3 100644 --- a/tests/encoder_decoder/test_e2e_correctness.py +++ b/tests/encoder_decoder/test_e2e_correctness.py @@ -7,12 +7,18 @@ import pytest from transformers import AutoModelForSeq2SeqLM +from vllm.attention.selector import (_Backend, + global_force_attn_backend_context_manager) +from vllm.platforms import current_platform from vllm.sequence import SampleLogprobs -from vllm.utils import is_cpu from ..conftest import DecoderPromptType from ..models.utils import check_logprobs_close +LIST_ENC_DEC_SUPPORTED_BACKENDS = [ + _Backend.XFORMERS, _Backend.FLASH_ATTN, None +] + def vllm_to_hf_output( vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]], @@ -29,13 +35,14 @@ def vllm_to_hf_output( @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"]) -@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.parametrize("dtype", ["float"]) +@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) @pytest.mark.parametrize("enforce_eager", [True, False]) @pytest.mark.skipif( - is_cpu(), + current_platform.is_cpu(), reason="CPU backend is not currently supported with encoder/decoder models" ) def test_encoder_decoder_e2e( @@ -48,51 +55,58 @@ def test_encoder_decoder_e2e( num_logprobs: int, decoder_prompt_type: DecoderPromptType, enforce_eager: bool, + attn_backend: _Backend, ) -> None: ''' - End-to-End (E2E) test for the encoder-decoder framework. + End-to-End (E2E) test for the encoder-decoder framework. This test evaluates the encoder-decoder functionality using the BART model. We compare the outputs of the Hugging Face and vLLM implementations to ensure that both implementations produce consistent and correct results. ''' - test_case_prompts = example_encoder_decoder_prompts[decoder_prompt_type] + with global_force_attn_backend_context_manager(attn_backend): + if attn_backend == _Backend.FLASH_ATTN: + # Flash Attention works only with bfloat16 data-type + dtype = 'bfloat16' + test_case_prompts = example_encoder_decoder_prompts[ + decoder_prompt_type] - # Configuration settings for HF baseline - hf_kwargs = { - "top_k": None, - "num_beams": 1, - "repetition_penalty": 1.0, - "top_p": 1.0, - "length_penalty": 1.0, - "early_stopping": False, - "no_repeat_ngram_size": None, - "min_length": 0 - } + # Configuration settings for HF baseline + hf_kwargs = { + "top_k": None, + "num_beams": 1, + "repetition_penalty": 1.0, + "top_p": 1.0, + "length_penalty": 1.0, + "early_stopping": False, + "no_repeat_ngram_size": None, + "min_length": 0 + } - with hf_runner(model, dtype=dtype, - auto_cls=AutoModelForSeq2SeqLM) as hf_model: - hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit( - test_case_prompts, - max_tokens, - num_logprobs, - **hf_kwargs, - )) - with vllm_runner(model, dtype=dtype, - enforce_eager=enforce_eager) as vllm_model: - vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( - test_case_prompts, max_tokens, num_logprobs) + with hf_runner(model, dtype=dtype, + auto_cls=AutoModelForSeq2SeqLM) as hf_model: + hf_outputs = ( + hf_model.generate_encoder_decoder_greedy_logprobs_limit( + test_case_prompts, + max_tokens, + num_logprobs, + **hf_kwargs, + )) + with vllm_runner(model, dtype=dtype, + enforce_eager=enforce_eager) as vllm_model: + vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( + test_case_prompts, max_tokens, num_logprobs) - hf_skip_tokens = (1 - if decoder_prompt_type == DecoderPromptType.NONE else 0) + hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE + else 0) - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, decoder_prompt_type) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - num_outputs_0_skip_tokens=hf_skip_tokens, - ) + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=[ + vllm_to_hf_output(vllm_output, decoder_prompt_type) + for vllm_output in vllm_outputs + ], + name_0="hf", + name_1="vllm", + num_outputs_0_skip_tokens=hf_skip_tokens, + ) diff --git a/tests/engine/output_processor/test_stop_checker.py b/tests/engine/output_processor/test_stop_checker.py index 0d84443c51f99..cc14e8cbf75df 100644 --- a/tests/engine/output_processor/test_stop_checker.py +++ b/tests/engine/output_processor/test_stop_checker.py @@ -4,6 +4,7 @@ from transformers import PreTrainedTokenizer from vllm.engine.output_processor.stop_checker import StopChecker +from vllm.inputs import token_inputs from vllm.sampling_params import SamplingParams from vllm.sequence import Logprob, Sequence, SequenceStatus @@ -15,7 +16,7 @@ def sequence_with_eos(text: str, eos_token: str, """ seq = Sequence( seq_id=0, - inputs={"prompt_token_ids": []}, + inputs=token_inputs([]), block_size=16, eos_token_id=eos_token_id, ) diff --git a/tests/engine/test_short_mm_context.py b/tests/engine/test_short_mm_context.py new file mode 100644 index 0000000000000..a6ba7a131c506 --- /dev/null +++ b/tests/engine/test_short_mm_context.py @@ -0,0 +1,29 @@ +import pytest + +from ..conftest import IMAGE_ASSETS + +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": + "USER: \nWhat's the content of the image?\nASSISTANT:", + "cherry_blossom": + "USER: \nWhat is the season?\nASSISTANT:", +}) + +models = ["llava-hf/llava-1.5-7b-hf"] + + +@pytest.mark.parametrize("model", models) +def test_context_length_too_short(vllm_runner, image_assets, model): + images = [asset.pil_image for asset in image_assets] + + with pytest.raises(ValueError, match="too long to fit into the model"): + vllm_model = vllm_runner( + model, + max_model_len=128, # LLaVA has a feature size of 576 + enforce_eager=True, + ) + + with vllm_model: + vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]], + max_tokens=1, + images=[images[0]]) diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py new file mode 100644 index 0000000000000..fc66386fd2d2a --- /dev/null +++ b/tests/entrypoints/llm/test_chat.py @@ -0,0 +1,92 @@ +from typing import List + +import pytest + +from vllm import LLM + +from ..openai.test_vision import TEST_IMAGE_URLS + + +def test_chat(): + llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct") + + prompt1 = "Explain the concept of entropy." + messages = [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": prompt1 + }, + ] + outputs = llm.chat(messages) + assert len(outputs) == 1 + + +def test_multi_chat(): + llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct") + + prompt1 = "Explain the concept of entropy." + prompt2 = "Explain what among us is." + + conversation1 = [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": prompt1 + }, + ] + + conversation2 = [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": prompt2 + }, + ] + + messages = [conversation1, conversation2] + + outputs = llm.chat(messages) + assert len(outputs) == 2 + + +@pytest.mark.parametrize("image_urls", + [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]]) +def test_chat_multi_image(image_urls: List[str]): + llm = LLM( + model="microsoft/Phi-3.5-vision-instruct", + dtype="bfloat16", + max_model_len=4096, + max_num_seqs=5, + enforce_eager=True, + trust_remote_code=True, + limit_mm_per_prompt={"image": 2}, + ) + + messages = [{ + "role": + "user", + "content": [ + *({ + "type": "image_url", + "image_url": { + "url": image_url + } + } for image_url in image_urls), + { + "type": "text", + "text": "What's in this image?" + }, + ], + }] + outputs = llm.chat(messages) + assert len(outputs) >= 0 diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py index 1885f2e168d80..4c9f796e5ed71 100644 --- a/tests/entrypoints/llm/test_encode.py +++ b/tests/entrypoints/llm/test_encode.py @@ -4,8 +4,7 @@ import pytest from vllm import LLM, EmbeddingRequestOutput, PoolingParams - -from ...conftest import cleanup +from vllm.distributed import cleanup_dist_env_and_memory MODEL_NAME = "intfloat/e5-mistral-7b-instruct" @@ -41,7 +40,7 @@ def llm(): del llm - cleanup() + cleanup_dist_env_and_memory() def assert_outputs_equal(o1: List[EmbeddingRequestOutput], diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py index 6543c4bb1b58e..7d2b377752725 100644 --- a/tests/entrypoints/llm/test_generate.py +++ b/tests/entrypoints/llm/test_generate.py @@ -4,9 +4,7 @@ import pytest from vllm import LLM, RequestOutput, SamplingParams - -from ...conftest import cleanup -from ..openai.test_vision import TEST_IMAGE_URLS +from vllm.distributed import cleanup_dist_env_and_memory MODEL_NAME = "facebook/opt-125m" @@ -40,7 +38,7 @@ def llm(): del llm - cleanup() + cleanup_dist_env_and_memory() def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]): @@ -104,90 +102,3 @@ def test_multiple_sampling_params(llm: LLM): # sampling_params is None, default params should be applied outputs = llm.generate(PROMPTS, sampling_params=None) assert len(PROMPTS) == len(outputs) - - -def test_chat(): - - llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") - - prompt1 = "Explain the concept of entropy." - messages = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": prompt1 - }, - ] - outputs = llm.chat(messages) - assert len(outputs) == 1 - - -def test_multi_chat(): - - llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") - - prompt1 = "Explain the concept of entropy." - prompt2 = "Explain what among us is." - - conversation1 = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": prompt1 - }, - ] - - conversation2 = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": prompt2 - }, - ] - - messages = [conversation1, conversation2] - - outputs = llm.chat(messages) - assert len(outputs) == 2 - - -@pytest.mark.parametrize("image_urls", - [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]]) -def test_chat_multi_image(image_urls: List[str]): - llm = LLM( - model="microsoft/Phi-3.5-vision-instruct", - dtype="bfloat16", - max_model_len=4096, - max_num_seqs=5, - enforce_eager=True, - trust_remote_code=True, - limit_mm_per_prompt={"image": 2}, - ) - - messages = [{ - "role": - "user", - "content": [ - *({ - "type": "image_url", - "image_url": { - "url": image_url - } - } for image_url in image_urls), - { - "type": "text", - "text": "What's in this image?" - }, - ], - }] - outputs = llm.chat(messages) - assert len(outputs) >= 0 diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py index 9f5727ecd0406..eb2113692e7b4 100644 --- a/tests/entrypoints/llm/test_generate_multiple_loras.py +++ b/tests/entrypoints/llm/test_generate_multiple_loras.py @@ -5,10 +5,9 @@ from huggingface_hub import snapshot_download from vllm import LLM +from vllm.distributed import cleanup_dist_env_and_memory from vllm.lora.request import LoRARequest -from ...conftest import cleanup - MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" PROMPTS = [ @@ -39,7 +38,7 @@ def llm(): del llm - cleanup() + cleanup_dist_env_and_memory() @pytest.fixture(scope="module") diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py index 2841dfc6bd9c2..67c79415f322a 100644 --- a/tests/entrypoints/llm/test_guided_generate.py +++ b/tests/entrypoints/llm/test_guided_generate.py @@ -5,12 +5,11 @@ import jsonschema import pytest +from vllm.distributed import cleanup_dist_env_and_memory from vllm.entrypoints.llm import LLM from vllm.outputs import RequestOutput from vllm.sampling_params import GuidedDecodingParams, SamplingParams -from ...conftest import cleanup - MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" @@ -23,7 +22,7 @@ def llm(): with llm.deprecate_legacy_api(): yield weakref.proxy(llm) del llm - cleanup() + cleanup_dist_env_and_memory() @pytest.mark.skip_global_cleanup diff --git a/tests/entrypoints/llm/test_init.py b/tests/entrypoints/llm/test_init.py new file mode 100644 index 0000000000000..c9a4ad44fea30 --- /dev/null +++ b/tests/entrypoints/llm/test_init.py @@ -0,0 +1,22 @@ +import pytest + +from vllm import LLM + +from ...utils import error_on_warning + +MODEL_NAME = "facebook/opt-125m" + + +def test_pos_args_deprecated(): + with error_on_warning(DeprecationWarning): + LLM(model=MODEL_NAME, tokenizer=MODEL_NAME) + + with error_on_warning(DeprecationWarning): + LLM(MODEL_NAME, tokenizer=MODEL_NAME) + + with pytest.warns(DeprecationWarning, match="'tokenizer'"): + LLM(MODEL_NAME, MODEL_NAME) + + with pytest.warns(DeprecationWarning, + match="'tokenizer', 'tokenizer_mode'"): + LLM(MODEL_NAME, MODEL_NAME, "auto") diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py index 39480531f5866..cbfb0cc32c1ce 100644 --- a/tests/entrypoints/llm/test_lazy_outlines.py +++ b/tests/entrypoints/llm/test_lazy_outlines.py @@ -1,6 +1,7 @@ import sys from vllm import LLM, SamplingParams +from vllm.distributed import cleanup_dist_env_and_memory def test_lazy_outlines(sample_regex): @@ -14,6 +15,7 @@ def test_lazy_outlines(sample_regex): ] sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + # Create an LLM without guided decoding as a baseline. llm = LLM(model="facebook/opt-125m", enforce_eager=True, gpu_memory_utilization=0.3) @@ -26,10 +28,15 @@ def test_lazy_outlines(sample_regex): # make sure outlines is not imported assert 'outlines' not in sys.modules + # Destroy the LLM object and free up the GPU memory. + del llm + cleanup_dist_env_and_memory() + + # Create an LLM with guided decoding enabled. llm = LLM(model="facebook/opt-125m", enforce_eager=True, guided_decoding_backend="lm-format-enforcer", - gpu_memory_utilization=0.3) + gpu_memory_utilization=0.6) sampling_params = SamplingParams(temperature=0.8, top_p=0.95) outputs = llm.generate( prompts=[ diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py index 565dfa01346cc..675a980ab3f3f 100644 --- a/tests/entrypoints/llm/test_prompt_validation.py +++ b/tests/entrypoints/llm/test_prompt_validation.py @@ -4,6 +4,12 @@ def test_empty_prompt(): - llm = LLM(model="gpt2") + llm = LLM(model="gpt2", enforce_eager=True) with pytest.raises(ValueError, match='Prompt cannot be empty'): llm.generate([""]) + + +def test_out_of_vocab_token(): + llm = LLM(model="gpt2", enforce_eager=True) + with pytest.raises(ValueError, match='out of vocabulary'): + llm.generate({"prompt_token_ids": [999999]}) diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py index 0b6026a89c758..65699e609e4a8 100644 --- a/tests/entrypoints/offline_mode/test_offline_mode.py +++ b/tests/entrypoints/offline_mode/test_offline_mode.py @@ -1,51 +1,56 @@ """Tests for HF_HUB_OFFLINE mode""" import importlib import sys -import weakref import pytest from vllm import LLM - -from ...conftest import cleanup - -MODEL_NAME = "facebook/opt-125m" +from vllm.distributed import cleanup_dist_env_and_memory + +MODEL_CONFIGS = [ + { + "model": "facebook/opt-125m", + "enforce_eager": True, + "gpu_memory_utilization": 0.20, + "max_model_len": 64, + "max_num_batched_tokens": 64, + "max_num_seqs": 64, + "tensor_parallel_size": 1, + }, + { + "model": "mistralai/Mistral-7B-Instruct-v0.1", + "enforce_eager": True, + "gpu_memory_utilization": 0.95, + "max_model_len": 64, + "max_num_batched_tokens": 64, + "max_num_seqs": 64, + "tensor_parallel_size": 1, + "tokenizer_mode": "mistral", + }, +] @pytest.fixture(scope="module") -def llm(): - # pytest caches the fixture so we use weakref.proxy to - # enable garbage collection - llm = LLM(model=MODEL_NAME, - max_num_batched_tokens=4096, - tensor_parallel_size=1, - gpu_memory_utilization=0.10, - enforce_eager=True) - - with llm.deprecate_legacy_api(): - yield weakref.proxy(llm) +def cache_models(): + # Cache model files first + for model_config in MODEL_CONFIGS: + LLM(**model_config) + cleanup_dist_env_and_memory() - del llm - - cleanup() + yield @pytest.mark.skip_global_cleanup -def test_offline_mode(llm: LLM, monkeypatch): - # we use the llm fixture to ensure the model files are in-cache - del llm - +@pytest.mark.usefixtures("cache_models") +def test_offline_mode(monkeypatch): # Set HF to offline mode and ensure we can still construct an LLM try: monkeypatch.setenv("HF_HUB_OFFLINE", "1") # Need to re-import huggingface_hub and friends to setup offline mode _re_import_modules() # Cached model files should be used in offline mode - LLM(model=MODEL_NAME, - max_num_batched_tokens=4096, - tensor_parallel_size=1, - gpu_memory_utilization=0.10, - enforce_eager=True) + for model_config in MODEL_CONFIGS: + LLM(**model_config) finally: # Reset the environment after the test # NB: Assuming tests are run in online mode diff --git a/tests/entrypoints/openai/test_accuracy.py b/tests/entrypoints/openai/test_accuracy.py index 63beaaba29a80..a16e95f94171e 100644 --- a/tests/entrypoints/openai/test_accuracy.py +++ b/tests/entrypoints/openai/test_accuracy.py @@ -10,6 +10,8 @@ import lm_eval import pytest +from vllm.platforms import current_platform + from ...utils import RemoteOpenAIServer MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct" @@ -18,12 +20,21 @@ FILTER = "exact_match,strict-match" RTOL = 0.03 EXPECTED_VALUE = 0.58 -DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"] +DEFAULT_ARGS = ["--max-model-len", "2048", "--disable-log-requests"] MORE_ARGS_LIST = [ + [], # Default ["--enable-chunked-prefill"], # Chunked ["--num-scheduler-steps", "8"], # MS ["--num-scheduler-steps", "8", "--multi-step-stream-outputs"] # MS+Stream ] +MAX_WAIT_SECONDS = None + +if current_platform.is_tpu(): + MORE_ARGS_LIST = [ + [], # Default + # ["--num-scheduler-steps", "8"], # Multi-step << currently fails + ] + MAX_WAIT_SECONDS = 600 @pytest.mark.parametrize("more_args", MORE_ARGS_LIST) @@ -33,7 +44,9 @@ def test_lm_eval_accuracy(more_args): print(f"Running with: {args}") - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + with RemoteOpenAIServer( + MODEL_NAME, args, + max_wait_seconds=MAX_WAIT_SECONDS) as remote_server: url = f"{remote_server.url_for('v1')}/completions" model_args = ( diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index df8a140283fbb..a74109e2f5120 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -68,11 +68,12 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI, }] # test single completion - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_tokens=10, - logprobs=True, - top_logprobs=5) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + logprobs=True, + top_logprobs=5) assert len(chat_completion.choices) == 1 choice = chat_completion.choices[0] @@ -91,7 +92,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, ) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 @@ -123,11 +124,12 @@ async def test_single_chat_session_audio_base64encoded( }] # test single completion - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_tokens=10, - logprobs=True, - top_logprobs=5) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + logprobs=True, + top_logprobs=5) assert len(chat_completion.choices) == 1 choice = chat_completion.choices[0] @@ -146,7 +148,7 @@ async def test_single_chat_session_audio_base64encoded( chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, ) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 @@ -178,7 +180,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, ) output = chat_completion.choices[0].message.content @@ -188,7 +190,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI, stream = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, stream=True, ) @@ -242,7 +244,7 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str, await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, ) diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py index d3aea533b6db9..4616f363cc04a 100644 --- a/tests/entrypoints/openai/test_basic.py +++ b/tests/entrypoints/openai/test_basic.py @@ -1,7 +1,6 @@ from http import HTTPStatus from typing import List -import openai import pytest import pytest_asyncio import requests @@ -83,10 +82,8 @@ async def client(server): indirect=True, ) @pytest.mark.asyncio -async def test_show_version(client: openai.AsyncOpenAI): - base_url = str(client.base_url)[:-3].strip("/") - - response = requests.get(base_url + "/version") +async def test_show_version(server: RemoteOpenAIServer): + response = requests.get(server.url_for("version")) response.raise_for_status() assert response.json() == {"version": VLLM_VERSION} @@ -102,9 +99,7 @@ async def test_show_version(client: openai.AsyncOpenAI): indirect=True, ) @pytest.mark.asyncio -async def test_check_health(client: openai.AsyncOpenAI): - base_url = str(client.base_url)[:-3].strip("/") - - response = requests.get(base_url + "/health") +async def test_check_health(server: RemoteOpenAIServer): + response = requests.get(server.url_for("health")) assert response.status_code == HTTPStatus.OK diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index 0fbc4cca83bd2..8d13f64dce01c 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -16,9 +16,6 @@ # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" -# technically this needs Mistral-7B-v0.1 as base, but we're not testing -# generation quality here -LORA_NAME = "typeof/zephyr-7b-beta-lora" @pytest.fixture(scope="module") @@ -68,11 +65,12 @@ async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): "content": "what is 1+1?" }] - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_tokens=5, - temperature=0.0, - logprobs=False) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=5, + temperature=0.0, + logprobs=False) choice = chat_completion.choices[0] assert choice.logprobs is None @@ -93,12 +91,13 @@ async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): "content": "what is 1+1?" }] - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_tokens=5, - temperature=0.0, - logprobs=True, - top_logprobs=0) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=5, + temperature=0.0, + logprobs=True, + top_logprobs=0) choice = chat_completion.choices[0] assert choice.logprobs is not None @@ -120,12 +119,13 @@ async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): "content": "what is 1+1?" }] - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_tokens=5, - temperature=0.0, - logprobs=True, - top_logprobs=5) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=5, + temperature=0.0, + logprobs=True, + top_logprobs=5) choice = chat_completion.choices[0] assert choice.logprobs is not None @@ -152,7 +152,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, with pytest.raises((openai.BadRequestError, openai.APIError)): stream = await client.chat.completions.create(model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, logprobs=True, top_logprobs=21, stream=True) @@ -162,16 +162,17 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, with pytest.raises(openai.BadRequestError): await client.chat.completions.create(model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, logprobs=True, top_logprobs=30, stream=False) # the server should still work afterwards - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_tokens=10, - stream=False) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + stream=False) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 @@ -274,11 +275,12 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, }] # test single completion - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_tokens=10, - logprobs=True, - top_logprobs=5) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + logprobs=True, + top_logprobs=5) assert chat_completion.id is not None assert len(chat_completion.choices) == 1 @@ -297,7 +299,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, ) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 @@ -322,7 +324,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str): chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, ) output = chat_completion.choices[0].message.content @@ -332,7 +334,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str): stream = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, stream=True, ) @@ -372,7 +374,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, stream = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, stream=True, stream_options={"include_usage": False}) @@ -383,7 +385,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, # "continuous_usage_stats": False}} stream = await client.chat.completions.create(model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, stream=True, stream_options={ @@ -412,7 +414,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, stream=False, stream_options={"include_usage": None}) @@ -422,7 +424,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, stream=False, stream_options={"include_usage": True}) @@ -432,19 +434,29 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, stream = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, + extra_body=dict(min_tokens=10), temperature=0.0, stream=True, stream_options={ "include_usage": True, - "continuous_usage_stats": True + "continuous_usage_stats": True, }, ) + last_completion_tokens = 0 async for chunk in stream: assert chunk.usage.prompt_tokens >= 0 - assert chunk.usage.completion_tokens >= 0 + assert last_completion_tokens == 0 or \ + chunk.usage.completion_tokens > last_completion_tokens or \ + ( + not chunk.choices and + chunk.usage.completion_tokens == last_completion_tokens + ) assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens + chunk.usage.completion_tokens) + last_completion_tokens = chunk.usage.completion_tokens + + assert last_completion_tokens == 10 # NOTE: Not sure why, but when I place this after `test_guided_regex_chat` @@ -469,7 +481,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=10, + max_completion_tokens=10, extra_body=dict(guided_choice=sample_guided_choice, guided_decoding_backend=guided_decoding_backend)) choice1 = chat_completion.choices[0].message.content @@ -483,7 +495,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=10, + max_completion_tokens=10, extra_body=dict(guided_choice=sample_guided_choice, guided_decoding_backend=guided_decoding_backend)) choice2 = chat_completion.choices[0].message.content @@ -510,7 +522,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=1000, + max_completion_tokens=1000, extra_body=dict(guided_json=sample_json_schema, guided_decoding_backend=guided_decoding_backend)) message = chat_completion.choices[0].message @@ -528,7 +540,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=1000, + max_completion_tokens=1000, extra_body=dict(guided_json=sample_json_schema, guided_decoding_backend=guided_decoding_backend)) message = chat_completion.choices[0].message @@ -556,7 +568,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=20, + max_completion_tokens=20, extra_body=dict(guided_regex=sample_regex, guided_decoding_backend=guided_decoding_backend)) ip1 = chat_completion.choices[0].message.content @@ -568,7 +580,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=20, + max_completion_tokens=20, extra_body=dict(guided_regex=sample_regex, guided_decoding_backend=guided_decoding_backend)) ip2 = chat_completion.choices[0].message.content @@ -616,7 +628,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=10, + max_completion_tokens=10, logprobs=True, top_logprobs=5, extra_body=dict(guided_choice=sample_guided_choice, @@ -653,7 +665,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=1000, + max_completion_tokens=1000, tools=[{ "type": "function", "function": { @@ -687,7 +699,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, stream = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=1000, + max_completion_tokens=1000, tools=[{ "type": "function", "function": { @@ -743,7 +755,7 @@ async def test_required_tool_use_not_yet_supported( await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=1000, + max_completion_tokens=1000, tools=[{ "type": "function", "function": { @@ -758,7 +770,7 @@ async def test_required_tool_use_not_yet_supported( await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=1000, + max_completion_tokens=1000, tools=[{ "type": "function", "function": { @@ -789,7 +801,7 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI, with pytest.raises(openai.BadRequestError): await client.chat.completions.create(model=MODEL_NAME, messages=messages, - max_tokens=1000, + max_completion_tokens=1000, tool_choice={ "type": "function", "function": { @@ -802,7 +814,7 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI, await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=1000, + max_completion_tokens=1000, tools=[{ "type": "function", "function": { @@ -841,14 +853,28 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI): @pytest.mark.asyncio async def test_response_format_json_schema(client: openai.AsyncOpenAI): + prompt = 'what is 1+1? The format is "result": 2' + # Check that this prompt cannot lead to a valid JSON without json_schema for _ in range(2): resp = await client.chat.completions.create( model=MODEL_NAME, messages=[{ - "role": - "user", - "content": ('what is 1+1? please respond with a JSON object, ' - 'the format is {"result": 2}') + "role": "user", + "content": prompt + }], + ) + content = resp.choices[0].message.content + assert content is not None + with pytest.raises((json.JSONDecodeError, AssertionError)): + loaded = json.loads(content) + assert loaded == {"result": 2}, loaded + + for _ in range(2): + resp = await client.chat.completions.create( + model=MODEL_NAME, + messages=[{ + "role": "user", + "content": prompt }], response_format={ "type": "json_schema", diff --git a/tests/entrypoints/openai/test_chunked_prompt.py b/tests/entrypoints/openai/test_chunked_prompt.py new file mode 100644 index 0000000000000..61d66365130c7 --- /dev/null +++ b/tests/entrypoints/openai/test_chunked_prompt.py @@ -0,0 +1,126 @@ +import openai # use the official client for correctness check +import pytest +import pytest_asyncio + +from ...utils import RemoteOpenAIServer + +# any model with a chat template should work here +MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" + + +@pytest.fixture(scope="module") +def server(): + args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "8192", + "--enforce-eager", + # lora config below + "--max-num-seqs", + "128", + "--enable-chunked-prefill", + "--max-num-batched-tokens", + "1000", + # large prompts create a lot of output + "--disable-log-requests", + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +@pytest.mark.asyncio +async def test_completion_stream_options_and_logprobs_with_long_prompts( + client: openai.AsyncOpenAI): + # Test stream with long prompt + prompt = "What is the capital of France?" * 400 + + stream = await client.completions.create( + model=MODEL_NAME, + prompt=prompt, + max_tokens=5, + temperature=0.0, + stream=True, + stream_options={ + "include_usage": True, + "continuous_usage_stats": True, + }, + logprobs=5, + ) + + tokens_received = 0 + finished = False + async for chunk in stream: + assert chunk.usage.prompt_tokens >= 0 + assert chunk.usage.completion_tokens >= 0 + assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens + + chunk.usage.completion_tokens) + if not finished: + tokens_received += 1 + assert chunk.choices[0].text + + if chunk.choices[0].finish_reason is not None: + finished = True + + if finished: + assert chunk.usage.completion_tokens == tokens_received + + +@pytest.mark.asyncio +async def test_chat_completion_stream_options_and_logprobs_with_long_prompts( + client: openai.AsyncOpenAI): + # Test stream with long prompt + messages = [{ + "role": "system", + "content": "You are a helpful assistant." + }, { + "role": "user", + "content": "What is the capital of France?" * 400 + }] + stream = await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + max_tokens=5, + temperature=0.0, + stream=True, + stream_options={ + "include_usage": True, + "continuous_usage_stats": True, + }, + logprobs=True, + top_logprobs=5, + ) + + tokens_received = 0 + empty_chunks_received = 0 + finished = False + async for chunk in stream: + assert chunk.usage.prompt_tokens >= 0 + assert chunk.usage.completion_tokens >= 0 + assert chunk.usage.total_tokens == (chunk.usage.prompt_tokens + + chunk.usage.completion_tokens) + + if not finished: + if chunk.choices[0].delta.content == "": + # when there is no tokens generated + assert chunk.usage.completion_tokens == 0 + assert chunk.choices[0].logprobs is None + empty_chunks_received += 1 + else: + tokens_received += 1 + + if chunk.choices[0].finish_reason is not None: + finished = True + + if finished: + assert chunk.usage.completion_tokens == tokens_received + + assert empty_chunks_received <= 1 diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index cc72a49ebbbda..c81cfdbbe5cff 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -157,15 +157,15 @@ async def test_added_lora_tokens(client: openai.AsyncOpenAI): @pytest.mark.asyncio async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI): # test using token IDs - completion = await client.completions.create( - model=MODEL_NAME, - prompt=[0, 0, 32000, 32001, 32002], - echo=True, - max_tokens=5, - temperature=0.0, - ) - # Added tokens should not appear in tokenized prompt - assert "vllm" not in completion.choices[0].text + with pytest.raises(openai.BadRequestError, match="out of vocabulary"): + # Added tokens should be rejected by the base model + await client.completions.create( + model=MODEL_NAME, + prompt=[0, 0, 32000, 32001, 32002], + echo=True, + max_tokens=5, + temperature=0.0, + ) @pytest.mark.asyncio @@ -340,6 +340,40 @@ async def test_completion_streaming(client: openai.AsyncOpenAI, assert "".join(chunks) == single_output +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME, "zephyr-lora", "zephyr-pa"], +) +async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str): + """Streaming for parallel sampling. + The tokens from multiple samples, are flattened into a single stream, + with an index to indicate which sample the token belongs to. + """ + + prompt = "What is an LLM?" + n = 3 + max_tokens = 5 + + stream = await client.completions.create(model=model_name, + prompt=prompt, + max_tokens=max_tokens, + n=n, + stream=True) + chunks: List[List[str]] = [[] for i in range(n)] + finish_reason_count = 0 + async for chunk in stream: + index = chunk.choices[0].index + text = chunk.choices[0].text + chunks[index].append(text) + if chunk.choices[0].finish_reason is not None: + finish_reason_count += 1 + assert finish_reason_count == n + for chunk in chunks: + assert len(chunk) == max_tokens + print("".join(chunk)) + + @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index f119c6c1201c9..9f2b77dde2a7f 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -4,14 +4,18 @@ import openai import pytest import pytest_asyncio +import requests + +from vllm.transformers_utils.tokenizer import get_tokenizer from ...utils import RemoteOpenAIServer -EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct" +MODEL_NAME = "intfloat/e5-mistral-7b-instruct" +DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 @pytest.fixture(scope="module") -def embedding_server(): +def server(): args = [ # use half precision for speed and memory savings in CI environment "--dtype", @@ -19,31 +23,29 @@ def embedding_server(): "--enforce-eager", "--max-model-len", "8192", + "--chat-template", + DUMMY_CHAT_TEMPLATE, ] - with RemoteOpenAIServer(EMBEDDING_MODEL_NAME, args) as remote_server: + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server @pytest_asyncio.fixture -async def embedding_client(embedding_server): - async with embedding_server.get_async_client() as async_client: +async def client(server): + async with server.get_async_client() as async_client: yield async_client @pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [EMBEDDING_MODEL_NAME], -) -async def test_single_embedding(embedding_client: openai.AsyncOpenAI, - model_name: str): +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str): input_texts = [ "The chef prepared a delicious meal.", ] # test single embedding - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_texts, encoding_format="float", @@ -57,7 +59,7 @@ async def test_single_embedding(embedding_client: openai.AsyncOpenAI, # test using token IDs input_tokens = [1, 1, 1, 1, 1] - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_tokens, encoding_format="float", @@ -71,18 +73,14 @@ async def test_single_embedding(embedding_client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [EMBEDDING_MODEL_NAME], -) -async def test_batch_embedding(embedding_client: openai.AsyncOpenAI, - model_name: str): +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str): # test List[str] input_texts = [ "The cat sat on the mat.", "A feline was resting on a rug.", "Stars twinkle brightly in the night sky." ] - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_texts, encoding_format="float", @@ -90,11 +88,14 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI, assert embeddings.id is not None assert len(embeddings.data) == 3 assert len(embeddings.data[0].embedding) == 4096 + assert embeddings.usage.completion_tokens == 0 + assert embeddings.usage.prompt_tokens == 32 + assert embeddings.usage.total_tokens == 32 # test List[List[int]] input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], [25, 32, 64, 77]] - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_tokens, encoding_format="float", @@ -108,22 +109,70 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [EMBEDDING_MODEL_NAME], -) -async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI, +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_conversation_embedding(server: RemoteOpenAIServer, + client: openai.AsyncOpenAI, + model_name: str): + messages = [{ + "role": "user", + "content": "The cat sat on the mat.", + }, { + "role": "assistant", + "content": "A feline was resting on a rug.", + }, { + "role": "user", + "content": "Stars twinkle brightly in the night sky.", + }] + + chat_response = requests.post(server.url_for("v1/embeddings"), + json={ + "model": model_name, + "messages": messages, + "encoding_format": "float", + }) + chat_response.raise_for_status() + chat_embeddings = chat_response.json() + + tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast") + prompt = tokenizer.apply_chat_template( + messages, + chat_template=DUMMY_CHAT_TEMPLATE, + add_generation_prompt=True, + continue_final_message=False, + tokenize=False, + ) + completion_response = await client.embeddings.create( + model=model_name, + input=prompt, + encoding_format="float", + # To be consistent with chat + extra_body={"add_special_tokens": False}, + ) + completion_embeddings = completion_response.model_dump(mode="json") + + assert chat_embeddings.pop("id") is not None + assert completion_embeddings.pop("id") is not None + assert chat_embeddings.pop("created") <= completion_embeddings.pop( + "created") + assert chat_embeddings == completion_embeddings + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_batch_base64_embedding(client: openai.AsyncOpenAI, model_name: str): input_texts = [ "Hello my name is", "The best thing about vLLM is that it supports many different models" ] - responses_float = await embedding_client.embeddings.create( - input=input_texts, model=model_name, encoding_format="float") + responses_float = await client.embeddings.create(input=input_texts, + model=model_name, + encoding_format="float") - responses_base64 = await embedding_client.embeddings.create( - input=input_texts, model=model_name, encoding_format="base64") + responses_base64 = await client.embeddings.create(input=input_texts, + model=model_name, + encoding_format="base64") decoded_responses_base64_data = [] for data in responses_base64.data: @@ -137,8 +186,8 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI, 1] # Default response is float32 decoded from base64 by OpenAI Client - responses_default = await embedding_client.embeddings.create( - input=input_texts, model=model_name) + responses_default = await client.embeddings.create(input=input_texts, + model=model_name) assert responses_float.data[0].embedding == responses_default.data[ 0].embedding @@ -147,18 +196,15 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [EMBEDDING_MODEL_NAME], -) -async def test_single_embedding_truncation( - embedding_client: openai.AsyncOpenAI, model_name: str): +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_single_embedding_truncation(client: openai.AsyncOpenAI, + model_name: str): input_texts = [ "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?", ] # test single embedding - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_texts, extra_body={"truncate_prompt_tokens": 10}) @@ -173,7 +219,7 @@ async def test_single_embedding_truncation( 1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728, 9901, 340, 2229, 385, 340, 315, 28741, 28804, 2 ] - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_tokens, extra_body={"truncate_prompt_tokens": 10}) @@ -187,18 +233,15 @@ async def test_single_embedding_truncation( @pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [EMBEDDING_MODEL_NAME], -) -async def test_single_embedding_truncation_invalid( - embedding_client: openai.AsyncOpenAI, model_name: str): +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI, + model_name: str): input_texts = [ "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?", ] with pytest.raises(openai.BadRequestError): - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_texts, extra_body={"truncate_prompt_tokens": 8193}) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 6cb74eb78cbf0..6523c8b6297c6 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -70,18 +70,21 @@ async def client(server): [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST), ("_count", _NUM_REQUESTS)], "vllm:request_params_n": [("_count", _NUM_REQUESTS)], + "vllm:request_params_max_tokens": + [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST), + ("_count", _NUM_REQUESTS)], "vllm:prompt_tokens": [("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)], - "vllm:generation_tokens": - [("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)], + "vllm:generation_tokens": [ + ("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST) + ], "vllm:request_success": [("_total", _NUM_REQUESTS)], } @pytest.mark.asyncio -async def test_metrics_counts(client: openai.AsyncOpenAI): - base_url = str(client.base_url)[:-3].strip("/") - +async def test_metrics_counts(server: RemoteOpenAIServer, + client: openai.AsyncClient): for _ in range(_NUM_REQUESTS): # sending a request triggers the metrics to be logged. await client.completions.create( @@ -89,7 +92,7 @@ async def test_metrics_counts(client: openai.AsyncOpenAI): prompt=_TOKENIZED_PROMPT, max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST) - response = requests.get(base_url + "/metrics") + response = requests.get(server.url_for("metrics")) print(response.text) assert response.status_code == HTTPStatus.OK @@ -150,6 +153,9 @@ async def test_metrics_counts(client: openai.AsyncOpenAI): "vllm:request_params_n_sum", "vllm:request_params_n_bucket", "vllm:request_params_n_count", + "vllm:request_params_max_tokens_sum", + "vllm:request_params_max_tokens_bucket", + "vllm:request_params_max_tokens_count", "vllm:num_preemptions_total", "vllm:prompt_tokens_total", "vllm:generation_tokens_total", @@ -170,16 +176,15 @@ async def test_metrics_counts(client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_metrics_exist(client: openai.AsyncOpenAI): - base_url = str(client.base_url)[:-3].strip("/") - +async def test_metrics_exist(server: RemoteOpenAIServer, + client: openai.AsyncClient): # sending a request triggers the metrics to be logged. await client.completions.create(model=MODEL_NAME, prompt="Hello, my name is", max_tokens=5, temperature=0.0) - response = requests.get(base_url + "/metrics") + response = requests.get(server.url_for("metrics")) assert response.status_code == HTTPStatus.OK for metric in EXPECTED_METRICS: diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py index 0a573a0066d32..1ae64ef492d5b 100644 --- a/tests/entrypoints/openai/test_prompt_validation.py +++ b/tests/entrypoints/openai/test_prompt_validation.py @@ -20,3 +20,38 @@ async def test_empty_prompt(): prompt="", max_tokens=5, temperature=0.0) + + +@pytest.mark.asyncio +async def test_out_of_vocab_token_ids(): + model_name = "gpt2" + server_args = ["--enforce-eager"] + with RemoteOpenAIServer(model_name, server_args) as remote_server: + client = remote_server.get_async_client() + + with pytest.raises(openai.BadRequestError, + match=re.compile('.*out of vocabulary.*')): + await client.completions.create(model=model_name, + prompt=[999999], + max_tokens=5, + temperature=0.0) + + +@pytest.mark.asyncio +async def test_reject_multistep_with_guided_decoding(): + model_name = "gpt2" + server_args = ["--enforce-eager", "--num-scheduler-steps", "8"] + with RemoteOpenAIServer(model_name, server_args) as remote_server: + client = remote_server.get_async_client() + + with pytest.raises(openai.BadRequestError, + match=re.compile( + '.*Guided decoding .* multi-step decoding.*')): + await client.completions.create( + model=model_name, + prompt="Hello", + max_tokens=5, + temperature=0.0, + extra_body={"response_format": { + "type": "json_object" + }}) diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index ec550fe82c70f..e969d33775d86 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -22,12 +22,13 @@ class MockHFConfig: @dataclass class MockModelConfig: + task = "generate" tokenizer = MODEL_NAME trust_remote_code = False tokenizer_mode = "auto" + chat_template_text_format = "string" max_model_len = 100 tokenizer_revision = None - embedding_mode = False multimodal_config = MultiModalConfig() hf_config = MockHFConfig() diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py index 25ab91ef69333..6fcc92022855b 100644 --- a/tests/entrypoints/openai/test_shutdown.py +++ b/tests/entrypoints/openai/test_shutdown.py @@ -6,7 +6,7 @@ from ...utils import RemoteOpenAIServer -MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" +MODEL_NAME = "meta-llama/Llama-3.2-1B" @pytest.mark.asyncio diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py index 859a676a9c777..b1956a8cbc9dc 100644 --- a/tests/entrypoints/openai/test_tokenization.py +++ b/tests/entrypoints/openai/test_tokenization.py @@ -1,4 +1,3 @@ -import openai # use the official client for correctness check import pytest import pytest_asyncio import requests @@ -55,9 +54,11 @@ async def client(server): [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], indirect=["tokenizer_name"], ) -async def test_tokenize_completions(client: openai.AsyncOpenAI, - model_name: str, tokenizer_name: str): - base_url = str(client.base_url)[:-3].strip("/") +async def test_tokenize_completions( + server: RemoteOpenAIServer, + model_name: str, + tokenizer_name: str, +): tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast") @@ -65,7 +66,7 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI, prompt = "vllm1 This is a test prompt." tokens = tokenizer.encode(prompt, add_special_tokens=add_special) - response = requests.post(base_url + "/tokenize", + response = requests.post(server.url_for("tokenize"), json={ "add_special_tokens": add_special, "model": model_name, @@ -86,9 +87,11 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI, [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], indirect=["tokenizer_name"], ) -async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str, - tokenizer_name: str): - base_url = str(client.base_url)[:-3].strip("/") +async def test_tokenize_chat( + server: RemoteOpenAIServer, + model_name: str, + tokenizer_name: str, +): tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast") @@ -121,7 +124,7 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str, tokens = tokenizer.encode(prompt, add_special_tokens=add_special) - response = requests.post(base_url + "/tokenize", + response = requests.post(server.url_for("tokenize"), json={ "add_generation_prompt": add_generation, @@ -146,17 +149,18 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str, [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], indirect=["tokenizer_name"], ) -async def test_detokenize(client: openai.AsyncOpenAI, model_name: str, - tokenizer_name: str): - base_url = str(client.base_url)[:-3].strip("/") +async def test_detokenize( + server: RemoteOpenAIServer, + model_name: str, + tokenizer_name: str, +): tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast") prompt = "This is a test prompt. vllm1" tokens = tokenizer.encode(prompt, add_special_tokens=False) - print(f"CALLING {base_url} FOR {model_name}") - response = requests.post(base_url + "/detokenize", + response = requests.post(server.url_for("detokenize"), json={ "model": model_name, "tokens": tokens diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 81d79601124a7..157d873a75b4d 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -23,6 +23,8 @@ @pytest.fixture(scope="module") def server(): args = [ + "--task", + "generate", "--dtype", "bfloat16", "--max-model-len", @@ -76,11 +78,12 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI, }] # test single completion - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_tokens=10, - logprobs=True, - top_logprobs=5) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + logprobs=True, + top_logprobs=5) assert len(chat_completion.choices) == 1 choice = chat_completion.choices[0] @@ -99,12 +102,48 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, ) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI, + model_name: str, + image_url: str): + messages = [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }] + + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + n=2, + max_completion_tokens=10, + logprobs=True, + top_logprobs=5, + extra_body=dict(use_beam_search=True)) + assert len(chat_completion.choices) == 2 + assert chat_completion.choices[ + 0].message.content != chat_completion.choices[1].message.content + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) @@ -131,11 +170,12 @@ async def test_single_chat_session_image_base64encoded( }] # test single completion - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_tokens=10, - logprobs=True, - top_logprobs=5) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + logprobs=True, + top_logprobs=5) assert len(chat_completion.choices) == 1 choice = chat_completion.choices[0] @@ -154,12 +194,47 @@ async def test_single_chat_session_image_base64encoded( chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, ) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +async def test_single_chat_session_image_base64encoded_beamsearch( + client: openai.AsyncOpenAI, model_name: str, image_url: str, + base64_encoded_image: Dict[str, str]): + + messages = [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": + f"data:image/jpeg;base64,{base64_encoded_image[image_url]}" + } + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }] + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + n=2, + max_completion_tokens=10, + extra_body=dict(use_beam_search=True)) + assert len(chat_completion.choices) == 2 + assert chat_completion.choices[ + 0].message.content != chat_completion.choices[1].message.content + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) @@ -186,7 +261,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, ) output = chat_completion.choices[0].message.content @@ -196,7 +271,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI, stream = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, stream=True, ) @@ -247,7 +322,7 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str, await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, ) @@ -264,7 +339,7 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str, chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, ) message = chat_completion.choices[0].message diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py new file mode 100644 index 0000000000000..d0c43b47bf0af --- /dev/null +++ b/tests/entrypoints/openai/test_vision_embedding.py @@ -0,0 +1,99 @@ +from typing import Dict + +import pytest +import pytest_asyncio +import requests + +from vllm.multimodal.utils import encode_image_base64, fetch_image + +from ...utils import VLLM_PATH, RemoteOpenAIServer + +MODEL_NAME = "TIGER-Lab/VLM2Vec-Full" +MAXIMUM_IMAGES = 2 + +vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec.jinja" +assert vlm2vec_jinja_path.exists() + +# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) +TEST_IMAGE_URLS = [ + "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", + "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", + "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", + "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", +] + + +@pytest.fixture(scope="module") +def server(): + args = [ + "--task", + "embedding", + "--dtype", + "bfloat16", + "--max-model-len", + "2048", + "--max-num-seqs", + "5", + "--enforce-eager", + "--trust-remote-code", + "--limit-mm-per-prompt", + f"image={MAXIMUM_IMAGES}", + "--chat-template", + str(vlm2vec_jinja_path), + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +@pytest.fixture(scope="session") +def base64_encoded_image() -> Dict[str, str]: + return { + image_url: encode_image_base64(fetch_image(image_url)) + for image_url in TEST_IMAGE_URLS + } + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +async def test_image_embedding(server: RemoteOpenAIServer, model_name: str, + image_url: str): + messages = [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "Represent the given image." + }, + ], + }] + + response = requests.post(server.url_for("v1/embeddings"), + json={ + "model": model_name, + "messages": messages, + "encoding_format": "float" + }) + response.raise_for_status() + + embeddings = response.json() + assert embeddings["id"] is not None + assert len(embeddings["data"]) == 1 + assert len(embeddings["data"][0]["embedding"]) == 3072 + assert embeddings["usage"]["completion_tokens"] == 0 + assert embeddings["usage"]["prompt_tokens"] == 762 + assert embeddings["usage"]["total_tokens"] == 762 diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 6ded5102c9314..5fa466f8f041f 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -8,21 +8,25 @@ from vllm.config import ModelConfig from vllm.entrypoints.chat_utils import (parse_chat_messages, parse_chat_messages_futures) +from vllm.entrypoints.llm import apply_hf_chat_template from vllm.multimodal import MultiModalDataDict from vllm.multimodal.utils import encode_image_base64 from vllm.transformers_utils.tokenizer_group import TokenizerGroup PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct" +MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct" -@pytest.fixture(scope="module") +@pytest.fixture(scope="function") def phi3v_model_config(): return ModelConfig(PHI3V_MODEL_ID, - PHI3V_MODEL_ID, + task="generate", + tokenizer=PHI3V_MODEL_ID, tokenizer_mode="auto", trust_remote_code=True, dtype="bfloat16", seed=0, + chat_template_text_format="string", limit_mm_per_prompt={ "image": 2, }) @@ -38,6 +42,30 @@ def phi3v_tokenizer(): ) +@pytest.fixture(scope="module") +def mllama_model_config(): + return ModelConfig(MLLAMA_MODEL_ID, + task="generate", + tokenizer=MLLAMA_MODEL_ID, + tokenizer_mode="auto", + trust_remote_code=True, + dtype="bfloat16", + seed=0, + limit_mm_per_prompt={ + "image": 2, + }) + + +@pytest.fixture(scope="module") +def mllama_tokenizer(): + return TokenizerGroup( + MLLAMA_MODEL_ID, + enable_lora=False, + max_num_seqs=5, + max_input_length=None, + ) + + @pytest.fixture(scope="module") def image_url(): image = ImageAsset('cherry_blossom') @@ -303,6 +331,51 @@ def test_parse_chat_messages_multiple_images_across_messages( _assert_mm_data_is_image_input(mm_data, 2) +def test_parse_chat_messages_context_text_format( + phi3v_model_config, + phi3v_tokenizer, +): + phi3v_model_config.chat_template_text_format = "openai" + conversation, mm_data = parse_chat_messages( + [{ + "role": "user", + "content": [{ + "type": "text", + "text": "What's in this text?" + }] + }, { + "role": "assistant", + "content": "Some stuff." + }, { + "role": "user", + "content": "What about this one?" + }], phi3v_model_config, phi3v_tokenizer) + + assert conversation == [ + { + "role": "user", + "content": [{ + "type": "text", + "text": "What's in this text?" + }] + }, + { + "role": "assistant", + "content": [{ + "type": "text", + "text": "Some stuff." + }] + }, + { + "role": "user", + "content": [{ + "type": "text", + "text": "What about this one?" + }] + }, + ] + + def test_parse_chat_messages_rejects_too_many_images_in_one_message( phi3v_model_config, phi3v_tokenizer, @@ -387,3 +460,179 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages( "text": "What about these two?" }] }], phi3v_model_config, phi3v_tokenizer) + + +def test_parse_chat_messages_multiple_images_uncommon_input( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + conversation, mm_data = parse_chat_messages([{ + "role": + "user", + "content": [ + "What's in these images?", { + "image_url": image_url + }, { + "image_url": image_url + } + ] + }], phi3v_model_config, phi3v_tokenizer) + + assert conversation == [{ + "role": + "user", + "content": + "<|image_1|>\n<|image_2|>\nWhat's in these images?" + }] + _assert_mm_data_is_image_input(mm_data, 2) + + +### Mllama currently wraps images / texts as interleaved dictionaries +def test_mllama_single_image( + mllama_model_config, + mllama_tokenizer, + image_url, +): + """Ensures that a single image is parsed correctly mllama.""" + conversation, mm_data = parse_chat_messages([{ + "role": + "user", + "content": [{ + 'type': 'text', + 'text': 'The content of this image is:' + }, { + "image_url": image_url + }] + }], mllama_model_config, mllama_tokenizer) + _assert_mm_data_is_image_input(mm_data, 1) + assert conversation == [{ + 'role': + 'user', + 'content': [{ + 'type': 'text', + 'text': 'The content of this image is:' + }, { + 'type': 'image' + }] + }] + + +def test_mllama_interleaved_images( + mllama_model_config, + mllama_tokenizer, + image_url, +): + """Ensures that multiple image are parsed as interleaved dicts.""" + conversation, mm_data = parse_chat_messages([{ + "role": + "user", + "content": [ + { + 'type': 'text', + 'text': 'The content of the first image is:' + }, + { + "image_url": image_url + }, + { + 'type': 'text', + 'text': 'The content of the second image is:' + }, + { + "image_url": image_url + }, + ] + }], mllama_model_config, mllama_tokenizer) + _assert_mm_data_is_image_input(mm_data, 2) + assert conversation == [{ + 'role': + 'user', + 'content': [{ + 'type': 'text', + 'text': 'The content of the first image is:' + }, { + 'type': 'image' + }, { + 'type': 'text', + 'text': 'The content of the second image is:' + }, { + 'type': 'image' + }] + }] + + +@pytest.mark.parametrize("model", [MLLAMA_MODEL_ID]) +def test_multimodal_image_parsing_matches_hf(model, image_url): + """Checks end to end hf alignment for multimodal [image] parsing.""" + + def get_conversation(is_hf: bool): + img_part = {"type": "image_url", "image_url": {"url": image_url}} + if is_hf: + img_part = {'type': 'image'} + return [{ + 'role': + 'user', + 'content': [ + { + 'type': 'text', + 'text': 'The content of the first image is:' + }, + img_part, + { + 'type': 'text', + 'text': 'The content of the second image is:' + }, + img_part, + { + 'type': 'text', + 'text': 'What animal is in the first image?' + }, + ] + }] + + # Build a config for the model + model_config = ModelConfig(model, + task="generate", + tokenizer=MLLAMA_MODEL_ID, + tokenizer_mode="auto", + trust_remote_code=True, + dtype="bfloat16", + seed=0, + limit_mm_per_prompt={ + "image": 2, + }) + + # Build the tokenizer group and grab the underlying tokenizer + tokenizer_group = TokenizerGroup( + MLLAMA_MODEL_ID, + enable_lora=False, + max_num_seqs=5, + max_input_length=None, + ) + tokenizer = tokenizer_group.tokenizer + + # Build and parse a conversation with {"type": "image"} using the tokenizer + hf_conversation = get_conversation(is_hf=True) + hf_result = tokenizer.apply_chat_template( + hf_conversation, + tokenize=False, + add_generation_prompt=True, + ) + + # Now parse with vLLMs chat utils & apply the template + vllm_conversation = get_conversation(is_hf=False) + conversation, _ = parse_chat_messages( + vllm_conversation, + model_config, + tokenizer_group, + ) + + vllm_result = apply_hf_chat_template( + tokenizer, + conversation=conversation, + chat_template=None, + add_generation_prompt=True, + ) + + assert hf_result == vllm_result diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index 8f6a54ff5979c..f2358940fc7b8 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -2,12 +2,13 @@ import torch -from vllm.utils import is_hip +from vllm.platforms import current_platform # Using the default value (240.0) from pytorch will cause accuracy # issue on dynamic quantization models. Here use 224.0 for rocm. ROCM_FP8_MAX = 224.0 -FP8_DTYPE = torch.float8_e4m3fnuz if is_hip() else torch.float8_e4m3fn +FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm() \ + else torch.float8_e4m3fn def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor: @@ -24,8 +25,10 @@ def ref_dynamic_per_token_quant(x: torch.tensor, qtype_traits = torch.iinfo(quant_dtype) if quant_dtype == torch.int8 \ else torch.finfo(quant_dtype) - qtype_traits_max = ROCM_FP8_MAX if is_hip() else qtype_traits.max - qtype_traits_min = -ROCM_FP8_MAX if is_hip() else qtype_traits.min + qtype_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \ + else qtype_traits.max + qtype_traits_min = -ROCM_FP8_MAX if current_platform.is_rocm() \ + else qtype_traits.min qtype_max = as_float32_tensor(qtype_traits_max) s_1 = as_float32_tensor(1.0) s_512 = as_float32_tensor(512.0) @@ -66,8 +69,10 @@ def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \ -> Tuple[torch.tensor, torch.tensor]: fp8_traits = torch.finfo(FP8_DTYPE) - fp8_traits_max = ROCM_FP8_MAX if is_hip() else fp8_traits.max - fp8_traits_min = -ROCM_FP8_MAX if is_hip() else fp8_traits.min + fp8_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \ + else fp8_traits.max + fp8_traits_min = -ROCM_FP8_MAX if current_platform.is_rocm() \ + else fp8_traits.min fp8_max = as_float32_tensor(fp8_traits_max) one = as_float32_tensor(1.0) diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index 9b476585fa19e..057a11746014c 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -1,13 +1,14 @@ +import random from typing import Type import pytest import torch from tests.kernels.utils import opcheck -from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul, - NewGELU, QuickGELU, - SiluAndMul) -from vllm.utils import seed_everything +from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul, + GeluAndMul, NewGELU, + QuickGELU, SiluAndMul) +from vllm.platforms import current_platform from .allclose_default import get_default_atol, get_default_rtol @@ -20,7 +21,8 @@ ] -@pytest.mark.parametrize("activation", ["silu", "gelu", "gelu_tanh"]) +@pytest.mark.parametrize("activation", + ["silu", "gelu", "gelu_tanh", "fatrelu"]) @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("d", D) @pytest.mark.parametrize("dtype", DTYPES) @@ -35,7 +37,7 @@ def test_act_and_mul( seed: int, device: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) x = torch.randn(num_tokens, 2 * d, dtype=dtype) if activation == "silu": @@ -47,16 +49,23 @@ def test_act_and_mul( elif activation == "gelu_tanh": layer = GeluAndMul(approximate="tanh") fn = torch.ops._C.gelu_tanh_and_mul + elif activation == "fatrelu": + threshold = random.uniform(0, 1) + layer = FatreluAndMul(threshold) + fn = torch.ops._C.fatrelu_and_mul out = layer(x) ref_out = layer.forward_native(x) - # The SiLU and GELU implementations are equivalent to the native PyTorch - # implementations, so we can do exact comparison. + # The SiLU, GELU and FatReLU implementations are equivalent to the native + # PyTorch implementations, so we can do exact comparison. torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0) d = x.shape[-1] // 2 output_shape = (x.shape[:-1] + (d, )) out = torch.empty(output_shape, dtype=x.dtype, device=x.device) - opcheck(fn, (out, x)) + if activation == "fatrelu": + opcheck(fn, (out, x, threshold)) + else: + opcheck(fn, (out, x)) @pytest.mark.parametrize("activation", [(FastGELU, torch.ops._C.gelu_fast), @@ -76,7 +85,7 @@ def test_activation( seed: int, device: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) x = torch.randn(num_tokens, d, dtype=dtype) layer = activation[0]() diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 52f1ecd176963..4ecd0fc1a21ad 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -6,11 +6,12 @@ from tests.kernels.utils import opcheck from vllm import _custom_ops as ops -from vllm.utils import get_max_shared_memory_bytes, is_hip, seed_everything +from vllm.platforms import current_platform +from vllm.utils import get_max_shared_memory_bytes from .allclose_default import get_default_atol, get_default_rtol -if not is_hip(): +if not current_platform.is_rocm(): from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask @@ -23,8 +24,9 @@ NUM_BLOCKS = 4321 # Arbitrary values for testing PARTITION_SIZE = 512 # flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16} -DTYPES = [torch.half, torch.bfloat16, torch.float - ] if not is_hip() else [torch.half, torch.bfloat16] +DTYPES = [ + torch.half, torch.bfloat16, torch.float +] if not current_platform.is_rocm() else [torch.half, torch.bfloat16] NUM_GEN_SEQS = [7] # Arbitrary values for testing NUM_PREFILL_SEQS = [3] # Arbitrary values for testing NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing @@ -114,7 +116,8 @@ def ref_single_query_cached_kv_attention( @pytest.mark.parametrize( - "version", ["v1", "v2"] if not is_hip() else ["v1", "v2", "rocm"]) + "version", + ["v1", "v2"] if not current_platform.is_rocm() else ["v1", "v2", "rocm"]) @pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @@ -141,7 +144,7 @@ def test_paged_attention( or (version == "rocm" and head_size not in (64, 128))): pytest.skip() - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) scale = float(1.0 / (head_size**0.5)) num_query_heads, num_kv_heads = num_heads @@ -317,8 +320,8 @@ def test_paged_attention( # NOTE(woosuk): Due to the kernel-level differences in the two # implementations, there is a small numerical difference in the two # outputs. Thus, we use a relaxed tolerance for the test. - atol = get_default_atol(output) if is_hip() else 1e-3 - rtol = get_default_rtol(output) if is_hip() else 1e-5 + atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3 + rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5 # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error, # so we use a relaxed tolerance for the test. @@ -368,7 +371,7 @@ def ref_multi_query_kv_attention( @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.skipif(is_hip(), +@pytest.mark.skipif(current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm.") @torch.inference_mode() def test_multi_query_kv_attention( @@ -379,7 +382,7 @@ def test_multi_query_kv_attention( seed: int, device: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) # MAX_SEQ_LEN sometimes causes OOM in the reference implementation. # As the xformers library is already tested with its own tests, we can use @@ -425,6 +428,6 @@ def test_multi_query_kv_attention( scale, dtype, ) - atol = get_default_atol(output) if is_hip() else 1e-3 - rtol = get_default_rtol(output) if is_hip() else 1e-5 + atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3 + rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5 torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol) diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index f471dcee938be..3fe9ca0b0450f 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -19,22 +19,25 @@ def test_env(name: str, device: str, monkeypatch): override_backend_env_variable(monkeypatch, name) if device == "cpu": - with patch("vllm.attention.selector.is_cpu", return_value=True): - backend = which_attn_to_use(16, None, torch.float16, torch.float16, - 16, False) + with patch("vllm.attention.selector.current_platform.is_cpu", + return_value=True): + backend = which_attn_to_use(16, torch.float16, torch.float16, 16, + False) assert backend.name == "TORCH_SDPA" elif device == "hip": - with patch("vllm.attention.selector.is_hip", return_value=True): - backend = which_attn_to_use(16, None, torch.float16, torch.float16, - 16, False) + with patch("vllm.attention.selector.current_platform.is_rocm", + return_value=True): + backend = which_attn_to_use(16, torch.float16, torch.float16, 16, + False) assert backend.name == "ROCM_FLASH" elif device == "openvino": - with patch("vllm.attention.selector.is_openvino", return_value=True): - backend = which_attn_to_use(16, None, torch.float16, torch.float16, - 16, False) + with patch("vllm.attention.selector.current_platform.is_openvino", + return_value=True): + backend = which_attn_to_use(16, torch.float16, torch.float16, 16, + False) assert backend.name == "OPENVINO" else: - backend = which_attn_to_use(16, None, torch.float16, torch.float16, 16, + backend = which_attn_to_use(16, torch.float16, torch.float16, 16, False) assert backend.name == name @@ -46,37 +49,32 @@ def test_flash_attn(monkeypatch): # Unsupported CUDA arch with patch("torch.cuda.get_device_capability", return_value=(7, 5)): - backend = which_attn_to_use(16, None, torch.float16, None, 16, False) + backend = which_attn_to_use(16, torch.float16, None, 16, False) assert backend.name != STR_FLASH_ATTN_VAL # Unsupported data type - backend = which_attn_to_use(16, None, torch.float8_e4m3fn, None, 16, False) + backend = which_attn_to_use(16, torch.float8_e4m3fn, None, 16, False) assert backend.name != STR_FLASH_ATTN_VAL # Unsupported kv cache data type - backend = which_attn_to_use(16, None, torch.float16, "fp8", 16, False) + backend = which_attn_to_use(16, torch.float16, "fp8", 16, False) assert backend.name != STR_FLASH_ATTN_VAL # Unsupported block size - backend = which_attn_to_use(16, None, torch.float16, None, 8, False) - assert backend.name != STR_FLASH_ATTN_VAL - - # Unsupported sliding window - backend = which_attn_to_use(16, 1, torch.float16, None, 16, False) + backend = which_attn_to_use(16, torch.float16, None, 8, False) assert backend.name != STR_FLASH_ATTN_VAL # flash-attn is not installed with patch.dict('sys.modules', {'vllm_flash_attn': None}): - backend = which_attn_to_use(16, None, torch.float16, None, 16, False) + backend = which_attn_to_use(16, torch.float16, None, 16, False) assert backend.name != STR_FLASH_ATTN_VAL # Unsupported head size - backend = which_attn_to_use(17, None, torch.float16, None, 16, False) + backend = which_attn_to_use(17, torch.float16, None, 16, False) assert backend.name != STR_FLASH_ATTN_VAL # Attention-free models should bypass env and use PlaceholderAttention - backend = which_attn_to_use(16, None, torch.float16, torch.float16, 16, - True) + backend = which_attn_to_use(16, torch.float16, torch.float16, 16, True) assert backend.name != STR_FLASH_ATTN_VAL @@ -84,4 +82,4 @@ def test_invalid_env(monkeypatch): """Throw an exception if the backend name is invalid.""" override_backend_env_variable(monkeypatch, STR_INVALID_VAL) with pytest.raises(ValueError): - which_attn_to_use(16, None, torch.float16, None, 16, False) + which_attn_to_use(16, torch.float16, None, 16, False) diff --git a/tests/kernels/test_awq_marlin.py b/tests/kernels/test_awq_marlin.py index 0f0a2b24563fd..59917dd2c58ad 100644 --- a/tests/kernels/test_awq_marlin.py +++ b/tests/kernels/test_awq_marlin.py @@ -5,11 +5,10 @@ import pytest import torch +import vllm.model_executor.layers.fused_moe # noqa from tests.kernels.utils import (compute_max_diff, stack_and_dev, torch_moe, torch_moe_single) from vllm import _custom_ops as ops -from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( - fused_marlin_moe, single_marlin_moe) from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( awq_marlin_quantize) @@ -81,7 +80,7 @@ def test_fused_marlin_moe_awq( score = torch.randn((m, e), device="cuda", dtype=dtype) topk_weights, topk_ids = fused_topk(a, score, topk, False) - marlin_output = fused_marlin_moe( + marlin_output = torch.ops.vllm.fused_marlin_moe( a, qweight1, qweight2, @@ -150,14 +149,14 @@ def test_single_marlin_moe_multiply_awq( score = torch.randn((m, e), device="cuda", dtype=dtype) - marlin_output = single_marlin_moe(a, - qweight, - scales, - score, - topk, - renormalize=False, - w_zeros=zp, - num_bits=num_bits) + marlin_output = torch.ops.vllm.single_marlin_moe(a, + qweight, + scales, + score, + topk, + renormalize=False, + w_zeros=zp, + num_bits=num_bits) torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk) diff --git a/tests/kernels/test_awq_triton.py b/tests/kernels/test_awq_triton.py index e95e5bd948212..406a0c8dd8080 100644 --- a/tests/kernels/test_awq_triton.py +++ b/tests/kernels/test_awq_triton.py @@ -7,7 +7,7 @@ from vllm.model_executor.layers.quantization.awq_triton import ( AWQ_TRITON_SUPPORTED_GROUP_SIZES, awq_dequantize_triton, awq_gemm_triton) -from vllm.utils import seed_everything +from vllm.platforms import current_platform device = "cuda" @@ -80,7 +80,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size): zeros_cols = qweight_cols zeros_dtype = torch.int32 - seed_everything(0) + current_platform.seed_everything(0) qweight = torch.randint(0, torch.iinfo(torch.int32).max, @@ -134,7 +134,7 @@ def test_gemm(N, K, M, splitK, group_size): qzeros_rows = scales_rows qzeros_cols = qweight_cols - seed_everything(0) + current_platform.seed_everything(0) input = torch.rand((input_rows, input_cols), dtype=input_dtype, diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py index f3bd8f0524264..fb601852dd523 100644 --- a/tests/kernels/test_blocksparse_attention.py +++ b/tests/kernels/test_blocksparse_attention.py @@ -7,7 +7,8 @@ from vllm import _custom_ops as ops from vllm.attention.ops.blocksparse_attention.interface import ( LocalStridedBlockSparseAttn) -from vllm.utils import get_max_shared_memory_bytes, is_hip, seed_everything +from vllm.platforms import current_platform +from vllm.utils import get_max_shared_memory_bytes from .allclose_default import get_default_atol, get_default_rtol @@ -172,7 +173,7 @@ def test_paged_attention( blocksparse_block_size: int, blocksparse_head_sliding_step: int, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) scale = float(1.0 / (head_size**0.5)) num_query_heads, num_kv_heads = num_heads @@ -316,8 +317,8 @@ def test_paged_attention( # NOTE(woosuk): Due to the kernel-level differences in the two # implementations, there is a small numerical difference in the two # outputs. Thus, we use a relaxed tolerance for the test. - atol = get_default_atol(output) if is_hip() else 1e-3 - rtol = get_default_rtol(output) if is_hip() else 1e-5 + atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3 + rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5 # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error, # so we use a relaxed tolerance for the test. @@ -383,7 +384,7 @@ def test_varlen_blocksparse_attention_prefill( seed: int, device: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) # MAX_SEQ_LEN sometimes causes OOM in the reference implementation. # As the xformers library is already tested with its own tests, we can use diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index b0e7097fdfbd4..e2b4778b94b9e 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -6,7 +6,7 @@ from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck from vllm import _custom_ops as ops -from vllm.utils import seed_everything +from vllm.platforms import current_platform COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')] DTYPES = [torch.half, torch.bfloat16, torch.float] @@ -56,7 +56,7 @@ def test_copy_blocks( ) -> None: if kv_cache_dtype == "fp8" and head_size % 16: pytest.skip() - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) # Generate random block mappings where each source block is mapped to two # destination blocks. @@ -132,7 +132,7 @@ def test_reshape_and_cache( ) -> None: if kv_cache_dtype == "fp8" and head_size % 16: pytest.skip() - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) # Create a random slot mapping. num_slots = block_size * num_blocks @@ -224,7 +224,7 @@ def test_reshape_and_cache_flash( device: str, kv_cache_dtype: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) # Create a random slot mapping. @@ -258,19 +258,20 @@ def test_reshape_and_cache_flash( del key_caches del value_caches + k_scale = key.amax().item() / 256 + v_scale = value.amax().item() / 256 + # Clone the KV caches. if kv_cache_dtype == "fp8": cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16) - ops.convert_fp8(cloned_key_cache, key_cache) + ops.convert_fp8(cloned_key_cache, key_cache, k_scale, kv_cache_dtype) cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16) - ops.convert_fp8(cloned_value_cache, value_cache) + ops.convert_fp8(cloned_value_cache, value_cache, v_scale, + kv_cache_dtype) else: cloned_key_cache = key_cache.clone() cloned_value_cache = value_cache.clone() - # Using default kv_scale - k_scale = v_scale = 1.0 - # Call the reshape_and_cache kernel. opcheck(torch.ops._C_cache_ops.reshape_and_cache_flash, (key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype, @@ -281,9 +282,15 @@ def test_reshape_and_cache_flash( if kv_cache_dtype == "fp8": result_key_cache = torch.empty_like(key_cache, dtype=torch.float16) - ops.convert_fp8(result_key_cache, key_cache) + ops.convert_fp8(result_key_cache, + key_cache, + k_scale, + kv_dtype=kv_cache_dtype) result_value_cache = torch.empty_like(value_cache, dtype=torch.float16) - ops.convert_fp8(result_value_cache, value_cache) + ops.convert_fp8(result_value_cache, + value_cache, + v_scale, + kv_dtype=kv_cache_dtype) # Run the reference implementation. block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor") @@ -339,7 +346,7 @@ def test_swap_blocks( if kv_cache_dtype == "fp8" and head_size % 16: pytest.skip() - seed_everything(seed) + current_platform.seed_everything(seed) src_device = device if direction[0] == "cuda" else 'cpu' dst_device = device if direction[1] == "cuda" else 'cpu' @@ -408,7 +415,7 @@ def test_fp8_e4m3_conversion( seed: int, device: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) low = -224.0 high = 224.0 diff --git a/tests/kernels/test_causal_conv1d.py b/tests/kernels/test_causal_conv1d.py index 069020a536d0e..f9b11018288be 100644 --- a/tests/kernels/test_causal_conv1d.py +++ b/tests/kernels/test_causal_conv1d.py @@ -6,9 +6,10 @@ from tests.kernels.utils import opcheck from vllm import _custom_ops as ops # noqa: F401 +from vllm.attention.backends.utils import PAD_SLOT_ID from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( causal_conv1d_fn, causal_conv1d_update) -from vllm.utils import seed_everything +from vllm.platforms import current_platform def causal_conv1d_ref( @@ -69,7 +70,7 @@ def causal_conv1d_update_ref(x, bias: (dim,) cache_seqlens: (batch,), dtype int32. If not None, the conv_state is treated as a circular buffer. - The conv_state will be updated by copying x to the + The conv_state will be updated by copying x to the conv_state starting at the index @cache_seqlens % state_len before performing the convolution. @@ -114,16 +115,15 @@ def causal_conv1d_update_ref(x, @pytest.mark.parametrize("itype", [torch.bfloat16, torch.float]) @pytest.mark.parametrize("silu_activation", [True]) @pytest.mark.parametrize("has_bias", [True]) -def causal_conv1d_opcheck_fn( - x: torch.Tensor, - weight: torch.Tensor, - bias: Optional[torch.Tensor] = None, - cu_seq_len: Optional[torch.Tensor] = None, - cache_indices: Optional[torch.Tensor] = None, - has_initial_state: Optional[torch.Tensor] = None, - conv_states: Optional[torch.Tensor] = None, - activation: Optional[str] = "silu", -): +def causal_conv1d_opcheck_fn(x: torch.Tensor, + weight: torch.Tensor, + bias: Optional[torch.Tensor] = None, + cu_seq_len: Optional[torch.Tensor] = None, + cache_indices: Optional[torch.Tensor] = None, + has_initial_state: Optional[torch.Tensor] = None, + conv_states: Optional[torch.Tensor] = None, + activation: Optional[str] = "silu", + pad_slot_id: int = PAD_SLOT_ID): """ x: (batch, dim, seqlen) weight: (dim, width) @@ -141,16 +141,9 @@ def causal_conv1d_opcheck_fn( x = x.contiguous() bias = bias.contiguous() if bias is not None else None - opcheck(torch.ops._C.causal_conv1d_fwd, ( - x, - weight, - bias, - conv_states, - cu_seq_len, - cache_indices, - has_initial_state, - activation in ["silu", "swish"], - )) + opcheck(torch.ops._C.causal_conv1d_fwd, + (x, weight, bias, conv_states, cu_seq_len, cache_indices, + has_initial_state, activation in ["silu", "swish"], pad_slot_id)) @pytest.mark.parametrize("itype", [torch.bfloat16, torch.float]) @@ -158,7 +151,7 @@ def causal_conv1d_opcheck_fn( @pytest.mark.parametrize("has_bias", [True]) @pytest.mark.parametrize("width", [4]) @pytest.mark.parametrize( - 'seqlen', [1, 8, 16, 32, 64, 128, 256, 512, 784, 1024, 2048, 4096]) + 'seqlen', [1, 8, 16, 32, 64, 128, 256, 512, 784, 1024, 1025, 2048, 4096]) @pytest.mark.parametrize('dim', [64]) @pytest.mark.parametrize('batch', [1]) def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation, @@ -168,7 +161,7 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation, if itype == torch.bfloat16: rtol, atol = 1e-2, 5e-2 # set seed - seed_everything(0) + current_platform.seed_everything(0) x = torch.randn(batch, dim, seqlen, device=device, dtype=itype).contiguous() @@ -230,20 +223,14 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation, if itype == torch.bfloat16: rtol, atol = 1e-2, 5e-2 # set seed - seed_everything(0) + current_platform.seed_everything(0) batch = 2 x = torch.randn(batch, dim, seqlen, device=device, dtype=itype) + x_ref = x.clone() conv_state = torch.randn(batch, dim, width - 1, device=device, dtype=itype) - weight = torch.randn(dim, - width, - device=device, - dtype=itype, - requires_grad=True) - if has_bias: - bias = torch.randn(dim, device=device, dtype=itype, requires_grad=True) - else: - bias = None + weight = torch.randn(dim, width, device=device, dtype=itype) + bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None conv_state_ref = conv_state.detach().clone() activation = None if not silu_activation else "silu" out = causal_conv1d_update(x, @@ -251,7 +238,7 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation, weight, bias, activation=activation) - out_ref = causal_conv1d_update_ref(x, + out_ref = causal_conv1d_update_ref(x_ref, conv_state_ref, weight, bias, @@ -260,15 +247,9 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation, assert torch.equal(conv_state, conv_state_ref) assert torch.allclose(out, out_ref, rtol=rtol, atol=atol) - opcheck(torch.ops._C.causal_conv1d_update, ( - x, - conv_state, - weight, - bias, - activation in ["silu", "swish"], - None, - None, - )) + opcheck(torch.ops._C.causal_conv1d_update, + (x, conv_state, weight, bias, activation + in ["silu", "swish"], None, None, PAD_SLOT_ID)) @pytest.mark.parametrize("itype", @@ -278,37 +259,48 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation, @pytest.mark.parametrize("seqlen", [1, 4, 5]) @pytest.mark.parametrize("width", [2, 3, 4]) @pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096]) -def test_causal_conv1d_update_with_batch_gather(dim, width, seqlen, has_bias, +# tests correctness in case subset of the sequences are padded +@pytest.mark.parametrize("with_padding", [True, False]) +def test_causal_conv1d_update_with_batch_gather(with_padding, dim, width, + seqlen, has_bias, silu_activation, itype): device = "cuda" rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3) if itype == torch.bfloat16: rtol, atol = 1e-2, 5e-2 - # set )seed - seed_everything(0) - batch = 64 + # set seed + current_platform.seed_everything(0) + + batch_size = 3 + padding = 5 if with_padding else 0 + padded_batch_size = batch_size + padding + total_entries = 10 * batch_size - x = torch.randn(batch, dim, 1, device=device, dtype=itype) + x = torch.randn(padded_batch_size, dim, 1, device=device, dtype=itype) + x_ref = x.clone() - total_entries = 10 * batch + conv_state_indices = torch.randperm(total_entries)[:batch_size].to( + dtype=torch.int32, device=device) + unused_states_bool = torch.ones(total_entries, + dtype=torch.bool, + device=device) + unused_states_bool[conv_state_indices] = False + padded_state_indices = torch.concat([ + conv_state_indices, + torch.as_tensor( + [PAD_SLOT_ID] * padding, dtype=torch.int32, device=device) + ], + dim=0) conv_state = torch.randn(total_entries, dim, width - 1, device=device, dtype=itype) - conv_state_indices = torch.randperm(total_entries)[:batch].to( - dtype=torch.int32, device=device) + conv_state_for_padding_test = conv_state.clone() - weight = torch.randn(dim, - width, - device=device, - dtype=itype, - requires_grad=True) - if has_bias: - bias = torch.randn(dim, device=device, dtype=itype, requires_grad=True) - else: - bias = None + weight = torch.randn(dim, width, device=device, dtype=itype) + bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None conv_state_ref = conv_state[conv_state_indices, :].detach().clone() activation = None if not silu_activation else "silu" out = causal_conv1d_update(x, @@ -316,45 +308,50 @@ def test_causal_conv1d_update_with_batch_gather(dim, width, seqlen, has_bias, weight, bias, activation=activation, - conv_state_indices=conv_state_indices) - out_ref = causal_conv1d_update_ref(x, + conv_state_indices=padded_state_indices, + pad_slot_id=PAD_SLOT_ID) + out_ref = causal_conv1d_update_ref(x_ref[:batch_size], conv_state_ref, weight, bias, activation=activation) assert torch.equal(conv_state[conv_state_indices, :], conv_state_ref) - assert torch.allclose(out, out_ref, rtol=rtol, atol=atol) + assert torch.allclose(out[:batch_size], out_ref, rtol=rtol, atol=atol) + assert torch.equal(conv_state[unused_states_bool], + conv_state_for_padding_test[unused_states_bool]) - opcheck(torch.ops._C.causal_conv1d_update, ( - x, - conv_state, - weight, - bias, - activation in ["silu", "swish"], - None, - conv_state_indices, - )) + opcheck(torch.ops._C.causal_conv1d_update, + (x, conv_state, weight, bias, activation + in ["silu", "swish"], None, padded_state_indices, PAD_SLOT_ID)) @pytest.mark.parametrize("itype", [torch.bfloat16]) @pytest.mark.parametrize("silu_activation", [True]) @pytest.mark.parametrize("has_bias", [True]) @pytest.mark.parametrize("width", [4]) -@pytest.mark.parametrize('seqlen', - [8, 16, 32, 64, 128, 256, 512, 784, 1024, 2048, 4096]) +@pytest.mark.parametrize( + 'seqlen', [8, 16, 32, 64, 128, 256, 512, 784, 1024, 2048, 2049, 4096]) @pytest.mark.parametrize('dim', [64, 4096]) -def test_causal_conv1d_varlen(dim, seqlen, width, has_bias, silu_activation, - itype): +# tests correctness in case subset of the sequences are padded +@pytest.mark.parametrize('with_padding', [True, False]) +def test_causal_conv1d_varlen(with_padding, dim, seqlen, width, has_bias, + silu_activation, itype): device = "cuda" + torch.cuda.empty_cache() rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3) if itype == torch.bfloat16: rtol, atol = 1e-2, 5e-2 # set seed - seed_everything(0) - batch = 1 + current_platform.seed_everything(0) seqlens = [] - nsplits = 3 + batch_size = 4 + if seqlen < 10: + batch_size = 1 + padding = 3 if with_padding else 0 + padded_batch_size = batch_size + padding + nsplits = padded_batch_size - 1 + eos_pos = torch.randperm(seqlen - 1)[:nsplits].sort().values seqlens.append( torch.diff( @@ -364,10 +361,11 @@ def test_causal_conv1d_varlen(dim, seqlen, width, has_bias, silu_activation, assert sum(seqlens[-1]) == seqlen assert all(s > 0 for s in seqlens[-1]) + total_entries = batch_size * 10 cumsum = torch.cumsum(torch.tensor(seqlens[0]), dim=0).to(torch.int32) cumsum = torch.concat([torch.tensor([0], dtype=torch.int32), cumsum], dim=0) - x = torch.randn(batch, 4096 + dim + 64, seqlen, device=device, + x = torch.randn(1, 4096 + dim + 64, seqlen, device=device, dtype=itype)[:, 4096:4096 + dim, :] weight = torch.randn(dim, width, device=device, dtype=itype) bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None @@ -375,7 +373,7 @@ def test_causal_conv1d_varlen(dim, seqlen, width, has_bias, silu_activation, weight_ref = weight.clone() bias_ref = bias.clone() if bias is not None else None activation = None if not silu_activation else "silu" - final_states = torch.randn(nsplits + 1, + final_states = torch.randn(total_entries, dim, width - 1, device=x.device, @@ -385,18 +383,27 @@ def test_causal_conv1d_varlen(dim, seqlen, width, has_bias, silu_activation, 2, (cumsum.shape[0] - 1, ), dtype=torch.bool, device=x.device) - cache_indices = torch.randperm(cumsum.shape[0] - 1, + state_indices = torch.randperm(total_entries, dtype=torch.int32, - device=x.device) + device=x.device)[:batch_size] + padded_state_indices = torch.concat([ + state_indices, + torch.as_tensor( + [PAD_SLOT_ID] * padding, dtype=torch.int32, device=device), + ], + dim=-1) + out = causal_conv1d_fn(x.squeeze(0), weight, bias, cumsum.cuda(), - cache_indices, has_initial_states, final_states, - activation) + padded_state_indices, has_initial_states, + final_states, activation, PAD_SLOT_ID) out_ref = [] out_ref_b = [] splits = [torch.split(var, seqlens[0], dim=-1) for var in (x_ref)] for i in range(len(seqlens[0])): x_s = [v[i].unsqueeze(0) for v in splits][0] + if padded_state_indices[i] == PAD_SLOT_ID: + continue out_ref_b.append( causal_conv1d_ref( x_s, @@ -404,21 +411,20 @@ def test_causal_conv1d_varlen(dim, seqlen, width, has_bias, silu_activation, bias_ref, activation=activation, return_final_states=True, - final_states_out=final_states_ref[cache_indices[i]].unsqueeze( - 0), - initial_states=final_states_ref[cache_indices[i]].unsqueeze(0) - if has_initial_states[i] else None)) + final_states_out=final_states_ref[ + padded_state_indices[i]].unsqueeze(0), + initial_states=final_states_ref[padded_state_indices[i]]. + unsqueeze(0) if has_initial_states[i] else None)) out_ref.append(torch.cat([t[0] for t in out_ref_b], dim=2)) - out_ref = torch.cat(out_ref, dim=0) - - print(f"Output max diff: {(out - out_ref).abs().max().item()}") - print(f"Output mean diff: {(out - out_ref).abs().mean().item()}") - print("Output state max diff" - f":{(final_states - final_states_ref).abs().max()}") - print("Output state mean diff" - f":{(final_states - final_states_ref).abs().mean()}") - assert torch.allclose(out, out_ref, rtol=rtol, atol=atol) - assert torch.allclose(final_states, final_states_ref, rtol=rtol, atol=atol) + out_ref_tensor = torch.cat(out_ref, dim=0) + + unpadded_out = out[:, :out_ref_tensor.shape[-1]] + assert torch.allclose(unpadded_out, out_ref_tensor, rtol=rtol, atol=atol) + assert torch.allclose(final_states[state_indices], + final_states_ref[state_indices], + rtol=rtol, + atol=atol) + causal_conv1d_opcheck_fn(x.squeeze(0), weight, bias, cumsum.cuda(), - cache_indices, has_initial_states, final_states, - activation) + padded_state_indices, has_initial_states, + final_states, activation) diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py index 6b979d0558c46..a1dd5eeeaa398 100644 --- a/tests/kernels/test_encoder_decoder_attn.py +++ b/tests/kernels/test_encoder_decoder_attn.py @@ -16,13 +16,13 @@ from vllm.attention import (Attention, AttentionBackend, AttentionMetadata, AttentionType) from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP -from vllm.attention.selector import (_Backend, +from vllm.attention.selector import (_Backend, get_attn_backend, global_force_attn_backend_context_manager) -from vllm.utils import is_hip +from vllm.forward_context import set_forward_context +from vllm.platforms import current_platform # List of support backends for encoder/decoder models -LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS] - +LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN] HEAD_SIZES = [64, 256] NUM_HEADS = [1, 16] @@ -82,7 +82,7 @@ class TestResources(NamedTuple): will leverage attn_backend for the purpose of constructing backend-compatible attention metadata instances - + Attributes: * scale: 1/sqrt(d) scale factor for attn @@ -105,10 +105,10 @@ def _make_test_resources(test_pt: TestPoint, ) -> TestResources: Build key components for performing encoder/decoder attention test. Note that - (1) The Attention instance constructed here, automatically selects + (1) The Attention instance constructed here, automatically selects an attention backend class based on platform info & a set of canned heuristics, so - (2) The attention backend instance constructed here is thus *not + (2) The attention backend instance constructed here is thus *not the same backend instance* used by attn, but rather it is intended to be a *different instance* of the *same backend class*; therefore, @@ -145,7 +145,8 @@ class that Attention will automatically select when it is constructed. test_pt.num_heads, test_pt.head_size, test_pt.block_size, - device=CUDA_DEVICE) + device=CUDA_DEVICE, + backend=test_pt.backend_name) return TestResources(scale, attn_backend, attn, kv_cache) @@ -156,7 +157,7 @@ def _encoder_attn_setup( ''' Set up test vectors & data structures for encoder attention test. - A triplet of synthetic query/key/value tensors are constructed. + A triplet of synthetic query/key/value tensors are constructed. Given this is an encoder attention test, the key & value sequences will have the same length as the corresponding queries. @@ -169,14 +170,14 @@ def _encoder_attn_setup( Arguments: * test_pt: TestPoint data structure; this function relies on the - following fields: batch_size, num_heads, head_size, + following fields: batch_size, num_heads, head_size, block_size, max_q_seq_len * test_rsrcs: TestResources data structure; this function relies on the scale field - + Returns: - + * PhaseTestParameters data structure comprising (1) packed query/key/value tensors, (2) the ideal output of attention computed using a naive implementation, and (3) KVCache field set to None @@ -265,7 +266,7 @@ def _decoder_attn_setup( Arguments: * test_pt: TestPoint data structure; this function relies on the - following fields: batch_size, num_heads, head_size, + following fields: batch_size, num_heads, head_size, block_size, max_q_seq_len * test_rsrcs: TestResources data structure; this function relies on the scale field @@ -275,14 +276,14 @@ def _decoder_attn_setup( * qkv: Unpacked (batch_size x padded_seq_len x num_heads x head_size) query/key/value tensors * Prefill-phase decoder self-attention PhaseTestParameters data structure, - including (1) packed (number_of_tokens x num_heads x head_size) + including (1) packed (number_of_tokens x num_heads x head_size) query/key/value tensors along with (2) ideal attention output - computed using a naive implementation, and (3) memory-mapping data + computed using a naive implementation, and (3) memory-mapping data structures appropriate for prefill phase. - * Decode-phase decoder self-attention PhaseTestParameters data structure, - including (1) packed (number_of_tokens x num_heads x head_size) - query/key/value tensors along with (2) ideal attention output - computed using a naive implementation, and (3) memory-mapping data + * Decode-phase decoder self-attention PhaseTestParameters data structure, + including (1) packed (number_of_tokens x num_heads x head_size) + query/key/value tensors along with (2) ideal attention output + computed using a naive implementation, and (3) memory-mapping data structures appropriate for decode phase. * max_block_idx: max physical address in decoder self-attention block-table (intended to be used as the base address for the encoder/ @@ -436,12 +437,12 @@ def _enc_dec_cross_attn_setup_reuses_query( This function also constructs the cross-attention KV cache memory mapping (slot mapping and block table), ensuring that the block table starts at - block_base_addr. + block_base_addr. Arguments: * decoder_qkv: pre-existing unpacked (batch_size x padded_seq_len x - num_heads x head_size) decoder self-attention inputs; + num_heads x head_size) decoder self-attention inputs; this function relies on the query and q_seq_lens fields * encoder_test_params: PhaseTestParameters data structure which was @@ -452,7 +453,7 @@ def _enc_dec_cross_attn_setup_reuses_query( self-attention; all fields including KV cache required * test_pt: TestPoint data structure; this function relies on the - following fields: batch_size, num_heads, head_size, + following fields: batch_size, num_heads, head_size, block_size, max_q_seq_len * test_rsrcs: TestResources data structure; this function relies on the scale field @@ -460,16 +461,16 @@ def _enc_dec_cross_attn_setup_reuses_query( Returns: - * Prefill-phase encoder/decoder cross-attention PhaseTestParameters data - structure, including (1) packed + * Prefill-phase encoder/decoder cross-attention PhaseTestParameters data + structure, including (1) packed (number_of_tokens x num_heads x head_size) query/key/value tensors - along with (2) ideal attention output computed using a + along with (2) ideal attention output computed using a naive implementation, and (3) memory-mapping data structures appropriate for prefill phase. - * Decode-phase encoder/decoder cross-attention PhaseTestParameters data + * Decode-phase encoder/decoder cross-attention PhaseTestParameters data structure, including (1) packed (number_of_tokens x num_heads x head_size) query/key/value tensors - along with (2) ideal attention output computed using a + along with (2) ideal attention output computed using a naive implementation, and (3) memory-mapping data structures appropriate for decode phase. ''' @@ -592,11 +593,12 @@ def _run_encoder_attention_test( attn: Attention, encoder_test_params: PhaseTestParameters, attn_metadata: AttentionMetadata, + test_pt: TestPoint, ) -> torch.Tensor: ''' Run encoder attention. - attn.forward() is passed attn_type=AttentionType.ENCODER in order + attn.forward() is passed attn_type=AttentionType.ENCODER in order to configure the kernel invocation for encoder attention Requires attn_metadata.num_decode_tokens == 0 @@ -607,9 +609,11 @@ def _run_encoder_attention_test( * attn: Attention wrapper instance * encoder_test_params: encoder PhaseTestParameters data structure; this function relies on the packed - (number_of_tokens x num_heads x head_size) + (number_of_tokens x num_heads x head_size) query/key/value fields * attn_metadata: attention metadata for encoder/decoder-self attention + * test_pt: The TestPoint object containing test details like number of + model heads, head size, name of the backend being used etc. Returns: * Attention.forward() applied to packed {query,key,value} and @@ -619,20 +623,31 @@ def _run_encoder_attention_test( attn_type = AttentionType.ENCODER packed_qkv = encoder_test_params.packed_qkvo.packed_qkv assert packed_qkv is not None - return attn.forward(packed_qkv.query, - packed_qkv.key, - packed_qkv.value, - torch.tensor([], - dtype=torch.float32, - device=packed_qkv.query.device), - attn_metadata, - attn_type=attn_type) + with set_forward_context(attn_metadata): + # In the test setup the shape of the query is + # [batch_size, seq_len, num_heads, head_size]. However + # the attention backend expect the shape to be + # [num_tokens, hidden_size]. Hence reshape the query before + # invoking the forward method. + # TODO - Update the way we construct the query so that it + # is shaped as [num_tokens, hidden_size] and we can skip the reshape. + reshaped_query = packed_qkv.query.view( + -1, test_pt.num_heads * test_pt.head_size) + return attn.forward(reshaped_query, + packed_qkv.key, + packed_qkv.value, + torch.tensor([], + dtype=torch.float32, + device=packed_qkv.query.device), + attn_metadata, + attn_type=attn_type) def _run_decoder_self_attention_test( test_rsrcs: TestResources, decoder_test_params: PhaseTestParameters, attn_metadata: AttentionMetadata, + test_pt: TestPoint, ) -> torch.Tensor: ''' Run decoder self-attention test. @@ -646,10 +661,12 @@ def _run_decoder_self_attention_test( and attn (Attention wrapper instance) fields * decoder_test_params: decoder PhaseTestParameters data structure; this function relies on the packed - (number_of_tokens x num_heads x head_size) + (number_of_tokens x num_heads x head_size) query/key/value fields * attn_metadata: attention metadata for decoder-self attention (contains KV cache memory-mapping) + * test_pt: The TestPoint object containing test details like number of + model heads, head size, name of the backend being used etc. Returns: * Attention.forward() applied to packed_{query,key,value}, kv_cache @@ -660,12 +677,22 @@ def _run_decoder_self_attention_test( kv_cache = test_rsrcs.kv_cache packed_qkv = decoder_test_params.packed_qkvo.packed_qkv assert packed_qkv is not None - return attn.forward(packed_qkv.query, - packed_qkv.key, - packed_qkv.value, - kv_cache, - attn_metadata, - attn_type=attn_type) + with set_forward_context(attn_metadata): + # In the test setup the shape of the query is + # [batch_size, seq_len, num_heads, head_size]. However + # the attention backend expect the shape to be + # [num_tokens, hidden_size]. Hence reshape the query before + # invoking the forward method. + # TODO - Update the way we construct the query so that it + # is shaped as [num_tokens, hidden_size] and we can skip the reshape. + reshaped_query = packed_qkv.query.view( + -1, test_pt.num_heads * test_pt.head_size) + return attn.forward(reshaped_query, + packed_qkv.key, + packed_qkv.value, + kv_cache, + attn_metadata, + attn_type=attn_type) def _run_encoder_decoder_cross_attention_test( @@ -673,6 +700,7 @@ def _run_encoder_decoder_cross_attention_test( decoder_test_params: PhaseTestParameters, cross_test_params: Optional[PhaseTestParameters], attn_metadata: AttentionMetadata, + test_pt: TestPoint, ) -> torch.Tensor: ''' Run encoder/decoder cross-attention test. @@ -694,13 +722,15 @@ def _run_encoder_decoder_cross_attention_test( and attn (Attention wrapper instance) fields * decoder_test_params: decoder PhaseTestParameters data structure; this function relies on the packed - (number_of_tokens x num_heads x head_size) + (number_of_tokens x num_heads x head_size) query field * cross_test_params: encoder/decoder PhaseTestParameters data structure; this function relies on the packed - (number_of_tokens x num_heads x head_size) + (number_of_tokens x num_heads x head_size) key/value fields * attn_metadata: attention metadata for encoder/decoder-self attention + * test_pt: The TestPoint object containing test details like number of + model heads, head size, name of the backend being used etc. Returns: * Attention.forward() applied to packed_{query,key,value}, kv_cache @@ -718,15 +748,41 @@ def _run_encoder_decoder_cross_attention_test( cross_pckd_qkv = cross_test_params.packed_qkvo.packed_qkv key = (None if cross_pckd_qkv is None else cross_pckd_qkv.key) value = (None if cross_pckd_qkv is None else cross_pckd_qkv.value) - return attn.forward(decoder_test_params.packed_qkvo.packed_qkv.query, - key, - value, - kv_cache, - attn_metadata, - attn_type=attn_type) - - -@pytest.mark.skipif(is_hip(), reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP) + with set_forward_context(attn_metadata): + # In the test setup the shape of the query is + # [batch_size, seq_len, num_heads, head_size]. However + # the attention backend expect the shape to be + # [num_tokens, hidden_size]. Hence reshape the query before + # invoking the forward method. + # TODO - Update the way we construct the query so that it + # is shaped as [num_tokens, hidden_size] and we can skip the reshape. + reshaped_query = decoder_test_params.packed_qkvo.packed_qkv.query.view( + -1, test_pt.num_heads * test_pt.head_size) + return attn.forward(reshaped_query, + key, + value, + kv_cache, + attn_metadata, + attn_type=attn_type) + + +@pytest.fixture(autouse=True) +def set_reset_environment(attn_backend): + # Set the default torch datatype to bfloat16 to enable + # testing of the Flash Attention backend. Also clear the + # cached value of the backend. + default_dtype = torch.get_default_dtype() + if attn_backend.name == 'FLASH_ATTN': + torch.set_default_dtype(torch.bfloat16) + get_attn_backend.cache_clear() + yield + # Reset the torch datatype to what it was before the test + # so as not to impact the remaining tests. + torch.set_default_dtype(default_dtype) + + +@pytest.mark.skipif(current_platform.is_rocm(), + reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) @@ -755,7 +811,8 @@ def test_encoder_only( No KV cache is required for encoder-only attention. Note on ROCm/HIP: currently encoder/decoder models are not supported on - AMD GPUs, therefore this test simply is skipped if is_hip(). + AMD GPUs, therefore this test simply is skipped if + current_platform.is_rocm(). This test globally forces an override of the usual backend auto-selection process, forcing the specific backend-under-test @@ -771,10 +828,8 @@ def test_encoder_only( * max_dec_seq_len: max length of decoder input sequences * max_enc_seq_len: max length of encoder input sequences ''' - # Force Attention wrapper backend with global_force_attn_backend_context_manager(attn_backend): - # Note: KV cache size of 4096 is arbitrary & chosen intentionally # to be more than necessary, since exceeding the kv cache size # is not part of this test @@ -805,13 +860,18 @@ def test_encoder_only( # PREFILL: encoder attention enc_pckd_act_out: torch.Tensor = (_run_encoder_attention_test( - test_rsrcs.attn, enc_test_params, prephase_attn_metadata)) + test_rsrcs.attn, + enc_test_params, + prephase_attn_metadata, + test_pt=test_pt)) # - Is encoder attention result correct? - assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out) + assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out, + attn_backend.name) -@pytest.mark.skipif(is_hip(), reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP) +@pytest.mark.skipif(current_platform.is_rocm(), + reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) @@ -837,14 +897,14 @@ def test_e2e_enc_dec_attn( attributes for prefill-phase, and (2) an analogous attention metadata structure but for decode-phase * Test attention steps in the following order - + * Encoder attention * Prefill self-attention * Prefill cross-attention * Decode self-attention * Decode cross-attention - * Besides being reflective of realistic use-cases, this order would - exacerbate any accidental overlap in the self-/cross-attention + * Besides being reflective of realistic use-cases, this order would + exacerbate any accidental overlap in the self-/cross-attention block tables, which one hopes to avoid @@ -864,10 +924,11 @@ def test_e2e_enc_dec_attn( to be utilized. Note on ROCm/HIP: currently encoder/decoder models are not supported on - AMD GPUs, therefore this test simply is skipped if is_hip(). + AMD GPUs, therefore this test simply is skipped if + current_platform.is_rocm(). Note on metadata: there is a single attention metadata structure shared by - all prefill-phase attention operations (encoder, decoder, enc/dec cross), + all prefill-phase attention operations (encoder, decoder, enc/dec cross), and a single one shared by all decode-phase attention operations (decoder & enc/dec cross.) This is intended to reflect the behavior of EncoderDecoderModelRunner, which constructs a single attention metadata @@ -888,10 +949,8 @@ def test_e2e_enc_dec_attn( * max_dec_seq_len: max length of decoder input sequences * max_enc_seq_len: max length of encoder input sequences ''' - # Force Attention wrapper backend with global_force_attn_backend_context_manager(attn_backend): - # Note: KV cache size of 4096 is arbitrary & chosen intentionally # to be more than necessary, since exceeding the kv cache size # is not part of this test @@ -951,29 +1010,39 @@ def test_e2e_enc_dec_attn( enc_pckd_act_out = _run_encoder_attention_test(test_rsrcs.attn, enc_test_params, - prephase_attn_metadata) + prephase_attn_metadata, + test_pt=test_pt) # - Is encoder attention result correct? - assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out) + assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out, + attn_backend.name) # PREFILL: decoder self-attention test prephase_dec_pckd_act_out = _run_decoder_self_attention_test( - test_rsrcs, prephase_dec_test_params, prephase_attn_metadata) + test_rsrcs, + prephase_dec_test_params, + prephase_attn_metadata, + test_pt=test_pt) # - Is prefill decoder self-attention correct? assert_actual_matches_ideal(prephase_dec_test_params, - prephase_dec_pckd_act_out) + prephase_dec_pckd_act_out, + attn_backend.name) # PREFILL: encoder/decoder cross-attention test prephase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test( - test_rsrcs, prephase_dec_test_params, prephase_cross_test_params, - prephase_attn_metadata) + test_rsrcs, + prephase_dec_test_params, + prephase_cross_test_params, + prephase_attn_metadata, + test_pt=test_pt) # - Is prefill encoder/decoder cross-attention correct? assert_actual_matches_ideal(prephase_cross_test_params, - prephase_cross_pckd_act_out) + prephase_cross_pckd_act_out, + attn_backend.name) # DECODE: build decode-phase attention metadata @@ -989,17 +1058,26 @@ def test_e2e_enc_dec_attn( # DECODE: decoder self-attention test decphase_dec_pckd_act_out = _run_decoder_self_attention_test( - test_rsrcs, decphase_dec_test_params, decphase_attn_metadata) + test_rsrcs, + decphase_dec_test_params, + decphase_attn_metadata, + test_pt=test_pt) # - Is decode-phase decoder self-attention correct? assert_actual_matches_ideal(decphase_dec_test_params, - decphase_dec_pckd_act_out) + decphase_dec_pckd_act_out, + attn_backend.name) # DECODE: encoder/decoder cross-attention test decphase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test( - test_rsrcs, decphase_dec_test_params, None, decphase_attn_metadata) + test_rsrcs, + decphase_dec_test_params, + None, + decphase_attn_metadata, + test_pt=test_pt) # - Is decode-phase encoder/decoder cross-attention correct? assert_actual_matches_ideal(decphase_cross_test_params, - decphase_cross_pckd_act_out) + decphase_cross_pckd_act_out, + attn_backend.name) diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py index 3e9b4d9a4f8a0..a20c73345218f 100644 --- a/tests/kernels/test_flash_attn.py +++ b/tests/kernels/test_flash_attn.py @@ -3,7 +3,7 @@ import pytest import torch -from vllm.utils import seed_everything +from vllm.platforms import current_platform from vllm.vllm_flash_attn import (flash_attn_varlen_func, flash_attn_with_kvcache) @@ -78,6 +78,7 @@ def ref_paged_attn( @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0]) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) +@pytest.mark.parametrize("sliding_window", [None, 256]) @torch.inference_mode() def test_flash_attn_with_paged_kv( kv_lens: List[int], @@ -87,15 +88,18 @@ def test_flash_attn_with_paged_kv( block_size: int, soft_cap: Optional[float], num_blocks: int, + sliding_window: Optional[int], ) -> None: torch.set_default_device("cuda") - seed_everything(0) + current_platform.seed_everything(0) num_seqs = len(kv_lens) num_query_heads = num_heads[0] num_kv_heads = num_heads[1] assert num_query_heads % num_kv_heads == 0 max_kv_len = max(kv_lens) scale = head_size**-0.5 + window_size = ((sliding_window - 1, 0) if sliding_window is not None else + (-1, -1)) query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype) key_cache = torch.randn(num_blocks, @@ -121,18 +125,18 @@ def test_flash_attn_with_paged_kv( block_table=block_tables, cache_seqlens=kv_lens_tensor, softcap=soft_cap if soft_cap is not None else 0, + window_size=window_size, ).squeeze(1) - ref_output = ref_paged_attn( - query=query, - key_cache=key_cache, - value_cache=value_cache, - query_lens=[1] * num_seqs, - kv_lens=kv_lens, - block_tables=block_tables, - scale=scale, - soft_cap=soft_cap, - ) + ref_output = ref_paged_attn(query=query, + key_cache=key_cache, + value_cache=value_cache, + query_lens=[1] * num_seqs, + kv_lens=kv_lens, + block_tables=block_tables, + scale=scale, + soft_cap=soft_cap, + sliding_window=sliding_window) torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2), \ f"{torch.max(torch.abs(output - ref_output))}" @@ -141,7 +145,7 @@ def test_flash_attn_with_paged_kv( @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("block_size", BLOCK_SIZES) -@pytest.mark.parametrize("sliding_window", [None]) +@pytest.mark.parametrize("sliding_window", [None, 256]) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0]) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @@ -157,7 +161,7 @@ def test_varlen_with_paged_kv( num_blocks: int, ) -> None: torch.set_default_device("cuda") - seed_everything(0) + current_platform.seed_everything(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] kv_lens = [x[1] for x in seq_lens] @@ -166,8 +170,7 @@ def test_varlen_with_paged_kv( assert num_query_heads % num_kv_heads == 0 max_query_len = max(query_lens) max_kv_len = max(kv_lens) - window_size = ((sliding_window, - sliding_window) if sliding_window is not None else + window_size = ((sliding_window - 1, 0) if sliding_window is not None else (-1, -1)) scale = head_size**-0.5 diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/test_flashinfer.py index 80a388db6530e..a2c8f71665737 100644 --- a/tests/kernels/test_flashinfer.py +++ b/tests/kernels/test_flashinfer.py @@ -4,7 +4,7 @@ import pytest import torch -from vllm.utils import seed_everything +from vllm.platforms import current_platform NUM_HEADS = [(16, 16), (32, 8), (64, 8), (6, 1)] HEAD_SIZES = [128, 256] @@ -84,7 +84,7 @@ def test_flashinfer_decode_with_paged_kv( soft_cap: Optional[float], ) -> None: torch.set_default_device("cuda") - seed_everything(0) + current_platform.seed_everything(0) num_seqs = len(kv_lens) num_query_heads = num_heads[0] num_kv_heads = num_heads[1] @@ -170,7 +170,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]], block_size: int, soft_cap: Optional[float]) -> None: torch.set_default_device("cuda") - seed_everything(0) + current_platform.seed_everything(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] kv_lens = [x[1] for x in seq_lens] @@ -268,7 +268,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv( head_size: int, dtype: torch.dtype, block_size: int, soft_cap: Optional[float]) -> None: torch.set_default_device("cuda") - seed_everything(0) + current_platform.seed_everything(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] kv_lens = [x[1] for x in seq_lens] @@ -381,7 +381,7 @@ def test_flashinfer_decode_with_paged_fp8_kv( ) -> None: # test doesn't work for num_heads = (16,16) torch.set_default_device("cuda") - seed_everything(0) + current_platform.seed_everything(0) num_seqs = len(kv_lens) num_query_heads = num_heads[0] num_kv_heads = num_heads[1] diff --git a/tests/kernels/test_fp8_quant.py b/tests/kernels/test_fp8_quant.py index c18f5f468dc5a..ebaaae2321885 100644 --- a/tests/kernels/test_fp8_quant.py +++ b/tests/kernels/test_fp8_quant.py @@ -6,7 +6,7 @@ ref_dynamic_per_tensor_fp8_quant, ref_dynamic_per_token_quant) from tests.kernels.utils import opcheck -from vllm.utils import seed_everything +from vllm.platforms import current_platform DTYPES = [torch.half, torch.bfloat16, torch.float] HIDDEN_SIZES = [1, 2, 3, 4, 16, 67, 768, 2048, 5120, 5137, 8192, @@ -46,7 +46,7 @@ def opcheck_fp8_quant(output, def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, scale_ub: bool, seed: int) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") + 1e-6 # avoid nans @@ -76,7 +76,7 @@ def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int, @torch.inference_mode() def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") @@ -95,7 +95,7 @@ def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int, @torch.inference_mode() @pytest.mark.parametrize("seed", SEEDS) def test_fp8_quant_large(seed: int) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) num_tokens = 1024000 # Mistral-Nemo's max_position_embeddings hidden_size = 1152 # Smallest hidden_size to reproduce the error diff --git a/tests/kernels/test_gguf.py b/tests/kernels/test_gguf.py index 1513fc196153c..893af99ba4977 100644 --- a/tests/kernels/test_gguf.py +++ b/tests/kernels/test_gguf.py @@ -7,7 +7,7 @@ from huggingface_hub import snapshot_download import vllm._custom_ops as ops -from vllm.utils import seed_everything +from vllm.platforms import current_platform GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample") @@ -75,7 +75,7 @@ def test_dequantize(hidden_size: int, dtype: torch.dtype, @torch.inference_mode() def test_mmvq(hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType): - seed_everything(0) + current_platform.seed_everything(0) tensors = get_gguf_sample_tensors(hidden_size, quant_type) x = torch.rand((1, hidden_size), dtype=dtype, device="cuda") @@ -111,7 +111,7 @@ def test_mmvq(hidden_size: int, dtype: torch.dtype, @torch.inference_mode() def test_mmq(num_tokens: int, hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType): - seed_everything(0) + current_platform.seed_everything(0) tensors = get_gguf_sample_tensors(hidden_size, quant_type) x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda") diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py index 41e103e1d09f9..8db6a0d0d9fa4 100644 --- a/tests/kernels/test_int8_quant.py +++ b/tests/kernels/test_int8_quant.py @@ -4,7 +4,7 @@ from tests.kernels.quant_utils import ref_dynamic_per_token_quant from tests.kernels.utils import opcheck from vllm._custom_ops import scaled_int8_quant -from vllm.utils import seed_everything +from vllm.platforms import current_platform DTYPES = [torch.half, torch.bfloat16, torch.float] HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 5137, 8192, @@ -45,7 +45,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True): @torch.inference_mode() def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 @@ -68,7 +68,7 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int, @torch.inference_mode() def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) int8_traits = torch.iinfo(torch.int8) x = torch.rand(num_tokens, hidden_size, dtype=dtype, @@ -112,7 +112,7 @@ def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int, def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int, scale: float) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) int8_traits = torch.iinfo(torch.int8) x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 @@ -138,7 +138,7 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int, def test_static_scaled_int8_azp_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int, scale: float, azp: int) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) int8_traits = torch.iinfo(torch.int8) x = torch.rand(num_tokens, hidden_size, dtype=dtype, diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py index 382079d472ee9..9dfa2cbe45e94 100644 --- a/tests/kernels/test_layernorm.py +++ b/tests/kernels/test_layernorm.py @@ -3,7 +3,7 @@ from tests.kernels.utils import opcheck from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.utils import seed_everything +from vllm.platforms import current_platform DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing @@ -31,7 +31,7 @@ def test_rms_norm( seed: int, device: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) layer = RMSNorm(hidden_size).to(dtype=dtype) layer.weight.data.normal_(mean=1.0, std=0.1) diff --git a/tests/kernels/test_machete_gemm.py b/tests/kernels/test_machete_gemm.py index 0fc2984a68ded..59c0a24753c3b 100644 --- a/tests/kernels/test_machete_gemm.py +++ b/tests/kernels/test_machete_gemm.py @@ -80,7 +80,7 @@ def machete_quantize_and_pack(w: torch.Tensor, w_q = w_q.t().contiguous().t() # convert to col major w_q_machete = ops.machete_prepack_B(w_q, wtype) - opcheck(torch.ops._C.machete_prepack_B, (w_q, wtype)) + opcheck(torch.ops._C.machete_prepack_B, (w_q, wtype.id)) return w_ref, w_q_machete, w_s, w_zp @@ -153,9 +153,10 @@ def test_machete_all_schedules(shape, atype: torch.dtype, schedule=schedule, ) - opcheck(torch.ops._C.machete_gemm, - (a, w_q_machete, wtype, w_s, maybe_convert_zeropoints( - w_zp, w_s), group_size, None, None, None, schedule)) + opcheck( + torch.ops._C.machete_gemm, + (a, w_q_machete, wtype.id, w_s, maybe_convert_zeropoints( + w_zp, w_s), group_size, None, None, None, schedule)) # Relax atol as our reduction dim becomes larger (more rounding error) # Relax atol when we have zeropoints since the way machete applies diff --git a/tests/kernels/test_mamba_ssm.py b/tests/kernels/test_mamba_ssm.py index 8fa55e75f6c11..ad05a97685351 100644 --- a/tests/kernels/test_mamba_ssm.py +++ b/tests/kernels/test_mamba_ssm.py @@ -5,9 +5,10 @@ from tests.kernels.utils import opcheck from vllm import _custom_ops as ops # noqa: F401 +from vllm.attention.backends.utils import PAD_SLOT_ID from vllm.model_executor.layers.mamba.ops.mamba_ssm import ( selective_scan_fn, selective_state_update) -from vllm.utils import seed_everything +from vllm.platforms import current_platform def selective_state_update_ref(state, @@ -174,7 +175,8 @@ def selective_scan_opcheck_fn(u, cu_seq_len=None, cache_indices=None, has_initial_state=None, - ssm_states=None): + ssm_states=None, + pad_slot_id=PAD_SLOT_ID): """if return_last_state is True, returns (out, last_state) last_state has shape (batch, dim, dstate). """ @@ -203,7 +205,7 @@ def selective_scan_opcheck_fn(u, # a bogus error. opcheck(torch.ops._C.selective_scan_fwd, (u, delta, A, B, C, D, z, delta_bias, delta_softplus, cu_seq_len, - cache_indices, has_initial_state, ssm_states), + cache_indices, has_initial_state, ssm_states, pad_slot_id), test_utils=["test_schema", "test_faketensor"]) @@ -233,7 +235,7 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D, rtolw = max(rtolw, rtol) atolw = max(atolw, atol) # set seed - seed_everything(0) + current_platform.seed_everything(0) batch_size = 1 dim = 4 dstate = 8 @@ -356,7 +358,7 @@ def test_selective_state_update(dim, dstate, has_z, itype): if torch.version.hip: atol *= 2 # set seed - seed_everything(0) + current_platform.seed_everything(0) batch_size = 1 state = torch.randn(batch_size, dim, dstate, dtype=itype, device=device) x = torch.randn(batch_size, dim, device=device, dtype=itype) @@ -404,9 +406,12 @@ def test_selective_state_update(dim, dstate, has_z, itype): @pytest.mark.parametrize("varBC_groups", [1, 2]) @pytest.mark.parametrize("is_variable_C", [True]) @pytest.mark.parametrize("is_variable_B", [True]) -def test_selective_scan_varlen(is_variable_B, is_variable_C, varBC_groups, - has_D, has_z, has_delta_bias, delta_softplus, - return_last_state, seqlen, itype, wtype): +# tests correctness in case subset of the sequences are padded +@pytest.mark.parametrize("with_padding", [False, True]) +def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C, + varBC_groups, has_D, has_z, has_delta_bias, + delta_softplus, return_last_state, seqlen, + itype, wtype): if varBC_groups > 1 and (not is_variable_B or not is_variable_C): pytest.skip() # This config is not applicable device = 'cuda' @@ -420,18 +425,27 @@ def test_selective_scan_varlen(is_variable_B, is_variable_C, varBC_groups, # set seed torch.random.manual_seed(0) seqlens = [] - nsplits = 3 + batch_size = 4 if seqlen < 10: - nsplits = 0 + batch_size = 1 + padding = 3 if with_padding else 0 + padded_batch_size = batch_size + padding + + if with_padding and seqlen < padded_batch_size: + pytest.skip() + + nsplits = padded_batch_size - 1 eos_pos = torch.randperm(seqlen - 1)[:nsplits].sort().values seqlens.append( torch.diff( torch.cat( [torch.tensor([-1]), eos_pos, torch.tensor([seqlen - 1])])).tolist()) + assert sum(seqlens[-1]) == seqlen assert all(s > 0 for s in seqlens[-1]) + total_entries = batch_size * 10 cumsum = torch.cumsum(torch.tensor(seqlens[0]), dim=0).to(torch.int32) cumsum = torch.concat([torch.tensor([0], dtype=torch.int32), cumsum], dim=0).cuda() @@ -462,22 +476,33 @@ def test_selective_scan_varlen(is_variable_B, is_variable_C, varBC_groups, delta_ref = delta.clone() out = None out_ref = None - prev_state_shape = (cumsum.shape[0] - 1, u.shape[0], int(A.shape[1])) + + prev_state_shape = (total_entries, u.shape[0], int(A.shape[1])) prev_state = torch.randn(prev_state_shape, device=u.device, dtype=itype, requires_grad=False) prev_state_ref = prev_state.clone() - cache_indices = torch.randperm(cumsum.shape[0] - 1, + state_indices = torch.randperm(total_entries, dtype=torch.int32, - device=u.device) + device=u.device)[:batch_size] + unused_states_bool = torch.ones(total_entries, + dtype=torch.bool, + device=device) + unused_states_bool[state_indices] = False + padded_state_indices = torch.concat([ + state_indices, + torch.as_tensor( + [PAD_SLOT_ID] * padding, dtype=torch.int32, device=device), + ], + dim=-1) has_initial_state = torch.randint(0, 2, (cumsum.shape[0] - 1, ), dtype=torch.bool, device=u.device) out = selective_scan_fn(u, prev_state, delta, A, B, C, D, z, delta_bias, - delta_softplus, cumsum, cache_indices, + delta_softplus, cumsum, padded_state_indices, has_initial_state) outs_ref = [] splits = [ @@ -486,6 +511,8 @@ def test_selective_scan_varlen(is_variable_B, is_variable_C, varBC_groups, ] for i in range(len(seqlens[0])): u_s, delta_s, B_s, C_s, z_s = [v[i].unsqueeze(0) for v in splits] + if padded_state_indices[i] == PAD_SLOT_ID: + continue out_ref_s, _ = selective_scan_ref( u_s, delta_s, @@ -497,21 +524,22 @@ def test_selective_scan_varlen(is_variable_B, is_variable_C, varBC_groups, delta_bias=delta_bias, delta_softplus=delta_softplus, return_last_state=return_last_state, - prev_state=prev_state_ref[cache_indices[i]].unsqueeze(0) + prev_state=prev_state_ref[padded_state_indices[i]].unsqueeze(0) if has_initial_state[i] else None, - final_state_out=prev_state_ref[cache_indices[i]].unsqueeze(0)) + final_state_out=prev_state_ref[padded_state_indices[i]].unsqueeze( + 0)) outs_ref.append(out_ref_s) - out_ref = torch.cat(outs_ref, dim=-1) if len(outs_ref) > 1 else outs_ref[0] + out_ref = torch.cat(outs_ref, dim=-1)[0] - print("Output diff max", (out - out_ref[0]).max()) - print("Output diff mean", (out - out_ref[0]).mean()) + unpadded_out = out[:, :out_ref[0].shape[-1]] + print("Output diff max", (unpadded_out - out_ref).max()) + print("Output diff mean", (unpadded_out - out_ref).mean()) print("Output state diff max", (prev_state - prev_state_ref).max()) print("Output state diff mean", (prev_state - prev_state_ref).mean()) assert torch.allclose(prev_state, prev_state_ref, rtol=rtol, atol=atol) - assert torch.allclose(out, out_ref[0], rtol=rtol, atol=atol) - + assert torch.allclose(unpadded_out, out_ref, rtol=rtol, atol=atol) selective_scan_opcheck_fn(u, delta, A, B, C, D, z, delta_bias, - delta_softplus, cumsum, cache_indices, + delta_softplus, cumsum, padded_state_indices, has_initial_state, prev_state) @@ -520,31 +548,45 @@ def test_selective_scan_varlen(is_variable_B, is_variable_C, varBC_groups, @pytest.mark.parametrize("has_z", [True]) @pytest.mark.parametrize("dstate", [16, 32, 64]) @pytest.mark.parametrize("dim", [2048, 2048 + 16, 4096]) -def test_selective_state_update_with_batch_indices(dim, dstate, has_z, itype): +# tests correctness in case subset of the sequences are padded +@pytest.mark.parametrize("with_padding", [True, False]) +def test_selective_state_update_with_batch_indices(with_padding, dim, dstate, + has_z, itype): device = "cuda" rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2) if itype == torch.bfloat16: - rtol, atol = 7e-2, 7e-2 + rtol, atol = 1e-1, 1e-1 if torch.version.hip: atol *= 2 # set seed torch.random.manual_seed(0) batch_size = 3 - + padding = 5 if with_padding else 0 + padded_batch_size = batch_size + padding total_entries = 10 * batch_size state = torch.randn(total_entries, dim, dstate, dtype=itype, device=device) state_indices = torch.randperm(total_entries)[:batch_size].to( dtype=torch.int32, device=device) - - x = torch.randn(batch_size, dim, device=device, dtype=itype) - dt = torch.randn(batch_size, dim, device=device, dtype=itype) + unused_states_bool = torch.ones(total_entries, + dtype=torch.bool, + device=device) + unused_states_bool[state_indices] = False + padded_state_indices = torch.concat([ + state_indices, + torch.as_tensor( + [PAD_SLOT_ID] * padding, dtype=torch.int32, device=device) + ], + dim=0) + x = torch.randn(padded_batch_size, dim, device=device, dtype=itype) + dt = torch.randn(padded_batch_size, dim, device=device, dtype=itype) dt_bias = torch.rand(dim, device=device) - 4.0 A = -torch.rand(dim, dstate, device=device) - 1.0 - B = torch.randn(batch_size, dstate, device=device) - C = torch.randn(batch_size, dstate, device=device) + B = torch.randn(padded_batch_size, dstate, device=device) + C = torch.randn(padded_batch_size, dstate, device=device) D = torch.randn(dim, device=device) z = torch.randn_like(x) if has_z else None - state_ref = state[state_indices, :].detach().clone() + state_ref = state[state_indices, :].clone() + state_before = state.clone() out = selective_state_update(state, x, dt, @@ -555,28 +597,39 @@ def test_selective_state_update_with_batch_indices(dim, dstate, has_z, itype): z=z, dt_bias=dt_bias, dt_softplus=True, - state_batch_indices=state_indices) + state_batch_indices=padded_state_indices, + pad_slot_id=PAD_SLOT_ID) out_ref = selective_state_update_ref(state_ref, - x, - dt, + x[:batch_size], + dt[:batch_size], A, - B, - C, + B[:batch_size], + C[:batch_size], D=D, - z=z, + z=z[:batch_size], dt_bias=dt_bias, dt_softplus=True) - print("Output diff max", (out - out_ref[0]).max()) - print("Output diff mean", (out - out_ref[0]).mean()) + print("Output diff max", (out[:batch_size] - out_ref).max()) + print("Output diff mean", (out[:batch_size] - out_ref).mean()) print("Output state diff max", (state[state_indices, :] - state_ref).max()) print("Output state diff mean", (state[state_indices, :] - state_ref).mean()) + # test padded entries stay the same + if with_padding: + assert torch.equal(state_before[unused_states_bool], + state[unused_states_bool]) + assert torch.equal(x[batch_size + 1:], x[batch_size + 1:]) + assert torch.equal(dt[batch_size + 1:], dt[batch_size + 1:]) + assert torch.equal(B[batch_size + 1:], B[batch_size + 1:]) + assert torch.equal(C[batch_size + 1:], C[batch_size + 1:]) + + # test "real" entries assert torch.allclose(state[state_indices, :], state_ref, rtol=rtol, atol=atol) - assert torch.allclose(out, out_ref, rtol=rtol, atol=atol) + assert torch.allclose(out[:batch_size], out_ref, rtol=rtol, atol=atol) @pytest.mark.parametrize("itype", @@ -645,7 +698,8 @@ def test_selective_state_update_with_heads_with_batch_indices( z=z, dt_bias=dt_bias, dt_softplus=True, - state_batch_indices=state_indices) + state_batch_indices=state_indices, + pad_slot_id=PAD_SLOT_ID) out_ref = selective_state_update_ref(state_ref, x, dt, diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py index a9bb72156c39e..5cfd4d6da7a86 100644 --- a/tests/kernels/test_marlin_gemm.py +++ b/tests/kernels/test_marlin_gemm.py @@ -225,7 +225,7 @@ def test_gptq_marlin_gemm( opcheck( torch.ops._C.gptq_marlin_gemm, (a_input, marlin_q_w, marlin_s, marlin_zp, g_idx, sort_indices, - workspace.scratch, quant_type, a_input.shape[0], b_weight.shape[1], + workspace.scratch, quant_type.id, a_input.shape[0], b_weight.shape[1], a_input.shape[1], is_k_full, False, use_fp32_reduce), test_utils=DEFAULT_OPCHECK_TEST_UTILS) @@ -254,6 +254,16 @@ def test_gptq_marlin_gemm( assert max_diff < 0.04 +# TODO: find better way to test this? +@torch.compile(fullgraph=True) +def marlin_24_gemm_tester(a_input, marlin_24_q_w_comp, marlin_24_meta, + marlin_24_s, scratch, quant_type, size_m, size_n, + size_k): + return ops.gptq_marlin_24_gemm(a_input, marlin_24_q_w_comp, marlin_24_meta, + marlin_24_s, scratch, quant_type, size_m, + size_n, size_k) + + @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"), reason="Marlin is not supported on this GPU type.") @pytest.mark.parametrize("k_chunk", MARLIN_24_K_CHUNKS) @@ -282,11 +292,11 @@ def test_gptq_marlin_24_gemm(k_chunk, n_chunk, quant_type, group_size, opcheck(torch.ops._C.gptq_marlin_24_gemm, (a_input, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, - workspace_24.scratch, quant_type, a_input.shape[0], + workspace_24.scratch, quant_type.id, a_input.shape[0], b_weight.shape[1], a_input.shape[1]), test_utils=DEFAULT_OPCHECK_TEST_UTILS) - output = ops.gptq_marlin_24_gemm( + output = marlin_24_gemm_tester( a_input, marlin_24_q_w_comp, marlin_24_meta, diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index b73c45b9cd198..19c3fc1e1fe3a 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -7,19 +7,18 @@ from transformers import MixtralConfig from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock +import vllm.model_executor.layers.fused_moe # noqa from tests.kernels.utils import (compute_max_diff, opcheck, stack_and_dev, torch_moe, torch_moe_single) from vllm import _custom_ops as ops from vllm.model_executor.layers.fused_moe import fused_moe -from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( - fused_marlin_moe, single_marlin_moe) from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_topk, moe_align_block_size) from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( marlin_quantize) from vllm.model_executor.models.mixtral import MixtralMoE +from vllm.platforms import current_platform from vllm.scalar_type import scalar_types -from vllm.utils import seed_everything @pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1]) @@ -103,6 +102,7 @@ def test_mixtral_moe(dtype: torch.dtype): @pytest.mark.parametrize("act_order", [True, False]) @pytest.mark.parametrize("num_bits", [4, 8]) @pytest.mark.parametrize("is_k_full", [True, False]) +@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm") def test_fused_marlin_moe( m: int, n: int, @@ -114,7 +114,7 @@ def test_fused_marlin_moe( num_bits: int, is_k_full: bool, ): - seed_everything(7) + current_platform.seed_everything(7) # Filter act_order if act_order: @@ -191,7 +191,7 @@ def test_fused_marlin_moe( topk, renormalize=False, ) - marlin_output = fused_marlin_moe( + marlin_output = torch.ops.vllm.fused_marlin_moe( a, qweight1, qweight2, @@ -240,8 +240,8 @@ def test_fused_marlin_moe( requires_grad=False) opcheck(torch.ops._moe_C.marlin_gemm_moe, (a, qweight1, sorted_token_ids, topk_weights, topk_ids, - scales1, zp, g_idx1, sort_indices1, workspace, quant_type, m, - 2 * n, k, True, e, topk, block_size_m, True, False)) + scales1, zp, g_idx1, sort_indices1, workspace, quant_type.id, + m, 2 * n, k, True, e, topk, block_size_m, True, False)) @pytest.mark.skip("This test is here for the sake of debugging, " @@ -255,6 +255,7 @@ def test_fused_marlin_moe( @pytest.mark.parametrize("act_order", [True, False]) @pytest.mark.parametrize("num_bits", [4, 8]) @pytest.mark.parametrize("is_k_full", [True, False]) +@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm") def test_single_marlin_moe_multiply( m: int, n: int, @@ -306,7 +307,7 @@ def test_single_marlin_moe_multiply( sort_indices = stack_and_dev(sort_indices_l) score = torch.randn((m, e), device="cuda", dtype=dtype) - marlin_output = single_marlin_moe( + marlin_output = torch.ops.vllm.single_marlin_moe( a, qweight, scales, @@ -345,6 +346,6 @@ def test_moe_align_block_size_opcheck(): dtype=torch.int32, device=topk_ids.device) - opcheck(torch.ops._C.moe_align_block_size, + opcheck(torch.ops._moe_C.moe_align_block_size, (topk_ids, num_experts, block_size, sorted_ids, expert_ids, num_tokens_post_pad)) diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index 6ca3a645c7771..5c71eef17eab4 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -6,7 +6,6 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.platforms import current_platform -from vllm.utils import seed_everything from .allclose_default import get_default_atol, get_default_rtol @@ -52,7 +51,7 @@ def test_rotary_embedding( if rotary_dim is None: rotary_dim = head_size - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) if rotary_dim is None: rotary_dim = head_size @@ -106,12 +105,12 @@ def test_batched_rotary_embedding( max_position: int = 8192, base: int = 10000, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) if rotary_dim is None: rotary_dim = head_size rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, { - "type": "linear", + "rope_type": "linear", "factor": (1, ) }) rope = rope.to(dtype=dtype) @@ -168,13 +167,13 @@ def test_batched_rotary_embedding_multi_lora( max_position: int = 8192, base: int = 10000, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) if rotary_dim is None: rotary_dim = head_size scaling_factors: List[int] = [1, 2, 4] rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, { - "type": "linear", + "rope_type": "linear", "factor": tuple(scaling_factors) }) rope = rope.to(dtype=dtype) @@ -221,10 +220,10 @@ def test_rope_module_cache(): MAX_POSITIONS = [123, 1234] BASES = [10000, 1000000] ROPE_SCALINGS = (None, { - "type": "linear", + "rope_type": "linear", "factor": (1, ) }, { - "type": "dynamic", + "rope_type": "dynamic", "factor": 1 }) settings = (HEAD_SIZES, ROTARY_DIMS, MAX_POSITIONS, BASES, IS_NEOX_STYLE, diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py index 3181d92562399..a8a187ebaede4 100644 --- a/tests/kernels/test_prefix_prefill.py +++ b/tests/kernels/test_prefix_prefill.py @@ -9,7 +9,8 @@ from vllm.attention.backends.xformers import _make_alibi_bias from vllm.attention.ops.prefix_prefill import context_attention_fwd -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, seed_everything +from vllm.platforms import current_platform +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE NUM_HEADS = [64] NUM_QUERIES_PER_KV = [1, 8, 64] @@ -39,7 +40,7 @@ def test_contexted_kv_attention( kv_cache_dtype: str, device: str, ) -> None: - seed_everything(0) + current_platform.seed_everything(0) torch.set_default_device(device) # Need this, otherwise when we capture the graph the process @@ -234,7 +235,7 @@ def test_contexted_kv_attention_alibi( kv_cache_dtype: str, device: str, ) -> None: - seed_everything(0) + current_platform.seed_everything(0) torch.set_default_device(device) # Need this, otherwise when we capture the graph the process diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index a2d414f636e13..e7865fb2500ef 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -13,8 +13,8 @@ from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType from vllm.model_executor.layers.activation import SiluAndMul -from vllm.utils import (STR_BACKEND_ENV_VAR, STR_XFORMERS_ATTN_VAL, - make_tensor_with_pad) +from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, + STR_XFORMERS_ATTN_VAL, make_tensor_with_pad) # For now, disable "test_aot_dispatch_dynamic" since there are some # bugs related to this test in PyTorch 2.4. @@ -525,17 +525,22 @@ def make_backend(backend_name: str) -> AttentionBackend: if backend_name == STR_XFORMERS_ATTN_VAL: # NOTE: xFormers backend cannot be imported for CPU and AMD GPUs. from vllm.attention.backends.xformers import XFormersBackend - return XFormersBackend() + elif backend_name == STR_FLASH_ATTN_VAL: + from vllm.attention.backends.flash_attn import FlashAttentionBackend + return FlashAttentionBackend() + raise AssertionError( f"Unrecognized backend_name {backend_name} for unit test") def _make_metadata_tensors( - seq_lens: Optional[List[int]], context_lens: Optional[List[int]], - encoder_seq_lens: Optional[List[int]], device: Union[torch.device, str] -) -> Tuple[torch.Tensor, torch.Tensor, Any, Any, Optional[List[int]], - torch.Tensor, Optional[int]]: + seq_lens: Optional[List[int]], + context_lens: Optional[List[int]], + encoder_seq_lens: Optional[List[int]], + device: Union[torch.device, str], +) -> Tuple[torch.Tensor, torch.Tensor, Any, Any, Optional[torch.Tensor], + torch.Tensor, torch.Tensor, Optional[int]]: ''' Build scalar & tensor values required to build attention metadata structure. @@ -553,6 +558,8 @@ def _make_metadata_tensors( * max_context_len: max(context_lens) * max_seq_len: max(seq_lens) * seq_start_loc: start idx of each sequence + * encoder_seq_lens_tensor: encoder seq_lens list, as tensor + * encoder_seq_start_loc: start idx of each encoder sequence * max_encoder_seq_len: encoder seq_lens list, as tensor ''' seq_lens_tensor = maybe_make_int_tensor(seq_lens, device) @@ -566,8 +573,26 @@ def _make_metadata_tensors( seq_start_loc = None + if seq_lens_tensor is not None: + seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1, + dtype=torch.int32, + device=seq_lens_tensor.device) + torch.cumsum(seq_lens_tensor, + dim=0, + dtype=seq_start_loc.dtype, + out=seq_start_loc[1:]) + + encoder_seq_start_loc = torch.zeros(encoder_seq_lens_tensor.shape[0] + 1, + dtype=torch.int32, + device=encoder_seq_lens_tensor.device) + torch.cumsum(encoder_seq_lens_tensor, + dim=0, + dtype=encoder_seq_start_loc.dtype, + out=encoder_seq_start_loc[1:]) + return (seq_lens_tensor, context_lens_tensor, max_context_len, max_seq_len, - seq_start_loc, encoder_seq_lens_tensor, max_encoder_seq_len) + seq_start_loc, encoder_seq_lens_tensor, encoder_seq_start_loc, + max_encoder_seq_len) def make_kv_cache(num_blocks: int, @@ -575,6 +600,7 @@ def make_kv_cache(num_blocks: int, head_size: int, block_size: int, device: Union[torch.device, str], + backend: str, default_val: float = 0.0) -> torch.Tensor: ''' Create a fake KV cache. @@ -591,10 +617,20 @@ def make_kv_cache(num_blocks: int, Returns: * kv_cache: 2 x num_blocks x (block_size * num_heads * head_size) + * for backend 'XFORMERS' + * kv_cache: 2 x num_blocks x block_size x num_heads x head_size + * for backend 'FLASH_ATTN' ''' - - kv_cache = torch.rand( - (2, num_blocks, block_size * num_heads * head_size)).to(device) + if backend == 'XFORMERS': + kv_cache = torch.rand( + (2, num_blocks, block_size * num_heads * head_size)).to(device) + elif backend == 'FLASH_ATTN': + kv_cache = torch.rand( + (2, num_blocks, block_size, num_heads, head_size)).to(device) + else: + raise ValueError( + f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or " + f"'FLASH_ATTN'.") if default_val is not None: kv_cache[:, :, :] = default_val return kv_cache @@ -858,8 +894,9 @@ def make_test_metadata( context_lens_tensor, _, _, - _, + seq_start_loc, encoder_seq_lens_tensor, + encoder_seq_start_loc, max_encoder_seq_len, ) = _make_metadata_tensors(seq_lens, context_lens, @@ -869,10 +906,12 @@ def make_test_metadata( return attn_backend.make_metadata( num_prefills=num_prefills, slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping), + multi_modal_placeholder_index_maps=None, num_prefill_tokens=num_prefill_tokens, num_decode_tokens=num_decode_tokens, seq_lens=seq_lens, seq_lens_tensor=seq_lens_tensor, + seq_start_loc=seq_start_loc, max_prefill_seq_len=None if seq_lens is None else max(seq_lens), max_decode_seq_len=0, context_lens_tensor=context_lens_tensor, @@ -881,6 +920,7 @@ def make_test_metadata( num_encoder_tokens=num_encoder_tokens, encoder_seq_lens=encoder_seq_lens, encoder_seq_lens_tensor=encoder_seq_lens_tensor, + encoder_seq_start_loc=encoder_seq_start_loc, max_encoder_seq_len=max_encoder_seq_len, cross_slot_mapping=(None if cross_kv_mmap is None else cross_kv_mmap.slot_mapping), @@ -903,8 +943,9 @@ def make_test_metadata( context_lens_tensor, _, _, - _, + seq_start_loc, encoder_seq_lens_tensor, + encoder_seq_start_loc, max_encoder_seq_len, ) = _make_metadata_tensors(seq_lens, context_lens, @@ -914,18 +955,22 @@ def make_test_metadata( return attn_backend.make_metadata( num_prefills=num_prefills, slot_mapping=kv_mmap.slot_mapping, + multi_modal_placeholder_index_maps=None, num_prefill_tokens=num_prefill_tokens, num_decode_tokens=num_decode_tokens, seq_lens=seq_lens, seq_lens_tensor=seq_lens_tensor, + seq_start_loc=seq_start_loc, max_prefill_seq_len=0, max_decode_seq_len=max(seq_lens), + max_decode_query_len=1, context_lens_tensor=context_lens_tensor, block_tables=kv_mmap.block_tables, use_cuda_graph=False, num_encoder_tokens=num_encoder_tokens, encoder_seq_lens=encoder_seq_lens, encoder_seq_lens_tensor=encoder_seq_lens_tensor, + encoder_seq_start_loc=encoder_seq_start_loc, max_encoder_seq_len=max_encoder_seq_len, cross_slot_mapping=(None if cross_kv_mmap is None else cross_kv_mmap.slot_mapping), @@ -934,7 +979,8 @@ def make_test_metadata( def assert_actual_matches_ideal(test_params: PhaseTestParameters, - output_under_test: torch.Tensor) -> None: + output_under_test: torch.Tensor, + backend: str) -> None: ''' Assert that observed output matches the ideal output contained in the test parameters data structure. @@ -945,8 +991,22 @@ def assert_actual_matches_ideal(test_params: PhaseTestParameters, * output_under_test: actually observed output value ''' ideal_output = test_params.packed_qkvo.ideal_output - torch.testing.assert_close(ideal_output, - output_under_test.view_as(ideal_output)) + if backend == 'XFORMERS': + torch.testing.assert_close(ideal_output, + output_under_test.view_as(ideal_output)) + + elif backend == 'FLASH_ATTN': + # For FlashAttention override the accuracy thresholds to non default + # values since we notice a higher difference between the ideal and + # actual output. + torch.testing.assert_close(ideal_output, + output_under_test.view_as(ideal_output), + atol=0.01, + rtol=0.016) + else: + raise ValueError( + f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or " + f"'FLASH_ATTN'.") # Copied/modified from torch._refs.__init__.py diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 7940b589f309e..6095364ca4431 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -1,20 +1,16 @@ -import contextlib -import gc import tempfile from collections import OrderedDict from typing import Dict, List, TypedDict from unittest.mock import MagicMock, patch import pytest -import ray import torch import torch.nn as nn from huggingface_hub import snapshot_download import vllm from vllm.config import LoRAConfig -from vllm.distributed import (destroy_distributed_environment, - destroy_model_parallel, +from vllm.distributed import (cleanup_dist_env_and_memory, init_distributed_environment, initialize_model_parallel) from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -49,17 +45,6 @@ class ContextInfo(TypedDict): }] -def cleanup(): - destroy_model_parallel() - destroy_distributed_environment() - with contextlib.suppress(AssertionError): - torch.distributed.destroy_process_group() - gc.collect() - if not current_platform.is_hpu(): - torch.cuda.empty_cache() - ray.shutdown() - - @pytest.fixture() def should_do_global_cleanup_after_test(request) -> bool: """Allow subdirectories to skip global cleanup by overriding this fixture. @@ -74,7 +59,7 @@ def should_do_global_cleanup_after_test(request) -> bool: def cleanup_fixture(should_do_global_cleanup_after_test: bool): yield if should_do_global_cleanup_after_test: - cleanup() + cleanup_dist_env_and_memory(shutdown_ray=True) @pytest.fixture @@ -90,7 +75,7 @@ def dist_init(): ) initialize_model_parallel(1, 1) yield - cleanup() + cleanup_dist_env_and_memory(shutdown_ray=True) @pytest.fixture @@ -241,7 +226,7 @@ def long_context_lora_files_32k(): def long_context_infos(long_context_lora_files_16k_1, long_context_lora_files_16k_2, long_context_lora_files_32k): - cleanup() + cleanup_dist_env_and_memory(shutdown_ray=True) infos: Dict[int, ContextInfo] = {} for lora_checkpoint_info in LONG_LORA_INFOS: lora_id = lora_checkpoint_info["lora_id"] @@ -262,14 +247,13 @@ def long_context_infos(long_context_lora_files_16k_1, @pytest.fixture def llama_2_7b_engine_extra_embeddings(): - cleanup() + cleanup_dist_env_and_memory(shutdown_ray=True) get_model_old = get_model - def get_model_patched(*, model_config, device_config, **kwargs): - kwargs["lora_config"] = LoRAConfig(max_loras=4, max_lora_rank=8) - return get_model_old(model_config=model_config, - device_config=device_config, - **kwargs) + def get_model_patched(**kwargs): + kwargs["vllm_config"].lora_config = LoRAConfig(max_loras=4, + max_lora_rank=8) + return get_model_old(**kwargs) if current_platform.is_hpu(): with patch("vllm.worker.hpu_model_runner.get_model", @@ -281,7 +265,7 @@ def get_model_patched(*, model_config, device_config, **kwargs): yield engine.llm_engine del engine - cleanup() + cleanup_dist_env_and_memory(shutdown_ray=True) @pytest.fixture diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py index cbc3668997817..0ba2ce3617b67 100644 --- a/tests/lora/test_baichuan.py +++ b/tests/lora/test_baichuan.py @@ -3,10 +3,9 @@ import pytest import vllm +from vllm.distributed import cleanup_dist_env_and_memory from vllm.lora.request import LoRARequest -from .conftest import cleanup - MODEL_PATH = "baichuan-inc/Baichuan-7B" PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501 @@ -80,7 +79,7 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files, output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1) del llm_tp1 - cleanup() + cleanup_dist_env_and_memory() llm_tp2 = vllm.LLM(MODEL_PATH, enable_lora=True, @@ -93,7 +92,7 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files, output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2) del llm_tp2 - cleanup() + cleanup_dist_env_and_memory() assert output_tp1 == output_tp2 @@ -108,6 +107,6 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files, output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2) del llm_tp4 - cleanup() + cleanup_dist_env_and_memory() assert output_tp1 == output_tp4 diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py index f7c1d4f041c12..15ec66b0f5502 100644 --- a/tests/lora/test_gemma.py +++ b/tests/lora/test_gemma.py @@ -4,7 +4,7 @@ import vllm from vllm.lora.request import LoRARequest -from vllm.utils import is_hip +from vllm.platforms import current_platform MODEL_PATH = "google/gemma-7b" @@ -31,7 +31,8 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: return generated_texts -@pytest.mark.xfail(is_hip(), reason="There can be output mismatch on ROCm") +@pytest.mark.xfail(current_platform.is_rocm(), + reason="There can be output mismatch on ROCm") def test_gemma_lora(gemma_lora_files): llm = vllm.LLM(MODEL_PATH, max_model_len=1024, diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index e3233c6b60696..eb882faf3974a 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -39,7 +39,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask) from vllm.model_executor.utils import set_random_seed -from vllm.utils import seed_everything +from vllm.platforms import current_platform from .utils import DummyLoRAManager @@ -923,7 +923,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device, seq_len) -> None: dtype = torch.float16 seed = 0 - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) punica_wrapper = PunicaWrapper(8192, 256, device) max_loras = 8 @@ -951,7 +951,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device, lora_rope.create_lora_weights(max_loras, lora_config) linear_rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style, { - "type": "linear", + "rope_type": "linear", "factor": scaling_factors }) linear_rope = linear_rope.to(dtype=dtype) diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py index ad8490353998f..e2a4f1ed0496a 100644 --- a/tests/lora/test_llama.py +++ b/tests/lora/test_llama.py @@ -4,10 +4,9 @@ import ray import vllm +from vllm.distributed import cleanup_dist_env_and_memory from vllm.lora.request import LoRARequest -from .conftest import cleanup - MODEL_PATH = "meta-llama/Llama-2-7b-hf" @@ -93,7 +92,7 @@ def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available): output_tp1 = do_sample(llm_tp1, sql_lora_files, lora_id=1) del llm_tp1 - cleanup() + cleanup_dist_env_and_memory() llm_tp2 = vllm.LLM(MODEL_PATH, enable_lora=True, @@ -103,7 +102,7 @@ def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available): output_tp2 = do_sample(llm_tp2, sql_lora_files, lora_id=1) del llm_tp2 - cleanup() + cleanup_dist_env_and_memory() assert output_tp1 == output_tp2 @@ -115,7 +114,7 @@ def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available): output_tp4 = do_sample(llm_tp4, sql_lora_files, lora_id=1) del llm_tp4 - cleanup() + cleanup_dist_env_and_memory() assert output_tp1 == output_tp4 diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py index 389a3ccbc17ec..eada902c891f7 100644 --- a/tests/lora/test_long_context.py +++ b/tests/lora/test_long_context.py @@ -28,9 +28,15 @@ def _create_lora_request(lora_id, long_context_infos): context_len = long_context_infos[lora_id]["context_length"] scaling_factor = context_len_to_scaling_factor[context_len] - return LoRARequest(context_len, lora_id, - long_context_infos[lora_id]["lora"], None, - 4096 * scaling_factor) + return LoRARequest( + # There are 2 LoRAs for 16K, we need to add lora_id to indicate + # they are different LoRAs. + context_len + str(lora_id), + lora_id, + long_context_infos[lora_id]["lora"], + None, + 4096 * scaling_factor, + ) def evaluate_json_response(model_response, golden_response): @@ -108,14 +114,17 @@ def lora_llm(long_context_infos): for info in long_context_infos.values() ] - llm = vllm.LLM("meta-llama/Llama-2-13b-chat-hf", - enable_lora=True, - max_num_seqs=16, - max_loras=2, - long_lora_scaling_factors=tuple(scaling_factors), - max_num_batched_tokens=4096 * 8, - tensor_parallel_size=4, - distributed_executor_backend="mp") + llm = vllm.LLM( + "meta-llama/Llama-2-13b-chat-hf", + enable_lora=True, + max_num_seqs=16, + max_loras=2, + long_lora_scaling_factors=tuple(scaling_factors), + max_num_batched_tokens=4096 * 8, + tensor_parallel_size=4, + # FIXME enable async output processor + disable_async_output_proc=True, + distributed_executor_backend="mp") yield llm del llm @@ -129,13 +138,7 @@ def test_rotary_emb_replaced(dist_init): enable_lora=True) engine_config = engine_args.create_engine_config() model_runner = ModelRunner( - model_config=engine_config.model_config, - parallel_config=engine_config.parallel_config, - scheduler_config=engine_config.scheduler_config, - device_config=engine_config.device_config, - cache_config=engine_config.cache_config, - load_config=engine_config.load_config, - lora_config=engine_config.lora_config, + vllm_config=engine_config, is_driver_worker=True, ) model_runner.load_model() diff --git a/tests/lora/test_minicpmv.py b/tests/lora/test_minicpmv.py index 81b8188e638c9..2c45ce5141f7d 100644 --- a/tests/lora/test_minicpmv.py +++ b/tests/lora/test_minicpmv.py @@ -1,8 +1,11 @@ from typing import List +import pytest + import vllm from vllm.assets.image import ImageAsset from vllm.lora.request import LoRARequest +from vllm.platforms import current_platform MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5" @@ -53,6 +56,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: return generated_texts +@pytest.mark.xfail( + current_platform.is_rocm(), + reason="MiniCPM-V dependency xformers incompatible with ROCm") def test_minicpmv_lora(minicpmv_lora_files): llm = vllm.LLM( MODEL_PATH, @@ -61,8 +67,8 @@ def test_minicpmv_lora(minicpmv_lora_files): max_loras=4, max_lora_rank=64, trust_remote_code=True, + gpu_memory_utilization=0.97 # This model is pretty big for CI gpus ) - output1 = do_sample(llm, minicpmv_lora_files, lora_id=1) for i in range(len(EXPECTED_OUTPUT)): assert EXPECTED_OUTPUT[i].startswith(output1[i]) diff --git a/tests/lora/test_punica_sizes.py b/tests/lora/test_punica_sizes.py index 41c37a4813c68..e756544d96e98 100644 --- a/tests/lora/test_punica_sizes.py +++ b/tests/lora/test_punica_sizes.py @@ -1,5 +1,5 @@ """ -This script is mainly used to tests various hidden_sizes. We have collected the +This script is mainly used to tests various hidden_sizes. We have collected the hidden_sizes included in the LoRA models currently supported by vLLM. It tests whether the corresponding Triton kernel can run normally when tensor parallelism is set to [1, 2, 4, 8, 16, 32, 64]. @@ -15,8 +15,8 @@ from vllm.lora.ops.sgmv_expand import sgmv_expand from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice from vllm.lora.ops.sgmv_shrink import sgmv_shrink +from vllm.platforms import current_platform from vllm.triton_utils.libentry import LibEntry -from vllm.utils import seed_everything from .utils import (generate_data, generate_data_for_expand_nslices, ref_torch_groupgemm) @@ -146,7 +146,7 @@ def test_punica_sgmv( device: str, ): torch.set_default_device(device) - seed_everything(seed) + current_platform.seed_everything(seed) seq_length = 128 ( @@ -239,7 +239,7 @@ def test_punica_bgmv( from vllm.lora.ops.bgmv_shrink import _bgmv_shrink_kernel torch.set_default_device(device) - seed_everything(seed) + current_platform.seed_everything(seed) seq_length = 1 ( @@ -327,7 +327,7 @@ def test_punica_expand_nslices( from vllm.lora.ops.bgmv_expand_slice import _bgmv_expand_slice_kernel torch.set_default_device(device) - seed_everything(seed) + current_platform.seed_everything(seed) seq_length = 128 if op_type == "sgmv" else 1 ( diff --git a/tests/lora/test_punica_variation.py b/tests/lora/test_punica_variation.py index 185da6399a06a..dc0edeb10ef46 100644 --- a/tests/lora/test_punica_variation.py +++ b/tests/lora/test_punica_variation.py @@ -1,6 +1,6 @@ """ -This script is mainly used to test whether trtion kernels can run normally -under different conditions, including various batches, numbers of LoRA , and +This script is mainly used to test whether trtion kernels can run normally +under different conditions, including various batches, numbers of LoRA , and maximum ranks. """ from unittest.mock import patch @@ -14,8 +14,8 @@ from vllm.lora.ops.sgmv_expand import sgmv_expand from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice from vllm.lora.ops.sgmv_shrink import sgmv_shrink +from vllm.platforms import current_platform from vllm.triton_utils.libentry import LibEntry -from vllm.utils import seed_everything from .utils import (generate_data, generate_data_for_expand_nslices, ref_torch_groupgemm) @@ -61,7 +61,7 @@ def test_punica_sgmv( device: str, ): torch.set_default_device(device) - seed_everything(seed) + current_platform.seed_everything(seed) seq_length = 128 ( @@ -154,7 +154,7 @@ def test_punica_bgmv( from vllm.lora.ops.bgmv_shrink import _bgmv_shrink_kernel torch.set_default_device(device) - seed_everything(seed) + current_platform.seed_everything(seed) seq_length = 1 ( @@ -242,7 +242,7 @@ def test_punica_expand_nslices( from vllm.lora.ops.bgmv_expand_slice import _bgmv_expand_slice_kernel torch.set_default_device(device) - seed_everything(seed) + current_platform.seed_everything(seed) seq_length = 128 if op_type == "sgmv" else 1 ( diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index 5636c96435024..5432fa4ad0d3a 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -6,10 +6,9 @@ import pytest import vllm +from vllm.distributed import cleanup_dist_env_and_memory from vllm.lora.request import LoRARequest -from vllm.utils import is_hip - -from .conftest import cleanup +from vllm.platforms import current_platform @dataclass @@ -20,7 +19,7 @@ class ModelWithQuantization: MODELS: List[ModelWithQuantization] #AWQ quantization is currently not supported in ROCm. -if is_hip(): +if current_platform.is_rocm(): MODELS = [ ModelWithQuantization( model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", @@ -160,7 +159,7 @@ def expect_match(output, expected_output): print("removing lora") del llm - cleanup() + cleanup_dist_env_and_memory() @pytest.mark.parametrize("model", MODELS) @@ -181,7 +180,7 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available, output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1) del llm_tp1 - cleanup() + cleanup_dist_env_and_memory() llm_tp2 = vllm.LLM( model=model.model_path, @@ -194,6 +193,6 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available, output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1) del llm_tp2 - cleanup() + cleanup_dist_env_and_memory() assert output_tp1 == output_tp2 diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 732e91a52c0a9..9d814f657ac43 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -4,7 +4,8 @@ from unittest.mock import patch from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ParallelConfig, SchedulerConfig) + ModelConfig, ParallelConfig, SchedulerConfig, + VllmConfig) from vllm.lora.models import LoRAMapping from vllm.lora.request import LoRARequest from vllm.worker.worker import Worker @@ -12,10 +13,11 @@ @patch.dict(os.environ, {"RANK": "0"}) def test_worker_apply_lora(sql_lora_files): - worker = Worker( + vllm_config = VllmConfig( model_config=ModelConfig( "meta-llama/Llama-2-7b-hf", - "meta-llama/Llama-2-7b-hf", + task="auto", + tokenizer="meta-llama/Llama-2-7b-hf", tokenizer_mode="auto", trust_remote_code=False, seed=0, @@ -27,16 +29,19 @@ def test_worker_apply_lora(sql_lora_files): load_format="dummy", ), parallel_config=ParallelConfig(1, 1, False), - scheduler_config=SchedulerConfig(32, 32, 32), + scheduler_config=SchedulerConfig("generate", 32, 32, 32), device_config=DeviceConfig("cuda"), cache_config=CacheConfig(block_size=16, gpu_memory_utilization=1., swap_space=0, cache_dtype="auto"), - local_rank=0, - rank=0, lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32, max_loras=32), + ) + worker = Worker( + vllm_config=vllm_config, + local_rank=0, + rank=0, distributed_init_method=f"file://{tempfile.mkstemp()[1]}", ) worker.init_device() diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index f1003221ab518..4a824c7acef21 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -6,13 +6,12 @@ from prometheus_client import REGISTRY from vllm import EngineArgs, LLMEngine +from vllm.distributed import cleanup_dist_env_and_memory from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.metrics import RayPrometheusStatLogger from vllm.sampling_params import SamplingParams -from ..conftest import cleanup - MODELS = [ "facebook/opt-125m", ] @@ -85,6 +84,45 @@ def test_metric_counter_generation_tokens( f"metric: {metric_count!r}") +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("max_tokens", [128, 129]) +@pytest.mark.parametrize("disable_async_output_proc", [True, False]) +def test_metric_counter_generation_tokens_multi_step( + vllm_runner, + example_prompts, + model: str, + max_tokens: int, + disable_async_output_proc: bool, +) -> None: + num_scheduler_steps = 8 + with vllm_runner( + model, + disable_log_stats=False, + gpu_memory_utilization=0.4, + num_scheduler_steps=num_scheduler_steps, + disable_async_output_proc=disable_async_output_proc, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + tokenizer = vllm_model.model.get_tokenizer() + stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] + metric_count = stat_logger.metrics.counter_generation_tokens.labels( + **stat_logger.labels)._value.get() + vllm_generation_count = 0 + for i in range(len(example_prompts)): + vllm_output_ids, vllm_output_str = vllm_outputs[i] + prompt_ids = tokenizer.encode(example_prompts[i]) + # vllm_output_ids contains both prompt tokens and generation tokens. + # We're interested only in the count of the generation tokens. + vllm_generation_count += len(vllm_output_ids) - len(prompt_ids) + + # The multi-step scheduling will continue to execute forward even when + # encountering EOS, leading to slightly imprecise metrics. + assert abs(vllm_generation_count - metric_count) <\ + len(example_prompts) * num_scheduler_steps, \ + (f"generation token count: {vllm_generation_count!r}\n" + f"metric: {metric_count!r}") + + @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize( @@ -185,13 +223,14 @@ def test_metric_spec_decode( ) -> None: k = 5 - with vllm_runner(model, - dtype=dtype, - disable_log_stats=False, - gpu_memory_utilization=0.4, - speculative_model=model, - num_speculative_tokens=k, - use_v2_block_manager=True) as vllm_model: + with vllm_runner( + model, + dtype=dtype, + disable_log_stats=False, + gpu_memory_utilization=0.4, + speculative_model=model, + num_speculative_tokens=k, + ) as vllm_model: # Force log interval to be 0 to catch all metrics. stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] @@ -242,7 +281,6 @@ def test_metric_spec_decode_interval( gpu_memory_utilization=0.4, speculative_model=model, num_speculative_tokens=k, - use_v2_block_manager=True, enforce_eager=True) engine = LLMEngine.from_engine_args(engine_args) @@ -307,7 +345,7 @@ def test_metric_spec_decode_interval( finally: del engine - cleanup() + cleanup_dist_env_and_memory() def assert_metrics(engine: LLMEngine, disable_log_stats: bool, @@ -327,6 +365,7 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool, "vllm:request_prompt_tokens", "vllm:request_generation_tokens", "vllm:request_params_n", + "vllm:request_params_max_tokens", ] for metric_name in request_histogram_metrics: metric_value = REGISTRY.get_sample_value(f"{metric_name}_count", diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py new file mode 100644 index 0000000000000..af267f804ffa7 --- /dev/null +++ b/tests/model_executor/test_enabled_custom_ops.py @@ -0,0 +1,92 @@ +import os +from typing import List + +import pytest + +from vllm.model_executor.custom_op import CustomOp +from vllm.model_executor.layers.activation import (GeluAndMul, + ReLUSquaredActivation, + SiluAndMul) +from vllm.model_executor.layers.layernorm import RMSNorm + + +# Registered subclass for test +@CustomOp.register("relu3") +class Relu3(ReLUSquaredActivation): + pass + + +@pytest.mark.parametrize( + "env, torch_level, ops_enabled, default_on", + [ + # Default values based on compile level + ("", 0, [True] * 4, True), + ("", 1, [True] * 4, True), + ("", 2, [True] * 4, True), # All by default + ("", 3, [False] * 4, False), + ("", 4, [False] * 4, False), # None by default + # Explicitly enabling/disabling + # + # Default: all + # + # All but SiluAndMul + ("+rms_norm,-silu_and_mul", 0, [1, 0, 1, 1], True), + # Only ReLU3 + ("none,-rms_norm,+relu3", 0, [0, 0, 0, 1], False), + # All but SiluAndMul + ("all,-silu_and_mul", 1, [1, 0, 1, 1], True), + # All but ReLU3 (even if ReLU2 is on) + ("-relu3,relu2", 1, [1, 1, 1, 0], True), + # GeluAndMul and SiluAndMul + ("none,-relu3,+gelu_and_mul,+silu_and_mul", 2, [0, 1, 1, 0], False), + # All but RMSNorm + ("-rms_norm", 2, [0, 1, 1, 1], True), + # + # Default: none + # + # Only ReLU3 + ("-silu_and_mul,+relu3", 3, [0, 0, 0, 1], False), + # All but RMSNorm + ("all,-rms_norm", 4, [0, 1, 1, 1], True), + ]) +def test_enabled_ops(env: str, torch_level: int, ops_enabled: List[int], + default_on: bool): + os.environ["VLLM_CUSTOM_OPS"] = env + os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(torch_level) + + # Reset default_on (computed once): + CustomOp.default_on.cache_clear() + + assert CustomOp.default_on() == default_on + + ops_enabled = [bool(x) for x in ops_enabled] + + assert RMSNorm(1024).enabled() == ops_enabled[0] + assert CustomOp.op_registry["rms_norm"].enabled() == ops_enabled[0] + + assert SiluAndMul().enabled() == ops_enabled[1] + assert CustomOp.op_registry["silu_and_mul"].enabled() == ops_enabled[1] + + assert GeluAndMul().enabled() == ops_enabled[2] + assert CustomOp.op_registry["gelu_and_mul"].enabled() == ops_enabled[2] + + # If registered, subclasses should follow their own name + assert Relu3().enabled() == ops_enabled[3] + assert CustomOp.op_registry["relu3"].enabled() == ops_enabled[3] + + # Unregistered subclass + class SiluAndMul2(SiluAndMul): + pass + + # Subclasses should not require registration + assert SiluAndMul2().enabled() == SiluAndMul().enabled() + + +@pytest.mark.parametrize( + "env", ["all,none", "all,+rms_norm,all", "+rms_norm,-rms_norm"]) +def test_enabled_ops_invalid(env: str): + os.environ["VLLM_CUSTOM_OPS"] = env + CustomOp.default_on.cache_clear() + + with pytest.raises(AssertionError): + RMSNorm(1024).enabled() diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py index bfffd34d1142c..d14e88b4e5b26 100644 --- a/tests/models/decoder_only/audio_language/test_ultravox.py +++ b/tests/models/decoder_only/audio_language/test_ultravox.py @@ -2,8 +2,10 @@ import numpy as np import pytest +import pytest_asyncio from transformers import AutoModel, AutoTokenizer, BatchEncoding +from tests.utils import RemoteOpenAIServer from vllm.sequence import SampleLogprobs from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE @@ -17,6 +19,13 @@ VLLM_PLACEHOLDER = "<|reserved_special_token_0|>" HF_PLACEHOLDER = "<|audio|>" +CHUNKED_PREFILL_KWARGS = { + "enable_chunked_prefill": True, + "max_num_seqs": 2, + # Use a very small limit to exercise chunked prefill. + "max_num_batched_tokens": 16 +} + @pytest.fixture(scope="session") def audio_assets(): @@ -30,6 +39,26 @@ def audio(request): return AudioAsset(request.param) +@pytest.fixture(params=({}, CHUNKED_PREFILL_KWARGS)) +def server(request, audio_assets): + args = [ + "--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager", + f"--limit-mm-per-prompt=audio={len(audio_assets)}" + ] + [ + f"--{key.replace('_','-')}={value}" + for key, value in request.param.items() + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + def _get_prompt(audio_count, question, placeholder): tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) placeholder = f"{placeholder}\n" * audio_count @@ -68,8 +97,7 @@ def run_test( dtype: str, max_tokens: int, num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, + **kwargs, ): """Inference result should be the same between hf and vllm.""" torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype] @@ -79,11 +107,8 @@ def run_test( # if we run HF first, the cuda initialization will be done and it # will hurt multiprocessing backend with fork method (the default method). - with vllm_runner(model, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: + with vllm_runner(model, dtype=dtype, enforce_eager=True, + **kwargs) as vllm_model: vllm_outputs_per_audio = [ vllm_model.generate_greedy_logprobs([vllm_prompt], max_tokens, @@ -92,7 +117,7 @@ def run_test( for vllm_prompt, _, audio in prompts_and_audios ] - def process(hf_inputs: BatchEncoding): + def process(hf_inputs: BatchEncoding, **kwargs): hf_inputs["audio_values"] = hf_inputs["audio_values"] \ .to(torch_dtype) # type: ignore return hf_inputs @@ -135,18 +160,16 @@ def run_multi_audio_test( dtype: str, max_tokens: int, num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, + **kwargs, ): with vllm_runner(model, dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, enforce_eager=True, limit_mm_per_prompt={ "audio": max((len(audio) for _, audio in prompts_and_audios)) - }) as vllm_model: + }, + **kwargs) as vllm_model: vllm_outputs = vllm_model.generate_greedy_logprobs( [prompt for prompt, _ in prompts_and_audios], max_tokens, @@ -158,11 +181,13 @@ def run_multi_audio_test( assert all(tokens for tokens, *_ in vllm_outputs) +@pytest.mark.core_model @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS]) def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int, - num_logprobs: int) -> None: + num_logprobs: int, vllm_kwargs: dict) -> None: vllm_prompt = _get_prompt(1, "Describe the audio above.", VLLM_PLACEHOLDER) hf_prompt = _get_prompt(1, "Describe the audio above.", HF_PLACEHOLDER) @@ -174,16 +199,18 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int, dtype=dtype, max_tokens=max_tokens, num_logprobs=num_logprobs, - tensor_parallel_size=1, + **vllm_kwargs, ) +@pytest.mark.core_model @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS]) def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str, - max_tokens: int, - num_logprobs: int) -> None: + max_tokens: int, num_logprobs: int, + vllm_kwargs: dict) -> None: vllm_prompt = _get_prompt(len(audio_assets), "Describe each of the audios above.", @@ -196,5 +223,37 @@ def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str, dtype=dtype, max_tokens=max_tokens, num_logprobs=num_logprobs, - tensor_parallel_size=1, + **vllm_kwargs, ) + + +@pytest.mark.asyncio +async def test_online_inference(client, audio_assets): + """Exercises online inference with/without chunked prefill enabled.""" + + messages = [{ + "role": + "user", + "content": [ + *[{ + "type": "audio_url", + "audio_url": { + "url": audio.url + } + } for audio in audio_assets], + { + "type": + "text", + "text": + f"What's happening in these {len(audio_assets)} audio clips?" + }, + ], + }] + + chat_completion = await client.chat.completions.create(model=MODEL_NAME, + messages=messages, + max_tokens=10) + + assert len(chat_completion.choices) == 1 + choice = chat_completion.choices[0] + assert choice.finish_reason == "length" diff --git a/tests/models/decoder_only/language/test_big_models.py b/tests/models/decoder_only/language/test_big_models.py index fcc158639748d..fcfc159e4f5a0 100644 --- a/tests/models/decoder_only/language/test_big_models.py +++ b/tests/models/decoder_only/language/test_big_models.py @@ -8,7 +8,7 @@ from vllm.platforms import current_platform -from ...utils import check_outputs_equal +from ...utils import check_logprobs_close, check_outputs_equal MODELS = [ "meta-llama/Llama-2-7b-hf", @@ -21,10 +21,14 @@ ] if not current_platform.is_cpu(): - # MiniCPM requires fused_moe which is not supported by CPU - MODELS.append("openbmb/MiniCPM3-4B") + MODELS += [ + # fused_moe which not supported on CPU + "openbmb/MiniCPM3-4B", + # Head size isn't supported on CPU + "h2oai/h2o-danube3-4b-base", + ] -#TODO: remove this after CPU float16 support ready +# TODO: remove this after CPU float16 support ready target_dtype = "float" if current_platform.is_cpu() else "half" @@ -39,18 +43,40 @@ def test_models( dtype: str, max_tokens: int, ) -> None: - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + if model == "openbmb/MiniCPM3-4B": + # the output becomes slightly different when upgrading to + # pytorch 2.5 . Changing to logprobs checks instead of exact + # output checks. + NUM_LOG_PROBS = 8 + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy_logprobs_limit( + example_prompts, max_tokens, NUM_LOG_PROBS) + + with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: + vllm_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) + + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + else: + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + + with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, + max_tokens) + + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @pytest.mark.parametrize("model", MODELS) diff --git a/tests/models/decoder_only/language/test_danube3_4b.py b/tests/models/decoder_only/language/test_danube3_4b.py deleted file mode 100644 index bdd498edc293d..0000000000000 --- a/tests/models/decoder_only/language/test_danube3_4b.py +++ /dev/null @@ -1,52 +0,0 @@ -"""Compare the outputs of HF and vLLM when using greedy sampling. - -This tests danube3 separately because its head size isn't supported on CPU yet. - -Run `pytest tests/models/test_danube3_4b.py`. -""" -import pytest - -from ...utils import check_outputs_equal - -MODELS = ["h2oai/h2o-danube3-4b-base"] - -target_dtype = "half" - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [32]) -def test_models( - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, -) -> None: - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - - with vllm_runner(model, dtype=dtype) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", [target_dtype]) -def test_model_print( - vllm_runner, - model: str, - dtype: str, -) -> None: - with vllm_runner(model, dtype=dtype) as vllm_model: - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py index 408d12cd5ff5c..384ec77e5455a 100644 --- a/tests/models/decoder_only/language/test_jamba.py +++ b/tests/models/decoder_only/language/test_jamba.py @@ -1,5 +1,6 @@ import pytest +from tests.utils import multi_gpu_test from vllm.sampling_params import SamplingParams from vllm.worker.model_runner import _get_graph_batch_size @@ -270,6 +271,30 @@ def test_state_cleanup( "could be related to finished_requests_ids") +@multi_gpu_test(num_gpus=2) +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float"]) +@pytest.mark.parametrize("max_tokens", [64]) +def test_jamba_distributed_produces_identical_generation( + vllm_runner, model: str, dtype: str, max_tokens: int, + example_prompts) -> None: + + with vllm_runner(model, dtype=dtype, tensor_parallel_size=2) as vllm_model: + vllm_outputs_tp_2 = vllm_model.generate_greedy(example_prompts, + max_tokens) + + with vllm_runner(model, dtype=dtype, tensor_parallel_size=1) as vllm_model: + vllm_outputs_tp_1 = vllm_model.generate_greedy(example_prompts, + max_tokens) + + check_outputs_equal( + outputs_0_lst=vllm_outputs_tp_1, + outputs_1_lst=vllm_outputs_tp_2, + name_0="vllm_tp_1", + name_1="vllm_tp_2", + ) + + @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) def test_model_print( diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py index c27bf6a60a4f4..2dc231c595ffa 100644 --- a/tests/models/decoder_only/language/test_mamba.py +++ b/tests/models/decoder_only/language/test_mamba.py @@ -10,7 +10,7 @@ from ...utils import check_outputs_equal -MODELS = ["state-spaces/mamba-130m-hf"] +MODELS = ["state-spaces/mamba-130m-hf", "tiiuae/falcon-mamba-tiny-dev"] # Use lower-level interfaces to create this greedy generator, as mamba will diff --git a/tests/models/decoder_only/language/test_phimoe.py b/tests/models/decoder_only/language/test_phimoe.py index 89afbcf1c03ac..c997359a2781e 100644 --- a/tests/models/decoder_only/language/test_phimoe.py +++ b/tests/models/decoder_only/language/test_phimoe.py @@ -5,7 +5,7 @@ import pytest import torch -from vllm.utils import is_cpu +from vllm.platforms import current_platform from ....utils import large_gpu_test from ...utils import check_logprobs_close @@ -70,7 +70,7 @@ def test_phimoe_routing_function(): assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"]) -@pytest.mark.skipif(condition=is_cpu(), +@pytest.mark.skipif(condition=current_platform.is_cpu(), reason="This test takes a lot time to run on CPU, " "and vllm CI's disk space is not enough for this model.") @large_gpu_test(min_gb=80) diff --git a/tests/models/decoder_only/language/test_qwen.py b/tests/models/decoder_only/language/test_qwen.py new file mode 100644 index 0000000000000..128fe65afbb84 --- /dev/null +++ b/tests/models/decoder_only/language/test_qwen.py @@ -0,0 +1,34 @@ +"""Ensure that a text-only Qwen model can be run without throwing an error. +We explicitly test this because Qwen is implemented as a multimodal and +supports a visual encoder for models like Qwen-VL. +""" +from typing import List, Type + +import pytest + +from ....conftest import VllmRunner + +models = [ + "Qwen/Qwen-7B-Chat" # Has no visual encoder +] + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_text_only_qwen_model_can_be_loaded_and_run( + vllm_runner: Type[VllmRunner], + example_prompts: List[str], + model: str, + *, + dtype: str, + max_tokens: int, + num_logprobs: int, +): + with vllm_runner(model, dtype=dtype) as vllm_model: + vllm_model.generate_greedy_logprobs( + example_prompts, + max_tokens, + num_logprobs=num_logprobs, + ) diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py new file mode 100644 index 0000000000000..c2d3fda6994f6 --- /dev/null +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py @@ -0,0 +1,68 @@ +import pytest + +from vllm.inputs import InputContext + +from ....utils import build_model_context + + +@pytest.fixture() +def get_max_llava_next_image_tokens(): + from vllm.model_executor.models.llava_next import ( + get_max_llava_next_image_tokens) + return get_max_llava_next_image_tokens + + +@pytest.fixture() +def dummy_data_for_llava_next(): + from vllm.model_executor.models.llava_next import dummy_data_for_llava_next + return dummy_data_for_llava_next + + +@pytest.mark.parametrize("gridpoints,expected_max_tokens", [ + ([[336, 336]], 1176), + ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], 2928), +]) +def test_get_max_llava_next_image_tokens(gridpoints, expected_max_tokens, + get_max_llava_next_image_tokens): + ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf") + + # Update the config image_grid_pinpoints + # and calculate the resulting max tokens + ctx.model_config.hf_config.image_grid_pinpoints = gridpoints + + actual_max_tokens = get_max_llava_next_image_tokens( + InputContext(ctx.model_config)) + + assert expected_max_tokens == actual_max_tokens + + +@pytest.mark.parametrize( + "gridpoints,expected_size", + [ + # One point; it has to be the largest + ([[336, 336]], (336, 336)), + # Default for most llava next models; the 2x2 tile is the largest + ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], + (672, 672)), + # If two rectangular gridpoints are the same, the more vertical + # one has the higher feature count due to newline features + ([[336, 672], [672, 336]], (672, 336)) + ]) +def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next, + gridpoints, expected_size): + ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf") + + # Update the config image_grid_pinpoints + ctx.model_config.hf_config.image_grid_pinpoints = gridpoints + seq_len = 5000 # bigger than the max feature size for any image + + seq_data, mm_data = dummy_data_for_llava_next( + ctx, + seq_len=seq_len, + mm_counts={"image": 1}, + ) + + # The dummy data dims should match the gridpoint with the biggest feat size + assert mm_data["image"].height == expected_size[0] + assert mm_data["image"].width == expected_size[1] + assert len(seq_data.get_token_ids()) >= seq_len diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py new file mode 100644 index 0000000000000..d6a7b34fdde9f --- /dev/null +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py @@ -0,0 +1,181 @@ +"""Tests for phi3v's multimodal preprocessing kwargs.""" +from typing import Optional + +import pytest +import torch +from transformers import AutoImageProcessor, AutoTokenizer + +from vllm.inputs import InputContext, token_inputs +from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID +from vllm.multimodal import MultiModalRegistry + +from .....conftest import _ImageAssets +from ....utils import build_model_context + +models = ["microsoft/Phi-3.5-vision-instruct"] + + +# Wrap lazy imports to avoid initializing CUDA during test collection +@pytest.fixture() +def input_processor_for_phi3v(): + from vllm.model_executor.models.phi3v import input_processor_for_phi3v + return input_processor_for_phi3v + + +@pytest.fixture() +def dummy_data_for_phi3v(): + from vllm.model_executor.models.phi3v import dummy_data_for_phi3v + return dummy_data_for_phi3v + + +@pytest.fixture() +def get_max_phi3v_image_tokens(): + from vllm.model_executor.models.phi3v import get_max_phi3v_image_tokens + return get_max_phi3v_image_tokens + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("num_crops", [4, 16, None]) +def test_input_mapper_override(model: str, image_assets: _ImageAssets, + num_crops: Optional[int]): + """Ensure that the [default] input mapper handles num_crops properly.""" + # We pass the processor kwargs here since for this model, we fall back to + # the default mapper; this will fall back to the HF mapper and forward + # mm_processor_kwargs to it. + mm_processor_kwargs = { + "num_crops": num_crops + } if num_crops is not None else {} + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + mm_processor_kwargs=mm_processor_kwargs, + ) + + hf_processor = AutoImageProcessor.from_pretrained(model, + trust_remote_code=True, + **mm_processor_kwargs) + + mm_registry = MultiModalRegistry() + mm_registry.init_mm_limits_per_prompt(ctx.model_config) + + image = image_assets[0].pil_image + hf_result = hf_processor.preprocess( + image, + return_tensors="pt", + ) + + vllm_result = mm_registry.map_input( + ctx.model_config, + {"image": image}, + ) + + assert torch.all(hf_result["image_sizes"] == vllm_result["image_sizes"]) + assert torch.all( + hf_result["num_img_tokens"] == vllm_result["num_img_tokens"]) + + # For pixel values, the second axis should be the num_crops + 1 + # for the rescaled original image. The default value in VLLM falls + # back to the HF config, which is why we compare to the processor num_crops + assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"]) + assert vllm_result["pixel_values"].shape[1] == hf_processor.num_crops + 1 + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("num_crops,expected_max_tokens", [ + (4, 781), + (16, 2653), +]) +def test_max_tokens_override(get_max_phi3v_image_tokens, model: str, + num_crops: int, expected_max_tokens: int): + """Ensure get_max_phi3v_image_tokens handles num_crops properly.""" + # NOTE: mm_processor_kwargs on the context in this test is unused, since + # this is testing the mapper directly. In practice, the processor kwargs + # are wrapped in a closure when calling the max tokens func. We explicitly + # do NOT use the mm_processor_kwargs in the model context here to ensure + # that the max image tokens implementation is referencing a mix of the + # kwargs to the function and the original mm_processor_kwargs in case + # values are somehow updated and end up in a bad state. + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + mm_processor_kwargs=None, + ) + + actual_max_tokens = get_max_phi3v_image_tokens( + InputContext(ctx.model_config), + num_crops=num_crops, + ) + + assert expected_max_tokens == actual_max_tokens + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("num_crops,toks_per_img,num_imgs", [ + (4, 781, 1), + (4, 781, 2), + (16, 2653, 1), + (16, 2653, 2), +]) +def test_dummy_data_override(dummy_data_for_phi3v, model: str, num_crops: int, + toks_per_img: int, num_imgs: int): + """Ensure dummy_data_for_phi3v handles num_crops properly.""" + # Same as the previous test - don't initialize mm_processor_kwargs + # in this test and assume that the kwargs will be correctly expanded by + # the partial when calling the dummy data func. + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + mm_processor_kwargs=None, + ) + + sequence_data, _, = dummy_data_for_phi3v( + ctx=ctx, + seq_len=8192, # Should be bigger than num_imgs * toks_per_img + mm_counts={"image": num_imgs}, + num_crops=num_crops, + ) + # Ensure we have the right number of placeholders per num_crops size + img_tok_count = sequence_data.get_token_ids().count(_IMAGE_TOKEN_ID) + assert img_tok_count == toks_per_img * num_imgs + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("num_crops,expected_toks_per_img,num_imgs", [ + (4, 757, 1), + (4, 757, 2), + (16, 1921, 1), + (16, 1921, 2), +]) +def test_input_processor_override(input_processor_for_phi3v, + image_assets: _ImageAssets, model: str, + num_crops: int, expected_toks_per_img: int, + num_imgs: int): + """Ensure input_processor_for_phi3v handles num_crops properly.""" + # Same as the previous test - don't initialize mm_processor_kwargs + # in this test and assume that the kwargs will be correctly expanded by + # the partial when calling the custom input processor. + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + ) + tokenizer = AutoTokenizer.from_pretrained(model) + # Build the image str / prompt based on the number of images we pass + img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)]) + prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n" + images = [image_assets[0].pil_image] * num_imgs + + inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt), + prompt=prompt, + multi_modal_data={"image": images}) + + processed_inputs = input_processor_for_phi3v(ctx, + inputs, + num_crops=num_crops) + + # Ensure we have the right number of placeholders per num_crops size + img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID) + assert img_tok_count == expected_toks_per_img * num_imgs diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py new file mode 100644 index 0000000000000..a01651b171d60 --- /dev/null +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py @@ -0,0 +1,144 @@ +"""Tests for Qwen's multimodal preprocessing kwargs.""" +from typing import Dict, List, Union + +import pytest +import torch +from PIL.Image import Image + +from vllm.inputs import InputContext, token_inputs +from vllm.multimodal.base import MultiModalInputs +from vllm.multimodal.utils import cached_get_tokenizer + +from .....conftest import IMAGE_ASSETS +from ....utils import build_model_context + +### Multimodal preprocessing tests +SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image +# These values are specific to Qwen-VL/Chat; we can get these from the model +# config also, but they are hardcoded here to keep the parameterize/fixtures +# easy to read. +IMG_START_ID = 151857 +IMG_END_ID = 151858 +IMG_PAD_ID = 151859 +TOKS_PER_IMG = 256 +VIS_ENC_DIM = 4096 +IMG_SIZE = 448 + + +@pytest.fixture() +def input_mapper_for_qwen(): + # Lazy import to avoid initializing CUDA during test collection + from vllm.model_executor.models.qwen import input_mapper_for_qwen + return input_mapper_for_qwen + + +@pytest.fixture() +def input_processor_for_qwen(): + # Lazy import to avoid initializing CUDA during test collection + from vllm.model_executor.models.qwen import input_processor_for_qwen + return input_processor_for_qwen + + +@pytest.fixture() +def qwen_vl_context() -> InputContext: + """Get an InputContext for Qwen-VL.""" + return build_model_context(model_name="Qwen/Qwen-VL", + trust_remote_code=True) + + +# Happy path tests for single/multi-image scenarios for the multimodal +# input processor and mapper, respectively +@pytest.mark.parametrize("num_images", [1, 2]) +def test_input_processor_valid_mm_data(input_processor_for_qwen, + qwen_vl_context: InputContext, + num_images: int): + """Happy cases for image inputs to Qwen's multimodal input processor.""" + prompt = "".join( + [f"Picture {num}: \n" for num in range(1, num_images + 1)]) + inputs = token_inputs( + prompt=prompt, + # When processing multimodal data for a multimodal model, the qwen + # input processor will overwrite the provided prompt_token_ids with + # the image prompts + prompt_token_ids=[], + multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)}, + ) + proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs) + assert isinstance(proc_inputs, dict) + + # Each image should have one start / stop and a fixed context of 256 + proc_tokens = proc_inputs["prompt_token_ids"] + assert proc_tokens.count(IMG_START_ID) == num_images + assert proc_tokens.count(IMG_END_ID) == num_images + assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG + + +@pytest.mark.parametrize( + "img_data,expected_shape", + [ + # single / multi-image + (SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)), + (2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)), + # single / multi-image embeddings + (torch.rand( + (TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)), + (torch.rand( + (1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)), + (torch.rand( + (2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)), + ]) +def test_input_mapper_valid_mm_data(input_mapper_for_qwen, + qwen_vl_context: InputContext, + img_data: Union[torch.Tensor, List[Image], + Image], + expected_shape: List[int]): + """Happy cases for image inputs to Qwen's multimodal input mapper.""" + mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data) + # Ensure that we get the appropriately shaped pixel_values + # for images and image embeddings, respectively. + assert isinstance(mapped_img_data, MultiModalInputs) + assert "pixel_values" in mapped_img_data + assert mapped_img_data["pixel_values"].shape == expected_shape + + +# Sad path tests for the multimodal input processor and mapper, respectively +@pytest.mark.parametrize("mm_data", [ + { + "image": torch.rand((5)) + }, + { + "image": torch.rand((5, 5, 5, 5, 5)) + }, +]) +def test_input_processor_invalid_mm_data(input_processor_for_qwen, + qwen_vl_context: InputContext, + mm_data: Dict[str, torch.Tensor]): + """Test sad cases validated in Qwen's multimodal input processor.""" + tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer, + trust_remote_code=True) + prompt = "Picture 1: \n" + prompt_token_ids = tokenizer.encode(prompt) + inputs = token_inputs(prompt=prompt, + prompt_token_ids=prompt_token_ids, + multi_modal_data=mm_data) + # Should fail since we have too many or too few dimensions for embeddings + with pytest.raises(ValueError): + input_processor_for_qwen(qwen_vl_context, inputs) + + +@pytest.mark.parametrize( + "img_data", + [ + # Wrong context length + torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)), + # Wrong visual encoder output size + torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)), + ]) +def test_input_mapper_invalid_mm_data( + input_mapper_for_qwen, + qwen_vl_context: InputContext, + img_data: Union[torch.Tensor, List[Image], Image], +): + """Sad cases validated in Qwen VL's multimodal input mapper.""" + with pytest.raises(ValueError): + input_mapper_for_qwen(qwen_vl_context, img_data) diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py new file mode 100644 index 0000000000000..c23fbedf0c6ae --- /dev/null +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py @@ -0,0 +1,160 @@ +from typing import Any, Dict, Tuple + +import pytest +import torch +from PIL.Image import Image +from transformers import AutoTokenizer + +from vllm.inputs import InputContext, token_inputs +from vllm.multimodal import MultiModalRegistry + +from .....conftest import _ImageAssets +from ....utils import build_model_context + +MODEL = "Qwen/Qwen2-VL-2B-Instruct" +MIN_PIXELS = "min_pixels" +MAX_PIXELS = "max_pixels" + + +# Fixtures lazy import to avoid initializing CUDA during test collection +# NOTE: Qwen2VL supports multiple input modalities, so it registers multiple +# input mappers. +@pytest.fixture() +def image_input_mapper_for_qwen2_vl(): + from vllm.model_executor.models.qwen2_vl import ( + image_input_mapper_for_qwen2_vl) + return image_input_mapper_for_qwen2_vl + + +@pytest.fixture() +def input_processor_for_qwen2_vl(): + from vllm.model_executor.models.qwen2_vl import ( + input_processor_for_qwen2_vl) + return input_processor_for_qwen2_vl + + +@pytest.fixture() +def qwen2_vl_context() -> InputContext: + return build_model_context(model_name=MODEL) + + +@pytest.fixture() +def get_max_qwen2_vl_image_tokens(): + from vllm.model_executor.models.qwen2_vl import ( + get_max_qwen2_vl_image_tokens) + return get_max_qwen2_vl_image_tokens + + +@pytest.fixture() +def dummy_data_for_qwen2_vl(): + from vllm.model_executor.models.qwen2_vl import dummy_data_for_qwen2_vl + return dummy_data_for_qwen2_vl + + +@pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [ + ({}, 1225), + ({ + MIN_PIXELS: 64**2, + MAX_PIXELS: 512**2 + }, 324), +]) +def test_qwen2_vl_max_image_tokens(get_max_qwen2_vl_image_tokens, + qwen2_vl_context: InputContext, + mm_processor_kwargs: Dict[str, Any], + expected_max_tokens: int): + """Ensure that the max token calc handles min/max pixels properly.""" + actual_max_tokens = get_max_qwen2_vl_image_tokens(qwen2_vl_context, + **mm_processor_kwargs) + assert actual_max_tokens == expected_max_tokens + + +@pytest.mark.parametrize("mm_processor_kwargs,token_count,img_size", [ + [{}, 1225, (980, 980)], + [{ + MIN_PIXELS: 64**2, + MAX_PIXELS: 512**2 + }, 324, (504, 504)], +]) +def test_qwen2_vl_dummy_data(dummy_data_for_qwen2_vl, + qwen2_vl_context: InputContext, + mm_processor_kwargs: Dict[str, Any], + token_count: int, img_size: Tuple[int, int]): + """Ensure that the dummy data handles min/max pixels properly.""" + seq_len = 3000 + hf_config = qwen2_vl_context.get_hf_config() + image_token_id = hf_config.image_token_id + + # NOTE: video value is required, but isn't actually used + # when making the dummy data except for error handling currently + seq_data, mm_data = dummy_data_for_qwen2_vl(qwen2_vl_context, seq_len, { + "image": 1, + "video": 0 + }, **mm_processor_kwargs) + + # Ensure we have the right number of placeholders for min/max pixel values + assert seq_data.get_token_ids().count(image_token_id) == token_count + + # Ensure the images were resized correctly + image = mm_data["image"] + assert isinstance(image, Image) + assert image.size == img_size + + +@pytest.mark.parametrize("mm_processor_kwargs,num_placeholders", [ + ({}, 1426), + ({ + MIN_PIXELS: 64**2, + MAX_PIXELS: 512**2 + }, 330), +]) +def test_input_processor(input_processor_for_qwen2_vl, + qwen2_vl_context: InputContext, + image_assets: _ImageAssets, num_placeholders: int, + mm_processor_kwargs: Dict[str, Any]): + """Ensure that the image processor handles min/max pixels properly.""" + tokenizer = AutoTokenizer.from_pretrained(MODEL) + prompt = "<|vision_start|><|image_pad|><|vision_end|>" + + image = image_assets[0].pil_image + hf_config = qwen2_vl_context.get_hf_config() + image_token_id = hf_config.image_token_id + + inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt), + prompt=prompt, + multi_modal_data={"image": [image]}) + + processed_inputs = input_processor_for_qwen2_vl(qwen2_vl_context, inputs, + **mm_processor_kwargs) + assert processed_inputs["prompt_token_ids"].count( + image_token_id) == num_placeholders + assert len(processed_inputs["multi_modal_data"]["image"]) == 1 + + +@pytest.mark.parametrize("mm_processor_kwargs,pixels_shape", [ + ({}, [5704, 1176]), + ({ + MIN_PIXELS: 64**2, + MAX_PIXELS: 512**2 + }, [1320, 1176]), +]) +def test_image_mapper_override(qwen2_vl_context: InputContext, + image_assets: _ImageAssets, + mm_processor_kwargs: Dict[str, Any], + pixels_shape: Tuple[int, int]): + """Ensure that the image mapper handles min/max pixels properly.""" + mm_registry = MultiModalRegistry() + mm_registry.init_mm_limits_per_prompt(qwen2_vl_context.model_config) + + image = image_assets[0].pil_image + + mapped_output = mm_registry.map_input( + qwen2_vl_context.model_config, + {"image": image}, + mm_processor_kwargs=mm_processor_kwargs, + ) + + # Dimension 0 of pixel values should match the product of image_grid_thw + actual_pixels_shape = mapped_output["pixel_values"].shape + assert list(actual_pixels_shape) == pixels_shape + assert actual_pixels_shape[0] == torch.prod( + mapped_output["image_grid_thw"]) diff --git a/tests/models/decoder_only/vision_language/test_blip2.py b/tests/models/decoder_only/vision_language/test_blip2.py deleted file mode 100644 index e1e32b96d89ac..0000000000000 --- a/tests/models/decoder_only/vision_language/test_blip2.py +++ /dev/null @@ -1,101 +0,0 @@ -from typing import List, Optional, Tuple - -import pytest -from transformers import AutoModelForVision2Seq, AutoTokenizer - -from vllm.multimodal.utils import rescale_image_size -from vllm.sequence import SampleLogprobs - -from ....conftest import IMAGE_ASSETS -from ...utils import check_logprobs_close - -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ - "stop_sign": - "Question: What's the content of the image? Answer:", - "cherry_blossom": - "Question: What is the season? Answer:", -}) - - -def vllm_to_hf_output(vllm_output: Tuple[List[int], str, - Optional[SampleLogprobs]], - model: str): - """Sanitize vllm output to be comparable with hf output.""" - _, output_str, out_logprobs = vllm_output - - hf_output_str = output_str + "\n" - - tokenizer = AutoTokenizer.from_pretrained(model) - hf_output_ids = tokenizer.encode(hf_output_str) - assert hf_output_ids[0] == tokenizer.bos_token_id - hf_output_ids = hf_output_ids[1:] - - return hf_output_ids, hf_output_str, out_logprobs - - -@pytest.mark.parametrize("model", ["Salesforce/blip2-opt-2.7b"]) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - # Multi-scale - [0.25, 0.5, 1.0], - ], -) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, - dtype: str, max_tokens: int, num_logprobs: int) -> None: - """Inference result should be the same between hf and vllm. - - All the image fixtures for the test are from IMAGE_ASSETS. - For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalData objects and corresponding - MultiModalConfig as input. - Note, the text input is also adjusted to abide by vllm contract. - The text output is sanitized to be able to compare with hf. - """ - images = [asset.pil_image for asset in image_assets] - - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - - # max_model_len should be greater than image_feature_size - with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: - vllm_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs_per_image - ] - - with hf_runner(model, dtype=dtype, - auto_cls=AutoModelForVision2Seq) as hf_model: - hf_outputs_per_image = [ - hf_model.generate_greedy_logprobs_limit(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs_per_image - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, model) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - ) diff --git a/tests/models/decoder_only/vision_language/test_broadcast.py b/tests/models/decoder_only/vision_language/test_broadcast.py deleted file mode 100644 index d01490d74bd4d..0000000000000 --- a/tests/models/decoder_only/vision_language/test_broadcast.py +++ /dev/null @@ -1,42 +0,0 @@ -import pytest - -from ....utils import multi_gpu_test - - -@multi_gpu_test(num_gpus=2) -@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) -@pytest.mark.parametrize("model", [ - "llava-hf/llava-1.5-7b-hf", - "llava-hf/llava-v1.6-mistral-7b-hf", - "facebook/chameleon-7b", -]) -def test_models(hf_runner, vllm_runner, image_assets, - distributed_executor_backend, model) -> None: - - dtype = "half" - max_tokens = 5 - num_logprobs = 5 - tensor_parallel_size = 2 - - if model.startswith("llava-hf/llava-1.5"): - from .test_llava import models, run_test - elif model.startswith("llava-hf/llava-v1.6"): - from .test_llava_next import models, run_test # type: ignore[no-redef] - elif model.startswith("facebook/chameleon"): - from .test_chameleon import models, run_test # type: ignore[no-redef] - else: - raise NotImplementedError(f"Unsupported model: {model}") - - run_test( - hf_runner, - vllm_runner, - image_assets, - model=models[0], - # So that LLaVA-NeXT processor may return nested list - size_factors=[0.25, 0.5, 1.0], - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - ) diff --git a/tests/models/decoder_only/vision_language/test_chameleon.py b/tests/models/decoder_only/vision_language/test_chameleon.py deleted file mode 100644 index 8334451970a4f..0000000000000 --- a/tests/models/decoder_only/vision_language/test_chameleon.py +++ /dev/null @@ -1,125 +0,0 @@ -from typing import List, Optional, Type - -import pytest -from transformers import AutoModelForVision2Seq, BatchEncoding - -from vllm.multimodal.utils import rescale_image_size -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE - -from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from ...utils import check_outputs_equal - -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ - "stop_sign": - "USER: \nWhat's the content of the image?\nASSISTANT:", - "cherry_blossom": - "USER: \nWhat is the season?\nASSISTANT:", -}) - -models = ["facebook/chameleon-7b"] - - -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - size_factors: List[float], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - """Inference result should be the same between hf and vllm. - - All the image fixtures for the test are from IMAGE_ASSETS. - For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects - and corresponding vision language config as input. - Note, the text input is also adjusted to abide by vllm contract. - The text output is sanitized to be able to compare with hf. - """ - torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype] - images = [asset.pil_image for asset in image_assets] - - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - - with vllm_runner(model, - max_model_len=4096, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: - - vllm_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs_per_image - ] - - def process(hf_inputs: BatchEncoding): - hf_inputs["pixel_values"] = hf_inputs["pixel_values"] \ - .to(torch_dtype) # type: ignore - return hf_inputs - - with hf_runner(model, - dtype=dtype, - postprocess_inputs=process, - auto_cls=AutoModelForVision2Seq) as hf_model: - hf_outputs_per_image = [ - hf_model.generate_greedy_logprobs_limit(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs_per_image - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): - # HF Logprobs include image tokens, unlike vLLM, so we don't directly - # compare them - check_outputs_equal( - outputs_0_lst=[outputs[:2] for outputs in hf_outputs], - outputs_1_lst=[outputs[:2] for outputs in vllm_outputs], - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - # Multi-scale - [0.25, 0.5, 1.0], - ], -) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [8]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, - dtype, max_tokens, num_logprobs) -> None: - run_test( - hf_runner, - vllm_runner, - image_assets, - model, - size_factors=size_factors, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) diff --git a/tests/models/decoder_only/vision_language/test_fuyu.py b/tests/models/decoder_only/vision_language/test_fuyu.py deleted file mode 100644 index 7827ecb19a744..0000000000000 --- a/tests/models/decoder_only/vision_language/test_fuyu.py +++ /dev/null @@ -1,139 +0,0 @@ -from typing import List, Optional, Tuple, Type - -import pytest - -from vllm.multimodal.utils import rescale_image_size -from vllm.sequence import SampleLogprobs -from vllm.utils import is_cpu - -from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from ...utils import check_logprobs_close - -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ - "stop_sign": - "What's the content of the image?\n", - "cherry_blossom": - "What is the season?\n", -}) - -models = ["adept/fuyu-8b"] - - -def vllm_to_hf_output(vllm_output: Tuple[List[int], str, - Optional[SampleLogprobs]]): - """Sanitize vllm output to be comparable with hf output.""" - output_ids, output_str, out_logprobs = vllm_output - - hf_output_str = output_str.lstrip() + "|ENDOFTEXT|" - - return output_ids, hf_output_str, out_logprobs - - -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - size_factors: List[float], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - """Inference result should be the same between hf and vllm. - - All the image fixtures for the test are from IMAGE_ASSETS. - For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects - and corresponding MultiModalConfig as input. - Note, the text input is also adjusted to abide by vllm contract. - The text output is sanitized to be able to compare with hf. - """ - images = [asset.pil_image for asset in image_assets] - - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). - - # max_model_len should be greater than image_feature_size - with vllm_runner(model, - max_model_len=2048, - max_num_seqs=2, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: - vllm_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs_per_image - ] - - with hf_runner(model, dtype=dtype) as hf_model: - eos_token_id = hf_model.processor.tokenizer.eos_token_id - hf_outputs_per_image = [ - hf_model.generate_greedy_logprobs_limit(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images, - eos_token_id=eos_token_id) - for prompts, images in inputs_per_image - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output) for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - ) - - -target_dtype = "half" -if is_cpu(): - target_dtype = "bfloat16" - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [0.25], - # Single-scale, batched - [0.25, 0.25, 0.25], - # Multi-scale - [0.25, 0.2, 0.15], - ], -) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [10]) -def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, - dtype: str, max_tokens: int, num_logprobs: int) -> None: - run_test( - hf_runner, - vllm_runner, - image_assets, - model, - size_factors=size_factors, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) diff --git a/tests/models/decoder_only/vision_language/test_glm4.py b/tests/models/decoder_only/vision_language/test_glm4.py deleted file mode 100644 index 47922a57f680b..0000000000000 --- a/tests/models/decoder_only/vision_language/test_glm4.py +++ /dev/null @@ -1,133 +0,0 @@ -from typing import List, Optional, Tuple, Type - -import pytest - -from vllm.multimodal.utils import rescale_image_size -from vllm.transformers_utils.tokenizer import patch_padding_side - -from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner -from ....utils import large_gpu_test -from ...utils import check_logprobs_close - -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ - "stop_sign": - "What's the content of the image?", - "cherry_blossom": - "What is the season?", -}) - -models = ["THUDM/glm-4v-9b"] -target_dtype = "bfloat16" - - -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - inputs: List[Tuple[List[str], PromptImageInput]], - model: str, - *, - dtype: str, - max_tokens: int, - num_logprobs: int, - mm_limit: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - # max_model_len should be greater than image_feature_size - with vllm_runner(model, - max_model_len=2048, - max_num_seqs=2, - dtype=dtype, - limit_mm_per_prompt={"image": mm_limit}, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: - stop_token_ids = [151329, 151336, 151338] - vllm_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images, - stop_token_ids=stop_token_ids) - for prompts, images in inputs - ] - - with hf_runner(model, dtype=dtype) as hf_model: - hf_processor = hf_model.processor - patch_padding_side(hf_processor) - - def processor(*args, text="", images=None, **kwargs): - if images is None: - return hf_processor(*args, **kwargs) - - return hf_processor.apply_chat_template( - [{ - "role": "user", - "image": images, - "content": text - }], - add_generation_prompt=True, - tokenize=True, - return_dict=True, - **kwargs, - ) - - hf_model.processor = processor - hf_model.model.get_output_embeddings = lambda: \ - hf_model.model.transformer.output_layer - hf_outputs_per_image = [ - hf_model.generate_greedy_logprobs_limit( - prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images, - ) for prompts, images in inputs - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - - -@large_gpu_test(min_gb=48) -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - # Multi-scale - [0.25, 0.5, 1.0], - ], -) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, - dtype: str, max_tokens: int, num_logprobs: int) -> None: - images = [asset.pil_image for asset in image_assets] - - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - run_test( - hf_runner, - vllm_runner, - inputs_per_image, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - mm_limit=1, - tensor_parallel_size=1, - ) diff --git a/tests/models/decoder_only/vision_language/test_h2ovl.py b/tests/models/decoder_only/vision_language/test_h2ovl.py new file mode 100644 index 0000000000000..ad9aa3104750b --- /dev/null +++ b/tests/models/decoder_only/vision_language/test_h2ovl.py @@ -0,0 +1,130 @@ +from typing import Optional, Tuple + +import pytest +import torch +from PIL.Image import Image +from transformers import AutoConfig + +# Import the functions to test +from vllm.model_executor.models.h2ovl import (calculate_num_blocks, + image_to_pixel_values_wrapper) +from vllm.multimodal.utils import rescale_image_size + +models = [ + "h2oai/h2ovl-mississippi-800m", # Replace with your actual model names + "h2oai/h2ovl-mississippi-2b", +] +target_dtype = "bfloat16" + + +def run_preprocessing_test( + image: Image, + config, + max_dynamic_patch: Optional[int] = None, +) -> Tuple[torch.Tensor, int]: + """Test the image preprocessing and calculate expected blocks.""" + + if max_dynamic_patch is None: + max_dynamic_patch = config.max_dynamic_patch + + width, height = image.size + use_MSAC = config.use_msac + + # Create the mapper function with the provided configuration + mapper = image_to_pixel_values_wrapper(config, max_dynamic_patch, use_MSAC) + pixel_values = mapper(image) + + # Calculate the expected number of blocks + if use_MSAC: + # First pass + blocks1, _, _, aspect_ratio = calculate_num_blocks( + width, + height, + config.min_dynamic_patch, + max_dynamic_patch, + config.vision_config.image_size, + use_thumbnail=False, # Thumbnail is handled separately + prior_aspect_ratio=None, + ) + + # Second pass + blocks2, _, _, _ = calculate_num_blocks( + width, + height, + config.min_dynamic_patch, + max_dynamic_patch, + config.vision_config.image_size, + use_thumbnail=False, + prior_aspect_ratio=aspect_ratio, + ) + + # Add thumbnail if use_thumbnail is True and total_blocks > 1 + if config.use_thumbnail: + blocks1 += 1 if blocks1 > 1 else 0 + blocks2 += 1 if blocks2 > 1 else 0 + + # Total blocks is the sum of blocks from both passes minus overlapping + total_blocks = blocks1 + blocks2 - 1 + + expected_blocks = total_blocks + + else: + blocks, _, _, _ = calculate_num_blocks( + width, + height, + config.min_dynamic_patch, + max_dynamic_patch, + config.vision_config.image_size, + use_thumbnail=False, + prior_aspect_ratio=None, + ) + expected_blocks = blocks + + if config.use_thumbnail and expected_blocks > 1: + expected_blocks += 1 + + return pixel_values, expected_blocks + + +@pytest.mark.parametrize("model_name", models) +@pytest.mark.parametrize( + "size_factors", + [ + # Single-scale + [1.0], + # Single-scale, batched + [1.0, 1.0, 1.0], + # Multi-scale + [0.25, 0.5, 1.0], + ], +) +@pytest.mark.parametrize("max_dynamic_patch", [None, 2, 4, 8]) +def test_image_preprocessing(image_assets, model_name, size_factors, + max_dynamic_patch): + """Test image preprocessing pipeline with different configurations.""" + # Load the configuration from the model + config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) + + for asset in image_assets: + image = asset.pil_image + for factor in size_factors: + scaled_image = rescale_image_size(image, factor) + + # Test preprocessing and get expected number of blocks + pixel_values, expected_blocks = run_preprocessing_test( + scaled_image, config, max_dynamic_patch) + + # Verify output shapes and properties + actual_blocks = pixel_values.shape[0] + assert actual_blocks == expected_blocks, ( + f"Expected {expected_blocks} blocks, got {actual_blocks}") + + # Check image dimensions + expected_size = ( + 3, # Number of channels (C, H, W) + config.vision_config.image_size, + config.vision_config.image_size, + ) + for img in pixel_values: + assert img.shape == expected_size, ( + f"Expected image size {expected_size}, got {img.shape}") diff --git a/tests/models/decoder_only/vision_language/test_intern_vit.py b/tests/models/decoder_only/vision_language/test_intern_vit.py index 3c3b95b38baac..98f313eb9b9af 100644 --- a/tests/models/decoder_only/vision_language/test_intern_vit.py +++ b/tests/models/decoder_only/vision_language/test_intern_vit.py @@ -6,7 +6,7 @@ from huggingface_hub import snapshot_download from transformers import AutoConfig, AutoModel, CLIPImageProcessor -from ....conftest import _ImageAssets, cleanup +from ....conftest import _ImageAssets # we use snapshot_download to prevent conflicts between # dynamic_module and trust_remote_code for hf_runner @@ -45,12 +45,13 @@ def run_intern_vit_test( for pixel_value in pixel_values ] + from vllm.distributed import cleanup_dist_env_and_memory from vllm.model_executor.models.intern_vit import InternVisionModel vllm_model = InternVisionModel(config) vllm_model.load_weights(hf_model.state_dict().items()) del hf_model - cleanup() + cleanup_dist_env_and_memory() vllm_model = vllm_model.to("cuda", dtype) vllm_outputs_per_image = [ @@ -58,7 +59,7 @@ def run_intern_vit_test( for pixel_value in pixel_values ] del vllm_model - cleanup() + cleanup_dist_env_and_memory() cos_similar = nn.CosineSimilarity(dim=-1) for vllm_output, hf_output in zip(vllm_outputs_per_image, diff --git a/tests/models/decoder_only/vision_language/test_internvl.py b/tests/models/decoder_only/vision_language/test_internvl.py index 49cab75d8ea53..2fd1ac4bb08f7 100644 --- a/tests/models/decoder_only/vision_language/test_internvl.py +++ b/tests/models/decoder_only/vision_language/test_internvl.py @@ -1,16 +1,11 @@ -import types -from typing import List, Optional, Tuple, Type, Union +from typing import List, Optional, Tuple, Type import pytest import torch -from PIL.Image import Image -from transformers import AutoConfig from vllm.multimodal.utils import rescale_image_size -from vllm.utils import is_cpu -from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, - _ImageAssets) +from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets from ...utils import check_logprobs_close HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ @@ -19,160 +14,6 @@ "cherry_blossom": "<|im_start|>User\n\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 }) -HF_MULTIIMAGE_IMAGE_PROMPT = "<|im_start|>User\nImage-1: \nImage-2: \nDescribe the two images in detail.<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501 - -models = [ - "OpenGVLab/InternVL2-1B", - "OpenGVLab/InternVL2-2B", - # Broken due to outdated implementation of Phi-3 - # See: https://huggingface.co/OpenGVLab/InternVL2-4B/discussions/3 - # "OpenGVLab/InternVL2-4B", -] - - -# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py -def generate( - self, - pixel_values: torch.FloatTensor, - input_ids: torch.FloatTensor, - attention_mask: Optional[torch.LongTensor] = None, - **generate_kwargs, -) -> torch.LongTensor: - """Generate method for InternVL2 model without fixed use_cache.""" - assert self.img_context_token_id is not None - vit_embeds = self.extract_feature(pixel_values) - input_embeds = self.language_model.get_input_embeddings()(input_ids) - B, N, C = input_embeds.shape - input_embeds = input_embeds.reshape(B * N, C) - - input_ids = input_ids.reshape(B * N) - selected = (input_ids == self.img_context_token_id) - assert selected.sum() != 0 - input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device) - - input_embeds = input_embeds.reshape(B, N, C) - - outputs = self.language_model.generate( - inputs_embeds=input_embeds, - attention_mask=attention_mask, - **generate_kwargs, - ) - - return outputs - - -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - inputs: List[Tuple[List[str], PromptImageInput]], - model: str, - *, - dtype: str, - max_tokens: int, - num_logprobs: int, - mm_limit: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - """Inference result should be the same between hf and vllm. - - All the image fixtures for the test are from IMAGE_ASSETS. - For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects - and corresponding MultiModalConfig as input. - Note, the text input is also adjusted to abide by vllm contract. - The text output is sanitized to be able to compare with hf. - """ - - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). - - class InternVLProcessor: - """A simple processor for InternVL2 which misses a processor.""" - - def __init__(self, hf_runner: HfRunner): - self.num_image_token = hf_runner.model.num_image_token - self.tokenizer = hf_runner.tokenizer - self.dtype = hf_runner.model.dtype - - self.config = AutoConfig.from_pretrained(hf_runner.model_name, - trust_remote_code=True) - self.vision_config = self.config.vision_config - self.use_thumbnail = self.config.use_thumbnail - self.min_num = self.config.min_dynamic_patch - self.max_num = self.config.max_dynamic_patch - self.image_size = self.vision_config.image_size - - def __call__(self, text: str, images: Union[Image, List[Image]], - **kwargs): - from vllm.model_executor.models.internvl import ( - IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values) - images = [images] if isinstance(images, Image) else images - pixel_values = [ - image_to_pixel_values(image, self.image_size, self.min_num, - self.max_num, - self.use_thumbnail).to(self.dtype) - for image in images - ] - num_patches_list = [ - pixel_value.shape[0] for pixel_value in pixel_values - ] - pixel_values = torch.cat(pixel_values, dim=0) - for num_patches in num_patches_list: - context_tokens = IMG_CONTEXT * self.num_image_token \ - * num_patches - image_tokens = IMG_START + context_tokens + IMG_END - text = text.replace('', image_tokens, 1) - prompt = self.tokenizer(text, return_tensors="pt") - prompt.update({"pixel_values": pixel_values}) - return prompt - - # max_model_len should be greater than image_feature_size - with vllm_runner(model, - max_model_len=4096, - dtype=dtype, - limit_mm_per_prompt={"image": mm_limit}, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: - vllm_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs - ] - - with hf_runner(model, dtype=dtype) as hf_model: - img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids( - "") - hf_model.model.img_context_token_id = img_context_token_id - hf_model.processor = InternVLProcessor(hf_model) - hf_model.model.get_output_embeddings = lambda: \ - hf_model.model.language_model.get_output_embeddings() - hf_model.model.generate = types.MethodType(generate, hf_model.model) - eos_token_id = hf_model.tokenizer.eos_token_id - hf_outputs_per_image = [ - hf_model.generate_greedy_logprobs_limit(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=hf_images, - eos_token_id=eos_token_id) - for prompts, hf_images in inputs - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): - # TODO: Check whether using original CLIPVisionModel can improve - # consistency against HF - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) def run_awq_test( @@ -243,128 +84,6 @@ def run_awq_test( ) -target_dtype = "half" -if is_cpu(): - target_dtype = "bfloat16" - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - # Multi-scale - [0.25, 0.5, 1.0], - ], -) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -@torch.inference_mode() -def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, - dtype: str, max_tokens: int, num_logprobs: int) -> None: - images = [asset.pil_image for asset in image_assets] - - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - - run_test( - hf_runner, - vllm_runner, - inputs_per_image, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - mm_limit=1, - tensor_parallel_size=1, - ) - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - # Multi-scale - [0.5, 0.75, 1.0], - ], -) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -@torch.inference_mode() -def test_multi_images_models(hf_runner, vllm_runner, image_assets, model, - size_factors, dtype: str, max_tokens: int, - num_logprobs: int) -> None: - images = [asset.pil_image for asset in image_assets] - - inputs_per_case = [ - ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors], - [[rescale_image_size(image, factor) for image in images] - for factor in size_factors]) - ] - - run_test( - hf_runner, - vllm_runner, - inputs_per_case, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - mm_limit=2, - tensor_parallel_size=1, - ) - - -@pytest.mark.parametrize("model", ["OpenGVLab/InternVL2-2B"]) -@pytest.mark.parametrize("size_factors", [[0.5, 1.0]]) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -@torch.inference_mode() -def test_different_num_patches(hf_runner, vllm_runner, image_assets, model, - size_factors, dtype: str, max_tokens: int, - num_logprobs: int) -> None: - images = [asset.pil_image.resize((896, 896)) for asset in image_assets] - - inputs_batching = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - - inputs_multi_images = [ - ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors], - [[rescale_image_size(image, factor) for image in images] - for factor in size_factors]) - ] - for inputs in [inputs_batching, inputs_multi_images]: - run_test( - hf_runner, - vllm_runner, - inputs, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - mm_limit=2, - tensor_parallel_size=1, - ) - - @pytest.mark.parametrize( "models", [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")]) @pytest.mark.parametrize( diff --git a/tests/models/decoder_only/vision_language/test_llava.py b/tests/models/decoder_only/vision_language/test_llava.py deleted file mode 100644 index fd28a9367b4b2..0000000000000 --- a/tests/models/decoder_only/vision_language/test_llava.py +++ /dev/null @@ -1,313 +0,0 @@ -from typing import List, Optional, Tuple, Type, overload - -import pytest -from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer, - BatchEncoding) - -from vllm.multimodal.utils import rescale_image_size -from vllm.sequence import SampleLogprobs -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE - -from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, - _ImageAssets) -from ...utils import check_logprobs_close - -_LIMIT_IMAGE_PER_PROMPT = 4 - -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ - "stop_sign": - "USER: \nWhat's the content of the image?\nASSISTANT:", - "cherry_blossom": - "USER: \nWhat is the season?\nASSISTANT:", -}) - -models = [ - "llava-hf/llava-1.5-7b-hf", - # TODO: Get this model to produce meaningful output in vLLM - # "TIGER-Lab/Mantis-8B-siglip-llama3", -] - - -def vllm_to_hf_output(vllm_output: Tuple[List[int], str, - Optional[SampleLogprobs]], - model: str): - """Sanitize vllm output to be comparable with hf output.""" - output_ids, output_str, out_logprobs = vllm_output - - config = AutoConfig.from_pretrained(model) - image_token_id = config.image_token_index - - tokenizer = AutoTokenizer.from_pretrained(model) - eos_token_id = tokenizer.eos_token_id - - hf_output_ids = [ - token_id for idx, token_id in enumerate(output_ids) - if token_id != image_token_id or output_ids[idx - 1] != image_token_id - ] - - assert output_str[0] == " " - hf_output_str = output_str[1:] - if hf_output_ids[-1] == eos_token_id: - hf_output_str = hf_output_str + tokenizer.decode(eos_token_id) - - return hf_output_ids, hf_output_str, out_logprobs - - -@overload -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - size_factors: List[float], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - ... - - -@overload -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - sizes: List[Tuple[int, int]], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - ... - - -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - size_factors: Optional[List[float]] = None, - sizes: Optional[List[Tuple[int, int]]] = None, - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - images = [asset.pil_image for asset in image_assets] - - if size_factors is not None: - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - elif sizes is not None: - inputs_per_image = [( - [prompt for _ in sizes], - [image.resize(size) for size in sizes], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - else: - raise ValueError("You must provide either `size_factors` or `sizes`") - - _run_test(hf_runner, - vllm_runner, - inputs_per_image, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend) - - -def _run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - inputs: List[Tuple[List[str], PromptImageInput]], - model: str, - *, - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - """Inference result should be the same between hf and vllm. - - All the image fixtures for the test are from IMAGE_ASSETS. - For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects - and corresponding MultiModalConfig as input. - Note, the text input is also adjusted to abide by vllm contract. - The text output is sanitized to be able to compare with hf. - """ - # NOTE: For local use; this isn't tested in CI yet (see TODO above) - if model.startswith("TIGER-Lab/Mantis"): - from mantis.models.mllava import MLlavaProcessor - - torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype] - mantis_processor = MLlavaProcessor.from_pretrained( - model, torch_dtype=torch_dtype) - assert isinstance(mantis_processor, MLlavaProcessor) - else: - mantis_processor = None - - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). - - # max_model_len should be greater than image_feature_size - with vllm_runner(model, - dtype=dtype, - max_model_len=4096, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True, - limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT - }) as vllm_model: - vllm_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs - ] - - if mantis_processor is not None: - - def process(hf_inputs: BatchEncoding): - hf_inputs["pixel_values"] = hf_inputs["pixel_values"] \ - .to(torch_dtype) # type: ignore - return hf_inputs - else: - - def process(hf_inputs: BatchEncoding): - return hf_inputs - - with hf_runner(model, - dtype=dtype, - postprocess_inputs=process, - auto_cls=AutoModelForVision2Seq) as hf_model: - hf_outputs_per_image = [ - hf_model.generate_greedy_logprobs_limit(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): - # TODO: Check whether using original CLIPVisionModel can improve - # consistency against HF - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, model) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - # Multi-scale - [0.25, 0.5, 1.0], - ], -) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, - dtype, max_tokens, num_logprobs) -> None: - run_test( - hf_runner, - vllm_runner, - image_assets, - model, - size_factors=size_factors, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets, - model, dtype, max_tokens, - num_logprobs) -> None: - stop_sign = image_assets[0].pil_image - cherry_blossom = image_assets[1].pil_image - - inputs = [( - [ - "USER: \nDescribe 2 images.\nASSISTANT:", - "USER: \nDescribe 2 images.\nASSISTANT:", - "USER: \nDescribe 4 images.\nASSISTANT:", # noqa: E501 - "USER: \nWhat is the season?\nASSISTANT:", - ], - [ - [stop_sign, cherry_blossom], - # Images with different sizes and aspect-ratios - [ - rescale_image_size(stop_sign, 0.1), - stop_sign, - ], - [ - stop_sign, - rescale_image_size(stop_sign, 0.25), - cherry_blossom.resize((183, 488)), - cherry_blossom.resize((488, 183)) - ], - cherry_blossom, - ])] - - _run_test( - hf_runner, - vllm_runner, - inputs, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) - - -@pytest.mark.parametrize("model", models) -def test_context_length_too_short(vllm_runner, image_assets, model): - images = [asset.pil_image for asset in image_assets] - - with pytest.raises(ValueError, match="too long to fit into the model"): - vllm_model = vllm_runner( - model, - max_model_len=128, # LLaVA has a feature size of 576 - enforce_eager=True, - ) - - with vllm_model: - vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]], - max_tokens=1, - images=[images[0]]) diff --git a/tests/models/decoder_only/vision_language/test_llava_image_embeds.py b/tests/models/decoder_only/vision_language/test_llava_image_embeds.py deleted file mode 100644 index 66414032509ed..0000000000000 --- a/tests/models/decoder_only/vision_language/test_llava_image_embeds.py +++ /dev/null @@ -1,158 +0,0 @@ -from typing import List, Optional, Tuple, Type - -import pytest -from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer - -from vllm.sequence import SampleLogprobs - -from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from ...utils import check_logprobs_close - -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ - "stop_sign": - "USER: \nWhat's the content of the image?\nASSISTANT:", - "cherry_blossom": - "USER: \nWhat is the season?\nASSISTANT:", -}) - -models = [ - "llava-hf/llava-1.5-7b-hf", -] - - -def vllm_to_hf_output(vllm_output: Tuple[List[int], str, - Optional[SampleLogprobs]], - model: str): - """Sanitize vllm output to be comparable with hf output.""" - output_ids, output_str, out_logprobs = vllm_output - - config = AutoConfig.from_pretrained(model) - image_token_id = config.image_token_index - - tokenizer = AutoTokenizer.from_pretrained(model) - eos_token_id = tokenizer.eos_token_id - - hf_output_ids = [ - token_id for idx, token_id in enumerate(output_ids) - if token_id != image_token_id or output_ids[idx - 1] != image_token_id - ] - - assert output_str[0] == " " - hf_output_str = output_str[1:] - if hf_output_ids[-1] == eos_token_id: - hf_output_str = hf_output_str + tokenizer.decode(eos_token_id) - - return hf_output_ids, hf_output_str, out_logprobs - - -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - size_factors: List[float], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - """Inference result should be the same between hf and vllm. - - All the image fixtures for the test are from IMAGE_ASSETS. - For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects - and corresponding vision language config as input. - Note, the text input is also adjusted to abide by vllm contract. - The text output is sanitized to be able to compare with hf. - """ - - # vLLM to load from image embeddings - vllm_images = [asset.image_embeds for asset in image_assets] - - # transformers to load from PIL images - hf_images = [asset.pil_image for asset in image_assets] - - vllm_inputs_per_image = [( - [prompt for _ in size_factors], - [image for _ in size_factors], - ) for image, prompt in zip(vllm_images, HF_IMAGE_PROMPTS)] - - hf_inputs_per_image = [( - [prompt for _ in size_factors], - [image for _ in size_factors], - ) for image, prompt in zip(hf_images, HF_IMAGE_PROMPTS)] - - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). - - # max_model_len should be greater than image_feature_size - with vllm_runner(model, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: - vllm_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in vllm_inputs_per_image - ] - - with hf_runner(model, dtype=dtype, - auto_cls=AutoModelForVision2Seq) as hf_model: - hf_outputs_per_image = [ - hf_model.generate_greedy_logprobs_limit(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in hf_inputs_per_image - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): - # TODO: Check whether using original CLIPVisionModel can improve - # consistency against HF - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, model) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - ], -) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, - dtype: str, max_tokens: int, num_logprobs: int) -> None: - run_test( - hf_runner, - vllm_runner, - image_assets, - model, - size_factors=size_factors, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) diff --git a/tests/models/decoder_only/vision_language/test_llava_next.py b/tests/models/decoder_only/vision_language/test_llava_next.py deleted file mode 100644 index f833fe0c8bbb4..0000000000000 --- a/tests/models/decoder_only/vision_language/test_llava_next.py +++ /dev/null @@ -1,283 +0,0 @@ -from typing import List, Optional, Tuple, Type, overload - -import pytest -from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer - -from vllm.multimodal.utils import rescale_image_size -from vllm.sequence import SampleLogprobs - -from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, - _ImageAssets) -from ...utils import check_logprobs_close - -_LIMIT_IMAGE_PER_PROMPT = 4 - -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ - "stop_sign": - "[INST] \nWhat's the content of the image? [/INST]", - "cherry_blossom": - "[INST] \nWhat is the season? [/INST]", -}) - -models = ["llava-hf/llava-v1.6-mistral-7b-hf"] - - -def vllm_to_hf_output(vllm_output: Tuple[List[int], str, - Optional[SampleLogprobs]], - model: str): - """Sanitize vllm output to be comparable with hf output.""" - output_ids, output_str, out_logprobs = vllm_output - - config = AutoConfig.from_pretrained(model) - image_token_id = config.image_token_index - - tokenizer = AutoTokenizer.from_pretrained(model) - eos_token_id = tokenizer.eos_token_id - - hf_output_ids = [ - token_id for idx, token_id in enumerate(output_ids) - if token_id != image_token_id or output_ids[idx - 1] != image_token_id - ] - - assert output_str[0] == " " - hf_output_str = output_str[1:] - if hf_output_ids[-1] == eos_token_id: - hf_output_str = hf_output_str + tokenizer.decode(eos_token_id) - - return hf_output_ids, hf_output_str, out_logprobs - - -@overload -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - size_factors: List[float], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - ... - - -@overload -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - sizes: List[Tuple[int, int]], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - ... - - -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - size_factors: Optional[List[float]] = None, - sizes: Optional[List[Tuple[int, int]]] = None, - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - images = [asset.pil_image for asset in image_assets] - - if size_factors is not None: - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - elif sizes is not None: - inputs_per_image = [( - [prompt for _ in sizes], - [image.resize(size) for size in sizes], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - else: - raise ValueError("You must provide either `size_factors` or `sizes`") - - _run_test(hf_runner, - vllm_runner, - inputs_per_image, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend) - - -def _run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - inputs: List[Tuple[List[str], PromptImageInput]], - model: str, - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - # max_model_len should be greater than image_feature_size - with vllm_runner(model, - dtype=dtype, - max_model_len=10240, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True, - limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT - }) as vllm_model: - vllm_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs - ] - - with hf_runner(model, dtype=dtype, - auto_cls=AutoModelForVision2Seq) as hf_model: - hf_outputs_per_image = [ - hf_model.generate_greedy_logprobs_limit(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): - # TODO: Check whether using original CLIPVisionModel can improve - # consistency against HF - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, model) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - # Multi-scale - [0.25, 0.5, 1.0], - ], -) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, - dtype, max_tokens, num_logprobs) -> None: - """Inference result should be the same between hf and vllm. - - All the image fixtures for the test are from IMAGE_ASSETS. - For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects - and corresponding MultiModalConfig as input. - Note, the text input is also adjusted to abide by vllm contract. - The text output is sanitized to be able to compare with hf. - """ - run_test( - hf_runner, - vllm_runner, - image_assets, - model, - size_factors=size_factors, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "sizes", - [[(1669, 2560), (2560, 1669), (183, 488), (488, 183)]], -) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models_fixed_sizes(hf_runner, vllm_runner, image_assets, model, sizes, - dtype, max_tokens, num_logprobs) -> None: - run_test( - hf_runner, - vllm_runner, - image_assets, - model, - sizes=sizes, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets, - model, dtype, max_tokens, - num_logprobs) -> None: - stop_sign = image_assets[0].pil_image - cherry_blossom = image_assets[1].pil_image - - inputs = [( - [ - "[INST] \nDescribe 2 images. [/INST]", - "[INST] \nDescribe 2 images. [/INST]", - "[INST] \nDescribe 4 images. [/INST]", - "[INST] \nWhat is the season? [/INST]" - ], - [ - [stop_sign, cherry_blossom], - # Images with different sizes and aspect-ratios - [ - rescale_image_size(stop_sign, 0.1), - stop_sign, - ], - [ - stop_sign, - rescale_image_size(stop_sign, 0.25), - cherry_blossom.resize((183, 488)), - cherry_blossom.resize((488, 183)) - ], - cherry_blossom, - ])] - - _run_test( - hf_runner, - vllm_runner, - inputs, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) diff --git a/tests/models/decoder_only/vision_language/test_llava_next_video.py b/tests/models/decoder_only/vision_language/test_llava_next_video.py deleted file mode 100644 index 7b7b23c783e2a..0000000000000 --- a/tests/models/decoder_only/vision_language/test_llava_next_video.py +++ /dev/null @@ -1,226 +0,0 @@ -from typing import List, Optional, Tuple, Type, overload - -import pytest -from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer - -from vllm.multimodal.utils import (rescale_video_size, resize_video, - sample_frames_from_video) -from vllm.sequence import SampleLogprobs - -from ....conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets -from ...utils import check_logprobs_close - -_PREFACE = ( - "A chat between a curious human and an artificial intelligence assistant. " - "The assistant gives helpful, detailed, and polite answers to the human's " - "questions.") - -HF_VIDEO_PROMPTS = VIDEO_ASSETS.prompts({ - "sample_demo_1": - f"{_PREFACE}USER: