Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Misc] Small perf improvements #6520

Merged
merged 14 commits into from
Jul 19, 2024
11 changes: 7 additions & 4 deletions tests/core/block/test_block_manager_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,10 +249,13 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,

# Expect consumed blocks to be new blocks required to support the new slots.
expected_consumed_blocks = len(
chunk_list(
list(
range(prompt_len + num_slots_to_append + num_lookahead_slots)),
block_size)) - len(chunk_list(list(range(prompt_len)), block_size))
list(
chunk_list(
list(
range(prompt_len + num_slots_to_append +
num_lookahead_slots)),
block_size))) - len(
list(chunk_list(list(range(prompt_len)), block_size)))
assert num_consumed_blocks == expected_consumed_blocks


Expand Down
8 changes: 4 additions & 4 deletions tests/core/block/test_cpu_gpu_block_allocator.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,10 @@ def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int,

unique_token_ids = list(
range((num_cpu_blocks + num_gpu_blocks) * block_size))
gpu_token_ids = chunk_list(unique_token_ids[:num_gpu_blocks * block_size],
block_size)
cpu_token_ids = chunk_list(unique_token_ids[num_gpu_blocks * block_size:],
block_size)
gpu_token_ids = list(
chunk_list(unique_token_ids[:num_gpu_blocks * block_size], block_size))
cpu_token_ids = list(
chunk_list(unique_token_ids[num_gpu_blocks * block_size:], block_size))

assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks
Expand Down
19 changes: 14 additions & 5 deletions vllm/core/block/block_table.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import math
from typing import List, Optional

from vllm.core.block.common import BlockList
Expand Down Expand Up @@ -337,10 +338,17 @@ def get_num_blocks_touched_by_append_slots(
This is required for the scheduler to determine whether a sequence can
continue generation, or if it must be preempted.
"""
# Math below is equivalent to:
# all_token_ids = token_ids + [-1] * num_lookahead_slots
# token_blocks = self._chunk_token_blocks_for_append(all_token_ids)
# return len(token_blocks)

all_token_ids = token_ids + [-1] * num_lookahead_slots
token_blocks = self._chunk_token_blocks_for_append(all_token_ids)
return len(token_blocks)
num_token_ids = len(token_ids) + num_lookahead_slots
first_chunk_size = self._block_size - (self._num_full_slots %
self._block_size)
num_token_blocks = (1 + math.ceil(
(num_token_ids - first_chunk_size) / self._block_size))
return num_token_blocks

def _chunk_token_blocks_for_append(
self, token_ids: List[int]) -> List[List[int]]:
Expand All @@ -351,6 +359,7 @@ def _chunk_token_blocks_for_append(
"""
first_chunk_size = self._block_size - (self._num_full_slots %
self._block_size)
token_blocks = [token_ids[:first_chunk_size]] + chunk_list(
token_ids[first_chunk_size:], self._block_size)
token_blocks = [token_ids[:first_chunk_size]]
token_blocks.extend(
chunk_list(token_ids[first_chunk_size:], self._block_size))
return token_blocks
5 changes: 4 additions & 1 deletion vllm/core/block/prefix_caching_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,9 +552,12 @@ def get_common_computed_block_ids(
# runner.

# It returns a list of int although type annotation says list of string.
if len(computed_seq_block_ids) == 1:
return computed_seq_block_ids[0]

return commonprefix([
ids for ids in computed_seq_block_ids # type: ignore
if ids != []
if ids
])

def get_num_blocks_touched(self,
Expand Down
14 changes: 10 additions & 4 deletions vllm/model_executor/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import functools
import importlib
from typing import Dict, List, Optional, Type

Expand Down Expand Up @@ -98,6 +99,14 @@

class ModelRegistry:

@staticmethod
@functools.lru_cache(maxsize=128)
def _get_model(model_arch: str):
module_name, model_cls_name = _MODELS[model_arch]
module = importlib.import_module(
f"vllm.model_executor.models.{module_name}")
return getattr(module, model_cls_name, None)

@staticmethod
def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]:
if model_arch in _OOT_MODELS:
Expand All @@ -114,10 +123,7 @@ def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]:
"Model architecture %s is partially supported by ROCm: %s",
model_arch, _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch])

module_name, model_cls_name = _MODELS[model_arch]
module = importlib.import_module(
f"vllm.model_executor.models.{module_name}")
return getattr(module, model_cls_name, None)
return ModelRegistry._get_model(model_arch)

@staticmethod
def get_supported_archs() -> List[str]:
Expand Down
7 changes: 4 additions & 3 deletions vllm/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,24 +457,25 @@ def __init__(
self.prompt_adapter_request = prompt_adapter_request
self.encoder_seq = encoder_seq
self.trace_headers = trace_headers
self._first_seq = next(iter(self.seqs_dict.values()))

@property
def prompt(self) -> Optional[str]:
# All sequences in the group should have the same prompt.
# We use the prompt of an arbitrary sequence.
return next(iter(self.seqs_dict.values())).prompt
return self._first_seq.prompt

@property
def prompt_token_ids(self) -> List[int]:
# All sequences in the group should have the same prompt.
# We use the prompt of an arbitrary sequence.
return next(iter(self.seqs_dict.values())).prompt_token_ids
return self._first_seq.prompt_token_ids

@property
def multi_modal_data(self) -> "MultiModalDataDict":
# All sequences in the group should have the same multi-modal data.
# We use the multi-modal data of an arbitrary sequence.
return next(iter(self.seqs_dict.values())).multi_modal_data
return self._first_seq.multi_modal_data

@property
def lora_int_id(self) -> int:
Expand Down
5 changes: 3 additions & 2 deletions vllm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,9 +415,10 @@ def init_kmp_env():
os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist"


def chunk_list(lst: List[T], chunk_size: int) -> List[List[T]]:
def chunk_list(lst: List[T], chunk_size: int):
"""Yield successive chunk_size chunks from lst."""
return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
for i in range(0, len(lst), chunk_size):
yield lst[i:i + chunk_size]


def cdiv(a: int, b: int) -> int:
Expand Down
Loading