diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py index d0ca09c4be0d4..d7863a9ae1ada 100644 --- a/tests/core/block/test_block_manager_v2.py +++ b/tests/core/block/test_block_manager_v2.py @@ -249,10 +249,13 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append, # Expect consumed blocks to be new blocks required to support the new slots. expected_consumed_blocks = len( - chunk_list( - list( - range(prompt_len + num_slots_to_append + num_lookahead_slots)), - block_size)) - len(chunk_list(list(range(prompt_len)), block_size)) + list( + chunk_list( + list( + range(prompt_len + num_slots_to_append + + num_lookahead_slots)), + block_size))) - len( + list(chunk_list(list(range(prompt_len)), block_size))) assert num_consumed_blocks == expected_consumed_blocks diff --git a/tests/core/block/test_cpu_gpu_block_allocator.py b/tests/core/block/test_cpu_gpu_block_allocator.py index 15b76d9093c63..a9e38d40444a9 100644 --- a/tests/core/block/test_cpu_gpu_block_allocator.py +++ b/tests/core/block/test_cpu_gpu_block_allocator.py @@ -58,10 +58,10 @@ def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int, unique_token_ids = list( range((num_cpu_blocks + num_gpu_blocks) * block_size)) - gpu_token_ids = chunk_list(unique_token_ids[:num_gpu_blocks * block_size], - block_size) - cpu_token_ids = chunk_list(unique_token_ids[num_gpu_blocks * block_size:], - block_size) + gpu_token_ids = list( + chunk_list(unique_token_ids[:num_gpu_blocks * block_size], block_size)) + cpu_token_ids = list( + chunk_list(unique_token_ids[num_gpu_blocks * block_size:], block_size)) assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index 49e63c23155b8..06b816eb367f5 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -1,3 +1,4 @@ +import math from typing import List, Optional from vllm.core.block.common import BlockList @@ -337,10 +338,17 @@ def get_num_blocks_touched_by_append_slots( This is required for the scheduler to determine whether a sequence can continue generation, or if it must be preempted. """ + # Math below is equivalent to: + # all_token_ids = token_ids + [-1] * num_lookahead_slots + # token_blocks = self._chunk_token_blocks_for_append(all_token_ids) + # return len(token_blocks) - all_token_ids = token_ids + [-1] * num_lookahead_slots - token_blocks = self._chunk_token_blocks_for_append(all_token_ids) - return len(token_blocks) + num_token_ids = len(token_ids) + num_lookahead_slots + first_chunk_size = self._block_size - (self._num_full_slots % + self._block_size) + num_token_blocks = (1 + math.ceil( + (num_token_ids - first_chunk_size) / self._block_size)) + return num_token_blocks def _chunk_token_blocks_for_append( self, token_ids: List[int]) -> List[List[int]]: @@ -351,6 +359,7 @@ def _chunk_token_blocks_for_append( """ first_chunk_size = self._block_size - (self._num_full_slots % self._block_size) - token_blocks = [token_ids[:first_chunk_size]] + chunk_list( - token_ids[first_chunk_size:], self._block_size) + token_blocks = [token_ids[:first_chunk_size]] + token_blocks.extend( + chunk_list(token_ids[first_chunk_size:], self._block_size)) return token_blocks diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index f272e23ee6088..d102ad4045591 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -552,9 +552,12 @@ def get_common_computed_block_ids( # runner. # It returns a list of int although type annotation says list of string. + if len(computed_seq_block_ids) == 1: + return computed_seq_block_ids[0] + return commonprefix([ ids for ids in computed_seq_block_ids # type: ignore - if ids != [] + if ids ]) def get_num_blocks_touched(self, diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 87508a1168e0c..aa5a70757b31c 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -1,3 +1,4 @@ +import functools import importlib from typing import Dict, List, Optional, Type @@ -98,6 +99,14 @@ class ModelRegistry: + @staticmethod + @functools.lru_cache(maxsize=128) + def _get_model(model_arch: str): + module_name, model_cls_name = _MODELS[model_arch] + module = importlib.import_module( + f"vllm.model_executor.models.{module_name}") + return getattr(module, model_cls_name, None) + @staticmethod def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]: if model_arch in _OOT_MODELS: @@ -114,10 +123,7 @@ def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]: "Model architecture %s is partially supported by ROCm: %s", model_arch, _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch]) - module_name, model_cls_name = _MODELS[model_arch] - module = importlib.import_module( - f"vllm.model_executor.models.{module_name}") - return getattr(module, model_cls_name, None) + return ModelRegistry._get_model(model_arch) @staticmethod def get_supported_archs() -> List[str]: diff --git a/vllm/sequence.py b/vllm/sequence.py index 1cebf68d463db..6c12a01bd0b2b 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -457,24 +457,25 @@ def __init__( self.prompt_adapter_request = prompt_adapter_request self.encoder_seq = encoder_seq self.trace_headers = trace_headers + self._first_seq = next(iter(self.seqs_dict.values())) @property def prompt(self) -> Optional[str]: # All sequences in the group should have the same prompt. # We use the prompt of an arbitrary sequence. - return next(iter(self.seqs_dict.values())).prompt + return self._first_seq.prompt @property def prompt_token_ids(self) -> List[int]: # All sequences in the group should have the same prompt. # We use the prompt of an arbitrary sequence. - return next(iter(self.seqs_dict.values())).prompt_token_ids + return self._first_seq.prompt_token_ids @property def multi_modal_data(self) -> "MultiModalDataDict": # All sequences in the group should have the same multi-modal data. # We use the multi-modal data of an arbitrary sequence. - return next(iter(self.seqs_dict.values())).multi_modal_data + return self._first_seq.multi_modal_data @property def lora_int_id(self) -> int: diff --git a/vllm/utils.py b/vllm/utils.py index f3025a68dbbf9..f906d82581233 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -415,9 +415,10 @@ def init_kmp_env(): os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist" -def chunk_list(lst: List[T], chunk_size: int) -> List[List[T]]: +def chunk_list(lst: List[T], chunk_size: int): """Yield successive chunk_size chunks from lst.""" - return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)] + for i in range(0, len(lst), chunk_size): + yield lst[i:i + chunk_size] def cdiv(a: int, b: int) -> int: