[Misc] Small perf improvements (vllm-project#6520)

xjpang · Jul 24, 2024 · 2660a29 · 2660a29
1 parent 9676fc2
commit 2660a29
Show file tree

Hide file tree

Showing 7 changed files with 46 additions and 23 deletions.
diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py
@@ -249,10 +249,13 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,
 
     # Expect consumed blocks to be new blocks required to support the new slots.
     expected_consumed_blocks = len(
-        chunk_list(
-            list(
-                range(prompt_len + num_slots_to_append + num_lookahead_slots)),
-            block_size)) - len(chunk_list(list(range(prompt_len)), block_size))
+        list(
+            chunk_list(
+                list(
+                    range(prompt_len + num_slots_to_append +
+                          num_lookahead_slots)),
+                block_size))) - len(
+                    list(chunk_list(list(range(prompt_len)), block_size)))
     assert num_consumed_blocks == expected_consumed_blocks
 
 

diff --git a/tests/core/block/test_cpu_gpu_block_allocator.py b/tests/core/block/test_cpu_gpu_block_allocator.py
@@ -58,10 +58,10 @@ def test_allocate_immutable_block(num_cpu_blocks: int, num_gpu_blocks: int,
 
     unique_token_ids = list(
         range((num_cpu_blocks + num_gpu_blocks) * block_size))
-    gpu_token_ids = chunk_list(unique_token_ids[:num_gpu_blocks * block_size],
-                               block_size)
-    cpu_token_ids = chunk_list(unique_token_ids[num_gpu_blocks * block_size:],
-                               block_size)
+    gpu_token_ids = list(
+        chunk_list(unique_token_ids[:num_gpu_blocks * block_size], block_size))
+    cpu_token_ids = list(
+        chunk_list(unique_token_ids[num_gpu_blocks * block_size:], block_size))
 
     assert allocator.get_num_free_blocks(Device.CPU) == num_cpu_blocks
     assert allocator.get_num_free_blocks(Device.GPU) == num_gpu_blocks

diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
@@ -1,3 +1,4 @@
+import math
 from typing import List, Optional
 
 from vllm.core.block.common import BlockList
@@ -337,10 +338,17 @@ def get_num_blocks_touched_by_append_slots(
         This is required for the scheduler to determine whether a sequence can
         continue generation, or if it must be preempted.
         """
+        # Math below is equivalent to:
+        # all_token_ids = token_ids + [-1] * num_lookahead_slots
+        # token_blocks = self._chunk_token_blocks_for_append(all_token_ids)
+        # return len(token_blocks)
 
-        all_token_ids = token_ids + [-1] * num_lookahead_slots
-        token_blocks = self._chunk_token_blocks_for_append(all_token_ids)
-        return len(token_blocks)
+        num_token_ids = len(token_ids) + num_lookahead_slots
+        first_chunk_size = self._block_size - (self._num_full_slots %
+                                               self._block_size)
+        num_token_blocks = (1 + math.ceil(
+            (num_token_ids - first_chunk_size) / self._block_size))
+        return num_token_blocks
 
     def _chunk_token_blocks_for_append(
             self, token_ids: List[int]) -> List[List[int]]:
@@ -351,6 +359,7 @@ def _chunk_token_blocks_for_append(
         """
         first_chunk_size = self._block_size - (self._num_full_slots %
                                                self._block_size)
-        token_blocks = [token_ids[:first_chunk_size]] + chunk_list(
-            token_ids[first_chunk_size:], self._block_size)
+        token_blocks = [token_ids[:first_chunk_size]]
+        token_blocks.extend(
+            chunk_list(token_ids[first_chunk_size:], self._block_size))
         return token_blocks
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
@@ -552,9 +552,12 @@ def get_common_computed_block_ids(
         # runner.
 
         # It returns a list of int although type annotation says list of string.
+        if len(computed_seq_block_ids) == 1:
+            return computed_seq_block_ids[0]
+
         return commonprefix([
             ids for ids in computed_seq_block_ids  # type: ignore
-            if ids != []
+            if ids
         ])
 
     def get_num_blocks_touched(self,

diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
@@ -1,3 +1,4 @@
+import functools
 import importlib
 from typing import Dict, List, Optional, Type
 
@@ -98,6 +99,14 @@
 
 class ModelRegistry:
 
+    @staticmethod
+    @functools.lru_cache(maxsize=128)
+    def _get_model(model_arch: str):
+        module_name, model_cls_name = _MODELS[model_arch]
+        module = importlib.import_module(
+            f"vllm.model_executor.models.{module_name}")
+        return getattr(module, model_cls_name, None)
+
     @staticmethod
     def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]:
         if model_arch in _OOT_MODELS:
@@ -114,10 +123,7 @@ def load_model_cls(model_arch: str) -> Optional[Type[nn.Module]]:
                     "Model architecture %s is partially supported by ROCm: %s",
                     model_arch, _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch])
 
-        module_name, model_cls_name = _MODELS[model_arch]
-        module = importlib.import_module(
-            f"vllm.model_executor.models.{module_name}")
-        return getattr(module, model_cls_name, None)
+        return ModelRegistry._get_model(model_arch)
 
     @staticmethod
     def get_supported_archs() -> List[str]:

diff --git a/vllm/sequence.py b/vllm/sequence.py
@@ -457,24 +457,25 @@ def __init__(
         self.prompt_adapter_request = prompt_adapter_request
         self.encoder_seq = encoder_seq
         self.trace_headers = trace_headers
+        self._first_seq = next(iter(self.seqs_dict.values()))
 
     @property
     def prompt(self) -> Optional[str]:
         # All sequences in the group should have the same prompt.
         # We use the prompt of an arbitrary sequence.
-        return next(iter(self.seqs_dict.values())).prompt
+        return self._first_seq.prompt
 
     @property
     def prompt_token_ids(self) -> List[int]:
         # All sequences in the group should have the same prompt.
         # We use the prompt of an arbitrary sequence.
-        return next(iter(self.seqs_dict.values())).prompt_token_ids
+        return self._first_seq.prompt_token_ids
 
     @property
     def multi_modal_data(self) -> "MultiModalDataDict":
         # All sequences in the group should have the same multi-modal data.
         # We use the multi-modal data of an arbitrary sequence.
-        return next(iter(self.seqs_dict.values())).multi_modal_data
+        return self._first_seq.multi_modal_data
 
     @property
     def lora_int_id(self) -> int:

diff --git a/vllm/utils.py b/vllm/utils.py
@@ -415,9 +415,10 @@ def init_kmp_env():
     os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist"
 
 
-def chunk_list(lst: List[T], chunk_size: int) -> List[List[T]]:
+def chunk_list(lst: List[T], chunk_size: int):
     """Yield successive chunk_size chunks from lst."""
-    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
+    for i in range(0, len(lst), chunk_size):
+        yield lst[i:i + chunk_size]
 
 
 def cdiv(a: int, b: int) -> int: