lint

rickyyx · Nov 7, 2024 · 8d8853e · 8d8853e
1 parent 417760a
commit 8d8853e
Show file tree

Hide file tree

Showing 16 changed files with 304 additions and 552 deletions.
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
@@ -117,7 +117,8 @@ def main(args):
     input_length_range = tuple(map(int, args.input_length_range.split(':')))
     random.seed(args.seed)
     if args.dataset_path is not None:
-        print(f"Start to sample {args.num_prompts} prompts from {args.dataset_path}")
+        print(f"Start to sample {args.num_prompts} prompts "
+              f"from {args.dataset_path}")
         filtered_datasets = sample_requests(
             dataset_path=args.dataset_path,
             num_requests=args.num_prompts,

diff --git a/tests/core/block/test_block_manager.py b/tests/core/block/test_block_manager.py
@@ -256,10 +256,9 @@ def test_can_allocate_with_prefix_cache(
     # Allocate the seq 1
     block_manager.allocate(seq_group_1)
 
-    # Mark the seq 1 as computed (This shoudl be done by the scheduler in reality)
-    block_manager.mark_blocks_as_computed(
-        seq_group=seq_group_1, token_chunk_size=len(tokens_1)
-    )
+    # Mark the seq 1 as computed (This should be done by the scheduler in reality)
+    block_manager.mark_blocks_as_computed(seq_group=seq_group_1,
+                                          token_chunk_size=len(tokens_1))
 
     # Test if allocatable of seq 2.
     seq_group_2 = create_seq_group(
@@ -399,7 +398,9 @@ def test_can_swap(block_size, num_gpu_blocks, num_lookahead_slots,
                                               watermark=0,
                                               enable_caching=enable_caching)
     prompt, seq_group = create_dummy_prompt(
-        "1", prompt_length=(num_gpu_blocks - 1) * block_size - 1, block_size=block_size
+        "1",
+        prompt_length=(num_gpu_blocks - 1) * block_size - 1,
+        block_size=block_size,
     )
     prompt.status = SequenceStatus.WAITING
     block_manager.allocate(seq_group)

diff --git a/tests/core/block/test_block_table.py b/tests/core/block/test_block_table.py
@@ -105,8 +105,7 @@ def test_allocate_prefix_caching(block_size: int, sequence_len: int):
                 block_size=block_size,
                 block_allocator=allocator,
                 enable_prefix_caching=True,
-            )
-        )
+            ))
         seq = make_sequence(alloc_i, token_ids, block_size)
         block_tables[-1].allocate(seq=seq, device=Device.GPU)
 
@@ -148,7 +147,8 @@ def test_allocate_free(block_size: int, sequence_len: int, allocator_type: str,
     block_table = BlockTable(
         block_size=block_size,
         block_allocator=allocator,
-        enable_prefix_caching=True if allocator_type == "prefix_caching" else False,
+        enable_prefix_caching=True
+        if allocator_type == "prefix_caching" else False,
     )
 
     for i in range(5):
@@ -193,7 +193,8 @@ def test_append_token_ids_allocation(block_size: int, sequence_len: int,
     block_table = BlockTable(
         block_size=block_size,
         block_allocator=allocator,
-        enable_prefix_caching=True if allocator_type == "prefix_caching" else False,
+        enable_prefix_caching=True
+        if allocator_type == "prefix_caching" else False,
     )
 
     num_expected_blocks_before_append = len(
@@ -250,7 +251,8 @@ def test_ensure_num_empty_slots_allocation(block_size: int, sequence_len: int,
     block_table = BlockTable(
         block_size=block_size,
         block_allocator=allocator,
-        enable_prefix_caching=True if allocator_type == "prefix_caching" else False,
+        enable_prefix_caching=True
+        if allocator_type == "prefix_caching" else False,
     )
 
     num_expected_blocks_before_append = len(
@@ -308,7 +310,8 @@ def test_append_token_ids_correct_content(block_size: int, sequence_len: int,
     block_table = BlockTable(
         block_size=block_size,
         block_allocator=allocator,
-        enable_prefix_caching=True if allocator_type == "prefix_caching" else False,
+        enable_prefix_caching=True
+        if allocator_type == "prefix_caching" else False,
     )
     seq = make_sequence(0, token_ids, block_size)
     block_table.allocate(seq=seq, device=Device.GPU)
@@ -353,7 +356,8 @@ def test_fork(seq_len: int, block_size: int, allocator_type: str):
     block_table = BlockTable(
         block_size=block_size,
         block_allocator=allocator,
-        enable_prefix_caching=True if allocator_type == "prefix_caching" else False,
+        enable_prefix_caching=True
+        if allocator_type == "prefix_caching" else False,
     )
 
     seq = make_sequence(0, token_ids, block_size)
@@ -414,7 +418,8 @@ def test_cow(block_size: int, sequence_len: int, append_len: int,
     original_block_table = BlockTable(
         block_size=block_size,
         block_allocator=allocator,
-        enable_prefix_caching=True if allocator_type == "prefix_caching" else False,
+        enable_prefix_caching=True
+        if allocator_type == "prefix_caching" else False,
     )
 
     num_expected_non_cow_blocks = cdiv(sequence_len, block_size)
@@ -504,7 +509,8 @@ def test_cow_lookahead_simple(block_size: int, sequence_len: int,
     original_block_table = BlockTable(
         block_size=block_size,
         block_allocator=allocator,
-        enable_prefix_caching=True if allocator_type == "prefix_caching" else False,
+        enable_prefix_caching=True
+        if allocator_type == "prefix_caching" else False,
     )
 
     seq = make_sequence(0, token_ids, block_size)
@@ -590,7 +596,8 @@ def test_num_blocks_touched_by_append_slots(block_size: int, sequence_len: int,
     block_table = BlockTable(
         block_size=block_size,
         block_allocator=allocator,
-        enable_prefix_caching=True if allocator_type == "prefix_caching" else False,
+        enable_prefix_caching=True
+        if allocator_type == "prefix_caching" else False,
     )
     seq = make_sequence(0, token_ids, block_size)
     block_table.allocate(seq=seq, device=Device.GPU)

diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py
@@ -795,9 +795,8 @@ def test_get_cached_blocks():
 
         block_size = 16
         num_blocks = 5
-        allocator = PrefixCachingBlockAllocator(
-            block_size=block_size, num_blocks=num_blocks
-        )
+        allocator = PrefixCachingBlockAllocator(block_size=block_size,
+                                                num_blocks=num_blocks)
 
         # 1. Allocate a list of blocks
         block_hashes = [random.randint(1, 1000000) for _ in range(num_blocks)]
@@ -825,12 +824,11 @@ def test_get_cached_blocks():
             result = allocator.get_cached_blocks(cached_hashes)
             assert (
                 result == expected_cached_blocks
-            ), f"Expected {expected_cached_blocks}, but got {result}, with test case {cached_hashes}. blcok hashes = {block_hashes}"
+            ), f"Expected {expected_cached_blocks}, but got {result}, with test case {cached_hashes}. block hashes = {block_hashes}"
 
         # Test with some non-existent hashes
         non_existent_hash = max(block_hashes) + 1
         test_hashes = block_hashes[:3] + [non_existent_hash] + block_hashes[3:]
         result = allocator.get_cached_blocks(test_hashes)
-        assert (
-            result == block_hashes[0:3]
-        ), f"Expected {block_hashes[0:3]}, but got {result}"
+        assert (result == block_hashes[0:3]
+                ), f"Expected {block_hashes[0:3]}, but got {result}"
diff --git a/tests/core/utils.py b/tests/core/utils.py
@@ -116,7 +116,7 @@ def create_dummy_prompt_encoder_decoder(
 
 def create_seq_group(
     seq_prompt_len: int = 1024,
-    seq_output_lens: GenericSequence[int] = (128,),
+    seq_output_lens: GenericSequence[int] = (128, ),
     request_id: str = "0",
     seq_id_start: int = 0,
     sampling_params: Optional[SamplingParams] = None,

diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
@@ -43,10 +43,10 @@ def test_mixed_requests(
 
     cached_prompt = example_prompts[cached_position]
     with vllm_runner(
-        model,
-        dtype=dtype,
-        enable_prefix_caching=True,
-        enable_chunked_prefill=enable_chunked_prefill,
+            model,
+            dtype=dtype,
+            enable_prefix_caching=True,
+            enable_chunked_prefill=enable_chunked_prefill,
     ) as vllm_model:
         # Run the first prompt so the cache is populated
         vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens)

diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
@@ -55,30 +55,6 @@ def __init__(
         self._max_block_sliding_window = max_block_sliding_window
         self._num_full_slots = self._get_num_token_ids()
 
-    # @staticmethod
-    # def get_num_required_blocks(token_ids: List[int],
-    #                            block_size: int,
-    #                            num_lookahead_slots: int = 0) -> int:
-    #    """Calculates the minimum number of blocks required to store a given
-    #    sequence of token IDs along with any look-ahead slots that may be
-    #    required (like in multi-step + chunked-prefill).
-
-    #    This assumes worst-case scenario, where every block requires a new
-    #    allocation (e.g. ignoring prefix caching).
-
-    #    Args:
-    #        token_ids (List[int]): The sequence of token IDs to be stored.
-    #        block_size (int): The maximum number of tokens that can be stored in
-    #            a single block.
-    #        num_lookahead_slots (int): look-ahead slots that the sequence may
-    #            require.
-
-    #    Returns:
-    #        int: The minimum number of blocks required to store the given
-    #            sequence of token IDs along with any required look-ahead slots.
-    #    """
-    #    return cdiv(len(token_ids) + num_lookahead_slots, block_size)
-
     def allocate(
         self,
         token_ids: List[int],
@@ -100,14 +76,13 @@ def allocate(
         if not token_ids:
             return
 
-        blocks = self._allocate_blocks_for_token_ids(
-            token_ids, block_hashes, device
-        )
+        blocks = self._allocate_blocks_for_token_ids(token_ids, block_hashes,
+                                                     device)
         self.update(blocks)
         self._num_full_slots = len(token_ids)
 
     def update(self, blocks: List[Block]) -> None:
-        """Resets the table to the newly provided blocks 
+        """Resets the table to the newly provided blocks
         (with their corresponding block ids)
         """
         self._blocks.update(blocks)
@@ -164,17 +139,14 @@ def append_slots(
         # Update the blocks with the new tokens
         first_block_idx = self._num_full_slots // self._block_size
         token_blocks = self._chunk_token_blocks_for_append(token_ids)
-
-        if len(token_blocks) != len(block_hashes):
-            breakpoint()
-
         assert len(token_blocks) == len(
             block_hashes
         ), "chunked token_ids and block_hashes must have the same length"
 
         for i, token_block in enumerate(token_blocks):
             block_hash = block_hashes[i]
-            self._blocks.append_token_ids(first_block_idx + i, token_block, block_hash)
+            self._blocks.append_token_ids(first_block_idx + i, token_block,
+                                          block_hash)
 
         self._num_full_slots += len(token_ids)
 
@@ -304,19 +276,17 @@ def _allocate_blocks_for_token_ids(
                 self._allocator.allocate_immutable_blocks(
                     prev_block,
                     block_token_ids=block_token_ids,
-                    block_hashes=block_hashes[: len(block_token_ids)],
+                    block_hashes=block_hashes[:len(block_token_ids)],
                     device=device,
-                )
-            )
+                ))
             prev_block = blocks[-1]
 
         if tail_token_ids:
             assert len(tail_token_ids) == 1
             assert block_hashes[-1] is None
             cur_token_ids = tail_token_ids[0]
             block = self._allocator.allocate_mutable_block(
-                prev_block=prev_block, device=device
-            )
+                prev_block=prev_block, device=device)
             block.append_token_ids(cur_token_ids, block_hash=None)
 
             blocks.append(block)

diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py
@@ -254,9 +254,8 @@ def update(self, blocks: List[Block]):
         for block in self._blocks:
             self._add_block_id(block.block_id)
 
-    def append_token_ids(
-        self, block_index: int, token_ids: List[int], block_hash: Optional[int]
-    ) -> None:
+    def append_token_ids(self, block_index: int, token_ids: List[int],
+                         block_hash: Optional[int]) -> None:
         block = self._blocks[block_index]
         prev_block_id = block.block_id
 

diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -139,7 +139,7 @@ def allocate_immutable_blocks(
         prev_block: Optional[Block],
         block_token_ids: List[List[int]],
         device: Device,
-        block_hashes: Optional[List[Optional[int]]] = None,
+        block_hashes: List[Optional[int]],
     ) -> List[Block]:
         """Allocates a new group of immutable blocks with the provided block 
         token IDs on the specified device.
@@ -156,15 +156,7 @@ def allocate_immutable_blocks(
                 containing the provided block token IDs.
         """
         return self._allocators[device].allocate_immutable_blocks(
-            prev_block, block_token_ids, block_hashes
-        )
-
-    def get_allocated_cached_blocks(
-        self,
-        block_hashes: List[int],
-        device: Device,
-    ) -> List[int]:
-        return self._allocators[device].get_allocated_cached_blocks(block_hashes)
+            prev_block, block_token_ids, block_hashes)
 
     def allocate_immutable_block(self, prev_block: Optional[Block],
                                  token_ids: List[int],
@@ -353,14 +345,12 @@ def get_and_reset_swaps(self) -> List[Tuple[int, int]]:
         self._swap_mapping.clear()
         return list(mapping.items())
 
-    def find_cached_blocks_prefix(
-        self, block_hashes: List[int], allocated: bool
-    ) -> List[int]:
+    def find_cached_blocks_prefix(self, block_hashes: List[int],
+                                  allocated: bool) -> List[int]:
         # Prefix caching only supported on GPU.
         device = Device.GPU
         return self._allocators[device].find_cached_blocks_prefix(
-            block_hashes, allocated
-        )
+            block_hashes, allocated)
 
 
 class NullBlock(Block):
@@ -376,7 +366,9 @@ def __init__(self, proxy: Block):
         super().__init__()
         self._proxy = proxy
 
-    def append_token_ids(self, token_ids: List[BlockId]):
+    def append_token_ids(self,
+                         token_ids: List[BlockId],
+                         block_hash: Optional[int] = None) -> None:
         raise ValueError("null block should not be modified")
 
     @property
@@ -429,4 +421,5 @@ def content_hash(self):
         return self._proxy.content_hash
 
     def set_content_hash(self, content_hash: Optional[int]) -> None:
-        raise NotImplementedError("NullBlock does not support set_content_hash")
+        raise NotImplementedError(
+            "NullBlock does not support set_content_hash")