clean up

rickyyx · Oct 30, 2024 · 0321fa7 · 0321fa7
1 parent 2cffdf9
commit 0321fa7
Show file tree

Hide file tree

Showing 13 changed files with 91 additions and 284 deletions.
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
@@ -132,18 +132,18 @@ def main(args):
         filtered_datasets = [(PROMPT, prompt_len, args.output_len)
                              ] * args.num_prompts
 
-    # llm = LLM(
-    #     model=args.model,
-    #     tokenizer_mode="auto",
-    #     trust_remote_code=True,
-    #     enforce_eager=True,
-    #     use_v2_block_manager=args.use_v2_block_manager,
-    #     tensor_parallel_size=args.tensor_parallel_size,
-    #     enable_prefix_caching=args.enable_prefix_caching,
-    #     disable_log_stats=False,
-    #     max_num_batched_tokens=4096 * 2,
-    #     enable_chunked_prefill=True,
-    # )
+    llm = LLM(
+        model=args.model,
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        enforce_eager=True,
+        use_v2_block_manager=args.use_v2_block_manager,
+        tensor_parallel_size=args.tensor_parallel_size,
+        enable_prefix_caching=args.enable_prefix_caching,
+        disable_log_stats=False,
+        max_num_batched_tokens=4096 * 2,
+        enable_chunked_prefill=True,
+    )
     engine_args = EngineArgs.from_cli_args(args)
 
     llm = LLM(**dataclasses.asdict(engine_args))

diff --git a/tests/core/block/test_block_manager.py b/tests/core/block/test_block_manager.py
@@ -235,7 +235,7 @@ def test_can_allocate_with_prefix_cache(
     # Num blocks needed for 2 seqs, minus the number of blocks shared.
     num_blocks_required_with_sharing = 2 * num_blocks_required_seq - num_blocks_shared
 
-    block_manager = BlockSpaceManagerV2(
+    block_manager = SelfAttnBlockSpaceManager(
         block_size=block_size,
         num_gpu_blocks=num_gpu_blocks,
         num_cpu_blocks=0,
@@ -299,7 +299,7 @@ def test_can_append_with_prefix_cache(
     print(f"num_gpu_blocks: {num_gpu_blocks}")
 
     num_blocks_required_with_sharing = 2 * num_blocks_required_seq_1 - num_blocks_shared
-    block_manager = BlockSpaceManagerV2(
+    block_manager = SelfAttnBlockSpaceManager(
         block_size=block_size,
         num_gpu_blocks=num_gpu_blocks,
         num_cpu_blocks=0,

diff --git a/vllm/config.py b/vllm/config.py
@@ -1074,10 +1074,6 @@ def __init__(self,
         self.policy = policy
         self._verify_args()
 
-        print(
-            f"max_num_batched_tokens: {self.max_num_batched_tokens}, max_num_seqs: {self.max_num_seqs}"
-        )
-
     def _verify_args(self) -> None:
         if (self.max_num_batched_tokens < self.max_model_len
                 and not self.chunked_prefill_enabled):

diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
@@ -164,7 +164,9 @@ def append_slots(
 
         for i, token_block in enumerate(token_blocks):
             if self._enable_prefix_caching:
-                block_hash: Optional[int] = seq.get_block_hash(first_block_idx + i)
+                block_hash: Optional[int] = seq.update_and_get_block_hash(
+                    first_block_idx + i
+                )
             else:
                 block_hash = None
             self._blocks.append_token_ids(first_block_idx + i, token_block, block_hash)
@@ -286,7 +288,7 @@ def _allocate_blocks_for_token_ids(
             if len(cur_token_ids) == self._block_size:
                 block_token_ids.append(cur_token_ids)
                 if self._enable_prefix_caching:
-                    block_hashes.append(seq.get_block_hash(block_idx))
+                    block_hashes.append(seq.update_and_get_block_hash(block_idx))
                 else:
                     block_hashes.append(None)
             else:
@@ -308,12 +310,9 @@ def _allocate_blocks_for_token_ids(
             assert len(tail_token_ids) == 1
             assert block_hashes[-1] is None
             cur_token_ids = tail_token_ids[0]
-            try:
-                block = self._allocator.allocate_mutable_block(
-                    prev_block=prev_block, device=device
-                )
-            except Exception as e:
-                breakpoint()
+            block = self._allocator.allocate_mutable_block(
+                prev_block=prev_block, device=device
+            )
             block.append_token_ids(cur_token_ids, block_hash=None)
 
             blocks.append(block)

diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py
@@ -196,25 +196,13 @@ def increase_pool(self):
                                    allocator=self._allocator,
                                    block_id=None))
 
-    # TODO(rickyx): This should take in kwargs for flexible initialization of different types of blocks
-    # Right now, we update explicitly blocks with other args after initialization, e.g. block_hash
-    # computed for the prefix caching block.
     def init_block(
         self,
         prev_block: Optional[Block],
         token_ids: List[int],
         block_size: int,
         physical_block_id: Optional[int],
     ) -> Block:
-        """Initializes a block with the given parameters.
-
-        Args:
-            prev_block (Optional[Block]): The previous block in the sequence.
-            token_ids (List[int]): The token IDs to be stored in the block.
-            block_size (int): The size of the block.
-            physical_block_id (Optional[int]): The physical block ID.
-            block_hash (Optional[int]): The hash of the block's content.
-        """
         if len(self._free_ids) == 0:
             self.increase_pool()
             assert len(self._free_ids) > 0

diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
@@ -99,7 +99,7 @@ def content_hash(self) -> Optional[int]:
         return None
 
     @abstractmethod
-    def set_content_hash(self, content_hash: int) -> None:
+    def set_content_hash(self, content_hash: Optional[int]) -> None:
         pass
 
 

diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
@@ -64,7 +64,6 @@ def allocate_immutable_block(
         self,
         prev_block: Optional[Block],
         token_ids: List[int],
-        device: Optional[Device] = None,
         block_hash: Optional[int] = None,
     ) -> Block:
         """Allocates a new immutable block with the given token IDs, linked to
@@ -79,7 +78,6 @@ def allocate_immutable_block(
         Returns:
             Block: The newly allocated immutable block.
         """
-        assert device is None
         assert block_hash is None
 
         block = self.allocate_mutable_block(prev_block=prev_block)
@@ -91,9 +89,7 @@ def allocate_immutable_blocks(
         prev_block: Optional[Block],
         block_token_ids: List[List[int]],
         block_hashes: Optional[List[Optional[int]]] = None,
-        device: Optional[Device] = None,
     ) -> List[Block]:
-        assert device is None
         num_blocks = len(block_token_ids)
 
         block_ids = []
@@ -114,7 +110,6 @@ def allocate_immutable_blocks(
     def allocate_mutable_block(
         self,
         prev_block: Optional[Block],
-        device: Optional[Device] = None,
         block_hash: Optional[int] = None,
     ) -> Block:
         """Allocates a new mutable block, linked to the previous block.
@@ -127,7 +122,6 @@ def allocate_mutable_block(
         Returns:
             Block: The newly allocated mutable block.
         """
-        assert device is None
         assert block_hash is None
 
         block_id = self._allocate_block_id()

diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
@@ -140,7 +140,6 @@ def allocate_immutable_block(
         prev_block: Optional[Block],
         token_ids: List[int],
         block_hash: Optional[int] = None,
-        device: Optional[Device] = None,
     ) -> Block:
         """Allocates an immutable block with the given token IDs, reusing cached
         blocks if possible.
@@ -153,7 +152,6 @@ def allocate_immutable_block(
         Returns:
             Block: The allocated immutable block.
         """
-        assert device is None
         assert len(token_ids) == self._block_size, "An immutable block should be full"
         assert (
             block_hash is not None
@@ -163,9 +161,6 @@ def allocate_immutable_block(
         cached_block_id = self._cached_blocks.get(block_hash, None)
         if cached_block_id is not None:
             # Initialize a block that points to cached data
-            # print(
-            #     f"reuse block_hash={block_hash} from cached_block_id: {cached_block_id}"
-            # )
             block: Block = self._block_pool.init_block(
                 prev_block=prev_block,
                 token_ids=token_ids,
@@ -177,9 +172,6 @@ def allocate_immutable_block(
             self._incr_refcount_cached_block(block)
             return block
 
-        # print(
-        #     f"alloc from new block(block_hash: {block_hash}), get_num_free_blocks: {self.get_num_free_blocks()}"
-        # )
         self.metric_data.query(hit=False)
 
         # No cached block => Allocate a new block
@@ -192,7 +184,6 @@ def allocate_immutable_blocks(
         prev_block: Optional[Block],
         block_token_ids: List[List[int]],
         block_hashes: Optional[List[int]] = None,
-        device: Optional[Device] = None,
     ) -> List[Block]:
         blocks = []
         assert (
@@ -204,7 +195,6 @@ def allocate_immutable_blocks(
                 prev_block=prev_block,
                 token_ids=token_ids,
                 block_hash=block_hash,
-                device=device,
             )
             blocks.append(prev_block)
         return blocks
@@ -224,9 +214,6 @@ def allocate_mutable_block(self,
         """
         assert device is None
         assert_prefix_caching_block_or_none(prev_block)
-        # print(
-        #     f"Allocating mutable block: get_num_free_blocks: {self.get_num_free_blocks()}"
-        # )
         block_id = self._allocate_block_id()
         block = self._block_pool.init_block(prev_block=prev_block,
                                             token_ids=[],
@@ -297,7 +284,6 @@ def _allocate_block_id(self) -> BlockId:
         """First tries to allocate a block id from the hashless allocator,
         and if there are no blocks, then tries to evict an unused cached block.
         """
-        # print(f"allocating block_id: get_num_free_blocks: {self.get_num_free_blocks()}")
         hashless_block_id = self._maybe_allocate_hashless_block_id()
         if hashless_block_id is not None:
             return hashless_block_id
@@ -418,9 +404,7 @@ def get_num_free_blocks(self, device: Optional[Device] = None) -> int:
         assert device is None
         # The number of free blocks is the number of hashless free blocks
         # plus the number of blocks evictor could free from its list.
-        return self._hashless_allocator.get_num_free_blocks() + (
-            self.evictor.num_blocks
-        )
+        return self._hashless_allocator.get_num_free_blocks() + self.evictor.num_blocks
 
     def get_num_total_blocks(self) -> int:
         return self._hashless_allocator.get_num_total_blocks()
@@ -511,9 +495,6 @@ def cow_block_if_not_appendable(self, block: Block) -> BlockId:
             return src_block_id
 
         self._free_block_id(block)
-        # print(
-        #     f"Allocating block for COW: get_num_free_blocks: {self.get_num_free_blocks()}"
-        # )
         trg_block_id = self._allocate_block_id()
 
         self._cow_tracker.record_cow(src_block_id, trg_block_id)
@@ -878,38 +859,6 @@ def token_ids(self) -> List[int]:
     def prev_block(self) -> Optional[Block]:
         return self._prev_block
 
-    # @property
-    # def content_hash(self) -> Optional[int]:
-    #     """Return the content-based hash of the current block, or None if it is
-    #     not yet defined.
-
-    #     For the content-based hash to be defined, the current block must be
-    #     full.
-    #     """
-    #     # If the hash is already computed, return it.
-    #     if self._cached_content_hash is not None:    #         return self._cached_content_hash
-
-    #     # We cannot compute a hash for the current block because it is not full.
-    #     if not self.is_full:
-    #         return None
-
-    #     is_first_block = self._prev_block is None
-    #     prev_block_hash = (
-    #         None if is_first_block else
-    #         self._prev_block.content_hash  # type: ignore
-    #     )
-
-    #     # Previous block exists but does not yet have a hash.
-    #     # Return no hash in this case.
-    #     if prev_block_hash is None and not is_first_block:
-    #         return None
-
-    #     self._cached_content_hash = PrefixCachingBlock.hash_block_tokens(
-    #         is_first_block,
-    #         prev_block_hash,
-    #         cur_block_token_ids=self.token_ids)
-    #     return self._cached_content_hash
-
     @property
     def content_hash(self) -> Optional[int]:
         return self._cached_content_hash
@@ -952,7 +901,9 @@ def hash_block_tokens(is_first_block: bool, prev_block_hash: Optional[int],
         assert (prev_block_hash is None) == is_first_block
         return hash((is_first_block, prev_block_hash, *cur_block_token_ids))
 
-
+# TODO(rickyx): This is not used anymore. Or it could be used to track 
+# cached blocks for a sequence, so the sequence would be decoupled from the computed
+# block hash calculation.
 class ComputedBlocksTracker:
     """Handles caching of per-sequence computed block ids. 
         When a sequence appears for the first time, it traverses all of the 
@@ -989,54 +940,6 @@ def remove_seq(self, seq_id: int) -> None:
         assert seq_id in self._cached_computed_seq_blocks
         del self._cached_computed_seq_blocks[seq_id]
 
-    def get_cached_computed_blocks_and_update(
-            self, seq_id: int, block_ids: List[int]) -> List[int]:
-        """ Look at the class documentation for details
-        """
-        # Ensure seq_id is already tracked
-        assert seq_id in self._cached_computed_seq_blocks
-
-        # Get cached data (may be empty on the first time)
-        prev_computed_block_ids, has_gap = self._cached_computed_seq_blocks[
-            seq_id]
-
-        if has_gap:
-            # When gap is detected, we do not add more computed blocks at this
-            # sequence iteration
-            return prev_computed_block_ids
-
-        # We do not consider the last block id for caching purposes.
-        num_cur_blocks = len(block_ids) - 1
-        assert num_cur_blocks >= 0
-
-        if len(prev_computed_block_ids) >= num_cur_blocks:
-            # Cache HIT
-            assert len(prev_computed_block_ids) == num_cur_blocks
-            return prev_computed_block_ids
-
-        # If here, then we may possibly add more computed blocks. As a result,
-        # traverse the additional blocks after prev_computed_block_ids to
-        # detect more computed blocks and add them.
-
-        # Incremental init for seq_id => Look only at the new blocks
-        computed_block_ids = self._allocator.get_computed_block_ids(  # noqa: E501
-            prev_computed_block_ids,
-            block_ids,
-            skip_last_block_id=
-            True,  # We skip last block id to avoid caching of full seq
-        )
-
-        # QQ(rickyx): why is it possible to actually have a gap?
-
-        # Detect if there is a "gap"
-        has_gap = len(computed_block_ids) < num_cur_blocks
-
-        # Record
-        self._cached_computed_seq_blocks[seq_id] = (computed_block_ids,
-                                                    has_gap)
-
-        return computed_block_ids
-
 
 class LastAccessBlocksTracker:
     """Manages the last access time of the tracked sequences, in order to allow