Skip to content

Commit

Permalink
clean up
Browse files Browse the repository at this point in the history
  • Loading branch information
rickyyx committed Oct 30, 2024
1 parent 2cffdf9 commit 0321fa7
Show file tree
Hide file tree
Showing 13 changed files with 91 additions and 284 deletions.
24 changes: 12 additions & 12 deletions benchmarks/benchmark_prefix_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,18 +132,18 @@ def main(args):
filtered_datasets = [(PROMPT, prompt_len, args.output_len)
] * args.num_prompts

# llm = LLM(
# model=args.model,
# tokenizer_mode="auto",
# trust_remote_code=True,
# enforce_eager=True,
# use_v2_block_manager=args.use_v2_block_manager,
# tensor_parallel_size=args.tensor_parallel_size,
# enable_prefix_caching=args.enable_prefix_caching,
# disable_log_stats=False,
# max_num_batched_tokens=4096 * 2,
# enable_chunked_prefill=True,
# )
llm = LLM(
model=args.model,
tokenizer_mode="auto",
trust_remote_code=True,
enforce_eager=True,
use_v2_block_manager=args.use_v2_block_manager,
tensor_parallel_size=args.tensor_parallel_size,
enable_prefix_caching=args.enable_prefix_caching,
disable_log_stats=False,
max_num_batched_tokens=4096 * 2,
enable_chunked_prefill=True,
)
engine_args = EngineArgs.from_cli_args(args)

llm = LLM(**dataclasses.asdict(engine_args))
Expand Down
4 changes: 2 additions & 2 deletions tests/core/block/test_block_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def test_can_allocate_with_prefix_cache(
# Num blocks needed for 2 seqs, minus the number of blocks shared.
num_blocks_required_with_sharing = 2 * num_blocks_required_seq - num_blocks_shared

Check failure on line 236 in tests/core/block/test_block_manager.py

View workflow job for this annotation

GitHub Actions / ruff (3.9)

Ruff (E501)

tests/core/block/test_block_manager.py:236:81: E501 Line too long (86 > 80)

Check failure on line 236 in tests/core/block/test_block_manager.py

View workflow job for this annotation

GitHub Actions / ruff (3.8)

Ruff (E501)

tests/core/block/test_block_manager.py:236:81: E501 Line too long (86 > 80)

Check failure on line 236 in tests/core/block/test_block_manager.py

View workflow job for this annotation

GitHub Actions / ruff (3.10)

Ruff (E501)

tests/core/block/test_block_manager.py:236:81: E501 Line too long (86 > 80)

Check failure on line 236 in tests/core/block/test_block_manager.py

View workflow job for this annotation

GitHub Actions / ruff (3.11)

Ruff (E501)

tests/core/block/test_block_manager.py:236:81: E501 Line too long (86 > 80)

Check failure on line 236 in tests/core/block/test_block_manager.py

View workflow job for this annotation

GitHub Actions / ruff (3.12)

Ruff (E501)

tests/core/block/test_block_manager.py:236:81: E501 Line too long (86 > 80)

block_manager = BlockSpaceManagerV2(
block_manager = SelfAttnBlockSpaceManager(
block_size=block_size,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=0,
Expand Down Expand Up @@ -299,7 +299,7 @@ def test_can_append_with_prefix_cache(
print(f"num_gpu_blocks: {num_gpu_blocks}")

num_blocks_required_with_sharing = 2 * num_blocks_required_seq_1 - num_blocks_shared

Check failure on line 301 in tests/core/block/test_block_manager.py

View workflow job for this annotation

GitHub Actions / ruff (3.9)

Ruff (E501)

tests/core/block/test_block_manager.py:301:81: E501 Line too long (88 > 80)

Check failure on line 301 in tests/core/block/test_block_manager.py

View workflow job for this annotation

GitHub Actions / ruff (3.8)

Ruff (E501)

tests/core/block/test_block_manager.py:301:81: E501 Line too long (88 > 80)

Check failure on line 301 in tests/core/block/test_block_manager.py

View workflow job for this annotation

GitHub Actions / ruff (3.10)

Ruff (E501)

tests/core/block/test_block_manager.py:301:81: E501 Line too long (88 > 80)

Check failure on line 301 in tests/core/block/test_block_manager.py

View workflow job for this annotation

GitHub Actions / ruff (3.11)

Ruff (E501)

tests/core/block/test_block_manager.py:301:81: E501 Line too long (88 > 80)

Check failure on line 301 in tests/core/block/test_block_manager.py

View workflow job for this annotation

GitHub Actions / ruff (3.12)

Ruff (E501)

tests/core/block/test_block_manager.py:301:81: E501 Line too long (88 > 80)
block_manager = BlockSpaceManagerV2(
block_manager = SelfAttnBlockSpaceManager(
block_size=block_size,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=0,
Expand Down
4 changes: 0 additions & 4 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1074,10 +1074,6 @@ def __init__(self,
self.policy = policy
self._verify_args()

print(
f"max_num_batched_tokens: {self.max_num_batched_tokens}, max_num_seqs: {self.max_num_seqs}"
)

def _verify_args(self) -> None:
if (self.max_num_batched_tokens < self.max_model_len
and not self.chunked_prefill_enabled):
Expand Down
15 changes: 7 additions & 8 deletions vllm/core/block/block_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,9 @@ def append_slots(

for i, token_block in enumerate(token_blocks):
if self._enable_prefix_caching:
block_hash: Optional[int] = seq.get_block_hash(first_block_idx + i)
block_hash: Optional[int] = seq.update_and_get_block_hash(
first_block_idx + i
)
else:
block_hash = None
self._blocks.append_token_ids(first_block_idx + i, token_block, block_hash)
Expand Down Expand Up @@ -286,7 +288,7 @@ def _allocate_blocks_for_token_ids(
if len(cur_token_ids) == self._block_size:
block_token_ids.append(cur_token_ids)
if self._enable_prefix_caching:
block_hashes.append(seq.get_block_hash(block_idx))
block_hashes.append(seq.update_and_get_block_hash(block_idx))
else:
block_hashes.append(None)
else:
Expand All @@ -308,12 +310,9 @@ def _allocate_blocks_for_token_ids(
assert len(tail_token_ids) == 1
assert block_hashes[-1] is None
cur_token_ids = tail_token_ids[0]
try:
block = self._allocator.allocate_mutable_block(
prev_block=prev_block, device=device
)
except Exception as e:
breakpoint()
block = self._allocator.allocate_mutable_block(
prev_block=prev_block, device=device
)
block.append_token_ids(cur_token_ids, block_hash=None)

blocks.append(block)
Expand Down
12 changes: 0 additions & 12 deletions vllm/core/block/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,25 +196,13 @@ def increase_pool(self):
allocator=self._allocator,
block_id=None))

# TODO(rickyx): This should take in kwargs for flexible initialization of different types of blocks
# Right now, we update explicitly blocks with other args after initialization, e.g. block_hash
# computed for the prefix caching block.
def init_block(
self,
prev_block: Optional[Block],
token_ids: List[int],
block_size: int,
physical_block_id: Optional[int],
) -> Block:
"""Initializes a block with the given parameters.
Args:
prev_block (Optional[Block]): The previous block in the sequence.
token_ids (List[int]): The token IDs to be stored in the block.
block_size (int): The size of the block.
physical_block_id (Optional[int]): The physical block ID.
block_hash (Optional[int]): The hash of the block's content.
"""
if len(self._free_ids) == 0:
self.increase_pool()
assert len(self._free_ids) > 0
Expand Down
2 changes: 1 addition & 1 deletion vllm/core/block/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def content_hash(self) -> Optional[int]:
return None

@abstractmethod
def set_content_hash(self, content_hash: int) -> None:
def set_content_hash(self, content_hash: Optional[int]) -> None:
pass


Expand Down
6 changes: 0 additions & 6 deletions vllm/core/block/naive_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ def allocate_immutable_block(
self,
prev_block: Optional[Block],
token_ids: List[int],
device: Optional[Device] = None,
block_hash: Optional[int] = None,
) -> Block:
"""Allocates a new immutable block with the given token IDs, linked to
Expand All @@ -79,7 +78,6 @@ def allocate_immutable_block(
Returns:
Block: The newly allocated immutable block.
"""
assert device is None
assert block_hash is None

block = self.allocate_mutable_block(prev_block=prev_block)
Expand All @@ -91,9 +89,7 @@ def allocate_immutable_blocks(
prev_block: Optional[Block],
block_token_ids: List[List[int]],
block_hashes: Optional[List[Optional[int]]] = None,
device: Optional[Device] = None,
) -> List[Block]:
assert device is None
num_blocks = len(block_token_ids)

block_ids = []
Expand All @@ -114,7 +110,6 @@ def allocate_immutable_blocks(
def allocate_mutable_block(
self,
prev_block: Optional[Block],
device: Optional[Device] = None,
block_hash: Optional[int] = None,
) -> Block:
"""Allocates a new mutable block, linked to the previous block.
Expand All @@ -127,7 +122,6 @@ def allocate_mutable_block(
Returns:
Block: The newly allocated mutable block.
"""
assert device is None
assert block_hash is None

block_id = self._allocate_block_id()
Expand Down
105 changes: 4 additions & 101 deletions vllm/core/block/prefix_caching_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,6 @@ def allocate_immutable_block(
prev_block: Optional[Block],
token_ids: List[int],
block_hash: Optional[int] = None,
device: Optional[Device] = None,
) -> Block:
"""Allocates an immutable block with the given token IDs, reusing cached
blocks if possible.
Expand All @@ -153,7 +152,6 @@ def allocate_immutable_block(
Returns:
Block: The allocated immutable block.
"""
assert device is None
assert len(token_ids) == self._block_size, "An immutable block should be full"
assert (
block_hash is not None
Expand All @@ -163,9 +161,6 @@ def allocate_immutable_block(
cached_block_id = self._cached_blocks.get(block_hash, None)
if cached_block_id is not None:
# Initialize a block that points to cached data
# print(
# f"reuse block_hash={block_hash} from cached_block_id: {cached_block_id}"
# )
block: Block = self._block_pool.init_block(
prev_block=prev_block,
token_ids=token_ids,
Expand All @@ -177,9 +172,6 @@ def allocate_immutable_block(
self._incr_refcount_cached_block(block)
return block

# print(
# f"alloc from new block(block_hash: {block_hash}), get_num_free_blocks: {self.get_num_free_blocks()}"
# )
self.metric_data.query(hit=False)

# No cached block => Allocate a new block
Expand All @@ -192,7 +184,6 @@ def allocate_immutable_blocks(
prev_block: Optional[Block],
block_token_ids: List[List[int]],
block_hashes: Optional[List[int]] = None,

Check failure on line 186 in vllm/core/block/prefix_caching_block.py

View workflow job for this annotation

GitHub Actions / mypy (3.8)

Argument 3 of "allocate_immutable_blocks" is incompatible with supertype "BlockAllocator"; supertype defines the argument type as "Optional[List[Optional[int]]]" [override]

Check failure on line 186 in vllm/core/block/prefix_caching_block.py

View workflow job for this annotation

GitHub Actions / mypy (3.12)

Argument 3 of "allocate_immutable_blocks" is incompatible with supertype "BlockAllocator"; supertype defines the argument type as "Optional[List[Optional[int]]]" [override]

Check failure on line 186 in vllm/core/block/prefix_caching_block.py

View workflow job for this annotation

GitHub Actions / mypy (3.9)

Argument 3 of "allocate_immutable_blocks" is incompatible with supertype "BlockAllocator"; supertype defines the argument type as "Optional[List[Optional[int]]]" [override]

Check failure on line 186 in vllm/core/block/prefix_caching_block.py

View workflow job for this annotation

GitHub Actions / mypy (3.11)

Argument 3 of "allocate_immutable_blocks" is incompatible with supertype "BlockAllocator"; supertype defines the argument type as "Optional[List[Optional[int]]]" [override]

Check failure on line 186 in vllm/core/block/prefix_caching_block.py

View workflow job for this annotation

GitHub Actions / mypy (3.10)

Argument 3 of "allocate_immutable_blocks" is incompatible with supertype "BlockAllocator"; supertype defines the argument type as "Optional[List[Optional[int]]]" [override]
device: Optional[Device] = None,
) -> List[Block]:
blocks = []
assert (
Expand All @@ -204,7 +195,6 @@ def allocate_immutable_blocks(
prev_block=prev_block,
token_ids=token_ids,
block_hash=block_hash,
device=device,
)
blocks.append(prev_block)
return blocks
Expand All @@ -224,9 +214,6 @@ def allocate_mutable_block(self,
"""
assert device is None
assert_prefix_caching_block_or_none(prev_block)
# print(
# f"Allocating mutable block: get_num_free_blocks: {self.get_num_free_blocks()}"
# )
block_id = self._allocate_block_id()
block = self._block_pool.init_block(prev_block=prev_block,
token_ids=[],
Expand Down Expand Up @@ -297,7 +284,6 @@ def _allocate_block_id(self) -> BlockId:
"""First tries to allocate a block id from the hashless allocator,
and if there are no blocks, then tries to evict an unused cached block.
"""
# print(f"allocating block_id: get_num_free_blocks: {self.get_num_free_blocks()}")
hashless_block_id = self._maybe_allocate_hashless_block_id()
if hashless_block_id is not None:
return hashless_block_id
Expand Down Expand Up @@ -418,9 +404,7 @@ def get_num_free_blocks(self, device: Optional[Device] = None) -> int:
assert device is None
# The number of free blocks is the number of hashless free blocks
# plus the number of blocks evictor could free from its list.
return self._hashless_allocator.get_num_free_blocks() + (
self.evictor.num_blocks
)
return self._hashless_allocator.get_num_free_blocks() + self.evictor.num_blocks

def get_num_total_blocks(self) -> int:
return self._hashless_allocator.get_num_total_blocks()
Expand Down Expand Up @@ -511,9 +495,6 @@ def cow_block_if_not_appendable(self, block: Block) -> BlockId:
return src_block_id

self._free_block_id(block)
# print(
# f"Allocating block for COW: get_num_free_blocks: {self.get_num_free_blocks()}"
# )
trg_block_id = self._allocate_block_id()

self._cow_tracker.record_cow(src_block_id, trg_block_id)
Expand Down Expand Up @@ -878,38 +859,6 @@ def token_ids(self) -> List[int]:
def prev_block(self) -> Optional[Block]:
return self._prev_block

# @property
# def content_hash(self) -> Optional[int]:
# """Return the content-based hash of the current block, or None if it is
# not yet defined.

# For the content-based hash to be defined, the current block must be
# full.
# """
# # If the hash is already computed, return it.
# if self._cached_content_hash is not None: # return self._cached_content_hash

# # We cannot compute a hash for the current block because it is not full.
# if not self.is_full:
# return None

# is_first_block = self._prev_block is None
# prev_block_hash = (
# None if is_first_block else
# self._prev_block.content_hash # type: ignore
# )

# # Previous block exists but does not yet have a hash.
# # Return no hash in this case.
# if prev_block_hash is None and not is_first_block:
# return None

# self._cached_content_hash = PrefixCachingBlock.hash_block_tokens(
# is_first_block,
# prev_block_hash,
# cur_block_token_ids=self.token_ids)
# return self._cached_content_hash

@property
def content_hash(self) -> Optional[int]:
return self._cached_content_hash
Expand Down Expand Up @@ -952,7 +901,9 @@ def hash_block_tokens(is_first_block: bool, prev_block_hash: Optional[int],
assert (prev_block_hash is None) == is_first_block
return hash((is_first_block, prev_block_hash, *cur_block_token_ids))


# TODO(rickyx): This is not used anymore. Or it could be used to track
# cached blocks for a sequence, so the sequence would be decoupled from the computed
# block hash calculation.
class ComputedBlocksTracker:
"""Handles caching of per-sequence computed block ids.
When a sequence appears for the first time, it traverses all of the
Expand Down Expand Up @@ -989,54 +940,6 @@ def remove_seq(self, seq_id: int) -> None:
assert seq_id in self._cached_computed_seq_blocks
del self._cached_computed_seq_blocks[seq_id]

def get_cached_computed_blocks_and_update(
self, seq_id: int, block_ids: List[int]) -> List[int]:
""" Look at the class documentation for details
"""
# Ensure seq_id is already tracked
assert seq_id in self._cached_computed_seq_blocks

# Get cached data (may be empty on the first time)
prev_computed_block_ids, has_gap = self._cached_computed_seq_blocks[
seq_id]

if has_gap:
# When gap is detected, we do not add more computed blocks at this
# sequence iteration
return prev_computed_block_ids

# We do not consider the last block id for caching purposes.
num_cur_blocks = len(block_ids) - 1
assert num_cur_blocks >= 0

if len(prev_computed_block_ids) >= num_cur_blocks:
# Cache HIT
assert len(prev_computed_block_ids) == num_cur_blocks
return prev_computed_block_ids

# If here, then we may possibly add more computed blocks. As a result,
# traverse the additional blocks after prev_computed_block_ids to
# detect more computed blocks and add them.

# Incremental init for seq_id => Look only at the new blocks
computed_block_ids = self._allocator.get_computed_block_ids( # noqa: E501
prev_computed_block_ids,
block_ids,
skip_last_block_id=
True, # We skip last block id to avoid caching of full seq
)

# QQ(rickyx): why is it possible to actually have a gap?

# Detect if there is a "gap"
has_gap = len(computed_block_ids) < num_cur_blocks

# Record
self._cached_computed_seq_blocks[seq_id] = (computed_block_ids,
has_gap)

return computed_block_ids


class LastAccessBlocksTracker:
"""Manages the last access time of the tracked sequences, in order to allow
Expand Down
Loading

0 comments on commit 0321fa7

Please sign in to comment.