diff --git a/vllm/core/block/cpu_offloading_block_allocator.py b/vllm/core/block/cpu_offloading_block_allocator.py index 41a6e94c4e58c..93ebbd479ff00 100644 --- a/vllm/core/block/cpu_offloading_block_allocator.py +++ b/vllm/core/block/cpu_offloading_block_allocator.py @@ -109,10 +109,13 @@ def __init__(self, cpu_block_allocator: PrefixCachingBlockAllocator, """ GPU block should only be in one of the following three status: uncached: allocated blocks that didn't hit any cache - cached: allocated blocks that hit the cache, either in GPU or in CPU + cached: allocated blocks that are cached, either in GPU or in CPU free: the blocks are not allocated by block allocator - block allocator will automatically track free blocks, and we don't need - to specially handle cached blocks. So we only track uncached blocks + This implementation aims to transform uncacherd blocks to cached blocks + by performing GPU to CPU copy when calling `get_and_reset_swaps` + + As block allocator will automatically track free blocks, and we don't + need to specially handle cached blocks. So we only track uncached blocks """ self._uncached_blocks: Deque[Block] = deque() """