@@ -98,9 +98,9 @@ def test_prefill():
98
98
# Incomplete 1 block (6 tokens)
99
99
unique_token_ids = [3 ] * 6
100
100
req2 = make_request ("2" , common_token_ids + unique_token_ids )
101
- computed_block = manager .get_computed_blocks (req2 )
101
+ computed_blocks = manager .get_computed_blocks (req2 )
102
102
assert len (req2 .kv_block_hashes ) == 3
103
- assert [b .block_id for b in computed_block ] == [0 , 1 , 2 ]
103
+ assert [b .block_id for b in computed_blocks ] == [0 , 1 , 2 ]
104
104
num_new_tokens = 53 - 3 * 16
105
105
blocks = manager .allocate_slots (req2 , num_new_tokens , computed_blocks )
106
106
assert [b .block_id for b in blocks ] == [7 , 8 ]
@@ -500,3 +500,62 @@ def test_mm_prefix_caching():
500
500
mm_hashes = mm_hashes )
501
501
computed_blocks = manager .get_computed_blocks (req1 )
502
502
assert len (computed_blocks ) == 3
503
+
504
+
505
+ def test_prefill_not_enough_free_blocks_with_computed_blocks ():
506
+ """
507
+ This is a unit test that tests the correctness of the allocate_slots
508
+ when there is not enough free blocks. Specifically, when a request
509
+ has computed blocks but cannot be allocated due to not enough free blocks,
510
+ the computed blocks should not be touched.
511
+ """
512
+ block_size = 16
513
+ manager = KVCacheManager (
514
+ block_size = block_size ,
515
+ num_gpu_blocks = 10 ,
516
+ max_model_len = 8192 ,
517
+ sliding_window = None ,
518
+ enable_caching = True ,
519
+ num_preallocate_tokens = 0 ,
520
+ )
521
+ # Complete 3 blocks (48 tokens)
522
+ # | Common-0 | Common-1 | Common-2 | ... |
523
+ common_token_ids = [i for i in range (3 ) for _ in range (16 )]
524
+ req0 = make_request ("0" , common_token_ids )
525
+ computed_blocks = manager .get_computed_blocks (req0 )
526
+ assert not computed_blocks
527
+ manager .allocate_slots (req0 , 48 , computed_blocks )
528
+ block_part0 = manager .req_to_blocks [req0 .request_id ]
529
+
530
+ # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... |
531
+ req1 = make_request ("1" , common_token_ids * 2 )
532
+ computed_blocks = manager .get_computed_blocks (req1 )
533
+ assert computed_blocks == block_part0
534
+ manager .allocate_slots (req1 , 48 , computed_blocks )
535
+ block_part1 = manager .req_to_blocks [req1 .request_id ]
536
+ # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
537
+ # | Req1-5(F)| ... |
538
+ manager .free (req1 )
539
+ assert {block .ref_cnt for block in block_part1 [:3 ]} == {1 }
540
+ assert {block .ref_cnt for block in block_part1 [3 :]} == {0 }
541
+
542
+ # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) |
543
+ # | Req1-5(F)| Req2-0 | Req2-1 | ... |
544
+ req2 = make_request ("2" , [7 ] * block_size * 2 )
545
+ computed_blocks = manager .get_computed_blocks (req2 )
546
+ assert not computed_blocks
547
+ manager .allocate_slots (req2 , block_size * 2 , computed_blocks )
548
+
549
+ # Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed,
550
+ # but it cannot be allocated due to insufficient free blocks (2).
551
+ # In this case, the ref_cnt of the computed blocks should not be changed.
552
+ assert manager .free_block_queue .num_free_blocks == 5
553
+ req3 = make_request ("3" , common_token_ids * 3 )
554
+ computed_blocks = manager .get_computed_blocks (req3 )
555
+ assert computed_blocks == block_part1
556
+ # Req3 cannot be allocated.
557
+ assert manager .allocate_slots (req3 , 48 , computed_blocks ) is None
558
+ # Block 0-2 are used by Req 1.
559
+ assert {block .ref_cnt for block in block_part1 [:3 ]} == {1 }
560
+ # Block 3-5 are free.
561
+ assert {block .ref_cnt for block in block_part1 [3 :]} == {0 }
0 commit comments