diff --git a/vllm/config.py b/vllm/config.py index 9ecd3e72afa9f..307cf9c8d5b2a 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -917,6 +917,10 @@ def _verify_args(self) -> None: raise ValueError( "GPU memory utilization must be less than 1.0. Got " f"{self.gpu_memory_utilization}.") + if (current_platform.is_cuda() and self.block_size is not None + and self.block_size > 32): + raise ValueError("CUDA Paged Attention kernel only supports " + f"block sizes up to 32. Got {self.block_size}.") def _verify_cache_dtype(self) -> None: if self.cache_dtype == "auto": diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 674577f23eba6..64cc4592c2861 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -424,10 +424,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument('--block-size', type=int, default=EngineArgs.block_size, - choices=[8, 16, 32], + choices=[8, 16, 32, 64, 128], help='Token block size for contiguous chunks of ' 'tokens. This is ignored on neuron devices and ' - 'set to max-model-len') + 'set to max-model-len. On CUDA devices, ' + 'only block sizes up to 32 are supported. ' + 'On HPU devices, block size defaults to 128.') parser.add_argument( "--enable-prefix-caching",