diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index e63be184af16a..5958489a2f736 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1040,8 +1040,8 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: self.parallel_config.pipeline_parallel_size): for batch_size in reversed(batch_size_capture_list): if self.attn_backend.get_name() == "flashinfer": - indptr_buffer = indptr_buffer[:batch_size + 1] - last_page_len_buffer = last_page_len_buffer[: + _indptr_buffer = indptr_buffer[:batch_size + 1] + _last_page_len_buffer = last_page_len_buffer[: batch_size] num_qo_heads = ( @@ -1055,8 +1055,8 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: use_tensor_cores = False decode_wrapper = \ CUDAGraphBatchDecodeWithPagedKVCacheWrapper( - decode_workspace_buffer, indptr_buffer, - indices_buffer, last_page_len_buffer, "NHD", + decode_workspace_buffer, _indptr_buffer, + indices_buffer, _last_page_len_buffer, "NHD", use_tensor_cores) kv_cache_dtype = get_kv_cache_torch_dtype( self.kv_cache_dtype, self.model_config.dtype) @@ -1131,10 +1131,10 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: self.model, self.attn_backend.get_name()) if self.attn_backend.get_name() == "flashinfer": - graph_runner.flashinfer_indptr_buffer = indptr_buffer + graph_runner.flashinfer_indptr_buffer = _indptr_buffer graph_runner.flashinfer_indices_buffer = indices_buffer graph_runner.flashinfer_last_page_len_buffer = \ - last_page_len_buffer + _last_page_len_buffer graph_runner.flashinfer_decode_workspace_buffer = \ decode_workspace_buffer graph_runner.flashinfer_decode_wrapper = \