diff --git a/tests/kernels/test_uva.py b/tests/kernels/test_uva.py index 1f8a174a0f066..129b2e39b555c 100644 --- a/tests/kernels/test_uva.py +++ b/tests/kernels/test_uva.py @@ -55,7 +55,6 @@ def test_gpu_write(device): cuda_view[4, 5] = -1 cuda_view.mul_(2) - torch.cuda.synchronize() assert cpu_tensor[0, 0] == 2 assert cpu_tensor[2, 3] == 4 assert cpu_tensor[4, 5] == -2 diff --git a/vllm/v1/worker/gpu_block_table.py b/vllm/v1/worker/gpu_block_table.py index 9e0f1ac7201cc..3af85aa1e9b26 100644 --- a/vllm/v1/worker/gpu_block_table.py +++ b/vllm/v1/worker/gpu_block_table.py @@ -5,6 +5,9 @@ from vllm import _custom_ops as ops from vllm.utils import get_cuda_view_from_cpu_tensor, is_uva_available +from vllm.logger import init_logger + +logger = init_logger(__name__) class BlockTable: @@ -39,6 +42,8 @@ def __init__( # UVA requires pinned memory. self.use_uva = is_uva_available() and pin_memory if self.use_uva: + logger.info("Using Unified Virtual Addressing (UVA) for block " + "table transfer.") self.block_table_diff = torch.zeros((max_num_reqs, 2), dtype=torch.int32, device="cpu", @@ -49,6 +54,10 @@ def __init__( self.block_table_cpu) self.block_table_diff_cuda_view = get_cuda_view_from_cpu_tensor( self.block_table_diff) + else: + logger.warning("Unified Virtual Addressing (UVA) is not supported " + "in the current environment. This may result in " + "lower performance.") def add_row(self, row_idx: int, block_ids: List[int]) -> None: num_blocks = len(block_ids)