diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index ffe8de0ba0849..383a10e5b96b4 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -270,9 +270,6 @@ def test_with_prefix_caching( @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16]) @pytest.mark.parametrize("enforce_eager", [False]) -# NOTE: Increasing this in this suite will fail CI because we currently cannot -# reset distributed env properly. Use a value > 1 just when you test. -@pytest.mark.parametrize("tensor_parallel_size", [1]) @pytest.mark.parametrize("attention_backend", ["TORCH_SDPA"]) @pytest.mark.cpu_model @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only") @@ -307,9 +304,6 @@ def test_models_cpu( @pytest.mark.parametrize("max_tokens", [16]) @pytest.mark.parametrize("enforce_eager", [False]) @pytest.mark.parametrize("chunk_size", [30, 32]) -# NOTE: Increasing this in this suite will fail CI because we currently cannot -# reset distributed env properly. Use a value > 1 just when you test. -@pytest.mark.parametrize("tensor_parallel_size", [1]) @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.cpu_model @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only") diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 00b72f893e78d..3d025df26a7a1 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -294,7 +294,7 @@ def build(self, seq_lens: List[int], query_lens: List[int], prefill_block_tables = make_tensor_with_pad( self.input_data.prefill_block_tables, pad=0, - dtype=torch.int, + dtype=torch.int32, device="cpu", ) query_lens_tensor = torch.tensor(prefill_query_lens, @@ -330,13 +330,13 @@ def build(self, seq_lens: List[int], query_lens: List[int], if input_data.num_decode_tokens != 0: seq_lens_tensor = torch.tensor( input_data.seq_lens[input_data.num_prefills:], - dtype=torch.int, + dtype=torch.int32, device="cpu", ) block_tables = make_tensor_with_pad( self.input_data.decode_block_tables, pad=0, - dtype=torch.int, + dtype=torch.int32, device="cpu", ) else: diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 7d566c45ac2a9..66bd844c94901 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -295,7 +295,7 @@ def _compute_prompt_input_tokens(self, data: ModelInputData, slot_mapping[i] = slot data.slot_mapping.extend(slot_mapping) - # The MRPOE positions are prepared in _compute_multi_modal_input + # The MROPE positions are prepared in _compute_multi_modal_input if data.input_positions is not None: data.input_positions.extend(token_positions)