fix comments

Signed-off-by: jiang1.li <[email protected]>
vllm-project · Nov 20, 2024 · f9812e2 · f9812e2
1 parent 883c6f5
commit f9812e2
Show file tree

Hide file tree

Showing 3 changed files with 4 additions and 10 deletions.
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
@@ -270,9 +270,6 @@ def test_with_prefix_caching(
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
 @pytest.mark.parametrize("enforce_eager", [False])
-# NOTE: Increasing this in this suite will fail CI because we currently cannot
-# reset distributed env properly. Use a value > 1 just when you test.
-@pytest.mark.parametrize("tensor_parallel_size", [1])
 @pytest.mark.parametrize("attention_backend", ["TORCH_SDPA"])
 @pytest.mark.cpu_model
 @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")
@@ -307,9 +304,6 @@ def test_models_cpu(
 @pytest.mark.parametrize("max_tokens", [16])
 @pytest.mark.parametrize("enforce_eager", [False])
 @pytest.mark.parametrize("chunk_size", [30, 32])
-# NOTE: Increasing this in this suite will fail CI because we currently cannot
-# reset distributed env properly. Use a value > 1 just when you test.
-@pytest.mark.parametrize("tensor_parallel_size", [1])
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.cpu_model
 @pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only")

diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
@@ -294,7 +294,7 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             prefill_block_tables = make_tensor_with_pad(
                 self.input_data.prefill_block_tables,
                 pad=0,
-                dtype=torch.int,
+                dtype=torch.int32,
                 device="cpu",
             )
             query_lens_tensor = torch.tensor(prefill_query_lens,
@@ -330,13 +330,13 @@ def build(self, seq_lens: List[int], query_lens: List[int],
         if input_data.num_decode_tokens != 0:
             seq_lens_tensor = torch.tensor(
                 input_data.seq_lens[input_data.num_prefills:],
-                dtype=torch.int,
+                dtype=torch.int32,
                 device="cpu",
             )
             block_tables = make_tensor_with_pad(
                 self.input_data.decode_block_tables,
                 pad=0,
-                dtype=torch.int,
+                dtype=torch.int32,
                 device="cpu",
             )
         else:

diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
@@ -295,7 +295,7 @@ def _compute_prompt_input_tokens(self, data: ModelInputData,
                 slot_mapping[i] = slot
             data.slot_mapping.extend(slot_mapping)
 
-        # The MRPOE positions are prepared in _compute_multi_modal_input
+        # The MROPE positions are prepared in _compute_multi_modal_input
         if data.input_positions is not None:
             data.input_positions.extend(token_positions)