Merge remote-tracking branch 'upstream/main' into skip-lm-head

* upstream/main: [Build] Temporarily Disable Kernels and LoRA tests (vllm-project#6961) [core][misc] improve free_finished_seq_groups (vllm-project#6865) [Kernel] Remove scaled_fp8_quant kernel padding footgun (vllm-project#6842) [Bugfix] Fix tensorizer memory profiling bug during testing (vllm-project#6881) [OpenVINO] Updated OpenVINO requirements and build docs (vllm-project#6948) [Kernel] Squash a few more warnings (vllm-project#6914) [BugFix] Fix use of per-request seed with pipeline parallel (vllm-project#6698) [Doc] Super tiny fix doc typo (vllm-project#6949)
tjohnson31415 · Jul 30, 2024 · ee83917 · ee83917
2 parents fd11ac1 + 40c27a7
commit ee83917
Show file tree

Hide file tree

Showing 36 changed files with 422 additions and 284 deletions.
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -155,12 +155,12 @@ steps:
     - pytest -v -s test_inputs.py
     - pytest -v -s multimodal
 
-- label: Kernels Test %N
-  #mirror_hardwares: [amd]
-  commands:
-    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
-    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 4
+# - label: Kernels Test %N
+#   #mirror_hardwares: [amd]
+#   commands:
+#     - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+#     - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+#   parallelism: 4
 
 - label: Models Test
   #mirror_hardwares: [amd]
@@ -202,20 +202,20 @@ steps:
     - export VLLM_ATTENTION_BACKEND=XFORMERS
     - pytest -v -s spec_decode
 
-- label: LoRA Test %N
-  #mirror_hardwares: [amd]
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
-  parallelism: 4
-
-- label: LoRA Long Context (Distributed)
-  #mirror_hardwares: [amd]
-  num_gpus: 4
-  # This test runs llama 13B, so it is required to run on 4 GPUs.
-  commands:
-    # FIXIT: find out which code initialize cuda before running the test
-    # before the fix, we need to use spawn to test it
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s -x lora/test_long_context.py
+# - label: LoRA Test %N
+#   #mirror_hardwares: [amd]
+#   command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
+#   parallelism: 4
+
+# - label: LoRA Long Context (Distributed)
+#   #mirror_hardwares: [amd]
+#   num_gpus: 4
+#   # This test runs llama 13B, so it is required to run on 4 GPUs.
+#   commands:
+#     # FIXIT: find out which code initialize cuda before running the test
+#     # before the fix, we need to use spawn to test it
+#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+#     - pytest -v -s -x lora/test_long_context.py
 
 - label: Tensorizer Test
   #mirror_hardwares: [amd]

diff --git a/Dockerfile.openvino b/Dockerfile.openvino
@@ -1,7 +1,7 @@
 # The vLLM Dockerfile is used to construct vLLM image that can be directly used
 # to run the OpenAI compatible server.
 
-FROM ubuntu:20.04 AS dev
+FROM ubuntu:22.04 AS dev
 
 RUN apt-get update -y && \
     apt-get install -y python3-pip git
@@ -18,7 +18,7 @@ COPY setup.py /workspace/vllm/
 # install build requirements
 RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
 # build vLLM with OpenVINO backend
-RUN PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/pre-release" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
 
 COPY examples/ /workspace/vllm/examples
 COPY benchmarks/ /workspace/vllm/benchmarks

diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
@@ -706,7 +706,7 @@ void paged_attention_v1_launcher(
   int kv_block_stride = key_cache.stride(0);
   int kv_head_stride = key_cache.stride(1);
 
-  int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
+  [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
   assert(head_size % thread_group_size == 0);
 
   // NOTE: alibi_slopes is optional.
@@ -865,7 +865,7 @@ void paged_attention_v2_launcher(
   int kv_block_stride = key_cache.stride(0);
   int kv_head_stride = key_cache.stride(1);
 
-  int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
+  [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
   assert(head_size % thread_group_size == 0);
 
   // NOTE: alibi_slopes is optional.

diff --git a/csrc/quantization/aqlm/gemm_kernels.cu b/csrc/quantization/aqlm/gemm_kernels.cu
@@ -273,8 +273,6 @@ __global__ void Code2x8Dequant(
   }
   __syncthreads();
 
-  float res = 0;
-
   int iters = (prob_k / 8 - 1) / (8 * 32) + 1;
   while (iters--) {
     if (pred && a_gl_rd < a_gl_end) {

diff --git a/csrc/quantization/fp8/amd/quant_utils.cuh b/csrc/quantization/fp8/amd/quant_utils.cuh
@@ -526,6 +526,7 @@ __inline__ __device__ Tout convert(const Tin& x) {
   }
   #endif
   assert(false);
+  return {};  // Squash missing return statement warning
 }
 
 template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
@@ -536,6 +537,7 @@ __inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) {
   }
   #endif
   assert(false);
+  return {};  // Squash missing return statement warning
 }
 
   // The following macro is used to dispatch the conversion function based on

diff --git a/csrc/quantization/fp8/nvidia/quant_utils.cuh b/csrc/quantization/fp8/nvidia/quant_utils.cuh
@@ -508,6 +508,7 @@ __inline__ __device__ Tout convert(const Tin& x) {
   }
   #endif
   assert(false);
+  return {};  // Squash missing return statement warning
 }
 
 template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
@@ -520,6 +521,7 @@ __inline__ __device__ Tout scaled_convert(const Tin& x, const float scale) {
   }
   #endif
   assert(false);
+  return {};  // Squash missing return statement warning
 }
 
   // The following macro is used to dispatch the conversion function based on

diff --git a/csrc/quantization/squeezellm/quant_cuda_kernel.cu b/csrc/quantization/squeezellm/quant_cuda_kernel.cu
@@ -203,7 +203,8 @@ void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
 #endif
       mat.data_ptr<int>(),
 #ifndef USE_ROCM
-      (half2*)mul.data<at::Half>(), (__half*)lookup_table.data_ptr<at::Half>(),
+      (half2*)mul.data_ptr<at::Half>(),
+      (__half*)lookup_table.data_ptr<at::Half>(),
 #else
       (float2*)mul.data_ptr<float>(),
       (__half*)lookup_table.data_ptr<at::Half>(),

diff --git a/docs/source/getting_started/openvino-installation.rst b/docs/source/getting_started/openvino-installation.rst
@@ -57,7 +57,7 @@ Install from source
 
   .. code-block:: console
 
-      $ PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
+      $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/pre-release" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
 
 .. _openvino_backend_performance_tips:
 

diff --git a/requirements-openvino.txt b/requirements-openvino.txt
@@ -1,7 +1,33 @@
 # Common dependencies
--r requirements-common.txt
+# -r requirements-common.txt
+# TODO: remove temporary copy of all common dependencies once Optimum Intel will support Transformers >= 4.43.2
+cmake >= 3.21
+ninja  # For faster builds.
+psutil
+sentencepiece  # Required for LLaMA tokenizer.
+numpy < 2.0.0
+requests
+tqdm
+py-cpuinfo
+transformers < 4.43
+tokenizers >= 0.19.1  # Required for Llama 3.
+fastapi
+aiohttp
+openai
+uvicorn[standard]
+pydantic >= 2.0  # Required for OpenAI server.
+pillow  # Required for image processing
+prometheus_client >= 0.18.0
+prometheus-fastapi-instrumentator >= 7.0.0
+tiktoken >= 0.6.0  # Required for DBRX tokenizer
+lm-format-enforcer == 0.10.3
+outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
+typing_extensions
+filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
+pyzmq
 
 # OpenVINO dependencies
 torch >= 2.1.2
 openvino ~= 2024.3.0.dev
+openvino-tokenizers[transformers] ~= 2024.3.0.0.dev
 optimum-intel[openvino] >= 1.18.1
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
@@ -123,7 +123,7 @@ def per_tensor_dequantize(tensor, inv_scale, dtype):
     assert torch.allclose(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
 
     # Padding
-    y, _ = ops.scaled_fp8_quant(x, inv_scale, batch_dim_padding=17)
+    y, _ = ops.scaled_fp8_quant(x, inv_scale, num_token_padding=17)
     assert y.shape[0] == 17
     assert torch.allclose(
         ref_y,

diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
@@ -150,10 +150,9 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
                                     high=vocab_size,
                                     size=(batch_size, k),
                                     dtype=torch.int64)
-    generators = [None] * batch_size
 
     rejection_sampler(target_probs, bonus_token_ids, draft_probs,
-                      draft_token_ids, generators)
+                      draft_token_ids)
 
 
 @pytest.mark.parametrize("frac_seeded", [0.0, 0.25, 0.5, 1.0])
@@ -185,14 +184,13 @@ def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
 
     results = []
     for _ in range(n_rep):
-        generators = [
-            torch.Generator(
-                device=device).manual_seed(i) if seeded_mask[i] else None
-            for i in range(batch_size)
-        ]
+        seeded_seqs = {
+            i: torch.Generator(device=device).manual_seed(i)
+            for i in range(batch_size) if seeded_mask[i]
+        }
         results.append(
             rejection_sampler(target_probs, bonus_token_ids, draft_probs,
-                              draft_token_ids, generators))
+                              draft_token_ids, seeded_seqs))
 
     for i in range(batch_size):
         if seeded_mask[i]:
@@ -242,11 +240,10 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
         raise AssertionError()
 
     oob_token_ids[0][0] = rogue_token_id
-    generators = [None] * batch_size
 
     with pytest.raises(AssertionError):
         rejection_sampler(target_probs, bonus_token_ids, draft_probs,
-                          draft_token_ids, generators)
+                          draft_token_ids)
 
 
 @pytest.mark.parametrize("draft_and_target_probs_equal", [True, False])
@@ -417,15 +414,11 @@ def _estimate_rejection_sampling_pdf(
                                       dtype=torch.int64,
                                       device="cuda").repeat(num_samples, 1)
 
-        # unseeded
-        generators = [None]
-
         # Get output tokens via rejection sampling.
         output_token_ids = self.rejection_sampler(target_probs.to("cuda"),
                                                   bonus_token_ids.to("cuda"),
                                                   draft_probs.to("cuda"),
-                                                  draft_token_ids.to("cuda"),
-                                                  generators)
+                                                  draft_token_ids.to("cuda"))
 
         # Remove bonus tokens
         output_token_ids = output_token_ids[:, :-1].flatten()

diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
@@ -510,13 +510,16 @@ def test_sampler_mixed(seed: int, device: str):
             ))
         seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
 
+    generators: Dict[str, torch.Generator] = {}
+
     def test_sampling():
         sampling_metadata = SamplingMetadata.prepare(
             seq_group_metadata_list,
             seq_lens,
             query_lens=seq_lens,
             device=device,
-            pin_memory=is_pin_memory_available())
+            pin_memory=is_pin_memory_available(),
+            generators=generators)
         sampler_output = sampler(logits=fake_logits,
                                  sampling_metadata=sampling_metadata)
 

diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -21,7 +21,8 @@
 
 import pytest
 
-from .conftest import run_greedy_equality_correctness_test
+from .conftest import (run_equality_correctness_test,
+                       run_greedy_equality_correctness_test)
 
 # main model
 MAIN_MODEL = "JackFram/llama-160m"
@@ -77,6 +78,57 @@ def test_mlp_e2e_greedy_correctness(baseline_llm_generator, test_llm_generator,
                                          force_output_len=True)
 
 
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True,
+
+        # Print spec metrics.
+        "disable_log_stats": False,
+
+        # Precision
+        "dtype": PRECISION,
+
+        # Main model
+        "model": MAIN_MODEL,
+
+        # Speculative model
+        "speculative_model": SPEC_MODEL,
+    }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}])
+@pytest.mark.parametrize("test_llm_kwargs", [{"seed": 5}])
+@pytest.mark.parametrize("output_len", [64])
+@pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("temperature", [0.1, 1.0])
+@pytest.mark.parametrize("seed", [None])
+def test_mlp_e2e_seeded_correctness(baseline_llm_generator, test_llm_generator,
+                                    batch_size: int, output_len: int,
+                                    temperature: float):
+    """Verify seeded runs produce the same output."""
+    run_equality_correctness_test(baseline_llm_generator,
+                                  test_llm_generator,
+                                  batch_size,
+                                  max_output_len=output_len,
+                                  temperature=temperature,
+                                  seeded=True,
+                                  force_output_len=True)
+
+    # Ensure this same test does fail if we _don't_ include per-request seeds
+    with pytest.raises(AssertionError):
+        run_equality_correctness_test(baseline_llm_generator,
+                                      test_llm_generator,
+                                      batch_size,
+                                      max_output_len=output_len,
+                                      temperature=temperature,
+                                      seeded=False,
+                                      force_output_len=True)
+
+
 @pytest.mark.parametrize(
     "common_llm_kwargs",
     [{

diff --git a/tests/spec_decode/e2e/test_seed.py b/tests/spec_decode/e2e/test_seed.py
@@ -29,7 +29,7 @@
     "output_len",
     [
         # Use smaller output len for fast test.
-        10,
+        20,
     ])
 @pytest.mark.parametrize("seed", [None])
 def test_seeded_consistency(baseline_llm_generator, test_llm_generator,

diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py
@@ -86,6 +86,7 @@ def test_create_single_target_seq_group_metadata(k: int):
         input_seq_id,
         target_seq_id,
         token_ids,
+        input_seq_group_metadata.sampling_params,
     )
 
     assert output.request_id == input_seq_group_metadata.request_id