Merge branch 'main' into torch-compile-addtional-test

vllm-project · Nov 1, 2024 · 997f705 · 997f705
2 parents fac4b3f + 566cd27
commit 997f705
Show file tree

Hide file tree

Showing 40 changed files with 659 additions and 192 deletions.
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
@@ -31,8 +31,8 @@ cleanup_docker() {
     echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
     # Remove dangling images (those that are not tagged and not used by any container)
     docker image prune -f
-    # Remove unused volumes
-    docker volume prune -f
+    # Remove unused volumes / force the system prune for old images as well.
+    docker volume prune -f && docker system prune --force --filter "until=72h" --all
     echo "Docker images and volumes cleanup completed."
   else
     echo "Disk usage is below $threshold%. No cleanup needed."

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -9,6 +9,7 @@
 # label(str): the name of the test. emoji allowed.
 # fast_check(bool): whether to run this on each commit on fastcheck pipeline.
 # fast_check_only(bool): run this test on fastcheck pipeline only
+# nightly(bool): run this test in nightly pipeline only
 # optional(bool): never run this test by default (i.e. need to unblock manually)
 # command(str): the single command to run for tests. incompatible with commands.
 # commands(list): the list of commands to run for test. incompatbile with command.
@@ -330,18 +331,28 @@ steps:
   commands:
     - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py
 
-- label: Decoder-only Multi-Modal Models Test # 1h31min
+- label: Decoder-only Multi-Modal Models Test (Standard)
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
   - tests/models/decoder_only/audio_language
   - tests/models/decoder_only/vision_language
   commands:
-    - pytest -v -s models/decoder_only/audio_language
+    - pytest -v -s models/decoder_only/audio_language -m core_model
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m core_model
+
+- label: Decoder-only Multi-Modal Models Test (Extended)
+  nightly: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/decoder_only/audio_language
+  - tests/models/decoder_only/vision_language
+  commands:
+    - pytest -v -s models/decoder_only/audio_language -m 'not core_model'
     # HACK - run phi3v tests separately to sidestep this transformers bug
     # https://github.com/huggingface/transformers/issues/34307
     - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
-    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model'
 
 - label: Other Models Test # 6min
   #mirror_hardwares: [amd]

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -5,3 +5,19 @@ updates:
     directory: "/"
     schedule:
       interval: "weekly"
+  - package-ecosystem: "pip"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+    labels: ["dependencies"]
+    open-pull-requests-limit: 5
+    reviewers: ["khluu", "simon-mo"]
+    allow:
+      - dependency-type: "all"
+    groups:
+      patch-update:
+        applies-to: version-updates
+        update-types: ["patch"]
+      minor-update:
+        applies-to: version-updates
+        update-types: ["minor"]
diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu
@@ -418,6 +418,31 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
             typename Ktraits::BlockStoreT(smem_store).Store(out, out_vals_store, seqlen - chunk * kChunkSize);
         }
         out += kChunkSize;
+
+        int final_state_position =  ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize);
+        // in case the final state is separated between the last "smem_exchange" and 
+        // and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2), 
+        // (which occurs when `final_state_position` is a non-positivie index)
+        // we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
+        if (final_state_position < 0 && seqlen > kWidth){
+            input_t vals_load[kNElts] = {0};
+            if ((chunk == n_chunks - 2) && (tidx == kNThreads - 1)){
+                // chunk = n_chunks - 2, a segment of the final state sits in the last index
+                reinterpret_cast<vec_t *>(vals_load)[0] = smem_exchange[kNThreads - 1];
+                #pragma unroll
+                for (int w = 0; w < -final_state_position; ++w){
+                    conv_states[w] = vals_load[kNElts + final_state_position + w];
+                }
+            }
+            if ((chunk == n_chunks - 1) && tidx == 0){
+                // chunk = n_chunks - 1, the second segment of the final state first positions
+                reinterpret_cast<vec_t *>(vals_load)[0] = smem_exchange[0];
+                for (int w = -final_state_position; w < kWidth - 1; ++w){
+                    conv_states[w] = vals_load[w + final_state_position];
+                }
+                return;
+            }
+        }
     }
     // Final state is stored in the smem_exchange last token slot,
     // in case seqlen < kWidth, we would need to take the final state from the 
@@ -446,9 +471,14 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
         }
         else {
             // in case the final state is in between the threads data
-            reinterpret_cast<vec_t *>(x_vals_load)[1] = smem_exchange[last_thread + 1];
-            reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[last_thread];
             const int offset = ((seqlen - (kWidth - 1)) % (kNElts));
+            if ((offset + kWidth - 2) >= kNElts && (last_thread + 1 < kNThreads)){
+                // In case last_thread == kNThreads - 1, accessing last_thread + 1 will result in a 
+                // illegal access error on H100.
+                // Therefore, we access last_thread + 1, only if the final state data sits there
+                reinterpret_cast<vec_t *>(x_vals_load)[1] = smem_exchange[last_thread + 1];
+            }
+            reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[last_thread];
             #pragma unroll
             for (int w = 0; w < kWidth - 1; ++w){
                 conv_states[w] = x_vals_load[offset + w ];

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
@@ -277,7 +277,7 @@ Text Generation
   * - :code:`QWenLMHeadModel`
     - Qwen
     - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
-    -
+    - ✅︎
     - ✅︎
   * - :code:`Qwen2ForCausalLM`
     - Qwen2
@@ -516,7 +516,7 @@ Text Generation
     - Qwen-VL
     - T + I\ :sup:`E+`
     - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
-    -
+    - ✅︎
     - ✅︎
   * - :code:`Qwen2AudioForConditionalGeneration`
     - Qwen2-Audio
@@ -540,6 +540,9 @@ Text Generation
 | :sup:`E` Pre-computed embeddings can be inputted for this modality.
 | :sup:`+` Multiple items can be inputted per text prompt for this modality.
 
+.. note::
+  vLLM currently only supports adding LoRA to the language backbone of multimodal models.               
+
 .. note::
   For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
   For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630

diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst
@@ -283,7 +283,7 @@ Feature x Feature
      - ✅
      - ✅
      - ✅
-     - `✗ <https://github.com/vllm-project/vllm/issues/8985>`__ 
+     - `✗ <https://github.com/vllm-project/vllm/issues/9893>`__ 
      - ?
      - ✅
      - ✅

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
@@ -262,10 +262,9 @@ def run_qwen2_vl(question: str, modality: str):
 
     model_name = "Qwen/Qwen2-VL-7B-Instruct"
 
-    # Tested on L40
     llm = LLM(
         model=model_name,
-        max_model_len=8192,
+        max_model_len=4096,
         max_num_seqs=5,
         # Note - mm_processor_kwargs can also be passed to generate/chat calls
         mm_processor_kwargs={

diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
@@ -6,18 +6,22 @@
 
 import torch
 from torch import nn
+from torch.library import Library
 
 from vllm.compilation.compile_context import set_compile_context
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
 from vllm.compilation.levels import CompilationLevel
+from vllm.utils import direct_register_custom_op
 
 os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE)
 
 global_counter = 0
 
+# create a library to hold the custom op
+silly_lib = Library("silly", "FRAGMENT")  # noqa
+
 
-@torch.library.custom_op("silly::attention", mutates_args=["out"])
 def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
                     out: torch.Tensor) -> None:
     global global_counter
@@ -27,12 +31,20 @@ def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
     out[0] += 1
 
 
-@silly_attention.register_fake
-def _(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-      out: torch.Tensor) -> None:
+def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                         out: torch.Tensor) -> None:
     return
 
 
+direct_register_custom_op(
+    op_name="attention",
+    op_func=silly_attention,
+    mutates_args=["out"],
+    fake_impl=silly_attention_fake,
+    target_lib=silly_lib,
+)
+
+
 @support_torch_compile
 class SillyModel(nn.Module):
 

diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
@@ -8,29 +8,41 @@
 
 import torch
 from torch import nn
+from torch.library import Library
 
 from vllm.compilation.compile_context import set_compile_context
 from vllm.compilation.config import CompilationConfig
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
 from vllm.compilation.levels import CompilationLevel
 from vllm.plugins import set_compilation_config
+from vllm.utils import direct_register_custom_op
+
+# create a library to hold the custom op
+silly_lib = Library("silly", "FRAGMENT")  # noqa
 
 
-@torch.library.custom_op("silly::attention", mutates_args=["out"])
 def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
                     out: torch.Tensor) -> None:
     out.copy_(q)
     out += k
     out += v
 
 
-@silly_attention.register_fake
-def _(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
-      out: torch.Tensor) -> None:
+def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                         out: torch.Tensor) -> None:
     return
 
 
+direct_register_custom_op(
+    op_name="attention",
+    op_func=silly_attention,
+    mutates_args=["out"],
+    fake_impl=silly_attention_fake,
+    target_lib=silly_lib,
+)
+
+
 @dataclass
 class LlamaConfig:
     hidden_size: int = 128
-Original file line number
+Diff line change
@@ Expand Up / @@ -283,7 +283,7 @@ Feature x Feature @@
          - ✅
          - ✅
          - ✅
-         - `✗ <https://github.com/vllm-project/vllm/issues/8985>`__
+         - `✗ <https://github.com/vllm-project/vllm/issues/9893>`__
          - ?
          - ✅
          - ✅
@@ Expand Down @@