Merge branch 'vllm-project:main' into add_chart_helm_example

vllm-project · Dec 8, 2024 · 1634fec · 1634fec
2 parents a65aa6b + a11f326
commit 1634fec
Show file tree

Hide file tree

Showing 69 changed files with 1,612 additions and 1,187 deletions.
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -237,7 +237,7 @@ steps:
   source_file_dependencies:
   - vllm/lora
   - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore lora/test_long_context.py lora/test_chatglm3_tp.py lora/test_llama_tp.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
   parallelism: 4
 
 - label: "PyTorch Fullgraph Smoke Test" # 9min
@@ -362,6 +362,7 @@ steps:
   - tests/models/embedding/vision_language
   - tests/models/encoder_decoder/vision_language
   commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
     - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
     - pytest -v -s models/embedding/vision_language -m core_model
@@ -377,6 +378,7 @@ steps:
   - tests/models/embedding/vision_language
   - tests/models/encoder_decoder/vision_language
   commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
     # HACK - run phi3v tests separately to sidestep this transformers bug
     # https://github.com/huggingface/transformers/issues/34307

diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu
@@ -140,13 +140,10 @@ void paged_attention_v1_launcher(
       blocksparse_block_size, blocksparse_head_sliding_step);
 
 #define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
-  switch (is_block_sparse) {                                               \
-    case true:                                                             \
-      CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);     \
-      break;                                                               \
-    case false:                                                            \
-      CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);    \
-      break;                                                               \
+  if (is_block_sparse) {                                                   \
+    CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);       \
+  } else {                                                                 \
+    CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);      \
   }
 
 // NOTE(woosuk): To reduce the compilation time, we omitted block sizes

diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu
@@ -147,13 +147,10 @@ void paged_attention_v2_launcher(
       blocksparse_head_sliding_step);
 
 #define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
-  switch (is_block_sparse) {                                               \
-    case true:                                                             \
-      CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);     \
-      break;                                                               \
-    case false:                                                            \
-      CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);    \
-      break;                                                               \
+  if (is_block_sparse) {                                                   \
+    CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);       \
+  } else {                                                                 \
+    CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);      \
   }
 
 // NOTE(woosuk): To reduce the compilation time, we omitted block sizes

diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu
@@ -424,7 +424,7 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
         // and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2), 
         // (which occurs when `final_state_position` is a non-positivie index)
         // we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
-        if (final_state_position < 0 && seqlen > kWidth){
+        if (conv_states != nullptr && final_state_position < 0 && seqlen > kWidth){
             input_t vals_load[kNElts] = {0};
             if ((chunk == n_chunks - 2) && (tidx == kNThreads - 1)){
                 // chunk = n_chunks - 2, a segment of the final state sits in the last index

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
@@ -547,15 +547,15 @@ Text Generation
     - ✅︎
     - 
   * - :code:`InternVLChatModel`
-    - InternVL2
+    - InternVL 2.5, Mono-InternVL, InternVL 2.0
     - T + I\ :sup:`E+`
-    - :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc.
+    - :code:`OpenGVLab/InternVL2_5-4B`, :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, etc.
     - 
     - ✅︎
   * - :code:`LlavaForConditionalGeneration`
     - LLaVA-1.5
     - T + I\ :sup:`E+`
-    - :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc.
+    - :code:`llava-hf/llava-1.5-7b-hf`, :code:`TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.
     -
     - ✅︎
   * - :code:`LlavaNextForConditionalGeneration`
@@ -664,6 +664,10 @@ Text Generation
 .. note::
   vLLM currently only supports adding LoRA to the language backbone of multimodal models.
 
+.. note::
+  To use :code:`TIGER-Lab/Mantis-8B-siglip-llama3`, you have to install their GitHub repo (:code:`pip install git+https://github.com/TIGER-AI-Lab/Mantis.git`)
+  and pass :code:`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
+
 .. note::
   The official :code:`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
   For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630

diff --git a/docs/source/serving/deploying_with_kubeai.rst b/docs/source/serving/deploying_with_kubeai.rst
@@ -0,0 +1,17 @@
+.. _deploying_with_kubeai:
+
+Deploying with KubeAI
+=====================
+
+`KubeAI <https://github.com/substratusai/kubeai>`_ is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.
+
+
+Please see the Installation Guides for environment specific instructions:
+
+* `Any Kubernetes Cluster <https://www.kubeai.org/installation/any/>`_
+* `EKS <https://www.kubeai.org/installation/eks/>`_
+* `GKE <https://www.kubeai.org/installation/gke/>`_
+
+Once you have KubeAI installed, you can
+`configure text generation models <https://www.kubeai.org/how-to/configure-text-generation-models/>`_
+using vLLM.
diff --git a/docs/source/serving/integrations.rst b/docs/source/serving/integrations.rst
@@ -6,6 +6,7 @@ Integrations
 
    run_on_sky
    deploying_with_kserve
+   deploying_with_kubeai
    deploying_with_triton
    deploying_with_bentoml
    deploying_with_cerebrium

diff --git a/docs/source/usage/spec_decode.rst b/docs/source/usage/spec_decode.rst
@@ -8,6 +8,9 @@ Speculative decoding
     not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. The work
     to optimize it is ongoing and can be followed in `this issue. <https://github.com/vllm-project/vllm/issues/4630>`_
 
+.. warning::
+    Currently, speculative decoding in vLLM is not compatible with pipeline parallelism.
+
 This document shows how to use `Speculative Decoding <https://x.com/karpathy/status/1697318534555336961>`_ with vLLM.
 Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference.
 

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
@@ -223,7 +223,7 @@ def run_internvl(question: str, modality: str):
     # Stop tokens for InternVL
     # models variants may have different stop tokens
     # please refer to the model card for the correct "stop words":
-    # https://huggingface.co/OpenGVLab/InternVL2-2B#service
+    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
     stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
     return llm, prompt, stop_token_ids
@@ -419,6 +419,22 @@ def run_aria(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
+# Mantis
+def run_mantis(question: str, modality: str):
+    assert modality == "image"
+
+    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
+    prompt = llama3_template.format(f"{question}\n<image>")
+
+    llm = LLM(
+        model="TIGER-Lab/Mantis-8B-siglip-llama3",
+        max_model_len=4096,
+        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
+    )
+    stop_token_ids = [128009]
+    return llm, prompt, stop_token_ids
+
+
 model_example_map = {
     "llava": run_llava,
     "llava-next": run_llava_next,
@@ -441,6 +457,7 @@ def run_aria(question: str, modality: str):
     "glm4v": run_glm4v,
     "idefics3": run_idefics3,
     "aria": run_aria,
+    "mantis": run_mantis,
 }
 
 

diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
@@ -165,7 +165,7 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
     # Stop tokens for InternVL
     # models variants may have different stop tokens
     # please refer to the model card for the correct "stop words":
-    # https://huggingface.co/OpenGVLab/InternVL2-2B#service
+    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
     stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
 

diff --git a/requirements-test.in b/requirements-test.in
@@ -24,9 +24,6 @@ mistral_common[opencv] >= 1.5.0 # required for pixtral test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.4 # required for model evaluation test
 
-# TODO: Add this after fully implementing llava(mantis)
-# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
-
 # quantization
 bitsandbytes>=0.44.0
 buildkite-test-collector==0.1.9

diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
@@ -247,9 +247,19 @@ def _compare_tp(
     *,
     method: Literal["generate", "encode"],
 ):
-    tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup
-    multi_node_only, trust_remote_code, tokenizer_mode, \
-        load_format, hf_overrides = test_options
+    (
+        tp_size,
+        pp_size,
+        eager_mode,
+        chunked_prefill,
+    ) = parallel_setup
+    (
+        multi_node_only,
+        trust_remote_code,
+        tokenizer_mode,
+        load_format,
+        hf_overrides,
+    ) = test_options
 
     if num_gpus_available < tp_size * pp_size:
         pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")

diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
@@ -50,12 +50,12 @@ def test_compilation_config():
     args = parser.parse_args(["-O=3"])
     assert args.compilation_config.level == 3
 
-    # set to json
-    args = parser.parse_args(["--compilation-config", '{"level": 3}'])
+    # set to string form of a dict
+    args = parser.parse_args(["--compilation-config", "{'level': 3}"])
     assert args.compilation_config.level == 3
 
-    # set to json
-    args = parser.parse_args(['--compilation-config={"level": 3}'])
+    # set to string form of a dict
+    args = parser.parse_args(["--compilation-config={'level': 3}"])
     assert args.compilation_config.level == 3
 
 

diff --git a/tests/kernels/test_causal_conv1d.py b/tests/kernels/test_causal_conv1d.py
@@ -149,13 +149,14 @@ def causal_conv1d_opcheck_fn(x: torch.Tensor,
 @pytest.mark.parametrize("itype", [torch.bfloat16, torch.float])
 @pytest.mark.parametrize("silu_activation", [True])
 @pytest.mark.parametrize("has_bias", [True])
+@pytest.mark.parametrize("has_initial_state", [True, False])
 @pytest.mark.parametrize("width", [4])
 @pytest.mark.parametrize(
     'seqlen', [1, 8, 16, 32, 64, 128, 256, 512, 784, 1024, 1025, 2048, 4096])
 @pytest.mark.parametrize('dim', [64])
 @pytest.mark.parametrize('batch', [1])
 def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
-                       itype):
+                       has_initial_state, itype):
     device = "cuda"
     rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3)
     if itype == torch.bfloat16:
@@ -167,11 +168,18 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
 
     weight = torch.randn(dim, width, device=device, dtype=itype)
     bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None
-    initial_states = torch.randn(batch,
-                                 dim,
-                                 width - 1,
-                                 device=device,
-                                 dtype=itype)
+    if has_initial_state:
+        initial_states = torch.randn(batch,
+                                     dim,
+                                     width - 1,
+                                     device=device,
+                                     dtype=itype)
+        has_initial_state_tensor = torch.ones(batch,
+                                              dtype=torch.bool,
+                                              device=x.device)
+    else:
+        initial_states = None
+        has_initial_state_tensor = None
     x_ref = x.clone()
     weight_ref = weight.clone()
     bias_ref = bias.clone() if bias is not None else None
@@ -183,31 +191,28 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
                            bias,
                            activation=activation,
                            conv_states=initial_states,
-                           has_initial_state=torch.ones(batch,
-                                                        dtype=torch.bool,
-                                                        device=x.device))
+                           has_initial_state=has_initial_state_tensor)
     out_ref, final_states_ref = causal_conv1d_ref(
         x_ref,
         weight_ref,
         bias_ref,
         initial_states=initial_states_ref,
         return_final_states=True,
         activation=activation)
-    assert initial_states is not None and final_states_ref is not None
-    assert torch.allclose(initial_states,
-                          final_states_ref,
-                          rtol=rtol,
-                          atol=atol)
+    if has_initial_state:
+        assert initial_states is not None and final_states_ref is not None
+        assert torch.allclose(initial_states,
+                              final_states_ref,
+                              rtol=rtol,
+                              atol=atol)
     assert torch.allclose(out, out_ref, rtol=rtol, atol=atol)
 
     causal_conv1d_opcheck_fn(x,
                              weight,
                              bias,
                              activation=activation,
                              conv_states=initial_states,
-                             has_initial_state=torch.ones(batch,
-                                                          dtype=torch.bool,
-                                                          device=x.device))
+                             has_initial_state=has_initial_state_tensor)
 
 
 @pytest.mark.parametrize("itype", [torch.bfloat16])