Skip to content

Commit

Permalink
Merge branch 'main' into fix-xgrammar-json-ref
Browse files Browse the repository at this point in the history
  • Loading branch information
mgoin committed Dec 9, 2024
2 parents 2966c19 + 25b79d9 commit f32cae8
Show file tree
Hide file tree
Showing 88 changed files with 2,129 additions and 1,585 deletions.
4 changes: 3 additions & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ steps:
source_file_dependencies:
- vllm/lora
- tests/lora
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore lora/test_long_context.py lora/test_chatglm3_tp.py lora/test_llama_tp.py
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
parallelism: 4

- label: "PyTorch Fullgraph Smoke Test" # 9min
Expand Down Expand Up @@ -362,6 +362,7 @@ steps:
- tests/models/embedding/vision_language
- tests/models/encoder_decoder/vision_language
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
- pytest -v -s models/embedding/vision_language -m core_model
Expand All @@ -377,6 +378,7 @@ steps:
- tests/models/embedding/vision_language
- tests/models/encoder_decoder/vision_language
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
# HACK - run phi3v tests separately to sidestep this transformers bug
# https://github.com/huggingface/transformers/issues/34307
Expand Down
11 changes: 4 additions & 7 deletions csrc/attention/paged_attention_v1.cu
Original file line number Diff line number Diff line change
Expand Up @@ -140,13 +140,10 @@ void paged_attention_v1_launcher(
blocksparse_block_size, blocksparse_head_sliding_step);

#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
switch (is_block_sparse) { \
case true: \
CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
break; \
case false: \
CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
break; \
if (is_block_sparse) { \
CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
} else { \
CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
}

// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
Expand Down
11 changes: 4 additions & 7 deletions csrc/attention/paged_attention_v2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -147,13 +147,10 @@ void paged_attention_v2_launcher(
blocksparse_head_sliding_step);

#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
switch (is_block_sparse) { \
case true: \
CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
break; \
case false: \
CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
break; \
if (is_block_sparse) { \
CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
} else { \
CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
}

// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
Expand Down
2 changes: 1 addition & 1 deletion csrc/mamba/causal_conv1d/causal_conv1d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
// and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2),
// (which occurs when `final_state_position` is a non-positivie index)
// we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
if (final_state_position < 0 && seqlen > kWidth){
if (conv_states != nullptr && final_state_position < 0 && seqlen > kWidth){
input_t vals_load[kNElts] = {0};
if ((chunk == n_chunks - 2) && (tidx == kNThreads - 1)){
// chunk = n_chunks - 2, a segment of the final state sits in the last index
Expand Down
36 changes: 32 additions & 4 deletions docs/source/models/supported_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -495,7 +495,7 @@ Text Generation
---------------

.. list-table::
:widths: 25 25 15 25 5 5
:widths: 25 25 15 20 5 5 5
:header-rows: 1

* - Architecture
Expand All @@ -504,144 +504,168 @@ Text Generation
- Example HF Models
- :ref:`LoRA <lora>`
- :ref:`PP <distributed_serving>`
- V1
* - :code:`AriaForConditionalGeneration`
- Aria
- T + I
- :code:`rhymes-ai/Aria`
-
- ✅︎
-
* - :code:`Blip2ForConditionalGeneration`
- BLIP-2
- T + I\ :sup:`E`
- :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc.
-
- ✅︎
-
* - :code:`ChameleonForConditionalGeneration`
- Chameleon
- T + I
- :code:`facebook/chameleon-7b` etc.
-
- ✅︎
-
* - :code:`FuyuForCausalLM`
- Fuyu
- T + I
- :code:`adept/fuyu-8b` etc.
-
- ✅︎
-
* - :code:`ChatGLMModel`
- GLM-4V
- T + I
- :code:`THUDM/glm-4v-9b` etc.
- ✅︎
- ✅︎
-
* - :code:`H2OVLChatModel`
- H2OVL
- T + I\ :sup:`E+`
- :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc.
-
- ✅︎
-
* - :code:`Idefics3ForConditionalGeneration`
- Idefics3
- T + I
- :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc.
- ✅︎
-
-
* - :code:`InternVLChatModel`
- InternVL2
- InternVL 2.5, Mono-InternVL, InternVL 2.0
- T + I\ :sup:`E+`
- :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc.
- :code:`OpenGVLab/InternVL2_5-4B`, :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, etc.
-
- ✅︎
- ✅︎
* - :code:`LlavaForConditionalGeneration`
- LLaVA-1.5
- T + I\ :sup:`E+`
- :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc.
- :code:`llava-hf/llava-1.5-7b-hf`, :code:`TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.
-
- ✅︎
- ✅︎
* - :code:`LlavaNextForConditionalGeneration`
- LLaVA-NeXT
- T + I\ :sup:`E+`
- :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
-
- ✅︎
-
* - :code:`LlavaNextVideoForConditionalGeneration`
- LLaVA-NeXT-Video
- T + V
- :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
-
- ✅︎
-
* - :code:`LlavaOnevisionForConditionalGeneration`
- LLaVA-Onevision
- T + I\ :sup:`+` + V\ :sup:`+`
- :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
-
- ✅︎
-
* - :code:`MiniCPMV`
- MiniCPM-V
- T + I\ :sup:`E+`
- :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc.
- ✅︎
- ✅︎
-
* - :code:`MllamaForConditionalGeneration`
- Llama 3.2
- T + I\ :sup:`+`
- :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc.
-
-
-
* - :code:`MolmoForCausalLM`
- Molmo
- T + I
- :code:`allenai/Molmo-7B-D-0924`, :code:`allenai/Molmo-72B-0924`, etc.
-
- ✅︎
- ✅︎
* - :code:`NVLM_D_Model`
- NVLM-D 1.0
- T + I\ :sup:`E+`
- :code:`nvidia/NVLM-D-72B`, etc.
-
- ✅︎
- ✅︎
* - :code:`PaliGemmaForConditionalGeneration`
- PaliGemma
- T + I\ :sup:`E`
- :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc.
-
- ✅︎
-
* - :code:`Phi3VForCausalLM`
- Phi-3-Vision, Phi-3.5-Vision
- T + I\ :sup:`E+`
- :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc.
-
- ✅︎
- ✅︎
* - :code:`PixtralForConditionalGeneration`
- Pixtral
- T + I\ :sup:`+`
- :code:`mistralai/Pixtral-12B-2409`, :code:`mistral-community/pixtral-12b` etc.
-
- ✅︎
- ✅︎
* - :code:`QWenLMHeadModel`
- Qwen-VL
- T + I\ :sup:`E+`
- :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
- ✅︎
- ✅︎
-
* - :code:`Qwen2AudioForConditionalGeneration`
- Qwen2-Audio
- T + A\ :sup:`+`
- :code:`Qwen/Qwen2-Audio-7B-Instruct`
-
- ✅︎
-
* - :code:`Qwen2VLForConditionalGeneration`
- Qwen2-VL
- T + I\ :sup:`E+` + V\ :sup:`E+`
- :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
- ✅︎
- ✅︎
-
* - :code:`UltravoxModel`
- Ultravox
- T + A\ :sup:`E+`
- :code:`fixie-ai/ultravox-v0_3`
-
- ✅︎
-

| :sup:`E` Pre-computed embeddings can be inputted for this modality.
| :sup:`+` Multiple items can be inputted per text prompt for this modality.
Expand All @@ -664,6 +688,10 @@ Text Generation
.. note::
vLLM currently only supports adding LoRA to the language backbone of multimodal models.

.. note::
To use :code:`TIGER-Lab/Mantis-8B-siglip-llama3`, you have to install their GitHub repo (:code:`pip install git+https://github.com/TIGER-AI-Lab/Mantis.git`)
and pass :code:`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.

.. note::
The official :code:`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
Expand Down
17 changes: 17 additions & 0 deletions docs/source/serving/deploying_with_kubeai.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
.. _deploying_with_kubeai:

Deploying with KubeAI
=====================

`KubeAI <https://github.com/substratusai/kubeai>`_ is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.


Please see the Installation Guides for environment specific instructions:

* `Any Kubernetes Cluster <https://www.kubeai.org/installation/any/>`_
* `EKS <https://www.kubeai.org/installation/eks/>`_
* `GKE <https://www.kubeai.org/installation/gke/>`_

Once you have KubeAI installed, you can
`configure text generation models <https://www.kubeai.org/how-to/configure-text-generation-models/>`_
using vLLM.
1 change: 1 addition & 0 deletions docs/source/serving/integrations.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Integrations

run_on_sky
deploying_with_kserve
deploying_with_kubeai
deploying_with_triton
deploying_with_bentoml
deploying_with_cerebrium
Expand Down
3 changes: 3 additions & 0 deletions docs/source/usage/spec_decode.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ Speculative decoding
not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. The work
to optimize it is ongoing and can be followed in `this issue. <https://github.com/vllm-project/vllm/issues/4630>`_

.. warning::
Currently, speculative decoding in vLLM is not compatible with pipeline parallelism.

This document shows how to use `Speculative Decoding <https://x.com/karpathy/status/1697318534555336961>`_ with vLLM.
Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference.

Expand Down
19 changes: 18 additions & 1 deletion examples/offline_inference_vision_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def run_internvl(question: str, modality: str):
# Stop tokens for InternVL
# models variants may have different stop tokens
# please refer to the model card for the correct "stop words":
# https://huggingface.co/OpenGVLab/InternVL2-2B#service
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
return llm, prompt, stop_token_ids
Expand Down Expand Up @@ -419,6 +419,22 @@ def run_aria(question: str, modality: str):
return llm, prompt, stop_token_ids


# Mantis
def run_mantis(question: str, modality: str):
assert modality == "image"

llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' # noqa: E501
prompt = llama3_template.format(f"{question}\n<image>")

llm = LLM(
model="TIGER-Lab/Mantis-8B-siglip-llama3",
max_model_len=4096,
hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
)
stop_token_ids = [128009]
return llm, prompt, stop_token_ids


model_example_map = {
"llava": run_llava,
"llava-next": run_llava_next,
Expand All @@ -441,6 +457,7 @@ def run_aria(question: str, modality: str):
"glm4v": run_glm4v,
"idefics3": run_idefics3,
"aria": run_aria,
"mantis": run_mantis,
}


Expand Down
2 changes: 1 addition & 1 deletion examples/offline_inference_vision_language_multi_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
# Stop tokens for InternVL
# models variants may have different stop tokens
# please refer to the model card for the correct "stop words":
# https://huggingface.co/OpenGVLab/InternVL2-2B#service
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]

Expand Down
2 changes: 1 addition & 1 deletion requirements-common.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer >= 0.10.9, < 0.11
outlines >= 0.0.43, < 0.1
xgrammar >= 0.1.5; platform_machine == "x86_64"
xgrammar >= 0.1.6; platform_machine == "x86_64"
typing_extensions >= 4.10
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
partial-json-parser # used for parsing partial JSON outputs
Expand Down
3 changes: 0 additions & 3 deletions requirements-test.in
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,6 @@ mistral_common[opencv] >= 1.5.0 # required for pixtral test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.4 # required for model evaluation test

# TODO: Add this after fully implementing llava(mantis)
# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test

# quantization
bitsandbytes>=0.44.0
buildkite-test-collector==0.1.9
Expand Down
16 changes: 13 additions & 3 deletions tests/distributed/test_pipeline_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,9 +247,19 @@ def _compare_tp(
*,
method: Literal["generate", "encode"],
):
tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup
multi_node_only, trust_remote_code, tokenizer_mode, \
load_format, hf_overrides = test_options
(
tp_size,
pp_size,
eager_mode,
chunked_prefill,
) = parallel_setup
(
multi_node_only,
trust_remote_code,
tokenizer_mode,
load_format,
hf_overrides,
) = test_options

if num_gpus_available < tp_size * pp_size:
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
Expand Down
Loading

0 comments on commit f32cae8

Please sign in to comment.