diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index df201cdc7c554..329cc42558da6 100755 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -31,8 +31,8 @@ cleanup_docker() { echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." # Remove dangling images (those that are not tagged and not used by any container) docker image prune -f - # Remove unused volumes - docker volume prune -f + # Remove unused volumes / force the system prune for old images as well. + docker volume prune -f && docker system prune --force --filter "until=72h" --all echo "Docker images and volumes cleanup completed." else echo "Disk usage is below $threshold%. No cleanup needed." diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 32eed1a771718..9444dc43ea97e 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -9,6 +9,7 @@ # label(str): the name of the test. emoji allowed. # fast_check(bool): whether to run this on each commit on fastcheck pipeline. # fast_check_only(bool): run this test on fastcheck pipeline only +# nightly(bool): run this test in nightly pipeline only # optional(bool): never run this test by default (i.e. need to unblock manually) # command(str): the single command to run for tests. incompatible with commands. # commands(list): the list of commands to run for test. incompatbile with command. @@ -330,18 +331,28 @@ steps: commands: - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py -- label: Decoder-only Multi-Modal Models Test # 1h31min +- label: Decoder-only Multi-Modal Models Test (Standard) #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/models/decoder_only/audio_language - tests/models/decoder_only/vision_language commands: - - pytest -v -s models/decoder_only/audio_language + - pytest -v -s models/decoder_only/audio_language -m core_model + - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m core_model + +- label: Decoder-only Multi-Modal Models Test (Extended) + nightly: true + source_file_dependencies: + - vllm/ + - tests/models/decoder_only/audio_language + - tests/models/decoder_only/vision_language + commands: + - pytest -v -s models/decoder_only/audio_language -m 'not core_model' # HACK - run phi3v tests separately to sidestep this transformers bug # https://github.com/huggingface/transformers/issues/34307 - pytest -v -s models/decoder_only/vision_language/test_phi3v.py - - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language + - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model' - label: Other Models Test # 6min #mirror_hardwares: [amd] diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 6fddca0d6e4b9..a21acd9671eeb 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -5,3 +5,19 @@ updates: directory: "/" schedule: interval: "weekly" + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "weekly" + labels: ["dependencies"] + open-pull-requests-limit: 5 + reviewers: ["khluu", "simon-mo"] + allow: + - dependency-type: "all" + groups: + patch-update: + applies-to: version-updates + update-types: ["patch"] + minor-update: + applies-to: version-updates + update-types: ["minor"] diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu index 3a464c5f327ad..498d069c05f0d 100644 --- a/csrc/mamba/causal_conv1d/causal_conv1d.cu +++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu @@ -418,6 +418,31 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) { typename Ktraits::BlockStoreT(smem_store).Store(out, out_vals_store, seqlen - chunk * kChunkSize); } out += kChunkSize; + + int final_state_position = ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize); + // in case the final state is separated between the last "smem_exchange" and + // and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2), + // (which occurs when `final_state_position` is a non-positivie index) + // we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it + if (final_state_position < 0 && seqlen > kWidth){ + input_t vals_load[kNElts] = {0}; + if ((chunk == n_chunks - 2) && (tidx == kNThreads - 1)){ + // chunk = n_chunks - 2, a segment of the final state sits in the last index + reinterpret_cast(vals_load)[0] = smem_exchange[kNThreads - 1]; + #pragma unroll + for (int w = 0; w < -final_state_position; ++w){ + conv_states[w] = vals_load[kNElts + final_state_position + w]; + } + } + if ((chunk == n_chunks - 1) && tidx == 0){ + // chunk = n_chunks - 1, the second segment of the final state first positions + reinterpret_cast(vals_load)[0] = smem_exchange[0]; + for (int w = -final_state_position; w < kWidth - 1; ++w){ + conv_states[w] = vals_load[w + final_state_position]; + } + return; + } + } } // Final state is stored in the smem_exchange last token slot, // in case seqlen < kWidth, we would need to take the final state from the @@ -446,9 +471,14 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) { } else { // in case the final state is in between the threads data - reinterpret_cast(x_vals_load)[1] = smem_exchange[last_thread + 1]; - reinterpret_cast(x_vals_load)[0] = smem_exchange[last_thread]; const int offset = ((seqlen - (kWidth - 1)) % (kNElts)); + if ((offset + kWidth - 2) >= kNElts && (last_thread + 1 < kNThreads)){ + // In case last_thread == kNThreads - 1, accessing last_thread + 1 will result in a + // illegal access error on H100. + // Therefore, we access last_thread + 1, only if the final state data sits there + reinterpret_cast(x_vals_load)[1] = smem_exchange[last_thread + 1]; + } + reinterpret_cast(x_vals_load)[0] = smem_exchange[last_thread]; #pragma unroll for (int w = 0; w < kWidth - 1; ++w){ conv_states[w] = x_vals_load[offset + w ]; diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index ff893b613f150..3279e7a108232 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -277,7 +277,7 @@ Text Generation * - :code:`QWenLMHeadModel` - Qwen - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc. - - + - ✅︎ - ✅︎ * - :code:`Qwen2ForCausalLM` - Qwen2 @@ -516,7 +516,7 @@ Text Generation - Qwen-VL - T + I\ :sup:`E+` - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc. - - + - ✅︎ - ✅︎ * - :code:`Qwen2AudioForConditionalGeneration` - Qwen2-Audio @@ -540,6 +540,9 @@ Text Generation | :sup:`E` Pre-computed embeddings can be inputted for this modality. | :sup:`+` Multiple items can be inputted per text prompt for this modality. +.. note:: + vLLM currently only supports adding LoRA to the language backbone of multimodal models. + .. note:: For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now. For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst index 20a81f4cad1d1..cab19e4ec5b6c 100644 --- a/docs/source/serving/compatibility_matrix.rst +++ b/docs/source/serving/compatibility_matrix.rst @@ -283,7 +283,7 @@ Feature x Feature - ✅ - ✅ - ✅ - - `✗ `__ + - `✗ `__ - ? - ✅ - ✅ diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 24162e98f5155..87a9c34f80411 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -262,10 +262,9 @@ def run_qwen2_vl(question: str, modality: str): model_name = "Qwen/Qwen2-VL-7B-Instruct" - # Tested on L40 llm = LLM( model=model_name, - max_model_len=8192, + max_model_len=4096, max_num_seqs=5, # Note - mm_processor_kwargs can also be passed to generate/chat calls mm_processor_kwargs={ diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py index 58075f7023821..1ae64ef492d5b 100644 --- a/tests/entrypoints/openai/test_prompt_validation.py +++ b/tests/entrypoints/openai/test_prompt_validation.py @@ -35,3 +35,23 @@ async def test_out_of_vocab_token_ids(): prompt=[999999], max_tokens=5, temperature=0.0) + + +@pytest.mark.asyncio +async def test_reject_multistep_with_guided_decoding(): + model_name = "gpt2" + server_args = ["--enforce-eager", "--num-scheduler-steps", "8"] + with RemoteOpenAIServer(model_name, server_args) as remote_server: + client = remote_server.get_async_client() + + with pytest.raises(openai.BadRequestError, + match=re.compile( + '.*Guided decoding .* multi-step decoding.*')): + await client.completions.create( + model=model_name, + prompt="Hello", + max_tokens=5, + temperature=0.0, + extra_body={"response_format": { + "type": "json_object" + }}) diff --git a/tests/kernels/test_causal_conv1d.py b/tests/kernels/test_causal_conv1d.py index 96bfe06d74ae5..f9b11018288be 100644 --- a/tests/kernels/test_causal_conv1d.py +++ b/tests/kernels/test_causal_conv1d.py @@ -151,7 +151,7 @@ def causal_conv1d_opcheck_fn(x: torch.Tensor, @pytest.mark.parametrize("has_bias", [True]) @pytest.mark.parametrize("width", [4]) @pytest.mark.parametrize( - 'seqlen', [1, 8, 16, 32, 64, 128, 256, 512, 784, 1024, 2048, 4096]) + 'seqlen', [1, 8, 16, 32, 64, 128, 256, 512, 784, 1024, 1025, 2048, 4096]) @pytest.mark.parametrize('dim', [64]) @pytest.mark.parametrize('batch', [1]) def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation, @@ -420,7 +420,10 @@ def test_causal_conv1d_varlen(with_padding, dim, seqlen, width, has_bias, unpadded_out = out[:, :out_ref_tensor.shape[-1]] assert torch.allclose(unpadded_out, out_ref_tensor, rtol=rtol, atol=atol) - assert torch.allclose(final_states, final_states_ref, rtol=rtol, atol=atol) + assert torch.allclose(final_states[state_indices], + final_states_ref[state_indices], + rtol=rtol, + atol=atol) causal_conv1d_opcheck_fn(x.squeeze(0), weight, bias, cumsum.cuda(), padded_state_indices, has_initial_states, diff --git a/tests/kernels/test_mamba_ssm.py b/tests/kernels/test_mamba_ssm.py index bf7ff3b5c59b8..ad05a97685351 100644 --- a/tests/kernels/test_mamba_ssm.py +++ b/tests/kernels/test_mamba_ssm.py @@ -555,7 +555,7 @@ def test_selective_state_update_with_batch_indices(with_padding, dim, dstate, device = "cuda" rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2) if itype == torch.bfloat16: - rtol, atol = 7e-2, 7e-2 + rtol, atol = 1e-1, 1e-1 if torch.version.hip: atol *= 2 # set seed @@ -610,8 +610,8 @@ def test_selective_state_update_with_batch_indices(with_padding, dim, dstate, dt_bias=dt_bias, dt_softplus=True) - print("Output diff max", (out - out_ref[0]).max()) - print("Output diff mean", (out - out_ref[0]).mean()) + print("Output diff max", (out[:batch_size] - out_ref).max()) + print("Output diff mean", (out[:batch_size] - out_ref).mean()) print("Output state diff max", (state[state_indices, :] - state_ref).max()) print("Output state diff mean", (state[state_indices, :] - state_ref).mean()) diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py index ad6c2d854d1f0..b9089e75ffab8 100644 --- a/tests/models/decoder_only/audio_language/test_ultravox.py +++ b/tests/models/decoder_only/audio_language/test_ultravox.py @@ -158,6 +158,7 @@ def run_multi_audio_test( assert all(tokens for tokens, *_ in vllm_outputs) +@pytest.mark.core_model @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) @@ -178,6 +179,7 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int, ) +@pytest.mark.core_model @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py index 5c90e7f7a267c..c23fbedf0c6ae 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py @@ -17,7 +17,7 @@ # Fixtures lazy import to avoid initializing CUDA during test collection -# NOTE: Qwen2vl supports multiple input modalities, so it registers multiple +# NOTE: Qwen2VL supports multiple input modalities, so it registers multiple # input mappers. @pytest.fixture() def image_input_mapper_for_qwen2_vl(): diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 9370527e3cd57..d738647c91b66 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -75,6 +75,63 @@ # this is a good idea for checking your command first, since tests are slow. VLM_TEST_SETTINGS = { + #### Core tests to always run in the CI + "llava": VLMTestInfo( + models=["llava-hf/llava-1.5-7b-hf"], + test_type=( + VLMTestType.EMBEDDING, + VLMTestType.IMAGE, + VLMTestType.CUSTOM_INPUTS + ), + prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", + convert_assets_to_embeddings=model_utils.get_llava_embeddings, + max_model_len=4096, + auto_cls=AutoModelForVision2Seq, + vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, + custom_test_opts=[CustomTestOptions( + inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs( + formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:" + ), + limit_mm_per_prompt={"image": 4}, + )], + marks=[pytest.mark.core_model], + ), + "paligemma": VLMTestInfo( + models=["google/paligemma-3b-mix-224"], + test_type=VLMTestType.IMAGE, + prompt_formatter=identity, + img_idx_to_prompt = lambda idx: "", + # Paligemma uses its own sample prompts because the default one fails + single_image_prompts=IMAGE_ASSETS.prompts({ + "stop_sign": "caption es", + "cherry_blossom": "What is in the picture?", + }), + auto_cls=AutoModelForVision2Seq, + postprocess_inputs=model_utils.get_key_type_post_processor( + "pixel_values" + ), + vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output, + dtype="half" if current_platform.is_rocm() else ("half", "float"), + marks=[pytest.mark.core_model], + ), + "qwen2_vl": VLMTestInfo( + models=["Qwen/Qwen2-VL-2B-Instruct"], + test_type=( + VLMTestType.IMAGE, + VLMTestType.MULTI_IMAGE, + VLMTestType.VIDEO + ), + prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 + img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 + video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 + max_model_len=4096, + max_num_seqs=2, + auto_cls=AutoModelForVision2Seq, + vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, + marks=[pytest.mark.core_model], + image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + ), + #### Extended model tests "blip2": VLMTestInfo( models=["Salesforce/blip2-opt-2.7b"], test_type=VLMTestType.IMAGE, @@ -151,25 +208,6 @@ use_tokenizer_eos=True, patch_hf_runner=model_utils.internvl_patch_hf_runner, ), - "llava": VLMTestInfo( - models=["llava-hf/llava-1.5-7b-hf"], - test_type=( - VLMTestType.EMBEDDING, - VLMTestType.IMAGE, - VLMTestType.CUSTOM_INPUTS - ), - prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", - convert_assets_to_embeddings=model_utils.get_llava_embeddings, - max_model_len=4096, - auto_cls=AutoModelForVision2Seq, - vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, - custom_test_opts=[CustomTestOptions( - inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs( - formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:" - ), - limit_mm_per_prompt={"image": 4}, - )], - ), "llava_next": VLMTestInfo( models=["llava-hf/llava-v1.6-mistral-7b-hf"], test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS), @@ -200,12 +238,12 @@ vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, # Llava-one-vision tests fixed sizes & the default size factors image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))], - runner_mm_key="videos", custom_test_opts=[CustomTestOptions( inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs( formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 ), limit_mm_per_prompt={"video": 4}, + runner_mm_key="videos", )], ), # FIXME @@ -218,9 +256,11 @@ auto_cls=AutoModelForVision2Seq, vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output, image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))], - runner_mm_key="videos", marks=[ - pytest.mark.skip(reason="LLava next video tests currently fail.") + pytest.mark.skipif( + transformers.__version__.startswith("4.46"), + reason="Model broken with changes in transformers 4.46" + ) ], ), "minicpmv": VLMTestInfo( @@ -234,23 +274,6 @@ postprocess_inputs=model_utils.wrap_inputs_post_processor, hf_output_post_proc=model_utils.minicmpv_trunc_hf_output, ), - "paligemma": VLMTestInfo( - models=["google/paligemma-3b-mix-224"], - test_type=VLMTestType.IMAGE, - prompt_formatter=identity, - img_idx_to_prompt = lambda idx: "", - # Paligemma uses its own sample prompts because the default one fails - single_image_prompts=IMAGE_ASSETS.prompts({ - "stop_sign": "caption es", - "cherry_blossom": "What is in the picture?", - }), - auto_cls=AutoModelForVision2Seq, - postprocess_inputs=model_utils.get_key_type_post_processor( - "pixel_values" - ), - vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output, - dtype="half" if current_platform.is_rocm() else ("half", "float"), - ), # Tests for phi3v currently live in another file because of a bug in # transformers. Once this issue is fixed, we can enable them here instead. # https://github.com/huggingface/transformers/issues/34307 diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py index 6856e8df81a13..e925934db0e7c 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py @@ -56,6 +56,17 @@ def qwen_vllm_to_hf_output( return output_ids, hf_output_str, out_logprobs +def qwen2_vllm_to_hf_output( + vllm_output: RunnerOutput, + model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]: + """Sanitize vllm output [qwen2 models] to be comparable with hf output.""" + output_ids, output_str, out_logprobs = vllm_output + + hf_output_str = output_str + "<|im_end|>" + + return output_ids, hf_output_str, out_logprobs + + def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput: config = AutoConfig.from_pretrained(model) diff --git a/tests/models/decoder_only/vision_language/vlm_utils/runners.py b/tests/models/decoder_only/vision_language/vlm_utils/runners.py index 5a3f9e820dad0..2d3b39fe3594e 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/runners.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/runners.py @@ -29,6 +29,7 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo, num_logprobs=test_case.num_logprobs, limit_mm_per_prompt={"image": 1}, distributed_executor_backend=test_case.distributed_executor_backend, + runner_mm_key="images", **model_test_info.get_non_parametrized_runner_kwargs()) @@ -51,6 +52,7 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo, num_logprobs=test_case.num_logprobs, limit_mm_per_prompt={"image": len(image_assets)}, distributed_executor_backend=test_case.distributed_executor_backend, + runner_mm_key="images", **model_test_info.get_non_parametrized_runner_kwargs()) @@ -74,6 +76,7 @@ def run_embedding_test(*, model_test_info: VLMTestInfo, limit_mm_per_prompt={"image": 1}, vllm_embeddings=vllm_embeddings, distributed_executor_backend=test_case.distributed_executor_backend, + runner_mm_key="images", **model_test_info.get_non_parametrized_runner_kwargs()) @@ -101,6 +104,7 @@ def run_video_test( num_logprobs=test_case.num_logprobs, limit_mm_per_prompt={"video": len(video_assets)}, distributed_executor_backend=test_case.distributed_executor_backend, + runner_mm_key="videos", **model_test_info.get_non_parametrized_runner_kwargs()) @@ -115,7 +119,11 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo, inputs = test_case.custom_test_opts.inputs limit_mm_per_prompt = test_case.custom_test_opts.limit_mm_per_prompt - assert inputs is not None and limit_mm_per_prompt is not None + runner_mm_key = test_case.custom_test_opts.runner_mm_key + # Inputs, limit_mm_per_prompt, and runner_mm_key should all be set + assert inputs is not None + assert limit_mm_per_prompt is not None + assert runner_mm_key is not None core.run_test( hf_runner=hf_runner, @@ -127,4 +135,5 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo, num_logprobs=test_case.num_logprobs, limit_mm_per_prompt=limit_mm_per_prompt, distributed_executor_backend=test_case.distributed_executor_backend, + runner_mm_key=runner_mm_key, **model_test_info.get_non_parametrized_runner_kwargs()) diff --git a/tests/models/decoder_only/vision_language/vlm_utils/types.py b/tests/models/decoder_only/vision_language/vlm_utils/types.py index 4d18d53af30fa..fd18c7c8346f0 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/types.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py @@ -52,6 +52,8 @@ class SizeType(Enum): class CustomTestOptions(NamedTuple): inputs: List[Tuple[List[str], List[Union[List[Image], Image]]]] limit_mm_per_prompt: Dict[str, int] + # kwarg to pass multimodal data in as to vllm/hf runner instances. + runner_mm_key: str = "images" class ImageSizeWrapper(NamedTuple): @@ -141,9 +143,6 @@ class VLMTestInfo(NamedTuple): Callable[[PosixPath, str, Union[List[ImageAsset], _ImageAssets]], str]] = None # noqa: E501 - # kwarg to pass multimodal data in as to vllm/hf runner instances - runner_mm_key: str = "images" - # Allows configuring a test to run with custom inputs custom_test_opts: Optional[List[CustomTestOptions]] = None @@ -168,7 +167,6 @@ def get_non_parametrized_runner_kwargs(self): "get_stop_token_ids": self.get_stop_token_ids, "model_kwargs": self.model_kwargs, "patch_hf_runner": self.patch_hf_runner, - "runner_mm_key": self.runner_mm_key, } diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py index a8d0ac4fc160d..9fab5898a06ba 100644 --- a/tests/models/embedding/vision_language/test_llava_next.py +++ b/tests/models/embedding/vision_language/test_llava_next.py @@ -2,6 +2,7 @@ import pytest import torch.nn.functional as F +import transformers from transformers import AutoModelForVision2Seq from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner @@ -85,8 +86,8 @@ def _run_test( ) -# FIXME -@pytest.mark.skip(reason="LLava next embedding tests currently fail") +@pytest.mark.skipif(transformers.__version__.startswith("4.46"), + reason="Model broken with changes in transformers 4.46") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_models_text( diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index 366b030eaa399..fd6564bbfe630 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -5,6 +5,7 @@ import pytest from tests.kernels.utils import override_backend_env_variable +from vllm import SamplingParams, TokensPrompt from ..models.utils import check_outputs_equal @@ -12,6 +13,14 @@ "facebook/opt-125m", ] +UNSTABLE_PROMPT_SEQUENCE = [ + ([0] * 588) + ([1] * 1332) + ([2] * 30) + ([3] * 1), + ([0] * 588) + ([1] * 1332) + ([4] * 3) + ([5] * 50), + ([0] * 588) + ([1] * 1332) + ([2] * 30) + ([6] * 95), + ([0] * 588) + ([1] * 1332) + ([4] * 3) + ([7] * 174), + ([0] * 588) + ([8] * 1539), +] + @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"]) @@ -57,3 +66,22 @@ def test_mixed_requests( name_0="hf", name_1="vllm", ) + + +@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"]) +def test_unstable_prompt_sequence( + vllm_runner, + backend: str, + monkeypatch, +) -> None: + override_backend_env_variable(monkeypatch, backend) + + with vllm_runner( + "Qwen/Qwen2.5-0.5B-Instruct", + enable_chunked_prefill=True, + enable_prefix_caching=True, + max_model_len=4096, + ) as vllm_model: + for prompt in UNSTABLE_PROMPT_SEQUENCE: + vllm_model.generate(TokensPrompt(prompt_token_ids=prompt), + SamplingParams(max_tokens=1)) diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index d1a44f3e8bfa6..32fccd0dfb496 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -138,7 +138,6 @@ def _add_seq_group( chunked_prefill_enabled: bool): is_prompt = inter_data.is_prompt block_tables = inter_data.block_tables - computed_block_nums = inter_data.computed_block_nums for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len, curr_sliding_window_block) in zip( @@ -164,10 +163,14 @@ def _add_seq_group( # NOTE: This only works for oooooooxxx style attention. block_table = [] if inter_data.prefix_cache_hit: - block_table = computed_block_nums + block_table = block_tables[seq_id] elif ((chunked_prefill_enabled or not is_prompt) and block_tables is not None): - block_table = block_tables[seq_id][-curr_sliding_window_block:] + if curr_sliding_window_block == 0: + block_table = block_tables[seq_id] + else: + block_table = block_tables[seq_id][ + -curr_sliding_window_block:] self.block_tables.append(block_table) # Compute slot mapping. diff --git a/vllm/config.py b/vllm/config.py index e9559c40dbdfb..c2a8c956b374a 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -84,9 +84,6 @@ class ModelConfig: disable CUDA graph and always execute the model in eager mode. If False, we will use CUDA graph and eager execution in hybrid. If None, the user did not specify, so default to False. - max_context_len_to_capture: Maximum context len covered by CUDA graphs. - When a sequence has context length larger than this, we fall back - to eager mode (DEPRECATED. Use max_seq_len_to_capture instead). max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode. Additionally for encoder-decoder models, if the @@ -147,7 +144,6 @@ def __init__( quantization: Optional[str] = None, quantization_param_path: Optional[str] = None, enforce_eager: Optional[bool] = None, - max_context_len_to_capture: Optional[int] = None, max_seq_len_to_capture: Optional[int] = None, max_logprobs: int = 20, disable_sliding_window: bool = False, @@ -181,9 +177,6 @@ def __init__( self.quantization = quantization self.quantization_param_path = quantization_param_path self.enforce_eager = enforce_eager - if max_context_len_to_capture is not None: - raise ValueError("`max_context_len_to_capture` is deprecated. " - "Use `max_seq_len_to_capture` instead.") self.max_seq_len_to_capture = max_seq_len_to_capture self.max_logprobs = max_logprobs self.disable_sliding_window = disable_sliding_window diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index de886c98e51bd..b1f0f8b9df925 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -126,7 +126,6 @@ class EngineArgs: tokenizer_revision: Optional[str] = None quantization: Optional[str] = None enforce_eager: Optional[bool] = None - max_context_len_to_capture: Optional[int] = None max_seq_len_to_capture: int = 8192 disable_custom_all_reduce: bool = False tokenizer_pool_size: int = 0 @@ -504,14 +503,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: help='Always use eager-mode PyTorch. If False, ' 'will use eager mode and CUDA graph in hybrid ' 'for maximal performance and flexibility.') - parser.add_argument('--max-context-len-to-capture', - type=int, - default=EngineArgs.max_context_len_to_capture, - help='Maximum context length covered by CUDA ' - 'graphs. When a sequence has context length ' - 'larger than this, we fall back to eager mode. ' - '(DEPRECATED. Use --max-seq-len-to-capture instead' - ')') parser.add_argument('--max-seq-len-to-capture', type=int, default=EngineArgs.max_seq_len_to_capture, @@ -939,7 +930,6 @@ def create_model_config(self) -> ModelConfig: quantization=self.quantization, quantization_param_path=self.quantization_param_path, enforce_eager=self.enforce_eager, - max_context_len_to_capture=self.max_context_len_to_capture, max_seq_len_to_capture=self.max_seq_len_to_capture, max_logprobs=self.max_logprobs, disable_sliding_window=self.disable_sliding_window, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 3fd34fadee1ca..edef1f30a9e91 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -829,6 +829,13 @@ def add_request( raise ValueError(f"Got priority {priority} but " "Priority scheduling is not enabled.") + if isinstance(params, SamplingParams) \ + and (params.guided_decoding or params.logits_processors) \ + and self.scheduler_config.num_scheduler_steps > 1: + raise ValueError( + "Guided decoding and logits processors are not supported " + "in multi-step decoding") + if arrival_time is None: arrival_time = time.time() diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 083b67c2f8e7d..3d62cb3598477 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -93,9 +93,6 @@ class LLM: enforce_eager: Whether to enforce eager execution. If True, we will disable CUDA graph and always execute the model in eager mode. If False, we will use CUDA graph and eager execution in hybrid. - max_context_len_to_capture: Maximum context len covered by CUDA graphs. - When a sequence has context length larger than this, we fall back - to eager mode (DEPRECATED. Use `max_seq_len_to_capture` instead). max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs. When a sequence has context length larger than this, we fall back to eager mode. Additionally for encoder-decoder models, if the @@ -152,7 +149,6 @@ def __init__( swap_space: float = 4, cpu_offload_gb: float = 0, enforce_eager: Optional[bool] = None, - max_context_len_to_capture: Optional[int] = None, max_seq_len_to_capture: int = 8192, disable_custom_all_reduce: bool = False, disable_async_output_proc: bool = False, @@ -193,7 +189,6 @@ def __init__( swap_space=swap_space, cpu_offload_gb=cpu_offload_gb, enforce_eager=enforce_eager, - max_context_len_to_capture=max_context_len_to_capture, max_seq_len_to_capture=max_seq_len_to_capture, disable_custom_all_reduce=disable_custom_all_reduce, disable_async_output_proc=disable_async_output_proc, diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 0a1b40927e9f9..998016ea28c26 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -1048,7 +1048,7 @@ def get_mm_mapping(self) -> MultiModelKeys: @MULTIMODAL_REGISTRY.register_max_image_tokens(MAX_QWEN_IMG_TOKENS) @INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen) @INPUT_REGISTRY.register_input_processor(input_processor_for_qwen) -class QWenLMHeadModel(QWenBaseModel): +class QWenLMHeadModel(QWenBaseModel, SupportsLoRA): """ QWenLMHeadModel is not only applicable to LLM but also to VL, which is not conducive to the current integration logic of LoRA in vLLM. Therefore, it diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 633d66b4af31a..1e12c2332b65e 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -126,15 +126,18 @@ def __init__( hidden_features: int = None, act_layer: Type[nn.Module] = QuickGELU, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.fc1 = ColumnParallelLinear(in_features, hidden_features, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.fc1") self.act = act_layer() self.fc2 = RowParallelLinear(hidden_features, in_features, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.fc2") def forward(self, x: torch.Tensor) -> torch.Tensor: x_parallel, _ = self.fc1(x) @@ -196,6 +199,7 @@ def __init__( num_heads: Optional[int] = None, projection_size: Optional[int] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() # Per attention head and per partition values. @@ -207,10 +211,12 @@ def __init__( self.qkv = ColumnParallelLinear(input_size=embed_dim, output_size=3 * projection_size, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.qkv") self.proj = RowParallelLinear(input_size=projection_size, output_size=embed_dim, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.proj") # Detect attention implementation. self.attn_backend: _Backend = get_vit_attn_backend() @@ -310,6 +316,7 @@ def __init__( act_layer: Type[nn.Module] = QuickGELU, norm_layer: Type[nn.Module] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() if norm_layer is None: @@ -321,11 +328,13 @@ def __init__( self.attn = Qwen2VisionAttention(embed_dim=dim, num_heads=num_heads, projection_size=dim, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.attn") self.mlp = Qwen2VisionMLP(dim, mlp_hidden_dim, act_layer=act_layer, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.mlp") def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor) -> torch.Tensor: @@ -374,6 +383,7 @@ def __init__( norm_layer: Type[nn.Module] = None, spatial_merge_size: int = 2, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = context_dim * (spatial_merge_size**2) @@ -384,12 +394,14 @@ def __init__( ColumnParallelLinear(self.hidden_size, self.hidden_size, bias=True, - quant_config=quant_config), + quant_config=quant_config, + prefix=f"{prefix}.mlp.0"), nn.GELU(), RowParallelLinear(self.hidden_size, d_model, bias=True, - quant_config=quant_config), + quant_config=quant_config, + prefix=f"{prefix}.mlp.2"), ]) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -440,6 +452,7 @@ def __init__( vision_config: Qwen2VLVisionConfig, norm_eps: float = 1e-6, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() @@ -467,28 +480,29 @@ def __init__( self.rotary_pos_emb = Qwen2VisionRotaryEmbedding(head_dim // 2) self.blocks = nn.ModuleList([ - Qwen2VisionBlock( - dim=embed_dim, - num_heads=num_heads, - mlp_ratio=mlp_ratio, - norm_layer=norm_layer, - quant_config=quant_config, - ) for _ in range(depth) + Qwen2VisionBlock(dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + norm_layer=norm_layer, + quant_config=quant_config, + prefix=f"{prefix}.blocks.{layer_idx}") + for layer_idx in range(depth) ]) self.merger = Qwen2VisionPatchMerger( d_model=hidden_size, context_dim=embed_dim, norm_layer=norm_layer, quant_config=quant_config, + prefix=f"{prefix}.merger", ) @property def dtype(self) -> torch.dtype: - return self.blocks[0].mlp.fc2.weight.dtype + return self.patch_embed.proj.weight.dtype @property def device(self) -> torch.device: - return self.blocks[0].mlp.fc2.weight.device + return self.patch_embed.proj.weight.device def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: pos_ids = [] @@ -932,10 +946,8 @@ def __init__(self, self.visual = Qwen2VisionTransformer( config.vision_config, norm_eps=getattr(config, "rms_norm_eps", 1e-6), - - # NOTE: Qwen2-VL vision encoder does not support any - # quantization method now. - quant_config=None, + quant_config=quant_config, + prefix="visual", ) self.model = Qwen2Model(config, @@ -1175,7 +1187,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader(param, loaded_weight, shard_id) break else: - if "visual" in name and "qkv.weight" in name: + if "visual" in name and name.endswith("qkv.weight"): visual_num_heads = self.config.vision_config.num_heads visual_embed_dim = self.config.vision_config.embed_dim head_size = visual_embed_dim // visual_num_heads @@ -1184,7 +1196,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): visual_embed_dim) loaded_weight = loaded_weight.transpose(0, 1) loaded_weight = loaded_weight.reshape(-1, visual_embed_dim) - elif "visual" in name and "qkv.bias" in name: + elif "visual" in name and name.endswith("qkv.bias"): visual_num_heads = self.config.vision_config.num_heads visual_embed_dim = self.config.vision_config.embed_dim head_size = visual_embed_dim // visual_num_heads diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 5e191c6e715e0..5c6df5aaf5446 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -485,8 +485,8 @@ def __repr__(self) -> str: f"skip_special_tokens={self.skip_special_tokens}, " "spaces_between_special_tokens=" f"{self.spaces_between_special_tokens}, " - f"truncate_prompt_tokens={self.truncate_prompt_tokens}), " - f"guided_decoding={self.guided_decoding}") + f"truncate_prompt_tokens={self.truncate_prompt_tokens}, " + f"guided_decoding={self.guided_decoding})") class BeamSearchParams( diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 233a9e664d845..891637dafbb14 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -995,7 +995,7 @@ def __init__( # Python can be expensive. To optimize this, we cache the block table # in numpy and only copy the actual input content at every iteration. # The shape of the cached block table will be - # (max batch size to capture, max context len to capture / block size). + # (max batch size to capture, max seq len to capture / block size). self.graph_block_tables = np.zeros( (self.max_batchsize_to_capture, self.get_max_block_per_batch()), dtype=np.int32)