Skip to content

Commit

Permalink
Merge branch 'vllm-project:main' into support-idefics3
Browse files Browse the repository at this point in the history
  • Loading branch information
jeejeelee authored Nov 1, 2024
2 parents 6620b7c + 031a799 commit 367f31e
Show file tree
Hide file tree
Showing 27 changed files with 270 additions and 116 deletions.
4 changes: 2 additions & 2 deletions .buildkite/run-amd-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ cleanup_docker() {
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
# Remove unused volumes
docker volume prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
Expand Down
17 changes: 14 additions & 3 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# label(str): the name of the test. emoji allowed.
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# fast_check_only(bool): run this test on fastcheck pipeline only
# nightly(bool): run this test in nightly pipeline only
# optional(bool): never run this test by default (i.e. need to unblock manually)
# command(str): the single command to run for tests. incompatible with commands.
# commands(list): the list of commands to run for test. incompatbile with command.
Expand Down Expand Up @@ -330,18 +331,28 @@ steps:
commands:
- pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py

- label: Decoder-only Multi-Modal Models Test # 1h31min
- label: Decoder-only Multi-Modal Models Test (Standard)
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/models/decoder_only/audio_language
- tests/models/decoder_only/vision_language
commands:
- pytest -v -s models/decoder_only/audio_language
- pytest -v -s models/decoder_only/audio_language -m core_model
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m core_model

- label: Decoder-only Multi-Modal Models Test (Extended)
nightly: true
source_file_dependencies:
- vllm/
- tests/models/decoder_only/audio_language
- tests/models/decoder_only/vision_language
commands:
- pytest -v -s models/decoder_only/audio_language -m 'not core_model'
# HACK - run phi3v tests separately to sidestep this transformers bug
# https://github.com/huggingface/transformers/issues/34307
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model'

- label: Other Models Test # 6min
#mirror_hardwares: [amd]
Expand Down
16 changes: 16 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,19 @@ updates:
directory: "/"
schedule:
interval: "weekly"
- package-ecosystem: "pip"
directory: "/"
schedule:
interval: "weekly"
labels: ["dependencies"]
open-pull-requests-limit: 5
reviewers: ["khluu", "simon-mo"]
allow:
- dependency-type: "all"
groups:
patch-update:
applies-to: version-updates
update-types: ["patch"]
minor-update:
applies-to: version-updates
update-types: ["minor"]
34 changes: 32 additions & 2 deletions csrc/mamba/causal_conv1d/causal_conv1d.cu
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,31 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
typename Ktraits::BlockStoreT(smem_store).Store(out, out_vals_store, seqlen - chunk * kChunkSize);
}
out += kChunkSize;

int final_state_position = ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize);
// in case the final state is separated between the last "smem_exchange" and
// and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2),
// (which occurs when `final_state_position` is a non-positivie index)
// we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
if (final_state_position < 0 && seqlen > kWidth){
input_t vals_load[kNElts] = {0};
if ((chunk == n_chunks - 2) && (tidx == kNThreads - 1)){
// chunk = n_chunks - 2, a segment of the final state sits in the last index
reinterpret_cast<vec_t *>(vals_load)[0] = smem_exchange[kNThreads - 1];
#pragma unroll
for (int w = 0; w < -final_state_position; ++w){
conv_states[w] = vals_load[kNElts + final_state_position + w];
}
}
if ((chunk == n_chunks - 1) && tidx == 0){
// chunk = n_chunks - 1, the second segment of the final state first positions
reinterpret_cast<vec_t *>(vals_load)[0] = smem_exchange[0];
for (int w = -final_state_position; w < kWidth - 1; ++w){
conv_states[w] = vals_load[w + final_state_position];
}
return;
}
}
}
// Final state is stored in the smem_exchange last token slot,
// in case seqlen < kWidth, we would need to take the final state from the
Expand Down Expand Up @@ -446,9 +471,14 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
}
else {
// in case the final state is in between the threads data
reinterpret_cast<vec_t *>(x_vals_load)[1] = smem_exchange[last_thread + 1];
reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[last_thread];
const int offset = ((seqlen - (kWidth - 1)) % (kNElts));
if ((offset + kWidth - 2) >= kNElts && (last_thread + 1 < kNThreads)){
// In case last_thread == kNThreads - 1, accessing last_thread + 1 will result in a
// illegal access error on H100.
// Therefore, we access last_thread + 1, only if the final state data sits there
reinterpret_cast<vec_t *>(x_vals_load)[1] = smem_exchange[last_thread + 1];
}
reinterpret_cast<vec_t *>(x_vals_load)[0] = smem_exchange[last_thread];
#pragma unroll
for (int w = 0; w < kWidth - 1; ++w){
conv_states[w] = x_vals_load[offset + w ];
Expand Down
7 changes: 5 additions & 2 deletions docs/source/models/supported_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ Text Generation
* - :code:`QWenLMHeadModel`
- Qwen
- :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
-
- ✅︎
- ✅︎
* - :code:`Qwen2ForCausalLM`
- Qwen2
Expand Down Expand Up @@ -516,7 +516,7 @@ Text Generation
- Qwen-VL
- T + I\ :sup:`E+`
- :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
-
- ✅︎
- ✅︎
* - :code:`Qwen2AudioForConditionalGeneration`
- Qwen2-Audio
Expand All @@ -540,6 +540,9 @@ Text Generation
| :sup:`E` Pre-computed embeddings can be inputted for this modality.
| :sup:`+` Multiple items can be inputted per text prompt for this modality.
.. note::
vLLM currently only supports adding LoRA to the language backbone of multimodal models.

.. note::
For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
Expand Down
2 changes: 1 addition & 1 deletion docs/source/serving/compatibility_matrix.rst
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ Feature x Feature
- ✅
- ✅
- ✅
- `<https://github.com/vllm-project/vllm/issues/8985>`__
- `<https://github.com/vllm-project/vllm/issues/9893>`__
- ?
- ✅
- ✅
Expand Down
3 changes: 1 addition & 2 deletions examples/offline_inference_vision_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,10 +262,9 @@ def run_qwen2_vl(question: str, modality: str):

model_name = "Qwen/Qwen2-VL-7B-Instruct"

# Tested on L40
llm = LLM(
model=model_name,
max_model_len=8192,
max_model_len=4096,
max_num_seqs=5,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={
Expand Down
20 changes: 20 additions & 0 deletions tests/entrypoints/openai/test_prompt_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,23 @@ async def test_out_of_vocab_token_ids():
prompt=[999999],
max_tokens=5,
temperature=0.0)


@pytest.mark.asyncio
async def test_reject_multistep_with_guided_decoding():
model_name = "gpt2"
server_args = ["--enforce-eager", "--num-scheduler-steps", "8"]
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()

with pytest.raises(openai.BadRequestError,
match=re.compile(
'.*Guided decoding .* multi-step decoding.*')):
await client.completions.create(
model=model_name,
prompt="Hello",
max_tokens=5,
temperature=0.0,
extra_body={"response_format": {
"type": "json_object"
}})
7 changes: 5 additions & 2 deletions tests/kernels/test_causal_conv1d.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def causal_conv1d_opcheck_fn(x: torch.Tensor,
@pytest.mark.parametrize("has_bias", [True])
@pytest.mark.parametrize("width", [4])
@pytest.mark.parametrize(
'seqlen', [1, 8, 16, 32, 64, 128, 256, 512, 784, 1024, 2048, 4096])
'seqlen', [1, 8, 16, 32, 64, 128, 256, 512, 784, 1024, 1025, 2048, 4096])
@pytest.mark.parametrize('dim', [64])
@pytest.mark.parametrize('batch', [1])
def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation,
Expand Down Expand Up @@ -420,7 +420,10 @@ def test_causal_conv1d_varlen(with_padding, dim, seqlen, width, has_bias,

unpadded_out = out[:, :out_ref_tensor.shape[-1]]
assert torch.allclose(unpadded_out, out_ref_tensor, rtol=rtol, atol=atol)
assert torch.allclose(final_states, final_states_ref, rtol=rtol, atol=atol)
assert torch.allclose(final_states[state_indices],
final_states_ref[state_indices],
rtol=rtol,
atol=atol)

causal_conv1d_opcheck_fn(x.squeeze(0), weight, bias, cumsum.cuda(),
padded_state_indices, has_initial_states,
Expand Down
6 changes: 3 additions & 3 deletions tests/kernels/test_mamba_ssm.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,7 +555,7 @@ def test_selective_state_update_with_batch_indices(with_padding, dim, dstate,
device = "cuda"
rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2)
if itype == torch.bfloat16:
rtol, atol = 7e-2, 7e-2
rtol, atol = 1e-1, 1e-1
if torch.version.hip:
atol *= 2
# set seed
Expand Down Expand Up @@ -610,8 +610,8 @@ def test_selective_state_update_with_batch_indices(with_padding, dim, dstate,
dt_bias=dt_bias,
dt_softplus=True)

print("Output diff max", (out - out_ref[0]).max())
print("Output diff mean", (out - out_ref[0]).mean())
print("Output diff max", (out[:batch_size] - out_ref).max())
print("Output diff mean", (out[:batch_size] - out_ref).mean())
print("Output state diff max", (state[state_indices, :] - state_ref).max())
print("Output state diff mean",
(state[state_indices, :] - state_ref).mean())
Expand Down
2 changes: 2 additions & 0 deletions tests/models/decoder_only/audio_language/test_ultravox.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ def run_multi_audio_test(
assert all(tokens for tokens, *_ in vllm_outputs)


@pytest.mark.core_model
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
Expand All @@ -178,6 +179,7 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
)


@pytest.mark.core_model
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@


# Fixtures lazy import to avoid initializing CUDA during test collection
# NOTE: Qwen2vl supports multiple input modalities, so it registers multiple
# NOTE: Qwen2VL supports multiple input modalities, so it registers multiple
# input mappers.
@pytest.fixture()
def image_input_mapper_for_qwen2_vl():
Expand Down
Loading

0 comments on commit 367f31e

Please sign in to comment.