Skip to content

Commit

Permalink
[CI/Build] Split up models tests (vllm-project#10069)
Browse files Browse the repository at this point in the history
Signed-off-by: DarkLight1337 <[email protected]>
  • Loading branch information
DarkLight1337 authored Nov 9, 2024
1 parent b09895a commit 51c2e1f
Show file tree
Hide file tree
Showing 21 changed files with 115 additions and 129 deletions.
24 changes: 14 additions & 10 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ steps:

##### models test #####

- label: Basic Models Test # 3min
- label: Basic Models Test # 10min
source_file_dependencies:
- vllm/
- tests/models
Expand All @@ -314,23 +314,24 @@ steps:
- pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s models/*.py --ignore=models/test_oot_registration.py

- label: Decoder-only Language Models Test (Standard) # 35min
- label: Decoder-only Language Models Test (Standard) # 18min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/models/decoder_only/language
commands:
- pytest -v -s models/decoder_only/language/test_models.py
- pytest -v -s models/decoder_only/language -m core_model
- pytest -v -s models/decoder_only/language -m quant_model

- label: Decoder-only Language Models Test (Extended) # 1h20min
- label: Decoder-only Language Models Test (Extended) # 46min
nightly: true
source_file_dependencies:
- vllm/
- tests/models/decoder_only/language
commands:
- pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'

- label: Decoder-only Multi-Modal Models Test (Standard) # 26min
- label: Decoder-only Multi-Modal Models Test (Standard) # 22min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
Expand All @@ -339,21 +340,24 @@ steps:
commands:
- pytest -v -s models/decoder_only/audio_language -m core_model
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m core_model
# No tests under this group for now
# - pytest -v -s models/decoder_only/audio_language -m quant_model
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m quant_model

- label: Decoder-only Multi-Modal Models Test (Extended)
- label: Decoder-only Multi-Modal Models Test (Extended) # 1h10m
nightly: true
source_file_dependencies:
- vllm/
- tests/models/decoder_only/audio_language
- tests/models/decoder_only/vision_language
commands:
- pytest -v -s models/decoder_only/audio_language -m 'not core_model'
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
# HACK - run phi3v tests separately to sidestep this transformers bug
# https://github.com/huggingface/transformers/issues/34307
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model'
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'

- label: Other Models Test # 6min
- label: Other Models Test # 20min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ markers = [
"skip_global_cleanup",
"core_model: enable this model test in each PR instead of only nightly",
"cpu_model: enable this model test in CPU tests",
"quant_model: run this model test under Quantized category",
"distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
"skip_v1: do not run this test with v1",
]
1 change: 1 addition & 0 deletions tests/models/decoder_only/language/test_aqlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
]


@pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("aqlm"),
reason="AQLM is not supported on this GPU type.")
@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
Expand Down
1 change: 1 addition & 0 deletions tests/models/decoder_only/language/test_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
os.environ["TOKENIZERS_PARALLELISM"] = "true"


@pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="fp8 is not supported on this GPU type.")
@pytest.mark.parametrize(
Expand Down
35 changes: 16 additions & 19 deletions tests/models/decoder_only/language/test_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,21 @@

MAX_MODEL_LEN = 1024

# FIXME: Move this to confest
MODELS = [
("meta-llama/Llama-3.2-1B-Instruct",
hf_hub_download("bartowski/Llama-3.2-1B-Instruct-GGUF",
filename="Llama-3.2-1B-Instruct-Q4_K_M.gguf")),
("meta-llama/Llama-3.2-1B-Instruct",
hf_hub_download("bartowski/Llama-3.2-1B-Instruct-GGUF",
filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf")),
("Qwen/Qwen2-1.5B-Instruct",
hf_hub_download("Qwen/Qwen2-1.5B-Instruct-GGUF",
filename="qwen2-1_5b-instruct-q4_k_m.gguf")),
("Qwen/Qwen2-1.5B-Instruct",
hf_hub_download("legraphista/Qwen2-1.5B-Instruct-IMat-GGUF",
filename="Qwen2-1.5B-Instruct.IQ4_XS.gguf")),
]


@pytest.mark.skipif(not is_quant_method_supported("gguf"),
reason="gguf is not supported on this GPU type.")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize(("original_model", "gguf_id", "gguf_path"), [
("meta-llama/Llama-3.2-1B-Instruct",
"bartowski/Llama-3.2-1B-Instruct-GGUF",
"Llama-3.2-1B-Instruct-Q4_K_M.gguf"),
("meta-llama/Llama-3.2-1B-Instruct",
"bartowski/Llama-3.2-1B-Instruct-GGUF",
"Llama-3.2-1B-Instruct-IQ4_XS.gguf"),
("Qwen/Qwen2-1.5B-Instruct", "Qwen/Qwen2-1.5B-Instruct-GGUF",
"qwen2-1_5b-instruct-q4_k_m.gguf"),
("Qwen/Qwen2-1.5B-Instruct", "legraphista/Qwen2-1.5B-Instruct-IMat-GGUF",
"Qwen2-1.5B-Instruct.IQ4_XS.gguf"),
])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
Expand All @@ -45,7 +40,9 @@ def test_models(
num_gpus_available,
vllm_runner,
example_prompts,
model,
original_model,
gguf_id,
gguf_path,
dtype: str,
max_tokens: int,
num_logprobs: int,
Expand All @@ -54,7 +51,7 @@ def test_models(
if num_gpus_available < tp_size:
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")

original_model, gguf_model = model
gguf_model = hf_hub_download(gguf_id, filename=gguf_path)

tokenizer = AutoTokenizer.from_pretrained(original_model)
messages = [[{
Expand Down
1 change: 1 addition & 0 deletions tests/models/decoder_only/language/test_gptq_marlin.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
]


@pytest.mark.quant_model
@pytest.mark.flaky(reruns=3)
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
reason="gptq_marlin is not supported on this GPU type.")
Expand Down
1 change: 1 addition & 0 deletions tests/models/decoder_only/language/test_gptq_marlin_24.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ class ModelPair:
]


@pytest.mark.quant_model
@pytest.mark.flaky(reruns=2)
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24"),
reason="Marlin24 is not supported on this GPU type.")
Expand Down
3 changes: 2 additions & 1 deletion tests/models/decoder_only/language/test_granite.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
from ...utils import check_logprobs_close

MODELS = [
# TODO(sang): Sliding window should be tested separately.
"ibm/PowerLM-3b",
"ibm/PowerMoE-3b",
]


Expand All @@ -24,7 +26,6 @@ def test_models(
max_tokens: int,
num_logprobs: int,
) -> None:
# TODO(sang): Sliding window should be tested separately.
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)
Expand Down
39 changes: 0 additions & 39 deletions tests/models/decoder_only/language/test_granitemoe.py

This file was deleted.

1 change: 1 addition & 0 deletions tests/models/decoder_only/language/test_modelopt.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
@pytest.mark.skip(
reason=
"Prevent unstable test based on golden strings from breaking the build.")
@pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="fp8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_name", MODELS)
Expand Down
4 changes: 1 addition & 3 deletions tests/models/decoder_only/language/test_models.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
"""Compare the outputs of HF and vLLM when using greedy sampling.
This test only tests small models. Big models such as 7B should be tested from
test_big_models.py because it could use a larger instance to run tests.
Run `pytest tests/models/test_models.py`.
"""
import pytest
Expand Down Expand Up @@ -35,6 +32,7 @@
target_dtype = "half"


@pytest.mark.core_model
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [32])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,13 @@ def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next,
ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
seq_len = 5000 # bigger than the max feature size for any image

seq_data, mm_data = dummy_data_for_llava_next(
dummy_data = dummy_data_for_llava_next(
ctx,
seq_len=seq_len,
mm_counts={"image": 1},
)
seq_data = dummy_data.seq_data
mm_data = dummy_data.multi_modal_data

# The dummy data dims should match the gridpoint with the biggest feat size
assert mm_data["image"].height == expected_size[0]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,12 +131,13 @@ def test_dummy_data_override(dummy_data_for_phi3v, model: str, num_crops: int,
mm_processor_kwargs=None,
)

sequence_data, _, = dummy_data_for_phi3v(
dummy_data = dummy_data_for_phi3v(
ctx=ctx,
seq_len=8192, # Should be bigger than num_imgs * toks_per_img
mm_counts={"image": num_imgs},
num_crops=num_crops,
)
sequence_data = dummy_data.seq_data
# Ensure we have the right number of placeholders per num_crops size
img_tok_count = sequence_data.get_token_ids().count(_IMAGE_TOKEN_ID)
assert img_tok_count == toks_per_img * num_imgs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,17 @@ def test_qwen2_vl_dummy_data(dummy_data_for_qwen2_vl,

# NOTE: video value is required, but isn't actually used
# when making the dummy data except for error handling currently
seq_data, mm_data = dummy_data_for_qwen2_vl(qwen2_vl_context, seq_len, {
"image": 1,
"video": 0
}, **mm_processor_kwargs)
dummy_data = dummy_data_for_qwen2_vl(
ctx=qwen2_vl_context,
seq_len=seq_len,
mm_counts={
"image": 1,
"video": 0
},
**mm_processor_kwargs,
)
seq_data = dummy_data.seq_data
mm_data = dummy_data.multi_modal_data

# Ensure we have the right number of placeholders for min/max pixel values
assert seq_data.get_token_ids().count(image_token_id) == token_count
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Optional, Tuple, Type
from typing import List, Optional, Type

import pytest
import torch
Expand All @@ -19,7 +19,8 @@
def run_awq_test(
vllm_runner: Type[VllmRunner],
image_assets: _ImageAssets,
models: Tuple[str, str],
source_model: str,
quant_model: str,
*,
size_factors: List[float],
dtype: str,
Expand All @@ -28,8 +29,6 @@ def run_awq_test(
tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None,
):
source_model, quant_model = models

images = [asset.pil_image for asset in image_assets]

inputs_per_image = [(
Expand Down Expand Up @@ -84,8 +83,11 @@ def run_awq_test(
)


@pytest.mark.quant_model
@pytest.mark.parametrize(
"models", [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")])
("source_model", "quant_model"),
[("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")],
)
@pytest.mark.parametrize(
"size_factors",
[
Expand All @@ -103,12 +105,13 @@ def run_awq_test(
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@torch.inference_mode()
def test_awq_models(vllm_runner, image_assets, models, size_factors,
dtype: str, max_tokens: int, num_logprobs: int) -> None:
def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
size_factors, dtype, max_tokens, num_logprobs) -> None:
run_awq_test(
vllm_runner,
image_assets,
models,
source_model,
quant_model,
size_factors=size_factors,
dtype=dtype,
max_tokens=max_tokens,
Expand Down
Loading

0 comments on commit 51c2e1f

Please sign in to comment.