Skip to content

Commit

Permalink
[CI/Build] Update models tests & examples (vllm-project#8874)
Browse files Browse the repository at this point in the history
Co-authored-by: Roger Wang <[email protected]>
  • Loading branch information
2 people authored and MengqingCao committed Sep 30, 2024
1 parent 6a3b832 commit 7c28572
Show file tree
Hide file tree
Showing 13 changed files with 239 additions and 189 deletions.
51 changes: 32 additions & 19 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# label(str): the name of the test. emoji allowed.
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# fast_check_only(bool): run this test on fastcheck pipeline only
# optional(bool): never run this test by default (i.e. need to unblock manually)
# command(str): the single command to run for tests. incompatible with commands.
# commands(list): the list of commands to run for test. incompatbile with command.
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
Expand Down Expand Up @@ -39,7 +40,7 @@ steps:
# Check API reference (if it fails, you may have missing mock imports)
- grep \"sig sig-object py\" build/html/dev/sampling_params.html

- label: Async Engine, Inputs, Utils, Worker Test # 15min
- label: Async Engine, Inputs, Utils, Worker Test # 24min
fast_check: true
source_file_dependencies:
- vllm/
Expand Down Expand Up @@ -81,7 +82,7 @@ steps:
commands:
- pytest -v -s core

- label: Entrypoints Test # 20min
- label: Entrypoints Test # 40min
working_dir: "/vllm-workspace/tests"
fast_check: true
mirror_hardwares: [amd]
Expand Down Expand Up @@ -151,7 +152,7 @@ steps:
# OOM in the CI unless we run this separately
- pytest -v -s tokenization

- label: Examples Test # 12min
- label: Examples Test # 15min
working_dir: "/vllm-workspace/examples"
#mirror_hardwares: [amd]
source_file_dependencies:
Expand All @@ -169,15 +170,15 @@ steps:
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference_encoder_decoder.py

- label: Prefix Caching Test # 7min
- label: Prefix Caching Test # 9min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/prefix_caching
commands:
- pytest -v -s prefix_caching

- label: Samplers Test # 18min
- label: Samplers Test # 36min
source_file_dependencies:
- vllm/model_executor/layers
- vllm/sampling_metadata.py
Expand All @@ -193,7 +194,7 @@ steps:
- tests/test_logits_processor
command: pytest -v -s test_logits_processor.py

- label: Speculative decoding tests # 22min
- label: Speculative decoding tests # 30min
source_file_dependencies:
- vllm/spec_decode
- tests/spec_decode
Expand All @@ -203,30 +204,30 @@ steps:
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
- pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py

- label: LoRA Test %N # 30min each
- label: LoRA Test %N # 15min each
mirror_hardwares: [amd]
source_file_dependencies:
- vllm/lora
- tests/lora
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
parallelism: 4

- label: "PyTorch Fullgraph Smoke Test"
- label: "PyTorch Fullgraph Smoke Test" # 9min
fast_check: true
source_file_dependencies:
- vllm/
- tests/compile
commands:
- pytest -v -s compile/test_full_graph_smoke.py

- label: "PyTorch Fullgraph Test"
- label: "PyTorch Fullgraph Test" # 18min
source_file_dependencies:
- vllm/
- tests/compile
commands:
- pytest -v -s compile/test_full_graph.py

- label: Kernels Test %N # 30min each
- label: Kernels Test %N # 1h each
mirror_hardwares: [amd]
source_file_dependencies:
- csrc/
Expand Down Expand Up @@ -256,7 +257,7 @@ steps:
- pip install aiohttp
- bash run-benchmarks.sh

- label: Quantization Test # 15min
- label: Quantization Test # 33min
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
Expand Down Expand Up @@ -300,15 +301,15 @@ steps:
- pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s models/*.py --ignore=models/test_oot_registration.py

- label: Decoder-only Language Models Test # 1h3min
- label: Decoder-only Language Models Test # 1h36min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/models/decoder_only/language
commands:
- pytest -v -s models/decoder_only/language

- label: Decoder-only Multi-Modal Models Test # 56min
- label: Decoder-only Multi-Modal Models Test # 1h31min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
Expand All @@ -318,15 +319,25 @@ steps:
- pytest -v -s models/decoder_only/audio_language
- pytest -v -s models/decoder_only/vision_language

- label: Other Models Test # 5min
- label: Other Models Test # 6min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/models/embedding/language
- tests/models/encoder_decoder/language
- tests/models/encoder_decoder/vision_language
commands:
- pytest -v -s models/embedding/language
- pytest -v -s models/encoder_decoder/language
- pytest -v -s models/encoder_decoder/vision_language

- label: Custom Models Test
#mirror_hardwares: [amd]
optional: true
commands:
# PR authors can temporarily add commands below to test individual models
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*

##### 1 GPU test #####
##### multi gpus test #####
Expand Down Expand Up @@ -359,7 +370,7 @@ steps:
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'

- label: Distributed Tests (2 GPUs) # 28min
- label: Distributed Tests (2 GPUs) # 40min
#mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
Expand All @@ -376,14 +387,16 @@ steps:
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
- pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py

- label: Multi-step Tests (4 GPUs) # 21min
- label: Multi-step Tests (4 GPUs) # 36min
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
Expand All @@ -401,7 +414,7 @@ steps:
- pytest -v -s multi_step/test_correctness_async_llm.py
- pytest -v -s multi_step/test_correctness_llm.py

- label: Pipeline Parallelism Test # 23min
- label: Pipeline Parallelism Test # 45min
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
Expand All @@ -427,7 +440,7 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s -x lora/test_long_context.py

- label: Weight Loading Multiple GPU Test
- label: Weight Loading Multiple GPU Test # 33min
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
Expand Down
28 changes: 19 additions & 9 deletions examples/offline_inference_vision_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,18 @@
from vllm.assets.video import VideoAsset
from vllm.utils import FlexibleArgumentParser

# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.


# LLaVA-1.5
def run_llava(question, modality):
assert modality == "image"

prompt = f"USER: <image>\n{question}\nASSISTANT:"

llm = LLM(model="llava-hf/llava-1.5-7b-hf")
llm = LLM(model="llava-hf/llava-1.5-7b-hf", max_model_len=4096)
stop_token_ids = None
return llm, prompt, stop_token_ids

Expand Down Expand Up @@ -57,7 +61,7 @@ def run_llava_onevision(question, modality):
<|im_start|>assistant\n"

llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
max_model_len=32768)
max_model_len=16384)
stop_token_ids = None
return llm, prompt, stop_token_ids

Expand All @@ -67,7 +71,7 @@ def run_fuyu(question, modality):
assert modality == "image"

prompt = f"{question}\n"
llm = LLM(model="adept/fuyu-8b")
llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2)
stop_token_ids = None
return llm, prompt, stop_token_ids

Expand Down Expand Up @@ -99,7 +103,8 @@ def run_phi3v(question, modality):
llm = LLM(
model="microsoft/Phi-3-vision-128k-instruct",
trust_remote_code=True,
max_num_seqs=5,
max_model_len=4096,
max_num_seqs=2,
mm_processor_kwargs={"num_crops": 16},
)
stop_token_ids = None
Expand All @@ -122,7 +127,7 @@ def run_chameleon(question, modality):
assert modality == "image"

prompt = f"{question}<image>"
llm = LLM(model="facebook/chameleon-7b")
llm = LLM(model="facebook/chameleon-7b", max_model_len=4096)
stop_token_ids = None
return llm, prompt, stop_token_ids

Expand All @@ -145,6 +150,8 @@ def run_minicpmv(question, modality):
trust_remote_code=True)
llm = LLM(
model=model_name,
max_model_len=4096,
max_num_seqs=2,
trust_remote_code=True,
)
# NOTE The stop_token_ids are different for various versions of MiniCPM-V
Expand Down Expand Up @@ -177,7 +184,7 @@ def run_internvl(question, modality):
llm = LLM(
model=model_name,
trust_remote_code=True,
max_num_seqs=5,
max_model_len=4096,
)

tokenizer = AutoTokenizer.from_pretrained(model_name,
Expand Down Expand Up @@ -215,7 +222,8 @@ def run_qwen_vl(question, modality):
llm = LLM(
model="Qwen/Qwen-VL",
trust_remote_code=True,
max_num_seqs=5,
max_model_len=1024,
max_num_seqs=2,
)

prompt = f"{question}Picture 1: <img></img>\n"
Expand All @@ -229,8 +237,10 @@ def run_qwen2_vl(question, modality):

model_name = "Qwen/Qwen2-VL-7B-Instruct"

# Tested on L40
llm = LLM(
model=model_name,
max_model_len=8192,
max_num_seqs=5,
)

Expand All @@ -252,10 +262,10 @@ def run_mllama(question, modality):
# max_model_len (131072) for this model may cause OOM.
# You may lower either to run this example on lower-end GPUs.

# The configuration below has been confirmed to launch on a
# single H100 GPU.
# The configuration below has been confirmed to launch on a single L40 GPU.
llm = LLM(
model=model_name,
max_model_len=4096,
max_num_seqs=16,
enforce_eager=True,
)
Expand Down
13 changes: 10 additions & 3 deletions examples/offline_inference_vision_language_multi_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,18 @@ class ModelRequestData(NamedTuple):
chat_template: Optional[str]


# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.


def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
model_name = "Qwen/Qwen-VL-Chat"
llm = LLM(
model=model_name,
trust_remote_code=True,
max_num_seqs=5,
max_model_len=1024,
max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = "".join(f"Picture {i}: <img></img>\n"
Expand Down Expand Up @@ -83,6 +89,7 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
model="microsoft/Phi-3.5-vision-instruct",
trust_remote_code=True,
max_model_len=4096,
max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)},
mm_processor_kwargs={"num_crops": 4},
)
Expand All @@ -106,7 +113,6 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
llm = LLM(
model=model_name,
trust_remote_code=True,
max_num_seqs=5,
max_model_len=4096,
limit_mm_per_prompt={"image": len(image_urls)},
)
Expand Down Expand Up @@ -148,10 +154,11 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:

model_name = "Qwen/Qwen2-VL-7B-Instruct"

# Tested on L40
llm = LLM(
model=model_name,
max_num_seqs=5,
max_model_len=32768 if process_vision_info is None else 4096,
max_num_seqs=5,
limit_mm_per_prompt={"image": len(image_urls)},
)

Expand Down
Loading

0 comments on commit 7c28572

Please sign in to comment.