diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c51702886f3..2c7e4ae5e27 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -5,216 +5,28 @@ # https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 # to generate the final pipeline yaml file. - steps: -- label: Regression Test - mirror_hardwares: [amd] - command: pytest -v -s test_regression.py - working_dir: "/vllm-workspace/tests" # optional - -- label: AsyncEngine Test - #mirror_hardwares: [amd] - command: pytest -v -s async_engine - -- label: Basic Correctness Test - mirror_hardwares: [amd] - commands: - - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py - - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py - - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py - - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py - - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py - -- label: Core Test - mirror_hardwares: [amd] - commands: - - pytest -v -s core - - pytest -v -s distributed/test_parallel_state.py - -- label: Distributed Comm Ops Test - #mirror_hardwares: [amd] - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - commands: - - pytest -v -s distributed/test_comm_ops.py - - pytest -v -s distributed/test_shm_broadcast.py - -- label: Distributed Tests (2 GPUs) - mirror_hardwares: [amd] - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - commands: - - bash ../.buildkite/download-images.sh - - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py - - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py - - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py - - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py - - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py - - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py - -- label: Distributed Tests (4 GPUs) - #mirror_hardwares: [amd] - working_dir: "/vllm-workspace/tests" - num_gpus: 4 - commands: - - pytest -v -s distributed/test_pynccl.py - # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here. - # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context. - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py - -- label: Engine Test - mirror_hardwares: [amd] - command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py - -- label: Entrypoints Test - mirror_hardwares: [amd] - - commands: - - pytest -v -s entrypoints/llm - - pytest -v -s entrypoints/openai - -- label: Examples Test - working_dir: "/vllm-workspace/examples" - mirror_hardwares: [amd] - commands: - # install aws cli for llava_example.py - # install tensorizer for tensorize_vllm_model.py - - pip install awscli tensorizer - - python3 offline_inference.py - - python3 offline_inference_with_prefix.py - - python3 llm_engine_example.py - - python3 llava_example.py - - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - -- label: Inputs Test - #mirror_hardwares: [amd] - commands: - - bash ../.buildkite/download-images.sh - - pytest -v -s test_inputs.py - - pytest -v -s multimodal - -- label: Kernels Test %N - #mirror_hardwares: [amd] - command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - parallelism: 4 - -- label: Models Test - #mirror_hardwares: [amd] - commands: - - pytest -v -s models -m \"not vlm\" - -- label: Vision Language Models Test - mirror_hardwares: [amd] - commands: - - bash ../.buildkite/download-images.sh - - pytest -v -s models -m vlm - -- label: Prefix Caching Test - mirror_hardwares: [amd] - commands: - - pytest -v -s prefix_caching - -- label: Samplers Test - #mirror_hardwares: [amd] - command: pytest -v -s samplers - -- label: LogitsProcessor Test - mirror_hardwares: [amd] - command: pytest -v -s test_logits_processor.py - -- label: Utils Test - command: pytest -v -s test_utils.py - -- label: Worker Test - mirror_hardwares: [amd] - command: pytest -v -s worker - -- label: Speculative decoding tests - #mirror_hardwares: [amd] - commands: - # See https://github.com/vllm-project/vllm/issues/5152 - - export VLLM_ATTENTION_BACKEND=XFORMERS - - pytest -v -s spec_decode - -- label: LoRA Test %N - #mirror_hardwares: [amd] - command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py - parallelism: 4 - -- label: LoRA Long Context (Distributed) - #mirror_hardwares: [amd] - num_gpus: 4 - # This test runs llama 13B, so it is required to run on 4 GPUs. - commands: - # FIXIT: find out which code initialize cuda before running the test - # before the fix, we need to use spawn to test it - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s -x lora/test_long_context.py - -- label: Tensorizer Test - #mirror_hardwares: [amd] - command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader - -- label: Metrics Test - mirror_hardwares: [amd] - command: pytest -v -s metrics - -- label: Quantization Test - #mirror_hardwares: [amd] - command: pytest -v -s quantization - -- label: Tracing Test - commands: - - "pip install \ - opentelemetry-sdk \ - opentelemetry-api \ - opentelemetry-exporter-otlp \ - opentelemetry-semantic-conventions-ai" - - pytest -v -s tracing - -- label: Benchmarks - working_dir: "/vllm-workspace/.buildkite" - mirror_hardwares: [amd] - commands: - - pip install aiohttp - - bash run-benchmarks.sh - - label: LM Eval Small Models working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" commands: - pip install lm-eval - export VLLM_WORKER_MULTIPROC_METHOD=spawn - bash ./run-tests.sh -c configs/models-small.txt -t 1 + - cd ~/.cache + - pwd + - ls -a + - cd huggingface + - ls -a -- label: Documentation Build - working_dir: "/vllm-workspace/test_docs/docs" - no_gpu: True - commands: - - pip install -r requirements-docs.txt - - SPHINXOPTS=\"-W\" make html - -- label: Distributed Tests (A100) +- label: LM Eval Large Models gpu: a100 num_gpus: 4 - commands: - # NOTE: don't test llama model here, it seems hf implementation is buggy - # see https://github.com/vllm-project/vllm/pull/5689 for details - - pytest -v -s distributed/test_custom_all_reduce.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.5/flashinfer-0.0.5+cu121torch2.3-cp310-cp310-linux_x86_64.whl - - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - - pytest -v -s -x lora/test_mixtral.py + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + commands: + - pip install lm-eval + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - bash ./run-tests.sh -c configs/models-large.txt -t 4 + - cd /root/.cache + - ls -a + - cd huggingface + - ls -a diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 new file mode 100644 index 00000000000..cd498e296a0 --- /dev/null +++ b/.buildkite/test-template.j2 @@ -0,0 +1,110 @@ +{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %} +{% set default_working_dir = "/vllm-workspace/tests" %} +{% set hf_home = "/root/.cache/huggingface" %} + +steps: + {% for step in steps %} + {% if step.gpu != "a100" %} + - label: "{{ step.label }}" + priority: 10000 + agents: + {% if step.label == "Documentation Build" %} + queue: small_cpu_queue + {% elif step.no_gpu %} + queue: cpu_queue + {% elif step.num_gpus == 2 or step.num_gpus == 4 %} + queue: gpu_4_queue + {% else %} + queue: gpu_1_queue + {% endif %} + soft_fail: {{ step.soft_fail or false }} + {% if step.parallelism %} + parallelism: {{ step.parallelism }} + {% endif %} + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 5 + - exit_status: -10 # Agent was lost + limit: 5 + plugins: + - docker#v5.2.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:f17f03744ebabed187634baec601ef35094ae14f + always-pull: true + propagate-environment: true + {% if not step.no_gpu %} + gpus: all + {% endif %} + {% if step.label == "Benchmarks" %} + mount-buildkite-agent: true + {% endif %} + command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}"] + environment: + - VLLM_USAGE_SOURCE=ci-test + - HF_HOME={{ hf_home }} + - HF_TOKEN + {% if step.label == "Speculative decoding tests" %} + - VLLM_ATTENTION_BACKEND=XFORMERS + {% endif %} + volumes: + - /dev/shm:/dev/shm + - {{ hf_home }}:{{ hf_home }} + {% endif %} + {% endfor %} + + {% for step in steps %} + {% if step.gpu == "a100" %} + - label: "{{ step.label }}" + priority: 10000 + agents: + queue: a100-queue + soft_fail: {{ step.soft_fail or false }} + {% if step.parallelism %} + parallelism: {{ step.parallelism }} + {% endif %} + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 5 + - exit_status: -10 # Agent was lost + limit: 5 + plugins: + - kubernetes: + podSpec: + priorityClassName: ci + containers: + - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:f17f03744ebabed187634baec601ef35094ae14f + command: ["bash"] + args: + - '-c' + - "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'" + resources: + limits: + nvidia.com/gpu: {{ step.num_gpus or 1 }} + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: {{ hf_home }} + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_HOME + value: {{ hf_home }} + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: {{ hf_home }} + type: Directory + {% endif %} + {% endfor %} \ No newline at end of file