vllm-project · khluu · Jul 1, 2024 · Jul 1, 2024 · Jul 1, 2024 · Jul 1, 2024
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -5,216 +5,28 @@
 # https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 
 # to generate the final pipeline yaml file.
 
-
 steps:
-- label: Regression Test
-  mirror_hardwares: [amd]
-  command: pytest -v -s test_regression.py
-  working_dir: "/vllm-workspace/tests" # optional
-
-- label: AsyncEngine Test
-  #mirror_hardwares: [amd]
-  command: pytest -v -s async_engine
-
-- label: Basic Correctness Test
-  mirror_hardwares: [amd]
-  commands:
-  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
-  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
-  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
-  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
-  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
-
-- label: Core Test
-  mirror_hardwares: [amd]
-  commands: 
-  - pytest -v -s core
-  - pytest -v -s distributed/test_parallel_state.py
-
-- label: Distributed Comm Ops Test
-  #mirror_hardwares: [amd]
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  commands:
-  - pytest -v -s distributed/test_comm_ops.py
-  - pytest -v -s distributed/test_shm_broadcast.py
-
-- label: Distributed Tests (2 GPUs)
-  mirror_hardwares: [amd]
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 2
-  commands:
-  - bash ../.buildkite/download-images.sh
-  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
-  - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
-  - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
-  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
-
-- label: Distributed Tests (4 GPUs)
-  #mirror_hardwares: [amd]
-  working_dir: "/vllm-workspace/tests"
-  num_gpus: 4
-  commands:
-  - pytest -v -s distributed/test_pynccl.py
-  # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
-  # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
-
-- label: Engine Test
-  mirror_hardwares: [amd]
-  command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
-
-- label: Entrypoints Test
-  mirror_hardwares: [amd]
-
-  commands:
-  - pytest -v -s entrypoints/llm
-  - pytest -v -s entrypoints/openai
-
-- label: Examples Test
-  working_dir: "/vllm-workspace/examples"
-  mirror_hardwares: [amd]
-  commands:
-    # install aws cli for llava_example.py
-    # install tensorizer for tensorize_vllm_model.py
-    - pip install awscli tensorizer
-    - python3 offline_inference.py
-    - python3 offline_inference_with_prefix.py
-    - python3 llm_engine_example.py
-    - python3 llava_example.py
-    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-
-- label: Inputs Test
-  #mirror_hardwares: [amd]
-  commands:
-    - bash ../.buildkite/download-images.sh
-    - pytest -v -s test_inputs.py
-    - pytest -v -s multimodal
-
-- label: Kernels Test %N
-  #mirror_hardwares: [amd]
-  command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 4
-
-- label: Models Test
-  #mirror_hardwares: [amd]
-  commands:
-    - pytest -v -s models -m \"not vlm\"
-
-- label: Vision Language Models Test
-  mirror_hardwares: [amd]
-  commands:
-    - bash ../.buildkite/download-images.sh
-    - pytest -v -s models -m vlm
-
-- label: Prefix Caching Test
-  mirror_hardwares: [amd]
-  commands:
-    - pytest -v -s prefix_caching
-
-- label: Samplers Test
-  #mirror_hardwares: [amd]
-  command: pytest -v -s samplers
-
-- label: LogitsProcessor Test
-  mirror_hardwares: [amd]
-  command: pytest -v -s test_logits_processor.py
-
-- label: Utils Test
-  command: pytest -v -s test_utils.py
-
-- label: Worker Test
-  mirror_hardwares: [amd]
-  command: pytest -v -s worker
-
-- label: Speculative decoding tests
-  #mirror_hardwares: [amd]
-  commands:
-    # See https://github.com/vllm-project/vllm/issues/5152
-    - export VLLM_ATTENTION_BACKEND=XFORMERS
-    - pytest -v -s spec_decode
-
-- label: LoRA Test %N
-  #mirror_hardwares: [amd]
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
-  parallelism: 4
-
-- label: LoRA Long Context (Distributed)
-  #mirror_hardwares: [amd]
-  num_gpus: 4
-  # This test runs llama 13B, so it is required to run on 4 GPUs.
-  commands:
-    # FIXIT: find out which code initialize cuda before running the test
-    # before the fix, we need to use spawn to test it
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - pytest -v -s -x lora/test_long_context.py
-
-- label: Tensorizer Test
-  #mirror_hardwares: [amd]
-  command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
-
-- label: Metrics Test
-  mirror_hardwares: [amd]
-  command: pytest -v -s metrics
-
-- label: Quantization Test
-  #mirror_hardwares: [amd]
-  command: pytest -v -s quantization
-
-- label: Tracing Test
-  commands: 
-    - "pip install \
-        opentelemetry-sdk \
-        opentelemetry-api \
-        opentelemetry-exporter-otlp \
-        opentelemetry-semantic-conventions-ai"
-    - pytest -v -s tracing
-
-- label: Benchmarks
-  working_dir: "/vllm-workspace/.buildkite"
-  mirror_hardwares: [amd]
-  commands:
-  - pip install aiohttp
-  - bash run-benchmarks.sh
-
 - label: LM Eval Small Models
   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
   commands:
   - pip install lm-eval
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
   - bash ./run-tests.sh -c configs/models-small.txt -t 1
+  - cd ~/.cache
+  - pwd
+  - ls -a
+  - cd huggingface
+  - ls -a
 
-- label: Documentation Build
-  working_dir: "/vllm-workspace/test_docs/docs"
-  no_gpu: True
-  commands:
-  - pip install -r requirements-docs.txt
-  - SPHINXOPTS=\"-W\" make html
-
-- label: Distributed Tests (A100)
+- label: LM Eval Large Models
   gpu: a100
   num_gpus: 4
-  commands: 
-  # NOTE: don't test llama model here, it seems hf implementation is buggy
-  # see https://github.com/vllm-project/vllm/pull/5689 for details
-  - pytest -v -s distributed/test_custom_all_reduce.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.5/flashinfer-0.0.5+cu121torch2.3-cp310-cp310-linux_x86_64.whl
-  - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - pytest -v -s -x lora/test_mixtral.py
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  commands:
+  - pip install lm-eval
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - bash ./run-tests.sh -c configs/models-large.txt -t 4
+  - cd /root/.cache
+  - ls -a
+  - cd huggingface
+  - ls -a
diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
@@ -0,0 +1,110 @@
+{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
+{% set default_working_dir = "/vllm-workspace/tests" %}
+{% set hf_home = "/root/.cache/huggingface" %}
+
+steps:
+  {% for step in steps %}
+  {% if step.gpu != "a100" %}
+  - label: "{{ step.label }}"
+    priority: 10000
+    agents:
+      {% if step.label == "Documentation Build" %}
+      queue: small_cpu_queue
+      {% elif step.no_gpu %}
+      queue: cpu_queue
+      {% elif step.num_gpus == 2 or step.num_gpus == 4 %}
+      queue: gpu_4_queue
+      {% else %}
+      queue: gpu_1_queue
+      {% endif %}
+    soft_fail: {{ step.soft_fail or false }}
+    {% if step.parallelism %}
+    parallelism: {{ step.parallelism }}
+    {% endif %}
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+        - exit_status: -10  # Agent was lost
+          limit: 5
+    plugins:
+      - docker#v5.2.0:
+          image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:f17f03744ebabed187634baec601ef35094ae14f
+          always-pull: true
+          propagate-environment: true
+          {% if not step.no_gpu %}
+          gpus: all
+          {% endif %}
+          {% if step.label == "Benchmarks" %}
+          mount-buildkite-agent: true
+          {% endif %}
+          command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}"]
+          environment:
+            - VLLM_USAGE_SOURCE=ci-test
+            - HF_HOME={{ hf_home }}
+            - HF_TOKEN
+            {% if step.label == "Speculative decoding tests" %}
+            - VLLM_ATTENTION_BACKEND=XFORMERS
+            {% endif %}
+          volumes:
+            - /dev/shm:/dev/shm
+            - {{ hf_home }}:{{ hf_home }}
+  {% endif %}
+  {% endfor %}
+
+  {% for step in steps %}
+  {% if step.gpu == "a100" %}
+  - label: "{{ step.label }}"
+    priority: 10000
+    agents:
+      queue: a100-queue
+    soft_fail: {{ step.soft_fail or false }}
+    {% if step.parallelism %}
+    parallelism: {{ step.parallelism }}
+    {% endif %}
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+        - exit_status: -10  # Agent was lost
+          limit: 5
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: ci
+          containers:
+          - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:f17f03744ebabed187634baec601ef35094ae14f
+            command: ["bash"]
+            args:
+            - '-c'
+            - "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
+            resources:
+              limits:
+                nvidia.com/gpu: {{ step.num_gpus or 1 }}
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            - name: hf-cache
+              mountPath: {{ hf_home }}
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_HOME
+              value: {{ hf_home }}
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+          - name: hf-cache
+            hostPath:
+              path: {{ hf_home }}
+              type: Directory
+  {% endif %}
+  {% endfor %}