Merge remote-tracking branch 'origin/main' into kv_cache_dynamic_comp…

…utation_upstream
ROCm · Jan 7, 2025 · 64668c6 · 64668c6
2 parents 6ad050e + 869579a
commit 64668c6
Show file tree

Hide file tree

Showing 556 changed files with 28,215 additions and 17,528 deletions.
diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
@@ -0,0 +1,24 @@
+import argparse
+import os
+
+template = """<!DOCTYPE html>
+<html>
+    <body>
+    <h1>Links for vLLM</h1/>
+        <a href="../{wheel_html_escaped}">{wheel}</a><br/>
+    </body>
+</html>
+"""
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--wheel", help="The wheel path.", required=True)
+args = parser.parse_args()
+
+filename = os.path.basename(args.wheel)
+
+with open("index.html", "w") as f:
+    print(f"Generated index.html for {args.wheel}")
+    # cloudfront requires escaping the '+' character
+    f.write(
+        template.format(wheel=filename,
+                        wheel_html_escaped=filename.replace("+", "%2B")))
diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -1,5 +1,6 @@
 steps:
   - label: "Wait for container to be ready"
+    key: wait-for-container-image
     agents:
       queue: A100
     plugins:
@@ -10,12 +11,11 @@ steps:
             command:
             - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
 
-  - wait
-
   - label: "A100"
     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
     agents:
       queue: A100
+    depends_on: wait-for-container-image
     plugins:
     - kubernetes:
         podSpec:
@@ -49,6 +49,7 @@ steps:
     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
     agents:
       queue: H200
+    depends_on: wait-for-container-image
     plugins:
     - docker#v5.12.0:
         image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
@@ -65,15 +66,15 @@ steps:
         - VLLM_USAGE_SOURCE
         - HF_TOKEN
 
-  - block: "Run H100 Benchmark"
-    key: block-h100
-    depends_on: ~
+  #- block: "Run H100 Benchmark"
+    #key: block-h100
+    #depends_on: ~
 
   - label: "H100"
     # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
     agents:
       queue: H100
-    depends_on: block-h100
+    depends_on: wait-for-container-image
     plugins:
     - docker#v5.12.0:
         image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
@@ -55,3 +55,18 @@ steps:
           password-env: DOCKERHUB_TOKEN
     env:
       DOCKER_BUILDKIT: "1"
+
+  - block: "Build CPU release image"
+    key: block-cpu-release-image-build
+    depends_on: ~
+
+  - label: "Build and publish CPU release image"
+    depends_on: block-cpu-release-image-build
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION --progress plain -f Dockerfile.cpu ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION"
+    env:
+      DOCKER_BUILDKIT: "1"
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
@@ -9,31 +9,31 @@ CORE_RANGE=${CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
 
 # Try building the docker image
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu .
-numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test-"$BUILDKITE_BUILD_NUMBER" -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 -f Dockerfile.cpu .
 
 # Setup cleanup
-remove_docker_container() { docker rm -f cpu-test-"$NUMA_NODE" cpu-test-avx2-"$NUMA_NODE" || true; }
+remove_docker_container() { docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image, setting --shm-size=4g for tensor parallel.
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
- --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
- --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2-"$NUMA_NODE" cpu-test-avx2
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
 
 function cpu_tests() {
   set -e
   export NUMA_NODE=$2
 
   # offline inference
-  docker exec cpu-test-avx2-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
     set -e
     python3 examples/offline_inference.py"
 
   # Run basic model test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
     pip install pytest pytest-asyncio \
       decord einops librosa peft Pillow sentence-transformers soundfile \
@@ -46,26 +46,26 @@ function cpu_tests() {
     pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
 
   # Run compressed-tensor test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v \
     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
 
   # Run AWQ test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v \
     tests/quantization/test_ipex_quant.py"
 
   # Run chunked-prefill and prefix-cache test
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
     pytest -s -v -k cpu_model \
     tests/basic_correctness/test_chunked_prefill.py"  
 
   # online inference
-  docker exec cpu-test-"$NUMA_NODE" bash -c "
+  docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
     set -e
     export VLLM_CPU_KVCACHE_SPACE=10 
     export VLLM_CPU_OMP_THREADS_BIND=$1

diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
@@ -4,6 +4,9 @@
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 
+# Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile
+python3 use_existing_torch.py
+
 # Try building the docker image
 DOCKER_BUILDKIT=1 docker build . \
   --target vllm-openai \

diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
@@ -3,6 +3,18 @@
 # This script build the Neuron docker image and run the API server inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -e
+set -v
+
+image_name="neuron/vllm-ci"
+container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+
+HF_CACHE="$(realpath ~)/huggingface"
+mkdir -p "${HF_CACHE}"
+HF_MOUNT="/root/.cache/huggingface"
+
+NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
+mkdir -p "${NEURON_COMPILE_CACHE_URL}"
+NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
 
 # Try building the docker image
 aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
@@ -13,41 +25,30 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
     last_build=$(cat /tmp/neuron-docker-build-timestamp)
     current_time=$(date +%s)
     if [ $((current_time - last_build)) -gt 86400 ]; then
+        docker image prune -f
         docker system prune -f
+        rm -rf "${HF_MOUNT:?}/*"
+        rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*"
         echo "$current_time" > /tmp/neuron-docker-build-timestamp
     fi
 else
     date "+%s" > /tmp/neuron-docker-build-timestamp
 fi
 
-docker build -t neuron -f Dockerfile.neuron .
+docker build -t "${image_name}" -f Dockerfile.neuron .
 
 # Setup cleanup
-remove_docker_container() { docker rm -f neuron || true; }
+remove_docker_container() {
+    docker image rm -f "${image_name}" || true;
+}
 trap remove_docker_container EXIT
-remove_docker_container
 
 # Run the image
-docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
-       --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
-
-# Wait for the server to start
-wait_for_server_to_start() {
-    timeout=300
-    counter=0
-
-    while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do
-        sleep 1
-        counter=$((counter + 1))
-        if [ $counter -ge $timeout ]; then
-            echo "Timeout after $timeout seconds"
-            break
-        fi
-    done
-}
-wait_for_server_to_start
-
-# Test a simple prompt
-curl -X POST -H "Content-Type: application/json" \
-    localhost:8000/generate \
-    -d '{"prompt": "San Francisco is a"}'
+docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
+       -v "${HF_CACHE}:${HF_MOUNT}" \
+       -e "HF_HOME=${HF_MOUNT}" \
+       -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
+       -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
+       --name "${container_name}" \
+       ${image_name} \
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference_neuron.py"
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -106,14 +106,12 @@ steps:
   source_file_dependencies:
   - vllm/
   commands:
-  - pip install -e ./plugins/vllm_add_dummy_model
   - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
   - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
   - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
-  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
   - pytest -v -s entrypoints/test_chat_utils.py
   - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
@@ -244,7 +242,7 @@ steps:
   source_file_dependencies:
   - vllm/lora
   - tests/lora
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
   parallelism: 4
 
 - label: "PyTorch Fullgraph Smoke Test" # 9min
@@ -333,8 +331,6 @@ steps:
   - vllm/
   - tests/models
   commands:
-    - pip install -e ./plugins/vllm_add_dummy_model
-    - pytest -v -s models/test_oot_registration.py # it needs a clean process
     - pytest -v -s models/test_registry.py
     - pytest -v -s models/test_initialization.py
 
@@ -360,23 +356,25 @@ steps:
     - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
     - pytest -v -s models/embedding/language -m 'not core_model'
 
-- label: Multi-Modal Models Test (Standard) # 28min
+- label: Multi-Modal Models Test (Standard) # 40min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
   - tests/models/decoder_only/audio_language
   - tests/models/decoder_only/vision_language
   - tests/models/embedding/vision_language
+  - tests/models/encoder_decoder/audio_language
   - tests/models/encoder_decoder/vision_language
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
     - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
     - pytest -v -s models/embedding/vision_language -m core_model
+    - pytest -v -s models/encoder_decoder/audio_language -m core_model
     - pytest -v -s models/encoder_decoder/language -m core_model
     - pytest -v -s models/encoder_decoder/vision_language -m core_model
 
-- label: Multi-Modal Models Test (Extended) 1 # 1h16m
+- label: Multi-Modal Models Test (Extended) 1 # 48m
   optional: true
   source_file_dependencies:
   - vllm/
@@ -469,11 +467,28 @@ steps:
   - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
   - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-  - pip install -e ./plugins/vllm_add_dummy_model
-  - pytest -v -s distributed/test_distributed_oot.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
 
+- label: Plugin Tests (2 GPUs) # 40min
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  fast_check: true
+  source_file_dependencies:
+  - vllm/plugins/
+  - tests/plugins/
+  commands:
+  # begin platform plugin tests, all the code in-between runs on dummy platform
+  - pip install -e ./plugins/vllm_add_dummy_platform
+  - pytest -v -s plugins_tests/test_platform_plugins.py
+  - pip uninstall vllm_add_dummy_platform -y
+  # end platform plugin tests
+  # other tests continue here:
+  - pip install -e ./plugins/vllm_add_dummy_model
+  - pytest -v -s distributed/test_distributed_oot.py
+  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
+  - pytest -v -s models/test_oot_registration.py # it needs a clean process
+
 - label: Multi-step Tests (4 GPUs) # 36min
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
@@ -520,6 +535,7 @@ steps:
     # requires multi-GPU testing for validation.
     - pytest -v -s -x lora/test_chatglm3_tp.py
     - pytest -v -s -x lora/test_llama_tp.py
+    - pytest -v -s -x lora/test_minicpmv_tp.py
 
 
 - label: Weight Loading Multiple GPU Test  # 33min

diff --git a/.buildkite/upload-wheels.sh b/.buildkite/upload-wheels.sh
@@ -23,6 +23,8 @@ wheel="$new_wheel"
 version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
 echo "Version: $version"
 
+normal_wheel="$wheel" # Save the original wheel filename
+
 # If the version contains "dev", rename it to v1.0.0.dev for consistency
 if [[ $version == *dev* ]]; then
     suffix="${version##*.}"
@@ -32,12 +34,38 @@ if [[ $version == *dev* ]]; then
         new_version="1.0.0.dev"
     fi
     new_wheel="${wheel/$version/$new_version}"
-    mv -- "$wheel" "$new_wheel"
+    # use cp to keep both files in the artifacts directory
+    cp -- "$wheel" "$new_wheel"
     wheel="$new_wheel"
     version="$new_version"
 fi
 
 # Upload the wheel to S3
+python3 .buildkite/generate_index.py --wheel "$normal_wheel"
+
+# generate index for this commit
 aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+
+if [[ $normal_wheel == *"cu118"* ]]; then
+    # if $normal_wheel matches cu118, do not upload the index.html
+    echo "Skipping index files for cu118 wheels"
+else
+    # only upload index.html for cu12 wheels (default wheels)
+    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
+    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
+fi
+
+# generate index for nightly
 aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
+aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
+
+if [[ $normal_wheel == *"cu118"* ]]; then
+    # if $normal_wheel matches cu118, do not upload the index.html
+    echo "Skipping index files for cu118 wheels"
+else
+    # only upload index.html for cu12 wheels (default wheels)
+    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
+fi
+
 aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
diff --git a/.github/ISSUE_TEMPLATE/400-bug report.yml → .github/ISSUE_TEMPLATE/400-bug-report.yml b/.github/ISSUE_TEMPLATE/400-bug report.yml → .github/ISSUE_TEMPLATE/400-bug-report.yml
diff --git a/...ub/ISSUE_TEMPLATE/500-feature request.yml → ...ub/ISSUE_TEMPLATE/500-feature-request.yml b/...ub/ISSUE_TEMPLATE/500-feature request.yml → ...ub/ISSUE_TEMPLATE/500-feature-request.yml