diff --git a/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
new file mode 100644
index 0000000000000..15268395ec68b
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/DeepSeek-V2-Lite-Chat.yaml
@@ -0,0 +1,11 @@
+# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
+model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
+tasks:
+- name: "gsm8k"
+ metrics:
+ - name: "exact_match,strict-match"
+ value: 0.671
+ - name: "exact_match,flexible-extract"
+ value: 0.664
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
index e40f42a17c18f..374171f1f915b 100644
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
@@ -1,11 +1,11 @@
-# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 250 -f 5 -t 1
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
- value: 0.752
+ value: 0.755
- name: "exact_match,flexible-extract"
- value: 0.752
-limit: 250
+ value: 0.755
+limit: 1000
num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
index 7a89e8e0c76f2..dc36b705634f9 100644
--- a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
+++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml
@@ -4,8 +4,8 @@ tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
- value: 0.756
+ value: 0.753
- name: "exact_match,flexible-extract"
- value: 0.752
-limit: 250
+ value: 0.753
+limit: 1000
num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
new file mode 100644
index 0000000000000..43ff2bc5ce35e
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
+model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
+tasks:
+- name: "gsm8k"
+ metrics:
+ - name: "exact_match,strict-match"
+ value: 0.593
+ - name: "exact_match,flexible-extract"
+ value: 0.588
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
new file mode 100644
index 0000000000000..259799ba8bfa9
--- /dev/null
+++ b/.buildkite/lm-eval-harness/configs/Qwen2-1.5B-Instruct-W8A16-compressed-tensors.yaml
@@ -0,0 +1,11 @@
+# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
+model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
+tasks:
+- name: "gsm8k"
+ metrics:
+ - name: "exact_match,strict-match"
+ value: 0.595
+ - name: "exact_match,flexible-extract"
+ value: 0.582
+limit: 1000
+num_fewshot: 5
diff --git a/.buildkite/lm-eval-harness/configs/models-large.txt b/.buildkite/lm-eval-harness/configs/models-large.txt
index 2007dd2e1cfa1..94b15a87235b9 100644
--- a/.buildkite/lm-eval-harness/configs/models-large.txt
+++ b/.buildkite/lm-eval-harness/configs/models-large.txt
@@ -1,3 +1,4 @@
Meta-Llama-3-70B-Instruct.yaml
Mixtral-8x7B-Instruct-v0.1.yaml
Qwen2-57B-A14-Instruct.yaml
+DeepSeek-V2-Lite-Chat.yaml
diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt
index 3300ca64f44b8..3d1306f6bc4f1 100644
--- a/.buildkite/lm-eval-harness/configs/models-small.txt
+++ b/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -2,3 +2,4 @@ Meta-Llama-3-8B-Instruct.yaml
Meta-Llama-3-8B-Instruct-FP8.yaml
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
+Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
index 933733e9c1edf..2f04cc1283df3 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -3,7 +3,7 @@
# We use this for fp8, which HF does not support.
#
# Make sure you have lm-eval-harness installed:
-# pip install lm-eval==0.4.2
+# pip install lm-eval==0.4.3
usage() {
echo``
@@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
done
lm_eval --model vllm \
- --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true \
+ --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \
--tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
--batch_size $BATCH_SIZE
diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index 4036b32a46bf7..c84e150934306 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -1,5 +1,6 @@
# vLLM benchmark suite
+
## Introduction
This directory contains the performance benchmarking CI for vllm.
diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
index 2b25c954b5c5c..02c0ee534d72c 100644
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -11,7 +11,7 @@ steps:
- sh
- .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
- wait
- - label: "A100 Benchmark"
+ - label: "A100"
agents:
queue: A100
plugins:
@@ -42,21 +42,20 @@ steps:
- name: devshm
emptyDir:
medium: Memory
- # - label: "H100: NVIDIA SMI"
- # agents:
- # queue: H100
- # plugins:
- # - docker#v5.11.0:
- # image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
- # command:
- # - bash
- # - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
- # mount-buildkite-agent: true
- # propagate-environment: true
- # propagate-uid-gid: false
- # ipc: host
- # gpus: all
- # environment:
- # - VLLM_USAGE_SOURCE
- # - HF_TOKEN
+ - label: "H100"
+ agents:
+ queue: H100
+ plugins:
+ - docker#v5.11.0:
+ image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+ command:
+ - bash
+ - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+ mount-buildkite-agent: true
+ propagate-environment: true
+ ipc: host
+ gpus: all
+ environment:
+ - VLLM_USAGE_SOURCE
+ - HF_TOKEN
diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
deleted file mode 100755
index 15d411febcee1..0000000000000
--- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env bash
-
-# NOTE(simon): this script runs inside a buildkite agent with CPU only access.
-set -euo pipefail
-
-# Install system packages
-apt update
-apt install -y curl jq
-
-# Install minijinja for templating
-curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh
-source $HOME/.cargo/env
-
-# If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
-if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
- PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
-
- if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then
- echo "This PR has the 'perf-benchmarks' label. Proceeding with the nightly benchmarks."
- else
- echo "This PR does not have the 'perf-benchmarks' label. Skipping the nightly benchmarks."
- exit 0
- fi
-fi
-
-# Upload sample.yaml
-buildkite-agent pipeline upload .buildkite/nightly-benchmarks/benchmark-pipeline.yaml
diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
new file mode 100644
index 0000000000000..c3d3cbf473968
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -0,0 +1,45 @@
+
+# Nightly benchmark
+
+The main goal of this benchmarking is two-fold:
+- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload.
+- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md]().
+
+
+## Docker images
+
+We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images:
+- vllm/vllm-openai:v0.5.0.post1
+- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+- openmmlab/lmdeploy:v0.5.0
+- ghcr.io/huggingface/text-generation-inference:2.1
+
+
+
+
+## Hardware
+
+One AWS node with 8x NVIDIA A100 GPUs.
+
+
+## Workload description
+
+We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
+
+- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed).
+- Output length: the corresponding output length of these 500 prompts.
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
+- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
+
+
+
+## Plots
+
+In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed.
+
+
+
+## Results
+
+{nightly_results_benchmarking_table}
diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
new file mode 100644
index 0000000000000..6e399bb936fbc
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -0,0 +1,120 @@
+common_pod_spec: &common_pod_spec
+ priorityClassName: perf-benchmark
+ nodeSelector:
+ nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+ volumes:
+ - name: devshm
+ emptyDir:
+ medium: Memory
+ - name: hf-cache
+ hostPath:
+ path: /root/.cache/huggingface
+ type: Directory
+
+common_container_settings: &common_container_settings
+ command:
+ - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+ resources:
+ limits:
+ nvidia.com/gpu: 8
+ volumeMounts:
+ - name: devshm
+ mountPath: /dev/shm
+ - name: hf-cache
+ mountPath: /root/.cache/huggingface
+ env:
+ - name: VLLM_USAGE_SOURCE
+ value: ci-test
+ - name: HF_HOME
+ value: /root/.cache/huggingface
+ - name: VLLM_SOURCE_CODE_LOC
+ value: /workspace/build/buildkite/vllm/performance-benchmark
+ - name: HF_TOKEN
+ valueFrom:
+ secretKeyRef:
+ name: hf-token-secret
+ key: token
+
+steps:
+ - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
+ - label: "A100 trt benchmark"
+ priority: 100
+ agents:
+ queue: A100
+ plugins:
+ - kubernetes:
+ podSpec:
+ <<: *common_pod_spec
+ containers:
+ - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+ <<: *common_container_settings
+
+ - label: "A100 lmdeploy benchmark"
+ priority: 100
+ agents:
+ queue: A100
+ plugins:
+ - kubernetes:
+ podSpec:
+ <<: *common_pod_spec
+ containers:
+ - image: openmmlab/lmdeploy:v0.5.0
+ <<: *common_container_settings
+
+
+ - label: "A100 vllm benchmark"
+ priority: 100
+ agents:
+ queue: A100
+ plugins:
+ - kubernetes:
+ podSpec:
+ <<: *common_pod_spec
+ containers:
+ - image: vllm/vllm-openai:latest
+ <<: *common_container_settings
+
+ - label: "A100 tgi benchmark"
+ priority: 100
+ agents:
+ queue: A100
+ plugins:
+ - kubernetes:
+ podSpec:
+ <<: *common_pod_spec
+ containers:
+ - image: ghcr.io/huggingface/text-generation-inference:2.1
+ <<: *common_container_settings
+
+ - wait
+
+ - label: "Plot"
+ priority: 100
+ agents:
+ queue: A100
+ plugins:
+ - kubernetes:
+ podSpec:
+ <<: *common_pod_spec
+ containers:
+ - image: vllm/vllm-openai:v0.5.0.post1
+ command:
+ - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+ resources:
+ limits:
+ nvidia.com/gpu: 8
+ volumeMounts:
+ - name: devshm
+ mountPath: /dev/shm
+ env:
+ - name: VLLM_USAGE_SOURCE
+ value: ci-test
+ - name: VLLM_SOURCE_CODE_LOC
+ value: /workspace/build/buildkite/vllm/performance-benchmark
+ - name: HF_TOKEN
+ valueFrom:
+ secretKeyRef:
+ name: hf-token-secret
+ key: token
+
+ - wait
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
index 021473f76d0e5..04b02adf3644c 100644
--- a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh
@@ -54,7 +54,7 @@ wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
timeout 1200 bash -c '
- until curl localhost:8000/v1/completions; do
+ until curl -X POST localhost:8000/v1/completions; do
sleep 1
done' && return 0 || return 1
}
@@ -73,8 +73,17 @@ kill_gpu_processes() {
echo "All GPU processes have been killed."
fi
+ # Sometimes kill with pid doesn't work properly, we can also kill all process running python or python3
+ # since we are in container anyway
+ pkill -9 -f python
+ pkill -9 -f python3
+
# waiting for GPU processes to be fully killed
- sleep 10
+ # loop while nvidia-smi returns any processes
+ while [ -n "$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)" ]; do
+ sleep 1
+ echo "Waiting for GPU processes to be killed"
+ done
# remove vllm config file
rm -rf ~/.config/vllm
@@ -90,12 +99,19 @@ upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
- if [ ! -f /workspace/buildkite-agent ]; then
+ # Check if buildkite-agent is available in the PATH or at /workspace/buildkite-agent
+ if command -v buildkite-agent >/dev/null 2>&1; then
+ BUILDKITE_AGENT_COMMAND="buildkite-agent"
+ elif [ -f /workspace/buildkite-agent ]; then
+ BUILDKITE_AGENT_COMMAND="/workspace/buildkite-agent"
+ else
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
- /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
- /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+
+ # Use the determined command to annotate and upload artifacts
+ $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < $RESULTS_FOLDER/benchmark_results.md
+ $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
}
run_latency_tests() {
@@ -269,6 +285,7 @@ run_serving_tests() {
echo "Running test case $test_name"
echo "Server command: $server_command"
eval "$server_command" &
+ server_pid=$!
# wait until the server is alive
wait_for_server
@@ -318,6 +335,7 @@ run_serving_tests() {
done
# clean up
+ kill -9 $server_pid
kill_gpu_processes
done
}
diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
new file mode 100644
index 0000000000000..627a3e6971578
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+set -o pipefail
+set -x
+
+check_gpus() {
+ # check the number of GPUs and GPU type.
+ declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+ if [[ $gpu_count -gt 0 ]]; then
+ echo "GPU found."
+ else
+ echo "Need at least 1 GPU to run benchmarking."
+ exit 1
+ fi
+ declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+ echo "GPU type is $gpu_type"
+}
+
+check_hf_token() {
+ # check if HF_TOKEN is available and valid
+ if [[ -z "$HF_TOKEN" ]]; then
+ echo "Error: HF_TOKEN is not set."
+ exit 1
+ elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
+ echo "Error: HF_TOKEN does not start with 'hf_'."
+ exit 1
+ else
+ echo "HF_TOKEN is set and valid."
+ fi
+}
+
+main() {
+
+ check_gpus
+ check_hf_token
+
+ df -h
+
+ (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+ (which jq) || (apt-get update && apt-get -y install jq)
+
+ cd $VLLM_SOURCE_CODE_LOC/benchmarks
+ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+
+ # run lmdeploy
+ if which lmdeploy >/dev/null; then
+ echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh"
+ bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+ exit 0
+ fi
+
+ # run tgi
+ if [ -e /tgi-entrypoint.sh ]; then
+ echo "tgi is available, redirect to run-tgi-nightly.sh"
+ bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+ exit 0
+ fi
+
+ # run trt
+ if which trtllm-build >/dev/null; then
+ echo "trtllm is available, redirect to run-trt-nightly.sh"
+ bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+ exit 0
+ fi
+
+ # run vllm
+ if [ -e /vllm-workspace ]; then
+ echo "vllm is available, redirect to run-vllm-nightly.sh"
+ bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+ exit 0
+ fi
+
+}
+
+main "$@"
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
new file mode 100644
index 0000000000000..68ac5909e5951
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@@ -0,0 +1,26 @@
+import argparse
+
+from transformers import AutoTokenizer
+
+
+def main(model, cachedir):
+ # Load the tokenizer and save it to the specified directory
+ tokenizer = AutoTokenizer.from_pretrained(model)
+ tokenizer.save_pretrained(cachedir)
+ print(f"Tokenizer saved to {cachedir}")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="Download and save Hugging Face tokenizer")
+ parser.add_argument("--model",
+ type=str,
+ required=True,
+ help="Name of the model")
+ parser.add_argument("--cachedir",
+ type=str,
+ required=True,
+ help="Directory to save the tokenizer")
+
+ args = parser.parse_args()
+ main(args.model, args.cachedir)
diff --git a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
new file mode 100644
index 0000000000000..18bcc3a8714c4
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@@ -0,0 +1,6 @@
+from lmdeploy.serve.openai.api_client import APIClient
+
+api_client = APIClient("http://localhost:8000")
+model_name = api_client.available_models[0]
+
+print(model_name)
diff --git a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
new file mode 100644
index 0000000000000..f8262653a6628
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+
+server_params=$1
+common_params=$2
+
+
+
+model_path=$(echo "$common_params" | jq -r '.model')
+model_name="${model_path#*/}"
+model_type=$(echo "$server_params" | jq -r '.model_type')
+model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
+model_tp_size=$(echo "$common_params" | jq -r '.tp')
+max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
+max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
+max_output_len=$(echo "$server_params" | jq -r '.max_output_len')
+trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
+
+cd ~
+rm -rf models
+mkdir -p models
+cd models
+models_dir=$(pwd)
+trt_model_path=${models_dir}/${model_name}-trt-ckpt
+trt_engine_path=${models_dir}/${model_name}-trt-engine
+
+cd ~
+rm -rf tensorrt-demo
+git clone https://github.com/neuralmagic/tensorrt-demo.git
+cd tensorrt-demo
+tensorrt_demo_dir=$(pwd)
+
+# make sure the parameter inside tensorrt_demo is consistent to envvar
+sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
+sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
+
+
+cd /
+rm -rf tensorrtllm_backend
+git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
+git lfs install
+cd tensorrtllm_backend
+git checkout $trt_llm_version
+tensorrtllm_backend_dir=$(pwd)
+git submodule update --init --recursive
+cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
+
+cd /tensorrtllm_backend
+cd ./tensorrt_llm/examples/${model_type}
+
+
+if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
+
+ echo "Key 'fp8' exists in common params. Use quantize.py instead of convert_checkpoint.py"
+ echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md"
+ python ../quantization/quantize.py \
+ --model_dir ${model_path} \
+ --dtype ${model_dtype} \
+ --tp_size ${model_tp_size} \
+ --output_dir ${trt_model_path} \
+ --qformat fp8 \
+ --kv_cache_dtype fp8 \
+ --calib_size 2
+
+else
+
+ echo "Key 'fp8' does not exist in common params. Use convert_checkpoint.py"
+ python3 convert_checkpoint.py \
+ --model_dir ${model_path} \
+ --dtype ${model_dtype} \
+ --tp_size ${model_tp_size} \
+ --output_dir ${trt_model_path}
+
+fi
+
+
+
+trtllm-build \
+--checkpoint_dir=${trt_model_path} \
+--gpt_attention_plugin=${model_dtype} \
+--gemm_plugin=${model_dtype} \
+--remove_input_padding=enable \
+--paged_kv_cache=enable \
+--tp_size=${model_tp_size} \
+--max_batch_size=${max_batch_size} \
+--max_input_len=${max_input_len} \
+--max_output_len=${max_output_len} \
+--max_num_tokens=${max_output_len} \
+--opt_num_tokens=${max_output_len} \
+--output_dir=${trt_engine_path}
+
+cd /tensorrtllm_backend/triton_model_repo
+rm -rf ./tensorrt_llm/1/*
+cp -r ${trt_engine_path}/* ./tensorrt_llm/1
+cd /tensorrtllm_backend
+python3 scripts/launch_triton_server.py \
+--world_size=${model_tp_size} \
+--model_repo=/tensorrtllm_backend/triton_model_repo &
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
new file mode 100644
index 0000000000000..1168912c6e229
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+set -ex
+set -o pipefail
+
+
+main() {
+
+ (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+ (which jq) || (apt-get update && apt-get -y install jq)
+
+ if [ ! -f /workspace/buildkite-agent ]; then
+ echo "buildkite-agent binary not found. Skip plotting the results."
+ exit 0
+ fi
+
+ # initial annotation
+ description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
+
+ # download results
+ cd $VLLM_SOURCE_CODE_LOC/benchmarks
+ mkdir -p results/
+ /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
+ ls
+ ls results/
+
+ # generate figures
+ python3 -m pip install tabulate pandas matplotlib
+ python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
+ --description $description \
+ --results-folder results/
+
+ # upload results and figures
+ /workspace/buildkite-agent artifact upload "nightly_results.png"
+ /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+ /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+ /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
+}
+
+main "$@"
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
new file mode 100644
index 0000000000000..e5cfcc64a9b2a
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@@ -0,0 +1,135 @@
+import argparse
+import json
+import math
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import pandas as pd
+from tabulate import tabulate
+
+
+def parse_arguments():
+ parser = argparse.ArgumentParser(
+ description=
+ 'Parse command line arguments for summary-nightly-results script.')
+ parser.add_argument('--results-folder',
+ type=str,
+ required=True,
+ help='The folder where the results are stored.')
+ parser.add_argument('--description',
+ type=str,
+ required=True,
+ help='Description of the results.')
+
+ args = parser.parse_args()
+ return args
+
+
+def main(args):
+ bar_colors = ['#56B4E9', '#009E73', '#D55E00', '#E69F00']
+ results_folder = Path(args.results_folder)
+
+ results = []
+
+ # collect results
+ for test_file in results_folder.glob("*_nightly_results.json"):
+ with open(test_file, "r") as f:
+ results = results + json.loads(f.read())
+
+ # generate markdown table
+ df = pd.DataFrame.from_dict(results)
+
+ md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
+
+ with open(args.description, "r") as f:
+ description = f.read()
+
+ description = description.format(
+ nightly_results_benchmarking_table=md_table)
+
+ with open("nightly_results.md", "w") as f:
+ f.write(description)
+
+ plt.rcParams.update({'font.size': 20})
+
+ # plot results
+ fig, axes = plt.subplots(3, 3, figsize=(16, 14))
+ fig.subplots_adjust(hspace=1)
+ methods = ["vllm", "trt", "lmdeploy", "tgi"]
+ for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
+ for j, metric in enumerate(["TTFT", "ITL"]):
+ means, stds = [], []
+ for method in methods:
+ target = df['Test name'].str.contains(model)
+ target = target & df['Engine'].str.contains(method)
+ filtered_df = df[target]
+
+ if filtered_df.empty:
+ means.append(0.)
+ stds.append(0.)
+ else:
+ means.append(filtered_df[f"Mean {metric} (ms)"].values[0])
+ std = filtered_df[f"Std {metric} (ms)"].values[0]
+ success = filtered_df["Successful req."].values[0]
+ stds.append(std / math.sqrt(success))
+
+ print(model, metric)
+ print(means, stds)
+
+ ax = axes[i, j + 1]
+
+ bars = ax.bar(
+ ["vllm", "trt", "lmdeploy", "tgi"],
+ means,
+ yerr=stds,
+ capsize=10,
+ )
+ for idx, bar in enumerate(bars):
+ bar.set_color(bar_colors[idx])
+ ax.set_ylim(bottom=0)
+
+ ax.set_ylabel(f"{metric} (ms)")
+ ax.set_title(f"{model} {metric}")
+ ax.grid(axis='y')
+
+ metric = "Tput"
+ j = 0
+ if True:
+ tputs = []
+ for method in methods:
+ target = df['Test name'].str.contains(model)
+ target = target & df['Engine'].str.contains(method)
+ filtered_df = df[target]
+
+ if filtered_df.empty:
+ tputs.append(0.)
+ else:
+ input_tput = filtered_df["Input Tput (tok/s)"].values[0]
+ output_tput = filtered_df["Output Tput (tok/s)"].values[0]
+ tputs.append(input_tput + output_tput)
+
+ print(model, metric)
+ print(tputs)
+
+ ax = axes[i, j]
+
+ bars = ax.bar(
+ ["vllm", "trt", "lmdeploy", "tgi"],
+ tputs,
+ )
+ for idx, bar in enumerate(bars):
+ bar.set_color(bar_colors[idx])
+
+ ax.set_ylim(bottom=0)
+
+ ax.set_ylabel("Tput (token/s)")
+ ax.set_title(f"{model} {metric}")
+ ax.grid(axis='y')
+
+ fig.tight_layout()
+ fig.savefig("nightly_results.png", bbox_inches='tight', dpi=400)
+
+
+if __name__ == '__main__':
+ args = parse_arguments()
+ main(args)
diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
new file mode 100644
index 0000000000000..d6f112aaa42fd
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -0,0 +1,218 @@
+#!/bin/bash
+
+set -o pipefail
+
+check_gpus() {
+ # check the number of GPUs and GPU type.
+ declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+ if [[ $gpu_count -gt 0 ]]; then
+ echo "GPU found."
+ else
+ echo "Need at least 1 GPU to run benchmarking."
+ exit 1
+ fi
+ declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+ echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+ pkill lmdeploy || true
+ # waiting for GPU processes to be fully killed
+ sleep 10
+ # Print the GPU memory usage
+ # so that we know if all GPU processes are killed.
+ gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+ # The memory usage should be 0 MB.
+ echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+ # transforms the JSON string to command line args, and '_' is replaced to '-'
+ # example:
+ # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+ # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+ local json_string=$1
+ local args=$(
+ echo "$json_string" | jq -r '
+ to_entries |
+ map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+ join(" ")
+ '
+ )
+ echo "$args"
+}
+
+wait_for_server() {
+ # wait for vllm server to start
+ # return 1 if vllm server crashes
+ timeout 1200 bash -c '
+ until curl -s localhost:8000/v1/completions > /dev/null; do
+ sleep 1
+ done' && return 0 || return 1
+}
+
+run_serving_tests() {
+ # run serving tests using `benchmark_serving.py`
+ # $1: a json file specifying serving test cases
+
+ local serving_test_file
+ serving_test_file=$1
+
+ # Iterate over serving tests
+ jq -c '.[]' "$serving_test_file" | while read -r params; do
+ # get the test name, and append the GPU type back to it.
+ test_name=$(echo "$params" | jq -r '.test_name')
+
+ # if TEST_SELECTOR is set, only run the test cases that match the selector
+ if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+ echo "Skip test case $test_name."
+ continue
+ fi
+
+ # append lmdeploy to the test name
+ test_name=lmdeploy_$test_name
+
+ # get common parameters
+ common_params=$(echo "$params" | jq -r '.common_parameters')
+ model=$(echo "$common_params" | jq -r '.model')
+ tp=$(echo "$common_params" | jq -r '.tp')
+ dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+ dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+ port=$(echo "$common_params" | jq -r '.port')
+ num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+
+
+
+ # get client and server arguments
+ server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
+ client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters')
+ server_args=$(json2args "$server_params")
+ client_args=$(json2args "$client_params")
+ qps_list=$(echo "$params" | jq -r '.qps_list')
+ qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+ echo "Running over qps list $qps_list"
+
+ # check if there is enough GPU to run the test
+ if [[ $gpu_count -lt $tp ]]; then
+ echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+ continue
+ fi
+
+ # prepare tokenizer
+ rm -rf /tokenizer_cache
+ mkdir /tokenizer_cache
+ python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
+ --model "$model" \
+ --cachedir /tokenizer_cache
+
+ server_command="lmdeploy serve api_server $model \
+ --tp $tp \
+ --server-port $port \
+ $server_args"
+
+ # run the server
+ echo "Running test case $test_name"
+ echo "Server command: $server_command"
+ bash -c "$server_command" &
+
+ # wait until the server is alive
+ wait_for_server
+ if [ $? -eq 0 ]; then
+ echo ""
+ echo "lmdeploy server is up and running."
+ else
+ echo ""
+ echo "lmdeploy failed to start within the timeout period."
+ break
+ fi
+
+ # get model name
+ model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
+
+ # iterate over different QPS
+ for qps in $qps_list; do
+ # remove the surrounding single quote from qps
+ if [[ "$qps" == *"inf"* ]]; then
+ echo "qps was $qps"
+ qps="inf"
+ echo "now qps is $qps"
+ fi
+
+ new_test_name=$test_name"_qps_"$qps
+
+ client_command="python3 benchmark_serving.py \
+ --backend lmdeploy \
+ --tokenizer /tokenizer_cache \
+ --dataset-name $dataset_name \
+ --dataset-path $dataset_path \
+ --num-prompts $num_prompts \
+ --port $port \
+ --save-result \
+ --result-dir $RESULTS_FOLDER \
+ --result-filename ${new_test_name}.json \
+ --request-rate $qps \
+ --model \"$model_name\" \
+ $client_args"
+
+ echo "Running test case $test_name with qps $qps"
+ echo "Client command: $client_command"
+
+ eval "$client_command"
+
+ # record the benchmarking commands
+ jq_output=$(jq -n \
+ --arg server "$server_command" \
+ --arg client "$client_command" \
+ --arg gpu "$gpu_type" \
+ --arg engine "lmdeploy" \
+ '{
+ server_command: $server,
+ client_command: $client,
+ gpu_type: $gpu,
+ engine: $engine
+ }')
+ echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+ done
+
+ # clean up
+ kill_gpu_processes
+ rm -rf /root/.cache/huggingface/*
+ done
+}
+
+
+upload_to_buildkite() {
+ # upload the benchmarking results to buildkite
+
+ # if the agent binary is not found, skip uploading the results, exit 0
+ if [ ! -f /workspace/buildkite-agent ]; then
+ echo "buildkite-agent binary not found. Skip uploading the results."
+ return 0
+ fi
+ # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+ /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+
+main() {
+
+ check_gpus
+ # enter vllm directory
+ cd $VLLM_SOURCE_CODE_LOC/benchmarks
+
+ declare -g RESULTS_FOLDER=results/
+ mkdir -p $RESULTS_FOLDER
+ BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+ python -m pip install transformers==4.41.2
+
+ export CURRENT_LLM_SERVING_ENGINE=lmdeploy
+ run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+ python -m pip install tabulate pandas
+ python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+ upload_to_buildkite
+
+}
+
+main "$@"
diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
new file mode 100644
index 0000000000000..fed03654f8b77
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@@ -0,0 +1,216 @@
+#!/bin/bash
+
+set -o pipefail
+
+check_gpus() {
+ # check the number of GPUs and GPU type.
+ declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+ if [[ $gpu_count -gt 0 ]]; then
+ echo "GPU found."
+ else
+ echo "Need at least 1 GPU to run benchmarking."
+ exit 1
+ fi
+ declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+ echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+ pkill text-generation || true
+ # waiting for GPU processes to be fully killed
+ sleep 10
+ # Print the GPU memory usage
+ # so that we know if all GPU processes are killed.
+ gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+ # The memory usage should be 0 MB.
+ echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+ # transforms the JSON string to command line args, and '_' is replaced to '-'
+ # example:
+ # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+ # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+ local json_string=$1
+ local args=$(
+ echo "$json_string" | jq -r '
+ to_entries |
+ map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+ join(" ")
+ '
+ )
+ echo "$args"
+}
+
+wait_for_server() {
+ timeout 1200 bash -c '
+ until curl -s localhost:8000/generate_stream > /dev/null; do
+ sleep 1
+ done' && return 0 || return 1
+}
+
+run_serving_tests() {
+ # run serving tests using `benchmark_serving.py`
+ # $1: a json file specifying serving test cases
+
+ local serving_test_file
+ serving_test_file=$1
+
+ # Iterate over serving tests
+ jq -c '.[]' "$serving_test_file" | while read -r params; do
+ # get the test name, and append the GPU type back to it.
+ test_name=$(echo "$params" | jq -r '.test_name')
+
+
+ # if TEST_SELECTOR is set, only run the test cases that match the selector
+ if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+ echo "Skip test case $test_name."
+ continue
+ fi
+
+ # append tgi to the test name
+ test_name=tgi_$test_name
+
+ # get common parameters
+ common_params=$(echo "$params" | jq -r '.common_parameters')
+ model=$(echo "$common_params" | jq -r '.model')
+ tp=$(echo "$common_params" | jq -r '.tp')
+ dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+ dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+ port=$(echo "$common_params" | jq -r '.port')
+ num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+
+ # get client and server arguments
+ server_params=$(echo "$params" | jq -r '.tgi_server_parameters')
+ client_params=$(echo "$params" | jq -r '.tgi_client_parameters')
+ server_args=$(json2args "$server_params")
+ client_args=$(json2args "$client_params")
+ qps_list=$(echo "$params" | jq -r '.qps_list')
+ qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+ echo "Running over qps list $qps_list"
+
+ # check if there is enough GPU to run the test
+ if [[ $gpu_count -lt $tp ]]; then
+ echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
+ continue
+ fi
+
+ if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
+ echo "Key 'fp8' exists in common params."
+ server_command="/tgi-entrypoint.sh \
+ --model-id $model \
+ --num-shard $tp \
+ --port $port \
+ --quantize fp8 \
+ $server_args"
+ else
+ echo "Key 'fp8' does not exist in common params."
+ server_command="/tgi-entrypoint.sh \
+ --model-id $model \
+ --num-shard $tp \
+ --port $port \
+ $server_args"
+ fi
+
+
+
+
+ # run the server
+ echo "Running test case $test_name"
+ echo "Server command: $server_command"
+ eval "$server_command" &
+
+ # wait until the server is alive
+ wait_for_server
+ if [ $? -eq 0 ]; then
+ echo ""
+ echo "tgi server is up and running."
+ else
+ echo ""
+ echo "tgi failed to start within the timeout period."
+ break
+ fi
+
+ # iterate over different QPS
+ for qps in $qps_list; do
+ # remove the surrounding single quote from qps
+ if [[ "$qps" == *"inf"* ]]; then
+ echo "qps was $qps"
+ qps="inf"
+ echo "now qps is $qps"
+ fi
+
+ new_test_name=$test_name"_qps_"$qps
+
+ client_command="python3 benchmark_serving.py \
+ --backend tgi \
+ --model $model \
+ --dataset-name $dataset_name \
+ --dataset-path $dataset_path \
+ --num-prompts $num_prompts \
+ --port $port \
+ --save-result \
+ --result-dir $RESULTS_FOLDER \
+ --result-filename ${new_test_name}.json \
+ --request-rate $qps \
+ $client_args"
+
+ echo "Running test case $test_name with qps $qps"
+ echo "Client command: $client_command"
+
+ eval "$client_command"
+
+ # record the benchmarking commands
+ jq_output=$(jq -n \
+ --arg server "$server_command" \
+ --arg client "$client_command" \
+ --arg gpu "$gpu_type" \
+ --arg engine "tgi" \
+ '{
+ server_command: $server,
+ client_command: $client,
+ gpu_type: $gpu,
+ engine: $engine
+ }')
+ echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+ done
+
+ # clean up
+ kill_gpu_processes
+ rm -rf /root/.cache/huggingface/*
+ done
+}
+
+
+
+upload_to_buildkite() {
+ # upload the benchmarking results to buildkite
+
+ # if the agent binary is not found, skip uploading the results, exit 0
+ if [ ! -f /workspace/buildkite-agent ]; then
+ echo "buildkite-agent binary not found. Skip uploading the results."
+ return 0
+ fi
+ # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+ /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+main() {
+
+ check_gpus
+ # enter vllm directory
+ cd $VLLM_SOURCE_CODE_LOC/benchmarks
+ declare -g RESULTS_FOLDER=results/
+ mkdir -p $RESULTS_FOLDER
+ BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+ export CURRENT_LLM_SERVING_ENGINE=tgi
+ run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+ python -m pip install tabulate pandas
+ python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+ upload_to_buildkite
+
+}
+
+main "$@"
diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
new file mode 100644
index 0000000000000..4a82b9ec64d71
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -0,0 +1,214 @@
+#!/bin/bash
+
+set -o pipefail
+
+check_gpus() {
+ # check the number of GPUs and GPU type.
+ declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+ if [[ $gpu_count -gt 0 ]]; then
+ echo "GPU found."
+ else
+ echo "Need at least 1 GPU to run benchmarking."
+ exit 1
+ fi
+ declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+ echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+ pkill tritonserver || true
+ # waiting for GPU processes to be fully killed
+ sleep 20
+ # Print the GPU memory usage
+ # so that we know if all GPU processes are killed.
+ gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+ # The memory usage should be 0 MB.
+ echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+ # transforms the JSON string to command line args, and '_' is replaced to '-'
+ # example:
+ # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+ # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+ local json_string=$1
+ local args=$(
+ echo "$json_string" | jq -r '
+ to_entries |
+ map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+ join(" ")
+ '
+ )
+ echo "$args"
+}
+
+wait_for_server() {
+ timeout 1200 bash -c '
+ until curl -s localhost:8000/generate_stream > /dev/null; do
+ sleep 1
+ done' && return 0 || return 1
+}
+
+run_serving_tests() {
+ # run serving tests using `benchmark_serving.py`
+ # $1: a json file specifying serving test cases
+
+ local serving_test_file
+ serving_test_file=$1
+
+ # Iterate over serving tests
+ jq -c '.[]' "$serving_test_file" | while read -r params; do
+ # get the test name, and append the GPU type back to it.
+ test_name=$(echo "$params" | jq -r '.test_name')
+
+ # if TEST_SELECTOR is set, only run the test cases that match the selector
+ if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+ echo "Skip test case $test_name."
+ continue
+ fi
+
+ # append trt to the test name
+ test_name=trt_$test_name
+
+ # get common parameters
+ common_params=$(echo "$params" | jq -r '.common_parameters')
+ model=$(echo "$common_params" | jq -r '.model')
+ tp=$(echo "$common_params" | jq -r '.tp')
+ dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+ dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+ port=$(echo "$common_params" | jq -r '.port')
+ num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+
+ # get client and server arguments
+ server_params=$(echo "$params" | jq -r '.trt_server_parameters')
+ client_params=$(echo "$params" | jq -r '.trt_client_parameters')
+ client_args=$(json2args "$client_params")
+ qps_list=$(echo "$params" | jq -r '.qps_list')
+ qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+ echo "Running over qps list $qps_list"
+
+ # check if there is enough GPU to run the test
+ if [[ $gpu_count -lt $tp ]]; then
+ echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+ continue
+ fi
+
+
+
+ cd $VLLM_SOURCE_CODE_LOC/benchmarks
+
+
+ echo "Running test case $test_name"
+ bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
+
+ # wait until the server is alive
+ wait_for_server
+ if [ $? -eq 0 ]; then
+ echo ""
+ echo "trt server is up and running."
+ else
+ echo ""
+ echo "trt failed to start within the timeout period."
+ break
+ fi
+
+ # prepare tokenizer
+ cd $VLLM_SOURCE_CODE_LOC/benchmarks
+ rm -rf /tokenizer_cache
+ mkdir /tokenizer_cache
+ python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
+ --model "$model" \
+ --cachedir /tokenizer_cache
+ cd $VLLM_SOURCE_CODE_LOC/benchmarks
+
+
+ # iterate over different QPS
+ for qps in $qps_list; do
+ # remove the surrounding single quote from qps
+ if [[ "$qps" == *"inf"* ]]; then
+ echo "qps was $qps"
+ qps="inf"
+ echo "now qps is $qps"
+ fi
+
+ new_test_name=$test_name"_qps_"$qps
+
+ client_command="python3 benchmark_serving.py \
+ --backend tensorrt-llm \
+ --tokenizer /tokenizer_cache \
+ --model $model \
+ --dataset-name $dataset_name \
+ --dataset-path $dataset_path \
+ --num-prompts $num_prompts \
+ --port $port \
+ --save-result \
+ --result-dir $RESULTS_FOLDER \
+ --result-filename ${new_test_name}.json \
+ --request-rate $qps \
+ $client_args"
+
+ echo "Running test case $test_name with qps $qps"
+ echo "Client command: $client_command"
+
+ eval "$client_command"
+
+ server_command=""
+ # record the benchmarking commands
+ jq_output=$(jq -n \
+ --arg server "$server_command" \
+ --arg client "$client_command" \
+ --arg gpu "$gpu_type" \
+ --arg engine "trt" \
+ '{
+ server_command: $server,
+ client_command: $client,
+ gpu_type: $gpu,
+ engine: $engine
+ }')
+ echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+ done
+
+ # clean up
+ kill_gpu_processes
+ rm -rf /root/.cache/huggingface/*
+ done
+}
+
+upload_to_buildkite() {
+ # upload the benchmarking results to buildkite
+
+ # if the agent binary is not found, skip uploading the results, exit 0
+ if [ ! -f /workspace/buildkite-agent ]; then
+ echo "buildkite-agent binary not found. Skip uploading the results."
+ return 0
+ fi
+ # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+ /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+
+main() {
+
+ check_gpus
+
+
+ # enter vllm directory
+ cd $VLLM_SOURCE_CODE_LOC/benchmarks
+
+ declare -g RESULTS_FOLDER=results/
+ mkdir -p $RESULTS_FOLDER
+ BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+ # update transformers package, to make sure mixtral tokenizer is available
+ python -m pip install transformers -U
+
+ export CURRENT_LLM_SERVING_ENGINE=trt
+ run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+ python -m pip install tabulate pandas
+ python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+ upload_to_buildkite
+
+}
+
+main "$@"
diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
new file mode 100644
index 0000000000000..663045b8a9122
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@@ -0,0 +1,221 @@
+#!/bin/bash
+
+set -o pipefail
+
+check_gpus() {
+ # check the number of GPUs and GPU type.
+ declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+ if [[ $gpu_count -gt 0 ]]; then
+ echo "GPU found."
+ else
+ echo "Need at least 1 GPU to run benchmarking."
+ exit 1
+ fi
+ declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+ echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+ # kill all processes on GPU.
+ pkill pt_main_thread
+ sleep 10
+
+ # remove vllm config file
+ rm -rf ~/.config/vllm
+
+ # Print the GPU memory usage
+ # so that we know if all GPU processes are killed.
+ gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+ # The memory usage should be 0 MB.
+ echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+ # transforms the JSON string to command line args, and '_' is replaced to '-'
+ # example:
+ # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+ # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+ local json_string=$1
+ local args=$(
+ echo "$json_string" | jq -r '
+ to_entries |
+ map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+ join(" ")
+ '
+ )
+ echo "$args"
+}
+
+wait_for_server() {
+ # wait for vllm server to start
+ # return 1 if vllm server crashes
+ timeout 1200 bash -c '
+ until curl -s localhost:8000/v1/completions > /dev/null; do
+ sleep 1
+ done' && return 0 || return 1
+}
+
+run_serving_tests() {
+ # run serving tests using `benchmark_serving.py`
+ # $1: a json file specifying serving test cases
+
+ local serving_test_file
+ serving_test_file=$1
+
+ # Iterate over serving tests
+ jq -c '.[]' "$serving_test_file" | while read -r params; do
+ # get the test name, and append the GPU type back to it.
+ test_name=$(echo "$params" | jq -r '.test_name')
+
+ # if TEST_SELECTOR is set, only run the test cases that match the selector
+ if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+ echo "Skip test case $test_name."
+ continue
+ fi
+
+ # append vllm to the test name
+ test_name=vllm_$test_name
+
+
+ # get common parameters
+ common_params=$(echo "$params" | jq -r '.common_parameters')
+ model=$(echo "$common_params" | jq -r '.model')
+ tp=$(echo "$common_params" | jq -r '.tp')
+ dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+ dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+ port=$(echo "$common_params" | jq -r '.port')
+ num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+
+ # get client and server arguments
+ server_params=$(echo "$params" | jq -r '.vllm_server_parameters')
+ client_params=$(echo "$params" | jq -r '.vllm_client_parameters')
+ server_args=$(json2args "$server_params")
+ client_args=$(json2args "$client_params")
+ qps_list=$(echo "$params" | jq -r '.qps_list')
+ qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+ echo "Running over qps list $qps_list"
+
+ # check if there is enough GPU to run the test
+ if [[ $gpu_count -lt $tp ]]; then
+ echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+ continue
+ fi
+
+ if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
+ echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
+ model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
+ server_command="python3 \
+ -m vllm.entrypoints.openai.api_server \
+ -tp $tp \
+ --model $model \
+ --port $port \
+ $server_args"
+ else
+ echo "Key 'fp8' does not exist in common params."
+ server_command="python3 \
+ -m vllm.entrypoints.openai.api_server \
+ -tp $tp \
+ --model $model \
+ --port $port \
+ $server_args"
+ fi
+
+ # run the server
+ echo "Running test case $test_name"
+ echo "Server command: $server_command"
+ eval "$server_command" &
+
+ # wait until the server is alive
+ wait_for_server
+ if [ $? -eq 0 ]; then
+ echo ""
+ echo "vllm server is up and running."
+ else
+ echo ""
+ echo "vllm failed to start within the timeout period."
+ break
+ fi
+
+ # iterate over different QPS
+ for qps in $qps_list; do
+ # remove the surrounding single quote from qps
+ if [[ "$qps" == *"inf"* ]]; then
+ echo "qps was $qps"
+ qps="inf"
+ echo "now qps is $qps"
+ fi
+
+ new_test_name=$test_name"_qps_"$qps
+
+ client_command="python3 benchmark_serving.py \
+ --backend vllm \
+ --model $model \
+ --dataset-name $dataset_name \
+ --dataset-path $dataset_path \
+ --num-prompts $num_prompts \
+ --port $port \
+ --save-result \
+ --result-dir $RESULTS_FOLDER \
+ --result-filename ${new_test_name}.json \
+ --request-rate $qps \
+ $client_args"
+
+ echo "Running test case $test_name with qps $qps"
+ echo "Client command: $client_command"
+
+ eval "$client_command"
+
+ # record the benchmarking commands
+ jq_output=$(jq -n \
+ --arg server "$server_command" \
+ --arg client "$client_command" \
+ --arg gpu "$gpu_type" \
+ --arg engine "vllm" \
+ '{
+ server_command: $server,
+ client_command: $client,
+ gpu_type: $gpu,
+ engine: $engine
+ }')
+ echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+ done
+
+ # clean up
+ kill_gpu_processes
+ rm -rf /root/.cache/huggingface/*
+ done
+}
+
+
+upload_to_buildkite() {
+ # upload the benchmarking results to buildkite
+
+ # if the agent binary is not found, skip uploading the results, exit 0
+ if [ ! -f /workspace/buildkite-agent ]; then
+ echo "buildkite-agent binary not found. Skip uploading the results."
+ return 0
+ fi
+ # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+ /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+main() {
+
+ check_gpus
+ # enter vllm directory
+ cd $VLLM_SOURCE_CODE_LOC/benchmarks
+ declare -g RESULTS_FOLDER=results/
+ mkdir -p $RESULTS_FOLDER
+ BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+ export CURRENT_LLM_SERVING_ENGINE=vllm
+ run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+
+ python3 -m pip install tabulate pandas
+ python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+ upload_to_buildkite
+
+}
+
+main "$@"
diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
new file mode 100644
index 0000000000000..782d1ef9aab98
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -0,0 +1,76 @@
+import datetime
+import json
+import os
+from pathlib import Path
+
+import pandas as pd
+from tabulate import tabulate
+
+results_folder = Path("results/")
+
+# serving results and the keys that will be printed into markdown
+serving_results = []
+serving_column_mapping = {
+ "test_name": "Test name",
+ "gpu_type": "GPU",
+ "completed": "Successful req.",
+ "request_throughput": "Tput (req/s)",
+ "mean_ttft_ms": "Mean TTFT (ms)",
+ "std_ttft_ms": "Std TTFT (ms)",
+ "mean_itl_ms": "Mean ITL (ms)",
+ "std_itl_ms": "Std ITL (ms)",
+ "input_throughput": "Input Tput (tok/s)",
+ "output_throughput": "Output Tput (tok/s)",
+ "engine": "Engine",
+}
+
+if __name__ == "__main__":
+
+ # collect results
+ for test_file in results_folder.glob("*.json"):
+
+ with open(test_file, "r") as f:
+ raw_result = json.loads(f.read())
+
+ # attach the benchmarking command to raw_result
+ with open(test_file.with_suffix(".commands"), "r") as f:
+ command = json.loads(f.read())
+ raw_result.update(command)
+
+ # update the test name of this result
+ raw_result.update({"test_name": test_file.stem})
+
+ # add the result to raw_result
+ serving_results.append(raw_result)
+ continue
+
+ serving_results = pd.DataFrame.from_dict(serving_results)
+
+ if not serving_results.empty:
+ serving_results = serving_results[list(
+ serving_column_mapping.keys())].rename(
+ columns=serving_column_mapping)
+
+ serving_md_table_with_headers = tabulate(serving_results,
+ headers='keys',
+ tablefmt='pipe',
+ showindex=False)
+ # remove the first line of header
+ serving_md_table_lines = serving_md_table_with_headers.split('\n')
+ serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
+
+ prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+ prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
+
+ # document benchmarking results in markdown
+ with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
+ # document results with header.
+ # for those who wants to reproduce our benchmark.
+ f.write(serving_md_table_with_headers)
+ f.write('\n')
+
+ # document benchmarking results in json
+ with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
+
+ results = serving_results.to_dict(orient='records')
+ f.write(json.dumps(results))
diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
new file mode 100644
index 0000000000000..f250833c62710
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -0,0 +1,116 @@
+[
+ {
+ "test_name": "llama8B_tp1",
+ "qps_list": [4],
+ "common_parameters": {
+ "model": "meta-llama/Meta-Llama-3-8B",
+ "tp": 1,
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 500,
+ "port": 8000
+ },
+ "lmdeploy_server_parameters": {
+ },
+ "lmdeploy_client_parameters": {
+ },
+ "tgi_server_parameters": {
+ },
+ "tgi_client_parameters": {
+ "endpoint": "/generate_stream"
+ },
+ "trt_server_parameters": {
+ "model_type": "llama",
+ "model_dtype": "float16",
+ "max_batch_size": 256,
+ "max_input_len": 4096,
+ "max_output_len": 4096,
+ "trt_llm_version": "r24.04"
+ },
+ "trt_client_parameters": {
+ "endpoint": "/v2/models/ensemble/generate_stream"
+ },
+ "vllm_server_parameters": {
+ "disable_log_stats": "",
+ "disable_log_requests": ""
+ },
+ "vllm_client_parameters": {
+ }
+ },
+ {
+ "test_name": "llama70B_tp4",
+ "qps_list": [2],
+ "common_parameters": {
+ "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+ "tp": 4,
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 500,
+ "port": 8000
+ },
+ "lmdeploy_server_parameters": {
+ },
+ "lmdeploy_client_parameters": {
+ },
+ "tgi_server_parameters": {
+ },
+ "tgi_client_parameters": {
+ "endpoint": "/generate_stream"
+ },
+ "trt_server_parameters": {
+ "model_type": "llama",
+ "model_dtype": "float16",
+ "max_batch_size": 256,
+ "max_input_len": 4096,
+ "max_output_len": 4096,
+ "trt_llm_version": "r24.04"
+ },
+ "trt_client_parameters": {
+ "endpoint": "/v2/models/ensemble/generate_stream"
+ },
+ "vllm_server_parameters": {
+ "disable_log_stats": "",
+ "disable_log_requests": ""
+ },
+ "vllm_client_parameters": {
+ }
+ },
+ {
+ "test_name": "mixtral8x7B_tp2",
+ "qps_list": [2],
+ "common_parameters": {
+ "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+ "tp": 2,
+ "dataset_name": "sharegpt",
+ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+ "num_prompts": 500,
+ "port": 8000
+ },
+ "lmdeploy_server_parameters": {
+ },
+ "lmdeploy_client_parameters": {
+ },
+ "tgi_server_parameters": {
+ },
+ "tgi_client_parameters": {
+ "endpoint": "/generate_stream"
+ },
+ "trt_server_parameters": {
+ "model_type": "llama",
+ "model_dtype": "float16",
+ "max_batch_size": 256,
+ "max_input_len": 4096,
+ "max_output_len": 4096,
+ "trt_llm_version": "r24.04"
+ },
+ "trt_client_parameters": {
+ "endpoint": "/v2/models/ensemble/generate_stream"
+ },
+ "vllm_server_parameters": {
+ "disable_log_stats": "",
+ "disable_log_requests": ""
+ },
+ "vllm_client_parameters": {
+ }
+ }
+]
\ No newline at end of file
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index c394f3fd7a0c5..4fa1951134eba 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -1,21 +1,17 @@
steps:
- - block: "Build wheels"
-
- - label: "Build wheel - Python {{matrix.python_version}}, CUDA {{matrix.cuda_version}}"
+ - label: "Build wheel - CUDA {{matrix.cuda_version}}"
agents:
queue: cpu_queue
commands:
- - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image --target build --progress plain ."
+ - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
- - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image cp -r dist /artifacts_host"
+ - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+ # rename the files to change linux -> manylinux1
+ - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
+ - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
matrix:
setup:
cuda_version:
- "11.8.0"
- "12.1.0"
- python_version:
- - "3.8"
- - "3.9"
- - "3.10"
- - "3.11"
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index bde8ab6184d3c..363bc07fc2de4 100644
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -2,6 +2,15 @@
set -ex
# Print ROCm version
+echo "--- Confirming Clean Initial State"
+while true; do
+ sleep 3
+ if grep -q clean /opt/amdgpu/etc/gpu_state; then
+ echo "GPUs state is \"clean\""
+ break
+ fi
+done
+
echo "--- ROCm info"
rocminfo
@@ -45,15 +54,10 @@ while true; do
fi
done
-echo "--- Building container"
-sha=$(git rev-parse --short HEAD)
-image_name=rocm_${sha}
-container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)
-docker build \
- -t ${image_name} \
- -f Dockerfile.rocm \
- --progress plain \
- .
+echo "--- Pulling container"
+image_name="rocmshared/vllm-ci:${BUILDKITE_COMMIT}"
+container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+docker pull ${image_name}
remove_docker_container() {
docker rm -f ${container_name} || docker image rm -f ${image_name} || true
diff --git a/.buildkite/run-multi-node-test.sh b/.buildkite/run-multi-node-test.sh
index 0d94b2555f166..7ac4dcc4c786d 100755
--- a/.buildkite/run-multi-node-test.sh
+++ b/.buildkite/run-multi-node-test.sh
@@ -2,16 +2,17 @@
set -euox pipefail
-if [[ $# -lt 3 ]]; then
- echo "Please provide the number of nodes and GPU per node."
+if [[ $# -lt 4 ]]; then
+ echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
exit 1
fi
-NUM_NODES=$1
-NUM_GPUS=$2
-DOCKER_IMAGE=$3
+WORKING_DIR=$1
+NUM_NODES=$2
+NUM_GPUS=$3
+DOCKER_IMAGE=$4
-shift 3
+shift 4
COMMANDS=("$@")
if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
echo "The number of commands must be equal to the number of nodes."
@@ -40,13 +41,40 @@ start_nodes() {
fi
done
GPU_DEVICES+='"'
- # echo "Starting node$node with GPU devices: $GPU_DEVICES"
- docker run -d --gpus "$GPU_DEVICES" --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE tail -f /dev/null
+
+ # start the container in detached mode
+ # things to note:
+ # 1. --shm-size=10.24gb is required. don't use --ipc=host
+ # 2. pass HF_TOKEN to the container
+ # 3. map the huggingface cache directory to the container
+ # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
+ # starting from 192.168.10.11)
+ docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null"
+
+ # organize containers into a ray cluster
+ if [ $node -eq 0 ]; then
+ # start the ray head node
+ docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block"
+ # wait for the head node to be ready
+ sleep 10
+ else
+ # start the ray worker nodes, and connect them to the head node
+ docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
+ fi
done
+
+ # wait for the cluster to be ready
+ sleep 10
+
+ # print the cluster status
+ docker exec node0 /bin/bash -c "ray status"
}
run_nodes() {
- for node in $(seq 0 $(($NUM_NODES-1))); do
+ # important: iterate in reverse order to start the head node last
+ # we start the worker nodes first, in detached mode, and then start the head node
+ # in the foreground, so that the output of the head node is visible in the buildkite logs
+ for node in $(seq $(($NUM_NODES - 1)) -1 0); do
GPU_DEVICES='"device='
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
@@ -57,10 +85,10 @@ run_nodes() {
done
GPU_DEVICES+='"'
echo "Running node$node with GPU devices: $GPU_DEVICES"
- if [ $node -lt $(($NUM_NODES - 1)) ]; then
- docker exec -d node$node /bin/bash -c "${COMMANDS[$node]}"
+ if [ $node -ne 0 ]; then
+ docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
else
- docker exec node$node /bin/bash -c "${COMMANDS[$node]}"
+ docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
fi
done
}
diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
new file mode 100644
index 0000000000000..4aabd123ae234
--- /dev/null
+++ b/.buildkite/run-tpu-test.sh
@@ -0,0 +1,16 @@
+set -e
+
+# Build the docker image.
+docker build -f Dockerfile.tpu -t vllm-tpu .
+
+# Set up cleanup.
+remove_docker_container() { docker rm -f tpu-test || true; }
+trap remove_docker_container EXIT
+# Remove the container that might not be cleaned up in the previous run.
+remove_docker_container
+
+# For HF_TOKEN.
+source /etc/environment
+# Run a simple end-to-end example.
+docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu \
+ python3 /workspace/vllm/examples/offline_inference_tpu.py
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 8013fbb642bb8..4019cc00fa2b9 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -7,8 +7,33 @@
steps:
+- label: Async Engine, Inputs, Utils, Worker Test
+ fast_check: true
+ fast_check_only: true
+ commands:
+ - pytest -v -s async_engine # Async Engine
+ - bash ../.buildkite/download-images.sh # Inputs
+ - pytest -v -s test_inputs.py
+ - pytest -v -s multimodal
+ - pytest -v -s test_utils.py # Utils
+ - pytest -v -s worker # Worker
+
+- label: Tensorizer, Metrics, Tracing Test
+ fast_check: true
+ fast_check_only: true
+ commands:
+ - apt-get install curl libsodium23 && pytest -v -s tensorizer_loader # Tensorizer
+ - pytest -v -s metrics # Metrics
+ - "pip install \
+ opentelemetry-sdk \
+ opentelemetry-api \
+ opentelemetry-exporter-otlp \
+ opentelemetry-semantic-conventions-ai" # Tracing
+ - pytest -v -s tracing
+
- label: Regression Test
mirror_hardwares: [amd]
+ fast_check: true
command: pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
@@ -18,15 +43,17 @@ steps:
- label: Basic Correctness Test
mirror_hardwares: [amd]
+ fast_check: true
commands:
- - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
- - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
+ - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+ - pytest -v -s basic_correctness/test_basic_correctness.py
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Core Test
mirror_hardwares: [amd]
+ fast_check: true
commands:
- pytest -v -s core
- pytest -v -s distributed/test_parallel_state.py
@@ -39,6 +66,17 @@ steps:
- pytest -v -s distributed/test_comm_ops.py
- pytest -v -s distributed/test_shm_broadcast.py
+- label: 2 Node Tests (4 GPUs in total)
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ num_nodes: 2
+ commands:
+ - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
+ - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
+ - TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
+ - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
+ - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
+
- label: Distributed Tests (2 GPUs)
mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests"
@@ -66,6 +104,7 @@ steps:
#mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests"
num_gpus: 4
+ fast_check: true
commands:
- pytest -v -s distributed/test_pynccl.py
# We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
@@ -87,9 +126,13 @@ steps:
- label: Engine Test
mirror_hardwares: [amd]
- command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
+ commands:
+ - pytest -v -s engine test_sequence.py test_config.py test_logger.py
+ # OOM in the CI unless we run this separately
+ - pytest -v -s tokenization
- label: Entrypoints Test
+ fast_check: true
mirror_hardwares: [amd]
commands:
@@ -119,14 +162,14 @@ steps:
- label: Kernels Test %N
#mirror_hardwares: [amd]
commands:
- - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+ - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 4
- label: Models Test
#mirror_hardwares: [amd]
commands:
- - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+ - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
- pytest -v -s models -m \"not vlm\"
- label: Vision Language Models Test
@@ -149,7 +192,9 @@ steps:
command: pytest -v -s test_logits_processor.py
- label: Utils Test
- command: pytest -v -s test_utils.py
+ commands:
+ - pytest -v -s test_utils.py
+ - pytest -v -s test_embedded_commit.py
- label: Worker Test
mirror_hardwares: [amd]
@@ -179,7 +224,10 @@ steps:
- label: Tensorizer Test
#mirror_hardwares: [amd]
- command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
+ commands:
+ - apt-get install curl libsodium23
+ - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+ - pytest -v -s tensorizer_loader
- label: Metrics Test
mirror_hardwares: [amd]
@@ -223,6 +271,7 @@ steps:
- label: Documentation Build
working_dir: "/vllm-workspace/test_docs/docs"
+ fast_check: true
no_gpu: True
commands:
- pip install -r requirements-docs.txt
@@ -237,7 +286,7 @@ steps:
- pytest -v -s distributed/test_custom_all_reduce.py
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
- - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.7/flashinfer-0.0.7+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+ - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
- VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
- VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
- pytest -v -s -x lora/test_mixtral.py
diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
new file mode 100644
index 0000000000000..71f4e520135d4
--- /dev/null
+++ b/.github/FUNDING.yml
@@ -0,0 +1,2 @@
+github: [vllm-project]
+open_collective: [vllm]
diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml
new file mode 100644
index 0000000000000..cd53b764c7200
--- /dev/null
+++ b/.github/workflows/add_label_automerge.yml
@@ -0,0 +1,21 @@
+name: Add label on auto-merge enabled
+on:
+ pull_request_target:
+ types:
+ - auto_merge_enabled
+jobs:
+ add-label-on-auto-merge:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Add label
+ uses: actions/github-script@v5
+ with:
+ script: |
+ github.rest.issues.addLabels({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ issue_number: context.issue.number,
+ labels: ['ready']
+ })
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/add_label_ready_comment.yml b/.github/workflows/add_label_ready_comment.yml
new file mode 100644
index 0000000000000..729c1452af03d
--- /dev/null
+++ b/.github/workflows/add_label_ready_comment.yml
@@ -0,0 +1,23 @@
+name: Add Ready Label on Ready Comment
+
+on:
+ issue_comment:
+ types: [created]
+
+jobs:
+ add-ready-label:
+ runs-on: ubuntu-latest
+ if: github.event.issue.pull_request && contains(github.event.comment.body, '/ready')
+ steps:
+ - name: Add label
+ uses: actions/github-script@v5
+ with:
+ script: |
+ github.rest.issues.addLabels({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ issue_number: context.issue.number,
+ labels: ['ready']
+ })
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 62f0dbcd93eff..5780f09a646cb 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -32,20 +32,22 @@ jobs:
pip install types-setuptools
- name: Mypy
run: |
+ mypy tests --config-file pyproject.toml
+ mypy vllm/*.py --config-file pyproject.toml
mypy vllm/attention --config-file pyproject.toml
mypy vllm/core --config-file pyproject.toml
mypy vllm/distributed --config-file pyproject.toml
+ mypy vllm/engine --config-file pyproject.toml
mypy vllm/entrypoints --config-file pyproject.toml
mypy vllm/executor --config-file pyproject.toml
+ mypy vllm/inputs --config-file pyproject.toml
+ mypy vllm/logging --config-file pyproject.toml
+ mypy vllm/lora --config-file pyproject.toml
+ mypy vllm/model_executor --config-file pyproject.toml
mypy vllm/multimodal --config-file pyproject.toml
- mypy vllm/usage --config-file pyproject.toml
- mypy vllm/*.py --config-file pyproject.toml
+ mypy vllm/platforms --config-file pyproject.toml
+ mypy vllm/spec_decode --config-file pyproject.toml
mypy vllm/transformers_utils --config-file pyproject.toml
- mypy vllm/engine --config-file pyproject.toml
+ mypy vllm/usage --config-file pyproject.toml
mypy vllm/worker --config-file pyproject.toml
- mypy vllm/spec_decode --config-file pyproject.toml
- mypy vllm/model_executor --config-file pyproject.toml
- mypy vllm/lora --config-file pyproject.toml
- mypy vllm/logging --config-file pyproject.toml
- mypy tests --config-file pyproject.toml
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 9c35ede5f6781..15c2ec05b25db 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -49,7 +49,7 @@ jobs:
matrix:
os: ['ubuntu-20.04']
python-version: ['3.8', '3.9', '3.10', '3.11']
- pytorch-version: ['2.3.0'] # Must be the most recent version that meets requirements-cuda.txt.
+ pytorch-version: ['2.3.1'] # Must be the most recent version that meets requirements-cuda.txt.
cuda-version: ['11.8', '12.1']
steps:
diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
new file mode 100644
index 0000000000000..d6924a30aa406
--- /dev/null
+++ b/.github/workflows/reminder_comment.yml
@@ -0,0 +1,21 @@
+name: PR Reminder Comment Bot
+on:
+ pull_request_target:
+ types: [opened]
+
+jobs:
+ pr_reminder:
+ runs-on: ubuntu-latest
+ steps:
+ - name: Remind to run full CI on PR
+ uses: actions/github-script@v6
+ with:
+ script: |
+ github.rest.issues.createComment({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ issue_number: context.issue.number,
+ body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only trigger `fastcheck` CI to run, which consists only a small and essential subset of tests to quickly catch errors with the flexibility to run extra individual tests on top (you can do this by unblocking test steps in the Buildkite run). \n\nFull CI run is still required to merge this PR so once the PR is ready to go, please make sure to run it. If you need all test signals in between PR commits, you can trigger full CI as well.\n\n To run full CI, you can do one of these:\n- Comment `/ready` on the PR\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
+ })
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.gitignore b/.gitignore
index e077366d1e4a1..17184b19127ca 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+# vllm commit id, generated by setup.py
+vllm/commit_id.py
+
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 31f7a97386d91..ced73ca03bfbc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,7 +32,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from Dockerfile.rocm
#
-set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.3.1")
set(TORCH_SUPPORTED_VERSION_ROCM "2.4.0")
#
diff --git a/Dockerfile b/Dockerfile
index 67198e8fd9800..89d9be0e84ddb 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -88,6 +88,9 @@ ENV NVCC_THREADS=$nvcc_threads
# make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1
+ARG buildkite_commit
+ENV BUILDKITE_COMMIT=${buildkite_commit}
+
ARG USE_SCCACHE
# if USE_SCCACHE is set, use sccache to speed up compilation
RUN --mount=type=cache,target=/root/.cache/pip \
@@ -101,7 +104,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
&& export SCCACHE_REGION=us-west-2 \
&& export CMAKE_BUILD_TYPE=Release \
&& sccache --show-stats \
- && python3 setup.py bdist_wheel --dist-dir=dist \
+ && python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
&& sccache --show-stats; \
fi
@@ -109,7 +112,7 @@ ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/pip \
if [ "$USE_SCCACHE" != "1" ]; then \
- python3 setup.py bdist_wheel --dist-dir=dist; \
+ python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
fi
# check the size of the wheel, we cannot upload wheels larger than 100MB
@@ -169,7 +172,7 @@ RUN --mount=type=bind,from=mamba-builder,src=/usr/src/mamba,target=/usr/src/mamb
python3 -m pip install /usr/src/mamba/*.whl --no-cache-dir
RUN --mount=type=cache,target=/root/.cache/pip \
- python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+ python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.9/flashinfer-0.0.9+cu121torch2.3-cp310-cp310-linux_x86_64.whl
#################### vLLM installation IMAGE ####################
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 1b89b892bbf1c..befb0499f2e68 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -52,25 +52,25 @@ RUN pip install --upgrade pip
# Remove sccache so it doesn't interfere with ccache
# TODO: implement sccache support across components
RUN apt-get purge -y sccache; pip uninstall -y sccache; rm -f "$(which sccache)"
-# Install torch == 2.4.0 on ROCm
+# Install torch == 2.5.0 on ROCm
RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
*"rocm-5.7"*) \
pip uninstall -y torch torchaudio torchvision \
&& pip install --no-cache-dir --pre \
- torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
- torchvision==0.19.0.dev20240612 \
+ torch==2.5.0.dev20240710 torchaudio==2.4.0.dev20240710 \
+ torchvision==0.20.0.dev20240710 \
--index-url https://download.pytorch.org/whl/nightly/rocm5.7;; \
*"rocm-6.0"*) \
pip uninstall -y torch torchaudio torchvision \
&& pip install --no-cache-dir --pre \
- torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
- torchvision==0.19.0.dev20240612 \
+ torch==2.5.0.dev20240710 torchaudio==2.4.0.dev20240710 \
+ torchvision==0.20.0.dev20240710 \
--index-url https://download.pytorch.org/whl/nightly/rocm6.0;; \
*"rocm-6.1"*) \
pip uninstall -y torch torchaudio torchvision \
&& pip install --no-cache-dir --pre \
- torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \
- torchvision==0.19.0.dev20240612 \
+ torch==2.5.0.dev20240710 torchaudio==2.4.0.dev20240710 \
+ torchvision==0.20.0.dev20240710 \
--index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
*) ;; esac
diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index 23bb78682da2c..6ad8e8ccfac78 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -15,9 +15,4 @@ COPY . /workspace/vllm
ENV VLLM_TARGET_DEVICE="tpu"
RUN cd /workspace/vllm && python setup.py develop
-# Re-install outlines to avoid dependency errors.
-# The outlines version must follow requirements-common.txt.
-RUN pip uninstall outlines -y
-RUN pip install "outlines>=0.0.43"
-
CMD ["/bin/bash"]
diff --git a/README.md b/README.md
index 3e0da945d9be8..4e96699fe07de 100644
--- a/README.md
+++ b/README.md
@@ -16,27 +16,12 @@ Easy, fast, and cheap LLM serving for everyone
---
-**Ray Summit CPF is Open (June 4th to June 20th)!**
-
-There will be a track for vLLM at the Ray Summit (09/30-10/02, SF) this year!
-If you have cool projects related to vLLM or LLM inference, we would love to see your proposals.
-This will be a great chance for everyone in the community to get together and learn.
-Please submit your proposal [here](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/eventsite)
-
----
-
*Latest News* 🔥
- [2024/06] We hosted [the fourth vLLM meetup](https://lu.ma/agivllm) with Cloudflare and BentoML! Please find the meetup slides [here](https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing).
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
-- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
-- [2024/01] Added ROCm 6.0 support to vLLM.
-- [2023/12] Added ROCm 5.7 support to vLLM.
-- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
-- [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
-- [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
+- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) with IBM! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
+- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) with a16z! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
-- [2023/07] Added support for LLaMA-2! You can run and serve 7B/13B/70B LLaMA-2s on vLLM with a single command!
-- [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds.
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
---
@@ -52,14 +37,16 @@ vLLM is fast with:
- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
- Optimized CUDA kernels
+**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/3924) that compares the performance of vllm against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
+
vLLM is flexible and easy to use with:
- Seamless integration with popular Hugging Face models
- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
-- Tensor parallelism support for distributed inference
+- Tensor parallelism and pipeline parallelism support for distributed inference
- Streaming outputs
- OpenAI-compatible API server
-- Support NVIDIA GPUs, AMD GPUs, Intel CPUs and GPUs
+- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs
- (Experimental) Prefix caching support
- (Experimental) Multi-lora support
@@ -103,6 +90,7 @@ vLLM is a community project. Our compute resources for development and testing a
- Databricks
- DeepInfra
- Dropbox
+- Google Cloud
- Lambda Lab
- NVIDIA
- Replicate
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index fe29c67086158..fbab547d094fe 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -390,17 +390,17 @@ def remove_prefix(text: str, prefix: str) -> str:
return text
-def get_model(pretrained_model_name_or_path: str):
+def get_model(pretrained_model_name_or_path: str) -> str:
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
from modelscope import snapshot_download
- else:
- from huggingface_hub import snapshot_download
-
- model_path = snapshot_download(
- model_id=pretrained_model_name_or_path,
- local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
- ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
- return model_path
+
+ model_path = snapshot_download(
+ model_id=pretrained_model_name_or_path,
+ local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+ ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
+
+ return model_path
+ return pretrained_model_name_or_path
def get_tokenizer(
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 7ba977141de0f..fc0dbf77f16b9 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -2,8 +2,8 @@
On the server side, run one of the following commands:
vLLM OpenAI API server
- python -m vllm.entrypoints.openai.api_server \
- --model --swap-space 16 \
+ vllm serve \
+ --swap-space 16 \
--disable-log-requests
(TGI backend)
@@ -60,12 +60,15 @@ class BenchmarkMetrics:
output_throughput: float
mean_ttft_ms: float
median_ttft_ms: float
+ std_ttft_ms: float
p99_ttft_ms: float
mean_tpot_ms: float
median_tpot_ms: float
+ std_tpot_ms: float
p99_tpot_ms: float
mean_itl_ms: float
median_itl_ms: float
+ std_itl_ms: float
p99_itl_ms: float
@@ -200,7 +203,7 @@ def sample_random_requests(
)
offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
input_requests = []
- for i in range(args.num_prompts):
+ for i in range(num_prompts):
prompt = tokenizer.decode([(offsets[i] + i + j) % tokenizer.vocab_size
for j in range(input_lens[i])])
input_requests.append(
@@ -274,12 +277,15 @@ def calculate_metrics(
mean_ttft_ms=np.mean(ttfts or 0) *
1000, # ttfts is empty if streaming is not supported by backend
median_ttft_ms=np.median(ttfts or 0) * 1000,
+ std_ttft_ms=np.std(ttfts or 0) * 1000,
p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
mean_tpot_ms=np.mean(tpots or 0) * 1000,
median_tpot_ms=np.median(tpots or 0) * 1000,
+ std_tpot_ms=np.std(tpots or 0) * 1000,
p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
mean_itl_ms=np.mean(itls or 0) * 1000,
median_itl_ms=np.median(itls or 0) * 1000,
+ std_itl_ms=np.std(itls or 0) * 1000,
p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
)
@@ -396,12 +402,15 @@ async def benchmark(
"output_throughput": metrics.output_throughput,
"mean_ttft_ms": metrics.mean_ttft_ms,
"median_ttft_ms": metrics.median_ttft_ms,
+ "std_ttft_ms": metrics.std_ttft_ms,
"p99_ttft_ms": metrics.p99_ttft_ms,
"mean_tpot_ms": metrics.mean_tpot_ms,
"median_tpot_ms": metrics.median_tpot_ms,
+ "std_tpot_ms": metrics.std_tpot_ms,
"p99_tpot_ms": metrics.p99_tpot_ms,
"mean_itl_ms": metrics.mean_itl_ms,
"median_itl_ms": metrics.median_itl_ms,
+ "std_itl_ms": metrics.std_itl_ms,
"p99_itl_ms": metrics.p99_itl_ms,
"input_lens": [output.prompt_len for output in outputs],
"output_lens": actual_output_lens,
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
index 261f5829631ee..3da4cecd7eeff 100644
--- a/benchmarks/kernels/benchmark_marlin.py
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -5,14 +5,16 @@
from benchmark_shapes import WEIGHT_SHAPES
from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.gptq_marlin import (
- GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
- GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
- MarlinWorkspace, marlin_24_quantize, marlin_quantize)
+ GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
+ GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
+ MarlinWorkspace, marlin_quantize)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
+ marlin_24_quantize)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
gptq_pack, quantize_weights, sort_weights)
from vllm.utils import FlexibleArgumentParser
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index 81bf2d62d8f42..605166930ccc6 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -38,7 +38,13 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
if (cuda_device_capability >= 90) {
return CUDA_VERSION >= 12000;
} else if (cuda_device_capability >= 89) {
- return CUDA_VERSION >= 12040;
+ // CUTLASS Kernels have not been tuned for Ada Lovelace systems
+ // and are slower than torch.mm. Return false unconditionally in this case.
+ return false;
+
+ // Once the CUTLASS kernels have been optimized for Lovelace systems,
+ // use the following check:
+ // return CUDA_VERSION >= 12040;
}
#endif
@@ -98,4 +104,4 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
TORCH_CHECK(version_num >= 75);
cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales, bias);
}
-}
\ No newline at end of file
+}
diff --git a/docs/source/_templates/sections/header.html b/docs/source/_templates/sections/header.html
index cd5c4053e225f..7174431b10272 100644
--- a/docs/source/_templates/sections/header.html
+++ b/docs/source/_templates/sections/header.html
@@ -5,6 +5,7 @@
justify-content: center;
align-items: center;
font-size: 16px;
+ padding: 0 6px 0 6px;
}
.notification-bar p {
margin: 0;
diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md
index cd8e8b0f513c4..0182c96a8dfbf 100644
--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
@@ -13,6 +13,7 @@ vLLM is a community project. Our compute resources for development and testing a
- Databricks
- DeepInfra
- Dropbox
+- Google Cloud
- Lambda Lab
- NVIDIA
- Replicate
diff --git a/docs/source/dev/multimodal/adding_multimodal_plugin.rst b/docs/source/dev/multimodal/adding_multimodal_plugin.rst
new file mode 100644
index 0000000000000..b726138f840a3
--- /dev/null
+++ b/docs/source/dev/multimodal/adding_multimodal_plugin.rst
@@ -0,0 +1,17 @@
+.. _adding_multimodal_plugin:
+
+Adding a Multimodal Plugin
+==========================
+
+This document teaches you how to add a new modality to vLLM.
+
+Each modality in vLLM is represented by a :class:`~vllm.multimodal.MultiModalPlugin` and registered to :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
+For vLLM to recognize a new modality type, you have to create a new plugin and then pass it to :meth:`~vllm.multimodal.MultiModalRegistry.register_plugin`.
+
+The remainder of this document details how to define custom :class:`~vllm.multimodal.MultiModalPlugin` s.
+
+.. note::
+ This article is a work in progress.
+
+..
+ TODO: Add more instructions on how to add new plugins once embeddings is in.
diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
index 39daf30a3338f..6713dcf08d9f0 100644
--- a/docs/source/dev/multimodal/multimodal_index.rst
+++ b/docs/source/dev/multimodal/multimodal_index.rst
@@ -7,17 +7,21 @@ Multi-Modality
vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.
-Multi-modal input can be passed alongside text and token prompts to :ref:`supported models `
+Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models `
via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptStrictInputs`.
-.. note::
- ``multi_modal_data`` can accept keys and values beyond the builtin ones, as long as a customized plugin is registered through
- the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
+Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities
+by following :ref:`this guide `.
-To implement a new multi-modal model in vLLM, please follow :ref:`this guide `.
+Looking to add your own multi-modal model? Please follow the instructions listed :ref:`here `.
-..
- TODO: Add more instructions on how to add new plugins once embeddings is in.
+Guides
+++++++
+
+.. toctree::
+ :maxdepth: 1
+
+ adding_multimodal_plugin
Module Contents
+++++++++++++++
@@ -36,10 +40,14 @@ Registry
Base Classes
------------
-.. autoclass:: vllm.multimodal.MultiModalDataDict
+.. autodata:: vllm.multimodal.BatchedTensors
+
+.. autoclass:: vllm.multimodal.MultiModalDataBuiltins
:members:
:show-inheritance:
+.. autodata:: vllm.multimodal.MultiModalDataDict
+
.. autoclass:: vllm.multimodal.MultiModalInputs
:members:
:show-inheritance:
diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst
index a9544e8a59a3d..1c97515dbecd9 100644
--- a/docs/source/getting_started/cpu-installation.rst
+++ b/docs/source/getting_started/cpu-installation.rst
@@ -20,7 +20,7 @@ Requirements
* OS: Linux
* Compiler: gcc/g++>=12.3.0 (optional, recommended)
-* Instruction set architecture (ISA) requirement: AVX512 is required.
+* Instruction set architecture (ISA) requirement: AVX512 (optional, recommended)
.. _cpu_backend_quick_start_dockerfile:
diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index 4cd34769ecfb4..0d03fe93adc61 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -50,6 +50,8 @@ Here are some common issues that can cause hangs:
value = cpu_data.mean().item()
assert value == world_size, f"Expected {world_size}, got {value}"
+ print("sanity check is successful!")
+
.. tip::
Save the script as ``test.py``.
@@ -62,4 +64,6 @@ Here are some common issues that can cause hangs:
- is reachable from all nodes
- is set before running the script.
+ If the script runs successfully, you should see the message ``sanity check is successful!``.
+
If the problem persists, feel free to `open an issue on GitHub `_, with a detailed description of the issue, your environment, and the logs.
diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index d458b0235ecb7..a9dfac8ff5af8 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -42,6 +42,20 @@ You can install vLLM using pip:
Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions.
+.. note::
+
+ vLLM also publishes a subset of wheels (Python 3.10, 3.11 with CUDA 12) for every commit since v0.5.3. You can download them with the following command:
+
+ .. code-block:: console
+
+ $ export VLLM_VERSION=0.5.2 # vLLM's main branch version is currently set to latest released tag
+ $ export PYTHON_VERSION=310
+ $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-${VLLM_VERSION}-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl
+ $ # You can also access a specific commit
+ $ # export VLLM_COMMIT=...
+ $ # pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-${VLLM_VERSION}-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl
+
+
.. _build_from_source:
Build from source
diff --git a/docs/source/getting_started/xpu-installation.rst b/docs/source/getting_started/xpu-installation.rst
index 4f0d2da25b8e8..a0118e20c49db 100644
--- a/docs/source/getting_started/xpu-installation.rst
+++ b/docs/source/getting_started/xpu-installation.rst
@@ -40,12 +40,13 @@ Quick start using Dockerfile
Build from source
-----------------
-- First, install required driver and intel OneAPI 2024.1.
+- First, install required driver and intel OneAPI 2024.1 or later.
- Second, install Python packages for vLLM XPU backend building:
.. code-block:: console
+ $ source /opt/intel/oneapi/setvars.sh
$ pip install --upgrade pip
$ pip install -v -r requirements-xpu.txt
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 67c039f25e98d..2691805ed97a4 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -38,7 +38,7 @@ vLLM is flexible and easy to use with:
* Seamless integration with popular HuggingFace models
* High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
-* Tensor parallelism support for distributed inference
+* Tensor parallelism and pipeline parallelism support for distributed inference
* Streaming outputs
* OpenAI-compatible API server
* Support NVIDIA GPUs and AMD GPUs
diff --git a/docs/source/models/spec_decode.rst b/docs/source/models/spec_decode.rst
index 9fb62397b9aaf..87a52360c0841 100644
--- a/docs/source/models/spec_decode.rst
+++ b/docs/source/models/spec_decode.rst
@@ -73,5 +73,5 @@ Resources for vLLM contributors
-------------------------------
* `A Hacker's Guide to Speculative Decoding in vLLM `_
* `What is Lookahead Scheduling in vLLM? `_
-* `Information on batch expansion. `_
+* `Information on batch expansion `_
* `Dynamic speculative decoding `_
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index f56679c3c6d00..50cae1041bc8f 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -137,6 +137,10 @@ Decoder-only Language Models
- Phi-3-Small
- :code:`microsoft/Phi-3-small-8k-instruct`, :code:`microsoft/Phi-3-small-128k-instruct`, etc.
-
+ * - :code:`PersimmonForCausalLM`
+ - Persimmon
+ - :code:`adept/persimmon-8b-base`, :code:`adept/persimmon-8b-chat`, etc.
+ -
* - :code:`QWenLMHeadModel`
- Qwen
- :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
@@ -178,6 +182,10 @@ Vision Language Models
- Models
- Example HuggingFace Models
- :ref:`LoRA `
+ * - :code:`FuyuForCausalLM`
+ - Fuyu
+ - :code:`adept/fuyu-8b` etc.
+ -
* - :code:`LlavaForConditionalGeneration`
- LLaVA-1.5
- :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc.
diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst
index 3c58ed295fba6..2dfb83f168b5d 100644
--- a/docs/source/serving/distributed_serving.rst
+++ b/docs/source/serving/distributed_serving.rst
@@ -1,5 +1,21 @@
.. _distributed_serving:
+How to decide the distributed inference strategy?
+=================================================
+
+Before going into the details of distributed inference and serving, let's first make it clear when to use distributed inference and what are the strategies available. The common practice is:
+
+- **Single GPU (no distributed inference)**: If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference.
+- **Single-Node Multi-GPU (tensor parallel inference)**: If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you have 4 GPUs in a single node, you can set the tensor parallel size to 4.
+- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2.
+
+In short, you should increase the number of GPUs and the number of nodes until you have enough GPU memory to hold the model. The tensor parallel size should be the number of GPUs in each node, and the pipeline parallel size should be the number of nodes.
+
+After adding enough GPUs and nodes to hold the model, you can run vLLM first, which will print some logs like ``# GPU blocks: 790``. Multiply the number by ``16`` (the block size), and you can get roughly the maximum number of tokens that can be served on the current configuration. If this number is not satisfying, e.g. you want higher throughput, you can further increase the number of GPUs or nodes, until the number of blocks is enough.
+
+.. note::
+ There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs.
+
Distributed Inference and Serving
=================================
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 6248d84683753..092c3c6cb9a3d 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -109,7 +109,7 @@ directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
```{argparse}
:module: vllm.entrypoints.openai.cli_args
-:func: make_arg_parser
+:func: create_parser_for_docs
:prog: -m vllm.entrypoints.openai.api_server
```
diff --git a/examples/fuyu_example.py b/examples/fuyu_example.py
new file mode 100644
index 0000000000000..c92b8fb4bc286
--- /dev/null
+++ b/examples/fuyu_example.py
@@ -0,0 +1,31 @@
+import requests
+from PIL import Image
+
+from vllm import LLM, SamplingParams
+
+
+def run_fuyu():
+ llm = LLM(model="adept/fuyu-8b", max_model_len=4096)
+
+ # single-image prompt
+ prompt = "What is the highest life expectancy at of male?\n"
+ url = "https://huggingface.co/adept/fuyu-8b/resolve/main/chart.png"
+ image = Image.open(requests.get(url, stream=True).raw)
+ sampling_params = SamplingParams(temperature=0, max_tokens=64)
+
+ outputs = llm.generate(
+ {
+ "prompt": prompt,
+ "multi_modal_data": {
+ "image": image
+ },
+ },
+ sampling_params=sampling_params)
+
+ for o in outputs:
+ generated_text = o.outputs[0].text
+ print(generated_text)
+
+
+if __name__ == "__main__":
+ run_fuyu()
diff --git a/examples/offline_inference_tpu.py b/examples/offline_inference_tpu.py
new file mode 100644
index 0000000000000..251629b8027ce
--- /dev/null
+++ b/examples/offline_inference_tpu.py
@@ -0,0 +1,28 @@
+from vllm import LLM, SamplingParams
+
+prompts = [
+ "A robot may not injure a human being",
+ "It is only with the heart that one can see rightly;",
+ "The greatest glory in living lies not in never falling,",
+]
+answers = [
+ " or, through inaction, allow a human being to come to harm.",
+ " what is essential is invisible to the eye.",
+ " but in rising every time we fall.",
+]
+N = 1
+# Currently, top-p sampling is disabled. `top_p` should be 1.0.
+sampling_params = SamplingParams(temperature=0.7,
+ top_p=1.0,
+ n=N,
+ max_tokens=16)
+
+# Set `enforce_eager=True` to avoid ahead-of-time compilation.
+# In real workloads, `enforace_eager` should be `False`.
+llm = LLM(model="google/gemma-2b", enforce_eager=True)
+outputs = llm.generate(prompts, sampling_params)
+for output, answer in zip(outputs, answers):
+ prompt = output.prompt
+ generated_text = output.outputs[0].text
+ print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+ assert generated_text.startswith(answer)
diff --git a/format.sh b/format.sh
index 5edc868f9f70c..5ad6d6f2938bb 100755
--- a/format.sh
+++ b/format.sh
@@ -96,23 +96,23 @@ echo 'vLLM yapf: Done'
# Run mypy
echo 'vLLM mypy:'
+mypy tests --config-file pyproject.toml
+mypy vllm/*.py --config-file pyproject.toml
mypy vllm/attention --config-file pyproject.toml
mypy vllm/core --config-file pyproject.toml
mypy vllm/distributed --config-file pyproject.toml
+mypy vllm/engine --config-file pyproject.toml
mypy vllm/entrypoints --config-file pyproject.toml
mypy vllm/executor --config-file pyproject.toml
+mypy vllm/logging --config-file pyproject.toml
+mypy vllm/lora --config-file pyproject.toml
+mypy vllm/model_executor --config-file pyproject.toml
mypy vllm/multimodal --config-file pyproject.toml
-mypy vllm/usage --config-file pyproject.toml
-mypy vllm/*.py --config-file pyproject.toml
+mypy vllm/prompt_adapter --config-file pyproject.toml
+mypy vllm/spec_decode --config-file pyproject.toml
mypy vllm/transformers_utils --config-file pyproject.toml
-mypy vllm/engine --config-file pyproject.toml
+mypy vllm/usage --config-file pyproject.toml
mypy vllm/worker --config-file pyproject.toml
-mypy vllm/spec_decode --config-file pyproject.toml
-mypy vllm/model_executor --config-file pyproject.toml
-mypy vllm/lora --config-file pyproject.toml
-mypy vllm/logging --config-file pyproject.toml
-mypy vllm/prompt_adapter --config-file pyproject.toml
-mypy tests --config-file pyproject.toml
# If git diff returns a file that is in the skip list, the file may be checked anyway:
diff --git a/pyproject.toml b/pyproject.toml
index 790e013620286..1ba1eacd90084 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
"ninja",
"packaging",
"setuptools >= 49.4.0",
- "torch == 2.3.0",
+ "torch == 2.3.1",
"wheel",
]
build-backend = "setuptools.build_meta"
diff --git a/requirements-build.txt b/requirements-build.txt
index 1a07a94e82e04..b05f38a0ed919 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -3,5 +3,5 @@ cmake>=3.21
ninja
packaging
setuptools>=49.4.0
-torch==2.3.0
+torch==2.3.1
wheel
diff --git a/requirements-common.txt b/requirements-common.txt
index e874c4af49d66..29643cfce161b 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -6,7 +6,7 @@ numpy < 2.0.0
requests
tqdm
py-cpuinfo
-transformers >= 4.42.0 # Required for Gemma 2 and for additional chat template parameters.
+transformers >= 4.42.4 # Required for Gemma 2 and for additional chat template parameters.
tokenizers >= 0.19.1 # Required for Llama 3.
fastapi
aiohttp
@@ -17,8 +17,8 @@ pillow # Required for image processing
prometheus_client >= 0.18.0
prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer
-lm-format-enforcer == 0.10.1
-outlines >= 0.0.43 # Requires torch >= 2.1.0
+lm-format-enforcer == 0.10.3
+outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
typing_extensions
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
pyzmq
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 10596ed85d600..3eb91212e976e 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -4,8 +4,8 @@
# Dependencies for NVIDIA GPUs
ray >= 2.9
nvidia-ml-py # for pynvml package
-torch == 2.3.0
+torch == 2.3.1
# These must be updated alongside torch
-torchvision == 0.18.0 # Required for phi3v processor, also see https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-xformers == 0.0.26.post1 # Requires PyTorch 2.3.0
-vllm-flash-attn == 2.5.9 # Requires PyTorch 2.3.0
+torchvision == 0.18.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+xformers == 0.0.27 # Requires PyTorch 2.3.1
+vllm-flash-attn == 2.5.9.post1 # Requires PyTorch 2.3.1
diff --git a/requirements-openvino.txt b/requirements-openvino.txt
index e555d52572541..e32c76fb0db21 100644
--- a/requirements-openvino.txt
+++ b/requirements-openvino.txt
@@ -4,6 +4,6 @@
# OpenVINO dependencies
torch >= 2.1.2
openvino ~= 2024.3.0.dev
-optimum-intel[openvino] >= 1.17.2
+optimum-intel[openvino] >= 1.18.1
triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error.
diff --git a/setup.py b/setup.py
index 067ad13fed71b..72ef26f15e405 100644
--- a/setup.py
+++ b/setup.py
@@ -5,6 +5,7 @@
import re
import subprocess
import sys
+import warnings
from shutil import which
from typing import Dict, List
@@ -26,6 +27,34 @@ def load_module_from_path(module_name, path):
ROOT_DIR = os.path.dirname(__file__)
logger = logging.getLogger(__name__)
+
+def embed_commit_hash():
+ try:
+ if "BUILDKITE_COMMIT" in os.environ:
+ # ci build
+ commit_id = os.environ["BUILDKITE_COMMIT"]
+ else:
+ commit_id = subprocess.check_output(["git", "rev-parse", "HEAD"],
+ encoding="utf-8").strip()
+
+ commit_contents = f'__commit__ = "{commit_id}"\n'
+
+ version_file = os.path.join(ROOT_DIR, "vllm", "commit_id.py")
+ with open(version_file, "w", encoding="utf-8") as f:
+ f.write(commit_contents)
+
+ except subprocess.CalledProcessError as e:
+ warnings.warn(f"Failed to get commit hash:\n{e}",
+ RuntimeWarning,
+ stacklevel=2)
+ except Exception as e:
+ warnings.warn(f"Failed to embed commit hash:\n{e}",
+ RuntimeWarning,
+ stacklevel=2)
+
+
+embed_commit_hash()
+
# cannot import envs directly because it depends on vllm,
# which is not installed yet
envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
@@ -459,4 +488,9 @@ def _read_requirements(filename: str) -> List[str]:
},
cmdclass={"build_ext": cmake_build_ext} if _build_custom_ops() else {},
package_data=package_data,
+ entry_points={
+ "console_scripts": [
+ "vllm=vllm.scripts:main",
+ ],
+ },
)
diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py
index cc05d79e56874..575f8f19b8ebe 100644
--- a/tests/async_engine/test_openapi_server_ray.py
+++ b/tests/async_engine/test_openapi_server_ray.py
@@ -1,35 +1,26 @@
import openai # use the official client for correctness check
import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ..utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "facebook/opt-125m"
@pytest.fixture(scope="module")
-def ray_ctx():
- ray.init(runtime_env={"working_dir": VLLM_PATH})
- yield
- ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(ray_ctx):
- return RemoteOpenAIServer([
- "--model",
- MODEL_NAME,
- # use half precision for speed and memory savings in CI environment
- "--dtype",
- "float16",
- "--max-model-len",
- "2048",
- "--enforce-eager",
- "--engine-use-ray"
- ])
+def server():
+ with RemoteOpenAIServer([
+ "--model",
+ MODEL_NAME,
+ # use half precision for speed and memory savings in CI environment
+ "--dtype",
+ "float16",
+ "--max-model-len",
+ "2048",
+ "--enforce-eager",
+ "--engine-use-ray"
+ ]) as remote_server:
+ yield remote_server
@pytest.fixture(scope="module")
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index a7b0fef533ccb..ec7c2ba3e3ce0 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -2,11 +2,13 @@
Run `pytest tests/basic_correctness/test_basic_correctness.py`.
"""
+import os
import weakref
import pytest
from vllm import LLM
+from vllm.utils import is_hip
from ..models.utils import check_outputs_equal
@@ -27,6 +29,7 @@ def test_vllm_gc_ed():
@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [False, True])
@@ -35,10 +38,17 @@ def test_models(
vllm_runner,
example_prompts,
model: str,
+ backend: str,
dtype: str,
max_tokens: int,
enforce_eager: bool,
) -> None:
+
+ if backend == "FLASHINFER" and is_hip():
+ pytest.skip("Flashinfer does not support ROCm/HIP.")
+
+ os.environ["VLLM_ATTENTION_BACKEND"] = backend
+
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 6072a2dd71800..2d9f63795189d 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -2,11 +2,8 @@
import openai # use the official client for correctness check
import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ..utils import RemoteOpenAIServer
# downloading lora to test lora requests
@@ -21,14 +18,7 @@
@pytest.fixture(scope="module")
-def ray_ctx():
- ray.init(runtime_env={"working_dir": VLLM_PATH})
- yield
- ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(ray_ctx):
+def server():
args = [
"--model",
MODEL_NAME,
@@ -50,7 +40,8 @@ def server(ray_ctx):
args += [
"--enforce-eager",
]
- return RemoteOpenAIServer(args, num_gpus=PP_SIZE * TP_SIZE)
+ with RemoteOpenAIServer(args) as remote_server:
+ yield remote_server
@pytest.fixture(scope="module")
diff --git a/tests/distributed/test_same_node.py b/tests/distributed/test_same_node.py
index 2d886eb566d5d..07e84d0ad54cd 100644
--- a/tests/distributed/test_same_node.py
+++ b/tests/distributed/test_same_node.py
@@ -10,3 +10,4 @@
expected = os.environ.get("VLLM_TEST_SAME_HOST", "1") == "1"
assert test_result == expected, f"Expected {expected}, got {test_result}"
+print("Same node test passed!")
diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
index 9ff11b0d27b11..a51a9909f6f41 100644
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -1,7 +1,7 @@
import ray
import vllm.envs as envs
-from vllm.utils import (cuda_device_count_stateless, is_hip,
+from vllm.utils import (cuda_device_count_stateless,
update_environment_variables)
@@ -22,11 +22,6 @@ def get_cuda_visible_devices(self):
def test_cuda_device_count_stateless():
"""Test that cuda_device_count_stateless changes return value if
CUDA_VISIBLE_DEVICES is changed."""
- if is_hip():
- # Set HIP_VISIBLE_DEVICES == CUDA_VISIBLE_DEVICES. Conversion
- # is handled by `update_environment_variables`
- update_environment_variables(
- {"CUDA_VISIBLE_DEVICES": envs.CUDA_VISIBLE_DEVICES})
actor = _CUDADeviceCountStatelessTestActor.options( # type: ignore
num_gpus=2).remote()
assert sorted(ray.get(
diff --git a/tests/entrypoints/openai/conftest.py b/tests/entrypoints/openai/conftest.py
new file mode 100644
index 0000000000000..0837644f26bde
--- /dev/null
+++ b/tests/entrypoints/openai/conftest.py
@@ -0,0 +1,69 @@
+import pytest
+
+
+@pytest.fixture
+def sample_regex():
+ return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+ r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
+
+
+@pytest.fixture
+def sample_json_schema():
+ return {
+ "type": "object",
+ "properties": {
+ "name": {
+ "type": "string"
+ },
+ "age": {
+ "type": "integer"
+ },
+ "skills": {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "maxLength": 10
+ },
+ "minItems": 3
+ },
+ "work_history": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "company": {
+ "type": "string"
+ },
+ "duration": {
+ "type": "number"
+ },
+ "position": {
+ "type": "string"
+ }
+ },
+ "required": ["company", "position"]
+ }
+ }
+ },
+ "required": ["name", "age", "skills", "work_history"]
+ }
+
+
+@pytest.fixture
+def sample_guided_choice():
+ return [
+ "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript",
+ "Ruby", "Swift", "Kotlin"
+ ]
+
+
+@pytest.fixture
+def sample_sql_statements():
+ return ("""
+start: select_statement
+select_statement: "SELECT" column "from" table "where" condition
+column: "col_1" | "col_2"
+table: "table_1" | "table_2"
+condition: column "=" number
+number: "1" | "2"
+""")
\ No newline at end of file
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index 3e80214f24dc5..d370c63c0c7ba 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -6,15 +6,12 @@
import jsonschema
import openai # use the official client for correctness check
import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
import torch
# downloading lora to test lora requests
from huggingface_hub import snapshot_download
from openai import BadRequestError
-from ...utils import VLLM_PATH, RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -22,53 +19,6 @@
# generation quality here
LORA_NAME = "typeof/zephyr-7b-beta-lora"
-TEST_SCHEMA = {
- "type": "object",
- "properties": {
- "name": {
- "type": "string"
- },
- "age": {
- "type": "integer"
- },
- "skills": {
- "type": "array",
- "items": {
- "type": "string",
- "maxLength": 10
- },
- "minItems": 3
- },
- "work history": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "company": {
- "type": "string"
- },
- "duration": {
- "type": "string"
- },
- "position": {
- "type": "string"
- }
- },
- "required": ["company", "position"]
- }
- }
- },
- "required": ["name", "age", "skills", "work history"]
-}
-
-TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
- r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
-
-TEST_CHOICE = [
- "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby",
- "Swift", "Kotlin"
-]
-
@pytest.fixture(scope="module")
def zephyr_lora_files():
@@ -76,35 +26,29 @@ def zephyr_lora_files():
@pytest.fixture(scope="module")
-def ray_ctx():
- ray.init(runtime_env={"working_dir": VLLM_PATH})
- yield
- ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(zephyr_lora_files, ray_ctx):
- return RemoteOpenAIServer([
- "--model",
- MODEL_NAME,
- # use half precision for speed and memory savings in CI environment
- "--dtype",
- "bfloat16",
- "--max-model-len",
- "8192",
- "--enforce-eager",
- # lora config below
- "--enable-lora",
- "--lora-modules",
- f"zephyr-lora={zephyr_lora_files}",
- f"zephyr-lora2={zephyr_lora_files}",
- "--max-lora-rank",
- "64",
- "--max-cpu-loras",
- "2",
- "--max-num-seqs",
- "128",
- ])
+def server(zephyr_lora_files):
+ with RemoteOpenAIServer([
+ "--model",
+ MODEL_NAME,
+ # use half precision for speed and memory savings in CI environment
+ "--dtype",
+ "bfloat16",
+ "--max-model-len",
+ "8192",
+ "--enforce-eager",
+ # lora config below
+ "--enable-lora",
+ "--lora-modules",
+ f"zephyr-lora={zephyr_lora_files}",
+ f"zephyr-lora2={zephyr_lora_files}",
+ "--max-lora-rank",
+ "64",
+ "--max-cpu-loras",
+ "2",
+ "--max-num-seqs",
+ "128",
+ ]) as remote_server:
+ yield remote_server
@pytest.fixture(scope="module")
@@ -408,7 +352,8 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
@pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"])
async def test_guided_choice_chat(client: openai.AsyncOpenAI,
- guided_decoding_backend: str):
+ guided_decoding_backend: str,
+ sample_guided_choice):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
@@ -422,10 +367,10 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
model=MODEL_NAME,
messages=messages,
max_tokens=10,
- extra_body=dict(guided_choice=TEST_CHOICE,
+ extra_body=dict(guided_choice=sample_guided_choice,
guided_decoding_backend=guided_decoding_backend))
choice1 = chat_completion.choices[0].message.content
- assert choice1 in TEST_CHOICE
+ assert choice1 in sample_guided_choice
messages.append({"role": "assistant", "content": choice1})
messages.append({
@@ -436,10 +381,10 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
model=MODEL_NAME,
messages=messages,
max_tokens=10,
- extra_body=dict(guided_choice=TEST_CHOICE,
+ extra_body=dict(guided_choice=sample_guided_choice,
guided_decoding_backend=guided_decoding_backend))
choice2 = chat_completion.choices[0].message.content
- assert choice2 in TEST_CHOICE
+ assert choice2 in sample_guided_choice
assert choice1 != choice2
@@ -447,7 +392,8 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
@pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"])
async def test_guided_json_chat(client: openai.AsyncOpenAI,
- guided_decoding_backend: str):
+ guided_decoding_backend: str,
+ sample_json_schema):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
@@ -456,18 +402,18 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
"user",
"content":
f"Give an example JSON for an employee profile that "
- f"fits this schema: {TEST_SCHEMA}"
+ f"fits this schema: {sample_json_schema}"
}]
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=1000,
- extra_body=dict(guided_json=TEST_SCHEMA,
+ extra_body=dict(guided_json=sample_json_schema,
guided_decoding_backend=guided_decoding_backend))
message = chat_completion.choices[0].message
assert message.content is not None
json1 = json.loads(message.content)
- jsonschema.validate(instance=json1, schema=TEST_SCHEMA)
+ jsonschema.validate(instance=json1, schema=sample_json_schema)
messages.append({"role": "assistant", "content": message.content})
messages.append({
@@ -480,12 +426,12 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
model=MODEL_NAME,
messages=messages,
max_tokens=1000,
- extra_body=dict(guided_json=TEST_SCHEMA,
+ extra_body=dict(guided_json=sample_json_schema,
guided_decoding_backend=guided_decoding_backend))
message = chat_completion.choices[0].message
assert message.content is not None
json2 = json.loads(message.content)
- jsonschema.validate(instance=json2, schema=TEST_SCHEMA)
+ jsonschema.validate(instance=json2, schema=sample_json_schema)
assert json1["name"] != json2["name"]
assert json1["age"] != json2["age"]
@@ -494,7 +440,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,
@pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"])
async def test_guided_regex_chat(client: openai.AsyncOpenAI,
- guided_decoding_backend: str):
+ guided_decoding_backend: str, sample_regex):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
@@ -502,17 +448,17 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
"role":
"user",
"content":
- f"Give an example IP address with this regex: {TEST_REGEX}"
+ f"Give an example IP address with this regex: {sample_regex}"
}]
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
max_tokens=20,
- extra_body=dict(guided_regex=TEST_REGEX,
+ extra_body=dict(guided_regex=sample_regex,
guided_decoding_backend=guided_decoding_backend))
ip1 = chat_completion.choices[0].message.content
assert ip1 is not None
- assert re.fullmatch(TEST_REGEX, ip1) is not None
+ assert re.fullmatch(sample_regex, ip1) is not None
messages.append({"role": "assistant", "content": ip1})
messages.append({"role": "user", "content": "Give me a different one"})
@@ -520,11 +466,11 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI,
model=MODEL_NAME,
messages=messages,
max_tokens=20,
- extra_body=dict(guided_regex=TEST_REGEX,
+ extra_body=dict(guided_regex=sample_regex,
guided_decoding_backend=guided_decoding_backend))
ip2 = chat_completion.choices[0].message.content
assert ip2 is not None
- assert re.fullmatch(TEST_REGEX, ip2) is not None
+ assert re.fullmatch(sample_regex, ip2) is not None
assert ip1 != ip2
@@ -553,7 +499,8 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):
@pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"])
async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
- guided_decoding_backend: str):
+ guided_decoding_backend: str,
+ sample_guided_choice):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
@@ -569,7 +516,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
max_tokens=10,
logprobs=True,
top_logprobs=5,
- extra_body=dict(guided_choice=TEST_CHOICE,
+ extra_body=dict(guided_choice=sample_guided_choice,
guided_decoding_backend=guided_decoding_backend))
assert chat_completion.choices[0].logprobs is not None
@@ -585,7 +532,8 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
@pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"])
async def test_named_tool_use(client: openai.AsyncOpenAI,
- guided_decoding_backend: str):
+ guided_decoding_backend: str,
+ sample_json_schema):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
@@ -594,7 +542,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
"user",
"content":
f"Give an example JSON for an employee profile that "
- f"fits this schema: {TEST_SCHEMA}"
+ f"fits this schema: {sample_json_schema}"
}]
# non-streaming
@@ -608,7 +556,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
"function": {
"name": "dummy_function_name",
"description": "This is a dummy function",
- "parameters": TEST_SCHEMA
+ "parameters": sample_json_schema
}
}],
tool_choice={
@@ -621,7 +569,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
assert len(message.content) == 0
json_string = message.tool_calls[0].function.arguments
json1 = json.loads(json_string)
- jsonschema.validate(instance=json1, schema=TEST_SCHEMA)
+ jsonschema.validate(instance=json1, schema=sample_json_schema)
messages.append({"role": "assistant", "content": json_string})
messages.append({
@@ -642,7 +590,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
"function": {
"name": "dummy_function_name",
"description": "This is a dummy function",
- "parameters": TEST_SCHEMA
+ "parameters": sample_json_schema
}
}],
tool_choice={
@@ -667,7 +615,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
# finish reason should only return in last block
assert finish_reason_count == 1
json2 = json.loads("".join(output))
- jsonschema.validate(instance=json2, schema=TEST_SCHEMA)
+ jsonschema.validate(instance=json2, schema=sample_json_schema)
assert json1["name"] != json2["name"]
assert json1["age"] != json2["age"]
@@ -675,7 +623,8 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
async def test_required_tool_use_not_yet_supported(
- client: openai.AsyncOpenAI, guided_decoding_backend: str):
+ client: openai.AsyncOpenAI, guided_decoding_backend: str,
+ sample_json_schema):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
@@ -684,7 +633,7 @@ async def test_required_tool_use_not_yet_supported(
"user",
"content":
f"Give an example JSON for an employee profile that "
- f"fits this schema: {TEST_SCHEMA}"
+ f"fits this schema: {sample_json_schema}"
}]
with pytest.raises(openai.BadRequestError):
@@ -697,7 +646,7 @@ async def test_required_tool_use_not_yet_supported(
"function": {
"name": "dummy_function_name",
"description": "This is a dummy function",
- "parameters": TEST_SCHEMA
+ "parameters": sample_json_schema
}
}],
tool_choice="required")
@@ -712,7 +661,7 @@ async def test_required_tool_use_not_yet_supported(
"function": {
"name": "dummy_function_name",
"description": "This is a dummy function",
- "parameters": TEST_SCHEMA
+ "parameters": sample_json_schema
}
}],
tool_choice="auto")
@@ -720,8 +669,9 @@ async def test_required_tool_use_not_yet_supported(
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
-async def test_inconsistent_tool_choice_and_tools(
- client: openai.AsyncOpenAI, guided_decoding_backend: str):
+async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
+ guided_decoding_backend: str,
+ sample_json_schema):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
@@ -730,7 +680,7 @@ async def test_inconsistent_tool_choice_and_tools(
"user",
"content":
f"Give an example JSON for an employee profile that "
- f"fits this schema: {TEST_SCHEMA}"
+ f"fits this schema: {sample_json_schema}"
}]
with pytest.raises(openai.BadRequestError):
@@ -755,7 +705,7 @@ async def test_inconsistent_tool_choice_and_tools(
"function": {
"name": "dummy_function_name",
"description": "This is a dummy function",
- "parameters": TEST_SCHEMA
+ "parameters": sample_json_schema
}
}],
tool_choice={
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index 52a848b7831d5..6e5fdebe786e1 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -6,9 +6,6 @@
import jsonschema
import openai # use the official client for correctness check
import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
import requests
# downloading lora to test lora requests
from huggingface_hub import snapshot_download
@@ -16,7 +13,7 @@
from vllm.transformers_utils.tokenizer import get_tokenizer
-from ...utils import VLLM_PATH, RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -24,53 +21,6 @@
# generation quality here
LORA_NAME = "typeof/zephyr-7b-beta-lora"
-TEST_SCHEMA = {
- "type": "object",
- "properties": {
- "name": {
- "type": "string"
- },
- "age": {
- "type": "integer"
- },
- "skills": {
- "type": "array",
- "items": {
- "type": "string",
- "maxLength": 10
- },
- "minItems": 3
- },
- "work history": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "company": {
- "type": "string"
- },
- "duration": {
- "type": "string"
- },
- "position": {
- "type": "string"
- }
- },
- "required": ["company", "position"]
- }
- }
- },
- "required": ["name", "age", "skills", "work history"]
-}
-
-TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
- r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
-
-TEST_CHOICE = [
- "Python", "Java", "JavaScript", "C++", "C#", "PHP", "TypeScript", "Ruby",
- "Swift", "Kotlin"
-]
-
@pytest.fixture(scope="module")
def zephyr_lora_files():
@@ -78,35 +28,29 @@ def zephyr_lora_files():
@pytest.fixture(scope="module")
-def ray_ctx():
- ray.init(runtime_env={"working_dir": VLLM_PATH})
- yield
- ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(zephyr_lora_files, ray_ctx):
- return RemoteOpenAIServer([
- "--model",
- MODEL_NAME,
- # use half precision for speed and memory savings in CI environment
- "--dtype",
- "bfloat16",
- "--max-model-len",
- "8192",
- "--enforce-eager",
- # lora config below
- "--enable-lora",
- "--lora-modules",
- f"zephyr-lora={zephyr_lora_files}",
- f"zephyr-lora2={zephyr_lora_files}",
- "--max-lora-rank",
- "64",
- "--max-cpu-loras",
- "2",
- "--max-num-seqs",
- "128",
- ])
+def server(zephyr_lora_files):
+ with RemoteOpenAIServer([
+ "--model",
+ MODEL_NAME,
+ # use half precision for speed and memory savings in CI environment
+ "--dtype",
+ "bfloat16",
+ "--max-model-len",
+ "8192",
+ "--enforce-eager",
+ # lora config below
+ "--enable-lora",
+ "--lora-modules",
+ f"zephyr-lora={zephyr_lora_files}",
+ f"zephyr-lora2={zephyr_lora_files}",
+ "--max-lora-rank",
+ "64",
+ "--max-cpu-loras",
+ "2",
+ "--max-num-seqs",
+ "128",
+ ]) as remote_server:
+ yield remote_server
@pytest.fixture(scope="module")
@@ -529,77 +473,71 @@ async def test_logits_bias(client: openai.AsyncOpenAI):
@pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"])
async def test_guided_json_completion(client: openai.AsyncOpenAI,
- guided_decoding_backend: str):
+ guided_decoding_backend: str,
+ sample_json_schema):
completion = await client.completions.create(
model=MODEL_NAME,
prompt=f"Give an example JSON for an employee profile "
- f"that fits this schema: {TEST_SCHEMA}",
+ f"that fits this schema: {sample_json_schema}",
n=3,
temperature=1.0,
max_tokens=500,
- extra_body=dict(guided_json=TEST_SCHEMA,
+ extra_body=dict(guided_json=sample_json_schema,
guided_decoding_backend=guided_decoding_backend))
assert completion.id is not None
assert len(completion.choices) == 3
for i in range(3):
output_json = json.loads(completion.choices[i].text)
- jsonschema.validate(instance=output_json, schema=TEST_SCHEMA)
+ jsonschema.validate(instance=output_json, schema=sample_json_schema)
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"])
async def test_guided_regex_completion(client: openai.AsyncOpenAI,
- guided_decoding_backend: str):
+ guided_decoding_backend: str,
+ sample_regex):
completion = await client.completions.create(
model=MODEL_NAME,
- prompt=f"Give an example IPv4 address with this regex: {TEST_REGEX}",
+ prompt=f"Give an example IPv4 address with this regex: {sample_regex}",
n=3,
temperature=1.0,
max_tokens=20,
- extra_body=dict(guided_regex=TEST_REGEX,
+ extra_body=dict(guided_regex=sample_regex,
guided_decoding_backend=guided_decoding_backend))
assert completion.id is not None
assert len(completion.choices) == 3
for i in range(3):
- assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None
+ assert re.fullmatch(sample_regex,
+ completion.choices[i].text) is not None
@pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"])
async def test_guided_choice_completion(client: openai.AsyncOpenAI,
- guided_decoding_backend: str):
+ guided_decoding_backend: str,
+ sample_guided_choice):
completion = await client.completions.create(
model=MODEL_NAME,
prompt="The best language for type-safe systems programming is ",
n=2,
temperature=1.0,
max_tokens=10,
- extra_body=dict(guided_choice=TEST_CHOICE,
+ extra_body=dict(guided_choice=sample_guided_choice,
guided_decoding_backend=guided_decoding_backend))
assert completion.id is not None
assert len(completion.choices) == 2
for i in range(2):
- assert completion.choices[i].text in TEST_CHOICE
+ assert completion.choices[i].text in sample_guided_choice
@pytest.mark.asyncio
-async def test_guided_grammar(client: openai.AsyncOpenAI):
- simple_sql_grammar = """
-start: select_statement
-
-select_statement: "SELECT" column "from" table "where" condition
-
-column: "col_1" | "col_2"
-table: "table_1" | "table_2"
-condition: column "=" number
-
-number: "1" | "2"
-"""
+async def test_guided_grammar(client: openai.AsyncOpenAI,
+ sample_sql_statements):
completion = await client.completions.create(
model=MODEL_NAME,
@@ -607,13 +545,13 @@ async def test_guided_grammar(client: openai.AsyncOpenAI):
"table_1 where it is equals to 1"),
temperature=1.0,
max_tokens=500,
- extra_body=dict(guided_grammar=simple_sql_grammar))
+ extra_body=dict(guided_grammar=sample_sql_statements))
content = completion.choices[0].text
# use Lark to parse the output, and make sure it's a valid parse tree
from lark import Lark
- parser = Lark(simple_sql_grammar)
+ parser = Lark(sample_sql_statements)
parser.parse(content)
# remove spaces for comparison b/c we removed them in the grammar
@@ -661,7 +599,8 @@ async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
@pytest.mark.parametrize("guided_decoding_backend",
["outlines", "lm-format-enforcer"])
async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
- guided_decoding_backend: str):
+ guided_decoding_backend: str,
+ sample_json_schema, sample_regex):
with pytest.raises(openai.BadRequestError):
_ = await client.completions.create(
model=MODEL_NAME,
@@ -673,7 +612,8 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
_ = await client.completions.create(
model=MODEL_NAME,
prompt="Give an example string that fits this regex",
- extra_body=dict(guided_regex=TEST_REGEX, guided_json=TEST_SCHEMA))
+ extra_body=dict(guided_regex=sample_regex,
+ guided_json=sample_json_schema))
@pytest.mark.asyncio
diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py
index f8aa1c9143a3b..4a32aadc8c3ae 100644
--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -3,33 +3,26 @@
import numpy as np
import openai
import pytest
-import ray
-from ...utils import VLLM_PATH, RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct"
@pytest.fixture(scope="module")
-def ray_ctx():
- ray.init(runtime_env={"working_dir": VLLM_PATH})
- yield
- ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def embedding_server(ray_ctx):
- return RemoteOpenAIServer([
- "--model",
- EMBEDDING_MODEL_NAME,
- # use half precision for speed and memory savings in CI environment
- "--dtype",
- "bfloat16",
- "--enforce-eager",
- "--max-model-len",
- "8192",
- "--enforce-eager",
- ])
+def embedding_server():
+ with RemoteOpenAIServer([
+ "--model",
+ EMBEDDING_MODEL_NAME,
+ # use half precision for speed and memory savings in CI environment
+ "--dtype",
+ "bfloat16",
+ "--enforce-eager",
+ "--max-model-len",
+ "8192",
+ "--enforce-eager",
+ ]) as remote_server:
+ yield remote_server
@pytest.mark.asyncio
diff --git a/tests/entrypoints/openai/test_guided_processors.py b/tests/entrypoints/openai/test_guided_processors.py
index 27568d3e7c26c..85cb4d52200c3 100644
--- a/tests/entrypoints/openai/test_guided_processors.py
+++ b/tests/entrypoints/openai/test_guided_processors.py
@@ -10,59 +10,17 @@
from vllm.model_executor.guided_decoding.outlines_logits_processors import (
JSONLogitsProcessor, RegexLogitsProcessor)
-TEST_SCHEMA = {
- "type": "object",
- "properties": {
- "name": {
- "type": "string"
- },
- "age": {
- "type": "integer"
- },
- "skills": {
- "type": "array",
- "items": {
- "type": "string",
- "maxLength": 10
- },
- "minItems": 3
- },
- "work history": {
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "company": {
- "type": "string"
- },
- "duration": {
- "type": "string"
- },
- "position": {
- "type": "string"
- }
- },
- "required": ["company", "position"]
- }
- }
- },
- "required": ["name", "age", "skills", "work history"]
-}
-TEST_REGEX = (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
- r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
-
-
-def test_guided_logits_processors():
+def test_guided_logits_processors(sample_regex, sample_json_schema):
"""Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
- regex_LP = RegexLogitsProcessor(TEST_REGEX, tokenizer)
- json_LP = JSONLogitsProcessor(TEST_SCHEMA,
+ regex_LP = RegexLogitsProcessor(sample_regex, tokenizer)
+ json_LP = JSONLogitsProcessor(sample_json_schema,
tokenizer,
whitespace_pattern=None)
token_ids = tokenizer.encode(
- f"Give an example IPv4 address with this regex: {TEST_REGEX}")
+ f"Give an example IPv4 address with this regex: {sample_regex}")
tensor = torch.rand(32000)
original_tensor = torch.clone(tensor)
regex_LP(token_ids, tensor)
@@ -70,7 +28,8 @@ def test_guided_logits_processors():
assert not torch.allclose(tensor, original_tensor)
token_ids = tokenizer.encode(
- f"Give an employee profile that fits this schema: {TEST_SCHEMA}")
+ f"Give an employee profile that fits this schema: {sample_json_schema}"
+ )
tensor = torch.rand(32000)
original_tensor = torch.clone(tensor)
json_LP(token_ids, tensor)
@@ -80,13 +39,14 @@ def test_guided_logits_processors():
@pytest.mark.asyncio
@pytest.mark.parametrize("backend", ["outlines", "lm-format-enforcer"])
-async def test_guided_logits_processor_black_box(backend: str):
+async def test_guided_logits_processor_black_box(backend: str, sample_regex,
+ sample_json_schema):
tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
token_ids = tokenizer.encode(
- f"Give an example IPv4 address with this regex: {TEST_REGEX}")
+ f"Give an example IPv4 address with this regex: {sample_regex}")
regex_request = CompletionRequest(model='test',
prompt=token_ids,
- guided_regex=TEST_REGEX)
+ guided_regex=sample_regex)
regex_lp = await get_guided_decoding_logits_processor(
backend, regex_request, tokenizer)
assert regex_lp is not None
@@ -97,10 +57,11 @@ async def test_guided_logits_processor_black_box(backend: str):
assert not torch.allclose(tensor, original_tensor)
token_ids = tokenizer.encode(
- f"Give an employee profile that fits this schema: {TEST_SCHEMA}")
+ f"Give an employee profile that fits this schema: {sample_json_schema}"
+ )
json_request = CompletionRequest(model='test',
prompt=token_ids,
- guided_json=TEST_SCHEMA)
+ guided_json=sample_json_schema)
json_lp = await get_guided_decoding_logits_processor(
backend, json_request, tokenizer)
assert json_lp is not None
diff --git a/tests/entrypoints/openai/test_models.py b/tests/entrypoints/openai/test_models.py
index 914ef6e19e109..bf63f9a813f2c 100644
--- a/tests/entrypoints/openai/test_models.py
+++ b/tests/entrypoints/openai/test_models.py
@@ -1,12 +1,9 @@
import openai # use the official client for correctness check
import pytest
-# using Ray for overall ease of process management, parallel requests,
-# and debugging.
-import ray
# downloading lora to test lora requests
from huggingface_hub import snapshot_download
-from ...utils import VLLM_PATH, RemoteOpenAIServer
+from ...utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
@@ -21,35 +18,29 @@ def zephyr_lora_files():
@pytest.fixture(scope="module")
-def ray_ctx():
- ray.init(runtime_env={"working_dir": VLLM_PATH})
- yield
- ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(zephyr_lora_files, ray_ctx):
- return RemoteOpenAIServer([
- "--model",
- MODEL_NAME,
- # use half precision for speed and memory savings in CI environment
- "--dtype",
- "bfloat16",
- "--max-model-len",
- "8192",
- "--enforce-eager",
- # lora config below
- "--enable-lora",
- "--lora-modules",
- f"zephyr-lora={zephyr_lora_files}",
- f"zephyr-lora2={zephyr_lora_files}",
- "--max-lora-rank",
- "64",
- "--max-cpu-loras",
- "2",
- "--max-num-seqs",
- "128",
- ])
+def server(zephyr_lora_files):
+ with RemoteOpenAIServer([
+ "--model",
+ MODEL_NAME,
+ # use half precision for speed and memory savings in CI environment
+ "--dtype",
+ "bfloat16",
+ "--max-model-len",
+ "8192",
+ "--enforce-eager",
+ # lora config below
+ "--enable-lora",
+ "--lora-modules",
+ f"zephyr-lora={zephyr_lora_files}",
+ f"zephyr-lora2={zephyr_lora_files}",
+ "--max-lora-rank",
+ "64",
+ "--max-cpu-loras",
+ "2",
+ "--max-num-seqs",
+ "128",
+ ]) as remote_server:
+ yield remote_server
@pytest.fixture(scope="module")
diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
index 5de28513ca391..b25e2a26e2d82 100644
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -6,7 +6,8 @@
# ruff: noqa: E501
INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
-{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
+{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+{"custom_id": "request-3", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NonExistModel", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py
index b869717608d0f..563b68566bd2c 100644
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -3,7 +3,6 @@
import openai
import pytest
import pytest_asyncio
-import ray
from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64
@@ -23,25 +22,19 @@
@pytest.fixture(scope="module")
-def ray_ctx():
- ray.init(runtime_env={"working_dir": VLLM_PATH})
- yield
- ray.shutdown()
-
-
-@pytest.fixture(scope="module")
-def server(ray_ctx):
- return RemoteOpenAIServer([
- "--model",
- MODEL_NAME,
- "--dtype",
- "bfloat16",
- "--max-model-len",
- "4096",
- "--enforce-eager",
- "--chat-template",
- str(LLAVA_CHAT_TEMPLATE),
- ])
+def server():
+ with RemoteOpenAIServer([
+ "--model",
+ MODEL_NAME,
+ "--dtype",
+ "bfloat16",
+ "--max-model-len",
+ "4096",
+ "--enforce-eager",
+ "--chat-template",
+ str(LLAVA_CHAT_TEMPLATE),
+ ]) as remote_server:
+ yield remote_server
@pytest.fixture(scope="module")
diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py
index 92ddcb209b690..3bd6680cf8134 100644
--- a/tests/kernels/test_marlin_gemm.py
+++ b/tests/kernels/test_marlin_gemm.py
@@ -5,19 +5,21 @@
import pytest
import torch
+from tests.quantization.utils import is_quant_method_supported
from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.gptq_marlin import (
- GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
- GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS,
- marlin_permute_scales)
from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
-from vllm.model_executor.layers.quantization.utils.marlin_perms import (
- marlin_perm)
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
- MarlinWorkspace, compute_max_diff, is_marlin_supported, marlin_24_quantize,
- marlin_quantize, marlin_weights, pack_fp8_to_int32)
+ GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
+ GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS,
+ marlin_permute_scales)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
+ pack_fp8_to_int32)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
+ MarlinWorkspace, get_weight_perm, marlin_quantize, marlin_weights)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
+ marlin_24_quantize)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
gptq_pack, quantize_weights, sort_weights)
@@ -42,11 +44,16 @@
DTYPES = [torch.float16, torch.bfloat16]
+def compute_max_diff(output, output_ref):
+ return torch.mean(torch.abs(output - output_ref)) / torch.mean(
+ torch.abs(output_ref))
+
+
def rand_data(shape, dtype=torch.float16):
return torch.randn(shape, dtype=dtype, device="cuda")
-@pytest.mark.skipif(not is_marlin_supported(),
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
reason="Marlin is not supported on this GPU type.")
@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
@@ -93,8 +100,8 @@ def test_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
# Pack to Marlin format
- marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, num_bits,
- marlin_perm[num_bits])
+ weight_perm = get_weight_perm(num_bits)
+ marlin_q_w_1 = marlin_weights(q_w, size_k, size_n, num_bits, weight_perm)
# Run Marlin repack GPU kernel
marlin_q_w_2 = ops.gptq_marlin_repack(
@@ -109,7 +116,7 @@ def test_marlin_repack(k_chunk, n_chunk, num_bits, group_size, act_order,
assert torch.allclose(marlin_q_w_1, marlin_q_w_2)
-@pytest.mark.skipif(not is_marlin_supported(),
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
reason="Marlin is not supported on this GPU type.")
@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
@@ -174,7 +181,7 @@ def test_marlin_gemm(
assert max_diff < 0.04
-@pytest.mark.skipif(not is_marlin_supported(),
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
reason="Marlin is not supported on this GPU type.")
@pytest.mark.parametrize("k_chunk", MARLIN_24_K_CHUNKS)
@pytest.mark.parametrize("n_chunk", MARLIN_24_N_CHUNKS)
@@ -222,7 +229,7 @@ def test_marlin_24_gemm(k_chunk, n_chunk, num_bits, group_size, mnk_factors):
assert max_diff < 0.04
-@pytest.mark.skipif(not is_marlin_supported(),
+@pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="Marlin is not supported on this GPU type.")
@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS)
@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS)
@@ -268,13 +275,10 @@ def test_fp8_marlin_gemm(
# expand it to channelwise
scales = weight_scale.repeat(1, size_n).to(a_input.dtype).to("cuda")
# Permute scales
- marlin_scales = marlin_permute_scales(
- s=scales,
- size_k=size_k,
- size_n=size_n,
- group_size=-1,
- num_bits=8,
- )
+ marlin_scales = marlin_permute_scales(s=scales,
+ size_k=size_k,
+ size_n=size_n,
+ group_size=-1)
workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
GPTQ_MARLIN_MAX_PARALLEL)
diff --git a/tests/models/test_compressed_tensors.py b/tests/models/test_compressed_tensors.py
index 9a0054c5aff53..da47d5f3f3d23 100644
--- a/tests/models/test_compressed_tensors.py
+++ b/tests/models/test_compressed_tensors.py
@@ -12,7 +12,10 @@
from .utils import check_logprobs_close
MODELS = [
+ # No bias
"nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test",
+ # Bias
+ "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
]
MAX_TOKENS = 32
diff --git a/tests/models/test_fuyu.py b/tests/models/test_fuyu.py
new file mode 100644
index 0000000000000..672470acb77e6
--- /dev/null
+++ b/tests/models/test_fuyu.py
@@ -0,0 +1,142 @@
+from typing import List, Optional, Tuple, Type
+
+import pytest
+
+from vllm.multimodal.utils import rescale_image_size
+from vllm.sequence import SampleLogprobs
+from vllm.utils import is_cpu
+
+from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
+from .utils import check_logprobs_close
+
+pytestmark = pytest.mark.vlm
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+ "stop_sign": "What's the content of the image?\n", # noqa: E501
+ "cherry_blossom": "What is the season?\n",
+ "boardwalk": "What's in this image?\n",
+})
+
+models = ["adept/fuyu-8b"]
+
+
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+ Optional[SampleLogprobs]]):
+ """Sanitize vllm output to be comparable with hf output."""
+ output_ids, output_str, out_logprobs = vllm_output
+
+ hf_output_str = output_str.lstrip() + "|ENDOFTEXT|"
+
+ return output_ids, hf_output_str, out_logprobs
+
+
+def run_test(
+ hf_runner: Type[HfRunner],
+ vllm_runner: Type[VllmRunner],
+ image_assets: _ImageAssets,
+ model: str,
+ *,
+ size_factors: List[float],
+ dtype: str,
+ max_tokens: int,
+ num_logprobs: int,
+ tensor_parallel_size: int,
+ distributed_executor_backend: Optional[str] = None,
+):
+ """Inference result should be the same between hf and vllm.
+
+ All the image fixtures for the test is under tests/images.
+ For huggingface runner, we provide the PIL images as input.
+ For vllm runner, we provide MultiModalDataDict objects
+ and corresponding vision language config as input.
+ Note, the text input is also adjusted to abide by vllm contract.
+ The text output is sanitized to be able to compare with hf.
+ """
+ images = [asset.pil_image for asset in image_assets]
+
+ inputs_per_image = [(
+ [prompt for _ in size_factors],
+ [rescale_image_size(image, factor) for factor in size_factors],
+ ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+
+ # NOTE: take care of the order. run vLLM first, and then run HF.
+ # vLLM needs a fresh new process without cuda initialization.
+ # if we run HF first, the cuda initialization will be done and it
+ # will hurt multiprocessing backend with fork method (the default method).
+
+ # max_model_len should be greater than image_feature_size
+ with vllm_runner(model,
+ max_model_len=2560,
+ max_num_seqs=1,
+ dtype=dtype,
+ tensor_parallel_size=tensor_parallel_size,
+ distributed_executor_backend=distributed_executor_backend,
+ enforce_eager=True) as vllm_model:
+ vllm_outputs_per_image = [
+ vllm_model.generate_greedy_logprobs(prompts,
+ max_tokens,
+ num_logprobs=num_logprobs,
+ images=vllm_images)
+ for prompts, vllm_images in inputs_per_image
+ ]
+
+ with hf_runner(model, dtype=dtype) as hf_model:
+ hf_model.model.get_output_embeddings = lambda: \
+ hf_model.model.language_model.get_output_embeddings()
+ eos_token_id = hf_model.processor.tokenizer.eos_token_id
+ hf_outputs_per_image = [
+ hf_model.generate_greedy_logprobs_limit(prompts,
+ max_tokens,
+ num_logprobs=num_logprobs,
+ images=hf_images,
+ eos_token_id=eos_token_id)
+ for prompts, hf_images in inputs_per_image
+ ]
+
+ for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+ vllm_outputs_per_image):
+ check_logprobs_close(
+ outputs_0_lst=hf_outputs,
+ outputs_1_lst=[
+ vllm_to_hf_output(vllm_output) for vllm_output in vllm_outputs
+ ],
+ name_0="hf",
+ name_1="vllm",
+ )
+
+
+target_dtype = "half"
+if is_cpu():
+ target_dtype = "bfloat16"
+
+
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+ "size_factors",
+ [
+ # No image
+ [],
+ # Single-scale
+ [0.25],
+ # Single-scale, batched
+ [0.25, 0.25, 0.25],
+ # Multi-scale
+ [0.25, 0.2, 0.15],
+ ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+ dtype: str, max_tokens: int, num_logprobs: int) -> None:
+ run_test(
+ hf_runner,
+ vllm_runner,
+ image_assets,
+ model,
+ size_factors=size_factors,
+ dtype=dtype,
+ max_tokens=max_tokens,
+ num_logprobs=num_logprobs,
+ tensor_parallel_size=1,
+ )
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 581cbcf9068fe..163741a5719c2 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -1,8 +1,10 @@
from typing import List, Optional, Tuple
import pytest
-from transformers import AutoTokenizer
+from transformers import AutoConfig, AutoTokenizer
+from vllm.model_executor.models.llava_next import (
+ get_llava_next_image_feature_size)
from vllm.multimodal.utils import rescale_image_size
from vllm.sequence import SampleLogprobs
@@ -120,3 +122,13 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
name_0="hf",
name_1="vllm",
)
+
+
+@pytest.mark.parametrize("height_and_width_and_result", [(1669, 2560, 2144),
+ (183, 488, 776)])
+def test_image_feature_size(height_and_width_and_result):
+ height, width, result = height_and_width_and_result
+ config = AutoConfig.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
+ assert get_llava_next_image_feature_size(config,
+ input_height=height,
+ input_width=width) == result
diff --git a/tests/models/test_paligemma.py b/tests/models/test_paligemma.py
index 2b1d3c5b43b44..b0e7264e89118 100644
--- a/tests/models/test_paligemma.py
+++ b/tests/models/test_paligemma.py
@@ -129,7 +129,7 @@ def run_test(
[0.25, 0.5, 1.0],
],
)
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("dtype", ["float", "half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 96223a247657b..888e20e51a842 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -6,7 +6,6 @@
import pytest
import torch
-from vllm import SamplingParams
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501
CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
@@ -57,12 +56,14 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
assert qkv_proj.weight_scale.dtype is torch.float32
assert qkv_proj.input_scale.dtype is torch.float32
+ output = llm.generate_greedy("Hello my name is", max_tokens=20)
+ assert output
+
def test_compressed_tensors_no_enforce_eager(vllm_runner):
model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
with vllm_runner(model_path) as llm:
- sampling_params = SamplingParams()
- output = llm.generate("Hello world!", sampling_params=sampling_params)
+ output = llm.generate_greedy("Hello my name is", max_tokens=20)
assert output
@@ -84,13 +85,16 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args):
assert qkv_proj.scheme.strategy == strategy
assert qkv_proj.weight.dtype is torch.int8
+ output = llm.generate_greedy("Hello my name is", max_tokens=20)
+ assert output
+
@pytest.mark.parametrize(
"wNa16_args",
[("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8),
("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8),
("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4)])
-def test_compressed_tensors_w4a16(vllm_runner, wNa16_args):
+def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
model, strategy, group, pack_factor = wNa16_args
with vllm_runner(model) as llm:
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
@@ -101,12 +105,15 @@ def test_compressed_tensors_w4a16(vllm_runner, wNa16_args):
assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16)
assert qkv_proj.scheme.strategy == strategy
- assert qkv_proj.scheme.group_size == group
+ assert qkv_proj.scheme.group_size == (-1 if group is None else group)
assert qkv_proj.weight_packed.dtype is torch.int32
assert qkv_proj.weight_scale.dtype is torch.float16
assert qkv_proj.weight_packed.pack_factor == pack_factor
+ output = llm.generate_greedy("Hello my name is", max_tokens=20)
+ assert output
+
def test_compressed_tensors_w4a16_marlin24(vllm_runner):
model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
@@ -120,8 +127,7 @@ def test_compressed_tensors_w4a16_marlin24(vllm_runner):
assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24)
assert qkv_proj.weight_packed.dtype is torch.int32
- sampling_params = SamplingParams()
- output = llm.generate("Hello world!", sampling_params=sampling_params)
+ output = llm.generate_greedy("Hello my name is", max_tokens=20)
assert output
@@ -142,6 +148,5 @@ def test_compressed_tensors_fp8(vllm_runner):
assert len(qkv_proj.input_scale.shape) == 0
assert len(qkv_proj.weight_scale.shape) == 0
- sampling_params = SamplingParams()
- output = llm.generate("Hello world!", sampling_params=sampling_params)
+ output = llm.generate_greedy("Hello my name is", max_tokens=20)
assert output
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 02a953da04659..f7bcd4c855799 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -11,7 +11,8 @@
@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype",
+ ["float"]) # needed for comparing logprobs with HF
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
@pytest.mark.parametrize("num_top_logprobs", [6]) # 32000 == vocab_size
@pytest.mark.parametrize("detokenize", [True, False])
diff --git a/tests/spec_decode/test_dynamic_spec_decode.py b/tests/spec_decode/test_dynamic_spec_decode.py
index 29ed96999cb4c..1f3219593f96b 100644
--- a/tests/spec_decode/test_dynamic_spec_decode.py
+++ b/tests/spec_decode/test_dynamic_spec_decode.py
@@ -70,14 +70,17 @@ def test_disable_spec_tokens(queue_size: int, batch_size: int, k: int,
if queue_size < disable_by_batch_size:
# Should raise exception when executing the mocked draft model.
with pytest.raises(ValueError, match=exception_secret):
- proposer.get_spec_proposals(execute_model_req=ExecuteModelRequest(
- seq_group_metadata_list=seq_group_metadata_list,
- num_lookahead_slots=k), )
+ proposer.get_spec_proposals(
+ execute_model_req=ExecuteModelRequest(
+ seq_group_metadata_list=seq_group_metadata_list,
+ num_lookahead_slots=k),
+ seq_ids_with_bonus_token_in_last_step=set())
else:
# Should not execute the draft model because spec decode is disabled
# for all requests. Accordingly, the proposal length should be 0.
proposals = proposer.get_spec_proposals(
execute_model_req=ExecuteModelRequest(
seq_group_metadata_list=seq_group_metadata_list,
- num_lookahead_slots=k), )
+ num_lookahead_slots=k),
+ seq_ids_with_bonus_token_in_last_step=set())
assert proposals.proposal_lens.tolist() == [0] * batch_size
diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index 7744b2640fe94..9832d4f267e8a 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -118,7 +118,8 @@ def test_same_output_for_single_step():
actual_output, _ = multi_step_worker.sampler_output(
execute_model_req=ExecuteModelRequest(
seq_group_metadata_list=multi_step_seq_group),
- sample_len=num_steps)
+ sample_len=num_steps,
+ seq_ids_with_bonus_token_in_last_step=set())
assert len(actual_output) == num_steps
actual_output = actual_output[0]
@@ -210,7 +211,8 @@ def test_same_output_for_multi_step():
multi_step_output, _ = multi_step_worker.sampler_output(
execute_model_req=ExecuteModelRequest(
seq_group_metadata_list=seq_group_metadata_list),
- sample_len=num_steps)
+ sample_len=num_steps,
+ seq_ids_with_bonus_token_in_last_step=set())
# Run single-step repeatedly.
zero_kv_cache(worker.cache_engine)
@@ -277,6 +279,203 @@ def test_same_output_for_multi_step():
single_step_logprobs)
+@torch.inference_mode()
+def test_multi_step_with_batch_expansion_correct_output():
+ """
+ In this test we verify that the MultiStepWorker is able to handle bonus
+ tokens correctly. The test verifies that if a sequence has a
+ bonus token then the MultiStepWorker is able to expand the batch by adding
+ new sequences corresponding to the sequences with bonus tokens. The
+ expanded batch is then used for predicting the next tokens.
+ """
+ seed = 100
+ model_name = 'JackFram/llama-68m'
+
+ block_size = 16
+ num_gpu_blocks = 2048 // block_size
+ batch_size = 128
+ multi_step_worker = create_worker(
+ MultiStepWorker,
+ model_name,
+ block_size,
+ num_gpu_blocks,
+ seed,
+ model_runner_cls=TP1DraftModelRunner,
+ )
+ worker = create_worker(
+ Worker,
+ model_name,
+ block_size,
+ num_gpu_blocks,
+ seed,
+ )
+ random.seed(seed)
+ prompts = [[0] for _ in range(batch_size)]
+ num_steps = 2
+ final_prompt_lens = [(num_steps + 1) for prompt in prompts]
+ rand_seeds = list(random.randint(0, 100) for _ in range(num_steps))
+ multi_step_worker.execute_model = patch_execute_model_with_seeds(
+ multi_step_worker, rand_seeds)
+ worker.execute_model = patch_execute_model_with_seeds(worker, rand_seeds)
+ # Create the test continuations
+ continuations = [[random.randint(0, 1000)] for _ in prompts]
+ seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+ prompts,
+ num_gpu_blocks,
+ block_size,
+ continuations=continuations,
+ final_prompt_lens=final_prompt_lens)
+
+ # Run single-step twice to generate 2 tokens. This
+ # will simulate the bonus token case with the second token
+ # being the bonus token.
+ zero_kv_cache(worker.cache_engine)
+ single_step_output: List[SamplerOutput] = []
+ set_random_seed(seed)
+ for _ in range(num_steps):
+ seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+ prompts,
+ num_gpu_blocks,
+ block_size,
+ continuations=continuations,
+ final_prompt_lens=final_prompt_lens)
+ single_step_output.extend(
+ worker.execute_model(execute_model_req=ExecuteModelRequest(
+ seq_group_metadata_list=seq_group_metadata_list)))
+ # Append output tokens to new sequence data.
+ for i, seq_group_output in enumerate(single_step_output[-1]):
+ continuations[i].append(seq_group_output.samples[0].output_token)
+
+ # Create continuations for the MultiStepWorker. The continuations have
+ # 2 tokens in order to simulate the bonus token case.
+ multi_step_continuations = []
+ for continuation in continuations:
+ multi_step_continuations.append(continuation[:2])
+ seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+ prompts,
+ num_gpu_blocks,
+ block_size,
+ continuations=multi_step_continuations,
+ final_prompt_lens=final_prompt_lens)
+
+ # Run multi-step and verify that the third token prediction is accurate
+ # for all sequences.
+ zero_kv_cache(multi_step_worker.cache_engine)
+ all_seq_ids = {i for i in range(batch_size)}
+ multi_step_output, _ = multi_step_worker.sampler_output(
+ execute_model_req=ExecuteModelRequest(
+ seq_group_metadata_list=seq_group_metadata_list),
+ sample_len=1,
+ seq_ids_with_bonus_token_in_last_step=all_seq_ids)
+ for index, output in enumerate(multi_step_output[-1].outputs):
+ assert (continuations[index][-1] == output.samples[0].output_token)
+
+
+@torch.inference_mode()
+def test_multi_step_with_batch_expansion_incorrect_output():
+ """
+ Tests the MultiStepWorker's ability to handle batch expansion with bonus
+ tokens in a negative case scenario. This test provides the MultiStepWorker
+ with a batch containing sequences with bonus tokens but specifies the
+ sequence IDs with bonus tokens incorrectly. The test verifies that the
+ MultiStepWorker generates correct tokens for the sequences where the
+ sequence ID is specified correctly and incorrect tokens for those where
+ the sequence ID is specified incorrectly.
+ """
+ seed = 100
+ model_name = 'JackFram/llama-68m'
+
+ block_size = 16
+ num_gpu_blocks = 2048 // block_size
+ batch_size = 128
+ multi_step_worker = create_worker(
+ MultiStepWorker,
+ model_name,
+ block_size,
+ num_gpu_blocks,
+ seed,
+ model_runner_cls=TP1DraftModelRunner,
+ )
+ worker = create_worker(
+ Worker,
+ model_name,
+ block_size,
+ num_gpu_blocks,
+ seed,
+ )
+ random.seed(seed)
+ prompts = [[0] for _ in range(batch_size)]
+ num_steps = 2
+ final_prompt_lens = [(num_steps + 1) for prompt in prompts]
+ rand_seeds = list(random.randint(0, 100) for _ in range(num_steps))
+ multi_step_worker.execute_model = patch_execute_model_with_seeds(
+ multi_step_worker, rand_seeds)
+ worker.execute_model = patch_execute_model_with_seeds(worker, rand_seeds)
+ # Create the test continuations
+ continuations = [[random.randint(0, 1000)] for _ in prompts]
+ seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+ prompts,
+ num_gpu_blocks,
+ block_size,
+ continuations=continuations,
+ final_prompt_lens=final_prompt_lens)
+ # Run single-step twice to generate 2 tokens. This
+ # will simulate the bonus token case with the second token
+ # being the bonus token.
+ zero_kv_cache(worker.cache_engine)
+ single_step_output: List[SamplerOutput] = []
+ set_random_seed(seed)
+ for _ in range(num_steps):
+ seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+ prompts,
+ num_gpu_blocks,
+ block_size,
+ continuations=continuations,
+ final_prompt_lens=final_prompt_lens)
+ single_step_output.extend(
+ worker.execute_model(execute_model_req=ExecuteModelRequest(
+ seq_group_metadata_list=seq_group_metadata_list)))
+ # Append output tokens to new sequence data.
+ for i, seq_group_output in enumerate(single_step_output[-1]):
+ continuations[i].append(seq_group_output.samples[0].output_token)
+
+ # Create continuations for the MultiStepWorker. The continuations have
+ # 2 tokens in order to simulate the bonus token case.
+ multi_step_continuations = []
+ for continuation in continuations:
+ multi_step_continuations.append(continuation[:2])
+ seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+ prompts,
+ num_gpu_blocks,
+ block_size,
+ continuations=multi_step_continuations,
+ final_prompt_lens=final_prompt_lens)
+
+ # Run multi-step. In this run INCORRECTLY specify that only the odd number
+ # sequences have bonus tokens. Verify that with this setting the third token
+ # prediction is accurate only for the odd numbered sequences. Also verify
+ # that the prediction might be wrong for some of the even numbered
+ # sequences.
+ zero_kv_cache(multi_step_worker.cache_engine)
+ set_random_seed(seed)
+ odd_seq_ids = {i for i in range(batch_size) if i % 2 != 0}
+ multi_step_output, _ = multi_step_worker.sampler_output(
+ execute_model_req=ExecuteModelRequest(
+ seq_group_metadata_list=seq_group_metadata_list),
+ sample_len=1,
+ seq_ids_with_bonus_token_in_last_step=odd_seq_ids)
+ num_mismatch = 0
+ for index, output in enumerate(multi_step_output[-1].outputs):
+ if (index % 2) != 0:
+ assert (continuations[index][-1] == output.samples[0].output_token)
+ elif (continuations[index][-1] != output.samples[0].output_token):
+ num_mismatch += 1
+ # The prediction is accurate for some of the sequences even without proper
+ # handling of the bonus tokens. Hence verify that the number of sequences
+ # for which there is a mismatch is > 0.
+ assert (num_mismatch > 0)
+
+
@torch.inference_mode()
def test_draft_proposals_full_speculation_len():
"""Verify Top1Proposer correctly handles case where all sequences
@@ -318,7 +517,8 @@ def test_draft_proposals_full_speculation_len():
proposals = proposer.get_spec_proposals(
execute_model_req=ExecuteModelRequest(
seq_group_metadata_list=seq_group_metadata_list,
- num_lookahead_slots=k), )
+ num_lookahead_slots=k),
+ seq_ids_with_bonus_token_in_last_step=set())
assert torch.is_tensor(proposals.proposal_token_ids)
assert torch.is_tensor(proposals.proposal_probs)
@@ -356,7 +556,8 @@ def test_draft_proposals_no_speculations():
proposals = proposer.get_spec_proposals(
execute_model_req=ExecuteModelRequest(
seq_group_metadata_list=seq_group_metadata_list,
- num_lookahead_slots=k), )
+ num_lookahead_slots=k),
+ seq_ids_with_bonus_token_in_last_step=set())
assert torch.is_tensor(proposals.proposal_token_ids)
assert torch.is_tensor(proposals.proposal_probs)
@@ -428,7 +629,8 @@ def test_draft_proposals_mixed_k():
proposals = proposer.get_spec_proposals(
execute_model_req=ExecuteModelRequest(
seq_group_metadata_list=seq_group_metadata_list,
- num_lookahead_slots=k), )
+ num_lookahead_slots=k),
+ seq_ids_with_bonus_token_in_last_step=set())
assert torch.is_tensor(proposals.proposal_token_ids)
assert torch.is_tensor(proposals.proposal_probs)
diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py
index b1537884f896e..3995f87898afb 100644
--- a/tests/spec_decode/test_ngram_worker.py
+++ b/tests/spec_decode/test_ngram_worker.py
@@ -53,7 +53,8 @@ def test_ngram_algo_correctness_for_single_no_match():
proposals = proposer.get_spec_proposals(
execute_model_req=ExecuteModelRequest(
seq_group_metadata_list=seq_group_metadata_list,
- num_lookahead_slots=proposal_len), )
+ num_lookahead_slots=proposal_len),
+ seq_ids_with_bonus_token_in_last_step=None)
assert torch.is_tensor(proposals.proposal_token_ids)
assert torch.is_tensor(proposals.proposal_probs)
@@ -121,7 +122,8 @@ def test_ngram_algo_correctness_for_batches_not_match_all():
proposals = proposer.get_spec_proposals(
execute_model_req=ExecuteModelRequest(
seq_group_metadata_list=seq_group_metadata_list,
- num_lookahead_slots=proposal_len), )
+ num_lookahead_slots=proposal_len),
+ seq_ids_with_bonus_token_in_last_step=None)
assert torch.is_tensor(proposals.proposal_token_ids)
assert torch.is_tensor(proposals.proposal_probs)
@@ -193,7 +195,8 @@ def test_ngram_algo_correctness_for_batches_match_all():
proposals = proposer.get_spec_proposals(
execute_model_req=ExecuteModelRequest(
seq_group_metadata_list=seq_group_metadata_list,
- num_lookahead_slots=proposal_len), )
+ num_lookahead_slots=proposal_len),
+ seq_ids_with_bonus_token_in_last_step=None)
assert torch.is_tensor(proposals.proposal_token_ids)
assert torch.is_tensor(proposals.proposal_probs)
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index 527e7eddd7e33..0baac32042ef9 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -1,6 +1,7 @@
import random
+from collections import defaultdict
from types import SimpleNamespace
-from typing import Dict, List
+from typing import Dict, List, Set
from unittest.mock import MagicMock
import pytest
@@ -377,8 +378,10 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool,
set_random_seed(1)
- worker = SpecDecodeWorker(draft_worker, target_worker, spec_decode_sampler,
- metrics_collector)
+ worker = SpecDecodeWorker(draft_worker,
+ target_worker,
+ spec_decode_sampler,
+ metrics_collector=metrics_collector)
worker.init_device()
proposal_token_ids = torch.randint(low=0,
@@ -554,7 +557,6 @@ def test_init_device(acceptance_sampler_method: str):
worker = SpecDecodeWorker(draft_worker, target_worker, spec_decode_sampler,
metrics_collector)
-
worker.init_device()
draft_worker.init_device.assert_called_once()
@@ -645,3 +647,140 @@ def test_split_num_cache_blocks_evenly(available_gpu_blocks: int,
assert (num_blocks * target_cache_block_size_bytes) + (
num_blocks * draft_kv_size_bytes) <= (available_gpu_blocks *
target_cache_block_size_bytes)
+
+
+@torch.inference_mode()
+def test_populate_seq_ids_with_bonus_tokens():
+ """
+ Verify that a call to _create_output_sampler_list correctly updates
+ seq_with_bonus_token_in_last_step.
+
+ seq_with_bonus_token_in_last_step is an internal data structure in
+ SpecDecodeWorker that tracks the sequence IDs which are assigned bonus
+ tokens by the target model in their last forward pass. This state is
+ maintained only for models relying on the KV cache, such as those using
+ the MultiStepWorker.
+ """
+ batch_size = 10
+ k = 5
+ vocab_size = 10000
+ num_sequences_with_bonus_tokens = 5
+ target_worker = mock_worker(vocab_size=vocab_size, use_spec=False)
+ metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+ target_worker.execute_model.return_value = [MagicMock(spec=SamplerOutput)]
+ target_worker.device = 'cuda'
+
+ set_random_seed(1)
+ draft_worker = mock_worker(cls=MultiStepWorker)
+ draft_worker.device = 'cuda'
+ # The sequence_ids attached to each sequence in the batch.
+ # The sequence at index i has seq_id assigned_seq_ids[i]
+ assigned_seq_ids = list(range(batch_size))
+ seq_group_metadata_list, _, _ = create_batch(batch_size,
+ k,
+ seq_ids=assigned_seq_ids,
+ prev_output_token_len=10)
+ target_token_logprobs = torch.rand(batch_size, (k + 1),
+ vocab_size,
+ dtype=torch.float32,
+ device='cuda')
+ accepted_token_ids = torch.randint(low=0,
+ high=vocab_size,
+ size=(batch_size, (k + 1)),
+ dtype=torch.int64,
+ device='cuda')
+ expected_request_id_seq_ids_mapping: Dict[str, Set[int]] = defaultdict(set)
+ for seq_group_metadata in seq_group_metadata_list:
+ for seq_id in seq_group_metadata.seq_data:
+ expected_request_id_seq_ids_mapping[
+ seq_group_metadata.request_id].add(seq_id)
+ # Generate a random sample of sequence indexes with bonus tokens
+ seq_indexes_with_bonus_tokens = random.sample(
+ range(batch_size), num_sequences_with_bonus_tokens)
+ # Create a mask that is True for indices in seq_indexes_with_bonus_tokens
+ mask = torch.ones(batch_size, dtype=torch.bool, device='cuda')
+ mask[seq_indexes_with_bonus_tokens] = False
+ # Set the last token ID to -1 for all indices not in
+ # seq_indexes_with_bonus_tokens to indicate the lack of bonus token in
+ # those indices.
+ accepted_token_ids[mask, -1:] = -1
+ worker = SpecDecodeWorker(draft_worker,
+ target_worker,
+ mock_spec_decode_sampler("rejection_sampler"),
+ metrics_collector=metrics_collector)
+ # Initialize _seq_with_bonus_token_in_last_step with a set of sequence IDs.
+ # This set includes all sequence IDs in the batch as well as an additional
+ # `num_extra_sequence_ids` sequence IDs. Note that the sequence IDs are in
+ # the range [0, batch_size + num_extra_sequence_ids).
+ num_extra_sequence_ids = 10
+ worker._seq_with_bonus_token_in_last_step = set(
+ range(batch_size + num_extra_sequence_ids))
+ worker._create_output_sampler_list(
+ seq_group_metadata_list=seq_group_metadata_list,
+ accepted_token_ids=accepted_token_ids,
+ target_logprobs=target_token_logprobs,
+ k=k)
+ # Verify that _seq_with_bonus_token_in_last_step contains the following:
+ # 1. Sequence IDs that were already present in
+ # _seq_with_bonus_token_in_last_step but were not part of the current
+ # batch are retained.
+ # 2. Of the sequence IDs present in the current batch, only those with a
+ # bonus token are retained in _seq_with_bonus_token_in_last_step.
+ # Sequence IDs that are present in the current batch but do not have
+ # bonus tokens are removed from _seq_with_bonus_token_in_last_step.
+ expected_seq_ids_with_bonus_tokens = \
+ set([assigned_seq_ids[i] for i in seq_indexes_with_bonus_tokens])
+ additional_sequence_ids = \
+ set(range(batch_size, batch_size + num_extra_sequence_ids))
+ assert worker._seq_with_bonus_token_in_last_step == \
+ expected_seq_ids_with_bonus_tokens.union(additional_sequence_ids)
+ assert worker._request_id_seq_id_mapping == \
+ expected_request_id_seq_ids_mapping
+
+
+@torch.inference_mode()
+def test_handle_finished_requests():
+ """
+ Test to verify that finished request IDs are appropriately processed to
+ update the internal state of the SpecDecodeWorker.
+
+ This test initializes the SpecDecodeWorker with mock data, marks certain
+ requests as finished, and ensures that the corresponding sequence IDs are
+ correctly removed from the internal mappings.
+ """
+ batch_size = 32
+ k = 3
+ draft_worker = mock_worker(cls=MultiStepWorker)
+ target_worker = mock_worker()
+ metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+ worker = SpecDecodeWorker(draft_worker, target_worker,
+ mock_spec_decode_sampler("rejection_sampler"),
+ metrics_collector)
+ # Initialize the request_id_seq_id_mapping mapping dict with a few fake
+ # request ids and corresponding sequence ids.
+ worker._request_id_seq_id_mapping = \
+ {'request-1': {1,2,3}, 'request-2': {4,5,6,7},
+ 'request-3': {8,9}, 'request-4': {10,11}}
+ # Initialize seq_with_bonus_token_in_last_step with a few fake
+ # sequence ids.
+ worker._seq_with_bonus_token_in_last_step = {1, 4, 5, 8, 9, 10}
+ exception_secret = 'artificial stop'
+ draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret)
+
+ seq_group_metadata_list, _, _ = create_batch(batch_size, k)
+ # Mark requests with ids request-1 and request-3 as finished.
+ execute_model_req = ExecuteModelRequest(
+ seq_group_metadata_list=seq_group_metadata_list,
+ num_lookahead_slots=k,
+ finished_requests_ids=['request-1', 'request-3'])
+
+ with pytest.raises(ValueError, match=exception_secret):
+ worker.execute_model(execute_model_req=execute_model_req)
+ # Verify that request-1 and request-3 are removed from
+ # request_id_seq_id_mapping
+ assert worker._request_id_seq_id_mapping == \
+ {'request-2': {4,5,6,7}, 'request-4': {10,11}}
+ # Verify that all sequence ids corresponding to 'request-1'
+ # and 'request-3' are removed from seq_with_bonus_token_in_last_step.
+ assert worker._seq_with_bonus_token_in_last_step == \
+ {4,5,10}
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index b2ebcc15cd0fc..a43f9132585b5 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -6,7 +6,6 @@
import openai
import pytest
-import ray
import torch
from tensorizer import EncryptionParams
@@ -22,7 +21,7 @@
tensorize_vllm_model)
from ..conftest import VllmRunner, cleanup
-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ..utils import RemoteOpenAIServer
# yapf conflicts with isort for this docstring
@@ -220,23 +219,21 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
json.dumps(model_loader_extra_config),
]
- ray.init(runtime_env={"working_dir": VLLM_PATH})
+ with RemoteOpenAIServer(openai_args) as server:
+ print("Server ready.")
- server = RemoteOpenAIServer(openai_args)
- print("Server ready.")
+ client = server.get_client()
+ completion = client.completions.create(model=model_ref,
+ prompt="Hello, my name is",
+ max_tokens=5,
+ temperature=0.0)
- client = server.get_client()
- completion = client.completions.create(model=model_ref,
- prompt="Hello, my name is",
- max_tokens=5,
- temperature=0.0)
-
- assert completion.id is not None
- assert len(completion.choices) == 1
- assert len(completion.choices[0].text) >= 5
- assert completion.choices[0].finish_reason == "length"
- assert completion.usage == openai.types.CompletionUsage(
- completion_tokens=5, prompt_tokens=6, total_tokens=11)
+ assert completion.id is not None
+ assert len(completion.choices) == 1
+ assert len(completion.choices[0].text) >= 5
+ assert completion.choices[0].finish_reason == "length"
+ assert completion.usage == openai.types.CompletionUsage(
+ completion_tokens=5, prompt_tokens=6, total_tokens=11)
def test_raise_value_error_on_invalid_load_format(vllm_runner):
@@ -282,7 +279,6 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
base_model.model.llm_engine.model_executor.shutdown()
del base_model
cleanup()
- ray.shutdown()
# load model with two shards and serialize with encryption
model_path = str(tmp_path / (model_ref + "-%02d.tensors"))
@@ -305,7 +301,6 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
assert os.path.isfile(model_path % 0), "Serialization subprocess failed"
assert os.path.isfile(model_path % 1), "Serialization subprocess failed"
cleanup()
- ray.shutdown()
loaded_vllm_model = vllm_runner(
model_ref,
diff --git a/tests/test_embedded_commit.py b/tests/test_embedded_commit.py
new file mode 100644
index 0000000000000..17b01651e39af
--- /dev/null
+++ b/tests/test_embedded_commit.py
@@ -0,0 +1,7 @@
+import vllm
+
+
+def test_embedded_commit_defined():
+ assert vllm.__commit__ != "COMMIT_HASH_PLACEHOLDER"
+ # 7 characters is the length of a short commit hash
+ assert len(vllm.__commit__) >= 7
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index 12e5ae85adea6..f4551ed42efb8 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -1,4 +1,4 @@
-from typing import Dict, List
+from typing import Any, Dict, List, Optional
import pytest
from transformers import AutoTokenizer
@@ -139,6 +139,15 @@ def create_dummy_logprobs(
} for token_id in complete_sequence_token_ids]
+def create_dummy_prompt_logprobs(
+ complete_sequence_token_ids: List[int]
+) -> List[Optional[Dict[int, Any]]]:
+ # logprob for the first prompt token is None.
+ logprobs: List[Optional[Dict[int, Any]]] = [None]
+ logprobs.extend(create_dummy_logprobs(complete_sequence_token_ids)[1:])
+ return logprobs
+
+
@pytest.mark.parametrize("complete_sequence", TRUTH)
@pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
@pytest.mark.parametrize("skip_special_tokens", [True, False])
@@ -177,13 +186,10 @@ def test_decode_sequence_logprobs(complete_sequence: str,
@pytest.mark.parametrize("complete_sequence", TRUTH)
@pytest.mark.parametrize("tokenizer_name", TOKENIZERS)
-@pytest.mark.parametrize("skip_special_tokens", [True])
-def test_decode_prompt_logprobs(complete_sequence: str,
- complete_sequence_token_ids: List[int],
- detokenizer: Detokenizer,
- skip_special_tokens: bool):
+def test_decode_prompt_logprobs(complete_sequence_token_ids: List[int],
+ detokenizer: Detokenizer):
"""Verify Detokenizer decodes prompt logprobs correctly."""
- sampling_params = SamplingParams(skip_special_tokens=skip_special_tokens,
+ sampling_params = SamplingParams(skip_special_tokens=True,
prompt_logprobs=1)
# Run sequentially.
@@ -192,19 +198,78 @@ def test_decode_prompt_logprobs(complete_sequence: str,
seqs=[seq],
sampling_params=sampling_params,
arrival_time=0.0)
- dummy_logprobs = create_dummy_logprobs(complete_sequence_token_ids)
- detokenizer.decode_prompt_logprobs_inplace(seq_group, dummy_logprobs)
- decoded_prompt_logprobs = dummy_logprobs
+ dummy_logprobs = create_dummy_prompt_logprobs(complete_sequence_token_ids)
+ detokenizer.decode_prompt_logprobs_inplace(seq_group,
+ dummy_logprobs,
+ position_offset=0)
+ # First logprob is None.
+ decoded_prompt_logprobs: List[Dict[int, Any]] = dummy_logprobs[
+ 1:] # type: ignore
- if skip_special_tokens:
- # Text for logprobs for the chosen token should be the same as the
- # prompt text. Note that this will only be true if we skip
- # special tokens.
- assert complete_sequence == "".join([
- logprobs[token_id].decoded_token for token_id, logprobs in zip(
- complete_sequence_token_ids, decoded_prompt_logprobs)
- ])
- assert complete_sequence != "".join([
- logprobs[token_id + 1].decoded_token for token_id, logprobs in zip(
- complete_sequence_token_ids, decoded_prompt_logprobs)
- ])
+ # decoded_prompt_logprobs doesn't contain the first token.
+ token_ids = complete_sequence_token_ids
+ tokenzier = detokenizer.get_tokenizer_for_seq(seq)
+ text_full = tokenzier.decode(token_ids, skip_special_tokens=True)
+ text_first = tokenzier.decode(token_ids[0], skip_special_tokens=True)
+ text = text_full[len(text_first):]
+
+ # Text for logprobs for the chosen token should be the same as the
+ # prompt text. Note that the first logprob is None.
+ assert text == "".join([
+ logprobs[token_id].decoded_token
+ for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs)
+ ])
+ assert text != "".join([
+ logprobs[token_id + 1].decoded_token
+ for token_id, logprobs in zip(token_ids[1:], decoded_prompt_logprobs)
+ ])
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 7, 16, -1])
+def test_decode_prompt_logprobs_chunked_prefill(
+ vllm_runner,
+ model,
+ chunked_prefill_token_size: int,
+ example_prompts,
+):
+ max_num_seqs = 256
+ enable_chunked_prefill = False
+ max_num_batched_tokens = None
+ if chunked_prefill_token_size != -1:
+ enable_chunked_prefill = True
+ max_num_seqs = min(chunked_prefill_token_size, max_num_seqs)
+ max_num_batched_tokens = chunked_prefill_token_size
+
+ with vllm_runner(model,
+ dtype="half",
+ max_logprobs=5,
+ gpu_memory_utilization=0.5,
+ enable_chunked_prefill=enable_chunked_prefill,
+ max_num_batched_tokens=max_num_batched_tokens,
+ max_num_seqs=max_num_seqs) as vllm_model:
+
+ vllm_sampling_params = SamplingParams(max_tokens=10,
+ logprobs=5,
+ prompt_logprobs=5,
+ temperature=0.0)
+ vllm_results = vllm_model.model.generate(
+ example_prompts, sampling_params=vllm_sampling_params)
+
+ for idx, result in enumerate(vllm_results):
+ assert result.prompt_logprobs is not None
+ assert result.prompt_logprobs[0] is None
+
+ # Compared detokenized prompts ids to original prompt.
+ generated_string = ""
+ for (prompt_token,
+ prompt_logprobs) in zip(result.prompt_token_ids[1:],
+ result.prompt_logprobs[1:]):
+ # prompt_logprobs is a dict of the token_id: logprob
+ # We select the token_id corresponding to the actual prompt
+ # Decoded token in the detokenized string corresponding to this
+ # prompt token.
+ generated_string += prompt_logprobs[prompt_token].decoded_token
+
+ assert generated_string == example_prompts[idx], (
+ "Detokenized prompt logprobs do not match original prompt")
diff --git a/tests/utils.py b/tests/utils.py
index ad4d097b0e8ed..8780d45a31b29 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -14,7 +14,7 @@
from vllm.distributed import (ensure_model_parallel_initialized,
init_distributed_environment)
from vllm.entrypoints.openai.cli_args import make_arg_parser
-from vllm.utils import get_open_port, is_hip
+from vllm.utils import FlexibleArgumentParser, get_open_port, is_hip
if is_hip():
from amdsmi import (amdsmi_get_gpu_vram_usage,
@@ -49,53 +49,7 @@ class RemoteOpenAIServer:
DUMMY_API_KEY = "token-abc123" # vLLM's OpenAI server does not need API key
MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 60 seconds
- class _RemoteRunner:
-
- def __init__(self, cli_args: List[str], *, wait_url: str,
- wait_timeout: float) -> None:
- env = os.environ.copy()
- env["PYTHONUNBUFFERED"] = "1"
- self.proc = subprocess.Popen(
- [
- sys.executable, "-m", "vllm.entrypoints.openai.api_server",
- *cli_args
- ],
- env=env,
- stdout=sys.stdout,
- stderr=sys.stderr,
- )
-
- self._wait_for_server(url=wait_url, timeout=wait_timeout)
-
- def ready(self):
- return True
-
- def _wait_for_server(self, *, url: str, timeout: float):
- # run health check
- start = time.time()
- while True:
- try:
- if requests.get(url).status_code == 200:
- break
- except Exception as err:
- if self.proc.poll() is not None:
- raise RuntimeError(
- "Server exited unexpectedly.") from err
-
- time.sleep(0.5)
- if time.time() - start > timeout:
- raise RuntimeError(
- "Server failed to start in time.") from err
-
- def __del__(self):
- if hasattr(self, "proc"):
- self.proc.terminate()
-
- def __init__(self,
- cli_args: List[str],
- *,
- auto_port: bool = True,
- num_gpus: int = 1) -> None:
+ def __init__(self, cli_args: List[str], *, auto_port: bool = True) -> None:
if auto_port:
if "-p" in cli_args or "--port" in cli_args:
raise ValueError("You have manually specified the port"
@@ -103,18 +57,48 @@ def __init__(self,
cli_args = cli_args + ["--port", str(get_open_port())]
- parser = make_arg_parser()
+ parser = FlexibleArgumentParser(
+ description="vLLM's remote OpenAI server.")
+ parser = make_arg_parser(parser)
args = parser.parse_args(cli_args)
self.host = str(args.host or 'localhost')
self.port = int(args.port)
- self._runner = ray.remote(num_gpus=num_gpus)(
- self._RemoteRunner).remote(
- cli_args,
- wait_url=self.url_for("health"),
- wait_timeout=self.MAX_SERVER_START_WAIT_S)
-
- self._wait_until_ready()
+ env = os.environ.copy()
+ # the current process might initialize cuda,
+ # to be safe, we should use spawn method
+ env['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
+ self.proc = subprocess.Popen(
+ [sys.executable, "-m", "vllm.entrypoints.openai.api_server"] +
+ cli_args,
+ env=env,
+ stdout=sys.stdout,
+ stderr=sys.stderr)
+ self._wait_for_server(url=self.url_for("health"),
+ timeout=self.MAX_SERVER_START_WAIT_S)
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.proc.terminate()
+
+ def _wait_for_server(self, *, url: str, timeout: float):
+ # run health check
+ start = time.time()
+ while True:
+ try:
+ if requests.get(url).status_code == 200:
+ break
+ except Exception as err:
+ result = self.proc.poll()
+ if result is not None and result != 0:
+ raise RuntimeError("Server exited unexpectedly.") from err
+
+ time.sleep(0.5)
+ if time.time() - start > timeout:
+ raise RuntimeError(
+ "Server failed to start in time.") from err
@property
def url_root(self) -> str:
@@ -123,9 +107,6 @@ def url_root(self) -> str:
def url_for(self, *parts: str) -> str:
return self.url_root + "/" + "/".join(parts)
- def _wait_until_ready(self) -> None:
- ray.get(self._runner.ready.remote())
-
def get_client(self):
return openai.OpenAI(
base_url=self.url_for("v1"),
diff --git a/vllm/__init__.py b/vllm/__init__.py
index e217059873bf5..318f078fdbee7 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -12,9 +12,10 @@
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams
-from .version import __version__
+from .version import __commit__, __version__
__all__ = [
+ "__commit__",
"__version__",
"LLM",
"ModelRegistry",
diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py
index 7a6954ceb6d6a..c45f7b28b2afb 100644
--- a/vllm/attention/backends/pallas.py
+++ b/vllm/attention/backends/pallas.py
@@ -116,7 +116,7 @@ def __init__(
self.megacore_mode = None
tpu_type = torch_xla.tpu.get_tpu_env()["TYPE"].lower()
- if not tpu_type.endswith("lite"):
+ if "lite" not in tpu_type:
if self.num_kv_heads % 2 == 0:
self.megacore_mode = "kv_head"
else:
diff --git a/vllm/attention/ops/blocksparse_attention/utils.py b/vllm/attention/ops/blocksparse_attention/utils.py
index b1808970d7939..78d752230d6e7 100644
--- a/vllm/attention/ops/blocksparse_attention/utils.py
+++ b/vllm/attention/ops/blocksparse_attention/utils.py
@@ -4,16 +4,35 @@
from functools import lru_cache
+import numpy as np
import torch
import triton
-try:
- from scipy import sparse
-except ImportError as err:
- raise ImportError("Please install scipy via "
- "`pip install scipy` to use "
- "BlockSparseAttention in "
- "models such as Phi-3.") from err
+
+class csr_matrix:
+ """Simple implementation of CSR matrix conversion without scipy.
+ This replaced scipy.sparse.csr_matrix() previously used."""
+
+ def __init__(self, input_array):
+ if not isinstance(input_array, np.ndarray):
+ raise ValueError("Input must be a NumPy array")
+
+ self.shape = input_array.shape
+ rows, cols = self.shape
+ data = []
+ indices = []
+ indptr = [0]
+
+ for i in range(rows):
+ for j in range(cols):
+ if input_array[i, j]:
+ data.append(input_array[i, j])
+ indices.append(j)
+ indptr.append(len(indices))
+
+ self.data = np.array(data)
+ self.indices = np.array(indices)
+ self.indptr = np.array(indptr)
def dense_to_crow_col(x: torch.Tensor):
@@ -26,7 +45,7 @@ def dense_to_crow_col(x: torch.Tensor):
assert x.dim() in (2, 3)
if x.dim() == 2:
x = x[None]
- x = [sparse.csr_matrix(xi.bool().cpu().numpy()) for xi in x]
+ x = [csr_matrix(xi.bool().cpu().numpy()) for xi in x]
crows = torch.vstack([torch.from_numpy(xi.indptr) for xi in x])
cols = [torch.from_numpy(xi.indices) for xi in x]
max_cols = max(len(xi) for xi in cols)
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index 4cd4976ade729..4577d84db18ac 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -687,6 +687,12 @@ def context_attention_fwd(q,
cap = current_platform.get_device_capability()
BLOCK = 128 if cap[0] >= 8 else 64
+
+ # need to reduce num. blocks when using fp32
+ # due to increased use of GPU shared memory
+ if q.dtype is torch.float32:
+ BLOCK = BLOCK // 2
+
# shape constraints
Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
assert Lq == Lk and Lk == Lv
@@ -718,7 +724,7 @@ def context_attention_fwd(q,
b_ctx_len,
alibi_slopes,
v_cache.shape[3],
- 8,
+ k_cache.shape[4],
o,
b_loc.stride(0),
b_loc.stride(1),
@@ -768,7 +774,7 @@ def context_attention_fwd(q,
b_seq_len,
b_ctx_len,
v_cache.shape[3],
- 8,
+ k_cache.shape[4],
o,
b_loc.stride(0),
b_loc.stride(1),
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index ae63eb1d48f8d..084100f6c1135 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -77,9 +77,6 @@ def get_attn_backend(
return IpexAttnBackend
elif backend == _Backend.FLASHINFER:
logger.info("Using Flashinfer backend.")
- logger.warning(("Flashinfer will be stuck on llama-2-7b,"
- " please avoid using Flashinfer as the "
- "backend when running on llama-2-7b."))
from vllm.attention.backends.flashinfer import FlashInferBackend
return FlashInferBackend
elif backend == _Backend.PALLAS:
diff --git a/vllm/config.py b/vllm/config.py
index 68ca81a2ec4fe..de7bb3943a45f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -6,7 +6,6 @@
import torch
from transformers import PretrainedConfig
-import vllm.envs as envs
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.model_executor.models import ModelRegistry
@@ -14,7 +13,7 @@
from vllm.transformers_utils.config import get_config, get_hf_text_config
from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu,
is_hip, is_neuron, is_openvino, is_tpu, is_xpu,
- print_warning_once, update_environment_variables)
+ print_warning_once)
if TYPE_CHECKING:
from ray.util.placement_group import PlacementGroup
@@ -138,12 +137,10 @@ def __init__(
self.quantization = quantization
self.quantization_param_path = quantization_param_path
self.enforce_eager = enforce_eager
- self.max_context_len_to_capture = max_context_len_to_capture
- if self.max_context_len_to_capture is not None:
+ if max_context_len_to_capture is not None:
raise ValueError("`max_context_len_to_capture` is deprecated. "
"Use `max_seq_len_to_capture` instead.")
- self.max_seq_len_to_capture = (max_seq_len_to_capture
- or max_context_len_to_capture)
+ self.max_seq_len_to_capture = max_seq_len_to_capture
self.max_logprobs = max_logprobs
self.disable_sliding_window = disable_sliding_window
self.skip_tokenizer_init = skip_tokenizer_init
@@ -697,12 +694,6 @@ def __init__(
self.distributed_executor_backend = backend
logger.info("Defaulting to use %s for distributed inference",
backend)
- # If CUDA_VISIBLE_DEVICES is set on ROCm prior to vLLM init,
- # propagate changes to HIP_VISIBLE_DEVICES (conversion handled by
- # the update_environment_variables function)
- if is_hip() and envs.CUDA_VISIBLE_DEVICES:
- update_environment_variables(
- {"CUDA_VISIBLE_DEVICES": envs.CUDA_VISIBLE_DEVICES})
self._verify_args()
self.rank = 0
diff --git a/vllm/distributed/device_communicators/cuda_wrapper.py b/vllm/distributed/device_communicators/cuda_wrapper.py
index 24308235c4a48..5cac3c1d57bca 100644
--- a/vllm/distributed/device_communicators/cuda_wrapper.py
+++ b/vllm/distributed/device_communicators/cuda_wrapper.py
@@ -4,6 +4,9 @@
"""
import ctypes
+import glob
+import os
+import sys
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
@@ -33,6 +36,26 @@ class Function:
argtypes: List[Any]
+def get_pytorch_default_cudart_library_path() -> str:
+ # code borrowed from https://github.com/pytorch/pytorch/blob/1cae60a87e5bdda8bcf55724a862eeed98a9747e/torch/__init__.py#L284 # noqa
+ lib_folder = "cuda_runtime"
+ lib_name = "libcudart.so.*[0-9]"
+ lib_path = None
+ for path in sys.path:
+ nvidia_path = os.path.join(path, "nvidia")
+ if not os.path.exists(nvidia_path):
+ continue
+ candidate_lib_paths = glob.glob(
+ os.path.join(nvidia_path, lib_folder, "lib", lib_name))
+ if candidate_lib_paths and not lib_path:
+ lib_path = candidate_lib_paths[0]
+ if lib_path:
+ break
+ if not lib_path:
+ raise ValueError(f"{lib_name} not found in the system path {sys.path}")
+ return lib_path
+
+
class CudaRTLibrary:
exported_functions = [
# ​cudaError_t cudaSetDevice ( int device )
@@ -77,9 +100,7 @@ class CudaRTLibrary:
def __init__(self, so_file: Optional[str] = None):
if so_file is None:
- assert torch.version.cuda is not None
- major_version = torch.version.cuda.split(".")[0]
- so_file = f"libcudart.so.{major_version}"
+ so_file = get_pytorch_default_cudart_library_path()
if so_file not in CudaRTLibrary.path_to_library_cache:
lib = ctypes.CDLL(so_file)
CudaRTLibrary.path_to_library_cache[so_file] = lib
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 9b4ef48b0e47e..93bf8793dae33 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -225,11 +225,11 @@ async def step_async(
"""
seq_group_metadata_list, scheduler_outputs = self.scheduler[
virtual_engine].schedule()
- finished_requests_ids = self.scheduler[
- virtual_engine].get_and_reset_finished_requests_ids()
if not scheduler_outputs.is_empty():
# Execute the model.
+ finished_requests_ids = self.scheduler[
+ virtual_engine].get_and_reset_finished_requests_ids()
execute_model_req = ExecuteModelRequest(
seq_group_metadata_list=seq_group_metadata_list,
blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
@@ -553,11 +553,13 @@ async def engine_step(self, virtual_engine: int) -> bool:
request_outputs = await self.engine.step_async(virtual_engine)
# Put the outputs into the corresponding streams.
+ finished = True
for request_output in request_outputs:
self._request_tracker.process_request_output(
request_output, verbose=self.log_requests)
+ finished = finished and request_output.finished
- return len(request_outputs) > 0
+ return not finished
async def _engine_abort(self, request_ids: Iterable[str]):
if self.engine_use_ray:
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index b476594fc73f6..622221d2dd13e 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -284,7 +284,7 @@ def __init__(
"quantization":
model_config.quantization,
"kv_cache_dtype":
- cache_config.cache_dtype,
+ str(cache_config.cache_dtype),
# Feature flags
"enable_lora":
@@ -871,10 +871,10 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
"as performance will be severely degraded otherwise.")
seq_group_metadata_list, scheduler_outputs = self.scheduler[
0].schedule()
- finished_requests_ids = self.scheduler[
- 0].get_and_reset_finished_requests_ids()
if not scheduler_outputs.is_empty():
+ finished_requests_ids = self.scheduler[
+ 0].get_and_reset_finished_requests_ids()
execute_model_req = ExecuteModelRequest(
seq_group_metadata_list=seq_group_metadata_list,
blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
index fa672e1feda92..4851897ddef19 100644
--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@@ -60,14 +60,23 @@ def process_prompt_logprob(self, seq_group: SequenceGroup,
assert len(outputs) == 1, ("Single step should only has 1 output.")
output = outputs[0]
prompt_logprobs = output.prompt_logprobs
+
+ # If this is the first (or only) "chunk" of the prefill, we need
+ # to prepend None to the list of prompt logprobs. The reason for this
+ # is that for N prompt tokens, the Sampler will generate N-1 total
+ # prompt logprobs during prefill since the token at idx 0 will not
+ # have a logprob associated with it.
if prompt_logprobs is not None:
+ if not seq_group.prompt_logprobs:
+ prompt_logprobs = [None] + prompt_logprobs
+ seq_group.prompt_logprobs = []
+
if seq_group.sampling_params.detokenize and self.detokenizer:
self.detokenizer.decode_prompt_logprobs_inplace(
- seq_group, prompt_logprobs)
- if not seq_group.prompt_logprobs:
- # The first prompt token's logprob is None because it doesn't
- # have tokens that are precedent.
- seq_group.prompt_logprobs = [None]
+ seq_group,
+ prompt_logprobs,
+ position_offset=len(seq_group.prompt_logprobs))
+
seq_group.prompt_logprobs.extend(prompt_logprobs)
def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 6cba356c47063..45c634b4a2991 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -8,7 +8,7 @@
import fastapi
import uvicorn
-from fastapi import Request
+from fastapi import APIRouter, Request
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, Response, StreamingResponse
@@ -35,10 +35,14 @@
from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
from vllm.logger import init_logger
from vllm.usage.usage_lib import UsageContext
+from vllm.utils import FlexibleArgumentParser
from vllm.version import __version__ as VLLM_VERSION
TIMEOUT_KEEP_ALIVE = 5 # seconds
+logger = init_logger(__name__)
+engine: AsyncLLMEngine
+engine_args: AsyncEngineArgs
openai_serving_chat: OpenAIServingChat
openai_serving_completion: OpenAIServingCompletion
openai_serving_embedding: OpenAIServingEmbedding
@@ -64,35 +68,23 @@ async def _force_log():
yield
-app = fastapi.FastAPI(lifespan=lifespan)
-
-
-def parse_args():
- parser = make_arg_parser()
- return parser.parse_args()
-
+router = APIRouter()
# Add prometheus asgi middleware to route /metrics requests
route = Mount("/metrics", make_asgi_app())
# Workaround for 307 Redirect for /metrics
route.path_regex = re.compile('^/metrics(?P.*)$')
-app.routes.append(route)
-
-
-@app.exception_handler(RequestValidationError)
-async def validation_exception_handler(_, exc):
- err = openai_serving_chat.create_error_response(message=str(exc))
- return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST)
+router.routes.append(route)
-@app.get("/health")
+@router.get("/health")
async def health() -> Response:
"""Health check."""
await openai_serving_chat.engine.check_health()
return Response(status_code=200)
-@app.post("/tokenize")
+@router.post("/tokenize")
async def tokenize(request: TokenizeRequest):
generator = await openai_serving_completion.create_tokenize(request)
if isinstance(generator, ErrorResponse):
@@ -103,7 +95,7 @@ async def tokenize(request: TokenizeRequest):
return JSONResponse(content=generator.model_dump())
-@app.post("/detokenize")
+@router.post("/detokenize")
async def detokenize(request: DetokenizeRequest):
generator = await openai_serving_completion.create_detokenize(request)
if isinstance(generator, ErrorResponse):
@@ -114,19 +106,19 @@ async def detokenize(request: DetokenizeRequest):
return JSONResponse(content=generator.model_dump())
-@app.get("/v1/models")
+@router.get("/v1/models")
async def show_available_models():
models = await openai_serving_completion.show_available_models()
return JSONResponse(content=models.model_dump())
-@app.get("/version")
+@router.get("/version")
async def show_version():
ver = {"version": VLLM_VERSION}
return JSONResponse(content=ver)
-@app.post("/v1/chat/completions")
+@router.post("/v1/chat/completions")
async def create_chat_completion(request: ChatCompletionRequest,
raw_request: Request):
generator = await openai_serving_chat.create_chat_completion(
@@ -142,7 +134,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
return JSONResponse(content=generator.model_dump())
-@app.post("/v1/completions")
+@router.post("/v1/completions")
async def create_completion(request: CompletionRequest, raw_request: Request):
generator = await openai_serving_completion.create_completion(
request, raw_request)
@@ -156,7 +148,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
return JSONResponse(content=generator.model_dump())
-@app.post("/v1/embeddings")
+@router.post("/v1/embeddings")
async def create_embedding(request: EmbeddingRequest, raw_request: Request):
generator = await openai_serving_embedding.create_embedding(
request, raw_request)
@@ -167,8 +159,10 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
return JSONResponse(content=generator.model_dump())
-if __name__ == "__main__":
- args = parse_args()
+def build_app(args):
+ app = fastapi.FastAPI(lifespan=lifespan)
+ app.include_router(router)
+ app.root_path = args.root_path
app.add_middleware(
CORSMiddleware,
@@ -178,6 +172,12 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request):
allow_headers=args.allowed_headers,
)
+ @app.exception_handler(RequestValidationError)
+ async def validation_exception_handler(_, exc):
+ err = openai_serving_chat.create_error_response(message=str(exc))
+ return JSONResponse(err.model_dump(),
+ status_code=HTTPStatus.BAD_REQUEST)
+
if token := envs.VLLM_API_KEY or args.api_key:
@app.middleware("http")
@@ -203,6 +203,12 @@ async def authentication(request: Request, call_next):
raise ValueError(f"Invalid middleware {middleware}. "
f"Must be a function or a class.")
+ return app
+
+
+def run_server(args, llm_engine=None):
+ app = build_app(args)
+
logger.info("vLLM API server version %s", VLLM_VERSION)
logger.info("args: %s", args)
@@ -211,10 +217,12 @@ async def authentication(request: Request, call_next):
else:
served_model_names = [args.model]
- engine_args = AsyncEngineArgs.from_cli_args(args)
+ global engine, engine_args
- engine = AsyncLLMEngine.from_engine_args(
- engine_args, usage_context=UsageContext.OPENAI_API_SERVER)
+ engine_args = AsyncEngineArgs.from_cli_args(args)
+ engine = (llm_engine
+ if llm_engine is not None else AsyncLLMEngine.from_engine_args(
+ engine_args, usage_context=UsageContext.OPENAI_API_SERVER))
event_loop: Optional[asyncio.AbstractEventLoop]
try:
@@ -230,6 +238,10 @@ async def authentication(request: Request, call_next):
# When using single vLLM without engine_use_ray
model_config = asyncio.run(engine.get_model_config())
+ global openai_serving_chat
+ global openai_serving_completion
+ global openai_serving_embedding
+
openai_serving_chat = OpenAIServingChat(engine, model_config,
served_model_names,
args.response_role,
@@ -258,3 +270,13 @@ async def authentication(request: Request, call_next):
ssl_certfile=args.ssl_certfile,
ssl_ca_certs=args.ssl_ca_certs,
ssl_cert_reqs=args.ssl_cert_reqs)
+
+
+if __name__ == "__main__":
+ # NOTE(simon):
+ # This section should be in sync with vllm/scripts.py for CLI entrypoints.
+ parser = FlexibleArgumentParser(
+ description="vLLM OpenAI-Compatible RESTful API server.")
+ parser = make_arg_parser(parser)
+ args = parser.parse_args()
+ run_server(args)
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 81c474ecc808a..f841633b572a9 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -34,9 +34,7 @@ def __call__(self, parser, namespace, values, option_string=None):
setattr(namespace, self.dest, adapter_list)
-def make_arg_parser():
- parser = FlexibleArgumentParser(
- description="vLLM OpenAI-Compatible RESTful API server.")
+def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
parser.add_argument("--host",
type=nullable_str,
default=None,
@@ -133,3 +131,9 @@ def make_arg_parser():
parser = AsyncEngineArgs.add_cli_args(parser)
return parser
+
+
+def create_parser_for_docs() -> FlexibleArgumentParser:
+ parser_for_docs = FlexibleArgumentParser(
+ prog="-m vllm.entrypoints.openai.api_server")
+ return make_arg_parser(parser_for_docs)
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 881e2675cd005..b3f0aae6d002d 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -716,7 +716,7 @@ class BatchResponseData(OpenAIBaseModel):
request_id: str
# The body of the response.
- body: Union[ChatCompletionResponse, ]
+ body: Optional[ChatCompletionResponse] = None
class BatchRequestOutput(OpenAIBaseModel):
diff --git a/vllm/envs.py b/vllm/envs.py
index c624510c7ea1a..85d60f3242526 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -41,6 +41,7 @@
NVCC_THREADS: Optional[str] = None
VLLM_USE_PRECOMPILED: bool = False
VLLM_INSTALL_PUNICA_KERNELS: bool = False
+ VLLM_NO_DEPRECATION_WARNING: bool = False
CMAKE_BUILD_TYPE: Optional[str] = None
VERBOSE: bool = False
@@ -205,6 +206,7 @@
# - "FLASH_ATTN": use FlashAttention
# - "XFORMERS": use XFormers
# - "ROCM_FLASH": use ROCmFlashAttention
+ # - "FLASHINFER": use flashinfer
"VLLM_ATTENTION_BACKEND":
lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
@@ -251,6 +253,10 @@
lambda: os.getenv("VLLM_XLA_CACHE_PATH", "~/.vllm/xla_cache/"),
"VLLM_FUSED_MOE_CHUNK_SIZE":
lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "65536")),
+
+ # If set, vllm will skip the deprecation warnings.
+ "VLLM_NO_DEPRECATION_WARNING":
+ lambda: bool(int(os.getenv("VLLM_NO_DEPRECATION_WARNING", "0"))),
}
# end-env-vars-definition
diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index dcde27973f8ef..a0e248b2e1992 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -9,6 +9,7 @@
ResultHandler, WorkerMonitor)
from vllm.logger import init_logger
from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.triton_utils import maybe_set_triton_cache_manager
from vllm.utils import (cuda_device_count_stateless,
error_on_invalid_device_count_status,
get_distributed_init_method, get_open_port,
@@ -42,6 +43,10 @@ def _init_executor(self) -> None:
if "OMP_NUM_THREADS" not in os.environ:
os.environ["OMP_NUM_THREADS"] = "1"
+ # workaround for https://github.com/vllm-project/vllm/issues/6103
+ if world_size > 1:
+ maybe_set_triton_cache_manager()
+
assert world_size <= cuda_device_count_stateless(), (
"please set tensor_parallel_size to less than max local gpu count")
diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
index 53107dada9962..6b2cb3e2403f2 100644
--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
@@ -70,6 +70,22 @@ def pin_lora(self, lora_id: int) -> bool:
def list_loras(self) -> Set[int]:
return self.driver_worker.list_loras()
+ def add_prompt_adapter(self, prompt_adapter_request) -> bool:
+ raise NotImplementedError(
+ "Soft prompt is currently not supported by the Neuron backend.")
+
+ def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+ raise NotImplementedError(
+ "Soft prompt is currently not supported by the Neuron backend.")
+
+ def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+ raise NotImplementedError(
+ "Soft prompt is currently not supported by the Neuron backend.")
+
+ def list_prompt_adapters(self) -> Set[int]:
+ raise NotImplementedError(
+ "Soft prompt is currently not supported by the Neuron backend.")
+
def check_health(self) -> None:
# NeuronExecutor will always be healthy as long as
# it's running.
diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py
index 697d698b4edf7..1ef37785b6d59 100644
--- a/vllm/executor/openvino_executor.py
+++ b/vllm/executor/openvino_executor.py
@@ -90,6 +90,22 @@ def pin_lora(self, lora_id: int) -> bool:
def list_loras(self) -> Set[int]:
return self.driver_worker.list_loras()
+ def add_prompt_adapter(self, prompt_adapter_request) -> bool:
+ raise NotImplementedError(
+ "Soft prompt is currently not supported by the OPENVINO backend.")
+
+ def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+ raise NotImplementedError(
+ "Soft prompt is currently not supported by the OPENVINO backend.")
+
+ def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+ raise NotImplementedError(
+ "Soft prompt is currently not supported by the OPENVINO backend.")
+
+ def list_prompt_adapters(self) -> Set[int]:
+ raise NotImplementedError(
+ "Soft prompt is currently not supported by the OPENVINO backend.")
+
def check_health(self) -> None:
# OpenVINOExecutor will always be healthy as long as
# it's running.
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 6e13264aba233..388f934ef75a6 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -224,16 +224,13 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
# broadcasted to.
self.non_driver_workers: List[RayWorkerWrapper] = []
- for pp_rank in range(self.parallel_config.pipeline_parallel_size):
- for tp_rank in range(self.parallel_config.tensor_parallel_size):
- rank = (pp_rank *
- self.parallel_config.tensor_parallel_size) + tp_rank
- if rank == 0:
- pass
- elif rank % self.parallel_config.tensor_parallel_size == 0:
- self.tp_driver_workers.append(self.workers[rank - 1])
- else:
- self.non_driver_workers.append(self.workers[rank - 1])
+ for idx, rank in enumerate(worker_ranks[1:]):
+ # We need to skip the driver worker, which we
+ # do by skipping worker_ranks[0] which is always 0.
+ if rank % self.parallel_config.tensor_parallel_size == 0:
+ self.tp_driver_workers.append(self.workers[idx])
+ else:
+ self.non_driver_workers.append(self.workers[idx])
def _driver_execute_model(
self, execute_model_req: Optional[ExecuteModelRequest]
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index a29622b7d25c3..413c0b6d0924e 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -394,14 +394,16 @@ def fused_topk(
# This is used by the Deepseek-V2 model
-def grouped_topk(
- hidden_states: torch.Tensor,
- gating_output: torch.Tensor,
- topk: int,
- renormalize: bool,
- num_expert_group: int = 0,
- topk_group: int = 0,
-):
+def grouped_topk(hidden_states: torch.Tensor,
+ gating_output: torch.Tensor,
+ topk: int,
+ renormalize: bool,
+ num_expert_group: int = 0,
+ topk_group: int = 0):
+
+ assert hidden_states.shape[0] == gating_output.shape[0], (
+ "Number of tokens mismatch")
+
scores = torch.softmax(gating_output, dim=-1)
num_token = scores.shape[0]
group_scores = scores.view(num_token, num_expert_group,
@@ -492,12 +494,14 @@ def fused_experts(hidden_states: torch.Tensor,
if tokens_in_chunk == 0:
break
- if tokens_in_chunk < CHUNK_SIZE:
- # will only happen in the last chunk
+ if tokens_in_chunk < CHUNK_SIZE and chunk > 0:
+ # Adjust the intermediate cache size and config for the last
+ # chunk. Note that in most cases we only have one chunk
+ # so the cache size and config are already set correctly and
+ # do not need to be adjusted.
intermediate_cache1 = intermediate_cache1[:tokens_in_chunk]
intermediate_cache2 = intermediate_cache2[:tokens_in_chunk]
intermediate_cache3 = intermediate_cache3[:tokens_in_chunk]
- # reload config to get better performance on the last chunk
config = get_config_func(tokens_in_chunk)
curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx]
@@ -555,6 +559,9 @@ def fused_moe(
renormalize: bool,
inplace: bool = False,
override_config: Optional[Dict[str, Any]] = None,
+ use_grouped_topk: bool = False,
+ num_expert_group: Optional[int] = None,
+ topk_group: Optional[int] = None,
use_fp8: bool = False,
w1_scale: Optional[torch.Tensor] = None,
w2_scale: Optional[torch.Tensor] = None,
@@ -577,6 +584,10 @@ def fused_moe(
Defaults to False.
- override_config (Optional[Dict[str, Any]]): Optional override
for the kernel configuration.
+ - num_expert_group: Optional[int]: additional parameter for grouped_topk
+ - topk_group: Optional[int]: additional parameter for grouped_topk
+ - use_grouped_topk: If True, use grouped_topk instead of fused_topk
+ note: Deepseekv2 model uses grouped_topk
- use_fp8 (bool): If True, use fp8 arithmetic to compute the inner
products for w1 and w2. Defaults to False.
- w1_scale (Optional[torch.Tensor]): Optional scale to be used for
@@ -590,8 +601,15 @@ def fused_moe(
# Check constraints.
assert gating_output.shape[1] == w1.shape[0], "Number of experts mismatch"
- topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
- renormalize)
+ if use_grouped_topk:
+ assert num_expert_group is not None and topk_group is not None
+ topk_weights, topk_ids = grouped_topk(hidden_states, gating_output,
+ topk, renormalize,
+ num_expert_group, topk_group)
+ else:
+ topk_weights, topk_ids = fused_topk(hidden_states, gating_output, topk,
+ renormalize)
+
return fused_experts(hidden_states,
w1,
w2,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 73cfcd7fc85f2..7f0668601fac3 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1,5 +1,5 @@
from abc import abstractmethod
-from typing import Optional
+from typing import List, Optional, Tuple
import torch
@@ -7,7 +7,7 @@
get_tensor_model_parallel_world_size,
tensor_model_parallel_all_reduce)
from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe.fused_moe import fused_moe
+from vllm.model_executor.custom_op import CustomOp
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase)
from vllm.model_executor.utils import set_weight_attrs
@@ -29,11 +29,14 @@ def apply(self,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
- renormalize: bool = True) -> torch.Tensor:
+ renormalize: bool = True,
+ use_grouped_topk: bool = False,
+ num_expert_group: Optional[int] = None,
+ topk_group: Optional[int] = None) -> torch.Tensor:
raise NotImplementedError
-class UnquantizedFusedMoEMethod(FusedMoEMethodBase):
+class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
"""MoE method without quantization."""
def create_weights(self, layer: torch.nn.Module, num_experts: int,
@@ -58,20 +61,48 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
layer.register_parameter("w2_weight", w2_weight)
set_weight_attrs(w2_weight, extra_weight_attrs)
- def apply(self,
- layer: torch.nn.Module,
- x: torch.Tensor,
- router_logits: torch.Tensor,
- top_k: int,
- renormalize: bool = True) -> torch.Tensor:
+ def apply(
+ self,
+ layer: torch.nn.Module,
+ x: torch.Tensor,
+ router_logits: torch.Tensor,
+ top_k: int,
+ renormalize: bool = True,
+ use_grouped_topk: bool = False,
+ num_expert_group: Optional[int] = None,
+ topk_group: Optional[int] = None,
+ ) -> torch.Tensor:
+ return self.forward(x, layer.w13_weight, layer.w2_weight,
+ router_logits, top_k, renormalize,
+ use_grouped_topk, num_expert_group, topk_group)
+ def forward_cuda(
+ self,
+ x: torch.Tensor,
+ w1: torch.Tensor,
+ w2: torch.Tensor,
+ router_logits: torch.Tensor,
+ top_k: int,
+ renormalize: bool,
+ use_grouped_topk: bool,
+ num_expert_group: Optional[int],
+ topk_group: Optional[int],
+ ) -> torch.Tensor:
+ from vllm.model_executor.layers.fused_moe.fused_moe import fused_moe
return fused_moe(x,
- layer.w13_weight,
- layer.w2_weight,
+ w1,
+ w2,
router_logits,
top_k,
renormalize=renormalize,
- inplace=True)
+ inplace=True,
+ use_grouped_topk=use_grouped_topk,
+ num_expert_group=num_expert_group,
+ topk_group=topk_group)
+
+ def forward_cpu(self, *args, **kwargs):
+ raise NotImplementedError(
+ "The CPU backend currently does not support MoE.")
class FusedMoE(torch.nn.Module):
@@ -104,6 +135,9 @@ def __init__(
params_dtype: Optional[torch.dtype] = None,
reduce_results: bool = False,
renormalize: bool = True,
+ use_grouped_topk: bool = False,
+ num_expert_group: Optional[int] = None,
+ topk_group: Optional[int] = None,
quant_config: Optional[QuantizationConfig] = None,
tp_size: Optional[int] = None,
):
@@ -119,6 +153,11 @@ def __init__(
self.intermediate_size_per_partition = intermediate_size // self.tp_size
self.reduce_results = reduce_results
self.renormalize = renormalize
+ self.use_grouped_topk = use_grouped_topk
+ if self.use_grouped_topk:
+ assert num_expert_group is not None and topk_group is not None
+ self.num_expert_group = num_expert_group
+ self.topk_group = topk_group
if quant_config is None:
self.quant_method: Optional[QuantizeMethodBase] = (
@@ -140,9 +179,8 @@ def weight_loader(self, param: torch.nn.Parameter,
shard_id: int, expert_id: int):
param_data = param.data
- # FIXME(robertgshaw2-neuralmagic): Overfit to Mixtral.
- # Follow up PR to enable fp8 for other MoE models.
- if "input_scale" in weight_name or "w2.weight_scale" in weight_name:
+ # Input scales can be loaded directly and should be equal.
+ if "input_scale" in weight_name:
if param_data[expert_id] != 1 and (param_data[expert_id] -
loaded_weight).abs() > 1e-5:
raise ValueError(
@@ -150,14 +188,21 @@ def weight_loader(self, param: torch.nn.Parameter,
f"must be equal. But got {param_data[expert_id]} "
f"vs. {loaded_weight}")
param_data[expert_id] = loaded_weight
- # FIXME(robertgshaw2-neuralmagic): Overfit to Mixtral.
- # Follow up PR to enable fp8 for other MoE models.
+ # Weight scales
elif "weight_scale" in weight_name:
- # We have to keep the weight scales of w1 and w3 because
- # we need to re-quantize w1/w3 weights after weight loading.
- assert "w1" in weight_name or "w3" in weight_name
- shard_id = 0 if "w1" in weight_name else 1
- param_data[expert_id][shard_id] = loaded_weight
+ # If we are in merged column case (gate_up_proj)
+ # shard_id 0 == gate_proj / w1
+ # shard_id 2 == up_proj / w3
+ if shard_id == 0 or shard_id == 2:
+ # We have to keep the weight scales of w1 and w3 because
+ # we need to re-quantize w1/w3 weights after weight loading.
+ idx = 0 if shard_id == 0 else 1
+ param_data[expert_id][idx] = loaded_weight
+ # If we are in the row parallel case (down_proj)
+ # shard_id 1 == down_proj / w2
+ else:
+ param_data[expert_id] = loaded_weight
+ # Weights
else:
tp_rank = get_tensor_model_parallel_rank()
shard_size = self.intermediate_size_per_partition
@@ -188,10 +233,50 @@ def forward(self, hidden_states: torch.Tensor,
x=hidden_states,
router_logits=router_logits,
top_k=self.top_k,
- renormalize=self.renormalize)
+ renormalize=self.renormalize,
+ use_grouped_topk=self.use_grouped_topk,
+ num_expert_group=self.num_expert_group,
+ topk_group=self.topk_group)
if self.reduce_results and self.tp_size > 1:
final_hidden_states = tensor_model_parallel_all_reduce(
final_hidden_states)
return final_hidden_states
+
+ @classmethod
+ def make_expert_params_mapping(
+ cls, ckpt_gate_proj_name: str, ckpt_down_proj_name: str,
+ ckpt_up_proj_name: str,
+ num_experts: int) -> List[Tuple[str, str, int, int]]:
+
+ gate_up = [ckpt_gate_proj_name, ckpt_up_proj_name]
+ gate_down_up = [
+ ckpt_gate_proj_name, ckpt_down_proj_name, ckpt_up_proj_name
+ ]
+
+ return [
+ # These are the weight scales for the experts
+ # (param_name, weight_name, expert_id, shard_id)
+ ("experts.w13_scale"
+ if weight_name in gate_up else "experts.w2_scale",
+ f"experts.{expert_id}.{weight_name}.weight_scale", expert_id,
+ shard_id) for expert_id in range(num_experts)
+ for shard_id, weight_name in enumerate(gate_down_up)
+ ] + [
+ # These are the weights for the experts
+ # (param_name, weight_name, expert_id, shard_id)
+ ("experts.w13_weight"
+ if weight_name in gate_up else "experts.w2_weight",
+ f"experts.{expert_id}.{weight_name}.weight", expert_id, shard_id)
+ for expert_id in range(num_experts)
+ for shard_id, weight_name in enumerate(gate_down_up)
+ ] + [
+ # These are the weight scales for the experts
+ # (param_name, weight_name, expert_id, shard_id)
+ ("experts.a13_scale"
+ if weight_name in gate_up else "experts.a2_scale",
+ f"experts.{expert_id}.{weight_name}.input_scale", expert_id,
+ shard_id) for expert_id in range(num_experts)
+ for shard_id, weight_name in enumerate(gate_down_up)
+ ]
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 1dda5d3740a8b..bc07d2b831862 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -99,15 +99,7 @@ def apply(self,
class UnquantizedLinearMethod(LinearMethodBase):
- """Linear method without quantization.
-
- Args:
- separate_bias_add: If true, add bias separately after matrix
- multiplication.
- """
-
- def __init__(self, separate_bias_add: bool = False):
- self.separate_bias_add = separate_bias_add
+ """Linear method without quantization."""
def create_weights(self, layer: torch.nn.Module,
input_size_per_partition: int,
@@ -126,12 +118,8 @@ def apply(self,
layer: torch.nn.Module,
x: torch.Tensor,
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
- weight = layer.weight
- if self.separate_bias_add:
- if bias is not None:
- return F.linear(x, weight) + bias
- return F.linear(x, weight)
- return F.linear(x, weight, bias)
+
+ return F.linear(x, layer.weight, bias)
class LinearBase(torch.nn.Module):
@@ -743,7 +731,6 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
param_data.copy_(loaded_weight)
def forward(self, input_):
- # Set up backprop all-reduce.
if self.input_is_parallel:
input_parallel = input_
else:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index c711fd14c668c..524b4c894b9b5 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -267,10 +267,7 @@ def apply(self,
"""
- if bias is not None:
- raise ValueError("bias is not supported for this linear method")
-
scheme = layer.scheme
if scheme is None:
raise ValueError("A scheme must be defined for each layer")
- return scheme.apply_weights(layer, x)
+ return scheme.apply_weights(layer, x, bias=bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
index 119f6cd91bb0c..3aa9130782039 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
@@ -1,4 +1,5 @@
from abc import ABC, abstractmethod
+from typing import Optional
import torch
@@ -20,14 +21,16 @@ def create_weights(self, *args, **kwargs):
raise NotImplementedError
@abstractmethod
- def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
+ def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+ bias: Optional[torch.Tensor]):
"""
Run the forward pass for the particular scheme. This is where
scheme-specific dequant/quant steps/kernels should be applied.
- :param layer: toch.nn.Module with the registered weights and
+ :param layer: torch.nn.Module with the registered weights and
other parameters relevant to the particular scheme.
:param x: input to the layer
+ :param bias: bias parameter
"""
raise NotImplementedError
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
index f5911bc3dabb5..2c7fe3e0e4114 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
@@ -1,4 +1,4 @@
-from typing import Callable, List
+from typing import Callable, List, Optional
import torch
import torch.nn.functional as F
@@ -37,6 +37,7 @@ def create_weights(self, layer: torch.nn.Module,
layer.register_parameter("weight", weight)
set_weight_attrs(weight, {"weight_loader": weight_loader})
- def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
- weight = layer.weight
- return F.linear(x, weight)
+ def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+ bias: Optional[torch.Tensor]) -> torch.Tensor:
+
+ return F.linear(x, layer.weight, bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
index 3c07d6b6fe5c1..54bf85c096f2e 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py
@@ -118,7 +118,9 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
requires_grad=False)
layer.workspace = workspace
- def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
+ def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+ bias: Optional[torch.Tensor]) -> torch.Tensor:
+
qweight = layer.weight_packed
meta = layer.meta
scales = layer.scale_packed
@@ -135,4 +137,8 @@ def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
size_n, size_k)
output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
+
+ if bias is not None:
+ output.add_(bias) # In-place add
+
return output
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
index e70504ec51cb3..6fec5d01056d8 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -1,4 +1,4 @@
-from typing import Callable, List
+from typing import Callable, List, Optional
import torch
from torch.nn import Parameter
@@ -78,8 +78,11 @@ def create_weights(self, layer: torch.nn.Module,
**layer_kwargs)
layer.register_parameter("input_scale", scale)
- def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
+ def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+ bias: Optional[torch.Tensor]) -> torch.Tensor:
+
return apply_int8_linear(input=x,
weight=layer.weight,
weight_scale=layer.weight_scale,
- input_scale=layer.input_scale)
+ input_scale=layer.input_scale,
+ bias=bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
index 2243260053ef5..187a3f9877ccf 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py
@@ -6,9 +6,10 @@
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
CompressedTensorsScheme)
-from vllm.model_executor.layers.quantization.gptq_marlin import (
- GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, GPTQMarlinState,
- marlin_permute_scales)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+ apply_marlin_linear, marlin_make_empty_g_idx, marlin_make_workspace,
+ marlin_permute_scales, replace_tensor, verify_marlin_supported,
+ verify_marlin_supports_shape)
from vllm.model_executor.utils import set_weight_attrs
__all__ = ["CompressedTensorsWNA16"]
@@ -22,29 +23,40 @@ def __init__(self,
num_bits: int,
group_size: Optional[int] = None):
self.num_bits = num_bits
+ self.pack_factor = 32 // self.num_bits
self.strategy = strategy
- self.group_size = group_size
- if self.strategy == "group" and self.group_size is None:
- raise ValueError(
- "group_size must be given when using strategy group")
+ self.group_size: int
+ if group_size is None:
+ if self.strategy != "channel":
+ raise ValueError(
+ "Marlin kernels require group quantization or "
+ "channelwise quantization, but found no group "
+ "size and strategy is not channelwise.")
+ self.group_size = -1
+ else:
+ self.group_size = group_size
- def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
- pass
+ # Verify supported on platform.
+ verify_marlin_supported(num_bits=self.num_bits,
+ group_size=self.group_size,
+ is_sym=True)
def create_weights(self, layer: torch.nn.Module, input_size: int,
output_partition_sizes: List[int],
input_size_per_partition: int,
params_dtype: torch.dtype, weight_loader: Callable,
**kwargs):
-
- pack_factor = 32 // self.num_bits
output_size_per_partition = sum(output_partition_sizes)
- if self.group_size is not None:
- group_size = self.group_size
- else:
- group_size = input_size
+ # If group_size is -1, we are in channelwise case.
+ group_size = input_size if self.group_size == -1 else self.group_size
+
+ verify_marlin_supports_shape(
+ output_size_per_partition=output_size_per_partition,
+ input_size_per_partition=input_size_per_partition,
+ input_size=input_size,
+ group_size=group_size)
weight_scale_dim = None
scales_and_zp_size = input_size // group_size
@@ -57,7 +69,7 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
weight = Parameter(
torch.empty(
output_size_per_partition,
- input_size_per_partition // pack_factor,
+ input_size_per_partition // self.pack_factor,
dtype=torch.int32,
),
requires_grad=False,
@@ -68,7 +80,7 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
"input_dim": 1,
"output_dim": 0,
"packed_dim": 1,
- "pack_factor": pack_factor,
+ "pack_factor": self.pack_factor,
"weight_loader": weight_loader
})
layer.register_parameter("weight_packed", weight)
@@ -103,73 +115,51 @@ def create_weights(self, layer: torch.nn.Module, input_size: int,
layer.input_size_per_partition = input_size_per_partition
layer.output_size_per_partition = output_size_per_partition
-
layer.input_size = input_size
- layer.marlin_state = GPTQMarlinState.REPACK
- layer.is_k_full = True
layer.group_size = group_size
- max_workspace_size = (
- output_size_per_partition //
- GPTQ_MARLIN_MIN_THREAD_N) * GPTQ_MARLIN_MAX_PARALLEL
-
- workspace = torch.zeros(max_workspace_size,
- dtype=torch.int,
- requires_grad=False)
- layer.workspace = workspace
-
- def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
- reshaped_x = x.reshape(-1, x.shape[-1])
-
- size_m = reshaped_x.shape[0]
- part_size_n = layer.output_size_per_partition
- part_size_k = layer.input_size_per_partition
-
- out_shape = x.shape[:-1] + (part_size_n, )
-
- if layer.marlin_state == GPTQMarlinState.REPACK:
- layer.marlin_state = GPTQMarlinState.READY
-
- # Newly generated tensors need to replace existing tensors that are
- # already registered as parameters by vLLM (and won't be freed)
- def replace_tensor(name, new_t):
- # It is important to use resize_() here since it ensures
- # the same buffer is reused
- getattr(layer, name).resize_(new_t.shape)
- getattr(layer, name).copy_(new_t)
- del new_t
-
- cur_device = layer.weight_packed.device
-
- # Reset g_idx related tensors
- layer.g_idx = Parameter(torch.empty(0,
- dtype=torch.int,
- device=cur_device),
- requires_grad=False)
- layer.g_idx_sort_indices = Parameter(torch.empty(
- 0, dtype=torch.int, device=cur_device),
- requires_grad=False)
-
- # Repack weights
- marlin_qweight = ops.gptq_marlin_repack(
- layer.weight_packed.t().contiguous(), layer.g_idx_sort_indices,
- part_size_k, part_size_n, self.num_bits)
-
- replace_tensor("weight_packed", marlin_qweight)
-
- # Permute scales
- scales_size_k = part_size_k
- scales_size_n = part_size_n
-
- marlin_scales = marlin_permute_scales(
- layer.weight_scale.squeeze().t().contiguous(), scales_size_k,
- scales_size_n, layer.group_size, self.num_bits)
- replace_tensor("weight_scale", marlin_scales)
-
- output = ops.gptq_marlin_gemm(reshaped_x, layer.weight_packed,
- layer.weight_scale, layer.g_idx,
- layer.g_idx_sort_indices,
- layer.workspace, self.num_bits, size_m,
- part_size_n, part_size_k,
- layer.is_k_full)
- return output.reshape(out_shape)
+ # Checkpoints are serialized in compressed-tensors format, which is
+ # different from marlin format. Handle repacking here.
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+ device = layer.weight_packed.device
+
+ # Allocate marlin workspace.
+ layer.workspace = marlin_make_workspace(
+ layer.output_size_per_partition, device)
+
+ # Act-order not supported in compressed-tensors yet, so set to empty.
+ layer.g_idx = marlin_make_empty_g_idx(device)
+ layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
+
+ # Repack weights from compressed-tensors format to marlin format.
+ marlin_qweight = ops.gptq_marlin_repack(
+ layer.weight_packed.t().contiguous(),
+ perm=layer.g_idx_sort_indices,
+ size_k=layer.input_size_per_partition,
+ size_n=layer.output_size_per_partition,
+ num_bits=self.num_bits)
+ replace_tensor(layer, "weight_packed", marlin_qweight)
+
+ # Permute scales from compressed-tensors format to marlin format.
+ marlin_scales = marlin_permute_scales(
+ layer.weight_scale.squeeze().t().contiguous(),
+ size_k=layer.input_size_per_partition,
+ size_n=layer.output_size_per_partition,
+ group_size=layer.group_size)
+ replace_tensor(layer, "weight_scale", marlin_scales)
+
+ def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+ bias: Optional[torch.Tensor]) -> torch.Tensor:
+
+ return apply_marlin_linear(
+ input=x,
+ weight=layer.weight_packed,
+ weight_scale=layer.weight_scale,
+ g_idx=layer.g_idx,
+ g_idx_sort_indices=layer.g_idx_sort_indices,
+ workspace=layer.workspace,
+ num_bits=self.num_bits,
+ output_size_per_partition=layer.output_size_per_partition,
+ input_size_per_partition=layer.input_size_per_partition,
+ is_k_full=True,
+ bias=bias)
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 8dba9019f94cf..5c916c9b4d7e4 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -11,7 +11,7 @@
from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase)
-from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
all_close_1d, apply_fp8_linear, create_per_tensor_scale_param,
@@ -377,7 +377,10 @@ def apply(self,
x: torch.Tensor,
router_logits: torch.Tensor,
top_k: int,
- renormalize: bool = True) -> torch.Tensor:
+ renormalize: bool = True,
+ use_grouped_topk: bool = False,
+ num_expert_group: Optional[int] = None,
+ topk_group: Optional[int] = None) -> torch.Tensor:
return fused_moe(x,
layer.w13_weight,
@@ -390,7 +393,10 @@ def apply(self,
w1_scale=layer.w13_scale,
w2_scale=layer.w2_scale,
a1_scale=layer.a13_scale,
- a2_scale=layer.a2_scale)
+ a2_scale=layer.a2_scale,
+ use_grouped_topk=use_grouped_topk,
+ num_expert_group=num_expert_group,
+ topk_group=topk_group)
class Fp8KVCacheMethod(QuantizeMethodBase):
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index 6b971f73d45bf..07a73d06e0596 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -1,5 +1,3 @@
-import enum
-from enum import Enum
from typing import Any, Dict, List, Optional
import torch
@@ -12,46 +10,15 @@
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
- GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_K,
- GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_SUPPORTED_GROUP_SIZES,
- GPTQ_MARLIN_SUPPORTED_NUM_BITS, GPTQ_MARLIN_SUPPORTED_SYM,
- GPTQ_MARLIN_TILE)
+ apply_marlin_linear, check_marlin_supported, marlin_is_k_full,
+ marlin_make_empty_g_idx, marlin_make_workspace, marlin_permute_scales,
+ marlin_repeat_scales_on_all_ranks, marlin_sort_g_idx, replace_tensor,
+ verify_marlin_supported, verify_marlin_supports_shape)
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
-from vllm.platforms import current_platform
logger = init_logger(__name__)
-# Permutations for Marlin scale shuffling
-def get_scale_perms(num_bits: int):
- scale_perm: List[int] = []
- for i in range(8):
- scale_perm.extend([i + 8 * j for j in range(8)])
- scale_perm_single: List[int] = []
- for i in range(4):
- scale_perm_single.extend(
- [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
- return scale_perm, scale_perm_single
-
-
-def get_pack_factor(num_bits: int):
- assert (num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS
- ), f"Unsupported num_bits = {num_bits}"
- return 32 // num_bits
-
-
-def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
- group_size: int, num_bits: int):
- scale_perm, scale_perm_single = get_scale_perms(num_bits)
- if group_size < size_k and group_size != -1:
- s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
- else:
- s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
- s = s.reshape((-1, size_n)).contiguous()
-
- return s
-
-
class GPTQMarlinConfig(QuantizationConfig):
"""Config class for GPTQ Marlin"""
@@ -63,33 +30,16 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool,
desc_act = False
self.weight_bits = weight_bits
+ self.pack_factor = 32 // self.weight_bits # packed into int32
self.group_size = group_size
self.desc_act = desc_act
self.is_sym = is_sym
self.lm_head_quantized = lm_head_quantized
- # Verify
- if self.weight_bits not in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
- raise ValueError(
- f"Marlin does not support weight_bits = {self.weight_bits}. "
- f"Only weight_bits = {GPTQ_MARLIN_SUPPORTED_NUM_BITS} "
- "are supported.")
- if self.group_size not in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES:
- raise ValueError(
- f"Marlin does not support group_size = {self.group_size}. "
- f"Only group_sizes = {GPTQ_MARLIN_SUPPORTED_GROUP_SIZES} "
- "are supported.")
- if self.is_sym not in GPTQ_MARLIN_SUPPORTED_SYM:
- raise ValueError(
- f"Marlin does not support is_sym = {self.is_sym}. "
- f"Only sym = {GPTQ_MARLIN_SUPPORTED_SYM} are supported.")
-
- # Init
- self.pack_factor = get_pack_factor(weight_bits)
- self.tile_size = GPTQ_MARLIN_TILE
- self.min_thread_n = GPTQ_MARLIN_MIN_THREAD_N
- self.min_thread_k = GPTQ_MARLIN_MIN_THREAD_K
- self.max_parallel = GPTQ_MARLIN_MAX_PARALLEL
+ # Verify supported on platform.
+ verify_marlin_supported(num_bits=self.weight_bits,
+ group_size=self.group_size,
+ is_sym=self.is_sym)
def __repr__(self) -> str:
return (f"GPTQMarlinConfig(weight_bits={self.weight_bits}, "
@@ -168,21 +118,10 @@ def is_marlin_compatible(cls, quant_config: Dict[str, Any]):
or desc_act is None):
return False
- # If the capability of the device is too low, cannot convert.
- major, minor = current_platform.get_device_capability()
- device_capability = major * 10 + minor
- if device_capability < cls.get_min_capability():
- return False
-
- # Otherwise, can convert if model satisfies marlin constraints.
- return (num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS
- and group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES
- and sym in GPTQ_MARLIN_SUPPORTED_SYM)
-
-
-class GPTQMarlinState(Enum):
- REPACK = enum.auto()
- READY = enum.auto()
+ return check_marlin_supported(num_bits=num_bits,
+ group_size=group_size,
+ is_sym=sym,
+ min_capability=cls.get_min_capability())
class GPTQMarlinLinearMethod(LinearMethodBase):
@@ -206,6 +145,8 @@ def create_weights(
**extra_weight_attrs,
) -> None:
del output_size
+ output_size_per_partition = sum(output_partition_sizes)
+ is_row_parallel = input_size != input_size_per_partition
# Normalize group_size
if self.quant_config.group_size != -1:
@@ -213,58 +154,25 @@ def create_weights(
else:
group_size = input_size
- # Validate dtype
- if params_dtype not in [torch.float16, torch.bfloat16]:
- raise ValueError(f"The params dtype must be float16 "
- f"or bfloat16, but got {params_dtype}")
-
- # Validate output_size_per_partition
- output_size_per_partition = sum(output_partition_sizes)
- if output_size_per_partition % self.quant_config.min_thread_n != 0:
- raise ValueError(
- f"Weight output_size_per_partition = "
- f"{output_size_per_partition} is not divisible by "
- f" min_thread_n = {self.quant_config.min_thread_n}.")
-
- # Validate input_size_per_partition
- if input_size_per_partition % self.quant_config.min_thread_k != 0:
- raise ValueError(
- f"Weight input_size_per_partition = "
- f"{input_size_per_partition} is not divisible "
- f"by min_thread_k = {self.quant_config.min_thread_k}.")
-
- if (group_size < input_size
- and input_size_per_partition % group_size != 0):
- raise ValueError(
- f"Weight input_size_per_partition = {input_size_per_partition}"
- f" is not divisible by group_size = {group_size}.")
-
- # Detect sharding of scales/zp
-
- # By default, no sharding over "input dim"
- scales_and_zp_size = input_size // group_size
- scales_and_zp_input_dim = None
-
- if self.quant_config.desc_act:
- # Act-order case
- assert self.quant_config.group_size != -1
-
- is_k_full = input_size_per_partition == input_size
-
+ verify_marlin_supports_shape(
+ output_size_per_partition=output_size_per_partition,
+ input_size_per_partition=input_size_per_partition,
+ input_size=input_size,
+ group_size=group_size)
+
+ # Determine sharding
+ if marlin_repeat_scales_on_all_ranks(self.quant_config.desc_act,
+ self.quant_config.group_size,
+ is_row_parallel):
+ # By setting scale_dim == None, weight_loader will
+ # repeat the scales on each GPU in TP>1 case.
+ scales_and_zp_input_dim = None
+ scales_and_zp_size = input_size // group_size
else:
- # No act-order case
-
- # K is always full due to full alignment with
- # group-size and shard of scales/zp
- is_k_full = True
-
- # If this is a row-parallel case, then shard scales/zp
- if (input_size != input_size_per_partition
- and self.quant_config.group_size != -1):
- scales_and_zp_size = input_size_per_partition // group_size
- scales_and_zp_input_dim = 0
-
- # Init buffers
+ # By setting scale_dim == 0, weight_loader will
+ # shard the scales in TP>1 case.
+ scales_and_zp_input_dim = 0
+ scales_and_zp_size = input_size_per_partition // group_size
# Quantized weights
qweight = Parameter(
@@ -303,11 +211,6 @@ def create_weights(
},
)
- g_idx_sort_indices = torch.empty(
- g_idx.shape,
- dtype=torch.int32,
- )
-
# Scales
scales = Parameter(
torch.empty(
@@ -347,25 +250,52 @@ def create_weights(
},
)
- # Allocate marlin workspace
- max_workspace_size = (
- output_size_per_partition //
- self.quant_config.min_thread_n) * self.quant_config.max_parallel
- workspace = torch.zeros(max_workspace_size,
- dtype=torch.int,
- requires_grad=False)
-
layer.register_parameter("qweight", qweight)
layer.register_parameter("g_idx", g_idx)
layer.register_parameter("scales", scales)
layer.register_parameter("qzeros", qzeros)
- layer.g_idx_sort_indices = g_idx_sort_indices
- layer.workspace = workspace
layer.input_size_per_partition = input_size_per_partition
layer.output_size_per_partition = output_size_per_partition
layer.input_size = input_size
- layer.is_k_full = is_k_full
- layer.marlin_state = GPTQMarlinState.REPACK
+ layer.is_k_full = marlin_is_k_full(self.quant_config.desc_act,
+ is_row_parallel)
+
+ # Checkpoints are serialized in AutoGPTQ format, which is different from the
+ # marlin format. This function is called after the weights are loaded.
+ # Here, we handle the repacking, including the activation reordering case.
+ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+ device = layer.qweight.device
+
+ # Allocate marlin workspace
+ layer.workspace = marlin_make_workspace(
+ layer.output_size_per_partition, device)
+
+ # Handle sorting for activation reordering if needed.
+ if self.quant_config.desc_act:
+ g_idx, g_idx_sort_indices = marlin_sort_g_idx(layer.g_idx)
+ layer.g_idx_sort_indices = g_idx_sort_indices
+ replace_tensor(layer, "g_idx", g_idx)
+ else:
+ layer.g_idx = marlin_make_empty_g_idx(device)
+ layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
+
+ # Repack weights from autogptq format to marlin format.
+ marlin_qweight = ops.gptq_marlin_repack(
+ layer.qweight,
+ perm=layer.g_idx_sort_indices,
+ size_k=layer.input_size_per_partition,
+ size_n=layer.output_size_per_partition,
+ num_bits=self.quant_config.weight_bits)
+ replace_tensor(layer, "qweight", marlin_qweight)
+
+ # Permute scales from autogptq format to marlin format.
+ marlin_scales = marlin_permute_scales(
+ layer.scales,
+ size_k=(layer.input_size if self.quant_config.desc_act else
+ layer.input_size_per_partition),
+ size_n=layer.output_size_per_partition,
+ group_size=self.quant_config.group_size)
+ replace_tensor(layer, "scales", marlin_scales)
def apply(
self,
@@ -373,90 +303,15 @@ def apply(
x: torch.Tensor,
bias: Optional[torch.Tensor] = None,
) -> torch.Tensor:
- reshaped_x = x.reshape(-1, x.shape[-1])
-
- size_m = reshaped_x.shape[0]
- part_size_n = layer.output_size_per_partition
- part_size_k = layer.input_size_per_partition
- full_size_k = layer.input_size
-
- out_shape = x.shape[:-1] + (part_size_n, )
-
- if layer.marlin_state == GPTQMarlinState.REPACK:
- layer.marlin_state = GPTQMarlinState.READY
-
- # Newly generated tensors need to replace existing tensors that are
- # already registered as parameters by vLLM (and won't be freed)
- def replace_tensor(name, new_t):
- # It is important to use resize_() here since it ensures
- # the same buffer is reused
- getattr(layer, name).resize_(new_t.shape)
- getattr(layer, name).copy_(new_t)
- del new_t
-
- cur_device = layer.qweight.device
-
- # Process act_order
- if self.quant_config.desc_act:
- # Get sorting based on g_idx
- g_idx_sort_indices = torch.argsort(layer.g_idx).to(torch.int)
-
- sorted_g_idx = layer.g_idx[g_idx_sort_indices]
-
- replace_tensor("g_idx", sorted_g_idx)
- replace_tensor("g_idx_sort_indices", g_idx_sort_indices)
-
- else:
- # Reset g_idx related tensors
- layer.g_idx = Parameter(
- torch.empty(0, dtype=torch.int, device=cur_device),
- requires_grad=False,
- )
- layer.g_idx_sort_indices = Parameter(
- torch.empty(0, dtype=torch.int, device=cur_device),
- requires_grad=False,
- )
-
- # Repack weights
- marlin_qweight = ops.gptq_marlin_repack(
- layer.qweight,
- layer.g_idx_sort_indices,
- part_size_k,
- part_size_n,
- self.quant_config.weight_bits,
- )
- replace_tensor("qweight", marlin_qweight)
-
- # Permute scales
- scales_size_k = part_size_k
- scales_size_n = part_size_n
- if self.quant_config.desc_act:
- scales_size_k = full_size_k
-
- marlin_scales = marlin_permute_scales(
- layer.scales,
- scales_size_k,
- scales_size_n,
- self.quant_config.group_size,
- self.quant_config.weight_bits,
- )
- replace_tensor("scales", marlin_scales)
-
- output = ops.gptq_marlin_gemm(
- reshaped_x,
- layer.qweight,
- layer.scales,
- layer.g_idx,
- layer.g_idx_sort_indices,
- layer.workspace,
- self.quant_config.weight_bits,
- size_m,
- part_size_n,
- part_size_k,
- layer.is_k_full,
- )
-
- if bias is not None:
- output.add_(bias) # In-place add
-
- return output.reshape(out_shape)
+ return apply_marlin_linear(
+ input=x,
+ weight=layer.qweight,
+ weight_scale=layer.scales,
+ g_idx=layer.g_idx,
+ g_idx_sort_indices=layer.g_idx_sort_indices,
+ workspace=layer.workspace,
+ num_bits=self.quant_config.weight_bits,
+ output_size_per_partition=layer.output_size_per_partition,
+ input_size_per_partition=layer.input_size_per_partition,
+ is_k_full=layer.is_k_full,
+ bias=bias)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py b/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py
deleted file mode 100644
index 93f65a20d4e4a..0000000000000
--- a/vllm/model_executor/layers/quantization/utils/marlin_24_perms.py
+++ /dev/null
@@ -1,60 +0,0 @@
-"""This file is used for /tests and /benchmarks"""
-from typing import Dict, List
-
-import numpy
-import torch
-
-
-# Precompute permutations for Marlin24 weight and scale shuffling # noqa: E501
-#
-# Marlin works on [16*2,64] tiles. The goal of the permutations is to reorder the weight data so that it is compatible noqa: # noqa: E501
-# with the tensor-core format that is described here:
-# https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type # noqa: E501
-#
-# As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501
-# (without the need to use ldmatrix instructions) # noqa: E501
-def get_perms_24(num_bits: int):
- perm_list: List[int] = []
- for i in range(32):
- perm1: List[int] = []
- col = i // 4
- col_o = col // 2
- for block in [0, 1]:
- for row in [
- 2 * (i % 4),
- 2 * (i % 4) + 1,
- 2 * (i % 4 + 4),
- 2 * (i % 4 + 4) + 1,
- ]:
- perm1.append(16 * row + col_o * 256 + 8 * (col % 2) +
- 4 * block)
- for j in range(4):
- perm_list.extend([p + 1 * j for p in perm1])
- perm = numpy.array(perm_list)
-
- if num_bits == 4:
- interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
- elif num_bits == 8:
- interleave = numpy.array([0, 2, 1, 3])
- else:
- raise ValueError("num_bits must be 4 or 8, got {}".format(num_bits))
-
- perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
- perm = torch.from_numpy(perm)
- scale_perm: List[int] = []
- for i in range(8):
- scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]])
- scale_perm_single: List[int] = []
- for i in range(8):
- scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]])
- return perm, scale_perm, scale_perm_single
-
-
-marlin_24_perm: Dict[int, torch.Tensor] = {}
-marlin_24_scale_perm: Dict[int, List[int]] = {}
-marlin_24_scale_perm_single: Dict[int, List[int]] = {}
-for num_bits in [4, 8]:
- perm_24, scale_perm_24, scale_perm_single_24 = get_perms_24(num_bits)
- marlin_24_perm[num_bits] = perm_24
- marlin_24_scale_perm[num_bits] = scale_perm_24
- marlin_24_scale_perm_single[num_bits] = scale_perm_single_24
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_perms.py b/vllm/model_executor/layers/quantization/utils/marlin_perms.py
deleted file mode 100644
index db5e6857a8846..0000000000000
--- a/vllm/model_executor/layers/quantization/utils/marlin_perms.py
+++ /dev/null
@@ -1,60 +0,0 @@
-"""This file is used for /tests and /benchmarks"""
-from typing import Dict, List
-
-import numpy
-import torch
-
-
-# Precompute permutations for Marlin weight and scale shuffling # noqa: E501
-#
-# Marlin works on [16,64] tiles. The goal of the permutations is to reorder the weight data so that it is compatible noqa: # noqa: E501
-# with the tensor-core format that is described here:
-# https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#matrix-fragments-for-mma-m16n8k16-with-floating-point-type # noqa: E501
-#
-# As a result of this reordering, the vector loads inside the kernel will get the data as it is needed for tensor-core # noqa: E501
-# (without the need to use ldmatrix instructions) # noqa: E501
-def get_perms(num_bits: int):
- perm_list: List[int] = []
- for i in range(32):
- perm1: List[int] = []
- col = i // 4
- for block in [0, 1]:
- for row in [
- 2 * (i % 4),
- 2 * (i % 4) + 1,
- 2 * (i % 4 + 4),
- 2 * (i % 4 + 4) + 1,
- ]:
- perm1.append(16 * row + col + 8 * block)
- for j in range(4):
- perm_list.extend([p + 256 * j for p in perm1])
-
- perm = numpy.array(perm_list)
-
- if num_bits == 4:
- interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
- elif num_bits == 8:
- interleave = numpy.array([0, 2, 1, 3])
- else:
- raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
-
- perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
- perm = torch.from_numpy(perm)
- scale_perm: List[int] = []
- for i in range(8):
- scale_perm.extend([i + 8 * j for j in range(8)])
- scale_perm_single: List[int] = []
- for i in range(4):
- scale_perm_single.extend(
- [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
- return perm, scale_perm, scale_perm_single
-
-
-marlin_perm: Dict[int, torch.Tensor] = {}
-marlin_scale_perm: Dict[int, List[int]] = {}
-marlin_scale_perm_single: Dict[int, List[int]] = {}
-for num_bits in [4, 8]:
- perm, scale_perm, scale_perm_single = get_perms(num_bits)
- marlin_perm[num_bits] = perm
- marlin_scale_perm[num_bits] = scale_perm
- marlin_scale_perm_single[num_bits] = scale_perm_single
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
index 9886245269ad3..764f0a6f3b71c 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -1,21 +1,9 @@
-"""This file is used for /tests and /benchmarks"""
-import random
-from typing import Optional
+from typing import List, Optional, Tuple
-import numpy
import torch
from vllm import _custom_ops as ops
-from vllm.model_executor.layers.quantization.utils.format_24 import (
- mask_creator, sparse_semi_structured_from_dense_cutlass)
-from vllm.model_executor.layers.quantization.utils.marlin_24_perms import (
- marlin_24_perm, marlin_24_scale_perm, marlin_24_scale_perm_single)
-from vllm.model_executor.layers.quantization.utils.marlin_perms import (
- marlin_perm, marlin_scale_perm, marlin_scale_perm_single)
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
- get_pack_factor, quantize_weights, sort_weights)
from vllm.platforms import current_platform
-from vllm.utils import print_warning_once
GPTQ_MARLIN_TILE = 16
GPTQ_MARLIN_MIN_THREAD_N = 64
@@ -25,135 +13,122 @@
GPTQ_MARLIN_SUPPORTED_NUM_BITS = [4, 8]
GPTQ_MARLIN_SUPPORTED_GROUP_SIZES = [-1, 32, 64, 128]
GPTQ_MARLIN_SUPPORTED_SYM = [True]
-
-
-def is_marlin_supported():
- capability = current_platform.get_device_capability()
- return capability[0] >= 8
-
-
-def apply_fp8_marlin_linear(
- input: torch.Tensor,
- weight: torch.Tensor,
- weight_scale: torch.Tensor,
- workspace: torch.Tensor,
- size_n: int,
- size_k: int,
- bias: Optional[torch.Tensor],
-) -> torch.Tensor:
- # For GPUs that lack FP8 hardware support, we can leverage the
- # Marlin kernel for fast weight-only FP8 quantization
-
- reshaped_x = input.reshape(-1, input.shape[-1])
- out_shape = input.shape[:-1] + (size_n, )
-
- output = ops.fp8_marlin_gemm(
- a=reshaped_x,
- b_q_weight=weight,
- b_scales=weight_scale,
- workspace=workspace,
- num_bits=8,
- size_m=reshaped_x.shape[0],
- size_n=size_n,
- size_k=size_k,
- )
-
- if bias is not None:
- output.add_(bias) # In-place add
-
- return output.reshape(out_shape)
-
-
-def prepare_fp8_layer_for_marlin(layer: torch.nn.Module) -> None:
- print_warning_once(
- "Your GPU does not have native support for FP8 computation but "
- "FP8 quantization is being used. Weight-only FP8 compression will "
- "be used leveraging the Marlin kernel. This may degrade "
- "performance for compute-heavy workloads.")
-
- part_size_n = layer.output_size_per_partition
- part_size_k = layer.input_size_per_partition
-
- device = layer.weight.device
-
- # WEIGHTS
- # Repack weights to gptq format (packed int32 elements)
- packed_gptq_qweight = pack_fp8_to_int32(layer.weight)
-
- # Repack weights to marlin format
- marlin_qweight = ops.gptq_marlin_repack(
- b_q_weight=packed_gptq_qweight,
- perm=torch.empty(0, dtype=torch.int, device=device),
- size_k=part_size_k,
- size_n=part_size_n,
- num_bits=8,
- )
- layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
-
- # WEIGHT SCALES
- # Currently Marlin doesn't support per-tensor scales, so we
- # expand it to channelwise
- scales = layer.weight_scale.repeat(1, part_size_n).to(
- layer.orig_dtype).to(device)
- # Permute scales
- num_bits = 8
- marlin_scales = marlin_permute_scales(
- s=scales,
- size_k=part_size_k,
- size_n=part_size_n,
- group_size=-1,
- scale_perm=marlin_scale_perm[num_bits],
- scale_perm_single=marlin_scale_perm_single[num_bits])
- layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
-
- # Allocate marlin workspace
- max_workspace_size = (part_size_n //
+GTPQ_MARLIN_UNSUPPORTED_GROUP_SIZE_ACT_ORDER = [-1]
+
+
+def check_marlin_supported(num_bits: int, group_size: int, is_sym: bool,
+ min_capability: int) -> bool:
+
+ # If the capability of the device is too low, cannot convert.
+ major, minor = current_platform.get_device_capability()
+ device_capability = major * 10 + minor
+ if device_capability < min_capability:
+ return False
+
+ return (device_capability >= min_capability
+ and num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS
+ and group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES
+ and is_sym in GPTQ_MARLIN_SUPPORTED_SYM)
+
+
+def verify_marlin_supported(num_bits: int, group_size: Optional[int],
+ is_sym: bool) -> None:
+
+ if num_bits not in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
+ raise ValueError(
+ f"Marlin does not support weight_bits = {num_bits}. "
+ f"Only weight_bits = {GPTQ_MARLIN_SUPPORTED_NUM_BITS} "
+ "are supported.")
+ if (group_size is None
+ or group_size not in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES):
+ raise ValueError(
+ f"Marlin does not support group_size = {group_size}. "
+ f"Only group_sizes = {GPTQ_MARLIN_SUPPORTED_GROUP_SIZES} "
+ "are supported.")
+ if is_sym not in GPTQ_MARLIN_SUPPORTED_SYM:
+ raise ValueError(
+ f"Marlin does not support is_sym = is_sym. "
+ f"Only sym = {GPTQ_MARLIN_SUPPORTED_SYM} are supported.")
+
+
+def verify_marlin_supports_shape(output_size_per_partition: int,
+ input_size_per_partition: int,
+ input_size: int, group_size: int) -> None:
+
+ # Validate output_size_per_partition
+ if output_size_per_partition % GPTQ_MARLIN_MIN_THREAD_N != 0:
+ raise ValueError(f"Weight output_size_per_partition = "
+ f"{output_size_per_partition} is not divisible by "
+ f" min_thread_n = {GPTQ_MARLIN_MIN_THREAD_N}. "
+ "Consider reducing tensor_parallel_size or running "
+ "with --quantization gptq.")
+
+ # Validate input_size_per_partition
+ if input_size_per_partition % GPTQ_MARLIN_MIN_THREAD_K != 0:
+ raise ValueError(f"Weight input_size_per_partition = "
+ f"{input_size_per_partition} is not divisible "
+ f"by min_thread_k = {GPTQ_MARLIN_MIN_THREAD_K}. "
+ "Consider reducing tensor_parallel_size or running "
+ "with --quantization gptq.")
+
+ if (group_size < input_size
+ and input_size_per_partition % group_size != 0):
+ raise ValueError(
+ f"Weight input_size_per_partition = {input_size_per_partition}"
+ f" is not divisible by group_size = {group_size}."
+ "Consider reducing tensor_parallel_size or running "
+ "with --quantization gptq.")
+
+
+def marlin_make_workspace(output_size_per_partition: int,
+ device: torch.device) -> torch.Tensor:
+ max_workspace_size = (output_size_per_partition //
GPTQ_MARLIN_MIN_THREAD_N) * GPTQ_MARLIN_MAX_PARALLEL
- workspace = torch.zeros(max_workspace_size,
- dtype=torch.int,
- device=device,
- requires_grad=False)
-
- layer.workspace = workspace
+ return torch.zeros(max_workspace_size,
+ dtype=torch.int,
+ device=device,
+ requires_grad=False)
-def marlin_permute_weights(q_w, size_k, size_n, perm, tile=GPTQ_MARLIN_TILE):
- assert q_w.shape == (size_k, size_n)
- assert size_k % tile == 0, f"size_k = {size_k}, tile = {tile}"
- assert size_n % tile == 0, f"size_k = {size_n}, tile = {tile}"
- # Permute weights to 16x64 marlin tiles
- q_w = q_w.reshape((size_k // tile, tile, size_n // tile, tile))
- q_w = q_w.permute((0, 2, 1, 3))
- q_w = q_w.reshape((size_k // tile, size_n * tile))
+def marlin_is_k_full(act_order: bool, is_row_parallel: bool) -> bool:
+ return (not act_order) or (act_order and not is_row_parallel)
- q_w = q_w.reshape((-1, perm.numel()))[:, perm].reshape(q_w.shape)
- return q_w
+def marlin_repeat_scales_on_all_ranks(act_order: bool, group_size: int,
+ is_row_parallel: bool) -> bool:
+ # Need to repeat scales on every rank if act_ordering or
+ # channelwise and RowParallelLinear
+ is_channelwise = group_size == -1
+ return act_order or (is_channelwise and is_row_parallel)
-def marlin_weights(q_w, size_k, size_n, num_bits, perm):
- # Permute
- q_w = marlin_permute_weights(q_w, size_k, size_n, perm)
+def marlin_make_empty_g_idx(device: torch.device) -> torch.Tensor:
+ return torch.nn.Parameter(torch.empty(0, dtype=torch.int, device=device),
+ requires_grad=False)
- # Pack
- pack_factor = get_pack_factor(num_bits)
- orig_device = q_w.device
- q_w = q_w.cpu().numpy().astype(numpy.uint32)
+def marlin_sort_g_idx(
+ g_idx: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+ g_idx_sort_indices = torch.argsort(g_idx).to(torch.int)
+ return g_idx[g_idx_sort_indices], g_idx_sort_indices
- q_packed = numpy.zeros((q_w.shape[0], q_w.shape[1] // pack_factor),
- dtype=numpy.uint32)
- for i in range(pack_factor):
- q_packed |= q_w[:, i::pack_factor] << num_bits * i
- q_packed = torch.from_numpy(q_packed.astype(numpy.int32)).to(orig_device)
+def get_scale_perms():
+ scale_perm: List[int] = []
+ for i in range(8):
+ scale_perm.extend([i + 8 * j for j in range(8)])
+ scale_perm_single: List[int] = []
+ for i in range(4):
+ scale_perm_single.extend(
+ [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
+ return scale_perm, scale_perm_single
- return q_packed
+def marlin_permute_scales(s: torch.Tensor, size_k: int, size_n: int,
+ group_size: int) -> torch.Tensor:
-def marlin_permute_scales(s, size_k, size_n, group_size, scale_perm,
- scale_perm_single):
+ scale_perm, scale_perm_single = get_scale_perms()
if group_size < size_k and group_size != -1:
s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
else:
@@ -163,180 +138,44 @@ def marlin_permute_scales(s, size_k, size_n, group_size, scale_perm,
return s
-def marlin_quantize(
- w: torch.Tensor,
- num_bits: int,
- group_size: int,
- act_order: bool,
-):
- size_k, size_n = w.shape
-
- # Normalize group_size
- if group_size == -1:
- group_size = size_k
- assert group_size <= size_k
-
- # Quantize (and apply act_order if provided)
- w_ref, q_w, s, g_idx, rand_perm = quantize_weights(w, num_bits, group_size,
- act_order)
-
- # For act_order, sort the "weights" and "g_idx" so that group ids are
- # increasing
- sort_indices = torch.empty(0, dtype=torch.int, device=w.device)
- if act_order:
- q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
-
- # Reformat to marlin
- marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits,
- marlin_perm[num_bits])
- marlin_s = marlin_permute_scales(s, size_k, size_n, group_size,
- marlin_scale_perm[num_bits],
- marlin_scale_perm_single[num_bits])
-
- # Create result
- res_list = [w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm]
- for i in range(len(res_list)):
- res_list[i] = res_list[i].to(w.device)
-
- return res_list
-
-
-def inject_24(w, size_k, size_n):
- assert w.shape == (size_k, size_n)
-
- mask = mask_creator(w.t()).t().cuda().bool()
-
- return (mask * w).contiguous(), mask.contiguous()
-
-
-def check_24(w, num_rows_to_sample=50, _verbose=False):
- BLOCK_SIZE = 4
- MAX_NON_ZEROS = 2
-
- w = w.t().contiguous()
-
- print("check_24: w.shape = {}".format(w.shape))
-
- num_rows, num_cols = w.shape
- sampled_row_idxs = random.choices(range(num_rows), k=num_rows_to_sample)
- if _verbose:
- print(f"Sampled row idxs = {sampled_row_idxs}")
-
- total_segments = 0
- non_24_segments = 0
- for i in sampled_row_idxs:
- for j in range(0, num_cols - BLOCK_SIZE, BLOCK_SIZE):
- total_segments += 1
- block = w[i, j:j + BLOCK_SIZE]
- num_nonzero = torch.count_nonzero(block)
- if num_nonzero > MAX_NON_ZEROS:
- print("i = {} j = {} block = {}".format(i, j, block))
- non_24_segments += 1
-
- print(f"{non_24_segments} / {total_segments} do not have 2:4 structure.")
-
-
-def compress_quantized_24_weight(q_24, size_k, size_n, num_bits):
- assert q_24.shape == (size_k, size_n)
-
- # Remove zp to normalize over 0
- max_q_val = (1 << num_bits) - 1
- zp = (max_q_val + 1) // 2
- q_24_no_zp = q_24 - zp
-
- # Compress
- q_24_no_zp = q_24_no_zp.t().contiguous()
- q_24_no_zp_comp, meta = sparse_semi_structured_from_dense_cutlass(
- q_24_no_zp)
- q_24_no_zp_comp = q_24_no_zp_comp.t().contiguous()
-
- # Restore zp
- q_24_comp = q_24_no_zp_comp + zp
-
- # Resize meta to its actual shape (without moving any data)
- meta = meta.resize_(meta.shape[1] // 2, meta.shape[0] * 2)
-
- return q_24_comp, meta
-
-
-def marlin_24_quantize(
- w: torch.Tensor,
- num_bits: int,
- group_size: int,
-):
- size_k, size_n = w.shape
-
- # Normalize group_size
- if group_size == -1:
- group_size = size_k
- assert group_size <= size_k
-
- # Inject 2:4 sparsity
- w_24, mask_24 = inject_24(w, size_k, size_n)
-
- # Quantize
- w_24_ref, q_w_24, s, g_idx, rand_perm = quantize_weights(w_24,
- num_bits,
- group_size,
- act_order=False)
-
- # Compress quantized weight
- q_w_24_comp, meta = compress_quantized_24_weight(q_w_24, size_k, size_n,
- num_bits)
- size_k_comp = size_k // 2
-
- # Reformat to marlin
- marlin_24_q_w_comp = marlin_weights(q_w_24_comp, size_k_comp, size_n,
- num_bits, marlin_24_perm[num_bits])
- marlin_24_s = marlin_permute_scales(s, size_k, size_n, group_size,
- marlin_24_scale_perm[num_bits],
- marlin_24_scale_perm_single[num_bits])
-
- # Create result
- res_list = [w_24_ref, marlin_24_q_w_comp, meta, marlin_24_s]
- for i in range(len(res_list)):
- res_list[i] = res_list[i].to(w.device)
-
- return res_list
-
-
-def compute_max_diff(output, output_ref):
- return torch.mean(torch.abs(output - output_ref)) / torch.mean(
- torch.abs(output_ref))
-
-
-class MarlinWorkspace:
-
- def __init__(self, out_features, min_thread_n, max_parallel):
- assert (out_features % min_thread_n == 0), (
- "out_features = {} is undivisible by min_thread_n = {}".format(
- out_features, min_thread_n))
-
- max_workspace_size = ((out_features // min_thread_n) * max_parallel)
-
- self.scratch = torch.zeros(max_workspace_size,
- dtype=torch.int,
- device="cuda")
-
-
-def pack_fp8_to_int32(fp8_tensor: torch.Tensor) -> torch.Tensor:
- """
- Repack FP8 weights to gptq format (packed int32 elements)
- """
- assert fp8_tensor.dtype == torch.float8_e4m3fn
- assert fp8_tensor.shape[0] % 4 == 0
-
- # Reshape to prepare for packing
- reshaped = fp8_tensor.reshape(-1, 4, *fp8_tensor.shape[1:])
-
- # Convert fp8 to uint8 (byte) representation
- byte_tensor = reshaped.view(torch.uint8)
+# Newly generated tensors need to replace existing tensors that are
+# already registered as parameters by vLLM (and won't be freed)
+def replace_tensor(layer: torch.nn.Module, name: str,
+ new_t: torch.Tensor) -> None:
+ # It is important to use resize_() here since it ensures
+ # the same buffer is reused
+ getattr(layer, name).resize_(new_t.shape)
+ getattr(layer, name).copy_(new_t)
+ del new_t
+
+
+def apply_marlin_linear(input: torch.Tensor,
+ weight: torch.Tensor,
+ weight_scale: torch.Tensor,
+ g_idx: torch.Tensor,
+ g_idx_sort_indices: torch.Tensor,
+ workspace: torch.Tensor,
+ num_bits: int,
+ output_size_per_partition: int,
+ input_size_per_partition: int,
+ is_k_full: bool,
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+ reshaped_x = input.reshape(-1, input.shape[-1])
+ out_shape = input.shape[:-1] + (output_size_per_partition, )
+
+ output = ops.gptq_marlin_gemm(reshaped_x,
+ weight,
+ weight_scale,
+ g_idx,
+ g_idx_sort_indices,
+ workspace,
+ num_bits,
+ size_m=reshaped_x.shape[0],
+ size_n=output_size_per_partition,
+ size_k=input_size_per_partition,
+ is_k_full=is_k_full)
- # Pack 4 uint8 values into one int32
- packed = (byte_tensor[:, 0].to(torch.int32) |
- (byte_tensor[:, 1].to(torch.int32) << 8) |
- (byte_tensor[:, 2].to(torch.int32) << 16) |
- (byte_tensor[:, 3].to(torch.int32) << 24))
+ if bias is not None:
+ output.add_(bias) # In-place add
- return packed.view(fp8_tensor.shape[0] // 4,
- *fp8_tensor.shape[1:]).contiguous()
+ return output.reshape(out_shape)
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
new file mode 100644
index 0000000000000..e93eb747ba2eb
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py
@@ -0,0 +1,109 @@
+from typing import Optional
+
+import torch
+
+import vllm._custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.utils import print_warning_once
+
+from .marlin_utils import marlin_make_workspace, marlin_permute_scales
+
+
+def is_fp8_marlin_supported():
+ capability = current_platform.get_device_capability()
+ return capability[0] >= 8
+
+
+def apply_fp8_marlin_linear(
+ input: torch.Tensor,
+ weight: torch.Tensor,
+ weight_scale: torch.Tensor,
+ workspace: torch.Tensor,
+ size_n: int,
+ size_k: int,
+ bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+ # For GPUs that lack FP8 hardware support, we can leverage the
+ # Marlin kernel for fast weight-only FP8 quantization
+
+ reshaped_x = input.reshape(-1, input.shape[-1])
+ out_shape = input.shape[:-1] + (size_n, )
+
+ output = ops.fp8_marlin_gemm(
+ a=reshaped_x,
+ b_q_weight=weight,
+ b_scales=weight_scale,
+ workspace=workspace,
+ num_bits=8,
+ size_m=reshaped_x.shape[0],
+ size_n=size_n,
+ size_k=size_k,
+ )
+
+ if bias is not None:
+ output.add_(bias) # In-place add
+
+ return output.reshape(out_shape)
+
+
+def prepare_fp8_layer_for_marlin(layer: torch.nn.Module) -> None:
+ print_warning_once(
+ "Your GPU does not have native support for FP8 computation but "
+ "FP8 quantization is being used. Weight-only FP8 compression will "
+ "be used leveraging the Marlin kernel. This may degrade "
+ "performance for compute-heavy workloads.")
+
+ part_size_n = layer.output_size_per_partition
+ part_size_k = layer.input_size_per_partition
+
+ device = layer.weight.device
+
+ # WORKSPACE
+ layer.workspace = marlin_make_workspace(part_size_n, device)
+
+ # WEIGHT
+ # Repack weights to marlin format
+ marlin_qweight = ops.gptq_marlin_repack(b_q_weight=pack_fp8_to_int32(
+ layer.weight),
+ perm=torch.empty(0,
+ dtype=torch.int,
+ device=device),
+ size_k=part_size_k,
+ size_n=part_size_n,
+ num_bits=8)
+ layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False)
+
+ # WEIGHT SCALES
+ # Currently Marlin doesn't support per-tensor scales, so we
+ # expand it to channelwise
+ scales = layer.weight_scale.repeat(1, part_size_n).to(
+ layer.orig_dtype).to(device)
+ # Permute scales
+ marlin_scales = marlin_permute_scales(s=scales,
+ size_k=part_size_k,
+ size_n=part_size_n,
+ group_size=-1)
+ layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False)
+
+
+def pack_fp8_to_int32(fp8_tensor: torch.Tensor) -> torch.Tensor:
+ """
+ Repack FP8 weights to gptq format (packed int32 elements)
+ """
+ assert fp8_tensor.dtype == torch.float8_e4m3fn
+ assert fp8_tensor.shape[0] % 4 == 0
+
+ # Reshape to prepare for packing
+ reshaped = fp8_tensor.reshape(-1, 4, *fp8_tensor.shape[1:])
+
+ # Convert fp8 to uint8 (byte) representation
+ byte_tensor = reshaped.view(torch.uint8)
+
+ # Pack 4 uint8 values into one int32
+ packed = (byte_tensor[:, 0].to(torch.int32) |
+ (byte_tensor[:, 1].to(torch.int32) << 8) |
+ (byte_tensor[:, 2].to(torch.int32) << 16) |
+ (byte_tensor[:, 3].to(torch.int32) << 24))
+
+ return packed.view(fp8_tensor.shape[0] // 4,
+ *fp8_tensor.shape[1:]).contiguous()
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
new file mode 100644
index 0000000000000..1773748a0f228
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test.py
@@ -0,0 +1,120 @@
+"""Utility functions used for tests and benchmarks"""
+
+from typing import List
+
+import numpy
+import torch
+
+from .marlin_utils import GPTQ_MARLIN_TILE, marlin_permute_scales
+from .quant_utils import get_pack_factor, quantize_weights, sort_weights
+
+
+class MarlinWorkspace:
+
+ def __init__(self, out_features, min_thread_n, max_parallel):
+ assert (out_features % min_thread_n == 0), (
+ "out_features = {} is undivisible by min_thread_n = {}".format(
+ out_features, min_thread_n))
+
+ max_workspace_size = ((out_features // min_thread_n) * max_parallel)
+
+ self.scratch = torch.zeros(max_workspace_size,
+ dtype=torch.int,
+ device="cuda")
+
+
+def marlin_permute_weights(q_w, size_k, size_n, perm, tile=GPTQ_MARLIN_TILE):
+ assert q_w.shape == (size_k, size_n)
+ assert size_k % tile == 0, f"size_k = {size_k}, tile = {tile}"
+ assert size_n % tile == 0, f"size_k = {size_n}, tile = {tile}"
+
+ # Permute weights to 16x64 marlin tiles
+ q_w = q_w.reshape((size_k // tile, tile, size_n // tile, tile))
+ q_w = q_w.permute((0, 2, 1, 3))
+ q_w = q_w.reshape((size_k // tile, size_n * tile))
+
+ q_w = q_w.reshape((-1, perm.numel()))[:, perm].reshape(q_w.shape)
+
+ return q_w
+
+
+def marlin_weights(q_w, size_k, size_n, num_bits, perm):
+ # Permute
+ q_w = marlin_permute_weights(q_w, size_k, size_n, perm)
+
+ # Pack
+ pack_factor = get_pack_factor(num_bits)
+ orig_device = q_w.device
+
+ q_w = q_w.cpu().numpy().astype(numpy.uint32)
+
+ q_packed = numpy.zeros((q_w.shape[0], q_w.shape[1] // pack_factor),
+ dtype=numpy.uint32)
+ for i in range(pack_factor):
+ q_packed |= q_w[:, i::pack_factor] << num_bits * i
+
+ q_packed = torch.from_numpy(q_packed.astype(numpy.int32)).to(orig_device)
+
+ return q_packed
+
+
+def get_weight_perm(num_bits: int):
+ perm_list: List[int] = []
+ for i in range(32):
+ perm1: List[int] = []
+ col = i // 4
+ for block in [0, 1]:
+ for row in [
+ 2 * (i % 4),
+ 2 * (i % 4) + 1,
+ 2 * (i % 4 + 4),
+ 2 * (i % 4 + 4) + 1,
+ ]:
+ perm1.append(16 * row + col + 8 * block)
+ for j in range(4):
+ perm_list.extend([p + 256 * j for p in perm1])
+
+ perm = numpy.array(perm_list)
+
+ if num_bits == 4:
+ interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
+ elif num_bits == 8:
+ interleave = numpy.array([0, 2, 1, 3])
+ else:
+ raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+
+ perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
+ perm = torch.from_numpy(perm)
+ return perm
+
+
+def marlin_quantize(w: torch.Tensor, num_bits: int, group_size: int,
+ act_order: bool):
+ size_k, size_n = w.shape
+
+ # Normalize group_size
+ if group_size == -1:
+ group_size = size_k
+ assert group_size <= size_k
+
+ # Quantize (and apply act_order if provided)
+ w_ref, q_w, s, g_idx, rand_perm = quantize_weights(w, num_bits, group_size,
+ act_order)
+
+ # For act_order, sort the "weights" and "g_idx" so that group ids are
+ # increasing
+ sort_indices = torch.empty(0, dtype=torch.int, device=w.device)
+ if act_order:
+ q_w, g_idx, sort_indices = sort_weights(q_w, g_idx)
+
+ # Reformat to marlin
+ weight_perm = get_weight_perm(num_bits)
+ marlin_q_w = marlin_weights(q_w, size_k, size_n, num_bits, weight_perm)
+ marlin_s = marlin_permute_scales(s, size_k, size_n, group_size)
+
+ # Create result
+ res_list = [w_ref, marlin_q_w, marlin_s, g_idx, sort_indices, rand_perm]
+ for i in range(len(res_list)):
+ res_list[i] = res_list[i].to(w.device)
+
+ return res_list
diff --git a/vllm/model_executor/layers/quantization/utils/format_24.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
similarity index 71%
rename from vllm/model_executor/layers/quantization/utils/format_24.py
rename to vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
index 01c8cf789204b..648c32249a571 100644
--- a/vllm/model_executor/layers/quantization/utils/format_24.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_test_24.py
@@ -1,9 +1,14 @@
-#
-# Modified by Roberto Lopez Castro (roberto.lopez.castro@udc.es).
-#
+"""Utility functions used for tests and benchmarks"""
+import random
+from typing import List
+
+import numpy
import torch
+from .marlin_utils_test import marlin_weights
+from .quant_utils import quantize_weights
+
# This is PyTorch implementation of main part of reorder_meta()
# function, from tools/util/include/cutlass/util/host_reorder.h file
@@ -306,3 +311,155 @@ def mask_creator(tensor):
mask = w_b.scatter_(dim=1, index=index, value=0).reshape(tensor.shape)
return mask
+
+
+def inject_24(w, size_k, size_n):
+ assert w.shape == (size_k, size_n)
+
+ mask = mask_creator(w.t()).t().cuda().bool()
+
+ return (mask * w).contiguous(), mask.contiguous()
+
+
+def check_24(w, num_rows_to_sample=50, _verbose=False):
+ BLOCK_SIZE = 4
+ MAX_NON_ZEROS = 2
+
+ w = w.t().contiguous()
+
+ print("check_24: w.shape = {}".format(w.shape))
+
+ num_rows, num_cols = w.shape
+ sampled_row_idxs = random.choices(range(num_rows), k=num_rows_to_sample)
+ if _verbose:
+ print(f"Sampled row idxs = {sampled_row_idxs}")
+
+ total_segments = 0
+ non_24_segments = 0
+ for i in sampled_row_idxs:
+ for j in range(0, num_cols - BLOCK_SIZE, BLOCK_SIZE):
+ total_segments += 1
+ block = w[i, j:j + BLOCK_SIZE]
+ num_nonzero = torch.count_nonzero(block)
+ if num_nonzero > MAX_NON_ZEROS:
+ print("i = {} j = {} block = {}".format(i, j, block))
+ non_24_segments += 1
+
+ print(f"{non_24_segments} / {total_segments} do not have 2:4 structure.")
+
+
+def compress_quantized_24_weight(q_24, size_k, size_n, num_bits):
+ assert q_24.shape == (size_k, size_n)
+
+ # Remove zp to normalize over 0
+ max_q_val = (1 << num_bits) - 1
+ zp = (max_q_val + 1) // 2
+ q_24_no_zp = q_24 - zp
+
+ # Compress
+ q_24_no_zp = q_24_no_zp.t().contiguous()
+ q_24_no_zp_comp, meta = sparse_semi_structured_from_dense_cutlass(
+ q_24_no_zp)
+ q_24_no_zp_comp = q_24_no_zp_comp.t().contiguous()
+
+ # Restore zp
+ q_24_comp = q_24_no_zp_comp + zp
+
+ # Resize meta to its actual shape (without moving any data)
+ meta = meta.resize_(meta.shape[1] // 2, meta.shape[0] * 2)
+
+ return q_24_comp, meta
+
+
+def get_scale_perms_24():
+ scale_perm: List[int] = []
+ for i in range(8):
+ scale_perm.extend([i * 8 + j for j in [0, 4, 1, 5, 2, 6, 3, 7]])
+ scale_perm_single: List[int] = []
+ for i in range(8):
+ scale_perm_single.extend([8 * i + j for j in [0, 1, 2, 3, 4, 5, 6, 7]])
+ return scale_perm, scale_perm_single
+
+
+def get_weight_perm_24(num_bits: int):
+ perm_list: List[int] = []
+ for i in range(32):
+ perm1: List[int] = []
+ col = i // 4
+ col_o = col // 2
+ for block in [0, 1]:
+ for row in [
+ 2 * (i % 4),
+ 2 * (i % 4) + 1,
+ 2 * (i % 4 + 4),
+ 2 * (i % 4 + 4) + 1,
+ ]:
+ perm1.append(16 * row + col_o * 256 + 8 * (col % 2) +
+ 4 * block)
+ for j in range(4):
+ perm_list.extend([p + 1 * j for p in perm1])
+ perm = numpy.array(perm_list)
+
+ if num_bits == 4:
+ interleave = numpy.array([0, 2, 4, 6, 1, 3, 5, 7])
+ elif num_bits == 8:
+ interleave = numpy.array([0, 2, 1, 3])
+ else:
+ raise ValueError("num_bits must be 4 or 8, got {}".format(num_bits))
+
+ perm = perm.reshape((-1, len(interleave)))[:, interleave].ravel()
+ perm = torch.from_numpy(perm)
+ return perm
+
+
+def marlin_permute_scales_24(s: torch.Tensor, size_k: int, size_n: int,
+ group_size: int) -> torch.Tensor:
+
+ scale_perm, scale_perm_single = get_scale_perms_24()
+ if group_size < size_k and group_size != -1:
+ s = s.reshape((-1, len(scale_perm)))[:, scale_perm]
+ else:
+ s = s.reshape((-1, len(scale_perm_single)))[:, scale_perm_single]
+ s = s.reshape((-1, size_n)).contiguous()
+
+ return s
+
+
+def marlin_24_quantize(
+ w: torch.Tensor,
+ num_bits: int,
+ group_size: int,
+):
+ size_k, size_n = w.shape
+
+ # Normalize group_size
+ if group_size == -1:
+ group_size = size_k
+ assert group_size <= size_k
+
+ # Inject 2:4 sparsity
+ w_24, mask_24 = inject_24(w, size_k, size_n)
+
+ # Quantize
+ w_24_ref, q_w_24, s, g_idx, rand_perm = quantize_weights(w_24,
+ num_bits,
+ group_size,
+ act_order=False)
+
+ # Compress quantized weight
+ q_w_24_comp, meta = compress_quantized_24_weight(q_w_24, size_k, size_n,
+ num_bits)
+ size_k_comp = size_k // 2
+
+ # Reformat to marlin
+ weight_perm = get_weight_perm_24(num_bits)
+ marlin_24_q_w_comp = marlin_weights(q_w_24_comp, size_k_comp, size_n,
+ num_bits, weight_perm)
+ marlin_24_s = marlin_permute_scales_24(s, size_k, size_n, group_size)
+
+ # Create result
+ res_list = [w_24_ref, marlin_24_q_w_comp, meta, marlin_24_s]
+ for i in range(len(res_list)):
+ res_list[i] = res_list[i].to(w.device)
+
+ return res_list
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 81b7fdb7833d7..f290a6830c91b 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -112,7 +112,7 @@ def apply_fp8_linear(
# If dynamic, layer.input_scale is None and x_scale computed from x.
# If static, layer.input_scale is scalar and x_scale is input_scale.
- if bias is None and cutlass_fp8_supported:
+ if cutlass_fp8_supported:
qinput, x_scale = ops.scaled_fp8_quant(input, input_scale)
# Fused GEMM_DQ
@@ -120,7 +120,8 @@ def apply_fp8_linear(
weight,
out_dtype=input.dtype,
scale_a=x_scale,
- scale_b=weight_scale)
+ scale_b=weight_scale,
+ bias=bias)
else:
qinput, x_scale = ops.scaled_fp8_quant(input,
@@ -148,9 +149,6 @@ def apply_int8_linear(
input_scale: torch.Tensor,
bias: Optional[torch.Tensor] = None,
):
- if bias is not None:
- raise NotImplementedError("W8A8 with int8 does not yet support bias.")
-
# ops.scaled_int8_quant supports both dynamic and static quant.
# * dynamic, layer.input_scale is None and x_scale computed from x.
# * static, layer.input_scale is scalar and x_scale is input_scale.
@@ -160,4 +158,5 @@ def apply_int8_linear(
weight,
scale_a=x_scale,
scale_b=weight_scale,
- out_dtype=input.dtype)
+ out_dtype=input.dtype,
+ bias=bias)
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 60547965063fa..0b269393294ae 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -279,10 +279,6 @@ def load_model(self, *, model_config: ModelConfig,
quant_method = getattr(module, "quant_method", None)
if quant_method is not None:
quant_method.process_weights_after_loading(module)
- # FIXME: Remove this after Mixtral is updated
- # to use quant_method.
- if hasattr(module, "process_weights_after_loading"):
- module.process_weights_after_loading()
return model.eval()
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index 943022a3f03c7..c8568b3dc6690 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -431,6 +431,11 @@ def convert_pyslice_to_tensor(x: Any) -> torch.Tensor:
def default_weight_loader(param: torch.Tensor,
loaded_weight: torch.Tensor) -> None:
"""Default weight loader."""
+ # If the weight on disk does not have a shape, give it one
+ # (such scales for AutoFp8).
+ if len(loaded_weight.shape) == 0:
+ loaded_weight = loaded_weight.reshape(1)
+
assert param.size() == loaded_weight.size()
param.data.copy_(loaded_weight)
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 096e3f4724014..87508a1168e0c 100644
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -23,6 +23,7 @@
"DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
"DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"),
"FalconForCausalLM": ("falcon", "FalconForCausalLM"),
+ "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
"GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
"Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),
"GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
@@ -49,6 +50,7 @@
"OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
"OPTForCausalLM": ("opt", "OPTForCausalLM"),
"OrionForCausalLM": ("orion", "OrionForCausalLM"),
+ "PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"),
"PaliGemmaForConditionalGeneration":
("paligemma", "PaliGemmaForConditionalGeneration"),
"PhiForCausalLM": ("phi", "PhiForCausalLM"),
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index d8fbf796b5d3a..b4f628061f19c 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -214,22 +214,24 @@ class CLIPEncoder(nn.Module):
def __init__(self,
config: CLIPVisionConfig,
- quant_config: Optional[QuantizationConfig] = None):
+ quant_config: Optional[QuantizationConfig] = None,
+ num_hidden_layers_override: Optional[int] = None):
super().__init__()
self.config = config
+
+ if num_hidden_layers_override is None:
+ num_hidden_layers = config.num_hidden_layers
+ else:
+ num_hidden_layers = num_hidden_layers_override
self.layers = nn.ModuleList([
CLIPEncoderLayer(config=config, quant_config=quant_config)
- for _ in range(config.num_hidden_layers)
+ for _ in range(num_hidden_layers)
])
- def forward(self,
- inputs_embeds: torch.Tensor,
- vision_feature_layer: int = -1):
+ def forward(self, inputs_embeds: torch.Tensor):
- # Encoder forward pass only up to the required layer
- num_layer = len(self.layers) + vision_feature_layer + 1
hidden_states = inputs_embeds
- for encoder_layer in self.layers[:num_layer]:
+ for encoder_layer in self.layers:
hidden_states = encoder_layer(hidden_states)
return hidden_states
@@ -239,7 +241,8 @@ class CLIPVisionTransformer(nn.Module):
def __init__(self,
config: CLIPVisionConfig,
- quant_config: Optional[QuantizationConfig] = None):
+ quant_config: Optional[QuantizationConfig] = None,
+ num_hidden_layers_override: Optional[int] = None):
super().__init__()
self.config = config
embed_dim = config.hidden_size
@@ -249,18 +252,19 @@ def __init__(self,
# NOTE: This typo of "layrnorm" is not fixed on purpose to match
# the original transformers code and name of the model weights.
self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
- self.encoder = CLIPEncoder(config=config, quant_config=quant_config)
+ self.encoder = CLIPEncoder(
+ config=config,
+ quant_config=quant_config,
+ num_hidden_layers_override=num_hidden_layers_override)
def forward(
self,
pixel_values: torch.Tensor,
- vision_feature_layer: int = -1,
) -> torch.Tensor:
hidden_states = self.embeddings(pixel_values)
hidden_states = self.pre_layrnorm(hidden_states)
- hidden_states = self.encoder(inputs_embeds=hidden_states,
- vision_feature_layer=vision_feature_layer)
+ hidden_states = self.encoder(inputs_embeds=hidden_states)
return hidden_states
@@ -272,17 +276,17 @@ class CLIPVisionModel(nn.Module):
def __init__(self,
config: CLIPVisionConfig,
- quant_config: Optional[QuantizationConfig] = None):
+ quant_config: Optional[QuantizationConfig] = None,
+ num_hidden_layers_override: Optional[int] = None):
super().__init__()
- self.vision_model = CLIPVisionTransformer(config=config,
- quant_config=quant_config)
+ self.vision_model = CLIPVisionTransformer(
+ config=config,
+ quant_config=quant_config,
+ num_hidden_layers_override=num_hidden_layers_override)
- def forward(self,
- pixel_values: Optional[torch.Tensor] = None,
- vision_feature_layer: int = -1):
+ def forward(self, pixel_values: Optional[torch.Tensor] = None):
- return self.vision_model(pixel_values=pixel_values,
- vision_feature_layer=vision_feature_layer)
+ return self.vision_model(pixel_values=pixel_values)
@property
def device(self):
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index fb4097fd1e9b3..2d12ceb7f3dbf 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -29,11 +29,10 @@
from vllm.attention import Attention, AttentionMetadata
from vllm.config import CacheConfig
-from vllm.distributed import (get_tensor_model_parallel_rank,
- get_tensor_model_parallel_world_size,
+from vllm.distributed import (get_tensor_model_parallel_world_size,
tensor_model_parallel_all_reduce)
from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe import fused_experts, grouped_topk
+from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
MergedColumnParallelLinear,
@@ -91,32 +90,34 @@ def __init__(
quant_config: Optional[QuantizationConfig] = None,
):
super().__init__()
- self.config = config
- self.rank = get_tensor_model_parallel_rank()
self.tp_size = get_tensor_model_parallel_world_size()
- self.n_routed_experts = config.n_routed_experts
- self.top_k = config.num_experts_per_tok
self.routed_scaling_factor = config.routed_scaling_factor
- if self.tp_size > self.n_routed_experts:
+ self.n_shared_experts = config.n_shared_experts
+ self.routed_scaling_factor = config.routed_scaling_factor
+ if self.tp_size > config.n_routed_experts:
raise ValueError(
f"Tensor parallel size {self.tp_size} is greater than "
- f"the number of experts {self.n_routed_experts}.")
-
- self.experts = nn.ModuleList([
- DeepseekV2MLP(hidden_size=config.hidden_size,
- intermediate_size=config.moe_intermediate_size,
- hidden_act=config.hidden_act,
- quant_config=quant_config,
- reduce_results=False)
- for idx in range(self.n_routed_experts)
- ])
- self.pack_params()
+ f"the number of experts {config.n_routed_experts}.")
+
+ if config.hidden_act != "silu":
+ raise ValueError(f"Unsupported activation: {config.hidden_act}. "
+ "Only silu is supported for now.")
+
+ self.experts = FusedMoE(num_experts=config.n_routed_experts,
+ top_k=config.num_experts_per_tok,
+ hidden_size=config.hidden_size,
+ intermediate_size=config.moe_intermediate_size,
+ reduce_results=False,
+ renormalize=config.norm_topk_prob,
+ quant_config=quant_config,
+ use_grouped_topk=True,
+ num_expert_group=config.n_group,
+ topk_group=config.topk_group)
self.gate = ReplicatedLinear(config.hidden_size,
- self.n_routed_experts,
+ config.n_routed_experts,
bias=False,
quant_config=None)
-
if config.n_shared_experts is not None:
intermediate_size = (config.moe_intermediate_size *
config.n_shared_experts)
@@ -128,50 +129,21 @@ def __init__(
reduce_results=False,
)
- def pack_params(self):
- w1 = []
- w2 = []
- for expert in self.experts:
- w1.append(expert.gate_up_proj.weight)
- w2.append(expert.down_proj.weight)
- self.w1 = torch._utils._flatten_dense_tensors(w1)
- w1s = torch._utils._unflatten_dense_tensors(self.w1, w1)
- for data, param in zip(w1s, w1):
- param.data = data
- self.w1 = self.w1.view(len(w1), *w1s[0].shape)
-
- self.w2 = torch._utils._flatten_dense_tensors(w2)
- w2s = torch._utils._unflatten_dense_tensors(self.w2, w2)
- for data, param in zip(w2s, w2):
- param.data = data
-
- self.w2 = self.w2.view(len(w2), *w2s[0].shape)
-
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
num_tokens, hidden_dim = hidden_states.shape
hidden_states = hidden_states.view(-1, hidden_dim)
- if self.config.n_shared_experts is not None:
+ if self.n_shared_experts is not None:
shared_output = self.shared_experts(hidden_states)
# router_logits: (num_tokens, n_experts)
router_logits, _ = self.gate(hidden_states)
- topk_weights, topk_ids = grouped_topk(
- hidden_states,
- router_logits,
- self.top_k,
- renormalize=self.config.norm_topk_prob,
- num_expert_group=self.config.n_group,
- topk_group=self.config.topk_group)
- final_hidden_states = fused_experts(
- hidden_states,
- self.w1,
- self.w2,
- topk_weights,
- topk_ids,
- inplace=True) * self.routed_scaling_factor
- if self.config.n_shared_experts is not None:
+ final_hidden_states = self.experts(
+ hidden_states=hidden_states,
+ router_logits=router_logits) * self.routed_scaling_factor
+ if shared_output is not None:
final_hidden_states = final_hidden_states + shared_output
- final_hidden_states = tensor_model_parallel_all_reduce(
- final_hidden_states)
+ if self.tp_size > 1:
+ final_hidden_states = tensor_model_parallel_all_reduce(
+ final_hidden_states)
return final_hidden_states.view(num_tokens, hidden_dim)
@@ -504,34 +476,58 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
("gate_up_proj", "up_proj", 1),
]
+ # Params for weights, fp8 weight scales, fp8 activation scales
+ # (param_name, weight_name, expert_id, shard_id)
+ expert_params_mapping = FusedMoE.make_expert_params_mapping(
+ ckpt_gate_proj_name="gate_proj",
+ ckpt_down_proj_name="down_proj",
+ ckpt_up_proj_name="up_proj",
+ num_experts=self.config.n_routed_experts)
+
params_dict = dict(self.named_parameters())
for name, loaded_weight in weights:
if "rotary_emb.inv_freq" in name:
continue
for (param_name, weight_name, shard_id) in stacked_params_mapping:
+ # Skip non-stacked layers and experts (experts handled below).
if weight_name not in name:
continue
+ # We have mlp.experts[0].gate_proj in the checkpoint.
+ # Since we handle the experts below in expert_params_mapping,
+ # we need to skip here BEFORE we update the name, otherwise
+ # name will be updated to mlp.experts[0].gate_up_proj, which
+ # will then be updated below in expert_params_mapping
+ # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+ if (("mlp.experts." in name) and name not in params_dict):
+ continue
name = name.replace(weight_name, param_name)
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
- # Skip experts that are not assigned to this worker.
- if (("mlp.experts." in name or "mlp.shared_experts." in name)
- and name not in params_dict):
- continue
param = params_dict[name]
weight_loader = param.weight_loader
weight_loader(param, loaded_weight, shard_id)
break
else:
- # Skip loading extra bias for GPTQ models.
- if name.endswith(".bias") and name not in params_dict:
- continue
- # Skip experts that are not assigned to this worker.
- if (("mlp.experts." in name or "mlp.shared_experts." in name)
- and name not in params_dict):
- continue
- param = params_dict[name]
- weight_loader = getattr(param, "weight_loader",
- default_weight_loader)
- weight_loader(param, loaded_weight)
+ for mapping in expert_params_mapping:
+ param_name, weight_name, expert_id, shard_id = mapping
+ if weight_name not in name:
+ continue
+ name = name.replace(weight_name, param_name)
+ param = params_dict[name]
+ weight_loader = param.weight_loader
+ weight_loader(param,
+ loaded_weight,
+ weight_name,
+ shard_id=shard_id,
+ expert_id=expert_id)
+ break
+ else:
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader",
+ default_weight_loader)
+ weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
new file mode 100644
index 0000000000000..fdea8ee30ce68
--- /dev/null
+++ b/vllm/model_executor/models/fuyu.py
@@ -0,0 +1,328 @@
+# coding=utf-8
+# adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/fuyu/modeling_fuyu.py
+# Copyright 2023 The vLLM team.
+# Copyright 2023 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Fuyu model."""
+import math
+from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from PIL import Image
+from transformers import FuyuConfig, FuyuImageProcessor
+
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, MultiModalConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import ColumnParallelLinear
+from vllm.model_executor.layers.quantization.base_config import (
+ QuantizationConfig)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.persimmon import PersimmonForCausalLM
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.base import MultiModalInputs
+from vllm.multimodal.image import (cached_get_image_processor,
+ cached_get_tokenizer)
+from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData
+
+from .interfaces import SupportsVision
+from .utils import merge_vision_embeddings
+
+logger = init_logger(__name__)
+
+# Cannot find the following 2 numbers from hf config.
+_IMAGE_TOKEN_ID = 71011
+_NEWLINE_TOKEN_ID = 71019
+
+MAX_IMAGE_FEATURE_SIZE_HEIGHT = 1080
+MAX_IMAGE_FEATURE_SIZE_WIDTH = 1920
+
+
+class FuyuImagePixelInputs(TypedDict):
+ type: Literal["pixel_values"]
+ data: torch.Tensor
+ """
+ Shape:
+ (batch_size, num_patches, patch_size_x * patch_size_y * num_channels)
+ """
+
+
+def _calculate_num_image_tokens(
+ height: int,
+ width: int,
+) -> Tuple[int, int]:
+ """
+ calculate number of image tokens needed for a given image size
+ The expected Fuyu image prompts is in format:
+ (image_token * ncols + newline_token) * nrows
+ args:
+ image_size: Tuple[int, int] - (width, height) of the image
+ returns:
+ ncols: int - number of image tokens in x direction
+ nrows: int - number of image tokens in y direction
+ """
+ ncol = math.ceil(width / 30)
+ nrow = math.ceil(height / 30)
+ return ncol, nrow
+
+
+def get_max_fuyu_image_feature_size():
+
+ return _calculate_num_image_tokens(
+ height=MAX_IMAGE_FEATURE_SIZE_HEIGHT,
+ width=MAX_IMAGE_FEATURE_SIZE_WIDTH,
+ )
+
+
+def get_max_fuyu_image_tokens(ctx: InputContext):
+ ncol, nrow = get_max_fuyu_image_feature_size()
+ return (ncol + 1) * nrow
+
+
+def dummy_seq_data_for_fuyu(ctx: InputContext, seq_len: int):
+ ncol, nrow = get_max_fuyu_image_feature_size()
+ image_feature_size = get_max_fuyu_image_tokens(ctx)
+
+ token_ids = ([_IMAGE_TOKEN_ID] * ncol + [_NEWLINE_TOKEN_ID]) * nrow
+ token_ids += [0] * (seq_len - image_feature_size)
+ return SequenceData(token_ids)
+
+
+def dummy_image_for_fuyu(
+ image_width: int,
+ image_height: int,
+):
+ image = Image.new("RGB", (image_width, image_height), color=0)
+ return {"image": image}
+
+
+def dummy_data_for_fuyu(ctx: InputContext, seq_len: int):
+ seq_data = dummy_seq_data_for_fuyu(ctx, seq_len)
+ mm_data = dummy_image_for_fuyu(MAX_IMAGE_FEATURE_SIZE_WIDTH,
+ MAX_IMAGE_FEATURE_SIZE_HEIGHT)
+ return seq_data, mm_data
+
+
+def _fuyu_image_preprocess(image_processor: FuyuImageProcessor,
+ data: Image.Image):
+ image_encoding = image_processor.preprocess(data, return_tensors="pt")
+ batch_images = torch.stack([img[0] for img in image_encoding["images"]
+ ]).unsqueeze(1)
+ image_unpadded_heights = torch.tensor(
+ image_encoding["image_unpadded_heights"])
+ image_unpadded_widths = torch.tensor(
+ image_encoding["image_unpadded_widths"])
+
+ batch_size = len(image_encoding["images"])
+ image_present = torch.ones(batch_size, 1, 1)
+ model_image_input = image_processor.preprocess_with_tokenizer_info(
+ image_input=batch_images,
+ image_present=image_present,
+ image_unpadded_h=image_unpadded_heights,
+ image_unpadded_w=image_unpadded_widths,
+ image_placeholder_id=_IMAGE_TOKEN_ID,
+ image_newline_id=_NEWLINE_TOKEN_ID,
+ variable_sized=True,
+ )
+ return model_image_input
+
+
+def input_processor_for_fuyu(ctx: InputContext, llm_inputs: LLMInputs):
+ multi_modal_data = llm_inputs.get("multi_modal_data")
+ if multi_modal_data is None or "image" not in multi_modal_data:
+ return llm_inputs
+
+ model_config = ctx.model_config
+ image_data = multi_modal_data["image"]
+ new_multi_modal_data = {}
+ # process image data
+ if isinstance(image_data, Image.Image):
+ # Fuyu's image_processor can also finish token padding
+ image_processor: FuyuImageProcessor = cached_get_image_processor(
+ model_config.model)
+
+ model_image_input = _fuyu_image_preprocess(image_processor, image_data)
+ image_patches = torch.stack([
+ image_patch[0]
+ for image_patch in model_image_input["image_patches"]
+ ])
+ new_multi_modal_data["image"] = image_patches
+
+ elif isinstance(image_data, torch.Tensor):
+ raise NotImplementedError("Embeddings input is not supported yet")
+ else:
+ raise TypeError(f"Invalid image type: {type(image_data)}")
+
+ # process prompts
+ prompt = llm_inputs["prompt"]
+ prompt_token_ids = llm_inputs["prompt_token_ids"]
+ tokenizer = cached_get_tokenizer(model_config.model)
+ # dim0 is batch_size, dim1 is subseq_size which will always be 1
+ image_input_ids: List[List[
+ torch.Tensor]] = model_image_input["image_input_ids"]
+ image_input_ids = image_input_ids[0][0].tolist()
+ bos_token = tokenizer.encode("", add_special_tokens=False)[1:]
+ boa_token = tokenizer.encode("\x04", add_special_tokens=False)[1:]
+
+ new_prompt = prompt + "\x04"
+ new_prompt_token_ids = image_input_ids + bos_token + prompt_token_ids[
+ 1:] + boa_token
+
+ return LLMInputs(prompt=new_prompt,
+ prompt_token_ids=new_prompt_token_ids,
+ multi_modal_data=new_multi_modal_data)
+
+
+def input_mapper_for_fuyu(ctx: InputContext, data: object):
+ model_config = ctx.model_config
+ if isinstance(data, Image.Image):
+ # Fuyu's image_processor can also finish token padding
+ image_processor: FuyuImageProcessor = cached_get_image_processor(
+ model_config.model)
+
+ model_image_input = _fuyu_image_preprocess(image_processor, data)
+ data = torch.stack([
+ image_patch[0]
+ for image_patch in model_image_input["image_patches"]
+ ])
+
+ # image has been processed with prompt in input processor
+ return MultiModalInputs({"image_patches": data})
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_fuyu)
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_fuyu_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_fuyu)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_fuyu)
+class FuyuForCausalLM(nn.Module, SupportsVision):
+
+ def __init__(self,
+ config: FuyuConfig,
+ multimodal_config: MultiModalConfig,
+ cache_config: Optional[CacheConfig] = None,
+ quant_config: Optional[QuantizationConfig] = None) -> None:
+ super().__init__()
+ self.config = config
+ self.multimodal_config = multimodal_config
+
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+ self.image_token_id = _IMAGE_TOKEN_ID
+ self.image_feature_size = config.patch_size**2 * config.num_channels
+
+ self.vision_embed_tokens = ColumnParallelLinear(
+ self.image_feature_size,
+ config.hidden_size,
+ quant_config=quant_config,
+ )
+ self.language_model = PersimmonForCausalLM(config,
+ cache_config=cache_config,
+ quant_config=quant_config)
+
+ def _parse_and_validate_image_input(self, **kwargs: object):
+ image_patches = kwargs.pop("image_patches", None)
+
+ if isinstance(image_patches, torch.Tensor):
+ expected_feature_size = self.image_feature_size
+ if image_patches.size(-1) != expected_feature_size:
+ raise ValueError(
+ f"Expected image patches to have the last dimension of "
+ f"{expected_feature_size}, got {image_patches.size(-1)}")
+ image_patches = image_patches.to(
+ self.vision_embed_tokens.weight.dtype)
+ return FuyuImagePixelInputs(type="pixel_values",
+ data=image_patches)
+ return None
+
+ def forward(
+ self,
+ input_ids: torch.Tensor,
+ positions: torch.Tensor,
+ kv_caches: List[torch.Tensor],
+ attn_metadata: AttentionMetadata,
+ intermediate_tensors: Optional[IntermediateTensors] = None,
+ **kwargs: object,
+ ):
+ image_input = self._parse_and_validate_image_input(**kwargs)
+
+ if image_input is not None:
+ vision_embeddings, _ = self.vision_embed_tokens(
+ image_input["data"])
+ inputs_embeds = self.language_model.model.embed_tokens(input_ids)
+ inputs_embeds = merge_vision_embeddings(input_ids, inputs_embeds,
+ vision_embeddings,
+ self.image_token_id)
+
+ else:
+ inputs_embeds = None
+
+ hidden_states = self.language_model(
+ input_ids=input_ids,
+ positions=positions,
+ kv_caches=kv_caches,
+ attn_metadata=attn_metadata,
+ inputs_embeds=inputs_embeds,
+ )
+ return hidden_states
+
+ def compute_logits(self, hidden_states: torch.Tensor,
+ sampling_metadata: SamplingMetadata) -> torch.Tensor:
+ logits = self.language_model.logits_processor(
+ self.language_model.lm_head, hidden_states, sampling_metadata)
+ return logits
+
+ def sample(
+ self,
+ logits: torch.Tensor,
+ sampling_metadata: SamplingMetadata,
+ ) -> Optional[SamplerOutput]:
+ next_tokens = self.language_model.sampler(logits, sampling_metadata)
+ return next_tokens
+
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+ params_dict = dict(self.named_parameters(remove_duplicate=False))
+ for name, loaded_weight in weights:
+ if "rotary_emb.inv_freq" in name:
+ continue
+ if ("rotary_emb.cos_cached" in name
+ or "rotary_emb.sin_cached" in name):
+ # Models trained using ColossalAI may include these tensors in
+ # the checkpoint. Skip them.
+ continue
+ param = params_dict[name]
+
+ if "query_key_value" in name:
+ # copy from vllm/model_executor/models/bloom.py
+ # NOTE: Fuyu's fused QKV's output_dim has the shape of
+ # (num_heads * 3 * head_size), while the
+ # required shape is (3 * num_heads * head_size).
+ # Thus, we need weight conversion.
+ output_dim = getattr(param, "output_dim", None)
+ num_heads = self.config.num_attention_heads
+ if output_dim is not None:
+ loaded_weight_shape = loaded_weight.shape
+ loaded_weight = loaded_weight.view(
+ loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
+ loaded_weight_shape[output_dim + 1:])
+ loaded_weight = loaded_weight.transpose(
+ output_dim, output_dim + 1)
+ loaded_weight = loaded_weight.reshape(loaded_weight_shape)
+
+ weight_loader = getattr(param, "weight_loader",
+ default_weight_loader)
+ weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 16548c6c1e8c7..7e0888b5f5abd 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -277,6 +277,7 @@ def forward(
positions: torch.Tensor,
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
+ intermediate_tensors: Optional[IntermediateTensors] = None,
inputs_embeds: Optional[torch.Tensor] = None,
) -> torch.Tensor:
if inputs_embeds is not None:
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index be19f4ba8c71e..d309a2b27f5dd 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -27,7 +27,6 @@
from vllm.config import CacheConfig
from vllm.distributed.parallel_state import (
get_pp_group, get_tensor_model_parallel_world_size)
-from vllm.distributed.utils import get_pp_indices
from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
QKVParallelLinear,
@@ -42,6 +41,8 @@
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors, SamplerOutput
+from .utils import is_pp_missing_parameter, make_layers
+
class GPT2Attention(nn.Module):
@@ -183,18 +184,9 @@ def __init__(
self.embed_dim = config.hidden_size
self.wte = VocabParallelEmbedding(config.vocab_size, self.embed_dim)
self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
- self.start_layer, self.end_layer = get_pp_indices(
+ self.start_layer, self.end_layer, self.h = make_layers(
config.num_hidden_layers,
- get_pp_group().rank_in_group,
- get_pp_group().world_size)
- self.h = nn.ModuleList(
- [nn.Identity() for _ in range(self.start_layer)] + [
- GPT2Block(config, cache_config, quant_config)
- for _ in range(self.start_layer, self.end_layer)
- ] + [
- nn.Identity()
- for _ in range(self.end_layer, config.num_hidden_layers)
- ])
+ lambda: GPT2Block(config, cache_config, quant_config))
self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
def forward(
@@ -291,19 +283,20 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
continue
if not name.startswith("transformer."):
name = "transformer." + name
- try:
- param = params_dict[name]
- # The HF's GPT-2 implementation uses Conv1D instead of Linear.
- # Because of this, we need to transpose the weights.
- # Note(zhuohan): the logic below might break quantized models.
- for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]:
- if conv1d_weight_name not in name:
- continue
- if not name.endswith(".weight"):
- continue
- loaded_weight = loaded_weight.t()
- weight_loader = getattr(param, "weight_loader",
- default_weight_loader)
- weight_loader(param, loaded_weight)
- except KeyError:
+
+ if is_pp_missing_parameter(name, self):
continue
+
+ param = params_dict[name]
+ # The HF's GPT-2 implementation uses Conv1D instead of Linear.
+ # Because of this, we need to transpose the weights.
+ # Note(zhuohan): the logic below might break quantized models.
+ for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]:
+ if conv1d_weight_name not in name:
+ continue
+ if not name.endswith(".weight"):
+ continue
+ loaded_weight = loaded_weight.t()
+ weight_loader = getattr(param, "weight_loader",
+ default_weight_loader)
+ weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index cc42413d53f4c..fc4e13bbb0e68 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -235,7 +235,7 @@ def forward(
class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA):
packed_modules_mapping = {"c_attn": ["c_attn"]}
- supported_lora_modules = ["c_fc", "c_proj", "wte", "lm_head", "c_attn"]
+ supported_lora_modules = ["c_fc", "c_proj", "wte", "c_attn"]
embedding_modules = {
"wte": "input_embeddings",
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 77edcd7402db1..a777d1fbfa802 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -29,8 +29,7 @@
from vllm.attention import Attention, AttentionMetadata
from vllm.config import CacheConfig, LoRAConfig
-from vllm.distributed import (get_pp_group, get_pp_indices,
- get_tensor_model_parallel_rank,
+from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size)
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.layernorm import RMSNorm
@@ -51,6 +50,7 @@
from vllm.utils import is_hip, print_warning_once
from .interfaces import SupportsLoRA
+from .utils import is_pp_missing_parameter, make_layers
class LlamaMLP(nn.Module):
@@ -262,20 +262,11 @@ def __init__(
config.hidden_size,
org_num_embeddings=config.vocab_size,
)
- self.start_layer, self.end_layer = get_pp_indices(
+ self.start_layer, self.end_layer, self.layers = make_layers(
config.num_hidden_layers,
- get_pp_group().rank_in_group,
- get_pp_group().world_size)
- self.layers = nn.ModuleList(
- [nn.Identity() for _ in range(self.start_layer)] + [
- LlamaDecoderLayer(config=config,
- cache_config=cache_config,
- quant_config=quant_config)
- for _ in range(self.start_layer, self.end_layer)
- ] + [
- nn.Identity()
- for _ in range(self.end_layer, config.num_hidden_layers)
- ])
+ lambda: LlamaDecoderLayer(config=config,
+ cache_config=cache_config,
+ quant_config=quant_config))
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
@@ -455,12 +446,14 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
- try:
- param = params_dict[name]
- weight_loader = param.weight_loader
- weight_loader(param, loaded_weight, shard_id)
- except KeyError:
- pass
+
+ if is_pp_missing_parameter(name, self):
+ continue
+
+ param = params_dict[name]
+ weight_loader = param.weight_loader
+ weight_loader(param, loaded_weight, shard_id)
+
break
else:
# Skip loading extra bias for GPTQ models.
@@ -479,13 +472,14 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
continue
else:
name = remapped_kv_scale_name
- try:
- param = params_dict[name]
- weight_loader = getattr(param, "weight_loader",
- default_weight_loader)
- weight_loader(param, loaded_weight)
- except KeyError:
- pass
+
+ if is_pp_missing_parameter(name, self):
+ continue
+
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader",
+ default_weight_loader)
+ weight_loader(param, loaded_weight)
# If this function is called, it should always initialize KV cache scale
# factors (or else raise an exception). Thus, handled exceptions should
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 250d3968715ba..b5dddd5192194 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -128,8 +128,17 @@ def __init__(self,
self.config = config
self.multimodal_config = multimodal_config
+ # Initialize the vision tower only up to the required feature layer
+ vision_feature_layer = config.vision_feature_layer
+ if vision_feature_layer < 0:
+ num_hidden_layers = config.vision_config.num_hidden_layers \
+ + vision_feature_layer + 1
+ else:
+ num_hidden_layers = vision_feature_layer + 1
+
# TODO: Optionally initializes this for supporting embeddings.
- self.vision_tower = CLIPVisionModel(config.vision_config)
+ self.vision_tower = CLIPVisionModel(
+ config.vision_config, num_hidden_layers_override=num_hidden_layers)
self.multi_modal_projector = LlavaMultiModalProjector(
vision_hidden_size=config.vision_config.hidden_size,
text_hidden_size=config.text_config.hidden_size,
@@ -193,8 +202,7 @@ def _image_pixels_to_features(self, vision_tower: CLIPVisionModel,
# NOTE: we skip the step to select the vision feature layer since
# this is already done inside the vision tower
- image_features = vision_tower(pixel_values,
- self.config.vision_feature_layer)
+ image_features = vision_tower(pixel_values)
return self._select_image_features(
image_features,
@@ -333,7 +341,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
break
else:
use_default_weight_loading = True
- if use_default_weight_loading:
+ if use_default_weight_loading and name in params_dict:
param = params_dict[name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 7e06f1e95dab1..0c89eed88f21a 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -74,19 +74,21 @@ def _get_llava_next_num_unpadded_features(
) -> Tuple[int, int]:
current_height = npatches * num_patch_height
current_width = npatches * num_patch_width
+ current_height = torch.tensor(current_height).to("cuda")
+ current_width = torch.tensor(current_width).to("cuda")
aspect_ratio: float = width / height
current_aspect_ratio: float = current_width / current_height
if aspect_ratio > current_aspect_ratio:
- new_height = (height * current_width) // width
- if new_height % 2 == 1:
- new_height += 1
- current_height = new_height
+ scale_factor = current_width / width
+ new_height = int(height * scale_factor)
+ padding = (current_height - new_height) // 2
+ current_height -= padding * 2
else:
- new_width = (width * current_height) // height
- if new_width % 2 == 1:
- new_width += 1
- current_width = new_width
+ scale_factor = current_height / height
+ new_width = int(width * scale_factor)
+ padding = (current_width - new_width) // 2
+ current_width -= padding * 2
unpadded_features = current_height * current_width
newline_features = current_height
@@ -220,8 +222,17 @@ def __init__(self,
self.config = config
self.multimodal_config = multimodal_config
+ # Initialize the vision tower only up to the required feature layer
+ vision_feature_layer = config.vision_feature_layer
+ if vision_feature_layer < 0:
+ num_hidden_layers = config.vision_config.num_hidden_layers \
+ + vision_feature_layer + 1
+ else:
+ num_hidden_layers = vision_feature_layer + 1
+
# TODO: Optionally initializes this for supporting embeddings.
- self.vision_tower = CLIPVisionModel(config=config.vision_config)
+ self.vision_tower = CLIPVisionModel(
+ config.vision_config, num_hidden_layers_override=num_hidden_layers)
self.multi_modal_projector = LlavaMultiModalProjector(
vision_hidden_size=config.vision_config.hidden_size,
text_hidden_size=config.text_config.hidden_size,
@@ -310,8 +321,7 @@ def _image_pixels_to_features(self, vision_tower: CLIPVisionModel,
# NOTE: we skip the step to select the vision feature layer since
# this is already done inside the vision tower
- image_features = vision_tower(pixel_values,
- self.config.vision_feature_layer)
+ image_features = vision_tower(pixel_values)
return self._select_image_features(
image_features,
@@ -559,7 +569,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
break
else:
use_default_weight_loading = True
- if use_default_weight_loading:
+ if use_default_weight_loading and name in params_dict:
param = params_dict[name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 7f5e3b9699c91..0c456ada61230 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -88,12 +88,13 @@ def __init__(self,
tp_size=tp_size)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
- num_tokens, hidden_size = hidden_states.shape
+ # NOTE: hidden_states can have either 1D or 2D shape.
+ orig_shape = hidden_states.shape
hidden_states = hidden_states.view(-1, self.hidden_size)
# router_logits: (num_tokens, n_experts)
router_logits, _ = self.gate(hidden_states)
final_hidden_states = self.experts(hidden_states, router_logits)
- return final_hidden_states.view(num_tokens, hidden_size)
+ return final_hidden_states.view(orig_shape)
class MixtralAttention(nn.Module):
@@ -371,31 +372,13 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
("qkv_proj", "v_proj", "v"),
]
- expert_params_mapping = [
- # These are the weight scales for the experts
- # (param_name, weight_name, expert_id, shard_id)
- ("experts.w13_scale"
- if weight_name in ["w1", "w3"] else "experts.w2_scale",
- f"experts.{expert_id}.{weight_name}.weight_scale", expert_id,
- shard_id) for expert_id in range(self.config.num_local_experts)
- for shard_id, weight_name in enumerate(["w1", "w2", "w3"])
- ] + [
- # These are the weights for the experts
- # (param_name, weight_name, expert_id)
- ("experts.w13_weight"
- if weight_name in ["w1", "w3"] else "experts.w2_weight",
- f"experts.{expert_id}.{weight_name}.weight", expert_id, shard_id)
- for expert_id in range(self.config.num_local_experts)
- for shard_id, weight_name in enumerate(["w1", "w2", "w3"])
- ] + [
- # These are the activation scales for the experts
- # (param_name, weight_name, expert_id)
- ("experts.a13_scale"
- if weight_name in ["w1", "w3"] else "experts.a2_scale",
- f"experts.{expert_id}.{weight_name}.input_scale", expert_id,
- shard_id) for expert_id in range(self.config.num_local_experts)
- for shard_id, weight_name in enumerate(["w1", "w2", "w3"])
- ]
+ # Params for weights, fp8 weight scales, fp8 activation scales
+ # (param_name, weight_name, expert_id, shard_id)
+ expert_params_mapping = FusedMoE.make_expert_params_mapping(
+ ckpt_gate_proj_name="w1",
+ ckpt_down_proj_name="w2",
+ ckpt_up_proj_name="w3",
+ num_experts=self.config.num_local_experts)
params_dict = dict(self.named_parameters())
for name, loaded_weight in weights:
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index 97f7ec74292bb..d3aec06a92fdb 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -110,7 +110,7 @@ def __init__(self, config: MLPSpeculatorConfig, **kwargs) -> None:
])
self.head = nn.ModuleList([
- nn.Linear(self.inner_dim, self.vocab_size, bias=False)
+ ParallelLMHead(self.vocab_size, self.inner_dim, bias=False)
for _ in range(self.max_speculative_tokens)
])
self.ln = nn.ModuleList([
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 2af2bedd8e48e..8a2bacbd96b67 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -19,7 +19,7 @@
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import cached_get_tokenizer
-from vllm.sequence import SamplerOutput, SequenceData
+from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData
from .interfaces import SupportsVision
from .utils import merge_vision_embeddings
@@ -111,7 +111,7 @@ def input_processor_for_paligemma(ctx: InputContext, llm_inputs: LLMInputs):
orig_prompt = llm_inputs.get("prompt")
orig_prompt_ids = llm_inputs.get("prompt_token_ids")
- if image_token_str in orig_prompt:
+ if orig_prompt is not None and image_token_str in orig_prompt:
logger.warning(
"The image token '%s' was detected in the prompt and "
"will be removed. Please follow the proper prompt format"
@@ -214,7 +214,9 @@ def _parse_and_validate_image_input(
def _image_pixels_to_features(self, vision_tower: SiglipVisionModel,
pixel_values: torch.Tensor) -> torch.Tensor:
- image_outputs = vision_tower(pixel_values, output_hidden_states=True)
+ target_dtype = vision_tower.get_input_embeddings().weight.dtype
+ image_outputs = vision_tower(pixel_values.to(dtype=target_dtype),
+ output_hidden_states=True)
selected_image_features = image_outputs.last_hidden_state
@@ -236,9 +238,12 @@ def _process_image_input(
return self.multi_modal_projector(image_features)
- def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
+ def forward(self,
+ input_ids: torch.Tensor,
+ positions: torch.Tensor,
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
+ intermediate_tensors: Optional[IntermediateTensors] = None,
**kwargs: object) -> SamplerOutput:
parsed_image_input = self._parse_and_validate_image_input(**kwargs)
@@ -263,6 +268,7 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
positions,
kv_caches,
attn_metadata,
+ None,
inputs_embeds=inputs_embeds)
return hidden_states
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
new file mode 100644
index 0000000000000..bc38d4421b79e
--- /dev/null
+++ b/vllm/model_executor/models/persimmon.py
@@ -0,0 +1,333 @@
+# coding=utf-8
+# adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/persimmon/modeling_persimmon.py
+# Copyright 2023 The vLLM team.
+# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only persimmon model compatible with HuggingFace weights."""
+from typing import Iterable, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import PersimmonConfig
+from transformers.activations import ReLUSquaredActivation
+
+from vllm.attention import Attention, AttentionMetadata
+from vllm.config import CacheConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+ QKVParallelLinear,
+ RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+ QuantizationConfig)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+ ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors, SamplerOutput
+
+
+class PersimmonMLP(nn.Module):
+
+ def __init__(self,
+ config: PersimmonConfig,
+ quant_config: Optional[QuantizationConfig] = None):
+ super().__init__()
+ self.dense_h_to_4h = ColumnParallelLinear(config.hidden_size,
+ config.intermediate_size,
+ quant_config=quant_config)
+ self.dense_4h_to_h = RowParallelLinear(config.intermediate_size,
+ config.hidden_size,
+ quant_config=quant_config)
+ self.act = ReLUSquaredActivation()
+
+ def forward(self, hidden_states) -> torch.Tensor:
+ hidden_states, _ = self.dense_h_to_4h(hidden_states)
+ hidden_states = self.act(hidden_states)
+ hidden_states, _ = self.dense_4h_to_h(hidden_states)
+ return hidden_states
+
+
+class PersimmonAttention(nn.Module):
+
+ def __init__(self,
+ config: PersimmonConfig,
+ cache_config: Optional[CacheConfig] = None,
+ quant_config: Optional[QuantizationConfig] = None):
+ super().__init__()
+ self.config = config
+ tensor_parallel_world_size = get_tensor_model_parallel_world_size()
+
+ self.hidden_size = config.hidden_size
+ self.total_num_heads = config.num_attention_heads
+ self.num_heads = self.total_num_heads // tensor_parallel_world_size
+ self.head_dim = self.hidden_size // self.total_num_heads
+ self.max_position_embeddings = config.max_position_embeddings
+ self.rope_theta = config.rope_theta
+ self.partial_rotary_factor = config.partial_rotary_factor
+ self.is_causal = True
+
+ assert (self.head_dim * self.total_num_heads) == self.hidden_size
+ assert self.total_num_heads % tensor_parallel_world_size == 0
+
+ self.query_key_value = QKVParallelLinear(
+ self.hidden_size,
+ self.head_dim,
+ self.total_num_heads,
+ bias=True,
+ quant_config=quant_config,
+ )
+ self.dense = RowParallelLinear(
+ self.num_heads * self.head_dim,
+ self.hidden_size,
+ bias=True,
+ quant_config=quant_config,
+ )
+ self.is_qk_layernorm = config.qk_layernorm
+
+ if self.is_qk_layernorm:
+ self.q_layernorm = nn.LayerNorm(self.head_dim)
+ self.k_layernorm = nn.LayerNorm(self.head_dim)
+
+ self.rotary_emb = get_rope(
+ self.head_dim,
+ rotary_dim=int(self.partial_rotary_factor * self.head_dim),
+ max_position=self.max_position_embeddings,
+ base=self.rope_theta,
+ )
+ self.scaling = self.head_dim**-0.5
+ self.attn = Attention(self.num_heads,
+ self.head_dim,
+ scale=self.scaling,
+ cache_config=cache_config,
+ quant_config=quant_config)
+
+ def _split_heads(self, x: torch.Tensor) -> torch.Tensor:
+ # [seq_length, hidden_size] -> [seq_length, num_heads, head_dim]
+ seq_length = x.shape[0]
+ return x.view(seq_length, self.num_heads, self.head_dim)
+
+ def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
+ # [seq_length, num_heads, head_dim] -> [seq_length, hidden_size]
+ seq_length = x.shape[0]
+ return x.view(seq_length, self.num_heads * self.head_dim)
+
+ def forward(
+ self,
+ position_ids: torch.Tensor,
+ hidden_states: torch.Tensor,
+ kv_cache: torch.Tensor,
+ attn_metadata: AttentionMetadata,
+ ) -> torch.Tensor:
+ # [seq_length, 3 x hidden_size]
+ qkv, _ = self.query_key_value(hidden_states)
+ q, k, v = qkv.chunk(chunks=3, dim=-1)
+
+ if self.is_qk_layernorm:
+ # [seq_length, num_heads, head_dim]
+ q = self._split_heads(q)
+ k = self._split_heads(k)
+
+ q = self.q_layernorm(q)
+ k = self.k_layernorm(k)
+
+ q = self._merge_heads(q)
+ k = self._merge_heads(k)
+
+ q, k = self.rotary_emb(position_ids, q, k)
+ attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+ output, _ = self.dense(attn_output)
+ return output
+
+
+class PersimmonDecoderLayer(nn.Module):
+
+ def __init__(self,
+ config: PersimmonConfig,
+ cache_config: Optional[CacheConfig] = None,
+ quant_config: Optional[QuantizationConfig] = None):
+ super().__init__()
+ self.hidden_size = config.hidden_size
+ self.self_attn = PersimmonAttention(config=config,
+ cache_config=cache_config,
+ quant_config=quant_config)
+ self.mlp = PersimmonMLP(config, quant_config=quant_config)
+ self.input_layernorm = nn.LayerNorm(config.hidden_size,
+ eps=config.layer_norm_eps)
+ self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
+ eps=config.layer_norm_eps)
+
+ def forward(
+ self,
+ position_ids: torch.Tensor,
+ hidden_states: torch.Tensor,
+ kv_cache: torch.Tensor,
+ attn_metadata: AttentionMetadata,
+ ) -> torch.Tensor:
+ residual = hidden_states
+
+ hidden_states = self.input_layernorm(hidden_states)
+
+ # Self Attention
+ hidden_states = self.self_attn(
+ position_ids=position_ids,
+ hidden_states=hidden_states,
+ kv_cache=kv_cache,
+ attn_metadata=attn_metadata,
+ )
+ hidden_states = residual + hidden_states
+
+ # Fully Connected
+ residual = hidden_states
+ hidden_states = self.post_attention_layernorm(hidden_states)
+ hidden_states = self.mlp(hidden_states)
+
+ hidden_states = hidden_states + residual
+
+ outputs = hidden_states
+ return outputs
+
+
+class PersimmonModel(nn.Module):
+
+ def __init__(self,
+ config: PersimmonConfig,
+ cache_config: Optional[CacheConfig] = None,
+ quant_config: Optional[QuantizationConfig] = None):
+ super().__init__()
+ self.vocab_size = config.vocab_size
+
+ self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
+ config.hidden_size)
+ self.layers = nn.ModuleList([
+ PersimmonDecoderLayer(config,
+ cache_config=cache_config,
+ quant_config=quant_config)
+ for _ in range(config.num_hidden_layers)
+ ])
+ self.final_layernorm = nn.LayerNorm(config.hidden_size,
+ eps=config.layer_norm_eps)
+
+ def forward(
+ self,
+ input_ids: torch.Tensor,
+ positions: torch.Tensor,
+ kv_caches: List[torch.Tensor],
+ attn_metadata: AttentionMetadata,
+ inputs_embeds: Optional[torch.Tensor] = None,
+ ) -> torch.Tensor:
+ if inputs_embeds is not None:
+ hidden_states = inputs_embeds
+ else:
+ hidden_states = self.embed_tokens(input_ids)
+ for i in range(len(self.layers)):
+ hidden_states = self.layers[i](
+ positions,
+ hidden_states,
+ kv_caches[i],
+ attn_metadata,
+ )
+ hidden_states = self.final_layernorm(hidden_states)
+ return hidden_states
+
+
+class PersimmonForCausalLM(nn.Module):
+
+ def __init__(self,
+ config,
+ cache_config: Optional[CacheConfig] = None,
+ quant_config: Optional[QuantizationConfig] = None):
+ super().__init__()
+ self.config = config
+ self.vocab_size = config.vocab_size
+ self.model = PersimmonModel(config,
+ cache_config=cache_config,
+ quant_config=quant_config)
+ self.lm_head = ParallelLMHead(config.vocab_size,
+ config.hidden_size,
+ bias=False)
+ self.logits_processor = LogitsProcessor(config.vocab_size)
+ self.sampler = Sampler()
+
+ def forward(
+ self,
+ input_ids: torch.Tensor,
+ positions: torch.Tensor,
+ kv_caches: List[torch.Tensor],
+ attn_metadata: AttentionMetadata,
+ intermediate_tensors: Optional[IntermediateTensors] = None,
+ inputs_embeds: Optional[torch.Tensor] = None,
+ ):
+ hidden_states = self.model(
+ input_ids=input_ids,
+ positions=positions,
+ kv_caches=kv_caches,
+ attn_metadata=attn_metadata,
+ inputs_embeds=inputs_embeds,
+ )
+ return hidden_states
+
+ def compute_logits(self, hidden_states: torch.Tensor,
+ sampling_metadata: SamplingMetadata) -> torch.Tensor:
+ logits = self.logits_processor(self.lm_head, hidden_states,
+ sampling_metadata)
+ return logits
+
+ def sample(
+ self,
+ logits: torch.Tensor,
+ sampling_metadata: SamplingMetadata,
+ ) -> Optional[SamplerOutput]:
+ next_tokens = self.sampler(logits, sampling_metadata)
+ return next_tokens
+
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+ params_dict = dict(self.named_parameters(remove_duplicate=False))
+ for name, loaded_weight in weights:
+ if "rotary_emb.inv_freq" in name:
+ continue
+ if ("rotary_emb.cos_cached" in name
+ or "rotary_emb.sin_cached" in name):
+ # Models trained using ColossalAI may include these tensors in
+ # the checkpoint. Skip them.
+ continue
+ param = params_dict[name]
+
+ if "query_key_value" in name:
+ # copy from vllm/model_executor/models/bloom.py
+ # NOTE: Persimmon's fused QKV's output_dim has the shape of
+ # (num_heads * 3 * head_size), while the
+ # required shape is (3 * num_heads * head_size).
+ # Thus, we need weight conversion.
+ output_dim = getattr(param, "output_dim", None)
+ num_heads = self.config.num_attention_heads
+ if output_dim is not None:
+ loaded_weight_shape = loaded_weight.shape
+ loaded_weight = loaded_weight.view(
+ loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
+ loaded_weight_shape[output_dim + 1:])
+ loaded_weight = loaded_weight.transpose(
+ output_dim, output_dim + 1)
+ loaded_weight = loaded_weight.reshape(loaded_weight_shape)
+
+ weight_loader = getattr(param, "weight_loader",
+ default_weight_loader)
+ weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 1c6bd106b53f5..8b2c425289f0a 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -80,13 +80,11 @@ def __init__(self, wte=None) -> None:
def get_img_features(self,
img_embeds: torch.FloatTensor) -> torch.FloatTensor:
- LAYER_IDX = self.layer_idx
TYPE_FEATURE = self.type_feature
# NOTE: we skip the step to select the vision feature layer since
# this is already done inside the img_processor
- img_feature = self.img_processor(img_embeds,
- vision_feature_layer=LAYER_IDX)
+ img_feature = self.img_processor(img_embeds)
if TYPE_FEATURE == "patch":
patch_feature = img_feature[:, 1:]
@@ -111,7 +109,17 @@ def __init__(self, config: PretrainedConfig, wte=None) -> None:
config, 'n_embd') else config.hidden_size
clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG
- self.img_processor = CLIPVisionModel(clip_config)
+ self.layer_idx = config.img_processor.get('layer_idx', -2)
+
+ # Initialize the CLIP only up to the required feature layer
+ if self.layer_idx < 0:
+ num_hidden_layers = clip_config.num_hidden_layers + \
+ self.layer_idx + 1
+ else:
+ num_hidden_layers = self.layer_idx + 1
+
+ self.img_processor = CLIPVisionModel(
+ clip_config, num_hidden_layers_override=num_hidden_layers)
image_dim_out = config.img_processor['image_dim_out']
self.num_img_tokens = config.img_processor['num_img_tokens']
@@ -142,8 +150,6 @@ def __init__(self, config: PretrainedConfig, wte=None) -> None:
self.img_projection = nn.Sequential(*layers)
self.vocab_size = config.vocab_size
-
- self.layer_idx = config.img_processor.get('layer_idx', -2)
self.type_feature = config.img_processor.get('type_feature', 'patch')
def forward(self, input_ids: torch.LongTensor,
@@ -588,7 +594,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
- param = params_dict[name]
- weight_loader = getattr(param, "weight_loader",
- default_weight_loader)
- weight_loader(param, loaded_weight)
+ if name in params_dict:
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader",
+ default_weight_loader)
+ weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index ccaa6f20893e0..2cc2f1440d147 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -50,6 +50,7 @@
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors, SamplerOutput
+from vllm.utils import print_warning_once
class Qwen2MoeMLP(nn.Module):
@@ -126,7 +127,9 @@ def __init__(
bias=False)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
- num_tokens, hidden_dim = hidden_states.shape
+ # NOTE: hidden_states can have either 1D or 2D shape.
+ orig_shape = hidden_states.shape
+ hidden_dim = hidden_states.shape[-1]
hidden_states = hidden_states.view(-1, hidden_dim)
shared_output = None
if self.shared_expert is not None:
@@ -145,7 +148,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
final_hidden_states = tensor_model_parallel_all_reduce(
final_hidden_states)
- return final_hidden_states.view(num_tokens, hidden_dim)
+ return final_hidden_states.view(orig_shape)
class Qwen2MoeAttention(nn.Module):
@@ -404,15 +407,13 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
("gate_up_proj", "up_proj", 1),
]
- expert_params_mapping = [
- # These are the weights for the experts
- # (param_name, weight_name, expert_id, shard_id)
- ("experts.w13_weight" if weight_name in ["gate_proj", "up_proj"]
- else "experts.w2_weight",
- f"experts.{expert_id}.{weight_name}.weight", expert_id, shard_id)
- for expert_id in range(self.config.num_experts) for shard_id,
- weight_name in enumerate(["gate_proj", "down_proj", "up_proj"])
- ]
+ # Params for weights, fp8 weight scales, fp8 activation scales
+ # (param_name, weight_name, expert_id, shard_id)
+ expert_params_mapping = FusedMoE.make_expert_params_mapping(
+ ckpt_gate_proj_name="gate_proj",
+ ckpt_down_proj_name="down_proj",
+ ckpt_up_proj_name="up_proj",
+ num_experts=self.config.num_experts)
params_dict = dict(self.named_parameters())
for name, loaded_weight in weights:
@@ -459,8 +460,20 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
- if name not in params_dict:
- continue
+ # Remapping the name of FP8 kv-scale.
+ if name.endswith("kv_scale"):
+ remapped_kv_scale_name = name.replace(
+ ".kv_scale", ".attn.kv_scale")
+ if remapped_kv_scale_name not in params_dict:
+ print_warning_once(
+ "Found kv scale in the checkpoint "
+ f"(e.g. {name}), but not found the expected "
+ f"name in the model "
+ f"(e.g. {remapped_kv_scale_name}). "
+ "kv-scale is not loaded.")
+ continue
+ else:
+ name = remapped_kv_scale_name
param = params_dict[name]
weight_loader = getattr(param, "weight_loader",
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index ef2562b073e6f..c135b20352203 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -1,3 +1,5 @@
+from typing import Callable, Dict, List, Tuple
+
import torch
from vllm.multimodal import BatchedTensors
@@ -39,3 +41,60 @@ def merge_vision_embeddings(input_ids: torch.Tensor,
inputs_embeds[mask] = torch.cat(vision_embeddings)
return inputs_embeds
+
+
+class PPMissingLayer(torch.nn.Identity):
+ """
+ A placeholder layer for missing layers in a pipeline parallel model.
+ """
+
+ def __init__(self, *args, **kwargs):
+ super().__init__()
+
+
+def make_layers(
+ num_hidden_layers: int, layer_fn: Callable[[], torch.nn.Module]
+) -> Tuple[int, int, torch.nn.ModuleList]:
+ """Make a list of layers with the given layer function, taking
+ pipeline parallelism into account.
+ """
+ from vllm.distributed.parallel_state import get_pp_group
+ from vllm.distributed.utils import get_pp_indices
+ start_layer, end_layer = get_pp_indices(num_hidden_layers,
+ get_pp_group().rank_in_group,
+ get_pp_group().world_size)
+ modules = torch.nn.ModuleList(
+ [PPMissingLayer() for _ in range(start_layer)] +
+ [layer_fn() for _ in range(start_layer, end_layer)] +
+ [PPMissingLayer() for _ in range(end_layer, num_hidden_layers)])
+ return start_layer, end_layer, modules
+
+
+# NOTE: don't use lru_cache here because it can prevent garbage collection
+_model_to_pp_missing_layer_names: Dict[int, List[str]] = {}
+
+
+def get_pp_missing_layer_names(model: torch.nn.Module) -> List[str]:
+ """Get the names of the missing layers in a pipeline parallel model."""
+ model_id = id(model)
+ if model_id in _model_to_pp_missing_layer_names:
+ return _model_to_pp_missing_layer_names[model_id]
+
+ missing_layer_names = []
+ for name, module in model.named_modules():
+ if isinstance(module, PPMissingLayer):
+ # NOTE: the trailing dot is used to match the prefix of the layer.
+ # without the dot, we could match a layer that is not missing,
+ # e.g., 'encoder.layer.1' would match 'encoder.layer.11'
+ missing_layer_names.append(name + '.')
+ _model_to_pp_missing_layer_names[model_id] = missing_layer_names
+
+ return missing_layer_names
+
+
+def is_pp_missing_parameter(name: str, model: torch.nn.Module) -> bool:
+ """Check if a parameter is missing in a pipeline parallel model."""
+ for missing_layer_name in get_pp_missing_layer_names(model):
+ if name.startswith(missing_layer_name):
+ return True
+ return False
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index b6d930659a8c1..503dceab5b168 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,5 +1,5 @@
-from .base import (BatchedTensors, MultiModalDataDict, MultiModalInputs,
- MultiModalPlugin)
+from .base import (BatchedTensors, MultiModalDataBuiltins, MultiModalDataDict,
+ MultiModalInputs, MultiModalPlugin)
from .registry import MultiModalRegistry
MULTIMODAL_REGISTRY = MultiModalRegistry()
@@ -13,6 +13,7 @@
__all__ = [
"BatchedTensors",
+ "MultiModalDataBuiltins",
"MultiModalDataDict",
"MultiModalInputs",
"MultiModalPlugin",
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 0e31816a8e8ac..3ebc25c5930cf 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -43,9 +43,6 @@ def try_concat(
*,
device: torch.types.Device,
) -> BatchedTensors:
- # Avoid initializing CUDA too early
- import torch
-
unbatched_shape = tensors[0].shape[1:]
for tensor in tensors:
@@ -84,16 +81,21 @@ def batch(
class MultiModalDataBuiltins(TypedDict, total=False):
+ """Modality types that are predefined by vLLM."""
+
image: Image.Image
+ """The input image."""
MultiModalDataDict = Union[MultiModalDataBuiltins, Dict[str, Any]]
"""
A dictionary containing an item for each modality type to input.
-The data belonging to each modality is converted into keyword arguments
-to the model by the corresponding mapper. By default, the mapper of
-the corresponding plugin with the same modality key is applied.
+Note:
+ This dictionary also accepts modality keys defined outside
+ :class:`MultiModalDataBuiltins` as long as a customized plugin is registered
+ through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
+ Read more on that :ref:`here `.
"""
MultiModalInputMapper = Callable[[InputContext, object], MultiModalInputs]
@@ -123,6 +125,9 @@ class MultiModalPlugin(ABC):
process the same data differently). This registry is in turn used by
:class:`~MultiModalRegistry` which acts at a higher level
(i.e., the modality of the data).
+
+ See also:
+ :ref:`adding_multimodal_plugin`
"""
def __init__(self) -> None:
@@ -183,8 +188,8 @@ def wrapper(model_cls: N) -> N:
def map_input(self, model_config: ModelConfig,
data: object) -> MultiModalInputs:
"""
- Apply an input mapper to a data passed
- to the model, transforming the data into a dictionary of model inputs.
+ Transform the data into a dictionary of model inputs using the
+ input mapper registered for that model.
The model is identified by ``model_config``.
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index b6c73512350f3..3b37ce9149fb8 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -100,6 +100,7 @@ def repeat_and_pad_image_tokens(
class ImagePlugin(MultiModalPlugin):
+ """Plugin for image data."""
def get_data_key(self) -> str:
return "image"
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index e0716bbf15715..d8e1b68178acd 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -15,10 +15,8 @@
class MultiModalRegistry:
"""
- A registry to dispatch data processing
- according to its modality and the target model.
-
- The registry handles both external and internal data input.
+ A registry that dispatches data processing to the
+ :class:`~vllm.multimodal.MultiModalPlugin` for each modality.
"""
DEFAULT_PLUGINS = (ImagePlugin(), )
@@ -30,6 +28,12 @@ def __init__(
self._plugins = {p.get_data_key(): p for p in plugins}
def register_plugin(self, plugin: MultiModalPlugin) -> None:
+ """
+ Register a multi-modal plugin so it can be recognized by vLLM.
+
+ See also:
+ :ref:`adding_multimodal_plugin`
+ """
data_type_key = plugin.get_data_key()
if data_type_key in self._plugins:
@@ -75,7 +79,11 @@ def map_input(self, model_config: ModelConfig,
data: MultiModalDataDict) -> MultiModalInputs:
"""
Apply an input mapper to the data passed to the model.
-
+
+ The data belonging to each modality is passed to the corresponding
+ plugin which in turn converts the data into into keyword arguments
+ via the input mapper registered for that model.
+
See :meth:`MultiModalPlugin.map_input` for more details.
"""
merged_dict: Dict[str, torch.Tensor] = {}
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index e55b8bbfdeaac..8691a61343ab6 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -35,8 +35,12 @@ def _load_image_from_data_url(image_url: str):
return load_image_from_base64(image_base64)
-def fetch_image(image_url: str) -> Image.Image:
- """Load PIL image from a url or base64 encoded openai GPT4V format"""
+def fetch_image(image_url: str, *, image_mode: str = "RGB") -> Image.Image:
+ """
+ Load a PIL image from a HTTP or base64 data URL.
+
+ By default, the image is converted into RGB format.
+ """
if image_url.startswith('http'):
_validate_remote_url(image_url, name="image_url")
@@ -53,7 +57,7 @@ def fetch_image(image_url: str) -> Image.Image:
raise ValueError("Invalid 'image_url': A valid 'image_url' must start "
"with either 'data:image' or 'http'.")
- return image
+ return image.convert(image_mode)
class ImageFetchAiohttp:
@@ -70,8 +74,17 @@ def get_aiohttp_client(cls) -> aiohttp.ClientSession:
return cls.aiohttp_client
@classmethod
- async def fetch_image(cls, image_url: str) -> Image.Image:
- """Load PIL image from a url or base64 encoded openai GPT4V format"""
+ async def fetch_image(
+ cls,
+ image_url: str,
+ *,
+ image_mode: str = "RGB",
+ ) -> Image.Image:
+ """
+ Asynchronously load a PIL image from a HTTP or base64 data URL.
+
+ By default, the image is converted into RGB format.
+ """
if image_url.startswith('http'):
_validate_remote_url(image_url, name="image_url")
@@ -91,7 +104,7 @@ async def fetch_image(cls, image_url: str) -> Image.Image:
"Invalid 'image_url': A valid 'image_url' must start "
"with either 'data:image' or 'http'.")
- return image
+ return image.convert(image_mode)
async def async_get_and_parse_image(image_url: str) -> MultiModalDataDict:
@@ -99,12 +112,19 @@ async def async_get_and_parse_image(image_url: str) -> MultiModalDataDict:
return {"image": image}
-def encode_image_base64(image: Image.Image, format: str = 'JPEG') -> str:
- """Encode a pillow image to base64 format."""
+def encode_image_base64(
+ image: Image.Image,
+ *,
+ image_mode: str = "RGB",
+ format: str = "JPEG",
+) -> str:
+ """
+ Encode a pillow image to base64 format.
+ By default, the image is converted into RGB format before being encoded.
+ """
buffered = BytesIO()
- if format == 'JPEG':
- image = image.convert('RGB')
+ image = image.convert(image_mode)
image.save(buffered, format)
return base64.b64encode(buffered.getvalue()).decode('utf-8')
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 2d482010cf760..02ba227460e3f 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -34,11 +34,10 @@ def get_physical_device_capability(device_id: int = 0) -> Tuple[int, int]:
def device_id_to_physical_device_id(device_id: int) -> int:
if "CUDA_VISIBLE_DEVICES" in os.environ:
device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
- device_ids = [int(device_id) for device_id in device_ids]
physical_device_id = device_ids[device_id]
+ return int(physical_device_id)
else:
- physical_device_id = device_id
- return physical_device_id
+ return device_id
class CudaPlatform(Platform):
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index a2caae21a86e3..ebe5e0fd34135 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -8,6 +8,11 @@
from pydantic import Field
from typing_extensions import Annotated
+import vllm.envs as envs
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
_SAMPLING_EPS = 1e-5
@@ -184,6 +189,18 @@ def __init__(
self._verify_args()
if self.use_beam_search:
+ # Lazy import to avoid circular imports.
+ from vllm.usage.usage_lib import set_runtime_usage_data
+ set_runtime_usage_data("use_beam_search", True)
+
+ if not envs.VLLM_NO_DEPRECATION_WARNING:
+ logger.warning(
+ "[IMPORTANT] We plan to discontinue the support for beam "
+ "search in the next major release. Please refer to "
+ "https://github.com/vllm-project/vllm/issues/6226 for "
+ "more information. Set VLLM_NO_DEPRECATION_WARNING=1 to "
+ "suppress this warning.")
+
self._verify_beam_search()
else:
self._verify_non_beam_search()
diff --git a/vllm/scripts.py b/vllm/scripts.py
new file mode 100644
index 0000000000000..3f334be925ee8
--- /dev/null
+++ b/vllm/scripts.py
@@ -0,0 +1,154 @@
+# The CLI entrypoint to vLLM.
+import argparse
+import os
+import signal
+import sys
+from typing import Optional
+
+from openai import OpenAI
+
+from vllm.entrypoints.openai.api_server import run_server
+from vllm.entrypoints.openai.cli_args import make_arg_parser
+from vllm.utils import FlexibleArgumentParser
+
+
+def registrer_signal_handlers():
+
+ def signal_handler(sig, frame):
+ sys.exit(0)
+
+ signal.signal(signal.SIGINT, signal_handler)
+ signal.signal(signal.SIGTSTP, signal_handler)
+
+
+def serve(args: argparse.Namespace) -> None:
+ # EngineArgs expects the model name to be passed as --model.
+ args.model = args.model_tag
+
+ run_server(args)
+
+
+def interactive_cli(args: argparse.Namespace) -> None:
+ registrer_signal_handlers()
+
+ base_url = args.url
+ api_key = args.api_key or os.environ.get("OPENAI_API_KEY", "EMPTY")
+ openai_client = OpenAI(api_key=api_key, base_url=base_url)
+
+ if args.model_name:
+ model_name = args.model_name
+ else:
+ available_models = openai_client.models.list()
+ model_name = available_models.data[0].id
+
+ print(f"Using model: {model_name}")
+
+ if args.command == "complete":
+ complete(model_name, openai_client)
+ elif args.command == "chat":
+ chat(args.system_prompt, model_name, openai_client)
+
+
+def complete(model_name: str, client: OpenAI) -> None:
+ print("Please enter prompt to complete:")
+ while True:
+ input_prompt = input("> ")
+
+ completion = client.completions.create(model=model_name,
+ prompt=input_prompt)
+ output = completion.choices[0].text
+ print(output)
+
+
+def chat(system_prompt: Optional[str], model_name: str,
+ client: OpenAI) -> None:
+ conversation = []
+ if system_prompt is not None:
+ conversation.append({"role": "system", "content": system_prompt})
+
+ print("Please enter a message for the chat model:")
+ while True:
+ input_message = input("> ")
+ message = {"role": "user", "content": input_message}
+ conversation.append(message)
+
+ chat_completion = client.chat.completions.create(model=model_name,
+ messages=conversation)
+
+ response_message = chat_completion.choices[0].message
+ output = response_message.content
+
+ conversation.append(response_message)
+ print(output)
+
+
+def _add_query_options(
+ parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
+ parser.add_argument(
+ "--url",
+ type=str,
+ default="http://localhost:8000/v1",
+ help="url of the running OpenAI-Compatible RESTful API server")
+ parser.add_argument(
+ "--model-name",
+ type=str,
+ default=None,
+ help=("The model name used in prompt completion, default to "
+ "the first model in list models API call."))
+ parser.add_argument(
+ "--api-key",
+ type=str,
+ default=None,
+ help=(
+ "API key for OpenAI services. If provided, this api key "
+ "will overwrite the api key obtained through environment variables."
+ ))
+ return parser
+
+
+def main():
+ parser = FlexibleArgumentParser(description="vLLM CLI")
+ subparsers = parser.add_subparsers(required=True)
+
+ serve_parser = subparsers.add_parser(
+ "serve",
+ help="Start the vLLM OpenAI Compatible API server",
+ usage="vllm serve [options]")
+ serve_parser.add_argument("model_tag",
+ type=str,
+ help="The model tag to serve")
+ serve_parser = make_arg_parser(serve_parser)
+ serve_parser.set_defaults(dispatch_function=serve)
+
+ complete_parser = subparsers.add_parser(
+ "complete",
+ help=("Generate text completions based on the given prompt "
+ "via the running API server"),
+ usage="vllm complete [options]")
+ _add_query_options(complete_parser)
+ complete_parser.set_defaults(dispatch_function=interactive_cli,
+ command="complete")
+
+ chat_parser = subparsers.add_parser(
+ "chat",
+ help="Generate chat completions via the running API server",
+ usage="vllm chat [options]")
+ _add_query_options(chat_parser)
+ chat_parser.add_argument(
+ "--system-prompt",
+ type=str,
+ default=None,
+ help=("The system prompt to be added to the chat template, "
+ "used for models that support system prompts."))
+ chat_parser.set_defaults(dispatch_function=interactive_cli, command="chat")
+
+ args = parser.parse_args()
+ # One of the sub commands should be executed.
+ if hasattr(args, "dispatch_function"):
+ args.dispatch_function(args)
+ else:
+ parser.print_help()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/vllm/sequence.py b/vllm/sequence.py
index a3f998b94d795..1cebf68d463db 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -3,8 +3,9 @@
import enum
import math
from abc import ABC, abstractmethod
+from collections import defaultdict
from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union
import torch
@@ -916,6 +917,21 @@ def get_all_seq_ids(
return [seq_id for sg in seq_group_metadata_list for seq_id in sg.seq_data]
+def get_all_seq_ids_and_request_ids(
+ seq_group_metadata_list: List[SequenceGroupMetadata]
+) -> Tuple[List[int], Dict[str, Set[int]]]:
+ """Given a list of SequenceGroupMetadata, create a list of all
+ sequence ids.
+ """
+ seq_ids: List[int] = []
+ request_id_seq_ids_mapping: Dict[str, Set[int]] = defaultdict(set)
+ for sg in seq_group_metadata_list:
+ for seq_id in sg.seq_data:
+ seq_ids.append(seq_id)
+ request_id_seq_ids_mapping[sg.request_id].add(seq_id)
+ return seq_ids, request_id_seq_ids_mapping
+
+
class HiddenStates:
"""Hidden states corresponding to in-progress sequences.
Used in speculative decoding to pass hidden states from
diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py
index d236fc0f2cb6b..d109d8edc1b0b 100644
--- a/vllm/spec_decode/interfaces.py
+++ b/vllm/spec_decode/interfaces.py
@@ -1,6 +1,6 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass
-from typing import Optional
+from typing import Optional, Set
import torch
@@ -62,6 +62,9 @@ class SpeculativeProposer(ABC):
def get_spec_proposals(
self,
execute_model_req: ExecuteModelRequest,
+ # If set, this contains all sequence IDs that were assigned
+ # bonus tokens in their last forward pass.
+ seq_ids_with_bonus_token_in_last_step: Set[int],
) -> SpeculativeProposals:
raise NotImplementedError
diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py
index b72740fc3961c..041ce41e91d05 100644
--- a/vllm/spec_decode/medusa_worker.py
+++ b/vllm/spec_decode/medusa_worker.py
@@ -1,5 +1,5 @@
import weakref
-from typing import List, Optional, Tuple
+from typing import List, Optional, Set, Tuple
import torch
@@ -40,6 +40,8 @@ def sampler_output(
self,
execute_model_req: ExecuteModelRequest,
sample_len: int,
+ # Unused parameter.
+ seq_ids_with_bonus_token_in_last_step: Set[int],
) -> Tuple[List[SamplerOutput], bool]:
"""Run the model forward pass to generate sample_len future tokens.
Returns the list of sampler output, one per layer, along with indicator
@@ -97,12 +99,14 @@ def _prepare_input_tensors(
def get_spec_proposals(
self,
execute_model_req: ExecuteModelRequest,
+ seq_ids_with_bonus_token_in_last_step: Set[int],
) -> SpeculativeProposals:
"""Produce speculations given an input batch of sequences. The number of
speculative tokens per sequence is determined by max_proposal_len.
"""
- return self._proposer.get_spec_proposals(execute_model_req)
+ return self._proposer.get_spec_proposals(
+ execute_model_req, seq_ids_with_bonus_token_in_last_step)
def _raise_if_unsupported(
self,
diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py
index 6c1c8da57d188..308573348d443 100644
--- a/vllm/spec_decode/mlp_speculator_worker.py
+++ b/vllm/spec_decode/mlp_speculator_worker.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Tuple
+from typing import List, Optional, Set, Tuple
import torch
@@ -20,6 +20,9 @@ def sampler_output(
self,
execute_model_req: ExecuteModelRequest,
sample_len: int,
+ # Unused parameter. MLPSpeculatorWorker does not use the KV Cache and
+ # therefore does not need this parameter.
+ seq_ids_with_bonus_token_in_last_step: Set[int],
) -> Tuple[List[SamplerOutput], bool]:
"""Run the model forward pass to generate sample_len future tokens.
Returns the list of sampler output, one per layer, along with indicator
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index c1a02e1d32e85..09a77f9e870fb 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -1,6 +1,6 @@
import copy
import weakref
-from typing import Dict, List, Tuple
+from typing import Dict, List, Set, Tuple
import torch
@@ -51,6 +51,7 @@ def sampler_output(
self,
execute_model_req: ExecuteModelRequest,
sample_len: int,
+ seq_ids_with_bonus_token_in_last_step: Set[int],
) -> Tuple[List[SamplerOutput], bool]:
"""Run the model forward pass sample_len times. Returns the list of
sampler output, one per model forward pass, along with indicator of
@@ -60,44 +61,142 @@ def sampler_output(
For multi step worker, this indicator shall be True.
"""
self._raise_if_unsupported(execute_model_req)
-
- # Shallow copy input data so modifications (such as appending tokens)
- # do not cause side-effects.
- copied_seq_group_metadata_list = self._shallow_copy_inputs(
- execute_model_req.seq_group_metadata_list)
- copied_execute_model_req = execute_model_req.clone(
- copied_seq_group_metadata_list)
-
+ # Expand the batch for sequences with a bonus token.
+ # Perform a forward pass on the expanded batch and filter the
+ # response to retain only the original sequences' responses.
+ expanded_request, indices_of_seq_with_bonus_tokens =\
+ self._expand_execute_model_request(
+ execute_model_req, seq_ids_with_bonus_token_in_last_step)
# Run model sample_len times.
model_outputs: List[SamplerOutput] = []
if isinstance(self.model_runner, TP1DraftModelRunner):
- copied_execute_model_req.num_steps = sample_len
+ expanded_request.num_steps = sample_len
model_outputs = self.execute_model(
- execute_model_req=copied_execute_model_req)
+ execute_model_req=expanded_request)
else:
# TODO: Remove this branch once DraftModelRunner supports TP>1.
for _ in range(sample_len):
model_output: List[SamplerOutput] = super().execute_model(
- execute_model_req=copied_execute_model_req)
+ execute_model_req=expanded_request)
assert (len(model_output) == 1
), "composing multistep workers not supported"
model_output = model_output[0]
- self._append_new_tokens(model_output,
- copied_seq_group_metadata_list)
+ self._append_new_tokens(
+ model_output, expanded_request.seq_group_metadata_list)
model_outputs.append(model_output)
- return model_outputs, True
+ filtered_model_outputs = self._filter_model_output(
+ model_outputs, indices_of_seq_with_bonus_tokens)
+ return filtered_model_outputs, True
+
+ @staticmethod
+ def _expand_execute_model_request(
+ execute_model_req: ExecuteModelRequest,
+ seq_with_bonus_token_in_last_step: set,
+ ) -> Tuple[ExecuteModelRequest, List[int]]:
+ """
+ Expands the execute model request based on sequences with bonus
+ tokens.
+
+ For each sequence with a bonus token, this method creates a new
+ sequence without the bonus token and adds it to the execute model
+ request. The original sequence groups are also retained. The indices
+ of the original sequence groups are returned for further processing.
+
+ Args:
+ execute_model_req (ExecuteModelRequest): The original execute
+ model request.
+ seq_with_bonus_token_in_last_step (set): Set of sequence IDs that
+ contain bonus tokens.
+
+ Returns:
+ Tuple[ExecuteModelRequest, List[int]]: The updated execute model
+ request with expanded sequences and a list of indices corresponding
+ to the original sequence groups.
+ """
+ updated_seq_group_metadata_list: List[SequenceGroupMetadata] = []
+ updated_execute_model_req = execute_model_req.clone(
+ updated_seq_group_metadata_list)
+ indices_of_original_sequence_groups = []
+ for seq_group in execute_model_req.seq_group_metadata_list:
+ seq_group_has_bonus_tokens = False
+ for seq_id, _ in seq_group.seq_data.items():
+ # Identify sequences with bonus tokens in the sequence group.
+ if seq_id in seq_with_bonus_token_in_last_step:
+ seq_group_has_bonus_tokens = True
+ break
+ if seq_group_has_bonus_tokens:
+ #Create new sequences without the last bonus token. These new
+ # sequence have the same sequence id as the original sequence.
+ # We create a new sequence group and add them there.
+ updated_seq_group_without_bonus_token = \
+ MultiStepWorker._copy_seq_metadata_excluding_last_token(
+ seq_group, seq_with_bonus_token_in_last_step)
+ updated_seq_group_metadata_list.append(
+ updated_seq_group_without_bonus_token)
+ # Add the original sequence group.
+ updated_seq_group_metadata_list.append(
+ MultiStepWorker._shallow_copy_seq_group_metadata(seq_group))
+ # Record the index of the original sequence group.
+ indices_of_original_sequence_groups.append(
+ len(updated_seq_group_metadata_list) - 1)
+
+ updated_execute_model_req.seq_group_metadata_list =\
+ updated_seq_group_metadata_list
+ return updated_execute_model_req, indices_of_original_sequence_groups
+
+ @staticmethod
+ def _filter_model_output(
+ expanded_batch_outputs: List[SamplerOutput],
+ output_indices_to_retain: List[int]) -> List[SamplerOutput]:
+ """
+ Filters the model output to include only the specified sequence
+ outputs. This method contracts the expanded batch output from the
+ model to retain the outputs of only those sequences indicated by the
+ provided indices.
+
+ Args:
+ expanded_batch_output (List[SamplerOutput]): The expanded output
+ batch from the model.
+ output_indices_to_retain (List[int]): Indices of the model outputs
+ to retain.
+
+ Returns:
+ List[SamplerOutput]: A list containing the filtered model
+ outputs for the specified indices.
+ """
+ return [
+ SamplerOutput(
+ outputs=[
+ expanded_batch_output.outputs[i]
+ for i in output_indices_to_retain
+ ],
+ sampled_token_probs=(
+ expanded_batch_output.
+ sampled_token_probs[output_indices_to_retain]
+ if expanded_batch_output.sampled_token_probs is not None
+ else None),
+ logprobs=(
+ expanded_batch_output.logprobs[output_indices_to_retain]
+ if expanded_batch_output.logprobs is not None else None),
+ sampled_token_ids=(expanded_batch_output.
+ sampled_token_ids[output_indices_to_retain]
+ if expanded_batch_output.sampled_token_ids
+ is not None else None))
+ for expanded_batch_output in expanded_batch_outputs
+ ]
def get_spec_proposals(
self,
execute_model_req: ExecuteModelRequest,
+ seq_ids_with_bonus_token_in_last_step: set,
) -> SpeculativeProposals:
"""Produce speculations given an input batch of sequences. The number of
speculative tokens per sequence is determined by max_proposal_len.
"""
-
- return self._proposer.get_spec_proposals(execute_model_req)
+ return self._proposer.get_spec_proposals(
+ execute_model_req, seq_ids_with_bonus_token_in_last_step)
@staticmethod
def _append_new_tokens(
@@ -123,9 +222,8 @@ def _append_new_tokens(
seq.update_num_computed_tokens(1)
@staticmethod
- def _shallow_copy_inputs(
- seq_group_metadata_list: List[SequenceGroupMetadata]
- ) -> List[SequenceGroupMetadata]:
+ def _shallow_copy_seq_group_metadata(
+ seq_group_metadata: SequenceGroupMetadata, ) -> SequenceGroupMetadata:
"""Copy input data structures to remove side-effects when input data
structures are shared with other modules.
@@ -133,26 +231,62 @@ def _shallow_copy_inputs(
The alternative is deep-copying (or other form of deep copy); this has
performance downsides.
"""
-
- # Shallow-copy the list of SequenceGroupMetadata. This allows us to
+ # Shallow-copy the SequenceGroupMetadata. This allows us to
# append tokens and change is_prompt without external side-effects.
- new_seq_group_metadata_list: List[SequenceGroupMetadata] = []
+ # We must shallow-copy seq_group_metadata as is_prompt could change.
+ new_seq_group_metadata = copy.copy(seq_group_metadata)
- for old_seq_group_metadata in seq_group_metadata_list:
- # We must shallow-copy seq_group_metadata as is_prompt could change.
- seq_group_metadata = copy.copy(old_seq_group_metadata)
- new_seq_group_metadata_list.append(seq_group_metadata)
-
- # We must shallow-copy seq_data as we will append token ids
- new_seq_data: Dict[int, SequenceData] = {}
- for seq_id, old_seq_data in seq_group_metadata.seq_data.items():
- new_seq_data[seq_id] = copy.copy(old_seq_data)
- new_seq_data[
- seq_id].output_token_ids = old_seq_data.output_token_ids[:]
+ # We must shallow-copy seq_data as we will append token ids
+ new_seq_data: Dict[int, SequenceData] = {}
+ for seq_id, old_seq_data in seq_group_metadata.seq_data.items():
+ new_seq_data[seq_id] = copy.copy(old_seq_data)
+ new_seq_data[seq_id].output_token_ids =\
+ old_seq_data.output_token_ids[:]
- seq_group_metadata.seq_data = new_seq_data
+ new_seq_group_metadata.seq_data = new_seq_data
+ return new_seq_group_metadata
- return new_seq_group_metadata_list
+ @staticmethod
+ def _copy_seq_metadata_excluding_last_token(
+ seq_group_metadata: SequenceGroupMetadata,
+ seq_ids_to_copy: Set[int],
+ ) -> SequenceGroupMetadata:
+ """
+ Creates a shallow copy of the given SequenceGroupMetadata, retaining
+ only the sequence IDs specified in seq_ids_to_copy. For each of these
+ sequence IDs, all output_token_ids except the last one are copied.
+ Sequence IDs not in seq_ids_to_copy are excluded from the copy.
+
+ Parameters:
+ seq_group_metadata (SequenceGroupMetadata): The original sequence
+ group metadata.
+ seq_ids_to_copy (Set[int]): The set of sequence IDs to include in the
+ copy.
+
+ Returns:
+ SequenceGroupMetadata: A shallow copy of the sequence group metadata
+ with the specified modifications.
+ """
+ # Shallow-copy the SequenceGroupMetadata.
+ new_seq_group_metadata = copy.copy(seq_group_metadata)
+ # Shallow-copy seq_data and modify the output_token_ids.
+ new_seq_data: Dict[int, SequenceData] = {}
+ for seq_id, old_seq_data in seq_group_metadata.seq_data.items():
+ if (seq_id in seq_ids_to_copy):
+ new_seq_data[seq_id] = copy.copy(old_seq_data)
+ # Copy all the output token ids except the last.
+ # Also reduce num_computed_tokens by 1 since we are not
+ # including the last output token.
+ # NOTE: num_computed_tokens is not directly used by the
+ # speculative decoding workers, as it is only relevant for
+ # chunked prefill, which is disabled for speculative decoding.
+ # However, to maintain consistency in num_computed_tokens,
+ # we update it here.
+ new_seq_data[seq_id].output_token_ids =\
+ old_seq_data.output_token_ids[:-1]
+ new_seq_data[seq_id].update_num_computed_tokens(-1)
+ new_seq_group_metadata.seq_data = new_seq_data
+ return new_seq_group_metadata
def _assert_enough_kv_space(
self, seq_group_metadata_list: List[SequenceGroupMetadata],
diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py
index 23a3e1649914b..07991df52e655 100644
--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@@ -1,5 +1,5 @@
import weakref
-from typing import List, Optional, Tuple
+from typing import List, Optional, Set, Tuple
import torch
@@ -48,6 +48,9 @@ def sampler_output(
self,
execute_model_req: ExecuteModelRequest,
sample_len: int,
+ # Unused parameter. NGramWorker does not use the KV Cache and
+ # therefore does not need this parameter.
+ seq_ids_with_bonus_token_in_last_step: Set[int],
) -> Tuple[Optional[List[Optional[SamplerOutput]]], bool]:
"""NGram match algo to pick proposal candidate. Returns the list of
sampler output, one per SequenceGroupMetadata.
@@ -133,12 +136,15 @@ def sampler_output(
def get_spec_proposals(
self,
execute_model_req: ExecuteModelRequest,
+ # Unused parameter. NGramWorker does not use the KV Cache and
+ # therefore does not need this parameter.
+ seq_ids_with_bonus_token_in_last_step: Set[int],
) -> SpeculativeProposals:
"""Produce speculations given an input batch of sequences. The number of
speculative tokens per sequence is determined by max_proposal_len.
"""
-
- return self._proposer.get_spec_proposals(execute_model_req)
+ return self._proposer.get_spec_proposals(
+ execute_model_req, seq_ids_with_bonus_token_in_last_step)
def _raise_if_unsupported(
self,
diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py
index b691659fb292b..fffa557121e17 100644
--- a/vllm/spec_decode/proposer_worker_base.py
+++ b/vllm/spec_decode/proposer_worker_base.py
@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod
-from typing import List, Optional, Tuple
+from typing import List, Optional, Set, Tuple
from vllm.sequence import ExecuteModelRequest, SamplerOutput
from vllm.spec_decode.interfaces import SpeculativeProposer
@@ -14,6 +14,13 @@ def sampler_output(
self,
execute_model_req: ExecuteModelRequest,
sample_len: int,
+ # A set containing all sequence IDs that were assigned bonus tokens
+ # in their last forward pass. This set is used to backfill the KV cache
+ # with the key-value pairs of the penultimate token in the sequences.
+ # This parameter is only used by the MultiStepWorker, which relies on
+ # the KV cache for token generation. It is not used by workers that
+ # do not utilize the KV cache.
+ seq_ids_with_bonus_token_in_last_step: Set[int]
) -> Tuple[Optional[List[SamplerOutput]], bool]:
raise NotImplementedError
diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py
index b78e4489513f7..0dbb924d25400 100644
--- a/vllm/spec_decode/smaller_tp_proposer_worker.py
+++ b/vllm/spec_decode/smaller_tp_proposer_worker.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Tuple
+from typing import List, Optional, Set, Tuple
import torch
@@ -110,13 +110,17 @@ def sampler_output(
self,
execute_model_req: ExecuteModelRequest,
sample_len: int,
+ seq_ids_with_bonus_token_in_last_step: Set[int],
) -> Tuple[List[SamplerOutput], bool]:
# Do not check _is_dummy, as it's always called by get_spec_proposals
- return self._worker.sampler_output(execute_model_req, sample_len)
+ return self._worker.sampler_output(
+ execute_model_req, sample_len,
+ seq_ids_with_bonus_token_in_last_step)
def get_spec_proposals(
self,
execute_model_req: ExecuteModelRequest,
+ seq_ids_with_bonus_token_in_last_step: Set[int],
) -> SpeculativeProposals:
"""Produce speculations given an input batch of sequences. The number of
speculative tokens per sequence is determined by max_proposal_len.
@@ -125,7 +129,8 @@ def get_spec_proposals(
return SpeculativeProposals(None, None, None)
with self._patch_tensor_parallel_group():
- return self._worker.get_spec_proposals(execute_model_req)
+ return self._worker.get_spec_proposals(
+ execute_model_req, seq_ids_with_bonus_token_in_last_step)
def execute_model(
self,
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 60a7dab68b7fd..3c8e3dee46831 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -1,5 +1,6 @@
+from collections import defaultdict
from functools import cached_property
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple
import torch
@@ -13,7 +14,7 @@
TypicalAcceptanceSampler)
from vllm.sequence import (CompletionSequenceGroupOutput, ExecuteModelRequest,
HiddenStates, SamplerOutput, SequenceGroupMetadata,
- get_all_seq_ids)
+ get_all_seq_ids_and_request_ids)
from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
from vllm.spec_decode.interfaces import (SpeculativeProposals,
@@ -112,11 +113,7 @@ def create_worker(
draft_worker_kwargs.pop("ngram_prompt_lookup_max"))
ngram_prompt_lookup_min = (
draft_worker_kwargs.pop("ngram_prompt_lookup_min"))
-
- disable_bonus_tokens = True
-
if ngram_prompt_lookup_max > 0:
- disable_bonus_tokens = False
proposer_worker = NGramWorker(**draft_worker_kwargs)
proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min,
ngram_prompt_lookup_max)
@@ -128,11 +125,9 @@ def create_worker(
if draft_worker_kwargs[
"model_config"].hf_config.model_type == "mlp_speculator":
- disable_bonus_tokens = False
proposer_worker = MLPSpeculatorWorker(**draft_worker_kwargs)
elif draft_worker_kwargs[
"model_config"].hf_config.model_type == "medusa":
- disable_bonus_tokens = False
proposer_worker = MedusaWorker(**draft_worker_kwargs)
else:
if draft_tp == 1:
@@ -149,10 +144,10 @@ def create_worker(
spec_decode_sampler: SpecDecodeBaseSampler = None
if draft_token_acceptance_method == "rejection_sampler":
spec_decode_sampler = RejectionSampler(
- disable_bonus_tokens=disable_bonus_tokens, )
+ disable_bonus_tokens=False, )
elif draft_token_acceptance_method == "typical_acceptance_sampler":
spec_decode_sampler = TypicalAcceptanceSampler(
- disable_bonus_tokens=disable_bonus_tokens,
+ disable_bonus_tokens=False,
posterior_threshold=\
typical_acceptance_sampler_posterior_threshold,
posterior_alpha=typical_acceptance_sampler_posterior_alpha,
@@ -200,6 +195,15 @@ def __init__(
self._metrics = AsyncMetricsCollector(
self.spec_decode_sampler
) if metrics_collector is None else metrics_collector
+ # Tracks the sequence IDs that received a bonus token ID in
+ # their last forward pass. Needed only if KV cache is being
+ # used for token generation such as in the case of MultiStepWorker.
+ self._seq_with_bonus_token_in_last_step: Set[int] = set()
+ # Tracks the currently active request ids and the sequence IDs
+ # corresponding to them
+ self._request_id_seq_id_mapping: Dict[str, Set[int]] = defaultdict(set)
+ # Tracks if the proposer worker uses the KV cache or not.
+
self.probs_dtype = self.spec_decode_sampler.probs_dtype
self.token_id_dtype = self.spec_decode_sampler.token_id_dtype
# Lazy initiazliation.
@@ -307,6 +311,7 @@ def execute_model(
broadcast_tensor_dict({}, src=0)
return []
+ self._track_finished_requests(execute_model_req)
disable_all_speculation = self._should_disable_all_speculation(
execute_model_req)
num_lookahead_slots = execute_model_req.num_lookahead_slots
@@ -453,7 +458,8 @@ def _run_speculative_decoding_step(
self.previous_hidden_states = None
# Generate proposals using draft worker.
- proposals = self.proposer_worker.get_spec_proposals(execute_model_req)
+ proposals = self.proposer_worker.get_spec_proposals(
+ execute_model_req, self._seq_with_bonus_token_in_last_step)
proposal_scores = self.scorer.score_proposals(
execute_model_req,
@@ -585,7 +591,9 @@ def _create_output_sampler_list(
# Get the sequence ids and num_logprobs (sampling parameter) in the
# batch.
- seq_ids = get_all_seq_ids(seq_group_metadata_list)
+ seq_ids, request_ids_seq_ids_mapping = get_all_seq_ids_and_request_ids(
+ seq_group_metadata_list)
+
num_logprobs_per_seq = get_all_num_logprobs(seq_group_metadata_list)
# Serialize all tensors to CPU Python lists.
@@ -608,7 +616,6 @@ def _create_output_sampler_list(
for sequence_index in range(batch_size):
# Each sequence may have a different num_logprobs; retrieve it.
num_logprobs = num_logprobs_per_seq[sequence_index]
-
step_output_token_ids.append(
create_sequence_group_output(
token_id=accepted_token_ids_by_step[step_index]
@@ -623,18 +630,48 @@ def _create_output_sampler_list(
topk_logprobs=topk_logprobs_by_step[step_index]
[sequence_index][:num_logprobs],
))
-
sampler_output_list.append(
SamplerOutput(outputs=step_output_token_ids))
+ # Populate the data structures needed to keep track of sequences with
+ # bonus tokens.
+ self._track_sequences_with_bonus_tokens(seq_ids,
+ request_ids_seq_ids_mapping,
+ accepted_token_ids_by_step)
maybe_rejsample_metrics = (
self._metrics.maybe_collect_rejsample_metrics(k))
if maybe_rejsample_metrics is not None:
sampler_output_list[
0].spec_decode_worker_metrics = maybe_rejsample_metrics
-
return sampler_output_list
+ def _track_finished_requests(self, execute_model_req: ExecuteModelRequest):
+ """
+ Removes the finished requests and their associated sequence ids from
+ internal book keeping data structures.
+ """
+ for finished_request in execute_model_req.finished_requests_ids:
+ for seq_id in self._request_id_seq_id_mapping[finished_request]:
+ self._seq_with_bonus_token_in_last_step.discard(seq_id)
+ del self._request_id_seq_id_mapping[finished_request]
+
+ def _track_sequences_with_bonus_tokens(
+ self, seq_ids: List[int],
+ request_ids_seq_ids_mapping: Dict[str, Set[int]],
+ accepted_token_ids_by_step: List[List[int]]):
+ """
+ Updates the internal data structures which keep track of sequences
+ which have been assigned bonus tokens in their last forward pass.
+ """
+ for seq_index, seq_id in enumerate(seq_ids):
+ last_token_id = accepted_token_ids_by_step[-1][seq_index]
+ if last_token_id == -1:
+ self._seq_with_bonus_token_in_last_step.discard(seq_id)
+ else:
+ self._seq_with_bonus_token_in_last_step.add(seq_id)
+ for request_id, sequences in request_ids_seq_ids_mapping.items():
+ self._request_id_seq_id_mapping[request_id].update(sequences)
+
@cached_property
def _vocab_size(self) -> int:
"""Get the vocab size of the model and make sure it's consistent between
diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py
index d3e280e6843b8..7b34b5d34208b 100644
--- a/vllm/spec_decode/top1_proposer.py
+++ b/vllm/spec_decode/top1_proposer.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Tuple
+from typing import List, Optional, Set, Tuple
import torch
@@ -42,6 +42,7 @@ def __init__(
def get_spec_proposals(
self,
execute_model_req: ExecuteModelRequest,
+ seq_ids_with_bonus_token_in_last_step: Set[int],
) -> SpeculativeProposals:
"""Get speculative proposals given the input batch.
@@ -76,6 +77,8 @@ def get_spec_proposals(
maybe_sampler_output, transposed = self._worker.sampler_output(
execute_model_req=nonzero_execute_model_req,
sample_len=proposal_len,
+ seq_ids_with_bonus_token_in_last_step=\
+ seq_ids_with_bonus_token_in_last_step,
)
(
proposal_lens,
diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py
index e8e53f4946efa..cc9a971301afc 100644
--- a/vllm/transformers_utils/detokenizer.py
+++ b/vllm/transformers_utils/detokenizer.py
@@ -21,14 +21,17 @@ def get_tokenizer_for_seq(self,
"""Returns the HF tokenizer to use for a given sequence."""
return self.tokenizer_group.get_lora_tokenizer(sequence.lora_request)
- def decode_prompt_logprobs_inplace(
- self, seq_group: SequenceGroup,
- prompt_logprobs: List[Optional[Dict[int, Logprob]]]) -> None:
+ def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup,
+ prompt_logprobs: List[Optional[Dict[
+ int, Logprob]]],
+ position_offset: int) -> None:
"""Decodes the logprobs for the prompt of a sequence group.
Args:
seq_group: The sequence group to decode.
prompt_logprobs: The logprobs to decode.
+ position_offset: Offset of the first index of the logprobs
+ relative to the start of the sequence (for chunked prefill).
Returns:
The prompt logprobs with the decoded tokens.
@@ -47,8 +50,13 @@ def decode_prompt_logprobs_inplace(
next_iter_tokens: List[str] = []
prev_tokens = None
- for token_position, prompt_logprobs_for_token in enumerate(
+ for token_position_in_logprob, prompt_logprobs_for_token in enumerate(
prompt_logprobs):
+
+ # Absolute token position equals the index in the logprobs
+ # list plus the offset of the entire logprobs list relative
+ # to the start of the sequence.
+ token_position = token_position_in_logprob + position_offset
if not prompt_logprobs_for_token:
continue
for token_id, sample_logprob in prompt_logprobs_for_token.items():
diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py
new file mode 100644
index 0000000000000..09843e5d1f30b
--- /dev/null
+++ b/vllm/triton_utils/__init__.py
@@ -0,0 +1,6 @@
+from vllm.triton_utils.custom_cache_manager import (
+ maybe_set_triton_cache_manager)
+
+__all__ = [
+ "maybe_set_triton_cache_manager",
+]
diff --git a/vllm/triton_utils/custom_cache_manager.py b/vllm/triton_utils/custom_cache_manager.py
new file mode 100644
index 0000000000000..17039d7ba24c7
--- /dev/null
+++ b/vllm/triton_utils/custom_cache_manager.py
@@ -0,0 +1,53 @@
+import os
+
+from triton.runtime.cache import (FileCacheManager, default_cache_dir,
+ default_dump_dir, default_override_dir)
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def maybe_set_triton_cache_manager() -> None:
+ """Set environment variable to tell Triton to use a
+ custom cache manager"""
+ cache_manger = os.environ.get("TRITON_CACHE_MANAGER", None)
+ if cache_manger is None:
+ manager = "vllm.triton_utils.custom_cache_manager:CustomCacheManager"
+ logger.info("Setting Triton cache manager to: %s", manager)
+ os.environ["TRITON_CACHE_MANAGER"] = manager
+
+
+class CustomCacheManager(FileCacheManager):
+ """Re-implements Triton's cache manager, ensuring that a
+ unique cache directory is created for each process. This is
+ needed to avoid collisions when running with tp>1 and
+ using multi-processing as the distributed backend.
+
+ Note this issue was fixed by triton-lang/triton/pull/4295,
+ but the fix is not yet included in triton==v3.0.0. However,
+ it should be included in the subsequent version.
+ """
+
+ def __init__(self, key, override=False, dump=False):
+ self.key = key
+ self.lock_path = None
+ if dump:
+ self.cache_dir = default_dump_dir()
+ self.cache_dir = os.path.join(self.cache_dir, self.key)
+ self.lock_path = os.path.join(self.cache_dir, "lock")
+ os.makedirs(self.cache_dir, exist_ok=True)
+ elif override:
+ self.cache_dir = default_override_dir()
+ self.cache_dir = os.path.join(self.cache_dir, self.key)
+ else:
+ # create cache directory if it doesn't exist
+ self.cache_dir = os.getenv("TRITON_CACHE_DIR",
+ "").strip() or default_cache_dir()
+ if self.cache_dir:
+ self.cache_dir = f"{self.cache_dir}_{os.getpid()}"
+ self.cache_dir = os.path.join(self.cache_dir, self.key)
+ self.lock_path = os.path.join(self.cache_dir, "lock")
+ os.makedirs(self.cache_dir, exist_ok=True)
+ else:
+ raise RuntimeError("Could not create or locate cache dir")
diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index afb3007a528b4..6907d8b9becd2 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -7,7 +7,7 @@
from enum import Enum
from pathlib import Path
from threading import Thread
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, Union
from uuid import uuid4
import cpuinfo
@@ -25,6 +25,13 @@
_USAGE_STATS_ENABLED = None
_USAGE_STATS_SERVER = envs.VLLM_USAGE_STATS_SERVER
+_GLOBAL_RUNTIME_DATA: Dict[str, Union[str, int, bool]] = {}
+
+
+def set_runtime_usage_data(key: str, value: Union[str, int, bool]) -> None:
+ """Set global usage data that will be sent with every usage heartbeat."""
+ _GLOBAL_RUNTIME_DATA[key] = value
+
def is_usage_stats_enabled():
"""Determine whether or not we can send usage stats to the server.
@@ -187,7 +194,11 @@ def _report_continous_usage(self):
"""
while True:
time.sleep(600)
- data = {"uuid": self.uuid, "log_time": _get_current_timestamp_ns()}
+ data = {
+ "uuid": self.uuid,
+ "log_time": _get_current_timestamp_ns(),
+ }
+ data.update(_GLOBAL_RUNTIME_DATA)
self._write_to_file(data)
self._send_to_server(data)
diff --git a/vllm/utils.py b/vllm/utils.py
index a3d15d7979228..8be1528230b5f 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -386,10 +386,6 @@ def get_open_port() -> int:
def update_environment_variables(envs: Dict[str, str]):
- if is_hip() and "CUDA_VISIBLE_DEVICES" in envs:
- # Propagate changes to CUDA_VISIBLE_DEVICES to
- # ROCm's HIP_VISIBLE_DEVICES as well
- envs["HIP_VISIBLE_DEVICES"] = envs["CUDA_VISIBLE_DEVICES"]
for k, v in envs.items():
if k in os.environ and os.environ[k] != v:
logger.warning(
diff --git a/vllm/version.py b/vllm/version.py
index dd9b22cccc1fd..94333a8fa28dd 100644
--- a/vllm/version.py
+++ b/vllm/version.py
@@ -1 +1,12 @@
-__version__ = "0.5.1"
+import warnings
+
+try:
+ import vllm.commit_id
+ __commit__ = vllm.commit_id.__commit__
+except Exception as e:
+ warnings.warn(f"Failed to read commit hash:\n{e}",
+ RuntimeWarning,
+ stacklevel=2)
+ __commit__ = "COMMIT_HASH_PLACEHOLDER"
+
+__version__ = "0.5.2"
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index b082f45344863..93ffea9106501 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -11,7 +11,7 @@
from vllm.lora.request import LoRARequest
from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
SamplerOutput)
-from vllm.utils import (enable_trace_function_call_for_thread, is_hip,
+from vllm.utils import (enable_trace_function_call_for_thread,
update_environment_variables)
from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase
@@ -309,14 +309,6 @@ def update_environment_variables(envs: Dict[str, str]) -> None:
# overwriting CUDA_VISIBLE_DEVICES is desired behavior
# suppress the warning in `update_environment_variables`
del os.environ[key]
- if is_hip():
- hip_env_var = "HIP_VISIBLE_DEVICES"
- if hip_env_var in os.environ:
- logger.warning(
- "Ignoring pre-set environment variable `%s=%s` as "
- "%s has also been set, which takes precedence.",
- hip_env_var, os.environ[hip_env_var], key)
- os.environ.pop(hip_env_var, None)
update_environment_variables(envs)
def init_worker(self, *args, **kwargs):
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index e03f24fdfc41a..876abb3bf94d1 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -109,9 +109,6 @@ def __init__(
self.kv_cache_dtype = kv_cache_dtype
self.block_size = cache_config.block_size
- self.max_context_len_to_capture = (
- self.model_config.max_context_len_to_capture
- if self.model_config is not None else 0)
self.attn_backend = get_attn_backend(
self.model_config.get_num_attention_heads(self.parallel_config),