diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
index b2e910e1ba8a7..a67fc89d54e60 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -41,6 +41,6 @@ while getopts "m:b:l:f:" OPT; do
done
lm_eval --model hf \
- --model_args pretrained=$MODEL,parallelize=True \
- --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
- --batch_size $BATCH_SIZE
+ --model_args "pretrained=$MODEL,parallelize=True" \
+ --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
+ --batch_size "$BATCH_SIZE"
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
index 4d32b49a4fac3..65be3c5d93b20 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
done
lm_eval --model vllm \
- --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \
- --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
- --batch_size $BATCH_SIZE
+ --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
+ --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
+ --batch_size "$BATCH_SIZE"
diff --git a/.buildkite/lm-eval-harness/run-tests.sh b/.buildkite/lm-eval-harness/run-tests.sh
index b4fdde6dab425..26f33b744289a 100644
--- a/.buildkite/lm-eval-harness/run-tests.sh
+++ b/.buildkite/lm-eval-harness/run-tests.sh
@@ -30,7 +30,7 @@ while getopts "c:t:" OPT; do
done
# Parse list of configs.
-IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
+IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
do
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index f90e464288cf1..7cf05610b9953 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -56,7 +56,7 @@
def read_markdown(file):
if os.path.exists(file):
- with open(file, "r") as f:
+ with open(file) as f:
return f.read() + "\n"
else:
return f"{file} not found.\n"
@@ -75,14 +75,14 @@ def results_to_json(latency, throughput, serving):
# collect results
for test_file in results_folder.glob("*.json"):
- with open(test_file, "r") as f:
+ with open(test_file) as f:
raw_result = json.loads(f.read())
if "serving" in str(test_file):
# this result is generated via `benchmark_serving.py`
# attach the benchmarking command to raw_result
- with open(test_file.with_suffix(".commands"), "r") as f:
+ with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
raw_result.update(command)
@@ -97,7 +97,7 @@ def results_to_json(latency, throughput, serving):
# this result is generated via `benchmark_latency.py`
# attach the benchmarking command to raw_result
- with open(test_file.with_suffix(".commands"), "r") as f:
+ with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
raw_result.update(command)
@@ -119,7 +119,7 @@ def results_to_json(latency, throughput, serving):
# this result is generated via `benchmark_throughput.py`
# attach the benchmarking command to raw_result
- with open(test_file.with_suffix(".commands"), "r") as f:
+ with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
raw_result.update(command)
diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
index 6059588fe7277..052060c576300 100644
--- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@@ -72,7 +72,7 @@ def main(args):
# collect results
for test_file in results_folder.glob("*_nightly_results.json"):
- with open(test_file, "r") as f:
+ with open(test_file) as f:
results = results + json.loads(f.read())
# generate markdown table
@@ -80,7 +80,7 @@ def main(args):
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
- with open(args.description, "r") as f:
+ with open(args.description) as f:
description = f.read()
description = description.format(
diff --git a/.buildkite/nightly-benchmarks/scripts/launch-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
index e9d7d6a8d760a..fb5063db86942 100644
--- a/.buildkite/nightly-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
@@ -50,31 +50,30 @@ launch_trt_server() {
git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
git lfs install
cd tensorrtllm_backend
- git checkout $trt_llm_version
- tensorrtllm_backend_dir=$(pwd)
+ git checkout "$trt_llm_version"
git submodule update --init --recursive
# build trtllm engine
cd /tensorrtllm_backend
- cd ./tensorrt_llm/examples/${model_type}
+ cd "./tensorrt_llm/examples/${model_type}"
python3 convert_checkpoint.py \
- --model_dir ${model_path} \
- --dtype ${model_dtype} \
- --tp_size ${model_tp_size} \
- --output_dir ${trt_model_path}
+ --model_dir "${model_path}" \
+ --dtype "${model_dtype}" \
+ --tp_size "${model_tp_size}" \
+ --output_dir "${trt_model_path}"
trtllm-build \
- --checkpoint_dir ${trt_model_path} \
+ --checkpoint_dir "${trt_model_path}" \
--use_fused_mlp \
--reduce_fusion disable \
--workers 8 \
- --gpt_attention_plugin ${model_dtype} \
- --gemm_plugin ${model_dtype} \
- --tp_size ${model_tp_size} \
- --max_batch_size ${max_batch_size} \
- --max_input_len ${max_input_len} \
- --max_seq_len ${max_seq_len} \
- --max_num_tokens ${max_num_tokens} \
- --output_dir ${trt_engine_path}
+ --gpt_attention_plugin "${model_dtype}" \
+ --gemm_plugin "${model_dtype}" \
+ --tp_size "${model_tp_size}" \
+ --max_batch_size "${max_batch_size}" \
+ --max_input_len "${max_input_len}" \
+ --max_seq_len "${max_seq_len}" \
+ --max_num_tokens "${max_num_tokens}" \
+ --output_dir "${trt_engine_path}"
# handle triton protobuf files and launch triton server
cd /tensorrtllm_backend
@@ -82,15 +81,15 @@ launch_trt_server() {
cp -r all_models/inflight_batcher_llm/* triton_model_repo/
cd triton_model_repo
rm -rf ./tensorrt_llm/1/*
- cp -r ${trt_engine_path}/* ./tensorrt_llm/1
+ cp -r "${trt_engine_path}"/* ./tensorrt_llm/1
python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false
- python3 ../tools/fill_template.py -i preprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5
- python3 ../tools/fill_template.py -i postprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false
- python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:$max_batch_size
- python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:"False",bls_instance_count:1
+ python3 ../tools/fill_template.py -i preprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5"
+ python3 ../tools/fill_template.py -i postprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false"
+ python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:"$max_batch_size"
+ python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt "triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:False,bls_instance_count:1"
cd /tensorrtllm_backend
python3 scripts/launch_triton_server.py \
- --world_size=${model_tp_size} \
+ --world_size="${model_tp_size}" \
--model_repo=/tensorrtllm_backend/triton_model_repo &
}
@@ -98,10 +97,7 @@ launch_trt_server() {
launch_tgi_server() {
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
- dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
- dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
- num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
server_args=$(json2args "$server_params")
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
@@ -129,10 +125,7 @@ launch_tgi_server() {
launch_lmdeploy_server() {
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
- dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
- dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
- num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
server_args=$(json2args "$server_params")
server_command="lmdeploy serve api_server $model \
@@ -149,10 +142,7 @@ launch_sglang_server() {
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
- dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
- dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
- num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
server_args=$(json2args "$server_params")
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
@@ -185,10 +175,7 @@ launch_vllm_server() {
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
- dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
- dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
- num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
server_args=$(json2args "$server_params")
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
@@ -217,19 +204,19 @@ launch_vllm_server() {
main() {
- if [[ $CURRENT_LLM_SERVING_ENGINE == "trt" ]]; then
+ if [[ "$CURRENT_LLM_SERVING_ENGINE" == "trt" ]]; then
launch_trt_server
fi
- if [[ $CURRENT_LLM_SERVING_ENGINE == "tgi" ]]; then
+ if [[ "$CURRENT_LLM_SERVING_ENGINE" == "tgi" ]]; then
launch_tgi_server
fi
- if [[ $CURRENT_LLM_SERVING_ENGINE == "lmdeploy" ]]; then
+ if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
launch_lmdeploy_server
fi
- if [[ $CURRENT_LLM_SERVING_ENGINE == "sglang" ]]; then
+ if [[ "$CURRENT_LLM_SERVING_ENGINE" == "sglang" ]]; then
launch_sglang_server
fi
diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
index c6a1bbdeb7d48..686f70dbece6c 100644
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@@ -16,10 +16,10 @@ main() {
fi
# initial annotation
- description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
+ #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
# download results
- cd $VLLM_SOURCE_CODE_LOC/benchmarks
+ cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
mkdir -p results/
/workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
ls
@@ -30,15 +30,15 @@ main() {
/workspace/buildkite-agent artifact upload "results.zip"
# upload benchmarking scripts
- cd $VLLM_SOURCE_CODE_LOC/
+ cd "$VLLM_SOURCE_CODE_LOC/"
zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
/workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
- cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
+ cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
# upload benchmarking pipeline
/workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
- cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
+ cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
@@ -75,4 +75,4 @@ main() {
# /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
}
-main "$@"
\ No newline at end of file
+main "$@"
diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
index dd8c15e0700eb..3f38cf5137535 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -12,7 +12,7 @@ check_gpus() {
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
- declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+ declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
echo "GPU type is $gpu_type"
}
@@ -102,7 +102,7 @@ kill_gpu_processes() {
pkill -f text-generation
pkill -f lmdeploy
- while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
+ while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
sleep 1
done
}
@@ -119,8 +119,8 @@ wait_for_server() {
ensure_installed() {
# Ensure that the given command is installed by apt-get
local cmd=$1
- if ! which $cmd >/dev/null; then
- apt-get update && apt-get install -y $cmd
+ if ! which "$cmd" >/dev/null; then
+ apt-get update && apt-get install -y "$cmd"
fi
}
@@ -173,13 +173,11 @@ run_serving_tests() {
echo "Reuse previous server for test case $test_name"
else
kill_gpu_processes
- bash $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh \
+ bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
"$server_params" "$common_params"
fi
- wait_for_server
-
- if [ $? -eq 0 ]; then
+ if wait_for_server; then
echo ""
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
else
@@ -190,13 +188,13 @@ run_serving_tests() {
# prepare tokenizer
# this is required for lmdeploy.
- cd $VLLM_SOURCE_CODE_LOC/benchmarks
+ cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
rm -rf /tokenizer_cache
mkdir /tokenizer_cache
python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
--model "$model" \
--cachedir /tokenizer_cache
- cd $VLLM_SOURCE_CODE_LOC/benchmarks
+ cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
# change model name for lmdeploy (it will not follow standard hf name)
@@ -307,11 +305,11 @@ run_serving_tests() {
prepare_dataset() {
# download sharegpt dataset
- cd $VLLM_SOURCE_CODE_LOC/benchmarks
+ cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
# duplicate sonnet by 4x, to allow benchmarking with input length 2048
- cd $VLLM_SOURCE_CODE_LOC/benchmarks
+ cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
echo "" > sonnet_4x.txt
for _ in {1..4}
do
@@ -339,17 +337,17 @@ main() {
prepare_dataset
- cd $VLLM_SOURCE_CODE_LOC/benchmarks
+ cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
- BENCHMARK_ROOT=$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
+ BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
# run the test
- run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+ run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
# upload benchmark results to buildkite
python3 -m pip install tabulate pandas
- python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+ python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
upload_to_buildkite
}
diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index a0b9a409b758d..d397b05cdff23 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -17,7 +17,7 @@ check_gpus() {
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
- declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+ declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
echo "GPU type is $gpu_type"
}
@@ -93,7 +93,7 @@ kill_gpu_processes() {
# wait until GPU memory usage smaller than 1GB
- while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
+ while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
sleep 1
done
@@ -117,7 +117,7 @@ upload_to_buildkite() {
fi
# Use the determined command to annotate and upload artifacts
- $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" <$RESULTS_FOLDER/benchmark_results.md
+ $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md"
$BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
}
@@ -150,7 +150,7 @@ run_latency_tests() {
# check if there is enough GPU to run the test
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
if [[ $gpu_count -lt $tp ]]; then
- echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
+ echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
@@ -206,9 +206,9 @@ run_throughput_tests() {
throughput_args=$(json2args "$throughput_params")
# check if there is enough GPU to run the test
- tp=$(echo $throughput_params | jq -r '.tensor_parallel_size')
+ tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
if [[ $gpu_count -lt $tp ]]; then
- echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
+ echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
@@ -270,7 +270,7 @@ run_serving_tests() {
# check if there is enough GPU to run the test
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
if [[ $gpu_count -lt $tp ]]; then
- echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
+ echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
@@ -278,7 +278,7 @@ run_serving_tests() {
server_model=$(echo "$server_params" | jq -r '.model')
client_model=$(echo "$client_params" | jq -r '.model')
if [[ $server_model != "$client_model" ]]; then
- echo "Server model and client model must be the same. Skip testcase $testname."
+ echo "Server model and client model must be the same. Skip testcase $test_name."
continue
fi
@@ -293,8 +293,7 @@ run_serving_tests() {
server_pid=$!
# wait until the server is alive
- wait_for_server
- if [ $? -eq 0 ]; then
+ if wait_for_server; then
echo ""
echo "vllm server is up and running."
else
diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
index 4e4d4cd4ca3c6..92d6fad73a94c 100644
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -36,11 +36,11 @@
# collect results
for test_file in results_folder.glob("*.json"):
- with open(test_file, "r") as f:
+ with open(test_file) as f:
raw_result = json.loads(f.read())
# attach the benchmarking command to raw_result
- with open(test_file.with_suffix(".commands"), "r") as f:
+ with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
raw_result.update(command)
diff --git a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
index f16862907def1..19f7160e68a4d 100644
--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@@ -6,7 +6,7 @@ TIMEOUT_SECONDS=10
retries=0
while [ $retries -lt 1000 ]; do
- if [ $(curl -s --max-time $TIMEOUT_SECONDS -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
+ if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
exit 0
fi
@@ -16,4 +16,4 @@ while [ $retries -lt 1000 ]; do
sleep 5
done
-exit 1
\ No newline at end of file
+exit 1
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index df201cdc7c554..902e162720b89 100755
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
# This script runs test inside the corresponding ROCm docker container.
set -o pipefail
@@ -31,8 +33,8 @@ cleanup_docker() {
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f
- # Remove unused volumes
- docker volume prune -f
+ # Remove unused volumes / force the system prune for old images as well.
+ docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
@@ -57,17 +59,17 @@ done
echo "--- Pulling container"
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
-docker pull ${image_name}
+docker pull "${image_name}"
remove_docker_container() {
- docker rm -f ${container_name} || docker image rm -f ${image_name} || true
+ docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
}
trap remove_docker_container EXIT
echo "--- Running container"
HF_CACHE="$(realpath ~)/huggingface"
-mkdir -p ${HF_CACHE}
+mkdir -p "${HF_CACHE}"
HF_MOUNT="/root/.cache/huggingface"
commands=$@
@@ -107,35 +109,36 @@ fi
PARALLEL_JOB_COUNT=8
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then
+ # assign job count as the number of shards used
+ commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
- #replace shard arguments
- commands=${commands//"--shard-id= "/"--shard-id=${GPU} "}
- commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
- echo "Shard ${GPU} commands:$commands"
+ # assign shard-id for each shard
+ commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
+ echo "Shard ${GPU} commands:$commands_gpu"
docker run \
--device /dev/kfd --device /dev/dri \
--network host \
--shm-size=16gb \
--rm \
- -e HIP_VISIBLE_DEVICES=${GPU} \
+ -e HIP_VISIBLE_DEVICES="${GPU}" \
-e HF_TOKEN \
- -v ${HF_CACHE}:${HF_MOUNT} \
- -e HF_HOME=${HF_MOUNT} \
- --name ${container_name}_${GPU} \
- ${image_name} \
- /bin/bash -c "${commands}" \
+ -v "${HF_CACHE}:${HF_MOUNT}" \
+ -e "HF_HOME=${HF_MOUNT}" \
+ --name "${container_name}_${GPU}" \
+ "${image_name}" \
+ /bin/bash -c "${commands_gpu}" \
|& while read -r line; do echo ">>Shard $GPU: $line"; done &
PIDS+=($!)
done
#wait for all processes to finish and collect exit codes
- for pid in ${PIDS[@]}; do
- wait ${pid}
+ for pid in "${PIDS[@]}"; do
+ wait "${pid}"
STATUS+=($?)
done
- for st in ${STATUS[@]}; do
+ for st in "${STATUS[@]}"; do
if [[ ${st} -ne 0 ]]; then
echo "One of the processes failed with $st"
- exit ${st}
+ exit "${st}"
fi
done
else
@@ -146,9 +149,9 @@ else
--rm \
-e HIP_VISIBLE_DEVICES=0 \
-e HF_TOKEN \
- -v ${HF_CACHE}:${HF_MOUNT} \
- -e HF_HOME=${HF_MOUNT} \
- --name ${container_name} \
- ${image_name} \
+ -v "${HF_CACHE}:${HF_MOUNT}" \
+ -e "HF_HOME=${HF_MOUNT}" \
+ --name "${container_name}" \
+ "${image_name}" \
/bin/bash -c "${commands}"
fi
diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh
index cbf6dda677c53..1641c1faa9d6a 100644
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
# This script is run by buildkite to run the benchmarks and upload the results to buildkite
set -ex
diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh
index fd60f5b6afeca..a63c95e51002f 100755
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
@@ -13,7 +15,7 @@ remove_docker_container
# Run the image, setting --shm-size=4g for tensor parallel.
source /etc/environment
#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN="$HF_TOKEN" --name cpu-test cpu-test
# Run basic model test
docker exec cpu-test bash -c "
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index c331a9c49c0d0..064d7c77ab570 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
@@ -46,7 +48,7 @@ docker exec cpu-test bash -c "
docker exec cpu-test bash -c "
export VLLM_CPU_KVCACHE_SPACE=10
export VLLM_CPU_OMP_THREADS_BIND=48-92
- python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
+ python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
--backend vllm \
diff --git a/.buildkite/run-multi-node-test.sh b/.buildkite/run-multi-node-test.sh
index 7ac4dcc4c786d..530bf90a855fe 100755
--- a/.buildkite/run-multi-node-test.sh
+++ b/.buildkite/run-multi-node-test.sh
@@ -14,7 +14,7 @@ DOCKER_IMAGE=$4
shift 4
COMMANDS=("$@")
-if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
+if [ ${#COMMANDS[@]} -ne "$NUM_NODES" ]; then
echo "The number of commands must be equal to the number of nodes."
echo "Number of nodes: $NUM_NODES"
echo "Number of commands: ${#COMMANDS[@]}"
@@ -23,7 +23,7 @@ fi
echo "List of commands"
for command in "${COMMANDS[@]}"; do
- echo $command
+ echo "$command"
done
start_network() {
@@ -36,7 +36,7 @@ start_nodes() {
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
GPU_DEVICES+=$(($DEVICE_NUM))
- if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
+ if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
GPU_DEVICES+=','
fi
done
@@ -49,17 +49,20 @@ start_nodes() {
# 3. map the huggingface cache directory to the container
# 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
# starting from 192.168.10.11)
- docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null"
+ docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \
+ -v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
+ --network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
+ /bin/bash -c "tail -f /dev/null"
# organize containers into a ray cluster
- if [ $node -eq 0 ]; then
+ if [ "$node" -eq 0 ]; then
# start the ray head node
- docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block"
+ docker exec -d "node$node" /bin/bash -c "ray start --head --port=6379 --block"
# wait for the head node to be ready
sleep 10
else
# start the ray worker nodes, and connect them to the head node
- docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
+ docker exec -d "node$node" /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
fi
done
@@ -79,22 +82,22 @@ run_nodes() {
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
GPU_DEVICES+=$(($DEVICE_NUM))
- if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
+ if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
GPU_DEVICES+=','
fi
done
GPU_DEVICES+='"'
echo "Running node$node with GPU devices: $GPU_DEVICES"
- if [ $node -ne 0 ]; then
- docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
+ if [ "$node" -ne 0 ]; then
+ docker exec -d "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
else
- docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
+ docker exec "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
fi
done
}
cleanup() {
for node in $(seq 0 $(($NUM_NODES-1))); do
- docker stop node$node
+ docker stop "node$node"
done
docker network rm docker-net
}
diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
index 252c0f7fecd12..9259391aaed49 100644
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
# This script build the Neuron docker image and run the API server inside the container.
# It serves a sanity check for compilation and basic model usage.
set -e
@@ -12,10 +14,10 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
current_time=$(date +%s)
if [ $((current_time - last_build)) -gt 86400 ]; then
docker system prune -f
- echo $current_time > /tmp/neuron-docker-build-timestamp
+ echo "$current_time" > /tmp/neuron-docker-build-timestamp
fi
else
- echo $(date +%s) > /tmp/neuron-docker-build-timestamp
+ date "+%s" > /tmp/neuron-docker-build-timestamp
fi
docker build -t neuron -f Dockerfile.neuron .
@@ -34,7 +36,7 @@ wait_for_server_to_start() {
timeout=300
counter=0
- while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
+ while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do
sleep 1
counter=$((counter + 1))
if [ $counter -ge $timeout ]; then
diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh
index 70e56596c4a86..6b12f424fd828 100755
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
# This script build the OpenVINO docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
@@ -11,4 +13,4 @@ trap remove_docker_container EXIT
remove_docker_container
# Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
index 6989c94d46a89..770dad6ffa3a1 100644
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
set -e
# Build the docker image.
@@ -12,4 +14,4 @@ remove_docker_container
# For HF_TOKEN.
source /etc/environment
# Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
+docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh
index 6ffa66d5ef3d6..faeac8e2ded36 100644
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 8c98aa36ac0ff..705e81d15ad65 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -9,6 +9,7 @@
# label(str): the name of the test. emoji allowed.
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# fast_check_only(bool): run this test on fastcheck pipeline only
+# nightly(bool): run this test in nightly pipeline only
# optional(bool): never run this test by default (i.e. need to unblock manually)
# command(str): the single command to run for tests. incompatible with commands.
# commands(list): the list of commands to run for test. incompatbile with command.
@@ -119,6 +120,7 @@ steps:
- tests/spec_decode/e2e/test_integration_dist_tp4
- tests/compile
commands:
+ - pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
@@ -229,6 +231,9 @@ steps:
- tests/compile
commands:
- pytest -v -s compile/test_basic_correctness.py
+ # these tests need to be separated, cannot combine
+ - pytest -v -s compile/piecewise/test_simple.py
+ - pytest -v -s compile/piecewise/test_toy_llama.py
- label: "PyTorch Fullgraph Test" # 18min
source_file_dependencies:
@@ -317,7 +322,6 @@ steps:
- tests/models/decoder_only/language
commands:
- pytest -v -s models/decoder_only/language/test_models.py
- - pytest -v -s models/decoder_only/language/test_big_models.py
- label: Decoder-only Language Models Test (Extended) # 1h20min
nightly: true
@@ -325,17 +329,30 @@ steps:
- vllm/
- tests/models/decoder_only/language
commands:
- - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py
+ - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py
-- label: Decoder-only Multi-Modal Models Test # 1h31min
+- label: Decoder-only Multi-Modal Models Test (Standard)
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/models/decoder_only/audio_language
- tests/models/decoder_only/vision_language
commands:
- - pytest -v -s models/decoder_only/audio_language
- - pytest -v -s models/decoder_only/vision_language
+ - pytest -v -s models/decoder_only/audio_language -m core_model
+ - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m core_model
+
+- label: Decoder-only Multi-Modal Models Test (Extended)
+ nightly: true
+ source_file_dependencies:
+ - vllm/
+ - tests/models/decoder_only/audio_language
+ - tests/models/decoder_only/vision_language
+ commands:
+ - pytest -v -s models/decoder_only/audio_language -m 'not core_model'
+ # HACK - run phi3v tests separately to sidestep this transformers bug
+ # https://github.com/huggingface/transformers/issues/34307
+ - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
+ - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model'
- label: Other Models Test # 6min
#mirror_hardwares: [amd]
@@ -410,12 +427,11 @@ steps:
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
- - pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
+ - pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
- label: Multi-step Tests (4 GPUs) # 36min
working_dir: "/vllm-workspace/tests"
@@ -494,6 +510,7 @@ steps:
# NOTE: don't test llama model here, it seems hf implementation is buggy
# see https://github.com/vllm-project/vllm/pull/5689 for details
- pytest -v -s distributed/test_custom_all_reduce.py
+ - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
- pytest -v -s -x lora/test_mixtral.py
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 6fddca0d6e4b9..4f54eea564ecb 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -5,3 +5,28 @@ updates:
directory: "/"
schedule:
interval: "weekly"
+ - package-ecosystem: "pip"
+ directory: "/"
+ schedule:
+ interval: "weekly"
+ labels: ["dependencies"]
+ open-pull-requests-limit: 5
+ reviewers: ["khluu", "simon-mo"]
+ allow:
+ - dependency-type: "all"
+ ignore:
+ - dependency-name: "torch"
+ - dependency-name: "torchvision"
+ - dependency-name: "xformers"
+ - dependency-name: "lm-format-enforcer"
+ - dependency-name: "gguf"
+ - dependency-name: "compressed-tensors"
+ - dependency-name: "ray[adag]"
+ - dependency-name: "lm-eval"
+ groups:
+ patch-update:
+ applies-to: version-updates
+ update-types: ["patch"]
+ minor-update:
+ applies-to: version-updates
+ update-types: ["minor"]
diff --git a/.github/mergify.yml b/.github/mergify.yml
new file mode 100644
index 0000000000000..ca4bd7ee2b87f
--- /dev/null
+++ b/.github/mergify.yml
@@ -0,0 +1,60 @@
+pull_request_rules:
+- name: label-documentation
+ description: Automatically apply documentation label
+ conditions:
+ - or:
+ - files~=^[^/]+\.md$
+ - files~=^docs/
+ actions:
+ label:
+ add:
+ - documentation
+
+- name: label-ci-build
+ description: Automatically apply ci/build label
+ conditions:
+ - or:
+ - files~=^\.github/
+ - files~=\.buildkite/
+ - files~=^cmake/
+ - files=CMakeLists.txt
+ - files~=^Dockerfile
+ - files~=^requirements.*\.txt
+ - files=setup.py
+ actions:
+ label:
+ add:
+ - ci/build
+
+- name: label-frontend
+ description: Automatically apply frontend label
+ conditions:
+ - files~=^vllm/entrypoints/
+ actions:
+ label:
+ add:
+ - frontend
+
+- name: ping author on conflicts and add 'needs-rebase' label
+ conditions:
+ - conflict
+ - -closed
+ actions:
+ label:
+ add:
+ - needs-rebase
+ comment:
+ message: |
+ This pull request has merge conflicts that must be resolved before it can be
+ merged. Please rebase the PR, @{{author}}.
+
+ https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
+
+- name: remove 'needs-rebase' label when conflict is resolved
+ conditions:
+ - -conflict
+ - -closed
+ actions:
+ label:
+ remove:
+ - needs-rebase
diff --git a/.github/scripts/cleanup_pr_body.sh b/.github/scripts/cleanup_pr_body.sh
new file mode 100755
index 0000000000000..3b2da7b9f8966
--- /dev/null
+++ b/.github/scripts/cleanup_pr_body.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+set -eu
+
+# ensure 1 argument is passed
+if [ "$#" -ne 1 ]; then
+ echo "Usage: $0 "
+ exit 1
+fi
+
+PR_NUMBER=$1
+OLD=/tmp/orig_pr_body.txt
+NEW=/tmp/new_pr_body.txt
+
+gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
+cp "${OLD}" "${NEW}"
+
+# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
+sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE\*\*/,$d' "${NEW}"
+
+# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
+sed -i '/FIX #xxxx.*$/d' "${NEW}"
+
+# Remove "FILL IN THE PR DESCRIPTION HERE"
+sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
+
+# Run this only if ${NEW} is different than ${OLD}
+if ! cmp -s "${OLD}" "${NEW}"; then
+ echo "Updating PR body"
+ gh pr edit --body-file "${NEW}" "${PR_NUMBER}"
+else
+ echo "No changes needed"
+fi
diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
index b80749aaa8fec..0226cf0ca00e9 100644
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -6,12 +6,14 @@ on:
paths:
- '.github/workflows/*.ya?ml'
- '.github/workflows/actionlint.*'
+ - '.github/workflows/matchers/actionlint.json'
pull_request:
branches:
- "main"
paths:
- '.github/workflows/*.ya?ml'
- '.github/workflows/actionlint.*'
+ - '.github/workflows/matchers/actionlint.json'
env:
LC_ALL: en_US.UTF-8
@@ -28,7 +30,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: "Checkout"
- uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
index 68d60d7365ed1..68149d2dc019f 100644
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@@ -6,9 +6,21 @@ on:
push:
branches:
- main
+ paths:
+ - '**/*.h'
+ - '**/*.cpp'
+ - '**/*.cu'
+ - '**/*.cuh'
+ - '.github/workflows/clang-format.yml'
pull_request:
branches:
- main
+ paths:
+ - '**/*.h'
+ - '**/*.cpp'
+ - '**/*.cu'
+ - '**/*.cuh'
+ - '.github/workflows/clang-format.yml'
jobs:
clang-format:
@@ -17,9 +29,9 @@ jobs:
matrix:
python-version: ["3.11"]
steps:
- - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+ - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+ uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
new file mode 100644
index 0000000000000..7cf7242e130c8
--- /dev/null
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -0,0 +1,23 @@
+name: Cleanup PR Body
+
+on:
+ pull_request:
+ types: [opened, edited, synchronize]
+
+jobs:
+ update-description:
+ runs-on: ubuntu-latest
+
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+ - name: Set up Python
+ uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+ with:
+ python-version: '3.12'
+
+ - name: Update PR description
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml
new file mode 100644
index 0000000000000..68887adaae54b
--- /dev/null
+++ b/.github/workflows/codespell.yml
@@ -0,0 +1,45 @@
+name: codespell
+
+on:
+ # Trigger the workflow on push or pull request,
+ # but only for the main branch
+ push:
+ branches:
+ - main
+ paths:
+ - "**/*.py"
+ - "**/*.md"
+ - "**/*.rst"
+ - pyproject.toml
+ - requirements-lint.txt
+ - .github/workflows/codespell.yml
+ pull_request:
+ branches:
+ - main
+ paths:
+ - "**/*.py"
+ - "**/*.md"
+ - "**/*.rst"
+ - pyproject.toml
+ - requirements-lint.txt
+ - .github/workflows/codespell.yml
+
+jobs:
+ codespell:
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python-version: ["3.12"]
+ steps:
+ - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install -r requirements-lint.txt
+ - name: Spelling check with codespell
+ run: |
+ codespell --toml pyproject.toml
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 5f1e5f8eeaf7d..73eeacf1fa562 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -6,20 +6,35 @@ on:
push:
branches:
- main
+ paths:
+ - '**/*.py'
+ - '.github/workflows/mypy.yaml'
+ - 'tools/mypy.sh'
+ - 'pyproject.toml'
pull_request:
branches:
- main
+ # This workflow is only relevant when one of the following files changes.
+ # However, we have github configured to expect and require this workflow
+ # to run and pass before github with auto-merge a pull request. Until github
+ # allows more flexible auto-merge policy, we can just run this on every PR.
+ # It doesn't take that long to run, anyway.
+ #paths:
+ # - '**/*.py'
+ # - '.github/workflows/mypy.yaml'
+ # - 'tools/mypy.sh'
+ # - 'pyproject.toml'
jobs:
mypy:
runs-on: ubuntu-latest
strategy:
matrix:
- python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+ python-version: ["3.9", "3.10", "3.11", "3.12"]
steps:
- - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+ - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+ uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
@@ -33,4 +48,4 @@ jobs:
- name: Mypy
run: |
echo "::add-matcher::.github/workflows/matchers/mypy.json"
- tools/mypy.sh 1
+ tools/mypy.sh 1 ${{ matrix.python-version }}
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index f959a1cacf866..c1051d10a4860 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -21,7 +21,7 @@ jobs:
upload_url: ${{ steps.create_release.outputs.upload_url }}
steps:
- name: Checkout
- uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Extract branch info
shell: bash
@@ -48,13 +48,13 @@ jobs:
fail-fast: false
matrix:
os: ['ubuntu-20.04']
- python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+ python-version: ['3.9', '3.10', '3.11', '3.12']
pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
cuda-version: ['11.8', '12.1']
steps:
- name: Checkout
- uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Setup ccache
uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
@@ -68,7 +68,7 @@ jobs:
bash -x .github/workflows/scripts/env.sh
- name: Set up Python
- uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+ uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
with:
python-version: ${{ matrix.python-version }}
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index 9cc8a9e914474..7266cc378cfb0 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -6,33 +6,47 @@ on:
push:
branches:
- main
+ paths:
+ - "**/*.py"
+ - pyproject.toml
+ - requirements-lint.txt
+ - .github/workflows/matchers/ruff.json
+ - .github/workflows/ruff.yml
pull_request:
branches:
- main
+ # This workflow is only relevant when one of the following files changes.
+ # However, we have github configured to expect and require this workflow
+ # to run and pass before github with auto-merge a pull request. Until github
+ # allows more flexible auto-merge policy, we can just run this on every PR.
+ # It doesn't take that long to run, anyway.
+ #paths:
+ # - "**/*.py"
+ # - pyproject.toml
+ # - requirements-lint.txt
+ # - .github/workflows/matchers/ruff.json
+ # - .github/workflows/ruff.yml
jobs:
ruff:
runs-on: ubuntu-latest
strategy:
matrix:
- python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+ python-version: ["3.12"]
steps:
- - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
- - name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
- with:
- python-version: ${{ matrix.python-version }}
- - name: Install dependencies
- run: |
- python -m pip install --upgrade pip
- pip install -r requirements-lint.txt
- - name: Analysing the code with ruff
- run: |
- echo "::add-matcher::.github/workflows/matchers/ruff.json"
- ruff check --output-format github .
- - name: Spelling check with codespell
- run: |
- codespell --toml pyproject.toml
- - name: Run isort
- run: |
- isort . --check-only
+ - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install -r requirements-lint.txt
+ - name: Analysing the code with ruff
+ run: |
+ echo "::add-matcher::.github/workflows/matchers/ruff.json"
+ ruff check --output-format github .
+ - name: Run isort
+ run: |
+ isort . --check-only
diff --git a/.github/workflows/scripts/cuda-install.sh b/.github/workflows/scripts/cuda-install.sh
index 312c6e82f33a3..3d0b7a1fe0402 100644
--- a/.github/workflows/scripts/cuda-install.sh
+++ b/.github/workflows/scripts/cuda-install.sh
@@ -1,16 +1,16 @@
#!/bin/bash
# Replace '.' with '-' ex: 11.8 -> 11-8
-cuda_version=$(echo $1 | tr "." "-")
+cuda_version=$(echo "$1" | tr "." "-")
# Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
-OS=$(echo $2 | tr -d ".\-")
+OS=$(echo "$2" | tr -d ".\-")
# Installs CUDA
-wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb
+wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb"
sudo dpkg -i cuda-keyring_1.1-1_all.deb
rm cuda-keyring_1.1-1_all.deb
sudo apt -qq update
-sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version}
+sudo apt -y install "cuda-${cuda_version}" "cuda-nvcc-${cuda_version}" "cuda-libraries-dev-${cuda_version}"
sudo apt clean
# Test nvcc
diff --git a/.github/workflows/scripts/pytorch-install.sh b/.github/workflows/scripts/pytorch-install.sh
index dfc1851d7692c..e3cda7dad2d17 100644
--- a/.github/workflows/scripts/pytorch-install.sh
+++ b/.github/workflows/scripts/pytorch-install.sh
@@ -6,7 +6,7 @@ cuda_version=$3
# Install torch
$python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
-$python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./}
+$python_executable -m pip install torch=="${pytorch_version}+cu${cuda_version//./}" --extra-index-url "https://download.pytorch.org/whl/cu${cuda_version//./}"
# Print version information
$python_executable --version
diff --git a/.github/workflows/shellcheck.yml b/.github/workflows/shellcheck.yml
new file mode 100644
index 0000000000000..4b1587e373e17
--- /dev/null
+++ b/.github/workflows/shellcheck.yml
@@ -0,0 +1,37 @@
+name: Lint shell scripts
+on:
+ push:
+ branches:
+ - "main"
+ paths:
+ - '**/*.sh'
+ - '.github/workflows/shellcheck.yml'
+ pull_request:
+ branches:
+ - "main"
+ paths:
+ - '**/*.sh'
+ - '.github/workflows/shellcheck.yml'
+
+env:
+ LC_ALL: en_US.UTF-8
+
+defaults:
+ run:
+ shell: bash
+
+permissions:
+ contents: read
+
+jobs:
+ shellcheck:
+ runs-on: ubuntu-latest
+ steps:
+ - name: "Checkout"
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+ with:
+ fetch-depth: 0
+
+ - name: "Check shell scripts"
+ run: |
+ tools/shellcheck.sh
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
index becf2f4f74616..81e7c9b050760 100644
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@@ -10,10 +10,15 @@ jobs:
permissions:
issues: write
pull-requests: write
+ actions: write
runs-on: ubuntu-latest
steps:
- uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
with:
+ # Increasing this value ensures that changes to this workflow
+ # propagate to all issues and PRs in days rather than months
+ operations-per-run: 1000
+
exempt-draft-pr: true
exempt-issue-labels: 'keep-open'
exempt-pr-labels: 'keep-open'
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
index 9f06b35c19e32..ff441f94435ad 100644
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -6,26 +6,33 @@ on:
push:
branches:
- main
+ paths:
+ - "**/*.py"
+ - .github/workflows/yapf.yml
pull_request:
branches:
- main
+ paths:
+ - "**/*.py"
+ - .github/workflows/yapf.yml
+
jobs:
yapf:
runs-on: ubuntu-latest
strategy:
matrix:
- python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+ python-version: ["3.12"]
steps:
- - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
- - name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
- with:
- python-version: ${{ matrix.python-version }}
- - name: Install dependencies
- run: |
- python -m pip install --upgrade pip
- pip install yapf==0.32.0
- pip install toml==0.10.2
- - name: Running yapf
- run: |
- yapf --diff --recursive .
+ - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install yapf==0.32.0
+ pip install toml==0.10.2
+ - name: Running yapf
+ run: |
+ yapf --diff --recursive .
diff --git a/.gitignore b/.gitignore
index 1ea6e3419db2a..ceef6a5fba456 100644
--- a/.gitignore
+++ b/.gitignore
@@ -202,3 +202,4 @@ benchmarks/*.json
# Linting
actionlint
+shellcheck*/
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 42cbf18a0f712..284196bc2d279 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -6,17 +6,16 @@ version: 2
build:
os: ubuntu-22.04
tools:
- python: "3.8"
+ python: "3.12"
sphinx:
- configuration: docs/source/conf.py
- fail_on_warning: true
+ configuration: docs/source/conf.py
+ fail_on_warning: true
# If using Sphinx, optionally build your docs in additional formats such as PDF
formats: []
# Optionally declare the Python requirements required to build your docs
python:
- install:
- - requirements: docs/requirements-docs.txt
-
+ install:
+ - requirements: docs/requirements-docs.txt
diff --git a/.shellcheckrc b/.shellcheckrc
new file mode 100644
index 0000000000000..f3b6eedf8d907
--- /dev/null
+++ b/.shellcheckrc
@@ -0,0 +1,9 @@
+# rules currently disabled:
+#
+# SC1091 (info): Not following: was not specified as input (see shellcheck -x)
+# SC2004 (style): $/${} is unnecessary on arithmetic variables.
+# SC2129 (style): Consider using { cmd1; cmd2; } >> file instead of individual redirects.
+# SC2155 (warning): Declare and assign separately to avoid masking return values.
+# SC2164 (warning): Use 'cd ... || exit' or 'cd ... || return' in case cd fails.
+#
+disable=SC1091,SC2004,SC2129,SC2155,SC2164
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d1956f3d409b4..25c0865a90a67 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,7 +31,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
# Supported python versions. These versions will be searched in order, the
# first match will be selected. These should be kept in sync with setup.py.
#
-set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11" "3.12")
+set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
# Supported NVIDIA architectures.
set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
@@ -49,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from Dockerfile.rocm
#
-set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0")
-set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
+set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
#
# Try to find python package with an executable that exactly matches
@@ -128,9 +128,9 @@ endif()
if(VLLM_GPU_LANG STREQUAL "CUDA")
#
- # For cuda we want to be able to control which architectures we compile for on
+ # For cuda we want to be able to control which architectures we compile for on
# a per-file basis in order to cut down on compile time. So here we extract
- # the set of architectures we want to compile for and remove the from the
+ # the set of architectures we want to compile for and remove the from the
# CMAKE_CUDA_FLAGS so that they are not applied globally.
#
clear_cuda_arches(CUDA_ARCH_FLAGS)
@@ -138,7 +138,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "CUDA target architectures: ${CUDA_ARCHS}")
# Filter the target architectures by the supported supported archs
# since for some files we will build for all CUDA_ARCHS.
- cuda_archs_loose_intersection(CUDA_ARCHS
+ cuda_archs_loose_intersection(CUDA_ARCHS
"${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}")
message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}")
else()
@@ -195,7 +195,6 @@ set(VLLM_EXT_SRC
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
"csrc/quantization/fp8/common.cu"
"csrc/cuda_utils_kernels.cu"
- "csrc/moe_align_block_size_kernels.cu"
"csrc/prepare_inputs/advance_step.cu"
"csrc/torch_bindings.cpp")
@@ -237,7 +236,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# are not supported by Machete yet.
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS})
if (MARLIN_ARCHS)
- set(MARLIN_SRCS
+ set(MARLIN_SRCS
"csrc/quantization/fp8/fp8_marlin.cu"
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
@@ -278,7 +277,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"in CUDA target architectures")
endif()
- # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
+ # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
# build any 3x kernels
set(SCALED_MM_3X_ARCHS)
endif()
@@ -286,7 +285,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
#
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
# kernels for the remaining archs that are not already built for 3x.
- cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
+ cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
"7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
@@ -317,10 +316,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
#
- # For the Machete kernels we automatically generate sources for various
+ # For the Machete kernels we automatically generate sources for various
# preselected input type pairs and schedules.
# Generate sources:
- set(MACHETE_GEN_SCRIPT
+ set(MACHETE_GEN_SCRIPT
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py)
file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH)
@@ -330,8 +329,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH}
OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH})
execute_process(
- COMMAND ${CMAKE_COMMAND} -E env
- PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
+ COMMAND ${CMAKE_COMMAND} -E env
+ PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT}
RESULT_VARIABLE machete_generation_result
OUTPUT_VARIABLE machete_generation_output
@@ -341,11 +340,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
if (NOT machete_generation_result EQUAL 0)
message(FATAL_ERROR "Machete generation failed."
- " Result: \"${machete_generation_result}\""
+ " Result: \"${machete_generation_result}\""
"\nCheck the log for details: "
"${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log")
else()
- set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
+ set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH}
CACHE STRING "Last run machete generate script hash" FORCE)
message(STATUS "Machete generation completed successfully.")
endif()
@@ -367,7 +366,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
else()
- if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
+ if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
AND MACHETE_ARCHS)
message(STATUS "Not building Machete kernels as CUDA Compiler version is "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
@@ -393,8 +392,8 @@ define_gpu_extension_target(
USE_SABI 3
WITH_SOABI)
-# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
-# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
+# If CUTLASS is compiled on NVCC >= 12.5, it by default uses
+# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the
# driver API. This causes problems when linking with earlier versions of CUDA.
# Setting this variable sidesteps the issue by calling the driver directly.
target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
@@ -405,6 +404,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
set(VLLM_MOE_EXT_SRC
"csrc/moe/torch_bindings.cpp"
+ "csrc/moe/moe_align_sum_kernels.cu"
"csrc/moe/topk_softmax_kernels.cu")
set_gencode_flags_for_srcs(
@@ -471,9 +471,9 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
return()
endif ()
-# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
-# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
-# arches in the CUDA case (and instead set the gencodes on a per file basis)
+# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
+# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
+# arches in the CUDA case (and instead set the gencodes on a per file basis)
# we need to manually set VLLM_GPU_ARCHES here.
if(VLLM_GPU_LANG STREQUAL "CUDA")
foreach(_ARCH ${CUDA_ARCHS})
@@ -507,7 +507,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
- GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd
+ GIT_TAG 5259c586c403a4e4d8bf69973c159b40cc346fb9
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 5f79356bd32f7..b39fd75b5fb70 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -11,12 +11,14 @@ We also believe in the power of community support; thus, answering queries, offe
Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository!
+## License
+
+See [LICENSE](LICENSE).
## Developing
Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source](https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source) documentation for details.
-
## Testing
```bash
@@ -33,6 +35,14 @@ pytest tests/
## Contribution Guidelines
+### DCO and Signed-off-by
+
+When contributing changes to this project, you must agree to the [DCO](DCO).
+Commits must include a `Signed-off-by:` header which certifies agreement with
+the terms of the [DCO](DCO).
+
+Using `-s` with `git commit` will automatically add this header.
+
### Issues
If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
diff --git a/DCO b/DCO
new file mode 100644
index 0000000000000..49b8cb0549267
--- /dev/null
+++ b/DCO
@@ -0,0 +1,34 @@
+Developer Certificate of Origin
+Version 1.1
+
+Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+
+Everyone is permitted to copy and distribute verbatim copies of this
+license document, but changing it is not allowed.
+
+
+Developer's Certificate of Origin 1.1
+
+By making a contribution to this project, I certify that:
+
+(a) The contribution was created in whole or in part by me and I
+ have the right to submit it under the open source license
+ indicated in the file; or
+
+(b) The contribution is based upon previous work that, to the best
+ of my knowledge, is covered under an appropriate open source
+ license and I have the right under that license to submit that
+ work with modifications, whether created in whole or in part
+ by me, under the same open source license (unless I am
+ permitted to submit under a different license), as indicated
+ in the file; or
+
+(c) The contribution was provided directly to me by some other
+ person who certified (a), (b) or (c) and I have not modified
+ it.
+
+(d) I understand and agree that this project and the contribution
+ are public and that a record of the contribution (including all
+ personal information I submit with it, including my sign-off) is
+ maintained indefinitely and may be redistributed consistent with
+ this project or the open source license(s) involved.
diff --git a/Dockerfile b/Dockerfile
index 0a562253c537b..4c0f5aebe859d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -191,6 +191,9 @@ ADD . /vllm-workspace/
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-dev.txt
+# Copy in the v1 package for testing (it isn't distributed yet)
+COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
+
# doc requires source code
# we hide them inside `test_docs/` , so that this source code
# will not be imported by other tests
@@ -206,7 +209,7 @@ FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
- pip install accelerate hf_transfer 'modelscope!=1.15.0' bitsandbytes>=0.44.0 timm==0.9.10
+ pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' timm==0.9.10
ENV VLLM_USAGE_SOURCE production-docker-image
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index f1a21d6bd13fc..287b4958da4e5 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -22,7 +22,7 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li
RUN echo 'ulimit -c 0' >> ~/.bashrc
-RUN pip install intel_extension_for_pytorch==2.4.0
+RUN pip install intel_extension_for_pytorch==2.5.0
WORKDIR /workspace
diff --git a/Dockerfile.hpu b/Dockerfile.hpu
new file mode 100644
index 0000000000000..f481c8c6a57bf
--- /dev/null
+++ b/Dockerfile.hpu
@@ -0,0 +1,16 @@
+FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+
+RUN pip install -v -r requirements-hpu.txt
+
+ENV no_proxy=localhost,127.0.0.1
+ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
+
+RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
+
+WORKDIR /workspace/
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index 3d9d8e7da487c..2143315d2a078 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -31,11 +31,11 @@ RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
RUN python3 -m pip install -U \
- cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+ 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-r requirements-neuron.txt
ENV VLLM_TARGET_DEVICE neuron
RUN --mount=type=bind,source=.git,target=.git \
- pip install --no-build-isolation -v -e . \
+ pip install --no-build-isolation -v -e .
CMD ["/bin/bash"]
diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
index cd5fcf481f07c..b19c6ddec7948 100644
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -21,7 +21,7 @@ RUN --mount=type=bind,source=.git,target=.git \
# These packages will be in rocketce eventually
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
- cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+ 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
torch==2.3.1 \
-r requirements-cpu.txt \
xformers uvloop==0.20.0
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index d35889f053e27..8fb79afaebe97 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -52,7 +52,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip uninstall -y torch torchvision \
&& python3 -m pip install --pre \
torch==2.6.0.dev20240918 \
- setuptools-scm>=8 \
+ 'setuptools-scm>=8' \
torchvision==0.20.0.dev20240918 \
--extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
*) ;; esac
@@ -121,6 +121,8 @@ ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
+RUN python3 -m pip install --upgrade pip
+
# Package upgrades for useful functionality or to avoid dependency issues
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard
diff --git a/Dockerfile.tpu b/Dockerfile.tpu
index bdfab3f61910f..0a507b6ecdf60 100644
--- a/Dockerfile.tpu
+++ b/Dockerfile.tpu
@@ -1,4 +1,4 @@
-ARG NIGHTLY_DATE="20240828"
+ARG NIGHTLY_DATE="20241017"
ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE"
FROM $BASE_IMAGE
@@ -9,12 +9,6 @@ RUN apt-get update && apt-get install -y \
git \
ffmpeg libsm6 libxext6 libgl1
-# Install the TPU and Pallas dependencies.
-RUN --mount=type=cache,target=/root/.cache/pip \
- python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
-RUN --mount=type=cache,target=/root/.cache/pip \
- python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-
# Build vLLM.
COPY . .
ARG GIT_REPO_CHECK=0
@@ -25,7 +19,6 @@ ENV VLLM_TARGET_DEVICE="tpu"
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \
python3 -m pip install \
- cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
-r requirements-tpu.txt
RUN python3 setup.py develop
diff --git a/Dockerfile.xpu b/Dockerfile.xpu
index 0ecb46df6256c..63bc682770422 100644
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@@ -30,9 +30,19 @@ COPY requirements-common.txt /workspace/vllm/requirements-common.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install --no-cache-dir \
- --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
-r requirements-xpu.txt
+RUN git clone https://github.com/intel/pti-gpu && \
+ cd pti-gpu/sdk && \
+ git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \
+ mkdir build && \
+ cd build && \
+ cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \
+ make -j && \
+ cmake --install . --config Release --prefix "/usr/local"
+
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
+
COPY . .
ARG GIT_REPO_CHECK
RUN --mount=type=bind,source=.git,target=.git \
diff --git a/README.md b/README.md
index 0836d872358fb..b75bfc5c699a7 100644
--- a/README.md
+++ b/README.md
@@ -13,9 +13,19 @@ Easy, fast, and cheap LLM serving for everyone
| Documentation | Blog | Paper | Discord | Twitter/X | Developer Slack |
+---
+
+**vLLM x Snowflake Meetup (Wednesday, November 13th, 5:30-8PM PT) at Snowflake HQ, San Mateo**
+
+We are excited to announce the last in-person vLLM meetup of the year!
+Join the vLLM developers and engineers from Snowflake AI Research to chat about the latest LLM inference optimizations and your 2025 vLLM wishlist!
+Register [here](https://lu.ma/h0qvrajz) and be a part of the event!
+
+---
+
*Latest News* 🔥
-- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
+- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/sessioncatalog?tab.day=20241001&search.sessiontracks=1719251906298001uzJ2) from other vLLM contributors and users!
- [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
- [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing).
@@ -42,7 +52,7 @@ vLLM is fast with:
- Speculative decoding
- Chunked prefill
-**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script.
+**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script.
vLLM is flexible and easy to use with:
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 192d6c4022c83..2aa4a285021f1 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -6,3 +6,14 @@ You can download the dataset by running:
```bash
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
```
+
+## Downloading the ShareGPT4V dataset
+
+The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts
+will ignore a datapoint if the referred image is missing.
+```bash
+wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
+mkdir coco -p
+wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
+unzip coco/train2017.zip -d coco/
+```
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 4813fde27f0bc..a42e70170ba28 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -79,7 +79,7 @@ async def async_request_tgi(
# any data, we should skip it.
if chunk_bytes.startswith(":"):
continue
- chunk = remove_prefix(chunk_bytes, "data:")
+ chunk = chunk_bytes.removeprefix("data:")
data = json.loads(chunk)
timestamp = time.perf_counter()
@@ -144,8 +144,8 @@ async def async_request_trt_llm(
if not chunk_bytes:
continue
- chunk = remove_prefix(chunk_bytes.decode("utf-8"),
- "data:")
+ chunk = chunk_bytes.decode("utf-8").removeprefix(
+ "data:")
data = json.loads(chunk)
output.generated_text += data["text_output"]
@@ -261,8 +261,8 @@ async def async_request_openai_completions(
if not chunk_bytes:
continue
- chunk = remove_prefix(chunk_bytes.decode("utf-8"),
- "data: ")
+ chunk = chunk_bytes.decode("utf-8").removeprefix(
+ "data: ")
if chunk == "[DONE]":
latency = time.perf_counter() - st
else:
@@ -324,7 +324,7 @@ async def async_request_openai_chat_completions(
},
],
"temperature": 0.0,
- "max_tokens": request_func_input.output_len,
+ "max_completion_tokens": request_func_input.output_len,
"stream": True,
"ignore_eos": request_func_input.ignore_eos,
}
@@ -349,8 +349,8 @@ async def async_request_openai_chat_completions(
if not chunk_bytes:
continue
- chunk = remove_prefix(chunk_bytes.decode("utf-8"),
- "data: ")
+ chunk = chunk_bytes.decode("utf-8").removeprefix(
+ "data: ")
if chunk == "[DONE]":
latency = time.perf_counter() - st
else:
@@ -389,14 +389,6 @@ async def async_request_openai_chat_completions(
return output
-# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix)
-# introduced in Python 3.9
-def remove_prefix(text: str, prefix: str) -> str:
- if text.startswith(prefix):
- return text[len(prefix):]
- return text
-
-
def get_model(pretrained_model_name_or_path: str) -> str:
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
from modelscope import snapshot_download
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index 1aac029992dbf..6d33096ca1d11 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -118,7 +118,7 @@ def main(args):
random.seed(args.seed)
if args.dataset_path is not None:
print(f"Start to sample {args.num_prompts} prompts"
- "from {args.dataset_path}")
+ f"from {args.dataset_path}")
filtered_datasets = sample_requests(
dataset_path=args.dataset_path,
num_requests=args.num_prompts,
@@ -142,13 +142,6 @@ def main(args):
repeat_count=args.repeat_count,
sort=args.sort)
- print("------warm up------")
- test_prefix(
- llm=llm,
- prompts=prompts,
- sampling_params=sampling_params,
- )
-
print("------start generating------")
test_prefix(
llm=llm,
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 0d205014b15bf..bdb8ea8e2a5dc 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -297,8 +297,33 @@ def sample_random_requests(
async def get_request(
input_requests: List[Tuple[str, int, int]],
request_rate: float,
+ burstiness: float = 1.0,
) -> AsyncGenerator[Tuple[str, int, int], None]:
+ """
+ Asynchronously generates requests at a specified rate
+ with OPTIONAL burstiness.
+
+ Args:
+ input_requests:
+ A list of input requests, each represented as a tuple.
+ request_rate:
+ The rate at which requests are generated (requests/s).
+ burstiness (optional):
+ The burstiness factor of the request generation.
+ Only takes effect when request_rate is not inf.
+ Default value is 1, which follows a Poisson process.
+ Otherwise, the request intervals follow a gamma distribution.
+ A lower burstiness value (0 < burstiness < 1) results
+ in more bursty requests, while a higher burstiness value
+ (burstiness > 1) results in a more uniform arrival of requests.
+ """
input_requests = iter(input_requests)
+
+ # Calculate scale parameter theta to maintain the desired request_rate.
+ assert burstiness > 0, (
+ f"A positive burstiness factor is expected, but given {burstiness}.")
+ theta = 1.0 / (request_rate * burstiness)
+
for request in input_requests:
yield request
@@ -306,8 +331,9 @@ async def get_request(
# If the request rate is infinity, then we don't need to wait.
continue
- # Sample the request interval from the exponential distribution.
- interval = np.random.exponential(1.0 / request_rate)
+ # Sample the request interval from the gamma distribution.
+ # If burstiness is 1, it follows exponential distribution.
+ interval = np.random.gamma(shape=burstiness, scale=theta)
# The next request will be sent after the interval.
await asyncio.sleep(interval)
@@ -406,9 +432,9 @@ def calculate_metrics(
median_itl_ms=np.median(itls or 0) * 1000,
percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
for p in selected_percentiles],
- mean_e2el_ms=np.median(e2els or 0) * 1000,
+ mean_e2el_ms=np.mean(e2els or 0) * 1000,
std_e2el_ms=np.std(e2els or 0) * 1000,
- median_e2el_ms=np.mean(e2els or 0) * 1000,
+ median_e2el_ms=np.median(e2els or 0) * 1000,
percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
for p in selected_percentiles],
)
@@ -426,6 +452,7 @@ async def benchmark(
logprobs: Optional[int],
best_of: int,
request_rate: float,
+ burstiness: float,
disable_tqdm: bool,
profile: bool,
selected_percentile_metrics: List[str],
@@ -480,7 +507,13 @@ async def benchmark(
if profile_output.success:
print("Profiler started")
+ if burstiness == 1.0:
+ distribution = "Poisson process"
+ else:
+ distribution = "Gamma distribution"
+
print(f"Traffic request rate: {request_rate}")
+ print(f"Burstiness factor: {burstiness} ({distribution})")
print(f"Maximum request concurrency: {max_concurrency}")
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
@@ -502,7 +535,7 @@ async def limited_request_func(request_func_input, pbar):
benchmark_start_time = time.perf_counter()
tasks: List[asyncio.Task] = []
- async for request in get_request(input_requests, request_rate):
+ async for request in get_request(input_requests, request_rate, burstiness):
prompt, prompt_len, output_len, mm_content = request
request_func_input = RequestFuncInput(model=model_id,
prompt=prompt,
@@ -769,6 +802,7 @@ def main(args: argparse.Namespace):
logprobs=args.logprobs,
best_of=args.best_of,
request_rate=args.request_rate,
+ burstiness=args.burstiness,
disable_tqdm=args.disable_tqdm,
profile=args.profile,
selected_percentile_metrics=args.percentile_metrics.split(","),
@@ -807,6 +841,7 @@ def main(args: argparse.Namespace):
# Traffic
result_json["request_rate"] = (
args.request_rate if args.request_rate < float("inf") else "inf")
+ result_json["burstiness"] = args.burstiness
result_json["max_concurrency"] = args.max_concurrency
# Merge with benchmark result
@@ -922,8 +957,20 @@ def main(args: argparse.Namespace):
default=float("inf"),
help="Number of requests per second. If this is inf, "
"then all the requests are sent at time 0. "
- "Otherwise, we use Poisson process to synthesize "
- "the request arrival times.",
+ "Otherwise, we use Poisson process or gamma distribution "
+ "to synthesize the request arrival times.",
+ )
+ parser.add_argument(
+ "--burstiness",
+ type=float,
+ default=1.0,
+ help="Burstiness factor of the request generation. "
+ "Only take effect when request_rate is not inf. "
+ "Default value is 1, which follows Poisson process. "
+ "Otherwise, the request intervals follow a gamma distribution. "
+ "A lower burstiness value (0 < burstiness < 1) results in more "
+ "bursty requests. A higher burstiness value (burstiness > 1) "
+ "results in a more uniform arrival of requests.",
)
parser.add_argument("--seed", type=int, default=0)
parser.add_argument(
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index ee41c8ea38382..159cf055737ce 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -4,10 +4,11 @@
import json
import random
import time
-from typing import List, Optional, Tuple
+from typing import List, Optional
import torch
import uvloop
+from PIL import Image
from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer,
PreTrainedTokenizerBase)
@@ -15,16 +16,56 @@
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args)
+from vllm.inputs import TextPrompt
+from vllm.multimodal import MultiModalDataDict
from vllm.sampling_params import BeamSearchParams
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
-def sample_requests(
- dataset_path: str,
- num_requests: int,
- tokenizer: PreTrainedTokenizerBase,
- fixed_output_len: Optional[int],
-) -> List[Tuple[str, int, int]]:
+@dataclasses.dataclass
+class SampleRequest:
+ """A class representing a single inference request for benchmarking.
+
+ Attributes:
+ prompt: The input text prompt for the model.
+ multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+ images).
+ prompt_len: The length of the prompt in tokens.
+ expected_output_len: The expected length of the output in tokens.
+ """
+ prompt: str
+ prompt_len: int
+ expected_output_len: int
+ multi_modal_data: Optional[MultiModalDataDict] = None
+
+
+def _get_prompt_for_image_model(question: str, *, model: str) -> str:
+ """Prepend and append special tokens around the question to form a prompt.
+
+ Args:
+ question: The input question text to wrap with special tokens
+ model: The name of the model being used, to determine which special
+ tokens to add
+
+ Returns:
+ The formatted prompt string with appropriate special tokens for the
+ model
+
+ Raises:
+ ValueError: If an unsupported model name is provided
+ """
+ model = model.lower()
+ if "pixtral" in model:
+ return f"[INST]{question}\n[IMG][/INST]"
+ raise ValueError(f"Unsupported model {model}")
+
+
+def sample_requests(tokenizer: PreTrainedTokenizerBase,
+ args: argparse.Namespace) -> List[SampleRequest]:
+ dataset_path: str = args.dataset
+ num_requests: int = args.num_prompts
+ fixed_output_len: Optional[int] = args.output_len
+ model: str = args.model
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
@@ -33,23 +74,36 @@ def sample_requests(
dataset = json.load(f)
# Filter out the conversations with less than 2 turns.
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
- # Only keep the first two turns of each conversation.
- dataset = [(data["conversations"][0]["value"],
- data["conversations"][1]["value"]) for data in dataset]
-
# Shuffle the dataset.
random.shuffle(dataset)
# Filter out sequences that are too long or too short
- filtered_dataset: List[Tuple[str, int, int]] = []
- for i in range(len(dataset)):
+ filtered_dataset: List[SampleRequest] = []
+ for data in dataset:
if len(filtered_dataset) == num_requests:
break
+ # Only keep the first two turns of each conversation.
+ prompt = data["conversations"][0]["value"]
+ completion = data["conversations"][1]["value"]
+
+ multi_modal_data: Optional[MultiModalDataDict] = None
+ if "image" in data:
+ multi_modal_data = multi_modal_data or {}
+ image_path = data["image"]
+ # TODO(vllm-project/vllm/issues/9778): Support multiple images.
+ assert isinstance(image_path,
+ str), "Only support single image input"
+ try:
+ multi_modal_data["image"] = Image.open(image_path).convert(
+ "RGB")
+ except FileNotFoundError:
+ # Ignore datapoint where asset is missing
+ continue
+ prompt = _get_prompt_for_image_model(question=prompt, model=model)
+
# Tokenize the prompts and completions.
- prompt = dataset[i][0]
prompt_token_ids = tokenizer(prompt).input_ids
- completion = dataset[i][1]
completion_token_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_token_ids)
output_len = len(completion_token_ids
@@ -60,13 +114,17 @@ def sample_requests(
if prompt_len > 1024 or prompt_len + output_len > 2048:
# Prune too long sequences.
continue
- filtered_dataset.append((prompt, prompt_len, output_len))
+ filtered_dataset.append(
+ SampleRequest(prompt=prompt,
+ prompt_len=prompt_len,
+ expected_output_len=output_len,
+ multi_modal_data=multi_modal_data))
return filtered_dataset
def run_vllm(
- requests: List[Tuple[str, int, int]],
+ requests: List[SampleRequest],
n: int,
engine_args: EngineArgs,
) -> float:
@@ -74,17 +132,19 @@ def run_vllm(
llm = LLM(**dataclasses.asdict(engine_args))
# Add the requests to the engine.
- prompts: List[str] = []
+ prompts: List[TextPrompt] = []
sampling_params: List[SamplingParams] = []
- for prompt, _, output_len in requests:
- prompts.append(prompt)
+ for request in requests:
+ prompts.append(
+ TextPrompt(prompt=request.prompt,
+ multi_modal_data=request.multi_modal_data))
sampling_params.append(
SamplingParams(
n=n,
temperature=1.0,
top_p=1.0,
ignore_eos=True,
- max_tokens=output_len,
+ max_tokens=request.expected_output_len,
))
use_beam_search = False
@@ -94,11 +154,11 @@ def run_vllm(
llm.generate(prompts, sampling_params, use_tqdm=True)
end = time.perf_counter()
else:
- prompts = [prompt for prompt, _, _ in requests]
+ prompts = [request.prompt for request in requests]
# output_len should be the same for all requests.
output_len = requests[0][2]
- for prompt, input_len, _output_len in requests:
- assert _output_len == output_len
+ for request in requests:
+ assert request.expected_output_len == output_len
start = time.perf_counter()
llm.beam_search(
prompts,
@@ -112,7 +172,7 @@ def run_vllm(
async def run_vllm_async(
- requests: List[Tuple[str, int, int]],
+ requests: List[SampleRequest],
n: int,
engine_args: AsyncEngineArgs,
disable_frontend_multiprocessing: bool = False,
@@ -123,17 +183,19 @@ async def run_vllm_async(
engine_args, disable_frontend_multiprocessing) as llm:
# Add the requests to the engine.
- prompts: List[str] = []
+ prompts: List[TextPrompt] = []
sampling_params: List[SamplingParams] = []
- for prompt, _, output_len in requests:
- prompts.append(prompt)
+ for request in requests:
+ prompts.append(
+ TextPrompt(prompt=request.prompt,
+ multi_modal_data=request.multi_modal_data))
sampling_params.append(
SamplingParams(
n=n,
temperature=1.0,
top_p=1.0,
ignore_eos=True,
- max_tokens=output_len,
+ max_tokens=request.expected_output_len,
))
generators = []
@@ -149,7 +211,7 @@ async def run_vllm_async(
def run_hf(
- requests: List[Tuple[str, int, int]],
+ requests: List[SampleRequest],
model: str,
tokenizer: PreTrainedTokenizerBase,
n: int,
@@ -207,14 +269,14 @@ def run_hf(
def run_mii(
- requests: List[Tuple[str, int, int]],
+ requests: List[SampleRequest],
model: str,
tensor_parallel_size: int,
output_len: int,
) -> float:
from mii import client, serve
llm = serve(model, tensor_parallel=tensor_parallel_size)
- prompts = [prompt for prompt, _, _ in requests]
+ prompts = [request.prompt for request in requests]
start = time.perf_counter()
llm.generate(prompts, max_new_tokens=output_len)
@@ -243,12 +305,17 @@ def main(args: argparse.Namespace):
else:
raise ValueError(
f"Failed to synthesize a prompt with {args.input_len} tokens.")
- requests = [(prompt, args.input_len, args.output_len)
- for _ in range(args.num_prompts)]
+ requests = [
+ SampleRequest(prompt=prompt,
+ prompt_len=args.input_len,
+ expected_output_len=args.output_len)
+ for _ in range(args.num_prompts)
+ ]
else:
- requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
- args.output_len)
+ requests = sample_requests(tokenizer, args)
+ is_multi_modal = any(request.multi_modal_data is not None
+ for request in requests)
if args.backend == "vllm":
if args.async_engine:
elapsed_time = uvloop.run(
@@ -270,9 +337,15 @@ def main(args: argparse.Namespace):
args.output_len)
else:
raise ValueError(f"Unknown backend: {args.backend}")
- total_num_tokens = sum(prompt_len + output_len
- for _, prompt_len, output_len in requests)
- total_output_tokens = sum(output_len for _, _, output_len in requests)
+ total_num_tokens = sum(request.prompt_len + request.expected_output_len
+ for request in requests)
+ total_output_tokens = sum(request.expected_output_len
+ for request in requests)
+ if is_multi_modal:
+ print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
+ "following metrics are not accurate because image tokens are not"
+ " counted. See vllm-project/vllm/issues/9778 for details.")
+ # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length.
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
@@ -299,7 +372,9 @@ def main(args: argparse.Namespace):
parser.add_argument("--dataset",
type=str,
default=None,
- help="Path to the dataset.")
+ help="Path to the dataset. The dataset is expected to "
+ "be a json in form of List[Dict[..., conversations: "
+ "List[Dict[..., value: ]]]]")
parser.add_argument("--input-len",
type=int,
default=None,
diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py
index 92f6053cc6d7e..7acea6087fdfd 100644
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -3,8 +3,8 @@
import torch
from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
- seed_everything)
+from vllm.platforms import current_platform
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
@torch.inference_mode()
@@ -16,7 +16,7 @@ def main(num_tokens: int,
do_profile: bool = False,
num_warmup_iters: int = 5,
num_iters: int = 100) -> None:
- seed_everything(seed)
+ current_platform.seed_everything(seed)
torch.set_default_device("cuda")
layer = RMSNorm(hidden_size).to(dtype=dtype)
diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py
index b70c4b94c97a1..665b50bf18cf0 100644
--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -269,10 +269,10 @@ def run_square_bench(args):
def run_range_bench(args):
- m_start, k_start, n_start = [int(x) for x in args.dim_start.split(",")]
- m_end, k_end, n_end = [int(x) for x in args.dim_end.split(",")]
+ m_start, k_start, n_start = (int(x) for x in args.dim_start.split(","))
+ m_end, k_end, n_end = (int(x) for x in args.dim_end.split(","))
m_increment, k_increment, n_increment = \
- [int(x) for x in args.dim_increment.split(",")]
+ (int(x) for x in args.dim_increment.split(","))
Ms = list(range(m_start, m_end + 1, m_increment))
Ks = list(range(k_start, k_end + 1, k_increment))
Ns = list(range(n_start, n_end + 1, n_increment))
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index c2ad98b7e2656..8f538c21f7f7e 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -10,7 +10,8 @@
from transformers import AutoConfig
from vllm.model_executor.layers.fused_moe.fused_moe import *
-from vllm.utils import FlexibleArgumentParser, seed_everything
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
class BenchmarkConfig(TypedDict):
@@ -88,22 +89,23 @@ def prepare(i: int):
input_gating.copy_(gating_output[i])
def run():
- fused_moe(
- x,
- w1,
- w2,
- input_gating,
- topk,
- renormalize=True,
- inplace=True,
- override_config=config,
- use_fp8_w8a8=use_fp8_w8a8,
- use_int8_w8a16=use_int8_w8a16,
- w1_scale=w1_scale,
- w2_scale=w2_scale,
- a1_scale=a1_scale,
- a2_scale=a2_scale,
- )
+ from vllm.model_executor.layers.fused_moe import override_config
+ with override_config(config):
+ fused_moe(
+ x,
+ w1,
+ w2,
+ input_gating,
+ topk,
+ renormalize=True,
+ inplace=True,
+ use_fp8_w8a8=use_fp8_w8a8,
+ use_int8_w8a16=use_int8_w8a16,
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a1_scale,
+ a2_scale=a2_scale,
+ )
# JIT compilation & warmup
run()
@@ -166,7 +168,7 @@ class BenchmarkWorker:
def __init__(self, seed: int) -> None:
torch.set_default_device("cuda")
- seed_everything(seed)
+ current_platform.seed_everything(seed)
self.seed = seed
def benchmark(
@@ -180,7 +182,7 @@ def benchmark(
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
) -> Tuple[Dict[str, int], float]:
- seed_everything(self.seed)
+ current_platform.seed_everything(self.seed)
dtype_str = get_config_dtype_str(dtype,
use_int8_w8a16=use_int8_w8a16,
use_fp8_w8a8=use_fp8_w8a8)
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 87864d038d593..14eef00b855ac 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -5,8 +5,9 @@
import torch
from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
- create_kv_caches_with_random, seed_everything)
+ create_kv_caches_with_random)
NUM_BLOCKS = 1024
PARTITION_SIZE = 512
@@ -28,7 +29,7 @@ def main(
device: str = "cuda",
kv_cache_dtype: Optional[str] = None,
) -> None:
- seed_everything(seed)
+ current_platform.seed_everything(seed)
scale = float(1.0 / (head_size**0.5))
query = torch.empty(num_seqs,
diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py
index 743a5744e8614..1d62483448946 100644
--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -3,8 +3,8 @@
import torch
from vllm import _custom_ops as ops
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
- seed_everything)
+from vllm.platforms import current_platform
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
@torch.inference_mode()
@@ -17,7 +17,7 @@ def main(num_tokens: int,
do_profile: bool = False,
num_warmup_iters: int = 5,
num_iters: int = 100) -> None:
- seed_everything(seed)
+ current_platform.seed_everything(seed)
torch.set_default_device("cuda")
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index 784b1cf9844e4..250d505168d09 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -6,7 +6,8 @@
from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding,
get_rope)
-from vllm.utils import FlexibleArgumentParser, seed_everything
+from vllm.platforms import current_platform
+from vllm.utils import FlexibleArgumentParser
def benchmark_rope_kernels_multi_lora(
@@ -22,7 +23,7 @@ def benchmark_rope_kernels_multi_lora(
max_position: int = 8192,
base: int = 10000,
) -> None:
- seed_everything(seed)
+ current_platform.seed_everything(seed)
torch.set_default_device(device)
if rotary_dim is None:
rotary_dim = head_size
diff --git a/benchmarks/launch_tgi_server.sh b/benchmarks/launch_tgi_server.sh
index 8c5cd454fbbee..ba7383d88dc49 100755
--- a/benchmarks/launch_tgi_server.sh
+++ b/benchmarks/launch_tgi_server.sh
@@ -4,13 +4,13 @@ PORT=8000
MODEL=$1
TOKENS=$2
-docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \
- -v $PWD/data:/data \
+docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \
+ -v "$PWD/data:/data" \
ghcr.io/huggingface/text-generation-inference:2.2.0 \
- --model-id $MODEL \
+ --model-id "$MODEL" \
--sharded false \
--max-input-length 1024 \
--max-total-tokens 2048 \
--max-best-of 5 \
--max-concurrent-requests 5000 \
- --max-batch-total-tokens $TOKENS
+ --max-batch-total-tokens "$TOKENS"
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 7237d246ddf55..5912c5c02ede7 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -18,6 +18,7 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc")
#
list(APPEND CXX_COMPILE_FLAGS
"-fopenmp"
+ "-mf16c"
"-DVLLM_CPU_EXTENSION")
execute_process(COMMAND cat /proc/cpuinfo
@@ -92,7 +93,7 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
FetchContent_Declare(
oneDNN
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
- GIT_TAG v3.5.3
+ GIT_TAG v3.6
GIT_PROGRESS TRUE
GIT_SHALLOW TRUE
)
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 24bb7299338ac..40430dae10c5b 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -424,11 +424,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
# Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
# dependencies that are not necessary and may not be installed.
if (GPU_LANGUAGE STREQUAL "CUDA")
- if ("${CUDA_CUDA_LIB}" STREQUAL "")
- set(CUDA_CUDA_LIB "${CUDA_CUDA_LIBRARY}")
- endif()
- target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB}
- ${CUDA_LIBRARIES})
+ target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver)
else()
target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
endif()
diff --git a/collect_env.py b/collect_env.py
index 80403d576d78f..254c19b19a5ac 100644
--- a/collect_env.py
+++ b/collect_env.py
@@ -1,17 +1,19 @@
# ruff: noqa
# code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py
-# Unlike the rest of the PyTorch this file must be python2 compliant.
-# This script outputs relevant system environment info
-# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
import datetime
import locale
import os
import re
import subprocess
import sys
+# Unlike the rest of the PyTorch this file must be python2 compliant.
+# This script outputs relevant system environment info
+# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
from collections import namedtuple
+from vllm.envs import environment_variables
+
try:
import torch
TORCH_AVAILABLE = True
@@ -52,6 +54,7 @@
'vllm_version', # vllm specific field
'vllm_build_flags', # vllm specific field
'gpu_topo', # vllm specific field
+ 'env_vars',
])
DEFAULT_CONDA_PATTERNS = {
@@ -512,6 +515,22 @@ def is_xnnpack_available():
else:
return "N/A"
+def get_env_vars():
+ env_vars = ''
+ secret_terms=('secret', 'token', 'api', 'access', 'password')
+ report_prefix = ("TORCH", "NCCL", "PYTORCH",
+ "CUDA", "CUBLAS", "CUDNN",
+ "OMP_", "MKL_",
+ "NVIDIA")
+ for k, v in os.environ.items():
+ if any(term in k.lower() for term in secret_terms):
+ continue
+ if k in environment_variables:
+ env_vars = env_vars + "{}={}".format(k, v) + "\n"
+ if k.startswith(report_prefix):
+ env_vars = env_vars + "{}={}".format(k, v) + "\n"
+
+ return env_vars
def get_env_info():
run_lambda = run
@@ -583,6 +602,7 @@ def get_version_or_na(cfg, prefix):
vllm_version=vllm_version,
vllm_build_flags=vllm_build_flags,
gpu_topo=gpu_topo,
+ env_vars=get_env_vars(),
)
@@ -631,6 +651,8 @@ def get_version_or_na(cfg, prefix):
{vllm_build_flags}
GPU Topology:
{gpu_topo}
+
+{env_vars}
""".strip()
diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
index abb4e3bea14bb..e3953c7c45719 100644
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -22,6 +22,16 @@ struct KernelVecType {
using v_load_vec_type = vec_op::FP32Vec16;
};
+template <>
+struct KernelVecType {
+ using q_load_vec_type = vec_op::FP16Vec8;
+ using q_vec_type = vec_op::FP32Vec16;
+ using k_load_vec_type = vec_op::FP16Vec16;
+ using k_vec_type = vec_op::FP32Vec16;
+ using qk_acc_vec_type = vec_op::FP32Vec16;
+ using v_load_vec_type = vec_op::FP16Vec16;
+};
+
#ifdef __AVX512BF16__
template <>
struct KernelVecType {
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index a325153b470cc..4bb4eb0f491ac 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -11,10 +11,10 @@ static_assert(false, "AVX2 must be supported for the current implementation.");
namespace vec_op {
-// FIXME: FP16 is not fully supported in Torch-CPU
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
- AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+ AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
+ AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
@@ -50,37 +50,37 @@ template struct Vec {
struct FP32Vec8;
struct FP32Vec16;
-#ifdef __AVX512FP16__
struct FP16Vec8 : public Vec {
constexpr static int VEC_ELEM_NUM = 8;
- __m128h reg;
+ __m128i reg;
- explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {}
+ explicit FP16Vec8(const void *ptr)
+ : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
- explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {}
+ explicit FP16Vec8(const FP32Vec8 &);
- explicit FP16Vec8(__m128h data) : reg(data) {}
+ void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
+};
- FP16Vec8 operator*(const FP16Vec8 &b) const {
- return FP16Vec8(_mm_mul_ph(reg, b.reg));
- }
+struct FP16Vec16 : public Vec {
+ constexpr static int VEC_ELEM_NUM = 16;
- FP16Vec8 operator+(const FP16Vec8 &b) const {
- return FP16Vec8(_mm_add_ph(reg, b.reg));
- }
+ __m256i reg;
- FP16Vec8 operator-(const FP16Vec8 &b) const {
- return FP16Vec8(_mm_sub_ph(reg, b.reg));
- }
+ explicit FP16Vec16(const void *ptr)
+ : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
- FP16Vec8 operator/(const FP16Vec8 &b) const {
- return FP16Vec8(_mm_div_ph(reg, b.reg));
- }
+ explicit FP16Vec16(const FP32Vec16 &);
- void save(void *ptr) const { _mm_storeu_ph(ptr, reg); }
+ void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
+
+ void save(void* ptr, const int elem_num) const {
+ constexpr uint32_t M = 0xFFFFFFFF;
+ __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
+ _mm256_mask_storeu_epi16(ptr, mask, reg);
+ }
};
-#endif
struct BF16Vec8 : public Vec {
constexpr static int VEC_ELEM_NUM = 8;
@@ -202,9 +202,7 @@ struct FP32Vec8 : public Vec {
explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
-#ifdef __AVX512FP16__
- explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {}
-#endif
+ explicit FP32Vec8(const FP16Vec8 &v) : reg(_mm256_cvtph_ps(v.reg)) {}
explicit FP32Vec8(const BF16Vec8 &v)
: reg(_mm256_castsi256_ps(
@@ -323,6 +321,10 @@ struct FP32Vec16 : public Vec {
: reg(_mm512_castsi512_ps(
_mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
+ explicit FP32Vec16(const FP16Vec16 &v) : reg(_mm512_cvtph_ps(v.reg)) {}
+
+ explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+
explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
explicit FP32Vec16(const INT32Vec16 &v)
@@ -430,6 +432,16 @@ struct FP32Vec16 : public Vec {
explicit FP32Vec16(const FP32Vec8 &data)
: reg_low(data.reg), reg_high(data.reg) {}
+ explicit FP32Vec16(const FP16Vec16 &v) {
+ __m128i low = _mm256_extractf128_si256(v.reg, 0);
+ __m128i high = _mm256_extractf128_si256(v.reg, 1);
+
+ reg_low = _mm256_cvtph_ps(low);
+ reg_high = _mm256_cvtph_ps(high);
+ }
+
+ explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+
explicit FP32Vec16(const BF16Vec16 &v) {
__m128i low = _mm256_extractf128_si256(v.reg, 0);
__m128i high = _mm256_extractf128_si256(v.reg, 1);
@@ -534,24 +546,34 @@ template using vec_t = typename VecType::vec_type;
template <> struct VecType { using vec_type = FP32Vec8; };
-#ifdef __AVX512FP16__
-template <> struct VecType { using vec_type = FP16Vec16; };
-#endif
+template <> struct VecType { using vec_type = FP16Vec8; };
template <> struct VecType { using vec_type = BF16Vec8; };
template void storeFP32(float v, T *ptr) { *ptr = v; }
-#ifdef __AVX512FP16__
-template <> inline void storeFP32(float v, c10::Half *ptr) {
- *reinterpret_cast<_Float16 *>(ptr) = v;
-}
-#endif
-
inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
acc = acc + a * b;
}
+template <> inline void storeFP32(float v, c10::Half *ptr) {
+ *reinterpret_cast(ptr) =
+ _cvtss_sh(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+}
+
+inline FP16Vec8::FP16Vec8(const FP32Vec8 &v)
+ : reg(_mm256_cvtps_ph(v.reg,
+ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {}
+
+#ifdef __AVX512F__
+inline FP16Vec16::FP16Vec16(const FP32Vec16 &v)
+ : reg(_mm512_cvtps_ph(v.reg,
+ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {}
+#else
+inline FP16Vec16::FP16Vec16(const FP32Vec16 &v)
+ : reg(_mm256_insertf128_si256(_mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg), FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {}
+#endif
+
#ifdef __AVX512BF16__
template <> inline void storeFP32(float v, c10::BFloat16 *ptr) {
*reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
diff --git a/csrc/cpu/dnnl_helper.hpp b/csrc/cpu/dnnl_helper.hpp
index 024ad4ae43da8..8b5011dc065f0 100644
--- a/csrc/cpu/dnnl_helper.hpp
+++ b/csrc/cpu/dnnl_helper.hpp
@@ -2,6 +2,7 @@
#define DNNL_HELPER_HPP
#include
+#include
#include "oneapi/dnnl/dnnl.hpp"
@@ -32,6 +33,11 @@ struct DNNLType {
static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16;
};
+template <>
+struct DNNLType {
+ static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16;
+};
+
template
constexpr inline dnnl::memory::data_type get_dnnl_type() {
return DNNLType>::type;
diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp
index b493fd793818a..f42fa2361a2db 100644
--- a/csrc/cpu/quant.cpp
+++ b/csrc/cpu/quant.cpp
@@ -23,6 +23,13 @@ struct KernelVecType {
using cvt_vec_type = vec_op::FP32Vec16;
};
+template <>
+struct KernelVecType {
+ using load_vec_type = vec_op::FP16Vec16;
+ using azp_adj_load_vec_type = vec_op::INT32Vec16;
+ using cvt_vec_type = vec_op::FP32Vec16;
+};
+
#ifdef __AVX512F__
template
void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
diff --git a/csrc/custom_all_reduce.cu b/csrc/custom_all_reduce.cu
index 9b82bec44c3c6..123278bfed71d 100644
--- a/csrc/custom_all_reduce.cu
+++ b/csrc/custom_all_reduce.cu
@@ -5,32 +5,29 @@
#include "custom_all_reduce.cuh"
-// fake pointer type, must match fptr_t type in ops.h
+// Fake pointer type, must match fptr_t type in ops.h.
+// We use this type alias to indicate when pointers are passed in as int64_t.
using fptr_t = int64_t;
static_assert(sizeof(void*) == sizeof(fptr_t));
-fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data,
- const std::vector& handles,
- const std::vector& offsets, int64_t rank,
+fptr_t init_custom_ar(const std::vector& fake_ipc_ptrs,
+ torch::Tensor& rank_data, int64_t rank,
bool full_nvlink) {
- int world_size = offsets.size();
+ int world_size = fake_ipc_ptrs.size();
if (world_size > 8)
throw std::invalid_argument("world size > 8 is not supported");
if (world_size % 2 != 0)
throw std::invalid_argument("Odd num gpus is not supported for now");
- if (world_size != handles.size())
- throw std::invalid_argument(
- "handles length should equal to offsets length");
if (rank < 0 || rank >= world_size)
throw std::invalid_argument("invalid rank passed in");
- cudaIpcMemHandle_t ipc_handles[8];
+ vllm::Signal* ipc_ptrs[8];
for (int i = 0; i < world_size; i++) {
- std::memcpy(&ipc_handles[i], handles[i].data(), sizeof(cudaIpcMemHandle_t));
+ ipc_ptrs[i] = reinterpret_cast(fake_ipc_ptrs[i]);
}
- return (fptr_t) new vllm::CustomAllreduce(
- reinterpret_cast(meta.data_ptr()), rank_data.data_ptr(),
- rank_data.numel(), ipc_handles, offsets, rank, full_nvlink);
+ return (fptr_t) new vllm::CustomAllreduce(ipc_ptrs, rank_data.data_ptr(),
+ rank_data.numel(), rank, world_size,
+ full_nvlink);
}
/**
@@ -55,26 +52,48 @@ bool _is_weak_contiguous(torch::Tensor& t) {
t.numel() * t.element_size());
}
-void _all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
- cudaStream_t stream) {
+/**
+ * Performs an out-of-place allreduce and stores result in out.
+ *
+ * If _reg_buffer is null, assumes inp.data_ptr() is already IPC-registered.
+ * Otherwise, _reg_buffer is assumed to be IPC-registered and inp is first
+ * copied into _reg_buffer.
+ */
+void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
+ fptr_t _reg_buffer, int64_t reg_buffer_sz_bytes) {
auto fa = reinterpret_cast(_fa);
+ const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
+ auto stream = c10::cuda::getCurrentCUDAStream().stream();
+
+ TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
+ TORCH_CHECK_EQ(inp.numel(), out.numel());
TORCH_CHECK(_is_weak_contiguous(out));
+ TORCH_CHECK(_is_weak_contiguous(inp));
+ auto input_size = inp.numel() * inp.element_size();
+ auto reg_buffer = reinterpret_cast(_reg_buffer);
+ if (reg_buffer) {
+ TORCH_CHECK_LE(input_size, reg_buffer_sz_bytes);
+ AT_CUDA_CHECK(cudaMemcpyAsync(reg_buffer, inp.data_ptr(), input_size,
+ cudaMemcpyDeviceToDevice, stream));
+ } else {
+ reg_buffer = inp.data_ptr();
+ }
switch (out.scalar_type()) {
case at::ScalarType::Float: {
- fa->allreduce(stream, reinterpret_cast(inp.data_ptr()),
+ fa->allreduce(stream, reinterpret_cast(reg_buffer),
reinterpret_cast(out.data_ptr()),
out.numel());
break;
}
case at::ScalarType::Half: {
- fa->allreduce(stream, reinterpret_cast(inp.data_ptr()),
+ fa->allreduce(stream, reinterpret_cast(reg_buffer),
reinterpret_cast(out.data_ptr()), out.numel());
break;
}
#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
case at::ScalarType::BFloat16: {
fa->allreduce(
- stream, reinterpret_cast(inp.data_ptr()),
+ stream, reinterpret_cast(reg_buffer),
reinterpret_cast(out.data_ptr()), out.numel());
break;
}
@@ -85,57 +104,41 @@ void _all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
}
}
-void all_reduce_reg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out) {
- const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
- auto stream = c10::cuda::getCurrentCUDAStream().stream();
- TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
- TORCH_CHECK_EQ(inp.numel(), out.numel());
- _all_reduce(_fa, inp, out, stream);
-}
-
-void all_reduce_unreg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& reg_buffer,
- torch::Tensor& out) {
- const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
- auto stream = c10::cuda::getCurrentCUDAStream().stream();
-
- auto input_size = inp.numel() * inp.element_size();
- TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
- TORCH_CHECK_EQ(inp.numel(), out.numel());
- TORCH_CHECK(input_size <= reg_buffer.numel() * reg_buffer.element_size(),
- "registered buffer is too small to contain the input");
- AT_CUDA_CHECK(cudaMemcpyAsync(reg_buffer.data_ptr(), inp.data_ptr(),
- input_size, cudaMemcpyDeviceToDevice, stream));
- _all_reduce(_fa, reg_buffer, out, stream);
-}
-
void dispose(fptr_t _fa) {
- auto fa = reinterpret_cast(_fa);
- delete fa;
+ delete reinterpret_cast(_fa);
}
int64_t meta_size() { return sizeof(vllm::Signal); }
-void register_buffer(fptr_t _fa, torch::Tensor& t,
- const std::vector& handles,
- const std::vector& offsets) {
+void register_buffer(fptr_t _fa, const std::vector& fake_ipc_ptrs) {
auto fa = reinterpret_cast(_fa);
- fa->register_buffer(handles, offsets, t.data_ptr());
+ TORCH_CHECK(fake_ipc_ptrs.size() == fa->world_size_);
+ void* ipc_ptrs[8];
+ for (int i = 0; i < fake_ipc_ptrs.size(); i++) {
+ ipc_ptrs[i] = reinterpret_cast(fake_ipc_ptrs[i]);
+ }
+ fa->register_buffer(ipc_ptrs);
}
-std::tuple> get_graph_buffer_ipc_meta(
- fptr_t _fa) {
+// Use vector to represent byte data for python binding compatibility.
+std::tuple, std::vector>
+get_graph_buffer_ipc_meta(fptr_t _fa) {
auto fa = reinterpret_cast(_fa);
- auto [handle_bytes, offsets] = fa->get_graph_buffer_ipc_meta();
- auto options =
- torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
- auto handles =
- torch::empty({static_cast(handle_bytes.size())}, options);
- std::memcpy(handles.data_ptr(), handle_bytes.data(), handle_bytes.size());
- return {handles, std::move(offsets)};
+ auto [handle, offsets] = fa->get_graph_buffer_ipc_meta();
+ std::vector bytes(handle.begin(), handle.end());
+ return std::make_tuple(bytes, offsets);
}
-void register_graph_buffers(fptr_t _fa, const std::vector& handles,
+// Use vector to represent byte data for python binding compatibility.
+void register_graph_buffers(fptr_t _fa,
+ const std::vector>& handles,
const std::vector>& offsets) {
auto fa = reinterpret_cast(_fa);
- fa->register_graph_buffers(handles, offsets);
+ std::vector bytes;
+ bytes.reserve(handles.size());
+ for (int i = 0; i < handles.size(); i++) {
+ bytes.emplace_back(handles[i].begin(), handles[i].end());
+ }
+ bytes.reserve(handles.size());
+ fa->register_graph_buffers(bytes, offsets);
}
diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh
index a2f7e43300002..6be4d4f2b2eb8 100644
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@@ -285,46 +285,52 @@ class CustomAllreduce {
int world_size_;
bool full_nvlink_;
- // below are device pointers
RankSignals sg_;
+ // Stores an map from a pointer to its peer pointters from all ranks.
std::unordered_map buffers_;
Signal* self_sg_;
- // stores the registered device pointers from all ranks
+ // Stores rank data from all ranks. This is mainly for cuda graph purposes.
+ // For cuda graph to work, all kernel arguments must be fixed during graph
+ // capture time. However, the peer pointers are not known during graph capture
+ // time. Therefore, during capture, we increment the rank data pointer and use
+ // that as the argument to the kernel. The kernel arguments are stored in
+ // graph_unreg_buffers_. The actual peer pointers will be filled in at the
+ // memory pointed to by the pointers in graph_unreg_buffers_ when
+ // the IPC handles are exchanged between ranks.
+ //
+ // The overall process looks like this:
+ // 1. Graph capture.
+ // 2. Each rank obtains the IPC handles for each addresses used during cuda
+ // graph capture using get_graph_buffer_ipc_meta.
+ // 3. (In Python) all gather the IPC handles.
+ // 4. Obtain the peer pointers by opening the IPC handles, and store them in
+ // the rank data array at corresponding positions.
RankData *d_rank_data_base_, *d_rank_data_end_;
std::vector graph_unreg_buffers_;
// a map from IPC handles to opened IPC pointers
std::map ipc_handles_;
/**
- * meta is a pointer to device metadata and temporary buffer for allreduce.
+ * Signals are an array of ipc-enabled buffers from all ranks.
+ * For each of the buffer, the layout is as follows:
+ * | -- sizeof(Signal) -- | ------ a few MB ----- |
+ * The first section is for allreduce synchronization, and the second section
+ * is for storing the intermediate results required by some allreduce algos.
*
- * There's a total of sizeof(Signal) of prefix before the actual data,
- * so meta + 1 points to actual temporary buffer.
- *
- * note: this class does not own any device memory. Any required buffers
- * are passed in from the constructor
+ * Note: this class does not own any device memory. Any required buffers
+ * are passed in from the constructor.
*/
- CustomAllreduce(Signal* meta, void* rank_data, size_t rank_data_sz,
- const cudaIpcMemHandle_t* handles,
- const std::vector& offsets, int rank,
- bool full_nvlink = true)
+ CustomAllreduce(Signal** signals, void* rank_data, size_t rank_data_sz,
+ int rank, int world_size, bool full_nvlink = true)
: rank_(rank),
- world_size_(offsets.size()),
+ world_size_(world_size),
full_nvlink_(full_nvlink),
- self_sg_(meta),
+ self_sg_(signals[rank]),
d_rank_data_base_(reinterpret_cast(rank_data)),
d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
for (int i = 0; i < world_size_; i++) {
- Signal* rank_sg;
- if (i != rank_) {
- char* handle = open_ipc_handle(&handles[i]);
- handle += offsets[i];
- rank_sg = (Signal*)handle;
- } else {
- rank_sg = self_sg_;
- }
- sg_.signals[i] = rank_sg;
+ sg_.signals[i] = signals[i];
}
}
@@ -341,11 +347,10 @@ class CustomAllreduce {
return it->second;
}
- std::pair, std::vector>
- get_graph_buffer_ipc_meta() {
+ std::pair> get_graph_buffer_ipc_meta() {
auto num_buffers = graph_unreg_buffers_.size();
auto handle_sz = sizeof(cudaIpcMemHandle_t);
- std::vector handles(handle_sz * num_buffers, 0);
+ std::string handles(handle_sz * num_buffers, static_cast(0));
std::vector offsets(num_buffers);
for (int i = 0; i < num_buffers; i++) {
auto ptr = graph_unreg_buffers_[i];
@@ -370,26 +375,22 @@ class CustomAllreduce {
std::to_string(d_rank_data_base_ + num - d_rank_data_end_));
}
- void register_buffer(const std::vector& handles,
- const std::vector& offsets, void* self) {
+ /**
+ * Register already-shared IPC pointers.
+ */
+ void register_buffer(void** ptrs) {
check_rank_data_capacity();
RankData data;
for (int i = 0; i < world_size_; i++) {
- if (i != rank_) {
- char* handle = open_ipc_handle(handles[i].data());
- handle += offsets[i];
- data.ptrs[i] = handle;
- } else {
- data.ptrs[i] = self;
- }
+ data.ptrs[i] = ptrs[i];
}
auto d_data = d_rank_data_base_++;
CUDACHECK(
cudaMemcpy(d_data, &data, sizeof(RankData), cudaMemcpyHostToDevice));
- buffers_[self] = d_data;
+ buffers_[ptrs[rank_]] = d_data;
}
- // note: when registering graph buffers, we intentionally choose to not
+ // Note: when registering graph buffers, we intentionally choose to not
// deduplicate the addresses. That means if the allocator reuses some
// addresses, they will be registered again. This is to account for the remote
// possibility of different allocation patterns between ranks. For example,
@@ -424,11 +425,13 @@ class CustomAllreduce {
}
/**
- * This is the result after careful grid search. Using 36 blocks give the best
- * or close to the best runtime on the devices I tried: A100, A10, A30, T4,
- * V100. You'll notice that NCCL kernels also only take a small amount of SMs.
- * Not quite sure the underlying reason, but my guess is that too many SMs
- * will cause contention on NVLink bus.
+ * Performs allreduce, assuming input has already been registered.
+ *
+ * Block and grid default configs are results after careful grid search. Using
+ * 36 blocks give the best or close to the best runtime on the devices I
+ * tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also only
+ * take a small amount of SMs. Not quite sure the underlying reason, but my
+ * guess is that too many SMs will cause contention on NVLink bus.
*/
template
void allreduce(cudaStream_t stream, T* input, T* output, int size,
diff --git a/csrc/custom_all_reduce_test.cu b/csrc/custom_all_reduce_test.cu
index 376687e91cfda..b59ea40d980f4 100644
--- a/csrc/custom_all_reduce_test.cu
+++ b/csrc/custom_all_reduce_test.cu
@@ -135,24 +135,26 @@ void run(int myRank, int nRanks, ncclComm_t& comm, int threads, int block_limit,
void* rank_data;
size_t rank_data_sz = 16 * 1024 * 1024;
CUDACHECK(cudaMalloc(&rank_data, rank_data_sz));
- std::vector offsets(nRanks, 0);
- vllm::CustomAllreduce fa(buffer, rank_data, rank_data_sz, data_handles,
- offsets, myRank);
+ vllm::Signal* ipc_ptrs[8];
+ for (int i = 0; i < nRanks; i++) {
+ if (i == myRank)
+ ipc_ptrs[i] = buffer;
+ else
+ CUDACHECK(cudaIpcOpenMemHandle((void**)&ipc_ptrs[i], data_handles[i],
+ cudaIpcMemLazyEnablePeerAccess));
+ }
+ vllm::CustomAllreduce fa(ipc_ptrs, rank_data, rank_data_sz, myRank, nRanks);
auto* self_data =
reinterpret_cast(reinterpret_cast(buffer) +
sizeof(vllm::Signal) + data_size * sizeof(T));
// hack buffer registration
{
- std::vector handles;
- handles.reserve(nRanks);
+ void* data[8];
for (int i = 0; i < nRanks; i++) {
- char* begin = (char*)&data_handles[i];
- char* end = (char*)&data_handles[i + 1];
- handles.emplace_back(begin, end);
+ data[i] =
+ ((char*)ipc_ptrs[i]) + sizeof(vllm::Signal) + data_size * sizeof(T);
}
- std::vector offsets(nRanks,
- sizeof(vllm::Signal) + data_size * sizeof(T));
- fa.register_buffer(handles, offsets, self_data);
+ fa.register_buffer(data);
}
double* ground_truth;
diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu
index 3a464c5f327ad..498d069c05f0d 100644
--- a/csrc/mamba/causal_conv1d/causal_conv1d.cu
+++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu
@@ -418,6 +418,31 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
typename Ktraits::BlockStoreT(smem_store).Store(out, out_vals_store, seqlen - chunk * kChunkSize);
}
out += kChunkSize;
+
+ int final_state_position = ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize);
+ // in case the final state is separated between the last "smem_exchange" and
+ // and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2),
+ // (which occurs when `final_state_position` is a non-positivie index)
+ // we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it
+ if (final_state_position < 0 && seqlen > kWidth){
+ input_t vals_load[kNElts] = {0};
+ if ((chunk == n_chunks - 2) && (tidx == kNThreads - 1)){
+ // chunk = n_chunks - 2, a segment of the final state sits in the last index
+ reinterpret_cast(vals_load)[0] = smem_exchange[kNThreads - 1];
+ #pragma unroll
+ for (int w = 0; w < -final_state_position; ++w){
+ conv_states[w] = vals_load[kNElts + final_state_position + w];
+ }
+ }
+ if ((chunk == n_chunks - 1) && tidx == 0){
+ // chunk = n_chunks - 1, the second segment of the final state first positions
+ reinterpret_cast(vals_load)[0] = smem_exchange[0];
+ for (int w = -final_state_position; w < kWidth - 1; ++w){
+ conv_states[w] = vals_load[w + final_state_position];
+ }
+ return;
+ }
+ }
}
// Final state is stored in the smem_exchange last token slot,
// in case seqlen < kWidth, we would need to take the final state from the
@@ -446,9 +471,14 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) {
}
else {
// in case the final state is in between the threads data
- reinterpret_cast(x_vals_load)[1] = smem_exchange[last_thread + 1];
- reinterpret_cast(x_vals_load)[0] = smem_exchange[last_thread];
const int offset = ((seqlen - (kWidth - 1)) % (kNElts));
+ if ((offset + kWidth - 2) >= kNElts && (last_thread + 1 < kNThreads)){
+ // In case last_thread == kNThreads - 1, accessing last_thread + 1 will result in a
+ // illegal access error on H100.
+ // Therefore, we access last_thread + 1, only if the final state data sits there
+ reinterpret_cast(x_vals_load)[1] = smem_exchange[last_thread + 1];
+ }
+ reinterpret_cast(x_vals_load)[0] = smem_exchange[last_thread];
#pragma unroll
for (int w = 0; w < kWidth - 1; ++w){
conv_states[w] = x_vals_load[offset + w ];
diff --git a/csrc/moe_align_block_size_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
similarity index 59%
rename from csrc/moe_align_block_size_kernels.cu
rename to csrc/moe/moe_align_sum_kernels.cu
index 1f8d75da83bb8..fff7ce34c838a 100644
--- a/csrc/moe_align_block_size_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -1,15 +1,17 @@
#include
#include
+#include
#include
#include
-#include "cuda_compat.h"
-#include "dispatch_utils.h"
+#include "../cuda_compat.h"
+#include "../dispatch_utils.h"
#define CEILDIV(x, y) (((x) + (y) - 1) / (y))
namespace vllm {
+namespace moe {
namespace {
__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
@@ -32,10 +34,10 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
extern __shared__ int32_t shared_mem[];
int32_t* tokens_cnts =
- shared_mem; // 2d tensor with shape (num_experts + 1, num_experts)
+ shared_mem; // 2d tensor with shape (blockDim.x + 1, num_experts)
int32_t* cumsum =
- shared_mem + (num_experts + 1) *
- num_experts; // 1d tensor with shape (num_experts + 1)
+ shared_mem +
+ (blockDim.x + 1) * num_experts; // 1d tensor with shape (num_experts + 1)
for (int i = 0; i < num_experts; ++i) {
tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
@@ -53,10 +55,12 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
__syncthreads();
// For each expert we accumulate the token counts from the different threads.
- tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
- for (int i = 1; i <= blockDim.x; ++i) {
- tokens_cnts[index(num_experts, i, threadIdx.x)] +=
- tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
+ if (threadIdx.x < num_experts) {
+ tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
+ for (int i = 1; i <= blockDim.x; ++i) {
+ tokens_cnts[index(num_experts, i, threadIdx.x)] +=
+ tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
+ }
}
__syncthreads();
@@ -79,9 +83,11 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
* For each expert, each thread processes the tokens of the corresponding
* blocks and stores the corresponding expert_id for each block.
*/
- for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
- i += block_size) {
- expert_ids[i / block_size] = threadIdx.x;
+ if (threadIdx.x < num_experts) {
+ for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
+ i += block_size) {
+ expert_ids[i / block_size] = threadIdx.x;
+ }
}
/**
@@ -106,6 +112,24 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
}
}
+
+template
+__global__ void moe_sum_kernel(
+ scalar_t* __restrict__ out, // [..., d]
+ const scalar_t* __restrict__ input, // [..., topk, d]
+ const int d) {
+ const int64_t token_idx = blockIdx.x;
+ for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+ scalar_t x = 0.0;
+#pragma unroll
+ for (int k = 0; k < TOPK; ++k) {
+ x += VLLM_LDG(&input[token_idx * TOPK * d + k * d + idx]);
+ }
+ out[token_idx * d + idx] = x;
+ }
+}
+
+} // namespace moe
} // namespace vllm
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
@@ -117,18 +141,62 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
// calc needed amount of shared mem for `tokens_cnts` and `cumsum`
// tensors
+ const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
const int32_t shared_mem =
- ((num_experts + 1) * num_experts + (num_experts + 1)) *
+ ((num_thread + 1) * num_experts + (num_experts + 1)) *
sizeof(int32_t);
// set dynamic shared mem
- auto kernel = vllm::moe_align_block_size_kernel;
+ auto kernel = vllm::moe::moe_align_block_size_kernel;
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
(void*)kernel, shared_mem));
- kernel<<<1, num_experts, shared_mem, stream>>>(
+ kernel<<<1, num_thread, shared_mem, stream>>>(
topk_ids.data_ptr(), sorted_token_ids.data_ptr(),
experts_ids.data_ptr(),
num_tokens_post_pad.data_ptr(), num_experts, block_size,
topk_ids.numel());
});
}
+
+void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size]
+ torch::Tensor& output) // [num_tokens, hidden_size]
+{
+ const int hidden_size = input.size(-1);
+ const int num_tokens = output.numel() / hidden_size;
+ const int topk = input.size(1);
+
+ dim3 grid(num_tokens);
+ dim3 block(std::min(hidden_size, 1024));
+ const at::cuda::OptionalCUDAGuard device_guard(device_of(output));
+ const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+ switch (topk) {
+ case 2:
+ VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] {
+ vllm::moe::moe_sum_kernel<<>>(
+ output.data_ptr(), input.data_ptr(),
+ hidden_size);
+ });
+ break;
+
+ case 3:
+ VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] {
+ vllm::moe::moe_sum_kernel<<>>(
+ output.data_ptr(), input.data_ptr(),
+ hidden_size);
+ });
+ break;
+
+ case 4:
+ VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] {
+ vllm::moe::moe_sum_kernel<<>>(
+ output.data_ptr(), input.data_ptr(),
+ hidden_size);
+ });
+ break;
+
+ default:
+ at::sum_out(output, input, 1);
+ break;
+ }
+}
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
index a251730aa765a..596cc0aa6c855 100644
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -5,3 +5,10 @@
void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
torch::Tensor& token_expert_indices,
torch::Tensor& gating_output);
+
+void moe_sum(torch::Tensor& input, torch::Tensor& output);
+
+void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
+ int64_t block_size, torch::Tensor sorted_token_ids,
+ torch::Tensor experts_ids,
+ torch::Tensor num_tokens_post_pad);
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index 019c6cedd3d80..f3a558c14ab93 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -8,6 +8,20 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
"token_expert_indices, Tensor gating_output) -> ()");
m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
+ // Calculate the result of moe by summing up the partial results
+ // from all selected experts.
+ m.def("moe_sum(Tensor! input, Tensor output) -> ()");
+ m.impl("moe_sum", torch::kCUDA, &moe_sum);
+
+ // Aligning the number of tokens to be processed by each expert such
+ // that it is divisible by the block size.
+ m.def(
+ "moe_align_block_size(Tensor topk_ids, int num_experts,"
+ " int block_size, Tensor! sorted_token_ids,"
+ " Tensor! experts_ids,"
+ " Tensor! num_tokens_post_pad) -> ()");
+ m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
+
#ifndef USE_ROCM
m.def(
"marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, "
diff --git a/csrc/ops.h b/csrc/ops.h
index 11a2970695545..e0775ee1891df 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -5,6 +5,30 @@
#include "core/scalar_type.hpp"
+#include
+
+torch::Tensor weak_ref_tensor(torch::Tensor& tensor) {
+ // Ensure tensor is on CUDA
+ if (!tensor.is_cuda()) {
+ throw std::runtime_error("Tensor must be on CUDA device");
+ }
+
+ // Get the raw data pointer
+ void* data_ptr = tensor.data_ptr();
+
+ // Get tensor sizes and strides
+ std::vector sizes = tensor.sizes().vec();
+ std::vector strides = tensor.strides().vec();
+
+ // Get tensor options (dtype, device)
+ auto options = tensor.options();
+
+ // Create a new tensor from the raw data pointer
+ auto new_tensor = torch::from_blob(data_ptr, sizes, strides, options);
+
+ return new_tensor;
+}
+
void paged_attention_v1(
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
@@ -145,11 +169,6 @@ void dynamic_per_token_scaled_fp8_quant(
torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale,
c10::optional const& scale_ub);
-void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
- int64_t block_size, torch::Tensor sorted_token_ids,
- torch::Tensor experts_ids,
- torch::Tensor num_tokens_post_pad);
-
void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
const torch::Tensor& A, const torch::Tensor& B,
const torch::Tensor& C,
@@ -180,20 +199,16 @@ void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
#ifndef USE_ROCM
using fptr_t = int64_t;
-fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data,
- const std::vector& handles,
- const std::vector& offsets, int64_t rank,
- bool full_nvlink);
-void all_reduce_reg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out);
-void all_reduce_unreg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& reg_buffer,
- torch::Tensor& out);
+fptr_t init_custom_ar(const std::vector& fake_ipc_ptrs,
+ torch::Tensor& rank_data, int64_t rank, bool full_nvlink);
+void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
+ fptr_t reg_buffer, int64_t reg_buffer_sz_bytes);
void dispose(fptr_t _fa);
int64_t meta_size();
-void register_buffer(fptr_t _fa, torch::Tensor& t,
- const std::vector& handles,
- const std::vector& offsets);
-std::tuple> get_graph_buffer_ipc_meta(
- fptr_t _fa);
-void register_graph_buffers(fptr_t _fa, const std::vector& handles,
+void register_buffer(fptr_t _fa, const std::vector& fake_ipc_ptrs);
+std::tuple, std::vector>
+get_graph_buffer_ipc_meta(fptr_t _fa);
+void register_graph_buffers(fptr_t _fa,
+ const std::vector>& handles,
const std::vector>& offsets);
#endif
diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py
index ebbe76cfb944a..d126af1849024 100644
--- a/csrc/quantization/machete/generate.py
+++ b/csrc/quantization/machete/generate.py
@@ -468,7 +468,7 @@ def generate():
impl_configs = []
GPTQ_kernel_type_configs = list(
- (TypeConfig(
+ TypeConfig(
element_a=element_a,
element_b=element_b,
element_b_scale=element_a,
@@ -476,7 +476,7 @@ def generate():
element_d=element_a,
accumulator=DataType.f32,
) for element_b in (VLLMDataType.u4b8, VLLMDataType.u8b128)
- for element_a in (DataType.f16, DataType.bf16)))
+ for element_a in (DataType.f16, DataType.bf16))
GPTQ_kernel_specializations = [
Specialization(with_C=False, with_zeropoints=False, with_scales=True)
@@ -490,7 +490,7 @@ def generate():
]
AWQ_kernel_type_configs = list(
- (TypeConfig(
+ TypeConfig(
element_a=element_a,
element_b=element_b,
element_b_scale=element_a,
@@ -498,7 +498,7 @@ def generate():
element_d=element_a,
accumulator=DataType.f32,
) for element_b in (DataType.u4, DataType.u8)
- for element_a in (DataType.f16, DataType.bf16)))
+ for element_a in (DataType.f16, DataType.bf16))
AWQ_kernel_specializations = [
Specialization(with_C=False, with_zeropoints=True, with_scales=True)
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 826f918c82e78..971a45d50ffa4 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -18,6 +18,9 @@
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// vLLM custom ops
+ ops.def("weak_ref_tensor(Tensor input) -> Tensor");
+ ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
+
// Attention ops
// Compute the attention between an input query and the cached
// keys/values using PagedAttention.
@@ -336,15 +339,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
&dynamic_per_token_scaled_fp8_quant);
- // Aligning the number of tokens to be processed by each expert such
- // that it is divisible by the block size.
- ops.def(
- "moe_align_block_size(Tensor topk_ids, int num_experts,"
- " int block_size, Tensor! sorted_token_ids,"
- " Tensor! experts_ids,"
- " Tensor! num_tokens_post_pad) -> ()");
- ops.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
-
// Compute int8 quantized tensor for given scaling factor.
ops.def(
"static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
@@ -417,27 +411,18 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
// Custom all-reduce kernels
custom_ar.def(
- "init_custom_ar(Tensor meta, Tensor rank_data, "
- "str[] handles, int[] offsets, int rank, "
- "bool full_nvlink) -> int");
+ "init_custom_ar(int[] ipc_tensors, Tensor rank_data, "
+ "int rank, bool full_nvlink) -> int");
custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
-
- custom_ar.def("all_reduce_reg(int fa, Tensor inp, Tensor! out) -> ()");
- custom_ar.impl("all_reduce_reg", torch::kCUDA, &all_reduce_reg);
-
custom_ar.def(
- "all_reduce_unreg(int fa, Tensor inp, Tensor reg_buffer, Tensor! out) -> "
- "()");
- custom_ar.impl("all_reduce_unreg", torch::kCUDA, &all_reduce_unreg);
+ "all_reduce(int fa, Tensor inp, Tensor! out, int reg_buffer, "
+ "int reg_buffer_sz_bytes) -> ()");
+ custom_ar.impl("all_reduce", torch::kCUDA, &all_reduce);
custom_ar.def("dispose", &dispose);
custom_ar.def("meta_size", &meta_size);
- custom_ar.def(
- "register_buffer(int fa, Tensor t, str[] handles, "
- "int[] offsets) -> ()");
- custom_ar.impl("register_buffer", torch::kCUDA, ®ister_buffer);
-
+ custom_ar.def("register_buffer", ®ister_buffer);
custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
custom_ar.def("register_graph_buffers", ®ister_graph_buffers);
}
diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index d58f226136918..e3e35844405ac 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -13,5 +13,7 @@ torch
py-cpuinfo
transformers
mistral_common >= 1.3.4
+aiohttp
+starlette
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 8435129e752e1..96ad9a4c26b09 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -96,7 +96,6 @@ def setup(app):
# Mock out external dependencies here, otherwise the autodoc pages may be blank.
autodoc_mock_imports = [
- "aiohttp",
"compressed_tensors",
"cpuinfo",
"cv2",
@@ -117,6 +116,7 @@ def setup(app):
"soundfile",
"gguf",
"lark",
+ "decord",
]
for mock_target in autodoc_mock_imports:
@@ -143,6 +143,7 @@ def add_line(self, line: str, source: str, *lineno: int) -> None:
"python": ("https://docs.python.org/3", None),
"typing_extensions":
("https://typing-extensions.readthedocs.io/en/latest", None),
+ "aiohttp": ("https://docs.aiohttp.org/en/stable", None),
"pillow": ("https://pillow.readthedocs.io/en/stable", None),
"numpy": ("https://numpy.org/doc/stable", None),
"torch": ("https://pytorch.org/docs/stable", None),
diff --git a/docs/source/dev/pooling_params.rst b/docs/source/dev/pooling_params.rst
new file mode 100644
index 0000000000000..334e0287aff09
--- /dev/null
+++ b/docs/source/dev/pooling_params.rst
@@ -0,0 +1,5 @@
+Pooling Parameters
+==================
+
+.. autoclass:: vllm.PoolingParams
+ :members:
diff --git a/docs/source/dev/profiling/profiling_index.rst b/docs/source/dev/profiling/profiling_index.rst
index 9e8b2f1817567..a422b1fcda521 100644
--- a/docs/source/dev/profiling/profiling_index.rst
+++ b/docs/source/dev/profiling/profiling_index.rst
@@ -1,5 +1,6 @@
-Profiling vLLM
-=================================
+==============
+Profiling vLLM
+==============
We support tracing vLLM workers using the ``torch.profiler`` module. You can enable tracing by setting the ``VLLM_TORCH_PROFILER_DIR`` environment variable to the directory where you want to save the traces: ``VLLM_TORCH_PROFILER_DIR=/mnt/traces/``
diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst
index 301337aebcf4c..ece5d785e0c65 100644
--- a/docs/source/getting_started/amd-installation.rst
+++ b/docs/source/getting_started/amd-installation.rst
@@ -13,8 +13,6 @@ Requirements
* GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
* ROCm 6.2
-Note: PyTorch 2.5+/ROCm6.2 dropped the support for python 3.8.
-
Installation options:
#. :ref:`Build from source with docker `
diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst
index d12aeebbbc184..69530fd778c55 100644
--- a/docs/source/getting_started/cpu-installation.rst
+++ b/docs/source/getting_started/cpu-installation.rst
@@ -3,13 +3,13 @@
Installation with CPU
========================
-vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32 and BF16. vLLM CPU backend supports the following vLLM features:
+vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:
- Tensor Parallel (``-tp = N``)
- Quantization (``INT8 W8A8, AWQ``)
.. note::
- FP16 data type and more advanced features on `chunked-prefill`, `prefix-caching` and `FP8 KV cache` are under development and will be available soon.
+ More advanced features on `chunked-prefill`, `prefix-caching` and `FP8 KV cache` are under development and will be available soon.
Table of contents:
@@ -72,8 +72,6 @@ Build from source
$ VLLM_TARGET_DEVICE=cpu python setup.py install
.. note::
- - BF16 is the default data type in the current CPU backend (that means the backend will cast FP16 to BF16), and is compatible will all CPUs with AVX512 ISA support.
-
- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building.
diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
new file mode 100644
index 0000000000000..68c1a56660fa4
--- /dev/null
+++ b/docs/source/getting_started/gaudi-installation.rst
@@ -0,0 +1,402 @@
+Installation with Intel® Gaudi® AI Accelerators
+===============================================
+
+This README provides instructions on running vLLM with Intel Gaudi devices.
+
+Requirements and Installation
+=============================
+
+Please follow the instructions provided in the `Gaudi Installation
+Guide `__
+to set up the execution environment. To achieve the best performance,
+please follow the methods outlined in the `Optimizing Training Platform
+Guide `__.
+
+Requirements
+------------
+
+- OS: Ubuntu 22.04 LTS
+- Python: 3.10
+- Intel Gaudi accelerator
+- Intel Gaudi software version 1.18.0
+
+
+Quick start using Dockerfile
+----------------------------
+.. code:: console
+
+ $ docker build -f Dockerfile.hpu -t vllm-hpu-env .
+ $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
+
+
+.. tip::
+ If you're observing the following error: ``docker: Error response from daemon: Unknown runtime specified habana.``, please refer to "Install Using Containers" section of `Intel Gaudi Software Stack and Driver Installation `__. Make sure you have ``habana-container-runtime`` package installed and that ``habana`` container runtime is registered.
+
+
+Build from source
+-----------------
+
+Environment verification
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+To verify that the Intel Gaudi software was correctly installed, run:
+
+.. code:: console
+
+ $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
+ $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed
+ $ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed
+ $ pip list | grep neural # verify that neural_compressor is installed
+
+Refer to `Intel Gaudi Software Stack
+Verification `__
+for more details.
+
+Run Docker Image
+~~~~~~~~~~~~~~~~
+
+It is highly recommended to use the latest Docker image from Intel Gaudi
+vault. Refer to the `Intel Gaudi
+documentation `__
+for more details.
+
+Use the following commands to run a Docker image:
+
+.. code:: console
+
+ $ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+ $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+
+Build and Install vLLM
+~~~~~~~~~~~~~~~~~~~~~~
+
+To build and install vLLM from source, run:
+
+.. code:: console
+
+ $ git clone https://github.com/vllm-project/vllm.git
+ $ cd vllm
+ $ python setup.py develop
+
+
+Currently, the latest features and performance optimizations are developed in Gaudi's `vLLM-fork `__ and we periodically upstream them to vLLM main repo. To install latest `HabanaAI/vLLM-fork `__, run the following:
+
+.. code:: console
+
+ $ git clone https://github.com/HabanaAI/vllm-fork.git
+ $ cd vllm-fork
+ $ git checkout habana_main
+ $ python setup.py develop
+
+
+Supported Features
+==================
+
+- `Offline batched
+ inference `__
+- Online inference via `OpenAI-Compatible
+ Server `__
+- HPU autodetection - no need to manually select device within vLLM
+- Paged KV cache with algorithms enabled for Intel Gaudi accelerators
+- Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
+ prefill attention, Root Mean Square Layer Normalization, Rotary
+ Positional Encoding
+- Tensor parallelism support for multi-card inference
+- Inference with `HPU Graphs `__
+ for accelerating low-batch latency and throughput
+- Attention with Linear Biases (ALiBi)
+
+Unsupported Features
+====================
+
+- Beam search
+- LoRA adapters
+- Quantization
+- Prefill chunking (mixed-batch inferencing)
+
+Supported Configurations
+========================
+
+The following configurations have been validated to be function with
+Gaudi2 devices. Configurations that are not listed may or may not work.
+
+- `meta-llama/Llama-2-7b `__
+ on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+ datatype with random or greedy sampling
+- `meta-llama/Llama-2-7b-chat-hf `__
+ on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+ datatype with random or greedy sampling
+- `meta-llama/Meta-Llama-3-8B `__
+ on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+ datatype with random or greedy sampling
+- `meta-llama/Meta-Llama-3-8B-Instruct `__
+ on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+ datatype with random or greedy sampling
+- `meta-llama/Meta-Llama-3.1-8B `__
+ on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+ datatype with random or greedy sampling
+- `meta-llama/Meta-Llama-3.1-8B-Instruct `__
+ on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16
+ datatype with random or greedy sampling
+- `meta-llama/Llama-2-70b `__
+ with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+- `meta-llama/Llama-2-70b-chat-hf `__
+ with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+- `meta-llama/Meta-Llama-3-70B `__
+ with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+- `meta-llama/Meta-Llama-3-70B-Instruct `__
+ with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+- `meta-llama/Meta-Llama-3.1-70B `__
+ with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+- `meta-llama/Meta-Llama-3.1-70B-Instruct `__
+ with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling
+
+Performance Tuning
+==================
+
+Execution modes
+---------------
+
+Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via ``PT_HPU_LAZY_MODE`` environment variable), and ``--enforce-eager`` flag.
+
+.. list-table:: vLLM execution modes
+ :widths: 25 25 50
+ :header-rows: 1
+
+ * - ``PT_HPU_LAZY_MODE``
+ - ``enforce_eager``
+ - execution mode
+ * - 0
+ - 0
+ - torch.compile
+ * - 0
+ - 1
+ - PyTorch eager mode
+ * - 1
+ - 0
+ - HPU Graphs
+ * - 1
+ - 1
+ - PyTorch lazy mode
+
+.. warning::
+ In 1.18.0, all modes utilizing ``PT_HPU_LAZY_MODE=0`` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
+
+
+Bucketing mechanism
+-------------------
+
+Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. `Intel Gaudi Graph Compiler `__ is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
+In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - ``batch_size`` and ``sequence_length``.
+
+.. note::
+ Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase.
+
+Bucketing ranges are determined with 3 parameters - ``min``, ``step`` and ``max``. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup:
+
+.. code-block::
+
+ INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
+ INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
+ INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
+ INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+
+``min`` determines the lowest value of the bucket. ``step`` determines the interval between buckets, and ``max`` determines the upper bound of the bucket. Furthermore, interval between ``min`` and ``step`` has special handling - ``min`` gets multiplied by consecutive powers of two, until ``step`` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes.
+
+Example (with ramp-up)
+
+.. code-block::
+
+ min = 2, step = 32, max = 64
+ => ramp_up = (2, 4, 8, 16)
+ => stable = (32, 64)
+ => buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64)
+
+Example (without ramp-up)
+
+.. code-block::
+
+ min = 128, step = 128, max = 512
+ => ramp_up = ()
+ => stable = (128, 256, 384, 512)
+ => buckets = ramp_up + stable => (128, 256, 384, 512)
+
+
+In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket.
+
+.. warning::
+ If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario.
+
+As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as ``(4, 512)`` prefill bucket, as ``batch_size`` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as ``(4, 512)`` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a ``(2, 512)`` bucket, or context length increases above 512 tokens, in which case it will become ``(4, 640)`` bucket.
+
+.. note::
+ Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
+
+Warmup
+------
+
+Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup:
+
+.. code-block::
+
+ INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB
+ INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB
+ INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB
+ ...
+ INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
+ INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB
+ INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB
+ INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB
+ ...
+ INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB
+ INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
+
+This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
+
+.. tip::
+ Compiling all the buckets might take some time and can be turned off with ``VLLM_SKIP_WARMUP=true`` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
+
+HPU Graph capture
+-----------------
+
+`HPU Graphs `__ are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management.
+
+
+When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by ``gpu_memory_utilization`` flag (``0.9`` by default).
+Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage.
+Only after that, ``gpu_memory_utilization`` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable.
+Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured.
+Environment variable ``VLLM_GRAPH_RESERVED_MEM`` defines the ratio of memory reserved for HPU Graphs capture.
+With its default value (``VLLM_GRAPH_RESERVED_MEM=0.1``), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache.
+Environment variable ``VLLM_GRAPH_PROMPT_RATIO`` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (``VLLM_GRAPH_PROMPT_RATIO=0.3``), both stages have equal memory constraints.
+Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. ``VLLM_GRAPH_PROMPT_RATIO=0.2`` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs.
+
+.. note::
+ ``gpu_memory_utilization`` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, ``gpu_memory_utilization`` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory.
+
+User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented:
+- ``max_bs`` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. ``(64, 128)``, ``(64, 256)``, ``(32, 128)``, ``(32, 256)``, ``(1, 128)``, ``(1,256)``), default strategy for decode
+- ``min_tokens`` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (``batch_size*sequence_length``), default strategy for prompt
+
+When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by ``max_bs`` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in ``min_tokens`` strategy.
+
+
+.. note::
+ ``VLLM_GRAPH_PROMPT_RATIO`` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * ``VLLM_GRAPH_PROMPT_RATIO``) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below.
+
+
+Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
+
+.. code-block::
+
+ INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024]
+ INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)]
+ INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048]
+ INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+ INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
+ INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used)
+ INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used)
+ INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used)
+ INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache
+ INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0
+ INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used)
+ INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB
+ ...
+ INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
+ INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3)
+ INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB
+ ...
+ INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB
+ INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB
+ ...
+ INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB
+ INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB
+ INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB
+ INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB
+ INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB
+ INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)]
+ INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)]
+ INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory
+ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used)
+
+
+Recommended vLLM Parameters
+---------------------------
+
+- We recommend running inference on Gaudi 2 with ``block_size`` of 128
+ for BF16 data type. Using default values (16, 32) might lead to
+ sub-optimal performance due to Matrix Multiplication Engine
+ under-utilization (see `Gaudi
+ Architecture `__).
+- For max throughput on Llama 7B, we recommend running with batch size
+ of 128 or 256 and max context length of 2048 with HPU Graphs enabled.
+ If you encounter out-of-memory issues, see troubleshooting section.
+
+Environment variables
+---------------------
+
+**Diagnostic and profiling knobs:**
+
+- ``VLLM_PROFILER_ENABLED``: if ``true``, high level profiler will be enabled. Resulting JSON traces can be viewed in `perfetto.habana.ai `__. Disabled by default.
+- ``VLLM_HPU_LOG_STEP_GRAPH_COMPILATION``: if ``true``, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside ``PT_HPU_METRICS_GC_DETAILS=1``. Disabled by default.
+- ``VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL``: if ``true``, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default.
+- ``VLLM_HPU_LOG_STEP_CPU_FALLBACKS``: if ``true``, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default.
+- ``VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL``: if ``true``, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default.
+
+**Performance tuning knobs:**
+
+- ``VLLM_SKIP_WARMUP``: if ``true``, warmup will be skipped, ``false`` by default
+- ``VLLM_GRAPH_RESERVED_MEM``: percentage of memory dedicated for HPUGraph capture, ``0.1`` by default
+- ``VLLM_GRAPH_PROMPT_RATIO``: percentage of reserved graph memory dedicated for prompt graphs, ``0.3`` by default
+- ``VLLM_GRAPH_PROMPT_STRATEGY``: strategy determining order of prompt graph capture, ``min_tokens`` or ``max_bs``, ``min_tokens`` by default
+- ``VLLM_GRAPH_DECODE_STRATEGY``: strategy determining order of decode graph capture, ``min_tokens`` or ``max_bs``, ``max_bs`` by default
+- ``VLLM_{phase}_{dim}_BUCKET_{param}`` - collection of 12 environment variables configuring ranges of bucketing mechanism
+
+ - ``{phase}`` is either ``PROMPT`` or ``DECODE``
+ - ``{dim}`` is either ``BS``, ``SEQ`` or ``BLOCK``
+ - ``{param}`` is either ``MIN``, ``STEP`` or ``MAX``
+ - Default values:
+
+ - Prompt:
+ - batch size min (``VLLM_PROMPT_BS_BUCKET_MIN``): ``1``
+ - batch size step (``VLLM_PROMPT_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)``
+ - batch size max (``VLLM_PROMPT_BS_BUCKET_MAX``): ``min(max_num_seqs, 64)``
+ - sequence length min (``VLLM_PROMPT_SEQ_BUCKET_MIN``): ``block_size``
+ - sequence length step (``VLLM_PROMPT_SEQ_BUCKET_STEP``): ``block_size``
+ - sequence length max (``VLLM_PROMPT_SEQ_BUCKET_MAX``): ``max_model_len``
+
+ - Decode:
+ - batch size min (``VLLM_DECODE_BS_BUCKET_MIN``): ``1``
+ - batch size step (``VLLM_DECODE_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)``
+ - batch size max (``VLLM_DECODE_BS_BUCKET_MAX``): ``max_num_seqs``
+ - sequence length min (``VLLM_DECODE_BLOCK_BUCKET_MIN``): ``block_size``
+ - sequence length step (``VLLM_DECODE_BLOCK_BUCKET_STEP``): ``block_size``
+ - sequence length max (``VLLM_DECODE_BLOCK_BUCKET_MAX``): ``max(128, (max_num_seqs*max_model_len)/block_size)``
+
+
+Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:
+
+- ``PT_HPU_LAZY_MODE``: if ``0``, PyTorch Eager backend for Gaudi will be used, if ``1`` PyTorch Lazy backend for Gaudi will be used, ``1`` is default
+- ``PT_HPU_ENABLE_LAZY_COLLECTIVES``: required to be ``true`` for tensor parallel inference with HPU Graphs
+
+Troubleshooting: Tweaking HPU Graphs
+====================================
+
+If you experience device out-of-memory issues or want to attempt
+inference at higher batch sizes, try tweaking HPU Graphs by following
+the below:
+
+- Tweak ``gpu_memory_utilization`` knob. It will decrease the
+ allocation of KV cache, leaving some headroom for capturing graphs
+ with larger batch size. By default ``gpu_memory_utilization`` is set
+ to 0.9. It attempts to allocate ~90% of HBM left for KV cache after
+ short profiling run. Note that decreasing reduces the number of KV
+ cache blocks you have available, and therefore reduces the effective
+ maximum number of tokens you can handle at a given time.
+
+- If this method is not efficient, you can disable ``HPUGraph``
+ completely. With HPU Graphs disabled, you are trading latency and
+ throughput at lower batches for potentially higher throughput on
+ higher batches. You can do that by adding ``--enforce-eager`` flag to
+ server (for online inference), or by passing ``enforce_eager=True``
+ argument to LLM constructor (for offline inference).
diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index a706b285edede..f02626bda4c64 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -10,7 +10,7 @@ Requirements
============
* OS: Linux
-* Python: 3.8 - 3.12
+* Python: 3.9 -- 3.12
* GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
Install released versions
@@ -66,7 +66,7 @@ If you want to access the wheels for previous commits, you can specify the commi
$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-Note that the wheels are built with Python 3.8 ABI (see `PEP 425 `_ for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata.
+Note that the wheels are built with Python 3.8 ABI (see `PEP 425 `_ for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
Another way to access the latest code is to use the docker images:
@@ -148,7 +148,7 @@ If you want to modify C++ or CUDA code, you'll need to build vLLM from source. T
.. tip::
Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
- For example, you can install `ccache `_ using ``conda install ccache`` or ``apt install ccache`` .
+ For example, you can install `ccache `_ using ``conda install ccache`` or ``apt install ccache`` .
As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
@@ -181,8 +181,8 @@ to be run simultaneously, via the environment variable ``MAX_JOBS``. For example
$ export MAX_JOBS=6
$ pip install -e .
-This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default `_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory.
-A side effect is a much slower build process.
+This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default `_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory.
+A side effect is a much slower build process.
Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
@@ -209,7 +209,7 @@ Here is a sanity check to verify that the CUDA Toolkit is correctly installed:
Unsupported OS build
--------------------
-vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems.
+vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems.
Simply disable the ``VLLM_TARGET_DEVICE`` environment variable before installing:
diff --git a/docs/source/getting_started/neuron-installation.rst b/docs/source/getting_started/neuron-installation.rst
index ec99fc013057b..025ba6ef7ebd8 100644
--- a/docs/source/getting_started/neuron-installation.rst
+++ b/docs/source/getting_started/neuron-installation.rst
@@ -11,7 +11,7 @@ Requirements
------------
* OS: Linux
-* Python: 3.8 -- 3.11
+* Python: 3.9 -- 3.11
* Accelerator: NeuronCore_v2 (in trn1/inf2 instances)
* Pytorch 2.0.1/2.1.1
* AWS Neuron SDK 2.16/2.17 (Verified on python 3.8)
diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst
index 80b19ac672936..0c0491c860563 100644
--- a/docs/source/getting_started/quickstart.rst
+++ b/docs/source/getting_started/quickstart.rst
@@ -1,38 +1,50 @@
.. _quickstart:
+==========
Quickstart
==========
-This guide shows how to use vLLM to:
+This guide will help you quickly get started with vLLM to:
-* run offline batched inference on a dataset;
-* build an API server for a large language model;
-* start an OpenAI-compatible API server.
+* :ref:`Run offline batched inference `
+* :ref:`Run OpenAI-compatible inference `
-Be sure to complete the :ref:`installation instructions ` before continuing with this guide.
+Prerequisites
+--------------
+- OS: Linux
+- Python: 3.9 -- 3.12
+- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
-.. note::
+Installation
+--------------
+
+You can install vLLM using pip. It's recommended to use `conda `_ to create and manage Python environments.
+
+.. code-block:: console
- By default, vLLM downloads model from `HuggingFace `_. If you would like to use models from `ModelScope `_ in the following examples, please set the environment variable:
+ $ conda create -n myenv python=3.10 -y
+ $ conda activate myenv
+ $ pip install vllm
- .. code-block:: shell
+Please refer to the :ref:`installation documentation ` for more details on installing vLLM.
- export VLLM_USE_MODELSCOPE=True
+.. _offline_batched_inference:
Offline Batched Inference
-------------------------
-We first show an example of using vLLM for offline batched inference on a dataset. In other words, we use vLLM to generate texts for a list of input prompts.
+With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). The example script for this section can be found `here `__.
+
+The first line of this example imports the classes :class:`~vllm.LLM` and :class:`~vllm.SamplingParams`:
-Import :class:`~vllm.LLM` and :class:`~vllm.SamplingParams` from vLLM.
-The :class:`~vllm.LLM` class is the main class for running offline inference with vLLM engine.
-The :class:`~vllm.SamplingParams` class specifies the parameters for the sampling process.
+- :class:`~vllm.LLM` is the main class for running offline inference with vLLM engine.
+- :class:`~vllm.SamplingParams` specifies the parameters for the sampling process.
.. code-block:: python
from vllm import LLM, SamplingParams
-Define the list of input prompts and the sampling parameters for generation. The sampling temperature is set to 0.8 and the nucleus sampling probability is set to 0.95. For more information about the sampling parameters, refer to the `class definition `_.
+The next section defines a list of input prompts and sampling parameters for text generation. The `sampling temperature `_ is set to ``0.8`` and the `nucleus sampling probability `_ is set to ``0.95``. You can find more information about the sampling parameters `here `__.
.. code-block:: python
@@ -44,46 +56,46 @@ Define the list of input prompts and the sampling parameters for generation. The
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-Initialize vLLM's engine for offline inference with the :class:`~vllm.LLM` class and the `OPT-125M model `_. The list of supported models can be found at :ref:`supported models `.
+The :class:`~vllm.LLM` class initializes vLLM's engine and the `OPT-125M model `_ for offline inference. The list of supported models can be found :ref:`here `.
.. code-block:: python
llm = LLM(model="facebook/opt-125m")
-Call ``llm.generate`` to generate the outputs. It adds the input prompts to vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all the output tokens.
+.. note::
+
+ By default, vLLM downloads models from `HuggingFace `_. If you would like to use models from `ModelScope `_, set the environment variable ``VLLM_USE_MODELSCOPE`` before initializing the engine.
+
+Now, the fun part! The outputs are generated using ``llm.generate``. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all of the output tokens.
.. code-block:: python
outputs = llm.generate(prompts, sampling_params)
- # Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-The code example can also be found in `examples/offline_inference.py `_.
+.. _openai_compatible_server:
OpenAI-Compatible Server
------------------------
vLLM can be deployed as a server that implements the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API.
-By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the command below) and implements `list models `_, `create chat completion `_, and `create completion `_ endpoints. We are actively adding support for more endpoints.
+By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time and implements endpoints such as `list models `_, `create chat completion `_, and `create completion `_ endpoints.
-Start the server:
+Run the following command to start the vLLM server with the `Qwen2.5-1.5B-Instruct `_ model:
.. code-block:: console
- $ vllm serve facebook/opt-125m
+ $ vllm serve Qwen/Qwen2.5-1.5B-Instruct
-By default, the server uses a predefined chat template stored in the tokenizer. You can override this template by using the ``--chat-template`` argument:
-
-.. code-block:: console
+.. note::
- $ vllm serve facebook/opt-125m --chat-template ./examples/template_chatml.jinja
+ By default, the server uses a predefined chat template stored in the tokenizer. You can learn about overriding it `here `__.
-This server can be queried in the same format as OpenAI API. For example, list the models:
+This server can be queried in the same format as OpenAI API. For example, to list the models:
.. code-block:: console
@@ -91,17 +103,17 @@ This server can be queried in the same format as OpenAI API. For example, list t
You can pass in the argument ``--api-key`` or environment variable ``VLLM_API_KEY`` to enable the server to check for API key in the header.
-Using OpenAI Completions API with vLLM
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+OpenAI Completions API with vLLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Query the model with input prompts:
+Once your server is started, you can query the model with input prompts:
.. code-block:: console
$ curl http://localhost:8000/v1/completions \
$ -H "Content-Type: application/json" \
$ -d '{
- $ "model": "facebook/opt-125m",
+ $ "model": "Qwen/Qwen2.5-1.5B-Instruct",
$ "prompt": "San Francisco is a",
$ "max_tokens": 7,
$ "temperature": 0
@@ -120,36 +132,32 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
api_key=openai_api_key,
base_url=openai_api_base,
)
- completion = client.completions.create(model="facebook/opt-125m",
+ completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
prompt="San Francisco is a")
print("Completion result:", completion)
-For a more detailed client example, refer to `examples/openai_completion_client.py `_.
-
-Using OpenAI Chat API with vLLM
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+A more detailed client example can be found `here `__.
-The vLLM server is designed to support the OpenAI Chat API, allowing you to engage in dynamic conversations with the model. The chat interface is a more interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations.
+OpenAI Chat Completions API with vLLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Querying the model using OpenAI Chat API:
+vLLM is designed to also support the OpenAI Chat Completions API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations.
-You can use the `create chat completion `_ endpoint to communicate with the model in a chat-like interface:
+You can use the `create chat completion `_ endpoint to interact with the model:
.. code-block:: console
$ curl http://localhost:8000/v1/chat/completions \
$ -H "Content-Type: application/json" \
$ -d '{
- $ "model": "facebook/opt-125m",
+ $ "model": "Qwen/Qwen2.5-1.5B-Instruct",
$ "messages": [
$ {"role": "system", "content": "You are a helpful assistant."},
$ {"role": "user", "content": "Who won the world series in 2020?"}
$ ]
$ }'
-Python Client Example:
-
-Using the `openai` python package, you can also communicate with the model in a chat-like manner:
+Alternatively, you can use the ``openai`` python package:
.. code-block:: python
@@ -164,12 +172,10 @@ Using the `openai` python package, you can also communicate with the model in a
)
chat_response = client.chat.completions.create(
- model="facebook/opt-125m",
+ model="Qwen/Qwen2.5-1.5B-Instruct",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Tell me a joke."},
]
)
print("Chat response:", chat_response)
-
-For more in-depth examples and advanced features of the chat API, you can refer to the official OpenAI documentation.
diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst
index 217028839e347..75ab2b6ba02dc 100644
--- a/docs/source/getting_started/tpu-installation.rst
+++ b/docs/source/getting_started/tpu-installation.rst
@@ -1,80 +1,164 @@
.. _installation_tpu:
+#####################
Installation with TPU
-=====================
+#####################
-vLLM supports Google Cloud TPUs using PyTorch XLA.
+Tensor Processing Units (TPUs) are Google's custom-developed application-specific
+integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs
+are available in different versions each with different hardware specifications.
+For more information about TPUs, see `TPU System Architecture `_.
+For more information on the TPU versions supported with vLLM, see:
+
+* `TPU v6e `_
+* `TPU v5e `_
+* `TPU v5p `_
+* `TPU v4 `_
+
+These TPU versions allow you to configure the physical arrangements of the TPU
+chips. This can improve throughput and networking performance. For more
+information see:
+
+* `TPU v6e topologies `_
+* `TPU v5e topologies `_
+* `TPU v5p topologies `_
+* `TPU v4 topologies `_
+
+In order for you to use Cloud TPUs you need to have TPU quota granted to your
+Google Cloud Platform project. TPU quotas specify how many TPUs you can use in a
+GPC project and are specified in terms of TPU version, the number of TPU you
+want to use, and quota type. For more information, see `TPU quota `_.
+
+For TPU pricing information, see `Cloud TPU pricing `_.
+
+You may need additional persistent storage for your TPU VMs. For more
+information, see `Storage options for Cloud TPU data `_.
Requirements
------------
-* Google Cloud TPU VM (single & multi host)
-* TPU versions: v5e, v5p, v4
-* Python: 3.10
+* Google Cloud TPU VM
+* TPU versions: v6e, v5e, v5p, v4
+* Python: 3.10 or newer
-Installation options:
+Provision Cloud TPUs
+====================
-1. :ref:`Build a docker image with Dockerfile `.
-2. :ref:`Build from source `.
+You can provision Cloud TPUs using the `Cloud TPU API `_`
+or the `queued resources `_`
+API. This section shows how to create TPUs using the queued resource API.
+For more information about using the Cloud TPU API, see `Create a Cloud TPU using the Create Node API `_.
+`Queued resources `_
+enable you to request Cloud TPU resources in a queued manner. When you request
+queued resources, the request is added to a queue maintained by the Cloud TPU
+service. When the requested resource becomes available, it's assigned to your
+Google Cloud project for your immediate exclusive use.
-.. _build_docker_tpu:
+Provision a Cloud TPU with the queued resource API
+--------------------------------------------------
+Create a TPU v5e with 4 TPU chips:
-Build a docker image with :code:`Dockerfile.tpu`
-------------------------------------------------
+.. code-block:: console
-`Dockerfile.tpu `_ is provided to build a docker image with TPU support.
+ gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
+ --node-id TPU_NAME \
+ --project PROJECT_ID \
+ --zone ZONE \
+ --accelerator-type ACCELERATOR_TYPE \
+ --runtime-version RUNTIME_VERSION \
+ --service-account SERVICE_ACCOUNT
-.. code-block:: console
+.. list-table:: Parameter descriptions
+ :header-rows: 1
- $ docker build -f Dockerfile.tpu -t vllm-tpu .
+ * - Parameter name
+ - Description
+ * - QUEUED_RESOURCE_ID
+ - The user-assigned ID of the queued resource request.
+ * - TPU_NAME
+ - The user-assigned name of the TPU which is created when the queued
+ resource request is allocated.
+ * - PROJECT_ID
+ - Your Google Cloud project
+ * - ZONE
+ - The `zone `_ where you
+ want to create your Cloud TPU.
+ * - ACCELERATOR_TYPE
+ - The TPU version you want to use. Specify the TPU version, followed by a
+ '-' and the number of TPU cores. For example `v5e-4` specifies a v5e TPU
+ with 4 cores. For more information, see `TPU versions `_.
+ * - RUNTIME_VERSION
+ - The TPU VM runtime version to use. For more information see `TPU VM images `_.
+ * - SERVICE_ACCOUNT
+ - The email address for your service account. You can find it in the IAM
+ Cloud Console under *Service Accounts*. For example:
+ `tpu-service-account@.iam.gserviceaccount.com`
+Connect to your TPU using SSH:
-You can run the docker image with the following command:
+.. code-block:: bash
-.. code-block:: console
+ gcloud compute tpus tpu-vm ssh TPU_NAME
- $ # Make sure to add `--privileged --net host --shm-size=16G`.
- $ docker run --privileged --net host --shm-size=16G -it vllm-tpu
+Create and activate a Conda environment for vLLM:
+.. code-block:: bash
-.. _build_from_source_tpu:
+ conda create -n vllm python=3.10 -y
+ conda activate vllm
-Build from source
------------------
+Clone the vLLM repository and go to the vLLM directory:
-You can also build and install the TPU backend from source.
+.. code-block:: bash
-First, install the dependencies:
+ git clone https://github.com/vllm-project/vllm.git && cd vllm
-.. code-block:: console
+Uninstall the existing `torch` and `torch_xla` packages:
+
+.. code-block:: bash
- $ # (Recommended) Create a new conda environment.
- $ conda create -n myenv python=3.10 -y
- $ conda activate myenv
+ pip uninstall torch torch-xla -y
- $ # Clean up the existing torch and torch-xla packages.
- $ pip uninstall torch torch-xla -y
+Install build dependencies:
- $ # Install PyTorch and PyTorch XLA.
- $ export DATE="20240828"
- $ export TORCH_VERSION="2.5.0"
- $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl
- $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl
+.. code-block:: bash
- $ # Install JAX and Pallas.
- $ pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
- $ pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+ pip install -r requirements-tpu.txt
+ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
- $ # Install other build dependencies.
- $ pip install -r requirements-tpu.txt
+Run the setup script:
+.. code-block:: bash
-Next, build vLLM from source. This will only take a few seconds:
+ VLLM_TARGET_DEVICE="tpu" python setup.py develop
+
+
+Provision Cloud TPUs with GKE
+-----------------------------
+
+For more information about using TPUs with GKE, see
+https://cloud.google.com/kubernetes-engine/docs/how-to/tpus
+https://cloud.google.com/kubernetes-engine/docs/concepts/tpus
+https://cloud.google.com/kubernetes-engine/docs/concepts/plan-tpus
+
+.. _build_docker_tpu:
+
+Build a docker image with :code:`Dockerfile.tpu`
+------------------------------------------------
+
+You can use `Dockerfile.tpu `_
+to build a Docker image with TPU support.
.. code-block:: console
- $ VLLM_TARGET_DEVICE="tpu" python setup.py develop
+ $ docker build -f Dockerfile.tpu -t vllm-tpu .
+
+Run the Docker image with the following command:
+
+.. code-block:: console
+ $ # Make sure to add `--privileged --net host --shm-size=16G`.
+ $ docker run --privileged --net host --shm-size=16G -it vllm-tpu
.. note::
@@ -82,7 +166,6 @@ Next, build vLLM from source. This will only take a few seconds:
The compilation time may take 20~30 minutes in the first run.
However, the compilation time reduces to ~5 minutes afterwards because the XLA graphs are cached in the disk (in :code:`VLLM_XLA_CACHE_PATH` or :code:`~/.cache/vllm/xla_cache` by default).
-
.. tip::
If you encounter the following error:
@@ -93,7 +176,7 @@ Next, build vLLM from source. This will only take a few seconds:
ImportError: libopenblas.so.0: cannot open shared object file: No such file or directory
- Please install OpenBLAS with the following command:
+ Install OpenBLAS with the following command:
.. code-block:: console
diff --git a/docs/source/getting_started/xpu-installation.rst b/docs/source/getting_started/xpu-installation.rst
index 151ebb5f1811f..b1868acbc84b0 100644
--- a/docs/source/getting_started/xpu-installation.rst
+++ b/docs/source/getting_started/xpu-installation.rst
@@ -60,3 +60,21 @@ Build from source
- FP16 is the default data type in the current XPU backend. The BF16 data
type will be supported in the future.
+
+Distributed inference and serving
+---------------------------------
+
+XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following:
+
+.. code-block:: console
+
+ $ python -m vllm.entrypoints.openai.api_server \
+ $ --model=facebook/opt-13b \
+ $ --dtype=bfloat16 \
+ $ --device=xpu \
+ $ --max_model_len=1024 \
+ $ --distributed-executor-backend=ray \
+ $ --pipeline-parallel-size=2 \
+ $ -tp=8
+
+By default, a ray instance will be launched automatically if no existing one is detected in system, with ``num-gpus`` equals to ``parallel_config.world_size``. We recommend properly starting a ray cluster before execution, referring helper `script `_.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index c328c049b430c..38dad25e18c02 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -43,7 +43,7 @@ vLLM is flexible and easy to use with:
* Tensor parallelism and pipeline parallelism support for distributed inference
* Streaming outputs
* OpenAI-compatible API server
-* Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
+* Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
* Prefix caching support
* Multi-lora support
@@ -66,6 +66,7 @@ Documentation
getting_started/amd-installation
getting_started/openvino-installation
getting_started/cpu-installation
+ getting_started/gaudi-installation
getting_started/neuron-installation
getting_started/tpu-installation
getting_started/xpu-installation
@@ -125,15 +126,16 @@ Documentation
.. toctree::
:maxdepth: 1
- :caption: Performance benchmarks
+ :caption: Performance
- performance_benchmark/benchmarks
+ performance/benchmarks
.. toctree::
:maxdepth: 2
:caption: Developer Documentation
dev/sampling_params
+ dev/pooling_params
dev/offline_inference/offline_index
dev/engine/engine_index
dev/kernel/paged_attention
diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst
index ae09259c0756c..c6d88cc38e99b 100644
--- a/docs/source/models/adding_model.rst
+++ b/docs/source/models/adding_model.rst
@@ -133,7 +133,9 @@ If you are running api server with :code:`vllm serve `, you can wrap the e
from vllm import ModelRegistry
from your_code import YourModelForCausalLM
ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
- import runpy
- runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__')
+
+ if __name__ == '__main__':
+ import runpy
+ runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__')
Save the above code in a file and run it with :code:`python your_file.py `.
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 98d804052b575..5a474043078db 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -160,13 +160,13 @@ Text Generation
-
- ✅︎
* - :code:`GraniteForCausalLM`
- - PowerLM
- - :code:`ibm/PowerLM-3b` etc.
+ - Granite 3.0, PowerLM
+ - :code:`ibm-granite/granite-3.0-2b-base`, :code:`ibm-granite/granite-3.0-8b-instruct`, :code:`ibm/PowerLM-3b`, etc.
- ✅︎
- ✅︎
* - :code:`GraniteMoeForCausalLM`
- - PowerMoE
- - :code:`ibm/PowerMoE-3b` etc.
+ - Granite 3.0 MoE, PowerMoE
+ - :code:`ibm-granite/granite-3.0-1b-a400m-base`, :code:`ibm-granite/granite-3.0-3b-a800m-instruct`, :code:`ibm/PowerMoE-3b`, etc.
- ✅︎
- ✅︎
* - :code:`InternLMForCausalLM`
@@ -277,11 +277,11 @@ Text Generation
* - :code:`QWenLMHeadModel`
- Qwen
- :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
- -
+ - ✅︎
- ✅︎
* - :code:`Qwen2ForCausalLM`
- Qwen2
- - :code:`Qwen/Qwen2-beta-7B`, :code:`Qwen/Qwen2-beta-7B-Chat`, etc.
+ - :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc.
- ✅︎
- ✅︎
* - :code:`Qwen2MoeForCausalLM`
@@ -333,7 +333,7 @@ Text Embedding
* - :code:`MistralModel`
- Mistral-based
- :code:`intfloat/e5-mistral-7b-instruct`, etc.
- -
+ - ✅︎
- ✅︎
.. important::
@@ -361,6 +361,28 @@ Reward Modeling
.. note::
As an interim measure, these models are supported via Embeddings API. See `this RFC `_ for upcoming changes.
+Classification
+---------------
+
+.. list-table::
+ :widths: 25 25 50 5 5
+ :header-rows: 1
+
+ * - Architecture
+ - Models
+ - Example HF Models
+ - :ref:`LoRA `
+ - :ref:`PP `
+ * - :code:`Qwen2ForSequenceClassification`
+ - Qwen2-based
+ - :code:`jason9693/Qwen2.5-1.5B-apeach`, etc.
+ -
+ - ✅︎
+
+.. note::
+ As an interim measure, these models are supported via Embeddings API. It will be supported via Classification API in the future (no reference APIs exist now).
+
+
Multimodal Language Models
^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -418,6 +440,18 @@ Text Generation
- :code:`THUDM/glm-4v-9b` etc.
-
- ✅︎
+ * - :code:`H2OVLChatModel`
+ - H2OVL
+ - T + I\ :sup:`E+`
+ - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc.
+ -
+ - ✅︎
+ * - :code:`Idefics3ForConditionalGeneration`
+ - Idefics3
+ - T + I
+ - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc.
+ -
+ -
* - :code:`InternVLChatModel`
- InternVL2
- T + I\ :sup:`E+`
@@ -444,7 +478,7 @@ Text Generation
- ✅︎
* - :code:`LlavaOnevisionForConditionalGeneration`
- LLaVA-Onevision
- - T + I\ :sup:`+` + V
+ - T + I\ :sup:`+` + V\ :sup:`+`
- :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
-
- ✅︎
@@ -456,7 +490,7 @@ Text Generation
- ✅︎
* - :code:`MllamaForConditionalGeneration`
- Llama 3.2
- - T + I
+ - T + I\ :sup:`+`
- :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc.
-
-
@@ -494,7 +528,7 @@ Text Generation
- Qwen-VL
- T + I\ :sup:`E+`
- :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc.
- -
+ - ✅︎
- ✅︎
* - :code:`Qwen2AudioForConditionalGeneration`
- Qwen2-Audio
@@ -506,7 +540,7 @@ Text Generation
- Qwen2-VL
- T + I\ :sup:`E+` + V\ :sup:`+`
- :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
- -
+ - ✅︎
- ✅︎
* - :code:`UltravoxModel`
- Ultravox
@@ -518,6 +552,9 @@ Text Generation
| :sup:`E` Pre-computed embeddings can be inputted for this modality.
| :sup:`+` Multiple items can be inputted per text prompt for this modality.
+.. note::
+ vLLM currently only supports adding LoRA to the language backbone of multimodal models.
+
.. note::
For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index a47902ab4fc9d..112e9db6a41de 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -185,7 +185,7 @@ Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruc
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
.. important::
- Since OpenAI Vision API is based on `Chat Completions `_ API,
+ Since OpenAI Vision API is based on `Chat Completions API `_,
a chat template is **required** to launch the API server.
Although Phi-3.5-Vision comes with a chat template, for other models you may have to provide one if the model's tokenizer does not come with it.
@@ -240,8 +240,15 @@ To consume the server, you can use the OpenAI client like in the example below:
)
print("Chat completion output:", chat_response.choices[0].message.content)
+A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py `_.
-A full code example can be found in `examples/openai_api_client_for_multimodal.py `_.
+.. tip::
+ Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via ``--allowed-local-media-path`` when launching the API server/engine,
+ and pass the file path as ``url`` in the API request.
+
+.. tip::
+ There is no need to place image placeholders in the text content of the API request - they are already represented by the image content.
+ In fact, you can place image placeholders in the middle of the text by interleaving text and image content.
.. note::
@@ -251,5 +258,56 @@ A full code example can be found in `examples/openai_api_client_for_multimodal.p
$ export VLLM_IMAGE_FETCH_TIMEOUT=
-.. note::
- There is no need to format the prompt in the API request since it will be handled by the server.
+Chat Embeddings API
+^^^^^^^^^^^^^^^^^^^
+
+vLLM's Chat Embeddings API is a superset of OpenAI's `Embeddings API `_,
+where a list of ``messages`` can be passed instead of batched ``inputs``. This enables multi-modal inputs to be passed to embedding models.
+
+.. tip::
+ The schema of ``messages`` is exactly the same as in Chat Completions API.
+
+In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model.
+
+.. code-block:: bash
+
+ vllm serve TIGER-Lab/VLM2Vec-Full --task embedding \
+ --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja
+
+.. important::
+
+ Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embedding``
+ to run this model in embedding mode instead of text generation mode.
+
+.. important::
+
+ VLM2Vec does not expect chat-based input. We use a `custom chat template `_
+ to combine the text and images together.
+
+Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library:
+
+.. code-block:: python
+
+ import requests
+
+ image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+
+ response = requests.post(
+ "http://localhost:8000/v1/embeddings",
+ json={
+ "model": "TIGER-Lab/VLM2Vec-Full",
+ "messages": [{
+ "role": "user",
+ "content": [
+ {"type": "image_url", "image_url": {"url": image_url}},
+ {"type": "text", "text": "Represent the given image."},
+ ],
+ }],
+ "encoding_format": "float",
+ },
+ )
+ response.raise_for_status()
+ response_json = response.json()
+ print("Embedding output:", response_json["data"][0]["embedding"])
+
+A full code example can be found in `examples/openai_chat_embedding_client_for_multimodal.py `_.
diff --git a/docs/source/performance/benchmarks.rst b/docs/source/performance/benchmarks.rst
new file mode 100644
index 0000000000000..6d4d7b544cb5d
--- /dev/null
+++ b/docs/source/performance/benchmarks.rst
@@ -0,0 +1,33 @@
+.. _benchmarks:
+
+================
+Benchmark Suites
+================
+
+vLLM contains two sets of benchmarks:
+
++ :ref:`Performance benchmarks `
++ :ref:`Nightly benchmarks `
+
+
+.. _performance_benchmarks:
+
+Performance Benchmarks
+----------------------
+
+The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the ``perf-benchmarks`` and ``ready`` labels, and when a PR is merged into vLLM.
+
+The latest performance results are hosted on the public `vLLM Performance Dashboard `_.
+
+More information on the performance benchmarks and their parameters can be found `here `__.
+
+.. _nightly_benchmarks:
+
+Nightly Benchmarks
+------------------
+
+These compare vLLM's performance against alternatives (``tgi``, ``trt-llm``, and ``lmdeploy``) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the ``perf-benchmarks`` and ``nightly-benchmarks`` labels.
+
+The latest nightly benchmark results are shared in major release blog posts such as `vLLM v0.6.0 `_.
+
+More information on the nightly benchmarks and their parameters can be found `here `__.
\ No newline at end of file
diff --git a/docs/source/performance_benchmark/benchmarks.rst b/docs/source/performance_benchmark/benchmarks.rst
deleted file mode 100644
index e5c8d6a55de63..0000000000000
--- a/docs/source/performance_benchmark/benchmarks.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-.. _benchmarks:
-
-Benchmark suites of vLLM
-========================
-
-
-
-vLLM contains two sets of benchmarks:
-
-+ **Performance benchmarks**: benchmark vLLM's performance under various workloads at a high frequency (when a pull request (PR for short) of vLLM is being merged). See `vLLM performance dashboard `_ for the latest performance results.
-
-+ **Nightly benchmarks**: compare vLLM's performance against alternatives (tgi, trt-llm, and lmdeploy) when there are major updates of vLLM (e.g., bumping up to a new version). The latest results are available in the `vLLM GitHub README `_.
-
-
-Trigger a benchmark
--------------------
-
-The performance benchmarks and nightly benchmarks can be triggered by submitting a PR to vLLM, and label the PR with `perf-benchmarks` and `nightly-benchmarks`.
-
-
-.. note::
-
- Please refer to `vLLM performance benchmark descriptions `_ and `vLLM nightly benchmark descriptions `_ for detailed descriptions on benchmark environment, workload and metrics.
diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst
index cac0605ca132b..f629b3ca78318 100644
--- a/docs/source/serving/compatibility_matrix.rst
+++ b/docs/source/serving/compatibility_matrix.rst
@@ -283,7 +283,7 @@ Feature x Feature
- ✅
- ✅
- ✅
- - ✗
+ - `✗ `__
- ?
- ✅
- ✅
@@ -359,7 +359,7 @@ Feature x Hardware
- ✅
- ✅
- ✅
- - `✗ `__
+ - ✅
- ✗
* - :abbr:`logP (Logprobs)`
- ✅
diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst
index fcb2646df50d3..4d57206e53a05 100644
--- a/docs/source/serving/distributed_serving.rst
+++ b/docs/source/serving/distributed_serving.rst
@@ -22,7 +22,7 @@ After adding enough GPUs and nodes to hold the model, you can run vLLM first, wh
Details for Distributed Inference and Serving
----------------------------------------------
-vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm `_. We also support pipeline parallel as a beta feature for online serving. We manage the distributed runtime with either `Ray `_ or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
+vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm `_. We manage the distributed runtime with either `Ray `_ or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray.
Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured :code:`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the :code:`LLM` class :code:`distributed-executor-backend` argument or :code:`--distributed-executor-backend` API server argument. Set it to :code:`mp` for multiprocessing or :code:`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case.
@@ -49,9 +49,6 @@ You can also additionally specify :code:`--pipeline-parallel-size` to enable pip
$ --tensor-parallel-size 4 \
$ --pipeline-parallel-size 2
-.. note::
- Pipeline parallel is a beta feature. It is only supported for online serving as well as LLaMa, GPT2, Mixtral, Qwen, Qwen2, and Nemotron style models.
-
Multi-Node Inference and Serving
--------------------------------
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 413c87ab28755..a196f8b1e574e 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -26,13 +26,26 @@ print(completion.choices[0].message)
```
## API Reference
-Please see the [OpenAI API Reference](https://platform.openai.com/docs/api-reference) for more information on the API. We support all parameters except:
-- Chat: `tools`, and `tool_choice`.
-- Completions: `suffix`.
-vLLM also provides experimental support for OpenAI Vision API compatible inference. See more details in [Using VLMs](../models/vlm.rst).
+We currently support the following OpenAI APIs:
+
+- [Completions API](https://platform.openai.com/docs/api-reference/completions)
+ - *Note: `suffix` parameter is not supported.*
+- [Chat Completions API](https://platform.openai.com/docs/api-reference/chat)
+ - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Using VLMs](../models/vlm.rst).
+ - *Note: `image_url.detail` parameter is not supported.*
+ - We also support `audio_url` content type for audio files.
+ - Refer to [vllm.entrypoints.chat_utils](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py) for the exact schema.
+ - *TODO: Support `input_audio` content type as defined [here](https://github.com/openai/openai-python/blob/v1.52.2/src/openai/types/chat/chat_completion_content_part_input_audio_param.py).*
+ - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
+- [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings)
+ - Instead of `inputs`, you can pass in a list of `messages` (same schema as Chat Completions API),
+ which will be treated as a single prompt to the model according to its chat template.
+ - This enables multi-modal inputs to be passed to embedding models, see [Using VLMs](../models/vlm.rst).
+ - *Note: You should run `vllm serve` with `--task embedding` to ensure that the model is being run in embedding mode.*
## Extra Parameters
+
vLLM supports a set of parameters that are not part of the OpenAI API.
In order to use them, you can pass them as extra parameters in the OpenAI client.
Or directly merge them into the JSON payload if you are using HTTP call directly.
@@ -49,7 +62,26 @@ completion = client.chat.completions.create(
)
```
-### Extra Parameters for Chat API
+### Extra Parameters for Completions API
+
+The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
+
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-completion-sampling-params
+:end-before: end-completion-sampling-params
+```
+
+The following extra parameters are supported:
+
+```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
+:language: python
+:start-after: begin-completion-extra-params
+:end-before: end-completion-extra-params
+```
+
+### Extra Parameters for Chat Completions API
+
The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
@@ -66,21 +98,22 @@ The following extra parameters are supported:
:end-before: end-chat-completion-extra-params
```
-### Extra Parameters for Completions API
-The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
+### Extra Parameters for Embeddings API
+
+The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported.
```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
:language: python
-:start-after: begin-completion-sampling-params
-:end-before: end-completion-sampling-params
+:start-after: begin-embedding-pooling-params
+:end-before: end-embedding-pooling-params
```
The following extra parameters are supported:
```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
:language: python
-:start-after: begin-completion-extra-params
-:end-before: end-completion-extra-params
+:start-after: begin-embedding-extra-params
+:end-before: end-embedding-extra-params
```
## Chat Template
@@ -127,14 +160,7 @@ this, unless explicitly specified.
:func: create_parser_for_docs
:prog: vllm serve
```
-## Tool Calling in the Chat Completion API
-### Named Function Calling
-vLLM supports only named function calling in the chat completion API by default. It does so using Outlines, so this is
-enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a
-high-quality one.
-To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and
-specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request.
### Config file
@@ -163,12 +189,22 @@ The order of priorities is `command line > config file values > defaults`.
---
## Tool calling in the chat completion API
-vLLM supports only named function calling in the chat completion API. The `tool_choice` options `auto` and `required` are **not yet supported** but on the roadmap.
+
+vLLM supports named function calling and `auto` tool choice in the chat completion API. The `tool_choice` options `required` is **not yet supported** but on the roadmap.
It is the callers responsibility to prompt the model with the tool information, vLLM will not automatically manipulate the prompt.
+
+### Named Function Calling
+vLLM supports named function calling in the chat completion API by default. It does so using Outlines, so this is
+enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a
+high-quality one.
+
vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
+To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and
+specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request.
+
### Automatic Function Calling
To enable this feature, you should set the following flags:
@@ -185,7 +221,9 @@ from HuggingFace; and you can find an example of this in a `tokenizer_config.jso
If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template!
+
#### Hermes Models (`hermes`)
+
All Nous Research Hermes-series models newer than Hermes 2 Pro should be supported.
* `NousResearch/Hermes-2-Pro-*`
* `NousResearch/Hermes-2-Theta-*`
@@ -197,7 +235,9 @@ step in their creation_.
Flags: `--tool-call-parser hermes`
+
#### Mistral Models (`mistral`)
+
Supported models:
* `mistralai/Mistral-7B-Instruct-v0.3` (confirmed)
* Additional mistral function-calling models are compatible as well.
@@ -216,7 +256,9 @@ when tools are provided, that results in much better reliability when working wi
Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja`
+
#### Llama Models (`llama3_json`)
+
Supported models:
* `meta-llama/Meta-Llama-3.1-8B-Instruct`
* `meta-llama/Meta-Llama-3.1-70B-Instruct`
@@ -236,7 +278,24 @@ it works better with vLLM.
Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool_chat_template_llama3_json.jinja`
+#### IBM Granite
+
+Supported models:
+* `ibm-granite/granite-3.0-8b-instruct`
+
+Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja`
+
+`examples/tool_chat_template_granite.jinja`: this is a modified chat template from the original on Huggingface. Parallel function calls are supported.
+
+* `ibm-granite/granite-20b-functioncalling`
+
+Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja`
+
+`examples/tool_chat_template_granite_20b_fc.jinja`: this is a modified chat template from the original on Huggingface, which is not vLLM compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported.
+
+
#### InternLM Models (`internlm`)
+
Supported models:
* `internlm/internlm2_5-7b-chat` (confirmed)
* Additional internlm2.5 function-calling models are compatible as well
@@ -246,6 +305,7 @@ Known issues:
Recommended flags: `--tool-call-parser internlm --chat-template examples/tool_chat_template_internlm2_tool.jinja`
+
#### Jamba Models (`jamba`)
AI21's Jamba-1.5 models are supported.
* `ai21labs/AI21-Jamba-1.5-Mini`
@@ -312,5 +372,5 @@ Then you can use this plugin in the command line like this.
--tool-parser-plugin
--tool-call-parser example \
--chat-template