diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
new file mode 100644
index 0000000000000..75ad094fa1382
--- /dev/null
+++ b/.buildkite/check-wheel-size.py
@@ -0,0 +1,36 @@
+import os
+import zipfile
+
+MAX_SIZE_MB = 200
+
+
+def print_top_10_largest_files(zip_file):
+ with zipfile.ZipFile(zip_file, 'r') as z:
+ file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
+ file_sizes.sort(key=lambda x: x[1], reverse=True)
+ for f, size in file_sizes[:10]:
+ print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
+
+
+def check_wheel_size(directory):
+ for root, _, files in os.walk(directory):
+ for f in files:
+ if f.endswith(".whl"):
+ wheel_path = os.path.join(root, f)
+ wheel_size = os.path.getsize(wheel_path)
+ wheel_size_mb = wheel_size / (1024 * 1024)
+ if wheel_size_mb > MAX_SIZE_MB:
+ print(
+ f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
+ f"compare to the allowed size ({MAX_SIZE_MB} MB).")
+ print_top_10_largest_files(wheel_path)
+ return 1
+ else:
+ print(f"Wheel {wheel_path} is within the allowed size "
+ f"({wheel_size_mb} MB).")
+ return 0
+
+
+if __name__ == "__main__":
+ import sys
+ sys.exit(check_wheel_size(sys.argv[1]))
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index 83a56e25aca73..bde8ab6184d3c 100644
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -1,38 +1,73 @@
-# This script build the ROCm docker image and run the API server inside the container.
-# It serves a sanity check for compilation and basic model usage.
+# This script runs test inside the corresponding ROCm docker container.
set -ex
# Print ROCm version
+echo "--- ROCm info"
rocminfo
-# Try building the docker image
-docker build -t rocm -f Dockerfile.rocm .
+# cleanup older docker images
+cleanup_docker() {
+ # Get Docker's root directory
+ docker_root=$(docker info -f '{{.DockerRootDir}}')
+ if [ -z "$docker_root" ]; then
+ echo "Failed to determine Docker root directory."
+ exit 1
+ fi
+ echo "Docker root directory: $docker_root"
+ # Check disk usage of the filesystem where Docker's root directory is located
+ disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+ # Define the threshold
+ threshold=70
+ if [ "$disk_usage" -gt "$threshold" ]; then
+ echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+ # Remove dangling images (those that are not tagged and not used by any container)
+ docker image prune -f
+ # Remove unused volumes
+ docker volume prune -f
+ echo "Docker images and volumes cleanup completed."
+ else
+ echo "Disk usage is below $threshold%. No cleanup needed."
+ fi
+}
-# Setup cleanup
-remove_docker_container() { docker rm -f rocm || true; }
-trap remove_docker_container EXIT
-remove_docker_container
-
-# Run the image
-docker run --device /dev/kfd --device /dev/dri --network host --name rocm rocm python3 -m vllm.entrypoints.api_server &
-
-# Wait for the server to start
-wait_for_server_to_start() {
- timeout=300
- counter=0
-
- while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
- sleep 1
- counter=$((counter + 1))
- if [ $counter -ge $timeout ]; then
- echo "Timeout after $timeout seconds"
- break
+# Call the cleanup docker function
+cleanup_docker
+
+echo "--- Resetting GPUs"
+
+echo "reset" > /opt/amdgpu/etc/gpu_state
+
+while true; do
+ sleep 3
+ if grep -q clean /opt/amdgpu/etc/gpu_state; then
+ echo "GPUs state is \"clean\""
+ break
fi
- done
+done
+
+echo "--- Building container"
+sha=$(git rev-parse --short HEAD)
+image_name=rocm_${sha}
+container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)
+docker build \
+ -t ${image_name} \
+ -f Dockerfile.rocm \
+ --progress plain \
+ .
+
+remove_docker_container() {
+ docker rm -f ${container_name} || docker image rm -f ${image_name} || true
}
-wait_for_server_to_start
+trap remove_docker_container EXIT
+
+echo "--- Running container"
+
+docker run \
+ --device /dev/kfd --device /dev/dri \
+ --network host \
+ --rm \
+ -e HF_TOKEN \
+ --name ${container_name} \
+ ${image_name} \
+ /bin/bash -c "${@}"
-# Test a simple prompt
-curl -X POST -H "Content-Type: application/json" \
- localhost:8000/generate \
- -d '{"prompt": "San Francisco is a"}'
diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh
index f6a542afe1a3d..1efc96395933f 100644
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
@@ -9,10 +9,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/.."
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
# run python-based benchmarks and upload the result to buildkite
-python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
+python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
bench_latency_exit_code=$?
-python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
+python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
bench_throughput_exit_code=$?
# run server-based benchmarks and upload the result to buildkite
@@ -53,6 +53,11 @@ echo '```' >> benchmark_results.md
tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
echo '```' >> benchmark_results.md
+# if the agent binary is not found, skip uploading the results, exit 0
+if [ ! -f /workspace/buildkite-agent ]; then
+ exit 0
+fi
+
# upload the results to buildkite
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
@@ -69,4 +74,5 @@ if [ $bench_serving_exit_code -ne 0 ]; then
exit $bench_serving_exit_code
fi
-/workspace/buildkite-agent artifact upload openai-*.json
+rm ShareGPT_V3_unfiltered_cleaned_split.json
+/workspace/buildkite-agent artifact upload "*.json"
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index f187d1f181724..414045fe163e5 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -11,4 +11,4 @@ trap remove_docker_container EXIT
remove_docker_container
# Run the image and launch offline inference
-docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 examples/offline_inference.py
+docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 vllm/examples/offline_inference.py
diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
new file mode 100644
index 0000000000000..252c0f7fecd12
--- /dev/null
+++ b/.buildkite/run-neuron-test.sh
@@ -0,0 +1,51 @@
+# This script build the Neuron docker image and run the API server inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -e
+
+# Try building the docker image
+aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
+
+# prune old image and containers to save disk space, and only once a day
+# by using a timestamp file in tmp.
+if [ -f /tmp/neuron-docker-build-timestamp ]; then
+ last_build=$(cat /tmp/neuron-docker-build-timestamp)
+ current_time=$(date +%s)
+ if [ $((current_time - last_build)) -gt 86400 ]; then
+ docker system prune -f
+ echo $current_time > /tmp/neuron-docker-build-timestamp
+ fi
+else
+ echo $(date +%s) > /tmp/neuron-docker-build-timestamp
+fi
+
+docker build -t neuron -f Dockerfile.neuron .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f neuron || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image
+docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
+ --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
+
+# Wait for the server to start
+wait_for_server_to_start() {
+ timeout=300
+ counter=0
+
+ while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
+ sleep 1
+ counter=$((counter + 1))
+ if [ $counter -ge $timeout ]; then
+ echo "Timeout after $timeout seconds"
+ break
+ fi
+ done
+}
+wait_for_server_to_start
+
+# Test a simple prompt
+curl -X POST -H "Content-Type: application/json" \
+ localhost:8000/generate \
+ -d '{"prompt": "San Francisco is a"}'
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 27e44463a30a6..21cbd9ba13780 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -5,89 +5,155 @@
steps:
- label: Regression Test
+ mirror_hardwares: [amd]
command: pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
- label: AsyncEngine Test
+ #mirror_hardwares: [amd]
command: pytest -v -s async_engine
- label: Basic Correctness Test
- command: pytest -v -s basic_correctness
+ mirror_hardwares: [amd]
+ commands:
+ - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
+ - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
+ - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
+ - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
+ - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Core Test
+ mirror_hardwares: [amd]
command: pytest -v -s core
- label: Distributed Comm Ops Test
- command: pytest -v -s test_comm_ops.py
- working_dir: "/vllm-workspace/tests/distributed"
- num_gpus: 2 # only support 1 or 2 for now.
+ #mirror_hardwares: [amd]
+ command: pytest -v -s distributed/test_comm_ops.py
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
- label: Distributed Tests
- working_dir: "/vllm-workspace/tests/distributed"
- num_gpus: 2 # only support 1 or 2 for now.
+ mirror_hardwares: [amd]
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ commands:
+ - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+ - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+ - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
+ - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
+ - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+ - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+ - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
+ - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
+ - pytest -v -s spec_decode/e2e/test_integration_dist.py
+
+- label: Distributed Tests (Multiple Groups)
+ #mirror_hardwares: [amd]
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
commands:
- - pytest -v -s test_pynccl.py
- - TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
- - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
+ - pytest -v -s distributed/test_pynccl.py
- label: Engine Test
- command: pytest -v -s engine tokenization test_sequence.py test_config.py
+ mirror_hardwares: [amd]
+ command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
- label: Entrypoints Test
+ mirror_hardwares: [amd]
+
commands:
- # these tests have to be separated, because each one will allocate all posible GPU memory
- - pytest -v -s entrypoints --ignore=entrypoints/test_server_oot_registration.py
- - pytest -v -s entrypoints/test_server_oot_registration.py
+ - pytest -v -s test_inputs.py
+ - pytest -v -s entrypoints -m llm
+ - pytest -v -s entrypoints -m openai
- label: Examples Test
working_dir: "/vllm-workspace/examples"
+ mirror_hardwares: [amd]
commands:
# install aws cli for llava_example.py
- - pip install awscli
+ # install tensorizer for tensorize_vllm_model.py
+ - pip install awscli tensorizer
- python3 offline_inference.py
- python3 offline_inference_with_prefix.py
- python3 llm_engine_example.py
- python3 llava_example.py
+ - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- label: Kernels Test %N
+ #mirror_hardwares: [amd]
command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 4
- label: Models Test
+ #mirror_hardwares: [amd]
commands:
- bash ../.buildkite/download-images.sh
- - pytest -v -s models --ignore=models/test_llava.py --ignore=models/test_mistral.py
+ - pytest -v -s models --ignore=models/test_llava.py
- label: Llava Test
+ mirror_hardwares: [amd]
commands:
- bash ../.buildkite/download-images.sh
- pytest -v -s models/test_llava.py
- label: Prefix Caching Test
+ mirror_hardwares: [amd]
commands:
- pytest -v -s prefix_caching
- label: Samplers Test
+ #mirror_hardwares: [amd]
command: pytest -v -s samplers
- label: LogitsProcessor Test
+ mirror_hardwares: [amd]
command: pytest -v -s test_logits_processor.py
+- label: Utils Test
+ command: pytest -v -s test_utils.py
+
- label: Worker Test
+ mirror_hardwares: [amd]
command: pytest -v -s worker
- label: Speculative decoding tests
+ #mirror_hardwares: [amd]
command: pytest -v -s spec_decode
- label: LoRA Test %N
- command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+ #mirror_hardwares: [amd]
+ command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
parallelism: 4
+- label: LoRA Long Context (Distributed)
+ #mirror_hardwares: [amd]
+ num_gpus: 4
+ # This test runs llama 13B, so it is required to run on 4 GPUs.
+ commands:
+ # Temporarily run this way because we cannot clean up GPU mem usage
+ # for multi GPU tests.
+ # TODO(sang): Fix it.
+ - pytest -v -s lora/test_long_context.py::test_rotary_emb_replaced
+ - pytest -v -s lora/test_long_context.py::test_batched_rope_kernel
+ - pytest -v -s lora/test_long_context.py::test_self_consistency
+ - pytest -v -s lora/test_long_context.py::test_quality
+ - pytest -v -s lora/test_long_context.py::test_max_len
+
+- label: Tensorizer Test
+ #mirror_hardwares: [amd]
+ command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
+
- label: Metrics Test
+ mirror_hardwares: [amd]
command: pytest -v -s metrics
+- label: Quantization Test
+ #mirror_hardwares: [amd]
+ command: pytest -v -s quantization
+
- label: Benchmarks
working_dir: "/vllm-workspace/.buildkite"
+ mirror_hardwares: [amd]
commands:
- pip install aiohttp
- bash run-benchmarks.sh
diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
index 3ed23c62c005d..265833e2ccf6e 100644
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@@ -3,16 +3,8 @@
{% set default_working_dir = "/vllm-workspace/tests" %}
steps:
- - label: "AMD Test"
- agents:
- queue: amd
- command: bash .buildkite/run-amd-test.sh
-
- - label: "CPU Test"
- command: bash .buildkite/run-cpu-test.sh
-
- label: ":docker: build image"
- commands:
+ commands:
- "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
- "docker push {{ docker_image }}"
env:
@@ -21,8 +13,35 @@ steps:
automatic:
- exit_status: -1 # Agent was lost
limit: 5
+ - exit_status: -10 # Agent was lost
+ limit: 5
- wait
+ - group: "AMD Tests"
+ depends_on: ~
+ steps:
+ {% for step in steps %}
+ {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
+ - label: "AMD: {{ step.label }}"
+ agents:
+ queue: amd
+ command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}"
+ env:
+ DOCKER_BUILDKIT: "1"
+ {% endif %}
+ {% endfor %}
+
+ - label: "Neuron Test"
+ depends_on: ~
+ agents:
+ queue: neuron
+ command: bash .buildkite/run-neuron-test.sh
+ soft_fail: true
+
+ - label: "Intel Test"
+ depends_on: ~
+ command: bash .buildkite/run-cpu-test.sh
+
{% for step in steps %}
- label: "{{ step.label }}"
agents:
@@ -35,9 +54,14 @@ steps:
automatic:
- exit_status: -1 # Agent was lost
limit: 5
+ - exit_status: -10 # Agent was lost
+ limit: 5
plugins:
- kubernetes:
podSpec:
+ {% if step.num_gpus %}
+ priorityClassName: gpu-priority-cls-{{ step.num_gpus }}
+ {% endif %}
volumes:
- name: dshm
emptyDir:
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000000000..7f9e6d720fae5
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,26 @@
+BasedOnStyle: Google
+UseTab: Never
+IndentWidth: 2
+ColumnLimit: 80
+
+# Force pointers to the type for C++.
+DerivePointerAlignment: false
+PointerAlignment: Left
+
+# Reordering #include statements can (and currently will) introduce errors
+SortIncludes: false
+
+# Style choices
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+IndentPPDirectives: BeforeHash
+
+IncludeCategories:
+ - Regex: '^<'
+ Priority: 4
+ - Regex: '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/'
+ Priority: 3
+ - Regex: '^"(qoda|\.\.)/'
+ Priority: 2
+ - Regex: '.*'
+ Priority: 1
diff --git a/.github/ISSUE_TEMPLATE/200-installation.yml b/.github/ISSUE_TEMPLATE/200-installation.yml
index 4c6c96187cc6c..df41ade8c3c01 100644
--- a/.github/ISSUE_TEMPLATE/200-installation.yml
+++ b/.github/ISSUE_TEMPLATE/200-installation.yml
@@ -18,6 +18,7 @@ body:
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
+ It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`
diff --git a/.github/ISSUE_TEMPLATE/300-usage.yml b/.github/ISSUE_TEMPLATE/300-usage.yml
index 88227b4b2e7b9..54763af1058f6 100644
--- a/.github/ISSUE_TEMPLATE/300-usage.yml
+++ b/.github/ISSUE_TEMPLATE/300-usage.yml
@@ -18,6 +18,7 @@ body:
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
+ It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`
diff --git a/.github/ISSUE_TEMPLATE/400-bug report.yml b/.github/ISSUE_TEMPLATE/400-bug report.yml
index f1124dfa78bbc..ce980c3f4a01d 100644
--- a/.github/ISSUE_TEMPLATE/400-bug report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug report.yml
@@ -18,6 +18,7 @@ body:
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
+ It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`
@@ -57,6 +58,10 @@ body:
If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
+
+ Please set the environment variable `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging to help debugging potential issues.
+
+ If you experienced crashes or hangs, it would be helpful to run vllm with `export VLLM_TRACE_FUNCTION=1` . All the function calls in vllm will be recorded. Inspect these log files, and tell which function crashes or hangs.
placeholder: |
A clear and concise description of what the bug is.
diff --git a/.github/ISSUE_TEMPLATE/700-performance discussion.yml b/.github/ISSUE_TEMPLATE/700-performance discussion.yml
index 9e8e7b4aa3530..4f8843420a94e 100644
--- a/.github/ISSUE_TEMPLATE/700-performance discussion.yml
+++ b/.github/ISSUE_TEMPLATE/700-performance discussion.yml
@@ -39,6 +39,7 @@ body:
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
+ It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`
diff --git a/.github/ISSUE_TEMPLATE/750-RFC.yml b/.github/ISSUE_TEMPLATE/750-RFC.yml
new file mode 100644
index 0000000000000..5382b124dcd79
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/750-RFC.yml
@@ -0,0 +1,49 @@
+name: 💬 Request for comments (RFC).
+description: Ask for feedback on major architectural changes or design choices.
+title: "[RFC]: "
+labels: ["RFC"]
+
+body:
+- type: markdown
+ attributes:
+ value: >
+ #### Please take a look at previous [RFCs](https://github.com/vllm-project/vllm/issues?q=label%3ARFC+sort%3Aupdated-desc) for reference.
+- type: textarea
+ attributes:
+ label: Motivation.
+ description: >
+ The motivation of the RFC.
+ validations:
+ required: true
+- type: textarea
+ attributes:
+ label: Proposed Change.
+ description: >
+ The proposed change of the RFC.
+ validations:
+ required: true
+- type: textarea
+ attributes:
+ label: Feedback Period.
+ description: >
+ The feedback period of the RFC. Usually at least one week.
+ validations:
+ required: false
+- type: textarea
+ attributes:
+ label: CC List.
+ description: >
+ The list of people you want to CC.
+ validations:
+ required: false
+- type: textarea
+ attributes:
+ label: Any Other Things.
+ description: >
+ Any other things you would like to mention.
+ validations:
+ required: false
+- type: markdown
+ attributes:
+ value: >
+ Thanks for contributing 🎉!
diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
new file mode 100644
index 0000000000000..e9b6e28fa6bcb
--- /dev/null
+++ b/.github/workflows/clang-format.yml
@@ -0,0 +1,42 @@
+name: clang-format
+
+on:
+ # Trigger the workflow on push or pull request,
+ # but only for the main branch
+ push:
+ branches:
+ - main
+ pull_request:
+ branches:
+ - main
+
+jobs:
+ clang-format:
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python-version: ["3.11"]
+ steps:
+ - uses: actions/checkout@v2
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install clang-format==18.1.5
+ - name: Running clang-format
+ run: |
+ EXCLUDES=(
+ 'csrc/moe/topk_softmax_kernels.cu'
+ 'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu'
+ 'csrc/punica/bgmv/bgmv_config.h'
+ 'csrc/punica/bgmv/bgmv_impl.cuh'
+ 'csrc/punica/bgmv/vec_dtypes.cuh'
+ 'csrc/punica/punica_ops.cu'
+ 'csrc/punica/type_convert.h'
+ )
+ find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
+ | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
+ | xargs clang-format --dry-run --Werror
\ No newline at end of file
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
new file mode 100644
index 0000000000000..a20753d8a7702
--- /dev/null
+++ b/.github/workflows/mypy.yaml
@@ -0,0 +1,50 @@
+name: mypy
+
+on:
+ # Trigger the workflow on push or pull request,
+ # but only for the main branch
+ push:
+ branches:
+ - main
+ pull_request:
+ branches:
+ - main
+
+jobs:
+ ruff:
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python-version: ["3.8", "3.9", "3.10", "3.11"]
+ steps:
+ - uses: actions/checkout@v2
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install mypy==1.9.0
+ pip install types-setuptools
+ pip install types-PyYAML
+ pip install types-requests
+ pip install types-setuptools
+ - name: Mypy
+ run: |
+ mypy vllm/attention --config-file pyproject.toml
+ mypy vllm/core --config-file pyproject.toml
+ mypy vllm/distributed --config-file pyproject.toml
+ mypy vllm/entrypoints --config-file pyproject.toml
+ mypy vllm/executor --config-file pyproject.toml
+ mypy vllm/usage --config-file pyproject.toml
+ mypy vllm/*.py --config-file pyproject.toml
+ mypy vllm/transformers_utils --config-file pyproject.toml
+ mypy vllm/engine --config-file pyproject.toml
+ mypy vllm/worker --config-file pyproject.toml
+ mypy vllm/spec_decode --config-file pyproject.toml
+ mypy vllm/model_executor --config-file pyproject.toml
+ mypy vllm/lora --config-file pyproject.toml
+ mypy vllm/logging --config-file pyproject.toml
+ mypy vllm/model_executor --config-file pyproject.toml
+
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index fc97e33c19af2..9c35ede5f6781 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -49,13 +49,19 @@ jobs:
matrix:
os: ['ubuntu-20.04']
python-version: ['3.8', '3.9', '3.10', '3.11']
- pytorch-version: ['2.2.1'] # Must be the most recent version that meets requirements-cuda.txt.
+ pytorch-version: ['2.3.0'] # Must be the most recent version that meets requirements-cuda.txt.
cuda-version: ['11.8', '12.1']
steps:
- name: Checkout
uses: actions/checkout@v3
+ - name: Setup ccache
+ uses: hendrikmuhs/ccache-action@v1.2
+ with:
+ create-symlink: true
+ key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
+
- name: Set up Linux Env
if: ${{ runner.os == 'Linux' }}
run: |
@@ -76,6 +82,8 @@ jobs:
- name: Build wheel
shell: bash
+ env:
+ CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
run: |
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
wheel_name=$(ls dist/*whl | xargs -n 1 basename)
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index e8060e369a889..e71033f828006 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -15,7 +15,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
- python-version: ["3.10"]
+ python-version: ["3.8", "3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
diff --git a/.github/workflows/scripts/create_release.js b/.github/workflows/scripts/create_release.js
index 0f25624b4c21c..475742118afeb 100644
--- a/.github/workflows/scripts/create_release.js
+++ b/.github/workflows/scripts/create_release.js
@@ -8,7 +8,7 @@ module.exports = async (github, context, core) => {
generate_release_notes: true,
name: process.env.RELEASE_TAG,
owner: context.repo.owner,
- prerelease: false,
+ prerelease: true,
repo: context.repo.repo,
tag_name: process.env.RELEASE_TAG,
});
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
index b163c960db555..04f307bcf8b0e 100644
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -14,7 +14,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
- python-version: ["3.10"]
+ python-version: ["3.8", "3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
diff --git a/.gitignore b/.gitignore
index b1513ef0ddb0c..e077366d1e4a1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -70,6 +70,8 @@ instance/
# Sphinx documentation
docs/_build/
+docs/source/getting_started/examples/*.rst
+!**/*.template.rst
# PyBuilder
.pybuilder/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3b6ea4b570a99..ad562d9c996f3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,7 +31,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from Dockerfile.rocm
#
-set(TORCH_SUPPORTED_VERSION_CUDA "2.2.1")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
@@ -174,17 +174,52 @@ set(VLLM_EXT_SRC
"csrc/layernorm_kernels.cu"
"csrc/quantization/squeezellm/quant_cuda_kernel.cu"
"csrc/quantization/gptq/q_gemm.cu"
- "csrc/quantization/fp8/convert_kernel.cu"
- "csrc/quantization/fp8/gemm_kernel.cu"
+ "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
+ "csrc/quantization/fp8/common.cu"
"csrc/cuda_utils_kernels.cu"
"csrc/moe_align_block_size_kernels.cu"
"csrc/pybind.cpp")
+if(VLLM_GPU_LANG STREQUAL "HIP")
+ list(APPEND VLLM_EXT_SRC
+ "csrc/quantization/fp8/amd/gemm_kernel.cu")
+endif()
+
if(VLLM_GPU_LANG STREQUAL "CUDA")
+ include(FetchContent)
+ SET(CUTLASS_ENABLE_HEADERS_ONLY=ON)
+ FetchContent_Declare(
+ cutlass
+ GIT_REPOSITORY https://github.com/nvidia/cutlass.git
+ # CUTLASS 3.5.0
+ GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
+ )
+ FetchContent_MakeAvailable(cutlass)
+
list(APPEND VLLM_EXT_SRC
+ "csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu"
- "csrc/quantization/marlin/marlin_cuda_kernel.cu"
- "csrc/custom_all_reduce.cu")
+ "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
+ "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
+ "csrc/quantization/gptq_marlin/gptq_marlin.cu"
+ "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
+ "csrc/custom_all_reduce.cu"
+ "csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu"
+ "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu"
+ "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu")
+
+ #
+ # The CUTLASS kernels for Hopper require sm90a to be enabled.
+ # This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
+ # That adds an extra 17MB to compiled binary, so instead we selectively enable it.
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
+ set_source_files_properties(
+ "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu"
+ PROPERTIES
+ COMPILE_FLAGS
+ "-gencode arch=compute_90a,code=sm_90a")
+ endif()
+
endif()
define_gpu_extension_target(
@@ -194,6 +229,7 @@ define_gpu_extension_target(
SOURCES ${VLLM_EXT_SRC}
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
+ INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
WITH_SOABI)
@@ -240,24 +276,13 @@ define_gpu_extension_target(
set(VLLM_PUNICA_EXT_SRC
"csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
- "csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu"
- "csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu"
- "csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu"
"csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
- "csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu"
- "csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu"
- "csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu"
- "csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu"
"csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
- "csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu"
"csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
"csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
- "csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu"
- "csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu"
"csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
- "csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu"
- "csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu"
- "csrc/punica/punica_ops.cc")
+ "csrc/punica/punica_ops.cu"
+ "csrc/punica/punica_pybind.cpp")
#
# Copy GPU compilation flags+update for punica
@@ -281,6 +306,9 @@ if (${VLLM_GPU_LANG} STREQUAL "CUDA")
endif()
endforeach()
message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
+elseif(${VLLM_GPU_LANG} STREQUAL "HIP")
+ set(VLLM_PUNICA_GPU_ARCHES ${VLLM_GPU_ARCHES})
+ message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
endif()
if (VLLM_PUNICA_GPU_ARCHES)
@@ -317,10 +345,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
add_dependencies(default _C)
add_dependencies(default _custom_C)
message(STATUS "Enabling moe extension.")
- add_dependencies(default _moe_C)
-endif()
+ add_dependencies(default _moe_C)
-if(VLLM_GPU_LANG STREQUAL "CUDA")
# Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
# VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
# there are supported target arches.
@@ -330,3 +356,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
add_dependencies(default _punica_C)
endif()
endif()
+
+if(VLLM_GPU_LANG STREQUAL "CUDA")
+ message(STATUS "Enabling moe extension.")
+ add_dependencies(default _moe_C)
+endif()
diff --git a/Dockerfile b/Dockerfile
index d1d29177b0f44..eb96bf3c1db2b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,9 +1,13 @@
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.
+# Please update any changes made here to
+# docs/source/dev/dockerfile/dockerfile.rst and
+# docs/source/assets/dev/dockerfile-stages-dependency.png
+
#################### BASE BUILD IMAGE ####################
# prepare basic build environment
-FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
+FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev
RUN apt-get update -y \
&& apt-get install -y python3-pip git
@@ -12,7 +16,7 @@ RUN apt-get update -y \
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
-RUN ldconfig /usr/local/cuda-12.1/compat/
+RUN ldconfig /usr/local/cuda-12.4/compat/
WORKDIR /workspace
@@ -71,34 +75,15 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/pip \
python3 setup.py bdist_wheel --dist-dir=dist
-# the `vllm_nccl` package must be installed from source distribution
-# pip is too smart to store a wheel in the cache, and other CI jobs
-# will directly use the wheel from the cache, which is not what we want.
-# we need to remove it manually
-RUN --mount=type=cache,target=/root/.cache/pip \
- pip cache remove vllm_nccl*
-#################### EXTENSION Build IMAGE ####################
-
-#################### FLASH_ATTENTION Build IMAGE ####################
-FROM dev as flash-attn-builder
-# max jobs used for build
-ARG max_jobs=2
-ENV MAX_JOBS=${max_jobs}
-# flash attention version
-ARG flash_attn_version=v2.5.6
-ENV FLASH_ATTN_VERSION=${flash_attn_version}
+# check the size of the wheel, we cannot upload wheels larger than 100MB
+COPY .buildkite/check-wheel-size.py check-wheel-size.py
+RUN python3 check-wheel-size.py dist
-WORKDIR /usr/src/flash-attention-v2
-
-# Download the wheel or build it if a pre-compiled release doesn't exist
-RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
- --no-build-isolation --no-deps --no-cache-dir
-
-#################### FLASH_ATTENTION Build IMAGE ####################
+#################### EXTENSION Build IMAGE ####################
#################### vLLM installation IMAGE ####################
# image with vLLM installed
-FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
+FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
WORKDIR /vllm-workspace
RUN apt-get update -y \
@@ -108,16 +93,12 @@ RUN apt-get update -y \
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
-RUN ldconfig /usr/local/cuda-12.1/compat/
+RUN ldconfig /usr/local/cuda-12.4/compat/
# install vllm wheel first, so that torch etc will be installed
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/pip \
pip install dist/*.whl --verbose
-
-RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
- --mount=type=cache,target=/root/.cache/pip \
- pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
#################### vLLM installation IMAGE ####################
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index 4251fddd6cc3b..aec79824213f3 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -17,4 +17,6 @@ RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.py
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
+WORKDIR /workspace/
+
CMD ["/bin/bash"]
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
new file mode 100644
index 0000000000000..fe42b4ef393f1
--- /dev/null
+++ b/Dockerfile.neuron
@@ -0,0 +1,36 @@
+# default base image
+ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04"
+
+FROM $BASE_IMAGE
+
+RUN echo "Base image is $BASE_IMAGE"
+
+# Install some basic utilities
+RUN apt-get update && apt-get install python3 python3-pip -y
+
+### Mount Point ###
+# When launching the container, mount the code directory to /app
+ARG APP_MOUNT=/app
+VOLUME [ ${APP_MOUNT} ]
+WORKDIR ${APP_MOUNT}
+
+RUN python3 -m pip install --upgrade pip
+RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
+RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
+RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+
+COPY ./vllm /app/vllm/vllm
+COPY ./setup.py /app/vllm/setup.py
+COPY ./requirements-common.txt /app/vllm/requirements-common.txt
+COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
+
+RUN cd /app/vllm \
+ && python3 -m pip install -U -r requirements-neuron.txt
+
+ENV VLLM_BUILD_WITH_NEURON 1
+RUN cd /app/vllm \
+ && pip install -e . \
+ && cd ..
+
+CMD ["/bin/bash"]
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 04102560452a1..6d0dd31d346f1 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -153,7 +153,6 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/gradlib/dist/*.whl /
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/rocm_patch /rocm_patch
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements*.txt /
-COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/patch_xformers.rocm.sh /
# -----------------------
# Final vLLM image
@@ -199,14 +198,13 @@ RUN --mount=type=bind,from=export_triton,src=/,target=/install \
fi
RUN python3 -m pip install --upgrade numba
-RUN python3 -m pip install xformers==0.0.23 --no-deps
# Install vLLM (and gradlib)
+# Make sure punica kernels are built (for LoRA)
+ENV VLLM_INSTALL_PUNICA_KERNELS=1
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
cd /install \
&& pip install -U -r requirements-rocm.txt \
- && if [ "$BUILD_FA" = "1" ]; then \
- bash patch_xformers.rocm.sh; fi \
&& case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
*"rocm-6.0"*) \
patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h rocm_patch/rocm_bf16.patch;; \
@@ -215,11 +213,9 @@ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
*) ;; esac \
&& pip install *.whl
-# Update Ray to latest version + set environment variable to ensure it works on TP > 1
-RUN python3 -m pip install --no-cache-dir 'ray[all]>=2.10.0'
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
-# HIPgraph performance environment variable.
+# Performance environment variable.
ENV HIP_FORCE_DEV_KERNARG=1
CMD ["/bin/bash"]
diff --git a/MANIFEST.in b/MANIFEST.in
index d385f194c6c0f..82be639ef4d73 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,6 +1,9 @@
include LICENSE
include requirements-common.txt
include requirements-cuda.txt
+include requirements-rocm.txt
+include requirements-neuron.txt
+include requirements-cpu.txt
include CMakeLists.txt
recursive-include cmake *
diff --git a/README.md b/README.md
index d53227b82d87a..d63819c3815c0 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,17 @@ Easy, fast, and cheap LLM serving for everyone
+---
+
+**The Fourth vLLM Bay Area Meetup (June 11th 5:30pm-8pm PT)**
+
+We are thrilled to announce our fourth vLLM Meetup!
+The vLLM team will share recent updates and roadmap.
+We will also have vLLM collaborators from BentoML and Cloudflare coming up to the stage to discuss their experience in deploying LLMs with vLLM.
+Please register [here](https://lu.ma/agivllm) and join us!
+
+---
+
*Latest News* 🔥
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
@@ -51,40 +62,14 @@ vLLM is flexible and easy to use with:
- (Experimental) Prefix caching support
- (Experimental) Multi-lora support
-vLLM seamlessly supports many Hugging Face models, including the following architectures:
-
-- Aquila & Aquila2 (`BAAI/AquilaChat2-7B`, `BAAI/AquilaChat2-34B`, `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.)
-- Baichuan & Baichuan2 (`baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.)
-- BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
-- ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
-- Command-R (`CohereForAI/c4ai-command-r-v01`, etc.)
-- DBRX (`databricks/dbrx-base`, `databricks/dbrx-instruct` etc.)
-- DeciLM (`Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.)
-- Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
-- Gemma (`google/gemma-2b`, `google/gemma-7b`, etc.)
-- GPT-2 (`gpt2`, `gpt2-xl`, etc.)
-- GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
-- GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
-- GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
-- InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
-- InternLM2 (`internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.)
-- Jais (`core42/jais-13b`, `core42/jais-13b-chat`, `core42/jais-30b-v3`, `core42/jais-30b-chat-v3`, etc.)
-- LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
-- MiniCPM (`openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, etc.)
-- Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
-- Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.)
-- MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
-- OLMo (`allenai/OLMo-1B`, `allenai/OLMo-7B`, etc.)
-- OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
-- Orion (`OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.)
-- Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
-- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
-- Qwen2 (`Qwen/Qwen1.5-7B`, `Qwen/Qwen1.5-7B-Chat`, etc.)
-- Qwen2MoE (`Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.)
-- StableLM(`stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.)
-- Starcoder2(`bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.)
-- Xverse (`xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.)
-- Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
+vLLM seamlessly supports most popular open-source models on HuggingFace, including:
+- Transformer-like LLMs (e.g., Llama)
+- Mixture-of-Expert LLMs (e.g., Mixtral)
+- Multi-modal LLMs (e.g., LLaVA)
+
+Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
+
+## Getting Started
Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
@@ -92,9 +77,7 @@ Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/get
pip install vllm
```
-## Getting Started
-
-Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started.
+Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
@@ -104,6 +87,32 @@ Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started
We welcome and value any contributions and collaborations.
Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
+## Sponsors
+
+vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
+
+
+
+
+- a16z
+- AMD
+- Anyscale
+- AWS
+- Crusoe Cloud
+- Databricks
+- DeepInfra
+- Dropbox
+- Lambda Lab
+- NVIDIA
+- Replicate
+- Roblox
+- RunPod
+- Trainy
+- UC Berkeley
+- UC San Diego
+
+We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
+
## Citation
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
diff --git a/ROCm_performance.md b/ROCm_performance.md
index 0f12ed1adc9af..180c848a21950 100644
--- a/ROCm_performance.md
+++ b/ROCm_performance.md
@@ -18,30 +18,3 @@ Define the following environment symbol: `PYTORCH_TUNABLEOP_ENABLED=1` in order
On ROCm, to have better performance, a custom paged attention is available by switching on the env variable: `VLLM_USE_ROCM_CUSTOM_PAGED_ATTN=1`.
Currently, this env variable is enabled by default. To fallback to PagedAttention v2 kernel assign the env variable to 0.
The custom PagedAttention kernel is enabled for dtype: fp16, block-size=16, head-size=128, and max context length <= 16k, with GQA ratio (num_heads//num_kv_heads) between 1 to 16. On all the other cases, we fallback to PagedAttention v2 kernel.
-
-## Fp8 Quantization
-
-To use fp8 quantization, first step is to quantize your model to fp8 format. Generating a safetensor file that contains the quantized weights and the corresponding scaling factors of your model. The safetensor file should be added under your model folder along with a file called `serenity_config.json`, which contains a json object with a key: `"quantized_weights": "quantized/osf/rank0.safetensors"`, the value should be the relative path of your safetensor file containing the quantized weights.
-
-Then we can run a model with fp8 quantization using vllm, just add a parameter `quantization="fp8"` when creating the `vllm.LLM` object.
-
-## Gemm Tunning for Fp8
-
-To get better performance of fp8 quantization, we will need to tune the gemm with the information of all the shapes used in the execution of the model.
-
-To obtain all the shapes of gemms during the execution of the model, set the env value TUNE_FP8=1 and the run the model as usual. We will get the a file called `/fp8_shapes.csv`.
-
-Next, run gradlib to obtain the best solutions of these shapes:
-
-```
-cd gradlib_fp8
-python3 -m pip uninstall gradlib
-python3 setup.py install
-python3 gemm_tuner.py --input_file /fp8_shapes.csv --tuned_file /tuned_fp8_16.csv
-cd ../gradlib
-python3 -m pip uninstall gradlib
-python3 setup.py install
-cd ..
-```
-
-Now, when running inference with fp8, we are using the tunned gemm for best performance.
\ No newline at end of file
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index ad428bd1c3644..58dcc6167efa6 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -27,8 +27,8 @@ class RequestFuncInput:
class RequestFuncOutput:
generated_text: str = ""
success: bool = False
- latency: float = 0
- ttft: float = 0 # Time to first token
+ latency: float = 0.0
+ ttft: float = 0.0 # Time to first token
itl: List[float] = field(
default_factory=list) # List of inter-token latencies
prompt_len: int = 0
@@ -58,23 +58,24 @@ async def async_request_tgi(
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
- ttft = 0
+ ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload) as response:
if response.status == 200:
- async for chunk in response.content:
- chunk = chunk.strip()
- if not chunk:
+ async for chunk_bytes in response.content:
+ chunk_bytes = chunk_bytes.strip()
+ if not chunk_bytes:
continue
- chunk = remove_prefix(chunk.decode("utf-8"), "data:")
+ chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+ "data:")
data = json.loads(chunk)
timestamp = time.perf_counter()
# First token
- if ttft == 0:
+ if ttft == 0.0:
ttft = time.perf_counter() - st
output.ttft = ttft
@@ -88,6 +89,9 @@ async def async_request_tgi(
output.latency = most_recent_timestamp - st
output.success = True
output.generated_text = data["generated_text"]
+ else:
+ output.error = response.reason or ""
+ output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
@@ -119,23 +123,25 @@ async def async_request_trt_llm(
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
- ttft = 0
+ ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload) as response:
if response.status == 200:
- async for chunk in response.content:
- chunk = chunk.strip()
- if not chunk:
+ async for chunk_bytes in response.content:
+ chunk_bytes = chunk_bytes.strip()
+ if not chunk_bytes:
continue
- chunk = remove_prefix(chunk.decode("utf-8"), "data:")
+ chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+ "data:")
data = json.loads(chunk)
+ output.generated_text += data["text_output"]
timestamp = time.perf_counter()
# First token
- if ttft == 0:
+ if ttft == 0.0:
ttft = time.perf_counter() - st
output.ttft = ttft
@@ -147,11 +153,10 @@ async def async_request_trt_llm(
most_recent_timestamp = timestamp
output.latency = most_recent_timestamp - st
- output.generated_text = json.loads(data)["text_output"]
output.success = True
else:
- output.error = response.reason
+ output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
@@ -195,7 +200,7 @@ async def async_request_deepspeed_mii(
output.generated_text = parsed_resp["text"][0]
output.success = True
else:
- output.error = response.reason
+ output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
@@ -234,19 +239,20 @@ async def async_request_openai_completions(
output.prompt_len = request_func_input.prompt_len
generated_text = ""
- ttft = 0
+ ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload,
headers=headers) as response:
if response.status == 200:
- async for chunk in response.content:
- chunk = chunk.strip()
- if not chunk:
+ async for chunk_bytes in response.content:
+ chunk_bytes = chunk_bytes.strip()
+ if not chunk_bytes:
continue
- chunk = remove_prefix(chunk.decode("utf-8"), "data: ")
+ chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+ "data: ")
if chunk == "[DONE]":
latency = time.perf_counter() - st
else:
@@ -255,7 +261,7 @@ async def async_request_openai_completions(
if data["choices"][0]["text"]:
timestamp = time.perf_counter()
# First token
- if ttft == 0:
+ if ttft == 0.0:
ttft = time.perf_counter() - st
output.ttft = ttft
@@ -273,6 +279,9 @@ async def async_request_openai_completions(
output.generated_text = generated_text
output.success = True
output.latency = latency
+ else:
+ output.error = response.reason or ""
+ output.success = False
except Exception:
output.success = False
exc_info = sys.exc_info()
@@ -315,19 +324,20 @@ async def async_request_openai_chat_completions(
output.prompt_len = request_func_input.prompt_len
generated_text = ""
- ttft = 0
+ ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload,
headers=headers) as response:
if response.status == 200:
- async for chunk in response.content:
- chunk = chunk.strip()
- if not chunk:
+ async for chunk_bytes in response.content:
+ chunk_bytes = chunk_bytes.strip()
+ if not chunk_bytes:
continue
- chunk = remove_prefix(chunk.decode("utf-8"), "data: ")
+ chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+ "data: ")
if chunk == "[DONE]":
latency = time.perf_counter() - st
else:
@@ -337,7 +347,7 @@ async def async_request_openai_chat_completions(
delta = data["choices"][0]["delta"]
if delta.get("content", None):
# First token
- if ttft == 0:
+ if ttft == 0.0:
ttft = time.perf_counter() - st
output.ttft = ttft
@@ -354,7 +364,7 @@ async def async_request_openai_chat_completions(
output.success = True
output.latency = latency
else:
- output.error = response.reason
+ output.error = response.reason or ""
output.success = False
except Exception:
output.success = False
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index a160e4cabf3f8..2aca1b23f9b6f 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -1,14 +1,17 @@
"""Benchmark the latency of processing a single batch of requests."""
import argparse
+import json
import time
from pathlib import Path
-from typing import Optional
+from typing import List, Optional
import numpy as np
import torch
from tqdm import tqdm
from vllm import LLM, SamplingParams
+from vllm.inputs import PromptStrictInputs
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
def main(args: argparse.Namespace):
@@ -17,6 +20,8 @@ def main(args: argparse.Namespace):
# NOTE(woosuk): If the request cannot be processed in a single batch,
# the engine will automatically process the request in multiple batches.
llm = LLM(model=args.model,
+ speculative_model=args.speculative_model,
+ num_speculative_tokens=args.num_speculative_tokens,
tokenizer=args.tokenizer,
quantization=args.quantization,
tensor_parallel_size=args.tensor_parallel_size,
@@ -28,9 +33,11 @@ def main(args: argparse.Namespace):
device=args.device,
ray_workers_use_nsight=args.ray_workers_use_nsight,
worker_use_ray=args.worker_use_ray,
+ use_v2_block_manager=args.use_v2_block_manager,
enable_chunked_prefill=args.enable_chunked_prefill,
download_dir=args.download_dir,
- block_size=args.block_size)
+ block_size=args.block_size,
+ gpu_memory_utilization=args.gpu_memory_utilization)
sampling_params = SamplingParams(
n=args.n,
@@ -44,7 +51,9 @@ def main(args: argparse.Namespace):
dummy_prompt_token_ids = np.random.randint(10000,
size=(args.batch_size,
args.input_len))
- dummy_prompt_token_ids = dummy_prompt_token_ids.tolist()
+ dummy_inputs: List[PromptStrictInputs] = [{
+ "prompt_token_ids": batch
+ } for batch in dummy_prompt_token_ids.tolist()]
def run_to_completion(profile_dir: Optional[str] = None):
if profile_dir:
@@ -55,13 +64,13 @@ def run_to_completion(profile_dir: Optional[str] = None):
],
on_trace_ready=torch.profiler.tensorboard_trace_handler(
str(profile_dir))) as p:
- llm.generate(prompt_token_ids=dummy_prompt_token_ids,
+ llm.generate(dummy_inputs,
sampling_params=sampling_params,
use_tqdm=False)
print(p.key_averages())
else:
start_time = time.perf_counter()
- llm.generate(prompt_token_ids=dummy_prompt_token_ids,
+ llm.generate(dummy_inputs,
sampling_params=sampling_params,
use_tqdm=False)
end_time = time.perf_counter()
@@ -93,16 +102,28 @@ def run_to_completion(profile_dir: Optional[str] = None):
for percentage, percentile in zip(percentages, percentiles):
print(f'{percentage}% percentile latency: {percentile} seconds')
+ # Output JSON results if specified
+ if args.output_json:
+ results = {
+ "avg_latency": np.mean(latencies),
+ "latencies": latencies.tolist(),
+ "percentiles": dict(zip(percentages, percentiles.tolist())),
+ }
+ with open(args.output_json, "w") as f:
+ json.dump(results, f, indent=4)
+
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Benchmark the latency of processing a single batch of '
'requests till completion.')
parser.add_argument('--model', type=str, default='facebook/opt-125m')
+ parser.add_argument('--speculative-model', type=str, default=None)
+ parser.add_argument('--num-speculative-tokens', type=int, default=None)
parser.add_argument('--tokenizer', type=str, default=None)
parser.add_argument('--quantization',
'-q',
- choices=['awq', 'gptq', 'squeezellm', None],
+ choices=[*QUANTIZATION_METHODS, None],
default=None)
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
parser.add_argument('--input-len', type=int, default=32)
@@ -137,15 +158,13 @@ def run_to_completion(profile_dir: Optional[str] = None):
action='store_true',
help='enforce eager mode and disable CUDA graph')
parser.add_argument(
- "--kv-cache-dtype",
+ '--kv-cache-dtype',
type=str,
- choices=['auto', 'fp8'],
- default='auto',
- help=
- 'Data type for kv cache storage. If "auto", will use model data type. '
- 'FP8_E5M2 (without scaling) is only supported on cuda version greater '
- 'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
- 'common inference criteria.')
+ choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
+ default="auto",
+ help='Data type for kv cache storage. If "auto", will use model '
+ 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
+ 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
parser.add_argument(
'--quantization-param-path',
type=str,
@@ -178,10 +197,10 @@ def run_to_completion(profile_dir: Optional[str] = None):
help='block size of key/value cache')
parser.add_argument(
'--enable-chunked-prefill',
- type=bool,
- default=False,
+ action='store_true',
help='If True, the prefill requests can be chunked based on the '
'max_num_batched_tokens')
+ parser.add_argument('--use-v2-block-manager', action='store_true')
parser.add_argument(
"--ray-workers-use-nsight",
action='store_true',
@@ -197,5 +216,16 @@ def run_to_completion(profile_dir: Optional[str] = None):
default=None,
help='directory to download and load the weights, '
'default to the default cache dir of huggingface')
+ parser.add_argument(
+ '--output-json',
+ type=str,
+ default=None,
+ help='Path to save the latency results in JSON format.')
+ parser.add_argument('--gpu-memory-utilization',
+ type=float,
+ default=0.9,
+ help='the fraction of GPU memory to be used for '
+ 'the model executor, which can range from 0 to 1.'
+ 'If unspecified, will use the default value of 0.9.')
args = parser.parse_args()
main(args)
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index 1f3274a28cad5..089966986984f 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -16,20 +16,22 @@ def test_prefix(llm=None, sampling_params=None, prompts=None):
def main(args):
- llm = LLM(model="baichuan-inc/Baichuan2-13B-Chat",
+ llm = LLM(model=args.model,
tokenizer_mode='auto',
trust_remote_code=True,
enforce_eager=True,
+ use_v2_block_manager=args.use_v2_block_manager,
+ tensor_parallel_size=args.tensor_parallel_size,
enable_prefix_caching=args.enable_prefix_caching)
num_prompts = 100
prompts = [PROMPT] * num_prompts
- sampling_params = SamplingParams(temperature=0, max_tokens=100)
+ sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
print("------warm up------")
test_prefix(
llm=llm,
- prompts=prompts[:1],
+ prompts=prompts,
sampling_params=sampling_params,
)
@@ -45,8 +47,16 @@ def main(args):
parser = argparse.ArgumentParser(
description='Benchmark the performance with or without automatic '
'prefix caching.')
+ parser.add_argument('--model',
+ type=str,
+ default='baichuan-inc/Baichuan2-13B-Chat')
+ parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
+ parser.add_argument('--output-len', type=int, default=10)
parser.add_argument('--enable-prefix-caching',
action='store_true',
help='enable prefix caching')
+ parser.add_argument('--use-v2-block-manager',
+ action='store_true',
+ help='Use BlockSpaceMangerV2')
args = parser.parse_args()
main(args)
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 6054df439fa57..f3d71de775f82 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -17,6 +17,10 @@
--dataset-path \
--request-rate \ # By default is inf
--num-prompts # By default is 1000
+
+ when using tgi backend, add
+ --endpoint /generate_stream
+ to the end of the command above.
"""
import argparse
import asyncio
@@ -27,7 +31,7 @@
import warnings
from dataclasses import dataclass
from datetime import datetime
-from typing import AsyncGenerator, List, Tuple
+from typing import AsyncGenerator, List, Optional, Tuple
import numpy as np
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
@@ -58,7 +62,11 @@ def sample_sharegpt_requests(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
+ fixed_output_len: Optional[int] = None,
) -> List[Tuple[str, int, int]]:
+ if fixed_output_len is not None and fixed_output_len < 4:
+ raise ValueError("output_len too small")
+
# Load the dataset.
with open(dataset_path) as f:
dataset = json.load(f)
@@ -68,38 +76,32 @@ def sample_sharegpt_requests(
dataset = [(data["conversations"][0]["value"],
data["conversations"][1]["value"]) for data in dataset]
- # some of these will be filtered out, so sample more than we need
- sampled_indices = random.sample(range(len(dataset)),
- int(num_requests * 1.2))
- dataset = [dataset[i] for i in sampled_indices]
-
- # Tokenize the prompts and completions.
- prompts = [prompt for prompt, _ in dataset]
- prompt_token_ids = tokenizer(prompts).input_ids
- completions = [completion for _, completion in dataset]
- completion_token_ids = tokenizer(completions).input_ids
- tokenized_dataset = []
- for i in range(len(dataset)):
- output_len = len(completion_token_ids[i])
- tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
+ # Shuffle the dataset.
+ random.shuffle(dataset)
- # Filter out too long sequences.
+ # Filter out sequences that are too long or too short
filtered_dataset: List[Tuple[str, int, int]] = []
- for prompt, prompt_token_ids, output_len in tokenized_dataset:
+ for i in range(len(dataset)):
+ if len(filtered_dataset) == num_requests:
+ break
+
+ # Tokenize the prompts and completions.
+ prompt = dataset[i][0]
+ prompt_token_ids = tokenizer(prompt).input_ids
+ completion = dataset[i][1]
+ completion_token_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_token_ids)
+ output_len = len(completion_token_ids
+ ) if fixed_output_len is None else fixed_output_len
if prompt_len < 4 or output_len < 4:
# Prune too short sequences.
- # This is because TGI causes errors when the input or output length
- # is too short.
continue
if prompt_len > 1024 or prompt_len + output_len > 2048:
# Prune too long sequences.
continue
filtered_dataset.append((prompt, prompt_len, output_len))
- # Sample the requests.
- sampled_requests = random.sample(filtered_dataset, num_requests)
- return sampled_requests
+ return filtered_dataset
def sample_sonnet_requests(
@@ -213,6 +215,11 @@ def calculate_metrics(
else:
actual_output_lens.append(0)
+ if completed == 0:
+ warnings.warn(
+ "All requests failed. This is likely due to a misconfiguration "
+ "on the benchmark arguments.",
+ stacklevel=2)
metrics = BenchmarkMetrics(
completed=completed,
total_input=total_input,
@@ -224,9 +231,9 @@ def calculate_metrics(
1000, # ttfts is empty if streaming is not supported by backend
median_ttft_ms=np.median(ttfts or 0) * 1000,
p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
- mean_tpot_ms=np.mean(tpots) * 1000,
- median_tpot_ms=np.median(tpots) * 1000,
- p99_tpot_ms=np.percentile(tpots, 99) * 1000,
+ mean_tpot_ms=np.mean(tpots or 0) * 1000,
+ median_tpot_ms=np.median(tpots or 0) * 1000,
+ p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
)
return metrics, actual_output_lens
@@ -248,6 +255,24 @@ async def benchmark(
else:
raise ValueError(f"Unknown backend: {backend}")
+ print("Starting initial single prompt test run...")
+ test_prompt, test_prompt_len, test_output_len = input_requests[0]
+ test_input = RequestFuncInput(
+ model=model_id,
+ prompt=test_prompt,
+ api_url=api_url,
+ prompt_len=test_prompt_len,
+ output_len=test_output_len,
+ best_of=best_of,
+ use_beam_search=use_beam_search,
+ )
+ test_output = await request_func(request_func_input=test_input)
+ if not test_output.success:
+ raise ValueError(
+ "Initial test run failed - Please make sure benchmark arguments "
+ f"are correctly specified. Error: {test_output.error}")
+ else:
+ print("Initial test run completed. Starting main benchmark run...")
print(f"Traffic request rate: {request_rate}")
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
@@ -361,6 +386,7 @@ def main(args: argparse.Namespace):
dataset_path=args.dataset,
num_requests=args.num_prompts,
tokenizer=tokenizer,
+ fixed_output_len=args.sharegpt_output_len,
)
elif args.dataset_name == "sharegpt":
@@ -368,6 +394,7 @@ def main(args: argparse.Namespace):
dataset_path=args.dataset_path,
num_requests=args.num_prompts,
tokenizer=tokenizer,
+ fixed_output_len=args.sharegpt_output_len,
)
elif args.dataset_name == "sonnet":
@@ -524,6 +551,12 @@ def main(args: argparse.Namespace):
default=1000,
help="Number of prompts to process.",
)
+ parser.add_argument(
+ "--sharegpt-output-len",
+ type=int,
+ default=None,
+ help="Output length for each request. Overrides the output length "
+ "from the ShareGPT dataset.")
parser.add_argument(
"--sonnet-input-len",
type=int,
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 9248010e38063..fdfbf23c721b3 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -10,6 +10,8 @@
from transformers import (AutoModelForCausalLM, AutoTokenizer,
PreTrainedTokenizerBase)
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+
def sample_requests(
dataset_path: str,
@@ -74,48 +76,51 @@ def run_vllm(
quantization_param_path: Optional[str],
device: str,
enable_prefix_caching: bool,
+ enable_chunked_prefill: bool,
+ max_num_batched_tokens: int,
gpu_memory_utilization: float = 0.9,
worker_use_ray: bool = False,
download_dir: Optional[str] = None,
) -> float:
from vllm import LLM, SamplingParams
- llm = LLM(model=model,
- tokenizer=tokenizer,
- quantization=quantization,
- tensor_parallel_size=tensor_parallel_size,
- seed=seed,
- trust_remote_code=trust_remote_code,
- dtype=dtype,
- max_model_len=max_model_len,
- gpu_memory_utilization=gpu_memory_utilization,
- enforce_eager=enforce_eager,
- kv_cache_dtype=kv_cache_dtype,
- quantization_param_path=quantization_param_path,
- device=device,
- enable_prefix_caching=enable_prefix_caching,
- worker_use_ray=worker_use_ray,
- download_dir=download_dir)
+ llm = LLM(
+ model=model,
+ tokenizer=tokenizer,
+ quantization=quantization,
+ tensor_parallel_size=tensor_parallel_size,
+ seed=seed,
+ trust_remote_code=trust_remote_code,
+ dtype=dtype,
+ max_model_len=max_model_len,
+ gpu_memory_utilization=gpu_memory_utilization,
+ enforce_eager=enforce_eager,
+ kv_cache_dtype=kv_cache_dtype,
+ quantization_param_path=quantization_param_path,
+ device=device,
+ enable_prefix_caching=enable_prefix_caching,
+ worker_use_ray=worker_use_ray,
+ download_dir=download_dir,
+ enable_chunked_prefill=enable_chunked_prefill,
+ max_num_batched_tokens=max_num_batched_tokens,
+ )
# Add the requests to the engine.
+ prompts = []
+ sampling_params = []
for prompt, _, output_len in requests:
- sampling_params = SamplingParams(
- n=n,
- temperature=0.0 if use_beam_search else 1.0,
- top_p=1.0,
- use_beam_search=use_beam_search,
- ignore_eos=True,
- max_tokens=output_len,
- )
- # FIXME(woosuk): Do not use internal method.
- llm._add_request(
- prompt=prompt,
- prompt_token_ids=None,
- sampling_params=sampling_params,
- )
+ prompts.append(prompt)
+ sampling_params.append(
+ SamplingParams(
+ n=n,
+ temperature=0.0 if use_beam_search else 1.0,
+ top_p=1.0,
+ use_beam_search=use_beam_search,
+ ignore_eos=True,
+ max_tokens=output_len,
+ ))
start = time.perf_counter()
- # FIXME(woosuk): Do not use internal method.
- llm._run_engine(use_tqdm=True)
+ llm.generate(prompts, sampling_params, use_tqdm=True)
end = time.perf_counter()
return end - start
@@ -221,7 +226,8 @@ def main(args: argparse.Namespace):
args.trust_remote_code, args.dtype, args.max_model_len,
args.enforce_eager, args.kv_cache_dtype,
args.quantization_param_path, args.device,
- args.enable_prefix_caching, args.gpu_memory_utilization,
+ args.enable_prefix_caching, args.enable_chunked_prefill,
+ args.max_num_batched_tokens, args.gpu_memory_utilization,
args.worker_use_ray, args.download_dir)
elif args.backend == "hf":
assert args.tensor_parallel_size == 1
@@ -238,6 +244,18 @@ def main(args: argparse.Namespace):
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
f"{total_num_tokens / elapsed_time:.2f} tokens/s")
+ # Output JSON results if specified
+ if args.output_json:
+ results = {
+ "elapsed_time": elapsed_time,
+ "num_requests": len(requests),
+ "total_num_tokens": total_num_tokens,
+ "requests_per_second": len(requests) / elapsed_time,
+ "tokens_per_second": total_num_tokens / elapsed_time,
+ }
+ with open(args.output_json, "w") as f:
+ json.dump(results, f, indent=4)
+
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Benchmark the throughput.")
@@ -262,7 +280,7 @@ def main(args: argparse.Namespace):
parser.add_argument("--tokenizer", type=str, default=None)
parser.add_argument('--quantization',
'-q',
- choices=['awq', 'gptq', 'squeezellm', None],
+ choices=[*QUANTIZATION_METHODS, None],
default=None)
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
parser.add_argument("--n",
@@ -307,15 +325,13 @@ def main(args: argparse.Namespace):
action="store_true",
help="enforce eager execution")
parser.add_argument(
- "--kv-cache-dtype",
+ '--kv-cache-dtype',
type=str,
- choices=["auto", "fp8"],
+ choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
default="auto",
- help=
- 'Data type for kv cache storage. If "auto", will use model data type. '
- 'FP8_E5M2 (without scaling) is only supported on cuda version greater '
- 'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
- 'common inference criteria.')
+ help='Data type for kv cache storage. If "auto", will use model '
+ 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
+ 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
parser.add_argument(
'--quantization-param-path',
type=str,
@@ -341,11 +357,24 @@ def main(args: argparse.Namespace):
help='use Ray for distributed serving, will be '
'automatically set when using more than 1 GPU '
'unless on ROCm where the default is torchrun')
+ parser.add_argument("--enable-chunked-prefill",
+ action='store_true',
+ help="enable chunked prefill for vLLM backend.")
+ parser.add_argument('--max-num-batched-tokens',
+ type=int,
+ default=None,
+ help='maximum number of batched tokens per '
+ 'iteration')
parser.add_argument('--download-dir',
type=str,
default=None,
help='directory to download and load the weights, '
'default to the default cache dir of huggingface')
+ parser.add_argument(
+ '--output-json',
+ type=str,
+ default=None,
+ help='Path to save the throughput results in JSON format.')
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model
diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py
new file mode 100644
index 0000000000000..59392947b15c8
--- /dev/null
+++ b/benchmarks/kernels/benchmark_aqlm.py
@@ -0,0 +1,302 @@
+import argparse
+import os
+import sys
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.aqlm import (
+ dequantize_weight, generic_dequantize_gemm, get_int_dtype,
+ optimized_dequantize_gemm)
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+
+def torch_mult(
+ input: torch.Tensor, # [..., in_features]
+ weights: torch.Tensor,
+ scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
+) -> torch.Tensor:
+ output = F.linear(input, weights)
+ return output
+
+
+def dequant_out_scale(
+ input: torch.Tensor, # [..., in_features]
+ codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
+ codebooks: torch.
+ Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
+ scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
+ output_partition_sizes: torch.IntTensor,
+ bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+
+ weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+ if bias is None:
+ output = F.linear(input, weights, bias)
+ orig_shape = output.shape
+ flattened_output = output.view(-1, output.size(-1))
+ f_scales = scales.view(-1, scales.shape[0])
+ b_scales = f_scales.expand(flattened_output.shape[0], -1)
+ flattened_output *= b_scales
+ return flattened_output.view(orig_shape)
+ else:
+ b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
+ -1, weights.shape[1])
+ weights *= b_scales
+ return F.linear(input, weights, bias)
+
+
+def dequant_weight_scale(
+ input: torch.Tensor, # [..., in_features]
+ codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
+ codebooks: torch.
+ Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
+ scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
+ output_partition_sizes: torch.IntTensor,
+ bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+
+ weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+ b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
+ -1, weights.shape[1])
+ weights *= b_scales
+ return F.linear(input, weights, bias)
+
+
+def dequant_no_scale(
+ input: torch.Tensor, # [..., in_features]
+ codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
+ codebooks: torch.
+ Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
+ scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
+ output_partition_sizes: torch.IntTensor,
+ bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+
+ weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+ return F.linear(input, weights, bias)
+
+
+# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
+# the generic pytorch version.
+# Just visual comparison.
+def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
+
+ n = parts.sum().item()
+
+ device = torch.device('cuda:0')
+
+ code_range = (1 << bits) // 2
+ ingroups = 8
+
+ codes = torch.randint(-code_range,
+ code_range,
+ size=(n, k // ingroups, nbooks),
+ dtype=get_int_dtype(bits),
+ device=device)
+
+ codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
+ dtype=torch.float16,
+ device=device)
+
+ count = 0
+ for index in range(16):
+ for i in range(8):
+ for book in range(nbooks):
+ codebooks[book, index, 0, i] = count * (10**book)
+ count += 1
+
+ print("codes shape", codes.shape)
+
+ for i in range(16):
+ for book in range(nbooks):
+ codes[0, i, book] = i
+ codes[0, -i, book] = i
+
+ weights = dequantize_weight(codes, codebooks, None)
+ weights2 = ops.aqlm_dequant(codes, codebooks, parts)
+
+ print("weights shape:", weights.shape)
+ print("weights2 shape:", weights2.shape)
+
+ print("weights are:", weights)
+ print("weights2 are:", weights2)
+
+ print("first 128 weights are", weights[0, 0:128].to(torch.int32))
+ print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32))
+
+ print("last 128 weights are", weights[0, -128:])
+ print("last 128 weights2 are:", weights2[0, -128:])
+
+
+def main():
+
+ parser = argparse.ArgumentParser(description="Benchmark aqlm performance.")
+
+ # Add arguments
+ parser.add_argument("--nbooks",
+ type=int,
+ default=1,
+ help="Number of codebooks (default: 1)")
+ parser.add_argument("--bits",
+ type=int,
+ default=16,
+ help="Number of bits per code element (default: 16)")
+ parser.add_argument(
+ "--test",
+ type=bool,
+ default=False,
+ help="Run the decompression/dequant tester rather than benchmarking "
+ "(default: False)")
+
+ # Parse the arguments
+ args = parser.parse_args()
+
+ # Extract values
+ nbooks = args.nbooks
+ bits = args.bits
+
+ if args.test:
+ dequant_test(4096, torch.tensor((4096, )), nbooks, bits)
+ return
+
+ # Otherwise, benchmark.
+ methods = [
+ ops.aqlm_gemm,
+ dequant_out_scale,
+ generic_dequantize_gemm,
+ optimized_dequantize_gemm,
+ dequant_weight_scale,
+ torch_mult,
+ dequant_no_scale,
+ ]
+
+ filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv"
+ print(f"writing benchmarks to file {filename}")
+ with open(filename, "w") as f:
+ sys.stdout = f
+
+ print('m | k | n | n parts', end='')
+ for method in methods:
+ print(f" | {method.__name__.replace('_', ' ')} (µs)", end='')
+ print('')
+
+ # These are reasonable prefill sizes.
+ ksandpartions = ((4096, (4096, 4096, 4096)), (4096, (4096, )),
+ (4096, (11008, 11008)), (11008, (4096, )))
+
+ # reasonable ranges for m.
+ for m in [
+ 1, 2, 4, 8, 10, 12, 14, 16, 24, 32, 48, 52, 56, 64, 96, 112,
+ 128, 256, 512, 1024, 1536, 2048, 3072, 4096
+ ]:
+ print(f'{m}', file=sys.__stdout__)
+ for ksp in ksandpartions:
+ run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits,
+ methods)
+
+ sys.stdout = sys.__stdout__
+
+
+def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
+ methods):
+
+ # I didn't see visible improvements from increasing these, but feel free :)
+ num_warmup_trials = 1
+ num_trials = 1
+
+ num_calls = 100
+
+ # warmup.
+ for method in methods:
+ for _ in range(num_warmup_trials):
+ run_timing(
+ num_calls=num_calls,
+ m=m,
+ k=k,
+ parts=parts,
+ nbooks=nbooks,
+ bits=bits,
+ method=method,
+ )
+
+ n = parts.sum().item()
+ print(f'{m} | {k} | {n} | {parts.tolist()}', end='')
+
+ for method in methods:
+ best_time_us = 1e20
+ for _ in range(num_trials):
+ kernel_dur_ms = run_timing(
+ num_calls=num_calls,
+ m=m,
+ k=k,
+ parts=parts,
+ nbooks=nbooks,
+ bits=bits,
+ method=method,
+ )
+
+ kernel_dur_us = 1000 * kernel_dur_ms
+
+ if kernel_dur_us < best_time_us:
+ best_time_us = kernel_dur_us
+
+ print(f' | {kernel_dur_us:.0f}', end='')
+
+ print('')
+
+
+def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
+ nbooks: int, bits: int, method) -> float:
+
+ n = parts.sum().item()
+
+ device = torch.device('cuda:0')
+
+ input = torch.randn((1, m, k), dtype=torch.float16, device=device)
+
+ code_range = (1 << bits) // 2
+ ingroups = 8
+
+ codes = torch.randint(-code_range,
+ code_range,
+ size=(n, k // ingroups, nbooks),
+ dtype=get_int_dtype(bits),
+ device=device)
+
+ codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
+ dtype=torch.float16,
+ device=device)
+
+ scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device)
+
+ # for comparison to just a pytorch mult.
+ weights = torch.randn((n, k), dtype=torch.float16, device=device)
+
+ start_event = torch.cuda.Event(enable_timing=True)
+ end_event = torch.cuda.Event(enable_timing=True)
+
+ start_event.record()
+
+ if method is torch_mult:
+ for i in range(num_calls):
+ torch_mult(input, weights, scales)
+ else:
+ for i in range(num_calls):
+ method(input, codes, codebooks, scales, parts, None)
+
+ end_event.record()
+ end_event.synchronize()
+
+ dur_ms = start_event.elapsed_time(end_event) / num_calls
+ return dur_ms
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
new file mode 100644
index 0000000000000..b771911781574
--- /dev/null
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -0,0 +1,233 @@
+import argparse
+
+import torch
+import torch.utils.benchmark as benchmark
+from benchmark_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+ GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
+ GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
+from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
+ GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
+ GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+ MarlinWorkspace, marlin_24_quantize, marlin_quantize)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+ gptq_pack, quantize_weights, sort_weights)
+
+DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+
+ACT_ORDER_OPTS = [False, True]
+K_FULL_OPTS = [False, True]
+
+
+def bench_run(results, model, act_order, is_k_full, num_bits, group_size,
+ size_m, size_k, size_n):
+ label = "Quant Matmul"
+
+ sub_label = ("{}, act={} k_full={}, b={}, g={}, "
+ "MKN=({}x{}x{})".format(model, act_order, is_k_full, num_bits,
+ group_size, size_m, size_k, size_n))
+
+ print(f"Testing: {sub_label}")
+
+ a = torch.randn(size_m, size_k).to(torch.half).cuda()
+ b = torch.rand(size_k, size_n).to(torch.half).cuda()
+
+ a_tmp = (torch.zeros(size_m, size_k).to(torch.half).cuda())
+
+ # Marlin quant
+ (
+ marlin_w_ref,
+ marlin_q_w,
+ marlin_s,
+ marlin_g_idx,
+ marlin_sort_indices,
+ marlin_rand_perm,
+ ) = marlin_quantize(b, num_bits, group_size, act_order)
+
+ # Marlin_24 quant
+ (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta,
+ marlin_24_s) = marlin_24_quantize(b, num_bits, group_size)
+
+ # GPTQ quant
+ (w_ref, q_w, s, g_idx,
+ rand_perm) = quantize_weights(b, num_bits, group_size, act_order)
+ q_w_gptq = gptq_pack(q_w, num_bits, size_k, size_n)
+
+ # For act_order, sort the "weights" and "g_idx"
+ # so that group ids are increasing
+ repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
+ if act_order:
+ (q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
+
+ # Prepare
+ marlin_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
+ GPTQ_MARLIN_MAX_PARALLEL)
+
+ marlin_24_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
+ GPTQ_MARLIN_24_MAX_PARALLEL)
+
+ globals = {
+ # Gen params
+ "num_bits": num_bits,
+ "group_size": group_size,
+ "size_m": size_m,
+ "size_n": size_n,
+ "size_k": size_k,
+ "a": a,
+ "a_tmp": a_tmp,
+ # Marlin params
+ "marlin_w_ref": marlin_w_ref,
+ "marlin_q_w": marlin_q_w,
+ "marlin_s": marlin_s,
+ "marlin_g_idx": marlin_g_idx,
+ "marlin_sort_indices": marlin_sort_indices,
+ "marlin_rand_perm": marlin_rand_perm,
+ "marlin_workspace": marlin_workspace,
+ "is_k_full": is_k_full,
+ # Marlin_24 params
+ "marlin_24_w_ref": marlin_24_w_ref,
+ "marlin_24_q_w_comp": marlin_24_q_w_comp,
+ "marlin_24_meta": marlin_24_meta,
+ "marlin_24_s": marlin_24_s,
+ "marlin_24_workspace": marlin_24_workspace,
+ # GPTQ params
+ "q_w_gptq": q_w_gptq,
+ "repack_sort_indices": repack_sort_indices,
+ # Kernels
+ "gptq_marlin_gemm": ops.gptq_marlin_gemm,
+ "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
+ "gptq_marlin_repack": ops.gptq_marlin_repack,
+ }
+
+ min_run_time = 1
+
+ # Warmup pytorch
+ for i in range(5):
+ torch.matmul(a, marlin_w_ref)
+
+ results.append(
+ benchmark.Timer(
+ stmt="torch.matmul(a, marlin_w_ref)",
+ globals=globals,
+ label=label,
+ sub_label=sub_label,
+ description="pytorch_gemm",
+ ).blocked_autorange(min_run_time=min_run_time))
+
+ results.append(
+ benchmark.Timer(
+ stmt=
+ "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full)", # noqa: E501
+ globals=globals,
+ label=label,
+ sub_label=sub_label,
+ description="gptq_marlin_gemm",
+ ).blocked_autorange(min_run_time=min_run_time))
+
+ if (num_bits in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
+ and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES):
+ results.append(
+ benchmark.Timer(
+ stmt=
+ "output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, num_bits, size_m, size_n, size_k)", # noqa: E501
+ globals=globals,
+ label=label,
+ sub_label=sub_label,
+ description="gptq_marlin_24_gemm",
+ ).blocked_autorange(min_run_time=min_run_time))
+
+ results.append(
+ benchmark.Timer(
+ stmt=
+ "q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, num_bits)", # noqa: E501
+ globals=globals,
+ label=label,
+ sub_label=sub_label,
+ description="gptq_marlin_repack",
+ ).blocked_autorange(min_run_time=min_run_time))
+
+
+def main(args):
+ print("Benchmarking models:")
+ for i, model in enumerate(args.models):
+ print(f"[{i}] {model}")
+
+ results = []
+
+ for model in args.models:
+ for layer in WEIGHT_SHAPES[model]:
+ size_k = layer[0]
+ size_n = layer[1]
+
+ if len(args.limit_k) > 0 and size_k not in args.limit_k:
+ continue
+
+ if len(args.limit_n) > 0 and size_n not in args.limit_n:
+ continue
+
+ for act_order in ACT_ORDER_OPTS:
+ if len(args.limit_act_order
+ ) > 0 and act_order not in args.limit_act_order:
+ continue
+
+ for is_k_full in K_FULL_OPTS:
+ if len(args.limit_k_full
+ ) > 0 and is_k_full not in args.limit_k_full:
+ continue
+
+ for num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
+ if len(args.limit_num_bits
+ ) > 0 and num_bits not in args.limit_num_bits:
+ continue
+
+ for group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES:
+ if len(
+ args.limit_group_size
+ ) > 0 and group_size not in args.limit_group_size:
+ continue
+
+ # For act_order, the group_size must be less than
+ # size_k
+ if act_order and (group_size == size_k
+ or group_size == -1):
+ continue
+
+ for size_m in args.batch_sizes:
+ bench_run(results, model, act_order, is_k_full,
+ num_bits, group_size, size_m, size_k,
+ size_n)
+
+ compare = benchmark.Compare(results)
+ compare.print()
+
+
+# For quick benchmarking use:
+# python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501
+#
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="Benchmark Marlin across specified models/shapes/batches")
+ parser.add_argument(
+ "--models",
+ nargs="+",
+ type=str,
+ default=DEFAULT_MODELS,
+ choices=WEIGHT_SHAPES.keys(),
+ )
+ parser.add_argument("--batch-sizes",
+ nargs="+",
+ type=int,
+ default=DEFAULT_BATCH_SIZES)
+ parser.add_argument("--limit-k", nargs="+", type=int, default=[])
+ parser.add_argument("--limit-n", nargs="+", type=int, default=[])
+ parser.add_argument("--limit-group-size", nargs="+", type=int, default=[])
+ parser.add_argument("--limit-num-bits", nargs="+", type=int, default=[])
+ parser.add_argument("--limit-act-order", nargs="+", type=int, default=[])
+ parser.add_argument("--limit-k-full", nargs="+", type=int, default=[])
+
+ args = parser.parse_args()
+ main(args)
diff --git a/benchmarks/kernels/benchmark_mixtral_moe.py b/benchmarks/kernels/benchmark_mixtral_moe.py
index 8e976fbcb3028..196ec8cfce88e 100644
--- a/benchmarks/kernels/benchmark_mixtral_moe.py
+++ b/benchmarks/kernels/benchmark_mixtral_moe.py
@@ -1,3 +1,4 @@
+import argparse
import json
import os
import sys
@@ -5,68 +6,70 @@
import torch
import torch.nn.functional as F
import triton
+from tqdm import tqdm
from vllm.model_executor.layers.fused_moe import (fused_moe,
get_config_file_name)
-os.environ['CUDA_VISIBLE_DEVICES'] = '0'
-
-def main():
+def main(model, tp_size, gpu, dtype: str):
+ os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu)
method = fused_moe
for bs in [
1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
2048, 3072, 4096
]:
- run_grid(bs, method=method)
-
-
-def run_grid(bs, method):
- d_model = 4096
+ run_grid(bs,
+ model=model,
+ method=method,
+ gpu=gpu,
+ tp_size=tp_size,
+ dtype=dtype)
+
+
+def run_grid(bs, model, method, gpu, tp_size, dtype: str):
+ if model == '8x7B':
+ d_model = 4096
+ model_intermediate_size = 14336
+ num_layers = 32
+ elif model == '8x22B':
+ d_model = 6144
+ model_intermediate_size = 16384
+ num_layers = 56
+ else:
+ raise ValueError(f'Unsupported Mixtral model {model}')
num_total_experts = 8
top_k = 2
- tp_size = 2
- model_intermediate_size = 14336
- num_layers = 32
+ # tp_size = 2
num_calls = 100
num_warmup_trials = 1
num_trials = 1
configs = []
- if bs <= 16:
- BLOCK_SIZES_M = [16]
- elif bs <= 32:
- BLOCK_SIZES_M = [16, 32]
- elif bs <= 64:
- BLOCK_SIZES_M = [16, 32, 64]
- elif bs <= 128:
- BLOCK_SIZES_M = [16, 32, 64, 128]
- else:
- BLOCK_SIZES_M = [16, 32, 64, 128, 256]
for block_size_n in [32, 64, 128, 256]:
- for block_size_m in BLOCK_SIZES_M:
+ for block_size_m in [16, 32, 64, 128, 256]:
for block_size_k in [64, 128, 256]:
for group_size_m in [1, 16, 32, 64]:
for num_warps in [4, 8]:
- configs.append({
- "BLOCK_SIZE_M": block_size_m,
- "BLOCK_SIZE_N": block_size_n,
- "BLOCK_SIZE_K": block_size_k,
- "GROUP_SIZE_M": group_size_m,
- "num_warps": num_warps,
- "num_stages": 4,
- })
+ for num_stages in [2, 3, 4, 5]:
+ configs.append({
+ "BLOCK_SIZE_M": block_size_m,
+ "BLOCK_SIZE_N": block_size_n,
+ "BLOCK_SIZE_K": block_size_k,
+ "GROUP_SIZE_M": group_size_m,
+ "num_warps": num_warps,
+ "num_stages": num_stages,
+ })
best_config = None
best_time_us = 1e20
- for config in configs:
- print(f'{tp_size=} {bs=}')
- print(f'{config}')
+ print(f'{tp_size=} {bs=}')
+
+ for config in tqdm(configs):
# warmup
- print('warming up')
try:
for _ in range(num_warmup_trials):
run_timing(
@@ -79,12 +82,12 @@ def run_grid(bs, method):
model_intermediate_size=model_intermediate_size,
method=method,
config=config,
+ dtype=dtype,
)
except triton.runtime.autotuner.OutOfResources:
continue
# trial
- print('benchmarking')
for _ in range(num_trials):
kernel_dur_ms = run_timing(
num_calls=num_calls,
@@ -96,6 +99,7 @@ def run_grid(bs, method):
model_intermediate_size=model_intermediate_size,
method=method,
config=config,
+ dtype=dtype,
)
kernel_dur_us = 1000 * kernel_dur_ms
@@ -105,16 +109,18 @@ def run_grid(bs, method):
best_config = config
best_time_us = kernel_dur_us
- print(f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f}'
- f' {bs=} {tp_size=} {top_k=} {num_total_experts=} '
- f'{d_model=} {model_intermediate_size=} {num_layers=}')
+ tqdm.write(
+ f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f}'
+ f' {bs=} {tp_size=} {top_k=} {num_total_experts=} '
+ f'{d_model=} {model_intermediate_size=} {num_layers=}')
print("best_time_us", best_time_us)
print("best_config", best_config)
# holds Dict[str, Dict[str, int]]
filename = get_config_file_name(num_total_experts,
- model_intermediate_size // tp_size)
+ model_intermediate_size // tp_size,
+ "float8" if dtype == "float8" else None)
print(f"writing config to file {filename}")
existing_content = {}
if os.path.exists(filename):
@@ -128,27 +134,48 @@ def run_grid(bs, method):
def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
top_k: int, tp_size: int, model_intermediate_size: int, method,
- config) -> float:
+ config, dtype: str) -> float:
shard_intermediate_size = model_intermediate_size // tp_size
hidden_states = torch.rand(
(bs, d_model),
device="cuda:0",
- dtype=torch.bfloat16,
+ dtype=torch.float16,
)
- ws = torch.rand(
+ w1 = torch.rand(
(num_total_experts, 2 * shard_intermediate_size, d_model),
device=hidden_states.device,
dtype=hidden_states.dtype,
)
- w2s = torch.rand(
+ w2 = torch.rand(
(num_total_experts, d_model, shard_intermediate_size),
device=hidden_states.device,
dtype=hidden_states.dtype,
)
+ w1_scale = None
+ w2_scale = None
+ a1_scale = None
+ a2_scale = None
+
+ if dtype == "float8":
+ w1 = w1.to(torch.float8_e4m3fn)
+ w2 = w2.to(torch.float8_e4m3fn)
+ w1_scale = torch.ones(num_total_experts,
+ device=hidden_states.device,
+ dtype=torch.float32)
+ w2_scale = torch.ones(num_total_experts,
+ device=hidden_states.device,
+ dtype=torch.float32)
+ a1_scale = torch.ones(1,
+ device=hidden_states.device,
+ dtype=torch.float32)
+ a2_scale = torch.ones(1,
+ device=hidden_states.device,
+ dtype=torch.float32)
+
gating_output = F.softmax(torch.rand(
(num_calls, bs, num_total_experts),
device=hidden_states.device,
@@ -163,13 +190,18 @@ def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
for i in range(num_calls):
hidden_states = method(
hidden_states=hidden_states,
- w1=ws,
- w2=w2s,
+ w1=w1,
+ w2=w2,
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a1_scale,
+ a2_scale=a2_scale,
gating_output=gating_output[i],
topk=2,
renormalize=True,
inplace=True,
override_config=config,
+ use_fp8=dtype == "float8",
)
end_event.record()
end_event.synchronize()
@@ -179,4 +211,29 @@ def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
if __name__ == "__main__":
- sys.exit(main())
+ parser = argparse.ArgumentParser(
+ prog='benchmark_mixtral_moe',
+ description='Benchmark and tune the fused_moe kernel',
+ )
+ parser.add_argument(
+ '--dtype',
+ type=str,
+ default='auto',
+ choices=['float8', 'float16'],
+ help='Data type used for fused_moe kernel computations',
+ )
+ parser.add_argument('--model',
+ type=str,
+ default='8x7B',
+ choices=['8x7B', '8x22B'],
+ help='The Mixtral model to benchmark')
+ parser.add_argument('--tp-size',
+ type=int,
+ default=2,
+ help='Tensor paralleli size')
+ parser.add_argument('--gpu',
+ type=int,
+ default=0,
+ help="GPU ID for benchmarking")
+ args = parser.parse_args()
+ sys.exit(main(args.model, args.tp_size, args.gpu, args.dtype))
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 24f734ce8cce4..0fcfc0a295ca2 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -5,7 +5,7 @@
import torch
-from vllm._C import ops
+from vllm import _custom_ops as ops
from vllm._custom_C import paged_attention_custom
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
@@ -17,7 +17,7 @@
def main(
version: str,
num_seqs: int,
- context_len: int,
+ seq_len: int,
num_query_heads: int,
num_kv_heads: int,
head_size: int,
@@ -49,12 +49,12 @@ def main(
dtype=torch.float,
device=device)
- context_lens = [context_len for _ in range(num_seqs)]
- max_context_len = max(context_lens)
- context_lens = torch.tensor(context_lens, dtype=torch.int, device=device)
+ seq_lens = [seq_len for _ in range(num_seqs)]
+ max_seq_len = max(seq_lens)
+ seq_lens = torch.tensor(seq_lens, dtype=torch.int, device=device)
# Create the block tables.
- max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
+ max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
block_tables = []
for _ in range(num_seqs):
block_table = [
@@ -81,7 +81,7 @@ def main(
if not args.custom_paged_attn:
global PARTITION_SIZE
PARTITION_SIZE = 512
- num_partitions = ((max_context_len + PARTITION_SIZE - 1) //
+ num_partitions = ((max_seq_len + PARTITION_SIZE - 1) //
PARTITION_SIZE)
tmp_output = torch.empty(
size=(num_seqs, num_query_heads, num_partitions, head_size),
@@ -114,9 +114,9 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
num_kv_heads,
scale,
block_tables,
- context_lens,
+ seq_lens,
block_size,
- max_context_len,
+ max_seq_len,
alibi_slopes,
kv_cache_dtype,
kv_scale,
@@ -134,9 +134,9 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
num_kv_heads,
scale,
block_tables,
- context_lens,
+ seq_lens,
block_size,
- max_context_len,
+ max_seq_len,
alibi_slopes,
kv_cache_dtype,
kv_scale,
@@ -153,9 +153,9 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
num_kv_heads,
scale,
block_tables,
- context_lens,
+ seq_lens,
block_size,
- max_context_len,
+ max_seq_len,
alibi_slopes,
kv_cache_dtype,
)
@@ -189,12 +189,12 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
choices=["v1", "v2"],
default="v2")
parser.add_argument("--batch-size", type=int, default=8)
- parser.add_argument("--context-len", type=int, default=4096)
+ parser.add_argument("--seq_len", type=int, default=4096)
parser.add_argument("--num-query-heads", type=int, default=64)
parser.add_argument("--num-kv-heads", type=int, default=8)
parser.add_argument("--head-size",
type=int,
- choices=[64, 80, 96, 112, 128, 256],
+ choices=[64, 80, 96, 112, 128, 192, 256],
default=128)
parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
parser.add_argument("--use-alibi", action="store_true")
@@ -207,13 +207,11 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
parser.add_argument(
"--kv-cache-dtype",
type=str,
- choices=["auto", "fp8"],
+ choices=["auto", "fp8", "fp8_e5m2", "fp8_e4m3"],
default="auto",
- help=
- 'Data type for kv cache storage. If "auto", will use model data type. '
- 'FP8_E5M2 (without scaling) is only supported on cuda version greater '
- 'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
- 'common inference criteria.')
+ help="Data type for kv cache storage. If 'auto', will use model "
+ "data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. "
+ "ROCm (AMD GPU) supports fp8 (=fp8_e4m3)")
parser.add_argument("--custom-paged-attn",
action="store_true",
help="Use custom paged attention")
@@ -225,7 +223,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
main(
version=args.version,
num_seqs=args.batch_size,
- context_len=args.context_len,
+ seq_len=args.seq_len,
num_query_heads=args.num_query_heads,
num_kv_heads=args.num_kv_heads,
head_size=args.head_size,
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index 9188e811e2982..00e55f6060b52 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -93,7 +93,7 @@ def benchmark_rope_kernels_multi_lora(
parser.add_argument("--num-heads", type=int, default=8)
parser.add_argument("--head-size",
type=int,
- choices=[64, 80, 96, 112, 128, 256],
+ choices=[64, 80, 96, 112, 128, 192, 256],
default=128)
parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
parser.add_argument("--dtype",
diff --git a/benchmarks/kernels/benchmark_shapes.py b/benchmarks/kernels/benchmark_shapes.py
new file mode 100644
index 0000000000000..4eeeca35a37cc
--- /dev/null
+++ b/benchmarks/kernels/benchmark_shapes.py
@@ -0,0 +1,75 @@
+WEIGHT_SHAPES = {
+ "ideal": [[4 * 256 * 32, 256 * 32]],
+ "mistralai/Mistral-7B-v0.1/TP1": [
+ [4096, 6144],
+ [4096, 4096],
+ [4096, 28672],
+ [14336, 4096],
+ ],
+ "mistralai/Mistral-7B-v0.1/TP2": [
+ [4096, 3072],
+ [2048, 4096],
+ [4096, 14336],
+ [7168, 4096],
+ ],
+ "mistralai/Mistral-7B-v0.1/TP4": [
+ [4096, 1536],
+ [1024, 4096],
+ [4096, 7168],
+ [3584, 4096],
+ ],
+ "meta-llama/Llama-2-7b-hf/TP1": [
+ [4096, 12288],
+ [4096, 4096],
+ [4096, 22016],
+ [11008, 4096],
+ ],
+ "meta-llama/Llama-2-7b-hf/TP2": [
+ [4096, 6144],
+ [2048, 4096],
+ [4096, 11008],
+ [5504, 4096],
+ ],
+ "meta-llama/Llama-2-7b-hf/TP4": [
+ [4096, 3072],
+ [1024, 4096],
+ [4096, 5504],
+ [2752, 4096],
+ ],
+ "meta-llama/Llama-2-13b-hf/TP1": [
+ [5120, 15360],
+ [5120, 5120],
+ [5120, 27648],
+ [13824, 5120],
+ ],
+ "meta-llama/Llama-2-13b-hf/TP2": [
+ [5120, 7680],
+ [2560, 5120],
+ [5120, 13824],
+ [6912, 5120],
+ ],
+ "meta-llama/Llama-2-13b-hf/TP4": [
+ [5120, 3840],
+ [1280, 5120],
+ [5120, 6912],
+ [3456, 5120],
+ ],
+ "meta-llama/Llama-2-70b-hf/TP1": [
+ [8192, 10240],
+ [8192, 8192],
+ [8192, 57344],
+ [28672, 8192],
+ ],
+ "meta-llama/Llama-2-70b-hf/TP2": [
+ [8192, 5120],
+ [4096, 8192],
+ [8192, 28672],
+ [14336, 8192],
+ ],
+ "meta-llama/Llama-2-70b-hf/TP4": [
+ [8192, 2560],
+ [2048, 8192],
+ [8192, 14336],
+ [7168, 8192],
+ ],
+}
diff --git a/benchmarks/launch_tgi_server.sh b/benchmarks/launch_tgi_server.sh
index 64d3c4f4b3889..f491c90d0683e 100755
--- a/benchmarks/launch_tgi_server.sh
+++ b/benchmarks/launch_tgi_server.sh
@@ -4,7 +4,7 @@ PORT=8000
MODEL=$1
TOKENS=$2
-docker run --gpus all --shm-size 1g -p $PORT:80 \
+docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \
-v $PWD/data:/data \
ghcr.io/huggingface/text-generation-inference:1.4.0 \
--model-id $MODEL \
diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py
new file mode 100644
index 0000000000000..c846e47de1fcf
--- /dev/null
+++ b/benchmarks/overheads/benchmark_hashing.py
@@ -0,0 +1,63 @@
+import argparse
+import cProfile
+import pstats
+
+from vllm import LLM, SamplingParams
+
+# A very long prompt, total number of tokens is about 15k.
+LONG_PROMPT = ["You are an expert in large language models, aren't you?"
+ ] * 1000
+LONG_PROMPT = ' '.join(LONG_PROMPT)
+
+
+def main(args):
+ llm = LLM(
+ model=args.model,
+ enforce_eager=True,
+ enable_prefix_caching=True,
+ tensor_parallel_size=args.tensor_parallel_size,
+ use_v2_block_manager=args.use_v2_block_manager,
+ )
+
+ sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
+ profiler = cProfile.Profile()
+
+ print("------warm up------")
+ for i in range(3):
+ output = llm.generate(LONG_PROMPT, sampling_params)
+ print(output[0].outputs[0].text)
+
+ print("------start generating------")
+ for i in range(3):
+ profiler.runctx('llm.generate(LONG_PROMPT, sampling_params)',
+ globals(), locals())
+
+ # analyze the runtime of hashing function
+ stats = pstats.Stats(profiler)
+ stats.sort_stats('cumulative')
+ total_time = 0
+ total_calls = 0
+ for func in stats.stats:
+ if 'hash_of_block' in func[2]:
+ total_time = stats.stats[func][3]
+ total_calls = stats.stats[func][0]
+ percentage = (total_time / stats.total_tt) * 100
+ print(f"Hashing took {total_time:.2f} seconds,"
+ f"{percentage:.2f}% of the total runtime.")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description='Benchmark the performance of hashing function in'
+ 'automatic prefix caching.')
+ parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k')
+ parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
+ parser.add_argument('--output-len', type=int, default=10)
+ parser.add_argument('--enable-prefix-caching',
+ action='store_true',
+ help='enable prefix caching')
+ parser.add_argument('--use-v2-block-manager',
+ action='store_true',
+ help='Use BlockSpaceMangerV2')
+ args = parser.parse_args()
+ main(args)
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 8339fa6cccb0d..b173ee106d562 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -99,7 +99,7 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
"Failed to determine torch nvcc compiler flags")
if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
- list(APPEND GPU_FLAGS "-DENABLE_FP8_E5M2")
+ list(APPEND GPU_FLAGS "-DENABLE_FP8")
endif()
if (CUDA_VERSION VERSION_GREATER_EQUAL 12.0)
list(REMOVE_ITEM GPU_FLAGS
@@ -119,7 +119,7 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
list(APPEND GPU_FLAGS
"-DUSE_ROCM"
- "-DENABLE_FP8_E4M3"
+ "-DENABLE_FP8"
"-U__HIP_NO_HALF_CONVERSIONS__"
"-U__HIP_NO_HALF_OPERATORS__"
"-fno-gpu-rdc")
diff --git a/collect_env.py b/collect_env.py
index 8982fba024274..1ecfeb8e22e2f 100644
--- a/collect_env.py
+++ b/collect_env.py
@@ -63,6 +63,7 @@
"magma",
"triton",
"optree",
+ "nccl",
}
DEFAULT_PIP_PATTERNS = {
@@ -73,6 +74,7 @@
"triton",
"optree",
"onnx",
+ "nccl",
}
diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index 24d972702c858..867f63f12de4b 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -10,11 +10,11 @@
namespace vllm {
// Activation and gating kernel template.
-template
+template
__global__ void act_and_mul_kernel(
- scalar_t* __restrict__ out, // [..., d]
- const scalar_t* __restrict__ input, // [..., 2, d]
- const int d) {
+ scalar_t* __restrict__ out, // [..., d]
+ const scalar_t* __restrict__ input, // [..., 2, d]
+ const int d) {
const int64_t token_idx = blockIdx.x;
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
@@ -23,72 +23,66 @@ __global__ void act_and_mul_kernel(
}
}
-template
+template
__device__ __forceinline__ T silu_kernel(const T& x) {
// x * sigmoid(x)
- return (T) (((float) x) / (1.0f + expf((float) -x)));
+ return (T)(((float)x) / (1.0f + expf((float)-x)));
}
-template
+template
__device__ __forceinline__ T gelu_kernel(const T& x) {
// Equivalent to PyTorch GELU with 'none' approximation.
// Refer to:
// https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38
- const float f = (float) x;
+ const float f = (float)x;
constexpr float ALPHA = M_SQRT1_2;
- return (T) (f * 0.5f * (1.0f + ::erf(f * ALPHA)));
+ return (T)(f * 0.5f * (1.0f + ::erf(f * ALPHA)));
}
-template
+template
__device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
// Equivalent to PyTorch GELU with 'tanh' approximation.
// Refer to:
// https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30
- const float f = (float) x;
+ const float f = (float)x;
constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
constexpr float KAPPA = 0.044715;
float x_cube = f * f * f;
float inner = BETA * (f + KAPPA * x_cube);
- return (T) (0.5f * f * (1.0f + ::tanhf(inner)));
+ return (T)(0.5f * f * (1.0f + ::tanhf(inner)));
}
-} // namespace vllm
+} // namespace vllm
// Launch activation and gating kernel.
-#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL) \
- int d = input.size(-1) / 2; \
- int64_t num_tokens = input.numel() / input.size(-1); \
- dim3 grid(num_tokens); \
- dim3 block(std::min(d, 1024)); \
- const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
- const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
- VLLM_DISPATCH_FLOATING_TYPES( \
- input.scalar_type(), \
- "act_and_mul_kernel", \
- [&] { \
- vllm::act_and_mul_kernel><<>>( \
- out.data_ptr(), \
- input.data_ptr(), \
- d); \
- });
-
-void silu_and_mul(
- torch::Tensor& out, // [..., d]
- torch::Tensor& input) // [..., 2 * d]
+#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL) \
+ int d = input.size(-1) / 2; \
+ int64_t num_tokens = input.numel() / input.size(-1); \
+ dim3 grid(num_tokens); \
+ dim3 block(std::min(d, 1024)); \
+ const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
+ const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
+ VLLM_DISPATCH_FLOATING_TYPES( \
+ input.scalar_type(), "act_and_mul_kernel", [&] { \
+ vllm::act_and_mul_kernel> \
+ <<>>(out.data_ptr(), \
+ input.data_ptr(), d); \
+ });
+
+void silu_and_mul(torch::Tensor& out, // [..., d]
+ torch::Tensor& input) // [..., 2 * d]
{
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
}
-void gelu_and_mul(
- torch::Tensor& out, // [..., d]
- torch::Tensor& input) // [..., 2 * d]
+void gelu_and_mul(torch::Tensor& out, // [..., d]
+ torch::Tensor& input) // [..., 2 * d]
{
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);
}
-void gelu_tanh_and_mul(
- torch::Tensor& out, // [..., d]
- torch::Tensor& input) // [..., 2 * d]
+void gelu_tanh_and_mul(torch::Tensor& out, // [..., d]
+ torch::Tensor& input) // [..., 2 * d]
{
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel);
}
@@ -96,11 +90,11 @@ void gelu_tanh_and_mul(
namespace vllm {
// Element-wise activation kernel template.
-template
+template
__global__ void activation_kernel(
- scalar_t* __restrict__ out, // [..., d]
- const scalar_t* __restrict__ input, // [..., d]
- const int d) {
+ scalar_t* __restrict__ out, // [..., d]
+ const scalar_t* __restrict__ input, // [..., d]
+ const int d) {
const int64_t token_idx = blockIdx.x;
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
@@ -108,54 +102,49 @@ __global__ void activation_kernel(
}
}
-} // namespace vllm
+} // namespace vllm
// Launch element-wise activation kernel.
-#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \
- int d = input.size(-1); \
- int64_t num_tokens = input.numel() / d; \
- dim3 grid(num_tokens); \
- dim3 block(std::min(d, 1024)); \
- const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
- const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
- VLLM_DISPATCH_FLOATING_TYPES( \
- input.scalar_type(), \
- "activation_kernel", \
- [&] { \
- vllm::activation_kernel><<>>( \
- out.data_ptr(), \
- input.data_ptr(), \
- d); \
- });
+#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \
+ int d = input.size(-1); \
+ int64_t num_tokens = input.numel() / d; \
+ dim3 grid(num_tokens); \
+ dim3 block(std::min(d, 1024)); \
+ const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
+ const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
+ VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "activation_kernel", [&] { \
+ vllm::activation_kernel> \
+ <<>>(out.data_ptr(), \
+ input.data_ptr(), d); \
+ });
namespace vllm {
-template
+template
__device__ __forceinline__ T gelu_new_kernel(const T& x) {
- const float x3 = (float) (x * x * x);
- const T t = (T) tanhf((T) (0.79788456f * (float) (x + (T) (0.044715f * x3))));
- return ((T) 0.5) * x * (((T) 1.0) + t);
+ const float x3 = (float)(x * x * x);
+ const T t = (T)tanhf((T)(0.79788456f * (float)(x + (T)(0.044715f * x3))));
+ return ((T)0.5) * x * (((T)1.0) + t);
}
-template
+template
__device__ __forceinline__ T gelu_fast_kernel(const T& x) {
- const float f = (float) x;
- const T t = (T) tanhf(((T) (f * 0.79788456f)) * (((T) 1.0) + (T) (0.044715f * f) * x));
- return ((T) 0.5) * x * (((T) 1.0) + t);
+ const float f = (float)x;
+ const T t =
+ (T)tanhf(((T)(f * 0.79788456f)) * (((T)1.0) + (T)(0.044715f * f) * x));
+ return ((T)0.5) * x * (((T)1.0) + t);
}
-} // namespace vllm
+} // namespace vllm
-void gelu_new(
- torch::Tensor& out, // [..., d]
- torch::Tensor& input) // [..., d]
+void gelu_new(torch::Tensor& out, // [..., d]
+ torch::Tensor& input) // [..., d]
{
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);
}
-void gelu_fast(
- torch::Tensor& out, // [..., d]
- torch::Tensor& input) // [..., d]
+void gelu_fast(torch::Tensor& out, // [..., d]
+ torch::Tensor& input) // [..., d]
{
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
}
diff --git a/csrc/attention/attention_generic.cuh b/csrc/attention/attention_generic.cuh
index 31fb401cbe2c1..62409c0cce93e 100644
--- a/csrc/attention/attention_generic.cuh
+++ b/csrc/attention/attention_generic.cuh
@@ -1,5 +1,6 @@
/*
- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
* Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
@@ -22,31 +23,31 @@
namespace vllm {
// A vector type to store Q, K, V elements.
-template
+template
struct Vec {};
// A vector type to store FP32 accumulators.
-template
+template
struct FloatVec {};
// Template vector operations.
-template
+template
inline __device__ Acc mul(A a, B b);
-template
+template
inline __device__ float sum(T v);
-template
+template
inline __device__ float dot(T a, T b) {
return sum(mul(a, b));
}
-template
+template
inline __device__ float dot(T a, T b) {
return sum(mul(a, b));
}
-template
+template
inline __device__ void zero(T& dst) {
constexpr int WORDS = sizeof(T) / 4;
union {
@@ -61,4 +62,4 @@ inline __device__ void zero(T& dst) {
dst = tmp.raw;
}
-} // namespace vllm
+} // namespace vllm
diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
index b114ab0cfdb57..ece7e749e7312 100644
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -1,5 +1,6 @@
/*
- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
* Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
@@ -19,27 +20,23 @@
#include
#include
#include
+#include
#include "attention_dtypes.h"
#include "attention_utils.cuh"
-#if defined(ENABLE_FP8_E5M2)
-#include "../quantization/fp8_e5m2_kvcache/quant_utils.cuh"
-#elif defined(ENABLE_FP8_E4M3)
-#include "../quantization/fp8/amd_detail/quant_utils.cuh"
-#endif
-
-#include
-
#ifdef USE_ROCM
#include
- typedef __hip_bfloat16 __nv_bfloat16;
+ #include "../quantization/fp8/amd/quant_utils.cuh"
+typedef __hip_bfloat16 __nv_bfloat16;
+#else
+ #include "../quantization/fp8/nvidia/quant_utils.cuh"
#endif
#ifndef USE_ROCM
-#define WARP_SIZE 32
+ #define WARP_SIZE 32
#else
-#define WARP_SIZE warpSize
+ #define WARP_SIZE warpSize
#endif
#define MAX(a, b) ((a) > (b) ? (a) : (b))
@@ -49,7 +46,7 @@
namespace vllm {
// Utility function for attention softmax.
-template
+template
inline __device__ float block_sum(float* red_smem, float sum) {
// Decompose the thread index into warp / lane.
int warp = threadIdx.x / WARP_SIZE;
@@ -86,58 +83,65 @@ inline __device__ float block_sum(float* red_smem, float sum) {
// TODO(woosuk): Merge the last two dimensions of the grid.
// Grid: (num_heads, num_seqs, max_num_partitions).
-template<
- typename scalar_t,
- typename cache_t,
- int HEAD_SIZE,
- int BLOCK_SIZE,
- int NUM_THREADS,
- bool IS_FP8_KV_CACHE,
- int PARTITION_SIZE = 0> // Zero means no partitioning.
+template // Zero means no partitioning.
__device__ void paged_attention_kernel(
- float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
- float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
- scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size]
- const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
- const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x]
- const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size]
- const int num_kv_heads, // [num_heads]
- const float scale,
- const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
- const int* __restrict__ context_lens, // [num_seqs]
- const int max_num_blocks_per_seq,
- const float* __restrict__ alibi_slopes, // [num_heads]
- const int q_stride,
- const int kv_block_stride,
- const int kv_head_stride,
- const float kv_scale) {
+ float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
+ float* __restrict__ max_logits, // [num_seqs, num_heads,
+ // max_num_partitions]
+ scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions,
+ // head_size]
+ const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
+ const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads,
+ // head_size/x, block_size, x]
+ const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads,
+ // head_size, block_size]
+ const int num_kv_heads, // [num_heads]
+ const float scale,
+ const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
+ const int* __restrict__ seq_lens, // [num_seqs]
+ const int max_num_blocks_per_seq,
+ const float* __restrict__ alibi_slopes, // [num_heads]
+ const int q_stride, const int kv_block_stride, const int kv_head_stride,
+ const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
+ const int blocksparse_vert_stride, const int blocksparse_block_size,
+ const int blocksparse_head_sliding_step) {
const int seq_idx = blockIdx.y;
const int partition_idx = blockIdx.z;
const int max_num_partitions = gridDim.z;
constexpr bool USE_PARTITIONING = PARTITION_SIZE > 0;
- const int context_len = context_lens[seq_idx];
- if (USE_PARTITIONING && partition_idx * PARTITION_SIZE >= context_len) {
+ const int seq_len = seq_lens[seq_idx];
+ if (USE_PARTITIONING && partition_idx * PARTITION_SIZE >= seq_len) {
// No work to do. Terminate the thread block.
return;
}
- const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
- const int num_blocks_per_partition = USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_context_blocks;
+ const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE);
+ const int num_blocks_per_partition =
+ USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_seq_blocks;
// [start_block_idx, end_block_idx) is the range of blocks to process.
- const int start_block_idx = USE_PARTITIONING ? partition_idx * num_blocks_per_partition : 0;
- const int end_block_idx = MIN(start_block_idx + num_blocks_per_partition, num_context_blocks);
+ const int start_block_idx =
+ USE_PARTITIONING ? partition_idx * num_blocks_per_partition : 0;
+ const int end_block_idx =
+ MIN(start_block_idx + num_blocks_per_partition, num_seq_blocks);
const int num_blocks = end_block_idx - start_block_idx;
// [start_token_idx, end_token_idx) is the range of tokens to process.
const int start_token_idx = start_block_idx * BLOCK_SIZE;
- const int end_token_idx = MIN(start_token_idx + num_blocks * BLOCK_SIZE, context_len);
+ const int end_token_idx =
+ MIN(start_token_idx + num_blocks * BLOCK_SIZE, seq_len);
const int num_tokens = end_token_idx - start_token_idx;
constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
- constexpr int NUM_THREAD_GROUPS = NUM_THREADS / THREAD_GROUP_SIZE; // Note: This assumes THREAD_GROUP_SIZE divides NUM_THREADS
+ constexpr int NUM_THREAD_GROUPS =
+ NUM_THREADS / THREAD_GROUP_SIZE; // Note: This assumes THREAD_GROUP_SIZE
+ // divides NUM_THREADS
assert(NUM_THREADS % THREAD_GROUP_SIZE == 0);
- constexpr int NUM_TOKENS_PER_THREAD_GROUP = DIVIDE_ROUND_UP(BLOCK_SIZE, WARP_SIZE);
+ constexpr int NUM_TOKENS_PER_THREAD_GROUP =
+ DIVIDE_ROUND_UP(BLOCK_SIZE, WARP_SIZE);
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
const int thread_idx = threadIdx.x;
const int warp_idx = thread_idx / WARP_SIZE;
@@ -147,19 +151,18 @@ __device__ void paged_attention_kernel(
const int num_heads = gridDim.x;
const int num_queries_per_kv = num_heads / num_kv_heads;
const int kv_head_idx = head_idx / num_queries_per_kv;
- const float alibi_slope = alibi_slopes == nullptr ? 0.f : alibi_slopes[head_idx];
+ const float alibi_slope =
+ alibi_slopes == nullptr ? 0.f : alibi_slopes[head_idx];
// A vector type to store a part of a key or a query.
- // The vector size is configured in such a way that the threads in a thread group
- // fetch or compute 16 bytes at a time.
- // For example, if the size of a thread group is 4 and the data type is half,
- // then the vector size is 16 / (4 * sizeof(half)) == 2.
+ // The vector size is configured in such a way that the threads in a thread
+ // group fetch or compute 16 bytes at a time. For example, if the size of a
+ // thread group is 4 and the data type is half, then the vector size is 16 /
+ // (4 * sizeof(half)) == 2.
constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(scalar_t)), 1);
using K_vec = typename Vec::Type;
using Q_vec = typename Vec::Type;
-#if defined(ENABLE_FP8_E5M2) || defined(ENABLE_FP8_E4M3)
using Quant_vec = typename Vec::Type;
-#endif
constexpr int NUM_ELEMS_PER_THREAD = HEAD_SIZE / THREAD_GROUP_SIZE;
constexpr int NUM_VECS_PER_THREAD = NUM_ELEMS_PER_THREAD / VEC_SIZE;
@@ -169,18 +172,21 @@ __device__ void paged_attention_kernel(
// Load the query to registers.
// Each thread in a thread group has a different part of the query.
- // For example, if the the thread group size is 4, then the first thread in the group
- // has 0, 4, 8, ... th vectors of the query, and the second thread has 1, 5, 9, ...
- // th vectors of the query, and so on.
- // NOTE(woosuk): Because q is split from a qkv tensor, it may not be contiguous.
+ // For example, if the the thread group size is 4, then the first thread in
+ // the group has 0, 4, 8, ... th vectors of the query, and the second thread
+ // has 1, 5, 9, ... th vectors of the query, and so on. NOTE(woosuk): Because
+ // q is split from a qkv tensor, it may not be contiguous.
const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
__shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
#pragma unroll
- for (int i = thread_group_idx; i < NUM_VECS_PER_THREAD; i += NUM_THREAD_GROUPS) {
+ for (int i = thread_group_idx; i < NUM_VECS_PER_THREAD;
+ i += NUM_THREAD_GROUPS) {
const int vec_idx = thread_group_offset + i * THREAD_GROUP_SIZE;
- q_vecs[thread_group_offset][i] = *reinterpret_cast(q_ptr + vec_idx * VEC_SIZE);
+ q_vecs[thread_group_offset][i] =
+ *reinterpret_cast(q_ptr + vec_idx * VEC_SIZE);
}
- __syncthreads(); // TODO(naed90): possible speedup if this is replaced with a memory wall right before we use q_vecs
+ __syncthreads(); // TODO(naed90): possible speedup if this is replaced with a
+ // memory wall right before we use q_vecs
// Memory planning.
extern __shared__ char shared_mem[];
@@ -199,58 +205,101 @@ __device__ void paged_attention_kernel(
// Each thread group in a warp fetches a key from the block, and computes
// dot product with the query.
const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
- for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx; block_idx += NUM_WARPS) {
- // NOTE(woosuk): The block number is stored in int32. However, we cast it to int64
- // because int32 can lead to overflow when this variable is multiplied by large numbers
- // (e.g., kv_block_stride).
- const int64_t physical_block_number = static_cast(block_table[block_idx]);
+
+ // blocksparse specific vars
+ int bs_block_offset;
+ int q_bs_block_id;
+ if constexpr (IS_BLOCK_SPARSE) {
+ // const int num_blocksparse_blocks = DIVIDE_ROUND_UP(seq_len,
+ // blocksparse_block_size);
+ q_bs_block_id = (seq_len - 1) / blocksparse_block_size;
+ if (blocksparse_head_sliding_step >= 0)
+ // sliding on q heads
+ bs_block_offset =
+ (tp_rank * num_heads + head_idx) * blocksparse_head_sliding_step + 1;
+ else
+ // sliding on kv heads
+ bs_block_offset = (tp_rank * num_kv_heads + kv_head_idx) *
+ (-blocksparse_head_sliding_step) +
+ 1;
+ }
+
+ for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx;
+ block_idx += NUM_WARPS) {
+ // NOTE(woosuk): The block number is stored in int32. However, we cast it to
+ // int64 because int32 can lead to overflow when this variable is multiplied
+ // by large numbers (e.g., kv_block_stride).
+ // For blocksparse attention: skip computation on blocks that are not
+ // attended
+ if constexpr (IS_BLOCK_SPARSE) {
+ const int k_bs_block_id = block_idx * BLOCK_SIZE / blocksparse_block_size;
+ const bool is_remote =
+ ((k_bs_block_id + bs_block_offset) % blocksparse_vert_stride == 0);
+ const bool is_local =
+ (k_bs_block_id > q_bs_block_id - blocksparse_local_blocks);
+ if (!is_remote && !is_local) {
+ for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
+ const int physical_block_offset =
+ (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE;
+ const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
+
+ if (thread_group_offset == 0) {
+ // NOTE(linxihui): assign very large number to skipped tokens to
+ // avoid contribution to the sumexp softmax normalizer. This will
+ // not be used at computing sum(softmax*v) as the blocks will be
+ // skipped.
+ logits[token_idx - start_token_idx] = -FLT_MAX;
+ }
+ }
+ continue;
+ }
+ }
+ const int64_t physical_block_number =
+ static_cast(block_table[block_idx]);
// Load a key to registers.
// Each thread in a thread group has a different part of the key.
- // For example, if the the thread group size is 4, then the first thread in the group
- // has 0, 4, 8, ... th vectors of the key, and the second thread has 1, 5, 9, ... th
- // vectors of the key, and so on.
+ // For example, if the the thread group size is 4, then the first thread in
+ // the group has 0, 4, 8, ... th vectors of the key, and the second thread
+ // has 1, 5, 9, ... th vectors of the key, and so on.
for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
- const int physical_block_offset = (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE;
+ const int physical_block_offset =
+ (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE;
const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
K_vec k_vecs[NUM_VECS_PER_THREAD];
#pragma unroll
for (int j = 0; j < NUM_VECS_PER_THREAD; j++) {
- const cache_t* k_ptr = k_cache + physical_block_number * kv_block_stride
- + kv_head_idx * kv_head_stride
- + physical_block_offset * x;
+ const cache_t* k_ptr =
+ k_cache + physical_block_number * kv_block_stride +
+ kv_head_idx * kv_head_stride + physical_block_offset * x;
const int vec_idx = thread_group_offset + j * THREAD_GROUP_SIZE;
const int offset1 = (vec_idx * VEC_SIZE) / x;
const int offset2 = (vec_idx * VEC_SIZE) % x;
- if constexpr (IS_FP8_KV_CACHE) {
-#if defined(ENABLE_FP8_E5M2)
- Quant_vec k_vec_quant = *reinterpret_cast(k_ptr + offset1 * BLOCK_SIZE * x + offset2);
- // Vector conversion from Quant_vec to K_vec.
- k_vecs[j] = fp8_e5m2_unscaled::vec_conversion(k_vec_quant);
-#elif defined(ENABLE_FP8_E4M3)
- Quant_vec k_vec_quant = *reinterpret_cast(k_ptr + offset1 * BLOCK_SIZE * x + offset2);
- // Vector conversion from Quant_vec to K_vec. Use scaled_vec_conversion to convert FP8_E4M3 quantized k
- // cache vec to k vec in higher precision (FP16, BFloat16, etc.)
- k_vecs[j] = fp8_e4m3::scaled_vec_conversion(k_vec_quant, kv_scale);
-#else
- assert(false);
-#endif
+
+ if constexpr (KV_DTYPE == Fp8KVCacheDataType::kAuto) {
+ k_vecs[j] = *reinterpret_cast(
+ k_ptr + offset1 * BLOCK_SIZE * x + offset2);
} else {
- k_vecs[j] = *reinterpret_cast(k_ptr + offset1 * BLOCK_SIZE * x + offset2);
+ // Vector conversion from Quant_vec to K_vec.
+ Quant_vec k_vec_quant = *reinterpret_cast(
+ k_ptr + offset1 * BLOCK_SIZE * x + offset2);
+ k_vecs[j] = fp8::scaled_convert(
+ k_vec_quant, kv_scale);
}
}
// Compute dot product.
// This includes a reduction across the threads in the same thread group.
- float qk = scale * Qk_dot::dot(q_vecs[thread_group_offset], k_vecs);
+ float qk = scale * Qk_dot::dot(
+ q_vecs[thread_group_offset], k_vecs);
// Add the ALiBi bias if slopes are given.
- qk += (alibi_slope != 0) ? alibi_slope * (token_idx - context_len + 1) : 0;
+ qk += (alibi_slope != 0) ? alibi_slope * (token_idx - seq_len + 1) : 0;
if (thread_group_offset == 0) {
// Store the partial reductions to shared memory.
// NOTE(woosuk): It is required to zero out the masked logits.
- const bool mask = token_idx >= context_len;
+ const bool mask = token_idx >= seq_len;
logits[token_idx - start_token_idx] = mask ? 0.f : qk;
// Update the max value.
qk_max = mask ? qk_max : fmaxf(qk_max, qk);
@@ -298,13 +347,12 @@ __device__ void paged_attention_kernel(
// If partitioning is enabled, store the max logit and exp_sum.
if (USE_PARTITIONING && thread_idx == 0) {
- float* max_logits_ptr = max_logits + seq_idx * num_heads * max_num_partitions
- + head_idx * max_num_partitions
- + partition_idx;
+ float* max_logits_ptr = max_logits +
+ seq_idx * num_heads * max_num_partitions +
+ head_idx * max_num_partitions + partition_idx;
*max_logits_ptr = qk_max;
- float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions
- + head_idx * max_num_partitions
- + partition_idx;
+ float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions +
+ head_idx * max_num_partitions + partition_idx;
*exp_sums_ptr = exp_sum;
}
@@ -312,14 +360,13 @@ __device__ void paged_attention_kernel(
constexpr int V_VEC_SIZE = MIN(16 / sizeof(scalar_t), BLOCK_SIZE);
using V_vec = typename Vec::Type;
using L_vec = typename Vec::Type;
-#if defined(ENABLE_FP8_E5M2) || defined(ENABLE_FP8_E4M3)
using V_quant_vec = typename Vec::Type;
-#endif
using Float_L_vec = typename FloatVec::Type;
constexpr int NUM_V_VECS_PER_ROW = BLOCK_SIZE / V_VEC_SIZE;
constexpr int NUM_ROWS_PER_ITER = WARP_SIZE / NUM_V_VECS_PER_ROW;
- constexpr int NUM_ROWS_PER_THREAD = DIVIDE_ROUND_UP(HEAD_SIZE, NUM_ROWS_PER_ITER);
+ constexpr int NUM_ROWS_PER_THREAD =
+ DIVIDE_ROUND_UP(HEAD_SIZE, NUM_ROWS_PER_ITER);
// NOTE(woosuk): We use FP32 for the accumulator for better accuracy.
float accs[NUM_ROWS_PER_THREAD];
@@ -330,48 +377,55 @@ __device__ void paged_attention_kernel(
scalar_t zero_value;
zero(zero_value);
- for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx; block_idx += NUM_WARPS) {
- // NOTE(woosuk): The block number is stored in int32. However, we cast it to int64
- // because int32 can lead to overflow when this variable is multiplied by large numbers
- // (e.g., kv_block_stride).
- const int64_t physical_block_number = static_cast(block_table[block_idx]);
+ for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx;
+ block_idx += NUM_WARPS) {
+ // NOTE(woosuk): The block number is stored in int32. However, we cast it to
+ // int64 because int32 can lead to overflow when this variable is multiplied
+ // by large numbers (e.g., kv_block_stride).
+ // For blocksparse attention: skip computation on blocks that are not
+ // attended
+ if constexpr (IS_BLOCK_SPARSE) {
+ int v_bs_block_id = block_idx * BLOCK_SIZE / blocksparse_block_size;
+ if (!((v_bs_block_id + bs_block_offset) % blocksparse_vert_stride == 0) &&
+ !((v_bs_block_id > q_bs_block_id - blocksparse_local_blocks))) {
+ continue;
+ }
+ }
+ const int64_t physical_block_number =
+ static_cast(block_table[block_idx]);
const int physical_block_offset = (lane % NUM_V_VECS_PER_ROW) * V_VEC_SIZE;
const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
L_vec logits_vec;
- from_float(logits_vec, *reinterpret_cast(logits + token_idx - start_token_idx));
+ from_float(logits_vec, *reinterpret_cast(logits + token_idx -
+ start_token_idx));
- const cache_t* v_ptr = v_cache + physical_block_number * kv_block_stride
- + kv_head_idx * kv_head_stride;
+ const cache_t* v_ptr = v_cache + physical_block_number * kv_block_stride +
+ kv_head_idx * kv_head_stride;
#pragma unroll
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
if (row_idx < HEAD_SIZE) {
const int offset = row_idx * BLOCK_SIZE + physical_block_offset;
V_vec v_vec;
- if constexpr (IS_FP8_KV_CACHE) {
-#if defined(ENABLE_FP8_E5M2)
- V_quant_vec v_quant_vec = *reinterpret_cast(v_ptr + offset);
- // Vector conversion from V_quant_vec to V_vec.
- v_vec = fp8_e5m2_unscaled::vec_conversion(v_quant_vec);
-#elif defined(ENABLE_FP8_E4M3)
- V_quant_vec v_quant_vec = *reinterpret_cast(v_ptr + offset);
- // Vector conversion from V_quant_vec to V_vec. Use scaled_vec_conversion to convert
- // FP8_E4M3 quantized v cache vec to v vec in higher precision (FP16, BFloat16, etc.)
- v_vec = fp8_e4m3::scaled_vec_conversion(v_quant_vec, kv_scale);
-#else
- assert(false);
-#endif
- } else {
+
+ if constexpr (KV_DTYPE == Fp8KVCacheDataType::kAuto) {
v_vec = *reinterpret_cast(v_ptr + offset);
+ } else {
+ V_quant_vec v_quant_vec =
+ *reinterpret_cast(v_ptr + offset);
+ // Vector conversion from V_quant_vec to V_vec.
+ v_vec = fp8::scaled_convert(v_quant_vec,
+ kv_scale);
}
- if (block_idx == num_context_blocks - 1) {
- // NOTE(woosuk): When v_vec contains the tokens that are out of the context,
- // we should explicitly zero out the values since they may contain NaNs.
- // See https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472
+ if (block_idx == num_seq_blocks - 1) {
+ // NOTE(woosuk): When v_vec contains the tokens that are out of the
+ // context, we should explicitly zero out the values since they may
+ // contain NaNs. See
+ // https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472
scalar_t* v_vec_ptr = reinterpret_cast(&v_vec);
#pragma unroll
for (int j = 0; j < V_VEC_SIZE; j++) {
- v_vec_ptr[j] = token_idx + j < context_len ? v_vec_ptr[j] : zero_value;
+ v_vec_ptr[j] = token_idx + j < seq_len ? v_vec_ptr[j] : zero_value;
}
}
accs[i] += dot(logits_vec, v_vec);
@@ -390,8 +444,8 @@ __device__ void paged_attention_kernel(
accs[i] = acc;
}
- // NOTE(woosuk): A barrier is required because the shared memory space for logits
- // is reused for the output.
+ // NOTE(woosuk): A barrier is required because the shared memory space for
+ // logits is reused for the output.
__syncthreads();
// Perform reduction across warps.
@@ -428,9 +482,9 @@ __device__ void paged_attention_kernel(
// Write the final output.
if (warp_idx == 0) {
- scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
- + head_idx * max_num_partitions * HEAD_SIZE
- + partition_idx * HEAD_SIZE;
+ scalar_t* out_ptr =
+ out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+ head_idx * max_num_partitions * HEAD_SIZE + partition_idx * HEAD_SIZE;
#pragma unroll
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
@@ -442,89 +496,96 @@ __device__ void paged_attention_kernel(
}
// Grid: (num_heads, num_seqs, 1).
-template<
- typename scalar_t,
- typename cache_t,
- int HEAD_SIZE,
- int BLOCK_SIZE,
- int NUM_THREADS,
- bool IS_FP8_KV_CACHE>
+template
__global__ void paged_attention_v1_kernel(
- scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
- const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
- const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x]
- const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size]
- const int num_kv_heads, // [num_heads]
- const float scale,
- const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
- const int* __restrict__ context_lens, // [num_seqs]
- const int max_num_blocks_per_seq,
- const float* __restrict__ alibi_slopes, // [num_heads]
- const int q_stride,
- const int kv_block_stride,
- const int kv_head_stride,
- const float kv_scale) {
- paged_attention_kernel(
- /* exp_sums */ nullptr, /* max_logits */ nullptr,
- out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, context_lens,
- max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride, kv_scale);
+ scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
+ const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
+ const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads,
+ // head_size/x, block_size, x]
+ const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads,
+ // head_size, block_size]
+ const int num_kv_heads, // [num_heads]
+ const float scale,
+ const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
+ const int* __restrict__ seq_lens, // [num_seqs]
+ const int max_num_blocks_per_seq,
+ const float* __restrict__ alibi_slopes, // [num_heads]
+ const int q_stride, const int kv_block_stride, const int kv_head_stride,
+ const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
+ const int blocksparse_vert_stride, const int blocksparse_block_size,
+ const int blocksparse_head_sliding_step) {
+ paged_attention_kernel(
+ /* exp_sums */ nullptr, /* max_logits */ nullptr, out, q, k_cache,
+ v_cache, num_kv_heads, scale, block_tables, seq_lens,
+ max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride,
+ kv_head_stride, kv_scale, tp_rank, blocksparse_local_blocks,
+ blocksparse_vert_stride, blocksparse_block_size,
+ blocksparse_head_sliding_step);
}
// Grid: (num_heads, num_seqs, max_num_partitions).
-template<
- typename scalar_t,
- typename cache_t,
- int HEAD_SIZE,
- int BLOCK_SIZE,
- int NUM_THREADS,
- bool IS_FP8_KV_CACHE,
- int PARTITION_SIZE>
+template
__global__ void paged_attention_v2_kernel(
- float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
- float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
- scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
- const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
- const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x]
- const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size]
- const int num_kv_heads, // [num_heads]
- const float scale,
- const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
- const int* __restrict__ context_lens, // [num_seqs]
- const int max_num_blocks_per_seq,
- const float* __restrict__ alibi_slopes, // [num_heads]
- const int q_stride,
- const int kv_block_stride,
- const int kv_head_stride,
- const float kv_scale) {
- paged_attention_kernel(
- exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale,
- block_tables, context_lens, max_num_blocks_per_seq, alibi_slopes,
- q_stride, kv_block_stride, kv_head_stride, kv_scale);
+ float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
+ float* __restrict__ max_logits, // [num_seqs, num_heads,
+ // max_num_partitions]
+ scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
+ // max_num_partitions, head_size]
+ const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
+ const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads,
+ // head_size/x, block_size, x]
+ const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads,
+ // head_size, block_size]
+ const int num_kv_heads, // [num_heads]
+ const float scale,
+ const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
+ const int* __restrict__ seq_lens, // [num_seqs]
+ const int max_num_blocks_per_seq,
+ const float* __restrict__ alibi_slopes, // [num_heads]
+ const int q_stride, const int kv_block_stride, const int kv_head_stride,
+ const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
+ const int blocksparse_vert_stride, const int blocksparse_block_size,
+ const int blocksparse_head_sliding_step) {
+ paged_attention_kernel(
+ exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale,
+ block_tables, seq_lens, max_num_blocks_per_seq, alibi_slopes, q_stride,
+ kv_block_stride, kv_head_stride, kv_scale, tp_rank,
+ blocksparse_local_blocks, blocksparse_vert_stride, blocksparse_block_size,
+ blocksparse_head_sliding_step);
}
// Grid: (num_heads, num_seqs).
-template<
- typename scalar_t,
- int HEAD_SIZE,
- int NUM_THREADS,
- int PARTITION_SIZE>
+template
__global__ void paged_attention_v2_reduce_kernel(
- scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
- const float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
- const float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
- const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
- const int* __restrict__ context_lens, // [num_seqs]
- const int max_num_partitions) {
+ scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
+ const float* __restrict__ exp_sums, // [num_seqs, num_heads,
+ // max_num_partitions]
+ const float* __restrict__ max_logits, // [num_seqs, num_heads,
+ // max_num_partitions]
+ const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
+ // max_num_partitions, head_size]
+ const int* __restrict__ seq_lens, // [num_seqs]
+ const int max_num_partitions) {
const int num_heads = gridDim.x;
const int head_idx = blockIdx.x;
const int seq_idx = blockIdx.y;
- const int context_len = context_lens[seq_idx];
- const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
+ const int seq_len = seq_lens[seq_idx];
+ const int num_partitions = DIVIDE_ROUND_UP(seq_len, PARTITION_SIZE);
if (num_partitions == 1) {
// No need to reduce. Only copy tmp_out to out.
- scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
- const scalar_t* tmp_out_ptr = tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
- + head_idx * max_num_partitions * HEAD_SIZE;
+ scalar_t* out_ptr =
+ out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
+ const scalar_t* tmp_out_ptr =
+ tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+ head_idx * max_num_partitions * HEAD_SIZE;
for (int i = threadIdx.x; i < HEAD_SIZE; i += blockDim.x) {
out_ptr[i] = tmp_out_ptr[i];
}
@@ -543,8 +604,9 @@ __global__ void paged_attention_v2_reduce_kernel(
// Load max logits to shared memory.
float* shared_max_logits = reinterpret_cast(shared_mem);
- const float* max_logits_ptr = max_logits + seq_idx * num_heads * max_num_partitions
- + head_idx * max_num_partitions;
+ const float* max_logits_ptr = max_logits +
+ seq_idx * num_heads * max_num_partitions +
+ head_idx * max_num_partitions;
float max_logit = -FLT_MAX;
for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) {
const float l = max_logits_ptr[i];
@@ -573,9 +635,11 @@ __global__ void paged_attention_v2_reduce_kernel(
max_logit = VLLM_SHFL_SYNC(max_logit, 0);
// Load rescaled exp sums to shared memory.
- float* shared_exp_sums = reinterpret_cast(shared_mem + sizeof(float) * num_partitions);
- const float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions
- + head_idx * max_num_partitions;
+ float* shared_exp_sums =
+ reinterpret_cast(shared_mem + sizeof(float) * num_partitions);
+ const float* exp_sums_ptr = exp_sums +
+ seq_idx * num_heads * max_num_partitions +
+ head_idx * max_num_partitions;
float global_exp_sum = 0.0f;
for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) {
float l = shared_max_logits[i];
@@ -588,65 +652,56 @@ __global__ void paged_attention_v2_reduce_kernel(
const float inv_global_exp_sum = __fdividef(1.0f, global_exp_sum + 1e-6f);
// Aggregate tmp_out to out.
- const scalar_t* tmp_out_ptr = tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
- + head_idx * max_num_partitions * HEAD_SIZE;
- scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
+ const scalar_t* tmp_out_ptr =
+ tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+ head_idx * max_num_partitions * HEAD_SIZE;
+ scalar_t* out_ptr =
+ out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
#pragma unroll
for (int i = threadIdx.x; i < HEAD_SIZE; i += NUM_THREADS) {
float acc = 0.0f;
for (int j = 0; j < num_partitions; ++j) {
- acc += to_float(tmp_out_ptr[j * HEAD_SIZE + i]) * shared_exp_sums[j] * inv_global_exp_sum;
+ acc += to_float(tmp_out_ptr[j * HEAD_SIZE + i]) * shared_exp_sums[j] *
+ inv_global_exp_sum;
}
from_float(out_ptr[i], acc);
}
}
-} // namespace vllm
-
-#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \
- VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \
- ((void*)vllm::paged_attention_v1_kernel), shared_mem_size); \
- vllm::paged_attention_v1_kernel<<>>( \
- out_ptr, \
- query_ptr, \
- key_cache_ptr, \
- value_cache_ptr, \
- num_kv_heads, \
- scale, \
- block_tables_ptr, \
- context_lens_ptr, \
- max_num_blocks_per_seq, \
- alibi_slopes_ptr, \
- q_stride, \
- kv_block_stride, \
- kv_head_stride, \
- kv_scale);
+} // namespace vllm
+
+#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \
+ VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \
+ ((void*)vllm::paged_attention_v1_kernel), \
+ shared_mem_size); \
+ vllm::paged_attention_v1_kernel \
+ <<>>( \
+ out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \
+ scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \
+ alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \
+ kv_scale, tp_rank, blocksparse_local_blocks, \
+ blocksparse_vert_stride, blocksparse_block_size, \
+ blocksparse_head_sliding_step);
// TODO(woosuk): Tune NUM_THREADS.
-template<
- typename T,
- typename CACHE_T,
- int BLOCK_SIZE,
- bool IS_FP8_KV_CACHE,
+template
+ int NUM_THREADS = 1024>
#else
- int NUM_THREADS = 128>
+ int NUM_THREADS = 128>
#endif
void paged_attention_v1_launcher(
- torch::Tensor& out,
- torch::Tensor& query,
- torch::Tensor& key_cache,
- torch::Tensor& value_cache,
- int num_kv_heads,
- float scale,
- torch::Tensor& block_tables,
- torch::Tensor& context_lens,
- int max_context_len,
- const c10::optional& alibi_slopes,
- float kv_scale) {
+ torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
+ torch::Tensor& value_cache, int num_kv_heads, float scale,
+ torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
+ const c10::optional& alibi_slopes, float kv_scale,
+ const int tp_rank, const int blocksparse_local_blocks,
+ const int blocksparse_vert_stride, const int blocksparse_block_size,
+ const int blocksparse_head_sliding_step) {
int num_seqs = query.size(0);
int num_heads = query.size(1);
int head_size = query.size(2);
@@ -659,20 +714,22 @@ void paged_attention_v1_launcher(
assert(head_size % thread_group_size == 0);
// NOTE: alibi_slopes is optional.
- const float* alibi_slopes_ptr = alibi_slopes ?
- reinterpret_cast(alibi_slopes.value().data_ptr())
- : nullptr;
+ const float* alibi_slopes_ptr =
+ alibi_slopes
+ ? reinterpret_cast(alibi_slopes.value().data_ptr())
+ : nullptr;
T* out_ptr = reinterpret_cast(out.data_ptr());
T* query_ptr = reinterpret_cast(query.data_ptr());
CACHE_T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr());
CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr());
int* block_tables_ptr = block_tables.data_ptr();
- int* context_lens_ptr = context_lens.data_ptr();
+ int* seq_lens_ptr = seq_lens.data_ptr();
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
- int padded_max_context_len = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE) * BLOCK_SIZE;
- int logits_size = padded_max_context_len * sizeof(float);
+ int padded_max_seq_len =
+ DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE;
+ int logits_size = padded_max_seq_len * sizeof(float);
int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
// Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len
// Keep that in sync with the logic here!
@@ -701,6 +758,9 @@ void paged_attention_v1_launcher(
case 128:
LAUNCH_PAGED_ATTENTION_V1(128);
break;
+ case 192:
+ LAUNCH_PAGED_ATTENTION_V1(192);
+ break;
case 256:
LAUNCH_PAGED_ATTENTION_V1(256);
break;
@@ -710,133 +770,97 @@ void paged_attention_v1_launcher(
}
}
-#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
- paged_attention_v1_launcher( \
- out, \
- query, \
- key_cache, \
- value_cache, \
- num_kv_heads, \
- scale, \
- block_tables, \
- context_lens, \
- max_context_len, \
- alibi_slopes, \
- kv_scale);
+#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \
+ paged_attention_v1_launcher( \
+ out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
+ seq_lens, max_seq_len, alibi_slopes, kv_scale, tp_rank, \
+ blocksparse_local_blocks, blocksparse_vert_stride, \
+ blocksparse_block_size, blocksparse_head_sliding_step);
+
+#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
+ switch (is_block_sparse) { \
+ case true: \
+ CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
+ break; \
+ case false: \
+ CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
+ break; \
+ }
// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
// 1, 2, 4, 64, 128, 256.
-#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_KV_CACHE) \
- switch (block_size) { \
- case 8: \
- CALL_V1_LAUNCHER(T, CACHE_T, 8, IS_FP8_KV_CACHE); \
- break; \
- case 16: \
- CALL_V1_LAUNCHER(T, CACHE_T, 16, IS_FP8_KV_CACHE); \
- break; \
- case 32: \
- CALL_V1_LAUNCHER(T, CACHE_T, 32, IS_FP8_KV_CACHE); \
- break; \
- default: \
- TORCH_CHECK(false, "Unsupported block size: ", block_size); \
- break; \
+#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \
+ switch (block_size) { \
+ case 8: \
+ CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \
+ break; \
+ case 16: \
+ CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \
+ break; \
+ case 32: \
+ CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \
+ break; \
+ default: \
+ TORCH_CHECK(false, "Unsupported block size: ", block_size); \
+ break; \
}
void paged_attention_v1(
- torch::Tensor& out, // [num_seqs, num_heads, head_size]
- torch::Tensor& query, // [num_seqs, num_heads, head_size]
- torch::Tensor& key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
- torch::Tensor& value_cache, // [num_blocks, num_heads, head_size, block_size]
- int num_kv_heads, // [num_heads]
- float scale,
- torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
- torch::Tensor& context_lens, // [num_seqs]
- int block_size,
- int max_context_len,
- const c10::optional& alibi_slopes,
- const std::string& kv_cache_dtype,
- float kv_scale) {
- if (kv_cache_dtype == "auto") {
- if (query.dtype() == at::ScalarType::Float) {
- CALL_V1_LAUNCHER_BLOCK_SIZE(float, float, false);
- } else if (query.dtype() == at::ScalarType::Half) {
- CALL_V1_LAUNCHER_BLOCK_SIZE(uint16_t, uint16_t, false);
- } else if (query.dtype() == at::ScalarType::BFloat16) {
- CALL_V1_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, __nv_bfloat16, false);
- } else {
- TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
- }
- } else if (kv_cache_dtype == "fp8") {
- if (query.dtype() == at::ScalarType::Float) {
- CALL_V1_LAUNCHER_BLOCK_SIZE(float, uint8_t, true);
- } else if (query.dtype() == at::ScalarType::Half) {
- CALL_V1_LAUNCHER_BLOCK_SIZE(uint16_t, uint8_t, true);
- } else if (query.dtype() == at::ScalarType::BFloat16) {
- CALL_V1_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, uint8_t, true);
- } else {
- TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
- }
- } else {
- TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
- }
+ torch::Tensor& out, // [num_seqs, num_heads, head_size]
+ torch::Tensor& query, // [num_seqs, num_heads, head_size]
+ torch::Tensor&
+ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
+ torch::Tensor&
+ value_cache, // [num_blocks, num_heads, head_size, block_size]
+ int num_kv_heads, // [num_heads]
+ float scale,
+ torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
+ torch::Tensor& seq_lens, // [num_seqs]
+ int block_size, int max_seq_len,
+ const c10::optional& alibi_slopes,
+ const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
+ const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+ const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
+ const bool is_block_sparse = (blocksparse_vert_stride > 1);
+
+ DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
+ CALL_V1_LAUNCHER_BLOCK_SIZE)
}
-#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \
- vllm::paged_attention_v2_kernel \
- <<>>( \
- exp_sums_ptr, \
- max_logits_ptr, \
- tmp_out_ptr, \
- query_ptr, \
- key_cache_ptr, \
- value_cache_ptr, \
- num_kv_heads, \
- scale, \
- block_tables_ptr, \
- context_lens_ptr, \
- max_num_blocks_per_seq, \
- alibi_slopes_ptr, \
- q_stride, \
- kv_block_stride, \
- kv_head_stride, \
- kv_scale); \
- vllm::paged_attention_v2_reduce_kernel \
- <<>>( \
- out_ptr, \
- exp_sums_ptr, \
- max_logits_ptr, \
- tmp_out_ptr, \
- context_lens_ptr, \
- max_num_partitions);
-
-template<
- typename T,
- typename CACHE_T,
- int BLOCK_SIZE,
- bool IS_FP8_KV_CACHE,
+#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \
+ vllm::paged_attention_v2_kernel \
+ <<>>( \
+ exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \
+ value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \
+ seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \
+ kv_block_stride, kv_head_stride, kv_scale, tp_rank, \
+ blocksparse_local_blocks, blocksparse_vert_stride, \
+ blocksparse_block_size, blocksparse_head_sliding_step); \
+ vllm::paged_attention_v2_reduce_kernel \
+ <<>>( \
+ out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, seq_lens_ptr, \
+ max_num_partitions);
+
+template
+ int NUM_THREADS = 1024, int PARTITION_SIZE = 1024>
#else
- int NUM_THREADS = 128,
- int PARTITION_SIZE = 512>
+ int NUM_THREADS = 128, int PARTITION_SIZE = 512>
#endif
void paged_attention_v2_launcher(
- torch::Tensor& out,
- torch::Tensor& exp_sums,
- torch::Tensor& max_logits,
- torch::Tensor& tmp_out,
- torch::Tensor& query,
- torch::Tensor& key_cache,
- torch::Tensor& value_cache,
- int num_kv_heads,
- float scale,
- torch::Tensor& block_tables,
- torch::Tensor& context_lens,
- int max_context_len,
- const c10::optional& alibi_slopes,
- float kv_scale) {
+ torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
+ torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
+ torch::Tensor& value_cache, int num_kv_heads, float scale,
+ torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
+ const c10::optional& alibi_slopes, float kv_scale,
+ const int tp_rank, const int blocksparse_local_blocks,
+ const int blocksparse_vert_stride, const int blocksparse_block_size,
+ const int blocksparse_head_sliding_step) {
int num_seqs = query.size(0);
int num_heads = query.size(1);
int head_size = query.size(2);
@@ -849,9 +873,10 @@ void paged_attention_v2_launcher(
assert(head_size % thread_group_size == 0);
// NOTE: alibi_slopes is optional.
- const float* alibi_slopes_ptr = alibi_slopes ?
- reinterpret_cast(alibi_slopes.value().data_ptr())
- : nullptr;
+ const float* alibi_slopes_ptr =
+ alibi_slopes
+ ? reinterpret_cast(alibi_slopes.value().data_ptr())
+ : nullptr;
T* out_ptr = reinterpret_cast(out.data_ptr());
float* exp_sums_ptr = reinterpret_cast(exp_sums.data_ptr());
@@ -861,10 +886,10 @@ void paged_attention_v2_launcher(
CACHE_T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr());
CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr());
int* block_tables_ptr = block_tables.data_ptr();
- int* context_lens_ptr = context_lens.data_ptr();
+ int* seq_lens_ptr = seq_lens.data_ptr();
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
- int max_num_partitions = DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE);
+ int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
int logits_size = PARTITION_SIZE * sizeof(float);
int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
@@ -897,6 +922,9 @@ void paged_attention_v2_launcher(
case 128:
LAUNCH_PAGED_ATTENTION_V2(128);
break;
+ case 192:
+ LAUNCH_PAGED_ATTENTION_V2(192);
+ break;
case 256:
LAUNCH_PAGED_ATTENTION_V2(256);
break;
@@ -906,84 +934,68 @@ void paged_attention_v2_launcher(
}
}
-#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
- paged_attention_v2_launcher( \
- out, \
- exp_sums, \
- max_logits, \
- tmp_out, \
- query, \
- key_cache, \
- value_cache, \
- num_kv_heads, \
- scale, \
- block_tables, \
- context_lens, \
- max_context_len, \
- alibi_slopes, \
- kv_scale);
+#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \
+ paged_attention_v2_launcher( \
+ out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \
+ num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \
+ kv_scale, tp_rank, blocksparse_local_blocks, blocksparse_vert_stride, \
+ blocksparse_block_size, blocksparse_head_sliding_step);
+
+#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
+ switch (is_block_sparse) { \
+ case true: \
+ CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
+ break; \
+ case false: \
+ CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
+ break; \
+ }
// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
// 1, 2, 4, 64, 128, 256.
-#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_KV_CACHE) \
- switch (block_size) { \
- case 8: \
- CALL_V2_LAUNCHER(T, CACHE_T, 8, IS_FP8_KV_CACHE); \
- break; \
- case 16: \
- CALL_V2_LAUNCHER(T, CACHE_T, 16, IS_FP8_KV_CACHE); \
- break; \
- case 32: \
- CALL_V2_LAUNCHER(T, CACHE_T, 32, IS_FP8_KV_CACHE); \
- break; \
- default: \
- TORCH_CHECK(false, "Unsupported block size: ", block_size); \
- break; \
+#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \
+ switch (block_size) { \
+ case 8: \
+ CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \
+ break; \
+ case 16: \
+ CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \
+ break; \
+ case 32: \
+ CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \
+ break; \
+ default: \
+ TORCH_CHECK(false, "Unsupported block size: ", block_size); \
+ break; \
}
void paged_attention_v2(
- torch::Tensor& out, // [num_seqs, num_heads, head_size]
- torch::Tensor& exp_sums, // [num_seqs, num_heads, max_num_partitions]
- torch::Tensor& max_logits, // [num_seqs, num_heads, max_num_partitions]
- torch::Tensor& tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
- torch::Tensor& query, // [num_seqs, num_heads, head_size]
- torch::Tensor& key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
- torch::Tensor& value_cache, // [num_blocks, num_heads, head_size, block_size]
- int num_kv_heads, // [num_heads]
- float scale,
- torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
- torch::Tensor& context_lens, // [num_seqs]
- int block_size,
- int max_context_len,
- const c10::optional& alibi_slopes,
- const std::string& kv_cache_dtype,
- float kv_scale) {
- if (kv_cache_dtype == "auto") {
- if (query.dtype() == at::ScalarType::Float) {
- CALL_V2_LAUNCHER_BLOCK_SIZE(float, float, false);
- } else if (query.dtype() == at::ScalarType::Half) {
- CALL_V2_LAUNCHER_BLOCK_SIZE(uint16_t, uint16_t, false);
- } else if (query.dtype() == at::ScalarType::BFloat16) {
- CALL_V2_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, __nv_bfloat16, false);
- } else {
- TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
- }
- } else if (kv_cache_dtype == "fp8") {
- if (query.dtype() == at::ScalarType::Float) {
- CALL_V2_LAUNCHER_BLOCK_SIZE(float, uint8_t, true);
- } else if (query.dtype() == at::ScalarType::Half) {
- CALL_V2_LAUNCHER_BLOCK_SIZE(uint16_t, uint8_t, true);
- } else if (query.dtype() == at::ScalarType::BFloat16) {
- CALL_V2_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, uint8_t, true);
- } else {
- TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
- }
- } else {
- TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
- }
+ torch::Tensor& out, // [num_seqs, num_heads, head_size]
+ torch::Tensor& exp_sums, // [num_seqs, num_heads, max_num_partitions]
+ torch::Tensor& max_logits, // [num_seqs, num_heads, max_num_partitions]
+ torch::Tensor&
+ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
+ torch::Tensor& query, // [num_seqs, num_heads, head_size]
+ torch::Tensor&
+ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
+ torch::Tensor&
+ value_cache, // [num_blocks, num_heads, head_size, block_size]
+ int num_kv_heads, // [num_heads]
+ float scale,
+ torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
+ torch::Tensor& seq_lens, // [num_seqs]
+ int block_size, int max_seq_len,
+ const c10::optional& alibi_slopes,
+ const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
+ const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+ const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
+ const bool is_block_sparse = (blocksparse_vert_stride > 1);
+ DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
+ CALL_V2_LAUNCHER_BLOCK_SIZE)
}
#undef WARP_SIZE
#undef MAX
#undef MIN
-#undef DIVIDE_ROUND_UP
+#undef DIVIDE_ROUND_UP
\ No newline at end of file
diff --git a/csrc/attention/attention_utils.cuh b/csrc/attention/attention_utils.cuh
index 22273c11d483e..826b0edffae67 100644
--- a/csrc/attention/attention_utils.cuh
+++ b/csrc/attention/attention_utils.cuh
@@ -1,5 +1,6 @@
/*
- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
* Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
@@ -26,7 +27,7 @@
namespace vllm {
// Q*K^T operation.
-template
+template
inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
using A_vec = typename FloatVec::Type;
// Compute the parallel products for Q*K^T (treat vector lanes separately).
@@ -45,12 +46,12 @@ inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
return qk;
}
-template
+template
struct Qk_dot {
- template
+ template
static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) {
return qk_dot_(q, k);
}
};
-} // namespace vllm
+} // namespace vllm
diff --git a/csrc/attention/dtype_bfloat16.cuh b/csrc/attention/dtype_bfloat16.cuh
index 31e0cee01d2e1..3cdcb95e08099 100644
--- a/csrc/attention/dtype_bfloat16.cuh
+++ b/csrc/attention/dtype_bfloat16.cuh
@@ -1,6 +1,8 @@
/*
- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
- * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * and
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
* Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
@@ -28,8 +30,8 @@
#include
#include
- typedef __hip_bfloat162 __nv_bfloat162;
- typedef __hip_bfloat16 __nv_bfloat16;
+typedef __hip_bfloat162 __nv_bfloat162;
+typedef __hip_bfloat16 __nv_bfloat16;
#endif
#include
@@ -50,37 +52,37 @@ struct bf16_8_t {
};
// BF16 vector types for Q, K, V.
-template<>
+template <>
struct Vec<__nv_bfloat16, 1> {
using Type = __nv_bfloat16;
};
-template<>
+template <>
struct Vec<__nv_bfloat16, 2> {
using Type = __nv_bfloat162;
};
-template<>
+template <>
struct Vec<__nv_bfloat16, 4> {
using Type = bf16_4_t;
};
-template<>
+template <>
struct Vec<__nv_bfloat16, 8> {
using Type = bf16_8_t;
};
// FP32 accumulator vector types corresponding to Vec.
-template<>
+template <>
struct FloatVec<__nv_bfloat16> {
using Type = float;
};
-template<>
+template <>
struct FloatVec<__nv_bfloat162> {
using Type = float2;
};
-template<>
+template <>
struct FloatVec {
using Type = Float4_;
};
-template<>
+template <>
struct FloatVec {
using Type = Float8_;
};
@@ -108,9 +110,9 @@ inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b) {
assert(false);
#else
#ifndef USE_ROCM
- return a + b;
+ return a + b;
#else
- return __hadd(a, b);
+ return __hadd(a, b);
#endif
#endif
}
@@ -161,7 +163,7 @@ inline __device__ Float8_ add(bf16_8_t a, Float8_ fb) {
}
// Vector multiplication.
-template<>
+template <>
inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
assert(false);
@@ -170,7 +172,7 @@ inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b) {
#endif
}
-template<>
+template <>
inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
assert(false);
@@ -179,12 +181,12 @@ inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
#endif
}
-template<>
+template <>
inline __device__ __nv_bfloat162 mul(__nv_bfloat16 a, __nv_bfloat162 b) {
return mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b);
}
-template<>
+template <>
inline __device__ bf16_4_t mul(bf16_4_t a, bf16_4_t b) {
bf16_4_t c;
c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
@@ -192,7 +194,7 @@ inline __device__ bf16_4_t mul(bf16_4_t a, bf16_4_t b) {
return c;
}
-template<>
+template <>
inline __device__ bf16_4_t mul(__nv_bfloat16 a, bf16_4_t b) {
__nv_bfloat162 s = bf162bf162(a);
bf16_4_t c;
@@ -201,7 +203,7 @@ inline __device__ bf16_4_t mul(__nv_bfloat16 a, bf16_4_t b) {
return c;
}
-template<>
+template <>
inline __device__ bf16_8_t mul(bf16_8_t a, bf16_8_t b) {
bf16_8_t c;
c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
@@ -211,7 +213,7 @@ inline __device__ bf16_8_t mul(bf16_8_t a, bf16_8_t b) {
return c;
}
-template<>
+template <>
inline __device__ bf16_8_t mul(__nv_bfloat16 a, bf16_8_t b) {
__nv_bfloat162 s = bf162bf162(a);
bf16_8_t c;
@@ -222,26 +224,26 @@ inline __device__ bf16_8_t mul(__nv_bfloat16 a, bf16_8_t b) {
return c;
}
-template<>
+template <>
inline __device__ float mul(__nv_bfloat16 a, __nv_bfloat16 b) {
float fa = __bfloat162float(a);
float fb = __bfloat162float(b);
return fa * fb;
}
-template<>
+template <>
inline __device__ float2 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
float2 fa = bf1622float2(a);
float2 fb = bf1622float2(b);
return mul(fa, fb);
}
-template<>
+template <>
inline __device__ float2 mul(__nv_bfloat16 a, __nv_bfloat162 b) {
return mul(bf162bf162(a), b);
}
-template<>
+template <>
inline __device__ Float4_ mul(bf16_4_t a, bf16_4_t b) {
Float4_ fc;
fc.x = mul(a.x, b.x);
@@ -249,7 +251,7 @@ inline __device__ Float4_ mul(bf16_4_t a, bf16_4_t b) {
return fc;
}
-template<>
+template <>
inline __device__ Float4_ mul(__nv_bfloat16 a, bf16_4_t b) {
__nv_bfloat162 s = bf162bf162(a);
Float4_ fc;
@@ -258,7 +260,7 @@ inline __device__ Float4_ mul(__nv_bfloat16 a, bf16_4_t b) {
return fc;
}
-template<>
+template <>
inline __device__ Float8_ mul(bf16_8_t a, bf16_8_t b) {
Float8_ fc;
fc.x = mul(a.x, b.x);
@@ -268,7 +270,7 @@ inline __device__ Float8_ mul(bf16_8_t a, bf16_8_t b) {
return fc;
}
-template<>
+template <>
inline __device__ Float8_ mul(__nv_bfloat16 a, bf16_8_t b) {
__nv_bfloat162 s = bf162bf162(a);
Float8_ fc;
@@ -280,7 +282,8 @@ inline __device__ Float8_ mul(__nv_bfloat16 a, bf16_8_t b) {
}
// Vector fused multiply-add.
-inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b,
+ __nv_bfloat162 c) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
assert(false);
#else
@@ -288,7 +291,8 @@ inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bf
#endif
}
-inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b,
+ __nv_bfloat162 c) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
assert(false);
#else
@@ -379,23 +383,23 @@ inline __device__ Float8_ fma(__nv_bfloat16 a, bf16_8_t b, Float8_ fc) {
}
// Vector sum.
-template<>
+template <>
inline __device__ float sum(__nv_bfloat16 v) {
return __bfloat162float(v);
}
-template<>
+template <>
inline __device__ float sum(__nv_bfloat162 v) {
float2 vf = bf1622float2(v);
return vf.x + vf.y;
}
-template<>
+template <>
inline __device__ float sum(bf16_4_t v) {
return sum(v.x) + sum(v.y);
}
-template<>
+template <>
inline __device__ float sum(bf16_8_t v) {
return sum(v.x) + sum(v.y) + sum(v.z) + sum(v.w);
}
@@ -448,4 +452,4 @@ inline __device__ void zero(__nv_bfloat16& dst) {
#endif
}
-} // namespace vllm
+} // namespace vllm
diff --git a/csrc/attention/dtype_float16.cuh b/csrc/attention/dtype_float16.cuh
index d3271e69cd69d..3a1815f0ed4fc 100644
--- a/csrc/attention/dtype_float16.cuh
+++ b/csrc/attention/dtype_float16.cuh
@@ -1,6 +1,8 @@
/*
- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
- * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * and
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
* Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
@@ -30,37 +32,37 @@
namespace vllm {
// FP16 vector types for Q, K, V.
-template<>
+template <>
struct Vec {
using Type = uint16_t;
};
-template<>
+template <>
struct Vec {
using Type = uint32_t;
};
-template<>
+template <>
struct Vec {
using Type = uint2;
};
-template<>
+template <>
struct Vec {
using Type = uint4;
};
// FP32 accumulator vector types corresponding to Vec.
-template<>
+template <>
struct FloatVec {
using Type = float;
};
-template<>
+template <>
struct FloatVec {
using Type = float2;
};
-template<>
+template <>
struct FloatVec {
using Type = Float4_;
};
-template<>
+template <>
struct FloatVec {
using Type = Float8_;
};
@@ -73,8 +75,8 @@ inline __device__ uint32_t h0_h0(uint16_t a) {
return b;
#else
union {
- uint32_t u32;
- uint16_t u16[2];
+ uint32_t u32;
+ uint16_t u16[2];
} tmp;
tmp.u16[0] = a;
tmp.u16[1] = a;
@@ -130,10 +132,12 @@ inline __device__ uint32_t float2_to_half2(float2 f) {
} tmp;
#ifndef USE_ROCM
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
- asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(f.y), "f"(f.x));
+ asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n"
+ : "=r"(tmp.u32)
+ : "f"(f.y), "f"(f.x));
#else
- asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
- asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
+ asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
+ asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
#endif
#else
tmp.u16[0] = float_to_half(f.x);
@@ -201,7 +205,7 @@ inline __device__ Float8_ add(uint4 a, Float8_ fb) {
}
// Vector multiplication.
-template<>
+template <>
inline __device__ uint16_t mul(uint16_t a, uint16_t b) {
uint16_t c;
#ifndef USE_ROCM
@@ -212,7 +216,7 @@ inline __device__ uint16_t mul(uint16_t a, uint16_t b) {
return c;
}
-template<>
+template <>
inline __device__ uint32_t mul(uint32_t a, uint32_t b) {
uint32_t c;
#ifndef USE_ROCM
@@ -223,12 +227,12 @@ inline __device__ uint32_t mul(uint32_t a, uint32_t b) {
return c;
}
-template<>
+template <>
inline __device__ uint32_t mul(uint16_t a, uint32_t b) {
return mul(h0_h0(a), b);
}
-template<>
+template <>
inline __device__ uint2 mul(uint2 a, uint2 b) {
uint2 c;
c.x = mul(a.x, b.x);
@@ -236,7 +240,7 @@ inline __device__ uint2 mul(uint2 a, uint2 b) {
return c;
}
-template<>
+template <>
inline __device__ uint2 mul(uint16_t a, uint2 b) {
uint32_t s = h0_h0(a);
uint2 c;
@@ -245,7 +249,7 @@ inline __device__ uint2 mul(uint16_t a, uint2 b) {
return c;
}
-template<>
+template <>
inline __device__ uint4 mul(uint4 a, uint4 b) {
uint4 c;
c.x = mul(a.x, b.x);
@@ -255,7 +259,7 @@ inline __device__ uint4 mul(uint4 a, uint4 b) {
return c;
}
-template<>
+template <>
inline __device__ uint4 mul(uint16_t a, uint4 b) {
uint32_t s = h0_h0(a);
uint4 c;
@@ -266,26 +270,26 @@ inline __device__ uint4 mul(uint16_t a, uint4 b) {
return c;
}
-template<>
+template <>
inline __device__ float mul(uint16_t a, uint16_t b) {
float fa = half_to_float(a);
float fb = half_to_float(b);
return fa * fb;
}
-template<>
+template <>
inline __device__ float2 mul(uint32_t a, uint32_t b) {
float2 fa = half2_to_float2(a);
float2 fb = half2_to_float2(b);
return mul(fa, fb);
}
-template<>
+template <>
inline __device__ float2 mul(uint16_t a, uint32_t b) {
return mul(h0_h0(a), b);
}
-template<>
+template <>
inline __device__ Float4_ mul(uint2 a, uint2 b) {
Float4_ fc;
fc.x = mul(a.x, b.x);
@@ -293,7 +297,7 @@ inline __device__ Float4_ mul(uint2 a, uint2 b) {
return fc;
}
-template<>
+template <>
inline __device__ Float4_ mul(uint16_t a, uint2 b) {
uint32_t s = h0_h0(a);
Float4_ fc;
@@ -302,7 +306,7 @@ inline __device__ Float4_ mul(uint16_t a, uint2 b) {
return fc;
}
-template<>
+template <>
inline __device__ Float8_ mul(uint4 a, uint4 b) {
Float8_ fc;
fc.x = mul(a.x, b.x);
@@ -312,7 +316,7 @@ inline __device__ Float8_ mul(uint4 a, uint4 b) {
return fc;
}
-template<>
+template <>
inline __device__ Float8_ mul(uint16_t a, uint4 b) {
uint32_t s = h0_h0(a);
Float8_ fc;
@@ -327,9 +331,13 @@ inline __device__ Float8_ mul(uint16_t a, uint4 b) {
inline __device__ uint32_t fma(uint32_t a, uint32_t b, uint32_t c) {
uint32_t d;
#ifndef USE_ROCM
- asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "r"(b), "r"(c));
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+ : "=r"(d)
+ : "r"(a), "r"(b), "r"(c));
#else
- asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n" : "=v"(d) : "v"(a), "v"(b), "v"(c));
+ asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n"
+ : "=v"(d)
+ : "v"(a), "v"(b), "v"(c));
#endif
return d;
}
@@ -423,24 +431,24 @@ inline __device__ Float8_ fma(uint16_t a, uint4 b, Float8_ fc) {
}
// Vector sum.
-template<>
+template <>
inline __device__ float sum(uint16_t v) {
return half_to_float(v);
}
-template<>
+template <>
inline __device__ float sum(uint32_t v) {
float2 tmp = half2_to_float2(v);
return tmp.x + tmp.y;
}
-template<>
+template <>
inline __device__ float sum(uint2 v) {
uint32_t c = add(v.x, v.y);
return sum(c);
}
-template<>
+template <>
inline __device__ float sum(uint4 v) {
uint32_t c = add(v.x, v.y);
c = add(c, v.z);
@@ -470,13 +478,9 @@ inline __device__ void from_float(uint4& dst, Float8_ src) {
}
// From float16 to float32.
-inline __device__ float to_float(uint16_t u) {
- return half_to_float(u);
-}
+inline __device__ float to_float(uint16_t u) { return half_to_float(u); }
-inline __device__ float2 to_float(uint32_t u) {
- return half2_to_float2(u);
-}
+inline __device__ float2 to_float(uint32_t u) { return half2_to_float2(u); }
inline __device__ Float4_ to_float(uint2 u) {
Float4_ tmp;
@@ -495,8 +499,6 @@ inline __device__ Float8_ to_float(uint4 u) {
}
// Zero-out a variable.
-inline __device__ void zero(uint16_t& dst) {
- dst = uint16_t(0);
-}
+inline __device__ void zero(uint16_t& dst) { dst = uint16_t(0); }
-} // namespace vllm
+} // namespace vllm
diff --git a/csrc/attention/dtype_float32.cuh b/csrc/attention/dtype_float32.cuh
index b200d2d226eb0..7c6a686db3ba9 100644
--- a/csrc/attention/dtype_float32.cuh
+++ b/csrc/attention/dtype_float32.cuh
@@ -1,6 +1,8 @@
/*
- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
- * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * and
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
* Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
@@ -38,37 +40,35 @@ struct Float8_ {
};
// FP32 vector types for Q, K, V.
-template<>
+template <>
struct Vec {
using Type = float;
};
-template<>
+template <>
struct Vec {
using Type = float2;
};
-template<>
+template <>
struct Vec {
using Type = float4;
};
// FP32 accumulator vector types corresponding to Vec.
-template<>
+template <>
struct FloatVec {
using Type = float;
};
-template<>
+template <>
struct FloatVec {
using Type = float2;
};
-template<>
+template <>
struct FloatVec {
using Type = float4;
};
// Vector addition.
-inline __device__ float add(float a, float b) {
- return a + b;
-}
+inline __device__ float add(float a, float b) { return a + b; }
inline __device__ float2 add(float2 a, float2 b) {
float2 c;
@@ -87,12 +87,12 @@ inline __device__ float4 add(float4 a, float4 b) {
}
// Vector multiplication.
-template<>
+template <>
inline __device__ float mul(float a, float b) {
return a * b;
}
-template<>
+template <>
inline __device__ float2 mul(float2 a, float2 b) {
float2 c;
c.x = a.x * b.x;
@@ -100,7 +100,7 @@ inline __device__ float2 mul(float2 a, float2 b) {
return c;
}
-template<>
+template <>
inline __device__ float2 mul(float a, float2 b) {
float2 c;
c.x = a * b.x;
@@ -108,7 +108,7 @@ inline __device__ float2 mul(float a, float2 b) {
return c;
}
-template<>
+template <>
inline __device__ float4 mul(float4 a, float4 b) {
float4 c;
c.x = a.x * b.x;
@@ -118,7 +118,7 @@ inline __device__ float4 mul(float4 a, float4 b) {
return c;
}
-template<>
+template <>
inline __device__ float4 mul(float a, float4 b) {
float4 c;
c.x = a * b.x;
@@ -129,9 +129,7 @@ inline __device__ float4 mul(float a, float4 b) {
}
// Vector fused multiply-add.
-inline __device__ float fma(float a, float b, float c) {
- return a * b + c;
-}
+inline __device__ float fma(float a, float b, float c) { return a * b + c; }
inline __device__ float2 fma(float2 a, float2 b, float2 c) {
float2 d;
@@ -182,35 +180,33 @@ inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c) {
}
// Vector sum.
-template<>
+template <>
inline __device__ float sum(float v) {
return v;
}
-template<>
+template <>
inline __device__ float sum(float2 v) {
return v.x + v.y;
}
-template<>
+template <>
inline __device__ float sum(float4 v) {
return v.x + v.y + v.z + v.w;
}
-template<>
+template <>
inline __device__ float sum(Float4_ v) {
return v.x.x + v.x.y + v.y.x + v.y.y;
}
-template<>
+template <>
inline __device__ float sum(Float8_ v) {
return v.x.x + v.x.y + v.y.x + v.y.y + v.z.x + v.z.y + v.w.x + v.w.y;
}
// Vector dot product.
-inline __device__ float dot(float a, float b) {
- return a * b;
-}
+inline __device__ float dot(float a, float b) { return a * b; }
inline __device__ float dot(float2 a, float2 b) {
float2 c = mul(a, b);
@@ -232,42 +228,24 @@ inline __device__ float dot(Float8_ a, Float8_ b) {
}
// From float to float.
-inline __device__ void from_float(float& dst, float src) {
- dst = src;
-}
+inline __device__ void from_float(float& dst, float src) { dst = src; }
-inline __device__ void from_float(float2& dst, float2 src) {
- dst = src;
-}
+inline __device__ void from_float(float2& dst, float2 src) { dst = src; }
-inline __device__ void from_float(float4& dst, float4 src) {
- dst = src;
-}
+inline __device__ void from_float(float4& dst, float4 src) { dst = src; }
// From float to float.
-inline __device__ float to_float(float u) {
- return u;
-}
+inline __device__ float to_float(float u) { return u; }
-inline __device__ float2 to_float(float2 u) {
- return u;
-}
+inline __device__ float2 to_float(float2 u) { return u; }
-inline __device__ float4 to_float(float4 u) {
- return u;
-}
+inline __device__ float4 to_float(float4 u) { return u; }
-inline __device__ Float4_ to_float(Float4_ u) {
- return u;
-}
+inline __device__ Float4_ to_float(Float4_ u) { return u; }
-inline __device__ Float8_ to_float(Float8_ u) {
- return u;
-}
+inline __device__ Float8_ to_float(Float8_ u) { return u; }
// Zero-out a variable.
-inline __device__ void zero(float& dst) {
- dst = 0.f;
-}
+inline __device__ void zero(float& dst) { dst = 0.f; }
-} // namespace vllm
+} // namespace vllm
diff --git a/csrc/attention/dtype_fp8.cuh b/csrc/attention/dtype_fp8.cuh
index d11dee91ebe87..e714e321b0beb 100644
--- a/csrc/attention/dtype_fp8.cuh
+++ b/csrc/attention/dtype_fp8.cuh
@@ -3,33 +3,39 @@
#include "attention_generic.cuh"
#include
-#ifdef ENABLE_FP8_E5M2
-#include
-#endif
+#ifdef ENABLE_FP8
+ #ifndef USE_ROCM
+ #include
+ #endif // USE_ROCM
+#endif // ENABLE_FP8
namespace vllm {
-#if defined(ENABLE_FP8_E5M2) || defined(ENABLE_FP8_E4M3)
-// fp8 vector types for quantization of kv cache
-template<>
+enum class Fp8KVCacheDataType {
+ kAuto = 0,
+ kFp8E4M3 = 1,
+ kFp8E5M2 = 2,
+};
+
+// fp8 vector types for quantization of kv cache
+template <>
struct Vec {
- using Type = uint8_t;
+ using Type = uint8_t;
};
-template<>
+template <>
struct Vec {
- using Type = uint16_t;
+ using Type = uint16_t;
};
-template<>
+template <>
struct Vec {
- using Type = uint32_t;
+ using Type = uint32_t;
};
-template<>
+template <>
struct Vec {
- using Type = uint2;
+ using Type = uint2;
};
-#endif // ENABLE_FP8_E5M2
-} // namespace vllm
+} // namespace vllm
diff --git a/csrc/cache.h b/csrc/cache.h
index fa26ddb688588..064815b7403db 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -5,21 +5,20 @@
#include