diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml new file mode 100644 index 0000000000000..78347f63fa793 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1 +model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.356 + - name: "exact_match,flexible-extract" + value: 0.358 +limit: 1000 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt index 64a0f428587af..6057229ac50f3 100644 --- a/.buildkite/lm-eval-harness/configs/models-small.txt +++ b/.buildkite/lm-eval-harness/configs/models-small.txt @@ -1,6 +1,6 @@ Meta-Llama-3-8B-Instruct.yaml Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml -Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml +Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml diff --git a/.github/mergify.yml b/.github/mergify.yml new file mode 100644 index 0000000000000..2a3dee7c662d1 --- /dev/null +++ b/.github/mergify.yml @@ -0,0 +1,57 @@ +pull_request_rules: +- name: label-documentation + description: Automatically apply documentation label + conditions: + - or: + - files~=^[^/]+\.md$ + - files~=^docs/ + actions: + label: + add: + - documentation + +- name: label-ci-build + description: Automatically apply ci/build label + conditions: + - files~=^\.github/ + - files~=\.buildkite/ + - files~=^cmake/ + - files=CMakeLists.txt + - files~=^Dockerfile + - files~=^requirements.*\.txt + - files=setup.py + actions: + label: + add: + - ci/build + +- name: label-frontend + description: Automatically apply frontend label + conditions: + - files~=^vllm/entrypoints/ + actions: + label: + add: + - frontend + +- name: ping author on conflicts and add 'needs-rebase' label + conditions: + - conflict + - -closed + actions: + label: + add: + - needs-rebase + comment: + message: | + This pull request has merge conflicts that must be resolved before it can be + merged. @{{author}} please rebase it. https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork + +- name: remove 'needs-rebase' label when conflict is resolved + conditions: + - -conflict + - -closed + actions: + label: + remove: + - needs-rebase diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml new file mode 100644 index 0000000000000..81e7c9b050760 --- /dev/null +++ b/.github/workflows/stale.yml @@ -0,0 +1,52 @@ +name: 'Close inactive issues and PRs' + +on: + schedule: + # Daily at 1:30 AM UTC + - cron: '30 1 * * *' + +jobs: + close-issues-and-pull-requests: + permissions: + issues: write + pull-requests: write + actions: write + runs-on: ubuntu-latest + steps: + - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0 + with: + # Increasing this value ensures that changes to this workflow + # propagate to all issues and PRs in days rather than months + operations-per-run: 1000 + + exempt-draft-pr: true + exempt-issue-labels: 'keep-open' + exempt-pr-labels: 'keep-open' + + labels-to-add-when-unstale: 'unstale' + labels-to-remove-when-stale: 'unstale' + + days-before-issue-stale: 90 + days-before-issue-close: 30 + stale-issue-label: 'stale' + stale-issue-message: > + This issue has been automatically marked as stale because it has not + had any activity within 90 days. It will be automatically closed if no + further activity occurs within 30 days. Leave a comment if + you feel this issue should remain open. Thank you! + close-issue-message: > + This issue has been automatically closed due to inactivity. Please + feel free to reopen if you feel it is still relevant. Thank you! + + days-before-pr-stale: 90 + days-before-pr-close: 30 + stale-pr-label: 'stale' + stale-pr-message: > + This pull request has been automatically marked as stale because it + has not had any activity within 90 days. It will be automatically + closed if no further activity occurs within 30 days. Leave a comment + if you feel this pull request should remain open. Thank you! + close-pr-message: > + This pull request has been automatically closed due to inactivity. + Please feel free to reopen if you intend to continue working on it. + Thank you! diff --git a/CMakeLists.txt b/CMakeLists.txt index f3fd608894e6d..873da9795bcbc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,7 +49,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11 # requirements.txt files and should be kept consistent. The ROCm torch # versions are derived from Dockerfile.rocm # -set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0") +set(TORCH_SUPPORTED_VERSION_CUDA "2.5.0") set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0") # @@ -196,12 +196,12 @@ endif() # # Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process. -# Configure it to place files in vllm/.deps, in order to play nicely with sccache. +# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache. +# Each dependency that produces build artifacts should override its BINARY_DIR to avoid +# conflicts between build types. It should instead be set to ${CMAKE_BINARY_DIR}/. # include(FetchContent) -get_filename_component(PROJECT_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) -file(MAKE_DIRECTORY "${FETCHCONTENT_BASE_DIR}") -set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT_DIR}/.deps") +file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}") # @@ -229,7 +229,6 @@ set(VLLM_EXT_SRC "csrc/quantization/compressed_tensors/int8_quant_kernels.cu" "csrc/quantization/fp8/common.cu" "csrc/cuda_utils_kernels.cu" - "csrc/moe_align_block_size_kernels.cu" "csrc/prepare_inputs/advance_step.cu" "csrc/torch_bindings.cpp") @@ -286,7 +285,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}") else() message(STATUS "Not building Marlin kernels as no compatible archs found" - "in CUDA target architectures") + " in CUDA target architectures") endif() # @@ -444,6 +443,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1) set(VLLM_MOE_EXT_SRC "csrc/moe/torch_bindings.cpp" + "csrc/moe/moe_align_sum_kernels.cu" "csrc/moe/topk_softmax_kernels.cu") set_gencode_flags_for_srcs( @@ -471,7 +471,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}") else() message(STATUS "Not building Marlin MOE kernels as no compatible archs found" - "in CUDA target architectures") + " in CUDA target architectures") endif() endif() @@ -549,8 +549,10 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd + GIT_TAG 5259c586c403a4e4d8bf69973c159b40cc346fb9 GIT_PROGRESS TRUE + # Don't share the vllm-flash-attn build between build types + BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn ) endif() diff --git a/Dockerfile.openvino b/Dockerfile.openvino index c89864da91180..a05ff452cd36e 100644 --- a/Dockerfile.openvino +++ b/Dockerfile.openvino @@ -15,11 +15,11 @@ RUN --mount=type=bind,source=.git,target=.git \ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi # install build requirements -RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt +RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt # build vLLM with OpenVINO backend -RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/ +RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace -COPY examples/ /workspace/vllm/examples -COPY benchmarks/ /workspace/vllm/benchmarks +COPY examples/ /workspace/examples +COPY benchmarks/ /workspace/benchmarks CMD ["/bin/bash"] diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index ffe93f7799e1e..0a14aedd5feba 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -1,5 +1,6 @@ """Benchmark the latency of processing a single batch of requests.""" import argparse +import dataclasses import json import time from pathlib import Path @@ -10,44 +11,19 @@ from tqdm import tqdm from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs +from vllm.engine.arg_utils import EngineArgs from vllm.inputs import PromptType -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.utils import FlexibleArgumentParser def main(args: argparse.Namespace): print(args) + engine_args = EngineArgs.from_cli_args(args) + # NOTE(woosuk): If the request cannot be processed in a single batch, # the engine will automatically process the request in multiple batches. - llm = LLM( - model=args.model, - speculative_model=args.speculative_model, - num_speculative_tokens=args.num_speculative_tokens, - speculative_draft_tensor_parallel_size=\ - args.speculative_draft_tensor_parallel_size, - tokenizer=args.tokenizer, - quantization=args.quantization, - tensor_parallel_size=args.tensor_parallel_size, - trust_remote_code=args.trust_remote_code, - dtype=args.dtype, - max_model_len=args.max_model_len, - enforce_eager=args.enforce_eager, - kv_cache_dtype=args.kv_cache_dtype, - quantization_param_path=args.quantization_param_path, - device=args.device, - ray_workers_use_nsight=args.ray_workers_use_nsight, - enable_chunked_prefill=args.enable_chunked_prefill, - download_dir=args.download_dir, - block_size=args.block_size, - gpu_memory_utilization=args.gpu_memory_utilization, - load_format=args.load_format, - distributed_executor_backend=args.distributed_executor_backend, - otlp_traces_endpoint=args.otlp_traces_endpoint, - enable_prefix_caching=args.enable_prefix_caching, - num_scheduler_steps=args.num_scheduler_steps, - ) + llm = LLM(**dataclasses.asdict(engine_args)) sampling_params = SamplingParams( n=args.n, @@ -126,19 +102,6 @@ def run_to_completion(profile_dir: Optional[str] = None): parser = FlexibleArgumentParser( description='Benchmark the latency of processing a single batch of ' 'requests till completion.') - parser.add_argument('--model', type=str, default='facebook/opt-125m') - parser.add_argument('--speculative-model', type=str, default=None) - parser.add_argument('--num-speculative-tokens', type=int, default=None) - parser.add_argument('--speculative-draft-tensor-parallel-size', - '-spec-draft-tp', - type=int, - default=None) - parser.add_argument('--tokenizer', type=str, default=None) - parser.add_argument('--quantization', - '-q', - choices=[*QUANTIZATION_METHODS, None], - default=None) - parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) parser.add_argument('--input-len', type=int, default=32) parser.add_argument('--output-len', type=int, default=128) parser.add_argument('--batch-size', type=int, default=8) @@ -155,45 +118,6 @@ def run_to_completion(profile_dir: Optional[str] = None): type=int, default=30, help='Number of iterations to run.') - parser.add_argument('--trust-remote-code', - action='store_true', - help='trust remote code from huggingface') - parser.add_argument( - '--max-model-len', - type=int, - default=None, - help='Maximum length of a sequence (including prompt and output). ' - 'If None, will be derived from the model.') - parser.add_argument( - '--dtype', - type=str, - default='auto', - choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], - help='data type for model weights and activations. ' - 'The "auto" option will use FP16 precision ' - 'for FP32 and FP16 models, and BF16 precision ' - 'for BF16 models.') - parser.add_argument('--enforce-eager', - action='store_true', - help='enforce eager mode and disable CUDA graph') - parser.add_argument( - '--kv-cache-dtype', - type=str, - choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], - default="auto", - help='Data type for kv cache storage. If "auto", will use model ' - 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' - 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') - parser.add_argument( - '--quantization-param-path', - type=str, - default=None, - help='Path to the JSON file containing the KV cache scaling factors. ' - 'This should generally be supplied, when KV cache dtype is FP8. ' - 'Otherwise, KV cache scaling factors default to 1.0, which may cause ' - 'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' - 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' - 'instead supported for common inference criteria.') parser.add_argument( '--profile', action='store_true', @@ -204,83 +128,12 @@ def run_to_completion(profile_dir: Optional[str] = None): default=None, help=('path to save the pytorch profiler output. Can be visualized ' 'with ui.perfetto.dev or Tensorboard.')) - parser.add_argument("--device", - type=str, - default="auto", - choices=DEVICE_OPTIONS, - help='device type for vLLM execution') - parser.add_argument('--block-size', - type=int, - default=16, - help='block size of key/value cache') - parser.add_argument( - '--enable-chunked-prefill', - action='store_true', - help='If True, the prefill requests can be chunked based on the ' - 'max_num_batched_tokens') - parser.add_argument("--enable-prefix-caching", - action='store_true', - help="Enable automatic prefix caching") - parser.add_argument( - "--ray-workers-use-nsight", - action='store_true', - help="If specified, use nsight to profile ray workers", - ) - parser.add_argument('--download-dir', - type=str, - default=None, - help='directory to download and load the weights, ' - 'default to the default cache dir of huggingface') parser.add_argument( '--output-json', type=str, default=None, help='Path to save the latency results in JSON format.') - parser.add_argument('--gpu-memory-utilization', - type=float, - default=0.9, - help='the fraction of GPU memory to be used for ' - 'the model executor, which can range from 0 to 1.' - 'If unspecified, will use the default value of 0.9.') - parser.add_argument( - '--load-format', - type=str, - default=EngineArgs.load_format, - choices=[ - 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer', - 'bitsandbytes' - ], - help='The format of the model weights to load.\n\n' - '* "auto" will try to load the weights in the safetensors format ' - 'and fall back to the pytorch bin format if safetensors format ' - 'is not available.\n' - '* "pt" will load the weights in the pytorch bin format.\n' - '* "safetensors" will load the weights in the safetensors format.\n' - '* "npcache" will load the weights in pytorch format and store ' - 'a numpy cache to speed up the loading.\n' - '* "dummy" will initialize the weights with random values, ' - 'which is mainly for profiling.\n' - '* "tensorizer" will load the weights using tensorizer from ' - 'CoreWeave. See the Tensorize vLLM Model script in the Examples' - 'section for more information.\n' - '* "bitsandbytes" will load the weights using bitsandbytes ' - 'quantization.\n') - parser.add_argument( - '--distributed-executor-backend', - choices=['ray', 'mp'], - default=None, - help='Backend to use for distributed serving. When more than 1 GPU ' - 'is used, will be automatically set to "ray" if installed ' - 'or "mp" (multiprocessing) otherwise.') - parser.add_argument( - '--otlp-traces-endpoint', - type=str, - default=None, - help='Target URL to which OpenTelemetry traces will be sent.') - parser.add_argument( - "--num-scheduler-steps", - type=int, - default=1, - help="Maximum number of forward steps per scheduler call.") + + parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index a354358e43aa3..1aac029992dbf 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -25,6 +25,7 @@ --input-length-range 128:256 """ +import dataclasses import json import random import time @@ -33,6 +34,7 @@ from transformers import PreTrainedTokenizerBase from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import EngineArgs from vllm.utils import FlexibleArgumentParser try: @@ -129,12 +131,9 @@ def main(args): filtered_datasets = [(PROMPT, prompt_len, args.output_len) ] * args.num_prompts - llm = LLM(model=args.model, - tokenizer_mode='auto', - trust_remote_code=True, - enforce_eager=True, - tensor_parallel_size=args.tensor_parallel_size, - enable_prefix_caching=args.enable_prefix_caching) + engine_args = EngineArgs.from_cli_args(args) + + llm = LLM(**dataclasses.asdict(engine_args)) sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len) @@ -162,18 +161,11 @@ def main(args): parser = FlexibleArgumentParser( description= 'Benchmark the performance with or without automatic prefix caching.') - parser.add_argument('--model', - type=str, - default='baichuan-inc/Baichuan2-13B-Chat') parser.add_argument("--dataset-path", type=str, default=None, help="Path to the dataset.") - parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) parser.add_argument('--output-len', type=int, default=10) - parser.add_argument('--enable-prefix-caching', - action='store_true', - help='enable prefix caching') parser.add_argument('--num-prompts', type=int, default=1, @@ -190,9 +182,7 @@ def main(args): default='128:256', help='Range of input lengths for sampling prompts,' 'specified as "min:max" (e.g., "128:256").') - parser.add_argument("--seed", - type=int, - default=0, - help='Random seed for reproducibility') + + parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index 8843e3a927a01..e0c9e6a6db502 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -1,5 +1,6 @@ """Benchmark offline prioritization.""" import argparse +import dataclasses import json import random import time @@ -7,7 +8,8 @@ from transformers import AutoTokenizer, PreTrainedTokenizerBase -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.engine.arg_utils import EngineArgs +from vllm.utils import FlexibleArgumentParser def sample_requests( @@ -62,46 +64,11 @@ def sample_requests( def run_vllm( requests: List[Tuple[str, int, int]], - model: str, - tokenizer: str, - quantization: Optional[str], - tensor_parallel_size: int, - seed: int, n: int, - trust_remote_code: bool, - dtype: str, - max_model_len: Optional[int], - enforce_eager: bool, - kv_cache_dtype: str, - quantization_param_path: Optional[str], - device: str, - enable_prefix_caching: bool, - enable_chunked_prefill: bool, - max_num_batched_tokens: int, - gpu_memory_utilization: float = 0.9, - download_dir: Optional[str] = None, + engine_args: EngineArgs, ) -> float: from vllm import LLM, SamplingParams - llm = LLM( - model=model, - tokenizer=tokenizer, - quantization=quantization, - tensor_parallel_size=tensor_parallel_size, - seed=seed, - trust_remote_code=trust_remote_code, - dtype=dtype, - max_model_len=max_model_len, - gpu_memory_utilization=gpu_memory_utilization, - enforce_eager=enforce_eager, - kv_cache_dtype=kv_cache_dtype, - quantization_param_path=quantization_param_path, - device=device, - enable_prefix_caching=enable_prefix_caching, - download_dir=download_dir, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - disable_log_stats=False, - ) + llm = LLM(**dataclasses.asdict(engine_args)) # Add the requests to the engine. prompts = [] @@ -142,16 +109,8 @@ def main(args: argparse.Namespace): args.output_len) if args.backend == "vllm": - elapsed_time = run_vllm(requests, args.model, args.tokenizer, - args.quantization, args.tensor_parallel_size, - args.seed, args.n, args.trust_remote_code, - args.dtype, args.max_model_len, - args.enforce_eager, args.kv_cache_dtype, - args.quantization_param_path, args.device, - args.enable_prefix_caching, - args.enable_chunked_prefill, - args.max_num_batched_tokens, - args.gpu_memory_utilization, args.download_dir) + elapsed_time = run_vllm(requests, args.n, + EngineArgs.from_cli_args(args)) else: raise ValueError(f"Unknown backend: {args.backend}") total_num_tokens = sum(prompt_len + output_len @@ -173,7 +132,7 @@ def main(args: argparse.Namespace): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Benchmark the throughput.") + parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser.add_argument("--backend", type=str, choices=["vllm", "hf", "mii"], @@ -191,13 +150,6 @@ def main(args: argparse.Namespace): default=None, help="Output length for each request. Overrides the " "output length from the dataset.") - parser.add_argument("--model", type=str, default="facebook/opt-125m") - parser.add_argument("--tokenizer", type=str, default=None) - parser.add_argument('--quantization', - '-q', - choices=[*QUANTIZATION_METHODS, None], - default=None) - parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1) parser.add_argument("--n", type=int, default=1, @@ -206,81 +158,13 @@ def main(args: argparse.Namespace): type=int, default=200, help="Number of prompts to process.") - parser.add_argument("--seed", type=int, default=0) - parser.add_argument('--trust-remote-code', - action='store_true', - help='trust remote code from huggingface') - parser.add_argument( - '--max-model-len', - type=int, - default=None, - help='Maximum length of a sequence (including prompt and output). ' - 'If None, will be derived from the model.') - parser.add_argument( - '--dtype', - type=str, - default='auto', - choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], - help='data type for model weights and activations. ' - 'The "auto" option will use FP16 precision ' - 'for FP32 and FP16 models, and BF16 precision ' - 'for BF16 models.') - parser.add_argument('--gpu-memory-utilization', - type=float, - default=0.9, - help='the fraction of GPU memory to be used for ' - 'the model executor, which can range from 0 to 1.' - 'If unspecified, will use the default value of 0.9.') - parser.add_argument("--enforce-eager", - action="store_true", - help="enforce eager execution") - parser.add_argument( - '--kv-cache-dtype', - type=str, - choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], - default="auto", - help='Data type for kv cache storage. If "auto", will use model ' - 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' - 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') - parser.add_argument( - '--quantization-param-path', - type=str, - default=None, - help='Path to the JSON file containing the KV cache scaling factors. ' - 'This should generally be supplied, when KV cache dtype is FP8. ' - 'Otherwise, KV cache scaling factors default to 1.0, which may cause ' - 'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' - 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' - 'instead supported for common inference criteria.') - parser.add_argument( - "--device", - type=str, - default="cuda", - choices=["cuda", "cpu"], - help='device type for vLLM execution, supporting CUDA and CPU.') - parser.add_argument( - "--enable-prefix-caching", - action='store_true', - help="enable automatic prefix caching for vLLM backend.") - parser.add_argument("--enable-chunked-prefill", - action='store_true', - help="enable chunked prefill for vLLM backend.") - parser.add_argument('--max-num-batched-tokens', - type=int, - default=None, - help='maximum number of batched tokens per ' - 'iteration') - parser.add_argument('--download-dir', - type=str, - default=None, - help='directory to download and load the weights, ' - 'default to the default cache dir of huggingface') parser.add_argument( '--output-json', type=str, default=None, help='Path to save the throughput results in JSON format.') + parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index e26706af606b0..ee41c8ea38382 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -1,5 +1,6 @@ """Benchmark offline inference throughput.""" import argparse +import dataclasses import json import random import time @@ -11,10 +12,9 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) -from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs +from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.entrypoints.openai.api_server import ( build_async_engine_client_from_engine_args) -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.sampling_params import BeamSearchParams from vllm.utils import FlexibleArgumentParser, merge_async_iterators @@ -67,53 +67,11 @@ def sample_requests( def run_vllm( requests: List[Tuple[str, int, int]], - model: str, - tokenizer: str, - quantization: Optional[str], - tensor_parallel_size: int, - seed: int, n: int, - trust_remote_code: bool, - dtype: str, - max_model_len: Optional[int], - enforce_eager: bool, - kv_cache_dtype: str, - quantization_param_path: Optional[str], - device: str, - enable_prefix_caching: bool, - enable_chunked_prefill: bool, - max_num_batched_tokens: int, - distributed_executor_backend: Optional[str], - gpu_memory_utilization: float = 0.9, - num_scheduler_steps: int = 1, - download_dir: Optional[str] = None, - load_format: str = EngineArgs.load_format, - disable_async_output_proc: bool = False, + engine_args: EngineArgs, ) -> float: from vllm import LLM, SamplingParams - llm = LLM( - model=model, - tokenizer=tokenizer, - quantization=quantization, - tensor_parallel_size=tensor_parallel_size, - seed=seed, - trust_remote_code=trust_remote_code, - dtype=dtype, - max_model_len=max_model_len, - gpu_memory_utilization=gpu_memory_utilization, - enforce_eager=enforce_eager, - kv_cache_dtype=kv_cache_dtype, - quantization_param_path=quantization_param_path, - device=device, - enable_prefix_caching=enable_prefix_caching, - download_dir=download_dir, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - distributed_executor_backend=distributed_executor_backend, - load_format=load_format, - num_scheduler_steps=num_scheduler_steps, - disable_async_output_proc=disable_async_output_proc, - ) + llm = LLM(**dataclasses.asdict(engine_args)) # Add the requests to the engine. prompts: List[str] = [] @@ -155,56 +113,11 @@ def run_vllm( async def run_vllm_async( requests: List[Tuple[str, int, int]], - model: str, - tokenizer: str, - quantization: Optional[str], - tensor_parallel_size: int, - seed: int, n: int, - trust_remote_code: bool, - dtype: str, - max_model_len: Optional[int], - enforce_eager: bool, - kv_cache_dtype: str, - quantization_param_path: Optional[str], - device: str, - enable_prefix_caching: bool, - enable_chunked_prefill: bool, - max_num_batched_tokens: int, - distributed_executor_backend: Optional[str], - gpu_memory_utilization: float = 0.9, - num_scheduler_steps: int = 1, - download_dir: Optional[str] = None, - load_format: str = EngineArgs.load_format, - disable_async_output_proc: bool = False, + engine_args: AsyncEngineArgs, disable_frontend_multiprocessing: bool = False, ) -> float: from vllm import SamplingParams - engine_args = AsyncEngineArgs( - model=model, - tokenizer=tokenizer, - quantization=quantization, - tensor_parallel_size=tensor_parallel_size, - seed=seed, - trust_remote_code=trust_remote_code, - dtype=dtype, - max_model_len=max_model_len, - gpu_memory_utilization=gpu_memory_utilization, - enforce_eager=enforce_eager, - kv_cache_dtype=kv_cache_dtype, - quantization_param_path=quantization_param_path, - device=device, - enable_prefix_caching=enable_prefix_caching, - download_dir=download_dir, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - distributed_executor_backend=distributed_executor_backend, - load_format=load_format, - num_scheduler_steps=num_scheduler_steps, - disable_async_output_proc=disable_async_output_proc, - worker_use_ray=False, - disable_log_requests=True, - ) async with build_async_engine_client_from_engine_args( engine_args, disable_frontend_multiprocessing) as llm: @@ -320,7 +233,16 @@ def main(args: argparse.Namespace): args.tokenizer, trust_remote_code=args.trust_remote_code) if args.dataset is None: # Synthesize a prompt with the given input length. - prompt = "hi" * (args.input_len - 1) + # As tokenizer may add additional tokens like BOS, we need to try + # different lengths to get the desired input length. + for i in range(-10, 10): + prompt = "hi " * (args.input_len + i) + tokenized_prompt = tokenizer(prompt).input_ids + if len(tokenized_prompt) == args.input_len: + break + else: + raise ValueError( + f"Failed to synthesize a prompt with {args.input_len} tokens.") requests = [(prompt, args.input_len, args.output_len) for _ in range(args.num_prompts)] else: @@ -328,23 +250,17 @@ def main(args: argparse.Namespace): args.output_len) if args.backend == "vllm": - run_args = [ - requests, args.model, args.tokenizer, args.quantization, - args.tensor_parallel_size, args.seed, args.n, - args.trust_remote_code, args.dtype, args.max_model_len, - args.enforce_eager, args.kv_cache_dtype, - args.quantization_param_path, args.device, - args.enable_prefix_caching, args.enable_chunked_prefill, - args.max_num_batched_tokens, args.distributed_executor_backend, - args.gpu_memory_utilization, args.num_scheduler_steps, - args.download_dir, args.load_format, args.disable_async_output_proc - ] - if args.async_engine: - run_args.append(args.disable_frontend_multiprocessing) - elapsed_time = uvloop.run(run_vllm_async(*run_args)) + elapsed_time = uvloop.run( + run_vllm_async( + requests, + args.n, + AsyncEngineArgs.from_cli_args(args), + args.disable_frontend_multiprocessing, + )) else: - elapsed_time = run_vllm(*run_args) + elapsed_time = run_vllm(requests, args.n, + EngineArgs.from_cli_args(args)) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -356,8 +272,10 @@ def main(args: argparse.Namespace): raise ValueError(f"Unknown backend: {args.backend}") total_num_tokens = sum(prompt_len + output_len for _, prompt_len, output_len in requests) + total_output_tokens = sum(output_len for _, _, output_len in requests) print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " - f"{total_num_tokens / elapsed_time:.2f} tokens/s") + f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " + f"{total_output_tokens / elapsed_time:.2f} output tokens/s") # Output JSON results if specified if args.output_json: @@ -391,13 +309,6 @@ def main(args: argparse.Namespace): default=None, help="Output length for each request. Overrides the " "output length from the dataset.") - parser.add_argument("--model", type=str, default="facebook/opt-125m") - parser.add_argument("--tokenizer", type=str, default=None) - parser.add_argument('--quantization', - '-q', - choices=[*QUANTIZATION_METHODS, None], - default=None) - parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1) parser.add_argument("--n", type=int, default=1, @@ -406,123 +317,15 @@ def main(args: argparse.Namespace): type=int, default=1000, help="Number of prompts to process.") - parser.add_argument("--seed", type=int, default=0) parser.add_argument("--hf-max-batch-size", type=int, default=None, help="Maximum batch size for HF backend.") - parser.add_argument('--trust-remote-code', - action='store_true', - help='trust remote code from huggingface') - parser.add_argument( - '--max-model-len', - type=int, - default=None, - help='Maximum length of a sequence (including prompt and output). ' - 'If None, will be derived from the model.') - parser.add_argument( - '--dtype', - type=str, - default='auto', - choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], - help='data type for model weights and activations. ' - 'The "auto" option will use FP16 precision ' - 'for FP32 and FP16 models, and BF16 precision ' - 'for BF16 models.') - parser.add_argument('--gpu-memory-utilization', - type=float, - default=0.9, - help='the fraction of GPU memory to be used for ' - 'the model executor, which can range from 0 to 1.' - 'If unspecified, will use the default value of 0.9.') - parser.add_argument("--enforce-eager", - action="store_true", - help="enforce eager execution") - parser.add_argument( - '--kv-cache-dtype', - type=str, - choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], - default="auto", - help='Data type for kv cache storage. If "auto", will use model ' - 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' - 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') - parser.add_argument( - '--quantization-param-path', - type=str, - default=None, - help='Path to the JSON file containing the KV cache scaling factors. ' - 'This should generally be supplied, when KV cache dtype is FP8. ' - 'Otherwise, KV cache scaling factors default to 1.0, which may cause ' - 'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' - 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' - 'instead supported for common inference criteria.') - parser.add_argument("--device", - type=str, - default="auto", - choices=DEVICE_OPTIONS, - help='device type for vLLM execution') - parser.add_argument( - "--num-scheduler-steps", - type=int, - default=1, - help="Maximum number of forward steps per scheduler call.") - parser.add_argument( - "--enable-prefix-caching", - action='store_true', - help="Enable automatic prefix caching for vLLM backend.") - parser.add_argument("--enable-chunked-prefill", - action='store_true', - help="enable chunked prefill for vLLM backend.") - parser.add_argument('--max-num-batched-tokens', - type=int, - default=None, - help='maximum number of batched tokens per ' - 'iteration') - parser.add_argument('--download-dir', - type=str, - default=None, - help='directory to download and load the weights, ' - 'default to the default cache dir of huggingface') parser.add_argument( '--output-json', type=str, default=None, help='Path to save the throughput results in JSON format.') - parser.add_argument( - '--distributed-executor-backend', - choices=['ray', 'mp'], - default=None, - help='Backend to use for distributed serving. When more than 1 GPU ' - 'is used, will be automatically set to "ray" if installed ' - 'or "mp" (multiprocessing) otherwise.') - parser.add_argument( - '--load-format', - type=str, - default=EngineArgs.load_format, - choices=[ - 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer', - 'bitsandbytes' - ], - help='The format of the model weights to load.\n\n' - '* "auto" will try to load the weights in the safetensors format ' - 'and fall back to the pytorch bin format if safetensors format ' - 'is not available.\n' - '* "pt" will load the weights in the pytorch bin format.\n' - '* "safetensors" will load the weights in the safetensors format.\n' - '* "npcache" will load the weights in pytorch format and store ' - 'a numpy cache to speed up the loading.\n' - '* "dummy" will initialize the weights with random values, ' - 'which is mainly for profiling.\n' - '* "tensorizer" will load the weights using tensorizer from ' - 'CoreWeave. See the Tensorize vLLM Model script in the Examples' - 'section for more information.\n' - '* "bitsandbytes" will load the weights using bitsandbytes ' - 'quantization.\n') - parser.add_argument( - "--disable-async-output-proc", - action='store_true', - default=False, - help="Disable async output processor for vLLM backend.") parser.add_argument("--async-engine", action='store_true', default=False, @@ -531,6 +334,7 @@ def main(args: argparse.Namespace): action='store_true', default=False, help="Disable decoupled async engine frontend.") + parser = AsyncEngineArgs.add_cli_args(parser) args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py index 92f6053cc6d7e..7acea6087fdfd 100644 --- a/benchmarks/kernels/benchmark_layernorm.py +++ b/benchmarks/kernels/benchmark_layernorm.py @@ -3,8 +3,8 @@ import torch from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser, - seed_everything) +from vllm.platforms import current_platform +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser @torch.inference_mode() @@ -16,7 +16,7 @@ def main(num_tokens: int, do_profile: bool = False, num_warmup_iters: int = 5, num_iters: int = 100) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device("cuda") layer = RMSNorm(hidden_size).to(dtype=dtype) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index c2ad98b7e2656..8f538c21f7f7e 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -10,7 +10,8 @@ from transformers import AutoConfig from vllm.model_executor.layers.fused_moe.fused_moe import * -from vllm.utils import FlexibleArgumentParser, seed_everything +from vllm.platforms import current_platform +from vllm.utils import FlexibleArgumentParser class BenchmarkConfig(TypedDict): @@ -88,22 +89,23 @@ def prepare(i: int): input_gating.copy_(gating_output[i]) def run(): - fused_moe( - x, - w1, - w2, - input_gating, - topk, - renormalize=True, - inplace=True, - override_config=config, - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a16=use_int8_w8a16, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale, - ) + from vllm.model_executor.layers.fused_moe import override_config + with override_config(config): + fused_moe( + x, + w1, + w2, + input_gating, + topk, + renormalize=True, + inplace=True, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a16=use_int8_w8a16, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + ) # JIT compilation & warmup run() @@ -166,7 +168,7 @@ class BenchmarkWorker: def __init__(self, seed: int) -> None: torch.set_default_device("cuda") - seed_everything(seed) + current_platform.seed_everything(seed) self.seed = seed def benchmark( @@ -180,7 +182,7 @@ def benchmark( use_fp8_w8a8: bool, use_int8_w8a16: bool, ) -> Tuple[Dict[str, int], float]: - seed_everything(self.seed) + current_platform.seed_everything(self.seed) dtype_str = get_config_dtype_str(dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8) diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index f6ccbcc322be2..483584dd804ef 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -5,8 +5,9 @@ import torch from vllm import _custom_ops as ops +from vllm.platforms import current_platform from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser, - create_kv_caches_with_random, is_hip, seed_everything) + create_kv_caches_with_random) NUM_BLOCKS = 1024 * 1024 PARTITION_SIZE = 512 @@ -28,7 +29,7 @@ def main( device: str = "cuda", kv_cache_dtype: Optional[str] = None, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) scale = float(1.0 / (head_size**0.5)) query = torch.empty(num_seqs, @@ -77,7 +78,7 @@ def main( # Prepare for the paged attention kernel. output = torch.empty_like(query) if version == "v2": - if is_hip() and not args.custom_paged_attn: + if current_platform.is_rocm() and not args.custom_paged_attn: global PARTITION_SIZE PARTITION_SIZE = 1024 num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE) diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py index 743a5744e8614..1d62483448946 100644 --- a/benchmarks/kernels/benchmark_quant.py +++ b/benchmarks/kernels/benchmark_quant.py @@ -3,8 +3,8 @@ import torch from vllm import _custom_ops as ops -from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser, - seed_everything) +from vllm.platforms import current_platform +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser @torch.inference_mode() @@ -17,7 +17,7 @@ def main(num_tokens: int, do_profile: bool = False, num_warmup_iters: int = 5, num_iters: int = 100) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device("cuda") x = torch.randn(num_tokens, hidden_size, dtype=dtype) diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 784b1cf9844e4..250d505168d09 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -6,7 +6,8 @@ from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding, get_rope) -from vllm.utils import FlexibleArgumentParser, seed_everything +from vllm.platforms import current_platform +from vllm.utils import FlexibleArgumentParser def benchmark_rope_kernels_multi_lora( @@ -22,7 +23,7 @@ def benchmark_rope_kernels_multi_lora( max_position: int = 8192, base: int = 10000, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) if rotary_dim is None: rotary_dim = head_size diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 3866ba58a6e11..81ac34d55b03f 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -424,11 +424,7 @@ function (define_gpu_extension_target GPU_MOD_NAME) # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of # dependencies that are not necessary and may not be installed. if (GPU_LANGUAGE STREQUAL "CUDA") - if ("${CUDA_CUDA_LIB}" STREQUAL "") - set(CUDA_CUDA_LIB "${CUDA_CUDA_LIBRARY}") - endif() - target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB} - ${CUDA_LIBRARIES}) + target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver) else() target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) endif() diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu index 2dff7de3fd3fd..967f64a0f6b4c 100644 --- a/csrc/activation_kernels.cu +++ b/csrc/activation_kernels.cu @@ -131,6 +131,48 @@ void gelu_tanh_and_mul(torch::Tensor& out, // [..., d] namespace vllm { +template +__device__ __forceinline__ T fatrelu_kernel(const T& x, const float threshold) { + const float f = (float)x; + return (T)(f > threshold ? f : 0.0f); +} + +template +__global__ void act_and_mul_kernel_with_param( + scalar_t* __restrict__ out, const scalar_t* __restrict__ input, const int d, + const float param) { + const int64_t token_idx = blockIdx.x; + for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { + const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]); + const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]); + out[token_idx * d + idx] = ACT_FN(x, param) * y; + } +} + +} // namespace vllm + +#define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM) \ + int d = input.size(-1) / 2; \ + int64_t num_tokens = input.numel() / input.size(-1); \ + dim3 grid(num_tokens); \ + dim3 block(std::min(d, 1024)); \ + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ + VLLM_DISPATCH_FLOATING_TYPES( \ + input.scalar_type(), "act_and_mul_kernel_with_param", [&] { \ + vllm::act_and_mul_kernel_with_param> \ + <<>>(out.data_ptr(), \ + input.data_ptr(), d, \ + PARAM); \ + }); + +void fatrelu_and_mul(torch::Tensor& out, // [..., d], + torch::Tensor& input, // [..., 2 * d] + double threshold) { + LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold); +} +namespace vllm { + // Element-wise activation kernel template. template __global__ void activation_kernel( diff --git a/csrc/moe_align_block_size_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu similarity index 59% rename from csrc/moe_align_block_size_kernels.cu rename to csrc/moe/moe_align_sum_kernels.cu index 1f8d75da83bb8..fff7ce34c838a 100644 --- a/csrc/moe_align_block_size_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -1,15 +1,17 @@ #include #include +#include #include #include -#include "cuda_compat.h" -#include "dispatch_utils.h" +#include "../cuda_compat.h" +#include "../dispatch_utils.h" #define CEILDIV(x, y) (((x) + (y) - 1) / (y)) namespace vllm { +namespace moe { namespace { __device__ __forceinline__ int32_t index(int32_t total_col, int32_t row, @@ -32,10 +34,10 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, extern __shared__ int32_t shared_mem[]; int32_t* tokens_cnts = - shared_mem; // 2d tensor with shape (num_experts + 1, num_experts) + shared_mem; // 2d tensor with shape (blockDim.x + 1, num_experts) int32_t* cumsum = - shared_mem + (num_experts + 1) * - num_experts; // 1d tensor with shape (num_experts + 1) + shared_mem + + (blockDim.x + 1) * num_experts; // 1d tensor with shape (num_experts + 1) for (int i = 0; i < num_experts; ++i) { tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0; @@ -53,10 +55,12 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, __syncthreads(); // For each expert we accumulate the token counts from the different threads. - tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0; - for (int i = 1; i <= blockDim.x; ++i) { - tokens_cnts[index(num_experts, i, threadIdx.x)] += - tokens_cnts[index(num_experts, i - 1, threadIdx.x)]; + if (threadIdx.x < num_experts) { + tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0; + for (int i = 1; i <= blockDim.x; ++i) { + tokens_cnts[index(num_experts, i, threadIdx.x)] += + tokens_cnts[index(num_experts, i - 1, threadIdx.x)]; + } } __syncthreads(); @@ -79,9 +83,11 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, * For each expert, each thread processes the tokens of the corresponding * blocks and stores the corresponding expert_id for each block. */ - for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1]; - i += block_size) { - expert_ids[i / block_size] = threadIdx.x; + if (threadIdx.x < num_experts) { + for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1]; + i += block_size) { + expert_ids[i / block_size] = threadIdx.x; + } } /** @@ -106,6 +112,24 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, ++tokens_cnts[index(num_experts, threadIdx.x, expert_id)]; } } + +template +__global__ void moe_sum_kernel( + scalar_t* __restrict__ out, // [..., d] + const scalar_t* __restrict__ input, // [..., topk, d] + const int d) { + const int64_t token_idx = blockIdx.x; + for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { + scalar_t x = 0.0; +#pragma unroll + for (int k = 0; k < TOPK; ++k) { + x += VLLM_LDG(&input[token_idx * TOPK * d + k * d + idx]); + } + out[token_idx * d + idx] = x; + } +} + +} // namespace moe } // namespace vllm void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, @@ -117,18 +141,62 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { // calc needed amount of shared mem for `tokens_cnts` and `cumsum` // tensors + const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE); const int32_t shared_mem = - ((num_experts + 1) * num_experts + (num_experts + 1)) * + ((num_thread + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t); // set dynamic shared mem - auto kernel = vllm::moe_align_block_size_kernel; + auto kernel = vllm::moe::moe_align_block_size_kernel; AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( (void*)kernel, shared_mem)); - kernel<<<1, num_experts, shared_mem, stream>>>( + kernel<<<1, num_thread, shared_mem, stream>>>( topk_ids.data_ptr(), sorted_token_ids.data_ptr(), experts_ids.data_ptr(), num_tokens_post_pad.data_ptr(), num_experts, block_size, topk_ids.numel()); }); } + +void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size] + torch::Tensor& output) // [num_tokens, hidden_size] +{ + const int hidden_size = input.size(-1); + const int num_tokens = output.numel() / hidden_size; + const int topk = input.size(1); + + dim3 grid(num_tokens); + dim3 block(std::min(hidden_size, 1024)); + const at::cuda::OptionalCUDAGuard device_guard(device_of(output)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + switch (topk) { + case 2: + VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] { + vllm::moe::moe_sum_kernel<<>>( + output.data_ptr(), input.data_ptr(), + hidden_size); + }); + break; + + case 3: + VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] { + vllm::moe::moe_sum_kernel<<>>( + output.data_ptr(), input.data_ptr(), + hidden_size); + }); + break; + + case 4: + VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "moe_sum_kernel", [&] { + vllm::moe::moe_sum_kernel<<>>( + output.data_ptr(), input.data_ptr(), + hidden_size); + }); + break; + + default: + at::sum_out(output, input, 1); + break; + } +} diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h index a251730aa765a..596cc0aa6c855 100644 --- a/csrc/moe/moe_ops.h +++ b/csrc/moe/moe_ops.h @@ -5,3 +5,10 @@ void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices, torch::Tensor& token_expert_indices, torch::Tensor& gating_output); + +void moe_sum(torch::Tensor& input, torch::Tensor& output); + +void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, + int64_t block_size, torch::Tensor sorted_token_ids, + torch::Tensor experts_ids, + torch::Tensor num_tokens_post_pad); diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp index 019c6cedd3d80..f3a558c14ab93 100644 --- a/csrc/moe/torch_bindings.cpp +++ b/csrc/moe/torch_bindings.cpp @@ -8,6 +8,20 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { "token_expert_indices, Tensor gating_output) -> ()"); m.impl("topk_softmax", torch::kCUDA, &topk_softmax); + // Calculate the result of moe by summing up the partial results + // from all selected experts. + m.def("moe_sum(Tensor! input, Tensor output) -> ()"); + m.impl("moe_sum", torch::kCUDA, &moe_sum); + + // Aligning the number of tokens to be processed by each expert such + // that it is divisible by the block size. + m.def( + "moe_align_block_size(Tensor topk_ids, int num_experts," + " int block_size, Tensor! sorted_token_ids," + " Tensor! experts_ids," + " Tensor! num_tokens_post_pad) -> ()"); + m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size); + #ifndef USE_ROCM m.def( "marlin_gemm_moe(Tensor! a, Tensor! b_q_weights, Tensor! sorted_ids, " diff --git a/csrc/ops.h b/csrc/ops.h index 40ceefc76b088..f98c817fda646 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -5,6 +5,30 @@ #include "core/scalar_type.hpp" +#include + +torch::Tensor weak_ref_tensor(torch::Tensor& tensor) { + // Ensure tensor is on CUDA + if (!tensor.is_cuda()) { + throw std::runtime_error("Tensor must be on CUDA device"); + } + + // Get the raw data pointer + void* data_ptr = tensor.data_ptr(); + + // Get tensor sizes and strides + std::vector sizes = tensor.sizes().vec(); + std::vector strides = tensor.strides().vec(); + + // Get tensor options (dtype, device) + auto options = tensor.options(); + + // Create a new tensor from the raw data pointer + auto new_tensor = torch::from_blob(data_ptr, sizes, strides, options); + + return new_tensor; +} + void paged_attention_v1( torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int64_t num_kv_heads, double scale, @@ -59,6 +83,9 @@ void gelu_and_mul(torch::Tensor& out, torch::Tensor& input); void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input); +void fatrelu_and_mul(torch::Tensor& out, torch::Tensor& input, + double threshold); + void gelu_new(torch::Tensor& out, torch::Tensor& input); void gelu_fast(torch::Tensor& out, torch::Tensor& input); @@ -153,11 +180,6 @@ void dynamic_per_token_scaled_fp8_quant( torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale, c10::optional const& scale_ub); -void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, - int64_t block_size, torch::Tensor sorted_token_ids, - torch::Tensor experts_ids, - torch::Tensor num_tokens_post_pad); - void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta, const torch::Tensor& A, const torch::Tensor& B, const torch::Tensor& C, diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu index 1657f7d0b16e8..97a969cf5e3e0 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu @@ -137,9 +137,11 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a, return; } - // Turing - TORCH_CHECK(version_num >= 75); - cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales, bias); + if (version_num >= 75) { + // Turing + cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales, bias); + return; + } #endif TORCH_CHECK_NOT_IMPLEMENTED( diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 6f511c0f6430f..6da7c1fa30257 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -18,6 +18,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // vLLM custom ops + ops.def("weak_ref_tensor(Tensor input) -> Tensor"); + ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor); + // Attention ops // Compute the attention between an input query and the cached // keys/values using PagedAttention. @@ -64,6 +67,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("gelu_tanh_and_mul(Tensor! out, Tensor input) -> ()"); ops.impl("gelu_tanh_and_mul", torch::kCUDA, &gelu_tanh_and_mul); + // FATReLU implementation. + ops.def("fatrelu_and_mul(Tensor! out, Tensor input, float threshold) -> ()"); + ops.impl("fatrelu_and_mul", torch::kCUDA, &fatrelu_and_mul); + // GELU implementation used in GPT-2. ops.def("gelu_new(Tensor! out, Tensor input) -> ()"); ops.impl("gelu_new", torch::kCUDA, &gelu_new); @@ -352,15 +359,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA, &dynamic_per_token_scaled_fp8_quant); - // Aligning the number of tokens to be processed by each expert such - // that it is divisible by the block size. - ops.def( - "moe_align_block_size(Tensor topk_ids, int num_experts," - " int block_size, Tensor! sorted_token_ids," - " Tensor! experts_ids," - " Tensor! num_tokens_post_pad) -> ()"); - ops.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size); - // Compute int8 quantized tensor for given scaling factor. ops.def( "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale," diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst index f544325a0776c..d12aeebbbc184 100644 --- a/docs/source/getting_started/cpu-installation.rst +++ b/docs/source/getting_started/cpu-installation.rst @@ -3,7 +3,13 @@ Installation with CPU ======================== -vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32 and BF16. +vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32 and BF16. vLLM CPU backend supports the following vLLM features: + +- Tensor Parallel (``-tp = N``) +- Quantization (``INT8 W8A8, AWQ``) + +.. note:: + FP16 data type and more advanced features on `chunked-prefill`, `prefix-caching` and `FP8 KV cache` are under development and will be available soon. Table of contents: @@ -141,5 +147,20 @@ Performance tips - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using ``VLLM_CPU_OMP_THREADS_BIND`` to avoid cross NUMA node memory access. +CPU Backend Considerations +-------------------------- + +- The CPU backend significantly differs from the GPU backend since the vLLM architecture was originally optimized for GPU use. A number of optimizations are needed to enhance its performance. + +- Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance. + +- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the `topology `_. For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel. + + * Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With `TP feature on CPU `_ merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving: + + .. code-block:: console + + $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp + * Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like `Nginx <../serving/deploying_with_nginx.html>`_ or HAProxy are recommended. Anyscale Ray project provides the feature on LLM `serving `_. Here is the example to setup a scalable LLM serving with `Ray Serve `_. \ No newline at end of file diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst index cfd2dcb3bd5d3..91978065faf42 100644 --- a/docs/source/getting_started/debugging.rst +++ b/docs/source/getting_started/debugging.rst @@ -107,15 +107,15 @@ If GPU/CPU communication cannot be established, you can use the following Python If you are testing with a single node, adjust ``--nproc-per-node`` to the number of GPUs you want to use: -.. code-block:: shell +.. code-block:: console - NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py + $ NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py If you are testing with multi-nodes, adjust ``--nproc-per-node`` and ``--nnodes`` according to your setup and set ``MASTER_ADDR`` to the correct IP address of the master node, reachable from all nodes. Then, run: -.. code-block:: shell +.. code-block:: console - NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py + $ NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py If the script runs successfully, you should see the message ``sanity check is successful!``. diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst index 99c695ac4ddb1..a706b285edede 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation.rst @@ -7,14 +7,14 @@ Installation vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries. Requirements -=========================== +============ * OS: Linux -* Python: 3.8 -- 3.12 +* Python: 3.8 - 3.12 * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) Install released versions -=========================== +========================= You can install vLLM using pip: @@ -51,9 +51,9 @@ You can install vLLM using pip: .. _install-the-latest-code: Install the latest code -========================= +======================= -LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on x86 platform with cuda 12 for every commit since v0.5.3. You can download and install the latest one with the following command: +LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since ``v0.5.3``. You can download and install it with the following command: .. code-block:: console @@ -66,7 +66,7 @@ If you want to access the wheels for previous commits, you can specify the commi $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl -Note that the wheels are built with Python 3.8 abi (see `PEP 425 `_ for more details about abi), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. +Note that the wheels are built with Python 3.8 ABI (see `PEP 425 `_ for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Another way to access the latest code is to use the docker images: @@ -77,17 +77,17 @@ Another way to access the latest code is to use the docker images: These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days. -Latest code can contain bugs and may not be stable. Please use it with caution. +The latest code can contain bugs and may not be stable. Please use it with caution. .. _build_from_source: Build from source -================== +================= .. _python-only-build: Python-only build (without compilation) ----------------------------------------- +--------------------------------------- If you only need to change Python code, you can simply build vLLM without compilation. @@ -116,28 +116,28 @@ The script will: Now, you can edit the Python code in the current directory, and the changes will be reflected when you run vLLM. -Once you have finished editing or want to install another vLLM wheel, you should exit the development environment using `the same script `_ with the ``--quit-dev``(or ``-q`` for short) flag: +Once you have finished editing or want to install another vLLM wheel, you should exit the development environment using `the same script `_ with the ``--quit-dev`` (or ``-q`` for short) flag: .. code-block:: console $ python python_only_dev.py --quit-dev -The script with ``--quit-dev`` flag will: +The ``--quit-dev`` flag will: * Remove the symbolic link from the current directory to the vLLM package. * Restore the original vLLM package from the backup. -If you update the vLLM wheel and want to rebuild from the source and make further edits, you will need to start `all above <#python-only-build>`_ over again. +If you update the vLLM wheel and rebuild from the source to make further edits, you will need to repeat the `Python-only build <#python-only-build>`_ steps again. .. note:: There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors. - It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to `the above section <#install-the-latest-code>`_ for instructions on how to install a specified wheel. + It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to `the section above <#install-the-latest-code>`_ for instructions on how to install a specified wheel. Full build (with compilation) ---------------------------------- +----------------------------- -If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes: +If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes: .. code-block:: console @@ -153,7 +153,7 @@ If you want to modify C++ or CUDA code, you'll need to build vLLM from source. T Use an existing PyTorch installation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.: * Building vLLM with PyTorch nightly or a custom PyTorch build. @@ -171,7 +171,7 @@ To build vLLM using an existing PyTorch installation: Troubleshooting -~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~ To avoid your system being overloaded, you can limit the number of compilation jobs to be run simultaneously, via the environment variable ``MAX_JOBS``. For example: @@ -207,7 +207,7 @@ Here is a sanity check to verify that the CUDA Toolkit is correctly installed: Unsupported OS build ----------------------- +-------------------- vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst index 80b19ac672936..f0e6cddf09ef7 100644 --- a/docs/source/getting_started/quickstart.rst +++ b/docs/source/getting_started/quickstart.rst @@ -1,38 +1,50 @@ .. _quickstart: +========== Quickstart ========== -This guide shows how to use vLLM to: +This guide will help you quickly get started with vLLM to: -* run offline batched inference on a dataset; -* build an API server for a large language model; -* start an OpenAI-compatible API server. +* :ref:`Run offline batched inference ` +* :ref:`Run OpenAI-compatible inference ` -Be sure to complete the :ref:`installation instructions ` before continuing with this guide. +Prerequisites +-------------- +- OS: Linux +- Python: 3.8 - 3.12 +- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) -.. note:: +Installation +-------------- + +You can install vLLM using pip. It's recommended to use `conda `_ to create and manage Python environments. + +.. code-block:: console - By default, vLLM downloads model from `HuggingFace `_. If you would like to use models from `ModelScope `_ in the following examples, please set the environment variable: + $ conda create -n myenv python=3.10 -y + $ conda activate myenv + $ pip install vllm - .. code-block:: shell +Please refer to the :ref:`installation documentation ` for more details on installing vLLM. - export VLLM_USE_MODELSCOPE=True +.. _offline_batched_inference: Offline Batched Inference ------------------------- -We first show an example of using vLLM for offline batched inference on a dataset. In other words, we use vLLM to generate texts for a list of input prompts. +With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). The example script for this section can be found `here `__. + +The first line of this example imports the classes :class:`~vllm.LLM` and :class:`~vllm.SamplingParams`: -Import :class:`~vllm.LLM` and :class:`~vllm.SamplingParams` from vLLM. -The :class:`~vllm.LLM` class is the main class for running offline inference with vLLM engine. -The :class:`~vllm.SamplingParams` class specifies the parameters for the sampling process. +- :class:`~vllm.LLM` is the main class for running offline inference with vLLM engine. +- :class:`~vllm.SamplingParams` specifies the parameters for the sampling process. .. code-block:: python from vllm import LLM, SamplingParams -Define the list of input prompts and the sampling parameters for generation. The sampling temperature is set to 0.8 and the nucleus sampling probability is set to 0.95. For more information about the sampling parameters, refer to the `class definition `_. +The next section defines a list of input prompts and sampling parameters for text generation. The `sampling temperature `_ is set to ``0.8`` and the `nucleus sampling probability `_ is set to ``0.95``. You can find more information about the sampling parameters `here `__. .. code-block:: python @@ -44,46 +56,46 @@ Define the list of input prompts and the sampling parameters for generation. The ] sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -Initialize vLLM's engine for offline inference with the :class:`~vllm.LLM` class and the `OPT-125M model `_. The list of supported models can be found at :ref:`supported models `. +The :class:`~vllm.LLM` class initializes vLLM's engine and the `OPT-125M model `_ for offline inference. The list of supported models can be found :ref:`here `. .. code-block:: python llm = LLM(model="facebook/opt-125m") -Call ``llm.generate`` to generate the outputs. It adds the input prompts to vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all the output tokens. +.. note:: + + By default, vLLM downloads models from `HuggingFace `_. If you would like to use models from `ModelScope `_, set the environment variable ``VLLM_USE_MODELSCOPE`` before initializing the engine. + +Now, the fun part! The outputs are generated using ``llm.generate``. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all of the output tokens. .. code-block:: python outputs = llm.generate(prompts, sampling_params) - # Print the outputs. for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -The code example can also be found in `examples/offline_inference.py `_. +.. _openai_compatible_server: OpenAI-Compatible Server ------------------------ vLLM can be deployed as a server that implements the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API. -By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the command below) and implements `list models `_, `create chat completion `_, and `create completion `_ endpoints. We are actively adding support for more endpoints. +By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time and implements endpoints such as `list models `_, `create chat completion `_, and `create completion `_ endpoints. -Start the server: +Run the following command to start the vLLM server with the `Qwen2.5-1.5B-Instruct `_ model: .. code-block:: console - $ vllm serve facebook/opt-125m + $ vllm serve Qwen/Qwen2.5-1.5B-Instruct -By default, the server uses a predefined chat template stored in the tokenizer. You can override this template by using the ``--chat-template`` argument: - -.. code-block:: console +.. note:: - $ vllm serve facebook/opt-125m --chat-template ./examples/template_chatml.jinja + By default, the server uses a predefined chat template stored in the tokenizer. You can learn about overriding it `here `__. -This server can be queried in the same format as OpenAI API. For example, list the models: +This server can be queried in the same format as OpenAI API. For example, to list the models: .. code-block:: console @@ -91,17 +103,17 @@ This server can be queried in the same format as OpenAI API. For example, list t You can pass in the argument ``--api-key`` or environment variable ``VLLM_API_KEY`` to enable the server to check for API key in the header. -Using OpenAI Completions API with vLLM -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +OpenAI Completions API with vLLM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Query the model with input prompts: +Once your server is started, you can query the model with input prompts: .. code-block:: console $ curl http://localhost:8000/v1/completions \ $ -H "Content-Type: application/json" \ $ -d '{ - $ "model": "facebook/opt-125m", + $ "model": "Qwen/Qwen2.5-1.5B-Instruct", $ "prompt": "San Francisco is a", $ "max_tokens": 7, $ "temperature": 0 @@ -120,36 +132,32 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep api_key=openai_api_key, base_url=openai_api_base, ) - completion = client.completions.create(model="facebook/opt-125m", + completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", prompt="San Francisco is a") print("Completion result:", completion) -For a more detailed client example, refer to `examples/openai_completion_client.py `_. - -Using OpenAI Chat API with vLLM -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +A more detailed client example can be found `here `__. -The vLLM server is designed to support the OpenAI Chat API, allowing you to engage in dynamic conversations with the model. The chat interface is a more interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations. +OpenAI Chat API with vLLM +~~~~~~~~~~~~~~~~~~~~~~~~~~ -Querying the model using OpenAI Chat API: +vLLM is designed to also support the OpenAI Chat API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations. -You can use the `create chat completion `_ endpoint to communicate with the model in a chat-like interface: +You can use the `create chat completion `_ endpoint to interact with the model: .. code-block:: console $ curl http://localhost:8000/v1/chat/completions \ $ -H "Content-Type: application/json" \ $ -d '{ - $ "model": "facebook/opt-125m", + $ "model": "Qwen/Qwen2.5-1.5B-Instruct", $ "messages": [ $ {"role": "system", "content": "You are a helpful assistant."}, $ {"role": "user", "content": "Who won the world series in 2020?"} $ ] $ }' -Python Client Example: - -Using the `openai` python package, you can also communicate with the model in a chat-like manner: +Alternatively, you can use the `openai` python package: .. code-block:: python @@ -164,12 +172,10 @@ Using the `openai` python package, you can also communicate with the model in a ) chat_response = client.chat.completions.create( - model="facebook/opt-125m", + model="Qwen/Qwen2.5-1.5B-Instruct", messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Tell me a joke."}, ] ) print("Chat response:", chat_response) - -For more in-depth examples and advanced features of the chat API, you can refer to the official OpenAI documentation. diff --git a/docs/source/index.rst b/docs/source/index.rst index d20e46b4a3656..c328c049b430c 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -80,6 +80,7 @@ Documentation serving/openai_compatible_server serving/deploying_with_docker serving/deploying_with_k8s + serving/deploying_with_nginx serving/distributed_serving serving/metrics serving/env_vars diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst index ae09259c0756c..c6d88cc38e99b 100644 --- a/docs/source/models/adding_model.rst +++ b/docs/source/models/adding_model.rst @@ -133,7 +133,9 @@ If you are running api server with :code:`vllm serve `, you can wrap the e from vllm import ModelRegistry from your_code import YourModelForCausalLM ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) - import runpy - runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__') + + if __name__ == '__main__': + import runpy + runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__') Save the above code in a file and run it with :code:`python your_file.py `. diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 62ab8c067f5d0..ff893b613f150 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -3,10 +3,47 @@ Supported Models ================ -vLLM supports a variety of generative Transformer models in `HuggingFace (HF) Transformers `_. -The following is the list of model architectures that are currently supported by vLLM. +vLLM supports a variety of generative and embedding models from `HuggingFace (HF) Transformers `_. +This page lists the model architectures that are currently supported by vLLM. Alongside each architecture, we include some popular models that use it. +For other models, you can check the :code:`config.json` file inside the model repository. +If the :code:`"architectures"` field contains a model architecture listed below, then it should be supported in theory. + +.. tip:: + The easiest way to check if your model is really supported at runtime is to run the program below: + + .. code-block:: python + + from vllm import LLM + + llm = LLM(model=...) # Name or path of your model + output = llm.generate("Hello, my name is") + print(output) + + If vLLM successfully generates text, it indicates that your model is supported. + +Otherwise, please refer to :ref:`Adding a New Model ` and :ref:`Enabling Multimodal Inputs ` +for instructions on how to implement your model in vLLM. +Alternatively, you can `open an issue on GitHub `_ to request vLLM support. + +.. note:: + To use models from `ModelScope `_ instead of HuggingFace Hub, set an environment variable: + + .. code-block:: shell + + $ export VLLM_USE_MODELSCOPE=True + + And use with :code:`trust_remote_code=True`. + + .. code-block:: python + + from vllm import LLM + + llm = LLM(model=..., revision=..., trust_remote_code=True) # Name or path of your model + output = llm.generate("Hello, my name is") + print(output) + Text-only Language Models ^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -144,7 +181,7 @@ Text Generation - ✅︎ * - :code:`JAISLMHeadModel` - Jais - - :code:`core42/jais-13b`, :code:`core42/jais-13b-chat`, :code:`core42/jais-30b-v3`, :code:`core42/jais-30b-chat-v3`, etc. + - :code:`inceptionai/jais-13b`, :code:`inceptionai/jais-13b-chat`, :code:`inceptionai/jais-30b-v3`, :code:`inceptionai/jais-30b-chat-v3`, etc. - - ✅︎ * - :code:`JambaForCausalLM` @@ -324,6 +361,28 @@ Reward Modeling .. note:: As an interim measure, these models are supported via Embeddings API. See `this RFC `_ for upcoming changes. +Classification +--------------- + +.. list-table:: + :widths: 25 25 50 5 5 + :header-rows: 1 + + * - Architecture + - Models + - Example HF Models + - :ref:`LoRA ` + - :ref:`PP ` + * - :code:`Qwen2ForSequenceClassification` + - Qwen2-based + - :code:`jason9693/Qwen2.5-1.5B-apeach`, etc. + - + - ✅︎ + +.. note:: + As an interim measure, these models are supported via Embeddings API. It will be supported via Classification API in the future (no reference APIs exist now). + + Multimodal Language Models ^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -334,6 +393,14 @@ The following modalities are supported depending on the model: - **V**\ ideo - **A**\ udio +Any combination of modalities joined by :code:`+` are supported. + +- e.g.: :code:`T + I` means that the model supports text-only, image-only, and text-with-image inputs. + +On the other hand, modalities separated by :code:`/` are mutually exclusive. + +- e.g.: :code:`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs. + .. _supported_vlms: Text Generation @@ -376,7 +443,7 @@ Text Generation * - :code:`InternVLChatModel` - InternVL2 - T + I\ :sup:`E+` - - :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc. + - :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc. - - ✅︎ * - :code:`LlavaForConditionalGeneration` @@ -417,7 +484,7 @@ Text Generation - * - :code:`MolmoForCausalLM` - Molmo - - Image + - T + I - :code:`allenai/Molmo-7B-D-0924`, :code:`allenai/Molmo-72B-0924`, etc. - - ✅︎ @@ -451,6 +518,12 @@ Text Generation - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc. - - ✅︎ + * - :code:`Qwen2AudioForConditionalGeneration` + - Qwen2-Audio + - T + A\ :sup:`+` + - :code:`Qwen/Qwen2-Audio-7B-Instruct` + - + - ✅︎ * - :code:`Qwen2VLForConditionalGeneration` - Qwen2-VL - T + I\ :sup:`E+` + V\ :sup:`+` @@ -484,6 +557,12 @@ Multimodal Embedding - Example HF Models - :ref:`LoRA ` - :ref:`PP ` + * - :code:`LlavaNextForConditionalGeneration` + - LLaVA-NeXT-based + - T / I + - :code:`royokong/e5-v` + - + - ✅︎ * - :code:`Phi3VForCausalLM` - Phi-3-Vision-based - T + I @@ -495,44 +574,6 @@ Multimodal Embedding Some model architectures support both generation and embedding tasks. In this case, you have to pass :code:`--task embedding` to run the model in embedding mode. ----- - -If your model uses one of the above model architectures, you can seamlessly run your model with vLLM. -Otherwise, please refer to :ref:`Adding a New Model ` and :ref:`Enabling Multimodal Inputs ` -for instructions on how to implement support for your model. -Alternatively, you can raise an issue on our `GitHub `_ project. - -.. tip:: - The easiest way to check if your model is supported is to run the program below: - - .. code-block:: python - - from vllm import LLM - - llm = LLM(model=...) # Name or path of your model - output = llm.generate("Hello, my name is") - print(output) - - If vLLM successfully generates text, it indicates that your model is supported. - -.. tip:: - To use models from `ModelScope `_ instead of HuggingFace Hub, set an environment variable: - - .. code-block:: shell - - $ export VLLM_USE_MODELSCOPE=True - - And use with :code:`trust_remote_code=True`. - - .. code-block:: python - - from vllm import LLM - - llm = LLM(model=..., revision=..., trust_remote_code=True) # Name or path of your model - output = llm.generate("Hello, my name is") - print(output) - - Model Support Policy ===================== diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index a7b55d1c0c1ff..a47902ab4fc9d 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -247,9 +247,9 @@ A full code example can be found in `examples/openai_api_client_for_multimodal.p By default, the timeout for fetching images through http url is ``5`` seconds. You can override this by setting the environment variable: - .. code-block:: shell + .. code-block:: console - export VLLM_IMAGE_FETCH_TIMEOUT= + $ export VLLM_IMAGE_FETCH_TIMEOUT= .. note:: There is no need to format the prompt in the API request since it will be handled by the server. diff --git a/docs/source/serving/deploying_with_nginx.rst b/docs/source/serving/deploying_with_nginx.rst new file mode 100644 index 0000000000000..b5dff02b6bae6 --- /dev/null +++ b/docs/source/serving/deploying_with_nginx.rst @@ -0,0 +1,142 @@ +.. _nginxloadbalancer: + +Deploying with Nginx Loadbalancer +================================= + +This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers. + +Table of contents: + +#. :ref:`Build Nginx Container ` +#. :ref:`Create Simple Nginx Config file ` +#. :ref:`Build vLLM Container ` +#. :ref:`Create Docker Network ` +#. :ref:`Launch vLLM Containers ` +#. :ref:`Launch Nginx ` +#. :ref:`Verify That vLLM Servers Are Ready ` + +.. _nginxloadbalancer_nginx_build: + +Build Nginx Container +--------------------- + +This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory. + +.. code-block:: console + + export vllm_root=`pwd` + +Create a file named ``Dockerfile.nginx``: + +.. code-block:: console + + FROM nginx:latest + RUN rm /etc/nginx/conf.d/default.conf + EXPOSE 80 + CMD ["nginx", "-g", "daemon off;"] + +Build the container: + +.. code-block:: console + + docker build . -f Dockerfile.nginx --tag nginx-lb + +.. _nginxloadbalancer_nginx_conf: + +Create Simple Nginx Config file +------------------------------- + +Create a file named ``nginx_conf/nginx.conf``. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another ``server vllmN:8000 max_fails=3 fail_timeout=10000s;`` entry to ``upstream backend``. + +.. code-block:: console + + upstream backend { + least_conn; + server vllm0:8000 max_fails=3 fail_timeout=10000s; + server vllm1:8000 max_fails=3 fail_timeout=10000s; + } + server { + listen 80; + location / { + proxy_pass http://backend; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + } + +.. _nginxloadbalancer_nginx_vllm_container: + +Build vLLM Container +-------------------- + +.. code-block:: console + + cd $vllm_root + docker build -f Dockerfile . --tag vllm + + +If you are behind proxy, you can pass the proxy settings to the docker build command as shown below: + +.. code-block:: console + + cd $vllm_root + docker build -f Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy + +.. _nginxloadbalancer_nginx_docker_network: + +Create Docker Network +--------------------- + +.. code-block:: console + + docker network create vllm_nginx + + +.. _nginxloadbalancer_nginx_launch_container: + +Launch vLLM Containers +---------------------- + +Notes: + +* If you have your HuggingFace models cached somewhere else, update ``hf_cache_dir`` below. +* If you don't have an existing HuggingFace cache you will want to start ``vllm0`` and wait for the model to complete downloading and the server to be ready. This will ensure that ``vllm1`` can leverage the model you just downloaded and it won't have to be downloaded again. +* The below example assumes GPU backend used. If you are using CPU backend, remove ``--gpus all``, add ``VLLM_CPU_KVCACHE_SPACE`` and ``VLLM_CPU_OMP_THREADS_BIND`` environment variables to the docker run command. +* Adjust the model name that you want to use in your vLLM servers if you don't want to use ``Llama-2-7b-chat-hf``. + +.. code-block:: console + + mkdir -p ~/.cache/huggingface/hub/ + hf_cache_dir=~/.cache/huggingface/ + docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8081:8000 --name vllm0 vllm --model meta-llama/Llama-2-7b-chat-hf + docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf + +.. note:: + If you are behind proxy, you can pass the proxy settings to the docker run command via ``-e http_proxy=$http_proxy -e https_proxy=$https_proxy``. + +.. _nginxloadbalancer_nginx_launch_nginx: + +Launch Nginx +------------ + +.. code-block:: console + + docker run -itd -p 8000:80 --network vllm_nginx -v ./nginx_conf/:/etc/nginx/conf.d/ --name nginx-lb nginx-lb:latest + +.. _nginxloadbalancer_nginx_verify_nginx: + +Verify That vLLM Servers Are Ready +---------------------------------- + +.. code-block:: console + + docker logs vllm0 | grep Uvicorn + docker logs vllm1 | grep Uvicorn + +Both outputs should look like this: + +.. code-block:: console + + INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index cc8e539a8a6d3..413c87ab28755 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -103,6 +103,23 @@ vllm serve --chat-template ./path-to-chat-template.jinja vLLM community provides a set of chat templates for popular models. You can find them in the examples directory [here](https://github.com/vllm-project/vllm/tree/main/examples/) +With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies +both a `type` and a `text` field. An example is provided below: +```python +completion = client.chat.completions.create( + model="NousResearch/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]} + ] +) +``` +Most chat templates for LLMs expect the `content` to be a `string` but there are some newer models like +`meta-llama/Llama-Guard-3-1B` that expect the content to be parsed with the new OpenAI spec. In order to choose which +format the content needs to be parsed in by vLLM, please use the `--chat-template-text-format` argument to specify +between `string` or `openai`. The default value is `string` and vLLM internally converts both spec formats to match +this, unless explicitly specified. + + ## Command line arguments for the server ```{argparse} diff --git a/docs/source/serving/tensorizer.rst b/docs/source/serving/tensorizer.rst index a44696507fb9a..96a93db94871b 100644 --- a/docs/source/serving/tensorizer.rst +++ b/docs/source/serving/tensorizer.rst @@ -9,4 +9,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor For more information on CoreWeave's Tensorizer, please refer to `CoreWeave's Tensorizer documentation `_. For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see -the `vLLM example script `_. \ No newline at end of file +the `vLLM example script `_. + +.. note:: + Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. diff --git a/examples/florence2_inference.py b/examples/florence2_inference.py new file mode 100644 index 0000000000000..b58ac2e1f7ed4 --- /dev/null +++ b/examples/florence2_inference.py @@ -0,0 +1,44 @@ +''' +Demonstrate prompting of text-to-text +encoder/decoder models, specifically Florence-2 +''' +# TODO(Isotr0py): +# Move to offline_inference_vision_language.py after porting vision backbone +from vllm import LLM, SamplingParams + +dtype = "float" + +# Create a Florence-2 encoder/decoder model instance +llm = LLM( + model="microsoft/Florence-2-base", + tokenizer="facebook/bart-base", + dtype=dtype, + trust_remote_code=True, +) + +prompts = [ + "", "", "", + "", "", "", + "", "", "" +] +# Create a sampling params object. +sampling_params = SamplingParams( + temperature=0, + top_p=1.0, + min_tokens=0, + max_tokens=20, +) + +# Generate output tokens from the prompts. The output is a list of +# RequestOutput objects that contain the prompt, generated +# text, and other information. +outputs = llm.generate(prompts, sampling_params) + +# Print the outputs. +for output in outputs: + prompt = output.prompt + encoder_prompt = output.encoder_prompt + generated_text = output.outputs[0].text + print(f"Encoder prompt: {encoder_prompt!r}, " + f"Decoder prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py index 1c6ac06123bbb..37ec667d96a77 100644 --- a/examples/offline_inference_audio_language.py +++ b/examples/offline_inference_audio_language.py @@ -12,14 +12,15 @@ from vllm.utils import FlexibleArgumentParser audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] -question_per_audio_count = [ - "What is recited in the audio?", - "What sport and what nursery rhyme are referenced?" -] +question_per_audio_count = { + 0: "What is 1+1?", + 1: "What is recited in the audio?", + 2: "What sport and what nursery rhyme are referenced?" +} # Ultravox 0.3 -def run_ultravox(question, audio_count): +def run_ultravox(question: str, audio_count: int): model_name = "fixie-ai/ultravox-v0_3" tokenizer = AutoTokenizer.from_pretrained(model_name) @@ -42,9 +43,29 @@ def run_ultravox(question, audio_count): return llm, prompt, stop_token_ids -model_example_map = { - "ultravox": run_ultravox, -} +# Qwen2-Audio +def run_qwen2_audio(question: str, audio_count: int): + model_name = "Qwen/Qwen2-Audio-7B-Instruct" + + llm = LLM(model=model_name, + max_model_len=4096, + max_num_seqs=5, + limit_mm_per_prompt={"audio": audio_count}) + + audio_in_prompt = "".join([ + f"Audio {idx+1}: " + f"<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count) + ]) + + prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + "<|im_start|>user\n" + f"{audio_in_prompt}{question}<|im_end|>\n" + "<|im_start|>assistant\n") + stop_token_ids = None + return llm, prompt, stop_token_ids + + +model_example_map = {"ultravox": run_ultravox, "qwen2_audio": run_qwen2_audio} def main(args): @@ -54,7 +75,7 @@ def main(args): audio_count = args.num_audios llm, prompt, stop_token_ids = model_example_map[model]( - question_per_audio_count[audio_count - 1], audio_count) + question_per_audio_count[audio_count], audio_count) # We set temperature to 0.2 so that outputs can be different # even when all prompts are identical when running batch inference. @@ -62,16 +83,17 @@ def main(args): max_tokens=64, stop_token_ids=stop_token_ids) - assert args.num_prompts > 0 - inputs = { - "prompt": prompt, - "multi_modal_data": { + mm_data = {} + if audio_count > 0: + mm_data = { "audio": [ asset.audio_and_sample_rate for asset in audio_assets[:audio_count] ] - }, - } + } + + assert args.num_prompts > 0 + inputs = {"prompt": prompt, "multi_modal_data": mm_data} if args.num_prompts > 1: # Batch inference inputs = [inputs] * args.num_prompts @@ -100,7 +122,7 @@ def main(args): parser.add_argument("--num-audios", type=int, default=1, - choices=[1, 2], + choices=[0, 1, 2], help="Number of audio items per prompt.") args = parser.parse_args() diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 06b424abd50b5..83d2548a506e4 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -1,6 +1,6 @@ """ -This example shows how to use vLLM for running offline inference -with the correct prompt format on vision language models. +This example shows how to use vLLM for running offline inference with +the correct prompt format on vision language models for text generation. For most models, the prompt format should follow corresponding examples on HuggingFace model repository. @@ -267,6 +267,11 @@ def run_qwen2_vl(question: str, modality: str): model=model_name, max_model_len=8192, max_num_seqs=5, + # Note - mm_processor_kwargs can also be passed to generate/chat calls + mm_processor_kwargs={ + "min_pixels": 28 * 28, + "max_pixels": 1280 * 28 * 28, + }, ) prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" @@ -450,7 +455,7 @@ def main(args): if __name__ == "__main__": parser = FlexibleArgumentParser( description='Demo on using vLLM for offline inference with ' - 'vision language models') + 'vision language models for text generation') parser.add_argument('--model-type', '-m', type=str, diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference_vision_language_embedding.py index cfedd145a015d..e1732d045f949 100644 --- a/examples/offline_inference_vision_language_embedding.py +++ b/examples/offline_inference_vision_language_embedding.py @@ -1,22 +1,170 @@ +""" +This example shows how to use vLLM for running offline inference with +the correct prompt format on vision language models for multimodal embedding. + +For most models, the prompt format should follow corresponding examples +on HuggingFace model repository. +""" +from argparse import Namespace +from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args + +from PIL.Image import Image + from vllm import LLM -from vllm.assets.image import ImageAsset - -image = ImageAsset("cherry_blossom").pil_image.convert("RGB") -prompt = "<|image_1|> Represent the given image with the following question: What is in the image" # noqa: E501 - -# Create an LLM. -llm = LLM( - model="TIGER-Lab/VLM2Vec-Full", - task="embedding", - trust_remote_code=True, - max_model_len=4096, - max_num_seqs=2, - mm_processor_kwargs={"num_crops": 16}, -) - -# Generate embedding. The output is a list of EmbeddingRequestOutputs. -outputs = llm.encode({"prompt": prompt, "multi_modal_data": {"image": image}}) - -# Print the outputs. -for output in outputs: - print(output.outputs.embedding) # list of 3072 floats +from vllm.multimodal.utils import fetch_image +from vllm.utils import FlexibleArgumentParser + + +class TextQuery(TypedDict): + modality: Literal["text"] + text: str + + +class ImageQuery(TypedDict): + modality: Literal["image"] + image: Image + + +class TextImageQuery(TypedDict): + modality: Literal["text+image"] + text: str + image: Image + + +QueryModality = Literal["text", "image", "text+image"] +Query = Union[TextQuery, ImageQuery, TextImageQuery] + + +class ModelRequestData(NamedTuple): + llm: LLM + prompt: str + image: Optional[Image] + + +def run_e5_v(query: Query): + llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501 + + if query["modality"] == "text": + text = query["text"] + prompt = llama3_template.format( + f"{text}\nSummary above sentence in one word: ") + image = None + elif query["modality"] == "image": + prompt = llama3_template.format( + "\nSummary above image in one word: ") + image = query["image"] + else: + modality = query['modality'] + raise ValueError(f"Unsupported query modality: '{modality}'") + + llm = LLM( + model="royokong/e5-v", + task="embedding", + max_model_len=4096, + ) + + return ModelRequestData( + llm=llm, + prompt=prompt, + image=image, + ) + + +def run_vlm2vec(query: Query): + if query["modality"] == "text": + text = query["text"] + prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501 + image = None + elif query["modality"] == "image": + prompt = "<|image_1|> Find a day-to-day image that looks similar to the provided image." # noqa: E501 + image = query["image"] + elif query["modality"] == "text+image": + text = query["text"] + prompt = f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501 + image = query["image"] + else: + modality = query['modality'] + raise ValueError(f"Unsupported query modality: '{modality}'") + + llm = LLM( + model="TIGER-Lab/VLM2Vec-Full", + task="embedding", + trust_remote_code=True, + mm_processor_kwargs={"num_crops": 4}, + ) + + return ModelRequestData( + llm=llm, + prompt=prompt, + image=image, + ) + + +def get_query(modality: QueryModality): + if modality == "text": + return TextQuery(modality="text", text="A dog sitting in the grass") + + if modality == "image": + return ImageQuery( + modality="image", + image=fetch_image( + "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg" # noqa: E501 + ), + ) + + if modality == "text+image": + return TextImageQuery( + modality="text+image", + text="A cat standing in the snow.", + image=fetch_image( + "https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Felis_catus-cat_on_snow.jpg/179px-Felis_catus-cat_on_snow.jpg" # noqa: E501 + ), + ) + + msg = f"Modality {modality} is not supported." + raise ValueError(msg) + + +def run_encode(model: str, modality: QueryModality): + query = get_query(modality) + req_data = model_example_map[model](query) + + mm_data = {} + if req_data.image is not None: + mm_data["image"] = req_data.image + + outputs = req_data.llm.encode({ + "prompt": req_data.prompt, + "multi_modal_data": mm_data, + }) + + for output in outputs: + print(output.outputs.embedding) + + +def main(args: Namespace): + run_encode(args.model_name, args.modality) + + +model_example_map = { + "e5_v": run_e5_v, + "vlm2vec": run_vlm2vec, +} + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description='Demo on using vLLM for offline inference with ' + 'vision language models for multimodal embedding') + parser.add_argument('--model-name', + '-m', + type=str, + default="vlm2vec", + choices=model_example_map.keys(), + help='The name of the embedding model.') + parser.add_argument('--modality', + type=str, + default="image", + choices=get_args(QueryModality), + help='Modality of the input.') + args = parser.parse_args() + main(args) diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py index 69f590fb7950d..e28514bf403f7 100644 --- a/examples/offline_inference_vision_language_multi_image.py +++ b/examples/offline_inference_vision_language_multi_image.py @@ -1,7 +1,7 @@ """ This example shows how to use vLLM for running offline inference with -multi-image input on vision language models, using the chat template defined -by the model. +multi-image input on vision language models for text generation, +using the chat template defined by the model. """ from argparse import Namespace from typing import List, NamedTuple, Optional @@ -334,7 +334,8 @@ def main(args: Namespace): if __name__ == "__main__": parser = FlexibleArgumentParser( description='Demo on using vLLM for offline inference with ' - 'vision language models that support multi-image input') + 'vision language models that support multi-image input for text ' + 'generation') parser.add_argument('--model-type', '-m', type=str, diff --git a/format.sh b/format.sh index 1ac028d00e3a4..be6ee0ce46dcb 100755 --- a/format.sh +++ b/format.sh @@ -21,6 +21,20 @@ builtin cd "$(dirname "${BASH_SOURCE:-$0}")" ROOT="$(git rev-parse --show-toplevel)" builtin cd "$ROOT" || exit 1 +check_command() { + if ! command -v "$1" &> /dev/null; then + echo "❓❓$1 is not installed, please run \`pip install -r requirements-lint.txt\`" + exit 1 + fi +} + +check_command yapf +check_command ruff +check_command mypy +check_command codespell +check_command isort +check_command clang-format + YAPF_VERSION=$(yapf --version | awk '{print $2}') RUFF_VERSION=$(ruff --version | awk '{print $2}') MYPY_VERSION=$(mypy --version | awk '{print $2}') @@ -31,7 +45,7 @@ CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}') # # params: tool name, tool version, required version tool_version_check() { if [[ $2 != $3 ]]; then - echo "Wrong $1 version installed: $3 is required, not $2." + echo "❓❓Wrong $1 version installed: $3 is required, not $2." exit 1 fi } @@ -281,10 +295,12 @@ tools/actionlint.sh -color echo 'vLLM actionlint: Done' if ! git diff --quiet &>/dev/null; then - echo 'Reformatted files. Please review and stage the changes.' - echo 'Changes not staged for commit:' - echo + echo + echo "🔍🔍There are files changed by the format checker or by you that are not added and committed:" git --no-pager diff --name-only + echo "🔍🔍Format checker passed, but please add, commit and push all the files above to include changes made by the format checker." exit 1 +else + echo "✨🎉 Format check passed! Congratulations! 🎉✨" fi diff --git a/pyproject.toml b/pyproject.toml index 11c78e2406b80..83a941cca2c62 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "packaging", "setuptools>=61", "setuptools-scm>=8.0", - "torch == 2.4.0", + "torch == 2.5.0", "wheel", "jinja2", ] diff --git a/requirements-build.txt b/requirements-build.txt index 6144a56da8c47..ea2b688bb3108 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -4,6 +4,6 @@ ninja packaging setuptools>=61 setuptools-scm>=8 -torch==2.4.0 +torch==2.5.0 wheel jinja2 diff --git a/requirements-cuda.txt b/requirements-cuda.txt index 3b3c2f876919e..92fa303d687a2 100644 --- a/requirements-cuda.txt +++ b/requirements-cuda.txt @@ -4,7 +4,7 @@ # Dependencies for NVIDIA GPUs ray >= 2.9 nvidia-ml-py # for pynvml package -torch == 2.4.0 +torch == 2.5.0 # These must be updated alongside torch -torchvision == 0.19 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version -xformers == 0.0.27.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.4.0 +torchvision == 0.20 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version +xformers == 0.0.28.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.0 diff --git a/requirements-openvino.txt b/requirements-openvino.txt index ac54cf0c3288f..7ad0d1e7f704b 100644 --- a/requirements-openvino.txt +++ b/requirements-openvino.txt @@ -1,7 +1,7 @@ # Common dependencies -r requirements-common.txt -torch == 2.4.0 # should be aligned with "common" vLLM torch version +torch == 2.5.0 # should be aligned with "common" vLLM torch version openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version diff --git a/setup.py b/setup.py index d1f4b7f1c1119..8abeb0ba739db 100644 --- a/setup.py +++ b/setup.py @@ -157,6 +157,14 @@ def configure(self, ext: CMakeExtension) -> None: # on subsequent calls to python. cmake_args += ['-DVLLM_PYTHON_PATH={}'.format(":".join(sys.path))] + # Override the base directory for FetchContent downloads to $ROOT/.deps + # This allows sharing dependencies between profiles, + # and plays more nicely with sccache. + # To override this, set the FETCHCONTENT_BASE_DIR environment variable. + fc_base_dir = os.path.join(ROOT_DIR, ".deps") + fc_base_dir = os.environ.get("FETCHCONTENT_BASE_DIR", fc_base_dir) + cmake_args += ['-DFETCHCONTENT_BASE_DIR={}'.format(fc_base_dir)] + # # Setup parallelism and build tool # diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 0fe88e792520a..79647589d5204 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -11,7 +11,7 @@ import pytest from vllm import LLM -from vllm.utils import is_hip +from vllm.platforms import current_platform from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata from ..models.utils import check_outputs_equal @@ -19,7 +19,7 @@ MODELS = [ "facebook/opt-125m", - "meta-llama/Llama-2-7b-hf", + "meta-llama/Llama-3.2-1B", ] TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4") @@ -51,7 +51,7 @@ def test_models( enforce_eager: bool, ) -> None: - if backend == "FLASHINFER" and is_hip(): + if backend == "FLASHINFER" and current_platform.is_rocm(): pytest.skip("Flashinfer does not support ROCm/HIP.") os.environ["VLLM_ATTENTION_BACKEND"] = backend diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index c3e3835aff0af..51aec8c873d12 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -16,7 +16,7 @@ MODELS = [ "facebook/opt-125m", - "meta-llama/Llama-2-7b-hf", + "meta-llama/Llama-3.2-1B", ] diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py index a5df5639cf948..d7f36a7812802 100644 --- a/tests/basic_correctness/test_cpu_offload.py +++ b/tests/basic_correctness/test_cpu_offload.py @@ -2,5 +2,5 @@ def test_cpu_offload(): - compare_two_settings("meta-llama/Llama-2-7b-hf", [], - ["--cpu-offload-gb", "4"]) + compare_two_settings("meta-llama/Llama-3.2-1B", [], + ["--cpu-offload-gb", "1"]) diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index b6ec7413978f4..6aa27b24b4a6e 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -13,12 +13,11 @@ @pytest.mark.parametrize( "model, model_args, pp_size, tp_size, attn_backend, method, fullgraph", [ - ("meta-llama/Meta-Llama-3-8B", [], 2, 2, "FLASH_ATTN", "generate", - True), + ("meta-llama/Llama-3.2-1B", [], 2, 2, "FLASHINFER", "generate", True), ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", ["--quantization", "compressed-tensors" ], 1, 1, "FLASH_ATTN", "generate", True), - ("google/gemma-2-2b-it", [], 1, 2, "FLASHINFER", "generate", True), + ("ibm/PowerMoE-3b", [], 1, 2, "FLASH_ATTN", "generate", True), # TODO: add multi-modality test for llava ("llava-hf/llava-1.5-7b-hf", [], 2, 1, "FLASHINFER", "generate", False) ]) diff --git a/tests/compile/utils.py b/tests/compile/utils.py index c69343b51ae02..64fc08e80de3b 100644 --- a/tests/compile/utils.py +++ b/tests/compile/utils.py @@ -5,7 +5,7 @@ from tests.quantization.utils import is_quant_method_supported from vllm import LLM, SamplingParams from vllm.compilation.levels import CompilationLevel -from vllm.utils import is_hip +from vllm.platforms import current_platform TEST_MODELS = [ ("facebook/opt-125m", {}), @@ -55,7 +55,7 @@ "quantization": "marlin" })) -if not is_hip() and is_quant_method_supported("awq"): +if not current_platform.is_rocm() and is_quant_method_supported("awq"): TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", { "quantization": "AWQ" })) diff --git a/tests/conftest.py b/tests/conftest.py index 4c9180415da32..2fce2d772c6ed 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -32,9 +32,10 @@ to_enc_dec_tuple_list, zip_enc_dec_prompts) from vllm.logger import init_logger from vllm.outputs import RequestOutput +from vllm.platforms import current_platform from vllm.sampling_params import BeamSearchParams from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless, - identity, is_cpu) + identity) logger = init_logger(__name__) @@ -42,10 +43,12 @@ _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")] -PromptImageInput = Union[List[Image.Image], List[List[Image.Image]]] -PromptAudioInput = Union[List[Tuple[np.ndarray, int]], - List[List[Tuple[np.ndarray, int]]]] -PromptVideoInput = Union[List[np.ndarray], List[List[np.ndarray]]] +_M = TypeVar("_M") +_PromptMultiModalInput = Union[List[_M], List[List[_M]]] + +PromptImageInput = _PromptMultiModalInput[Image.Image] +PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]] +PromptVideoInput = _PromptMultiModalInput[np.ndarray] def _read_prompts(filename: str) -> List[str]: @@ -229,19 +232,22 @@ def video_assets() -> _VideoAssets: return VIDEO_ASSETS -_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature) +_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict) class HfRunner: - def wrap_device(self, input: _T, device: Optional[str] = None) -> _T: + def wrap_device(self, x: _T, device: Optional[str] = None) -> _T: if device is None: - return self.wrap_device(input, "cpu" if is_cpu() else "cuda") + device = "cpu" if current_platform.is_cpu() else "cuda" + + if isinstance(x, dict): + return {k: self.wrap_device(v, device) for k, v in x.items()} - if hasattr(input, "device") and input.device.type == device: - return input + if hasattr(x, "device") and x.device.type == device: + return x - return input.to(device) + return x.to(device) def __init__( self, @@ -249,7 +255,9 @@ def __init__( dtype: str = "half", *, model_kwargs: Optional[Dict[str, Any]] = None, + is_embedding_model: bool = False, is_sentence_transformer: bool = False, + skip_tokenizer_init: bool = False, auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM, postprocess_inputs: Callable[[BatchEncoding], BatchEncoding] = identity, @@ -277,11 +285,12 @@ def __init__( **model_kwargs, )) - self.tokenizer = AutoTokenizer.from_pretrained( - model_name, - torch_dtype=torch_dtype, - trust_remote_code=True, - ) + if not skip_tokenizer_init: + self.tokenizer = AutoTokenizer.from_pretrained( + model_name, + torch_dtype=torch_dtype, + trust_remote_code=True, + ) # don't put this import at the top level # it will call torch.cuda.device_count() @@ -291,6 +300,8 @@ def __init__( torch_dtype=torch_dtype, trust_remote_code=True, ) + if skip_tokenizer_init: + self.tokenizer = self.processor.tokenizer self.postprocess_inputs = postprocess_inputs @@ -316,12 +327,12 @@ def get_inputs( "text": prompt, "return_tensors": "pt", } - if images is not None and images[i] is not None: - processor_kwargs["images"] = images[i] - if videos is not None and videos[i] is not None: - processor_kwargs["videos"] = videos[i] - if audios is not None and audios[i] is not None: - audio, sr = audios[i] + if images is not None and (image := images[i]) is not None: + processor_kwargs["images"] = image + if videos is not None and (video := videos[i]) is not None: + processor_kwargs["videos"] = video + if audios is not None and (audio_tuple := audios[i]) is not None: + audio, sr = audio_tuple processor_kwargs["audio"] = audio processor_kwargs["sampling_rate"] = sr @@ -332,11 +343,22 @@ def get_inputs( return all_inputs + def classify(self, prompts: List[str]) -> List[str]: + # output is final logits + all_inputs = self.get_inputs(prompts) + outputs = [] + for inputs in all_inputs: + output = self.model(**self.wrap_device(inputs)) + logits = output.logits.softmax(dim=-1)[0].tolist() + outputs.append(logits) + + return outputs + def generate( self, prompts: List[str], images: Optional[PromptImageInput] = None, - videos: Optional[List[np.ndarray]] = None, + videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, **kwargs: Any, ) -> List[Tuple[List[List[int]], List[str]]]: @@ -366,7 +388,7 @@ def generate_greedy( prompts: List[str], max_tokens: int, images: Optional[PromptImageInput] = None, - videos: Optional[List[np.ndarray]] = None, + videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, **kwargs: Any, ) -> List[Tuple[List[int], str]]: @@ -407,7 +429,7 @@ def generate_greedy_logprobs( prompts: List[str], max_tokens: int, images: Optional[PromptImageInput] = None, - videos: Optional[List[np.ndarray]] = None, + videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, **kwargs: Any, ) -> List[List[torch.Tensor]]: @@ -486,7 +508,7 @@ def generate_greedy_logprobs_limit( num_logprobs: int, images: Optional[PromptImageInput] = None, audios: Optional[PromptAudioInput] = None, - videos: Optional[List[np.ndarray]] = None, + videos: Optional[PromptVideoInput] = None, **kwargs: Any, ) -> List[TokensTextLogprobs]: all_inputs = self.get_inputs(prompts, @@ -531,6 +553,7 @@ def generate_encoder_decoder_greedy_logprobs_limit( encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]], max_tokens: int, num_logprobs: int, + images: Optional[PromptImageInput] = None, **kwargs: Any, ) -> List[TokensTextLogprobs]: ''' @@ -541,11 +564,17 @@ def generate_encoder_decoder_greedy_logprobs_limit( all_output_ids: List[List[int]] = [] all_output_strs: List[str] = [] - for (encoder_prompt, - decoder_prompt) in to_enc_dec_tuple_list(encoder_decoder_prompts): + for i, (encoder_prompt, decoder_prompt) in enumerate( + to_enc_dec_tuple_list(encoder_decoder_prompts)): + processor_kwargs: Dict[str, Any] = { + "text": encoder_prompt, + "return_tensors": "pt", + } + if images is not None and images[i] is not None: + processor_kwargs["images"] = images[i] encoder_input_ids = self.wrap_device( - self.tokenizer(encoder_prompt, return_tensors="pt").input_ids, + self.processor(**processor_kwargs).input_ids, device=self.model.device.type, ) @@ -655,18 +684,29 @@ def get_inputs( inputs = [TextPrompt(prompt=prompt) for prompt in prompts] if images is not None: for i, image in enumerate(images): - inputs[i]["multi_modal_data"] = {"image": image} + if image is not None: + inputs[i]["multi_modal_data"] = {"image": image} if videos is not None: for i, video in enumerate(videos): - inputs[i]["multi_modal_data"] = {"video": video} + if video is not None: + inputs[i]["multi_modal_data"] = {"video": video} if audios is not None: for i, audio in enumerate(audios): - inputs[i]["multi_modal_data"] = {"audio": audio} + if audio is not None: + inputs[i]["multi_modal_data"] = {"audio": audio} return inputs + def classify(self, prompts: List[str]) -> List[str]: + req_outputs = self.model.encode(prompts) + outputs = [] + for req_output in req_outputs: + embedding = req_output.outputs.embedding + outputs.append(embedding) + return outputs + def generate( self, prompts: List[str], @@ -835,13 +875,20 @@ def generate_beam_search( returned_outputs.append((token_ids, texts)) return returned_outputs - def encode(self, prompts: List[str]) -> List[List[float]]: - req_outputs = self.model.encode(prompts) - outputs = [] - for req_output in req_outputs: - embedding = req_output.outputs.embedding - outputs.append(embedding) - return outputs + def encode( + self, + prompts: List[str], + images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, + ) -> List[List[float]]: + inputs = self.get_inputs(prompts, + images=images, + videos=videos, + audios=audios) + + req_outputs = self.model.encode(inputs) + return [req_output.outputs.embedding for req_output in req_outputs] def __enter__(self): return self diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index 308dad1850c9a..acd82065ae457 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -4,7 +4,6 @@ import pytest # noqa from vllm.config import CacheConfig, SchedulerConfig -from vllm.core.interfaces import AllocStatus from vllm.core.scheduler import Scheduler from vllm.sequence import Logprob, SequenceGroup @@ -347,158 +346,6 @@ def test_prompt_limit_exceed(): assert out.ignored_seq_groups[0] == seq_group -def test_swap(): - """Verify swapping works with chunked prefill requests""" - block_size = 4 - max_seqs = 30 - max_model_len = 200 - max_num_batched_tokens = 30 - scheduler_config = SchedulerConfig( - "generate", - max_num_batched_tokens, - max_seqs, - max_model_len, - enable_chunked_prefill=True, - ) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 16 - cache_config.num_gpu_blocks = 16 - scheduler = Scheduler(scheduler_config, cache_config, None) - - _, seq_group = create_dummy_prompt("1", - prompt_length=60, - best_of=2, - block_size=block_size) - scheduler.add_seq_group(seq_group) - _, out = schedule_and_update_computed_tokens(scheduler) - # The request is chunked. - # prefill scheduled now. - assert len(out.scheduled_seq_groups) == 1 - assert out.num_prefill_groups == 1 - assert seq_group.is_prefill() - assert out.num_batched_tokens == max_num_batched_tokens - - # The last request should be swapped out. - scheduler.block_manager.can_append_slots = MagicMock() - - def cannot_append_second_group(seq_group, num_lookahead_slots): - return seq_group.request_id != "1" - - scheduler.block_manager.can_append_slots.side_effect = ( - cannot_append_second_group) - - # The running prefill is now swapped. - _, out = schedule_and_update_computed_tokens(scheduler) - assert len(out.scheduled_seq_groups) == 0 - assert out.num_batched_tokens == 0 - assert out.blocks_to_swap_out != [] - assert out.blocks_to_swap_in == [] - - # Add 1 more task. Swap should be prioritized over new prefill. - _, seq_group = create_dummy_prompt("2", prompt_length=60) - scheduler.add_seq_group(seq_group) - _, out = schedule_and_update_computed_tokens(scheduler) - assert len(out.scheduled_seq_groups) == 1 - # 3 decodes. It is swapped in. - assert out.num_batched_tokens == 30 - assert out.blocks_to_swap_in != [] - assert out.blocks_to_swap_out == [] - - -def test_running_prefill_prioritized_over_swap(): - block_size = 4 - max_seqs = 30 - max_model_len = 200 - max_num_batched_tokens = 30 - scheduler_config = SchedulerConfig( - "generate", - max_num_batched_tokens, - max_seqs, - max_model_len, - enable_chunked_prefill=True, - ) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") - cache_config.num_cpu_blocks = 32 - cache_config.num_gpu_blocks = 32 - scheduler = Scheduler(scheduler_config, cache_config, None) - - _, seq_group = create_dummy_prompt("1", - prompt_length=60, - best_of=2, - block_size=block_size) - scheduler.add_seq_group(seq_group) - _, out = schedule_and_update_computed_tokens(scheduler) - # The request is chunked. - # prefill scheduled now. - assert len(out.scheduled_seq_groups) == 1 - assert out.num_prefill_groups == 1 - assert seq_group.is_prefill() - assert out.num_batched_tokens == max_num_batched_tokens - - # The request should be swapped out. - scheduler.block_manager.can_append_slots = MagicMock() - - def cannot_append_second_group(seq_group, num_lookahead_slots): - return seq_group.request_id != "1" - - scheduler.block_manager.can_append_slots.side_effect = ( - cannot_append_second_group) - - # The running prefill is now swapped. - _, out = schedule_and_update_computed_tokens(scheduler) - assert len(out.scheduled_seq_groups) == 0 - assert out.num_batched_tokens == 0 - assert out.blocks_to_swap_out != [] - assert out.blocks_to_swap_in == [] - - # Add 1 more task. Swap is not possible, so prefill is running. - scheduler.block_manager.can_swap_in = MagicMock() - scheduler.block_manager.can_swap_in.return_value = AllocStatus.LATER - - _, seq_group2 = create_dummy_prompt("2", - prompt_length=60, - block_size=block_size) - scheduler.add_seq_group(seq_group2) - _, out = schedule_and_update_computed_tokens(scheduler) - assert len(out.scheduled_seq_groups) == 1 - # 3 decodes. It is swapped in. - assert out.num_batched_tokens == 30 - assert out.blocks_to_swap_in == [] - assert out.blocks_to_swap_out == [] - assert out.scheduled_seq_groups[0].seq_group == seq_group2 - - # Now although swap is possible, running prefill is prioritized. - scheduler.block_manager.can_swap_in.return_value = AllocStatus.OK - _, out = schedule_and_update_computed_tokens(scheduler) - assert len(out.scheduled_seq_groups) == 1 - # 3 decodes. It is swapped in. - assert out.num_batched_tokens == 30 - assert out.blocks_to_swap_in == [] - assert out.blocks_to_swap_out == [] - assert not seq_group2.is_prefill() - assert out.scheduled_seq_groups[0].seq_group == seq_group2 - append_new_token(seq_group2, 1) - - # Decoding is prioritized. - _, out = schedule_and_update_computed_tokens(scheduler) - assert len(out.scheduled_seq_groups) == 1 - # 3 decodes. It is swapped in. - assert out.num_batched_tokens == 1 - assert out.blocks_to_swap_in == [] - assert out.blocks_to_swap_out == [] - assert not seq_group2.is_prefill() - assert out.scheduled_seq_groups[0].seq_group == seq_group2 - append_new_token(seq_group2, 1) - - # Since we abort the sequence group, we can finally swap. - scheduler.abort_seq_group(seq_group2.request_id) - _, out = schedule_and_update_computed_tokens(scheduler) - assert len(out.scheduled_seq_groups) == 1 - assert out.num_batched_tokens == 30 - assert out.blocks_to_swap_in != [] - assert out.blocks_to_swap_out == [] - - def test_chunked_prefill_preempt(): """Verify preempt works with chunked prefill requests""" block_size = 4 diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index 00b6349b9f8c5..5ff32be611592 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -10,7 +10,7 @@ from vllm.core.interfaces import AllocStatus from vllm.core.scheduler import Scheduler, SchedulingBudget from vllm.lora.request import LoRARequest -from vllm.sequence import SequenceGroup, SequenceStatus +from vllm.sequence import SequenceGroup from .utils import (append_new_token, append_new_token_seq_group, create_dummy_prompt, get_sequence_groups, @@ -296,55 +296,6 @@ def test_scheduler_delay_factor(): append_new_token(out, 1) -def test_swapped_out_prioritized(): - block_size = 4 - scheduler = initialize_scheduler(max_num_seqs=6, - block_size=block_size, - num_cpu_blocks=64, - num_gpu_blocks=64) - # best_of=2 * 3 == 6 sequences. - for i in range(3): - _, seq_group = create_dummy_prompt(str(i), - prompt_length=60, - best_of=2, - block_size=block_size) - scheduler.add_seq_group(seq_group) - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - # prefill scheduled now. - assert len(out.scheduled_seq_groups) == 3 - append_new_token(out, 1) - - # The last request should be swapped out. - scheduler.block_manager.can_append_slots = MagicMock() - - def cannot_append_second_group(seq_group, num_lookahead_slots): - return seq_group.request_id != "2" - - scheduler.block_manager.can_append_slots.side_effect = ( - cannot_append_second_group) - - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - assert len(out.scheduled_seq_groups) == 2 - assert out.num_batched_tokens == 2 - assert out.blocks_to_swap_out != [] - assert out.blocks_to_swap_in == [] - append_new_token(out, 1) - - # Add 1 more task. Swap should be prioritized over prefill. - _, seq_group = create_dummy_prompt(str(i), - prompt_length=60, - best_of=2, - block_size=block_size) - scheduler.add_seq_group(seq_group) - seq_group_meta, out = schedule_and_update_computed_tokens(scheduler) - append_new_token(out, 1) - assert len(out.scheduled_seq_groups) == 3 - # 3 decodes. It is swapped in. - assert out.num_batched_tokens == 3 - assert out.blocks_to_swap_in != [] - assert out.blocks_to_swap_out == [] - - def initialize_scheduler( *, max_num_seqs=1000, @@ -646,60 +597,6 @@ def cannot_append_second_group(seq_group, num_lookahead_slots): assert output.blocks_to_copy == [] -def test_decode_swap_beam_search(): - """ - Test best_of > 1 swap out blocks - """ - block_size = 4 - scheduler = initialize_scheduler(block_size=block_size, - num_gpu_blocks=64, - num_cpu_blocks=64) - curr_loras = None - budget = create_token_budget() - for i in range(3): - _, seq_group = create_dummy_prompt(str(i), - prompt_length=60, - best_of=2, - block_size=block_size) - scheduler._allocate_and_set_running(seq_group) - scheduler._add_seq_group_to_running(seq_group) - append_new_token_seq_group(60, seq_group, 1) - budget.add_num_seqs(seq_group.request_id, - seq_group.get_max_num_running_seqs()) - budget.add_num_batched_tokens( - seq_group.request_id, seq_group.num_seqs(SequenceStatus.RUNNING)) - - # The last request should be swapped out. - scheduler.block_manager.can_append_slots = MagicMock() - - def cannot_append_second_group(seq_group, num_lookahead_slots): - return seq_group.request_id != "2" - - scheduler.block_manager.can_append_slots.side_effect = ( - cannot_append_second_group) - scheduler.block_manager.swap_out = MagicMock() - expected_swap_mapping = [("5", "7")] - scheduler.block_manager.swap_out.return_value = expected_swap_mapping - - output = scheduler._schedule_running(budget, curr_loras) - remainig_running = scheduler.running - assert len(remainig_running) == 0 - assert len(output.decode_seq_groups) == 2 - assert len(output.prefill_seq_groups) == 0 - assert output.decode_seq_groups[0].seq_group.request_id == "0" - assert output.decode_seq_groups[1].seq_group.request_id == "1" - assert len(output.preempted) == 0 - assert len(output.swapped_out) == 1 - # Budget should refledct preempted requests. - assert budget.num_batched_tokens == 2 - # since there are 2 sequences, 2 should be subtracted. - assert budget.num_curr_seqs == 4 - # Both should be preempted, not swapped. - assert output.blocks_to_swap_out == expected_swap_mapping - # Nothing is copied. - assert output.blocks_to_copy == [] - - def test_schedule_decode_blocks_to_copy_update(): """ Verify blocks_to_copy is updated. @@ -736,105 +633,6 @@ def test_schedule_decode_blocks_to_copy_update(): assert output.blocks_to_copy == [(2, 3)] -def test_schedule_swapped_simple(): - block_size = 4 - scheduler = initialize_scheduler(block_size=block_size) - curr_loras = None - blocks_to_swap_out: List[Tuple[int, int]] = [] - _, seq_group = create_dummy_prompt("1", - prompt_length=4, - best_of=2, - block_size=block_size) - scheduler._allocate_and_set_running(seq_group) - append_new_token_seq_group(4, seq_group, 1) - scheduler._swap_out(seq_group, blocks_to_swap_out) - scheduler._add_seq_group_to_swapped(seq_group) - - budget = create_token_budget() - output = scheduler._schedule_swapped(budget, curr_loras) - remaining_swapped = scheduler.swapped - assert len(remaining_swapped) == 0 - assert budget.num_batched_tokens == 1 - assert budget.num_curr_seqs == 2 - assert len(output.decode_seq_groups) == 1 - assert len(output.prefill_seq_groups) == 0 - # swap in is the reverse of swap out - blocks_to_swap_in_reverse = [] - for swapin, swapout in output.blocks_to_swap_in: - blocks_to_swap_in_reverse.append((swapout, swapin)) - assert blocks_to_swap_out == blocks_to_swap_in_reverse - - -def test_schedule_swapped_max_token_budget(): - block_size = 4 - scheduler = initialize_scheduler(block_size=block_size, - num_cpu_blocks=32, - num_gpu_blocks=32) - curr_loras = None - blocks_to_swap_out: List[Tuple[int, int]] = [] - for i in range(2): - _, seq_group = create_dummy_prompt(str(i), prompt_length=60, best_of=2) - scheduler._allocate_and_set_running(seq_group) - append_new_token_seq_group(60, seq_group, 1) - scheduler._swap_out(seq_group, blocks_to_swap_out) - scheduler._add_seq_group_to_swapped(seq_group) - - budget = create_token_budget(token_budget=1) - output = scheduler._schedule_swapped(budget, curr_loras) - remaining_swapped = scheduler.swapped - assert len(remaining_swapped) == 1 - assert budget.num_batched_tokens == 1 - assert budget.num_curr_seqs == 2 - assert len(output.decode_seq_groups) == 1 - assert len(output.prefill_seq_groups) == 0 - - # Verify num_batched_tokens are respected. - budget = create_token_budget(token_budget=1) - add_token_budget(budget, 1, 0) - output = scheduler._schedule_swapped(budget, curr_loras) - remaining_swapped = scheduler.swapped - assert len(remaining_swapped) == 1 - assert budget.num_batched_tokens == 1 - assert budget.num_curr_seqs == 0 - assert len(output.decode_seq_groups) == 0 - assert len(output.prefill_seq_groups) == 0 - - -def test_schedule_swapped_max_seqs(): - block_size = 4 - scheduler = initialize_scheduler(block_size=block_size, - num_cpu_blocks=64, - num_gpu_blocks=64) - curr_loras = None - blocks_to_swap_out: List[Tuple[int, int]] = [] - for i in range(4): - _, seq_group = create_dummy_prompt(str(i), - prompt_length=60, - block_size=4) - scheduler._allocate_and_set_running(seq_group) - append_new_token_seq_group(60, seq_group, 1) - scheduler._swap_out(seq_group, blocks_to_swap_out) - scheduler._add_seq_group_to_swapped(seq_group) - - budget = create_token_budget(max_num_seqs=2) - output = scheduler._schedule_swapped(budget, curr_loras) - remaining_swapped = scheduler.swapped - assert len(remaining_swapped) == 2 - assert budget.num_batched_tokens == 2 - assert budget.num_curr_seqs == 2 - assert len(output.decode_seq_groups) == 2 - assert len(output.prefill_seq_groups) == 0 - - # Verify num_curr_seqs are respected. - output = scheduler._schedule_swapped(budget, curr_loras) - remaining_swapped = scheduler.swapped - assert len(remaining_swapped) == 2 - assert budget.num_batched_tokens == 2 - assert budget.num_curr_seqs == 2 - assert len(output.decode_seq_groups) == 0 - assert len(output.prefill_seq_groups) == 0 - - def test_schedule_swapped_max_loras(): block_size = 4 lora_config = LoRAConfig(max_lora_rank=8, max_loras=1) diff --git a/tests/data/test_config.yaml b/tests/data/test_config.yaml index 42f4f6f7bb992..5090e8f357bb8 100644 --- a/tests/data/test_config.yaml +++ b/tests/data/test_config.yaml @@ -1,3 +1,5 @@ port: 12312 served_model_name: mymodel tensor_parallel_size: 2 +trust_remote_code: true +multi_step_stream_outputs: false diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index fee201850f203..ed6360f9d6148 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -28,19 +28,25 @@ class ParallelSetup(NamedTuple): chunked_prefill: bool +class PPTestOptions(NamedTuple): + multi_node_only: bool + trust_remote_code: bool + tokenizer_mode: Optional[str] + + @dataclass class PPTestSettings: parallel_setups: List[ParallelSetup] distributed_backends: List[str] task: TaskOption - trust_remote_code: bool - tokenizer_mode: Optional[str] + test_options: PPTestOptions @staticmethod def detailed( *, tp_base: int = 1, pp_base: int = 2, + multi_node_only: bool = False, task: TaskOption = "auto", trust_remote_code: bool = False, tokenizer_mode: Optional[str] = None, @@ -70,8 +76,9 @@ def detailed( ], distributed_backends=["mp", "ray"], task=task, - trust_remote_code=trust_remote_code, - tokenizer_mode=tokenizer_mode, + test_options=PPTestOptions(multi_node_only=multi_node_only, + trust_remote_code=trust_remote_code, + tokenizer_mode=tokenizer_mode), ) @staticmethod @@ -80,6 +87,7 @@ def fast( tp_base: int = 1, pp_base: int = 2, task: TaskOption = "auto", + multi_node_only: bool = False, trust_remote_code: bool = False, tokenizer_mode: Optional[str] = None, ): @@ -92,25 +100,26 @@ def fast( ], distributed_backends=["mp"], task=task, - trust_remote_code=trust_remote_code, - tokenizer_mode=tokenizer_mode, + test_options=PPTestOptions(multi_node_only=multi_node_only, + trust_remote_code=trust_remote_code, + tokenizer_mode=tokenizer_mode), ) def iter_params(self, model_name: str): + opts = self.test_options + for parallel_setup in self.parallel_setups: for distributed_backend in self.distributed_backends: yield (model_name, parallel_setup, distributed_backend, - self.task, self.trust_remote_code, self.tokenizer_mode) + self.task, opts) # NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU # The values displayed here are only a rough indicator of the size of the model # yapf: disable -GENERATION_MODEL_SETTINGS = { - # [DETAILED TESTS] - "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(), - # [FAST TESTS] +TEXT_GENERATION_MODELS = { + # [Decoder-only] # Uses Llama # "BAAI/AquilaChat-7B": PPTestSettings.fast(), "Snowflake/snowflake-arctic-instruct": PPTestSettings.fast(tp_base=8, trust_remote_code=True), # noqa: E501 @@ -136,9 +145,10 @@ def iter_params(self, model_name: str): # Uses Llama # "internlm/internlm-chat-7b": PPTestSettings.fast(), "internlm/internlm2-chat-7b": PPTestSettings.fast(trust_remote_code=True), - "core42/jais-13b-chat": PPTestSettings.fast(), + "inceptionai/jais-13b-chat": PPTestSettings.fast(), # TODO: Implement PP # "ai21labs/AI21-Jamba-1.5-Mini": PPTestSettings.fast(), + "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(), "openbmb/MiniCPM-2B-sft-bf16": PPTestSettings.fast(trust_remote_code=True), "openbmb/MiniCPM3-4B": PPTestSettings.fast(trust_remote_code=True), # Uses Llama @@ -151,10 +161,9 @@ def iter_params(self, model_name: str): "facebook/opt-iml-max-1.3b": PPTestSettings.fast(), "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True), "microsoft/phi-2": PPTestSettings.fast(), - "microsoft/Phi-3-mini-4k-instruct": PPTestSettings.fast(), + "microsoft/Phi-3-mini-4k-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True), # noqa: E501 "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501 - # FIXME: https://github.com/vllm-project/vllm/issues/8553 - # "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501 + "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501 "adept/persimmon-8b-chat": PPTestSettings.fast(), "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True), "Qwen/Qwen2-beta-7B-Chat": PPTestSettings.fast(), @@ -162,41 +171,43 @@ def iter_params(self, model_name: str): "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(), "bigcode/starcoder2-3b": PPTestSettings.fast(), "upstage/solar-pro-preview-instruct": PPTestSettings.fast(tp_base=2), - # FIXME: Cannot load tokenizer in latest transformers version + # FIXME: Cannot load tokenizer in latest transformers version. + # Need to use tokenizer from `meta-llama/Llama-2-7b-chat-hf` # "xverse/XVERSE-7B-Chat": PPTestSettings.fast(trust_remote_code=True), + # [Encoder-only] + # TODO: Implement PP + # "facebook/bart-base": PPTestSettings.fast(), } -EMBEDDING_MODEL_SETTINGS = { # type: ignore[var-annotated] - # [FAST TESTS] +EMBEDDING_MODELS = { # type: ignore[var-annotated] + # [Text-only] "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(), "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(), "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(tp_base=4, trust_remote_code=True), # noqa: E501 } -MULTIMODAL_MODEL_SETTINGS = { - # [FAST TESTS] +MULTIMODAL_MODELS = { + # [Decoder-only] "Salesforce/blip2-opt-2.7b": PPTestSettings.fast(), "facebook/chameleon-7b": PPTestSettings.fast(), "adept/fuyu-8b": PPTestSettings.fast(), + "THUDM/glm-4v-9b": PPTestSettings.fast(trust_remote_code=True), "OpenGVLab/InternVL2-1B": PPTestSettings.fast(trust_remote_code=True), "llava-hf/llava-1.5-7b-hf": PPTestSettings.fast(), "llava-hf/llava-v1.6-mistral-7b-hf": PPTestSettings.fast(), "llava-hf/LLaVA-NeXT-Video-7B-hf": PPTestSettings.fast(), "llava-hf/llava-onevision-qwen2-0.5b-ov-hf": PPTestSettings.fast(), "openbmb/MiniCPM-Llama3-V-2_5": PPTestSettings.fast(trust_remote_code=True), - # TODO: Implement PP - # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(), + "allenai/Molmo-7B-D-0924": PPTestSettings.fast(trust_remote_code=True), "microsoft/Phi-3-vision-128k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501 "mistralai/Pixtral-12B-2409": PPTestSettings.fast(tp_base=2, tokenizer_mode="mistral"), # noqa: E501 "Qwen/Qwen-VL-Chat": PPTestSettings.fast(trust_remote_code=True), + "Qwen/Qwen2-Audio-7B-Instruct": PPTestSettings.fast(), "Qwen/Qwen2-VL-2B-Instruct": PPTestSettings.fast(), "fixie-ai/ultravox-v0_3": PPTestSettings.fast(), -} - -CONDITIONAL_GENERATION_MODEL_SETTINGS = { # type: ignore[var-annotated] - # [FAST TESTS] + # [Encoder-decoder] # TODO: Implement PP - # "facebook/bart-base": PPTestSettings.fast(), + # "meta-llama/Llama-3.2-11B-Vision-Instruct": PPTestSettings.fast(), } # yapf: enable @@ -205,6 +216,7 @@ def iter_params(self, model_name: str): # [LANGUAGE GENERATION] "meta-llama/Meta-Llama-3-8B", "ibm/PowerLM-3b", + "microsoft/Phi-3-mini-4k-instruct", # [LANGUAGE EMBEDDING] "intfloat/e5-mistral-7b-instruct", "BAAI/bge-multilingual-gemma2", @@ -220,19 +232,21 @@ def _compare_tp( parallel_setup: ParallelSetup, distributed_backend: str, task: TaskOption, - trust_remote_code: bool, - tokenizer_mode: Optional[str], + test_options: PPTestOptions, num_gpus_available: int, *, - method: Literal["generate", "encode"] = "encode", + method: Literal["generate", "encode"], ): tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup + multi_node_only, trust_remote_code, tokenizer_mode = test_options if num_gpus_available < tp_size * pp_size: pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs") if VLLM_MULTI_NODE and distributed_backend == "mp": pytest.skip("Skipping multi-node pipeline parallel test for " "multiprocessing distributed backend") + if multi_node_only and not VLLM_MULTI_NODE: + pytest.skip("Not in multi-node setting") common_args = [ # use half precision for speed and memory savings in CI environment @@ -307,9 +321,9 @@ def _compare_tp( @pytest.mark.parametrize( ("model_name", "parallel_setup", "distributed_backend", "task", - "trust_remote_code", "tokenizer_mode"), + "test_options"), [ - params for model_name, settings in GENERATION_MODEL_SETTINGS.items() + params for model_name, settings in TEXT_GENERATION_MODELS.items() for params in settings.iter_params(model_name) if model_name in TEST_MODELS ], @@ -320,25 +334,23 @@ def test_tp_language_generation( parallel_setup: ParallelSetup, distributed_backend: str, task: TaskOption, - trust_remote_code: bool, - tokenizer_mode: Optional[str], + test_options: PPTestOptions, num_gpus_available, ): _compare_tp(model_name, parallel_setup, distributed_backend, task, - trust_remote_code, - tokenizer_mode, + test_options, num_gpus_available, method="generate") @pytest.mark.parametrize( ("model_name", "parallel_setup", "distributed_backend", "task", - "trust_remote_code", "tokenizer_mode"), + "test_options"), [ - params for model_name, settings in EMBEDDING_MODEL_SETTINGS.items() + params for model_name, settings in EMBEDDING_MODELS.items() for params in settings.iter_params(model_name) if model_name in TEST_MODELS ], @@ -349,25 +361,23 @@ def test_tp_language_embedding( parallel_setup: ParallelSetup, distributed_backend: str, task: TaskOption, - trust_remote_code: bool, - tokenizer_mode: Optional[str], + test_options: PPTestOptions, num_gpus_available, ): _compare_tp(model_name, parallel_setup, distributed_backend, task, - trust_remote_code, - tokenizer_mode, + test_options, num_gpus_available, method="encode") @pytest.mark.parametrize( ("model_name", "parallel_setup", "distributed_backend", "task", - "trust_remote_code", "tokenizer_mode"), + "test_options"), [ - params for model_name, settings in MULTIMODAL_MODEL_SETTINGS.items() + params for model_name, settings in MULTIMODAL_MODELS.items() for params in settings.iter_params(model_name) if model_name in TEST_MODELS ], @@ -378,15 +388,13 @@ def test_tp_multimodal_generation( parallel_setup: ParallelSetup, distributed_backend: str, task: TaskOption, - trust_remote_code: bool, - tokenizer_mode: Optional[str], + test_options: PPTestOptions, num_gpus_available, ): _compare_tp(model_name, parallel_setup, distributed_backend, task, - trust_remote_code, - tokenizer_mode, + test_options, num_gpus_available, method="generate") diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py index 9324a737a779c..bef0c515b9073 100644 --- a/tests/encoder_decoder/test_e2e_correctness.py +++ b/tests/encoder_decoder/test_e2e_correctness.py @@ -7,8 +7,8 @@ import pytest from transformers import AutoModelForSeq2SeqLM +from vllm.platforms import current_platform from vllm.sequence import SampleLogprobs -from vllm.utils import is_cpu from ..conftest import DecoderPromptType from ..models.utils import check_logprobs_close @@ -35,7 +35,7 @@ def vllm_to_hf_output( @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) @pytest.mark.parametrize("enforce_eager", [True, False]) @pytest.mark.skipif( - is_cpu(), + current_platform.is_cpu(), reason="CPU backend is not currently supported with encoder/decoder models" ) def test_encoder_decoder_e2e( @@ -50,7 +50,7 @@ def test_encoder_decoder_e2e( enforce_eager: bool, ) -> None: ''' - End-to-End (E2E) test for the encoder-decoder framework. + End-to-End (E2E) test for the encoder-decoder framework. This test evaluates the encoder-decoder functionality using the BART model. We compare the outputs of the Hugging Face and vLLM implementations to ensure that both implementations produce consistent diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py index b57348a4d9a58..fc66386fd2d2a 100644 --- a/tests/entrypoints/llm/test_chat.py +++ b/tests/entrypoints/llm/test_chat.py @@ -8,7 +8,7 @@ def test_chat(): - llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") + llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct") prompt1 = "Explain the concept of entropy." messages = [ @@ -26,7 +26,7 @@ def test_chat(): def test_multi_chat(): - llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") + llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct") prompt1 = "Explain the concept of entropy." prompt2 = "Explain what among us is." diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index a29747603622b..d1aebbd70d256 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -16,9 +16,6 @@ # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" -# technically this needs Mistral-7B-v0.1 as base, but we're not testing -# generation quality here -LORA_NAME = "typeof/zephyr-7b-beta-lora" @pytest.fixture(scope="module") diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index cc72a49ebbbda..f03bdb045f640 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -340,6 +340,40 @@ async def test_completion_streaming(client: openai.AsyncOpenAI, assert "".join(chunks) == single_output +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME, "zephyr-lora", "zephyr-pa"], +) +async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str): + """Streaming for parallel sampling. + The tokens from multiple samples, are flattened into a single stream, + with an index to indicate which sample the token belongs to. + """ + + prompt = "What is an LLM?" + n = 3 + max_tokens = 5 + + stream = await client.completions.create(model=model_name, + prompt=prompt, + max_tokens=max_tokens, + n=n, + stream=True) + chunks: List[List[str]] = [[] for i in range(n)] + finish_reason_count = 0 + async for chunk in stream: + index = chunk.choices[0].index + text = chunk.choices[0].text + chunks[index].append(text) + if chunk.choices[0].finish_reason is not None: + finish_reason_count += 1 + assert finish_reason_count == n + for chunk in chunks: + assert len(chunk) == max_tokens + print("".join(chunk)) + + @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index d9342fad9f018..e969d33775d86 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -26,6 +26,7 @@ class MockModelConfig: tokenizer = MODEL_NAME trust_remote_code = False tokenizer_mode = "auto" + chat_template_text_format = "string" max_model_len = 100 tokenizer_revision = None multimodal_config = MultiModalConfig() diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py index 25ab91ef69333..6fcc92022855b 100644 --- a/tests/entrypoints/openai/test_shutdown.py +++ b/tests/entrypoints/openai/test_shutdown.py @@ -6,7 +6,7 @@ from ...utils import RemoteOpenAIServer -MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" +MODEL_NAME = "meta-llama/Llama-3.2-1B" @pytest.mark.asyncio diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 8311a5cb3c2d4..68804d6833c73 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -107,6 +107,42 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI, assert message.content is not None and len(message.content) >= 0 +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI, + model_name: str, + image_url: str): + messages = [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }] + + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + n=2, + max_tokens=10, + logprobs=True, + top_logprobs=5, + extra_body=dict(use_beam_search=True)) + assert len(chat_completion.choices) == 2 + assert chat_completion.choices[ + 0].message.content != chat_completion.choices[1].message.content + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) @@ -162,6 +198,41 @@ async def test_single_chat_session_image_base64encoded( assert message.content is not None and len(message.content) >= 0 +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +async def test_single_chat_session_image_base64encoded_beamsearch( + client: openai.AsyncOpenAI, model_name: str, image_url: str, + base64_encoded_image: Dict[str, str]): + + messages = [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": + f"data:image/jpeg;base64,{base64_encoded_image[image_url]}" + } + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }] + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + n=2, + max_tokens=10, + extra_body=dict(use_beam_search=True)) + assert len(chat_completion.choices) == 2 + assert chat_completion.choices[ + 0].message.content != chat_completion.choices[1].message.content + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 1d8c328b73259..5fa466f8f041f 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -8,14 +8,16 @@ from vllm.config import ModelConfig from vllm.entrypoints.chat_utils import (parse_chat_messages, parse_chat_messages_futures) +from vllm.entrypoints.llm import apply_hf_chat_template from vllm.multimodal import MultiModalDataDict from vllm.multimodal.utils import encode_image_base64 from vllm.transformers_utils.tokenizer_group import TokenizerGroup PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct" +MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct" -@pytest.fixture(scope="module") +@pytest.fixture(scope="function") def phi3v_model_config(): return ModelConfig(PHI3V_MODEL_ID, task="generate", @@ -24,6 +26,7 @@ def phi3v_model_config(): trust_remote_code=True, dtype="bfloat16", seed=0, + chat_template_text_format="string", limit_mm_per_prompt={ "image": 2, }) @@ -39,6 +42,30 @@ def phi3v_tokenizer(): ) +@pytest.fixture(scope="module") +def mllama_model_config(): + return ModelConfig(MLLAMA_MODEL_ID, + task="generate", + tokenizer=MLLAMA_MODEL_ID, + tokenizer_mode="auto", + trust_remote_code=True, + dtype="bfloat16", + seed=0, + limit_mm_per_prompt={ + "image": 2, + }) + + +@pytest.fixture(scope="module") +def mllama_tokenizer(): + return TokenizerGroup( + MLLAMA_MODEL_ID, + enable_lora=False, + max_num_seqs=5, + max_input_length=None, + ) + + @pytest.fixture(scope="module") def image_url(): image = ImageAsset('cherry_blossom') @@ -304,6 +331,51 @@ def test_parse_chat_messages_multiple_images_across_messages( _assert_mm_data_is_image_input(mm_data, 2) +def test_parse_chat_messages_context_text_format( + phi3v_model_config, + phi3v_tokenizer, +): + phi3v_model_config.chat_template_text_format = "openai" + conversation, mm_data = parse_chat_messages( + [{ + "role": "user", + "content": [{ + "type": "text", + "text": "What's in this text?" + }] + }, { + "role": "assistant", + "content": "Some stuff." + }, { + "role": "user", + "content": "What about this one?" + }], phi3v_model_config, phi3v_tokenizer) + + assert conversation == [ + { + "role": "user", + "content": [{ + "type": "text", + "text": "What's in this text?" + }] + }, + { + "role": "assistant", + "content": [{ + "type": "text", + "text": "Some stuff." + }] + }, + { + "role": "user", + "content": [{ + "type": "text", + "text": "What about this one?" + }] + }, + ] + + def test_parse_chat_messages_rejects_too_many_images_in_one_message( phi3v_model_config, phi3v_tokenizer, @@ -414,3 +486,153 @@ def test_parse_chat_messages_multiple_images_uncommon_input( "<|image_1|>\n<|image_2|>\nWhat's in these images?" }] _assert_mm_data_is_image_input(mm_data, 2) + + +### Mllama currently wraps images / texts as interleaved dictionaries +def test_mllama_single_image( + mllama_model_config, + mllama_tokenizer, + image_url, +): + """Ensures that a single image is parsed correctly mllama.""" + conversation, mm_data = parse_chat_messages([{ + "role": + "user", + "content": [{ + 'type': 'text', + 'text': 'The content of this image is:' + }, { + "image_url": image_url + }] + }], mllama_model_config, mllama_tokenizer) + _assert_mm_data_is_image_input(mm_data, 1) + assert conversation == [{ + 'role': + 'user', + 'content': [{ + 'type': 'text', + 'text': 'The content of this image is:' + }, { + 'type': 'image' + }] + }] + + +def test_mllama_interleaved_images( + mllama_model_config, + mllama_tokenizer, + image_url, +): + """Ensures that multiple image are parsed as interleaved dicts.""" + conversation, mm_data = parse_chat_messages([{ + "role": + "user", + "content": [ + { + 'type': 'text', + 'text': 'The content of the first image is:' + }, + { + "image_url": image_url + }, + { + 'type': 'text', + 'text': 'The content of the second image is:' + }, + { + "image_url": image_url + }, + ] + }], mllama_model_config, mllama_tokenizer) + _assert_mm_data_is_image_input(mm_data, 2) + assert conversation == [{ + 'role': + 'user', + 'content': [{ + 'type': 'text', + 'text': 'The content of the first image is:' + }, { + 'type': 'image' + }, { + 'type': 'text', + 'text': 'The content of the second image is:' + }, { + 'type': 'image' + }] + }] + + +@pytest.mark.parametrize("model", [MLLAMA_MODEL_ID]) +def test_multimodal_image_parsing_matches_hf(model, image_url): + """Checks end to end hf alignment for multimodal [image] parsing.""" + + def get_conversation(is_hf: bool): + img_part = {"type": "image_url", "image_url": {"url": image_url}} + if is_hf: + img_part = {'type': 'image'} + return [{ + 'role': + 'user', + 'content': [ + { + 'type': 'text', + 'text': 'The content of the first image is:' + }, + img_part, + { + 'type': 'text', + 'text': 'The content of the second image is:' + }, + img_part, + { + 'type': 'text', + 'text': 'What animal is in the first image?' + }, + ] + }] + + # Build a config for the model + model_config = ModelConfig(model, + task="generate", + tokenizer=MLLAMA_MODEL_ID, + tokenizer_mode="auto", + trust_remote_code=True, + dtype="bfloat16", + seed=0, + limit_mm_per_prompt={ + "image": 2, + }) + + # Build the tokenizer group and grab the underlying tokenizer + tokenizer_group = TokenizerGroup( + MLLAMA_MODEL_ID, + enable_lora=False, + max_num_seqs=5, + max_input_length=None, + ) + tokenizer = tokenizer_group.tokenizer + + # Build and parse a conversation with {"type": "image"} using the tokenizer + hf_conversation = get_conversation(is_hf=True) + hf_result = tokenizer.apply_chat_template( + hf_conversation, + tokenize=False, + add_generation_prompt=True, + ) + + # Now parse with vLLMs chat utils & apply the template + vllm_conversation = get_conversation(is_hf=False) + conversation, _ = parse_chat_messages( + vllm_conversation, + model_config, + tokenizer_group, + ) + + vllm_result = apply_hf_chat_template( + tokenizer, + conversation=conversation, + chat_template=None, + add_generation_prompt=True, + ) + + assert hf_result == vllm_result diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index 8f6a54ff5979c..f2358940fc7b8 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -2,12 +2,13 @@ import torch -from vllm.utils import is_hip +from vllm.platforms import current_platform # Using the default value (240.0) from pytorch will cause accuracy # issue on dynamic quantization models. Here use 224.0 for rocm. ROCM_FP8_MAX = 224.0 -FP8_DTYPE = torch.float8_e4m3fnuz if is_hip() else torch.float8_e4m3fn +FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm() \ + else torch.float8_e4m3fn def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor: @@ -24,8 +25,10 @@ def ref_dynamic_per_token_quant(x: torch.tensor, qtype_traits = torch.iinfo(quant_dtype) if quant_dtype == torch.int8 \ else torch.finfo(quant_dtype) - qtype_traits_max = ROCM_FP8_MAX if is_hip() else qtype_traits.max - qtype_traits_min = -ROCM_FP8_MAX if is_hip() else qtype_traits.min + qtype_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \ + else qtype_traits.max + qtype_traits_min = -ROCM_FP8_MAX if current_platform.is_rocm() \ + else qtype_traits.min qtype_max = as_float32_tensor(qtype_traits_max) s_1 = as_float32_tensor(1.0) s_512 = as_float32_tensor(512.0) @@ -66,8 +69,10 @@ def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \ -> Tuple[torch.tensor, torch.tensor]: fp8_traits = torch.finfo(FP8_DTYPE) - fp8_traits_max = ROCM_FP8_MAX if is_hip() else fp8_traits.max - fp8_traits_min = -ROCM_FP8_MAX if is_hip() else fp8_traits.min + fp8_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \ + else fp8_traits.max + fp8_traits_min = -ROCM_FP8_MAX if current_platform.is_rocm() \ + else fp8_traits.min fp8_max = as_float32_tensor(fp8_traits_max) one = as_float32_tensor(1.0) diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index 9b476585fa19e..057a11746014c 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -1,13 +1,14 @@ +import random from typing import Type import pytest import torch from tests.kernels.utils import opcheck -from vllm.model_executor.layers.activation import (FastGELU, GeluAndMul, - NewGELU, QuickGELU, - SiluAndMul) -from vllm.utils import seed_everything +from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul, + GeluAndMul, NewGELU, + QuickGELU, SiluAndMul) +from vllm.platforms import current_platform from .allclose_default import get_default_atol, get_default_rtol @@ -20,7 +21,8 @@ ] -@pytest.mark.parametrize("activation", ["silu", "gelu", "gelu_tanh"]) +@pytest.mark.parametrize("activation", + ["silu", "gelu", "gelu_tanh", "fatrelu"]) @pytest.mark.parametrize("num_tokens", NUM_TOKENS) @pytest.mark.parametrize("d", D) @pytest.mark.parametrize("dtype", DTYPES) @@ -35,7 +37,7 @@ def test_act_and_mul( seed: int, device: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) x = torch.randn(num_tokens, 2 * d, dtype=dtype) if activation == "silu": @@ -47,16 +49,23 @@ def test_act_and_mul( elif activation == "gelu_tanh": layer = GeluAndMul(approximate="tanh") fn = torch.ops._C.gelu_tanh_and_mul + elif activation == "fatrelu": + threshold = random.uniform(0, 1) + layer = FatreluAndMul(threshold) + fn = torch.ops._C.fatrelu_and_mul out = layer(x) ref_out = layer.forward_native(x) - # The SiLU and GELU implementations are equivalent to the native PyTorch - # implementations, so we can do exact comparison. + # The SiLU, GELU and FatReLU implementations are equivalent to the native + # PyTorch implementations, so we can do exact comparison. torch.testing.assert_close(out, ref_out, atol=0.0, rtol=0.0) d = x.shape[-1] // 2 output_shape = (x.shape[:-1] + (d, )) out = torch.empty(output_shape, dtype=x.dtype, device=x.device) - opcheck(fn, (out, x)) + if activation == "fatrelu": + opcheck(fn, (out, x, threshold)) + else: + opcheck(fn, (out, x)) @pytest.mark.parametrize("activation", [(FastGELU, torch.ops._C.gelu_fast), @@ -76,7 +85,7 @@ def test_activation( seed: int, device: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) x = torch.randn(num_tokens, d, dtype=dtype) layer = activation[0]() diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 57981e7572c94..9983aa80d20fa 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -6,11 +6,12 @@ from tests.kernels.utils import opcheck from vllm import _custom_ops as ops -from vllm.utils import get_max_shared_memory_bytes, is_hip, seed_everything +from vllm.platforms import current_platform +from vllm.utils import get_max_shared_memory_bytes from .allclose_default import get_default_atol, get_default_rtol -if not is_hip(): +if not current_platform.is_rocm(): from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask @@ -23,8 +24,9 @@ NUM_BLOCKS = 4321 # Arbitrary values for testing PARTITION_SIZE = 512 # flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16} -DTYPES = [torch.half, torch.bfloat16, torch.float - ] if not is_hip() else [torch.half, torch.bfloat16] +DTYPES = [ + torch.half, torch.bfloat16, torch.float +] if not current_platform.is_rocm() else [torch.half, torch.bfloat16] NUM_GEN_SEQS = [7] # Arbitrary values for testing NUM_PREFILL_SEQS = [3] # Arbitrary values for testing NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing @@ -114,7 +116,8 @@ def ref_single_query_cached_kv_attention( @pytest.mark.parametrize( - "version", ["v1", "v2"] if not is_hip() else ["v1", "v2", "rocm"]) + "version", + ["v1", "v2"] if not current_platform.is_rocm() else ["v1", "v2", "rocm"]) @pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @@ -141,7 +144,7 @@ def test_paged_attention( or (version == "rocm" and head_size not in (64, 128))): pytest.skip() - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) scale = float(1.0 / (head_size**0.5)) num_query_heads, num_kv_heads = num_heads @@ -209,7 +212,7 @@ def test_paged_attention( and block_size == BLOCK_SIZES[0])) elif version in ("v2", "rocm"): - if is_hip(): + if current_platform.is_rocm(): PARTITION_SIZE = 1024 if version == "v2" else 512 num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE) assert PARTITION_SIZE % block_size == 0 @@ -320,8 +323,8 @@ def test_paged_attention( # NOTE(woosuk): Due to the kernel-level differences in the two # implementations, there is a small numerical difference in the two # outputs. Thus, we use a relaxed tolerance for the test. - atol = get_default_atol(output) if is_hip() else 1e-3 - rtol = get_default_rtol(output) if is_hip() else 1e-5 + atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3 + rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5 # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error, # so we use a relaxed tolerance for the test. @@ -371,7 +374,7 @@ def ref_multi_query_kv_attention( @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.skipif(is_hip(), +@pytest.mark.skipif(current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm.") @torch.inference_mode() def test_multi_query_kv_attention( @@ -382,7 +385,7 @@ def test_multi_query_kv_attention( seed: int, device: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) # MAX_SEQ_LEN sometimes causes OOM in the reference implementation. # As the xformers library is already tested with its own tests, we can use @@ -428,6 +431,6 @@ def test_multi_query_kv_attention( scale, dtype, ) - atol = get_default_atol(output) if is_hip() else 1e-3 - rtol = get_default_rtol(output) if is_hip() else 1e-5 + atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3 + rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5 torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol) diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index 5671207ac847e..3fe9ca0b0450f 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -19,17 +19,20 @@ def test_env(name: str, device: str, monkeypatch): override_backend_env_variable(monkeypatch, name) if device == "cpu": - with patch("vllm.attention.selector.is_cpu", return_value=True): + with patch("vllm.attention.selector.current_platform.is_cpu", + return_value=True): backend = which_attn_to_use(16, torch.float16, torch.float16, 16, False) assert backend.name == "TORCH_SDPA" elif device == "hip": - with patch("vllm.attention.selector.is_hip", return_value=True): + with patch("vllm.attention.selector.current_platform.is_rocm", + return_value=True): backend = which_attn_to_use(16, torch.float16, torch.float16, 16, False) assert backend.name == "ROCM_FLASH" elif device == "openvino": - with patch("vllm.attention.selector.is_openvino", return_value=True): + with patch("vllm.attention.selector.current_platform.is_openvino", + return_value=True): backend = which_attn_to_use(16, torch.float16, torch.float16, 16, False) assert backend.name == "OPENVINO" diff --git a/tests/kernels/test_awq_marlin.py b/tests/kernels/test_awq_marlin.py index 0f0a2b24563fd..59917dd2c58ad 100644 --- a/tests/kernels/test_awq_marlin.py +++ b/tests/kernels/test_awq_marlin.py @@ -5,11 +5,10 @@ import pytest import torch +import vllm.model_executor.layers.fused_moe # noqa from tests.kernels.utils import (compute_max_diff, stack_and_dev, torch_moe, torch_moe_single) from vllm import _custom_ops as ops -from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( - fused_marlin_moe, single_marlin_moe) from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( awq_marlin_quantize) @@ -81,7 +80,7 @@ def test_fused_marlin_moe_awq( score = torch.randn((m, e), device="cuda", dtype=dtype) topk_weights, topk_ids = fused_topk(a, score, topk, False) - marlin_output = fused_marlin_moe( + marlin_output = torch.ops.vllm.fused_marlin_moe( a, qweight1, qweight2, @@ -150,14 +149,14 @@ def test_single_marlin_moe_multiply_awq( score = torch.randn((m, e), device="cuda", dtype=dtype) - marlin_output = single_marlin_moe(a, - qweight, - scales, - score, - topk, - renormalize=False, - w_zeros=zp, - num_bits=num_bits) + marlin_output = torch.ops.vllm.single_marlin_moe(a, + qweight, + scales, + score, + topk, + renormalize=False, + w_zeros=zp, + num_bits=num_bits) torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk) diff --git a/tests/kernels/test_awq_triton.py b/tests/kernels/test_awq_triton.py index e95e5bd948212..406a0c8dd8080 100644 --- a/tests/kernels/test_awq_triton.py +++ b/tests/kernels/test_awq_triton.py @@ -7,7 +7,7 @@ from vllm.model_executor.layers.quantization.awq_triton import ( AWQ_TRITON_SUPPORTED_GROUP_SIZES, awq_dequantize_triton, awq_gemm_triton) -from vllm.utils import seed_everything +from vllm.platforms import current_platform device = "cuda" @@ -80,7 +80,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size): zeros_cols = qweight_cols zeros_dtype = torch.int32 - seed_everything(0) + current_platform.seed_everything(0) qweight = torch.randint(0, torch.iinfo(torch.int32).max, @@ -134,7 +134,7 @@ def test_gemm(N, K, M, splitK, group_size): qzeros_rows = scales_rows qzeros_cols = qweight_cols - seed_everything(0) + current_platform.seed_everything(0) input = torch.rand((input_rows, input_cols), dtype=input_dtype, diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py index 1a5989c2ccb42..76b90760b6ef6 100644 --- a/tests/kernels/test_blocksparse_attention.py +++ b/tests/kernels/test_blocksparse_attention.py @@ -7,7 +7,8 @@ from vllm import _custom_ops as ops from vllm.attention.ops.blocksparse_attention.interface import ( LocalStridedBlockSparseAttn) -from vllm.utils import get_max_shared_memory_bytes, is_hip, seed_everything +from vllm.platforms import current_platform +from vllm.utils import get_max_shared_memory_bytes from .allclose_default import get_default_atol, get_default_rtol @@ -20,7 +21,7 @@ # There may not be enough gpu memory due to large NUM_BLOCKS. # Reduce NUM_BLOCKS when it happens. NUM_BLOCKS = 4321 # Arbitrary values for testing -PARTITION_SIZE = 512 if not is_hip() else 1024 +PARTITION_SIZE = 512 if not current_platform.is_rocm() else 1024 DTYPES = [torch.half, torch.bfloat16] NUM_GEN_SEQS = [3] # Arbitrary values for testing NUM_PREFILL_SEQS = [3] # Arbitrary values for testing @@ -172,7 +173,7 @@ def test_paged_attention( blocksparse_block_size: int, blocksparse_head_sliding_step: int, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) scale = float(1.0 / (head_size**0.5)) num_query_heads, num_kv_heads = num_heads @@ -316,8 +317,8 @@ def test_paged_attention( # NOTE(woosuk): Due to the kernel-level differences in the two # implementations, there is a small numerical difference in the two # outputs. Thus, we use a relaxed tolerance for the test. - atol = get_default_atol(output) if is_hip() else 1e-3 - rtol = get_default_rtol(output) if is_hip() else 1e-5 + atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3 + rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5 # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error, # so we use a relaxed tolerance for the test. @@ -383,7 +384,7 @@ def test_varlen_blocksparse_attention_prefill( seed: int, device: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) # MAX_SEQ_LEN sometimes causes OOM in the reference implementation. # As the xformers library is already tested with its own tests, we can use diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index b0e7097fdfbd4..5b8311a33c361 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -6,7 +6,7 @@ from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck from vllm import _custom_ops as ops -from vllm.utils import seed_everything +from vllm.platforms import current_platform COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')] DTYPES = [torch.half, torch.bfloat16, torch.float] @@ -56,7 +56,7 @@ def test_copy_blocks( ) -> None: if kv_cache_dtype == "fp8" and head_size % 16: pytest.skip() - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) # Generate random block mappings where each source block is mapped to two # destination blocks. @@ -132,7 +132,7 @@ def test_reshape_and_cache( ) -> None: if kv_cache_dtype == "fp8" and head_size % 16: pytest.skip() - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) # Create a random slot mapping. num_slots = block_size * num_blocks @@ -224,7 +224,7 @@ def test_reshape_and_cache_flash( device: str, kv_cache_dtype: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) # Create a random slot mapping. @@ -339,7 +339,7 @@ def test_swap_blocks( if kv_cache_dtype == "fp8" and head_size % 16: pytest.skip() - seed_everything(seed) + current_platform.seed_everything(seed) src_device = device if direction[0] == "cuda" else 'cpu' dst_device = device if direction[1] == "cuda" else 'cpu' @@ -408,7 +408,7 @@ def test_fp8_e4m3_conversion( seed: int, device: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) low = -224.0 high = 224.0 diff --git a/tests/kernels/test_causal_conv1d.py b/tests/kernels/test_causal_conv1d.py index 277d7e4977d73..96bfe06d74ae5 100644 --- a/tests/kernels/test_causal_conv1d.py +++ b/tests/kernels/test_causal_conv1d.py @@ -9,7 +9,7 @@ from vllm.attention.backends.utils import PAD_SLOT_ID from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( causal_conv1d_fn, causal_conv1d_update) -from vllm.utils import seed_everything +from vllm.platforms import current_platform def causal_conv1d_ref( @@ -70,7 +70,7 @@ def causal_conv1d_update_ref(x, bias: (dim,) cache_seqlens: (batch,), dtype int32. If not None, the conv_state is treated as a circular buffer. - The conv_state will be updated by copying x to the + The conv_state will be updated by copying x to the conv_state starting at the index @cache_seqlens % state_len before performing the convolution. @@ -161,7 +161,7 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation, if itype == torch.bfloat16: rtol, atol = 1e-2, 5e-2 # set seed - seed_everything(0) + current_platform.seed_everything(0) x = torch.randn(batch, dim, seqlen, device=device, dtype=itype).contiguous() @@ -223,7 +223,7 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation, if itype == torch.bfloat16: rtol, atol = 1e-2, 5e-2 # set seed - seed_everything(0) + current_platform.seed_everything(0) batch = 2 x = torch.randn(batch, dim, seqlen, device=device, dtype=itype) x_ref = x.clone() @@ -270,7 +270,7 @@ def test_causal_conv1d_update_with_batch_gather(with_padding, dim, width, rtol, atol = 1e-2, 5e-2 # set seed - seed_everything(0) + current_platform.seed_everything(0) batch_size = 3 padding = 5 if with_padding else 0 @@ -343,7 +343,7 @@ def test_causal_conv1d_varlen(with_padding, dim, seqlen, width, has_bias, if itype == torch.bfloat16: rtol, atol = 1e-2, 5e-2 # set seed - seed_everything(0) + current_platform.seed_everything(0) seqlens = [] batch_size = 4 if seqlen < 10: diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py index c0f1ec04964fd..4239afb70ff85 100644 --- a/tests/kernels/test_encoder_decoder_attn.py +++ b/tests/kernels/test_encoder_decoder_attn.py @@ -18,11 +18,12 @@ from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP from vllm.attention.selector import (_Backend, global_force_attn_backend_context_manager) -from vllm.utils import is_hip +from vllm.platforms import current_platform # List of support backends for encoder/decoder models -LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS] if not is_hip() \ - else [_Backend.ROCM_FLASH] +LIST_ENC_DEC_SUPPORTED_BACKENDS = ([ + _Backend.XFORMERS +] if not current_platform.is_rocm() else [_Backend.ROCM_FLASH]) HEAD_SIZES = [64, 256] @@ -83,7 +84,7 @@ class TestResources(NamedTuple): will leverage attn_backend for the purpose of constructing backend-compatible attention metadata instances - + Attributes: * scale: 1/sqrt(d) scale factor for attn @@ -106,10 +107,10 @@ def _make_test_resources(test_pt: TestPoint, ) -> TestResources: Build key components for performing encoder/decoder attention test. Note that - (1) The Attention instance constructed here, automatically selects + (1) The Attention instance constructed here, automatically selects an attention backend class based on platform info & a set of canned heuristics, so - (2) The attention backend instance constructed here is thus *not + (2) The attention backend instance constructed here is thus *not the same backend instance* used by attn, but rather it is intended to be a *different instance* of the *same backend class*; therefore, @@ -157,7 +158,7 @@ def _encoder_attn_setup( ''' Set up test vectors & data structures for encoder attention test. - A triplet of synthetic query/key/value tensors are constructed. + A triplet of synthetic query/key/value tensors are constructed. Given this is an encoder attention test, the key & value sequences will have the same length as the corresponding queries. @@ -170,14 +171,14 @@ def _encoder_attn_setup( Arguments: * test_pt: TestPoint data structure; this function relies on the - following fields: batch_size, num_heads, head_size, + following fields: batch_size, num_heads, head_size, block_size, max_q_seq_len * test_rsrcs: TestResources data structure; this function relies on the scale field - + Returns: - + * PhaseTestParameters data structure comprising (1) packed query/key/value tensors, (2) the ideal output of attention computed using a naive implementation, and (3) KVCache field set to None @@ -266,7 +267,7 @@ def _decoder_attn_setup( Arguments: * test_pt: TestPoint data structure; this function relies on the - following fields: batch_size, num_heads, head_size, + following fields: batch_size, num_heads, head_size, block_size, max_q_seq_len * test_rsrcs: TestResources data structure; this function relies on the scale field @@ -276,14 +277,14 @@ def _decoder_attn_setup( * qkv: Unpacked (batch_size x padded_seq_len x num_heads x head_size) query/key/value tensors * Prefill-phase decoder self-attention PhaseTestParameters data structure, - including (1) packed (number_of_tokens x num_heads x head_size) + including (1) packed (number_of_tokens x num_heads x head_size) query/key/value tensors along with (2) ideal attention output - computed using a naive implementation, and (3) memory-mapping data + computed using a naive implementation, and (3) memory-mapping data structures appropriate for prefill phase. - * Decode-phase decoder self-attention PhaseTestParameters data structure, - including (1) packed (number_of_tokens x num_heads x head_size) - query/key/value tensors along with (2) ideal attention output - computed using a naive implementation, and (3) memory-mapping data + * Decode-phase decoder self-attention PhaseTestParameters data structure, + including (1) packed (number_of_tokens x num_heads x head_size) + query/key/value tensors along with (2) ideal attention output + computed using a naive implementation, and (3) memory-mapping data structures appropriate for decode phase. * max_block_idx: max physical address in decoder self-attention block-table (intended to be used as the base address for the encoder/ @@ -437,12 +438,12 @@ def _enc_dec_cross_attn_setup_reuses_query( This function also constructs the cross-attention KV cache memory mapping (slot mapping and block table), ensuring that the block table starts at - block_base_addr. + block_base_addr. Arguments: * decoder_qkv: pre-existing unpacked (batch_size x padded_seq_len x - num_heads x head_size) decoder self-attention inputs; + num_heads x head_size) decoder self-attention inputs; this function relies on the query and q_seq_lens fields * encoder_test_params: PhaseTestParameters data structure which was @@ -453,7 +454,7 @@ def _enc_dec_cross_attn_setup_reuses_query( self-attention; all fields including KV cache required * test_pt: TestPoint data structure; this function relies on the - following fields: batch_size, num_heads, head_size, + following fields: batch_size, num_heads, head_size, block_size, max_q_seq_len * test_rsrcs: TestResources data structure; this function relies on the scale field @@ -461,16 +462,16 @@ def _enc_dec_cross_attn_setup_reuses_query( Returns: - * Prefill-phase encoder/decoder cross-attention PhaseTestParameters data - structure, including (1) packed + * Prefill-phase encoder/decoder cross-attention PhaseTestParameters data + structure, including (1) packed (number_of_tokens x num_heads x head_size) query/key/value tensors - along with (2) ideal attention output computed using a + along with (2) ideal attention output computed using a naive implementation, and (3) memory-mapping data structures appropriate for prefill phase. - * Decode-phase encoder/decoder cross-attention PhaseTestParameters data + * Decode-phase encoder/decoder cross-attention PhaseTestParameters data structure, including (1) packed (number_of_tokens x num_heads x head_size) query/key/value tensors - along with (2) ideal attention output computed using a + along with (2) ideal attention output computed using a naive implementation, and (3) memory-mapping data structures appropriate for decode phase. ''' @@ -597,7 +598,7 @@ def _run_encoder_attention_test( ''' Run encoder attention. - attn.forward() is passed attn_type=AttentionType.ENCODER in order + attn.forward() is passed attn_type=AttentionType.ENCODER in order to configure the kernel invocation for encoder attention Requires attn_metadata.num_decode_tokens == 0 @@ -608,7 +609,7 @@ def _run_encoder_attention_test( * attn: Attention wrapper instance * encoder_test_params: encoder PhaseTestParameters data structure; this function relies on the packed - (number_of_tokens x num_heads x head_size) + (number_of_tokens x num_heads x head_size) query/key/value fields * attn_metadata: attention metadata for encoder/decoder-self attention @@ -647,7 +648,7 @@ def _run_decoder_self_attention_test( and attn (Attention wrapper instance) fields * decoder_test_params: decoder PhaseTestParameters data structure; this function relies on the packed - (number_of_tokens x num_heads x head_size) + (number_of_tokens x num_heads x head_size) query/key/value fields * attn_metadata: attention metadata for decoder-self attention (contains KV cache memory-mapping) @@ -695,11 +696,11 @@ def _run_encoder_decoder_cross_attention_test( and attn (Attention wrapper instance) fields * decoder_test_params: decoder PhaseTestParameters data structure; this function relies on the packed - (number_of_tokens x num_heads x head_size) + (number_of_tokens x num_heads x head_size) query field * cross_test_params: encoder/decoder PhaseTestParameters data structure; this function relies on the packed - (number_of_tokens x num_heads x head_size) + (number_of_tokens x num_heads x head_size) key/value fields * attn_metadata: attention metadata for encoder/decoder-self attention @@ -727,7 +728,8 @@ def _run_encoder_decoder_cross_attention_test( attn_type=attn_type) -@pytest.mark.skipif(is_hip(), reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP) +@pytest.mark.skipif(current_platform.is_rocm(), + reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) @@ -756,7 +758,8 @@ def test_encoder_only( No KV cache is required for encoder-only attention. Note on ROCm/HIP: currently encoder/decoder models are not supported on - AMD GPUs, therefore this test simply is skipped if is_hip(). + AMD GPUs, therefore this test simply is skipped if + current_platform.is_rocm(). This test globally forces an override of the usual backend auto-selection process, forcing the specific backend-under-test @@ -837,14 +840,14 @@ def test_e2e_enc_dec_attn( attributes for prefill-phase, and (2) an analogous attention metadata structure but for decode-phase * Test attention steps in the following order - + * Encoder attention * Prefill self-attention * Prefill cross-attention * Decode self-attention * Decode cross-attention - * Besides being reflective of realistic use-cases, this order would - exacerbate any accidental overlap in the self-/cross-attention + * Besides being reflective of realistic use-cases, this order would + exacerbate any accidental overlap in the self-/cross-attention block tables, which one hopes to avoid @@ -864,10 +867,11 @@ def test_e2e_enc_dec_attn( to be utilized. Note on ROCm/HIP: currently encoder/decoder models are not supported on - AMD GPUs, therefore this test simply is skipped if is_hip(). + AMD GPUs, therefore this test simply is skipped if + current_platform.is_rocm(). Note on metadata: there is a single attention metadata structure shared by - all prefill-phase attention operations (encoder, decoder, enc/dec cross), + all prefill-phase attention operations (encoder, decoder, enc/dec cross), and a single one shared by all decode-phase attention operations (decoder & enc/dec cross.) This is intended to reflect the behavior of EncoderDecoderModelRunner, which constructs a single attention metadata diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py index 35c29c5bd1028..a20c73345218f 100644 --- a/tests/kernels/test_flash_attn.py +++ b/tests/kernels/test_flash_attn.py @@ -3,7 +3,7 @@ import pytest import torch -from vllm.utils import seed_everything +from vllm.platforms import current_platform from vllm.vllm_flash_attn import (flash_attn_varlen_func, flash_attn_with_kvcache) @@ -91,7 +91,7 @@ def test_flash_attn_with_paged_kv( sliding_window: Optional[int], ) -> None: torch.set_default_device("cuda") - seed_everything(0) + current_platform.seed_everything(0) num_seqs = len(kv_lens) num_query_heads = num_heads[0] num_kv_heads = num_heads[1] @@ -161,7 +161,7 @@ def test_varlen_with_paged_kv( num_blocks: int, ) -> None: torch.set_default_device("cuda") - seed_everything(0) + current_platform.seed_everything(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] kv_lens = [x[1] for x in seq_lens] diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/test_flashinfer.py index 80a388db6530e..a2c8f71665737 100644 --- a/tests/kernels/test_flashinfer.py +++ b/tests/kernels/test_flashinfer.py @@ -4,7 +4,7 @@ import pytest import torch -from vllm.utils import seed_everything +from vllm.platforms import current_platform NUM_HEADS = [(16, 16), (32, 8), (64, 8), (6, 1)] HEAD_SIZES = [128, 256] @@ -84,7 +84,7 @@ def test_flashinfer_decode_with_paged_kv( soft_cap: Optional[float], ) -> None: torch.set_default_device("cuda") - seed_everything(0) + current_platform.seed_everything(0) num_seqs = len(kv_lens) num_query_heads = num_heads[0] num_kv_heads = num_heads[1] @@ -170,7 +170,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]], block_size: int, soft_cap: Optional[float]) -> None: torch.set_default_device("cuda") - seed_everything(0) + current_platform.seed_everything(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] kv_lens = [x[1] for x in seq_lens] @@ -268,7 +268,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv( head_size: int, dtype: torch.dtype, block_size: int, soft_cap: Optional[float]) -> None: torch.set_default_device("cuda") - seed_everything(0) + current_platform.seed_everything(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] kv_lens = [x[1] for x in seq_lens] @@ -381,7 +381,7 @@ def test_flashinfer_decode_with_paged_fp8_kv( ) -> None: # test doesn't work for num_heads = (16,16) torch.set_default_device("cuda") - seed_everything(0) + current_platform.seed_everything(0) num_seqs = len(kv_lens) num_query_heads = num_heads[0] num_kv_heads = num_heads[1] diff --git a/tests/kernels/test_fp8_quant.py b/tests/kernels/test_fp8_quant.py index c18f5f468dc5a..ebaaae2321885 100644 --- a/tests/kernels/test_fp8_quant.py +++ b/tests/kernels/test_fp8_quant.py @@ -6,7 +6,7 @@ ref_dynamic_per_tensor_fp8_quant, ref_dynamic_per_token_quant) from tests.kernels.utils import opcheck -from vllm.utils import seed_everything +from vllm.platforms import current_platform DTYPES = [torch.half, torch.bfloat16, torch.float] HIDDEN_SIZES = [1, 2, 3, 4, 16, 67, 768, 2048, 5120, 5137, 8192, @@ -46,7 +46,7 @@ def opcheck_fp8_quant(output, def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, scale_ub: bool, seed: int) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") + 1e-6 # avoid nans @@ -76,7 +76,7 @@ def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int, @torch.inference_mode() def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") @@ -95,7 +95,7 @@ def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int, @torch.inference_mode() @pytest.mark.parametrize("seed", SEEDS) def test_fp8_quant_large(seed: int) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) num_tokens = 1024000 # Mistral-Nemo's max_position_embeddings hidden_size = 1152 # Smallest hidden_size to reproduce the error diff --git a/tests/kernels/test_gguf.py b/tests/kernels/test_gguf.py index 1513fc196153c..893af99ba4977 100644 --- a/tests/kernels/test_gguf.py +++ b/tests/kernels/test_gguf.py @@ -7,7 +7,7 @@ from huggingface_hub import snapshot_download import vllm._custom_ops as ops -from vllm.utils import seed_everything +from vllm.platforms import current_platform GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample") @@ -75,7 +75,7 @@ def test_dequantize(hidden_size: int, dtype: torch.dtype, @torch.inference_mode() def test_mmvq(hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType): - seed_everything(0) + current_platform.seed_everything(0) tensors = get_gguf_sample_tensors(hidden_size, quant_type) x = torch.rand((1, hidden_size), dtype=dtype, device="cuda") @@ -111,7 +111,7 @@ def test_mmvq(hidden_size: int, dtype: torch.dtype, @torch.inference_mode() def test_mmq(num_tokens: int, hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType): - seed_everything(0) + current_platform.seed_everything(0) tensors = get_gguf_sample_tensors(hidden_size, quant_type) x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda") diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py index 41e103e1d09f9..8db6a0d0d9fa4 100644 --- a/tests/kernels/test_int8_quant.py +++ b/tests/kernels/test_int8_quant.py @@ -4,7 +4,7 @@ from tests.kernels.quant_utils import ref_dynamic_per_token_quant from tests.kernels.utils import opcheck from vllm._custom_ops import scaled_int8_quant -from vllm.utils import seed_everything +from vllm.platforms import current_platform DTYPES = [torch.half, torch.bfloat16, torch.float] HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 5137, 8192, @@ -45,7 +45,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True): @torch.inference_mode() def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 @@ -68,7 +68,7 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int, @torch.inference_mode() def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) int8_traits = torch.iinfo(torch.int8) x = torch.rand(num_tokens, hidden_size, dtype=dtype, @@ -112,7 +112,7 @@ def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int, def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int, scale: float) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) int8_traits = torch.iinfo(torch.int8) x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 @@ -138,7 +138,7 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int, def test_static_scaled_int8_azp_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int, scale: float, azp: int) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) int8_traits = torch.iinfo(torch.int8) x = torch.rand(num_tokens, hidden_size, dtype=dtype, diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py index 382079d472ee9..9dfa2cbe45e94 100644 --- a/tests/kernels/test_layernorm.py +++ b/tests/kernels/test_layernorm.py @@ -3,7 +3,7 @@ from tests.kernels.utils import opcheck from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.utils import seed_everything +from vllm.platforms import current_platform DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing @@ -31,7 +31,7 @@ def test_rms_norm( seed: int, device: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) layer = RMSNorm(hidden_size).to(dtype=dtype) layer.weight.data.normal_(mean=1.0, std=0.1) diff --git a/tests/kernels/test_mamba_ssm.py b/tests/kernels/test_mamba_ssm.py index e92d401368a7b..bf7ff3b5c59b8 100644 --- a/tests/kernels/test_mamba_ssm.py +++ b/tests/kernels/test_mamba_ssm.py @@ -8,7 +8,7 @@ from vllm.attention.backends.utils import PAD_SLOT_ID from vllm.model_executor.layers.mamba.ops.mamba_ssm import ( selective_scan_fn, selective_state_update) -from vllm.utils import seed_everything +from vllm.platforms import current_platform def selective_state_update_ref(state, @@ -235,7 +235,7 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D, rtolw = max(rtolw, rtol) atolw = max(atolw, atol) # set seed - seed_everything(0) + current_platform.seed_everything(0) batch_size = 1 dim = 4 dstate = 8 @@ -358,7 +358,7 @@ def test_selective_state_update(dim, dstate, has_z, itype): if torch.version.hip: atol *= 2 # set seed - seed_everything(0) + current_platform.seed_everything(0) batch_size = 1 state = torch.randn(batch_size, dim, dstate, dtype=itype, device=device) x = torch.randn(batch_size, dim, device=device, dtype=itype) diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index 5c5d2a76bc1d1..7b1346a1841c1 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -10,19 +10,18 @@ from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock import vllm.envs as envs +import vllm.model_executor.layers.fused_moe # noqa from tests.kernels.utils import (compute_max_diff, opcheck, stack_and_dev, torch_moe, torch_moe_single) from vllm import _custom_ops as ops from vllm.model_executor.layers.fused_moe import fused_moe -from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( - fused_marlin_moe, single_marlin_moe) from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_topk, moe_align_block_size) from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( marlin_quantize) from vllm.model_executor.models.mixtral import MixtralMoE +from vllm.platforms import current_platform from vllm.scalar_type import scalar_types -from vllm.utils import is_hip, seed_everything @pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1]) @@ -115,7 +114,7 @@ def test_mixtral_moe(dtype: torch.dtype): atol=mixtral_moe_tol[dtype]) -@pytest.mark.skipif(is_hip(), +@pytest.mark.skipif(current_platform.is_rocm(), reason="Make this test work with MoE padding on HIP") @pytest.mark.parametrize("m", [64, 512, 222, 33, 1]) @pytest.mark.parametrize("n", [128, 2048, 256, 1024]) @@ -126,6 +125,7 @@ def test_mixtral_moe(dtype: torch.dtype): @pytest.mark.parametrize("act_order", [True, False]) @pytest.mark.parametrize("num_bits", [4, 8]) @pytest.mark.parametrize("is_k_full", [True, False]) +@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm") def test_fused_marlin_moe( m: int, n: int, @@ -137,7 +137,7 @@ def test_fused_marlin_moe( num_bits: int, is_k_full: bool, ): - seed_everything(7) + current_platform.seed_everything(7) # Filter act_order if act_order: @@ -214,7 +214,7 @@ def test_fused_marlin_moe( topk, renormalize=False, ) - marlin_output = fused_marlin_moe( + marlin_output = torch.ops.vllm.fused_marlin_moe( a, qweight1, qweight2, @@ -278,6 +278,7 @@ def test_fused_marlin_moe( @pytest.mark.parametrize("act_order", [True, False]) @pytest.mark.parametrize("num_bits", [4, 8]) @pytest.mark.parametrize("is_k_full", [True, False]) +@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm") def test_single_marlin_moe_multiply( m: int, n: int, @@ -329,7 +330,7 @@ def test_single_marlin_moe_multiply( sort_indices = stack_and_dev(sort_indices_l) score = torch.randn((m, e), device="cuda", dtype=dtype) - marlin_output = single_marlin_moe( + marlin_output = torch.ops.vllm.single_marlin_moe( a, qweight, scales, @@ -368,6 +369,6 @@ def test_moe_align_block_size_opcheck(): dtype=torch.int32, device=topk_ids.device) - opcheck(torch.ops._C.moe_align_block_size, + opcheck(torch.ops._moe_C.moe_align_block_size, (topk_ids, num_experts, block_size, sorted_ids, expert_ids, num_tokens_post_pad)) diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index 94da00915d40e..b408559cc0b07 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -5,7 +5,7 @@ import torch from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.utils import seed_everything +from vllm.platforms import current_platform from .allclose_default import get_default_atol, get_default_rtol @@ -48,7 +48,7 @@ def test_rotary_embedding( if rotary_dim is None: rotary_dim = head_size - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) if rotary_dim is None: rotary_dim = head_size @@ -100,7 +100,7 @@ def test_batched_rotary_embedding( max_position: int = 8192, base: int = 10000, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) if rotary_dim is None: rotary_dim = head_size @@ -160,7 +160,7 @@ def test_batched_rotary_embedding_multi_lora( max_position: int = 8192, base: int = 10000, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) if rotary_dim is None: rotary_dim = head_size diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py index 3181d92562399..a8a187ebaede4 100644 --- a/tests/kernels/test_prefix_prefill.py +++ b/tests/kernels/test_prefix_prefill.py @@ -9,7 +9,8 @@ from vllm.attention.backends.xformers import _make_alibi_bias from vllm.attention.ops.prefix_prefill import context_attention_fwd -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, seed_everything +from vllm.platforms import current_platform +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE NUM_HEADS = [64] NUM_QUERIES_PER_KV = [1, 8, 64] @@ -39,7 +40,7 @@ def test_contexted_kv_attention( kv_cache_dtype: str, device: str, ) -> None: - seed_everything(0) + current_platform.seed_everything(0) torch.set_default_device(device) # Need this, otherwise when we capture the graph the process @@ -234,7 +235,7 @@ def test_contexted_kv_attention_alibi( kv_cache_dtype: str, device: str, ) -> None: - seed_everything(0) + current_platform.seed_everything(0) torch.set_default_device(device) # Need this, otherwise when we capture the graph the process diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py index f7c1d4f041c12..15ec66b0f5502 100644 --- a/tests/lora/test_gemma.py +++ b/tests/lora/test_gemma.py @@ -4,7 +4,7 @@ import vllm from vllm.lora.request import LoRARequest -from vllm.utils import is_hip +from vllm.platforms import current_platform MODEL_PATH = "google/gemma-7b" @@ -31,7 +31,8 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: return generated_texts -@pytest.mark.xfail(is_hip(), reason="There can be output mismatch on ROCm") +@pytest.mark.xfail(current_platform.is_rocm(), + reason="There can be output mismatch on ROCm") def test_gemma_lora(gemma_lora_files): llm = vllm.LLM(MODEL_PATH, max_model_len=1024, diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index db877219a285c..eb882faf3974a 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -39,7 +39,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask) from vllm.model_executor.utils import set_random_seed -from vllm.utils import seed_everything +from vllm.platforms import current_platform from .utils import DummyLoRAManager @@ -923,7 +923,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device, seq_len) -> None: dtype = torch.float16 seed = 0 - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) punica_wrapper = PunicaWrapper(8192, 256, device) max_loras = 8 diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py index 389a3ccbc17ec..c8edb02a88d4b 100644 --- a/tests/lora/test_long_context.py +++ b/tests/lora/test_long_context.py @@ -28,9 +28,15 @@ def _create_lora_request(lora_id, long_context_infos): context_len = long_context_infos[lora_id]["context_length"] scaling_factor = context_len_to_scaling_factor[context_len] - return LoRARequest(context_len, lora_id, - long_context_infos[lora_id]["lora"], None, - 4096 * scaling_factor) + return LoRARequest( + # There are 2 LoRAs for 16K, we need to add lora_id to indicate + # they are different LoRAs. + context_len + str(lora_id), + lora_id, + long_context_infos[lora_id]["lora"], + None, + 4096 * scaling_factor, + ) def evaluate_json_response(model_response, golden_response): @@ -108,14 +114,17 @@ def lora_llm(long_context_infos): for info in long_context_infos.values() ] - llm = vllm.LLM("meta-llama/Llama-2-13b-chat-hf", - enable_lora=True, - max_num_seqs=16, - max_loras=2, - long_lora_scaling_factors=tuple(scaling_factors), - max_num_batched_tokens=4096 * 8, - tensor_parallel_size=4, - distributed_executor_backend="mp") + llm = vllm.LLM( + "meta-llama/Llama-2-13b-chat-hf", + enable_lora=True, + max_num_seqs=16, + max_loras=2, + long_lora_scaling_factors=tuple(scaling_factors), + max_num_batched_tokens=4096 * 8, + tensor_parallel_size=4, + # FIXME enable async output processor + disable_async_output_proc=True, + distributed_executor_backend="mp") yield llm del llm diff --git a/tests/lora/test_minicpmv.py b/tests/lora/test_minicpmv.py index 81b8188e638c9..be040060d02b2 100644 --- a/tests/lora/test_minicpmv.py +++ b/tests/lora/test_minicpmv.py @@ -61,6 +61,7 @@ def test_minicpmv_lora(minicpmv_lora_files): max_loras=4, max_lora_rank=64, trust_remote_code=True, + gpu_memory_utilization=0.97 # This model is pretty big for CI gpus ) output1 = do_sample(llm, minicpmv_lora_files, lora_id=1) diff --git a/tests/lora/test_punica_sizes.py b/tests/lora/test_punica_sizes.py index 41c37a4813c68..e756544d96e98 100644 --- a/tests/lora/test_punica_sizes.py +++ b/tests/lora/test_punica_sizes.py @@ -1,5 +1,5 @@ """ -This script is mainly used to tests various hidden_sizes. We have collected the +This script is mainly used to tests various hidden_sizes. We have collected the hidden_sizes included in the LoRA models currently supported by vLLM. It tests whether the corresponding Triton kernel can run normally when tensor parallelism is set to [1, 2, 4, 8, 16, 32, 64]. @@ -15,8 +15,8 @@ from vllm.lora.ops.sgmv_expand import sgmv_expand from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice from vllm.lora.ops.sgmv_shrink import sgmv_shrink +from vllm.platforms import current_platform from vllm.triton_utils.libentry import LibEntry -from vllm.utils import seed_everything from .utils import (generate_data, generate_data_for_expand_nslices, ref_torch_groupgemm) @@ -146,7 +146,7 @@ def test_punica_sgmv( device: str, ): torch.set_default_device(device) - seed_everything(seed) + current_platform.seed_everything(seed) seq_length = 128 ( @@ -239,7 +239,7 @@ def test_punica_bgmv( from vllm.lora.ops.bgmv_shrink import _bgmv_shrink_kernel torch.set_default_device(device) - seed_everything(seed) + current_platform.seed_everything(seed) seq_length = 1 ( @@ -327,7 +327,7 @@ def test_punica_expand_nslices( from vllm.lora.ops.bgmv_expand_slice import _bgmv_expand_slice_kernel torch.set_default_device(device) - seed_everything(seed) + current_platform.seed_everything(seed) seq_length = 128 if op_type == "sgmv" else 1 ( diff --git a/tests/lora/test_punica_variation.py b/tests/lora/test_punica_variation.py index 185da6399a06a..dc0edeb10ef46 100644 --- a/tests/lora/test_punica_variation.py +++ b/tests/lora/test_punica_variation.py @@ -1,6 +1,6 @@ """ -This script is mainly used to test whether trtion kernels can run normally -under different conditions, including various batches, numbers of LoRA , and +This script is mainly used to test whether trtion kernels can run normally +under different conditions, including various batches, numbers of LoRA , and maximum ranks. """ from unittest.mock import patch @@ -14,8 +14,8 @@ from vllm.lora.ops.sgmv_expand import sgmv_expand from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice from vllm.lora.ops.sgmv_shrink import sgmv_shrink +from vllm.platforms import current_platform from vllm.triton_utils.libentry import LibEntry -from vllm.utils import seed_everything from .utils import (generate_data, generate_data_for_expand_nslices, ref_torch_groupgemm) @@ -61,7 +61,7 @@ def test_punica_sgmv( device: str, ): torch.set_default_device(device) - seed_everything(seed) + current_platform.seed_everything(seed) seq_length = 128 ( @@ -154,7 +154,7 @@ def test_punica_bgmv( from vllm.lora.ops.bgmv_shrink import _bgmv_shrink_kernel torch.set_default_device(device) - seed_everything(seed) + current_platform.seed_everything(seed) seq_length = 1 ( @@ -242,7 +242,7 @@ def test_punica_expand_nslices( from vllm.lora.ops.bgmv_expand_slice import _bgmv_expand_slice_kernel torch.set_default_device(device) - seed_everything(seed) + current_platform.seed_everything(seed) seq_length = 128 if op_type == "sgmv" else 1 ( diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index d004c65929418..5432fa4ad0d3a 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -8,7 +8,7 @@ import vllm from vllm.distributed import cleanup_dist_env_and_memory from vllm.lora.request import LoRARequest -from vllm.utils import is_hip +from vllm.platforms import current_platform @dataclass @@ -19,7 +19,7 @@ class ModelWithQuantization: MODELS: List[ModelWithQuantization] #AWQ quantization is currently not supported in ROCm. -if is_hip(): +if current_platform.is_rocm(): MODELS = [ ModelWithQuantization( model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index 92e6086e312f7..7a361ef320810 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -84,6 +84,45 @@ def test_metric_counter_generation_tokens( f"metric: {metric_count!r}") +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("max_tokens", [128, 129]) +@pytest.mark.parametrize("disable_async_output_proc", [True, False]) +def test_metric_counter_generation_tokens_multi_step( + vllm_runner, + example_prompts, + model: str, + max_tokens: int, + disable_async_output_proc: bool, +) -> None: + num_scheduler_steps = 8 + with vllm_runner( + model, + disable_log_stats=False, + gpu_memory_utilization=0.4, + num_scheduler_steps=num_scheduler_steps, + disable_async_output_proc=disable_async_output_proc, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + tokenizer = vllm_model.model.get_tokenizer() + stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] + metric_count = stat_logger.metrics.counter_generation_tokens.labels( + **stat_logger.labels)._value.get() + vllm_generation_count = 0 + for i in range(len(example_prompts)): + vllm_output_ids, vllm_output_str = vllm_outputs[i] + prompt_ids = tokenizer.encode(example_prompts[i]) + # vllm_output_ids contains both prompt tokens and generation tokens. + # We're interested only in the count of the generation tokens. + vllm_generation_count += len(vllm_output_ids) - len(prompt_ids) + + # The multi-step scheduling will continue to execute forward even when + # encountering EOS, leading to slightly imprecise metrics. + assert abs(vllm_generation_count - metric_count) <\ + len(example_prompts) * num_scheduler_steps, \ + (f"generation token count: {vllm_generation_count!r}\n" + f"metric: {metric_count!r}") + + @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize( diff --git a/tests/models/decoder_only/language/test_big_models.py b/tests/models/decoder_only/language/test_big_models.py index 75625b35209ce..fcfc159e4f5a0 100644 --- a/tests/models/decoder_only/language/test_big_models.py +++ b/tests/models/decoder_only/language/test_big_models.py @@ -8,7 +8,7 @@ from vllm.platforms import current_platform -from ...utils import check_outputs_equal +from ...utils import check_logprobs_close, check_outputs_equal MODELS = [ "meta-llama/Llama-2-7b-hf", @@ -43,18 +43,40 @@ def test_models( dtype: str, max_tokens: int, ) -> None: - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + if model == "openbmb/MiniCPM3-4B": + # the output becomes slightly different when upgrading to + # pytorch 2.5 . Changing to logprobs checks instead of exact + # output checks. + NUM_LOG_PROBS = 8 + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy_logprobs_limit( + example_prompts, max_tokens, NUM_LOG_PROBS) + + with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: + vllm_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) + + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + else: + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + + with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, + max_tokens) + + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @pytest.mark.parametrize("model", MODELS) diff --git a/tests/models/decoder_only/language/test_phimoe.py b/tests/models/decoder_only/language/test_phimoe.py index 89afbcf1c03ac..c997359a2781e 100644 --- a/tests/models/decoder_only/language/test_phimoe.py +++ b/tests/models/decoder_only/language/test_phimoe.py @@ -5,7 +5,7 @@ import pytest import torch -from vllm.utils import is_cpu +from vllm.platforms import current_platform from ....utils import large_gpu_test from ...utils import check_logprobs_close @@ -70,7 +70,7 @@ def test_phimoe_routing_function(): assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"]) -@pytest.mark.skipif(condition=is_cpu(), +@pytest.mark.skipif(condition=current_platform.is_cpu(), reason="This test takes a lot time to run on CPU, " "and vllm CI's disk space is not enough for this model.") @large_gpu_test(min_gb=80) diff --git a/tests/models/decoder_only/vision_language/test_broadcast.py b/tests/models/decoder_only/vision_language/test_broadcast.py index d01490d74bd4d..fd7af4a8b0b29 100644 --- a/tests/models/decoder_only/vision_language/test_broadcast.py +++ b/tests/models/decoder_only/vision_language/test_broadcast.py @@ -1,4 +1,5 @@ import pytest +import transformers from ....utils import multi_gpu_test @@ -23,6 +24,9 @@ def test_models(hf_runner, vllm_runner, image_assets, elif model.startswith("llava-hf/llava-v1.6"): from .test_llava_next import models, run_test # type: ignore[no-redef] elif model.startswith("facebook/chameleon"): + if transformers.__version__.startswith("4.46.0"): + pytest.skip("Model broken in HF, " + "see huggingface/transformers#34379") from .test_chameleon import models, run_test # type: ignore[no-redef] else: raise NotImplementedError(f"Unsupported model: {model}") diff --git a/tests/models/decoder_only/vision_language/test_chameleon.py b/tests/models/decoder_only/vision_language/test_chameleon.py index 8334451970a4f..4bd678b9f21c4 100644 --- a/tests/models/decoder_only/vision_language/test_chameleon.py +++ b/tests/models/decoder_only/vision_language/test_chameleon.py @@ -1,6 +1,7 @@ from typing import List, Optional, Type import pytest +import transformers from transformers import AutoModelForVision2Seq, BatchEncoding from vllm.multimodal.utils import rescale_image_size @@ -93,6 +94,10 @@ def process(hf_inputs: BatchEncoding): ) +@pytest.mark.skipif( + transformers.__version__.startswith("4.46.0"), + reason="Model broken in HF, see huggingface/transformers#34379", +) @pytest.mark.parametrize("model", models) @pytest.mark.parametrize( "size_factors", diff --git a/tests/models/decoder_only/vision_language/test_fuyu.py b/tests/models/decoder_only/vision_language/test_fuyu.py index 7827ecb19a744..1affcd10ee72d 100644 --- a/tests/models/decoder_only/vision_language/test_fuyu.py +++ b/tests/models/decoder_only/vision_language/test_fuyu.py @@ -3,8 +3,8 @@ import pytest from vllm.multimodal.utils import rescale_image_size +from vllm.platforms import current_platform from vllm.sequence import SampleLogprobs -from vllm.utils import is_cpu from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets from ...utils import check_logprobs_close @@ -46,7 +46,7 @@ def run_test( All the image fixtures for the test are from IMAGE_ASSETS. For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects + For vllm runner, we provide MultiModalDataDict objects and corresponding MultiModalConfig as input. Note, the text input is also adjusted to abide by vllm contract. The text output is sanitized to be able to compare with hf. @@ -103,7 +103,7 @@ def run_test( target_dtype = "half" -if is_cpu(): +if current_platform.is_cpu(): target_dtype = "bfloat16" diff --git a/tests/models/decoder_only/vision_language/test_internvl.py b/tests/models/decoder_only/vision_language/test_internvl.py index 49cab75d8ea53..fc842ec4a6171 100644 --- a/tests/models/decoder_only/vision_language/test_internvl.py +++ b/tests/models/decoder_only/vision_language/test_internvl.py @@ -7,7 +7,6 @@ from transformers import AutoConfig from vllm.multimodal.utils import rescale_image_size -from vllm.utils import is_cpu from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, _ImageAssets) @@ -19,15 +18,20 @@ "cherry_blossom": "<|im_start|>User\n\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 }) -HF_MULTIIMAGE_IMAGE_PROMPT = "<|im_start|>User\nImage-1: \nImage-2: \nDescribe the two images in detail.<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501 +HF_MULTIIMAGE_IMAGE_PROMPT = "<|im_start|>User\nImage-1: \nImage-2: \nDescribe the two images in short.<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501 models = [ "OpenGVLab/InternVL2-1B", "OpenGVLab/InternVL2-2B", + # NOTE: Mono-InternVL-2B doesn't work with fp16, + # it will result NaN during inference. + # See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9 + "OpenGVLab/Mono-InternVL-2B", # Broken due to outdated implementation of Phi-3 # See: https://huggingface.co/OpenGVLab/InternVL2-4B/discussions/3 # "OpenGVLab/InternVL2-4B", ] +target_dtype = "bfloat16" # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py @@ -52,9 +56,15 @@ def generate( input_embeds = input_embeds.reshape(B, N, C) - outputs = self.language_model.generate( + forward_kwargs = dict( inputs_embeds=input_embeds, attention_mask=attention_mask, + ) + if getattr(self, "use_visual_token_mask", False): + visual_token_mask = selected.reshape(B, N, 1).to(input_embeds.dtype) + forward_kwargs["visual_token_mask"] = visual_token_mask + outputs = self.language_model.generate( + **forward_kwargs, **generate_kwargs, ) @@ -78,7 +88,7 @@ def run_test( All the image fixtures for the test are from IMAGE_ASSETS. For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects + For vllm runner, we provide MultiModalDataDict objects and corresponding MultiModalConfig as input. Note, the text input is also adjusted to abide by vllm contract. The text output is sanitized to be able to compare with hf. @@ -243,11 +253,6 @@ def run_awq_test( ) -target_dtype = "half" -if is_cpu(): - target_dtype = "bfloat16" - - @pytest.mark.parametrize("model", models) @pytest.mark.parametrize( "size_factors", diff --git a/tests/models/decoder_only/vision_language/test_llava_next.py b/tests/models/decoder_only/vision_language/test_llava_next.py index f833fe0c8bbb4..aa9b297c5dd4e 100644 --- a/tests/models/decoder_only/vision_language/test_llava_next.py +++ b/tests/models/decoder_only/vision_language/test_llava_next.py @@ -3,12 +3,13 @@ import pytest from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer +from vllm.inputs import InputContext from vllm.multimodal.utils import rescale_image_size from vllm.sequence import SampleLogprobs from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, _ImageAssets) -from ...utils import check_logprobs_close +from ...utils import build_model_context, check_logprobs_close _LIMIT_IMAGE_PER_PROMPT = 4 @@ -22,6 +23,19 @@ models = ["llava-hf/llava-v1.6-mistral-7b-hf"] +@pytest.fixture() +def get_max_llava_next_image_tokens(): + from vllm.model_executor.models.llava_next import ( + get_max_llava_next_image_tokens) + return get_max_llava_next_image_tokens + + +@pytest.fixture() +def dummy_data_for_llava_next(): + from vllm.model_executor.models.llava_next import dummy_data_for_llava_next + return dummy_data_for_llava_next + + def vllm_to_hf_output(vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]], model: str): @@ -281,3 +295,53 @@ def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets, num_logprobs=num_logprobs, tensor_parallel_size=1, ) + + +@pytest.mark.parametrize("gridpoints,expected_max_tokens", [ + ([[336, 336]], 1176), + ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], 2928), +]) +def test_get_max_llava_next_image_tokens(gridpoints, expected_max_tokens, + get_max_llava_next_image_tokens): + ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf") + + # Update the config image_grid_pinpoints + # and calculate the resulting max tokens + ctx.model_config.hf_config.image_grid_pinpoints = gridpoints + + actual_max_tokens = get_max_llava_next_image_tokens( + InputContext(ctx.model_config)) + + assert expected_max_tokens == actual_max_tokens + + +@pytest.mark.parametrize( + "gridpoints,expected_size", + [ + # One point; it has to be the largest + ([[336, 336]], (336, 336)), + # Default for most llava next models; the 2x2 tile is the largest + ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], + (672, 672)), + # If two rectangular gridpoints are the same, the more vertical + # one has the higher feature count due to newline features + ([[336, 672], [672, 336]], (672, 336)) + ]) +def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next, + gridpoints, expected_size): + ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf") + + # Update the config image_grid_pinpoints + ctx.model_config.hf_config.image_grid_pinpoints = gridpoints + seq_len = 5000 # bigger than the max feature size for any image + + seq_data, mm_data = dummy_data_for_llava_next( + ctx, + seq_len=seq_len, + mm_counts={"image": 1}, + ) + + # The dummy data dims should match the gridpoint with the biggest feat size + assert mm_data["image"].height == expected_size[0] + assert mm_data["image"].width == expected_size[1] + assert len(seq_data.get_token_ids()) >= seq_len diff --git a/tests/models/decoder_only/vision_language/test_llava_onevision.py b/tests/models/decoder_only/vision_language/test_llava_onevision.py index 367f25f446279..1616fd299b9aa 100644 --- a/tests/models/decoder_only/vision_language/test_llava_onevision.py +++ b/tests/models/decoder_only/vision_language/test_llava_onevision.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Tuple, Type, overload +from typing import List, Optional, Tuple, Type import pytest from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer, @@ -9,9 +9,8 @@ from vllm.sequence import SampleLogprobs from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE -from ....conftest import (VIDEO_ASSETS, HfRunner, PromptImageInput, VllmRunner, - _VideoAssets) -from ....utils import large_gpu_test +from ....conftest import (VIDEO_ASSETS, HfRunner, PromptImageInput, + PromptVideoInput, VllmRunner) from ...utils import check_logprobs_close # Video test @@ -20,7 +19,7 @@ "<|im_start|>user\n