diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml new file mode 100644 index 0000000000000..78347f63fa793 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1 +model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.356 + - name: "exact_match,flexible-extract" + value: 0.358 +limit: 1000 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt index 64a0f428587af..6057229ac50f3 100644 --- a/.buildkite/lm-eval-harness/configs/models-small.txt +++ b/.buildkite/lm-eval-harness/configs/models-small.txt @@ -1,6 +1,6 @@ Meta-Llama-3-8B-Instruct.yaml Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml -Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml +Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c4fc43dc0abb8..8c98aa36ac0ff 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -310,13 +310,22 @@ steps: - pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s models/*.py --ignore=models/test_oot_registration.py -- label: Decoder-only Language Models Test # 1h36min +- label: Decoder-only Language Models Test (Standard) # 35min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/models/decoder_only/language commands: - - pytest -v -s models/decoder_only/language + - pytest -v -s models/decoder_only/language/test_models.py + - pytest -v -s models/decoder_only/language/test_big_models.py + +- label: Decoder-only Language Models Test (Extended) # 1h20min + nightly: true + source_file_dependencies: + - vllm/ + - tests/models/decoder_only/language + commands: + - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py - label: Decoder-only Multi-Modal Models Test # 1h31min #mirror_hardwares: [amd] diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml index 2a0e3239f58da..b80749aaa8fec 100644 --- a/.github/workflows/actionlint.yml +++ b/.github/workflows/actionlint.yml @@ -34,4 +34,5 @@ jobs: - name: "Run actionlint" run: | + echo "::add-matcher::.github/workflows/matchers/actionlint.json" tools/actionlint.sh -color diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml index 2e7c7f7f087af..c9d6d4259df99 100644 --- a/.github/workflows/add_label_automerge.yml +++ b/.github/workflows/add_label_automerge.yml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Add label - uses: actions/github-script@v7 + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 with: script: | github.rest.issues.addLabels({ diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml index 064af291009fa..68d60d7365ed1 100644 --- a/.github/workflows/clang-format.yml +++ b/.github/workflows/clang-format.yml @@ -17,9 +17,9 @@ jobs: matrix: python-version: ["3.11"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: ${{ matrix.python-version }} - name: Install dependencies @@ -38,4 +38,4 @@ jobs: ) find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \ | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \ - | xargs clang-format --dry-run --Werror \ No newline at end of file + | xargs clang-format --dry-run --Werror diff --git a/.github/workflows/matchers/mypy.json b/.github/workflows/matchers/mypy.json new file mode 100644 index 0000000000000..f048fce528941 --- /dev/null +++ b/.github/workflows/matchers/mypy.json @@ -0,0 +1,16 @@ +{ + "problemMatcher": [ + { + "owner": "mypy", + "pattern": [ + { + "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$", + "file": 1, + "line": 2, + "severity": 3, + "message": 4 + } + ] + } + ] +} diff --git a/.github/workflows/matchers/ruff.json b/.github/workflows/matchers/ruff.json new file mode 100644 index 0000000000000..f6d4479ee1996 --- /dev/null +++ b/.github/workflows/matchers/ruff.json @@ -0,0 +1,17 @@ +{ + "problemMatcher": [ + { + "owner": "ruff", + "pattern": [ + { + "regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$", + "file": 1, + "line": 2, + "column": 3, + "code": 4, + "message": 5 + } + ] + } + ] + } diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml index 22e3564779ad9..5f1e5f8eeaf7d 100644 --- a/.github/workflows/mypy.yaml +++ b/.github/workflows/mypy.yaml @@ -17,9 +17,9 @@ jobs: matrix: python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: ${{ matrix.python-version }} - name: Install dependencies @@ -32,4 +32,5 @@ jobs: pip install types-setuptools - name: Mypy run: | - tools/mypy.sh + echo "::add-matcher::.github/workflows/matchers/mypy.json" + tools/mypy.sh 1 diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 96549b3f99181..f959a1cacf866 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -21,7 +21,7 @@ jobs: upload_url: ${{ steps.create_release.outputs.upload_url }} steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Extract branch info shell: bash @@ -30,7 +30,7 @@ jobs: - name: Create Release id: create_release - uses: "actions/github-script@v7" + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 env: RELEASE_TAG: ${{ env.release_tag }} with: @@ -54,10 +54,10 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Setup ccache - uses: hendrikmuhs/ccache-action@v1.2 + uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14 with: create-symlink: true key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }} @@ -68,7 +68,7 @@ jobs: bash -x .github/workflows/scripts/env.sh - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: ${{ matrix.python-version }} @@ -92,7 +92,7 @@ jobs: echo "asset_name=${asset_name}" >> "$GITHUB_ENV" - name: Upload Release Asset - uses: actions/upload-release-asset@v1 + uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml index d1791c3bc865a..df62539c0b3d9 100644 --- a/.github/workflows/reminder_comment.yml +++ b/.github/workflows/reminder_comment.yml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Remind to run full CI on PR - uses: actions/github-script@v7 + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 with: script: | github.rest.issues.createComment({ diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml index be73fb85ed1fa..9cc8a9e914474 100644 --- a/.github/workflows/ruff.yml +++ b/.github/workflows/ruff.yml @@ -17,9 +17,9 @@ jobs: matrix: python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: ${{ matrix.python-version }} - name: Install dependencies @@ -28,7 +28,8 @@ jobs: pip install -r requirements-lint.txt - name: Analysing the code with ruff run: | - ruff check . + echo "::add-matcher::.github/workflows/matchers/ruff.json" + ruff check --output-format github . - name: Spelling check with codespell run: | codespell --toml pyproject.toml diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml index eb728ae04dfc1..9f06b35c19e32 100644 --- a/.github/workflows/yapf.yml +++ b/.github/workflows/yapf.yml @@ -16,9 +16,9 @@ jobs: matrix: python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/CMakeLists.txt b/CMakeLists.txt index 7f6d1c66b2cf7..d1956f3d409b4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -169,12 +169,12 @@ endif() # # Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process. -# Configure it to place files in vllm/.deps, in order to play nicely with sccache. +# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache. +# Each dependency that produces build artifacts should override its BINARY_DIR to avoid +# conflicts between build types. It should instead be set to ${CMAKE_BINARY_DIR}/. # include(FetchContent) -get_filename_component(PROJECT_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) -file(MAKE_DIRECTORY "${FETCHCONTENT_BASE_DIR}") -set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT_DIR}/.deps") +file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}") # @@ -252,7 +252,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}") else() message(STATUS "Not building Marlin kernels as no compatible archs found" - "in CUDA target architectures") + " in CUDA target architectures") endif() # @@ -432,7 +432,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}") else() message(STATUS "Not building Marlin MOE kernels as no compatible archs found" - "in CUDA target architectures") + " in CUDA target architectures") endif() endif() @@ -509,6 +509,8 @@ else() GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd GIT_PROGRESS TRUE + # Don't share the vllm-flash-attn build between build types + BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn ) endif() diff --git a/Dockerfile.openvino b/Dockerfile.openvino index c89864da91180..a05ff452cd36e 100644 --- a/Dockerfile.openvino +++ b/Dockerfile.openvino @@ -15,11 +15,11 @@ RUN --mount=type=bind,source=.git,target=.git \ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi # install build requirements -RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt +RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt # build vLLM with OpenVINO backend -RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/ +RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace -COPY examples/ /workspace/vllm/examples -COPY benchmarks/ /workspace/vllm/benchmarks +COPY examples/ /workspace/examples +COPY benchmarks/ /workspace/benchmarks CMD ["/bin/bash"] diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le index a84e00fd5677f..cd5fcf481f07c 100644 --- a/Dockerfile.ppc64le +++ b/Dockerfile.ppc64le @@ -33,4 +33,4 @@ WORKDIR /workspace/ RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks -ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] +ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index ea1a7788f621d..0a14aedd5feba 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -1,5 +1,6 @@ """Benchmark the latency of processing a single batch of requests.""" import argparse +import dataclasses import json import time from pathlib import Path @@ -10,43 +11,19 @@ from tqdm import tqdm from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs +from vllm.engine.arg_utils import EngineArgs from vllm.inputs import PromptType -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.utils import FlexibleArgumentParser def main(args: argparse.Namespace): print(args) + engine_args = EngineArgs.from_cli_args(args) + # NOTE(woosuk): If the request cannot be processed in a single batch, # the engine will automatically process the request in multiple batches. - llm = LLM( - model=args.model, - speculative_model=args.speculative_model, - num_speculative_tokens=args.num_speculative_tokens, - speculative_draft_tensor_parallel_size=\ - args.speculative_draft_tensor_parallel_size, - tokenizer=args.tokenizer, - quantization=args.quantization, - tensor_parallel_size=args.tensor_parallel_size, - trust_remote_code=args.trust_remote_code, - dtype=args.dtype, - max_model_len=args.max_model_len, - enforce_eager=args.enforce_eager, - kv_cache_dtype=args.kv_cache_dtype, - quantization_param_path=args.quantization_param_path, - device=args.device, - ray_workers_use_nsight=args.ray_workers_use_nsight, - enable_chunked_prefill=args.enable_chunked_prefill, - download_dir=args.download_dir, - block_size=args.block_size, - gpu_memory_utilization=args.gpu_memory_utilization, - load_format=args.load_format, - distributed_executor_backend=args.distributed_executor_backend, - otlp_traces_endpoint=args.otlp_traces_endpoint, - enable_prefix_caching=args.enable_prefix_caching, - ) + llm = LLM(**dataclasses.asdict(engine_args)) sampling_params = SamplingParams( n=args.n, @@ -125,19 +102,6 @@ def run_to_completion(profile_dir: Optional[str] = None): parser = FlexibleArgumentParser( description='Benchmark the latency of processing a single batch of ' 'requests till completion.') - parser.add_argument('--model', type=str, default='facebook/opt-125m') - parser.add_argument('--speculative-model', type=str, default=None) - parser.add_argument('--num-speculative-tokens', type=int, default=None) - parser.add_argument('--speculative-draft-tensor-parallel-size', - '-spec-draft-tp', - type=int, - default=None) - parser.add_argument('--tokenizer', type=str, default=None) - parser.add_argument('--quantization', - '-q', - choices=[*QUANTIZATION_METHODS, None], - default=None) - parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) parser.add_argument('--input-len', type=int, default=32) parser.add_argument('--output-len', type=int, default=128) parser.add_argument('--batch-size', type=int, default=8) @@ -154,45 +118,6 @@ def run_to_completion(profile_dir: Optional[str] = None): type=int, default=30, help='Number of iterations to run.') - parser.add_argument('--trust-remote-code', - action='store_true', - help='trust remote code from huggingface') - parser.add_argument( - '--max-model-len', - type=int, - default=None, - help='Maximum length of a sequence (including prompt and output). ' - 'If None, will be derived from the model.') - parser.add_argument( - '--dtype', - type=str, - default='auto', - choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], - help='data type for model weights and activations. ' - 'The "auto" option will use FP16 precision ' - 'for FP32 and FP16 models, and BF16 precision ' - 'for BF16 models.') - parser.add_argument('--enforce-eager', - action='store_true', - help='enforce eager mode and disable CUDA graph') - parser.add_argument( - '--kv-cache-dtype', - type=str, - choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], - default="auto", - help='Data type for kv cache storage. If "auto", will use model ' - 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' - 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') - parser.add_argument( - '--quantization-param-path', - type=str, - default=None, - help='Path to the JSON file containing the KV cache scaling factors. ' - 'This should generally be supplied, when KV cache dtype is FP8. ' - 'Otherwise, KV cache scaling factors default to 1.0, which may cause ' - 'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' - 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' - 'instead supported for common inference criteria.') parser.add_argument( '--profile', action='store_true', @@ -203,78 +128,12 @@ def run_to_completion(profile_dir: Optional[str] = None): default=None, help=('path to save the pytorch profiler output. Can be visualized ' 'with ui.perfetto.dev or Tensorboard.')) - parser.add_argument("--device", - type=str, - default="auto", - choices=DEVICE_OPTIONS, - help='device type for vLLM execution') - parser.add_argument('--block-size', - type=int, - default=16, - help='block size of key/value cache') - parser.add_argument( - '--enable-chunked-prefill', - action='store_true', - help='If True, the prefill requests can be chunked based on the ' - 'max_num_batched_tokens') - parser.add_argument("--enable-prefix-caching", - action='store_true', - help="Enable automatic prefix caching") - parser.add_argument( - "--ray-workers-use-nsight", - action='store_true', - help="If specified, use nsight to profile ray workers", - ) - parser.add_argument('--download-dir', - type=str, - default=None, - help='directory to download and load the weights, ' - 'default to the default cache dir of huggingface') parser.add_argument( '--output-json', type=str, default=None, help='Path to save the latency results in JSON format.') - parser.add_argument('--gpu-memory-utilization', - type=float, - default=0.9, - help='the fraction of GPU memory to be used for ' - 'the model executor, which can range from 0 to 1.' - 'If unspecified, will use the default value of 0.9.') - parser.add_argument( - '--load-format', - type=str, - default=EngineArgs.load_format, - choices=[ - 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer', - 'bitsandbytes' - ], - help='The format of the model weights to load.\n\n' - '* "auto" will try to load the weights in the safetensors format ' - 'and fall back to the pytorch bin format if safetensors format ' - 'is not available.\n' - '* "pt" will load the weights in the pytorch bin format.\n' - '* "safetensors" will load the weights in the safetensors format.\n' - '* "npcache" will load the weights in pytorch format and store ' - 'a numpy cache to speed up the loading.\n' - '* "dummy" will initialize the weights with random values, ' - 'which is mainly for profiling.\n' - '* "tensorizer" will load the weights using tensorizer from ' - 'CoreWeave. See the Tensorize vLLM Model script in the Examples' - 'section for more information.\n' - '* "bitsandbytes" will load the weights using bitsandbytes ' - 'quantization.\n') - parser.add_argument( - '--distributed-executor-backend', - choices=['ray', 'mp'], - default=None, - help='Backend to use for distributed serving. When more than 1 GPU ' - 'is used, will be automatically set to "ray" if installed ' - 'or "mp" (multiprocessing) otherwise.') - parser.add_argument( - '--otlp-traces-endpoint', - type=str, - default=None, - help='Target URL to which OpenTelemetry traces will be sent.') + + parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index a354358e43aa3..1aac029992dbf 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -25,6 +25,7 @@ --input-length-range 128:256 """ +import dataclasses import json import random import time @@ -33,6 +34,7 @@ from transformers import PreTrainedTokenizerBase from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import EngineArgs from vllm.utils import FlexibleArgumentParser try: @@ -129,12 +131,9 @@ def main(args): filtered_datasets = [(PROMPT, prompt_len, args.output_len) ] * args.num_prompts - llm = LLM(model=args.model, - tokenizer_mode='auto', - trust_remote_code=True, - enforce_eager=True, - tensor_parallel_size=args.tensor_parallel_size, - enable_prefix_caching=args.enable_prefix_caching) + engine_args = EngineArgs.from_cli_args(args) + + llm = LLM(**dataclasses.asdict(engine_args)) sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len) @@ -162,18 +161,11 @@ def main(args): parser = FlexibleArgumentParser( description= 'Benchmark the performance with or without automatic prefix caching.') - parser.add_argument('--model', - type=str, - default='baichuan-inc/Baichuan2-13B-Chat') parser.add_argument("--dataset-path", type=str, default=None, help="Path to the dataset.") - parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) parser.add_argument('--output-len', type=int, default=10) - parser.add_argument('--enable-prefix-caching', - action='store_true', - help='enable prefix caching') parser.add_argument('--num-prompts', type=int, default=1, @@ -190,9 +182,7 @@ def main(args): default='128:256', help='Range of input lengths for sampling prompts,' 'specified as "min:max" (e.g., "128:256").') - parser.add_argument("--seed", - type=int, - default=0, - help='Random seed for reproducibility') + + parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index 8843e3a927a01..e0c9e6a6db502 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -1,5 +1,6 @@ """Benchmark offline prioritization.""" import argparse +import dataclasses import json import random import time @@ -7,7 +8,8 @@ from transformers import AutoTokenizer, PreTrainedTokenizerBase -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.engine.arg_utils import EngineArgs +from vllm.utils import FlexibleArgumentParser def sample_requests( @@ -62,46 +64,11 @@ def sample_requests( def run_vllm( requests: List[Tuple[str, int, int]], - model: str, - tokenizer: str, - quantization: Optional[str], - tensor_parallel_size: int, - seed: int, n: int, - trust_remote_code: bool, - dtype: str, - max_model_len: Optional[int], - enforce_eager: bool, - kv_cache_dtype: str, - quantization_param_path: Optional[str], - device: str, - enable_prefix_caching: bool, - enable_chunked_prefill: bool, - max_num_batched_tokens: int, - gpu_memory_utilization: float = 0.9, - download_dir: Optional[str] = None, + engine_args: EngineArgs, ) -> float: from vllm import LLM, SamplingParams - llm = LLM( - model=model, - tokenizer=tokenizer, - quantization=quantization, - tensor_parallel_size=tensor_parallel_size, - seed=seed, - trust_remote_code=trust_remote_code, - dtype=dtype, - max_model_len=max_model_len, - gpu_memory_utilization=gpu_memory_utilization, - enforce_eager=enforce_eager, - kv_cache_dtype=kv_cache_dtype, - quantization_param_path=quantization_param_path, - device=device, - enable_prefix_caching=enable_prefix_caching, - download_dir=download_dir, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - disable_log_stats=False, - ) + llm = LLM(**dataclasses.asdict(engine_args)) # Add the requests to the engine. prompts = [] @@ -142,16 +109,8 @@ def main(args: argparse.Namespace): args.output_len) if args.backend == "vllm": - elapsed_time = run_vllm(requests, args.model, args.tokenizer, - args.quantization, args.tensor_parallel_size, - args.seed, args.n, args.trust_remote_code, - args.dtype, args.max_model_len, - args.enforce_eager, args.kv_cache_dtype, - args.quantization_param_path, args.device, - args.enable_prefix_caching, - args.enable_chunked_prefill, - args.max_num_batched_tokens, - args.gpu_memory_utilization, args.download_dir) + elapsed_time = run_vllm(requests, args.n, + EngineArgs.from_cli_args(args)) else: raise ValueError(f"Unknown backend: {args.backend}") total_num_tokens = sum(prompt_len + output_len @@ -173,7 +132,7 @@ def main(args: argparse.Namespace): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Benchmark the throughput.") + parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser.add_argument("--backend", type=str, choices=["vllm", "hf", "mii"], @@ -191,13 +150,6 @@ def main(args: argparse.Namespace): default=None, help="Output length for each request. Overrides the " "output length from the dataset.") - parser.add_argument("--model", type=str, default="facebook/opt-125m") - parser.add_argument("--tokenizer", type=str, default=None) - parser.add_argument('--quantization', - '-q', - choices=[*QUANTIZATION_METHODS, None], - default=None) - parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1) parser.add_argument("--n", type=int, default=1, @@ -206,81 +158,13 @@ def main(args: argparse.Namespace): type=int, default=200, help="Number of prompts to process.") - parser.add_argument("--seed", type=int, default=0) - parser.add_argument('--trust-remote-code', - action='store_true', - help='trust remote code from huggingface') - parser.add_argument( - '--max-model-len', - type=int, - default=None, - help='Maximum length of a sequence (including prompt and output). ' - 'If None, will be derived from the model.') - parser.add_argument( - '--dtype', - type=str, - default='auto', - choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], - help='data type for model weights and activations. ' - 'The "auto" option will use FP16 precision ' - 'for FP32 and FP16 models, and BF16 precision ' - 'for BF16 models.') - parser.add_argument('--gpu-memory-utilization', - type=float, - default=0.9, - help='the fraction of GPU memory to be used for ' - 'the model executor, which can range from 0 to 1.' - 'If unspecified, will use the default value of 0.9.') - parser.add_argument("--enforce-eager", - action="store_true", - help="enforce eager execution") - parser.add_argument( - '--kv-cache-dtype', - type=str, - choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], - default="auto", - help='Data type for kv cache storage. If "auto", will use model ' - 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' - 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') - parser.add_argument( - '--quantization-param-path', - type=str, - default=None, - help='Path to the JSON file containing the KV cache scaling factors. ' - 'This should generally be supplied, when KV cache dtype is FP8. ' - 'Otherwise, KV cache scaling factors default to 1.0, which may cause ' - 'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' - 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' - 'instead supported for common inference criteria.') - parser.add_argument( - "--device", - type=str, - default="cuda", - choices=["cuda", "cpu"], - help='device type for vLLM execution, supporting CUDA and CPU.') - parser.add_argument( - "--enable-prefix-caching", - action='store_true', - help="enable automatic prefix caching for vLLM backend.") - parser.add_argument("--enable-chunked-prefill", - action='store_true', - help="enable chunked prefill for vLLM backend.") - parser.add_argument('--max-num-batched-tokens', - type=int, - default=None, - help='maximum number of batched tokens per ' - 'iteration') - parser.add_argument('--download-dir', - type=str, - default=None, - help='directory to download and load the weights, ' - 'default to the default cache dir of huggingface') parser.add_argument( '--output-json', type=str, default=None, help='Path to save the throughput results in JSON format.') + parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 1381004c9f02b..0d205014b15bf 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -53,6 +53,8 @@ except ImportError: from argparse import ArgumentParser as FlexibleArgumentParser +MILLISECONDS_TO_SECONDS_CONVERSION = 1000 + @dataclass class BenchmarkMetrics: @@ -60,6 +62,7 @@ class BenchmarkMetrics: total_input: int total_output: int request_throughput: float + request_goodput: float output_throughput: float total_token_throughput: float mean_ttft_ms: float @@ -316,12 +319,15 @@ def calculate_metrics( tokenizer: PreTrainedTokenizerBase, selected_percentile_metrics: List[str], selected_percentiles: List[float], + gootput_config_dict: Dict[str, float], ) -> Tuple[BenchmarkMetrics, List[int]]: actual_output_lens: List[int] = [] total_input = 0 completed = 0 + good_completed = 0 itls: List[float] = [] tpots: List[float] = [] + all_tpots: List[float] = [] ttfts: List[float] = [] e2els: List[float] = [] for i in range(len(outputs)): @@ -335,9 +341,13 @@ def calculate_metrics( add_special_tokens=False).input_ids) actual_output_lens.append(output_len) total_input += input_requests[i][1] + tpot = 0 if output_len > 1: - tpots.append( - (outputs[i].latency - outputs[i].ttft) / (output_len - 1)) + tpot = (outputs[i].latency - outputs[i].ttft) / (output_len - + 1) + tpots.append(tpot) + # Note: if output_len <= 1, we regard tpot as 0 for goodput + all_tpots.append(tpot) itls += outputs[i].itl ttfts.append(outputs[i].ttft) e2els.append(outputs[i].latency) @@ -345,6 +355,28 @@ def calculate_metrics( else: actual_output_lens.append(0) + if gootput_config_dict: + valid_metrics = [] + slo_values = [] + + if "ttft" in gootput_config_dict: + valid_metrics.append(ttfts) + slo_values.append(gootput_config_dict["ttft"] / + MILLISECONDS_TO_SECONDS_CONVERSION) + if "tpot" in gootput_config_dict: + valid_metrics.append(all_tpots) + slo_values.append(gootput_config_dict["tpot"] / + MILLISECONDS_TO_SECONDS_CONVERSION) + if "e2el" in gootput_config_dict: + valid_metrics.append(e2els) + slo_values.append(gootput_config_dict["e2el"] / + MILLISECONDS_TO_SECONDS_CONVERSION) + + for req_metric in zip(*valid_metrics): + is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)]) + if is_good_req: + good_completed += 1 + if completed == 0: warnings.warn( "All requests failed. This is likely due to a misconfiguration " @@ -355,6 +387,7 @@ def calculate_metrics( total_input=total_input, total_output=sum(actual_output_lens), request_throughput=completed / dur_s, + request_goodput=good_completed / dur_s, output_throughput=sum(actual_output_lens) / dur_s, total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, mean_ttft_ms=np.mean(ttfts or 0) * @@ -398,6 +431,8 @@ async def benchmark( selected_percentile_metrics: List[str], selected_percentiles: List[str], ignore_eos: bool, + gootput_config_dict: Dict[str, float], + max_concurrency: Optional[int], ): if backend in ASYNC_REQUEST_FUNCS: request_func = ASYNC_REQUEST_FUNCS[backend] @@ -446,9 +481,25 @@ async def benchmark( print("Profiler started") print(f"Traffic request rate: {request_rate}") + print(f"Maximum request concurrency: {max_concurrency}") pbar = None if disable_tqdm else tqdm(total=len(input_requests)) + # This can be used once the minimum Python version is 3.10 or higher, + # and it will simplify the code in limited_request_func. + # semaphore = (asyncio.Semaphore(max_concurrency) + # if max_concurrency else contextlib.nullcontext()) + semaphore = (asyncio.Semaphore(max_concurrency) + if max_concurrency else None) + + async def limited_request_func(request_func_input, pbar): + if semaphore is None: + return await request_func(request_func_input=request_func_input, + pbar=pbar) + async with semaphore: + return await request_func(request_func_input=request_func_input, + pbar=pbar) + benchmark_start_time = time.perf_counter() tasks: List[asyncio.Task] = [] async for request in get_request(input_requests, request_rate): @@ -464,8 +515,8 @@ async def benchmark( ignore_eos=ignore_eos) tasks.append( asyncio.create_task( - request_func(request_func_input=request_func_input, - pbar=pbar))) + limited_request_func(request_func_input=request_func_input, + pbar=pbar))) outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) if profile: @@ -495,6 +546,7 @@ async def benchmark( tokenizer=tokenizer, selected_percentile_metrics=selected_percentile_metrics, selected_percentiles=selected_percentiles, + gootput_config_dict=gootput_config_dict, ) print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) @@ -506,6 +558,9 @@ async def benchmark( metrics.total_output)) print("{:<40} {:<10.2f}".format("Request throughput (req/s):", metrics.request_throughput)) + if gootput_config_dict: + print("{:<40} {:<10.2f}".format("Request goodput (req/s):", + metrics.request_goodput)) print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", metrics.output_throughput)) print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", @@ -517,6 +572,8 @@ async def benchmark( "total_input_tokens": metrics.total_input, "total_output_tokens": metrics.total_output, "request_throughput": metrics.request_throughput, + "request_goodput:": + metrics.request_goodput if gootput_config_dict else None, "output_throughput": metrics.output_throughput, "total_token_throughput": metrics.total_token_throughput, "input_lens": [output.prompt_len for output in outputs], @@ -570,6 +627,41 @@ def process_one_metric( return result +def check_goodput_args(args): + # Check and parse goodput arguments + gootput_config_dict = {} + VALID_NAMES = ["ttft", "tpot", "e2el"] + if args.goodput: + gootput_config_dict = parse_goodput(args.goodput) + for slo_name, slo_val in gootput_config_dict.items(): + if slo_name not in VALID_NAMES: + raise ValueError( + f"Invalid metric name found, {slo_name}: {slo_val}. " + "The service level objective name should be one of " + f"{str(VALID_NAMES)}. ") + if slo_val < 0: + raise ValueError( + f"Invalid value found, {slo_name}: {slo_val}. " + "The service level objective value should be " + "non-negative.") + return gootput_config_dict + + +def parse_goodput(slo_pairs): + gootput_config_dict = {} + try: + for slo_pair in slo_pairs: + slo_name, slo_val = slo_pair.split(":") + gootput_config_dict[slo_name] = float(slo_val) + except ValueError as err: + raise argparse.ArgumentTypeError( + "Invalid format found for service level objectives. " + "Specify service level objectives for goodput as \"KEY:VALUE\" " + "pairs, where the key is a metric name, and the value is a " + "number in milliseconds.") from err + return gootput_config_dict + + def main(args: argparse.Namespace): print(args) random.seed(args.seed) @@ -664,6 +756,8 @@ def main(args: argparse.Namespace): else: raise ValueError(f"Unknown dataset: {args.dataset_name}") + gootput_config_dict = check_goodput_args(args) + benchmark_result = asyncio.run( benchmark( backend=backend, @@ -682,6 +776,8 @@ def main(args: argparse.Namespace): float(p) for p in args.metric_percentiles.split(",") ], ignore_eos=args.ignore_eos, + gootput_config_dict=gootput_config_dict, + max_concurrency=args.max_concurrency, )) # Save config and results to json @@ -711,13 +807,16 @@ def main(args: argparse.Namespace): # Traffic result_json["request_rate"] = ( args.request_rate if args.request_rate < float("inf") else "inf") + result_json["max_concurrency"] = args.max_concurrency # Merge with benchmark result result_json = {**result_json, **benchmark_result} # Save to file base_model_id = model_id.split("/")[-1] - file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa + max_concurrency_str = (f"-concurrency{args.max_concurrency}" + if args.max_concurrency is not None else "") + file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" #noqa if args.result_filename: file_name = args.result_filename if args.result_dir: @@ -768,6 +867,19 @@ def main(args: argparse.Namespace): default=None, help="Path to the sharegpt/sonnet dataset. " "Or the huggingface dataset ID if using HF dataset.") + parser.add_argument( + "--max-concurrency", + type=int, + default=None, + help="Maximum number of concurrent requests. This can be used " + "to help simulate an environment where a higher level component " + "is enforcing a maximum number of concurrent requests. While the " + "--request-rate argument controls the rate at which requests are " + "initiated, this argument will control how many are actually allowed " + "to execute at a time. This means that when used in combination, the " + "actual request rate may be lower than specified with --request-rate, " + "if the server is not processing requests fast enough to keep up.") + parser.add_argument( "--model", type=str, @@ -881,6 +993,17 @@ def main(args: argparse.Namespace): "Default value is \"99\". " "Use \"--percentile-metrics\" to select metrics.", ) + parser.add_argument( + "--goodput", + nargs="+", + required=False, + help="Specify service level objectives for goodput as \"KEY:VALUE\" " + "pairs, where the key is a metric name, and the value is in " + "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, " + "separated by spaces. Allowed request level metric names are " + "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of " + "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " + "and the blog: https://hao-ai-lab.github.io/blogs/distserve") # group for dataset specific arguments sonnet_group = parser.add_argument_group("sonnet dataset options") diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index e26706af606b0..24eb54e7b73bc 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -1,5 +1,6 @@ """Benchmark offline inference throughput.""" import argparse +import dataclasses import json import random import time @@ -11,10 +12,9 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) -from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs +from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.entrypoints.openai.api_server import ( build_async_engine_client_from_engine_args) -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.sampling_params import BeamSearchParams from vllm.utils import FlexibleArgumentParser, merge_async_iterators @@ -67,53 +67,11 @@ def sample_requests( def run_vllm( requests: List[Tuple[str, int, int]], - model: str, - tokenizer: str, - quantization: Optional[str], - tensor_parallel_size: int, - seed: int, n: int, - trust_remote_code: bool, - dtype: str, - max_model_len: Optional[int], - enforce_eager: bool, - kv_cache_dtype: str, - quantization_param_path: Optional[str], - device: str, - enable_prefix_caching: bool, - enable_chunked_prefill: bool, - max_num_batched_tokens: int, - distributed_executor_backend: Optional[str], - gpu_memory_utilization: float = 0.9, - num_scheduler_steps: int = 1, - download_dir: Optional[str] = None, - load_format: str = EngineArgs.load_format, - disable_async_output_proc: bool = False, + engine_args: EngineArgs, ) -> float: from vllm import LLM, SamplingParams - llm = LLM( - model=model, - tokenizer=tokenizer, - quantization=quantization, - tensor_parallel_size=tensor_parallel_size, - seed=seed, - trust_remote_code=trust_remote_code, - dtype=dtype, - max_model_len=max_model_len, - gpu_memory_utilization=gpu_memory_utilization, - enforce_eager=enforce_eager, - kv_cache_dtype=kv_cache_dtype, - quantization_param_path=quantization_param_path, - device=device, - enable_prefix_caching=enable_prefix_caching, - download_dir=download_dir, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - distributed_executor_backend=distributed_executor_backend, - load_format=load_format, - num_scheduler_steps=num_scheduler_steps, - disable_async_output_proc=disable_async_output_proc, - ) + llm = LLM(**dataclasses.asdict(engine_args)) # Add the requests to the engine. prompts: List[str] = [] @@ -155,56 +113,11 @@ def run_vllm( async def run_vllm_async( requests: List[Tuple[str, int, int]], - model: str, - tokenizer: str, - quantization: Optional[str], - tensor_parallel_size: int, - seed: int, n: int, - trust_remote_code: bool, - dtype: str, - max_model_len: Optional[int], - enforce_eager: bool, - kv_cache_dtype: str, - quantization_param_path: Optional[str], - device: str, - enable_prefix_caching: bool, - enable_chunked_prefill: bool, - max_num_batched_tokens: int, - distributed_executor_backend: Optional[str], - gpu_memory_utilization: float = 0.9, - num_scheduler_steps: int = 1, - download_dir: Optional[str] = None, - load_format: str = EngineArgs.load_format, - disable_async_output_proc: bool = False, + engine_args: AsyncEngineArgs, disable_frontend_multiprocessing: bool = False, ) -> float: from vllm import SamplingParams - engine_args = AsyncEngineArgs( - model=model, - tokenizer=tokenizer, - quantization=quantization, - tensor_parallel_size=tensor_parallel_size, - seed=seed, - trust_remote_code=trust_remote_code, - dtype=dtype, - max_model_len=max_model_len, - gpu_memory_utilization=gpu_memory_utilization, - enforce_eager=enforce_eager, - kv_cache_dtype=kv_cache_dtype, - quantization_param_path=quantization_param_path, - device=device, - enable_prefix_caching=enable_prefix_caching, - download_dir=download_dir, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - distributed_executor_backend=distributed_executor_backend, - load_format=load_format, - num_scheduler_steps=num_scheduler_steps, - disable_async_output_proc=disable_async_output_proc, - worker_use_ray=False, - disable_log_requests=True, - ) async with build_async_engine_client_from_engine_args( engine_args, disable_frontend_multiprocessing) as llm: @@ -320,7 +233,16 @@ def main(args: argparse.Namespace): args.tokenizer, trust_remote_code=args.trust_remote_code) if args.dataset is None: # Synthesize a prompt with the given input length. - prompt = "hi" * (args.input_len - 1) + # As tokenizer may add additional tokens like BOS, we need to try + # different lengths to get the desired input length. + for i in range(-10, 10): + prompt = "hi " * (args.input_len + i) + tokenized_prompt = tokenizer(prompt).input_ids + if len(tokenized_prompt) == args.input_len: + break + else: + raise ValueError( + f"Failed to synthesize a prompt with {args.input_len} tokens.") requests = [(prompt, args.input_len, args.output_len) for _ in range(args.num_prompts)] else: @@ -328,23 +250,17 @@ def main(args: argparse.Namespace): args.output_len) if args.backend == "vllm": - run_args = [ - requests, args.model, args.tokenizer, args.quantization, - args.tensor_parallel_size, args.seed, args.n, - args.trust_remote_code, args.dtype, args.max_model_len, - args.enforce_eager, args.kv_cache_dtype, - args.quantization_param_path, args.device, - args.enable_prefix_caching, args.enable_chunked_prefill, - args.max_num_batched_tokens, args.distributed_executor_backend, - args.gpu_memory_utilization, args.num_scheduler_steps, - args.download_dir, args.load_format, args.disable_async_output_proc - ] - if args.async_engine: - run_args.append(args.disable_frontend_multiprocessing) - elapsed_time = uvloop.run(run_vllm_async(*run_args)) + elapsed_time = uvloop.run( + run_vllm_async( + requests, + args.n, + AsyncEngineArgs.from_cli_args(args), + args.disable_frontend_multiprocessing, + )) else: - elapsed_time = run_vllm(*run_args) + elapsed_time = run_vllm(requests, args.n, + EngineArgs.from_cli_args(args)) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -391,13 +307,6 @@ def main(args: argparse.Namespace): default=None, help="Output length for each request. Overrides the " "output length from the dataset.") - parser.add_argument("--model", type=str, default="facebook/opt-125m") - parser.add_argument("--tokenizer", type=str, default=None) - parser.add_argument('--quantization', - '-q', - choices=[*QUANTIZATION_METHODS, None], - default=None) - parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1) parser.add_argument("--n", type=int, default=1, @@ -406,123 +315,15 @@ def main(args: argparse.Namespace): type=int, default=1000, help="Number of prompts to process.") - parser.add_argument("--seed", type=int, default=0) parser.add_argument("--hf-max-batch-size", type=int, default=None, help="Maximum batch size for HF backend.") - parser.add_argument('--trust-remote-code', - action='store_true', - help='trust remote code from huggingface') - parser.add_argument( - '--max-model-len', - type=int, - default=None, - help='Maximum length of a sequence (including prompt and output). ' - 'If None, will be derived from the model.') - parser.add_argument( - '--dtype', - type=str, - default='auto', - choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], - help='data type for model weights and activations. ' - 'The "auto" option will use FP16 precision ' - 'for FP32 and FP16 models, and BF16 precision ' - 'for BF16 models.') - parser.add_argument('--gpu-memory-utilization', - type=float, - default=0.9, - help='the fraction of GPU memory to be used for ' - 'the model executor, which can range from 0 to 1.' - 'If unspecified, will use the default value of 0.9.') - parser.add_argument("--enforce-eager", - action="store_true", - help="enforce eager execution") - parser.add_argument( - '--kv-cache-dtype', - type=str, - choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], - default="auto", - help='Data type for kv cache storage. If "auto", will use model ' - 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' - 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') - parser.add_argument( - '--quantization-param-path', - type=str, - default=None, - help='Path to the JSON file containing the KV cache scaling factors. ' - 'This should generally be supplied, when KV cache dtype is FP8. ' - 'Otherwise, KV cache scaling factors default to 1.0, which may cause ' - 'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' - 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' - 'instead supported for common inference criteria.') - parser.add_argument("--device", - type=str, - default="auto", - choices=DEVICE_OPTIONS, - help='device type for vLLM execution') - parser.add_argument( - "--num-scheduler-steps", - type=int, - default=1, - help="Maximum number of forward steps per scheduler call.") - parser.add_argument( - "--enable-prefix-caching", - action='store_true', - help="Enable automatic prefix caching for vLLM backend.") - parser.add_argument("--enable-chunked-prefill", - action='store_true', - help="enable chunked prefill for vLLM backend.") - parser.add_argument('--max-num-batched-tokens', - type=int, - default=None, - help='maximum number of batched tokens per ' - 'iteration') - parser.add_argument('--download-dir', - type=str, - default=None, - help='directory to download and load the weights, ' - 'default to the default cache dir of huggingface') parser.add_argument( '--output-json', type=str, default=None, help='Path to save the throughput results in JSON format.') - parser.add_argument( - '--distributed-executor-backend', - choices=['ray', 'mp'], - default=None, - help='Backend to use for distributed serving. When more than 1 GPU ' - 'is used, will be automatically set to "ray" if installed ' - 'or "mp" (multiprocessing) otherwise.') - parser.add_argument( - '--load-format', - type=str, - default=EngineArgs.load_format, - choices=[ - 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer', - 'bitsandbytes' - ], - help='The format of the model weights to load.\n\n' - '* "auto" will try to load the weights in the safetensors format ' - 'and fall back to the pytorch bin format if safetensors format ' - 'is not available.\n' - '* "pt" will load the weights in the pytorch bin format.\n' - '* "safetensors" will load the weights in the safetensors format.\n' - '* "npcache" will load the weights in pytorch format and store ' - 'a numpy cache to speed up the loading.\n' - '* "dummy" will initialize the weights with random values, ' - 'which is mainly for profiling.\n' - '* "tensorizer" will load the weights using tensorizer from ' - 'CoreWeave. See the Tensorize vLLM Model script in the Examples' - 'section for more information.\n' - '* "bitsandbytes" will load the weights using bitsandbytes ' - 'quantization.\n') - parser.add_argument( - "--disable-async-output-proc", - action='store_true', - default=False, - help="Disable async output processor for vLLM backend.") parser.add_argument("--async-engine", action='store_true', default=False, @@ -531,6 +332,7 @@ def main(args: argparse.Namespace): action='store_true', default=False, help="Disable decoupled async engine frontend.") + parser = AsyncEngineArgs.add_cli_args(parser) args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu index 1657f7d0b16e8..97a969cf5e3e0 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu @@ -137,9 +137,11 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a, return; } - // Turing - TORCH_CHECK(version_num >= 75); - cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales, bias); + if (version_num >= 75) { + // Turing + cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales, bias); + return; + } #endif TORCH_CHECK_NOT_IMPLEMENTED( diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst index f544325a0776c..d12aeebbbc184 100644 --- a/docs/source/getting_started/cpu-installation.rst +++ b/docs/source/getting_started/cpu-installation.rst @@ -3,7 +3,13 @@ Installation with CPU ======================== -vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32 and BF16. +vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32 and BF16. vLLM CPU backend supports the following vLLM features: + +- Tensor Parallel (``-tp = N``) +- Quantization (``INT8 W8A8, AWQ``) + +.. note:: + FP16 data type and more advanced features on `chunked-prefill`, `prefix-caching` and `FP8 KV cache` are under development and will be available soon. Table of contents: @@ -141,5 +147,20 @@ Performance tips - If using vLLM CPU backend on a multi-socket machine with NUMA, be aware to set CPU cores using ``VLLM_CPU_OMP_THREADS_BIND`` to avoid cross NUMA node memory access. +CPU Backend Considerations +-------------------------- + +- The CPU backend significantly differs from the GPU backend since the vLLM architecture was originally optimized for GPU use. A number of optimizations are needed to enhance its performance. + +- Decouple the HTTP serving components from the inference components. In a GPU backend configuration, the HTTP serving and tokenization tasks operate on the CPU, while inference runs on the GPU, which typically does not pose a problem. However, in a CPU-based setup, the HTTP serving and tokenization can cause significant context switching and reduced cache efficiency. Therefore, it is strongly recommended to segregate these two components for improved performance. + +- On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the `topology `_. For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel. + + * Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With `TP feature on CPU `_ merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving: + + .. code-block:: console + + $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp + * Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like `Nginx <../serving/deploying_with_nginx.html>`_ or HAProxy are recommended. Anyscale Ray project provides the feature on LLM `serving `_. Here is the example to setup a scalable LLM serving with `Ray Serve `_. \ No newline at end of file diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst index cfd2dcb3bd5d3..91978065faf42 100644 --- a/docs/source/getting_started/debugging.rst +++ b/docs/source/getting_started/debugging.rst @@ -107,15 +107,15 @@ If GPU/CPU communication cannot be established, you can use the following Python If you are testing with a single node, adjust ``--nproc-per-node`` to the number of GPUs you want to use: -.. code-block:: shell +.. code-block:: console - NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py + $ NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py If you are testing with multi-nodes, adjust ``--nproc-per-node`` and ``--nnodes`` according to your setup and set ``MASTER_ADDR`` to the correct IP address of the master node, reachable from all nodes. Then, run: -.. code-block:: shell +.. code-block:: console - NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py + $ NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py If the script runs successfully, you should see the message ``sanity check is successful!``. diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst index 99c695ac4ddb1..a706b285edede 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation.rst @@ -7,14 +7,14 @@ Installation vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries. Requirements -=========================== +============ * OS: Linux -* Python: 3.8 -- 3.12 +* Python: 3.8 - 3.12 * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) Install released versions -=========================== +========================= You can install vLLM using pip: @@ -51,9 +51,9 @@ You can install vLLM using pip: .. _install-the-latest-code: Install the latest code -========================= +======================= -LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on x86 platform with cuda 12 for every commit since v0.5.3. You can download and install the latest one with the following command: +LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on a x86 platform with CUDA 12 for every commit since ``v0.5.3``. You can download and install it with the following command: .. code-block:: console @@ -66,7 +66,7 @@ If you want to access the wheels for previous commits, you can specify the commi $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl -Note that the wheels are built with Python 3.8 abi (see `PEP 425 `_ for more details about abi), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. +Note that the wheels are built with Python 3.8 ABI (see `PEP 425 `_ for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Another way to access the latest code is to use the docker images: @@ -77,17 +77,17 @@ Another way to access the latest code is to use the docker images: These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days. -Latest code can contain bugs and may not be stable. Please use it with caution. +The latest code can contain bugs and may not be stable. Please use it with caution. .. _build_from_source: Build from source -================== +================= .. _python-only-build: Python-only build (without compilation) ----------------------------------------- +--------------------------------------- If you only need to change Python code, you can simply build vLLM without compilation. @@ -116,28 +116,28 @@ The script will: Now, you can edit the Python code in the current directory, and the changes will be reflected when you run vLLM. -Once you have finished editing or want to install another vLLM wheel, you should exit the development environment using `the same script `_ with the ``--quit-dev``(or ``-q`` for short) flag: +Once you have finished editing or want to install another vLLM wheel, you should exit the development environment using `the same script `_ with the ``--quit-dev`` (or ``-q`` for short) flag: .. code-block:: console $ python python_only_dev.py --quit-dev -The script with ``--quit-dev`` flag will: +The ``--quit-dev`` flag will: * Remove the symbolic link from the current directory to the vLLM package. * Restore the original vLLM package from the backup. -If you update the vLLM wheel and want to rebuild from the source and make further edits, you will need to start `all above <#python-only-build>`_ over again. +If you update the vLLM wheel and rebuild from the source to make further edits, you will need to repeat the `Python-only build <#python-only-build>`_ steps again. .. note:: There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors. - It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to `the above section <#install-the-latest-code>`_ for instructions on how to install a specified wheel. + It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to `the section above <#install-the-latest-code>`_ for instructions on how to install a specified wheel. Full build (with compilation) ---------------------------------- +----------------------------- -If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes: +If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes: .. code-block:: console @@ -153,7 +153,7 @@ If you want to modify C++ or CUDA code, you'll need to build vLLM from source. T Use an existing PyTorch installation -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.: * Building vLLM with PyTorch nightly or a custom PyTorch build. @@ -171,7 +171,7 @@ To build vLLM using an existing PyTorch installation: Troubleshooting -~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~ To avoid your system being overloaded, you can limit the number of compilation jobs to be run simultaneously, via the environment variable ``MAX_JOBS``. For example: @@ -207,7 +207,7 @@ Here is a sanity check to verify that the CUDA Toolkit is correctly installed: Unsupported OS build ----------------------- +-------------------- vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. diff --git a/docs/source/index.rst b/docs/source/index.rst index d20e46b4a3656..c328c049b430c 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -80,6 +80,7 @@ Documentation serving/openai_compatible_server serving/deploying_with_docker serving/deploying_with_k8s + serving/deploying_with_nginx serving/distributed_serving serving/metrics serving/env_vars diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 358b87b3000b5..456269261300e 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -87,6 +87,11 @@ Text Generation - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc. - - ✅︎ + * - :code:`FalconMambaForCausalLM` + - FalconMamba + - :code:`tiiuae/falcon-mamba-7b`, :code:`tiiuae/falcon-mamba-7b-instruct`, etc. + - ✅︎ + - * - :code:`GemmaForCausalLM` - Gemma - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc. @@ -294,6 +299,10 @@ Text Embedding - - ✅︎ +.. important:: + Some model architectures support both generation and embedding tasks. + In this case, you have to pass :code:`--task embedding` to run the model in embedding mode. + Reward Modeling --------------- @@ -325,6 +334,14 @@ The following modalities are supported depending on the model: - **V**\ ideo - **A**\ udio +Any combination of modalities joined by :code:`+` are supported. + +- e.g.: :code:`T + I` means that the model supports text-only, image-only, and text-with-image inputs. + +On the other hand, modalities separated by :code:`/` are mutually exclusive. + +- e.g.: :code:`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs. + .. _supported_vlms: Text Generation @@ -367,7 +384,7 @@ Text Generation * - :code:`InternVLChatModel` - InternVL2 - T + I\ :sup:`E+` - - :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc. + - :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc. - - ✅︎ * - :code:`LlavaForConditionalGeneration` @@ -433,7 +450,7 @@ Text Generation * - :code:`PixtralForConditionalGeneration` - Pixtral - T + I\ :sup:`+` - - :code:`mistralai/Pixtral-12B-2409` + - :code:`mistralai/Pixtral-12B-2409`, :code:`mistral-community/pixtral-12b` etc. - - ✅︎ * - :code:`QWenLMHeadModel` @@ -481,6 +498,12 @@ Multimodal Embedding - Example HF Models - :ref:`LoRA ` - :ref:`PP ` + * - :code:`LlavaNextForConditionalGeneration` + - LLaVA-NeXT-based + - T / I + - :code:`royokong/e5-v` + - + - ✅︎ * - :code:`Phi3VForCausalLM` - Phi-3-Vision-based - T + I @@ -488,6 +511,10 @@ Multimodal Embedding - 🚧 - ✅︎ +.. important:: + Some model architectures support both generation and embedding tasks. + In this case, you have to pass :code:`--task embedding` to run the model in embedding mode. + ---- If your model uses one of the above model architectures, you can seamlessly run your model with vLLM. diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index 7dd42ec1bb9c9..a47902ab4fc9d 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -181,8 +181,8 @@ Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruc .. code-block:: bash - vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \ - --trust-remote-code --limit-mm-per-prompt image=2 + vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ + --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2 .. important:: Since OpenAI Vision API is based on `Chat Completions `_ API, @@ -247,9 +247,9 @@ A full code example can be found in `examples/openai_api_client_for_multimodal.p By default, the timeout for fetching images through http url is ``5`` seconds. You can override this by setting the environment variable: - .. code-block:: shell + .. code-block:: console - export VLLM_IMAGE_FETCH_TIMEOUT= + $ export VLLM_IMAGE_FETCH_TIMEOUT= .. note:: There is no need to format the prompt in the API request since it will be handled by the server. diff --git a/docs/source/serving/deploying_with_nginx.rst b/docs/source/serving/deploying_with_nginx.rst new file mode 100644 index 0000000000000..b5dff02b6bae6 --- /dev/null +++ b/docs/source/serving/deploying_with_nginx.rst @@ -0,0 +1,142 @@ +.. _nginxloadbalancer: + +Deploying with Nginx Loadbalancer +================================= + +This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers. + +Table of contents: + +#. :ref:`Build Nginx Container ` +#. :ref:`Create Simple Nginx Config file ` +#. :ref:`Build vLLM Container ` +#. :ref:`Create Docker Network ` +#. :ref:`Launch vLLM Containers ` +#. :ref:`Launch Nginx ` +#. :ref:`Verify That vLLM Servers Are Ready ` + +.. _nginxloadbalancer_nginx_build: + +Build Nginx Container +--------------------- + +This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory. + +.. code-block:: console + + export vllm_root=`pwd` + +Create a file named ``Dockerfile.nginx``: + +.. code-block:: console + + FROM nginx:latest + RUN rm /etc/nginx/conf.d/default.conf + EXPOSE 80 + CMD ["nginx", "-g", "daemon off;"] + +Build the container: + +.. code-block:: console + + docker build . -f Dockerfile.nginx --tag nginx-lb + +.. _nginxloadbalancer_nginx_conf: + +Create Simple Nginx Config file +------------------------------- + +Create a file named ``nginx_conf/nginx.conf``. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another ``server vllmN:8000 max_fails=3 fail_timeout=10000s;`` entry to ``upstream backend``. + +.. code-block:: console + + upstream backend { + least_conn; + server vllm0:8000 max_fails=3 fail_timeout=10000s; + server vllm1:8000 max_fails=3 fail_timeout=10000s; + } + server { + listen 80; + location / { + proxy_pass http://backend; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + } + +.. _nginxloadbalancer_nginx_vllm_container: + +Build vLLM Container +-------------------- + +.. code-block:: console + + cd $vllm_root + docker build -f Dockerfile . --tag vllm + + +If you are behind proxy, you can pass the proxy settings to the docker build command as shown below: + +.. code-block:: console + + cd $vllm_root + docker build -f Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy + +.. _nginxloadbalancer_nginx_docker_network: + +Create Docker Network +--------------------- + +.. code-block:: console + + docker network create vllm_nginx + + +.. _nginxloadbalancer_nginx_launch_container: + +Launch vLLM Containers +---------------------- + +Notes: + +* If you have your HuggingFace models cached somewhere else, update ``hf_cache_dir`` below. +* If you don't have an existing HuggingFace cache you will want to start ``vllm0`` and wait for the model to complete downloading and the server to be ready. This will ensure that ``vllm1`` can leverage the model you just downloaded and it won't have to be downloaded again. +* The below example assumes GPU backend used. If you are using CPU backend, remove ``--gpus all``, add ``VLLM_CPU_KVCACHE_SPACE`` and ``VLLM_CPU_OMP_THREADS_BIND`` environment variables to the docker run command. +* Adjust the model name that you want to use in your vLLM servers if you don't want to use ``Llama-2-7b-chat-hf``. + +.. code-block:: console + + mkdir -p ~/.cache/huggingface/hub/ + hf_cache_dir=~/.cache/huggingface/ + docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8081:8000 --name vllm0 vllm --model meta-llama/Llama-2-7b-chat-hf + docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf + +.. note:: + If you are behind proxy, you can pass the proxy settings to the docker run command via ``-e http_proxy=$http_proxy -e https_proxy=$https_proxy``. + +.. _nginxloadbalancer_nginx_launch_nginx: + +Launch Nginx +------------ + +.. code-block:: console + + docker run -itd -p 8000:80 --network vllm_nginx -v ./nginx_conf/:/etc/nginx/conf.d/ --name nginx-lb nginx-lb:latest + +.. _nginxloadbalancer_nginx_verify_nginx: + +Verify That vLLM Servers Are Ready +---------------------------------- + +.. code-block:: console + + docker logs vllm0 | grep Uvicorn + docker logs vllm1 | grep Uvicorn + +Both outputs should look like this: + +.. code-block:: console + + INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 9132e12a36ba5..cc8e539a8a6d3 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -157,7 +157,7 @@ vLLM will use guided decoding to ensure the response matches the tool parameter To enable this feature, you should set the following flags: * `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it deems appropriate. -* `--tool-call-parser` -- select the tool parser to use - currently either `hermes` or `mistral` or `llama3_json` or `internlm`. Additional tool parsers +* `--tool-call-parser` -- select the tool parser to use (listed below). Additional tool parsers will continue to be added in the future, and also can register your own tool parsers in the `--tool-parser-plugin`. * `--tool-parser-plugin` -- **optional** tool parser plugin used to register user defined tool parsers into vllm, the registered tool parser name can be specified in `--tool-call-parser`. * `--chat-template` -- **optional** for auto tool choice. the path to the chat template which handles `tool`-role messages and `assistant`-role messages @@ -168,7 +168,7 @@ from HuggingFace; and you can find an example of this in a `tokenizer_config.jso If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template! -#### Hermes Models +#### Hermes Models (`hermes`) All Nous Research Hermes-series models newer than Hermes 2 Pro should be supported. * `NousResearch/Hermes-2-Pro-*` * `NousResearch/Hermes-2-Theta-*` @@ -180,7 +180,7 @@ step in their creation_. Flags: `--tool-call-parser hermes` -#### Mistral Models +#### Mistral Models (`mistral`) Supported models: * `mistralai/Mistral-7B-Instruct-v0.3` (confirmed) * Additional mistral function-calling models are compatible as well. @@ -199,7 +199,7 @@ when tools are provided, that results in much better reliability when working wi Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja` -#### Llama Models +#### Llama Models (`llama3_json`) Supported models: * `meta-llama/Meta-Llama-3.1-8B-Instruct` * `meta-llama/Meta-Llama-3.1-70B-Instruct` @@ -219,16 +219,24 @@ it works better with vLLM. Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool_chat_template_llama3_json.jinja` -#### Internlm Models +#### InternLM Models (`internlm`) Supported models: * `internlm/internlm2_5-7b-chat` (confirmed) * Additional internlm2.5 function-calling models are compatible as well Known issues: -* Although this implementation also supports Internlm2, the tool call results are not stable when testing with the `internlm/internlm2-chat-7b` model. +* Although this implementation also supports InternLM2, the tool call results are not stable when testing with the `internlm/internlm2-chat-7b` model. Recommended flags: `--tool-call-parser internlm --chat-template examples/tool_chat_template_internlm2_tool.jinja` +#### Jamba Models (`jamba`) +AI21's Jamba-1.5 models are supported. +* `ai21labs/AI21-Jamba-1.5-Mini` +* `ai21labs/AI21-Jamba-1.5-Large` + + +Flags: `--tool-call-parser jamba` + ### How to write a tool parser plugin diff --git a/docs/source/serving/tensorizer.rst b/docs/source/serving/tensorizer.rst index a44696507fb9a..96a93db94871b 100644 --- a/docs/source/serving/tensorizer.rst +++ b/docs/source/serving/tensorizer.rst @@ -9,4 +9,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor For more information on CoreWeave's Tensorizer, please refer to `CoreWeave's Tensorizer documentation `_. For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see -the `vLLM example script `_. \ No newline at end of file +the `vLLM example script `_. + +.. note:: + Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. diff --git a/examples/florence2_inference.py b/examples/florence2_inference.py new file mode 100644 index 0000000000000..b58ac2e1f7ed4 --- /dev/null +++ b/examples/florence2_inference.py @@ -0,0 +1,44 @@ +''' +Demonstrate prompting of text-to-text +encoder/decoder models, specifically Florence-2 +''' +# TODO(Isotr0py): +# Move to offline_inference_vision_language.py after porting vision backbone +from vllm import LLM, SamplingParams + +dtype = "float" + +# Create a Florence-2 encoder/decoder model instance +llm = LLM( + model="microsoft/Florence-2-base", + tokenizer="facebook/bart-base", + dtype=dtype, + trust_remote_code=True, +) + +prompts = [ + "", "", "", + "", "", "", + "", "", "" +] +# Create a sampling params object. +sampling_params = SamplingParams( + temperature=0, + top_p=1.0, + min_tokens=0, + max_tokens=20, +) + +# Generate output tokens from the prompts. The output is a list of +# RequestOutput objects that contain the prompt, generated +# text, and other information. +outputs = llm.generate(prompts, sampling_params) + +# Print the outputs. +for output in outputs: + prompt = output.prompt + encoder_prompt = output.encoder_prompt + generated_text = output.outputs[0].text + print(f"Encoder prompt: {encoder_prompt!r}, " + f"Decoder prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 4c88dcc2f087b..83d2548a506e4 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -1,6 +1,6 @@ """ -This example shows how to use vLLM for running offline inference -with the correct prompt format on vision language models. +This example shows how to use vLLM for running offline inference with +the correct prompt format on vision language models for text generation. For most models, the prompt format should follow corresponding examples on HuggingFace model repository. @@ -267,6 +267,11 @@ def run_qwen2_vl(question: str, modality: str): model=model_name, max_model_len=8192, max_num_seqs=5, + # Note - mm_processor_kwargs can also be passed to generate/chat calls + mm_processor_kwargs={ + "min_pixels": 28 * 28, + "max_pixels": 1280 * 28 * 28, + }, ) prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" @@ -277,6 +282,22 @@ def run_qwen2_vl(question: str, modality: str): return llm, prompt, stop_token_ids +# Pixtral HF-format +def run_pixtral_hf(question: str, modality: str): + assert modality == "image" + + model_name = "mistral-community/pixtral-12b" + + llm = LLM( + model=model_name, + max_model_len=8192, + ) + + prompt = f"[INST]{question}\n[IMG][/INST]" + stop_token_ids = None + return llm, prompt, stop_token_ids + + # LLama 3.2 def run_mllama(question: str, modality: str): assert modality == "image" @@ -347,6 +368,7 @@ def run_glm4v(question: str, modality: str): "NVLM_D": run_nvlm_d, "qwen_vl": run_qwen_vl, "qwen2_vl": run_qwen2_vl, + "pixtral_hf": run_pixtral_hf, "mllama": run_mllama, "molmo": run_molmo, "glm4v": run_glm4v, @@ -433,7 +455,7 @@ def main(args): if __name__ == "__main__": parser = FlexibleArgumentParser( description='Demo on using vLLM for offline inference with ' - 'vision language models') + 'vision language models for text generation') parser.add_argument('--model-type', '-m', type=str, diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference_vision_language_embedding.py index 8e62199e1db7b..e1732d045f949 100644 --- a/examples/offline_inference_vision_language_embedding.py +++ b/examples/offline_inference_vision_language_embedding.py @@ -1,21 +1,170 @@ +""" +This example shows how to use vLLM for running offline inference with +the correct prompt format on vision language models for multimodal embedding. + +For most models, the prompt format should follow corresponding examples +on HuggingFace model repository. +""" +from argparse import Namespace +from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args + +from PIL.Image import Image + from vllm import LLM -from vllm.assets.image import ImageAsset - -image = ImageAsset("cherry_blossom").pil_image.convert("RGB") -prompt = "<|image_1|> Represent the given image with the following question: What is in the image" # noqa: E501 - -# Create an LLM. -llm = LLM( - model="TIGER-Lab/VLM2Vec-Full", - trust_remote_code=True, - max_model_len=4096, - max_num_seqs=2, - mm_processor_kwargs={"num_crops": 16}, -) - -# Generate embedding. The output is a list of EmbeddingRequestOutputs. -outputs = llm.encode({"prompt": prompt, "multi_modal_data": {"image": image}}) - -# Print the outputs. -for output in outputs: - print(output.outputs.embedding) # list of 3072 floats +from vllm.multimodal.utils import fetch_image +from vllm.utils import FlexibleArgumentParser + + +class TextQuery(TypedDict): + modality: Literal["text"] + text: str + + +class ImageQuery(TypedDict): + modality: Literal["image"] + image: Image + + +class TextImageQuery(TypedDict): + modality: Literal["text+image"] + text: str + image: Image + + +QueryModality = Literal["text", "image", "text+image"] +Query = Union[TextQuery, ImageQuery, TextImageQuery] + + +class ModelRequestData(NamedTuple): + llm: LLM + prompt: str + image: Optional[Image] + + +def run_e5_v(query: Query): + llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501 + + if query["modality"] == "text": + text = query["text"] + prompt = llama3_template.format( + f"{text}\nSummary above sentence in one word: ") + image = None + elif query["modality"] == "image": + prompt = llama3_template.format( + "\nSummary above image in one word: ") + image = query["image"] + else: + modality = query['modality'] + raise ValueError(f"Unsupported query modality: '{modality}'") + + llm = LLM( + model="royokong/e5-v", + task="embedding", + max_model_len=4096, + ) + + return ModelRequestData( + llm=llm, + prompt=prompt, + image=image, + ) + + +def run_vlm2vec(query: Query): + if query["modality"] == "text": + text = query["text"] + prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501 + image = None + elif query["modality"] == "image": + prompt = "<|image_1|> Find a day-to-day image that looks similar to the provided image." # noqa: E501 + image = query["image"] + elif query["modality"] == "text+image": + text = query["text"] + prompt = f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501 + image = query["image"] + else: + modality = query['modality'] + raise ValueError(f"Unsupported query modality: '{modality}'") + + llm = LLM( + model="TIGER-Lab/VLM2Vec-Full", + task="embedding", + trust_remote_code=True, + mm_processor_kwargs={"num_crops": 4}, + ) + + return ModelRequestData( + llm=llm, + prompt=prompt, + image=image, + ) + + +def get_query(modality: QueryModality): + if modality == "text": + return TextQuery(modality="text", text="A dog sitting in the grass") + + if modality == "image": + return ImageQuery( + modality="image", + image=fetch_image( + "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg" # noqa: E501 + ), + ) + + if modality == "text+image": + return TextImageQuery( + modality="text+image", + text="A cat standing in the snow.", + image=fetch_image( + "https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Felis_catus-cat_on_snow.jpg/179px-Felis_catus-cat_on_snow.jpg" # noqa: E501 + ), + ) + + msg = f"Modality {modality} is not supported." + raise ValueError(msg) + + +def run_encode(model: str, modality: QueryModality): + query = get_query(modality) + req_data = model_example_map[model](query) + + mm_data = {} + if req_data.image is not None: + mm_data["image"] = req_data.image + + outputs = req_data.llm.encode({ + "prompt": req_data.prompt, + "multi_modal_data": mm_data, + }) + + for output in outputs: + print(output.outputs.embedding) + + +def main(args: Namespace): + run_encode(args.model_name, args.modality) + + +model_example_map = { + "e5_v": run_e5_v, + "vlm2vec": run_vlm2vec, +} + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description='Demo on using vLLM for offline inference with ' + 'vision language models for multimodal embedding') + parser.add_argument('--model-name', + '-m', + type=str, + default="vlm2vec", + choices=model_example_map.keys(), + help='The name of the embedding model.') + parser.add_argument('--modality', + type=str, + default="image", + choices=get_args(QueryModality), + help='Modality of the input.') + args = parser.parse_args() + main(args) diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py index 69f590fb7950d..e28514bf403f7 100644 --- a/examples/offline_inference_vision_language_multi_image.py +++ b/examples/offline_inference_vision_language_multi_image.py @@ -1,7 +1,7 @@ """ This example shows how to use vLLM for running offline inference with -multi-image input on vision language models, using the chat template defined -by the model. +multi-image input on vision language models for text generation, +using the chat template defined by the model. """ from argparse import Namespace from typing import List, NamedTuple, Optional @@ -334,7 +334,8 @@ def main(args: Namespace): if __name__ == "__main__": parser = FlexibleArgumentParser( description='Demo on using vLLM for offline inference with ' - 'vision language models that support multi-image input') + 'vision language models that support multi-image input for text ' + 'generation') parser.add_argument('--model-type', '-m', type=str, diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference_with_prefix.py index 3b3e0ae64a037..67b755a155966 100644 --- a/examples/offline_inference_with_prefix.py +++ b/examples/offline_inference_with_prefix.py @@ -1,4 +1,5 @@ from vllm import LLM, SamplingParams +from vllm.distributed import cleanup_dist_env_and_memory # NOTE: This is just a running example. For benchmarking purpose, # please see benchmarks/benchmark_prefix_caching.py @@ -28,12 +29,9 @@ # Create a sampling params object. sampling_params = SamplingParams(temperature=0.0) -# Create an LLM. +# Create an LLM without prefix caching as a baseline. regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4) -prefix_cached_llm = LLM(model="facebook/opt-125m", - enable_prefix_caching=True, - gpu_memory_utilization=0.4) print("Results without `enable_prefix_caching`") # Generate texts from the prompts. The output is a list of RequestOutput objects @@ -50,6 +48,15 @@ print("-" * 80) +# Destroy the LLM object and free up the GPU memory. +del regular_llm +cleanup_dist_env_and_memory() + +# Create an LLM with prefix caching enabled. +prefix_cached_llm = LLM(model="facebook/opt-125m", + enable_prefix_caching=True, + gpu_memory_utilization=0.4) + # Warmup so that the shared prompt's KV cache is computed. prefix_cached_llm.generate(generating_prompts[0], sampling_params) diff --git a/examples/openai_api_client_for_multimodal.py b/examples/openai_api_client_for_multimodal.py index 704236be72d03..beb83e494ed0b 100644 --- a/examples/openai_api_client_for_multimodal.py +++ b/examples/openai_api_client_for_multimodal.py @@ -7,8 +7,8 @@ vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja (multi-image inference with Phi-3.5-vision-instruct) -vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \ - --trust-remote-code --limit-mm-per-prompt image=2 +vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ + --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2 (audio inference with Ultravox) vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096 diff --git a/format.sh b/format.sh index 1ac028d00e3a4..be6ee0ce46dcb 100755 --- a/format.sh +++ b/format.sh @@ -21,6 +21,20 @@ builtin cd "$(dirname "${BASH_SOURCE:-$0}")" ROOT="$(git rev-parse --show-toplevel)" builtin cd "$ROOT" || exit 1 +check_command() { + if ! command -v "$1" &> /dev/null; then + echo "❓❓$1 is not installed, please run \`pip install -r requirements-lint.txt\`" + exit 1 + fi +} + +check_command yapf +check_command ruff +check_command mypy +check_command codespell +check_command isort +check_command clang-format + YAPF_VERSION=$(yapf --version | awk '{print $2}') RUFF_VERSION=$(ruff --version | awk '{print $2}') MYPY_VERSION=$(mypy --version | awk '{print $2}') @@ -31,7 +45,7 @@ CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}') # # params: tool name, tool version, required version tool_version_check() { if [[ $2 != $3 ]]; then - echo "Wrong $1 version installed: $3 is required, not $2." + echo "❓❓Wrong $1 version installed: $3 is required, not $2." exit 1 fi } @@ -281,10 +295,12 @@ tools/actionlint.sh -color echo 'vLLM actionlint: Done' if ! git diff --quiet &>/dev/null; then - echo 'Reformatted files. Please review and stage the changes.' - echo 'Changes not staged for commit:' - echo + echo + echo "🔍🔍There are files changed by the format checker or by you that are not added and committed:" git --no-pager diff --name-only + echo "🔍🔍Format checker passed, but please add, commit and push all the files above to include changes made by the format checker." exit 1 +else + echo "✨🎉 Format check passed! Congratulations! 🎉✨" fi diff --git a/setup.py b/setup.py index d1f4b7f1c1119..8abeb0ba739db 100644 --- a/setup.py +++ b/setup.py @@ -157,6 +157,14 @@ def configure(self, ext: CMakeExtension) -> None: # on subsequent calls to python. cmake_args += ['-DVLLM_PYTHON_PATH={}'.format(":".join(sys.path))] + # Override the base directory for FetchContent downloads to $ROOT/.deps + # This allows sharing dependencies between profiles, + # and plays more nicely with sccache. + # To override this, set the FETCHCONTENT_BASE_DIR environment variable. + fc_base_dir = os.path.join(ROOT_DIR, ".deps") + fc_base_dir = os.environ.get("FETCHCONTENT_BASE_DIR", fc_base_dir) + cmake_args += ['-DFETCHCONTENT_BASE_DIR={}'.format(fc_base_dir)] + # # Setup parallelism and build tool # diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index 1903a7582dc89..8a04693ba676d 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -12,11 +12,11 @@ from vllm import SamplingParams from vllm.config import ParallelConfig +from vllm.distributed import cleanup_dist_env_and_memory from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine from vllm.outputs import RequestOutput as RealRequestOutput from vllm.sampling_params import RequestOutputKind -from ..conftest import cleanup from ..utils import wait_for_gpu_memory_to_clear @@ -157,7 +157,7 @@ async def async_engine(): engine.shutdown_background_loop() del engine await asyncio.sleep(0.1) - cleanup() + cleanup_dist_env_and_memory() @pytest.fixture() diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 0fe88e792520a..3c2ca1bddd906 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -19,7 +19,7 @@ MODELS = [ "facebook/opt-125m", - "meta-llama/Llama-2-7b-hf", + "meta-llama/Llama-3.2-1B", ] TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4") diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index c3e3835aff0af..51aec8c873d12 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -16,7 +16,7 @@ MODELS = [ "facebook/opt-125m", - "meta-llama/Llama-2-7b-hf", + "meta-llama/Llama-3.2-1B", ] diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py index a5df5639cf948..d7f36a7812802 100644 --- a/tests/basic_correctness/test_cpu_offload.py +++ b/tests/basic_correctness/test_cpu_offload.py @@ -2,5 +2,5 @@ def test_cpu_offload(): - compare_two_settings("meta-llama/Llama-2-7b-hf", [], - ["--cpu-offload-gb", "4"]) + compare_two_settings("meta-llama/Llama-3.2-1B", [], + ["--cpu-offload-gb", "1"]) diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index b6ec7413978f4..77c56d91d0a8b 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -13,8 +13,7 @@ @pytest.mark.parametrize( "model, model_args, pp_size, tp_size, attn_backend, method, fullgraph", [ - ("meta-llama/Meta-Llama-3-8B", [], 2, 2, "FLASH_ATTN", "generate", - True), + ("meta-llama/Llama-3.2-1B", [], 2, 2, "FLASH_ATTN", "generate", True), ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", ["--quantization", "compressed-tensors" ], 1, 1, "FLASH_ATTN", "generate", True), diff --git a/tests/conftest.py b/tests/conftest.py index 5df7da9ee64e2..b11bbcb4ab7d1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,3 @@ -import contextlib -import gc import json import os import sys @@ -25,19 +23,19 @@ from vllm import LLM, SamplingParams from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset -from vllm.config import TokenizerPoolConfig +from vllm.config import TaskOption, TokenizerPoolConfig from vllm.connections import global_http_connection -from vllm.distributed import (destroy_distributed_environment, - destroy_model_parallel, +from vllm.distributed import (cleanup_dist_env_and_memory, init_distributed_environment, initialize_model_parallel) from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, to_enc_dec_tuple_list, zip_enc_dec_prompts) from vllm.logger import init_logger from vllm.outputs import RequestOutput +from vllm.platforms import current_platform from vllm.sampling_params import BeamSearchParams from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless, - identity, is_cpu) + identity) logger = init_logger(__name__) @@ -45,10 +43,12 @@ _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")] -PromptImageInput = Union[List[Image.Image], List[List[Image.Image]]] -PromptAudioInput = Union[List[Tuple[np.ndarray, int]], - List[List[Tuple[np.ndarray, int]]]] -PromptVideoInput = Union[List[np.ndarray], List[List[np.ndarray]]] +_M = TypeVar("_M") +_PromptMultiModalInput = Union[List[_M], List[List[_M]]] + +PromptImageInput = _PromptMultiModalInput[Image.Image] +PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]] +PromptVideoInput = _PromptMultiModalInput[np.ndarray] def _read_prompts(filename: str) -> List[str]: @@ -140,17 +140,7 @@ def dist_init(): ) initialize_model_parallel(1, 1) yield - cleanup() - - -def cleanup(): - destroy_model_parallel() - destroy_distributed_environment() - with contextlib.suppress(AssertionError): - torch.distributed.destroy_process_group() - gc.collect() - if not is_cpu(): - torch.cuda.empty_cache() + cleanup_dist_env_and_memory() @pytest.fixture() @@ -167,7 +157,7 @@ def should_do_global_cleanup_after_test(request) -> bool: def cleanup_fixture(should_do_global_cleanup_after_test: bool): yield if should_do_global_cleanup_after_test: - cleanup() + cleanup_dist_env_and_memory() @pytest.fixture(autouse=True) @@ -249,7 +239,8 @@ class HfRunner: def wrap_device(self, input: _T, device: Optional[str] = None) -> _T: if device is None: - return self.wrap_device(input, "cpu" if is_cpu() else "cuda") + return self.wrap_device( + input, "cpu" if current_platform.is_cpu() else "cuda") if hasattr(input, "device") and input.device.type == device: return input @@ -262,7 +253,9 @@ def __init__( dtype: str = "half", *, model_kwargs: Optional[Dict[str, Any]] = None, + is_embedding_model: bool = False, is_sentence_transformer: bool = False, + skip_tokenizer_init: bool = False, auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM, postprocess_inputs: Callable[[BatchEncoding], BatchEncoding] = identity, @@ -290,11 +283,12 @@ def __init__( **model_kwargs, )) - self.tokenizer = AutoTokenizer.from_pretrained( - model_name, - torch_dtype=torch_dtype, - trust_remote_code=True, - ) + if not skip_tokenizer_init: + self.tokenizer = AutoTokenizer.from_pretrained( + model_name, + torch_dtype=torch_dtype, + trust_remote_code=True, + ) # don't put this import at the top level # it will call torch.cuda.device_count() @@ -304,6 +298,8 @@ def __init__( torch_dtype=torch_dtype, trust_remote_code=True, ) + if skip_tokenizer_init: + self.tokenizer = self.processor.tokenizer self.postprocess_inputs = postprocess_inputs @@ -329,12 +325,12 @@ def get_inputs( "text": prompt, "return_tensors": "pt", } - if images is not None and images[i] is not None: - processor_kwargs["images"] = images[i] - if videos is not None and videos[i] is not None: - processor_kwargs["videos"] = videos[i] - if audios is not None and audios[i] is not None: - audio, sr = audios[i] + if images is not None and (image := images[i]) is not None: + processor_kwargs["images"] = image + if videos is not None and (video := videos[i]) is not None: + processor_kwargs["videos"] = video + if audios is not None and (audio_tuple := audios[i]) is not None: + audio, sr = audio_tuple processor_kwargs["audio"] = audio processor_kwargs["sampling_rate"] = sr @@ -349,7 +345,7 @@ def generate( self, prompts: List[str], images: Optional[PromptImageInput] = None, - videos: Optional[List[np.ndarray]] = None, + videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, **kwargs: Any, ) -> List[Tuple[List[List[int]], List[str]]]: @@ -379,7 +375,7 @@ def generate_greedy( prompts: List[str], max_tokens: int, images: Optional[PromptImageInput] = None, - videos: Optional[List[np.ndarray]] = None, + videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, **kwargs: Any, ) -> List[Tuple[List[int], str]]: @@ -420,7 +416,7 @@ def generate_greedy_logprobs( prompts: List[str], max_tokens: int, images: Optional[PromptImageInput] = None, - videos: Optional[List[np.ndarray]] = None, + videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, **kwargs: Any, ) -> List[List[torch.Tensor]]: @@ -499,7 +495,7 @@ def generate_greedy_logprobs_limit( num_logprobs: int, images: Optional[PromptImageInput] = None, audios: Optional[PromptAudioInput] = None, - videos: Optional[List[np.ndarray]] = None, + videos: Optional[PromptVideoInput] = None, **kwargs: Any, ) -> List[TokensTextLogprobs]: all_inputs = self.get_inputs(prompts, @@ -544,6 +540,7 @@ def generate_encoder_decoder_greedy_logprobs_limit( encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]], max_tokens: int, num_logprobs: int, + images: Optional[PromptImageInput] = None, **kwargs: Any, ) -> List[TokensTextLogprobs]: ''' @@ -554,11 +551,17 @@ def generate_encoder_decoder_greedy_logprobs_limit( all_output_ids: List[List[int]] = [] all_output_strs: List[str] = [] - for (encoder_prompt, - decoder_prompt) in to_enc_dec_tuple_list(encoder_decoder_prompts): + for i, (encoder_prompt, decoder_prompt) in enumerate( + to_enc_dec_tuple_list(encoder_decoder_prompts)): + processor_kwargs: Dict[str, Any] = { + "text": encoder_prompt, + "return_tensors": "pt", + } + if images is not None and images[i] is not None: + processor_kwargs["images"] = images[i] encoder_input_ids = self.wrap_device( - self.tokenizer(encoder_prompt, return_tensors="pt").input_ids, + self.processor(**processor_kwargs).input_ids, device=self.model.device.type, ) @@ -606,7 +609,7 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): del self.model - cleanup() + cleanup_dist_env_and_memory() @pytest.fixture(scope="session") @@ -619,6 +622,7 @@ class VllmRunner: def __init__( self, model_name: str, + task: TaskOption = "auto", tokenizer_name: Optional[str] = None, # Use smaller max model length, otherwise bigger model cannot run due # to kv cache size limit. @@ -634,6 +638,7 @@ def __init__( ) -> None: self.model = LLM( model=model_name, + task=task, tokenizer=tokenizer_name, trust_remote_code=True, dtype=dtype, @@ -666,15 +671,18 @@ def get_inputs( inputs = [TextPrompt(prompt=prompt) for prompt in prompts] if images is not None: for i, image in enumerate(images): - inputs[i]["multi_modal_data"] = {"image": image} + if image is not None: + inputs[i]["multi_modal_data"] = {"image": image} if videos is not None: for i, video in enumerate(videos): - inputs[i]["multi_modal_data"] = {"video": video} + if video is not None: + inputs[i]["multi_modal_data"] = {"video": video} if audios is not None: for i, audio in enumerate(audios): - inputs[i]["multi_modal_data"] = {"audio": audio} + if audio is not None: + inputs[i]["multi_modal_data"] = {"audio": audio} return inputs @@ -846,20 +854,27 @@ def generate_beam_search( returned_outputs.append((token_ids, texts)) return returned_outputs - def encode(self, prompts: List[str]) -> List[List[float]]: - req_outputs = self.model.encode(prompts) - outputs = [] - for req_output in req_outputs: - embedding = req_output.outputs.embedding - outputs.append(embedding) - return outputs + def encode( + self, + prompts: List[str], + images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, + ) -> List[List[float]]: + inputs = self.get_inputs(prompts, + images=images, + videos=videos, + audios=audios) + + req_outputs = self.model.encode(inputs) + return [req_output.outputs.embedding for req_output in req_outputs] def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): del self.model - cleanup() + cleanup_dist_env_and_memory() @pytest.fixture(scope="session") diff --git a/tests/core/block/e2e/conftest.py b/tests/core/block/e2e/conftest.py index e870597b7a011..70577ec052a2c 100644 --- a/tests/core/block/e2e/conftest.py +++ b/tests/core/block/e2e/conftest.py @@ -3,10 +3,9 @@ import pytest from vllm import LLM +from vllm.distributed import cleanup_dist_env_and_memory from vllm.model_executor.utils import set_random_seed -from ....conftest import cleanup - @pytest.fixture def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, @@ -37,7 +36,7 @@ def generator_inner(): yield llm del llm - cleanup() + cleanup_dist_env_and_memory() for llm in generator_inner(): yield llm diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py index f97caa06ff02d..308dad1850c9a 100644 --- a/tests/core/test_chunked_prefill_scheduler.py +++ b/tests/core/test_chunked_prefill_scheduler.py @@ -33,7 +33,8 @@ def test_simple(): num_seq_group = 4 max_model_len = 16 max_num_batched_tokens = 64 - scheduler_config = SchedulerConfig(max_num_batched_tokens, + scheduler_config = SchedulerConfig("generate", + max_num_batched_tokens, num_seq_group, max_model_len, enable_chunked_prefill=True) @@ -78,6 +79,7 @@ def test_chunk(): max_model_len = 80 max_num_batched_tokens = 64 scheduler_config = SchedulerConfig( + "generate", max_num_batched_tokens, max_seqs, max_model_len, @@ -126,6 +128,7 @@ def test_complex(): max_model_len = 80 max_num_batched_tokens = 64 scheduler_config = SchedulerConfig( + "generate", max_num_batched_tokens, max_seqs, max_model_len, @@ -196,6 +199,7 @@ def test_maximal_decoding(): max_model_len = 8 max_num_batched_tokens = 2 scheduler_config = SchedulerConfig( + "generate", max_num_batched_tokens, max_seqs, max_model_len, @@ -289,6 +293,7 @@ def test_prompt_limit(): max_model_len = 64 max_num_batched_tokens = 32 scheduler_config = SchedulerConfig( + "generate", max_num_batched_tokens, max_seqs, max_model_len, @@ -321,7 +326,8 @@ def test_prompt_limit_exceed(): max_seqs = 64 max_model_len = 32 max_num_batched_tokens = 64 - scheduler_config = SchedulerConfig(max_num_batched_tokens, + scheduler_config = SchedulerConfig("generate", + max_num_batched_tokens, max_seqs, max_model_len, enable_chunked_prefill=True) @@ -348,6 +354,7 @@ def test_swap(): max_model_len = 200 max_num_batched_tokens = 30 scheduler_config = SchedulerConfig( + "generate", max_num_batched_tokens, max_seqs, max_model_len, @@ -404,6 +411,7 @@ def test_running_prefill_prioritized_over_swap(): max_model_len = 200 max_num_batched_tokens = 30 scheduler_config = SchedulerConfig( + "generate", max_num_batched_tokens, max_seqs, max_model_len, @@ -498,6 +506,7 @@ def test_chunked_prefill_preempt(): max_model_len = 200 max_num_batched_tokens = 30 scheduler_config = SchedulerConfig( + "generate", max_num_batched_tokens, max_seqs, max_model_len, @@ -563,6 +572,7 @@ def test_chunked_prefill_max_seqs(): max_model_len = 80 max_num_batched_tokens = 64 scheduler_config = SchedulerConfig( + "generate", max_num_batched_tokens, max_seqs, max_model_len, @@ -617,6 +627,7 @@ def test_perfix_caching(): max_model_len = 80 max_num_batched_tokens = 64 scheduler_config = SchedulerConfig( + "generate", max_num_batched_tokens, max_seqs, max_model_len, diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index defa6c1bdaf78..00b6349b9f8c5 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -20,9 +20,10 @@ def test_scheduler_add_seq_group(): block_size = 4 scheduler_config = SchedulerConfig( - 100, - 64, - 1, + "generate", + max_num_batched_tokens=100, + max_num_seqs=64, + max_model_len=1, ) cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto") cache_config.num_cpu_blocks = 4 @@ -42,9 +43,10 @@ def test_scheduler_add_seq_group(): def test_scheduler_abort_seq_group(): block_size = 4 scheduler_config = SchedulerConfig( - 100, - 64, - 1, + "generate", + max_num_batched_tokens=100, + max_num_seqs=64, + max_model_len=1, ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 4 @@ -70,9 +72,10 @@ def test_scheduler_schedule_simple(): num_seq_group = 4 max_model_len = 16 scheduler_config = SchedulerConfig( - 64, - num_seq_group, - max_model_len, + "generate", + max_num_batched_tokens=64, + max_num_seqs=num_seq_group, + max_model_len=max_model_len, ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 8 @@ -114,9 +117,10 @@ def test_scheduler_prefill_prioritized(): max_model_len = 30 max_batched_num_tokens = 30 scheduler_config = SchedulerConfig( - max_batched_num_tokens, - 2, - max_model_len, + "generate", + max_num_batched_tokens=max_batched_num_tokens, + max_num_seqs=2, + max_model_len=max_model_len, ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 16 @@ -145,9 +149,10 @@ def test_scheduler_schedule_preempt_abort(): block_size = 4 max_model_len = 16 scheduler_config = SchedulerConfig( - 64, - 2, - max_model_len, + "generate", + max_num_batched_tokens=64, + max_num_seqs=2, + max_model_len=max_model_len, ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 2 @@ -204,9 +209,10 @@ def test_scheduler_max_seqs(): max_seq_group = 2 max_model_len = 16 scheduler_config = SchedulerConfig( - 64, - max_seq_group, - max_model_len, + "generate", + max_num_batched_tokens=64, + max_num_seqs=max_seq_group, + max_model_len=max_model_len, ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 8 @@ -248,9 +254,10 @@ def test_scheduler_max_seqs(): def test_scheduler_delay_factor(): block_size = 4 scheduler_config = SchedulerConfig( - 100, - 64, - 16, + "generate", + max_num_batched_tokens=100, + max_num_seqs=64, + max_model_len=16, delay_factor=0.5, ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") @@ -350,9 +357,10 @@ def initialize_scheduler( ): block_size = block_size scheduler_config = SchedulerConfig( - max_token_budget, - max_num_seqs, - max_model_len, + "generate", + max_num_batched_tokens=max_token_budget, + max_num_seqs=max_num_seqs, + max_model_len=max_model_len, ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = num_cpu_blocks diff --git a/tests/core/test_scheduler_encoder_decoder.py b/tests/core/test_scheduler_encoder_decoder.py index 50c047f30b80d..7cd0416d321ef 100644 --- a/tests/core/test_scheduler_encoder_decoder.py +++ b/tests/core/test_scheduler_encoder_decoder.py @@ -36,7 +36,12 @@ def test_scheduler_schedule_simple_encoder_decoder(): block_size = 4 num_seq_group = 4 max_model_len = 16 - scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len) + scheduler_config = SchedulerConfig( + task="generate", + max_num_batched_tokens=64, + max_num_seqs=num_seq_group, + max_model_len=max_model_len, + ) cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = 16 # enc and dec prompts per seq_group cache_config.num_gpu_blocks = 16 # enc and dec prompts per seq_group diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 09718e3d69aa5..a93cdbe1cf2a2 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -11,6 +11,7 @@ import pytest +from vllm.config import TaskOption from vllm.logger import init_logger from ..utils import compare_two_settings, fork_new_process_for_each_test @@ -27,18 +28,26 @@ class ParallelSetup(NamedTuple): chunked_prefill: bool +class PPTestOptions(NamedTuple): + multi_node_only: bool + trust_remote_code: bool + tokenizer_mode: Optional[str] + + @dataclass class PPTestSettings: parallel_setups: List[ParallelSetup] distributed_backends: List[str] - trust_remote_code: bool - tokenizer_mode: Optional[str] + task: TaskOption + test_options: PPTestOptions @staticmethod def detailed( *, tp_base: int = 1, pp_base: int = 2, + multi_node_only: bool = False, + task: TaskOption = "auto", trust_remote_code: bool = False, tokenizer_mode: Optional[str] = None, ): @@ -66,8 +75,10 @@ def detailed( chunked_prefill=False), ], distributed_backends=["mp", "ray"], - trust_remote_code=trust_remote_code, - tokenizer_mode=tokenizer_mode, + task=task, + test_options=PPTestOptions(multi_node_only=multi_node_only, + trust_remote_code=trust_remote_code, + tokenizer_mode=tokenizer_mode), ) @staticmethod @@ -75,6 +86,8 @@ def fast( *, tp_base: int = 1, pp_base: int = 2, + task: TaskOption = "auto", + multi_node_only: bool = False, trust_remote_code: bool = False, tokenizer_mode: Optional[str] = None, ): @@ -86,15 +99,19 @@ def fast( chunked_prefill=False), ], distributed_backends=["mp"], - trust_remote_code=trust_remote_code, - tokenizer_mode=tokenizer_mode, + task=task, + test_options=PPTestOptions(multi_node_only=multi_node_only, + trust_remote_code=trust_remote_code, + tokenizer_mode=tokenizer_mode), ) def iter_params(self, model_name: str): + opts = self.test_options + for parallel_setup in self.parallel_setups: for distributed_backend in self.distributed_backends: yield (model_name, parallel_setup, distributed_backend, - self.trust_remote_code, self.tokenizer_mode) + self.task, opts) # NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU @@ -104,6 +121,7 @@ def iter_params(self, model_name: str): GENERATION_MODEL_SETTINGS = { # [DETAILED TESTS] "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(), + "microsoft/Phi-3-mini-4k-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True), # noqa: E501 # [FAST TESTS] # Uses Llama # "BAAI/AquilaChat-7B": PPTestSettings.fast(), @@ -145,10 +163,8 @@ def iter_params(self, model_name: str): "facebook/opt-iml-max-1.3b": PPTestSettings.fast(), "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True), "microsoft/phi-2": PPTestSettings.fast(), - "microsoft/Phi-3-mini-4k-instruct": PPTestSettings.fast(), "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501 - # FIXME: https://github.com/vllm-project/vllm/issues/8553 - # "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501 + "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501 "adept/persimmon-8b-chat": PPTestSettings.fast(), "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True), "Qwen/Qwen2-beta-7B-Chat": PPTestSettings.fast(), @@ -200,6 +216,7 @@ def iter_params(self, model_name: str): # [LANGUAGE GENERATION] "meta-llama/Meta-Llama-3-8B", "ibm/PowerLM-3b", + "microsoft/Phi-3-mini-4k-instruct", # [LANGUAGE EMBEDDING] "intfloat/e5-mistral-7b-instruct", "BAAI/bge-multilingual-gemma2", @@ -214,19 +231,22 @@ def _compare_tp( model_name: str, parallel_setup: ParallelSetup, distributed_backend: str, - trust_remote_code: bool, - tokenizer_mode: Optional[str], + task: TaskOption, + test_options: PPTestOptions, num_gpus_available: int, *, - method: Literal["generate", "encode"] = "encode", + method: Literal["generate", "encode"], ): tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup + multi_node_only, trust_remote_code, tokenizer_mode = test_options if num_gpus_available < tp_size * pp_size: pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs") if VLLM_MULTI_NODE and distributed_backend == "mp": pytest.skip("Skipping multi-node pipeline parallel test for " "multiprocessing distributed backend") + if multi_node_only and not VLLM_MULTI_NODE: + pytest.skip("Not in multi-node setting") common_args = [ # use half precision for speed and memory savings in CI environment @@ -241,6 +261,8 @@ def _compare_tp( common_args.append("--enable-chunked-prefill") if eager_mode: common_args.append("--enforce-eager") + if task != "auto": + common_args.extend(["--task", task]) if trust_remote_code: common_args.append("--trust-remote-code") if tokenizer_mode: @@ -298,8 +320,8 @@ def _compare_tp( @pytest.mark.parametrize( - ("model_name", "parallel_setup", "distributed_backend", - "trust_remote_code", "tokenizer_mode"), + ("model_name", "parallel_setup", "distributed_backend", "task", + "test_options"), [ params for model_name, settings in GENERATION_MODEL_SETTINGS.items() for params in settings.iter_params(model_name) @@ -311,22 +333,22 @@ def test_tp_language_generation( model_name: str, parallel_setup: ParallelSetup, distributed_backend: str, - trust_remote_code: bool, - tokenizer_mode: Optional[str], + task: TaskOption, + test_options: PPTestOptions, num_gpus_available, ): _compare_tp(model_name, parallel_setup, distributed_backend, - trust_remote_code, - tokenizer_mode, + task, + test_options, num_gpus_available, method="generate") @pytest.mark.parametrize( - ("model_name", "parallel_setup", "distributed_backend", - "trust_remote_code", "tokenizer_mode"), + ("model_name", "parallel_setup", "distributed_backend", "task", + "test_options"), [ params for model_name, settings in EMBEDDING_MODEL_SETTINGS.items() for params in settings.iter_params(model_name) @@ -338,22 +360,22 @@ def test_tp_language_embedding( model_name: str, parallel_setup: ParallelSetup, distributed_backend: str, - trust_remote_code: bool, - tokenizer_mode: Optional[str], + task: TaskOption, + test_options: PPTestOptions, num_gpus_available, ): _compare_tp(model_name, parallel_setup, distributed_backend, - trust_remote_code, - tokenizer_mode, + task, + test_options, num_gpus_available, method="encode") @pytest.mark.parametrize( - ("model_name", "parallel_setup", "distributed_backend", - "trust_remote_code", "tokenizer_mode"), + ("model_name", "parallel_setup", "distributed_backend", "task", + "test_options"), [ params for model_name, settings in MULTIMODAL_MODEL_SETTINGS.items() for params in settings.iter_params(model_name) @@ -365,14 +387,14 @@ def test_tp_multimodal_generation( model_name: str, parallel_setup: ParallelSetup, distributed_backend: str, - trust_remote_code: bool, - tokenizer_mode: Optional[str], + task: TaskOption, + test_options: PPTestOptions, num_gpus_available, ): _compare_tp(model_name, parallel_setup, distributed_backend, - trust_remote_code, - tokenizer_mode, + task, + test_options, num_gpus_available, method="generate") diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py index 9324a737a779c..bef0c515b9073 100644 --- a/tests/encoder_decoder/test_e2e_correctness.py +++ b/tests/encoder_decoder/test_e2e_correctness.py @@ -7,8 +7,8 @@ import pytest from transformers import AutoModelForSeq2SeqLM +from vllm.platforms import current_platform from vllm.sequence import SampleLogprobs -from vllm.utils import is_cpu from ..conftest import DecoderPromptType from ..models.utils import check_logprobs_close @@ -35,7 +35,7 @@ def vllm_to_hf_output( @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) @pytest.mark.parametrize("enforce_eager", [True, False]) @pytest.mark.skipif( - is_cpu(), + current_platform.is_cpu(), reason="CPU backend is not currently supported with encoder/decoder models" ) def test_encoder_decoder_e2e( @@ -50,7 +50,7 @@ def test_encoder_decoder_e2e( enforce_eager: bool, ) -> None: ''' - End-to-End (E2E) test for the encoder-decoder framework. + End-to-End (E2E) test for the encoder-decoder framework. This test evaluates the encoder-decoder functionality using the BART model. We compare the outputs of the Hugging Face and vLLM implementations to ensure that both implementations produce consistent diff --git a/tests/entrypoints/llm/test_chat.py b/tests/entrypoints/llm/test_chat.py new file mode 100644 index 0000000000000..fc66386fd2d2a --- /dev/null +++ b/tests/entrypoints/llm/test_chat.py @@ -0,0 +1,92 @@ +from typing import List + +import pytest + +from vllm import LLM + +from ..openai.test_vision import TEST_IMAGE_URLS + + +def test_chat(): + llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct") + + prompt1 = "Explain the concept of entropy." + messages = [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": prompt1 + }, + ] + outputs = llm.chat(messages) + assert len(outputs) == 1 + + +def test_multi_chat(): + llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct") + + prompt1 = "Explain the concept of entropy." + prompt2 = "Explain what among us is." + + conversation1 = [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": prompt1 + }, + ] + + conversation2 = [ + { + "role": "system", + "content": "You are a helpful assistant" + }, + { + "role": "user", + "content": prompt2 + }, + ] + + messages = [conversation1, conversation2] + + outputs = llm.chat(messages) + assert len(outputs) == 2 + + +@pytest.mark.parametrize("image_urls", + [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]]) +def test_chat_multi_image(image_urls: List[str]): + llm = LLM( + model="microsoft/Phi-3.5-vision-instruct", + dtype="bfloat16", + max_model_len=4096, + max_num_seqs=5, + enforce_eager=True, + trust_remote_code=True, + limit_mm_per_prompt={"image": 2}, + ) + + messages = [{ + "role": + "user", + "content": [ + *({ + "type": "image_url", + "image_url": { + "url": image_url + } + } for image_url in image_urls), + { + "type": "text", + "text": "What's in this image?" + }, + ], + }] + outputs = llm.chat(messages) + assert len(outputs) >= 0 diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py index 1885f2e168d80..4c9f796e5ed71 100644 --- a/tests/entrypoints/llm/test_encode.py +++ b/tests/entrypoints/llm/test_encode.py @@ -4,8 +4,7 @@ import pytest from vllm import LLM, EmbeddingRequestOutput, PoolingParams - -from ...conftest import cleanup +from vllm.distributed import cleanup_dist_env_and_memory MODEL_NAME = "intfloat/e5-mistral-7b-instruct" @@ -41,7 +40,7 @@ def llm(): del llm - cleanup() + cleanup_dist_env_and_memory() def assert_outputs_equal(o1: List[EmbeddingRequestOutput], diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py index 6543c4bb1b58e..7d2b377752725 100644 --- a/tests/entrypoints/llm/test_generate.py +++ b/tests/entrypoints/llm/test_generate.py @@ -4,9 +4,7 @@ import pytest from vllm import LLM, RequestOutput, SamplingParams - -from ...conftest import cleanup -from ..openai.test_vision import TEST_IMAGE_URLS +from vllm.distributed import cleanup_dist_env_and_memory MODEL_NAME = "facebook/opt-125m" @@ -40,7 +38,7 @@ def llm(): del llm - cleanup() + cleanup_dist_env_and_memory() def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]): @@ -104,90 +102,3 @@ def test_multiple_sampling_params(llm: LLM): # sampling_params is None, default params should be applied outputs = llm.generate(PROMPTS, sampling_params=None) assert len(PROMPTS) == len(outputs) - - -def test_chat(): - - llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") - - prompt1 = "Explain the concept of entropy." - messages = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": prompt1 - }, - ] - outputs = llm.chat(messages) - assert len(outputs) == 1 - - -def test_multi_chat(): - - llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") - - prompt1 = "Explain the concept of entropy." - prompt2 = "Explain what among us is." - - conversation1 = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": prompt1 - }, - ] - - conversation2 = [ - { - "role": "system", - "content": "You are a helpful assistant" - }, - { - "role": "user", - "content": prompt2 - }, - ] - - messages = [conversation1, conversation2] - - outputs = llm.chat(messages) - assert len(outputs) == 2 - - -@pytest.mark.parametrize("image_urls", - [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]]) -def test_chat_multi_image(image_urls: List[str]): - llm = LLM( - model="microsoft/Phi-3.5-vision-instruct", - dtype="bfloat16", - max_model_len=4096, - max_num_seqs=5, - enforce_eager=True, - trust_remote_code=True, - limit_mm_per_prompt={"image": 2}, - ) - - messages = [{ - "role": - "user", - "content": [ - *({ - "type": "image_url", - "image_url": { - "url": image_url - } - } for image_url in image_urls), - { - "type": "text", - "text": "What's in this image?" - }, - ], - }] - outputs = llm.chat(messages) - assert len(outputs) >= 0 diff --git a/tests/entrypoints/llm/test_generate_multiple_loras.py b/tests/entrypoints/llm/test_generate_multiple_loras.py index 9f5727ecd0406..eb2113692e7b4 100644 --- a/tests/entrypoints/llm/test_generate_multiple_loras.py +++ b/tests/entrypoints/llm/test_generate_multiple_loras.py @@ -5,10 +5,9 @@ from huggingface_hub import snapshot_download from vllm import LLM +from vllm.distributed import cleanup_dist_env_and_memory from vllm.lora.request import LoRARequest -from ...conftest import cleanup - MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" PROMPTS = [ @@ -39,7 +38,7 @@ def llm(): del llm - cleanup() + cleanup_dist_env_and_memory() @pytest.fixture(scope="module") diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py index 2841dfc6bd9c2..67c79415f322a 100644 --- a/tests/entrypoints/llm/test_guided_generate.py +++ b/tests/entrypoints/llm/test_guided_generate.py @@ -5,12 +5,11 @@ import jsonschema import pytest +from vllm.distributed import cleanup_dist_env_and_memory from vllm.entrypoints.llm import LLM from vllm.outputs import RequestOutput from vllm.sampling_params import GuidedDecodingParams, SamplingParams -from ...conftest import cleanup - MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" @@ -23,7 +22,7 @@ def llm(): with llm.deprecate_legacy_api(): yield weakref.proxy(llm) del llm - cleanup() + cleanup_dist_env_and_memory() @pytest.mark.skip_global_cleanup diff --git a/tests/entrypoints/llm/test_init.py b/tests/entrypoints/llm/test_init.py new file mode 100644 index 0000000000000..c9a4ad44fea30 --- /dev/null +++ b/tests/entrypoints/llm/test_init.py @@ -0,0 +1,22 @@ +import pytest + +from vllm import LLM + +from ...utils import error_on_warning + +MODEL_NAME = "facebook/opt-125m" + + +def test_pos_args_deprecated(): + with error_on_warning(DeprecationWarning): + LLM(model=MODEL_NAME, tokenizer=MODEL_NAME) + + with error_on_warning(DeprecationWarning): + LLM(MODEL_NAME, tokenizer=MODEL_NAME) + + with pytest.warns(DeprecationWarning, match="'tokenizer'"): + LLM(MODEL_NAME, MODEL_NAME) + + with pytest.warns(DeprecationWarning, + match="'tokenizer', 'tokenizer_mode'"): + LLM(MODEL_NAME, MODEL_NAME, "auto") diff --git a/tests/entrypoints/llm/test_lazy_outlines.py b/tests/entrypoints/llm/test_lazy_outlines.py index 010969ad4750d..cbfb0cc32c1ce 100644 --- a/tests/entrypoints/llm/test_lazy_outlines.py +++ b/tests/entrypoints/llm/test_lazy_outlines.py @@ -1,6 +1,7 @@ import sys from vllm import LLM, SamplingParams +from vllm.distributed import cleanup_dist_env_and_memory def test_lazy_outlines(sample_regex): @@ -14,6 +15,7 @@ def test_lazy_outlines(sample_regex): ] sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + # Create an LLM without guided decoding as a baseline. llm = LLM(model="facebook/opt-125m", enforce_eager=True, gpu_memory_utilization=0.3) @@ -26,8 +28,11 @@ def test_lazy_outlines(sample_regex): # make sure outlines is not imported assert 'outlines' not in sys.modules - # The second LLM needs to request a higher gpu_memory_utilization because - # the first LLM has already allocated a full 30% of the gpu memory. + # Destroy the LLM object and free up the GPU memory. + del llm + cleanup_dist_env_and_memory() + + # Create an LLM with guided decoding enabled. llm = LLM(model="facebook/opt-125m", enforce_eager=True, guided_decoding_backend="lm-format-enforcer", diff --git a/tests/entrypoints/offline_mode/test_offline_mode.py b/tests/entrypoints/offline_mode/test_offline_mode.py index fe40af271c1cd..65699e609e4a8 100644 --- a/tests/entrypoints/offline_mode/test_offline_mode.py +++ b/tests/entrypoints/offline_mode/test_offline_mode.py @@ -1,51 +1,56 @@ """Tests for HF_HUB_OFFLINE mode""" import importlib import sys -import weakref import pytest from vllm import LLM - -from ...conftest import cleanup - -MODEL_NAME = "facebook/opt-125m" +from vllm.distributed import cleanup_dist_env_and_memory + +MODEL_CONFIGS = [ + { + "model": "facebook/opt-125m", + "enforce_eager": True, + "gpu_memory_utilization": 0.20, + "max_model_len": 64, + "max_num_batched_tokens": 64, + "max_num_seqs": 64, + "tensor_parallel_size": 1, + }, + { + "model": "mistralai/Mistral-7B-Instruct-v0.1", + "enforce_eager": True, + "gpu_memory_utilization": 0.95, + "max_model_len": 64, + "max_num_batched_tokens": 64, + "max_num_seqs": 64, + "tensor_parallel_size": 1, + "tokenizer_mode": "mistral", + }, +] @pytest.fixture(scope="module") -def llm(): - # pytest caches the fixture so we use weakref.proxy to - # enable garbage collection - llm = LLM(model=MODEL_NAME, - max_num_batched_tokens=4096, - tensor_parallel_size=1, - gpu_memory_utilization=0.10, - enforce_eager=True) - - with llm.deprecate_legacy_api(): - yield weakref.proxy(llm) +def cache_models(): + # Cache model files first + for model_config in MODEL_CONFIGS: + LLM(**model_config) + cleanup_dist_env_and_memory() - del llm - - cleanup() + yield @pytest.mark.skip_global_cleanup -def test_offline_mode(llm: LLM, monkeypatch): - # we use the llm fixture to ensure the model files are in-cache - del llm - +@pytest.mark.usefixtures("cache_models") +def test_offline_mode(monkeypatch): # Set HF to offline mode and ensure we can still construct an LLM try: monkeypatch.setenv("HF_HUB_OFFLINE", "1") # Need to re-import huggingface_hub and friends to setup offline mode _re_import_modules() # Cached model files should be used in offline mode - LLM(model=MODEL_NAME, - max_num_batched_tokens=4096, - tensor_parallel_size=1, - gpu_memory_utilization=0.20, - enforce_eager=True) + for model_config in MODEL_CONFIGS: + LLM(**model_config) finally: # Reset the environment after the test # NB: Assuming tests are run in online mode diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py index 3af0032fd2fb0..d1aebbd70d256 100644 --- a/tests/entrypoints/openai/test_chat.py +++ b/tests/entrypoints/openai/test_chat.py @@ -16,9 +16,6 @@ # any model with a chat template should work here MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" -# technically this needs Mistral-7B-v0.1 as base, but we're not testing -# generation quality here -LORA_NAME = "typeof/zephyr-7b-beta-lora" @pytest.fixture(scope="module") @@ -851,14 +848,28 @@ async def test_response_format_json_object(client: openai.AsyncOpenAI): @pytest.mark.asyncio async def test_response_format_json_schema(client: openai.AsyncOpenAI): + prompt = 'what is 1+1? The format is "result": 2' + # Check that this prompt cannot lead to a valid JSON without json_schema for _ in range(2): resp = await client.chat.completions.create( model=MODEL_NAME, messages=[{ - "role": - "user", - "content": ('what is 1+1? please respond with a JSON object, ' - 'the format is {"result": 2}') + "role": "user", + "content": prompt + }], + ) + content = resp.choices[0].message.content + assert content is not None + with pytest.raises((json.JSONDecodeError, AssertionError)): + loaded = json.loads(content) + assert loaded == {"result": 2}, loaded + + for _ in range(2): + resp = await client.chat.completions.create( + model=MODEL_NAME, + messages=[{ + "role": "user", + "content": prompt }], response_format={ "type": "json_schema", diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index cc72a49ebbbda..f03bdb045f640 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -340,6 +340,40 @@ async def test_completion_streaming(client: openai.AsyncOpenAI, assert "".join(chunks) == single_output +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME, "zephyr-lora", "zephyr-pa"], +) +async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str): + """Streaming for parallel sampling. + The tokens from multiple samples, are flattened into a single stream, + with an index to indicate which sample the token belongs to. + """ + + prompt = "What is an LLM?" + n = 3 + max_tokens = 5 + + stream = await client.completions.create(model=model_name, + prompt=prompt, + max_tokens=max_tokens, + n=n, + stream=True) + chunks: List[List[str]] = [[] for i in range(n)] + finish_reason_count = 0 + async for chunk in stream: + index = chunk.choices[0].index + text = chunk.choices[0].text + chunks[index].append(text) + if chunk.choices[0].finish_reason is not None: + finish_reason_count += 1 + assert finish_reason_count == n + for chunk in chunks: + assert len(chunk) == max_tokens + print("".join(chunk)) + + @pytest.mark.asyncio @pytest.mark.parametrize( "model_name", diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index ec550fe82c70f..d9342fad9f018 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -22,12 +22,12 @@ class MockHFConfig: @dataclass class MockModelConfig: + task = "generate" tokenizer = MODEL_NAME trust_remote_code = False tokenizer_mode = "auto" max_model_len = 100 tokenizer_revision = None - embedding_mode = False multimodal_config = MultiModalConfig() hf_config = MockHFConfig() diff --git a/tests/entrypoints/openai/test_shutdown.py b/tests/entrypoints/openai/test_shutdown.py index 25ab91ef69333..6fcc92022855b 100644 --- a/tests/entrypoints/openai/test_shutdown.py +++ b/tests/entrypoints/openai/test_shutdown.py @@ -6,7 +6,7 @@ from ...utils import RemoteOpenAIServer -MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" +MODEL_NAME = "meta-llama/Llama-3.2-1B" @pytest.mark.asyncio diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 81d79601124a7..8311a5cb3c2d4 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -23,6 +23,8 @@ @pytest.fixture(scope="module") def server(): args = [ + "--task", + "generate", "--dtype", "bfloat16", "--max-model-len", diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 6ded5102c9314..1d8c328b73259 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -18,7 +18,8 @@ @pytest.fixture(scope="module") def phi3v_model_config(): return ModelConfig(PHI3V_MODEL_ID, - PHI3V_MODEL_ID, + task="generate", + tokenizer=PHI3V_MODEL_ID, tokenizer_mode="auto", trust_remote_code=True, dtype="bfloat16", @@ -387,3 +388,29 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages( "text": "What about these two?" }] }], phi3v_model_config, phi3v_tokenizer) + + +def test_parse_chat_messages_multiple_images_uncommon_input( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + conversation, mm_data = parse_chat_messages([{ + "role": + "user", + "content": [ + "What's in these images?", { + "image_url": image_url + }, { + "image_url": image_url + } + ] + }], phi3v_model_config, phi3v_tokenizer) + + assert conversation == [{ + "role": + "user", + "content": + "<|image_1|>\n<|image_2|>\nWhat's in these images?" + }] + _assert_mm_data_is_image_input(mm_data, 2) diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index f471dcee938be..8bcee98403775 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -19,22 +19,23 @@ def test_env(name: str, device: str, monkeypatch): override_backend_env_variable(monkeypatch, name) if device == "cpu": - with patch("vllm.attention.selector.is_cpu", return_value=True): - backend = which_attn_to_use(16, None, torch.float16, torch.float16, - 16, False) + with patch("vllm.attention.selector.current_platform.is_cpu", + return_value=True): + backend = which_attn_to_use(16, torch.float16, torch.float16, 16, + False) assert backend.name == "TORCH_SDPA" elif device == "hip": with patch("vllm.attention.selector.is_hip", return_value=True): - backend = which_attn_to_use(16, None, torch.float16, torch.float16, - 16, False) + backend = which_attn_to_use(16, torch.float16, torch.float16, 16, + False) assert backend.name == "ROCM_FLASH" elif device == "openvino": with patch("vllm.attention.selector.is_openvino", return_value=True): - backend = which_attn_to_use(16, None, torch.float16, torch.float16, - 16, False) + backend = which_attn_to_use(16, torch.float16, torch.float16, 16, + False) assert backend.name == "OPENVINO" else: - backend = which_attn_to_use(16, None, torch.float16, torch.float16, 16, + backend = which_attn_to_use(16, torch.float16, torch.float16, 16, False) assert backend.name == name @@ -46,37 +47,32 @@ def test_flash_attn(monkeypatch): # Unsupported CUDA arch with patch("torch.cuda.get_device_capability", return_value=(7, 5)): - backend = which_attn_to_use(16, None, torch.float16, None, 16, False) + backend = which_attn_to_use(16, torch.float16, None, 16, False) assert backend.name != STR_FLASH_ATTN_VAL # Unsupported data type - backend = which_attn_to_use(16, None, torch.float8_e4m3fn, None, 16, False) + backend = which_attn_to_use(16, torch.float8_e4m3fn, None, 16, False) assert backend.name != STR_FLASH_ATTN_VAL # Unsupported kv cache data type - backend = which_attn_to_use(16, None, torch.float16, "fp8", 16, False) + backend = which_attn_to_use(16, torch.float16, "fp8", 16, False) assert backend.name != STR_FLASH_ATTN_VAL # Unsupported block size - backend = which_attn_to_use(16, None, torch.float16, None, 8, False) - assert backend.name != STR_FLASH_ATTN_VAL - - # Unsupported sliding window - backend = which_attn_to_use(16, 1, torch.float16, None, 16, False) + backend = which_attn_to_use(16, torch.float16, None, 8, False) assert backend.name != STR_FLASH_ATTN_VAL # flash-attn is not installed with patch.dict('sys.modules', {'vllm_flash_attn': None}): - backend = which_attn_to_use(16, None, torch.float16, None, 16, False) + backend = which_attn_to_use(16, torch.float16, None, 16, False) assert backend.name != STR_FLASH_ATTN_VAL # Unsupported head size - backend = which_attn_to_use(17, None, torch.float16, None, 16, False) + backend = which_attn_to_use(17, torch.float16, None, 16, False) assert backend.name != STR_FLASH_ATTN_VAL # Attention-free models should bypass env and use PlaceholderAttention - backend = which_attn_to_use(16, None, torch.float16, torch.float16, 16, - True) + backend = which_attn_to_use(16, torch.float16, torch.float16, 16, True) assert backend.name != STR_FLASH_ATTN_VAL @@ -84,4 +80,4 @@ def test_invalid_env(monkeypatch): """Throw an exception if the backend name is invalid.""" override_backend_env_variable(monkeypatch, STR_INVALID_VAL) with pytest.raises(ValueError): - which_attn_to_use(16, None, torch.float16, None, 16, False) + which_attn_to_use(16, torch.float16, None, 16, False) diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py index 3e9b4d9a4f8a0..35c29c5bd1028 100644 --- a/tests/kernels/test_flash_attn.py +++ b/tests/kernels/test_flash_attn.py @@ -78,6 +78,7 @@ def ref_paged_attn( @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0]) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) +@pytest.mark.parametrize("sliding_window", [None, 256]) @torch.inference_mode() def test_flash_attn_with_paged_kv( kv_lens: List[int], @@ -87,6 +88,7 @@ def test_flash_attn_with_paged_kv( block_size: int, soft_cap: Optional[float], num_blocks: int, + sliding_window: Optional[int], ) -> None: torch.set_default_device("cuda") seed_everything(0) @@ -96,6 +98,8 @@ def test_flash_attn_with_paged_kv( assert num_query_heads % num_kv_heads == 0 max_kv_len = max(kv_lens) scale = head_size**-0.5 + window_size = ((sliding_window - 1, 0) if sliding_window is not None else + (-1, -1)) query = torch.randn(num_seqs, num_query_heads, head_size, dtype=dtype) key_cache = torch.randn(num_blocks, @@ -121,18 +125,18 @@ def test_flash_attn_with_paged_kv( block_table=block_tables, cache_seqlens=kv_lens_tensor, softcap=soft_cap if soft_cap is not None else 0, + window_size=window_size, ).squeeze(1) - ref_output = ref_paged_attn( - query=query, - key_cache=key_cache, - value_cache=value_cache, - query_lens=[1] * num_seqs, - kv_lens=kv_lens, - block_tables=block_tables, - scale=scale, - soft_cap=soft_cap, - ) + ref_output = ref_paged_attn(query=query, + key_cache=key_cache, + value_cache=value_cache, + query_lens=[1] * num_seqs, + kv_lens=kv_lens, + block_tables=block_tables, + scale=scale, + soft_cap=soft_cap, + sliding_window=sliding_window) torch.testing.assert_close(output, ref_output, atol=2e-2, rtol=1e-2), \ f"{torch.max(torch.abs(output - ref_output))}" @@ -141,7 +145,7 @@ def test_flash_attn_with_paged_kv( @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("block_size", BLOCK_SIZES) -@pytest.mark.parametrize("sliding_window", [None]) +@pytest.mark.parametrize("sliding_window", [None, 256]) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0]) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @@ -166,8 +170,7 @@ def test_varlen_with_paged_kv( assert num_query_heads % num_kv_heads == 0 max_query_len = max(query_lens) max_kv_len = max(kv_lens) - window_size = ((sliding_window, - sliding_window) if sliding_window is not None else + window_size = ((sliding_window - 1, 0) if sliding_window is not None else (-1, -1)) scale = head_size**-0.5 diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 405c0d0efad65..e40f0dd74602e 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -1,20 +1,16 @@ -import contextlib -import gc import tempfile from collections import OrderedDict from typing import Dict, List, TypedDict from unittest.mock import MagicMock, patch import pytest -import ray import torch import torch.nn as nn from huggingface_hub import snapshot_download import vllm from vllm.config import LoRAConfig -from vllm.distributed import (destroy_distributed_environment, - destroy_model_parallel, +from vllm.distributed import (cleanup_dist_env_and_memory, init_distributed_environment, initialize_model_parallel) from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -48,16 +44,6 @@ class ContextInfo(TypedDict): }] -def cleanup(): - destroy_model_parallel() - destroy_distributed_environment() - with contextlib.suppress(AssertionError): - torch.distributed.destroy_process_group() - gc.collect() - torch.cuda.empty_cache() - ray.shutdown() - - @pytest.fixture() def should_do_global_cleanup_after_test(request) -> bool: """Allow subdirectories to skip global cleanup by overriding this fixture. @@ -72,7 +58,7 @@ def should_do_global_cleanup_after_test(request) -> bool: def cleanup_fixture(should_do_global_cleanup_after_test: bool): yield if should_do_global_cleanup_after_test: - cleanup() + cleanup_dist_env_and_memory(shutdown_ray=True) @pytest.fixture @@ -87,7 +73,7 @@ def dist_init(): ) initialize_model_parallel(1, 1) yield - cleanup() + cleanup_dist_env_and_memory(shutdown_ray=True) @pytest.fixture @@ -238,7 +224,7 @@ def long_context_lora_files_32k(): def long_context_infos(long_context_lora_files_16k_1, long_context_lora_files_16k_2, long_context_lora_files_32k): - cleanup() + cleanup_dist_env_and_memory(shutdown_ray=True) infos: Dict[int, ContextInfo] = {} for lora_checkpoint_info in LONG_LORA_INFOS: lora_id = lora_checkpoint_info["lora_id"] @@ -259,7 +245,7 @@ def long_context_infos(long_context_lora_files_16k_1, @pytest.fixture def llama_2_7b_engine_extra_embeddings(): - cleanup() + cleanup_dist_env_and_memory(shutdown_ray=True) get_model_old = get_model def get_model_patched(*, model_config, device_config, **kwargs): @@ -272,7 +258,7 @@ def get_model_patched(*, model_config, device_config, **kwargs): engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False) yield engine.llm_engine del engine - cleanup() + cleanup_dist_env_and_memory(shutdown_ray=True) @pytest.fixture diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py index cbc3668997817..0ba2ce3617b67 100644 --- a/tests/lora/test_baichuan.py +++ b/tests/lora/test_baichuan.py @@ -3,10 +3,9 @@ import pytest import vllm +from vllm.distributed import cleanup_dist_env_and_memory from vllm.lora.request import LoRARequest -from .conftest import cleanup - MODEL_PATH = "baichuan-inc/Baichuan-7B" PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501 @@ -80,7 +79,7 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files, output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1) del llm_tp1 - cleanup() + cleanup_dist_env_and_memory() llm_tp2 = vllm.LLM(MODEL_PATH, enable_lora=True, @@ -93,7 +92,7 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files, output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2) del llm_tp2 - cleanup() + cleanup_dist_env_and_memory() assert output_tp1 == output_tp2 @@ -108,6 +107,6 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files, output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2) del llm_tp4 - cleanup() + cleanup_dist_env_and_memory() assert output_tp1 == output_tp4 diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py index ad8490353998f..e2a4f1ed0496a 100644 --- a/tests/lora/test_llama.py +++ b/tests/lora/test_llama.py @@ -4,10 +4,9 @@ import ray import vllm +from vllm.distributed import cleanup_dist_env_and_memory from vllm.lora.request import LoRARequest -from .conftest import cleanup - MODEL_PATH = "meta-llama/Llama-2-7b-hf" @@ -93,7 +92,7 @@ def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available): output_tp1 = do_sample(llm_tp1, sql_lora_files, lora_id=1) del llm_tp1 - cleanup() + cleanup_dist_env_and_memory() llm_tp2 = vllm.LLM(MODEL_PATH, enable_lora=True, @@ -103,7 +102,7 @@ def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available): output_tp2 = do_sample(llm_tp2, sql_lora_files, lora_id=1) del llm_tp2 - cleanup() + cleanup_dist_env_and_memory() assert output_tp1 == output_tp2 @@ -115,7 +114,7 @@ def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available): output_tp4 = do_sample(llm_tp4, sql_lora_files, lora_id=1) del llm_tp4 - cleanup() + cleanup_dist_env_and_memory() assert output_tp1 == output_tp4 diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py index 389a3ccbc17ec..c8edb02a88d4b 100644 --- a/tests/lora/test_long_context.py +++ b/tests/lora/test_long_context.py @@ -28,9 +28,15 @@ def _create_lora_request(lora_id, long_context_infos): context_len = long_context_infos[lora_id]["context_length"] scaling_factor = context_len_to_scaling_factor[context_len] - return LoRARequest(context_len, lora_id, - long_context_infos[lora_id]["lora"], None, - 4096 * scaling_factor) + return LoRARequest( + # There are 2 LoRAs for 16K, we need to add lora_id to indicate + # they are different LoRAs. + context_len + str(lora_id), + lora_id, + long_context_infos[lora_id]["lora"], + None, + 4096 * scaling_factor, + ) def evaluate_json_response(model_response, golden_response): @@ -108,14 +114,17 @@ def lora_llm(long_context_infos): for info in long_context_infos.values() ] - llm = vllm.LLM("meta-llama/Llama-2-13b-chat-hf", - enable_lora=True, - max_num_seqs=16, - max_loras=2, - long_lora_scaling_factors=tuple(scaling_factors), - max_num_batched_tokens=4096 * 8, - tensor_parallel_size=4, - distributed_executor_backend="mp") + llm = vllm.LLM( + "meta-llama/Llama-2-13b-chat-hf", + enable_lora=True, + max_num_seqs=16, + max_loras=2, + long_lora_scaling_factors=tuple(scaling_factors), + max_num_batched_tokens=4096 * 8, + tensor_parallel_size=4, + # FIXME enable async output processor + disable_async_output_proc=True, + distributed_executor_backend="mp") yield llm del llm diff --git a/tests/lora/test_minicpmv.py b/tests/lora/test_minicpmv.py index 81b8188e638c9..be040060d02b2 100644 --- a/tests/lora/test_minicpmv.py +++ b/tests/lora/test_minicpmv.py @@ -61,6 +61,7 @@ def test_minicpmv_lora(minicpmv_lora_files): max_loras=4, max_lora_rank=64, trust_remote_code=True, + gpu_memory_utilization=0.97 # This model is pretty big for CI gpus ) output1 = do_sample(llm, minicpmv_lora_files, lora_id=1) diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index 5636c96435024..d004c65929418 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -6,11 +6,10 @@ import pytest import vllm +from vllm.distributed import cleanup_dist_env_and_memory from vllm.lora.request import LoRARequest from vllm.utils import is_hip -from .conftest import cleanup - @dataclass class ModelWithQuantization: @@ -160,7 +159,7 @@ def expect_match(output, expected_output): print("removing lora") del llm - cleanup() + cleanup_dist_env_and_memory() @pytest.mark.parametrize("model", MODELS) @@ -181,7 +180,7 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available, output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1) del llm_tp1 - cleanup() + cleanup_dist_env_and_memory() llm_tp2 = vllm.LLM( model=model.model_path, @@ -194,6 +193,6 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available, output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1) del llm_tp2 - cleanup() + cleanup_dist_env_and_memory() assert output_tp1 == output_tp2 diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 732e91a52c0a9..2f7ac85507425 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -15,7 +15,8 @@ def test_worker_apply_lora(sql_lora_files): worker = Worker( model_config=ModelConfig( "meta-llama/Llama-2-7b-hf", - "meta-llama/Llama-2-7b-hf", + task="auto", + tokenizer="meta-llama/Llama-2-7b-hf", tokenizer_mode="auto", trust_remote_code=False, seed=0, @@ -27,7 +28,7 @@ def test_worker_apply_lora(sql_lora_files): load_format="dummy", ), parallel_config=ParallelConfig(1, 1, False), - scheduler_config=SchedulerConfig(32, 32, 32), + scheduler_config=SchedulerConfig("generate", 32, 32, 32), device_config=DeviceConfig("cuda"), cache_config=CacheConfig(block_size=16, gpu_memory_utilization=1., diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index 8798ff078843a..7a361ef320810 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -6,13 +6,12 @@ from prometheus_client import REGISTRY from vllm import EngineArgs, LLMEngine +from vllm.distributed import cleanup_dist_env_and_memory from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.metrics import RayPrometheusStatLogger from vllm.sampling_params import SamplingParams -from ..conftest import cleanup - MODELS = [ "facebook/opt-125m", ] @@ -85,6 +84,45 @@ def test_metric_counter_generation_tokens( f"metric: {metric_count!r}") +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("max_tokens", [128, 129]) +@pytest.mark.parametrize("disable_async_output_proc", [True, False]) +def test_metric_counter_generation_tokens_multi_step( + vllm_runner, + example_prompts, + model: str, + max_tokens: int, + disable_async_output_proc: bool, +) -> None: + num_scheduler_steps = 8 + with vllm_runner( + model, + disable_log_stats=False, + gpu_memory_utilization=0.4, + num_scheduler_steps=num_scheduler_steps, + disable_async_output_proc=disable_async_output_proc, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + tokenizer = vllm_model.model.get_tokenizer() + stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] + metric_count = stat_logger.metrics.counter_generation_tokens.labels( + **stat_logger.labels)._value.get() + vllm_generation_count = 0 + for i in range(len(example_prompts)): + vllm_output_ids, vllm_output_str = vllm_outputs[i] + prompt_ids = tokenizer.encode(example_prompts[i]) + # vllm_output_ids contains both prompt tokens and generation tokens. + # We're interested only in the count of the generation tokens. + vllm_generation_count += len(vllm_output_ids) - len(prompt_ids) + + # The multi-step scheduling will continue to execute forward even when + # encountering EOS, leading to slightly imprecise metrics. + assert abs(vllm_generation_count - metric_count) <\ + len(example_prompts) * num_scheduler_steps, \ + (f"generation token count: {vllm_generation_count!r}\n" + f"metric: {metric_count!r}") + + @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize( @@ -307,7 +345,7 @@ def test_metric_spec_decode_interval( finally: del engine - cleanup() + cleanup_dist_env_and_memory() def assert_metrics(engine: LLMEngine, disable_log_stats: bool, diff --git a/tests/models/decoder_only/language/test_big_models.py b/tests/models/decoder_only/language/test_big_models.py index fcc158639748d..75625b35209ce 100644 --- a/tests/models/decoder_only/language/test_big_models.py +++ b/tests/models/decoder_only/language/test_big_models.py @@ -21,10 +21,14 @@ ] if not current_platform.is_cpu(): - # MiniCPM requires fused_moe which is not supported by CPU - MODELS.append("openbmb/MiniCPM3-4B") + MODELS += [ + # fused_moe which not supported on CPU + "openbmb/MiniCPM3-4B", + # Head size isn't supported on CPU + "h2oai/h2o-danube3-4b-base", + ] -#TODO: remove this after CPU float16 support ready +# TODO: remove this after CPU float16 support ready target_dtype = "float" if current_platform.is_cpu() else "half" diff --git a/tests/models/decoder_only/language/test_danube3_4b.py b/tests/models/decoder_only/language/test_danube3_4b.py deleted file mode 100644 index bdd498edc293d..0000000000000 --- a/tests/models/decoder_only/language/test_danube3_4b.py +++ /dev/null @@ -1,52 +0,0 @@ -"""Compare the outputs of HF and vLLM when using greedy sampling. - -This tests danube3 separately because its head size isn't supported on CPU yet. - -Run `pytest tests/models/test_danube3_4b.py`. -""" -import pytest - -from ...utils import check_outputs_equal - -MODELS = ["h2oai/h2o-danube3-4b-base"] - -target_dtype = "half" - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [32]) -def test_models( - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, -) -> None: - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - - with vllm_runner(model, dtype=dtype) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", [target_dtype]) -def test_model_print( - vllm_runner, - model: str, - dtype: str, -) -> None: - with vllm_runner(model, dtype=dtype) as vllm_model: - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py index c27bf6a60a4f4..2dc231c595ffa 100644 --- a/tests/models/decoder_only/language/test_mamba.py +++ b/tests/models/decoder_only/language/test_mamba.py @@ -10,7 +10,7 @@ from ...utils import check_outputs_equal -MODELS = ["state-spaces/mamba-130m-hf"] +MODELS = ["state-spaces/mamba-130m-hf", "tiiuae/falcon-mamba-tiny-dev"] # Use lower-level interfaces to create this greedy generator, as mamba will diff --git a/tests/models/decoder_only/language/test_phimoe.py b/tests/models/decoder_only/language/test_phimoe.py index 89afbcf1c03ac..c997359a2781e 100644 --- a/tests/models/decoder_only/language/test_phimoe.py +++ b/tests/models/decoder_only/language/test_phimoe.py @@ -5,7 +5,7 @@ import pytest import torch -from vllm.utils import is_cpu +from vllm.platforms import current_platform from ....utils import large_gpu_test from ...utils import check_logprobs_close @@ -70,7 +70,7 @@ def test_phimoe_routing_function(): assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"]) -@pytest.mark.skipif(condition=is_cpu(), +@pytest.mark.skipif(condition=current_platform.is_cpu(), reason="This test takes a lot time to run on CPU, " "and vllm CI's disk space is not enough for this model.") @large_gpu_test(min_gb=80) diff --git a/tests/models/decoder_only/vision_language/test_fuyu.py b/tests/models/decoder_only/vision_language/test_fuyu.py index 7827ecb19a744..1affcd10ee72d 100644 --- a/tests/models/decoder_only/vision_language/test_fuyu.py +++ b/tests/models/decoder_only/vision_language/test_fuyu.py @@ -3,8 +3,8 @@ import pytest from vllm.multimodal.utils import rescale_image_size +from vllm.platforms import current_platform from vllm.sequence import SampleLogprobs -from vllm.utils import is_cpu from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets from ...utils import check_logprobs_close @@ -46,7 +46,7 @@ def run_test( All the image fixtures for the test are from IMAGE_ASSETS. For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects + For vllm runner, we provide MultiModalDataDict objects and corresponding MultiModalConfig as input. Note, the text input is also adjusted to abide by vllm contract. The text output is sanitized to be able to compare with hf. @@ -103,7 +103,7 @@ def run_test( target_dtype = "half" -if is_cpu(): +if current_platform.is_cpu(): target_dtype = "bfloat16" diff --git a/tests/models/decoder_only/vision_language/test_intern_vit.py b/tests/models/decoder_only/vision_language/test_intern_vit.py index 3c3b95b38baac..98f313eb9b9af 100644 --- a/tests/models/decoder_only/vision_language/test_intern_vit.py +++ b/tests/models/decoder_only/vision_language/test_intern_vit.py @@ -6,7 +6,7 @@ from huggingface_hub import snapshot_download from transformers import AutoConfig, AutoModel, CLIPImageProcessor -from ....conftest import _ImageAssets, cleanup +from ....conftest import _ImageAssets # we use snapshot_download to prevent conflicts between # dynamic_module and trust_remote_code for hf_runner @@ -45,12 +45,13 @@ def run_intern_vit_test( for pixel_value in pixel_values ] + from vllm.distributed import cleanup_dist_env_and_memory from vllm.model_executor.models.intern_vit import InternVisionModel vllm_model = InternVisionModel(config) vllm_model.load_weights(hf_model.state_dict().items()) del hf_model - cleanup() + cleanup_dist_env_and_memory() vllm_model = vllm_model.to("cuda", dtype) vllm_outputs_per_image = [ @@ -58,7 +59,7 @@ def run_intern_vit_test( for pixel_value in pixel_values ] del vllm_model - cleanup() + cleanup_dist_env_and_memory() cos_similar = nn.CosineSimilarity(dim=-1) for vllm_output, hf_output in zip(vllm_outputs_per_image, diff --git a/tests/models/decoder_only/vision_language/test_internvl.py b/tests/models/decoder_only/vision_language/test_internvl.py index 49cab75d8ea53..fc842ec4a6171 100644 --- a/tests/models/decoder_only/vision_language/test_internvl.py +++ b/tests/models/decoder_only/vision_language/test_internvl.py @@ -7,7 +7,6 @@ from transformers import AutoConfig from vllm.multimodal.utils import rescale_image_size -from vllm.utils import is_cpu from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, _ImageAssets) @@ -19,15 +18,20 @@ "cherry_blossom": "<|im_start|>User\n\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 }) -HF_MULTIIMAGE_IMAGE_PROMPT = "<|im_start|>User\nImage-1: \nImage-2: \nDescribe the two images in detail.<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501 +HF_MULTIIMAGE_IMAGE_PROMPT = "<|im_start|>User\nImage-1: \nImage-2: \nDescribe the two images in short.<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501 models = [ "OpenGVLab/InternVL2-1B", "OpenGVLab/InternVL2-2B", + # NOTE: Mono-InternVL-2B doesn't work with fp16, + # it will result NaN during inference. + # See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9 + "OpenGVLab/Mono-InternVL-2B", # Broken due to outdated implementation of Phi-3 # See: https://huggingface.co/OpenGVLab/InternVL2-4B/discussions/3 # "OpenGVLab/InternVL2-4B", ] +target_dtype = "bfloat16" # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py @@ -52,9 +56,15 @@ def generate( input_embeds = input_embeds.reshape(B, N, C) - outputs = self.language_model.generate( + forward_kwargs = dict( inputs_embeds=input_embeds, attention_mask=attention_mask, + ) + if getattr(self, "use_visual_token_mask", False): + visual_token_mask = selected.reshape(B, N, 1).to(input_embeds.dtype) + forward_kwargs["visual_token_mask"] = visual_token_mask + outputs = self.language_model.generate( + **forward_kwargs, **generate_kwargs, ) @@ -78,7 +88,7 @@ def run_test( All the image fixtures for the test are from IMAGE_ASSETS. For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects + For vllm runner, we provide MultiModalDataDict objects and corresponding MultiModalConfig as input. Note, the text input is also adjusted to abide by vllm contract. The text output is sanitized to be able to compare with hf. @@ -243,11 +253,6 @@ def run_awq_test( ) -target_dtype = "half" -if is_cpu(): - target_dtype = "bfloat16" - - @pytest.mark.parametrize("model", models) @pytest.mark.parametrize( "size_factors", diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py index 12e8a961877cd..dfe10629f1c66 100644 --- a/tests/models/decoder_only/vision_language/test_phi3v.py +++ b/tests/models/decoder_only/vision_language/test_phi3v.py @@ -10,8 +10,9 @@ from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID from vllm.multimodal import MultiModalRegistry from vllm.multimodal.utils import rescale_image_size +from vllm.platforms import current_platform from vllm.sequence import SampleLogprobs -from vllm.utils import is_cpu, is_hip +from vllm.utils import is_hip from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, _ImageAssets) @@ -49,7 +50,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str, target_dtype = "half" -if is_cpu(): +if current_platform.is_cpu(): target_dtype = "bfloat16" # ROCm Triton FA can run into shared memory issues with these models, @@ -89,6 +90,7 @@ def run_test( # max_model_len should be greater than image_feature_size with vllm_runner(model, + task="generate", max_model_len=4096, max_num_seqs=2, dtype=dtype, diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py new file mode 100644 index 0000000000000..d3de5fb26d4b8 --- /dev/null +++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py @@ -0,0 +1,160 @@ +from typing import Any, Dict, Tuple + +import pytest +import torch +from PIL.Image import Image +from transformers import AutoTokenizer + +from vllm.inputs import InputContext, token_inputs +from vllm.multimodal import MultiModalRegistry + +from ....conftest import _ImageAssets +from ...utils import build_model_context + +MODEL = "Qwen/Qwen2-VL-2B-Instruct" +MIN_PIXELS = "min_pixels" +MAX_PIXELS = "max_pixels" + + +# Fixtures lazy import to avoid initializing CUDA during test collection +# NOTE: Qwen2vl supports multiple input modalities, so it registers multiple +# input mappers. +@pytest.fixture() +def image_input_mapper_for_qwen2_vl(): + from vllm.model_executor.models.qwen2_vl import ( + image_input_mapper_for_qwen2_vl) + return image_input_mapper_for_qwen2_vl + + +@pytest.fixture() +def input_processor_for_qwen2_vl(): + from vllm.model_executor.models.qwen2_vl import ( + input_processor_for_qwen2_vl) + return input_processor_for_qwen2_vl + + +@pytest.fixture() +def qwen2_vl_context() -> InputContext: + return build_model_context(model_name=MODEL) + + +@pytest.fixture() +def get_max_qwen2_vl_image_tokens(): + from vllm.model_executor.models.qwen2_vl import ( + get_max_qwen2_vl_image_tokens) + return get_max_qwen2_vl_image_tokens + + +@pytest.fixture() +def dummy_data_for_qwen2_vl(): + from vllm.model_executor.models.qwen2_vl import dummy_data_for_qwen2_vl + return dummy_data_for_qwen2_vl + + +@pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [ + ({}, 1225), + ({ + MIN_PIXELS: 64**2, + MAX_PIXELS: 512**2 + }, 324), +]) +def test_qwen2_vl_max_image_tokens(get_max_qwen2_vl_image_tokens, + qwen2_vl_context: InputContext, + mm_processor_kwargs: Dict[str, Any], + expected_max_tokens: int): + """Ensure that the max token calc handles min/max pixels properly.""" + actual_max_tokens = get_max_qwen2_vl_image_tokens(qwen2_vl_context, + **mm_processor_kwargs) + assert actual_max_tokens == expected_max_tokens + + +@pytest.mark.parametrize("mm_processor_kwargs,token_count,img_size", [ + [{}, 1225, (980, 980)], + [{ + MIN_PIXELS: 64**2, + MAX_PIXELS: 512**2 + }, 324, (504, 504)], +]) +def test_qwen2_vl_dummy_data(dummy_data_for_qwen2_vl, + qwen2_vl_context: InputContext, + mm_processor_kwargs: Dict[str, Any], + token_count: int, img_size: Tuple[int, int]): + """Ensure that the dummy data handles min/max pixels properly.""" + seq_len = 3000 + hf_config = qwen2_vl_context.get_hf_config() + image_token_id = hf_config.image_token_id + + # NOTE: video value is required, but isn't actually used + # when making the dummy data except for error handling currently + seq_data, mm_data = dummy_data_for_qwen2_vl(qwen2_vl_context, seq_len, { + "image": 1, + "video": 0 + }, **mm_processor_kwargs) + + # Ensure we have the right number of placeholders for min/max pixel values + assert seq_data.get_token_ids().count(image_token_id) == token_count + + # Ensure the images were resized correctly + image = mm_data["image"] + assert isinstance(image, Image) + assert image.size == img_size + + +@pytest.mark.parametrize("mm_processor_kwargs,num_placeholders", [ + ({}, 1426), + ({ + MIN_PIXELS: 64**2, + MAX_PIXELS: 512**2 + }, 330), +]) +def test_input_processor(input_processor_for_qwen2_vl, + qwen2_vl_context: InputContext, + image_assets: _ImageAssets, num_placeholders: int, + mm_processor_kwargs: Dict[str, Any]): + """Ensure that the image processor handles min/max pixels properly.""" + tokenizer = AutoTokenizer.from_pretrained(MODEL) + prompt = "<|vision_start|><|image_pad|><|vision_end|>" + + image = image_assets[0].pil_image + hf_config = qwen2_vl_context.get_hf_config() + image_token_id = hf_config.image_token_id + + inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt), + prompt=prompt, + multi_modal_data={"image": [image]}) + + processed_inputs = input_processor_for_qwen2_vl(qwen2_vl_context, inputs, + **mm_processor_kwargs) + assert processed_inputs["prompt_token_ids"].count( + image_token_id) == num_placeholders + assert len(processed_inputs["multi_modal_data"]["image"]) == 1 + + +@pytest.mark.parametrize("mm_processor_kwargs,pixels_shape", [ + ({}, [5704, 1176]), + ({ + MIN_PIXELS: 64**2, + MAX_PIXELS: 512**2 + }, [1320, 1176]), +]) +def test_image_mapper_override(qwen2_vl_context: InputContext, + image_assets: _ImageAssets, + mm_processor_kwargs: Dict[str, Any], + pixels_shape: Tuple[int, int]): + """Ensure that the image mapper handles min/max pixels properly.""" + mm_registry = MultiModalRegistry() + mm_registry.init_mm_limits_per_prompt(qwen2_vl_context.model_config) + + image = image_assets[0].pil_image + + mapped_output = mm_registry.map_input( + qwen2_vl_context.model_config, + {"image": image}, + mm_processor_kwargs=mm_processor_kwargs, + ) + + # Dimension 0 of pixel values should match the product of image_grid_thw + actual_pixels_shape = mapped_output["pixel_values"].shape + assert list(actual_pixels_shape) == pixels_shape + assert actual_pixels_shape[0] == torch.prod( + mapped_output["image_grid_thw"]) diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py index 2fcc2013d91ef..fd1c44d9c117e 100644 --- a/tests/models/embedding/utils.py +++ b/tests/models/embedding/utils.py @@ -16,7 +16,8 @@ def check_embeddings_close( for prompt_idx, (embeddings_0, embeddings_1) in enumerate( zip(embeddings_0_lst, embeddings_1_lst)): - assert len(embeddings_0) == len(embeddings_1) + assert len(embeddings_0) == len(embeddings_1), ( + f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}") sim = F.cosine_similarity(torch.tensor(embeddings_0), torch.tensor(embeddings_1), diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py new file mode 100644 index 0000000000000..52aef8c34d6f3 --- /dev/null +++ b/tests/models/embedding/vision_language/test_llava_next.py @@ -0,0 +1,135 @@ +from typing import List, Type + +import pytest +import torch.nn.functional as F +from transformers import AutoModelForVision2Seq + +from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner +from ....utils import large_gpu_test +from ..utils import check_embeddings_close + +llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501 + +HF_TEXT_PROMPTS = [ + # T -> X + llama3_template.format( + "The label of the object is stop sign\nSummary above sentence in one word: " # noqa: E501 + ), + # T -> X + llama3_template.format( + "cherry blossom\nSummary above sentence in one word: "), +] + +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + # I -> X + "stop_sign": + llama3_template.format("\nSummary above image in one word: "), + # I -> X + "cherry_blossom": + llama3_template.format("\nSummary above image in one word: "), +}) + +MODELS = ["royokong/e5-v"] + + +def _run_test( + hf_runner: Type[HfRunner], + vllm_runner: Type[VllmRunner], + input_texts: List[str], + input_images: PromptImageInput, + model: str, + *, + dtype: str, +) -> None: + # NOTE: take care of the order. run vLLM first, and then run HF. + # vLLM needs a fresh new process without cuda initialization. + # if we run HF first, the cuda initialization will be done and it + # will hurt multiprocessing backend with fork method (the default method). + with vllm_runner(model, + task="embedding", + dtype=dtype, + max_model_len=4096, + enforce_eager=True) as vllm_model: + vllm_outputs = vllm_model.encode(input_texts, images=input_images) + + with hf_runner(model, dtype=dtype, + auto_cls=AutoModelForVision2Seq) as hf_model: + # Patch the issue where image_token_id + # exceeds the maximum allowed vocab size + hf_model.model.resize_token_embeddings( + hf_model.model.language_model.vocab_size + 1) + + all_inputs = hf_model.get_inputs(input_texts, images=input_images) + + all_outputs = [] + for inputs in all_inputs: + # Based on: https://huggingface.co/royokong/e5-v + outputs = hf_model.model( + **hf_model.wrap_device(inputs, + device=hf_model.model.device.type), + return_dict=True, + output_hidden_states=True, + ) + pooled_output = F.normalize(outputs.hidden_states[-1][0, -1, :], + dim=-1) + + all_outputs.append(pooled_output.tolist()) + + hf_outputs = all_outputs + + check_embeddings_close( + embeddings_0_lst=hf_outputs, + embeddings_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["half"]) +def test_models_text( + hf_runner, + vllm_runner, + image_assets, + model: str, + dtype: str, +) -> None: + input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS] + input_texts = [text for text, _ in input_texts_images] + input_images = [image for _, image in input_texts_images] + + _run_test( + hf_runner, + vllm_runner, + input_texts, + input_images, # type: ignore + model, + dtype=dtype, + ) + + +@large_gpu_test(min_gb=48) +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["half"]) +def test_models_image( + hf_runner, + vllm_runner, + image_assets, + model: str, + dtype: str, +) -> None: + input_texts_images = [ + (text, asset.pil_image) + for text, asset in zip(HF_IMAGE_PROMPTS, image_assets) + ] + input_texts = [text for text, _ in input_texts_images] + input_images = [image for _, image in input_texts_images] + + _run_test( + hf_runner, + vllm_runner, + input_texts, + input_images, + model, + dtype=dtype, + ) diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py index ea6b56cd02625..ee411472ba284 100644 --- a/tests/models/embedding/vision_language/test_phi3v.py +++ b/tests/models/embedding/vision_language/test_phi3v.py @@ -1,41 +1,53 @@ +from typing import List, Type + import pytest import torch.nn.functional as F -from ....conftest import IMAGE_ASSETS +from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner +from ....utils import large_gpu_test from ..utils import check_embeddings_close +HF_TEXT_PROMPTS = [ + # T -> X + "Find me an everyday image that matches the given caption: The label of the object is stop sign", # noqa: E501 + # T -> X + "Retrieve an image of this caption: cherry blossom", +] + HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + # T + I -> X "stop_sign": "<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign", # noqa: E501 + # I -> X "cherry_blossom": - "<|image_1|> Represent the given image with the following question: What is in the image", # noqa: E501 + "<|image_1|> Represent the given image for classification", # noqa: E501 }) MODELS = ["TIGER-Lab/VLM2Vec-Full"] -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -def test_models( - hf_runner, - vllm_runner, - example_prompts, +def _run_test( + hf_runner: Type[HfRunner], + vllm_runner: Type[VllmRunner], + input_texts: List[str], + input_images: PromptImageInput, model: str, + *, dtype: str, ) -> None: # NOTE: take care of the order. run vLLM first, and then run HF. # vLLM needs a fresh new process without cuda initialization. # if we run HF first, the cuda initialization will be done and it # will hurt multiprocessing backend with fork method (the default method). - with vllm_runner(model, - max_model_len=4096, - max_num_seqs=2, - dtype=dtype, + with vllm_runner(model, task="embedding", dtype=dtype, enforce_eager=True) as vllm_model: - vllm_outputs = vllm_model.encode(example_prompts) + vllm_outputs = vllm_model.encode(input_texts, images=input_images) - with hf_runner(model, dtype=dtype) as hf_model: - all_inputs = hf_model.get_inputs(example_prompts) + # use eager mode for hf runner, since phi3_v didn't work with flash_attn + hf_model_kwargs = {"_attn_implementation": "eager"} + with hf_runner(model, dtype=dtype, + model_kwargs=hf_model_kwargs) as hf_model: + all_inputs = hf_model.get_inputs(input_texts, images=input_images) all_outputs = [] for inputs in all_inputs: @@ -60,3 +72,53 @@ def test_models( name_0="hf", name_1="vllm", ) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["half"]) +def test_models_text( + hf_runner, + vllm_runner, + image_assets, + model: str, + dtype: str, +) -> None: + input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS] + input_texts = [text for text, _ in input_texts_images] + input_images = [image for _, image in input_texts_images] + + _run_test( + hf_runner, + vllm_runner, + input_texts, + input_images, # type: ignore + model, + dtype=dtype, + ) + + +@large_gpu_test(min_gb=48) +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["half"]) +def test_models_image( + hf_runner, + vllm_runner, + image_assets, + model: str, + dtype: str, +) -> None: + input_texts_images = [ + (text, asset.pil_image) + for text, asset in zip(HF_IMAGE_PROMPTS, image_assets) + ] + input_texts = [text for text, _ in input_texts_images] + input_images = [image for _, image in input_texts_images] + + _run_test( + hf_runner, + vllm_runner, + input_texts, + input_images, + model, + dtype=dtype, + ) diff --git a/tests/models/encoder_decoder/vision_language/test_florence2.py b/tests/models/encoder_decoder/vision_language/test_florence2.py new file mode 100644 index 0000000000000..483773f069133 --- /dev/null +++ b/tests/models/encoder_decoder/vision_language/test_florence2.py @@ -0,0 +1,102 @@ +from functools import partial +from typing import List, Optional, Tuple, Type + +import pytest +from PIL import Image + +from vllm.inputs.data import ExplicitEncoderDecoderPrompt +from vllm.sequence import SampleLogprobs + +from ....conftest import HfRunner, VllmRunner +from ...utils import check_logprobs_close + +Florence2Prompt = partial(ExplicitEncoderDecoderPrompt, + decoder_prompt=None, + mm_processor_kwargs=None) + +MODELS = ["microsoft/Florence-2-base"] +# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer +# Therefore, we borrow the BartTokenizer from the original Bart model +TOKENIZER = "facebook/bart-base" +PROMPTS = [ + Florence2Prompt(encoder_prompt=""), + Florence2Prompt(encoder_prompt=""), + Florence2Prompt(encoder_prompt=""), + Florence2Prompt(encoder_prompt=""), + Florence2Prompt(encoder_prompt=""), + Florence2Prompt(encoder_prompt=""), + Florence2Prompt(encoder_prompt=""), + Florence2Prompt(encoder_prompt=""), + Florence2Prompt(encoder_prompt=""), +] + + +def vllm_to_hf_output(vllm_output: Tuple[List[int], str, + Optional[SampleLogprobs]], ): + """Sanitize vllm output to be comparable with hf output.""" + output_ids, output_str, out_logprobs = vllm_output + + hf_output_str = "" + output_str + "" + + return output_ids, hf_output_str, out_logprobs + + +def run_test( + hf_runner: Type[HfRunner], + vllm_runner: Type[VllmRunner], + prompts: List[ExplicitEncoderDecoderPrompt], + model: str, + *, + dtype: str, + max_tokens: int, + num_logprobs: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +) -> None: + with vllm_runner(model, + tokenizer_name=TOKENIZER, + dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True) as vllm_model: + vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( + prompts, max_tokens, num_logprobs) + + # Florence-2 processors require image inputs + dummy_image = Image.new(mode="RGB", size=(2, 2)) + with hf_runner(model, dtype=dtype, skip_tokenizer_init=True) as hf_model: + hf_model.model.get_output_embeddings = lambda: \ + hf_model.model.language_model.lm_head + hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit( + prompts, + max_tokens, + num_logprobs, + images=[dummy_image] * len(prompts), + )) + + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=[ + vllm_to_hf_output(vllm_output) for vllm_output in vllm_outputs + ], + name_0="hf", + name_1="vllm", + ) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float"]) +@pytest.mark.parametrize("max_tokens", [64]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_models(hf_runner, vllm_runner, model, dtype, max_tokens, + num_logprobs) -> None: + run_test( + hf_runner, + vllm_runner, + PROMPTS, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) diff --git a/tests/models/utils.py b/tests/models/utils.py index 86a624483c58a..f7802d98ad678 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -3,10 +3,10 @@ import torch -from vllm.config import ModelConfig +from vllm.config import ModelConfig, TaskOption from vllm.inputs import InputContext +from vllm.platforms import current_platform from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs -from vllm.utils import is_cpu TokensText = Tuple[List[int], str] @@ -19,7 +19,7 @@ def check_outputs_equal( name_1: str, ): """ - Compare the two sequences generated by different models, + Compare the two sequences generated by different models, which should be equal. """ assert len(outputs_0_lst) == len(outputs_1_lst) @@ -248,13 +248,14 @@ def check_logprobs_close( def build_model_context(model_name: str, + task: TaskOption = "auto", tokenizer_name: Optional[str] = None, trust_remote_code: bool = False, dtype: Optional[Union[str, torch.dtype]] = None, mm_processor_kwargs: Optional[Dict] = None, limit_mm_per_prompt: Optional[Dict] = None): """Creates an InputContext for a given model. - + Args: model_name: Name of the model being considered. tokenizer_name: Name of the tokenizer being considered. @@ -269,11 +270,12 @@ def build_model_context(model_name: str, if tokenizer_name is None: tokenizer_name = model_name if dtype is None: - dtype = "bfloat16" if is_cpu() else "half" + dtype = "bfloat16" if current_platform.is_cpu() else "half" model_config = ModelConfig( model_name, - tokenizer_name, + task=task, + tokenizer=tokenizer_name, tokenizer_mode="auto", trust_remote_code=trust_remote_code, dtype=dtype, diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py index 616a15a1328de..205ab00aa6b17 100644 --- a/tests/mq_llm_engine/test_error_handling.py +++ b/tests/mq_llm_engine/test_error_handling.py @@ -59,15 +59,7 @@ async def test_evil_forward(tmp_socket): await asyncio.sleep(2.0) await client.check_health() - # Throws an error in first forward pass. - with pytest.raises(RAISED_ERROR): - async for _ in client.generate(prompt="Hello my name is", - sampling_params=SamplingParams(), - request_id=uuid.uuid4()): - pass - assert client.errored - - # Engine is errored, should get ENGINE_DEAD_ERROR. + # Throws an error that should get ENGINE_DEAD_ERROR. with pytest.raises(MQEngineDeadError): async for _ in client.generate(prompt="Hello my name is", sampling_params=SamplingParams(), @@ -149,7 +141,7 @@ async def test_failed_abort(tmp_socket): client = await engine.make_client() assert client.is_running - # Firsh check health should work. + # First check health should work. await client.check_health() # Trigger an abort on the client side. @@ -174,6 +166,45 @@ async def test_failed_abort(tmp_socket): client.close() +@pytest.mark.asyncio +async def test_batch_error(tmp_socket): + with RemoteMQLLMEngine(engine_args=ENGINE_ARGS, + ipc_path=tmp_socket, + run_fn=run_with_evil_abort) as engine: + + client = await engine.make_client() + assert client.is_running + + # First check health should work. + await client.check_health() + + # Batch of requests + async def do_generate(client): + # min_tokens=2048 to keep busy the engine busy + # to get enough time to get process a request + # that will crash the engine + params = SamplingParams(min_tokens=2048, max_tokens=2048) + async for _ in client.generate(prompt="Hello my name is", + sampling_params=params, + request_id=uuid.uuid4()): + pass + + tasks = [asyncio.create_task(do_generate(client)) for _ in range(10)] + + # This request will force a processing batch to raise + # an exception and next the engine get errored + await client.abort(request_id="foo") + + # The batch of those request failed, then they + # should get the same exception as a MQEngineDeadError. + errors = await asyncio.gather(*tasks, return_exceptions=True) + for e in errors: + assert isinstance(e, MQEngineDeadError) + assert "KeyError" in repr(e) + + client.close() + + @pytest.mark.asyncio async def test_bad_request(tmp_socket): with RemoteMQLLMEngine(engine_args=ENGINE_ARGS, diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py index 7d09b81060efd..13ad4a7966b9d 100644 --- a/tests/multimodal/test_mapper.py +++ b/tests/multimodal/test_mapper.py @@ -24,6 +24,7 @@ def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor): model_config = ModelConfig( model=MODEL_NAME, + task="auto", tokenizer=MODEL_NAME, tokenizer_mode="auto", trust_remote_code=False, @@ -67,6 +68,7 @@ def test_llava_next_image_processor(image_assets, mm_registry, dtype, model_config = ModelConfig( model=MODEL_NAME, + task="auto", tokenizer=MODEL_NAME, tokenizer_mode="auto", trust_remote_code=False, @@ -109,6 +111,7 @@ def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid): model_config = ModelConfig( model=MODEL_NAME, + task="auto", tokenizer=MODEL_NAME, tokenizer_mode="auto", trust_remote_code=False, @@ -139,6 +142,7 @@ def test_image_mapper_multi(image_assets, mm_registry, num_images): model_config = ModelConfig( model=MODEL_NAME, + task="auto", tokenizer=MODEL_NAME, tokenizer_mode="auto", trust_remote_code=False, diff --git a/tests/multimodal/test_processor_kwargs.py b/tests/multimodal/test_processor_kwargs.py index 7b9e0b6e5234b..5044740c3e734 100644 --- a/tests/multimodal/test_processor_kwargs.py +++ b/tests/multimodal/test_processor_kwargs.py @@ -221,6 +221,7 @@ def test_max_tokens_kwarg_overrides(num_crops): expected_seq_count = DEFAULT_NUM_CROPS if num_crops is None else num_crops ctx = build_model_context(MULTIMODAL_MODEL_ID, + task="generate", trust_remote_code=True, mm_processor_kwargs=mm_processor_kwargs, limit_mm_per_prompt={"image": 1}) @@ -256,6 +257,7 @@ def test_max_tokens_kwarg_overrides(num_crops): def test_max_tokens_with_sad_kwarg_overrides(mm_processor_kwargs): """Ensure that max token calcs filters out invalid mm_processor_kwargs""" ctx = build_model_context(MULTIMODAL_MODEL_ID, + task="generate", trust_remote_code=True, mm_processor_kwargs=mm_processor_kwargs, limit_mm_per_prompt={"image": 1}) @@ -278,12 +280,13 @@ def test_max_tokens_with_sad_kwarg_overrides(mm_processor_kwargs): ### Test overrides for the mapper @pytest.mark.parametrize("num_crops", [DEFAULT_NUM_CROPS, NUM_CROPS_OVERRIDE]) -def test_default_mapper_with_processer_kwargs(image_assets, num_crops): +def test_default_mapper_with_processor_kwargs(image_assets, num_crops): """Ensure that the mapper processor kwargs can fall back to HF models.""" # NOTE - we don't validate bad inputs for the default mapper, because it's # through the automodel interface in transformers, so we can't easily # inspect what kwargs are or are not allowed. ctx = build_model_context(MULTIMODAL_MODEL_ID, + task="generate", trust_remote_code=True, mm_processor_kwargs={"num_crops": num_crops}, limit_mm_per_prompt={"image": 1}) @@ -311,6 +314,7 @@ def test_custom_mapper_kwarg_overrides(image_assets, init_num_crops, init_num_crops, inference_num_crops) ctx = build_model_context(MULTIMODAL_MODEL_ID, + task="generate", trust_remote_code=True, mm_processor_kwargs=init_kwargs, limit_mm_per_prompt={"image": 1}) @@ -348,6 +352,7 @@ def test_custom_mapper_with_sad_kwarg_overrides(image_assets, """Ensure that custom mappers filters out invalid mm_processor_kwargs""" # Should filter out the init time kwargs ctx = build_model_context(MULTIMODAL_MODEL_ID, + task="generate", trust_remote_code=True, mm_processor_kwargs=mm_processor_kwargs, limit_mm_per_prompt={"image": 1}) diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py index eeac6ab43c05f..5a28943b7ecbc 100644 --- a/tests/prefix_caching/test_disable_sliding_window.py +++ b/tests/prefix_caching/test_disable_sliding_window.py @@ -4,8 +4,8 @@ """ import pytest -from tests.conftest import cleanup from vllm import LLM +from vllm.distributed import cleanup_dist_env_and_memory MODEL_LEN_LEN = [ # Example models with sliding window. @@ -31,7 +31,7 @@ def test_disable_sliding_window(model_len_len, ): model_config.max_model_len) del vllm_disabled_model - cleanup() + cleanup_dist_env_and_memory() vllm_enabled_model = LLM(model, disable_sliding_window=False) vllm_enabled_model.generate("Hi my name is") @@ -41,4 +41,4 @@ def test_disable_sliding_window(model_len_len, ): model_config.max_model_len) del vllm_enabled_model - cleanup() + cleanup_dist_env_and_memory() diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py index f2acf0d70afef..0f01f5f819ea4 100644 --- a/tests/quantization/test_bitsandbytes.py +++ b/tests/quantization/test_bitsandbytes.py @@ -107,8 +107,7 @@ def validate_generated_texts(hf_runner, quantization='bitsandbytes', load_format='bitsandbytes', tensor_parallel_size=vllm_tp_size, - enforce_eager=False, - gpu_memory_utilization=0.8) as llm: + enforce_eager=False) as llm: vllm_outputs = llm.generate_greedy(prompts, 8) vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner") diff --git a/tests/quantization/test_configs.py b/tests/quantization/test_configs.py index d18233fe1aeae..cf77ccec7a191 100644 --- a/tests/quantization/test_configs.py +++ b/tests/quantization/test_configs.py @@ -57,7 +57,8 @@ def test_auto_gptq(model_arg_exptype: Tuple[str, None, str]) -> None: try: model_config = ModelConfig(model_path, - model_path, + task="auto", + tokenizer=model_path, tokenizer_mode="auto", trust_remote_code=False, seed=0, diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index b450ef97c89d4..b9cb3858c0068 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -4,10 +4,10 @@ import pytest from vllm import LLM, SamplingParams +from vllm.distributed import cleanup_dist_env_and_memory from vllm.model_executor.utils import set_random_seed from vllm.sequence import PromptLogprobs, SampleLogprobs -from ...conftest import cleanup from ...models.utils import (TokensTextLogprobs, TokensTextLogprobsPromptLogprobs, check_logprobs_close, check_outputs_equal) @@ -44,7 +44,7 @@ def generate(): yield llm del llm - cleanup() + cleanup_dist_env_and_memory() return generate diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py index 07b9c6b3c6be6..2a45653622448 100644 --- a/tests/tensorizer_loader/conftest.py +++ b/tests/tensorizer_loader/conftest.py @@ -1,27 +1,18 @@ -import contextlib import functools import gc from typing import Callable, TypeVar import pytest -import ray import torch from typing_extensions import ParamSpec -from vllm.distributed import (destroy_distributed_environment, - destroy_model_parallel) +from vllm.distributed import cleanup_dist_env_and_memory from vllm.model_executor.model_loader.tensorizer import TensorizerConfig @pytest.fixture(autouse=True) def cleanup(): - destroy_model_parallel() - destroy_distributed_environment() - with contextlib.suppress(AssertionError): - torch.distributed.destroy_process_group() - ray.shutdown() - gc.collect() - torch.cuda.empty_cache() + cleanup_dist_env_and_memory(shutdown_ray=True) _P = ParamSpec("_P") diff --git a/tests/test_config.py b/tests/test_config.py index b89429005e1d0..69918b67607d9 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -2,6 +2,42 @@ from vllm.config import ModelConfig + +@pytest.mark.parametrize(("model_id", "expected_task"), [ + ("facebook/opt-125m", "generate"), + ("intfloat/e5-mistral-7b-instruct", "embedding"), +]) +def test_auto_task(model_id, expected_task): + config = ModelConfig( + model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="float16", + ) + + assert config.task == expected_task + + +@pytest.mark.parametrize(("model_id", "bad_task"), [ + ("facebook/opt-125m", "embedding"), + ("intfloat/e5-mistral-7b-instruct", "generate"), +]) +def test_incorrect_task(model_id, bad_task): + with pytest.raises(ValueError, match=r"does not support the .* task"): + ModelConfig( + model_id, + task=bad_task, + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="float16", + ) + + MODEL_IDS_EXPECTED = [ ("Qwen/Qwen1.5-7B", 32768), ("mistralai/Mistral-7B-v0.1", 4096), @@ -14,7 +50,8 @@ def test_disable_sliding_window(model_id_expected): model_id, expected = model_id_expected model_config = ModelConfig( model_id, - model_id, + task="auto", + tokenizer=model_id, tokenizer_mode="auto", trust_remote_code=False, seed=0, @@ -32,7 +69,8 @@ def test_get_sliding_window(): # when use_sliding_window is False. qwen2_model_config = ModelConfig( "Qwen/Qwen1.5-7B", - "Qwen/Qwen1.5-7B", + task="auto", + tokenizer="Qwen/Qwen1.5-7B", tokenizer_mode="auto", trust_remote_code=False, seed=0, @@ -49,7 +87,8 @@ def test_get_sliding_window(): mistral_model_config = ModelConfig( "mistralai/Mistral-7B-v0.1", - "mistralai/Mistral-7B-v0.1", + task="auto", + tokenizer="mistralai/Mistral-7B-v0.1", tokenizer_mode="auto", trust_remote_code=False, seed=0, @@ -70,7 +109,8 @@ def test_rope_customization(): llama_model_config = ModelConfig( "meta-llama/Meta-Llama-3-8B-Instruct", - "meta-llama/Meta-Llama-3-8B-Instruct", + task="auto", + tokenizer="meta-llama/Meta-Llama-3-8B-Instruct", tokenizer_mode="auto", trust_remote_code=False, dtype="float16", @@ -82,7 +122,8 @@ def test_rope_customization(): llama_model_config = ModelConfig( "meta-llama/Meta-Llama-3-8B-Instruct", - "meta-llama/Meta-Llama-3-8B-Instruct", + task="auto", + tokenizer="meta-llama/Meta-Llama-3-8B-Instruct", tokenizer_mode="auto", trust_remote_code=False, dtype="float16", @@ -98,7 +139,8 @@ def test_rope_customization(): longchat_model_config = ModelConfig( "lmsys/longchat-13b-16k", - "lmsys/longchat-13b-16k", + task="auto", + tokenizer="lmsys/longchat-13b-16k", tokenizer_mode="auto", trust_remote_code=False, dtype="float16", @@ -112,7 +154,8 @@ def test_rope_customization(): longchat_model_config = ModelConfig( "lmsys/longchat-13b-16k", - "lmsys/longchat-13b-16k", + task="auto", + tokenizer="lmsys/longchat-13b-16k", tokenizer_mode="auto", trust_remote_code=False, dtype="float16", diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py index f5d9569046a63..2412da5037ece 100644 --- a/tests/test_sharded_state_loader.py +++ b/tests/test_sharded_state_loader.py @@ -46,9 +46,10 @@ def test_filter_subtensors(): @pytest.fixture(scope="module") def llama_2_7b_files(): with TemporaryDirectory() as cache_dir: - input_dir = snapshot_download("meta-llama/Llama-2-7b-hf", + input_dir = snapshot_download("meta-llama/Llama-3.2-1B", cache_dir=cache_dir, - ignore_patterns="*.bin*") + ignore_patterns=["*.bin*", "original/*"]) + yield input_dir @@ -58,9 +59,12 @@ def _run_writer(input_dir, output_dir, weights_patterns, **kwargs): # Dump worker states to output directory llm_sharded_writer.llm_engine.model_executor.save_sharded_state( path=output_dir) + # Copy metadata files to output directory for file in os.listdir(input_dir): - if not any(file.endswith(ext) for ext in weights_patterns): + if not any( + file.endswith(ext) and not os.path.isdir(file) + for ext in weights_patterns): shutil.copy(f"{input_dir}/{file}", output_dir) diff --git a/tests/test_utils.py b/tests/test_utils.py index 268e6f8194abb..0fed8e678fc76 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -59,7 +59,7 @@ def dummy(*, old_arg: object = None, new_arg: object = None): with pytest.warns(DeprecationWarning, match="'old_arg'"): dummy(old_arg=1) - with error_on_warning(): + with error_on_warning(DeprecationWarning): dummy(new_arg=1) @@ -69,10 +69,10 @@ def test_deprecate_kwargs_never(): def dummy(*, old_arg: object = None, new_arg: object = None): pass - with error_on_warning(): + with error_on_warning(DeprecationWarning): dummy(old_arg=1) - with error_on_warning(): + with error_on_warning(DeprecationWarning): dummy(new_arg=1) @@ -86,15 +86,15 @@ def dummy(*, old_arg: object = None, new_arg: object = None): with pytest.warns(DeprecationWarning, match="'old_arg'"): dummy(old_arg=1) - with error_on_warning(): + with error_on_warning(DeprecationWarning): dummy(new_arg=1) is_deprecated = False - with error_on_warning(): + with error_on_warning(DeprecationWarning): dummy(old_arg=1) - with error_on_warning(): + with error_on_warning(DeprecationWarning): dummy(new_arg=1) diff --git a/tests/tool_use/test_jamba_tool_parser.py b/tests/tool_use/test_jamba_tool_parser.py new file mode 100644 index 0000000000000..3095ef4516796 --- /dev/null +++ b/tests/tool_use/test_jamba_tool_parser.py @@ -0,0 +1,275 @@ +import json +from typing import Generator, List, Optional + +import partial_json_parser +import pytest +from partial_json_parser.core.options import Allow + +from vllm.entrypoints.openai.protocol import (DeltaMessage, FunctionCall, + ToolCall) +from vllm.entrypoints.openai.tool_parsers import JambaToolParser +from vllm.transformers_utils.detokenizer import detokenize_incrementally +from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer + +MODEL = "ai21labs/Jamba-tiny-dev" + + +@pytest.fixture(scope="module") +def jamba_tokenizer(): + return get_tokenizer(tokenizer_name=MODEL) + + +@pytest.fixture +def jamba_tool_parser(jamba_tokenizer): + return JambaToolParser(jamba_tokenizer) + + +def assert_tool_calls(actual_tool_calls: List[ToolCall], + expected_tool_calls: List[ToolCall]): + assert len(actual_tool_calls) == len(expected_tool_calls) + + for actual_tool_call, expected_tool_call in zip(actual_tool_calls, + expected_tool_calls): + assert isinstance(actual_tool_call.id, str) + assert len(actual_tool_call.id) > 16 + + assert actual_tool_call.type == "function" + assert actual_tool_call.function == expected_tool_call.function + + +def stream_delta_message_generator( + jamba_tool_parser: JambaToolParser, jamba_tokenizer: AnyTokenizer, + model_output: str) -> Generator[DeltaMessage, None, None]: + all_token_ids = jamba_tokenizer.encode(model_output, + add_special_tokens=False) + + previous_text = "" + previous_tokens = None + prefix_offset = 0 + read_offset = 0 + for i, delta_token in enumerate(all_token_ids): + delta_token_ids = [delta_token] + previous_token_ids = all_token_ids[:i] + current_token_ids = all_token_ids[:i + 1] + + (new_tokens, delta_text, new_prefix_offset, + new_read_offset) = detokenize_incrementally( + tokenizer=jamba_tokenizer, + all_input_ids=current_token_ids, + prev_tokens=previous_tokens, + prefix_offset=prefix_offset, + read_offset=read_offset, + skip_special_tokens=False, + spaces_between_special_tokens=True, + ) + + current_text = previous_text + delta_text + + delta_message = jamba_tool_parser.extract_tool_calls_streaming( + previous_text, + current_text, + delta_text, + previous_token_ids, + current_token_ids, + delta_token_ids, + request=None, # type: ignore[arg-type] + ) + if delta_message: + yield delta_message + + previous_text = current_text + previous_tokens = previous_tokens + new_tokens if previous_tokens\ + else new_tokens + prefix_offset = new_prefix_offset + read_offset = new_read_offset + + +def test_extract_tool_calls_no_tools(jamba_tool_parser): + model_output = "This is a test" + extracted_tool_calls = jamba_tool_parser.extract_tool_calls( + model_output, request=None) # type: ignore[arg-type] + assert not extracted_tool_calls.tools_called + assert extracted_tool_calls.tool_calls == [] + assert extracted_tool_calls.content == model_output + + +@pytest.mark.parametrize( + ids=[ + "single_tool", + "single_tool_with_content", + "parallel_tools", + ], + argnames=["model_output", "expected_tool_calls", "expected_content"], + argvalues=[ + ( + ''' [\n {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]''', # noqa: E501 + [ + ToolCall(function=FunctionCall(name="get_current_weather", + arguments=json.dumps( + { + "city": "Dallas", + "state": "TX", + "unit": "fahrenheit" + }))) + ], + None), + ( + ''' Sure! let me call the tool for you.[\n {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]''', # noqa: E501 + [ + ToolCall(function=FunctionCall(name="get_current_weather", + arguments=json.dumps( + { + "city": "Dallas", + "state": "TX", + "unit": "fahrenheit" + }))) + ], + " Sure! let me call the tool for you."), + ( + ''' [\n {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}},\n {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}\n]''', # noqa: E501 + [ + ToolCall(function=FunctionCall(name="get_current_weather", + arguments=json.dumps( + { + "city": "Dallas", + "state": "TX", + "unit": "fahrenheit" + }))), + ToolCall(function=FunctionCall(name="get_current_weather", + arguments=json.dumps( + { + "city": "Orlando", + "state": "FL", + "unit": "fahrenheit" + }))) + ], + None) + ], +) +def test_extract_tool_calls(jamba_tool_parser, model_output, + expected_tool_calls, expected_content): + extracted_tool_calls = jamba_tool_parser.extract_tool_calls( + model_output, request=None) # type: ignore[arg-type] + assert extracted_tool_calls.tools_called + + assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls) + + assert extracted_tool_calls.content == expected_content + + +@pytest.mark.parametrize( + ids=[ + "no_tools", + "single_tool", + "single_tool_with_content", + "parallel_tools", + ], + argnames=["model_output", "expected_tool_calls", "expected_content"], + argvalues=[ + ('''This is a test''', [], '''This is a test'''), + ( + ''' [\n {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]''', # noqa: E501 + [ + ToolCall(function=FunctionCall(name="get_current_weather", + arguments=json.dumps( + { + "city": "Dallas", + "state": "TX", + "unit": "fahrenheit" + }))) + ], + " "), + ( + ''' Sure! let me call the tool for you.[\n {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}\n]''', # noqa: E501 + [ + ToolCall(function=FunctionCall(name="get_current_weather", + arguments=json.dumps( + { + "city": "Dallas", + "state": "TX", + "unit": "fahrenheit" + }))) + ], + " Sure! let me call the tool for you."), + ( + ''' [\n {"name": "get_current_weather", "arguments": {"city": "Dallas", "state": "TX", "unit": "fahrenheit"}},\n {"name": "get_current_weather", "arguments": {"city": "Orlando", "state": "FL", "unit": "fahrenheit"}}\n]''', # noqa: E501 + [ + ToolCall(function=FunctionCall(name="get_current_weather", + arguments=json.dumps( + { + "city": "Dallas", + "state": "TX", + "unit": "fahrenheit" + }))), + ToolCall(function=FunctionCall(name="get_current_weather", + arguments=json.dumps( + { + "city": "Orlando", + "state": "FL", + "unit": "fahrenheit" + }))) + ], + " ") + ], +) +def test_extract_tool_calls_streaming(jamba_tool_parser, jamba_tokenizer, + model_output, expected_tool_calls, + expected_content): + other_content: str = '' + function_names: List[str] = [] + function_args_strs: List[str] = [] + tool_call_idx: int = -1 + tool_call_ids: List[Optional[str]] = [] + + for delta_message in stream_delta_message_generator( + jamba_tool_parser, jamba_tokenizer, model_output): + # role should never be streamed from tool parser + assert not delta_message.role + + if delta_message.content: + other_content += delta_message.content + + streamed_tool_calls = delta_message.tool_calls + + if streamed_tool_calls and len(streamed_tool_calls) > 0: + # make sure only one diff is present - correct even for parallel + assert len(streamed_tool_calls) == 1 + tool_call = streamed_tool_calls[0] + + # if a new tool is being called, set up empty arguments + if tool_call.index != tool_call_idx: + tool_call_idx = tool_call.index + function_args_strs.append("") + tool_call_ids.append(None) + + # if a tool call ID is streamed, make sure one hasn't been already + if tool_call.id and not tool_call_ids[tool_call.index]: + tool_call_ids[tool_call.index] = tool_call.id + + # if parts of the function start being streamed + if tool_call.function: + # if the function name is defined, set it. it should be streamed + # IN ENTIRETY, exactly one time. + if tool_call.function.name: + assert isinstance(tool_call.function.name, str) + function_names.append(tool_call.function.name) + + if tool_call.function.arguments: + # make sure they're a string and then add them to the list + assert isinstance(tool_call.function.arguments, str) + + function_args_strs[ + tool_call.index] += tool_call.function.arguments + + assert other_content == expected_content + + actual_tool_calls = [ + ToolCall(id=tool_call_id, + function=FunctionCall( + name=function_name, + arguments=partial_json_parser.ensure_json( + function_args_str, Allow.OBJ | Allow.STR))) + for tool_call_id, function_name, function_args_str in zip( + tool_call_ids, function_names, function_args_strs) + ] + assert_tool_calls(actual_tool_calls, expected_tool_calls) diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py index 64ed8e26f38ed..fe5fc979c66a3 100644 --- a/tests/tracing/test_tracing.py +++ b/tests/tracing/test_tracing.py @@ -87,8 +87,19 @@ def test_traces(trace_service): f"The fake trace service didn't receive a trace within " f"the {timeout} seconds timeout") - attributes = decode_attributes(trace_service.request.resource_spans[0]. - scope_spans[0].spans[0].attributes) + request = trace_service.request + assert len(request.resource_spans) == 1, ( + f"Expected 1 resource span, " + f"but got {len(request.resource_spans)}") + assert len(request.resource_spans[0].scope_spans) == 1, ( + f"Expected 1 scope span, " + f"but got {len(request.resource_spans[0].scope_spans)}") + assert len(request.resource_spans[0].scope_spans[0].spans) == 1, ( + f"Expected 1 span, " + f"but got {len(request.resource_spans[0].scope_spans[0].spans)}") + + attributes = decode_attributes( + request.resource_spans[0].scope_spans[0].spans[0].attributes) assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model assert attributes.get( SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id @@ -142,8 +153,19 @@ def test_traces_with_detailed_steps(trace_service): f"The fake trace service didn't receive a trace within " f"the {timeout} seconds timeout") - attributes = decode_attributes(trace_service.request.resource_spans[0]. - scope_spans[0].spans[0].attributes) + request = trace_service.request + assert len(request.resource_spans) == 1, ( + f"Expected 1 resource span, " + f"but got {len(request.resource_spans)}") + assert len(request.resource_spans[0].scope_spans) == 1, ( + f"Expected 1 scope span, " + f"but got {len(request.resource_spans[0].scope_spans)}") + assert len(request.resource_spans[0].scope_spans[0].spans) == 1, ( + f"Expected 1 span, " + f"but got {len(request.resource_spans[0].scope_spans[0].spans)}") + + attributes = decode_attributes( + request.resource_spans[0].scope_spans[0].spans[0].attributes) assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model assert attributes.get( SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id diff --git a/tests/utils.py b/tests/utils.py index 115cab80691f0..e983104e3cb0c 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -8,7 +8,7 @@ import warnings from contextlib import contextmanager from pathlib import Path -from typing import Any, Callable, Dict, List, Literal, Optional, Union +from typing import Any, Callable, Dict, List, Literal, Optional, Type, Union import openai import pytest @@ -454,13 +454,13 @@ def multi_process_parallel( @contextmanager -def error_on_warning(): +def error_on_warning(category: Type[Warning] = Warning): """ Within the scope of this context manager, tests will fail if any warning - is emitted. + of the given category is emitted. """ with warnings.catch_warnings(): - warnings.simplefilter("error") + warnings.filterwarnings("error", category=category) yield @@ -587,7 +587,7 @@ def large_gpu_test(*, min_gb: int): ) def wrapper(f: Callable[_P, None]) -> Callable[_P, None]: - return test_skipif(fork_new_process_for_each_test(f)) + return test_skipif(f) return wrapper diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py index 3dccc1b325d95..e75884a7395e2 100644 --- a/tests/worker/test_encoder_decoder_model_runner.py +++ b/tests/worker/test_encoder_decoder_model_runner.py @@ -5,8 +5,9 @@ import torch from vllm.engine.arg_utils import EngineArgs +from vllm.platforms import current_platform from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata -from vllm.utils import is_cpu, make_tensor_with_pad +from vllm.utils import make_tensor_with_pad from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner from vllm.worker.model_runner import _get_graph_batch_size @@ -31,7 +32,7 @@ def _create_model_runner(model: str, *args, return model_runner -@pytest.mark.skipif(condition=is_cpu(), +@pytest.mark.skipif(condition=current_platform.is_cpu(), reason="CPU backend is currently " "unsupported for encoder/ " "decoder models") @@ -74,7 +75,7 @@ def test_empty_seq_group(): assert return_seq_lens is None -@pytest.mark.skipif(condition=is_cpu(), +@pytest.mark.skipif(condition=current_platform.is_cpu(), reason="CPU backend is currently " "unsupported for encoder/ " "decoder models") @@ -264,7 +265,7 @@ def test_prepare_prompt(batch_size): assert torch.equal(actual, expected) -@pytest.mark.skipif(condition=is_cpu(), +@pytest.mark.skipif(condition=current_platform.is_cpu(), reason="CPU backend is currently " "unsupported for encoder/ " "decoder models") @@ -490,7 +491,7 @@ def test_prepare_decode(batch_size, multiple_seqs_per_seq_group): def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group): """ Tests that for encoder-decoder models with CUDA Graph capture and replay - enabled, the tensors used during the decode phase are correctly padded + enabled, the tensors used during the decode phase are correctly padded for varying input batch sizes. """ model_runner = _create_model_runner( diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py index 7e9138dc8d779..acd2ed6836365 100644 --- a/tests/worker/test_profile.py +++ b/tests/worker/test_profile.py @@ -54,16 +54,17 @@ def mock_mem_info(): gpu_blocks, _ = worker.determine_num_available_blocks() # Peak vram usage by torch should be 0.7077 GiB - # Non-torch allocations should be 0.0079 GiB + # No memory should be allocated outside of torch # 9.0 GiB should be the utilization target - # 8.2843 GiB should be available for the KV cache + # 8.2923 GiB should be available for the KV cache block_size = CacheEngine.get_cache_block_size( engine_config.cache_config, engine_config.model_config, engine_config.parallel_config) - expected_blocks = (8.2843 * 1024**3) // block_size + expected_blocks = (8.2923 * 1024**3) // block_size # Check within a small tolerance for portability # Hardware, kernel, or dependency changes could all affect memory - # utilization - assert abs(gpu_blocks - expected_blocks) < 5 + # utilization. + # A 10 block tolerance here should be about 6MB of wiggle room. + assert abs(gpu_blocks - expected_blocks) < 10 diff --git a/tools/mypy.sh b/tools/mypy.sh index d69b61c7f34fc..14b0976a27da5 100755 --- a/tools/mypy.sh +++ b/tools/mypy.sh @@ -2,6 +2,10 @@ CI=${1:-0} +if [ $CI -eq 1 ]; then + set -e +fi + run_mypy() { echo "Running mypy on $1" if [ $CI -eq 1 ] && [ -z "$1" ]; then diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index b2952bbfa917c..a25f7abca5498 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -26,7 +26,8 @@ import vllm._moe_C # noqa: F401 supports_moe_ops = True -if TYPE_CHECKING: +# neuron has torch version that doesn't even have impl_abstract +if TYPE_CHECKING or current_platform.is_neuron(): def register_fake(fn): return lambda name: fn diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index d54dbdcb19495..ffa05e80623ac 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -32,7 +32,7 @@ def get_supported_head_sizes() -> List[int]: @staticmethod def get_name() -> str: - return "flash-attn" + return "FLASH_ATTN" @staticmethod def get_impl_cls() -> Type["FlashAttentionImpl"]: @@ -524,8 +524,8 @@ def __init__( if alibi_slopes is not None: alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) self.alibi_slopes = alibi_slopes - self.sliding_window = ((sliding_window, sliding_window) - if sliding_window is not None else (-1, -1)) + self.sliding_window = ((sliding_window - 1, + 0) if sliding_window is not None else (-1, -1)) self.kv_cache_dtype = kv_cache_dtype if logits_soft_cap is None: # In flash-attn, setting logits_soft_cap as 0 means no soft cap. @@ -535,12 +535,6 @@ def __init__( assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads - if sliding_window is not None: - # NOTE(woosuk): flash-attn's sliding window does not work with - # paged KV cache. - raise ValueError( - "Sliding window is not supported in FlashAttention.") - support_head_sizes = FlashAttentionBackend.get_supported_head_sizes() if head_size not in support_head_sizes: raise ValueError( @@ -704,6 +698,7 @@ def unified_flash_attention( max_seqlen_k=max_seq_len, softmax_scale=softmax_scale, causal=True, + window_size=window_size, alibi_slopes=alibi_slopes, block_table=prefill_meta.block_tables, softcap=logits_soft_cap, @@ -725,6 +720,7 @@ def unified_flash_attention( max_seqlen_k=decode_meta.max_decode_seq_len, softmax_scale=softmax_scale, causal=True, + window_size=window_size, alibi_slopes=alibi_slopes, softcap=logits_soft_cap, block_table=decode_meta.block_tables, @@ -739,6 +735,7 @@ def unified_flash_attention( cache_seqlens=decode_meta.seq_lens_tensor, softmax_scale=softmax_scale, causal=True, + window_size=window_size, alibi_slopes=alibi_slopes, softcap=logits_soft_cap, ).squeeze(1) diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index dd9a0fb9d94df..e43fb134a6a5a 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -17,6 +17,7 @@ import torch +import vllm.envs as envs from vllm import _custom_ops as ops from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, @@ -39,7 +40,7 @@ class FlashInferBackend(AttentionBackend): @staticmethod def get_name() -> str: - return "flashinfer" + return "FLASHINFER" @staticmethod def get_impl_cls() -> Type["FlashInferImpl"]: @@ -124,7 +125,8 @@ def _get_decode_wrapper(self): self.runner.parallel_config)) num_kv_heads = self.runner.model_config.get_num_kv_heads( self.runner.parallel_config) - use_tensor_cores = num_qo_heads // num_kv_heads > 4 + use_tensor_cores = envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or ( + num_qo_heads // num_kv_heads > 4) self._decode_wrapper = BatchDecodeWithPagedKVCacheWrapper( self._get_workspace_buffer(), "NHD", @@ -183,7 +185,8 @@ def graph_capture_get_metadata_for_batch( self.runner.parallel_config)) num_kv_heads = self.runner.model_config.get_num_kv_heads( self.runner.parallel_config) - use_tensor_cores = num_qo_heads // num_kv_heads > 4 + use_tensor_cores = envs.VLLM_FLASHINFER_FORCE_TENSOR_CORES or ( + num_qo_heads // num_kv_heads > 4) self._graph_decode_wrapper = \ CUDAGraphBatchDecodeWithPagedKVCacheWrapper( self._graph_decode_workspace_buffer, _indptr_buffer, diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index 7398732ddfc92..1eb5fe10d76db 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -19,7 +19,7 @@ class IpexAttnBackend(AttentionBackend): @staticmethod def get_name() -> str: - return "ipex-attn" + return "IPEX" @staticmethod def get_impl_cls() -> Type["IpexAttnBackendImpl"]: diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py index 8b36230730380..6fddfc2002120 100644 --- a/vllm/attention/backends/openvino.py +++ b/vllm/attention/backends/openvino.py @@ -38,7 +38,7 @@ class OpenVINOAttentionBackend(AttentionBackend): @staticmethod def get_name() -> str: - return "openvino" + return "OPENVINO" @staticmethod def get_impl_cls(): diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index 56d3d3b482e58..6fee81de14420 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -11,6 +11,10 @@ class PallasAttentionBackend(AttentionBackend): + @staticmethod + def get_name() -> str: + return "PALLAS" + @staticmethod def get_impl_cls() -> Type["PallasAttentionBackendImpl"]: return PallasAttentionBackendImpl diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py index 3987986f1786b..4116fbf00020c 100644 --- a/vllm/attention/backends/placeholder_attn.py +++ b/vllm/attention/backends/placeholder_attn.py @@ -20,7 +20,7 @@ class PlaceholderAttentionBackend(AttentionBackend): @staticmethod def get_name() -> str: - return "placeholder-attn" + return "NO_ATTENTION" @staticmethod def get_impl_cls() -> Type["PlaceholderAttentionImpl"]: diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 682eac50126ad..c2aec4aaa74e7 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -28,7 +28,7 @@ class ROCmFlashAttentionBackend(AttentionBackend): @staticmethod def get_name() -> str: - return "rocm-flash-attn" + return "ROCM_FLASH" @staticmethod def get_impl_cls() -> Type["ROCmFlashAttentionImpl"]: diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index ef8d576616838..f985f70728a60 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -10,9 +10,9 @@ AttentionMetadata, AttentionType) from vllm.attention.backends.utils import CommonAttentionState from vllm.attention.ops.paged_attn import PagedAttentionMetadata -from vllm.utils import is_cpu +from vllm.platforms import current_platform -if is_cpu(): +if current_platform.is_cpu(): try: from vllm.attention.ops.ipex_attn import PagedAttention except ImportError: @@ -25,7 +25,7 @@ class TorchSDPABackend(AttentionBackend): @staticmethod def get_name() -> str: - return "torch-sdpa" + return "TORCH_SDPA" @staticmethod def get_impl_cls() -> Type["TorchSDPABackendImpl"]: @@ -234,10 +234,10 @@ def get_seq_len_block_table_args( on the type of attention operation. Decoder attn -> select entirely decoder self-attention-related fields - Encoder/decoder cross-attn -> select encoder sequence lengths & + Encoder/decoder cross-attn -> select encoder sequence lengths & cross-attn block-tables fields Encoder attn -> select encoder sequence lengths fields & no block tables - + Arguments: * attn_metadata: Attention metadata structure associated with attention diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 358a223e7ed0e..d1a44f3e8bfa6 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -317,8 +317,8 @@ def graph_capture_get_metadata_for_batch( if is_encoder_decoder_model: # The encoder decoder model works only with XFormers backend. # Assert the same. - assert self.runner.attn_backend.get_name() == "xformers", \ - f"Expected attn_backend name to be 'xformers', but "\ + assert self.runner.attn_backend.get_name() == "XFORMERS", \ + f"Expected attn_backend name to be 'XFORMERS', but "\ f" got '{self.runner.attn_backend.get_name()}'" self._update_captured_metadata_for_enc_dec_model( batch_size=batch_size, attn_metadata=attn_metadata) @@ -337,8 +337,8 @@ def get_graph_input_buffers( if is_encoder_decoder_model: # The encoder decoder model works only with XFormers backend. # Assert the same. - assert self.runner.attn_backend.get_name() == "xformers", \ - f"Expected attn_backend name to be 'xformers', but "\ + assert self.runner.attn_backend.get_name() == "XFORMERS", \ + f"Expected attn_backend name to be 'XFORMERS', but "\ f" got '{self.runner.attn_backend.get_name()}'" self._add_additonal_input_buffers_for_enc_dec_model( attn_metadata=attn_metadata, input_buffers=input_buffers) @@ -356,8 +356,8 @@ def prepare_graph_input_buffers( if is_encoder_decoder_model: # The encoder decoder model works only with XFormers backend. # Assert the same. - assert self.runner.attn_backend.get_name() == "xformers", \ - f"Expected attn_backend name to be 'xformers', but "\ + assert self.runner.attn_backend.get_name() == "XFORMERS", \ + f"Expected attn_backend name to be 'XFORMERS', but "\ f" got '{self.runner.attn_backend.get_name()}'" self._prepare_input_buffers_for_enc_dec_model( attn_metadata, input_buffers) diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 650bc6ec7750a..5aaf13d8ea744 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -24,7 +24,7 @@ class XFormersBackend(AttentionBackend): @staticmethod def get_name() -> str: - return "xformers" + return "XFORMERS" @staticmethod def get_impl_cls() -> Type["XFormersImpl"]: diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index b46f0721d0caf..33d05cbd3fe01 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -78,10 +78,9 @@ def __init__( # During model initialization, the default dtype is set as the model # weight and activation dtype. dtype = torch.get_default_dtype() - attn_backend = get_attn_backend(head_size, sliding_window, dtype, - kv_cache_dtype, block_size, - is_attention_free, blocksparse_params - is not None) + attn_backend = get_attn_backend(head_size, dtype, kv_cache_dtype, + block_size, is_attention_free, + blocksparse_params is not None) impl_cls = attn_backend.get_impl_cls() self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, diff --git a/vllm/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py index 1ead541f391b5..e4dc576d27932 100644 --- a/vllm/attention/ops/blocksparse_attention/interface.py +++ b/vllm/attention/ops/blocksparse_attention/interface.py @@ -3,7 +3,7 @@ import torch from vllm.platforms import current_platform -from vllm.utils import is_cpu, is_hip +from vllm.utils import is_hip from .utils import (dense_to_crow_col, get_head_sliding_step, get_sparse_attn_mask) @@ -32,7 +32,7 @@ def __init__( ): super().__init__() if use_spda is None: - use_spda = is_hip() or is_cpu() or not \ + use_spda = is_hip() or current_platform.is_cpu() or not \ IS_COMPUTE_8_OR_ABOVE device = device or (torch.cuda.current_device() if current_platform.is_cuda_alike() else "cpu") @@ -109,13 +109,13 @@ def varlen_attn(self, q, k, v: shape = (num_tokens, num_heads_q/kv, head_size). Support grouped attention, with `q[:, i*r:(i*r + r)]` is correspondent to `k[:, i]`, where `r` is the q/k ratio. - cu_seqlens_k: shape=(batch_size + 1,), - indicating segment of samples, + cu_seqlens_k: shape=(batch_size + 1,), + indicating segment of samples, e.g., `k[cu_seqlen[i]:cu_seqlne[i+1]]` is q of sample i cu_seqlens_q: shape=(batch_size + 1, ). Default None: same as cu_seqlens_k for prefilling or [0, 1, .., batch_size] for decoding. - The only case you need to specify is when q is a mix of + The only case you need to specify is when q is a mix of prefilling and decoding. sm_scale: softmax scale, default to 1/sqrt(head_size). @@ -171,7 +171,7 @@ def transpose_and_unpad(x_padded, cu_seqlens): def spda(self, q, k, v, cu_seqlens_k, cu_seqlens_q=None, sm_scale=None): """For CPU, V100 or other older GPUs. - NOTE: torch SPDA supports nested tensor, + NOTE: torch SPDA supports nested tensor, but seems extremely slow. Choose to pad instead. """ assert (cu_seqlens_q is None or @@ -201,8 +201,8 @@ def spda(self, q, k, v, cu_seqlens_k, cu_seqlens_q=None, sm_scale=None): return self.transpose_and_unpad(spda_output, cu_seqlens) def forward(self, q, k, v, cu_seqlens_k, cu_seqlens_q=None, sm_scale=None): - """Dispatch to `varlen_attn` (Ampere or newer) or - `self.spda`(cpu, Volta, Turing or older)based on + """Dispatch to `varlen_attn` (Ampere or newer) or + `self.spda`(cpu, Volta, Turing or older)based on the type of device used and cuda compute capability. q, k, v: shape = (num_tokens, num_heads_q/kv, head_size). @@ -213,8 +213,8 @@ def forward(self, q, k, v, cu_seqlens_k, cu_seqlens_q=None, sm_scale=None): cu_seqlens_q: shape=(batch_size + 1, ). Default None: same as cu_seqlens_k for prefilling or [0, 1, .., batch_size] for decoding. - The only case you need to specify - is when q is a mix of prefilling + The only case you need to specify + is when q is a mix of prefilling and decoding. sm_scale: softmax scale, default to 1/sqrt(head_size). diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 7edb7676ea2cd..cd3c642b8c8a2 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -10,13 +10,14 @@ from vllm.attention.backends.abstract import AttentionBackend from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import STR_BACKEND_ENV_VAR, is_cpu, is_hip, is_openvino, is_xpu +from vllm.utils import STR_BACKEND_ENV_VAR, is_hip, is_openvino logger = init_logger(__name__) class _Backend(enum.Enum): FLASH_ATTN = enum.auto() + FLASH_ATTN_VLLM_V1 = enum.auto() XFORMERS = enum.auto() ROCM_FLASH = enum.auto() TORCH_SDPA = enum.auto() @@ -90,7 +91,6 @@ def get_global_forced_attn_backend() -> Optional[_Backend]: @lru_cache(maxsize=None) def get_attn_backend( head_size: int, - sliding_window: Optional[int], dtype: torch.dtype, kv_cache_dtype: Optional[str], block_size: int, @@ -105,12 +105,16 @@ def get_attn_backend( BlocksparseFlashAttentionBackend) return BlocksparseFlashAttentionBackend - backend = which_attn_to_use(head_size, sliding_window, dtype, - kv_cache_dtype, block_size, is_attention_free) + backend = which_attn_to_use(head_size, dtype, kv_cache_dtype, block_size, + is_attention_free) if backend == _Backend.FLASH_ATTN: from vllm.attention.backends.flash_attn import ( # noqa: F401 FlashAttentionBackend) return FlashAttentionBackend + if backend == _Backend.FLASH_ATTN_VLLM_V1: + from vllm.v1.attention.backends.flash_attn import ( # noqa: F401 + FlashAttentionBackend as FlashAttentionBackendV1) + return FlashAttentionBackendV1 if backend == _Backend.XFORMERS: logger.info("Using XFormers backend.") from vllm.attention.backends.xformers import ( # noqa: F401 @@ -122,7 +126,7 @@ def get_attn_backend( ROCmFlashAttentionBackend) return ROCmFlashAttentionBackend elif backend == _Backend.TORCH_SDPA: - assert is_cpu(), RuntimeError( + assert current_platform.is_cpu(), RuntimeError( "Torch SDPA backend is only used for the CPU device.") logger.info("Using Torch SDPA backend.") from vllm.attention.backends.torch_sdpa import TorchSDPABackend @@ -132,7 +136,7 @@ def get_attn_backend( from vllm.attention.backends.openvino import OpenVINOAttentionBackend return OpenVINOAttentionBackend elif backend == _Backend.IPEX: - assert is_xpu(), RuntimeError( + assert current_platform.is_xpu(), RuntimeError( "IPEX attention backend is only used for the XPU device.") logger.info("Using IPEX attention backend.") from vllm.attention.backends.ipex_attn import IpexAttnBackend @@ -155,7 +159,6 @@ def get_attn_backend( def which_attn_to_use( head_size: int, - sliding_window: Optional[int], dtype: torch.dtype, kv_cache_dtype: Optional[str], block_size: int, @@ -185,7 +188,7 @@ def which_attn_to_use( if backend_by_env_var is not None: selected_backend = backend_name_to_enum(backend_by_env_var) - if is_cpu(): + if current_platform.is_cpu(): if selected_backend != _Backend.TORCH_SDPA: logger.info("Cannot use %s backend on CPU.", selected_backend) return _Backend.TORCH_SDPA @@ -195,7 +198,7 @@ def which_attn_to_use( logger.info("Cannot use %s backend on OpenVINO.", selected_backend) return _Backend.OPENVINO - if is_xpu(): + if current_platform.is_xpu(): if selected_backend != _Backend.IPEX: logger.info("Cannot use %s backend on XPU.", selected_backend) return _Backend.IPEX @@ -217,6 +220,9 @@ def which_attn_to_use( logger.info("%s is not supported in AMD GPUs.", selected_backend) return _Backend.ROCM_FLASH + if envs.VLLM_USE_V1: + return _Backend.FLASH_ATTN_VLLM_V1 + # FlashAttn in NVIDIA GPUs. if selected_backend == _Backend.FLASH_ATTN: if not current_platform.has_device_capability(80): @@ -243,10 +249,6 @@ def which_attn_to_use( "Cannot use FlashAttention-2 backend for block size not " "divisible by 16.") selected_backend = _Backend.XFORMERS - elif sliding_window is not None: - logger.info( - "Cannot use FlashAttention-2 backend due to sliding window.") - selected_backend = _Backend.XFORMERS # FlashAttn is valid for the model, checking if the package is installed. if selected_backend == _Backend.FLASH_ATTN: diff --git a/vllm/beam_search.py b/vllm/beam_search.py index 04624b8b94432..1b48538734dae 100644 --- a/vllm/beam_search.py +++ b/vllm/beam_search.py @@ -1,5 +1,7 @@ from dataclasses import dataclass -from typing import List, Optional +from typing import Dict, List, Optional + +from vllm.sequence import Logprob @dataclass @@ -11,6 +13,7 @@ class BeamSearchSequence: """ # The tokens includes the prompt. tokens: List[int] + logprobs: List[Dict[int, Logprob]] cum_logprob: float = 0.0 text: Optional[str] = None @@ -28,7 +31,7 @@ class BeamSearchInstance: def __init__(self, prompt_tokens: List[int]): self.beams: List[BeamSearchSequence] = [ - BeamSearchSequence(tokens=prompt_tokens) + BeamSearchSequence(tokens=prompt_tokens, logprobs=[]) ] self.completed: List[BeamSearchSequence] = [] diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 3ae74cc5cb7dd..0449f9354d0a2 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -1,24 +1,58 @@ import inspect -from typing import Dict, List, Union +from typing import Dict, List, Optional, Union import torch import vllm.envs as envs from vllm.compilation.levels import CompilationLevel from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher +from vllm.logger import init_logger from vllm.sequence import IntermediateTensors from vllm.utils import supports_dynamo +logger = init_logger(__name__) -def support_torch_compile(dynamic_arg_dims: Dict[str, Union[int, List[int]]]): + +def support_torch_compile( + cls: Optional[type] = None, + dynamic_arg_dims: Optional[Dict[str, Union[int, List[int]]]] = None): """ A decorator to add support for compiling the forward method of a class. + Usage 1: use directly as a decorator without arguments: + + ```python + @support_torch_compile + class MyModel(nn.Module): + def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]): + ... + ``` + + Usage 2: use as a decorator with arguments: + + ```python + @support_torch_compile(dynamic_arg_dims={"x": 0, "y": 0}) + class MyModel(nn.Module): + def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]): + ... + ``` + `dynamic_arg_dims` is a dictionary that maps argument names to the dynamic dimensions of the argument. The dynamic dimensions can be either a single integer or a list of integers. - Depending on the value of arguments: + if `dynamic_arg_dims` is `None`, it is inferred from the type annotation + of the `forward` method, based on the following default rules: + + - if the argument is annotated as `torch.Tensor` or + `Optional[torch.Tensor]`, the first dimension will be + marked as dynamic. + - if the argument is annotated as `IntermediateTensors`, the first + dimension of all the tensors in the intermediate tensors + will be marked as dynamic. + + During runtime, when we actually mark dimensions of tensors, + it depends on the value of arguments: - if it is a single integer, the corresponding dimension of the argument will be marked as dynamic. @@ -38,11 +72,35 @@ def cls_decorator_helper(cls: type): if not hasattr(cls, 'forward'): raise TypeError("decorated class should have a forward method.") sig = inspect.signature(cls.forward) - for k in dynamic_arg_dims: + inferred_dynamic_arg_dims = dynamic_arg_dims + if inferred_dynamic_arg_dims is None: + inferred_dynamic_arg_dims = {} + for k, v in sig.parameters.items(): + if v.annotation in [ + torch.Tensor, Optional[torch.Tensor], + IntermediateTensors, Optional[IntermediateTensors] + ]: + inferred_dynamic_arg_dims[k] = 0 + + logger.debug(("Inferred dynamic dimensions for " + "forward method of %s: %s"), cls, + list(inferred_dynamic_arg_dims.keys())) + + if len(inferred_dynamic_arg_dims) == 0: + raise ValueError( + "No dynamic dimensions found in the forward method of " + f"{cls}. Please provide dynamic_arg_dims explicitly.") + + for k in inferred_dynamic_arg_dims: if k not in sig.parameters: raise ValueError( f"Argument {k} not found in the forward method of {cls}") - return _support_torch_compile(cls, dynamic_arg_dims) + return _support_torch_compile(cls, inferred_dynamic_arg_dims) + + if cls is not None: + # use `support_torch_compile` as a decorator without arguments + assert isinstance(cls, type) + return cls_decorator_helper(cls) return cls_decorator_helper diff --git a/vllm/config.py b/vllm/config.py index 4533fb017188c..c569789c650ab 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1,8 +1,8 @@ import enum import json from dataclasses import dataclass, field, fields -from typing import (TYPE_CHECKING, Any, ClassVar, Dict, List, Mapping, - Optional, Tuple, Type, Union) +from typing import (TYPE_CHECKING, Any, ClassVar, Dict, Final, List, Literal, + Mapping, Optional, Set, Tuple, Type, Union) import torch from transformers import PretrainedConfig @@ -17,8 +17,7 @@ get_hf_image_processor_config, get_hf_text_config) from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory, - is_hip, is_neuron, is_openvino, is_xpu, - print_warning_once) + is_hip, is_openvino, print_warning_once) if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup @@ -33,6 +32,11 @@ _EMBEDDING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768 _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120 +TaskOption = Literal["auto", "generate", "embedding"] + +# "draft" is only used internally for speculative decoding +_Task = Literal["generate", "embedding", "draft"] + class ModelConfig: """Configuration for the model. @@ -40,7 +44,11 @@ class ModelConfig: Args: model: Name or path of the huggingface model to use. It is also used as the content for `model_name` tag in metrics - output when `served_model_name` is not specified. + output when `served_model_name` is not specified. + task: The task to use the model for. Each vLLM instance only supports + one task, even if the same model can be used for multiple tasks. + When the model only supports one task, "auto" can be used to select + it; otherwise, you must specify explicitly which task to use. tokenizer: Name or path of the huggingface tokenizer to use. tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if available, "slow" will always use the slow tokenizer, and @@ -108,6 +116,7 @@ class ModelConfig: def __init__(self, model: str, + task: Union[TaskOption, _Task], tokenizer: str, tokenizer_mode: str, trust_remote_code: bool, @@ -205,9 +214,15 @@ def __init__(self, self.is_attention_free = self._init_attention_free() self.has_inner_state = self._init_has_inner_state() - self.override_neuron_config = override_neuron_config if is_neuron( - ) else None - self._verify_embedding_mode() + if current_platform.is_neuron(): + self.override_neuron_config = override_neuron_config + else: + self.override_neuron_config = None + + supported_tasks, task = self._resolve_task(task, self.hf_config) + self.supported_tasks = supported_tasks + self.task: Final = task + self._verify_quantization() self._verify_cuda_graph() self._verify_bnb_config() @@ -241,18 +256,44 @@ def _verify_tokenizer_mode(self) -> None: "either 'auto', 'slow' or 'mistral'.") self.tokenizer_mode = tokenizer_mode - def _verify_embedding_mode(self) -> None: - architectures = getattr(self.hf_config, "architectures", []) + def _resolve_task( + self, + task_option: Union[TaskOption, _Task], + hf_config: PretrainedConfig, + ) -> Tuple[Set[_Task], _Task]: + if task_option == "draft": + return {"draft"}, "draft" + + architectures = getattr(hf_config, "architectures", []) + + task_support: Dict[_Task, bool] = { + # NOTE: Listed from highest to lowest priority, + # in case the model supports multiple of them + "generate": ModelRegistry.is_text_generation_model(architectures), + "embedding": ModelRegistry.is_embedding_model(architectures), + } + supported_tasks_lst: List[_Task] = [ + task for task, is_supported in task_support.items() if is_supported + ] + supported_tasks = set(supported_tasks_lst) + + if task_option == "auto": + selected_task = next(iter(supported_tasks_lst)) - # TODO: Allow the same model architecture to be specified as either - # generation or embedding model - if "Phi3VForCausalLM" in architectures: - # Match both remote and local names - embedding_mode = "/VLM2Vec" in self.model + if len(supported_tasks) > 1: + logger.info( + "This model supports multiple tasks: %s. " + "Defaulting to '%s'.", supported_tasks, selected_task) else: - embedding_mode = ModelRegistry.is_embedding_model(architectures) + if task_option not in supported_tasks: + msg = ( + f"This model does not support the '{task_option}' task. " + f"Supported tasks: {supported_tasks}") + raise ValueError(msg) - self.embedding_mode = embedding_mode + selected_task = task_option + + return supported_tasks, selected_task def _parse_quant_hf_config(self): quant_cfg = getattr(self.hf_config, "quantization_config", None) @@ -328,7 +369,7 @@ def _verify_quantization(self) -> None: "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ" " is not set, enabling VLLM_USE_TRITON_AWQ.") envs.VLLM_USE_TRITON_AWQ = True - if is_neuron( + if current_platform.is_neuron( ) and self.quantization not in neuron_supported_quantization: raise ValueError( f"{self.quantization} quantization is currently not " @@ -401,7 +442,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config, # Async postprocessor is not necessary with embedding mode # since there is no token generation - if self.embedding_mode: + if self.task == "embedding": self.use_async_output_proc = False # Reminder: Please update docs/source/serving/compatibility_matrix.rst @@ -582,11 +623,6 @@ def is_encoder_decoder_model(self) -> bool: (hasattr(self.hf_config, "text_config") and getattr( self.hf_config.text_config, "is_encoder_decoder", False))) - @property - def is_embedding_model(self) -> bool: - """Extract the embedding model flag.""" - return self.embedding_mode - @property def is_multimodal_model(self) -> bool: return self.multimodal_config is not None @@ -943,6 +979,7 @@ class SchedulerConfig: """Scheduler configuration. Args: + task: The task to use the model for. max_num_batched_tokens: Maximum number of tokens to be processed in a single iteration. max_num_seqs: Maximum number of sequences to be processed in a single @@ -957,7 +994,6 @@ class SchedulerConfig: prompt latency) before scheduling next prompt. enable_chunked_prefill: If True, prefill requests can be chunked based on the remaining max_num_batched_tokens. - embedding_mode: Whether the running model is for embedding. preemption_mode: Whether to perform preemption by swapping or recomputation. If not specified, we determine the mode as follows: We use recomputation by default since it incurs lower overhead than @@ -972,13 +1008,13 @@ class SchedulerConfig: """ def __init__(self, + task: _Task, max_num_batched_tokens: Optional[int], max_num_seqs: int, max_model_len: int, num_lookahead_slots: int = 0, delay_factor: float = 0.0, enable_chunked_prefill: bool = False, - embedding_mode: bool = False, is_multimodal_model: bool = False, preemption_mode: Optional[str] = None, num_scheduler_steps: int = 1, @@ -1002,7 +1038,7 @@ def __init__(self, # for higher throughput. max_num_batched_tokens = max(max_model_len, 2048) - if embedding_mode: + if task == "embedding": # For embedding, choose specific value for higher throughput max_num_batched_tokens = max( max_num_batched_tokens, @@ -1022,12 +1058,12 @@ def __init__(self, "Chunked prefill is enabled with max_num_batched_tokens=%d.", self.max_num_batched_tokens) + self.task: Final = task self.max_num_seqs = max_num_seqs self.max_model_len = max_model_len self.num_lookahead_slots = num_lookahead_slots self.delay_factor = delay_factor self.chunked_prefill_enabled = enable_chunked_prefill - self.embedding_mode = embedding_mode self.preemption_mode = preemption_mode self.num_scheduler_steps = num_scheduler_steps self.multi_step_stream_outputs = multi_step_stream_outputs @@ -1077,7 +1113,7 @@ def __init__(self, device: str = "auto") -> None: # Automated device type detection if current_platform.is_cuda_alike(): self.device_type = "cuda" - elif is_neuron(): + elif current_platform.is_neuron(): self.device_type = "neuron" elif is_openvino(): self.device_type = "openvino" @@ -1085,7 +1121,7 @@ def __init__(self, device: str = "auto") -> None: self.device_type = "tpu" elif current_platform.is_cpu(): self.device_type = "cpu" - elif is_xpu(): + elif current_platform.is_xpu(): self.device_type = "xpu" else: raise RuntimeError("Failed to infer device type") @@ -1239,6 +1275,7 @@ def maybe_create_spec_config( ngram_prompt_lookup_min = 0 draft_model_config = ModelConfig( model=speculative_model, + task="draft", tokenizer=target_model_config.tokenizer, tokenizer_mode=target_model_config.tokenizer_mode, trust_remote_code=target_model_config.trust_remote_code, @@ -1372,11 +1409,11 @@ def create_draft_parallel_config( else: speculative_draft_tensor_parallel_size = \ target_parallel_config.tensor_parallel_size - elif speculative_draft_tensor_parallel_size != 1: - # TODO(wooyeon): allow tp values larger than 1 + elif speculative_draft_tensor_parallel_size not in ( + 1, target_parallel_config.tensor_parallel_size): raise ValueError( f"{speculative_draft_tensor_parallel_size=} cannot be " - f"other value than 1") + f"other value than 1 or target model tensor_parallel_size") draft_parallel_config = ParallelConfig( pipeline_parallel_size=target_parallel_config. diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 7c8a2bc493513..57527e39b9bdd 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -7,7 +7,7 @@ from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device from vllm.core.block.naive_block import (BlockPool, NaiveBlock, NaiveBlockAllocator) -from vllm.core.evictor_v2 import EvictionPolicy, Evictor, make_evictor +from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor PrefixHash = int diff --git a/vllm/core/evictor_v2.py b/vllm/core/evictor.py similarity index 100% rename from vllm/core/evictor_v2.py rename to vllm/core/evictor.py diff --git a/vllm/core/evictor_v1.py b/vllm/core/evictor_v1.py deleted file mode 100644 index 5db5a08a5bb67..0000000000000 --- a/vllm/core/evictor_v1.py +++ /dev/null @@ -1,106 +0,0 @@ -import enum -from abc import ABC, abstractmethod -from typing import OrderedDict - -from vllm.block import PhysicalTokenBlock - - -class EvictionPolicy(enum.Enum): - """Enum for eviction policy used by make_evictor to instantiate the correct - Evictor subclass. - """ - LRU = enum.auto() - - -class Evictor(ABC): - """The Evictor subclasses should be used by the BlockAllocator class to - handle eviction of freed PhysicalTokenBlocks. - """ - - @abstractmethod - def __init__(self): - pass - - @abstractmethod - def __contains__(self, block_hash: int) -> bool: - pass - - @abstractmethod - def evict(self) -> PhysicalTokenBlock: - """Runs the eviction algorithm and returns the evicted block""" - pass - - @abstractmethod - def add(self, block: PhysicalTokenBlock): - """Adds block to the evictor, making it a candidate for eviction""" - pass - - @abstractmethod - def remove(self, block_hash: int) -> PhysicalTokenBlock: - """Simply removes the block with the hash value block_hash from the - evictor. Caller is responsible for making sure that block_hash is - contained in the evictor before calling remove. Should be used to - "bring back" blocks that have been freed but not evicted yet. - """ - pass - - @property - @abstractmethod - def num_blocks(self) -> int: - pass - - -class LRUEvictor(Evictor): - """Evicts in a least-recently-used order using the last_accessed timestamp - that's recorded in the PhysicalTokenBlock. If there are multiple blocks with - the same last_accessed time, then the one with the largest num_hashed_tokens - will be evicted. If two blocks each have the lowest last_accessed time and - highest num_hashed_tokens value, then one will be chose arbitrarily - """ - - def __init__(self): - self.free_table: OrderedDict[int, PhysicalTokenBlock] = OrderedDict() - - def __contains__(self, block_hash: int) -> bool: - return block_hash in self.free_table - - def evict(self) -> PhysicalTokenBlock: - if len(self.free_table) == 0: - raise ValueError("No usable cache memory left") - - evicted_block = next(iter(self.free_table.values())) - # The blocks with the lowest timestamps should be placed consecutively - # at the start of OrderedDict. Loop through all these blocks to - # find the one with maximum number of hashed tokens. - for _, block in self.free_table.items(): - if evicted_block.last_accessed < block.last_accessed: - break - if evicted_block.num_hashed_tokens < block.num_hashed_tokens: - evicted_block = block - - self.free_table.pop(evicted_block.block_hash) - - evicted_block.computed = False - return evicted_block - - def add(self, block: PhysicalTokenBlock): - self.free_table[block.block_hash] = block - - def remove(self, block_hash: int) -> PhysicalTokenBlock: - if block_hash not in self.free_table: - raise ValueError( - "Attempting to remove block that's not in the evictor") - block: PhysicalTokenBlock = self.free_table[block_hash] - self.free_table.pop(block_hash) - return block - - @property - def num_blocks(self) -> int: - return len(self.free_table) - - -def make_evictor(eviction_policy: EvictionPolicy) -> Evictor: - if eviction_policy == EvictionPolicy.LRU: - return LRUEvictor() - else: - raise ValueError(f"Unknown cache eviction policy: {eviction_policy}") diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index f0c8e6bab4862..8d3fce106dd2c 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -313,7 +313,7 @@ def __init__( self.lora_config = lora_config version = "selfattn" - if (self.scheduler_config.embedding_mode + if (self.scheduler_config.task == "embedding" or self.cache_config.is_attention_free): version = "placeholder" diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 6e1970bfed98a..ab47d62921d2c 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -7,7 +7,7 @@ The typical workflow is: - call `init_distributed_environment` to initialize the distributed environment. -- call `initialize_model_parallel` or `ensure_model_parallel_initialized` to +- call `initialize_model_parallel` or `ensure_model_parallel_initialized` to initialize the model parallel groups. - any code dealing with the distributed stuff @@ -20,6 +20,7 @@ steps. """ import contextlib +import gc import pickle import weakref from collections import namedtuple @@ -1129,6 +1130,19 @@ def destroy_distributed_environment(): torch.distributed.destroy_process_group() +def cleanup_dist_env_and_memory(shutdown_ray: bool = False): + destroy_model_parallel() + destroy_distributed_environment() + with contextlib.suppress(AssertionError): + torch.distributed.destroy_process_group() + if shutdown_ray: + import ray # Lazy import Ray + ray.shutdown() + gc.collect() + if not current_platform.is_cpu(): + torch.cuda.empty_cache() + + def in_the_same_node_as(pg: ProcessGroup, source_rank: int = 0) -> List[bool]: """ This is a collective operation that returns if each rank is in the same node diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 41963dcb16922..a5cfaf3977a4f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -3,7 +3,7 @@ import json from dataclasses import dataclass from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Mapping, Optional, - Tuple, Type, Union, cast) + Tuple, Type, Union, cast, get_args) import torch @@ -12,10 +12,12 @@ DeviceConfig, EngineConfig, LoadConfig, LoadFormat, LoRAConfig, ModelConfig, ObservabilityConfig, ParallelConfig, PromptAdapterConfig, SchedulerConfig, - SpeculativeConfig, TokenizerPoolConfig) + SpeculativeConfig, TaskOption, TokenizerPoolConfig) from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.transformers_utils.config import ( + maybe_register_config_serialize_by_value) from vllm.transformers_utils.utils import check_gguf_file from vllm.utils import FlexibleArgumentParser @@ -84,6 +86,7 @@ class EngineArgs: model: str = 'facebook/opt-125m' served_model_name: Optional[Union[str, List[str]]] = None tokenizer: Optional[str] = None + task: TaskOption = "auto" skip_tokenizer_init: bool = False tokenizer_mode: str = 'auto' trust_remote_code: bool = False @@ -198,6 +201,15 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: type=str, default=EngineArgs.model, help='Name or path of the huggingface model to use.') + parser.add_argument( + '--task', + default=EngineArgs.task, + choices=get_args(TaskOption), + help='The task to use the model for. Each vLLM instance only ' + 'supports one task, even if the same model can be used for ' + 'multiple tasks. When the model only supports one task, "auto" ' + 'can be used to select it; otherwise, you must specify explicitly ' + 'which task to use.') parser.add_argument( '--tokenizer', type=nullable_str, @@ -418,7 +430,11 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: help='The fraction of GPU memory to be used for the model ' 'executor, which can range from 0 to 1. For example, a value of ' '0.5 would imply 50%% GPU memory utilization. If unspecified, ' - 'will use the default value of 0.9.') + 'will use the default value of 0.9. This is a global gpu memory ' + 'utilization limit, for example if 50%% of the gpu memory is ' + 'already used before vLLM starts and --gpu-memory-utilization is ' + 'set to 0.9, then only 40%% of the gpu memory will be allocated ' + 'to the model executor.') parser.add_argument( '--num-gpu-blocks-override', type=int, @@ -838,6 +854,7 @@ def from_cli_args(cls, args: argparse.Namespace): def create_model_config(self) -> ModelConfig: return ModelConfig( model=self.model, + task=self.task, # We know this is not None because we set it in __post_init__ tokenizer=cast(str, self.tokenizer), tokenizer_mode=self.tokenizer_mode, @@ -909,6 +926,8 @@ def create_engine_config(self) -> EngineConfig: "supported for multimodal models and has been disabled.") self.enable_prefix_caching = False + maybe_register_config_serialize_by_value(self.trust_remote_code) + cache_config = CacheConfig( # neuron needs block_size = max_model_len block_size=self.block_size if self.device != "neuron" else @@ -1026,13 +1045,13 @@ def create_engine_config(self) -> EngineConfig: " please file an issue with detailed information.") scheduler_config = SchedulerConfig( + task=model_config.task, max_num_batched_tokens=self.max_num_batched_tokens, max_num_seqs=self.max_num_seqs, max_model_len=model_config.max_model_len, num_lookahead_slots=num_lookahead_slots, delay_factor=self.scheduler_delay_factor, enable_chunked_prefill=self.enable_chunked_prefill, - embedding_mode=model_config.embedding_mode, is_multimodal_model=model_config.is_multimodal_model, preemption_mode=self.preemption_mode, num_scheduler_steps=self.num_scheduler_steps, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 61c21887e6816..99beea932882d 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1,4 +1,5 @@ import time +from collections import Counter as collectionsCounter from collections import deque from contextlib import contextmanager from dataclasses import dataclass @@ -43,8 +44,10 @@ from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import RequestOutputKind, SamplingParams from vllm.sequence import (EmbeddingSequenceGroupOutput, ExecuteModelRequest, - Sequence, SequenceGroup, SequenceGroupMetadata, - SequenceGroupOutput, SequenceStatus) + ParallelSampleSequenceGroup, Sequence, + SequenceGroup, SequenceGroupBase, + SequenceGroupMetadata, SequenceGroupOutput, + SequenceStatus) from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context, init_tracer) from vllm.transformers_utils.config import try_get_generation_config @@ -344,7 +347,7 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: observability_config=self.observability_config, ) - if not self.model_config.embedding_mode: + if self.model_config.task != "embedding": self._initialize_kv_caches() # If usage stat is enabled, collect relevant info. @@ -473,6 +476,8 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: ), )) + self.seq_id_to_seq_group: Dict[str, SequenceGroupBase] = {} + def _initialize_kv_caches(self) -> None: """Initialize the KV cache in the worker(s). @@ -641,7 +646,10 @@ def _add_processed_request( prompt_adapter_request: Optional[PromptAdapterRequest], trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, - ) -> None: + ) -> SequenceGroup: + """Add a processed request to the engine's request pool. + return the created sequence group. + """ self._validate_model_inputs(processed_inputs) # Create the sequences. block_size = self.cache_config.block_size @@ -695,6 +703,8 @@ def _add_processed_request( min_cost_scheduler = self.scheduler[costs.index(min(costs))] min_cost_scheduler.add_seq_group(seq_group) + return seq_group + def stop_remote_worker_execution_loop(self) -> None: self.model_executor.stop_remote_worker_execution_loop() @@ -710,7 +720,7 @@ def add_request( trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, - ) -> None: + ) -> Optional[SequenceGroup]: ... @overload @@ -724,7 +734,7 @@ def add_request( trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, - ) -> None: + ) -> Optional[SequenceGroup]: ... @deprecate_kwargs( @@ -743,7 +753,7 @@ def add_request( priority: int = 0, *, inputs: Optional[PromptType] = None, # DEPRECATED - ) -> None: + ) -> Optional[SequenceGroup]: """Add a request to the engine's request pool. The request is added to the request pool and will be processed by the @@ -787,6 +797,22 @@ def add_request( >>> # continue the request processing >>> ... """ + + if isinstance(params, SamplingParams) and params.n > 1: + ParallelSampleSequenceGroup.add_request( + request_id, + self, + params, + prompt=prompt, + arrival_time=arrival_time, + lora_request=lora_request, + trace_headers=trace_headers, + prompt_adapter_request=prompt_adapter_request, + priority=priority, + inputs=inputs, + ) + return None + if inputs is not None: prompt = inputs assert prompt is not None and params is not None @@ -817,7 +843,7 @@ def add_request( processed_inputs["mm_processor_kwargs"] = preprocessed_inputs.get( "mm_processor_kwargs") - self._add_processed_request( + return self._add_processed_request( request_id=request_id, processed_inputs=processed_inputs, params=params, @@ -1116,7 +1142,7 @@ def _process_model_outputs(self, seq_group.metrics.model_execute_time = ( o.model_execute_time) - if self.model_config.embedding_mode: + if self.model_config.task == "embedding": self._process_sequence_group_outputs(seq_group, output) else: self.output_processor.process_prompt_logprob(seq_group, output) @@ -1134,7 +1160,9 @@ def _process_model_outputs(self, seq_group = scheduled_seq_group.seq_group seq_group.maybe_set_first_token_time(now) request_output = RequestOutputFactory.create( - seq_group, use_cache=self.use_cached_outputs) + seq_group, + self.seq_id_to_seq_group, + use_cache=self.use_cached_outputs) if request_output: ctx.request_outputs.append(request_output) @@ -1174,7 +1202,9 @@ def _process_model_outputs(self, seq_group = scheduled_seq_group.seq_group seq_group.maybe_set_first_token_time(now) request_output = RequestOutputFactory.create( - seq_group, use_cache=self.use_cached_outputs) + seq_group, + self.seq_id_to_seq_group, + use_cache=self.use_cached_outputs) if request_output: ctx.request_outputs.append(request_output) @@ -1193,7 +1223,10 @@ def _process_model_outputs(self, continue request_output = RequestOutputFactory.create( - seq_group, use_cache=self.use_cached_outputs) + seq_group, + self.seq_id_to_seq_group, + use_cache=self.use_cached_outputs, + ) if request_output: ctx.request_outputs.append(request_output) @@ -1212,7 +1245,7 @@ def _process_model_outputs(self, skip) # Tracing - self.do_tracing(scheduler_outputs) + self.do_tracing(scheduler_outputs, finished_before) return None @@ -1617,6 +1650,25 @@ def _get_stats(self, n_requests: List[int] = [] finished_reason_requests: List[str] = [] + # Lora requests + running_lora_adapters = dict( + collectionsCounter([ + running_request.lora_request.lora_name + for scheduler in self.scheduler + for running_request in scheduler.running + if running_request.lora_request + ])) + waiting_lora_adapters = dict( + collectionsCounter([ + waiting_request.lora_request.lora_name + for scheduler in self.scheduler + for waiting_request in scheduler.waiting + if waiting_request.lora_request + ])) + max_lora_stat = "0" + if self.lora_config: + max_lora_stat = str(self.lora_config.max_loras) + # NOTE: This loop assumes prefill seq_groups are before # decode seq_groups in scheduled_seq_groups. if scheduler_outputs is not None: @@ -1666,6 +1718,15 @@ def _get_stats(self, # TPOTs. latency = seq_group.get_last_latency(now) time_per_output_tokens_iter.append(latency) + if seq_group.state.current_step == 0: + # For async_output_proc, the do_log_stats() + # is called following init_multi_step(), which + # sets the current_step to zero. + actual_num_batched_tokens +=\ + seq_group.state.num_steps - 1 + else: + actual_num_batched_tokens +=\ + seq_group.state.current_step - 1 # Because of chunked prefill, we can have a single sequence # group that does multiple prompt_runs. To prevent logging @@ -1738,7 +1799,9 @@ def _get_stats(self, num_generation_tokens_requests=num_generation_tokens_requests, n_requests=n_requests, finished_reason_requests=finished_reason_requests, - ) + max_lora=str(max_lora_stat), + waiting_lora_adapters=list(waiting_lora_adapters.keys()), + running_lora_adapters=list(running_lora_adapters.keys())) def add_lora(self, lora_request: LoRARequest) -> bool: return self.model_executor.add_lora(lora_request) @@ -1786,11 +1849,18 @@ def stop_profile(self) -> None: def is_tracing_enabled(self) -> bool: return self.tracer is not None - def do_tracing(self, scheduler_outputs: SchedulerOutputs) -> None: + def do_tracing(self, + scheduler_outputs: SchedulerOutputs, + finished_before: Optional[List[int]] = None) -> None: if self.tracer is None: return - for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups: + for idx, scheduled_seq_group in enumerate( + scheduler_outputs.scheduled_seq_groups): + # Skip double tracing when using async output proc + if finished_before and idx in finished_before: + continue + seq_group = scheduled_seq_group.seq_group if seq_group.is_finished(): self.create_trace_span(seq_group) @@ -1855,9 +1925,6 @@ def create_trace_span(self, seq_group: SequenceGroup) -> None: def is_encoder_decoder_model(self): return self.input_preprocessor.is_encoder_decoder_model() - def is_embedding_model(self): - return self.model_config.is_embedding_model - def _validate_model_inputs(self, inputs: Union[DecoderOnlyInputs, EncoderDecoderInputs]): if self.model_config.is_multimodal_model: diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 98bf59be3469d..a46625eff1e4a 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -34,7 +34,11 @@ class Metrics: See https://prometheus.github.io/client_python/multiprocess/ for more details on limitations. """ + labelname_finish_reason = "finished_reason" + labelname_waiting_lora_adapters = "waiting_lora_adapters" + labelname_running_lora_adapters = "running_lora_adapters" + labelname_max_lora = "max_lora" _gauge_cls = prometheus_client.Gauge _counter_cls = prometheus_client.Counter _histogram_cls = prometheus_client.Histogram @@ -55,6 +59,16 @@ def __init__(self, labelnames: List[str], max_model_len: int): documentation="Number of requests waiting to be processed.", labelnames=labelnames, multiprocess_mode="sum") + self.gauge_lora_info = self._gauge_cls( + name="vllm:lora_requests_info", + documentation="Running stats on lora requests.", + labelnames=[ + self.labelname_running_lora_adapters, + self.labelname_max_lora, + self.labelname_waiting_lora_adapters, + ], + multiprocess_mode="livemostrecent", + ) self.gauge_scheduler_swapped = self._gauge_cls( name="vllm:num_requests_swapped", documentation="Number of requests swapped to CPU.", @@ -426,6 +440,9 @@ def _log_histogram(self, histogram, data: Union[List[int], for datum in data: histogram.labels(**self.labels).observe(datum) + def _log_gauge_string(self, gauge, data: Dict[str, str]) -> None: + gauge.labels(**data).set(1) + def _log_prometheus(self, stats: Stats) -> None: # System state data self._log_gauge(self.metrics.gauge_scheduler_running, @@ -442,7 +459,17 @@ def _log_prometheus(self, stats: Stats) -> None: stats.cpu_prefix_cache_hit_rate) self._log_gauge(self.metrics.gauge_gpu_prefix_cache_hit_rate, stats.gpu_prefix_cache_hit_rate) - + # Including max-lora in metric, in future this property of lora + # config maybe extended to be dynamic. + lora_info = { + self.metrics.labelname_running_lora_adapters: + ",".join(stats.running_lora_adapters), + self.metrics.labelname_waiting_lora_adapters: + ",".join(stats.waiting_lora_adapters), + self.metrics.labelname_max_lora: + stats.max_lora, + } + self._log_gauge_string(self.metrics.gauge_lora_info, lora_info) # Iteration level data self._log_counter(self.metrics.counter_num_preemption, stats.num_preemption_iter) diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index bafd5fa1a8a82..e9a5bd3b586be 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -51,6 +51,9 @@ class Stats: num_generation_tokens_requests: List[int] n_requests: List[int] finished_reason_requests: List[str] + waiting_lora_adapters: List[str] + running_lora_adapters: List[str] + max_lora: str spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index 9732c7098e160..9e5a6b21f4c18 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -204,8 +204,20 @@ async def run_output_handler_loop(self): # (and record only the first one) if is_engine_errored and not self._errored_with: self._errored_with = exception + # If engine is errored, no matter the type of exception + # it will no longer be able to receive new requests, + # therefore we have to inform that the current + # processed requests failed as well. Send back a dead + # engine error give this feedback and also give a + # 'hint' to the server to shutdown next. + exception = self.dead_error if request_id is None: + # If request_id is None, then the engine raised an + # exception for a batch, and we may not know the + # request that caused it, neither if it was actually + # caused by any of them (e.g. CUDA OOM). Therefore we + # broadcast the same exception for all requests. for queue_i in tuple(self.output_queues.values()): queue_i.put_nowait(exception) else: diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index ad0e970f36ff5..f67acdf660759 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -8,7 +8,7 @@ import cloudpickle import zmq -from vllm import AsyncEngineArgs, LLMEngine, SamplingParams +from vllm import AsyncEngineArgs, SamplingParams from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig) # yapf conflicts with isort for this block @@ -21,12 +21,17 @@ RPCStartupRequest, RPCStartupResponse, RPCUProfileRequest) # yapf: enable -from vllm.envs import VLLM_RPC_TIMEOUT +from vllm.envs import VLLM_RPC_TIMEOUT, VLLM_USE_V1 from vllm.executor.gpu_executor import GPUExecutor from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.usage.usage_lib import UsageContext +if VLLM_USE_V1: + from vllm.v1.engine.llm_engine import LLMEngine +else: + from vllm.engine.llm_engine import LLMEngine + CONFIG_TYPE = Union[ModelConfig, DecodingConfig, ParallelConfig, SchedulerConfig, LoRAConfig] @@ -136,14 +141,16 @@ def from_engine_args(cls, engine_args: AsyncEngineArgs, executor_class = LLMEngine._get_executor_cls(engine_config) - return cls( - ipc_path=ipc_path, - use_async_sockets=engine_config.model_config.use_async_output_proc, - **engine_config.to_dict(), - executor_class=executor_class, - log_requests=not engine_args.disable_log_requests, - log_stats=not engine_args.disable_log_stats, - usage_context=usage_context) + use_async_sockets = (engine_config.model_config.use_async_output_proc + and not VLLM_USE_V1) + + return cls(ipc_path=ipc_path, + use_async_sockets=use_async_sockets, + **engine_config.to_dict(), + executor_class=executor_class, + log_requests=not engine_args.disable_log_requests, + log_stats=not engine_args.disable_log_stats, + usage_context=usage_context) def start(self): try: diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 16ceddf13511c..5c504e0f0217d 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -59,7 +59,7 @@ def generate( async def beam_search( self, - prompt: Union[PromptType, List[int]], + prompt: Union[str, List[int]], request_id: str, params: BeamSearchParams, ) -> AsyncGenerator[RequestOutput, None]: @@ -71,9 +71,13 @@ async def beam_search( length_penalty = params.length_penalty tokenizer = await self.get_tokenizer(lora_request=None) - tokenizedPrompt = prompt if isinstance( - prompt, list) else tokenizer.encode(prompt) - tokenizedLength = len(tokenizedPrompt) + if isinstance(prompt, str): + tokenized_prompt = tokenizer.encode(prompt) + prompt_text = prompt + else: + tokenized_prompt = prompt + prompt_text = None + tokenized_length = len(tokenized_prompt) sort_beams_key = create_sort_beams_key_function( tokenizer.eos_token_id, length_penalty) @@ -81,7 +85,11 @@ async def beam_search( beam_search_params = SamplingParams(logprobs=2 * beam_width, max_tokens=1, temperature=temperature) - all_beams = [BeamSearchSequence(tokens=tokenizedPrompt, cum_logprob=0)] + all_beams = [ + BeamSearchSequence(tokens=tokenized_prompt, + logprobs=[], + cum_logprob=0) + ] completed = [] for _ in range(max_tokens): @@ -114,6 +122,7 @@ async def beam_search( for token_id, logprob_obj in logprobs.items(): new_beam = BeamSearchSequence( tokens=current_beam.tokens + [token_id], + logprobs=current_beam.logprobs + [logprobs], cum_logprob=current_beam.cum_logprob + logprob_obj.logprob) @@ -131,22 +140,22 @@ async def beam_search( best_beams = sorted_completed[:beam_width] for beam in best_beams: - beam.text = tokenizer.decode(beam.tokens[tokenizedLength:]) + beam.text = tokenizer.decode(beam.tokens[tokenized_length:]) beam_search_output = RequestOutput( request_id=request_id, - prompt=prompt, + prompt=prompt_text, outputs=[ CompletionOutput( text=beam.text, cumulative_logprob=beam.cum_logprob, - token_ids=beam.tokens, + token_ids=beam.tokens[tokenized_length:], index=i, - logprobs=beam.cum_logprob, + logprobs=beam.logprobs, ) for (i, beam) in enumerate(best_beams) ], finished=True, - prompt_token_ids=tokenizedPrompt, + prompt_token_ids=tokenized_prompt, prompt_logprobs=None) yield beam_search_output diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index a0eccca54acc4..140ee6df8cc52 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -5,8 +5,8 @@ from collections import defaultdict from functools import lru_cache, partial from pathlib import Path -from typing import (Any, Awaitable, Dict, Generic, Iterable, List, Literal, - Mapping, Optional, Tuple, TypeVar, Union, cast) +from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List, + Literal, Mapping, Optional, Tuple, TypeVar, Union, cast) # yapf conflicts with isort for this block # yapf: disable @@ -59,10 +59,35 @@ class CustomChatCompletionContentPartParam(TypedDict, total=False): """The type of the content part.""" +class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): + """A simpler version of the param that only accepts a plain image_url. + This is supported by OpenAI API, although it is not documented. + + Example: + { + "image_url": "https://example.com/image.jpg" + } + """ + image_url: Required[str] + + +class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): + """A simpler version of the param that only accepts a plain audio_url. + + Example: + { + "audio_url": "https://example.com/audio.mp3" + } + """ + audio_url: Required[str] + + ChatCompletionContentPartParam: TypeAlias = Union[ OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam, ChatCompletionContentPartRefusalParam, - CustomChatCompletionContentPartParam] + CustomChatCompletionContentPartParam, + CustomChatCompletionContentSimpleImageParam, + CustomChatCompletionContentSimpleAudioParam, str] class CustomChatCompletionMessageParam(TypedDict, total=False): @@ -390,6 +415,71 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int], _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam) MODEL_KEEP_MULTI_MODAL_CONTENT = {'mllama'} +# Define a mapping from part types to their corresponding parsing functions. +MM_PARSER_MAP: Dict[str, Callable[[ChatCompletionContentPartParam], str]] = { + "text": + lambda part: _TextParser(part).get("text", ""), + "image_url": + lambda part: _ImageParser(part).get("image_url", {}).get("url", ""), + "audio_url": + lambda part: _AudioParser(part).get("audio_url", {}).get("url", ""), + "refusal": + lambda part: _RefusalParser(part).get("refusal", ""), +} + + +def _parse_chat_message_content_mm_part( + part: ChatCompletionContentPartParam) -> Tuple[str, str]: + """ + Parses a given multi modal content part based on its type. + + Args: + part: A dict containing the content part, with a potential 'type' field. + + Returns: + A tuple (part_type, content) where: + - part_type: Type of the part (e.g., 'text', 'image_url'). + - content: Parsed content (e.g., text, image URL). + + Raises: + ValueError: If the 'type' field is missing and no direct URL is found. + """ + assert isinstance( + part, dict) # This is needed to avoid mypy errors: part.get() from str + part_type = part.get("type", None) + + if isinstance(part_type, str) and part_type in MM_PARSER_MAP: + content = MM_PARSER_MAP[part_type](part) + + # Special case for 'image_url.detail' + if part_type == "image_url" and part.get("detail") != "auto": + logger.warning("'image_url.detail' is currently not supported " + "and will be ignored.") + + return part_type, content + + # Handle missing 'type' but provided direct URL fields. + if part_type is None: + if part.get("image_url") is not None: + image_params = cast(CustomChatCompletionContentSimpleImageParam, + part) + return "image_url", image_params.get("image_url", "") + if part.get("audio_url") is not None: + audio_params = cast(CustomChatCompletionContentSimpleAudioParam, + part) + return "audio_url", audio_params.get("audio_url", "") + + # Raise an error if no 'type' or direct URL is found. + raise ValueError("Missing 'type' field in multimodal part.") + + if not isinstance(part_type, str): + raise ValueError("Invalid 'type' field in multimodal part.") + return part_type, "unknown part_type content" + + +VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url", + "audio_url") + def _parse_chat_message_content_parts( role: str, @@ -405,29 +495,28 @@ def _parse_chat_message_content_parts( has_image = False for part in parts: - part_type = part["type"] - if part_type == "text": - text = _TextParser(part)["text"] + if isinstance(part, str): # Handle plain text parts + text = _TextParser(part) texts.append(text) - elif part_type == "image_url": - image_url = _ImageParser(part)["image_url"] - - if image_url.get("detail", "auto") != "auto": - logger.warning( - "'image_url.detail' is currently not supported and " - "will be ignored.") - - mm_parser.parse_image(image_url["url"]) - has_image = True - elif part_type == "audio_url": - audio_url = _AudioParser(part)["audio_url"] - - mm_parser.parse_audio(audio_url["url"]) - elif part_type == "refusal": - text = _RefusalParser(part)["refusal"] - texts.append(text) - else: - raise NotImplementedError(f"Unknown part type: {part_type}") + else: # Handle structured dictionary parts + part_type, content = _parse_chat_message_content_mm_part(part) + + # if part_type is text/refusal/image_url/audio_url but + # content is empty, logg a warning and skip + if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content: + logger.warning("Skipping multimodal part " + "with empty / unparsable content.") + continue + + if part_type in ("text", "refusal"): + texts.append(content) + elif part_type == "image_url": + mm_parser.parse_image(content) + has_image = True + elif part_type == "audio_url": + mm_parser.parse_audio(content) + else: + raise NotImplementedError(f"Unknown part type: {part_type}") text_prompt = "\n".join(texts) if keep_multimodal_content: diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 2010381076c7d..db97fe0a0285b 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -6,10 +6,10 @@ from tqdm import tqdm +from vllm import envs from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, BeamSearchSequence, get_beam_search_score) -from vllm.engine.arg_utils import EngineArgs -from vllm.engine.llm_engine import LLMEngine +from vllm.engine.arg_utils import EngineArgs, TaskOption from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, apply_hf_chat_template, apply_mistral_chat_template, @@ -29,7 +29,12 @@ get_cached_tokenizer) from vllm.transformers_utils.tokenizer_group import TokenizerGroup from vllm.usage.usage_lib import UsageContext -from vllm.utils import Counter, deprecate_kwargs, is_list_of +from vllm.utils import Counter, deprecate_args, deprecate_kwargs, is_list_of + +if envs.VLLM_USE_V1: + from vllm.v1.engine.llm_engine import LLMEngine # type: ignore +else: + from vllm.engine.llm_engine import LLMEngine # type: ignore logger = init_logger(__name__) @@ -108,6 +113,12 @@ class LLM: DEPRECATE_LEGACY: ClassVar[bool] = False """A flag to toggle whether to deprecate the legacy generate/encode API.""" + DEPRECATE_INIT_POSARGS: ClassVar[bool] = True + """ + A flag to toggle whether to deprecate positional arguments in + :meth:`LLM.__init__`. + """ + @classmethod @contextmanager def deprecate_legacy_api(cls): @@ -117,6 +128,13 @@ def deprecate_legacy_api(cls): cls.DEPRECATE_LEGACY = False + @deprecate_args( + start_index=2, # Ignore self and model + is_deprecated=lambda: LLM.DEPRECATE_INIT_POSARGS, + additional_message=( + "All positional arguments other than `model` will be " + "replaced with keyword arguments in an upcoming version."), + ) def __init__( self, model: str, @@ -139,6 +157,8 @@ def __init__( disable_custom_all_reduce: bool = False, disable_async_output_proc: bool = False, mm_processor_kwargs: Optional[Dict[str, Any]] = None, + # After positional args are removed, move this right below `model` + task: TaskOption = "auto", **kwargs, ) -> None: ''' @@ -153,6 +173,7 @@ def __init__( engine_args = EngineArgs( model=model, + task=task, tokenizer=tokenizer, tokenizer_mode=tokenizer_mode, skip_tokenizer_init=skip_tokenizer_init, @@ -316,10 +337,21 @@ def generate( considered legacy and may be deprecated in the future. You should instead pass them via the ``inputs`` parameter. """ - if self.llm_engine.model_config.embedding_mode: - raise ValueError( + task = self.llm_engine.model_config.task + if task != "generate": + messages = [ "LLM.generate() is only supported for (conditional) generation " - "models (XForCausalLM, XForConditionalGeneration).") + "models (XForCausalLM, XForConditionalGeneration).", + ] + + supported_tasks = self.llm_engine.model_config.supported_tasks + if "generate" in supported_tasks: + messages.append( + "Your model supports the 'generate' task, but is " + f"currently initialized for the '{task}' task. Please " + "initialize the model using `--task generate`.") + + raise ValueError(" ".join(messages)) if prompt_token_ids is not None: parsed_prompts = self._convert_v1_inputs( @@ -433,6 +465,7 @@ def sort_beams_key(x: BeamSearchSequence) -> float: for token_id, logprob_obj in logprobs.items(): new_beam = BeamSearchSequence( tokens=current_beam.tokens + [token_id], + logprobs=current_beam.logprobs + [logprobs], cum_logprob=current_beam.cum_logprob + logprob_obj.logprob) @@ -691,10 +724,18 @@ def encode( considered legacy and may be deprecated in the future. You should instead pass them via the ``inputs`` parameter. """ - if not self.llm_engine.model_config.embedding_mode: - raise ValueError( - "LLM.encode() is only supported for embedding models (XModel)." - ) + task = self.llm_engine.model_config.task + if task != "embedding": + messages = ["LLM.encode() is only supported for embedding models."] + + supported_tasks = self.llm_engine.model_config.supported_tasks + if "embedding" in supported_tasks: + messages.append( + "Your model supports the 'embedding' task, but is " + f"currently initialized for the '{task}' task. Please " + "initialize the model using `--task embedding`.") + + raise ValueError(" ".join(messages)) if prompt_token_ids is not None: parsed_prompts = self._convert_v1_inputs( @@ -904,6 +945,3 @@ def _run_engine( def _is_encoder_decoder_model(self): return self.llm_engine.is_encoder_decoder_model() - - def _is_embedding_model(self): - return self.llm_engine.is_embedding_model() diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 6f1135f8093ba..733decf80a711 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -284,6 +284,12 @@ class ChatCompletionRequest(OpenAIBaseModel): "The priority of the request (lower means earlier handling; " "default: 0). Any priority other than 0 will raise an error " "if the served model does not use priority scheduling.")) + request_id: str = Field( + default_factory=lambda: f"{random_uuid()}", + description=( + "The request_id related to this request. If the caller does " + "not set it, a random_uuid will be generated. This id is used " + "through out the inference process and return in response.")) # doc: end-chat-completion-extra-params @@ -314,9 +320,15 @@ def to_sampling_params(self, default_max_tokens: int) -> SamplingParams: prompt_logprobs = self.top_logprobs guided_json_object = None - if (self.response_format is not None - and self.response_format.type == "json_object"): - guided_json_object = True + if self.response_format is not None: + if self.response_format.type == "json_object": + guided_json_object = True + elif self.response_format.type == "json_schema": + json_schema = self.response_format.json_schema + assert json_schema is not None + self.guided_json = json_schema.json_schema + if self.guided_decoding_backend is None: + self.guided_decoding_backend = "lm-format-enforcer" guided_decoding = GuidedDecodingParams.from_optional( json=self._get_guided_json_from_tool() or self.guided_json, @@ -537,8 +549,8 @@ class CompletionRequest(OpenAIBaseModel): default=None, description= ("Similar to chat completion, this parameter specifies the format of " - "output. Only {'type': 'json_object'} or {'type': 'text' } is " - "supported."), + "output. Only {'type': 'json_object'}, {'type': 'json_schema'} or " + "{'type': 'text' } is supported."), ) guided_json: Optional[Union[str, dict, BaseModel]] = Field( default=None, diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index c3fa0e44e5e8d..b9b240b64850e 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -38,7 +38,7 @@ from vllm.tracing import (contains_trace_headers, extract_trace_headers, log_tracing_disabled_warning) from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer -from vllm.utils import iterate_with_cancellation, random_uuid +from vllm.utils import iterate_with_cancellation logger = init_logger(__name__) @@ -176,7 +176,7 @@ async def create_chat_completion( "\"auto\" tool choice requires " "--enable-auto-tool-choice and --tool-call-parser to be set") - request_id = f"chat-{random_uuid()}" + request_id = f"chat-{request.request_id}" request_metadata = RequestResponseMetadata(request_id=request_id) if raw_request: diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 1e08cd9712bc0..56e35950410a0 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -258,6 +258,14 @@ async def completion_stream_generator( has_echoed = [False] * num_choices * num_prompts num_prompt_tokens = [0] * num_prompts + stream_options = request.stream_options + if stream_options: + include_usage = stream_options.include_usage + include_continuous_usage = include_usage and \ + stream_options.continuous_usage_stats + else: + include_usage, include_continuous_usage = False, False + try: async for prompt_idx, res in result_generator: prompt_token_ids = res.prompt_token_ids @@ -276,28 +284,25 @@ async def completion_stream_generator( i = output.index + prompt_idx * num_choices assert request.max_tokens is not None - if request.echo and request.max_tokens == 0: + if request.echo and not has_echoed[i]: assert prompt_token_ids is not None assert prompt_text is not None - # only return the prompt - delta_text = prompt_text - delta_token_ids = prompt_token_ids - out_logprobs = prompt_logprobs - has_echoed[i] = True - elif (request.echo and request.max_tokens > 0 - and not has_echoed[i]): - assert prompt_token_ids is not None - assert prompt_text is not None - assert prompt_logprobs is not None - # echo the prompt and first token - delta_text = prompt_text + output.text - delta_token_ids = [ - *prompt_token_ids, *output.token_ids - ] - out_logprobs = [ - *prompt_logprobs, - *(output.logprobs or []), - ] + if request.max_tokens == 0: + # only return the prompt + delta_text = prompt_text + delta_token_ids = prompt_token_ids + out_logprobs = prompt_logprobs + else: + assert prompt_logprobs is not None + # echo the prompt and first token + delta_text = prompt_text + output.text + delta_token_ids = [ + *prompt_token_ids, *output.token_ids + ] + out_logprobs = [ + *prompt_logprobs, + *(output.logprobs or []), + ] has_echoed[i] = True else: # return just the delta @@ -341,45 +346,39 @@ async def completion_stream_generator( stop_reason=stop_reason, ) ]) - if (request.stream_options - and request.stream_options.include_usage): - if (request.stream_options.continuous_usage_stats - or output.finish_reason is not None): - prompt_tokens = num_prompt_tokens[prompt_idx] - completion_tokens = previous_num_tokens[i] - usage = UsageInfo( - prompt_tokens=prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=prompt_tokens + completion_tokens, - ) - if request.stream_options.continuous_usage_stats: - chunk.usage = usage - else: - chunk.usage = None + if include_continuous_usage: + prompt_tokens = num_prompt_tokens[prompt_idx] + completion_tokens = previous_num_tokens[i] + chunk.usage = UsageInfo( + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=prompt_tokens + completion_tokens, + ) response_json = chunk.model_dump_json(exclude_unset=False) yield f"data: {response_json}\n\n" - if (request.stream_options - and request.stream_options.include_usage): + total_prompt_tokens = sum(num_prompt_tokens) + total_completion_tokens = sum(previous_num_tokens) + final_usage_info = UsageInfo( + prompt_tokens=total_prompt_tokens, + completion_tokens=total_completion_tokens, + total_tokens=total_prompt_tokens + total_completion_tokens) + + if include_usage: final_usage_chunk = CompletionStreamResponse( id=request_id, created=created_time, model=model_name, choices=[], - usage=usage, + usage=final_usage_info, ) final_usage_data = (final_usage_chunk.model_dump_json( exclude_unset=False, exclude_none=True)) yield f"data: {final_usage_data}\n\n" # report to FastAPI middleware aggregate usage across all choices - total_prompt_tokens = sum(num_prompt_tokens) - total_completion_tokens = sum(previous_num_tokens) - request_metadata.final_usage_info = UsageInfo( - prompt_tokens=total_prompt_tokens, - completion_tokens=total_completion_tokens, - total_tokens=total_prompt_tokens + total_completion_tokens) + request_metadata.final_usage_info = final_usage_info except ValueError as e: # TODO: Use a vllm-specific Validation Error @@ -413,26 +412,26 @@ def request_output_to_completion_response( for output in final_res.outputs: assert request.max_tokens is not None - if request.echo and request.max_tokens == 0: - assert prompt_text is not None - token_ids = prompt_token_ids - out_logprobs = prompt_logprobs - output_text = prompt_text - elif request.echo and request.max_tokens > 0: + if request.echo: assert prompt_text is not None - token_ids = [*prompt_token_ids, *output.token_ids] - - if request.logprobs is None: - out_logprobs = None + if request.max_tokens == 0: + token_ids = prompt_token_ids + out_logprobs = prompt_logprobs + output_text = prompt_text else: - assert prompt_logprobs is not None - assert output.logprobs is not None - out_logprobs = [ - *prompt_logprobs, - *output.logprobs, - ] - - output_text = prompt_text + output.text + token_ids = [*prompt_token_ids, *output.token_ids] + + if request.logprobs is None: + out_logprobs = None + else: + assert prompt_logprobs is not None + assert output.logprobs is not None + out_logprobs = [ + *prompt_logprobs, + *output.logprobs, + ] + + output_text = prompt_text + output.text else: token_ids = output.token_ids out_logprobs = output.logprobs diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index e9504cfa64b65..6c46aae2838f6 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -83,7 +83,8 @@ def __init__( lora_modules=None, prompt_adapters=None, request_logger=request_logger) - self._enabled = self._check_embedding_mode(model_config.embedding_mode) + self._enabled = self._check_embedding_mode( + model_config.task == "embedding") async def create_embedding( self, diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py index 309d9bede489b..0e88bb21ca75f 100644 --- a/vllm/entrypoints/openai/tool_parsers/__init__.py +++ b/vllm/entrypoints/openai/tool_parsers/__init__.py @@ -1,10 +1,12 @@ from .abstract_tool_parser import ToolParser, ToolParserManager from .hermes_tool_parser import Hermes2ProToolParser from .internlm2_tool_parser import Internlm2ToolParser +from .jamba_tool_parser import JambaToolParser from .llama_tool_parser import Llama3JsonToolParser from .mistral_tool_parser import MistralToolParser __all__ = [ "ToolParser", "ToolParserManager", "Hermes2ProToolParser", - "MistralToolParser", "Internlm2ToolParser", "Llama3JsonToolParser" + "MistralToolParser", "Internlm2ToolParser", "Llama3JsonToolParser", + "JambaToolParser" ] diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index e7ea82ebd5411..faa6f653b835c 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -53,7 +53,8 @@ def __init__(self, tokenizer: AnyTokenizer): self.tool_call_start_token_id = self.vocab.get( self.tool_call_start_token) self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token) - if not self.tool_call_start_token_id or not self.tool_call_end_token_id: + if (self.tool_call_start_token_id is None + or self.tool_call_end_token_id is None): raise RuntimeError( "Hermes 2 Pro Tool parser could not locate tool call start/end " "tokens in the tokenizer!") diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py new file mode 100644 index 0000000000000..cfd024853f887 --- /dev/null +++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py @@ -0,0 +1,300 @@ +import json +import re +from typing import Dict, List, Sequence, Union + +import partial_json_parser +from partial_json_parser.core.options import Allow + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaFunctionCall, DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, ToolCall) +from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager +from vllm.entrypoints.openai.tool_parsers.utils import ( + extract_intermediate_diff) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.transformers_utils.tokenizers import MistralTokenizer +from vllm.utils import random_uuid + +logger = init_logger(__name__) + + +@ToolParserManager.register_module("jamba") +class JambaToolParser(ToolParser): + + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + if isinstance(self.model_tokenizer, MistralTokenizer): + raise ValueError( + "Detected a MistralTokenizer tokenizer when using a Jamba model" + ) + + self.current_tool_name_sent: bool = False + self.prev_tool_call_arr: List[Dict] = [] + self.current_tool_id: int = -1 + self.streamed_args_for_tool: List[str] = [ + ] # map what has been streamed for each tool so far to a list + + self.tool_calls_start_token: str = "" + self.tool_calls_end_token: str = "" + + self.tool_calls_regex = re.compile( + rf"{self.tool_calls_start_token}(.*?){self.tool_calls_end_token}", + re.DOTALL) + + if not self.model_tokenizer: + raise ValueError( + "The model tokenizer must be passed to the ToolParser " + "constructor during construction.") + self.tool_calls_start_token_id = self.vocab.get( + self.tool_calls_start_token) + self.tool_calls_end_token_id = self.vocab.get( + self.tool_calls_end_token) + if (self.tool_calls_start_token_id is None + or self.tool_calls_end_token_id is None): + raise RuntimeError( + "Jamba Tool parser could not locate tool calls start/end " + "tokens in the tokenizer!") + + def adjust_request( + self, request: ChatCompletionRequest) -> ChatCompletionRequest: + if request.tools and request.tool_choice != 'none': + # do not skip special tokens because jamba use the special + # tokens to indicate the start and end of the tool calls + # information. + request.skip_special_tokens = False + return request + + def extract_tool_calls( + self, model_output: str, + request: ChatCompletionRequest) -> ExtractedToolCallInformation: + + # sanity check; avoid unnecessary processing + if self.tool_calls_start_token not in model_output: + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + else: + + try: + # use a regex to find the tool call between the tags + function_calls = self.tool_calls_regex.findall(model_output)[0] + + # load the JSON, and then use it to build the Function and + # Tool Call + raw_function_calls = json.loads(function_calls) + tool_calls = [ + ToolCall( + type="function", + function=FunctionCall( + name=function_call["name"], + # function call args are JSON but as a string + arguments=json.dumps(function_call["arguments"]))) + for function_call in raw_function_calls + ] + + content = model_output[:model_output. + find(self.tool_calls_start_token)] + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=tool_calls, + content=content if + (len(content) > 0 and content != " ") else None) + + except Exception: + logger.exception( + "Error in extracting tool call from response.") + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> Union[DeltaMessage, None]: + + # if the tool call token is not in the tokens generated so far, append + # output to contents since it's not a tool + if self.tool_calls_start_token not in current_text: + return DeltaMessage(content=delta_text) + + # if the tool call token ID IS in the tokens generated so far, that + # means we're parsing as tool calls now + + # handle if we detected the start of tool calls token which means + # the start of tool calling + if (self.tool_calls_start_token_id in delta_token_ids + and len(delta_token_ids) == 1): + # if it's the only token, return None, so we don't send a chat + # completion and don't send a control token + return None + + # bit mask flags for partial JSON parsing. If the name hasn't been + # sent yet, don't allow sending + # an incomplete string since OpenAI only ever (as far as I have + # seen) allows sending the entire tool/ function name at once. + flags = Allow.ALL if self.current_tool_name_sent \ + else Allow.ALL & ~Allow.STR + try: + + # Extract the tool calls between the special tool call tokens + parsable_arr = current_text.split( + self.tool_calls_start_token)[-1].split( + self.tool_calls_end_token)[0] + + # tool calls are generated in an array, so do partial JSON + # parsing on the entire array + try: + tool_call_arr: List[Dict] = partial_json_parser.loads( + parsable_arr, flags) + except partial_json_parser.core.exceptions.MalformedJSON: + logger.debug('not enough tokens to parse into JSON yet') + return None + + # select as the current tool call the one we're on the state at + + current_tool_call: Dict = tool_call_arr[self.current_tool_id] \ + if len(tool_call_arr) > 0 else {} + + # case -- if no tokens have been streamed for the tool, e.g. + # only the array brackets, stream nothing + if len(tool_call_arr) == 0: + return None + + # case: we are starting a new tool in the array + # -> array has > 0 length AND length has moved past cursor + elif (len(tool_call_arr) > 0 + and len(tool_call_arr) > self.current_tool_id + 1): + + # if we're moving on to a new call, first make sure we + # haven't missed anything in the previous one that was + # auto-generated due to JSON completions, but wasn't + # streamed to the client yet. + if self.current_tool_id >= 0: + diff: Union[str, None] = current_tool_call.get("arguments") + + if diff: + diff = json.dumps(diff).replace( + self.streamed_args_for_tool[self.current_tool_id], + "") + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=diff).model_dump( + exclude_none=True)) + ]) + self.streamed_args_for_tool[ + self.current_tool_id] += diff + else: + delta = None + else: + delta = None + # re-set stuff pertaining to progress in the current tool + self.current_tool_id = len(tool_call_arr) - 1 + self.current_tool_name_sent = False + self.streamed_args_for_tool.append("") + logger.debug("starting on new tool %d", self.current_tool_id) + return delta + + # case: update an existing tool - this is handled below + + # if the current tool name hasn't been sent, send if available + # - otherwise send nothing + if not self.current_tool_name_sent: + function_name = current_tool_call.get("name") + if function_name: + + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + type="function", + id=f"chatcmpl-tool-{random_uuid()}", + function=DeltaFunctionCall( + name=function_name).model_dump( + exclude_none=True)) + ]) + self.current_tool_name_sent = True + else: + delta = None + + # now we know we're on the same tool call and we're streaming + # arguments + else: + + prev_arguments = self.prev_tool_call_arr[ + self.current_tool_id].get("arguments") + cur_arguments = current_tool_call.get("arguments") + + new_text = delta_text.replace("\'", "\"") + + if not cur_arguments and not prev_arguments: + + delta = None + elif not cur_arguments and prev_arguments: + logger.error( + "INVARIANT - impossible to have arguments reset " + "mid-arguments") + delta = None + elif cur_arguments and not prev_arguments: + cur_arguments_json = json.dumps(cur_arguments) + logger.debug("finding %s in %s", new_text, + cur_arguments_json) + + arguments_delta = cur_arguments_json[:cur_arguments_json. + index(new_text) + + len(new_text)] + logger.debug("First tokens in arguments received: %s", + arguments_delta) + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=arguments_delta). + model_dump(exclude_none=True)) + ]) + self.streamed_args_for_tool[ + self.current_tool_id] += arguments_delta + + elif cur_arguments and prev_arguments: + cur_args_json = json.dumps(cur_arguments) + prev_args_json = json.dumps(prev_arguments) + logger.debug("Searching for diff between \n%s\n%s", + cur_args_json, prev_args_json) + + argument_diff = extract_intermediate_diff( + cur_args_json, prev_args_json) + logger.debug("got arguments diff: %s", argument_diff) + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=argument_diff).model_dump( + exclude_none=True)) + ]) + self.streamed_args_for_tool[ + self.current_tool_id] += argument_diff + else: + # try parsing it with regular JSON - if it works we're + # at the end, and we need to send the difference between + # tokens streamed so far and the valid JSON + delta = None + + # check to see if the name is defined and has been sent. if so, + # stream the name - otherwise keep waiting + # finish by setting old and returning None as base case + self.prev_tool_call_arr = tool_call_arr + return delta + + except Exception: + logger.exception("Error trying to handle streaming tool call.") + logger.debug( + "Skipping chunk as a result of tool streaming extraction " + "error") + return None diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py index ff4e88f29d39e..f5c0d92f3f9bd 100644 --- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py @@ -63,7 +63,7 @@ def __init__(self, tokenizer: AnyTokenizer): self.bot_token = "[TOOL_CALLS]" self.bot_token_id = self.vocab.get(self.bot_token) self.tool_call_regex = re.compile(r"\[{.*?}\]", re.DOTALL) - if not self.bot_token_id: + if self.bot_token_id is None: raise RuntimeError( "Mistral Tool Parser could not locate the tool call token in " "the tokenizer!") diff --git a/vllm/envs.py b/vllm/envs.py index 2396e87e20c39..ae6825f280073 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -27,11 +27,13 @@ VLLM_USAGE_SOURCE: str = "" VLLM_CONFIGURE_LOGGING: int = 1 VLLM_LOGGING_LEVEL: str = "INFO" + VLLM_LOGGING_PREFIX: str = "" VLLM_LOGGING_CONFIG_PATH: Optional[str] = None VLLM_TRACE_FUNCTION: int = 0 VLLM_ATTENTION_BACKEND: Optional[str] = None VLLM_USE_FLASHINFER_SAMPLER: bool = False VLLM_USE_FLASHINFER_REJECTION_SAMPLER: bool = False + VLLM_FLASHINFER_FORCE_TENSOR_CORES: bool = False VLLM_PP_LAYER_PARTITION: Optional[str] = None VLLM_CPU_KVCACHE_SPACE: int = 0 VLLM_CPU_OMP_THREADS_BIND: str = "" @@ -67,6 +69,7 @@ VLLM_TORCH_COMPILE_LEVEL: int = 0 VLLM_CUSTOM_OPS: List[str] = [] VLLM_DISABLED_KERNELS: List[str] = [] + VLLM_USE_V1: bool = False def get_default_cache_root(): @@ -266,6 +269,10 @@ def get_default_config_root(): "VLLM_LOGGING_LEVEL": lambda: os.getenv("VLLM_LOGGING_LEVEL", "INFO"), + # if set, VLLM_LOGGING_PREFIX will be prepended to all log messages + "VLLM_LOGGING_PREFIX": + lambda: os.getenv("VLLM_LOGGING_PREFIX", ""), + # Trace function calls # If set to 1, vllm will trace function calls # Useful for debugging @@ -286,6 +293,11 @@ def get_default_config_root(): "VLLM_USE_FLASHINFER_SAMPLER": lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_SAMPLER", "0"))), + # If set, vllm will force flashinfer to use tensor cores; + # otherwise will use heuristic based on model architecture. + "VLLM_FLASHINFER_FORCE_TENSOR_CORES": + lambda: bool(int(os.getenv("VLLM_FLASHINFER_FORCE_TENSOR_CORES", "0"))), + # Pipeline stage partition strategy "VLLM_PP_LAYER_PARTITION": lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None), @@ -444,6 +456,10 @@ def get_default_config_root(): "VLLM_DISABLED_KERNELS": lambda: [] if "VLLM_DISABLED_KERNELS" not in os.environ else os.environ[ "VLLM_DISABLED_KERNELS"].split(","), + + # If set, use the V1 code path. + "VLLM_USE_V1": + lambda: bool(int(os.getenv("VLLM_USE_V1", "0"))), } # end-env-vars-definition diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 7e46acefc5b0e..0af7b3386d895 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -10,7 +10,7 @@ from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.sequence import ExecuteModelRequest, IntermediateTensors -from vllm.utils import get_ip, is_hip, is_xpu +from vllm.utils import get_ip, is_hip from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -231,7 +231,7 @@ def initialize_ray_cluster( assert_ray_available() # Connect to a ray cluster. - if is_hip() or is_xpu(): + if is_hip() or current_platform.is_xpu(): ray.init(address=ray_address, ignore_reinit_error=True, num_gpus=parallel_config.world_size) diff --git a/vllm/logger.py b/vllm/logger.py index 77dddbfb60965..ccf09691a052a 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -15,8 +15,10 @@ VLLM_CONFIGURE_LOGGING = envs.VLLM_CONFIGURE_LOGGING VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL +VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX -_FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s" +_FORMAT = (f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s " + "%(filename)s:%(lineno)d] %(message)s") _DATE_FORMAT = "%m-%d %H:%M:%S" DEFAULT_LOGGING_CONFIG = { diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 549be116772c9..71eed6eb68d78 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -7,7 +7,7 @@ from vllm.compilation.levels import CompilationLevel from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import is_cpu, is_hip, is_xpu, print_warning_once +from vllm.utils import is_hip, print_warning_once logger = init_logger(__name__) @@ -74,11 +74,11 @@ def dispatch_forward(self): if is_hip(): return self.forward_hip - elif is_cpu(): + elif current_platform.is_cpu(): return self.forward_cpu elif current_platform.is_tpu(): return self.forward_tpu - elif is_xpu(): + elif current_platform.is_xpu(): return self.forward_xpu else: return self.forward_cuda diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index cf99306c9caef..8de3385a257f8 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -264,6 +264,8 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor): lambda: nn.ReLU(), "relu2": lambda: ReLUSquaredActivation(), + "silu": + lambda: nn.SiLU(), "quick_gelu": lambda: QuickGELU(), }) diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 10fae84dab723..30b43f375dd5c 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -27,7 +27,6 @@ def __init__( self.variance_epsilon = eps self.variance_size_override = (None if var_hidden_size == hidden_size else var_hidden_size) - self.weight = nn.Parameter(torch.ones(hidden_size)) def forward_native( diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index 1d5b6fad2e160..288f5a1134b6b 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -48,14 +48,15 @@ def forward( self, lm_head: VocabParallelEmbedding, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, + sampling_metadata: Optional[SamplingMetadata] = None, embedding_bias: Optional[torch.Tensor] = None, ) -> Optional[torch.Tensor]: if self.logits_as_input: logits = hidden_states else: - hidden_states = _prune_hidden_states(hidden_states, - sampling_metadata) + if sampling_metadata is not None: + hidden_states = _prune_hidden_states(hidden_states, + sampling_metadata) # Get the logits for the next tokens. logits = self._get_logits(hidden_states, lm_head, embedding_bias) @@ -69,7 +70,8 @@ def forward( logits *= self.scale # Apply logits processors (if any). - logits = _apply_logits_processors(logits, sampling_metadata) + if sampling_metadata is not None: + logits = _apply_logits_processors(logits, sampling_metadata) return logits diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 410b3cb5321cb..38dd1f2e10fcd 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -3,7 +3,8 @@ import torch from vllm import _custom_ops as ops -from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, + UnquantizedLinearMethod) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.parameter import (GroupQuantScaleParameter, @@ -21,10 +22,12 @@ def __init__( weight_bits: int, group_size: int, zero_point: bool, + modules_to_not_convert: Optional[List[str]] = None, ) -> None: self.weight_bits = weight_bits self.group_size = group_size self.zero_point = zero_point + self.modules_to_not_convert = modules_to_not_convert or [] if self.weight_bits != 4: raise ValueError( @@ -35,7 +38,8 @@ def __init__( def __repr__(self) -> str: return (f"AWQConfig(weight_bits={self.weight_bits}, " f"group_size={self.group_size}, " - f"zero_point={self.zero_point})") + f"zero_point={self.zero_point}, " + f"modules_to_not_convert={self.modules_to_not_convert})") def get_name(self) -> str: return "awq" @@ -61,11 +65,15 @@ def from_config(cls, config: Dict[str, Any]) -> "AWQConfig": weight_bits = cls.get_from_keys(config, ["w_bit", "bits"]) group_size = cls.get_from_keys(config, ["q_group_size", "group_size"]) zero_point = cls.get_from_keys(config, ["zero_point"]) - return cls(weight_bits, group_size, zero_point) + modules_to_not_convert = cls.get_from_keys_or( + config, ["modules_to_not_convert"], None) + return cls(weight_bits, group_size, zero_point, modules_to_not_convert) def get_quant_method(self, layer: torch.nn.Module, - prefix: str) -> Optional["AWQLinearMethod"]: + prefix: str) -> Optional["LinearMethodBase"]: if isinstance(layer, LinearBase): + if is_layer_skipped_awq(prefix, self.modules_to_not_convert): + return UnquantizedLinearMethod() return AWQLinearMethod(self) return None @@ -73,6 +81,10 @@ def get_scaled_act_names(self) -> List[str]: return ["gelu", "gelu_fast", "gelu_new", "gelu_pytorch_tanh"] +def is_layer_skipped_awq(prefix: str, modules_to_not_convert: List[str]): + return any(module_name in prefix for module_name in modules_to_not_convert) + + class AWQLinearMethod(LinearMethodBase): """Linear method for AWQ. diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 778162dd63ca6..1f2d7384076ed 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -122,7 +122,7 @@ def input_processor_for_blip( # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa class BlipVisionEmbeddings(nn.Module): - def __init__(self, config: BlipVisionConfig): + def __init__(self, config: Union[BlipVisionConfig, Blip2VisionConfig]): super().__init__() self.config = config @@ -167,9 +167,10 @@ class BlipParallelAttention(nn.Module): def __init__( self, - config: BlipVisionConfig, + config: Union[BlipVisionConfig, Blip2VisionConfig], quant_config: Optional[QuantizationConfig] = None, - ): + prefix: str = "", + ) -> None: super().__init__() self.config = config self.embed_dim = config.hidden_size @@ -189,11 +190,13 @@ def __init__( self.num_heads, bias=config.qkv_bias, quant_config=quant_config, + prefix=f"{prefix}.qkv", ) self.projection = RowParallelLinear( self.embed_dim, self.embed_dim, quant_config=quant_config, + prefix=f"{prefix}.projection", ) self.tp_size = get_tensor_model_parallel_world_size() @@ -235,9 +238,12 @@ def forward( class BlipMLP(nn.Module): - def __init__(self, - config: BlipVisionConfig, - quant_config: Optional[QuantizationConfig] = None): + def __init__( + self, + config: BlipVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: super().__init__() self.config = config @@ -246,11 +252,13 @@ def __init__(self, self.fc1 = ColumnParallelLinear(config.hidden_size, config.intermediate_size, bias=True, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.fc1") self.fc2 = RowParallelLinear(config.intermediate_size, config.hidden_size, bias=True, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.fc2") def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states, _ = self.fc1(hidden_states) @@ -262,24 +270,32 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class BlipEncoderLayer(nn.Module): - def __init__(self, - config: BlipVisionConfig, - quant_config: Optional[QuantizationConfig] = None): + def __init__( + self, + config: BlipVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: super().__init__() # fallback to sdpa attention if tp unavailable num_heads = config.num_attention_heads tp_size = get_tensor_model_parallel_world_size() if USE_XFORMERS_OPS and num_heads % tp_size == 0: - self.self_attn = BlipParallelAttention(config, - quant_config=quant_config) + self.self_attn = BlipParallelAttention( + config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) else: # Blip doesn't have SDPA attention implemented in transformers # use eager attention instead for cpu backend self.self_attn = BlipAttention(config) self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.mlp = BlipMLP(config, quant_config=quant_config) + self.mlp = BlipMLP(config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) @@ -307,10 +323,13 @@ class BlipEncoder(nn.Module): config: BlipConfig """ - def __init__(self, - config: BlipVisionConfig, - quant_config: Optional[QuantizationConfig] = None, - num_hidden_layers_override: Optional[int] = None): + def __init__( + self, + config: BlipVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + num_hidden_layers_override: Optional[int] = None, + prefix: str = "", + ) -> None: super().__init__() self.config = config @@ -321,8 +340,10 @@ def __init__(self, num_hidden_layers = num_hidden_layers_override self.layers = nn.ModuleList([ - BlipEncoderLayer(config=config, quant_config=quant_config) - for _ in range(num_hidden_layers) + BlipEncoderLayer(config=config, + quant_config=quant_config, + prefix=f"{prefix}.layers.{layer_idx}") + for layer_idx in range(num_hidden_layers) ]) def forward(self, inputs_embeds: torch.Tensor): @@ -337,10 +358,15 @@ class BlipVisionModel(nn.Module): config_class = BlipVisionConfig main_input_name = "pixel_values" - def __init__(self, - config: BlipVisionConfig, - quant_config: Optional[QuantizationConfig] = None, - num_hidden_layers_override: Optional[int] = None): + def __init__( + self, + config: BlipVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + *, + num_hidden_layers_override: Optional[int] = None, + require_post_norm: Optional[bool] = None, + prefix: str = "", + ) -> None: super().__init__() tp_size = get_tensor_model_parallel_world_size() @@ -354,19 +380,24 @@ def __init__(self, config=config, quant_config=quant_config, num_hidden_layers_override=num_hidden_layers_override, + prefix=f"{prefix}.encoder", ) + num_hidden_layers = config.num_hidden_layers if len(self.encoder.layers) > config.num_hidden_layers: raise ValueError( - f"The original encoder only has {config.num_hidden_layers} " + f"The original encoder only has {num_hidden_layers} " f"layers, but you requested {len(self.encoder.layers)} layers." ) - elif len(self.encoder.layers) == config.num_hidden_layers: + + # If possible, skip post_layernorm to conserve memory + if require_post_norm is None: + require_post_norm = len(self.encoder.layers) == num_hidden_layers + + if require_post_norm: self.post_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) else: - # post_layernorm is unused when we extract intermediate features - # In this case, we can skip it to conserve memory self.post_layernorm = None def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index d6fe7d150336a..cd2013e91514d 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -490,7 +490,7 @@ def __init__(self, self.multimodal_config = multimodal_config # TODO: Optionally initializes this for supporting embeddings. - self.vision_model = BlipVisionModel(config.vision_config) + self.vision_model = BlipVisionModel(config.vision_config, quant_config) self.query_tokens = nn.Parameter( torch.zeros(1, config.num_query_tokens, diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 7b0981d611b25..6b45cb384d4a0 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -192,6 +192,7 @@ def __init__( self, config: CLIPVisionConfig, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.config = config @@ -211,12 +212,14 @@ def __init__( head_size=self.head_dim, total_num_heads=self.num_heads, quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", ) self.out_proj = RowParallelLinear( input_size=self.embed_dim, output_size=self.embed_dim, quant_config=quant_config, + prefix=f"{prefix}.out_proj", ) self.tp_size = get_tensor_model_parallel_world_size() @@ -259,20 +262,25 @@ def forward( class CLIPMLP(nn.Module): - def __init__(self, - config: CLIPVisionConfig, - quant_config: Optional[QuantizationConfig] = None): + def __init__( + self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: super().__init__() self.config = config self.activation_fn = get_act_fn(config.hidden_act) self.fc1 = ColumnParallelLinear(config.hidden_size, config.intermediate_size, bias=True, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.fc1") self.fc2 = RowParallelLinear(config.intermediate_size, config.hidden_size, bias=True, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.fc2") def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states, _ = self.fc1(hidden_states) @@ -284,21 +292,29 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class CLIPEncoderLayer(nn.Module): - def __init__(self, - config: CLIPVisionConfig, - quant_config: Optional[QuantizationConfig] = None): + def __init__( + self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: super().__init__() num_heads = config.num_attention_heads tp_size = get_tensor_model_parallel_world_size() if USE_XFORMERS_OPS and num_heads % tp_size == 0: - self.self_attn = CLIPParallelAttention(config, - quant_config=quant_config) + self.self_attn = CLIPParallelAttention( + config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) else: self.self_attn = CLIPSdpaAttention(config) self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.mlp = CLIPMLP(config, quant_config=quant_config) + self.mlp = CLIPMLP(config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) @@ -327,11 +343,15 @@ class CLIPEncoder(nn.Module): config: CLIPConfig """ - def __init__(self, - config: CLIPVisionConfig, - quant_config: Optional[QuantizationConfig] = None, - num_hidden_layers_override: Optional[int] = None): + def __init__( + self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + num_hidden_layers_override: Optional[int] = None, + prefix: str = "", + ) -> None: super().__init__() + self.config = config if num_hidden_layers_override is None: @@ -339,8 +359,10 @@ def __init__(self, else: num_hidden_layers = num_hidden_layers_override self.layers = nn.ModuleList([ - CLIPEncoderLayer(config=config, quant_config=quant_config) - for _ in range(num_hidden_layers) + CLIPEncoderLayer(config=config, + quant_config=quant_config, + prefix=f"{prefix}.layers.{layer_idx}") + for layer_idx in range(num_hidden_layers) ]) def forward(self, inputs_embeds: torch.Tensor): @@ -354,11 +376,17 @@ def forward(self, inputs_embeds: torch.Tensor): class CLIPVisionTransformer(nn.Module): - def __init__(self, - config: CLIPVisionConfig, - quant_config: Optional[QuantizationConfig] = None, - num_hidden_layers_override: Optional[int] = None): + def __init__( + self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + *, + num_hidden_layers_override: Optional[int] = None, + require_post_norm: Optional[bool] = None, + prefix: str = "", + ) -> None: super().__init__() + self.config = config embed_dim = config.hidden_size @@ -370,19 +398,25 @@ def __init__(self, self.encoder = CLIPEncoder( config=config, quant_config=quant_config, - num_hidden_layers_override=num_hidden_layers_override) + num_hidden_layers_override=num_hidden_layers_override, + prefix=f"{prefix}.encoder", + ) + num_hidden_layers = config.num_hidden_layers if len(self.encoder.layers) > config.num_hidden_layers: raise ValueError( - f"The original encoder only has {config.num_hidden_layers} " + f"The original encoder only has {num_hidden_layers} " f"layers, but you requested {len(self.encoder.layers)} layers." ) - elif len(self.encoder.layers) == config.num_hidden_layers: + + # If possible, skip post_layernorm to conserve memory + if require_post_norm is None: + require_post_norm = len(self.encoder.layers) == num_hidden_layers + + if require_post_norm: self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) else: - # post_layernorm is unused when we extract intermediate features - # In this case, we can skip it to conserve memory self.post_layernorm = None def forward( @@ -405,10 +439,15 @@ class CLIPVisionModel(nn.Module): config_class = CLIPVisionConfig main_input_name = "pixel_values" - def __init__(self, - config: CLIPVisionConfig, - quant_config: Optional[QuantizationConfig] = None, - num_hidden_layers_override: Optional[int] = None): + def __init__( + self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + *, + num_hidden_layers_override: Optional[int] = None, + require_post_norm: Optional[bool] = None, + prefix: str = "", + ) -> None: super().__init__() tp_size = get_tensor_model_parallel_world_size() @@ -418,7 +457,10 @@ def __init__(self, self.vision_model = CLIPVisionTransformer( config=config, quant_config=quant_config, - num_hidden_layers_override=num_hidden_layers_override) + num_hidden_layers_override=num_hidden_layers_override, + require_post_norm=require_post_norm, + prefix=f"{prefix}.vision_model", + ) def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: return self.vision_model(pixel_values) diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py index 13811d33768a6..a87e1c0228627 100644 --- a/vllm/model_executor/models/eagle.py +++ b/vllm/model_executor/models/eagle.py @@ -44,7 +44,7 @@ def __init__(self, config: EAGLEConfig, *args, **kwargs) -> None: self.model = model_cls(self.config.model, *args, **kwargs) self.fc = nn.Linear(config.model.hidden_size * 2, config.model.hidden_size, - bias=getattr(self.config, "bias", False)) + bias=getattr(self.config, "eagle_fc_bias", False)) self.orig_vocab_size = config.vocab_size self.truncated_vocab_size = config.truncated_vocab_size diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py new file mode 100644 index 0000000000000..6840ac8b9e303 --- /dev/null +++ b/vllm/model_executor/models/florence2.py @@ -0,0 +1,261 @@ +import math +from typing import Iterable, List, Optional, Tuple + +import torch +import torch.nn as nn +from transformers import PretrainedConfig + +from vllm.attention import AttentionMetadata +from vllm.config import CacheConfig +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.bart import (BartDecoder, BartEncoder, + BartParallelLMHead, + BartScaledWordEmbedding) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors + +from .utils import AutoWeightsLoader + + +class Florence2LanguageModel(nn.Module): + + def __init__(self, + config: PretrainedConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + self.config = config + + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.shared = BartScaledWordEmbedding(self.vocab_size, config.d_model) + self.encoder = BartEncoder(config, + cache_config=cache_config, + quant_config=quant_config) + self.decoder = BartDecoder(config, + cache_config=cache_config, + quant_config=quant_config) + + if self.config.tie_word_embeddings: + self.encoder.embed_tokens.weight = self.shared.weight + self.decoder.embed_tokens.weight = self.shared.weight + + def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, + encoder_input_ids: torch.Tensor, + encoder_positions: torch.Tensor, kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata) -> torch.Tensor: + r""" + Args: + input_ids + Indices of *decoder* input sequence tokens in the vocabulary. + Padding will be ignored by default should you + provide it. + positions + Positions of *decoder* input sequence tokens. + encoder_input_ids + Indices of *encoder* input sequence tokens in the vocabulary. + encoder_positions: + Positions of *encoder* input sequence tokens. + kv_caches: + Layer-wise list of KV cache tensors + attn_metadata: + vLLM Attention metadata structure + Returns: + Model output torch.Tensor + """ + + encoder_hidden_states = None + + if encoder_input_ids.numel() > 0: + # Run encoder attention if a non-zero number of encoder tokens + # are provided as input + encoder_hidden_states = self.encoder(input_ids=encoder_input_ids, + positions=encoder_positions, + kv_caches=kv_caches, + attn_metadata=attn_metadata) + + # decoder outputs consists of + # (dec_features, past_key_value, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + decoder_input_ids=input_ids, + decoder_positions=positions, + encoder_hidden_states=encoder_hidden_states, + kv_caches=kv_caches, + attn_metadata=attn_metadata) + + return decoder_outputs + + +class Florence2LanguageForConditionalGeneration(nn.Module): + + def __init__(self, + config: PretrainedConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + self.config = config + self.model = Florence2LanguageModel(config, + cache_config=cache_config, + quant_config=quant_config) + embed_scale = math.sqrt( + config.d_model) if config.scale_embedding else 1.0 + + self.vocab_size = config.vocab_size + self.lm_head = BartParallelLMHead(self.vocab_size, + config.d_model, + embed_scale=embed_scale) + + self.logits_processor = LogitsProcessor(self.vocab_size, + config.vocab_size) + self.sampler = Sampler() + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + encoder_input_ids: torch.Tensor, + encoder_positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + **kwargs, + ) -> torch.Tensor: + r""" + Args: + input_ids + torch.Tensor of *decoder* input token ids. + positions + torch.Tensor of *decoder* position indices. + encoder_input_ids + torch.Tensor of *encoder* input token ids. + encoder_positions + torch.Tensor of *encoder* position indices + kv_caches: + Layer-wise list of KV cache tensors + attn_metadata: + vLLM Attention metadata structure + Returns: + Output torch.Tensor + """ + return self.model(input_ids, positions, encoder_input_ids, + encoder_positions, kv_caches, attn_metadata) + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def sample(self, logits: torch.Tensor, + sampling_metadata: SamplingMetadata) -> SamplerOutput: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + + param = params_dict[name.replace(weight_name, param_name)] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + if "final_logits_bias" in name: + continue + if self.config.tie_word_embeddings and "embed_tokens" in name: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + + +class Florence2ForConditionalGeneration(nn.Module): + + def __init__(self, + config: PretrainedConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + + # TODO(Isotr0py): Add vision backbone + self.language_model = Florence2LanguageForConditionalGeneration( + config=config.text_config, + cache_config=cache_config, + quant_config=quant_config) + + @property + def sampler(self): + return self.language_model.sampler + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + *, + encoder_input_ids: torch.Tensor, + encoder_positions: torch.Tensor, + **kwargs, + ) -> torch.Tensor: + r""" + Args: + input_ids + torch.Tensor of *decoder* input token ids. + positions + torch.Tensor of *decoder* position indices. + encoder_input_ids + torch.Tensor of *encoder* input token ids. + encoder_positions + torch.Tensor of *encoder* position indices + kv_caches: + Layer-wise list of KV cache tensors + attn_metadata: + vLLM Attention metadata structure + Returns: + Output torch.Tensor + """ + return self.language_model(input_ids, positions, encoder_input_ids, + encoder_positions, kv_caches, attn_metadata) + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + return self.language_model.compute_logits(hidden_states, + sampling_metadata) + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> SamplerOutput: + return self.language_model.sample(logits, sampling_metadata) + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + skip_prefixes = [ + 'image_projection', "vision_tower", "image_proj_norm", + "image_pos_embed", "visual_temporal_embed" + ] + loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) + loader.load_weights(weights) diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index f958268741cd5..d79248f93f5ae 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -241,13 +241,7 @@ def forward( return hidden_states, residual -@support_torch_compile( - dynamic_arg_dims={ - "input_ids": 0, - "positions": 0, - "inputs_embeds": 0, - "intermediate_tensors": 0, - }) +@support_torch_compile class Gemma2Model(nn.Module): def __init__( diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index 3b0b6febaa48c..43f4f29814e6d 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -113,7 +113,8 @@ def __init__( self, config: Idefics2Config, quant_config: Optional[QuantizationConfig] = None, - ): + prefix: str = "", + ) -> None: super().__init__() self.config = config self.embed_dim = config.hidden_size @@ -130,12 +131,14 @@ def __init__( self.head_dim, self.num_heads, quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", ) self.out_proj = RowParallelLinear( self.embed_dim, self.embed_dim, bias=True, quant_config=quant_config, + prefix=f"{prefix}.out_proj", ) self.tp_size = get_tensor_model_parallel_world_size() self.num_heads_per_partition = divide(self.num_heads, self.tp_size) @@ -178,7 +181,8 @@ def __init__( self, config: Idefics2Config, quant_config: Optional[QuantizationConfig] = None, - ): + prefix: str = "", + ) -> None: super().__init__() self.config = config self.activation_fn = get_act_fn(config.hidden_act) @@ -187,12 +191,14 @@ def __init__( config.intermediate_size, bias=True, quant_config=quant_config, + prefix=f"{prefix}.fc1", ) self.fc2 = RowParallelLinear( config.intermediate_size, config.hidden_size, bias=True, quant_config=quant_config, + prefix=f"{prefix}.fc2", ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -204,13 +210,22 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class Idefics2EncoderLayer(nn.Module): - def __init__(self, config: Idefics2Config): + def __init__( + self, + config: Idefics2Config, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: super().__init__() self.embed_dim = config.hidden_size - self.self_attn = Idefics2VisionAttention(config) + self.self_attn = Idefics2VisionAttention(config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn") self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) - self.mlp = Idefics2VisionMLP(config) + self.mlp = Idefics2VisionMLP(config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) @@ -245,12 +260,20 @@ class Idefics2Encoder(nn.Module): config: Idefics2Config """ - def __init__(self, config: Idefics2Config): + def __init__( + self, + config: Idefics2Config, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: super().__init__() + self.config = config self.layers = nn.ModuleList([ - Idefics2EncoderLayer(config) - for _ in range(config.num_hidden_layers) + Idefics2EncoderLayer(config, + quant_config=quant_config, + prefix=f"{prefix}.layers.{layer_idx}") + for layer_idx in range(config.num_hidden_layers) ]) def forward( @@ -275,12 +298,20 @@ def forward( class Idefics2VisionTransformer(nn.Module): - def __init__(self, config: Idefics2VisionConfig): + def __init__( + self, + config: Idefics2VisionConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: super().__init__() + embed_dim = config.hidden_size self.config = config self.embeddings = Idefics2VisionEmbeddings(config) - self.encoder = Idefics2Encoder(config) + self.encoder = Idefics2Encoder(config, + quant_config=quant_config, + prefix=f"{prefix}.encoder") self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 35be1cec3d434..9761635d2a6c2 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -97,6 +97,37 @@ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: return embeddings +class InternVisionPatchModel(nn.Module): + + def __init__(self, config: PretrainedConfig): + super().__init__() + self.config = config + self.embeddings = InternVisionEmbeddings(config) + + def get_input_embeddings(self): + return self.embeddings + + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + pixel_embeds: Optional[torch.Tensor] = None, + ) -> torch.FloatTensor: + if pixel_values is None and pixel_embeds is None: + raise ValueError( + 'You have to specify pixel_values or pixel_embeds') + + if pixel_embeds is not None: + hidden_states = pixel_embeds + elif pixel_values is not None: + if pixel_values.ndim == 4: + hidden_states = self.embeddings(pixel_values) + else: + raise ValueError( + f'wrong pixel_values size: {pixel_values.shape}') + + return hidden_states + + class InternParallelAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" @@ -106,6 +137,7 @@ def __init__( quant_config: Optional[QuantizationConfig] = None, *, num_dummy_heads: int = 0, + prefix: str = "", ) -> None: super().__init__() @@ -134,6 +166,7 @@ def __init__( num_dummy_heads + self.num_heads, bias=config.qkv_bias, quant_config=quant_config, + prefix=f"{prefix}.qkv", ) self.qk_normalization = config.qk_normalization @@ -150,6 +183,7 @@ def __init__( self.dummy_dim, self.embed_dim, quant_config=quant_config, + prefix=f"{prefix}.proj", ) def _apply_qk_norm(self, q: torch.Tensor, k: torch.Tensor): @@ -253,20 +287,26 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class InternMLP(nn.Module): - def __init__(self, - config: PretrainedConfig, - quant_config: Optional[QuantizationConfig] = None): + def __init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: super().__init__() + self.config = config self.activation_fn = get_act_fn(config.hidden_act) self.fc1 = ColumnParallelLinear(config.hidden_size, config.intermediate_size, bias=True, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.fc1") self.fc2 = RowParallelLinear(config.intermediate_size, config.hidden_size, bias=True, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.fc2") def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states, _ = self.fc1(hidden_states) @@ -284,6 +324,7 @@ def __init__( quant_config: Optional[QuantizationConfig] = None, *, num_dummy_heads: int = 0, + prefix: str = "", ) -> None: super().__init__() @@ -293,9 +334,12 @@ def __init__( self.attn = self._init_attn(config, quant_config, - num_dummy_heads=num_dummy_heads) + num_dummy_heads=num_dummy_heads, + prefix=f"{prefix}.attn") - self.mlp = InternMLP(config, quant_config=quant_config) + self.mlp = InternMLP(config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps) self.norm2 = NORM2FN[self.norm_type](self.embed_dim, @@ -312,6 +356,7 @@ def _init_attn( quant_config: Optional[QuantizationConfig], *, num_dummy_heads: int, + prefix: str = "", ): # fallback to sdpa attention if tp unavailable tp_size = get_tensor_model_parallel_world_size() @@ -320,7 +365,8 @@ def _init_attn( if USE_XFORMERS_OPS and (num_heads + num_dummy_heads) % tp_size == 0: return InternParallelAttention(config, quant_config=quant_config, - num_dummy_heads=num_dummy_heads) + num_dummy_heads=num_dummy_heads, + prefix=prefix) return InternSdpaAttention(config, num_dummy_heads=num_dummy_heads) @@ -346,6 +392,7 @@ def __init__( *, num_hidden_layers_override: Optional[int] = None, num_dummy_heads: int = 0, + prefix: str = "", ): super().__init__() @@ -359,8 +406,9 @@ def __init__( self.layers = nn.ModuleList([ InternVisionEncoderLayer(config, quant_config, - num_dummy_heads=num_dummy_heads) - for _ in range(num_hidden_layers) + num_dummy_heads=num_dummy_heads, + prefix=f"{prefix}.layers.{layer_idx}") + for layer_idx in range(num_hidden_layers) ]) def forward(self, inputs_embeds: torch.Tensor): @@ -381,7 +429,8 @@ def __init__( *, num_hidden_layers_override: Optional[int] = None, num_dummy_heads: int = 0, - ): + prefix: str = "", + ) -> None: super().__init__() self.config = config @@ -392,6 +441,7 @@ def __init__( quant_config=quant_config, num_hidden_layers_override=num_hidden_layers_override, num_dummy_heads=num_dummy_heads, + prefix=f"{prefix}.encoder", ) def get_input_embeddings(self): diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py new file mode 100644 index 0000000000000..6effd70b75da3 --- /dev/null +++ b/vllm/model_executor/models/internlm2_ve.py @@ -0,0 +1,166 @@ +# -*- coding: utf-8 -*- +from typing import List, Optional, Tuple, Union + +import torch +from torch import nn +from transformers import PretrainedConfig + +from vllm.attention import AttentionMetadata +from vllm.config import CacheConfig +from vllm.distributed import get_pp_group +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.models.internlm2 import (InternLM2Attention, + InternLM2ForCausalLM, + InternLM2MLP, InternLM2Model) +from vllm.sequence import IntermediateTensors + +from .utils import make_layers + + +class InternLM2VEDecoderLayer(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + self.attention = InternLM2Attention( + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + cache_config=cache_config, + quant_config=quant_config, + ) + self.feed_forward = InternLM2MLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + ) + self.feed_forward_ve = InternLM2MLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + ) + self.attention_norm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.ffn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + residual: Optional[torch.Tensor], + visual_token_mask: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Self Attention + if residual is None: + residual = hidden_states + hidden_states = self.attention_norm(hidden_states) + else: + hidden_states, residual = self.attention_norm( + hidden_states, residual) + hidden_states = self.attention( + positions=positions, + hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + ) + + # Fully Connected + hidden_states, residual = self.ffn_norm(hidden_states, residual) + if visual_token_mask is not None and visual_token_mask.any(): + visual_token_mask = visual_token_mask.repeat( + 1, self.hidden_size).bool() + text_token_mask = ~visual_token_mask + hidden_states[visual_token_mask] = self.feed_forward_ve( + hidden_states[visual_token_mask].reshape( + -1, self.hidden_size)).flatten() + if text_token_mask.any(): + hidden_states[text_token_mask] = self.feed_forward( + hidden_states[text_token_mask].reshape( + -1, self.hidden_size)).flatten() + else: + hidden_states = self.feed_forward(hidden_states) + return hidden_states, residual + + +class InternLM2VEModel(InternLM2Model): + + def __init__( + self, + config: PretrainedConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__(config, cache_config, quant_config) + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: InternLM2VEDecoderLayer(config, cache_config, + quant_config), + prefix=f"{prefix}.layers") + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + visual_token_mask: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.tok_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + for i in range(self.start_layer, self.end_layer): + layer = self.layers[i] + hidden_states, residual = layer( + positions, + hidden_states, + kv_caches[i - self.start_layer], + attn_metadata, + residual, + visual_token_mask=visual_token_mask, + ) + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +class InternLM2VEForCausalLM(InternLM2ForCausalLM): + + def __init__( + self, + config: PretrainedConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__(config, cache_config, quant_config) + self.model = InternLM2VEModel(config, cache_config, quant_config) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index aada92cdf2456..3ae37d9fe5d85 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -19,9 +19,11 @@ from vllm.config import CacheConfig, MultiModalConfig from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, token_inputs) -from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.quantization import (AWQConfig, + QuantizationConfig) from vllm.model_executor.layers.sampler import Sampler, SamplerOutput -from vllm.model_executor.models.intern_vit import InternVisionModel +from vllm.model_executor.models.intern_vit import (InternVisionModel, + InternVisionPatchModel) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.base import MultiModalInputs @@ -417,23 +419,24 @@ def __init__(self, self.config = config self.multimodal_config = multimodal_config + self._patch_quant_config(config, quant_config) image_size = config.force_image_size or config.vision_config.image_size patch_size = config.vision_config.patch_size self.patch_size = patch_size - self.select_layer = config.select_layer self.num_image_token = int( (image_size // patch_size)**2 * (config.downsample_ratio**2)) self.downsample_ratio = config.downsample_ratio self.ps_version = config.ps_version - vision_feature_layer = self.select_layer - if vision_feature_layer < 0: - num_hidden_layers = config.vision_config.num_hidden_layers \ - + vision_feature_layer + 1 - else: - num_hidden_layers = vision_feature_layer + 1 - self.vision_model = self._init_vision_model(config, num_hidden_layers) + self.llm_arch_name = config.text_config.architectures[0] + self.is_mono = self.llm_arch_name == 'InternLM2VEForCausalLM' + self.vision_model = self._init_vision_model( + config, + quant_config=quant_config, + is_mono=self.is_mono, + prefix="vision_model", + ) self.language_model = init_vllm_registered_model( config.text_config, cache_config, quant_config) @@ -444,6 +447,18 @@ def __init__(self, self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) + def _patch_quant_config(self, config: PretrainedConfig, + quant_config: QuantizationConfig): + # the awq models from OpenGVLab missing `modules_to_not_convert` + # patch the quant_config to add `modules_to_not_convert` back + if isinstance(quant_config, AWQConfig): + text_config = config.text_config + llm_quant_config = getattr(text_config, "quantization_config", + None) + if (not quant_config.modules_to_not_convert) and \ + (llm_quant_config is not None): + quant_config.modules_to_not_convert.append("vision_model") + @cached_property def sampler(self): if hasattr(self.language_model, "sampler"): @@ -451,10 +466,30 @@ def sampler(self): return Sampler() - def _init_vision_model(self, config: PretrainedConfig, - num_hidden_layers: int): - return InternVisionModel(config.vision_config, - num_hidden_layers_override=num_hidden_layers) + def _init_vision_model( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig], + *, + is_mono: bool, + prefix: str, + ): + if not is_mono: + vision_feature_layer = config.select_layer + if vision_feature_layer < 0: + num_hidden_layers = config.vision_config.num_hidden_layers \ + + vision_feature_layer + 1 + else: + num_hidden_layers = vision_feature_layer + 1 + + return InternVisionModel( + config.vision_config, + quant_config=quant_config, + num_hidden_layers_override=num_hidden_layers, + prefix=prefix, + ) + else: + return InternVisionPatchModel(config.vision_config) def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential: vit_hidden_size = config.vision_config.hidden_size @@ -562,6 +597,14 @@ def _process_image_input( return image_embeds + def _get_visual_token_mask(self, input_ids: torch.Tensor) -> torch.Tensor: + if self.is_mono: + visual_token_mask = ( + input_ids == self.img_context_token_id).reshape(-1, 1) + else: + visual_token_mask = None + return visual_token_mask + def forward( self, input_ids: torch.Tensor, @@ -574,6 +617,7 @@ def forward( if intermediate_tensors is not None: input_ids = None inputs_embeds = None + visual_token_mask = None else: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is not None: @@ -583,16 +627,24 @@ def forward( inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, vision_embeddings, self.img_context_token_id) + visual_token_mask = self._get_visual_token_mask(input_ids) input_ids = None else: inputs_embeds = None - - hidden_states = self.language_model.model(input_ids, - positions, - kv_caches, - attn_metadata, - intermediate_tensors, - inputs_embeds=inputs_embeds) + visual_token_mask = None + + forward_kwargs = { + "input_ids": input_ids, + "positions": positions, + "kv_caches": kv_caches, + "attn_metadata": attn_metadata, + "intermediate_tensors": intermediate_tensors, + "inputs_embeds": inputs_embeds, + } + if self.is_mono: + forward_kwargs.update({"visual_token_mask": visual_token_mask}) + + hidden_states = self.language_model.model(**forward_kwargs) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index fd88ae8b50402..c346e3e808e3f 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -268,13 +268,7 @@ def forward( return hidden_states, residual -@support_torch_compile( - dynamic_arg_dims={ - "input_ids": 0, - "positions": 0, - "inputs_embeds": 0, - "intermediate_tensors": 0, - }) +@support_torch_compile class LlamaModel(nn.Module): def __init__( diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index fd2827c0eff09..83e869efa4712 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -1,11 +1,12 @@ from functools import cached_property -from typing import (Iterable, List, Literal, Mapping, Optional, Tuple, - TypedDict, Union) +from typing import (Iterable, List, Literal, Mapping, Optional, Protocol, + Tuple, TypedDict, Union) import torch import torch.nn as nn from PIL import Image -from transformers import CLIPVisionConfig, LlavaConfig, SiglipVisionConfig +from transformers import (CLIPVisionConfig, LlavaConfig, PixtralVisionConfig, + PretrainedConfig, SiglipVisionConfig) from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, MultiModalConfig @@ -22,6 +23,10 @@ dummy_seq_data_for_clip, get_max_clip_image_tokens, input_processor_for_clip) from .interfaces import SupportsMultiModal, SupportsPP +from .pixtral import (PixtralHFVisionModel, dummy_image_for_pixtral_hf, + dummy_seq_data_for_pixtral_hf, + get_max_pixtral_hf_image_tokens, + input_processor_for_pixtral_hf) from .siglip import (SiglipVisionModel, dummy_image_for_siglip, dummy_seq_data_for_siglip, get_max_siglip_image_tokens, input_processor_for_siglip) @@ -31,8 +36,13 @@ class LlavaImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: torch.Tensor - """Shape: `(batch_size * num_images, num_channels, height, width)`""" + data: Union[torch.Tensor, List[torch.Tensor]] + """ + Shape: `(batch_size * num_images, num_channels, height, width)` + + Note that `height` or `width` may be different per batch and image, + in which case the data is passed as a list instead of a batched tensor. + """ class LlavaImageEmbeddingInputs(TypedDict): @@ -77,6 +87,8 @@ def get_max_llava_image_tokens(ctx: InputContext): num_image_tokens = get_max_clip_image_tokens(vision_config) elif isinstance(vision_config, SiglipVisionConfig): num_image_tokens = get_max_siglip_image_tokens(vision_config) + elif isinstance(vision_config, PixtralVisionConfig): + num_image_tokens = get_max_pixtral_hf_image_tokens(vision_config) else: msg = f"Unsupported vision config: {type(vision_config)}" raise NotImplementedError(msg) @@ -120,6 +132,17 @@ def dummy_data_for_llava(ctx: InputContext, seq_len: int, mm_data = dummy_image_for_siglip(vision_config, num_images) return seq_data, mm_data + elif isinstance(vision_config, PixtralVisionConfig): + seq_data = dummy_seq_data_for_pixtral_hf( + vision_config, + seq_len, + num_images, + image_token_id=hf_config.image_token_index, + image_feature_size_override=image_feature_size, + ) + + mm_data = dummy_image_for_pixtral_hf(vision_config, num_images) + return seq_data, mm_data msg = f"Unsupported vision config: {type(vision_config)}" raise NotImplementedError(msg) @@ -163,12 +186,31 @@ def input_processor_for_llava(ctx: InputContext, inputs: DecoderOnlyInputs): image_token_id=hf_config.image_token_index, image_feature_size_override=image_feature_size, ) + elif isinstance(vision_config, PixtralVisionConfig): + # We ignore image_feature_size_override since we have non-uniform + # image sizes for Pixtral + return input_processor_for_pixtral_hf( + model_config, + vision_config, + inputs, + image_token_id=hf_config.image_token_index, + ) msg = f"Unsupported vision config: {type(vision_config)}" raise NotImplementedError(msg) -def _init_vision_tower(hf_config: LlavaConfig): +class LlavaLikeConfig(Protocol): + vision_config: PretrainedConfig + vision_feature_layer: int + + +def init_vision_tower_for_llava( + hf_config: LlavaLikeConfig, + quant_config: Optional[QuantizationConfig], + *, + require_post_norm: Optional[bool] = None, +): vision_config = hf_config.vision_config # Initialize the vision tower only up to the required feature layer @@ -182,12 +224,23 @@ def _init_vision_tower(hf_config: LlavaConfig): if isinstance(vision_config, CLIPVisionConfig): return CLIPVisionModel( vision_config, + quant_config, num_hidden_layers_override=num_hidden_layers, + require_post_norm=require_post_norm, ) elif isinstance(vision_config, SiglipVisionConfig): return SiglipVisionModel( vision_config, + quant_config, num_hidden_layers_override=num_hidden_layers, + require_post_norm=require_post_norm, + ) + elif isinstance(vision_config, PixtralVisionConfig): + return PixtralHFVisionModel( + vision_config, + quant_config, + num_hidden_layers_override=num_hidden_layers, + require_post_norm=require_post_norm, ) msg = f"Unsupported vision config: {type(vision_config)}" @@ -210,8 +263,17 @@ def __init__(self, self.config = config self.multimodal_config = multimodal_config + # NOTE: These are special cases for Pixtral-12B in the HF-format + # https://huggingface.co/mistral-community/pixtral-12b/blob/main/config.json # noqa + if (config.text_config.architectures is None + and config.text_config.model_type == "mistral"): + config.text_config.architectures = ["MistralForCausalLM"] + if (config.projector_hidden_act is None + and config.vision_config.hidden_act == "gelu"): + config.projector_hidden_act = "gelu" + # TODO: Optionally initializes this for supporting embeddings. - self.vision_tower = _init_vision_tower(config) + self.vision_tower = init_vision_tower_for_llava(config, quant_config) self.multi_modal_projector = LlavaMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, text_hidden_size=config.text_config.hidden_size, @@ -243,9 +305,38 @@ def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: return data + def _validate_image_sizes(self, images: List[torch.Tensor], + sizes: List[torch.Tensor]) -> List[torch.Tensor]: + if not isinstance(sizes, list): + sizes = [sizes] + + total_images = sum(size.numel() // 2 for size in sizes) + if total_images != len(images): + raise ValueError("Mismatch in number of images. " + f"Expected {total_images}, got {len(images)}") + img_idx = 0 + for size in sizes: + # Flatten the size tensor to a list of (height, width) pairs + size = size.view(-1, 2).tolist() + for expected_h, expected_w in size: + if img_idx >= len(images): + raise ValueError("Ran out of images before sizes. " + f"{img_idx} >= {len(images)}") + img = images[img_idx] + if img.shape[-2:] != (expected_h, expected_w): + raise ValueError( + "Image size mismatch. Expected " + f"{(expected_h, expected_w)}, got {img.shape[-2:]}") + if img.shape[-3] != 3: + raise ValueError("Image channel mismatch. Expected 3, " + f"got {img.shape[-3]}") + img_idx += 1 + return images + def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[LlavaImageInputs]: pixel_values = kwargs.pop("pixel_values", None) + image_sizes = kwargs.pop("image_sizes", None) image_embeds = kwargs.pop("image_embeds", None) if pixel_values is None and image_embeds is None: @@ -256,6 +347,34 @@ def _parse_and_validate_image_input( raise ValueError("Incorrect type of pixel values. " f"Got type: {type(pixel_values)}") + # Case for models like PixtralHF that have dynamic image sizes + # so we need to produce a list of tensors + if image_sizes is not None: + images = pixel_values + + def flatten_to_3d_tensors(item): + if isinstance(item, torch.Tensor): + if item.dim() >= 3: + return [t for t in item.view(-1, *item.shape[-3:])] + else: + raise ValueError( + f"Unexpected tensor dimension: {item.dim()}") + elif isinstance(item, list): + return [ + t for subitem in item + for t in flatten_to_3d_tensors(subitem) + ] + else: + raise ValueError(f"Unexpected type: {type(item)}") + + # Restructure the batched images into a list of lists of images + images = flatten_to_3d_tensors(pixel_values) + + return LlavaImagePixelInputs( + type="pixel_values", + data=self._validate_image_sizes(images, image_sizes), + ) + return LlavaImagePixelInputs( type="pixel_values", data=self._validate_pixel_values( @@ -286,7 +405,8 @@ def _select_image_features(self, image_features: torch.Tensor, *, def _image_pixels_to_features( self, - vision_tower: Union[CLIPVisionModel, SiglipVisionModel], + vision_tower: Union[CLIPVisionModel, SiglipVisionModel, + PixtralHFVisionModel], pixel_values: torch.Tensor, ) -> torch.Tensor: diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 4dd472b04bb1a..d33d4ac5bfaed 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -13,23 +13,25 @@ from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, MultiModalConfig from vllm.inputs import INPUT_REGISTRY, DecoderOnlyInputs, InputContext +from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.sequence import IntermediateTensors +from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.utils import is_list_of from .clip import (CLIPVisionModel, dummy_image_for_clip, dummy_seq_data_for_clip, get_clip_image_feature_size, get_clip_patch_grid_length, input_processor_for_clip) from .interfaces import SupportsMultiModal, SupportsPP -from .llava import LlavaMultiModalProjector +from .llava import LlavaMultiModalProjector, init_vision_tower_for_llava from .siglip import (SiglipVisionModel, dummy_image_for_siglip, dummy_seq_data_for_siglip, get_siglip_image_feature_size, get_siglip_patch_grid_length, input_processor_for_siglip) -from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, - merge_multimodal_embeddings) +from .utils import (AutoWeightsLoader, embed_multimodal, flatten_bn, + init_vllm_registered_model) # Result in the max possible feature size (2x2 grid of 336x336px tiles) MAX_IMAGE_FEATURE_SIZE_HEIGHT = MAX_IMAGE_FEATURE_SIZE_WIDTH = 448 @@ -257,32 +259,6 @@ def input_processor_for_llava_next(ctx: InputContext, raise NotImplementedError(msg) -def _init_vision_tower(hf_config: LlavaNextConfig): - vision_config = hf_config.vision_config - - # Initialize the vision tower only up to the required feature layer - vision_feature_layer = hf_config.vision_feature_layer - if vision_feature_layer < 0: - num_hidden_layers = hf_config.vision_config.num_hidden_layers \ - + vision_feature_layer + 1 - else: - num_hidden_layers = vision_feature_layer + 1 - - if isinstance(vision_config, CLIPVisionConfig): - return CLIPVisionModel( - vision_config, - num_hidden_layers_override=num_hidden_layers, - ) - elif isinstance(vision_config, SiglipVisionConfig): - return SiglipVisionModel( - vision_config, - num_hidden_layers_override=num_hidden_layers, - ) - - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - @MULTIMODAL_REGISTRY.register_image_input_mapper() @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_next_image_tokens) @INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_next) @@ -301,7 +277,7 @@ def __init__(self, self.multimodal_config = multimodal_config # TODO: Optionally initializes this for supporting embeddings. - self.vision_tower = _init_vision_tower(config) + self.vision_tower = init_vision_tower_for_llava(config, quant_config) self.image_newline = nn.Parameter( torch.empty(config.text_config.hidden_size)) self.multi_modal_projector = LlavaMultiModalProjector( @@ -312,6 +288,10 @@ def __init__(self, self.language_model = init_vllm_registered_model( config.text_config, cache_config, quant_config) + # The same model class supports both language generation and embedding + # because the architecture name is the same + self._pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True) + self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) @@ -605,14 +585,12 @@ def forward( image_input = self._parse_and_validate_image_input(**kwargs) if image_input is not None: - vision_embeddings = self._process_image_input(image_input) - inputs_embeds = self.language_model.model.get_input_embeddings( - input_ids) - - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, - self.config.image_token_index) - + inputs_embeds = embed_multimodal( + input_ids, + self.config.image_token_index, + self.language_model.model.get_input_embeddings, + lambda _: self._process_image_input(image_input), + ) input_ids = None else: inputs_embeds = None @@ -641,6 +619,13 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) + def pooler( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> Optional[PoolerOutput]: + return self._pooler(hidden_states, pooling_metadata) + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) loader.load_weights(weights) diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 4a354b616c2f6..d02cf9044dfc0 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -26,6 +26,7 @@ from .clip import dummy_image_for_clip, dummy_seq_data_for_clip from .interfaces import SupportsMultiModal, SupportsPP +from .llava import init_vision_tower_for_llava from .siglip import (SiglipVisionModel, dummy_image_for_siglip, dummy_seq_data_for_siglip) from .utils import (AutoWeightsLoader, init_vllm_registered_model, @@ -179,32 +180,6 @@ def input_processor_for_llava_next_video(ctx: InputContext, raise NotImplementedError(msg) -def _init_vision_tower(hf_config: LlavaNextVideoConfig): - vision_config = hf_config.vision_config - - # Initialize the vision tower only up to the required feature layer - vision_feature_layer = hf_config.vision_feature_layer - if vision_feature_layer < 0: - num_hidden_layers = hf_config.vision_config.num_hidden_layers \ - + vision_feature_layer + 1 - else: - num_hidden_layers = vision_feature_layer + 1 - - if isinstance(vision_config, CLIPVisionConfig): - return CLIPVisionModel( - vision_config, - num_hidden_layers_override=num_hidden_layers, - ) - elif isinstance(vision_config, SiglipVisionConfig): - return SiglipVisionModel( - vision_config, - num_hidden_layers_override=num_hidden_layers, - ) - - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - # adopted from transformers modeling_llava_next_video.py class LlavaNextVideoPooler(nn.Module): @@ -281,7 +256,7 @@ def __init__(self, self.multimodal_config = multimodal_config # Initialize the vision tower only up to the required feature layer - self.vision_tower = _init_vision_tower(config) + self.vision_tower = init_vision_tower_for_llava(config, quant_config) self.vision_resampler = LlavaNextVideoPooler(config) self.multi_modal_projector = LlavaNextMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 5bd3055ca181a..10aa8049a2347 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -31,6 +31,7 @@ dummy_video_for_clip, get_clip_image_feature_size, get_clip_patch_grid_length, input_processor_for_clip) from .interfaces import SupportsMultiModal, SupportsPP +from .llava import init_vision_tower_for_llava from .siglip import (SiglipVisionModel, dummy_seq_data_for_siglip, dummy_video_for_siglip, get_siglip_image_feature_size, get_siglip_patch_grid_length, input_processor_for_siglip) @@ -357,32 +358,6 @@ def input_processor_for_llava_onevision(ctx: InputContext, raise NotImplementedError(msg) -def _init_vision_tower(hf_config: LlavaOnevisionConfig): - vision_config = hf_config.vision_config - - # Initialize the vision tower only up to the required feature layer - vision_feature_layer = hf_config.vision_feature_layer - if vision_feature_layer < 0: - num_hidden_layers = hf_config.vision_config.num_hidden_layers \ - + vision_feature_layer + 1 - else: - num_hidden_layers = vision_feature_layer + 1 - - if isinstance(vision_config, CLIPVisionConfig): - return CLIPVisionModel( - vision_config, - num_hidden_layers_override=num_hidden_layers, - ) - elif isinstance(vision_config, SiglipVisionConfig): - return SiglipVisionModel( - vision_config, - num_hidden_layers_override=num_hidden_layers, - ) - - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - class LlavaOnevisionMultiModalProjector(nn.Module): def __init__(self, config: LlavaOnevisionConfig): @@ -425,7 +400,7 @@ def __init__(self, self.multimodal_config = multimodal_config # Initialize the vision tower only up to the required feature layer - self.vision_tower = _init_vision_tower(config) + self.vision_tower = init_vision_tower_for_llava(config, quant_config) self.multi_modal_projector = LlavaOnevisionMultiModalProjector(config) self.language_model = init_vllm_registered_model( config.text_config, cache_config, quant_config) diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index 7f2efb9895f25..9f4f391a6682e 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -22,7 +22,7 @@ QuantizationConfig) from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( - VocabParallelEmbedding) + DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( composed_weight_loader, default_weight_loader, sharded_weight_loader) from vllm.model_executor.models.interfaces import (HasInnerState, @@ -59,7 +59,7 @@ def __init__(self, config: MambaConfig, layer_idx): self.conv_kernel_size = config.conv_kernel self.intermediate_size = config.intermediate_size self.time_step_rank = int(config.time_step_rank) - + self.is_falcon_mamba = config.model_type == "falcon_mamba" self.conv1d = ColumnParallelLinear( input_size=self.conv_kernel_size, output_size=self.intermediate_size, @@ -109,6 +109,13 @@ def __init__(self, config: MambaConfig, layer_idx): input_is_parallel=True, ) self.activation = config.hidden_act + if self.is_falcon_mamba: + self.dt_layernorm = RMSNorm(self.time_step_rank, + eps=config.mixer_rms_eps) + self.b_layernorm = RMSNorm(self.ssm_state_size, + eps=config.mixer_rms_eps) + self.c_layernorm = RMSNorm(self.ssm_state_size, + eps=config.mixer_rms_eps) def forward(self, hidden_states: torch.Tensor, attn_metadata: AttentionMetadata, @@ -158,8 +165,12 @@ def forward(self, hidden_states: torch.Tensor, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1, ) - - # Note that Jamba normalizes B, C, and time_step here but Mamba doesn't. + # Note that Jamba and FalconMamba normalizes B, C, and time_step here + # but Mamba doesn't. + if self.is_falcon_mamba: + time_step = self.dt_layernorm(time_step.contiguous()) + B = self.b_layernorm(B.contiguous()) + C = self.c_layernorm(C.contiguous()) discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1) # 3.c perform the recurrence y ← SSM(A, B, C)(x) @@ -213,11 +224,9 @@ def __init__(self, super().__init__() self.layer_idx = layer_idx self.config = config + self.is_falcon_mamba = config.model_type == "falcon_mamba" self.mixer = MambaMixer(config, layer_idx) - self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) - self.pre_ff_layernorm = RMSNorm(config.hidden_size, - eps=config.layer_norm_epsilon) def forward( self, @@ -319,8 +328,18 @@ def __init__( self.unpadded_vocab_size = config.vocab_size if lora_config: self.unpadded_vocab_size += lora_config.lora_extra_vocab_size - - self.lm_head = self.backbone.embeddings + if config.tie_word_embeddings: + self.lm_head = self.backbone.embeddings + else: + self.lm_head = ParallelLMHead( + self.unpadded_vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + padding_size=DEFAULT_VOCAB_PADDING_SIZE + # We need bigger padding if using lora for kernel + # compatibility + if not lora_config else lora_config.lora_vocab_padding_size, + ) # Used to track and store by the Mamba cache between steps. self.mamba_cache: Optional[MambaCacheManager] = None @@ -398,7 +417,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): for name, loaded_weight in weights: if "A_log" in name: name = name.replace("A_log", "A") - # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index ca7c2be5a038e..2ec51dc4647f5 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -395,7 +395,7 @@ def __init__( self.version = get_version_by_config(self.config) self.llm = self.init_llm(config, cache_config, quant_config) - self.vpm = self.init_vision_module() + self.vpm = self.init_vision_module(config, quant_config) param_dtype = torch.get_default_dtype() self.vpm.to(dtype=param_dtype) self.vision_dim = (self.vpm.embed_dim if self.version == (2, 0) else @@ -647,7 +647,11 @@ def init_llm( ) -> nn.Module: raise NotImplementedError - def init_vision_module(self) -> nn.Module: + def init_vision_module( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig], + ) -> nn.Module: raise NotImplementedError def init_resampler(self, embed_dim: int, vision_dim: int) -> nn.Module: @@ -693,7 +697,11 @@ def init_llm( quant_config=quant_config), name="model") - def init_vision_module(self) -> nn.Module: + def init_vision_module( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig], + ) -> nn.Module: # TODO :refactor this vision model try: import timm @@ -817,8 +825,13 @@ def init_llm( quant_config=quant_config), name="model") - def init_vision_module(self) -> nn.Module: - model = Idefics2VisionTransformer(self.config.vision_config) + def init_vision_module( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig], + ) -> nn.Module: + model = Idefics2VisionTransformer(config.vision_config, + quant_config=quant_config) if self.config.drop_vision_last_layer: model.encoder.layers = model.encoder.layers[:-1] return model @@ -929,9 +942,13 @@ def init_llm( quant_config=quant_config), name="model") - def init_vision_module(self) -> nn.Module: - - model = Idefics2VisionTransformer(self.config.vision_config) + def init_vision_module( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig], + ) -> nn.Module: + model = Idefics2VisionTransformer(config.vision_config, + quant_config=quant_config) if self.config.drop_vision_last_layer: model.encoder.layers = model.encoder.layers[:-1] return model diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 378231f14455a..23e2b520e5b40 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -379,9 +379,13 @@ def forward( class MllamaVisionEncoderLayer(nn.Module): - def __init__(self, - config: config_mllama.MllamaVisionConfig, - is_gated: bool = False): + def __init__( + self, + config: config_mllama.MllamaVisionConfig, + quant_config: Optional[QuantizationConfig], + prefix: str = "", + is_gated: bool = False, + ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -390,7 +394,9 @@ def __init__(self, self.intermediate_size = config.intermediate_size self.self_attn = MllamaVisionSdpaAttention(config) - self.mlp = CLIPMLP(config) + self.mlp = CLIPMLP(config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") self.input_layernorm = nn.LayerNorm(self.hidden_size, eps=config.norm_eps) @@ -427,16 +433,23 @@ def forward( class MllamaVisionEncoder(nn.Module): - def __init__(self, - config: config_mllama.MllamaVisionConfig, - num_layers=32, - is_gated=False, - output_hidden_states=None): + def __init__( + self, + config: config_mllama.MllamaVisionConfig, + quant_config: Optional[QuantizationConfig], + num_layers: int = 32, + is_gated: bool = False, + output_hidden_states=None, + prefix: str = "", + ) -> None: super().__init__() self.config = config self.layers = nn.ModuleList([ - MllamaVisionEncoderLayer(config, is_gated) - for _ in range(num_layers) + MllamaVisionEncoderLayer(config, + quant_config=quant_config, + is_gated=is_gated, + prefix=f"{prefix}.layers.{layer_idx}") + for layer_idx in range(num_layers) ]) self.output_hidden_states = output_hidden_states or [] @@ -463,8 +476,14 @@ def forward( class MllamaVisionModel(nn.Module): - def __init__(self, config: config_mllama.MllamaVisionConfig): + def __init__( + self, + config: config_mllama.MllamaVisionConfig, + quant_config: Optional[QuantizationConfig], + prefix: str = "", + ) -> None: super().__init__() + self.image_size = config.image_size self.patch_size = config.patch_size self.max_num_tiles = config.max_num_tiles @@ -500,12 +519,19 @@ def __init__(self, config: config_mllama.MllamaVisionConfig): # encoders self.transformer = MllamaVisionEncoder( config, + quant_config, config.num_hidden_layers, is_gated=False, - output_hidden_states=config.intermediate_layers_indices) - self.global_transformer = MllamaVisionEncoder(config, - config.num_global_layers, - is_gated=True) + output_hidden_states=config.intermediate_layers_indices, + prefix=f"{prefix}.transformer", + ) + self.global_transformer = MllamaVisionEncoder( + config, + quant_config, + config.num_global_layers, + is_gated=True, + prefix=f"{prefix}.global_transformer", + ) def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.Tensor: @@ -648,6 +674,7 @@ def __init__( config: Optional[config_mllama.MllamaTextConfig] = None, layer_idx: Optional[int] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.config = config @@ -673,6 +700,7 @@ def __init__( self.num_key_value_heads, bias=False, quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", ) self.o_proj = RowParallelLinear( self.num_heads * self.head_dim, @@ -680,6 +708,7 @@ def __init__( bias=False, input_is_parallel=True, quant_config=quant_config, + prefix=f"{prefix}.o_proj", ) # vllm.model_executor.layers.layernorm.RMSNorm has precision issue, # use huggingface's instead @@ -692,6 +721,7 @@ def __init__( self.head_dim, self.scaling, self.num_local_key_value_heads, + prefix=f"{prefix}.attn", ) def forward( @@ -791,15 +821,21 @@ class MllamaCrossAttentionDecoderLayer(torch.nn.Module): """Cross-attention transformer block with tanh-gated attention and feedforward.""" - def __init__(self, config: config_mllama.MllamaTextConfig, layer_idx: int, - quant_config: Optional[QuantizationConfig]) \ - -> None: + def __init__( + self, + config: config_mllama.MllamaTextConfig, + layer_idx: int, + quant_config: Optional[QuantizationConfig], + prefix: str = "", + ) -> None: super().__init__() + self.layer_idx = layer_idx self.cross_attn = MllamaTextCrossAttention( config=config, layer_idx=layer_idx, quant_config=quant_config, + prefix=f"{prefix}.cross_attn", ) self.input_layernorm = RMSNorm(config.hidden_size, @@ -811,6 +847,7 @@ def __init__(self, config: config_mllama.MllamaTextConfig, layer_idx: int, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, quant_config=quant_config, + prefix=f"{prefix}.mlp", ) self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -854,10 +891,15 @@ class MllamaTextModel(nn.Module): config_class = config_mllama.MllamaTextConfig base_model_prefix = "model" - def __init__(self, config: config_mllama.MllamaTextConfig, - cache_config: Optional[CacheConfig], - quant_config: Optional[QuantizationConfig]): + def __init__( + self, + config: config_mllama.MllamaTextConfig, + cache_config: Optional[CacheConfig], + quant_config: Optional[QuantizationConfig], + prefix: str = "", + ) -> None: super().__init__() + self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding(config.vocab_size + 8, @@ -869,13 +911,20 @@ def __init__(self, config: config_mllama.MllamaTextConfig, if layer_idx in self.cross_attention_layers: layers.append( MllamaCrossAttentionDecoderLayer( - config, layer_idx, quant_config=quant_config)) + config, + layer_idx, + quant_config=quant_config, + prefix=f"{prefix}.layers.{layer_idx}", + )) else: # TODO: force LlamaDecoderLayer to config.attention_bias=False layers.append( - LlamaDecoderLayer(config, - cache_config=cache_config, - quant_config=quant_config)) + LlamaDecoderLayer( + config, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.layers.{layer_idx}", + )) self.layers = nn.ModuleList(layers) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -932,12 +981,19 @@ class MllamaForCausalLM(nn.Module): "MllamaCrossAttentionDecoderLayer", "MllamaSelfAttentionDecoderLayer" ] - def __init__(self, config: config_mllama.MllamaTextConfig, - cache_config: Optional[CacheConfig], - quant_config: Optional[QuantizationConfig]): + def __init__( + self, + config: config_mllama.MllamaTextConfig, + cache_config: Optional[CacheConfig], + quant_config: Optional[QuantizationConfig], + prefix: str = "", + ) -> None: super().__init__() self.vocab_size = config.vocab_size - self.model = MllamaTextModel(config, cache_config, quant_config) + self.model = MllamaTextModel(config, + cache_config, + quant_config, + prefix=f"{prefix}.model") self.lm_head = ParallelLMHead( config.vocab_size, config.hidden_size, @@ -994,11 +1050,13 @@ def __init__(self, config.pad_token_id if config.pad_token_id is not None else -1 self.image_size = config.vision_config.image_size - self.vision_model = MllamaVisionModel(config.vision_config) + self.vision_model = MllamaVisionModel(config.vision_config, + quant_config) self.language_model = MllamaForCausalLM( config.text_config, cache_config=cache_config, quant_config=quant_config, + prefix="language_model", ) self.multi_modal_projector = nn.Linear( config.vision_config.vision_output_dim, diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index a52e3cb6039be..df4fd0a3256e9 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -4,10 +4,13 @@ # Copyright (c) 2024 NVIDIA # Licensed under Apache 2.0 License [see LICENSE for details] # -------------------------------------------------------- +from typing import Optional + import torch.nn as nn from transformers import PretrainedConfig from vllm.inputs import INPUT_REGISTRY +from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.multimodal import MULTIMODAL_REGISTRY from .intern_vit import InternVisionModel @@ -55,10 +58,31 @@ def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential: nn.Linear(llm_intermediate_size, llm_hidden_size, bias=False), ) - def _init_vision_model(self, config: PretrainedConfig, - num_hidden_layers: int): - # We added additional dummy heads to the original num of heads to make - # the number of heads divisible by 8. - return InternVisionModel(config.vision_config, - num_hidden_layers_override=num_hidden_layers, - num_dummy_heads=7) + def _init_vision_model( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig], + *, + is_mono: bool, + prefix: str, + ): + if not is_mono: + vision_feature_layer = config.select_layer + if vision_feature_layer < 0: + num_hidden_layers = config.vision_config.num_hidden_layers \ + + vision_feature_layer + 1 + else: + num_hidden_layers = vision_feature_layer + 1 + + # We added additional dummy heads to the original num of heads to + # make the number of heads divisible by 8. + return InternVisionModel( + config.vision_config, + quant_config=quant_config, + num_hidden_layers_override=num_hidden_layers, + num_dummy_heads=7, + prefix=prefix, + ) + else: + msg = "Monolith mode is not applicable to NVLM_D" + raise NotImplementedError(msg) diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 7806cd6ab4608..7a62a098a4525 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -142,7 +142,8 @@ def __init__(self, self.config = config self.multimodal_config = multimodal_config - self.vision_tower = SiglipVisionModel(config.vision_config) + self.vision_tower = SiglipVisionModel(config.vision_config, + quant_config) self.multi_modal_projector = PaliGemmaMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, projection_dim=config.vision_config.projection_dim) diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 0918f21a40e27..ec20cb249ba9b 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -102,8 +102,9 @@ def __init__(self, # pylint: disable=C0301 # Refer to: # https://huggingface.co/microsoft/phi-1_5/blob/d212a789620c380ff32ca1d1ee9943a777360987/modeling_phi.py#L518 - rope_theta = 10000 - max_position_embeddings = getattr(config, "n_positions", 2048) + rope_theta = getattr(config, "rope_theta", 10000.0) + max_position_embeddings = getattr(config, "max_position_embeddings", + 2048) self.rotary_emb = get_rope( self.head_size, rotary_dim=rotary_dim, diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 91c14e32c946c..855a9b17585a4 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -70,7 +70,8 @@ projection_dim=768) -def _init_img_processor(hf_config: PretrainedConfig): +def _init_img_processor(hf_config: PretrainedConfig, + quant_config: Optional[QuantizationConfig]): clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG layer_idx = hf_config.img_processor.get('layer_idx', -2) @@ -82,7 +83,10 @@ def _init_img_processor(hf_config: PretrainedConfig): num_hidden_layers = layer_idx + 1 img_processor = CLIPVisionModel( - clip_config, num_hidden_layers_override=num_hidden_layers) + clip_config, + quant_config, + num_hidden_layers_override=num_hidden_layers, + ) return img_processor @@ -148,14 +152,15 @@ def get_img_features(self, class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase): """Phi3 Image embedding with HD transform.""" - def __init__(self, config: PretrainedConfig) -> None: + def __init__(self, config: PretrainedConfig, + quant_config: Optional[QuantizationConfig]) -> None: super().__init__() # n_embed or hidden_size hidden_size = config.n_embd if hasattr( config, 'n_embd') else config.hidden_size - self.img_processor = _init_img_processor(config) + self.img_processor = _init_img_processor(config, quant_config) image_dim_out = config.img_processor['image_dim_out'] self.num_img_tokens = config.img_processor['num_img_tokens'] @@ -467,8 +472,6 @@ def input_processor_for_phi3v(ctx: InputContext, prompt_token_ids = inputs["prompt_token_ids"].copy() - print("prompt_token_ids (old)", prompt_token_ids) - # masked placeholder with image token id for idx in image_idx: candidates = _get_image_placeholder_token_id_candidates(model_config, @@ -537,7 +540,7 @@ def __init__(self, ) # TODO: Optionally initializes this for supporting input embeddings. - self.vision_embed_tokens = Phi3HDImageEmbedding(config) + self.vision_embed_tokens = Phi3HDImageEmbedding(config, quant_config) self.language_model = LlamaForCausalLM(config, cache_config, quant_config) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index f34d21fdef56f..18dbee94e10b0 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -3,18 +3,25 @@ from itertools import tee from typing import Iterable, List, Mapping, Optional, Tuple, Union +import numpy import torch import torch.nn as nn import torch.nn.functional as F from mistral_common.protocol.instruct.messages import ImageChunk from PIL import Image -from transformers import PretrainedConfig +from transformers import PixtralVisionConfig, PretrainedConfig +from transformers.models.pixtral.image_processing_pixtral import ( + _num_image_tokens) +from transformers.models.pixtral.modeling_pixtral import ( + PixtralRotaryEmbedding, apply_rotary_pos_emb, position_ids_in_meshgrid) from xformers.ops.fmha import memory_efficient_attention from xformers.ops.fmha.attn_bias import BlockDiagonalMask from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig, MultiModalConfig -from vllm.inputs import INPUT_REGISTRY, DecoderOnlyInputs, InputContext +from vllm.config import CacheConfig, ModelConfig, MultiModalConfig +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, + token_inputs) +from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import Sampler, SamplerOutput @@ -25,6 +32,8 @@ from vllm.multimodal.base import MultiModalInputs from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors, SequenceData +from vllm.transformers_utils.processor import cached_get_processor +from vllm.utils import is_list_of from .interfaces import SupportsMultiModal, SupportsPP from .utils import init_vllm_registered_model @@ -576,3 +585,448 @@ def __init__(self, args: VisionEncoderArgs, dim: int): def forward(self, x: torch.Tensor) -> torch.Tensor: return self.w_out(self.gelu(self.w_in(x))) + + +#### HF Transformers version of Pixtral #### +# Based off https://github.com/huggingface/transformers/blob/d7950bff82b18c823193d17d72188c5e46d06c83/src/transformers/models/pixtral/modeling_pixtral.py +# This model follows the Llava family, meaning image embeddings are placed +# instead of the `[IMG]` token placeholders. +# The model uses [`PixtralVisionModel`] for its vision encoder, +# and [`MistralForCausalLM`] for its language decoder. + + +def get_pixtral_hf_patch_grid_length(*, image_size: int, + patch_size: int) -> int: + # Since interpolation is applied, the image size need not be divisible + # assert image_size % patch_size == 0 + return image_size // patch_size + + +def get_pixtral_hf_num_patches(*, image_size: int, patch_size: int) -> int: + grid_length = get_pixtral_hf_patch_grid_length(image_size=image_size, + patch_size=patch_size) + return grid_length * grid_length + + +def get_max_pixtral_hf_image_feature_size( + hf_config: PixtralVisionConfig) -> int: + return get_pixtral_hf_num_patches(image_size=hf_config.image_size, + patch_size=hf_config.patch_size) + + +def get_max_pixtral_hf_image_tokens(hf_config: PixtralVisionConfig) -> int: + return get_max_pixtral_hf_image_feature_size(hf_config) + + +def dummy_seq_data_for_pixtral_hf( + hf_config: PixtralVisionConfig, + seq_len: int, + num_images: int, + *, + image_token_id: int, + image_feature_size_override: Optional[int] = None, +): + if image_feature_size_override is None: + image_feature_size = get_max_pixtral_hf_image_feature_size(hf_config) + else: + image_feature_size = image_feature_size_override + + return SequenceData.from_prompt_token_counts( + (image_token_id, image_feature_size * num_images), + (0, seq_len - image_feature_size * num_images), + ) + + +def dummy_image_for_pixtral_hf( + hf_config: PixtralVisionConfig, + num_images: int, + *, + image_width_override: Optional[int] = None, + image_height_override: Optional[int] = None, +): + width = height = hf_config.image_size + if image_width_override is not None: + width = image_width_override + if image_height_override is not None: + height = image_height_override + + image = Image.new("RGB", (width, height), color=0) + return {"image": image if num_images == 1 else [image] * num_images} + + +def get_pixtral_hf_image_feature_size(hf_config: PixtralVisionConfig, + image_width: int, + image_height: int) -> Tuple[int, int]: + # Adapted from transformers.models.pixtral.image_processing_pixtral.get_resize_output_image_size # noqa: E501 + # https://github.com/huggingface/transformers/blob/2bd4d5897dc73e8b172832070a6f9e567a0df017/src/transformers/models/pixtral/image_processing_pixtral.py#L180 # noqa: E501 + max_width, max_height = hf_config.image_size, hf_config.image_size + patch_width, patch_height = hf_config.patch_size, hf_config.patch_size + + ratio = max(image_width / max_width, image_height / max_height) + + if ratio > 1: + image_width = int(numpy.ceil(image_width / ratio)) + image_height = int(numpy.ceil(image_height / ratio)) + + num_height_tokens, num_width_tokens = _num_image_tokens( + (image_height, image_width), (patch_height, patch_width)) + + return num_width_tokens, num_height_tokens + + +def input_processor_for_pixtral_hf( + model_config: ModelConfig, + hf_config: PixtralVisionConfig, + inputs: DecoderOnlyInputs, + *, + image_token_id: int, + image_feature_size_override: Optional[Union[int, List[int]]] = None, +) -> DecoderOnlyInputs: + assert image_feature_size_override is None, ( + "image_feature_size_override is not supported for Pixtral") + + multi_modal_data = inputs.get("multi_modal_data") + if multi_modal_data is None or "image" not in multi_modal_data: + return inputs + + processor = cached_get_processor(model_config.model) + + image_data = multi_modal_data["image"] + if isinstance(image_data, Image.Image): + image_data = [image_data] + elif not is_list_of(image_data, Image.Image): + raise TypeError(f"Invalid image type: {type(image_data)}") + + new_prompt = inputs.get("prompt") + new_token_ids = inputs["prompt_token_ids"] + + image_token = processor.image_token + image_break_token = processor.image_break_token + image_end_token = processor.image_end_token + + # Update new_prompt if present + if new_prompt: + parts = new_prompt.split(image_token) + assert len(parts) - 1 == len(image_data) + new_parts = [parts[0]] # Start with the part before any image tokens + + for image, next_part in zip(image_data, parts[1:]): + w, h = image.size + (num_width_tokens, + num_height_tokens) = get_pixtral_hf_image_feature_size( + hf_config, image_width=w, image_height=h) + + replace_tokens = [image_token] * num_width_tokens + [ + image_break_token + ] + replace_tokens = replace_tokens * num_height_tokens + replace_tokens[-1] = image_end_token + + new_parts.append("".join(replace_tokens)) + new_parts.append(next_part) + + new_prompt = "".join(new_parts) + + # Update new_token_ids + convert_tokens_to_ids = processor.tokenizer.convert_tokens_to_ids + image_token_id = convert_tokens_to_ids(image_token) + image_break_id = convert_tokens_to_ids(image_break_token) + image_end_id = convert_tokens_to_ids(image_end_token) + placeholder_token_id = -999 + # Find all image token indices at once + placeholder_indices = [ + idx for idx, token_id in enumerate(new_token_ids) + if token_id == image_token_id + ] + assert len(placeholder_indices) == len(image_data) + replace_tokens_list = [] + for placeholder_idx, image in zip(placeholder_indices, image_data): + new_token_ids[placeholder_idx] = placeholder_token_id + + w, h = image.size + (num_width_tokens, + num_height_tokens) = get_pixtral_hf_image_feature_size(hf_config, + image_width=w, + image_height=h) + + replace_tokens = [image_token_id] * num_width_tokens + [image_break_id] + replace_tokens = replace_tokens * num_height_tokens + replace_tokens[-1] = image_end_id + replace_tokens_list.append(replace_tokens) + + # Backward iteration for replacement without affecting known indices + for placeholder_idx, replace_tokens in zip(reversed(placeholder_indices), + reversed(replace_tokens_list)): + new_token_ids[placeholder_idx:placeholder_idx + 1] = replace_tokens + + # NOTE: Create a defensive copy of the original inputs + return token_inputs(prompt_token_ids=new_token_ids, + prompt=new_prompt, + multi_modal_data=multi_modal_data) + + +class PixtralHFMLP(nn.Module): + + def __init__( + self, + config: PixtralVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + *, + prefix: str = "", + ) -> None: + super().__init__() + + assert config.intermediate_size is not None + # TODO: Use quant_config and prefix after optimizing this + self.gate_proj = nn.Linear(config.hidden_size, + config.intermediate_size, + bias=False) + self.up_proj = nn.Linear(config.hidden_size, + config.intermediate_size, + bias=False) + self.down_proj = nn.Linear(config.intermediate_size, + config.hidden_size, + bias=False) + self.act = get_act_fn(config.hidden_act) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x)) + + +class PixtralHFAttention(nn.Module): + + def __init__( + self, + config: PixtralVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + *, + prefix: str = "", + ) -> None: + super().__init__() + + self.config = config + assert not config.hidden_size % config.num_attention_heads + self.n_heads = config.num_attention_heads + self.head_dim = config.hidden_size // config.num_attention_heads + + self.scale = self.head_dim**-0.5 + + # TODO: Use quant_config and prefix after optimizing this + self.q_proj = nn.Linear(config.hidden_size, + config.hidden_size, + bias=False) + self.k_proj = nn.Linear(config.hidden_size, + config.hidden_size, + bias=False) + self.v_proj = nn.Linear(config.hidden_size, + config.hidden_size, + bias=False) + self.o_proj = nn.Linear(config.hidden_size, + config.hidden_size, + bias=False) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: BlockDiagonalMask, + position_embeddings: torch.Tensor, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + batch, patches, _ = hidden_states.size() + + q = self.q_proj(hidden_states) + k = self.k_proj(hidden_states) + v = self.v_proj(hidden_states) + + # Transpose q and k to apply HF's Rotary Position Embedding + q = q.view(batch, patches, self.n_heads, self.head_dim).transpose(1, 2) + k = k.view(batch, patches, self.n_heads, self.head_dim).transpose(1, 2) + cos, sin = position_embeddings + q, k = apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=0) + + # Transpose q and k back for attention + q = q.transpose(1, 2).contiguous() + k = k.transpose(1, 2).contiguous() + v = v.reshape(batch, patches, self.n_heads, self.head_dim) + + out = memory_efficient_attention(q, k, v, attn_bias=attention_mask) + out = out.reshape(batch, patches, self.n_heads * self.head_dim) + + return self.o_proj(out) + + +class PixtralHFTransformerBlock(nn.Module): + + def __init__( + self, + config: PixtralVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + *, + prefix: str = "", + ) -> None: + super().__init__() + + self.attention_norm = RMSNorm(config.hidden_size, eps=1e-5) + self.attention = PixtralHFAttention(config, + quant_config=quant_config, + prefix=f"{prefix}.attention") + self.feed_forward = PixtralHFMLP(config, + quant_config=quant_config, + prefix=f"{prefix}.feed_forward") + self.ffn_norm = RMSNorm(config.hidden_size, eps=1e-5) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: BlockDiagonalMask, + position_embeddings: torch.Tensor, + ) -> torch.Tensor: + r = self.attention.forward(self.attention_norm(hidden_states), + attention_mask=attention_mask, + position_embeddings=position_embeddings) + h = hidden_states + r + r = self.feed_forward.forward(self.ffn_norm(h)) + out = h + r + return out + + +class PixtralHFTransformer(nn.Module): + + def __init__( + self, + config: PixtralVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + *, + num_hidden_layers_override: Optional[int] = None, + prefix: str = "", + ) -> None: + super().__init__() + + if num_hidden_layers_override is None: + num_hidden_layers = config.num_hidden_layers + else: + num_hidden_layers = num_hidden_layers_override + + self.layers = nn.ModuleList([ + PixtralHFTransformerBlock(config=config, + quant_config=quant_config, + prefix=f"{prefix}.layers.{layer_idx}") + for layer_idx in range(num_hidden_layers) + ]) + + def forward( + self, + x: torch.Tensor, + attention_mask: BlockDiagonalMask, + position_embeddings: torch.Tensor, + ) -> torch.Tensor: + for layer in self.layers: + x = layer(x, attention_mask, position_embeddings) + return x + + +class PixtralHFVisionModel(nn.Module): + + def __init__( + self, + config: PixtralVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + *, + num_hidden_layers_override: Optional[int] = None, + require_post_norm: Optional[bool] = None, + prefix: str = "", + ) -> None: + super().__init__() + + self.config = config + self.patch_conv = nn.Conv2d( + in_channels=config.num_channels, + out_channels=config.hidden_size, + kernel_size=config.patch_size, + stride=config.patch_size, + bias=False, + ) + self.ln_pre = RMSNorm(config.hidden_size, eps=1e-5) + self.transformer = PixtralHFTransformer( + config, + quant_config, + num_hidden_layers_override=num_hidden_layers_override, + prefix=f"{prefix}.transformer", + ) + + num_hidden_layers = config.num_hidden_layers + if len(self.transformer.layers) > config.num_hidden_layers: + raise ValueError( + f"The original encoder only has {num_hidden_layers} " + f"layers, but you requested {len(self.transformer.layers)} " + "layers.") + + if require_post_norm is True: + msg = "PixtralHFVisionModel does not have post-layernorm" + raise ValueError(msg) + + self.dtype = next(self.parameters()).dtype + self.device = next(self.parameters()).device + self.patch_positional_embedding = PixtralRotaryEmbedding( + config, self.device) + + def forward( + self, + pixel_values: List[torch.Tensor], + ) -> torch.Tensor: + """ + Args: + pixel_values: Each image to be processed will be a separate tensor + in pixel_values. This means it will be a list of tensors + because multiple requests batched can have multiple images, + each with their own shape potentially + + Returns: + image_features: tensor of token features for + all tokens of all images of shape (N_toks, D) + """ + # pass images through initial convolution independently + patch_embeds_list = [ + self.patch_conv(img.unsqueeze(0).to(self.dtype)) + for img in pixel_values + ] + + # flatten to a single sequence + patch_embeds = torch.cat( + [p.flatten(2).permute(0, 2, 1) for p in patch_embeds_list], dim=1) + patch_embeds = self.ln_pre(patch_embeds) + + # positional embeddings + position_ids = position_ids_in_meshgrid( + patch_embeds_list, + max_width=self.config.image_size // self.config.patch_size).to( + self.device) + + position_embedding = self.patch_positional_embedding( + patch_embeds, position_ids) + attention_mask = BlockDiagonalMask.from_seqlens( + [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], ) + out = self.transformer(patch_embeds, attention_mask, + position_embedding) + + return out + + # (TODO) Add prefix argument for filtering out weights to be loaded + # ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986 + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [] + params_dict = dict(self.named_parameters()) + + for name, loaded_weight in weights: + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + + param = params_dict[name.replace(weight_name, param_name)] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index eb9a9aa9364cc..23eb1482ffef1 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -365,6 +365,28 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): embedding_modules = {} embedding_padding_modules = [] + # BitandBytes specific attributes + default_bitsandbytes_target_modules = [ + ".gate_proj.", + ".down_proj.", + ".up_proj.", + ".q_proj.", + ".k_proj.", + ".v_proj.", + ".o_proj.", + ] + + # in TP, these weights are partitioned along the column dimension (dim=-1) + column_parallel_weights_modules = [".down_proj.", ".o_proj."] + bitsandbytes_stacked_params_mapping = { + # shard_name, weight_name, index + "q_proj": ("qkv_proj", 0), + "k_proj": ("qkv_proj", 1), + "v_proj": ("qkv_proj", 2), + "gate_proj": ("gate_up_proj", 0), + "up_proj": ("gate_up_proj", 1), + } + def __init__( self, config: Qwen2Config, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index f7d632a83cc33..3dc955b12ba0e 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -22,7 +22,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2-VL model compatible with HuggingFace weights.""" -from functools import lru_cache, partial +from functools import partial from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional, Tuple, Type, TypedDict, Union) @@ -63,7 +63,7 @@ from vllm.multimodal.image import cached_get_image_processor from vllm.sequence import IntermediateTensors, SequenceData from vllm.transformers_utils.config import uses_mrope -from vllm.transformers_utils.processor import get_processor +from vllm.transformers_utils.processor import cached_get_processor from .interfaces import SupportsMultiModal, SupportsPP from .utils import (PPMissingLayer, get_vit_attn_backend, @@ -78,7 +78,7 @@ class Qwen2VLImagePixelInputs(TypedDict): type: Literal["pixel_values"] data: torch.Tensor - """Shape: + """Shape: `(num_patches, num_channels * patch_size * patch_size)` """ @@ -102,14 +102,14 @@ class Qwen2VLImageEmbeddingInputs(TypedDict): class Qwen2VLVideoInputs(TypedDict): pixel_values_videos: torch.Tensor - """Shape: - `(num_patches, + """Shape: + `(num_patches, num_channels * temporal_patch_size * patch_size * patch_size)` """ video_grid_thw: torch.Tensor """Shape: `(num_videos, 3)` - + This should be in `(grid_t, grid_h, grid_w)` format. """ @@ -544,13 +544,14 @@ def forward( # === Vision input helpers === # -cached_get_processor = lru_cache(get_processor) - def mm_input_mapper_for_qwen2_vl( ctx: InputContext, data: MultiModalData[object], data_type_key: str, + *, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, ) -> MultiModalInputs: """Input mapper for Qwen2-VL.""" if data_type_key == "image" and isinstance(data, dict): @@ -559,8 +560,19 @@ def mm_input_mapper_for_qwen2_vl( "image_grid_thw": data.get("image_grid_thw"), }) model_config = ctx.model_config + # Handle mm processor kwargs; we pass these at creation time + # because preprocess() in transformers doesn't expose them + mm_processor_kwargs = {} + if min_pixels: + mm_processor_kwargs["min_pixels"] = min_pixels + if max_pixels: + mm_processor_kwargs["max_pixels"] = max_pixels + image_processor = cached_get_image_processor( - model_config.model, trust_remote_code=model_config.trust_remote_code) + model_config.model, + trust_remote_code=model_config.trust_remote_code, + **mm_processor_kwargs, + ) if image_processor is None: raise RuntimeError("No HuggingFace processor is available " "to process the image object") @@ -633,25 +645,36 @@ def _get_max_image_info( image_processor, data_type_key: str = "image", mm_count: int = 1, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, ): + # Limit min / max pixels unless they're explicitly provided + if min_pixels is None: + min_pixels = max(image_processor.min_pixels, 28 * 28) + if max_pixels is None: + max_pixels = min(image_processor.max_pixels, 1280 * 28 * 28) + return _get_vision_info( image_processor, height=9999999, width=9999999, - - # Limit min / max pixels. - min_pixels=max(image_processor.min_pixels, 28 * 28), - max_pixels=min(image_processor.max_pixels, 1280 * 28 * 28), + min_pixels=min_pixels, + max_pixels=max_pixels, data_type_key=data_type_key, mm_count=mm_count, ) -def get_max_qwen2_vl_mm_tokens(ctx: InputContext, data_type_key: str) -> int: +def get_max_qwen2_vl_mm_tokens(ctx: InputContext, + data_type_key: str, + *, + min_pixels=None, + max_pixels=None) -> int: image_processor = cached_get_image_processor(ctx.model_config.model) max_resized_height, max_resized_width, max_llm_image_tokens = \ _get_max_image_info(image_processor, data_type_key=data_type_key, - mm_count=1) + mm_count=1, min_pixels=min_pixels, + max_pixels=max_pixels) return max_llm_image_tokens @@ -662,14 +685,20 @@ def get_max_qwen2_vl_mm_tokens(ctx: InputContext, data_type_key: str) -> int: def dummy_data_for_qwen2_vl( - ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int] + ctx: InputContext, + seq_len: int, + mm_counts: Mapping[str, int], + *, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None ) -> Tuple[SequenceData, Optional[MultiModalDataDict]]: image_processor = cached_get_image_processor(ctx.model_config.model) num_images = mm_counts["image"] max_resized_height, max_resized_width, max_llm_image_tokens = \ _get_max_image_info(image_processor, data_type_key="image", - mm_count=num_images) + mm_count=num_images, min_pixels=min_pixels, + max_pixels=max_pixels) if seq_len - max_llm_image_tokens - 2 < 0: raise RuntimeError( f"Qwen2-VL cannot process {num_images} images in a prompt, " @@ -680,10 +709,11 @@ def dummy_data_for_qwen2_vl( num_videos = mm_counts["video"] max_resized_height, max_resized_width, max_llm_video_tokens = \ _get_max_image_info(image_processor, data_type_key="video", - mm_count=num_videos) + mm_count=num_videos, min_pixels=min_pixels, + max_pixels=max_pixels) if seq_len - max_llm_video_tokens - 2 < 0: raise RuntimeError( - f"Qwen2-VL cannot process {num_images} videos in a prompt, " + f"Qwen2-VL cannot process {num_videos} videos in a prompt, " "please increase max_model_len or reduce video limit by " "--limit-mm-per-prompt.") @@ -708,6 +738,8 @@ def _get_llm_num_vision_tokens( mm_inputs: list, data_type_key: str, image_processor, + min_pixels: int, + max_pixels: int, ): """Get number of vision tokens of multimodal inputs. @@ -717,12 +749,13 @@ def _get_llm_num_vision_tokens( image = to_numpy_array(mm_inputs[0]) input_data_format = infer_channel_dimension_format(image) height, width = get_image_size(image, channel_dim=input_data_format) + _, _, llm_num_vision_tokens = _get_vision_info( image_processor, height=height, width=width, - min_pixels=image_processor.min_pixels, - max_pixels=image_processor.max_pixels, + min_pixels=min_pixels, + max_pixels=max_pixels, do_resize=image_processor.do_resize, data_type_key=data_type_key, mm_count=len(mm_inputs), @@ -732,7 +765,8 @@ def _get_llm_num_vision_tokens( def _expand_pad_tokens(inputs: list, token_id: int, make_batched_fn: Callable, data_type_key: str, image_processor: Any, - prompt_token_ids: List[int]) -> List[int]: + prompt_token_ids: List[int], min_pixels: Optional[int], + max_pixels: Optional[int]) -> List[int]: """ Expand pad tokens for multi-modal inputs (e.g., images or videos). @@ -743,6 +777,8 @@ def _expand_pad_tokens(inputs: list, token_id: int, make_batched_fn: Callable, data_type_key (str): The type of the multi-modal input. image_processor (Any): The image processor used to process the inputs. prompt_token_ids (List[int]): The list of token IDs in the prompt. + min_pixels (int): min pixels to used for img processing + max_pixels (int): max pixels to be used for img processing Returns: List[int]: The list of token IDs for the multi-modal inputs. @@ -759,6 +795,8 @@ def _expand_pad_tokens(inputs: list, token_id: int, make_batched_fn: Callable, [data] if data_type_key == "image" else data, data_type_key=data_type_key, image_processor=image_processor, + min_pixels=min_pixels, + max_pixels=max_pixels, ) if cnt == 0: end_idx = indices[cnt] @@ -775,6 +813,9 @@ def _expand_pad_tokens(inputs: list, token_id: int, make_batched_fn: Callable, def input_processor_for_qwen2_vl( ctx: InputContext, inputs: DecoderOnlyInputs, + *, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, ) -> DecoderOnlyInputs: multi_modal_data = inputs.get("multi_modal_data", None) if multi_modal_data is None: @@ -785,6 +826,10 @@ def input_processor_for_qwen2_vl( processor = cached_get_processor(ctx.model_config.model) image_processor = processor.image_processor + # Apply processor kwarg overrides for image processor options + min_pixels = min_pixels if min_pixels else image_processor.min_pixels + max_pixels = max_pixels if max_pixels else image_processor.max_pixels + hf_config = ctx.get_hf_config(Qwen2VLConfig) # To avoid redundant processing of vision objects (resize, rescale, etc.), @@ -832,16 +877,22 @@ def input_processor_for_qwen2_vl( else: prompt_token_ids = _expand_pad_tokens(image_inputs, hf_config.image_token_id, - make_batched_images, "image", + make_batched_images, + "image", image_processor, - prompt_token_ids) + prompt_token_ids, + min_pixels=min_pixels, + max_pixels=max_pixels) if video_inputs is not None: prompt_token_ids = _expand_pad_tokens(video_inputs, hf_config.video_token_id, - make_batched_videos, "video", + make_batched_videos, + "video", image_processor, - prompt_token_ids) + prompt_token_ids, + min_pixels=min_pixels, + max_pixels=max_pixels) return token_inputs( prompt_token_ids=prompt_token_ids, diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 3cd30257b17c1..3668366aba6fc 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -47,12 +47,14 @@ "GraniteMoeForCausalLM": ("granitemoe", "GraniteMoeForCausalLM"), "InternLMForCausalLM": ("llama", "LlamaForCausalLM"), "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"), + "InternLM2VEForCausalLM": ("internlm2_ve", "InternLM2VEForCausalLM"), "JAISLMHeadModel": ("jais", "JAISLMHeadModel"), "JambaForCausalLM": ("jamba", "JambaForCausalLM"), "LlamaForCausalLM": ("llama", "LlamaForCausalLM"), # For decapoda-research/llama-* "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"), "MambaForCausalLM": ("mamba", "MambaForCausalLM"), + "FalconMambaForCausalLM": ("mamba", "MambaForCausalLM"), "MistralForCausalLM": ("llama", "LlamaForCausalLM"), "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"), "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"), @@ -83,6 +85,7 @@ # [Encoder-decoder] "BartModel": ("bart", "BartForConditionalGeneration"), "BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"), + "Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"), # noqa: E501 } _EMBEDDING_MODELS = { @@ -92,6 +95,7 @@ "MistralModel": ("llama", "LlamaEmbeddingModel"), "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"), # [Multimodal] + "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), } diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index e717ab108c77b..91277b0ccd145 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -248,8 +248,10 @@ def __init__( self, config: SiglipVisionConfig, quant_config: Optional[QuantizationConfig] = None, - ): + prefix: str = "", + ) -> None: super().__init__() + self.config = config self.embed_dim = config.hidden_size self.num_heads = config.num_attention_heads @@ -266,12 +268,14 @@ def __init__( head_size=self.head_dim, total_num_heads=self.num_heads, quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", ) self.out_proj = RowParallelLinear( input_size=self.embed_dim, output_size=self.embed_dim, quant_config=quant_config, + prefix=f"{prefix}.out_proj", ) self.tp_size = get_tensor_model_parallel_world_size() @@ -314,8 +318,10 @@ def __init__( self, config: SiglipVisionConfig, quant_config: Optional[QuantizationConfig] = None, - ): + prefix: str = "", + ) -> None: super().__init__() + self.config = config self.activation_fn = get_act_fn(config.hidden_act) @@ -326,11 +332,13 @@ def __init__( config.hidden_size, config.intermediate_size, quant_config=quant_config if quantizable else None, + prefix=f"{prefix}.fc1", ) self.fc2 = RowParallelLinear( config.intermediate_size, config.hidden_size, quant_config=quant_config if quantizable else None, + prefix=f"{prefix}.fc2", ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -346,15 +354,20 @@ def __init__( self, config: SiglipVisionConfig, quant_config: Optional[QuantizationConfig] = None, - ): + prefix: str = "", + ) -> None: super().__init__() + self.embed_dim = config.hidden_size num_heads = config.num_attention_heads tp_size = get_tensor_model_parallel_world_size() if USE_XFORMERS_OPS and num_heads % tp_size == 0: - self.self_attn = SiglipParallelAttention(config, - quant_config=quant_config) + self.self_attn = SiglipParallelAttention( + config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) else: self.self_attn = SiglipSdpaAttention(config) @@ -363,6 +376,7 @@ def __init__( self.mlp = SiglipMLP( config, quant_config=quant_config, + prefix=f"{prefix}.mlp", ) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) @@ -392,8 +406,10 @@ def __init__( config: SiglipVisionConfig, quant_config: Optional[QuantizationConfig] = None, num_hidden_layers_override: Optional[int] = None, - ): + prefix: str = "", + ) -> None: super().__init__() + self.config = config if num_hidden_layers_override is None: @@ -402,8 +418,10 @@ def __init__( num_hidden_layers = num_hidden_layers_override self.layers = nn.ModuleList([ - SiglipEncoderLayer(config, quant_config=quant_config) - for _ in range(num_hidden_layers) + SiglipEncoderLayer(config, + quant_config=quant_config, + prefix=f"{prefix}.layers.{layer_idx}") + for layer_idx in range(num_hidden_layers) ]) def forward( @@ -424,7 +442,8 @@ def __init__( self, config: SiglipVisionConfig, quant_config: Optional[QuantizationConfig] = None, - ): + prefix: str = "", + ) -> None: super().__init__() self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size)) @@ -433,7 +452,9 @@ def __init__( config.hidden_size, config.num_attention_heads, batch_first=True) self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.mlp = SiglipMLP(config=config, quant_config=quant_config) + self.mlp = SiglipMLP(config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: batch_size = hidden_state.shape[0] @@ -454,9 +475,13 @@ def __init__( self, config: SiglipVisionConfig, quant_config: Optional[QuantizationConfig] = None, + *, num_hidden_layers_override: Optional[int] = None, - ): + require_post_norm: Optional[bool] = None, + prefix: str = "", + ) -> None: super().__init__() + self.config = config embed_dim = config.hidden_size @@ -465,26 +490,34 @@ def __init__( config, quant_config=quant_config, num_hidden_layers_override=num_hidden_layers_override, + prefix=f"{prefix}.encoder", ) + num_hidden_layers = config.num_hidden_layers if len(self.encoder.layers) > config.num_hidden_layers: raise ValueError( - f"The original encoder only has {config.num_hidden_layers} " + f"The original encoder only has {num_hidden_layers} " f"layers, but you requested {len(self.encoder.layers)} layers." ) - elif len(self.encoder.layers) == config.num_hidden_layers: + + # If possible, skip post_layernorm to conserve memory + if require_post_norm is None: + require_post_norm = len(self.encoder.layers) == num_hidden_layers + + if require_post_norm: self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) else: - # post_layernorm is unused when we extract intermediate features - # In this case, we can skip it to conserve memory self.post_layernorm = None self.use_head = (True if not hasattr(config, "vision_use_head") else config.vision_use_head) if self.use_head: self.head = SiglipMultiheadAttentionPoolingHead( - config=config, quant_config=quant_config) + config=config, + quant_config=quant_config, + prefix=f"{prefix}.head", + ) def forward( self, @@ -517,8 +550,11 @@ def __init__( self, config: SiglipVisionConfig, quant_config: Optional[QuantizationConfig] = None, + *, num_hidden_layers_override: Optional[int] = None, - ): + require_post_norm: Optional[bool] = None, + prefix: str = "", + ) -> None: super().__init__() num_heads = config.num_attention_heads @@ -529,6 +565,8 @@ def __init__( config, quant_config, num_hidden_layers_override=num_hidden_layers_override, + require_post_norm=require_post_norm, + prefix=f"{prefix}.vision_model", ) def get_input_embeddings(self) -> nn.Module: diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 9e2f5476f3aff..d96e988fba384 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -1,7 +1,7 @@ import itertools from dataclasses import dataclass, field -from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, - Protocol, Tuple, Union, overload) +from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping, + Optional, Protocol, Tuple, Union, overload) import torch import torch.nn as nn @@ -21,7 +21,7 @@ from vllm.multimodal.base import NestedTensors from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors -from vllm.utils import is_cpu, is_pin_memory_available +from vllm.utils import is_pin_memory_available logger = init_logger(__name__) @@ -294,10 +294,11 @@ def _embedding_count_expression(embeddings: NestedTensors) -> str: _embedding_count_expression(inner) for inner in embeddings) -def merge_multimodal_embeddings(input_ids: torch.Tensor, - inputs_embeds: torch.Tensor, - multimodal_embeddings: NestedTensors, - placeholder_token_id: int) -> torch.Tensor: +def _merge_multimodal_embeddings( + inputs_embeds: torch.Tensor, + is_multimodal: torch.Tensor, + multimodal_embeddings: NestedTensors, +) -> torch.Tensor: """ Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the positions in ``inputs_embeds`` corresponding to placeholder tokens in @@ -306,8 +307,7 @@ def merge_multimodal_embeddings(input_ids: torch.Tensor, Note: This updates ``inputs_embeds`` in place. """ - mask = (input_ids == placeholder_token_id) - num_expected_tokens = mask.sum().item() + num_expected_tokens = is_multimodal.sum().item() assert isinstance(num_expected_tokens, int) flattened = _flatten_embeddings(multimodal_embeddings) @@ -317,10 +317,70 @@ def merge_multimodal_embeddings(input_ids: torch.Tensor, f"Attempted to assign {expr} = {flattened.shape[0]} " f"multimodal tokens to {num_expected_tokens} placeholders") - inputs_embeds[mask] = flattened + inputs_embeds[is_multimodal] = flattened return inputs_embeds +def embed_multimodal( + input_ids: torch.Tensor, + multimodal_token_id: int, + get_text_embeds: Callable[[torch.Tensor], torch.Tensor], + get_multimodal_embeds: Callable[[torch.Tensor], Union[torch.Tensor, + List[torch.Tensor]]], +) -> torch.Tensor: + """ + Embed token IDs and multimodal inputs and combine their embeddings. + + ``multimodal_token_id`` is used to determine whether a token ID should + be embedded using ``get_text_embeds`` or ``get_multimodal_embeds``. + + Compared to ``merge_multimodal_embeddings`, this avoids running + ``get_text_embeds`` on ``input_ids[input_ids == multimodal_token_id]`` + which causes issues when the placeholder token ID exceeds the + vocabulary size of the language model. + """ + is_multimodal = input_ids == multimodal_token_id + is_text = ~is_multimodal + + text_embeds = get_text_embeds(input_ids[is_text]) + multimodal_embeds = get_multimodal_embeds(input_ids[is_multimodal]) + + merged_embeds = torch.empty( + (input_ids.shape[0], text_embeds.shape[1]), + dtype=text_embeds.dtype, + device=text_embeds.device, + ) + + merged_embeds[is_text] = text_embeds + + return _merge_multimodal_embeddings( + merged_embeds, + is_multimodal, + multimodal_embeds, + ) + + +def merge_multimodal_embeddings( + input_ids: torch.Tensor, + inputs_embeds: torch.Tensor, + multimodal_embeddings: NestedTensors, + placeholder_token_id: int, +) -> torch.Tensor: + """ + Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the + positions in ``inputs_embeds`` corresponding to placeholder tokens in + ``input_ids``. + + Note: + This updates ``inputs_embeds`` in place. + """ + return _merge_multimodal_embeddings( + inputs_embeds, + (input_ids == placeholder_token_id), + multimodal_embeddings, + ) + + class LayerFn(Protocol): def __call__(self, prefix: str) -> torch.nn.Module: @@ -474,7 +534,7 @@ def make_empty_intermediate_tensors( class LLMWrapper(nn.Module): """ - To align with the key names of LoRA trained with PEFT, we need to add an + To align with the key names of LoRA trained with PEFT, we need to add an additional layer to the llm's implementation. """ @@ -515,7 +575,7 @@ def get_vit_attn_backend() -> _Backend: "so we use xformers backend instead. You can run " "`pip install flash-attn` to use flash-attention backend.") selected_backend = _Backend.XFORMERS - elif is_cpu(): + elif current_platform.is_cpu(): selected_backend = _Backend.TORCH_SDPA else: selected_backend = _Backend.XFORMERS diff --git a/vllm/outputs.py b/vllm/outputs.py index 15cb8d53186df..951976310e7ae 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -1,14 +1,13 @@ import time from dataclasses import dataclass -from typing import List, Optional +from typing import Dict, List, Optional from typing import Sequence as GenericSequence from typing import Union -from vllm.inputs import PromptType from vllm.lora.request import LoRARequest from vllm.sampling_params import RequestOutputKind from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs, - SequenceGroup, SequenceStatus) + SequenceGroup, SequenceGroupBase, SequenceStatus) @dataclass @@ -93,7 +92,7 @@ class RequestOutput: def __init__( self, request_id: str, - prompt: Optional[PromptType], + prompt: Optional[str], prompt_token_ids: Optional[List[int]], prompt_logprobs: Optional[PromptLogprobs], outputs: List[CompletionOutput], @@ -115,14 +114,28 @@ def __init__( self.encoder_prompt_token_ids = encoder_prompt_token_ids @classmethod - def from_seq_group(cls, seq_group: SequenceGroup, - use_cache: bool) -> Optional["RequestOutput"]: + def from_seq_group( + cls, seq_group: SequenceGroup, use_cache: bool, + seq_id_to_seq_group: Dict[str, SequenceGroupBase] + ) -> Optional["RequestOutput"]: + finished = seq_group.is_finished() + + if seq_group.request_id in seq_id_to_seq_group: + group: SequenceGroupBase = seq_id_to_seq_group[ + seq_group.request_id] + if finished: + group.finish_seq(seq_group) + assembled_seq_group = group.maybe_assemble_group(seq_group) + if assembled_seq_group is None: + return None + return cls.from_seq_group(assembled_seq_group, use_cache, + seq_id_to_seq_group) + sampling_params = seq_group.sampling_params if sampling_params is None: raise ValueError( "Sampling parameters are missing for a CompletionRequest.") - finished = seq_group.is_finished() if sampling_params.output_kind == RequestOutputKind.FINAL_ONLY and ( not finished): return None @@ -137,15 +150,7 @@ def from_seq_group(cls, seq_group: SequenceGroup, outputs=[], finished=False) - seqs = seq_group.get_seqs() - if len(seqs) == 1: - top_n_seqs = seqs - else: - # Get the top-n sequences. - n = sampling_params._real_n or sampling_params.n - sorting_key = lambda seq: seq.get_cumulative_logprob() - sorted_seqs = sorted(seqs, key=sorting_key, reverse=True) - top_n_seqs = sorted_seqs[:n] + top_n_seqs = seq_group.get_seqs() # Create the outputs. # NOTE: We need omit logprobs here explicitly because the sequence @@ -209,7 +214,7 @@ def from_seq_group(cls, seq_group: SequenceGroup, else: output = CompletionOutput( - seqs.index(seq), output_text, [output_token_ids] + top_n_seqs.index(seq), output_text, [output_token_ids] if isinstance(output_token_ids, int) else output_token_ids, seq.get_cumulative_logprob() if include_logprobs else None, output_logprobs, @@ -310,10 +315,13 @@ def __repr__(self): class RequestOutputFactory: @staticmethod - def create(seq_group: SequenceGroup, use_cache: bool = False): + def create(seq_group: SequenceGroup, + seq_id_to_seq_group: Dict[str, SequenceGroupBase], + use_cache: bool = False): # Determine the type based on a condition, for example: if hasattr(seq_group, 'embeddings') and seq_group.embeddings is not None: return EmbeddingRequestOutput.from_seq_group(seq_group) else: - return RequestOutput.from_seq_group(seq_group, use_cache) + return RequestOutput.from_seq_group(seq_group, use_cache, + seq_id_to_seq_group) diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index c648862b2d757..58912158139bd 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -58,6 +58,13 @@ except Exception: pass +is_neuron = False +try: + import transformers_neuronx # noqa: F401 + is_neuron = True +except ImportError: + pass + if is_tpu: # people might install pytorch built with cuda but run on tpu # so we need to check tpu first @@ -75,6 +82,9 @@ elif is_cpu: from .cpu import CpuPlatform current_platform = CpuPlatform() +elif is_neuron: + from .neuron import NeuronPlatform + current_platform = NeuronPlatform() else: current_platform = UnspecifiedPlatform() diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 00742a290e42a..d36367f2bc9c1 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -10,6 +10,7 @@ class PlatformEnum(enum.Enum): TPU = enum.auto() XPU = enum.auto() CPU = enum.auto() + NEURON = enum.auto() UNSPECIFIED = enum.auto() @@ -48,6 +49,9 @@ def is_xpu(self) -> bool: def is_cpu(self) -> bool: return self._enum == PlatformEnum.CPU + def is_neuron(self) -> bool: + return self._enum == PlatformEnum.NEURON + def is_cuda_alike(self) -> bool: """Stateless version of :func:`torch.cuda.is_available`.""" return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM) diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py new file mode 100644 index 0000000000000..07d8398eda525 --- /dev/null +++ b/vllm/platforms/neuron.py @@ -0,0 +1,9 @@ +from .interface import Platform, PlatformEnum + + +class NeuronPlatform(Platform): + _enum = PlatformEnum.NEURON + + @classmethod + def get_device_name(cls, device_id: int = 0) -> str: + return "neuron" diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 4f2ae75e65f3a..9993cec13d649 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -49,14 +49,17 @@ class GuidedDecodingParams: @staticmethod def from_optional( - json: Optional[Union[Dict, BaseModel, str]], + json: Optional[Union[Dict, BaseModel, str]] = None, regex: Optional[str] = None, choice: Optional[List[str]] = None, grammar: Optional[str] = None, json_object: Optional[bool] = None, backend: Optional[str] = None, whitespace_pattern: Optional[str] = None, - ) -> "GuidedDecodingParams": + ) -> Optional["GuidedDecodingParams"]: + if all(arg is None + for arg in (json, regex, choice, grammar, json_object)): + return None # Extract json schemas from pydantic models if isinstance(json, (BaseModel, type(BaseModel))): json = json.model_json_schema() diff --git a/vllm/sequence.py b/vllm/sequence.py index e580d69ec5afb..93f58f00ef77b 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -4,7 +4,7 @@ from abc import ABC, abstractmethod from array import array from collections import defaultdict -from dataclasses import dataclass +from dataclasses import dataclass, field from functools import cached_property, reduce from typing import TYPE_CHECKING, Any, Callable, Dict, List, Mapping, Optional from typing import Sequence as GenericSequence @@ -17,7 +17,7 @@ from vllm.lora.request import LoRARequest from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.sampling_params import SamplingParams +from vllm.sampling_params import RequestOutputKind, SamplingParams from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics if TYPE_CHECKING: @@ -1401,3 +1401,121 @@ def clone( last_sampled_token_ids=self.last_sampled_token_ids.clone() if self.last_sampled_token_ids is not None else None, async_callback=self.async_callback) + + +@dataclass +class SequenceGroupBase: + group_id: str # the original request id before splitting + + assembled_seq_group: Optional[SequenceGroup] = None + + # seq id to a unique index inside this group + seq_id_to_index: Dict[str, int] = field(default_factory=dict) + + # seq ids to be finished + to_be_finished: Dict[str, SequenceGroup] = field(default_factory=dict) + + # seq id to finished sequences + finished_reqs: Dict[str, SequenceGroup] = field(default_factory=dict) + + streaming: bool = False + + output_produced: bool = False + + @staticmethod + def add_request(request_id: str, engine, params, *args, **kwargs): + """When we are ready to add a request with request_id and params + into the engine, we can split the request into multiple requests. + """ + raise NotImplementedError + + def finish_seq(self, seq: SequenceGroup): + """The sequence `seq` finishes, we should record the information. + """ + del self.to_be_finished[seq.request_id] + self.finished_reqs[seq.request_id] = seq + + def maybe_assemble_group( + self, seq_group: SequenceGroup) -> Optional[SequenceGroup]: + """Assemble the sequence group, for producing the final + output, or adding request in the engine again. + """ + raise NotImplementedError + + +class ParallelSampleSequenceGroup(SequenceGroupBase): + + @staticmethod + def add_request(request_id: str, engine, params, **kwargs): + original_params = params + params = copy.deepcopy(original_params) + params.n = 1 + group = ParallelSampleSequenceGroup(request_id) + seqs = [] + for i in range(original_params.n): + request_id_i = f"{request_id}_parallel_sample_{i}" + group.seq_id_to_index[request_id_i] = i + seq_group = engine.add_request( + request_id_i, + params=params, + **kwargs, + ) # type: ignore + assert seq_group is not None + engine.seq_id_to_seq_group[request_id_i] = group + group.to_be_finished[request_id_i] = seq_group + seqs.append(seq_group.seqs[0]) + + # for parallel sampling, the `assembled_seq_group` is always + # available, since we have all the sequences ready, and they + # will not change. + group.assembled_seq_group = SequenceGroup( + request_id=request_id, + seqs=seqs, + arrival_time=seq_group.arrival_time, + sampling_params=original_params, + lora_request=seq_group.lora_request, + embeddings=seq_group.embeddings, + pooling_params=seq_group.pooling_params, + encoder_seq=seq_group.encoder_seq, + trace_headers=seq_group.trace_headers, + prompt_adapter_request=seq_group.prompt_adapter_request, + priority=seq_group.priority, + ) + + group.streaming = params.output_kind == RequestOutputKind.DELTA + group.output_produced = False + + def maybe_assemble_group( + self, seq_group: SequenceGroup) -> Optional[SequenceGroup]: + + # in the streaming mode, we will return the assembled sequence + # for the first sequence, and then return None for the rest of + # sequences + if self.streaming: + if self.seq_id_to_index[seq_group.request_id] == 0: + return self.assembled_seq_group + return None + + # in the non-streaming mode, we will return the assembled sequence + # once after all sequences finish, and then return None for the + # rest of the time + + if len(self.to_be_finished) > 0: + return None + + assert self.assembled_seq_group is not None + params = self.assembled_seq_group.sampling_params + assert isinstance(params, SamplingParams) + if not self.output_produced: + self.output_produced = True + if params._real_n is not None: + # Get the top-n sequences. + n = params._real_n or params.n + seqs = self.assembled_seq_group.seqs + sorting_key = lambda seq: seq.get_cumulative_logprob() + sorted_seqs = sorted(seqs, key=sorting_key, reverse=True) + top_n_seqs = sorted_seqs[:n] + self.assembled_seq_group.seqs = top_n_seqs + return self.assembled_seq_group + if self.output_produced: + return None diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index aaf6ec5f508c8..3aa999fcb9ebb 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -179,7 +179,7 @@ def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest): return False # TODO: Add support for other attn backends - if self.attn_backend.get_name() != "flash-attn": + if self.attn_backend.get_name() != "FLASH_ATTN": return False # TODO: Add support for LORA diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 50d2767a03752..316db43502d3b 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -184,7 +184,7 @@ def create_worker( if not disable_mqa_scorer: if scorer_worker.model_runner.attn_backend.get_name( - ) != "flash-attn": + ) != "FLASH_ATTN": disable_mqa_scorer = True logger.info( "[Speculative Decoding] Disabling MQA scorer as the " diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 46405f3529215..9bd2531d7a15c 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -232,6 +232,68 @@ def get_config( return config +def maybe_register_config_serialize_by_value(trust_remote_code: bool) -> None: + """Try to register HF model configuration class to serialize by value + + With trust_remote_code, the config class is typically an instance of a + custom class imported from the HF modules cache. The class will not be + importable in spawned workers by default (and won't exist at all on + other nodes), which breaks serialization of the config. + + In this function we tell the cloudpickle serialization library to pass + instances of these generated classes by value instead of by reference, + i.e. the class definition is serialized along with its data so that the + class module does not need to be importable on the receiving end. This + registration only works if the modules cache has already been + initialized. + + + See: https://github.com/cloudpipe/cloudpickle?tab=readme-ov-file#overriding-pickles-serialization-mechanism-for-importable-constructs + """ + if not trust_remote_code: + return + + try: + import transformers_modules + except ImportError: + logger.debug("Could not import transformers_modules used for remote" + " code. If remote code is not needed remove" + " `--trust-remote-code`.") + return + + try: + import cloudpickle + cloudpickle.register_pickle_by_value(transformers_modules) + + # ray vendors its own version of cloudpickle + from vllm.executor.ray_utils import ray + if ray: + ray.cloudpickle.register_pickle_by_value(transformers_modules) + + # multiprocessing uses pickle to serialize arguments when using spawn + # Here we get pickle to use cloudpickle to serialize ModelConfig objects + # that contain instances of the custom config class to avoid + # serialization problems if the generated module (and model) has a `.` + # in its name + import multiprocessing + import pickle + + from vllm.config import ModelConfig + + def _reduce_modelconfig(mc: ModelConfig): + return (pickle.loads, (cloudpickle.dumps(mc), )) + + multiprocessing.reducer.register(ModelConfig, _reduce_modelconfig) + + except Exception as e: + logger.warning( + "Unable to register remote classes used by" + " trust_remote_code with by-value serialization. This may" + " lead to a later error. If remote code is not needed" + " remove `--trust-remote-code`", + exc_info=e) + + def load_params_config(model, revision) -> PretrainedConfig: # This function loads a params.json config which # should be used when loading models in mistral format diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py index 2b418f3603a0b..7c8423d2b0a34 100644 --- a/vllm/transformers_utils/detokenizer.py +++ b/vllm/transformers_utils/detokenizer.py @@ -1,8 +1,10 @@ -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional from vllm.sequence import (VLLM_INVALID_TOKEN_ID, Logprob, SamplingParams, Sequence, SequenceGroup) +from .detokenizer_utils import (convert_prompt_ids_to_tokens, + detokenize_incrementally) from .tokenizer import AnyTokenizer from .tokenizer_group import BaseTokenizerGroup @@ -88,7 +90,7 @@ def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup, prefix_offset = next_iter_prefix_offset read_offset = next_iter_read_offset if prev_tokens is None: - prev_tokens = next_iter_tokens + prev_tokens = next_iter_tokens.copy() else: prev_tokens.extend(next_iter_tokens) @@ -161,167 +163,3 @@ def decode_sequence_inplace(self, seq: Sequence, seq.output_text += new_decoded_token_text return len(new_decoded_token_text) - - -def _replace_none_with_empty(tokens: List[Optional[str]]): - for i, token in enumerate(tokens): - if token is None: - tokens[i] = "" - - -def _convert_tokens_to_string_with_added_encoders( - tokenizer: AnyTokenizer, - output_tokens: List[str], - skip_special_tokens: bool, - spaces_between_special_tokens: bool, -) -> str: - # Adapted from - # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921 - # NOTE(woosuk): The following code is slow because it runs a for loop over - # the output_tokens. In Python, running a for loop over a list can be slow - # even when the loop body is very simple. - sub_texts: List[str] = [] - current_sub_text: List[str] = [] - all_special_tokens = set(tokenizer.all_special_tokens) - for token in output_tokens: - if skip_special_tokens and token in all_special_tokens: - continue - if token in tokenizer.get_added_vocab(): - if current_sub_text: - sub_text = tokenizer.convert_tokens_to_string(current_sub_text) - sub_texts.append(sub_text) - current_sub_text = [] - sub_texts.append(token) - else: - current_sub_text.append(token) - if current_sub_text: - sub_text = tokenizer.convert_tokens_to_string(current_sub_text) - sub_texts.append(sub_text) - if spaces_between_special_tokens: - return " ".join(sub_texts) - else: - return "".join(sub_texts) - - -# 5 is an arbitrary value that should work for all -# tokenizers (bigger = more conservative). -INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET = 5 - - -def convert_prompt_ids_to_tokens( - tokenizer: AnyTokenizer, - prompt_ids: List[int], - skip_special_tokens: bool = False, -) -> Tuple[List[str], int, int]: - """Converts the prompt ids to tokens and returns the tokens and offsets - for incremental detokenization. - - Note that not all tokens are converted to strings. Only the tokens that - are necessary for incremental detokenization are converted to strings. - """ - # We do not need to convert the whole prompt to tokens. - # Offset a little more in case we have special tokens. - new_tokens = tokenizer.convert_ids_to_tokens( - prompt_ids[-INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET - 2:], - skip_special_tokens=skip_special_tokens) - read_offset = len(new_tokens) - prefix_offset = max( - read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0) - # This is required to guard against out-of-vocab prompt token ids - _replace_none_with_empty(new_tokens) # type: ignore[arg-type] - return new_tokens, prefix_offset, read_offset - - -# Based on -# https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15 -# under Apache 2.0 license -def detokenize_incrementally( - tokenizer: AnyTokenizer, - all_input_ids: List[int], - prev_tokens: Optional[List[str]], - prefix_offset: int, - read_offset: int, - skip_special_tokens: bool = False, - spaces_between_special_tokens: bool = True, -) -> Tuple[List[str], str, int, int]: - """Detokenizes the input ids incrementally and returns the new tokens - and the new text. - - If `prev_tokens` is None, this function will convert the input ids to - tokens and return the tokens and the new text. Otherwise, it will return the - new tokens and the new text. - - This function will also return the new prefix offset and the new read - offset to be used in the next iteration. - - The offsets are necessary to defeat cleanup algorithms in the decode which - decide to add a space or not depending on the surrounding ids. - - Args: - tokenizer: The tokenizer to use. - all_input_ids: The input ids. The last id is the new token id. - prev_tokens: The previous tokens. If None, this function will convert - the input ids to tokens and return the tokens and the new text. - prefix_offset: The prefix offset. - read_offset: The read offset. - skip_special_tokens: Whether to skip special tokens. - spaces_between_special_tokens: Whether to add spaces between special - tokens. - """ - new_token_id = all_input_ids[-1] - # This is the first iteration for this sequence - is_first_iter = prev_tokens is None - if is_first_iter: - (prev_tokens, prefix_offset, - read_offset) = convert_prompt_ids_to_tokens( - tokenizer, - all_input_ids[:-1], - skip_special_tokens=skip_special_tokens) - assert prev_tokens is not None - - # If the new token id is out of bounds, return an empty string. - if 0 <= new_token_id < len(tokenizer): - # Put new_token_id in a list so skip_special_tokens is respected - new_tokens = tokenizer.convert_ids_to_tokens( - [new_token_id], skip_special_tokens=skip_special_tokens) - if isinstance(new_tokens, str): - new_tokens = [new_tokens] - else: - new_tokens = [""] - output_tokens = prev_tokens + new_tokens - - # If this is the first iteration, return all tokens. - if is_first_iter: - new_tokens = output_tokens - - # The prefix text is necessary only to defeat cleanup algorithms in - # the decode which decide to add a space or not depending on the - # surrounding ids. - if tokenizer.is_fast or not tokenizer.get_added_vocab(): - prefix_text = tokenizer.convert_tokens_to_string( - output_tokens[prefix_offset:read_offset]) - new_text = tokenizer.convert_tokens_to_string( - output_tokens[prefix_offset:]) - else: - prefix_text = _convert_tokens_to_string_with_added_encoders( - tokenizer, - output_tokens[prefix_offset:read_offset], - skip_special_tokens=skip_special_tokens, - spaces_between_special_tokens=spaces_between_special_tokens, - ) - new_text = _convert_tokens_to_string_with_added_encoders( - tokenizer, - output_tokens[prefix_offset:], - skip_special_tokens=skip_special_tokens, - spaces_between_special_tokens=spaces_between_special_tokens, - ) - - if len(new_text) <= len(prefix_text) or new_text.endswith("�"): - # utf-8 char at the end means it's a potential unfinished byte sequence - # from byte fallback tokenization. - # If it's in the middle, it's probably a real invalid id generated - # by the model - return new_tokens, "", prefix_offset, read_offset - - new_text = new_text[len(prefix_text):] - return new_tokens, new_text, read_offset, len(output_tokens) diff --git a/vllm/transformers_utils/detokenizer_utils.py b/vllm/transformers_utils/detokenizer_utils.py new file mode 100644 index 0000000000000..37ff8a236e791 --- /dev/null +++ b/vllm/transformers_utils/detokenizer_utils.py @@ -0,0 +1,167 @@ +from typing import List, Optional, Tuple + +from .tokenizer import AnyTokenizer + + +def _replace_none_with_empty(tokens: List[Optional[str]]): + for i, token in enumerate(tokens): + if token is None: + tokens[i] = "" + + +def _convert_tokens_to_string_with_added_encoders( + tokenizer: AnyTokenizer, + output_tokens: List[str], + skip_special_tokens: bool, + spaces_between_special_tokens: bool, +) -> str: + # Adapted from + # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921 + # NOTE(woosuk): The following code is slow because it runs a for loop over + # the output_tokens. In Python, running a for loop over a list can be slow + # even when the loop body is very simple. + sub_texts: List[str] = [] + current_sub_text: List[str] = [] + all_special_tokens = set(tokenizer.all_special_tokens) + for token in output_tokens: + if skip_special_tokens and token in all_special_tokens: + continue + if token in tokenizer.get_added_vocab(): + if current_sub_text: + sub_text = tokenizer.convert_tokens_to_string(current_sub_text) + sub_texts.append(sub_text) + current_sub_text = [] + sub_texts.append(token) + else: + current_sub_text.append(token) + if current_sub_text: + sub_text = tokenizer.convert_tokens_to_string(current_sub_text) + sub_texts.append(sub_text) + if spaces_between_special_tokens: + return " ".join(sub_texts) + else: + return "".join(sub_texts) + + +# 5 is an arbitrary value that should work for all +# tokenizers (bigger = more conservative). +INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET = 5 + + +def convert_prompt_ids_to_tokens( + tokenizer: AnyTokenizer, + prompt_ids: List[int], + skip_special_tokens: bool = False, +) -> Tuple[List[str], int, int]: + """Converts the prompt ids to tokens and returns the tokens and offsets + for incremental detokenization. + + Note that not all tokens are converted to strings. Only the tokens that + are necessary for incremental detokenization are converted to strings. + """ + # We do not need to convert the whole prompt to tokens. + # Offset a little more in case we have special tokens. + new_tokens = tokenizer.convert_ids_to_tokens( + prompt_ids[-INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET - 2:], + skip_special_tokens=skip_special_tokens) + read_offset = len(new_tokens) + prefix_offset = max( + read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0) + # This is required to guard against out-of-vocab prompt token ids + _replace_none_with_empty(new_tokens) # type: ignore[arg-type] + return new_tokens, prefix_offset, read_offset + + +# Based on +# https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15 +# under Apache 2.0 license +def detokenize_incrementally( + tokenizer: AnyTokenizer, + all_input_ids: List[int], + prev_tokens: Optional[List[str]], + prefix_offset: int, + read_offset: int, + skip_special_tokens: bool = False, + spaces_between_special_tokens: bool = True, +) -> Tuple[List[str], str, int, int]: + """Detokenizes the input ids incrementally and returns the new tokens + and the new text. + + If `prev_tokens` is None, this function will convert the input ids to + tokens and return the tokens and the new text. Otherwise, it will return the + new tokens and the new text. + + This function will also return the new prefix offset and the new read + offset to be used in the next iteration. + + The offsets are necessary to defeat cleanup algorithms in the decode which + decide to add a space or not depending on the surrounding ids. + + Args: + tokenizer: The tokenizer to use. + all_input_ids: The input ids. The last id is the new token id. + prev_tokens: The previous tokens. If None, this function will convert + the input ids to tokens and return the tokens and the new text. + prefix_offset: The prefix offset. + read_offset: The read offset. + skip_special_tokens: Whether to skip special tokens. + spaces_between_special_tokens: Whether to add spaces between special + tokens. + """ + new_token_id = all_input_ids[-1] + # This is the first iteration for this sequence + is_first_iter = prev_tokens is None + if is_first_iter: + (prev_tokens, prefix_offset, + read_offset) = convert_prompt_ids_to_tokens( + tokenizer, + all_input_ids[:-1], + skip_special_tokens=skip_special_tokens) + assert prev_tokens is not None + + # If the new token id is out of bounds, return an empty string. + if 0 <= new_token_id < len(tokenizer): + # Put new_token_id in a list so skip_special_tokens is respected + new_tokens = tokenizer.convert_ids_to_tokens( + [new_token_id], skip_special_tokens=skip_special_tokens) + if isinstance(new_tokens, str): + new_tokens = [new_tokens] + else: + new_tokens = [""] + output_tokens = prev_tokens + new_tokens + + # If this is the first iteration, return all tokens. + if is_first_iter: + new_tokens = output_tokens + + # The prefix text is necessary only to defeat cleanup algorithms in + # the decode which decide to add a space or not depending on the + # surrounding ids. + if tokenizer.is_fast or not tokenizer.get_added_vocab(): + prefix_text = tokenizer.convert_tokens_to_string( + output_tokens[prefix_offset:read_offset]) + new_text = tokenizer.convert_tokens_to_string( + output_tokens[prefix_offset:]) + else: + prefix_text = _convert_tokens_to_string_with_added_encoders( + tokenizer, + output_tokens[prefix_offset:read_offset], + skip_special_tokens=skip_special_tokens, + spaces_between_special_tokens=spaces_between_special_tokens, + ) + new_text = _convert_tokens_to_string_with_added_encoders( + tokenizer, + output_tokens[prefix_offset:], + skip_special_tokens=skip_special_tokens, + spaces_between_special_tokens=spaces_between_special_tokens, + ) + + if len(new_text) <= len(prefix_text) or new_text.endswith("�"): + # utf-8 char at the end means it's a potential unfinished byte sequence + # from byte fallback tokenization. + # If it's in the middle, it's probably a real invalid id generated + # by the model + return new_tokens, "", prefix_offset, read_offset + + new_text = new_text[len(prefix_text):] + return new_tokens, new_text, read_offset, len(output_tokens) diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index 98663f7f0bd07..f1523667b0466 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -1,3 +1,4 @@ +from functools import lru_cache from typing import Any, cast @@ -37,6 +38,9 @@ def get_processor( return cast(ProcessorMixin, processor) +cached_get_processor = lru_cache(get_processor) + + def get_image_processor( processor_name: str, *args: Any, diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index dcb5cf216c996..23ea657ffb0a9 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -2,11 +2,12 @@ import re from dataclasses import dataclass from pathlib import Path -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, cast +import huggingface_hub from huggingface_hub import HfApi, hf_hub_download +from mistral_common.protocol.instruct.request import ChatCompletionRequest # yapf: disable -from mistral_common.tokens.tokenizers.mistral import ChatCompletionRequest from mistral_common.tokens.tokenizers.mistral import ( MistralTokenizer as PublicMistralTokenizer) # yapf: enable @@ -24,6 +25,26 @@ class Encoding: input_ids: List[int] +def list_local_repo_files(repo_id: str, revision: Optional[str]) -> List[str]: + repo_cache = os.path.join( + huggingface_hub.constants.HF_HUB_CACHE, + huggingface_hub.constants.REPO_ID_SEPARATOR.join( + ["models", *repo_id.split("/")])) + + if revision is None: + revision_file = os.path.join(repo_cache, "refs", "main") + if os.path.isfile(revision_file): + with open(revision_file) as file: + revision = file.read() + + if revision: + revision_dir = os.path.join(repo_cache, "snapshots", revision) + if os.path.isdir(revision_dir): + return os.listdir(revision_dir) + + return [] + + def find_tokenizer_file(files: List[str]): file_pattern = re.compile(r"^tokenizer\.model\.v.*$|^tekken\.json$") @@ -90,9 +111,16 @@ def from_pretrained(cls, @staticmethod def _download_mistral_tokenizer_from_hf(tokenizer_name: str, revision: Optional[str]) -> str: - api = HfApi() - repo_info = api.model_info(tokenizer_name) - files = [s.rfilename for s in repo_info.siblings] + try: + hf_api = HfApi() + files = hf_api.list_repo_files(repo_id=tokenizer_name, + revision=revision) + except ConnectionError as exc: + files = list_local_repo_files(repo_id=tokenizer_name, + revision=revision) + + if len(files) == 0: + raise exc filename = find_tokenizer_file(files) @@ -166,7 +194,7 @@ def apply_chat_template(self, tools: Optional[Dict[str, Any]] = None, **kwargs) -> List[int]: - last_message = messages[-1] + last_message = cast(Dict[str, Any], messages[-1]) if last_message["role"] == "assistant": last_message["prefix"] = True diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py index ce46082247639..ef7ca149266b6 100644 --- a/vllm/triton_utils/importing.py +++ b/vllm/triton_utils/importing.py @@ -1,10 +1,13 @@ from importlib.util import find_spec from vllm.logger import init_logger +from vllm.platforms import current_platform logger = init_logger(__name__) -HAS_TRITON = find_spec("triton") is not None +# neuron has too old torch +HAS_TRITON = find_spec( + "triton") is not None and not current_platform.is_neuron() if not HAS_TRITON: logger.info("Triton not installed; certain GPU-related functions" diff --git a/vllm/utils.py b/vllm/utils.py index 07769da3c86d4..0e9b241b6f9f6 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -13,10 +13,11 @@ import sys import tempfile import threading +import time import uuid import warnings import weakref -from asyncio import FIRST_COMPLETED, ensure_future +from asyncio import FIRST_COMPLETED, AbstractEventLoop, Future, Task from collections.abc import Mapping from functools import lru_cache, partial, wraps from platform import uname @@ -317,15 +318,6 @@ def is_hip() -> bool: return torch.version.hip is not None -@lru_cache(maxsize=None) -def is_cpu() -> bool: - from importlib.metadata import PackageNotFoundError, version - try: - return "cpu" in version("vllm") - except PackageNotFoundError: - return False - - @lru_cache(maxsize=None) def is_openvino() -> bool: from importlib.metadata import PackageNotFoundError, version @@ -335,38 +327,6 @@ def is_openvino() -> bool: return False -@lru_cache(maxsize=None) -def is_neuron() -> bool: - try: - import transformers_neuronx - except ImportError: - transformers_neuronx = None - return transformers_neuronx is not None - - -@lru_cache(maxsize=None) -def is_xpu() -> bool: - from importlib.metadata import PackageNotFoundError, version - try: - is_xpu_flag = "xpu" in version("vllm") - except PackageNotFoundError: - return False - # vllm is not build with xpu - if not is_xpu_flag: - return False - try: - import intel_extension_for_pytorch as ipex # noqa: F401 - _import_ipex = True - except ImportError as e: - logger.warning("Import Error for IPEX: %s", e.msg) - _import_ipex = False - # ipex dependency is not ready - if not _import_ipex: - logger.warning("not found ipex lib") - return False - return hasattr(torch, "xpu") and torch.xpu.is_available() - - @lru_cache(maxsize=None) def get_max_shared_memory_bytes(gpu: int = 0) -> int: """Returns the maximum shared memory per thread block in bytes.""" @@ -396,7 +356,7 @@ def seed_everything(seed: int) -> None: if current_platform.is_cuda_alike(): torch.cuda.manual_seed_all(seed) - if is_xpu(): + if current_platform.is_xpu(): torch.xpu.manual_seed_all(seed) @@ -437,6 +397,12 @@ def _async_wrapper(*args: P.args, **kwargs: P.kwargs) -> asyncio.Future: return _async_wrapper +def _next_task(iterator: AsyncGenerator[T, None], + loop: AbstractEventLoop) -> Task: + # Can use anext() in python >= 3.10 + return loop.create_task(iterator.__anext__()) # type: ignore[arg-type] + + async def iterate_with_cancellation( iterator: AsyncGenerator[T, None], is_cancelled: Callable[[], Awaitable[bool]], @@ -445,19 +411,27 @@ async def iterate_with_cancellation( at least once per second to check for client cancellation. """ - # Can use anext() in python >= 3.10 - awaits = [ensure_future(iterator.__anext__())] + loop = asyncio.get_running_loop() + + awaits: List[Future[T]] = [_next_task(iterator, loop)] + next_cancel_check: float = 0 while True: - done, pending = await asyncio.wait(awaits, timeout=1) - if await is_cancelled(): - with contextlib.suppress(BaseException): - awaits[0].cancel() - await iterator.aclose() - raise asyncio.CancelledError("client cancelled") + done, pending = await asyncio.wait(awaits, timeout=1.5) + + # Check for cancellation at most once per second + time_now = time.time() + if time_now >= next_cancel_check: + if await is_cancelled(): + with contextlib.suppress(BaseException): + awaits[0].cancel() + await iterator.aclose() + raise asyncio.CancelledError("client cancelled") + next_cancel_check = time_now + 1 + if done: try: item = await awaits[0] - awaits[0] = ensure_future(iterator.__anext__()) + awaits[0] = _next_task(iterator, loop) yield item except StopAsyncIteration: # we are done @@ -478,25 +452,29 @@ async def merge_async_iterators( to check for client cancellation. """ - # Can use anext() in python >= 3.10 - awaits = { - ensure_future(pair[1].__anext__()): pair - for pair in enumerate(iterators) - } - timeout = None if is_cancelled is None else 1 + loop = asyncio.get_running_loop() + + awaits = {_next_task(pair[1], loop): pair for pair in enumerate(iterators)} + timeout = None if is_cancelled is None else 1.5 + next_cancel_check: float = 0 try: while awaits: done, pending = await asyncio.wait(awaits.keys(), return_when=FIRST_COMPLETED, timeout=timeout) - if is_cancelled is not None and await is_cancelled(): - raise asyncio.CancelledError("client cancelled") + if is_cancelled is not None: + # Check for cancellation at most once per second + time_now = time.time() + if time_now >= next_cancel_check: + if await is_cancelled(): + raise asyncio.CancelledError("client cancelled") + next_cancel_check = time_now + 1 for d in done: pair = awaits.pop(d) try: item = await d i, it = pair - awaits[ensure_future(it.__anext__())] = pair + awaits[_next_task(it, loop)] = pair yield i, item except StopAsyncIteration: pass @@ -773,13 +751,13 @@ def is_pin_memory_available() -> bool: print_warning_once("Using 'pin_memory=False' as WSL is detected. " "This may slow down the performance.") return False - elif is_xpu(): + elif current_platform.is_xpu(): print_warning_once("Pin memory is not supported on XPU.") return False - elif is_neuron(): + elif current_platform.is_neuron(): print_warning_once("Pin memory is not supported on Neuron.") return False - elif is_cpu() or is_openvino(): + elif current_platform.is_cpu() or is_openvino(): return False return True @@ -794,7 +772,7 @@ def current_memory_usage(self) -> float: if current_platform.is_cuda_alike(): torch.cuda.reset_peak_memory_stats(self.device) mem = torch.cuda.max_memory_allocated(self.device) - elif is_xpu(): + elif current_platform.is_xpu(): torch.xpu.reset_peak_memory_stats(self.device) # type: ignore mem = torch.xpu.max_memory_allocated(self.device) # type: ignore return mem @@ -949,6 +927,8 @@ def flatten_2d_lists(lists: List[List[T]]) -> List[T]: return [item for sublist in lists for item in sublist] +# TODO: This function can be removed if transformer_modules classes are +# serialized by value when communicating between processes def init_cached_hf_modules() -> None: """ Lazy initialization of the Hugging Face modules. @@ -1034,10 +1014,54 @@ def identity(value: T) -> T: F = TypeVar('F', bound=Callable[..., Any]) +def deprecate_args( + start_index: int, + is_deprecated: Union[bool, Callable[[], bool]] = True, + additional_message: Optional[str] = None, +) -> Callable[[F], F]: + + if not callable(is_deprecated): + is_deprecated = partial(identity, is_deprecated) + + def wrapper(fn: F) -> F: + + params = inspect.signature(fn).parameters + pos_types = ( + inspect.Parameter.POSITIONAL_ONLY, + inspect.Parameter.POSITIONAL_OR_KEYWORD, + ) + pos_kws = [ + kw for kw, param in params.items() if param.kind in pos_types + ] + + @wraps(fn) + def inner(*args, **kwargs): + if is_deprecated(): + deprecated_args = pos_kws[start_index:len(args)] + if deprecated_args: + msg = ( + f"The positional arguments {deprecated_args} are " + "deprecated and will be removed in a future update.") + if additional_message is not None: + msg += f" {additional_message}" + + warnings.warn( + DeprecationWarning(msg), + stacklevel=3, # The inner function takes up one level + ) + + return fn(*args, **kwargs) + + return inner # type: ignore + + return wrapper + + def deprecate_kwargs( - *kws: str, - is_deprecated: Union[bool, Callable[[], bool]] = True, - additional_message: Optional[str] = None) -> Callable[[F], F]: + *kws: str, + is_deprecated: Union[bool, Callable[[], bool]] = True, + additional_message: Optional[str] = None, +) -> Callable[[F], F]: deprecated_kws = set(kws) if not callable(is_deprecated): diff --git a/vllm/v1/attention/__init__.py b/vllm/v1/attention/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/v1/attention/backends/__init__.py b/vllm/v1/attention/backends/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py new file mode 100644 index 0000000000000..0530b1a6762ce --- /dev/null +++ b/vllm/v1/attention/backends/flash_attn.py @@ -0,0 +1,241 @@ +"""Attention layer with FlashAttention.""" +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Type + +import torch + +from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionMetadata, AttentionType) +from vllm.forward_context import get_forward_context +from vllm.vllm_flash_attn import flash_attn_varlen_func + + +class FlashAttentionBackend(AttentionBackend): + + @staticmethod + def get_supported_head_sizes() -> List[int]: + return [32, 64, 96, 128, 160, 192, 224, 256] + + @staticmethod + def get_name() -> str: + return "flash-attn-vllm-v1" + + @staticmethod + def get_impl_cls() -> Type["FlashAttentionImpl"]: + return FlashAttentionImpl + + @staticmethod + def get_metadata_cls() -> Type["AttentionMetadata"]: + return FlashAttentionMetadata + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> Tuple[int, ...]: + if block_size % 16 != 0: + raise ValueError("Block size must be a multiple of 16.") + return (2, num_blocks, block_size, num_kv_heads, head_size) + + +@dataclass +class FlashAttentionMetadata: + # NOTE(sang): Definition of context_len, query_len, and seq_len. + # |---------- N-1 iteration --------| + # |---------------- N iteration ---------------------| + # |- tokenA -|......................|-- newTokens ---| + # |---------- context_len ----------| + # |-------------------- seq_len ---------------------| + # |-- query_len ---| + + max_query_len: int + query_start_loc: torch.Tensor + max_seq_len: int + seq_start_loc: torch.Tensor + block_table: torch.Tensor + slot_mapping: torch.Tensor + + +class FlashAttentionImpl(AttentionImpl): + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[List[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + blocksparse_params: Optional[Dict[str, Any]] = None, + logits_soft_cap: Optional[float] = None, + ) -> None: + if blocksparse_params is not None: + raise ValueError( + "FlashAttention does not support block-sparse attention.") + self.num_heads = num_heads + self.head_size = head_size + self.scale = float(scale) + self.num_kv_heads = num_kv_heads + if alibi_slopes is not None: + alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32) + self.alibi_slopes = alibi_slopes + self.sliding_window = ((sliding_window, sliding_window) + if sliding_window is not None else (-1, -1)) + self.kv_cache_dtype = kv_cache_dtype + if logits_soft_cap is None: + # In flash-attn, setting logits_soft_cap as 0 means no soft cap. + logits_soft_cap = 0 + self.logits_soft_cap = logits_soft_cap + + assert self.num_heads % self.num_kv_heads == 0 + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + + if sliding_window is not None: + # NOTE(woosuk): flash-attn's sliding window does not work with + # paged KV cache. + raise ValueError( + "Sliding window is not supported in FlashAttention.") + + support_head_sizes = FlashAttentionBackend.get_supported_head_sizes() + if head_size not in support_head_sizes: + raise ValueError( + f"Head size {head_size} is not supported by FlashAttention. " + f"Supported head sizes are: {support_head_sizes}.") + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: FlashAttentionMetadata, + k_scale: float = 1.0, + v_scale: float = 1.0, + attn_type: AttentionType = AttentionType.DECODER, + ) -> torch.Tensor: + """Forward pass with FlashAttention. + + Args: + query: shape = [num_tokens, num_heads * head_size] + key: shape = [num_tokens, num_kv_heads * head_size] + value: shape = [num_tokens, num_kv_heads * head_size] + kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + attn_metadata: Metadata for attention. + Returns: + shape = [num_tokens, num_heads * head_size] + """ + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "FlashAttentionImpl") + + # NOTE(woosuk): FlashAttention does not support FP8 KV cache. + assert k_scale == 1.0 and v_scale == 1.0, ( + "key/v_scale is not supported in FlashAttention.") + + output = torch.ops.vllm.unified_flash_attention( + query, + key, + value, + self.num_heads, + self.head_size, + self.num_kv_heads, + kv_cache, + self.kv_cache_dtype, + k_scale, + v_scale, + self.scale, + self.sliding_window, + self.alibi_slopes, + self.logits_soft_cap, + ) + return output + + +@torch.library.custom_op("vllm::unified_flash_attention", + mutates_args=["kv_cache"]) +def unified_flash_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + num_heads: int, + head_size: int, + num_kv_heads: int, + kv_cache: torch.Tensor, + kv_cache_dtype: str, + k_scale: float, + v_scale: float, + softmax_scale: float, + window_size: Optional[List[int]] = None, + alibi_slopes: Optional[torch.Tensor] = None, + logits_soft_cap: Optional[float] = None, +) -> torch.Tensor: + current_metadata = get_forward_context() + if current_metadata is None: + # Profiling run. + return torch.empty_like(query) + + assert current_metadata is not None + assert isinstance(current_metadata, FlashAttentionMetadata) + attn_metadata: FlashAttentionMetadata = current_metadata + + num_tokens, hidden_size = query.shape + # Reshape the query, key, and value tensors. + query = query.view(-1, num_heads, head_size) + key = key.view(-1, num_kv_heads, head_size) + value = value.view(-1, num_kv_heads, head_size) + + # Reshape the input keys and values and store them in the cache. + key_cache = kv_cache[0] + value_cache = kv_cache[1] + torch.ops._C_cache_ops.reshape_and_cache_flash( + key, + value, + kv_cache[0], + kv_cache[1], + attn_metadata.slot_mapping, + kv_cache_dtype, + k_scale, + v_scale, + ) + + output = flash_attn_varlen_func( + q=query, + k=key_cache, + v=value_cache, + cu_seqlens_q=attn_metadata.query_start_loc, + max_seqlen_q=attn_metadata.max_query_len, + cu_seqlens_k=attn_metadata.seq_start_loc, + max_seqlen_k=attn_metadata.max_seq_len, + softmax_scale=softmax_scale, + causal=True, + alibi_slopes=alibi_slopes, + window_size=window_size, + block_table=attn_metadata.block_table, + softcap=logits_soft_cap, + ) + return output.view(num_tokens, hidden_size) + + +@unified_flash_attention.register_fake +def _( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + num_heads: int, + head_size: int, + num_kv_heads: int, + kv_cache: torch.Tensor, + kv_cache_dtype: str, + k_scale: float, + v_scale: float, + softmax_scale: float, + window_size: Optional[List[int]] = None, + alibi_slopes: Optional[torch.Tensor] = None, + logits_soft_cap: Optional[float] = None, +) -> torch.Tensor: + return torch.empty_like(query) diff --git a/vllm/v1/core/__init__.py b/vllm/v1/core/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py new file mode 100644 index 0000000000000..9b735a8be10d7 --- /dev/null +++ b/vllm/v1/core/kv_cache_manager.py @@ -0,0 +1,108 @@ +from typing import Dict, List, Optional + +import numpy as np + +from vllm.logger import init_logger +from vllm.utils import cdiv +from vllm.v1.request import Request + +logger = init_logger(__name__) + + +class KVCacheManager: + + def __init__( + self, + block_size: int, + num_gpu_blocks: int, + sliding_window: Optional[int] = None, + enable_caching: bool = True, + num_preallocate_tokens: int = 64, + ) -> None: + self.block_size = block_size + self.num_gpu_blocks = num_gpu_blocks + self.sliding_window = sliding_window + self.enable_caching = enable_caching + # NOTE(woosuk): To avoid frequent block allocation, we preallocate some + # blocks for each request. For example, when a request reaches the end + # of its block table, we preallocate N blocks in advance. This way, we + # reduce the overhead of updating free_block_ids and ref_cnts for each + # request every step (at the cost of some memory waste). + # NOTE(woosuk): This is different from the "lookahead" slots since this + # does not guarantee that the request always has N empty blocks. After + # the request gets N empty blocks, it starts to use the blocks without + # further allocation. When it uses up all the N empty blocks, it gets + # N new empty blocks. + self.num_preallocate_tokens = num_preallocate_tokens + self.num_preallocate_blocks = cdiv(num_preallocate_tokens, block_size) + + self.free_block_ids = list(range(num_gpu_blocks)) + self.req_to_block_ids: Dict[str, List[int]] = {} + self.ref_cnts = np.zeros(num_gpu_blocks, dtype=np.int32) + + def get_computed_blocks(self, request: Request) -> List[int]: + if not self.enable_caching: + # No prefix caching. + return [] + # TODO(woosuk): Implement hash-based caching. + return [] + + def append_slots( + self, + request: Request, + num_tokens: int, + ) -> Optional[List[int]]: + num_required_blocks = cdiv(request.num_computed_tokens + num_tokens, + self.block_size) + req_block_ids = self.req_to_block_ids[request.request_id] + if num_required_blocks <= len(req_block_ids): + # No new block is needed. + return [] + + num_new_blocks = num_required_blocks - len(req_block_ids) + num_free_blocks = len(self.free_block_ids) + if num_new_blocks > num_free_blocks: + # Cannot allocate new blocks. + return None + + # Allocate new blocks. + num_new_blocks = min(num_new_blocks + self.num_preallocate_blocks, + num_free_blocks) + new_block_ids = self._get_new_blocks(num_new_blocks) + req_block_ids.extend(new_block_ids) + self.ref_cnts[new_block_ids] += 1 + return new_block_ids + + def allocate_slots( + self, + request: Request, + num_tokens: int, + computed_block_ids: List[int], + ) -> Optional[List[int]]: + num_required_blocks = cdiv(num_tokens, self.block_size) + num_free_blocks = len(self.free_block_ids) + if num_required_blocks > num_free_blocks: + # Cannot allocate new blocks. + return None + + num_new_blocks = min(num_required_blocks + self.num_preallocate_blocks, + num_free_blocks) + new_block_ids = self._get_new_blocks(num_new_blocks) + block_ids = computed_block_ids + new_block_ids + self.req_to_block_ids[request.request_id] = block_ids + self.ref_cnts[block_ids] += 1 + return new_block_ids + + def free(self, request: Request) -> None: + block_ids = self.req_to_block_ids.pop(request.request_id) + self.ref_cnts[block_ids] -= 1 + for block_id in block_ids: + ref_cnt = self.ref_cnts[block_id] + if ref_cnt == 0: + self.free_block_ids.append(block_id) + + def _get_new_blocks(self, num_blocks: int) -> List[int]: + assert num_blocks <= len(self.free_block_ids) + new_block_ids = self.free_block_ids[-num_blocks:] + self.free_block_ids = self.free_block_ids[:-num_blocks] + return new_block_ids diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py new file mode 100644 index 0000000000000..41659ff62747d --- /dev/null +++ b/vllm/v1/core/scheduler.py @@ -0,0 +1,412 @@ +from collections import deque +from dataclasses import dataclass +from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union + +from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig +from vllm.logger import init_logger +from vllm.multimodal import MultiModalDataDict +from vllm.sampling_params import SamplingParams +from vllm.v1.core.kv_cache_manager import KVCacheManager +from vllm.v1.outputs import ModelRunnerOutput +from vllm.v1.request import Request, RequestStatus + +logger = init_logger(__name__) + + +class Scheduler: + + def __init__( + self, + scheduler_config: SchedulerConfig, + cache_config: CacheConfig, + lora_config: Optional[LoRAConfig], + ) -> None: + self.scheduler_config = scheduler_config + self.cache_config = cache_config + self.lora_config = lora_config + # TODO: Support LoRA. + assert lora_config is None, "V1 does not support LoRA yet." + + num_gpu_blocks = cache_config.num_gpu_blocks + assert isinstance(num_gpu_blocks, int) and num_gpu_blocks > 0 + # Create the block space manager. + self.kv_cache_manager = KVCacheManager( + block_size=self.cache_config.block_size, + num_gpu_blocks=num_gpu_blocks, + sliding_window=self.cache_config.sliding_window, + enable_caching=True) + self.block_size = self.cache_config.block_size + + # Scheduling constraints. + self.max_num_running_reqs = self.scheduler_config.max_num_seqs + self.max_num_scheduled_tokens = \ + self.scheduler_config.max_num_batched_tokens + self.max_model_len = self.scheduler_config.max_model_len + + # req_id -> Request + self.requests: Dict[str, Request] = {} + # Priority queues for requests. + self.waiting: Deque[Request] = deque() + self.running: List[Request] = [] + + # The request IDs that are finished in between the previous and the + # current steps. This is used to notify the workers about the finished + # requests so that they can free the cached states for those requests. + # This is flushed at the end of each scheduling step. + self.finished_req_ids: Set[str] = set() + + # OPTIMIZATION: Cache the RunningRequestData objects to avoid creating + # them at each scheduling step. + # Request id -> RunningRequestData + self.running_reqs_data: Dict[str, RunningRequestData] = {} + + def schedule(self) -> "SchedulerOutput": + scheduled_new_reqs: List[Request] = [] + scheduled_resumed_reqs: List[Request] = [] + scheduled_running_reqs: List[Request] = [] + preempted_reqs: List[Request] = [] + + # NOTE(woosuk) on the scheduling algorithm: + # There's no "decoding phase" nor "prefill phase" in the scheduler. + # Each request just has the num_computed_tokens and num_tokens, + # which is equal to len(prompt_token_ids) + len(output_token_ids). + # At each step, the scheduler tries to assign tokens to the requests + # so that each request's num_computed_tokens can catch up its + # num_tokens. This is general enough to cover chunked prefills, + # prefix caching, and the "jump forward" optimization in the future. + + req_to_new_block_ids: Dict[str, List[int]] = {} + num_scheduled_tokens: Dict[str, int] = {} + token_budget = self.max_num_scheduled_tokens + + # First, schedule the RUNNING requests. + req_index = 0 + while req_index < len(self.running): + if token_budget == 0: + break + + request = self.running[req_index] + num_new_tokens = request.num_tokens - request.num_computed_tokens + num_new_tokens = min(num_new_tokens, token_budget) + assert num_new_tokens > 0 + + while True: + new_block_ids = self.kv_cache_manager.append_slots( + request, num_new_tokens) + if new_block_ids is None: + # The request cannot be scheduled. + # Preempt the lowest-priority request. + preempted_req = self.running.pop() + self.kv_cache_manager.free(preempted_req) + preempted_req.status = RequestStatus.PREEMPTED + preempted_req.num_computed_tokens = 0 + + self.waiting.appendleft(preempted_req) + preempted_reqs.append(preempted_req) + if preempted_req == request: + # No more request to preempt. + break + else: + # The request can be scheduled. + scheduled_running_reqs.append(request) + + req_to_new_block_ids[request.request_id] = new_block_ids + num_scheduled_tokens[request.request_id] = num_new_tokens + token_budget -= num_new_tokens + req_index += 1 + break + + # Next, schedule the WAITING requests. + if not preempted_reqs: + while self.waiting: + if len(self.running) == self.max_num_running_reqs: + break + if token_budget == 0: + break + + request = self.waiting[0] + # Get already-cached tokens. + computed_block_ids = self.kv_cache_manager.get_computed_blocks( + request) + # NOTE(woosuk): Since incomplete blocks are not eligible for + # sharing, `num_computed_tokens` is always a multiple of + # `block_size`. + num_computed_tokens = len(computed_block_ids) * self.block_size + # Number of tokens to be scheduled. + # We use `request.num_tokens` instead of + # `request.num_prompt_tokens` to consider the resumed requests, + # which have output tokens. + num_new_tokens = request.num_tokens - num_computed_tokens + num_new_tokens = min(num_new_tokens, token_budget) + assert num_new_tokens > 0 + new_block_ids = self.kv_cache_manager.allocate_slots( + request, num_new_tokens, computed_block_ids) + if new_block_ids is None: + # The request cannot be scheduled. + break + request.num_computed_tokens = num_computed_tokens + + self.waiting.popleft() + self.running.append(request) + if request.status == RequestStatus.WAITING: + scheduled_new_reqs.append(request) + elif request.status == RequestStatus.PREEMPTED: + scheduled_resumed_reqs.append(request) + else: + raise RuntimeError( + f"Invalid request status: {request.status}") + + req_to_new_block_ids[request.request_id] = ( + computed_block_ids + new_block_ids) + num_scheduled_tokens[request.request_id] = num_new_tokens + token_budget -= num_new_tokens + request.status = RequestStatus.RUNNING + + # Check if the scheduling constraints are satisfied. + total_num_scheduled_tokens = sum(num_scheduled_tokens.values()) + assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens + assert token_budget >= 0 + assert len(self.running) <= self.max_num_running_reqs + assert (len(scheduled_new_reqs) + len(scheduled_resumed_reqs) + + len(scheduled_running_reqs) == len(self.running)) + + # Construct the scheduler output. + new_reqs_data = [ + NewRequestData.from_request(req, + req_to_new_block_ids[req.request_id], + req.num_computed_tokens) + for req in scheduled_new_reqs + ] + resumed_reqs_data = [ + ResumedRequestData.from_request( + req, req_to_new_block_ids[req.request_id], + req.num_computed_tokens) for req in scheduled_resumed_reqs + ] + running_reqs_data = [ + self._make_running_request_data( + req, req_to_new_block_ids[req.request_id], + req.num_computed_tokens) for req in scheduled_running_reqs + ] + preempted_req_ids = {req.request_id for req in preempted_reqs} + scheduler_output = SchedulerOutput( + scheduled_new_reqs=new_reqs_data, + scheduled_resumed_reqs=resumed_reqs_data, + scheduled_running_reqs=running_reqs_data, + num_scheduled_tokens=num_scheduled_tokens, + total_num_scheduled_tokens=total_num_scheduled_tokens, + preempted_req_ids=preempted_req_ids, + # finished_req_ids is an existing state in the scheduler, + # instead of being newly scheduled in this step. + # It contains the request IDs that are finished in between + # the previous and the current steps. + finished_req_ids=self.finished_req_ids, + ) + + self.finished_req_ids = set() + return scheduler_output + + def _make_running_request_data( + self, + request: Request, + new_block_ids: List[int], + num_computed_tokens: int, + ) -> "RunningRequestData": + # OPTIMIZATION: Cache the RunningRequestData objects to avoid creating + # them at each scheduling step. + if request.request_id in self.running_reqs_data: + req_data = self.running_reqs_data[request.request_id] + req_data.new_block_ids = new_block_ids + req_data.num_computed_tokens = num_computed_tokens + else: + req_data = RunningRequestData.from_request(request, new_block_ids, + num_computed_tokens) + self.running_reqs_data[request.request_id] = req_data + return req_data + + def update_from_output( + self, + scheduler_output: "SchedulerOutput", + model_runner_output: "ModelRunnerOutput", + ) -> List[Tuple[Request, int]]: + # NOTE(woosuk): This method doesn't consider speculative decoding. + sampled_token_ids = model_runner_output.sampled_token_ids_cpu.tolist() + num_scheduled_tokens = scheduler_output.num_scheduled_tokens + new_running: List[Request] = [] + # (request, num_sampled_tokens) + sampled: List[Tuple[Request, int]] = [] + for request in self.running: + req_id = request.request_id + request.num_computed_tokens += num_scheduled_tokens[req_id] + # When the request's num_computed_tokens catches up its num_tokens, + # the request generates output tokens. Otherwise, we ignore the + # sampler output for the request. + assert request.num_computed_tokens <= request.num_tokens + if request.num_computed_tokens == request.num_tokens: + req_index = model_runner_output.req_id_to_index[req_id] + # NOTE(woosuk): Currently, we assume that each request + # generates at most one token at each step. + token_id = sampled_token_ids[req_index] + request.output_token_ids.append(token_id) + sampled.append((request, 1)) + # TODO: Update the KV cache manager for prefix caching. + + # Check if the request is finished. + stopped = self._check_stop(request) + if stopped: + continue + + new_running.append(request) + self.running = new_running + return sampled + + def _check_stop(self, request: Request) -> bool: + if (request.num_tokens >= self.max_model_len + or request.num_output_tokens >= request.max_tokens): + request.status = RequestStatus.FINISHED_LENGTH_CAPPED + self._free_request(request) + return True + + sampling_params = request.sampling_params + last_token_id = request.output_token_ids[-1] + if (not sampling_params.ignore_eos + and last_token_id == request.eos_token_id): + request.status = RequestStatus.FINISHED_STOPPED + self._free_request(request) + return True + + if last_token_id in (sampling_params.stop_token_ids or ()): + request.status = RequestStatus.FINISHED_STOPPED + request.stop_reason = last_token_id + self._free_request(request) + return True + return False + + def add_request(self, request: Request) -> None: + self.waiting.append(request) + self.requests[request.request_id] = request + + def finish_requests( + self, + request_ids: Union[str, Iterable[str]], + finished_status: RequestStatus, + ) -> None: + """Handles the finish signal from outside the scheduler. + + For example, the API server can abort a request when the client + disconnects. + """ + assert RequestStatus.is_finished(finished_status) + if isinstance(request_ids, str): + request_ids = (request_ids, ) + request_ids = set(request_ids) + + for req_id in request_ids: + request = self.requests.get(req_id) + if request is None: + # Invalid request ID. + continue + + if request.status == RequestStatus.RUNNING: + self.running.remove(request) + else: + self.waiting.remove(request) + request.status = finished_status + self._free_request(request) + + def _free_request(self, request: Request) -> None: + assert request.is_finished() + self.kv_cache_manager.free(request) + self.running_reqs_data.pop(request.request_id, None) + del self.requests[request.request_id] + self.finished_req_ids.add(request.request_id) + + def get_num_unfinished_requests(self) -> int: + return len(self.waiting) + len(self.running) + + def has_unfinished_requests(self) -> bool: + return self.get_num_unfinished_requests() > 0 + + +@dataclass +class NewRequestData: + + req_id: str + prompt_token_ids: List[int] + prompt: Optional[str] + multi_modal_data: Optional[MultiModalDataDict] + sampling_params: SamplingParams + block_ids: List[int] + num_computed_tokens: int + + @classmethod + def from_request( + cls, + request: Request, + block_ids: List[int], + num_computed_tokens: int, + ) -> "NewRequestData": + return cls( + req_id=request.request_id, + prompt_token_ids=request.inputs["prompt_token_ids"], + prompt=request.inputs.get("prompt"), + multi_modal_data=request.inputs.get("multi_modal_data"), + sampling_params=request.sampling_params, + block_ids=block_ids, + num_computed_tokens=num_computed_tokens, + ) + + +@dataclass +class ResumedRequestData: + + req_id: str + block_ids: List[int] + num_computed_tokens: int + + @classmethod + def from_request( + cls, + request: Request, + block_ids: List[int], + num_computed_tokens: int, + ) -> "ResumedRequestData": + return cls( + req_id=request.request_id, + block_ids=block_ids, + num_computed_tokens=num_computed_tokens, + ) + + +@dataclass +class RunningRequestData: + + req_id: str + new_block_ids: List[int] + num_computed_tokens: int + + @classmethod + def from_request( + cls, + request: Request, + new_block_ids: List[int], + num_computed_tokens: int, + ) -> "RunningRequestData": + return cls( + req_id=request.request_id, + new_block_ids=new_block_ids, + num_computed_tokens=num_computed_tokens, + ) + + +@dataclass +class SchedulerOutput: + + scheduled_new_reqs: List[NewRequestData] + scheduled_resumed_reqs: List[ResumedRequestData] + scheduled_running_reqs: List[RunningRequestData] + + num_scheduled_tokens: Dict[str, int] + total_num_scheduled_tokens: int + + preempted_req_ids: Set[str] + finished_req_ids: Set[str] diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py new file mode 100644 index 0000000000000..511b417086c63 --- /dev/null +++ b/vllm/v1/engine/llm_engine.py @@ -0,0 +1,523 @@ +import time +from typing import (Any, Dict, Iterable, List, Mapping, Optional, Tuple, Type, + Union) + +from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, + EngineConfig, LoadConfig, LoRAConfig, ModelConfig, + ObservabilityConfig, ParallelConfig, + PromptAdapterConfig, SchedulerConfig, + SpeculativeConfig) +from vllm.engine.arg_utils import EngineArgs +from vllm.engine.metrics_types import StatLoggerBase +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, + EncoderDecoderLLMInputs, InputRegistry, PromptType) +from vllm.inputs.preprocess import InputPreprocessor +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.outputs import CompletionOutput, RequestOutput +from vllm.pooling_params import PoolingParams +from vllm.prompt_adapter.request import PromptAdapterRequest +from vllm.sampling_params import RequestOutputKind, SamplingParams +from vllm.transformers_utils.config import try_get_generation_config +from vllm.transformers_utils.tokenizer_group import ( + BaseTokenizerGroup, init_tokenizer_from_configs) +from vllm.usage.usage_lib import UsageContext +from vllm.v1.core.scheduler import Scheduler +from vllm.v1.executor.gpu_executor import GPUExecutor +from vllm.v1.request import Request, RequestStatus +from vllm.v1.tokenizer.detokenizer import Detokenizer, DetokenizerInputs +from vllm.version import __version__ as VLLM_VERSION + +logger = init_logger(__name__) + + +class LLMEngine: + + def __init__( + self, + model_config: ModelConfig, + cache_config: CacheConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + load_config: LoadConfig, + lora_config: Optional[LoRAConfig], + speculative_config: Optional[SpeculativeConfig], + decoding_config: Optional[DecodingConfig], + observability_config: Optional[ObservabilityConfig], + prompt_adapter_config: Optional[PromptAdapterConfig], + executor_class: Type[GPUExecutor], + log_stats: bool, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, + input_registry: InputRegistry = INPUT_REGISTRY, + use_cached_outputs: bool = False, + ) -> None: + # Override the configs for V1. + # FIXME + if usage_context == UsageContext.LLM_CLASS: + scheduler_config.max_num_seqs = 1024 + scheduler_config.max_num_batched_tokens = 8192 + elif usage_context == UsageContext.OPENAI_API_SERVER: + scheduler_config.max_num_seqs = 1024 + scheduler_config.max_num_batched_tokens = 2048 + + logger.info( + "Initializing an LLM engine (v%s) with config: " + "model=%r, speculative_config=%r, tokenizer=%r, " + "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, " + "override_neuron_config=%s, " + "rope_scaling=%r, rope_theta=%r, tokenizer_revision=%s, " + "trust_remote_code=%s, dtype=%s, max_seq_len=%d, " + "download_dir=%r, load_format=%s, tensor_parallel_size=%d, " + "pipeline_parallel_size=%d, " + "disable_custom_all_reduce=%s, quantization=%s, " + "enforce_eager=%s, kv_cache_dtype=%s, " + "quantization_param_path=%s, device_config=%s, " + "decoding_config=%r, observability_config=%r, " + "seed=%d, served_model_name=%s, " + "num_scheduler_steps=%d, enable_prefix_caching=%s, " + "use_async_output_proc=%s, mm_processor_kwargs=%s)", + VLLM_VERSION, + model_config.model, + speculative_config, + model_config.tokenizer, + model_config.skip_tokenizer_init, + model_config.tokenizer_mode, + model_config.revision, + model_config.override_neuron_config, + model_config.rope_scaling, + model_config.rope_theta, + model_config.tokenizer_revision, + model_config.trust_remote_code, + model_config.dtype, + model_config.max_model_len, + load_config.download_dir, + load_config.load_format, + parallel_config.tensor_parallel_size, + parallel_config.pipeline_parallel_size, + parallel_config.disable_custom_all_reduce, + model_config.quantization, + model_config.enforce_eager, + cache_config.cache_dtype, + model_config.quantization_param_path, + device_config.device, + decoding_config, + observability_config, + model_config.seed, + model_config.served_model_name, + scheduler_config.num_scheduler_steps, + cache_config.enable_prefix_caching, + model_config.use_async_output_proc, + model_config.mm_processor_kwargs, + ) + + self.model_config = model_config + self.cache_config = cache_config + self.lora_config = lora_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + self.speculative_config = speculative_config + self.load_config = load_config + self.decoding_config = decoding_config or DecodingConfig() + self.prompt_adapter_config = prompt_adapter_config + self.observability_config = observability_config or ObservabilityConfig( + ) + self.log_stats = log_stats + + assert not self.model_config.skip_tokenizer_init + self.tokenizer = self._init_tokenizer() + if self.tokenizer: + # Ping the tokenizer to ensure liveness if it runs in a + # different process. + self.tokenizer.ping() + self.detokenizer = Detokenizer(self.model_config.tokenizer) + + self.generation_config_fields = _load_generation_config_dict( + model_config) + self.input_preprocessor = InputPreprocessor(model_config, + self.tokenizer) + self.input_registry = input_registry + self.input_processor = input_registry.create_input_processor( + model_config) + + # Request id -> Request + self.requests: Dict[str, Request] = {} + # NOTE(woosuk): Now that the detokenizer works asynchronously, we need + # to keep track of how many steps each request has been lagged behind + # in terms of detokenization. + # Request id -> how many detokenizer steps the request should wait for. + self.num_lagged_steps: Dict[str, int] = {} + # OPTIMIZATION: Cache the request output and update it incrementally. + # This is used to avoid creating a new RequestOutput object every step. + # Request id -> RequestOutput + self.request_outputs: Dict[str, RequestOutput] = {} + + self.model_executor = executor_class( + model_config=model_config, + cache_config=cache_config, + parallel_config=parallel_config, + scheduler_config=scheduler_config, + device_config=device_config, + lora_config=lora_config, + speculative_config=speculative_config, + load_config=load_config, + prompt_adapter_config=prompt_adapter_config, + observability_config=self.observability_config, + ) + assert self.model_config.task != "embedding" + self._initialize_kv_caches() + + # Create the scheduler. + # NOTE: the cache_config here have been updated with the numbers of + # GPU and CPU blocks, which are profiled in the distributed executor. + self.scheduler = Scheduler(scheduler_config, cache_config, lora_config) + + def _initialize_kv_caches(self) -> None: + num_gpu_blocks, _ = self.model_executor.determine_num_available_blocks( + ) + + if self.cache_config.num_gpu_blocks_override is not None: + num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override + logger.info( + "Overriding num_gpu_blocks=%d with " + "num_gpu_blocks_override=%d", num_gpu_blocks, + num_gpu_blocks_override) + num_gpu_blocks = num_gpu_blocks_override + + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = 0 + self.model_executor.initialize_cache(num_gpu_blocks) + + @classmethod + def from_engine_args( + cls, + engine_args: EngineArgs, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, + ) -> "LLMEngine": + """Creates an LLM engine from the engine arguments.""" + # Create the engine configs. + engine_config = engine_args.create_engine_config() + executor_class = cls._get_executor_cls(engine_config) + # Create the LLM engine. + engine = cls( + **engine_config.to_dict(), + executor_class=executor_class, + log_stats=not engine_args.disable_log_stats, + usage_context=usage_context, + stat_loggers=stat_loggers, + ) + return engine + + def _init_tokenizer(self) -> BaseTokenizerGroup: + return init_tokenizer_from_configs( + model_config=self.model_config, + scheduler_config=self.scheduler_config, + parallel_config=self.parallel_config, + enable_lora=bool(self.lora_config)) + + def _verify_args(self) -> None: + self.model_config.verify_with_parallel_config(self.parallel_config) + self.cache_config.verify_with_parallel_config(self.parallel_config) + if self.lora_config: + self.lora_config.verify_with_model_config(self.model_config) + self.lora_config.verify_with_scheduler_config( + self.scheduler_config) + if self.prompt_adapter_config: + self.prompt_adapter_config.verify_with_model_config( + self.model_config) + + def _add_processed_request( + self, + request_id: str, + processed_inputs: Union[DecoderOnlyInputs, EncoderDecoderLLMInputs], + params: Union[SamplingParams, PoolingParams], + arrival_time: float, + lora_request: Optional[LoRARequest], + prompt_adapter_request: Optional[PromptAdapterRequest], + trace_headers: Optional[Mapping[str, str]] = None, + ) -> None: + assert prompt_adapter_request is None + assert trace_headers is None + self._validate_model_inputs(processed_inputs) + eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request) + + # TODO(woosuk): Support embedding mode. + assert isinstance(params, SamplingParams) + sampling_params = params.clone() + sampling_params.update_from_generation_config( + self.generation_config_fields, eos_token_id) + + # TODO(woosuk): Check max_logprobs + # TODO(woosuk): Support encoder-decoder models. + req = Request(request_id, processed_inputs, params, eos_token_id, + arrival_time) + self.requests[request_id] = req + self.num_lagged_steps[request_id] = 0 + self.scheduler.add_request(req) + + def stop_remote_worker_execution_loop(self) -> None: + raise NotImplementedError("TP not implemented yet.") + + def add_request( + self, + request_id: str, + prompt: PromptType, + params: Union[SamplingParams, PoolingParams], + arrival_time: Optional[float] = None, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + priority: int = 0, + ) -> None: + if lora_request is not None and not self.lora_config: + raise ValueError(f"Got lora_request {lora_request} but LoRA is " + "not enabled!") + if arrival_time is None: + arrival_time = time.time() + assert priority == 0, "vLLM V1 does not support priority at the moment." + + preprocessed_inputs = self.input_preprocessor.preprocess( + prompt, + request_id=request_id, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request, + ) + processed_inputs = self.input_processor(preprocessed_inputs) + + self._add_processed_request( + request_id=request_id, + processed_inputs=processed_inputs, + params=params, + arrival_time=arrival_time, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request, + trace_headers=trace_headers, + ) + + def abort_request(self, request_id: Union[str, Iterable[str]]) -> None: + self.scheduler.finish_requests(request_id, + RequestStatus.FINISHED_ABORTED) + + def get_num_unfinished_requests(self) -> int: + """Gets the number of unfinished requests.""" + return len(self.requests) + + def has_unfinished_requests(self) -> bool: + """Returns True if there are unfinished requests.""" + return len(self.requests) > 0 + + def step(self) -> List[RequestOutput]: + # NOTE(woosuk): This method may return an empty list when the + # detokenizer is still processing the outputs. This should not be + # considered as the end of the generation process. + # FIXME(woosuk): Currently, the step method is inefficient because it + # creates RequestOutput objects for all running requests, while they + # may not be needed unless the output is streamed to the client. + if self.scheduler.has_unfinished_requests(): + scheduler_output = self.scheduler.schedule() + output = self.model_executor.execute_model(scheduler_output) + sampled = self.scheduler.update_from_output( + scheduler_output, output) + self.send_to_detokenizer(sampled) + req_outputs = self.recv_from_detokenizer() + return req_outputs + + def send_to_detokenizer(self, sampled: List[Tuple[Request, int]]) -> None: + inputs = DetokenizerInputs( + req_ids=[], + prompt_token_ids=[], + new_token_ids=[], + skip_special_tokens=[], + spaces_between_special_tokens=[], + free_req_ids=[], # TODO(woosuk): Implement freeing. + ) + for req, num_tokens in sampled: + inputs.req_ids.append(req.request_id) + if len(req.output_token_ids) == num_tokens: + # The request is first detokenized. + inputs.prompt_token_ids.append(req.prompt_token_ids) + else: + # The prompt token ids are already cached in the detokenizer. + inputs.prompt_token_ids.append([]) + inputs.new_token_ids.append(req.output_token_ids[-num_tokens:]) + inputs.skip_special_tokens.append( + req.sampling_params.skip_special_tokens) + inputs.spaces_between_special_tokens.append( + req.sampling_params.spaces_between_special_tokens) + + # Update the number of lagged steps. + self.num_lagged_steps[req.request_id] += 1 + self.detokenizer.send(inputs) + + def recv_from_detokenizer(self) -> List[RequestOutput]: + detokenizer_output = self.detokenizer.recv() + if detokenizer_output is None: + return [] + + req_outputs: List[RequestOutput] = [] + num_reqs = len(detokenizer_output.req_ids) + for i in range(num_reqs): + req_id = detokenizer_output.req_ids[i] + req = self.requests[req_id] + req.output_text += detokenizer_output.detokenized_texts[i] + + self.num_lagged_steps[req_id] -= 1 + finished = (self.num_lagged_steps[req_id] == 0 + and req.is_finished()) + req_output = self._make_request_output( + req, detokenizer_output.num_output_token_ids[i], + detokenizer_output.detokenized_texts[i], finished) + req_outputs.append(req_output) + + if finished: + del self.requests[req_id] + del self.num_lagged_steps[req_id] + del self.request_outputs[req_id] + return req_outputs + + def terminate_detokenizer(self) -> None: + self.detokenizer.terminate() + + def _make_request_output( + self, + request: Request, + num_output_tokens: int, + new_output_text: str, + finished: bool, + ) -> RequestOutput: + req_output = self.request_outputs.get(request.request_id) + if req_output is None: + # TODO: Support `n` > 1. + completion_output = CompletionOutput( + index=0, + text="", + token_ids=[], + cumulative_logprob=None, + logprobs=None, # TODO + finish_reason=None, + stop_reason=None, + lora_request=None, + ) + req_output = RequestOutput( + request_id=request.request_id, + prompt=request.prompt, + prompt_token_ids=request.prompt_token_ids, + prompt_logprobs=None, # TODO + outputs=[completion_output], + finished=False, + metrics=None, + lora_request=None, + encoder_prompt=None, + encoder_prompt_token_ids=None, + ) + self.request_outputs[request.request_id] = req_output + + completion_output = req_output.outputs[0] + if request.sampling_params.output_kind == RequestOutputKind.CUMULATIVE: + completion_output.text += new_output_text + completion_output.token_ids = ( + request.output_token_ids[:num_output_tokens]) + elif request.sampling_params.output_kind == RequestOutputKind.DELTA: + completion_output.text = new_output_text + num_prev_tokens = len(completion_output.token_ids) + completion_output.token_ids = request.output_token_ids[ + num_prev_tokens:num_output_tokens] + elif (request.sampling_params.output_kind == + RequestOutputKind.FINAL_ONLY): + if finished: + completion_output.text = request.output_text + completion_output.token_ids = request.output_token_ids + else: + completion_output.text = "" + completion_output.token_ids = [] + + if finished: + completion_output.finish_reason = request.get_finished_reason() + completion_output.stop_reason = request.stop_reason + req_output.finished = finished + return req_output + + def check_health(self) -> None: + if self.tokenizer: + self.tokenizer.check_health() + self.model_executor.check_health() + + def _validate_model_inputs(self, inputs: Union[DecoderOnlyInputs, + EncoderDecoderLLMInputs]): + prompt_ids = inputs.get("prompt_token_ids") + if prompt_ids is None or len(prompt_ids) == 0: + raise ValueError("Prompt cannot be empty") + + if self.model_config.is_multimodal_model: + max_prompt_len = self.model_config.max_model_len + + if len(prompt_ids) > max_prompt_len: + raise ValueError( + f"The prompt (total length {len(prompt_ids)}) is too long " + f"to fit into the model (context length {max_prompt_len}). " + "Make sure that `max_model_len` is no smaller than the " + "number of text tokens plus multimodal tokens. For image " + "inputs, the number of image tokens depends on the number " + "of images, and possibly their aspect ratios as well.") + + @classmethod + def validate_outputs(cls, outputs, output_type): + return outputs + + def get_model_config(self) -> ModelConfig: + """Gets the model configuration.""" + return self.model_config + + def get_parallel_config(self) -> ParallelConfig: + """Gets the parallel configuration.""" + return self.parallel_config + + def get_decoding_config(self) -> DecodingConfig: + """Gets the decoding configuration.""" + return self.decoding_config + + def get_scheduler_config(self) -> SchedulerConfig: + """Gets the scheduler configuration.""" + return self.scheduler_config + + def get_lora_config(self) -> LoRAConfig: + """Gets the LoRA configuration.""" + return self.lora_config + + @classmethod + def _get_executor_cls(cls, engine_config: EngineConfig): + return GPUExecutor + + def is_tracing_enabled(self) -> bool: + return False + + def do_log_stats(self, *args, **kwargs) -> None: + pass + + def is_encoder_decoder_model(self) -> bool: + return False + + def start_profile(self) -> None: + pass + + def stop_profile(self) -> None: + pass + + def get_tokenizer_group(self, *args, **kwargs): + return self.tokenizer + + +def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]: + config = try_get_generation_config( + model_config.model, + trust_remote_code=model_config.trust_remote_code, + revision=model_config.revision, + ) + + if config is None: + return {} + + return config.to_diff_dict() diff --git a/vllm/v1/executor/__init__.py b/vllm/v1/executor/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/v1/executor/gpu_executor.py b/vllm/v1/executor/gpu_executor.py new file mode 100644 index 0000000000000..c780c7031c3d6 --- /dev/null +++ b/vllm/v1/executor/gpu_executor.py @@ -0,0 +1,100 @@ +import os +from typing import Optional, Tuple + +from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, + ModelConfig, ObservabilityConfig, ParallelConfig, + PromptAdapterConfig, SchedulerConfig, + SpeculativeConfig) +from vllm.logger import init_logger +from vllm.utils import get_distributed_init_method, get_ip, get_open_port +from vllm.v1.outputs import ModelRunnerOutput +from vllm.v1.worker.gpu_worker import Worker + +logger = init_logger(__name__) + + +class GPUExecutor: + + def __init__( + self, + model_config: ModelConfig, + cache_config: CacheConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + load_config: LoadConfig, + lora_config: Optional[LoRAConfig], + speculative_config: Optional[SpeculativeConfig], + prompt_adapter_config: Optional[PromptAdapterConfig], + observability_config: Optional[ObservabilityConfig], + ) -> None: + self.model_config = model_config + self.cache_config = cache_config + self.lora_config = lora_config + self.load_config = load_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + self.speculative_config = speculative_config + self.prompt_adapter_config = prompt_adapter_config + self.observability_config = observability_config + + self.worker = self._create_worker() + self.worker.initialize() + self.worker.load_model() + + def _create_worker( + self, + local_rank: int = 0, + rank: int = 0, + distributed_init_method: Optional[str] = None) -> Worker: + """Return worker init args for a given rank.""" + # see https://github.com/NVIDIA/nccl/issues/1234 + os.environ['NCCL_CUMEM_ENABLE'] = '0' + + if distributed_init_method is None: + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + return Worker( + model_config=self.model_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + device_config=self.device_config, + cache_config=self.cache_config, + load_config=self.load_config, + local_rank=local_rank, + rank=rank, + distributed_init_method=distributed_init_method, + lora_config=self.lora_config, + speculative_config=self.speculative_config, + prompt_adapter_config=self.prompt_adapter_config, + observability_config=self.observability_config, + ) + + def determine_num_available_blocks(self) -> Tuple[int, int]: + """Determine the number of available KV blocks by invoking the + underlying worker. + """ + return self.worker.determine_num_available_blocks() + + def initialize_cache(self, num_gpu_blocks: int) -> None: + """Initialize the KV cache by invoking the underlying worker. + """ + # NOTE: This is logged in the executor because there can be >1 worker + # with other executors. We could log in the engine level, but work + # remains to abstract away the device for non-GPU configurations. + logger.info("# GPU blocks: %d", num_gpu_blocks) + self.worker.initialize_cache(num_gpu_blocks) + self.worker.compile_or_warm_up_model() + + def execute_model( + self, + scheduler_output, + ) -> ModelRunnerOutput: + output = self.worker.execute_model(scheduler_output) + return output + + def check_health(self) -> None: + # GPUExecutor will always be healthy as long as + # it's running. + return diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py new file mode 100644 index 0000000000000..8574987728844 --- /dev/null +++ b/vllm/v1/outputs.py @@ -0,0 +1,37 @@ +from dataclasses import dataclass +from typing import Dict, List, Optional + +import torch + + +@dataclass +class SamplerOutput: + + # [num_reqs] + sampled_token_ids: torch.Tensor + + # [num_reqs, max_num_logprobs + 1] + logprob_token_ids: Optional[torch.Tensor] + # [num_reqs, max_num_logprobs + 1] + logprobs: Optional[torch.Tensor] + + # TODO: Support prompt logprobs. + prompt_logprob_token_ids: Optional[torch.Tensor] + prompt_logprobs: Optional[torch.Tensor] + + +@dataclass +class ModelRunnerOutput: + + # [num_reqs] + req_ids: List[str] + # req_id -> index + req_id_to_index: Dict[str, int] + + # [num_reqs] + sampled_token_ids_cpu: torch.Tensor + + # [num_reqs, max_num_logprobs + 1] + logprob_token_ids_cpu: Optional[torch.Tensor] + # [num_reqs, max_num_logprobs + 1] + logprobs_cpu: Optional[torch.Tensor] diff --git a/vllm/v1/request.py b/vllm/v1/request.py new file mode 100644 index 0000000000000..be7d4d165d280 --- /dev/null +++ b/vllm/v1/request.py @@ -0,0 +1,92 @@ +import enum +from typing import TYPE_CHECKING, List, Optional, Union + +from vllm.lora.request import LoRARequest +from vllm.sampling_params import SamplingParams +from vllm.sequence import RequestMetrics + +if TYPE_CHECKING: + from vllm.inputs import DecoderOnlyInputs + + +class Request: + + def __init__( + self, + request_id: str, + inputs: "DecoderOnlyInputs", + sampling_params: SamplingParams, + eos_token_id: Optional[int], + arrival_time: float, + lora_request: Optional[LoRARequest] = None, + ) -> None: + self.request_id = request_id + self.inputs = inputs + self.sampling_params = sampling_params + # Because of LoRA, the eos token id can be different for each request. + self.eos_token_id = eos_token_id + self.metrics = RequestMetrics(arrival_time=arrival_time, + last_token_time=arrival_time, + first_scheduled_time=None, + first_token_time=None, + time_in_queue=None) + self.lora_request = lora_request + + self.status = RequestStatus.WAITING + self.stop_reason: Union[int, str, None] = None + assert sampling_params.max_tokens is not None + self.max_tokens = sampling_params.max_tokens + + self.prompt = inputs.get("prompt") + self.prompt_token_ids = inputs["prompt_token_ids"] + self.num_prompt_tokens = len(self.prompt_token_ids) + self.output_token_ids: List[int] = [] + self.output_text = "" + self.num_computed_tokens = 0 + + @property + def num_tokens(self) -> int: + return self.num_prompt_tokens + len(self.output_token_ids) + + @property + def num_output_tokens(self) -> int: + return len(self.output_token_ids) + + def is_finished(self) -> bool: + return RequestStatus.is_finished(self.status) + + def get_finished_reason(self) -> Union[str, None]: + return RequestStatus.get_finished_reason(self.status) + + +class RequestStatus(enum.IntEnum): + """Status of a sequence.""" + WAITING = 0 + RUNNING = 1 + PREEMPTED = 2 + # Note: anything after PREEMPTED (2) will be considered + # as a finished status. + FINISHED_STOPPED = 3 + FINISHED_LENGTH_CAPPED = 4 + FINISHED_ABORTED = 5 + FINISHED_IGNORED = 6 + + @staticmethod + def is_finished(status: "RequestStatus") -> bool: + return status > RequestStatus.PREEMPTED + + @staticmethod + def get_finished_reason(status: "RequestStatus") -> Union[str, None]: + return _FINISHED_REASON_MAP.get(status) + + +# Mapping of finished statuses to their finish reasons. +# NOTE: The ignored sequences are the sequences whose prompt lengths +# are longer than the model's length cap. Therefore, the stop +# reason should also be "length" as in OpenAI API. +_FINISHED_REASON_MAP = { + RequestStatus.FINISHED_STOPPED: "stop", + RequestStatus.FINISHED_LENGTH_CAPPED: "length", + RequestStatus.FINISHED_ABORTED: "abort", + RequestStatus.FINISHED_IGNORED: "length", +} diff --git a/vllm/v1/sample/__init__.py b/vllm/v1/sample/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py new file mode 100644 index 0000000000000..28614377b27b9 --- /dev/null +++ b/vllm/v1/sample/metadata.py @@ -0,0 +1,22 @@ +from dataclasses import dataclass +from typing import List, Optional + +import torch + + +@dataclass +class SamplingMetadata: + + temperature: torch.Tensor + all_greedy: bool + all_random: bool + + top_p: torch.Tensor + top_k: torch.Tensor + no_top_p: bool + no_top_k: bool + + generators: List[Optional[torch.Generator]] + no_generator: bool + + max_num_logprobs: int diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py new file mode 100644 index 0000000000000..157c4dd6d771e --- /dev/null +++ b/vllm/v1/sample/sampler.py @@ -0,0 +1,161 @@ +"""A layer that samples the next tokens from the model's outputs.""" +from typing import List, Optional + +import torch +import torch.nn as nn + +from vllm.v1.outputs import SamplerOutput +from vllm.v1.sample.metadata import SamplingMetadata + +_SAMPLING_EPS = 1e-5 + + +class Sampler(nn.Module): + + def forward( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> SamplerOutput: + logits = self.apply_temperature(logits, sampling_metadata.temperature) + logits = self.apply_top_k_top_p(logits, sampling_metadata) + + probs = self.get_probs(logits) + sampled = self.sample(probs, sampling_metadata) + # Use int32 to reduce the tensor size. + sampled = sampled.to(torch.int32) + + if sampling_metadata.max_num_logprobs > 0: + logprobs = self.get_logprobs(logits) + # FIXME: Mask the sampled token_id, get topk logprobs, + # and concatenate the topk with the sampled token_id. + topk_logprobs, topk_indices = torch.topk( + logprobs, sampling_metadata.max_num_logprobs, dim=-1) + # Use int32 to reduce the tensor size. + topk_indices = topk_indices.to(torch.int32) + else: + topk_logprobs = None + topk_indices = None + + sampler_output = SamplerOutput( + sampled_token_ids=sampled, + logprob_token_ids=topk_indices, + logprobs=topk_logprobs, + prompt_logprob_token_ids=None, + prompt_logprobs=None, + ) + return sampler_output + + def apply_temperature( + self, + logits: torch.Tensor, + temp: torch.Tensor, + ) -> torch.Tensor: + # Use float32 to apply temperature scaling. + logits = logits.to(torch.float32) + # Avoid division by zero. + temp = torch.where(temp < _SAMPLING_EPS, 1.0, temp) + # Use in-place division to avoid creating a new tensor. + logits.div_(temp.unsqueeze(dim=1)) + return logits + + def apply_top_k_top_p( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> torch.Tensor: + return _apply_top_k_top_p( + logits, + sampling_metadata.no_top_k, + sampling_metadata.top_k, + sampling_metadata.no_top_p, + sampling_metadata.top_p, + ) + + def get_probs(self, logits: torch.Tensor) -> torch.Tensor: + return torch.softmax(logits, dim=-1, dtype=torch.float32) + + def get_logprobs(self, logits: torch.Tensor) -> torch.Tensor: + return torch.log_softmax(logits, dim=-1, dtype=torch.float32) + + def greedy_sample(self, probs: torch.Tensor) -> torch.Tensor: + return probs.argmax(dim=-1).view(-1) + + def random_sample( + self, + probs: torch.Tensor, + generators: List[Optional[torch.Generator]], + no_generator: bool, + ) -> torch.Tensor: + q = torch.empty_like(probs) + # NOTE(woosuk): To batch-process the requests without their own seeds, + # which is the common case, we first assume that every request does + # not have its own seed. Then, we overwrite the values for the requests + # that have their own seeds. + q.exponential_() + if not no_generator: + assert len(generators) == probs.shape[0] + # TODO(woosuk): This can be slow because we handle each request + # one by one. Optimize this. + for i, generator in enumerate(generators): + if generator is not None: + q[i].exponential_(generator=generator) + return probs.div_(q).argmax(dim=-1).view(-1) + + def sample( + self, + probs: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> torch.Tensor: + assert not (sampling_metadata.all_greedy + and sampling_metadata.all_random) + if sampling_metadata.all_greedy: + return self.greedy_sample(probs) + if sampling_metadata.all_random: + return self.random_sample(probs, sampling_metadata.generators, + sampling_metadata.no_generator) + + greedy_sampled = self.greedy_sample(probs) + random_sampled = self.random_sample(probs, + sampling_metadata.generators, + sampling_metadata.no_generator) + sampled = torch.where( + sampling_metadata.temperature < _SAMPLING_EPS, + greedy_sampled, + random_sampled, + ) + return sampled + + +# TODO(woosuk): Optimize this with a custom kernel. +def _apply_top_k_top_p( + logits: torch.Tensor, + no_top_k: bool, + k: torch.Tensor, + no_top_p: bool, + p: torch.Tensor, +) -> torch.Tensor: + if no_top_k and no_top_p: + return logits + logits_sort, logits_idx = logits.sort(dim=-1, descending=False) + + if not no_top_k: + # Apply top-k. + top_k_mask = logits_sort.size(1) - k.to(torch.long) + # Get all the top_k values. + top_k_mask = logits_sort.gather(1, top_k_mask.unsqueeze(dim=1)) + top_k_mask = logits_sort < top_k_mask + logits_sort.masked_fill_(top_k_mask, -float("inf")) + + if not no_top_p: + # Apply top-p. + probs_sort = logits_sort.softmax(dim=-1) + probs_sum = probs_sort.cumsum(dim=-1) + top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1) + # at least one + top_p_mask[:, -1] = False + logits_sort.masked_fill_(top_p_mask, -float("inf")) + + # Re-sort the probabilities. + logits = logits_sort.scatter(dim=-1, index=logits_idx, src=logits_sort) + return logits diff --git a/vllm/v1/tokenizer/__init__.py b/vllm/v1/tokenizer/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/v1/tokenizer/detokenizer.py b/vllm/v1/tokenizer/detokenizer.py new file mode 100644 index 0000000000000..4bbcf4717981e --- /dev/null +++ b/vllm/v1/tokenizer/detokenizer.py @@ -0,0 +1,215 @@ +import multiprocessing +from dataclasses import dataclass +from typing import Dict, List, Optional + +import msgspec +import zmq +from msgspec import msgpack + +from vllm.transformers_utils.detokenizer_utils import ( + convert_prompt_ids_to_tokens, detokenize_incrementally) +from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.utils import get_open_port + + +class DetokenizerInputs(msgspec.Struct): + + # [num_reqs] + req_ids: List[str] + # A request's prompt token ids is sent to the detokenizer only when + # the request is first detokenized. Otherwise, an empty list is sent. + prompt_token_ids: List[List[int]] + new_token_ids: List[List[int]] + skip_special_tokens: List[bool] + spaces_between_special_tokens: List[bool] + + # [num_free_reqs] + free_req_ids: List[str] + + +class DetokenizerOutputs(msgspec.Struct): + + # [num_reqs] + req_ids: List[str] + detokenized_texts: List[str] + # NOTE(woosuk): The number of the output token ids of each request + # at the time of detokenization. The detokenizer returns this to the engine + # because the request state (including the output token ids) is + # asynchronously updated in the engine, while RequestOutput requires the + # output token ids to be consistent with the detokenized text. + num_output_token_ids: List[int] + + +class Detokenizer: + + def __init__(self, tokenizer_name: str): + # FIXME(woosuk): Currently, the detokenizer is just a hacky prototype. + # For example, it does not terminate properly. We need to improve this. + self.push_port = get_open_port() + self.pull_port = get_open_port() + self.detokenizer = DetokenizerProc(tokenizer_name, self.push_port, + self.pull_port) + self.detokenizer.start() + + self.zmq_context = zmq.Context() + self.push_socket = self.zmq_context.socket(zmq.PUSH) + self.push_socket.connect(f"tcp://localhost:{self.push_port}") + self.pull_socket = self.zmq_context.socket(zmq.PULL) + self.pull_socket.connect(f"tcp://localhost:{self.pull_port}") + self.poller = zmq.Poller() + self.poller.register(self.pull_socket, zmq.POLLIN) + self.msgpack_encoder = msgpack.Encoder() + self.msgpack_decoder = msgpack.Decoder(DetokenizerOutputs) + + def send(self, inputs: DetokenizerInputs) -> None: + self.push_socket.send(self.msgpack_encoder.encode(inputs), + flags=zmq.NOBLOCK) + + def recv(self) -> Optional[DetokenizerOutputs]: + socks = dict(self.poller.poll(timeout=0)) + if self.pull_socket in socks and socks[self.pull_socket] == zmq.POLLIN: + msg = self.pull_socket.recv() + return self.msgpack_decoder.decode(msg) + return None + + def terminate(self) -> None: + self.push_socket.send(b"", flags=zmq.NOBLOCK) + self.detokenizer.join() + + +class DetokenizerProc(multiprocessing.Process): + + def __init__( + self, + tokenizer_name: str, + pull_port: int, + push_port: int, + ): + super().__init__() + self.tokenizer_name = tokenizer_name + # NOTE: The pull_port of the detokenizer should be the same as the + # push_port of the engine. Vice versa. + self.pull_port = pull_port + self.push_port = push_port + + def run(self): + # Initialize these objects after the process is forked since they are + # not picklable. + self.msgpack_encoder = msgpack.Encoder() + self.msgpack_decoder = msgpack.Decoder(DetokenizerInputs) + self.tokenizer = get_tokenizer(self.tokenizer_name) + # req_id -> RequestState + self.request_states: Dict[str, RequestState] = {} + + self.zmq_context = zmq.Context() + self.pull_socket = self.zmq_context.socket(zmq.PULL) + self.pull_socket.bind(f"tcp://*:{self.pull_port}") + self.push_socket = self.zmq_context.socket(zmq.PUSH) + self.push_socket.bind(f"tcp://*:{self.push_port}") + + while True: + message = self.pull_socket.recv() + if message == b"": + # Terminate signal. + break + inputs = self.msgpack_decoder.decode(message) + + for req_id in inputs.free_req_ids: + self.free(req_id) + + detokenized_texts: List[str] = [] + num_output_token_ids: List[int] = [] + num_reqs = len(inputs.req_ids) + for i in range(num_reqs): + req_id = inputs.req_ids[i] + if req_id not in self.request_states: + self.add_request( + request_id=req_id, + prompt_token_ids=inputs.prompt_token_ids[i], + skip_special_tokens=inputs.skip_special_tokens[i], + spaces_between_special_tokens=inputs. + spaces_between_special_tokens[i], + ) + new_str = self.detokenize(req_id, inputs.new_token_ids[i]) + detokenized_texts.append(new_str) + req_state = self.request_states[req_id] + num_output_token_ids.append( + len(req_state.token_ids) - req_state.num_prompt_tokens) + + detokenized = DetokenizerOutputs( + req_ids=inputs.req_ids, + detokenized_texts=detokenized_texts, + num_output_token_ids=num_output_token_ids, + ) + self.push_socket.send(self.msgpack_encoder.encode(detokenized), + flags=zmq.NOBLOCK) + + def add_request( + self, + request_id: str, + prompt_token_ids: List[int], + skip_special_tokens: bool, + spaces_between_special_tokens: bool, + ) -> None: + tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens( + tokenizer=self.tokenizer, + prompt_ids=prompt_token_ids, + skip_special_tokens=skip_special_tokens, + ) + self.request_states[request_id] = RequestState( + req_id=request_id, + token_ids=prompt_token_ids, + tokens=tokens, + num_prompt_tokens=len(prompt_token_ids), + prefix_offset=prefix_offset, + read_offset=read_offset, + skip_special_tokens=skip_special_tokens, + spaces_between_special_tokens=spaces_between_special_tokens, + ) + + def free(self, request_id: str) -> None: + del self.request_states[request_id] + + def detokenize(self, request_id: str, new_token_ids: List[int]) -> str: + # TODO(woosuk): This method becomes very inefficient when the number of + # new_token_ids is more than 1. We need to optimize this. + req_state = self.request_states[request_id] + decoded_text = "" + for new_token_id in new_token_ids: + req_state.token_ids.append(new_token_id) + (new_tokens, new_decoded_token_text, prefix_offset, + read_offset) = detokenize_incrementally( + tokenizer=self.tokenizer, + all_input_ids=req_state.token_ids, + prev_tokens=req_state.tokens, + prefix_offset=req_state.prefix_offset, + read_offset=req_state.read_offset, + skip_special_tokens=req_state.skip_special_tokens, + spaces_between_special_tokens=req_state. + spaces_between_special_tokens, + ) + + req_state.tokens.extend(new_tokens) + req_state.prefix_offset = prefix_offset + req_state.read_offset = read_offset + req_state.output_text += new_decoded_token_text + decoded_text += new_decoded_token_text + return decoded_text + + +@dataclass +class RequestState: + + req_id: str + + token_ids: List[int] + tokens: List[str] + num_prompt_tokens: int + + prefix_offset: int + read_offset: int + + skip_special_tokens: bool + spaces_between_special_tokens: bool + + output_text: str = "" diff --git a/vllm/v1/worker/__init__.py b/vllm/v1/worker/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py new file mode 100644 index 0000000000000..e84645ac7a4ae --- /dev/null +++ b/vllm/v1/worker/gpu_model_runner.py @@ -0,0 +1,690 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING, Dict, List, Optional, Set +from unittest.mock import patch + +import numpy as np +import torch +import torch.distributed +import torch.nn as nn + +from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, + ModelConfig, ObservabilityConfig, ParallelConfig, + PromptAdapterConfig, SchedulerConfig) +from vllm.forward_context import set_forward_context +from vllm.logger import init_logger +from vllm.model_executor.model_loader import get_model +from vllm.multimodal import MultiModalDataDict +from vllm.sampling_params import SamplingParams, SamplingType +from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, cdiv, + is_pin_memory_available) +from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend, + FlashAttentionMetadata) +from vllm.v1.outputs import ModelRunnerOutput +from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.sample.sampler import Sampler + +if TYPE_CHECKING: + from vllm.v1.core.scheduler import SchedulerOutput + +logger = init_logger(__name__) + + +class GPUModelRunner: + + def __init__( + self, + model_config: ModelConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + cache_config: CacheConfig, + load_config: LoadConfig, + lora_config: Optional[LoRAConfig] = None, + prompt_adapter_config: Optional[PromptAdapterConfig] = None, + observability_config: Optional[ObservabilityConfig] = None, + ): + self.model_config = model_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + self.cache_config = cache_config + self.lora_config = lora_config + self.load_config = load_config + self.prompt_adapter_config = prompt_adapter_config + self.observability_config = observability_config + + self.device = self.device_config.device + self.pin_memory = is_pin_memory_available() + self.dtype = self.model_config.dtype + if cache_config.cache_dtype == "auto": + self.kv_cache_dtype = self.dtype + else: + self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ + cache_config.cache_dtype] + + self.sliding_window = model_config.get_sliding_window() + self.block_size = cache_config.block_size + self.max_model_len = model_config.max_model_len + self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size) + self.max_num_tokens = scheduler_config.max_num_batched_tokens + + # Model-related. + self.num_attn_layers = model_config.get_num_attention_layers( + parallel_config) + self.num_kv_heads = model_config.get_num_kv_heads(parallel_config) + self.head_size = model_config.get_head_size() + + # Lazy initialization + # self.model: nn.Module # Set after load_model + self.kv_caches: List[torch.Tensor] = [] + + # Request states. + self.requests: Dict[str, CachedRequestState] = {} + # Persistent batch. + self.input_batch = InputBatch( + max_num_reqs=self.scheduler_config.max_num_seqs, + max_model_len=self.max_model_len, + max_num_blocks_per_req=self.max_num_blocks_per_req, + device=self.device, + pin_memory=self.pin_memory, + ) + + def _update_states(self, scheduler_output: "SchedulerOutput") -> None: + # Remove stopped requests from the cached states. + # Keep the states of the pre-empted requests. + for req_id in scheduler_output.finished_req_ids: + self.requests.pop(req_id, None) + + # Remove the requests from the persistent batch. + stopped_req_ids = set().union( + scheduler_output.preempted_req_ids, + scheduler_output.finished_req_ids, + ) + removed_req_indices: List[int] = [] + for req_id in stopped_req_ids: + req_index = self.input_batch.remove_request(req_id) + if req_index is not None: + removed_req_indices.append(req_index) + + # Update the states of the running requests. + for req_data in scheduler_output.scheduled_running_reqs: + req_id = req_data.req_id + req_state = self.requests[req_id] + req_index = self.input_batch.req_id_to_index[req_id] + + # Update the num_computed_tokens. + req_state.num_computed_tokens = req_data.num_computed_tokens + self.input_batch.num_computed_tokens_cpu[req_index] = ( + req_data.num_computed_tokens) + + # Update the block table. + num_new_blocks = len(req_data.new_block_ids) + if num_new_blocks == 0: + continue + start_index = len(req_state.block_ids) + end_index = start_index + num_new_blocks + req_state.block_ids.extend(req_data.new_block_ids) + self.input_batch.block_table_cpu[ + req_index, start_index:end_index] = req_data.new_block_ids + + req_ids_to_add: List[str] = [] + # Add new requests to the cached states. + for req_data in scheduler_output.scheduled_new_reqs: + req_id = req_data.req_id + self.requests[req_id] = CachedRequestState( + req_id=req_id, + prompt_token_ids=req_data.prompt_token_ids, + prompt=req_data.prompt, + multi_modal_data=req_data.multi_modal_data, + sampling_params=req_data.sampling_params, + generator=None, # TODO + block_ids=req_data.block_ids, + num_computed_tokens=req_data.num_computed_tokens, + output_token_ids=[], + ) + req_ids_to_add.append(req_id) + + # Update the cached states of the resumed requests. + for req_data in scheduler_output.scheduled_resumed_reqs: + req_id = req_data.req_id + req_state = self.requests[req_id] + + req_state.block_ids = req_data.block_ids + req_state.num_computed_tokens = req_data.num_computed_tokens + req_ids_to_add.append(req_id) + + # Add the new or resumed requests to the persistent batch. + # The smaller empty indices are filled first. + removed_req_indices = sorted(removed_req_indices, reverse=True) + for req_id in req_ids_to_add: + req_state = self.requests[req_id] + if removed_req_indices: + # Fill the empty index. + req_index = removed_req_indices.pop() + else: + # Append to the end. + req_index = None + self.input_batch.add_request(req_state, req_index) + + # Condense the batched states if there are empty indices. + if removed_req_indices: + self.input_batch.condense(removed_req_indices) + + def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): + total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + assert total_num_scheduled_tokens > 0 + num_reqs = self.input_batch.num_reqs + assert num_reqs > 0 + + # OPTIMIZATION: Start copying the block table first. + # This way, we can overlap the copy with the following CPU operations. + self.input_batch.block_table[:num_reqs].copy_( + self.input_batch.block_table_cpu_tensor[:num_reqs], + non_blocking=True) + + # Get the number of scheduled tokens for each request. + # TODO: The Python loop can be slow. Optimize. + num_scheduled_tokens = [] + max_num_scheduled_tokens = 0 + for req_id in self.input_batch.req_ids[:num_reqs]: + num_tokens = scheduler_output.num_scheduled_tokens[req_id] + num_scheduled_tokens.append(num_tokens) + max_num_scheduled_tokens = max(max_num_scheduled_tokens, + num_tokens) + num_scheduled_tokens = np.array(num_scheduled_tokens, dtype=np.int32) + assert max_num_scheduled_tokens > 0 + + # Get request indices. + # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2] + indices = np.arange(num_reqs) + req_indices = np.repeat(indices, num_scheduled_tokens) + + # Get batched arange. + # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + arange_matrix = np.tile(np.arange(max_num_scheduled_tokens), + (num_reqs, 1)) + mask = arange_matrix < num_scheduled_tokens[:, np.newaxis] + arange = arange_matrix[mask] + + # Get positions. + positions = torch.empty((total_num_scheduled_tokens, ), + dtype=torch.int32, + device="cpu", + pin_memory=self.pin_memory) + positions_np = positions.numpy() + np.add(self.input_batch.num_computed_tokens_cpu[req_indices], + arange, + out=positions_np) + + # Get token indices. + # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2] + # where M is the max_model_len. + token_indices = positions_np + req_indices * self.max_model_len + token_indices = torch.from_numpy(token_indices) + input_ids = torch.empty((total_num_scheduled_tokens, ), + dtype=torch.int32, + device="cpu", + pin_memory=self.pin_memory) + torch.index_select(torch.from_numpy( + self.input_batch.token_ids_cpu).flatten(), + 0, + token_indices, + out=input_ids) + + # Calculate the slot mapping. + block_numbers = self.input_batch.block_table_cpu_tensor.flatten()[ + token_indices // self.block_size] + block_offsets = token_indices % self.block_size + slot_mapping = torch.empty((total_num_scheduled_tokens, ), + dtype=torch.int32, + device="cpu", + pin_memory=self.pin_memory) + torch.add(block_numbers * self.block_size, + block_offsets, + out=slot_mapping) + + # Prepare the attention metadata. + query_start_loc = torch.empty((num_reqs + 1, ), + dtype=torch.int32, + device="cpu", + pin_memory=self.pin_memory) + query_start_loc_np = query_start_loc.numpy() + query_start_loc_np[0] = 0 + np.cumsum(num_scheduled_tokens, out=query_start_loc_np[1:]) + + seq_lens = (self.input_batch.num_computed_tokens_cpu[:num_reqs] + + num_scheduled_tokens) + max_seq_len = seq_lens.max() + seq_start_loc = torch.empty((num_reqs + 1, ), + dtype=torch.int32, + device="cpu", + pin_memory=self.pin_memory) + seq_start_loc_np = seq_start_loc.numpy() + seq_start_loc_np[0] = 0 + np.cumsum(seq_lens, out=seq_start_loc_np[1:]) + + input_ids = input_ids.to(self.device, non_blocking=True) + positions = positions.to(self.device, non_blocking=True).long() + query_start_loc = query_start_loc.to(self.device, non_blocking=True) + seq_start_loc = seq_start_loc.to(self.device, non_blocking=True) + slot_mapping = slot_mapping.to(self.device, non_blocking=True).long() + attn_metadata = FlashAttentionMetadata( + max_query_len=max_num_scheduled_tokens, + query_start_loc=query_start_loc, + max_seq_len=max_seq_len, + seq_start_loc=seq_start_loc, + block_table=self.input_batch.block_table[:num_reqs], + slot_mapping=slot_mapping, + ) + # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial + # request in the batch. While we should not sample any token from this + # partial request, we do so for simplicity. We will ignore the sampled + # token from the partial request. + # TODO: Support prompt logprobs. + logits_indices = query_start_loc[1:] - 1 + return input_ids, positions, attn_metadata, logits_indices + + def _prepare_sampling( + self, + scheduler_output: "SchedulerOutput", + ) -> SamplingMetadata: + skip_copy = True + if (scheduler_output.finished_req_ids + or scheduler_output.preempted_req_ids): + skip_copy = False + if (scheduler_output.scheduled_new_reqs + or scheduler_output.scheduled_resumed_reqs): + skip_copy = False + # Create the sampling metadata. + sampling_metadata = self.input_batch.make_sampling_metadata(skip_copy) + return sampling_metadata + + @torch.inference_mode() + def execute_model( + self, + scheduler_output: "SchedulerOutput", + ) -> ModelRunnerOutput: + self._update_states(scheduler_output) + inputs = self._prepare_inputs(scheduler_output) + input_ids, positions, attn_metadata, logits_indices = inputs + + with set_forward_context(attn_metadata): + hidden_states = self.model( + input_ids=input_ids, + positions=positions, + kv_caches=self.kv_caches, + attn_metadata=attn_metadata, + ) + hidden_states = hidden_states[logits_indices] + logits = self.model.compute_logits(hidden_states, None) + + # Sample the next token and get logprobs if needed. + sampling_metadata = self._prepare_sampling(scheduler_output) + sampler_output = self.model.sample( + logits=logits, + sampling_metadata=sampling_metadata, + ) + + # NOTE: CPU-GPU synchronization happens here. + sampled_token_ids = sampler_output.sampled_token_ids.cpu() + sampled_token_ids_list = sampled_token_ids.tolist() + # TODO(woosuk): The following loop can be slow since it iterates over + # the requests one by one. Optimize. + num_reqs = self.input_batch.num_reqs + for i, req_id in enumerate(self.input_batch.req_ids[:num_reqs]): + req_state = self.requests[req_id] + seq_len = (req_state.num_computed_tokens + + scheduler_output.num_scheduled_tokens[req_id]) + assert seq_len <= req_state.num_tokens + if seq_len == req_state.num_tokens: + # Append the sampled token to the output token ids. + token_id = sampled_token_ids_list[i] + self.input_batch.token_ids_cpu[i, seq_len] = token_id + req_state.output_token_ids.append(token_id) + else: + # Ignore the sampled token from the partial request. + # Rewind the generator state as if the token was not sampled. + generator = self.input_batch.generators[i] + if generator is not None: + offset = generator.get_offset() + generator = generator.set_offset(offset - 1) + self.input_batch.generators[i] = generator + + if sampler_output.logprob_token_ids is None: + logprob_token_ids = None + else: + logprob_token_ids = sampler_output.logprob_token_ids.cpu() + if sampler_output.logprobs is None: + logprobs = None + else: + logprobs = sampler_output.logprobs.cpu() + model_runner_output = ModelRunnerOutput( + req_ids=self.input_batch.req_ids[:num_reqs], + req_id_to_index=self.input_batch.req_id_to_index, + sampled_token_ids_cpu=sampled_token_ids, + logprob_token_ids_cpu=logprob_token_ids, + logprobs_cpu=logprobs, + ) + return model_runner_output + + def load_model(self) -> None: + logger.info("Starting to load model %s...", self.model_config.model) + with DeviceMemoryProfiler() as m: # noqa: SIM117 + with patch("vllm.model_executor.layers.sampler.Sampler", Sampler): + self.model = get_model(model_config=self.model_config, + device_config=self.device_config, + load_config=self.load_config, + lora_config=self.lora_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config, + cache_config=self.cache_config) + + self.model_memory_usage = m.consumed_memory + logger.info("Loading model weights took %.4f GB", + self.model_memory_usage / float(2**30)) + + def _dummy_run(self, model: nn.Module, num_tokens: int) -> None: + input_ids = torch.zeros(num_tokens, + dtype=torch.int32, + device=self.device) + positions = torch.zeros(num_tokens, + dtype=torch.long, + device=self.device) + kv_caches = [None for _ in range(self.num_attn_layers)] + model(input_ids, positions, kv_caches, attn_metadata=None) + return + + @torch.inference_mode() + def profile_run(self) -> None: + self._dummy_run(self.model, self.max_num_tokens) + torch.cuda.synchronize() + return + + @torch.inference_mode() + def capture_model(self) -> None: + # TODO: Implement CUDA graph support. + return + + def initialize_kv_cache(self, num_blocks: int) -> None: + assert len(self.kv_caches) == 0 + kv_cache_shape = FlashAttentionBackend.get_kv_cache_shape( + num_blocks, self.block_size, self.num_kv_heads, self.head_size) + for _ in range(self.num_attn_layers): + self.kv_caches.append( + torch.zeros(kv_cache_shape, + dtype=self.kv_cache_dtype, + device=self.device)) + + +@dataclass +class CachedRequestState: + + req_id: str + prompt_token_ids: List[int] + prompt: Optional[str] + multi_modal_data: Optional["MultiModalDataDict"] + sampling_params: SamplingParams + generator: Optional[torch.Generator] + + block_ids: List[int] + num_computed_tokens: int + output_token_ids: List[int] + + @property + def num_tokens(self) -> int: + return len(self.prompt_token_ids) + len(self.output_token_ids) + + +class InputBatch: + + def __init__( + self, + max_num_reqs: int, + max_model_len: int, + max_num_blocks_per_req: int, + device: torch.device, + pin_memory: bool, + ): + self.max_num_reqs = max_num_reqs + self.max_model_len = max_model_len + self.max_num_blocks_per_req = max_num_blocks_per_req + self.device = device + self.pin_memory = pin_memory + + self.req_ids: List[Optional[str]] = [None] * max_num_reqs + self.req_id_to_index: Dict[str, int] = {} + + self.token_ids_cpu = np.empty((max_num_reqs, max_model_len), + dtype=np.int32) + self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32) + + # Attention-related. + self.block_table = torch.zeros((max_num_reqs, max_num_blocks_per_req), + device=self.device, + dtype=torch.int32) + self.block_table_cpu_tensor = torch.zeros( + (max_num_reqs, max_num_blocks_per_req), + device="cpu", + dtype=torch.int32, + pin_memory=pin_memory, + ) + self.block_table_cpu = self.block_table_cpu_tensor.numpy() + + # Sampling-related. + self.temperature = torch.empty((max_num_reqs, ), + dtype=torch.float32, + device=device) + self.temperature_cpu_tensor = torch.empty((max_num_reqs, ), + dtype=torch.float32, + device="cpu", + pin_memory=pin_memory) + self.temperature_cpu = self.temperature_cpu_tensor.numpy() + self.greedy_reqs: Set[str] = set() + self.random_reqs: Set[str] = set() + + self.top_p = torch.empty((max_num_reqs, ), + dtype=torch.float32, + device=device) + self.top_p_cpu_tensor = torch.empty((max_num_reqs, ), + dtype=torch.float32, + device="cpu", + pin_memory=pin_memory) + self.top_p_cpu = self.top_p_cpu_tensor.numpy() + self.top_p_reqs: Set[str] = set() + + self.top_k = torch.empty((max_num_reqs, ), + dtype=torch.int32, + device=device) + self.top_k_cpu_tensor = torch.empty((max_num_reqs, ), + dtype=torch.int32, + device="cpu", + pin_memory=pin_memory) + self.top_k_cpu = self.top_k_cpu_tensor.numpy() + self.top_k_reqs: Set[str] = set() + + self.generators: List[Optional[torch.Generator]] = [None + ] * max_num_reqs + + self.num_logprobs: Dict[str, int] = {} + self.prompt_logprob_reqs: Set[str] = set() + + def add_request( + self, + request: "CachedRequestState", + req_index: Optional[int] = None, + ) -> None: + if req_index is None: + req_index = self.num_reqs + assert req_index < self.max_num_reqs + + self.req_ids[req_index] = request.req_id + self.req_id_to_index[request.req_id] = req_index + + # Copy the prompt token ids and output token ids. + num_prompt_tokens = len(request.prompt_token_ids) + self.token_ids_cpu[ + req_index, :num_prompt_tokens] = request.prompt_token_ids + start_idx = num_prompt_tokens + end_idx = start_idx + len(request.output_token_ids) + self.token_ids_cpu[req_index, + start_idx:end_idx] = request.output_token_ids + + self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens + num_blocks = len(request.block_ids) + self.block_table_cpu[req_index, :num_blocks] = request.block_ids + + sampling_params = request.sampling_params + self.temperature_cpu[req_index] = sampling_params.temperature + if sampling_params.sampling_type == SamplingType.GREEDY: + self.greedy_reqs.add(req_index) + elif sampling_params.sampling_type == SamplingType.RANDOM: + self.random_reqs.add(req_index) + elif sampling_params.sampling_type == SamplingType.RANDOM_SEED: + # TODO(woosuk): Support per-request random seed. + raise NotImplementedError("Per-request seed is not supported yet.") + + self.top_p_cpu[req_index] = sampling_params.top_p + if sampling_params.top_p < 1: + self.top_p_reqs.add(req_index) + self.top_k_cpu[req_index] = sampling_params.top_k + if sampling_params.top_k > 0: + self.top_k_reqs.add(req_index) + + self.generators[req_index] = request.generator + + num_logprobs = sampling_params.logprobs + if num_logprobs is not None and num_logprobs > 0: + self.num_logprobs[request.req_id] = num_logprobs + if sampling_params.prompt_logprobs: + self.prompt_logprob_reqs.add(req_index) + + def remove_request(self, req_id: str) -> Optional[int]: + req_index = self.req_id_to_index.pop(req_id, None) + if req_index is None: + return None + self.req_ids[req_index] = None + + self.greedy_reqs.discard(req_id) + self.random_reqs.discard(req_id) + self.top_p_reqs.discard(req_id) + self.top_k_reqs.discard(req_id) + self.generators[req_index] = None + self.num_logprobs.pop(req_id, None) + self.prompt_logprob_reqs.discard(req_id) + return req_index + + def clear(self) -> None: + self.req_ids = [None] * self.max_num_reqs + self.req_id_to_index.clear() + self.greedy_reqs.clear() + self.random_reqs.clear() + self.top_p_reqs.clear() + self.top_k_reqs.clear() + self.generators.clear() + self.num_logprobs.clear() + self.prompt_logprob_reqs.clear() + + def condense(self, empty_req_indices: List[int]) -> None: + if self.num_reqs == 0: + # The batched states are empty. + return + + # NOTE(woosuk): This function assumes that the empty_req_indices + # is sorted in descending order. + last_req_index = self.num_reqs + len(empty_req_indices) - 1 + while empty_req_indices: + # Find the largest non-empty index. + while last_req_index in empty_req_indices: + last_req_index -= 1 + + # Find the smallest empty index. + empty_index = empty_req_indices.pop() + if empty_index >= last_req_index: + break + + # Swap the states. + req_id = self.req_ids[last_req_index] + self.req_ids[empty_index] = req_id + self.req_ids[last_req_index] = None + self.req_id_to_index[req_id] = empty_index + + # TODO(woosuk): Optimize the copy of token_ids_cpu and + # block_table_cpu. + self.token_ids_cpu[empty_index] = self.token_ids_cpu[ + last_req_index] + self.num_computed_tokens_cpu[ + empty_index] = self.num_computed_tokens_cpu[last_req_index] + self.block_table_cpu[empty_index] = self.block_table_cpu[ + last_req_index] + self.temperature_cpu[empty_index] = self.temperature_cpu[ + last_req_index] + self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index] + self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index] + self.generators[empty_index] = self.generators[last_req_index] + + # Decrement last_req_index since it is now empty. + last_req_index -= 1 + + def make_sampling_metadata( + self, + skip_copy: bool = False, + ) -> SamplingMetadata: + if not skip_copy: + self.temperature[:self.num_reqs].copy_( + self.temperature_cpu_tensor[:self.num_reqs], non_blocking=True) + self.top_p[:self.num_reqs].copy_( + self.top_p_cpu_tensor[:self.num_reqs], non_blocking=True) + self.top_k[:self.num_reqs].copy_( + self.top_k_cpu_tensor[:self.num_reqs], non_blocking=True) + return SamplingMetadata( + temperature=self.temperature[:self.num_reqs], + all_greedy=self.all_greedy, + all_random=self.all_random, + top_p=self.top_p[:self.num_reqs], + top_k=self.top_k[:self.num_reqs], + no_top_p=self.no_top_p, + no_top_k=self.no_top_k, + generators=self.generators[:self.num_reqs], + no_generator=self.no_generator, + max_num_logprobs=self.max_num_logprobs, + ) + + @property + def num_reqs(self) -> int: + return len(self.req_id_to_index) + + @property + def all_greedy(self) -> bool: + return len(self.random_reqs) == 0 + + @property + def all_random(self) -> bool: + return len(self.greedy_reqs) == 0 + + @property + def no_top_p(self) -> bool: + return len(self.top_p_reqs) == 0 + + @property + def no_top_k(self) -> bool: + return len(self.top_k_reqs) == 0 + + @property + def no_generator(self) -> bool: + return len(self.generators) == 0 + + @property + def max_num_logprobs(self) -> int: + if self.num_logprobs: + return max(self.num_logprobs.values()) + else: + return 0 + + @property + def no_logprob(self) -> bool: + return len(self.num_logprobs) == 0 + + @property + def no_prompt_logprob(self) -> bool: + return len(self.prompt_logprob_reqs) == 0 diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py new file mode 100644 index 0000000000000..8c5ca2ec35666 --- /dev/null +++ b/vllm/v1/worker/gpu_worker.py @@ -0,0 +1,245 @@ +"""A GPU worker class.""" +import gc +import os +from typing import TYPE_CHECKING, Optional, Tuple + +import torch +import torch.distributed + +from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, + ModelConfig, ObservabilityConfig, ParallelConfig, + PromptAdapterConfig, SchedulerConfig, + SpeculativeConfig) +from vllm.distributed import (ensure_model_parallel_initialized, + init_distributed_environment, + set_custom_all_reduce) +from vllm.logger import init_logger +from vllm.model_executor import set_random_seed +from vllm.platforms import current_platform +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size +from vllm.v1.outputs import ModelRunnerOutput +from vllm.v1.worker.gpu_model_runner import GPUModelRunner + +logger = init_logger(__name__) + +if TYPE_CHECKING: + from vllm.v1.core.scheduler import SchedulerOutput + + +class Worker: + + def __init__( + self, + model_config: ModelConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig, + device_config: DeviceConfig, + cache_config: CacheConfig, + load_config: LoadConfig, + local_rank: int, + rank: int, + distributed_init_method: str, + speculative_config: Optional[SpeculativeConfig] = None, + lora_config: Optional[LoRAConfig] = None, + prompt_adapter_config: Optional[PromptAdapterConfig] = None, + observability_config: Optional[ObservabilityConfig] = None, + ): + self.model_config = model_config + self.parallel_config = parallel_config + self.scheduler_config = scheduler_config + self.device_config = device_config + self.cache_config = cache_config + self.load_config = load_config + self.local_rank = local_rank + self.rank = rank + self.distributed_init_method = distributed_init_method + self.lora_config = lora_config + self.speculative_config = speculative_config + self.prompt_adapter_config = prompt_adapter_config + self.observability_config = observability_config + + if self.model_config.trust_remote_code: + # note: lazy import to avoid importing torch before initializing + from vllm.utils import init_cached_hf_modules + init_cached_hf_modules() + + self.model_runner = GPUModelRunner( + model_config, + parallel_config, + scheduler_config, + device_config, + cache_config, + load_config, + lora_config=lora_config, + ) + + def initialize(self): + if self.device_config.device.type == "cuda": + # torch.distributed.all_reduce does not free the input tensor until + # the synchronization point. This causes the memory usage to grow + # as the number of all_reduce calls increases. This env var disables + # this behavior. + # Related issue: + # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573 + os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1" + + # This env var set by Ray causes exceptions with graph building. + os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None) + self.device = torch.device(f"cuda:{self.local_rank}") + torch.cuda.set_device(self.device) + + _check_if_gpu_supports_dtype(self.model_config.dtype) + gc.collect() + torch.cuda.empty_cache() + self.init_gpu_memory = torch.cuda.mem_get_info()[0] + else: + raise RuntimeError( + f"Not support device type: {self.device_config.device}") + # Initialize the distributed environment. + init_worker_distributed_environment(self.parallel_config, self.rank, + self.distributed_init_method, + self.local_rank) + # Set random seed. + set_random_seed(self.model_config.seed) + + def load_model(self) -> None: + self.model_runner.load_model() + + @torch.inference_mode() + def determine_num_available_blocks(self) -> Tuple[int, int]: + """Profiles the peak memory usage of the model to determine how many + KV blocks may be allocated without OOMs. + + The engine will first conduct a profiling of the existing memory usage. + Then, it calculate the maximum possible number of GPU and CPU blocks + that can be allocated with the remaining free memory. + + .. tip:: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. + """ + # Profile the memory usage of the model and get the maximum number of + # cache blocks that can be allocated with the remaining free memory. + torch.cuda.empty_cache() + + # Execute a forward pass with dummy inputs to profile the memory usage + # of the model. + self.model_runner.profile_run() + + # Calculate the number of blocks that can be allocated with the + # profiled peak memory. + torch.cuda.synchronize() + free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info() + # NOTE(woosuk): Here we assume that the other processes using the same + # GPU did not change their memory usage during the profiling. + peak_memory = self.init_gpu_memory - free_gpu_memory + assert peak_memory > 0, ( + "Error in memory profiling. " + f"Initial free memory {self.init_gpu_memory}, current free memory" + f" {free_gpu_memory}. This happens when the GPU memory was " + "not properly cleaned up before initializing the vLLM instance.") + + cache_block_size = _get_cache_block_size(self.cache_config, + self.model_config, + self.parallel_config) + num_gpu_blocks = int( + (total_gpu_memory * self.cache_config.gpu_memory_utilization - + peak_memory) // cache_block_size) + num_gpu_blocks = max(num_gpu_blocks, 0) + # if self.model_runner.lora_manager: + # self.model_runner.remove_all_loras() + gc.collect() + torch.cuda.empty_cache() + return num_gpu_blocks, 0 + + def initialize_cache(self, num_gpu_blocks: int) -> None: + """Allocate GPU and CPU KV cache with the specified number of blocks.""" + if num_gpu_blocks <= 0: + raise ValueError("No available memory for the cache blocks. " + "Try increasing `gpu_memory_utilization` when " + "initializing the engine.") + + max_seq_len = self.cache_config.block_size * num_gpu_blocks + max_model_len = self.model_config.max_model_len + if max_model_len > max_seq_len: + raise ValueError( + f"The model's max seq len ({max_model_len}) " + "is larger than the maximum number of tokens that can be " + f"stored in KV cache ({max_seq_len}). Try increasing " + "`gpu_memory_utilization` or decreasing `max_model_len` when " + "initializing the engine.") + + self.model_runner.initialize_kv_cache(num_gpu_blocks) + + def compile_or_warm_up_model(self) -> None: + if not self.model_config.enforce_eager: + self.model_runner.capture_model() + # Reset the seed to ensure that the random state is not affected by + # the model initialization and profiling. + set_random_seed(self.model_config.seed) + + @torch.inference_mode() + def execute_model( + self, + scheduler_output: "SchedulerOutput", + ) -> ModelRunnerOutput: + output = self.model_runner.execute_model(scheduler_output) + # TODO(woosuk): Send the output to the engine process. + return output + + +def init_worker_distributed_environment( + parallel_config: ParallelConfig, + rank: int, + distributed_init_method: Optional[str] = None, + local_rank: int = -1, +) -> None: + """Initialize the distributed environment.""" + set_custom_all_reduce(not parallel_config.disable_custom_all_reduce) + + init_distributed_environment(parallel_config.world_size, rank, + distributed_init_method, local_rank) + + ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, + parallel_config.pipeline_parallel_size) + + +def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype): + # Check if the GPU supports the dtype. + if torch_dtype == torch.bfloat16: # noqa: SIM102 + if not current_platform.has_device_capability(80): + capability = current_platform.get_device_capability() + gpu_name = current_platform.get_device_name() + + if capability is None: + compute_str = "does not have a compute capability" + else: + version_str = capability.as_version_str() + compute_str = f"has compute capability {version_str}" + + raise ValueError( + "Bfloat16 is only supported on GPUs with compute capability " + f"of at least 8.0. Your {gpu_name} GPU {compute_str}. " + "You can use float16 instead by explicitly setting the" + "`dtype` flag in CLI, for example: --dtype=half.") + + +def _get_cache_block_size( + cache_config: CacheConfig, + model_config: ModelConfig, + parallel_config: ParallelConfig, +) -> int: + head_size = model_config.get_head_size() + num_heads = model_config.get_num_kv_heads(parallel_config) + num_attention_layers = model_config.get_num_attention_layers( + parallel_config) + + key_cache_block = cache_config.block_size * num_heads * head_size + value_cache_block = key_cache_block + total = num_attention_layers * (key_cache_block + value_cache_block) + if cache_config.cache_dtype == "auto": + dtype = model_config.dtype + else: + dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] + dtype_size = get_dtype_size(dtype) + return dtype_size * total diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 090f95e6e892c..ac3270d1c9909 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -53,7 +53,6 @@ def __init__( # Get attention backend. self.attn_backend = get_attn_backend(self.head_size, - model_config.get_sliding_window(), model_config.dtype, cache_config.cache_dtype, self.block_size, diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index dd38b550eb011..5032896600b3b 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -420,7 +420,6 @@ def __init__( self.block_size = cache_config.block_size self.attn_backend = get_attn_backend( self.model_config.get_head_size(), - self.model_config.get_sliding_window(), self.model_config.dtype, self.kv_cache_dtype, self.block_size, diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index b84562851f0f8..ab93471b5af74 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -57,7 +57,6 @@ def __init__(self, cache_config: CacheConfig, model_config: ModelConfig, # Get attention backend. self.attn_backend = get_attn_backend( self.model_config.get_head_size(), - self.model_config.get_sliding_window(), self.model_config.dtype, cache_config.cache_dtype, self.block_size, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index a82956985af55..8b74f06e77be0 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -828,7 +828,7 @@ def build(self) -> ModelInputForGPU: cuda_graph_pad_size = self._get_cuda_graph_pad_size( num_seqs=len(seq_lens), - max_decode_seq_len=max_encoder_seq_len, + max_decode_seq_len=max_decode_seq_len, max_encoder_seq_len=max_encoder_seq_len) batch_size = len(input_tokens) @@ -1011,7 +1011,6 @@ def __init__( self.attn_backend = get_attn_backend( self.model_config.get_head_size(), - self.model_config.get_sliding_window(), self.model_config.dtype, self.kv_cache_dtype, self.block_size, @@ -1856,7 +1855,7 @@ def forward( self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True) self.input_buffers["positions"].copy_(positions, non_blocking=True) - if self.backend_name != "placeholder-attn": + if self.backend_name != "NO_ATTENTION": self.input_buffers["slot_mapping"].copy_( attn_metadata.slot_mapping, non_blocking=True) diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 0cd0047bebf2d..be2f0d79154d6 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -29,8 +29,8 @@ logger = init_logger(__name__) -MULTI_STEP_ATTENTION_BACKENDS = ["flash-attn", "rocm-flash-attn", "flashinfer"] -MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS = ["flash-attn"] +MULTI_STEP_ATTENTION_BACKENDS = ["FLASH_ATTN", "ROCM_FLASH", "FLASHINFER"] +MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS = ["FLASH_ATTN"] def _get_supported_attention_backends(chunked_prefill_enabled: bool) \ -> List[str]: diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py index 760b18427e22b..a164fbe3393c4 100644 --- a/vllm/worker/openvino_model_runner.py +++ b/vllm/worker/openvino_model_runner.py @@ -75,7 +75,6 @@ def __init__( self.attn_backend = get_attn_backend( self.model_config.get_head_size(), - self.model_config.get_sliding_window(), self.model_config.dtype, self.kv_cache_dtype, self.block_size, diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py index 24425fece850f..bc245d19663d6 100644 --- a/vllm/worker/openvino_worker.py +++ b/vllm/worker/openvino_worker.py @@ -71,7 +71,6 @@ def __init__( # Get attention backend. self.attn_backend = get_attn_backend( self.head_size, - self.model_config.get_sliding_window(), self.model_config.dtype, self.cache_config.cache_dtype, self.block_size, diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index f7e5f660c0249..87ced7818a676 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -114,7 +114,6 @@ def __init__( dtype=np.int32) self.attn_backend = get_attn_backend( self.model_config.get_head_size(), - self.model_config.get_sliding_window(), self.model_config.dtype, self.cache_config.cache_dtype, self.block_size, diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 9c46bb4258609..fd30962e5d6bb 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -92,7 +92,7 @@ def __init__( ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner if model_runner_cls is not None: ModelRunnerClass = model_runner_cls - elif self._is_embedding_model(): + elif model_config.task == "embedding": ModelRunnerClass = EmbeddingModelRunner elif self._is_encoder_decoder_model(): ModelRunnerClass = EncoderDecoderModelRunner @@ -147,9 +147,6 @@ def stop_profile(self): def _is_encoder_decoder_model(self): return self.model_config.is_encoder_decoder_model - def _is_embedding_model(self): - return self.model_config.is_embedding_model - def init_device(self) -> None: if self.device_config.device.type == "cuda": # torch.distributed.all_reduce does not free the input tensor until @@ -235,10 +232,11 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: # gpu outside of `torch`. NCCL operations, for example, can use a few # GB during a forward pass torch.cuda.empty_cache() - # After emptying the torch cache, any other increase in gpu ram should - # be from non-torch allocations. - non_torch_allocations = free_memory_pre_profile - \ - torch.cuda.mem_get_info()[0] + torch_allocated_bytes = torch.cuda.memory_stats( + )["allocated_bytes.all.current"] + total_allocated_bytes = torch.cuda.mem_get_info( + )[1] - torch.cuda.mem_get_info()[0] + non_torch_allocations = total_allocated_bytes - torch_allocated_bytes if non_torch_allocations > 0: peak_memory += non_torch_allocations @@ -262,10 +260,12 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: logger.info( "Memory profiling results: total_gpu_memory=%.2fGiB" " initial_memory_usage=%.2fGiB peak_torch_memory=%.2fGiB" + " memory_usage_post_profile=%.2fGib" " non_torch_memory=%.2fGiB kv_cache_size=%.2fGiB" " gpu_memory_utilization=%.2f", total_gpu_memory / (1024**3), (total_gpu_memory - free_memory_pre_profile) / (1024**3), (peak_memory - non_torch_allocations) / (1024**3), + total_allocated_bytes / (1024**3), non_torch_allocations / (1024**3), available_kv_cache_memory / (1024**3), self.cache_config.gpu_memory_utilization) diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 5ff4626c060b3..75a6de3b24ba4 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -374,7 +374,6 @@ def __init__( self.attn_backend = get_attn_backend( self.model_config.get_head_size(), - self.model_config.get_sliding_window(), self.model_config.dtype, self.kv_cache_dtype, self.block_size, diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index 9ad070d042a3d..917866f2d985b 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -17,7 +17,7 @@ from vllm.distributed.parallel_state import get_pp_group from vllm.logger import init_logger from vllm.model_executor import set_random_seed -from vllm.utils import is_xpu +from vllm.platforms import current_platform from vllm.worker.cache_engine import CacheEngine from vllm.worker.worker import Worker from vllm.worker.worker_base import LoraNotSupportedWorkerBase @@ -53,7 +53,7 @@ def __init__( observability_config: Optional[ObservabilityConfig] = None, ) -> None: assert device_config.device_type == "xpu" - assert is_xpu() + assert current_platform.is_xpu() self.model_config = model_config self.parallel_config = parallel_config @@ -91,7 +91,8 @@ def __init__( self.gpu_cache: Optional[List[List[torch.Tensor]]] def init_device(self) -> None: - if self.device_config.device.type == "xpu" and is_xpu(): + if self.device_config.device.type == "xpu" and current_platform.is_xpu( + ): self.device = torch.device(f"xpu:{self.local_rank}") torch.xpu.set_device(self.device) torch.xpu.empty_cache()