diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index c331a9c49c0d0..2dbeee8562971 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -46,7 +46,7 @@ docker exec cpu-test bash -c " docker exec cpu-test bash -c " export VLLM_CPU_KVCACHE_SPACE=10 export VLLM_CPU_OMP_THREADS_BIND=48-92 - python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & + python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 python3 benchmarks/benchmark_serving.py \ --backend vllm \ diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 3e940549862ea..705e81d15ad65 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -120,6 +120,7 @@ steps: - tests/spec_decode/e2e/test_integration_dist_tp4 - tests/compile commands: + - pytest -v -s distributed/test_utils.py - pytest -v -s compile/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py @@ -431,7 +432,6 @@ steps: - pip install -e ./plugins/vllm_add_dummy_model - pytest -v -s distributed/test_distributed_oot.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py - label: Multi-step Tests (4 GPUs) # 36min working_dir: "/vllm-workspace/tests" diff --git a/.github/mergify.yml b/.github/mergify.yml index 1ce5039a061b2..ca4bd7ee2b87f 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -46,7 +46,9 @@ pull_request_rules: comment: message: | This pull request has merge conflicts that must be resolved before it can be - merged. @{{author}} please rebase it. https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork + merged. Please rebase the PR, @{{author}}. + + https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork - name: remove 'needs-rebase' label when conflict is resolved conditions: diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml index edf98ce2fcab0..1a6beca0b87c0 100644 --- a/.github/workflows/ruff.yml +++ b/.github/workflows/ruff.yml @@ -15,12 +15,17 @@ on: pull_request: branches: - main - paths: - - "**/*.py" - - pyproject.toml - - requirements-lint.txt - - .github/workflows/matchers/ruff.json - - .github/workflows/ruff.yml + # This workflow is only relevant when one of the following files changes. + # However, we have github configured to expect and require this workflow + # to run and pass before github with auto-merge a pull request. Until github + # allows more flexible auto-merge policy, we can just run this on every PR. + # It doesn't take that long to run, anyway. + #paths: + # - "**/*.py" + # - pyproject.toml + # - requirements-lint.txt + # - .github/workflows/matchers/ruff.json + # - .github/workflows/ruff.yml jobs: ruff: diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 34735700a224e..284196bc2d279 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -6,7 +6,7 @@ version: 2 build: os: ubuntu-22.04 tools: - python: '3.9' + python: "3.12" sphinx: configuration: docs/source/conf.py diff --git a/CMakeLists.txt b/CMakeLists.txt index c372ba98befbf..25c0865a90a67 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,7 +31,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) # Supported python versions. These versions will be searched in order, the # first match will be selected. These should be kept in sync with setup.py. # -set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11" "3.12") +set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12") # Supported NVIDIA architectures. set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0") diff --git a/Dockerfile b/Dockerfile index 343364da2ebf5..4c0f5aebe859d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -191,6 +191,9 @@ ADD . /vllm-workspace/ RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -r requirements-dev.txt +# Copy in the v1 package for testing (it isn't distributed yet) +COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1 + # doc requires source code # we hide them inside `test_docs/` , so that this source code # will not be imported by other tests diff --git a/Dockerfile.cpu b/Dockerfile.cpu index f1a21d6bd13fc..287b4958da4e5 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -22,7 +22,7 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li RUN echo 'ulimit -c 0' >> ~/.bashrc -RUN pip install intel_extension_for_pytorch==2.4.0 +RUN pip install intel_extension_for_pytorch==2.5.0 WORKDIR /workspace diff --git a/Dockerfile.xpu b/Dockerfile.xpu index 0ecb46df6256c..63bc682770422 100644 --- a/Dockerfile.xpu +++ b/Dockerfile.xpu @@ -30,9 +30,19 @@ COPY requirements-common.txt /workspace/vllm/requirements-common.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install --no-cache-dir \ - --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \ -r requirements-xpu.txt +RUN git clone https://github.com/intel/pti-gpu && \ + cd pti-gpu/sdk && \ + git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \ + mkdir build && \ + cd build && \ + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \ + make -j && \ + cmake --install . --config Release --prefix "/usr/local" + +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/" + COPY . . ARG GIT_REPO_CHECK RUN --mount=type=bind,source=.git,target=.git \ diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index 7237d246ddf55..776a0bb11ae64 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -18,6 +18,7 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc") # list(APPEND CXX_COMPILE_FLAGS "-fopenmp" + "-mf16c" "-DVLLM_CPU_EXTENSION") execute_process(COMMAND cat /proc/cpuinfo diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp index abb4e3bea14bb..e3953c7c45719 100644 --- a/csrc/cpu/attention.cpp +++ b/csrc/cpu/attention.cpp @@ -22,6 +22,16 @@ struct KernelVecType { using v_load_vec_type = vec_op::FP32Vec16; }; +template <> +struct KernelVecType { + using q_load_vec_type = vec_op::FP16Vec8; + using q_vec_type = vec_op::FP32Vec16; + using k_load_vec_type = vec_op::FP16Vec16; + using k_vec_type = vec_op::FP32Vec16; + using qk_acc_vec_type = vec_op::FP32Vec16; + using v_load_vec_type = vec_op::FP16Vec16; +}; + #ifdef __AVX512BF16__ template <> struct KernelVecType { diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp index a325153b470cc..12d5757b495be 100644 --- a/csrc/cpu/cpu_types_x86.hpp +++ b/csrc/cpu/cpu_types_x86.hpp @@ -11,10 +11,10 @@ static_assert(false, "AVX2 must be supported for the current implementation."); namespace vec_op { -// FIXME: FP16 is not fully supported in Torch-CPU #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) @@ -50,37 +50,37 @@ template struct Vec { struct FP32Vec8; struct FP32Vec16; -#ifdef __AVX512FP16__ struct FP16Vec8 : public Vec { constexpr static int VEC_ELEM_NUM = 8; - __m128h reg; + __m128i reg; - explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {} + explicit FP16Vec8(const void *ptr) + : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {} - explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {} + explicit FP16Vec8(const FP32Vec8 &); - explicit FP16Vec8(__m128h data) : reg(data) {} + void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; } +}; - FP16Vec8 operator*(const FP16Vec8 &b) const { - return FP16Vec8(_mm_mul_ph(reg, b.reg)); - } +struct FP16Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; - FP16Vec8 operator+(const FP16Vec8 &b) const { - return FP16Vec8(_mm_add_ph(reg, b.reg)); - } + __m256i reg; - FP16Vec8 operator-(const FP16Vec8 &b) const { - return FP16Vec8(_mm_sub_ph(reg, b.reg)); - } + explicit FP16Vec16(const void *ptr) + : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {} - FP16Vec8 operator/(const FP16Vec8 &b) const { - return FP16Vec8(_mm_div_ph(reg, b.reg)); - } + explicit FP16Vec16(const FP32Vec16 &); - void save(void *ptr) const { _mm_storeu_ph(ptr, reg); } + void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; } + + void save(void* ptr, const int elem_num) const { + constexpr uint32_t M = 0xFFFFFFFF; + __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num)); + _mm256_mask_storeu_epi16(ptr, mask, reg); + } }; -#endif struct BF16Vec8 : public Vec { constexpr static int VEC_ELEM_NUM = 8; @@ -202,9 +202,7 @@ struct FP32Vec8 : public Vec { explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {} -#ifdef __AVX512FP16__ - explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {} -#endif + explicit FP32Vec8(const FP16Vec8 &v) : reg(_mm256_cvtph_ps(v.reg)) {} explicit FP32Vec8(const BF16Vec8 &v) : reg(_mm256_castsi256_ps( @@ -323,6 +321,10 @@ struct FP32Vec16 : public Vec { : reg(_mm512_castsi512_ps( _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {} + explicit FP32Vec16(const FP16Vec16 &v) : reg(_mm512_cvtph_ps(v.reg)) {} + + explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} explicit FP32Vec16(const INT32Vec16 &v) @@ -534,24 +536,34 @@ template using vec_t = typename VecType::vec_type; template <> struct VecType { using vec_type = FP32Vec8; }; -#ifdef __AVX512FP16__ -template <> struct VecType { using vec_type = FP16Vec16; }; -#endif +template <> struct VecType { using vec_type = FP16Vec8; }; template <> struct VecType { using vec_type = BF16Vec8; }; template void storeFP32(float v, T *ptr) { *ptr = v; } -#ifdef __AVX512FP16__ -template <> inline void storeFP32(float v, c10::Half *ptr) { - *reinterpret_cast<_Float16 *>(ptr) = v; -} -#endif - inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { acc = acc + a * b; } +template <> inline void storeFP32(float v, c10::Half *ptr) { + *reinterpret_cast(ptr) = + _cvtss_sh(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); +} + +inline FP16Vec8::FP16Vec8(const FP32Vec8 &v) + : reg(_mm256_cvtps_ph(v.reg, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {} + +#ifdef __AVX512F__ +inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) + : reg(_mm512_cvtps_ph(v.reg, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {} +#else +inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) + : reg(_mm256_insertf128_si256(_mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg), FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {} +#endif + #ifdef __AVX512BF16__ template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v); diff --git a/csrc/cpu/dnnl_helper.hpp b/csrc/cpu/dnnl_helper.hpp index 024ad4ae43da8..8b5011dc065f0 100644 --- a/csrc/cpu/dnnl_helper.hpp +++ b/csrc/cpu/dnnl_helper.hpp @@ -2,6 +2,7 @@ #define DNNL_HELPER_HPP #include +#include #include "oneapi/dnnl/dnnl.hpp" @@ -32,6 +33,11 @@ struct DNNLType { static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16; }; +template <> +struct DNNLType { + static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16; +}; + template constexpr inline dnnl::memory::data_type get_dnnl_type() { return DNNLType>::type; diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp index b493fd793818a..f42fa2361a2db 100644 --- a/csrc/cpu/quant.cpp +++ b/csrc/cpu/quant.cpp @@ -23,6 +23,13 @@ struct KernelVecType { using cvt_vec_type = vec_op::FP32Vec16; }; +template <> +struct KernelVecType { + using load_vec_type = vec_op::FP16Vec16; + using azp_adj_load_vec_type = vec_op::INT32Vec16; + using cvt_vec_type = vec_op::FP32Vec16; +}; + #ifdef __AVX512F__ template void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst index 301337aebcf4c..ece5d785e0c65 100644 --- a/docs/source/getting_started/amd-installation.rst +++ b/docs/source/getting_started/amd-installation.rst @@ -13,8 +13,6 @@ Requirements * GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100) * ROCm 6.2 -Note: PyTorch 2.5+/ROCm6.2 dropped the support for python 3.8. - Installation options: #. :ref:`Build from source with docker ` diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst index d12aeebbbc184..69530fd778c55 100644 --- a/docs/source/getting_started/cpu-installation.rst +++ b/docs/source/getting_started/cpu-installation.rst @@ -3,13 +3,13 @@ Installation with CPU ======================== -vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32 and BF16. vLLM CPU backend supports the following vLLM features: +vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features: - Tensor Parallel (``-tp = N``) - Quantization (``INT8 W8A8, AWQ``) .. note:: - FP16 data type and more advanced features on `chunked-prefill`, `prefix-caching` and `FP8 KV cache` are under development and will be available soon. + More advanced features on `chunked-prefill`, `prefix-caching` and `FP8 KV cache` are under development and will be available soon. Table of contents: @@ -72,8 +72,6 @@ Build from source $ VLLM_TARGET_DEVICE=cpu python setup.py install .. note:: - - BF16 is the default data type in the current CPU backend (that means the backend will cast FP16 to BF16), and is compatible will all CPUs with AVX512 ISA support. - - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building. diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst index 61871cdf41125..f02626bda4c64 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation.rst @@ -66,7 +66,7 @@ If you want to access the wheels for previous commits, you can specify the commi $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl -Note that the wheels are built with Python 3.8 ABI (see `PEP 425 `_ for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. +Note that the wheels are built with Python 3.8 ABI (see `PEP 425 `_ for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before. Another way to access the latest code is to use the docker images: diff --git a/docs/source/getting_started/neuron-installation.rst b/docs/source/getting_started/neuron-installation.rst index ec99fc013057b..025ba6ef7ebd8 100644 --- a/docs/source/getting_started/neuron-installation.rst +++ b/docs/source/getting_started/neuron-installation.rst @@ -11,7 +11,7 @@ Requirements ------------ * OS: Linux -* Python: 3.8 -- 3.11 +* Python: 3.9 -- 3.11 * Accelerator: NeuronCore_v2 (in trn1/inf2 instances) * Pytorch 2.0.1/2.1.1 * AWS Neuron SDK 2.16/2.17 (Verified on python 3.8) diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst index 00b762ccc2ccb..0c0491c860563 100644 --- a/docs/source/getting_started/quickstart.rst +++ b/docs/source/getting_started/quickstart.rst @@ -12,7 +12,7 @@ This guide will help you quickly get started with vLLM to: Prerequisites -------------- - OS: Linux -- Python: 3.8 - 3.12 +- Python: 3.9 -- 3.12 - GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) Installation diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 87f45cf695c8d..5a474043078db 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -446,6 +446,12 @@ Text Generation - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc. - - ✅︎ + * - :code:`Idefics3ForConditionalGeneration` + - Idefics3 + - T + I + - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc. + - + - * - :code:`InternVLChatModel` - InternVL2 - T + I\ :sup:`E+` @@ -534,7 +540,7 @@ Text Generation - Qwen2-VL - T + I\ :sup:`E+` + V\ :sup:`+` - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc. - - + - ✅︎ - ✅︎ * - :code:`UltravoxModel` - Ultravox diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst index cab19e4ec5b6c..f629b3ca78318 100644 --- a/docs/source/serving/compatibility_matrix.rst +++ b/docs/source/serving/compatibility_matrix.rst @@ -359,7 +359,7 @@ Feature x Hardware - ✅ - ✅ - ✅ - - `✗ `__ + - ✅ - ✗ * - :abbr:`logP (Logprobs)` - ✅ diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 4fd002caf1763..8d17ce3754515 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -377,6 +377,22 @@ def run_glm4v(question: str, modality: str): return llm, prompt, stop_token_ids +# Idefics3-8B-Llama3 +def run_idefics3(question: str, modality: str): + assert modality == "image" + model_name = "HuggingFaceM4/Idefics3-8B-Llama3" + + llm = LLM(model=model_name, + max_model_len=8192, + max_num_seqs=2, + enforce_eager=True) + prompt = ( + f"<|begin_of_text|>User:{question}\nAssistant:" + ) + stop_token_ids = None + return llm, prompt, stop_token_ids + + model_example_map = { "llava": run_llava, "llava-next": run_llava_next, @@ -397,6 +413,7 @@ def run_glm4v(question: str, modality: str): "mllama": run_mllama, "molmo": run_molmo, "glm4v": run_glm4v, + "idefics3": run_idefics3, } diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py index d99684078ff3d..7e883568995a4 100644 --- a/examples/offline_inference_vision_language_multi_image.py +++ b/examples/offline_inference_vision_language_multi_image.py @@ -290,6 +290,30 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData: ) +def load_idefics3(question, image_urls: List[str]) -> ModelRequestData: + model_name = "HuggingFaceM4/Idefics3-8B-Llama3" + + # The configuration below has been confirmed to launch on a single L40 GPU. + llm = LLM( + model=model_name, + max_model_len=8192, + max_num_seqs=16, + enforce_eager=True, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = "\n".join(f"Image-{i}: \n" + for i, _ in enumerate(image_urls, start=1)) + prompt = f"<|begin_of_text|>User:{placeholders}\n{question}\nAssistant:" # noqa: E501 + return ModelRequestData( + llm=llm, + prompt=prompt, + stop_token_ids=None, + image_data=[fetch_image(url) for url in image_urls], + chat_template=None, + ) + + model_example_map = { "phi3_v": load_phi3v, "h2ovl_chat": load_h2onvl, @@ -298,6 +322,7 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData: "qwen2_vl": load_qwen2_vl, "qwen_vl_chat": load_qwenvl_chat, "mllama": load_mllama, + "idefics3": load_idefics3, } diff --git a/pyproject.toml b/pyproject.toml index 3562569647391..1aebc543a733a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -97,4 +97,5 @@ markers = [ "skip_global_cleanup", "core_model: run this model test in each PR instead of just daily", "distributed_2_gpus: run this test only in distributed tests for 2 GPUs", + "skip_v1: do not run this test with v1", ] diff --git a/requirements-cpu.txt b/requirements-cpu.txt index 27ca8ca5dbc58..749b03a0603d8 100644 --- a/requirements-cpu.txt +++ b/requirements-cpu.txt @@ -2,5 +2,5 @@ -r requirements-common.txt # Dependencies for x86_64 CPUs -torch == 2.4.0+cpu; platform_machine != "ppc64le" +torch == 2.5.1+cpu; platform_machine != "ppc64le" torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch diff --git a/requirements-xpu.txt b/requirements-xpu.txt index eb76a33dab5c2..e41295792283f 100644 --- a/requirements-xpu.txt +++ b/requirements-xpu.txt @@ -8,9 +8,9 @@ packaging setuptools-scm>=8 wheel jinja2 -# Following pkgs retrieved from https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ -torch == 2.3.1+cxx11.abi -intel-extension-for-pytorch == 2.3.110+xpu -oneccl_bind_pt == 2.3.100+xpu + +torch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl +intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl +oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl triton-xpu == 3.0.0b1 diff --git a/setup.py b/setup.py index 4a20e49235ac8..d2438ae74c455 100644 --- a/setup.py +++ b/setup.py @@ -55,12 +55,6 @@ def is_ninja_available() -> bool: return which("ninja") is not None -def remove_prefix(text, prefix): - if text.startswith(prefix): - return text[len(prefix):] - return text - - class CMakeExtension(Extension): def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None: @@ -197,8 +191,10 @@ def build_extensions(self) -> None: os.makedirs(self.build_temp) targets = [] - target_name = lambda s: remove_prefix(remove_prefix(s, "vllm."), - "vllm_flash_attn.") + + def target_name(s: str) -> str: + return s.removeprefix("vllm.").removeprefix("vllm_flash_attn.") + # Build all the extensions for ext in self.extensions: self.configure(ext) diff --git a/tests/conftest.py b/tests/conftest.py index f9dfabc82639b..6cf791dc62ce5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,6 +5,7 @@ from enum import Enum from typing import (Any, Callable, Dict, List, Optional, Tuple, Type, TypedDict, TypeVar, Union) +from unittest.mock import patch import numpy as np import pytest @@ -108,6 +109,23 @@ def prompts(self, prompts: _VideoAssetPrompts) -> List[str]: """Singleton instance of :class:`_VideoAssets`.""" +@pytest.fixture(params=[True, False]) +def run_with_both_engines(request): + # Automatically runs tests twice, once with V1 and once without + use_v1 = request.param + # Tests decorated with `@skip_v1` are only run without v1 + skip_v1 = request.node.get_closest_marker("skip_v1") + + if use_v1: + if skip_v1: + pytest.skip("Skipping test on vllm V1") + with patch('vllm.envs.VLLM_USE_V1', True): + yield + else: + with patch('vllm.envs.VLLM_USE_V1', False): + yield + + @pytest.fixture(autouse=True) def init_test_http_connection(): # pytest_asyncio may use a different event loop per test diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py index a51a9909f6f41..3c7facc12c59a 100644 --- a/tests/distributed/test_utils.py +++ b/tests/distributed/test_utils.py @@ -1,9 +1,15 @@ +import pytest import ray +import torch +import torch.distributed as dist import vllm.envs as envs +from vllm.distributed.utils import stateless_init_process_group from vllm.utils import (cuda_device_count_stateless, update_environment_variables) +from ..utils import multi_gpu_test + @ray.remote class _CUDADeviceCountStatelessTestActor: @@ -24,10 +30,75 @@ def test_cuda_device_count_stateless(): CUDA_VISIBLE_DEVICES is changed.""" actor = _CUDADeviceCountStatelessTestActor.options( # type: ignore num_gpus=2).remote() - assert sorted(ray.get( - actor.get_cuda_visible_devices.remote()).split(",")) == ["0", "1"] + assert len( + sorted(ray.get( + actor.get_cuda_visible_devices.remote()).split(","))) == 2 assert ray.get(actor.get_count.remote()) == 2 ray.get(actor.set_cuda_visible_devices.remote("0")) assert ray.get(actor.get_count.remote()) == 1 ray.get(actor.set_cuda_visible_devices.remote("")) assert ray.get(actor.get_count.remote()) == 0 + + +def cpu_worker(rank, WORLD_SIZE): + pg1 = stateless_init_process_group(init_method="tcp://127.0.0.1:29500", + rank=rank, + world_size=WORLD_SIZE, + backend="gloo") + if rank <= 2: + pg2 = stateless_init_process_group(init_method="tcp://127.0.0.1:29501", + rank=rank, + world_size=3, + backend="gloo") + data = torch.tensor([rank]) + dist.all_reduce(data, op=dist.ReduceOp.SUM, group=pg1) + if rank <= 2: + dist.all_reduce(data, op=dist.ReduceOp.SUM, group=pg2) + item = data[0].item() + print(f"rank: {rank}, item: {item}") + if rank == 3: + assert item == 6 + else: + assert item == 18 + + +def gpu_worker(rank, WORLD_SIZE): + pg1 = stateless_init_process_group(init_method="tcp://127.0.0.1:29502", + rank=rank, + world_size=WORLD_SIZE, + backend="nccl") + if rank <= 2: + pg2 = stateless_init_process_group(init_method="tcp://127.0.0.1:29503", + rank=rank, + world_size=3, + backend="nccl") + torch.cuda.set_device(rank) + data = torch.tensor([rank]).cuda() + dist.all_reduce(data, op=dist.ReduceOp.SUM, group=pg1) + if rank <= 2: + dist.all_reduce(data, op=dist.ReduceOp.SUM, group=pg2) + item = data[0].item() + print(f"rank: {rank}, item: {item}") + if rank == 3: + assert item == 6 + else: + assert item == 18 + + +@multi_gpu_test(num_gpus=4) +@pytest.mark.parametrize("worker", [cpu_worker, gpu_worker]) +def test_stateless_init_process_group(worker): + WORLD_SIZE = 4 + from multiprocessing import get_context + ctx = get_context("fork") + processes = [] + for i in range(WORLD_SIZE): + rank = i + processes.append(ctx.Process(target=worker, args=(rank, WORLD_SIZE))) + for p in processes: + p.start() + for p in processes: + p.join() + for p in processes: + assert not p.exitcode + print("All processes finished.") diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py index 675a980ab3f3f..ee7010a238114 100644 --- a/tests/entrypoints/llm/test_prompt_validation.py +++ b/tests/entrypoints/llm/test_prompt_validation.py @@ -3,12 +3,21 @@ from vllm import LLM +@pytest.fixture(autouse=True) +def v1(run_with_both_engines): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + def test_empty_prompt(): llm = LLM(model="gpt2", enforce_eager=True) with pytest.raises(ValueError, match='Prompt cannot be empty'): llm.generate([""]) +@pytest.mark.skip_v1 def test_out_of_vocab_token(): llm = LLM(model="gpt2", enforce_eager=True) with pytest.raises(ValueError, match='out of vocabulary'): diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index 3fe9ca0b0450f..169ce040d370c 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -44,6 +44,8 @@ def test_env(name: str, device: str, monkeypatch): def test_flash_attn(monkeypatch): """Test FlashAttn validation.""" + # TODO: When testing for v1, pipe in `use_v1` as an argument to + # which_attn_to_use override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL) diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py index a1dd5eeeaa398..3d3724c50421d 100644 --- a/tests/kernels/test_encoder_decoder_attn.py +++ b/tests/kernels/test_encoder_decoder_attn.py @@ -16,7 +16,7 @@ from vllm.attention import (Attention, AttentionBackend, AttentionMetadata, AttentionType) from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP -from vllm.attention.selector import (_Backend, get_attn_backend, +from vllm.attention.selector import (_Backend, _cached_get_attn_backend, global_force_attn_backend_context_manager) from vllm.forward_context import set_forward_context from vllm.platforms import current_platform @@ -774,7 +774,7 @@ def set_reset_environment(attn_backend): default_dtype = torch.get_default_dtype() if attn_backend.name == 'FLASH_ATTN': torch.set_default_dtype(torch.bfloat16) - get_attn_backend.cache_clear() + _cached_get_attn_backend.cache_clear() yield # Reset the torch datatype to what it was before the test # so as not to impact the remaining tests. diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py index 05117666f8c3f..d705909c24bf8 100644 --- a/tests/models/decoder_only/language/test_models.py +++ b/tests/models/decoder_only/language/test_models.py @@ -32,8 +32,7 @@ "openbmb/MiniCPM3-4B", ] -# TODO: remove this after CPU float16 support ready -target_dtype = "float" if current_platform.is_cpu() else "half" +target_dtype = "half" @pytest.mark.parametrize("model", MODELS) diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index cfd2d61f2b633..3dbfaafb781af 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -327,6 +327,22 @@ vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output, prompt_path_encoder=model_utils.qwen_prompt_path_encoder, ), + "idefics3": VLMTestInfo( + models=["HuggingFaceM4/Idefics3-8B-Llama3"], + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}\nAssistant:", # noqa: E501 + img_idx_to_prompt=lambda idx: "", + max_model_len=8192, + max_num_seqs=2, + auto_cls=AutoModelForVision2Seq, + marks=[ + pytest.mark.skipif( + transformers.__version__ < "4.46.0", + reason="Model introduced in HF >= 4.46.0" + ), + large_gpu_mark(min_gb=48), + ], + ), ### Tensor parallel / multi-gpu broadcast tests "broadcast-chameleon": VLMTestInfo( models=["facebook/chameleon-7b"], diff --git a/tests/test_config.py b/tests/test_config.py index 68950106e8898..66bdb883657c5 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -237,3 +237,41 @@ def test_rope_customization(): assert getattr(longchat_model_config.hf_config, "rope_scaling", None) == TEST_ROPE_SCALING assert longchat_model_config.max_model_len == 4096 + + +@pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [ + ("facebook/opt-125m", False), + ("facebook/bart-base", True), + ("meta-llama/Llama-3.2-1B", False), + ("meta-llama/Llama-3.2-11B-Vision", True), +]) +def test_is_encoder_decoder(model_id, is_encoder_decoder): + config = ModelConfig( + model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + dtype="float16", + seed=0, + ) + + assert config.is_encoder_decoder == is_encoder_decoder + + +@pytest.mark.parametrize(("model_id", "uses_mrope"), [ + ("facebook/opt-125m", False), + ("Qwen/Qwen2-VL-2B-Instruct", True), +]) +def test_uses_mrope(model_id, uses_mrope): + config = ModelConfig( + model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + dtype="float16", + seed=0, + ) + + assert config.uses_mrope == uses_mrope diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index 31fcc4c3256a8..28b804f765a3a 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -74,20 +74,12 @@ def paged_attention_v1( assert kv_cache_dtype == "auto" num_heads = out.size(1) num_queries_per_tokens = num_heads // num_kv_heads - head_mapping = torch.arange( - 0, - num_kv_heads, - device=query.device, - dtype=torch.int32, - ).view(num_kv_heads, - 1).repeat_interleave(num_queries_per_tokens).flatten() - # todo: ipex will refactor namespace - torch.xpu.paged_attention_v1( # type: ignore + ipex.llm.modules.PagedAttention.single_query_kv_attention( out, query.contiguous(), key_cache.view_as(value_cache), value_cache, - head_mapping, + num_queries_per_tokens, scale, block_tables, context_lens, @@ -124,26 +116,15 @@ def paged_attention_v2( assert kv_cache_dtype == "auto" num_heads = out.size(1) num_queries_per_tokens = num_heads // num_kv_heads - head_mapping = torch.arange( - 0, - num_kv_heads, - dtype=torch.int32, - device=query.device, - ).view(num_kv_heads, - 1).repeat_interleave(num_queries_per_tokens).flatten() - # todo: ipex will refactor namespace - torch.xpu.paged_attention_v2( # type: ignore + ipex.llm.modules.PagedAttention.single_query_kv_attention( out, - exp_sum, - max_logits, - tmp_out, query.contiguous(), key_cache.view_as(value_cache), value_cache, - head_mapping, + num_queries_per_tokens, + scale, block_tables, context_lens, - scale, block_size, max_context_len, alibi_slopes, @@ -202,6 +183,7 @@ def varlen_attention( is_causal: bool, return_softmax: bool, gen_: torch.Generator, + logits_soft_cap: float, ) -> None: ipex.llm.functional.varlen_attention(query.contiguous(), key.contiguous(), @@ -210,7 +192,8 @@ def varlen_attention( max_seqlen_q, max_seqlen_k, pdropout, softmax_scale, zero_tensors, is_causal, - return_softmax, gen_) + return_softmax, gen_, + logits_soft_cap) @staticmethod def reshape_and_cache( diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index 1eb5fe10d76db..87bdb1e0e6565 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -119,8 +119,6 @@ def __init__( if blocksparse_params is not None: raise ValueError( "IPEX backend does not support block-sparse attention.") - if logits_soft_cap is not None: - raise ValueError("IPEX backend does not support logits_soft_cap.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) @@ -135,6 +133,9 @@ def __init__( self.num_queries_per_kv = self.num_heads // self.num_kv_heads self.need_mask = (self.alibi_slopes is not None or self.sliding_window is not None) + if logits_soft_cap is None: + logits_soft_cap = 0 + self.logits_soft_cap = logits_soft_cap supported_head_sizes = PagedAttention.get_supported_head_sizes() if head_size not in supported_head_sizes: @@ -239,20 +240,23 @@ def forward( (num_tokens, self.num_heads, self.head_size), dtype=query.dtype, device=query.device) - ipex_ops.varlen_attention(query, - key, - value, - output, - attn_metadata.seqlen_q, - attn_metadata.seqlen_q, - attn_metadata.max_seqlen, - attn_metadata.max_seqlen, - pdropout=0.0, - softmax_scale=self.scale, - zero_tensors=False, - is_causal=True, - return_softmax=False, - gen_=None) + ipex_ops.varlen_attention( + query, + key, + value, + output, + attn_metadata.seqlen_q, + attn_metadata.seqlen_q, + attn_metadata.max_seqlen, + attn_metadata.max_seqlen, + pdropout=0.0, + softmax_scale=self.scale, + zero_tensors=False, + is_causal=True, + return_softmax=False, + gen_=None, + logits_soft_cap=self.logits_soft_cap, + ) else: # prefix-enabled attention raise RuntimeError( diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 991602da2853a..664707e9dc65d 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -89,7 +89,6 @@ def get_global_forced_attn_backend() -> Optional[_Backend]: return forced_attn_backend -@lru_cache(maxsize=None) def get_attn_backend( head_size: int, dtype: torch.dtype, @@ -99,6 +98,31 @@ def get_attn_backend( is_blocksparse: bool = False, ) -> Type[AttentionBackend]: """Selects which attention backend to use and lazily imports it.""" + # Accessing envs.* behind an @lru_cache decorator can cause the wrong + # value to be returned from the cache if the value changes between calls. + # To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the + # private function. + return _cached_get_attn_backend( + head_size=head_size, + dtype=dtype, + kv_cache_dtype=kv_cache_dtype, + block_size=block_size, + is_attention_free=is_attention_free, + is_blocksparse=is_blocksparse, + use_v1=envs.VLLM_USE_V1, + ) + + +@lru_cache(maxsize=None) +def _cached_get_attn_backend( + head_size: int, + dtype: torch.dtype, + kv_cache_dtype: Optional[str], + block_size: int, + is_attention_free: bool, + is_blocksparse: bool = False, + use_v1: bool = False, +) -> Type[AttentionBackend]: if is_blocksparse: logger.info("Using BlocksparseFlashAttention backend.") from vllm.attention.backends.blocksparse_attn import ( @@ -106,7 +130,7 @@ def get_attn_backend( return BlocksparseFlashAttentionBackend backend = which_attn_to_use(head_size, dtype, kv_cache_dtype, block_size, - is_attention_free) + is_attention_free, use_v1) if backend == _Backend.FLASH_ATTN: logger.info("Using Flash Attention backend.") from vllm.attention.backends.flash_attn import ( # noqa: F401 @@ -162,13 +186,12 @@ def get_attn_backend( raise ValueError("Invalid attention backend.") -def which_attn_to_use( - head_size: int, - dtype: torch.dtype, - kv_cache_dtype: Optional[str], - block_size: int, - is_attention_free: bool, -) -> _Backend: +def which_attn_to_use(head_size: int, + dtype: torch.dtype, + kv_cache_dtype: Optional[str], + block_size: int, + is_attention_free: bool, + use_v1: bool = False) -> _Backend: """Returns which flash attention backend to use.""" # Default case. selected_backend = _Backend.FLASH_ATTN @@ -228,7 +251,7 @@ def which_attn_to_use( if current_platform.is_hpu(): return _Backend.HPU_ATTN - if envs.VLLM_USE_V1: + if use_v1: return _Backend.FLASH_ATTN_VLLM_V1 # FlashAttn in NVIDIA GPUs. diff --git a/vllm/config.py b/vllm/config.py index a80fb726a5ec6..e844a46bf06e6 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -16,7 +16,7 @@ from vllm.transformers_utils.config import ( ConfigFormat, get_config, get_hf_image_processor_config, get_hf_text_config, get_pooling_config, - get_sentence_transformer_tokenizer_config) + get_sentence_transformer_tokenizer_config, is_encoder_decoder, uses_mrope) from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory, print_warning_once) @@ -682,12 +682,13 @@ def get_multimodal_config(self) -> "MultiModalConfig": return self.multimodal_config @property - def is_encoder_decoder_model(self) -> bool: + def is_encoder_decoder(self) -> bool: """Extract the HF encoder/decoder model flag.""" - return getattr( - self.hf_config, "is_encoder_decoder", - False) or (hasattr(self.hf_config, "text_config") and getattr( - self.hf_config.text_config, "is_encoder_decoder", False)) + return is_encoder_decoder(self.hf_config) + + @property + def uses_mrope(self) -> bool: + return uses_mrope(self.hf_config) @property def is_multimodal_model(self) -> bool: diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index efa3525910a5e..0d15403264eee 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -89,12 +89,11 @@ def _get_unique_name(name: str) -> str: return newname -_groups: Dict[str, Callable[[], "GroupCoordinator"]] = {} +_groups: Dict[str, Callable[[], Optional["GroupCoordinator"]]] = {} def _register_group(group: "GroupCoordinator") -> None: - # looks like Python 3.8 does not understand `ReferenceType` - _groups[group.unique_name] = weakref.ref(group) # type: ignore + _groups[group.unique_name] = weakref.ref(group) if supports_custom_op(): diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index 8c94ef8cb10ce..d24ce898707fc 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -5,6 +5,11 @@ from typing import Sequence, Tuple import torch +from torch.distributed import ProcessGroup +from torch.distributed.distributed_c10d import (Backend, PrefixStore, + _get_default_timeout, + is_nccl_available) +from torch.distributed.rendezvous import rendezvous import vllm.envs as envs from vllm.logger import init_logger @@ -84,3 +89,71 @@ def get_pp_indices(num_hidden_layers: int, pp_rank: int, end_layer = num_hidden_layers return (start_layer, end_layer) + + +def stateless_init_process_group(init_method: str, rank: int, world_size: int, + backend: str) -> ProcessGroup: + """A replacement for `torch.distributed.init_process_group` that does not + pollute the global state. + + If we have process A and process B called `torch.distributed.init_process_group` + to form a group, and then we want to form another group with process A, B, C, + D, it is not possible in PyTorch, because process A and process B have already + formed a group, and process C and process D cannot join that group. This + function is a workaround for this issue. + + `torch.distributed.init_process_group` is a global call, while this function + is a stateless call. It will return a `ProcessGroup` object that can be used + for collective communication. With this function, process A and process B + can call `stateless_init_process_group` to form a group, and then process A, B, + C, and D can call `stateless_init_process_group` to form another group. + """ # noqa + + backend = Backend(backend) # it is basically string + timeout = _get_default_timeout(backend) + + store, rank, world_size = next( + rendezvous(init_method, rank, world_size, timeout=timeout)) + store.set_timeout(timeout) + + group_rank = rank + group_size = world_size + + # Use a PrefixStore to avoid accidental overrides of keys used by + # different systems (e.g. RPC) in case the store is multi-tenant. + prefix_store = PrefixStore(init_method, store) + + pg_options = ProcessGroup.Options(backend=backend, timeout=timeout) + + pg: ProcessGroup = ProcessGroup( + prefix_store, + group_rank, + group_size, + pg_options, + ) + + if backend == "gloo": + from torch.distributed.distributed_c10d import ProcessGroupGloo + backend_class = ProcessGroupGloo(prefix_store, + group_rank, + group_size, + timeout=timeout) + backend_type = ProcessGroup.BackendType.GLOO + device = torch.device("cpu") + elif backend == "nccl": + assert is_nccl_available() + from torch.distributed.distributed_c10d import ProcessGroupNCCL + + backend_options = ProcessGroupNCCL.Options() + backend_options._timeout = timeout + + backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size, + backend_options) + backend_type = ProcessGroup.BackendType.NCCL + device = torch.device("cuda") + + backend_class._set_sequence_number_for_group() + + pg._register_backend(device, backend_type, backend_class) + + return pg diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index e1dcb82829d76..889845ee67312 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -6,7 +6,9 @@ import cloudpickle import zmq +import vllm.envs from vllm import AsyncEngineArgs, SamplingParams +from vllm.engine.llm_engine import LLMEngine # yapf conflicts with isort for this block # yapf: disable from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT, @@ -17,17 +19,11 @@ RPCStartupRequest, RPCStartupResponse, RPCUProfileRequest) # yapf: enable -from vllm.envs import VLLM_USE_V1 from vllm.executor.gpu_executor import GPUExecutor from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.usage.usage_lib import UsageContext -if VLLM_USE_V1: - from vllm.v1.engine.llm_engine import LLMEngine -else: - from vllm.engine.llm_engine import LLMEngine - logger = init_logger(__name__) POLLING_TIMEOUT_MS = 10000 @@ -117,11 +113,17 @@ def from_engine_args(cls, engine_args: AsyncEngineArgs, load_general_plugins() engine_config = engine_args.create_engine_config() + if vllm.envs.VLLM_USE_V1: + # Lazy import: the v1 package isn't distributed + from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine + engine_class = V1LLMEngine + else: + engine_class = LLMEngine - executor_class = LLMEngine._get_executor_cls(engine_config) + executor_class = engine_class._get_executor_cls(engine_config) use_async_sockets = (engine_config.model_config.use_async_output_proc - and not VLLM_USE_V1) + and not vllm.envs.VLLM_USE_V1) return cls(ipc_path=ipc_path, use_async_sockets=use_async_sockets, diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 0ada0aaacda24..ed4e4399d5514 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -187,6 +187,8 @@ def _placeholder_str(self, modality: ModalityStr, return "<|vision_start|><|image_pad|><|vision_end|>" if model_type == "molmo": return "" + if model_type == "idefics3": + return "" raise TypeError(f"Unknown {modality} model type: {model_type}") elif modality == "audio": diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index b18974c5a0c57..d8b60a5e01471 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1,7 +1,7 @@ import itertools import warnings from contextlib import contextmanager -from typing import (Any, ClassVar, Dict, List, Optional, Sequence, Tuple, +from typing import (Any, ClassVar, Dict, List, Optional, Sequence, Tuple, Type, Union, cast, overload) from tqdm import tqdm @@ -10,6 +10,7 @@ from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, BeamSearchSequence, get_beam_search_score) from vllm.engine.arg_utils import EngineArgs, TaskOption +from vllm.engine.llm_engine import LLMEngine from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, apply_hf_chat_template, apply_mistral_chat_template, @@ -31,11 +32,6 @@ from vllm.usage.usage_lib import UsageContext from vllm.utils import Counter, deprecate_args, deprecate_kwargs, is_list_of -if envs.VLLM_USE_V1: - from vllm.v1.engine.llm_engine import LLMEngine # type: ignore -else: - from vllm.engine.llm_engine import LLMEngine # type: ignore - logger = init_logger(__name__) @@ -206,10 +202,21 @@ def __init__( pooling_returned_token_ids=pooling_returned_token_ids, **kwargs, ) - self.llm_engine = LLMEngine.from_engine_args( + # Logic to switch between engines is done at runtime instead of import + # to avoid import order issues + self.engine_class = self.get_engine_class() + self.llm_engine = self.engine_class.from_engine_args( engine_args, usage_context=UsageContext.LLM_CLASS) self.request_counter = Counter() + @staticmethod + def get_engine_class() -> Type[LLMEngine]: + if envs.VLLM_USE_V1: + # Lazy import: the v1 package isn't distributed + from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine + return V1LLMEngine # type: ignore + return LLMEngine + def get_tokenizer(self) -> AnyTokenizer: return self.llm_engine.get_tokenizer_group(TokenizerGroup).tokenizer @@ -394,7 +401,7 @@ def generate( priority=priority) outputs = self._run_engine(use_tqdm=use_tqdm) - return LLMEngine.validate_outputs(outputs, RequestOutput) + return self.engine_class.validate_outputs(outputs, RequestOutput) def beam_search( self, @@ -769,7 +776,8 @@ def encode( ) outputs = self._run_engine(use_tqdm=use_tqdm) - return LLMEngine.validate_outputs(outputs, EmbeddingRequestOutput) + return self.engine_class.validate_outputs(outputs, + EmbeddingRequestOutput) def start_profile(self) -> None: self.llm_engine.start_profile() diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 570232be38379..db31b1153d97e 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -189,13 +189,7 @@ async def create_completion( try: async for i, res in result_generator: final_res_batch[i] = res - except asyncio.CancelledError: - return self.create_error_response("Client disconnected") - except ValueError as e: - # TODO: Use a vllm-specific Validation Error - return self.create_error_response(str(e)) - try: for i, final_res in enumerate(final_res_batch): assert final_res is not None @@ -217,6 +211,8 @@ async def create_completion( tokenizer, request_metadata, ) + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 917856cd2b2dd..bbe7db8f13231 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -205,12 +205,8 @@ async def create_embedding( try: async for i, res in result_generator: final_res_batch[i] = res - except asyncio.CancelledError: - return self.create_error_response("Client disconnected") - try: - for final_res in final_res_batch: - assert final_res is not None + assert all(final_res is not None for final_res in final_res_batch) final_res_batch_checked = cast(List[EmbeddingRequestOutput], final_res_batch) @@ -218,6 +214,8 @@ async def create_embedding( response = request_output_to_embedding_response( final_res_batch_checked, request_id, created_time, model_name, encoding_format) + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index ab3ebb4e43d18..4ceb5a837dd7f 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -2,8 +2,6 @@ from functools import partial from typing import Any, Awaitable, List, Optional, Set, Tuple, Union -import torch - import vllm.envs as envs from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, SchedulerConfig) @@ -316,9 +314,6 @@ async def check_health_async(self) -> None: def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig: - if config.dtype == torch.float16: - logger.warning("float16 is not supported on CPU, casting to bfloat16.") - config.dtype = torch.bfloat16 # Reminder: Please update docs/source/serving/compatibility_matrix.rst # If the feature combo become valid if not config.enforce_eager: diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index a5c787a56b5a9..509b0448b9e51 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -580,4 +580,4 @@ async def preprocess_async( ) def is_encoder_decoder_model(self): - return self.model_config.is_encoder_decoder_model + return self.model_config.is_encoder_decoder diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index e347ca80ff765..34d65ed51ef3f 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -9,7 +9,6 @@ from vllm.distributed import (divide, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.custom_op import CustomOp -from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.utils import set_weight_attrs from vllm.utils import LazyDict @@ -277,28 +276,14 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor): }) -def get_act_fn( - act_fn_name: str, - quant_config: Optional[QuantizationConfig] = None, - intermediate_size: Optional[int] = None, - input_is_parallel: bool = True, - params_dtype: Optional[torch.dtype] = None, -) -> nn.Module: +def get_act_fn(act_fn_name: str) -> nn.Module: """Get an activation function by name.""" act_fn_name = act_fn_name.lower() if act_fn_name not in _ACTIVATION_REGISTRY: raise ValueError( f"Activation function {act_fn_name!r} is not supported.") - act_fn = _ACTIVATION_REGISTRY[act_fn_name] - if (quant_config is not None - and act_fn_name in quant_config.get_scaled_act_names()): - if intermediate_size is None: - raise ValueError("intermediate_size must be specified for scaled " - "activation functions.") - return ScaledActivation(act_fn, intermediate_size, input_is_parallel, - params_dtype) - return act_fn + return _ACTIVATION_REGISTRY[act_fn_name] _ACTIVATION_AND_MUL_REGISTRY = LazyDict({ @@ -307,25 +292,11 @@ def get_act_fn( }) -def get_act_and_mul_fn( - act_fn_name: str, - quant_config: Optional[QuantizationConfig] = None, - intermediate_size: Optional[int] = None, - input_is_parallel: bool = True, - params_dtype: Optional[torch.dtype] = None, -) -> nn.Module: +def get_act_and_mul_fn(act_fn_name: str) -> nn.Module: """Get an activation-and-mul (i.e. SiluAndMul) function by name.""" act_fn_name = act_fn_name.lower() if act_fn_name not in _ACTIVATION_AND_MUL_REGISTRY: raise ValueError( f"Activation function {act_fn_name!r} is not supported.") - act_fn = _ACTIVATION_AND_MUL_REGISTRY[act_fn_name] - if (quant_config is not None - and act_fn_name in quant_config.get_scaled_act_names()): - if intermediate_size is None: - raise ValueError("intermediate_size must be specified for scaled " - "activation functions.") - return ScaledActivation(act_fn, intermediate_size, input_is_parallel, - params_dtype) - return act_fn + return _ACTIVATION_AND_MUL_REGISTRY[act_fn_name] diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index c88ca340ebcc5..72c89fe2b0e48 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -213,9 +213,6 @@ def get_quant_method(self, layer: torch.nn.Module, return AQLMLinearMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - class AQLMLinearMethod(LinearMethodBase): """Linear method for AQLM. diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 38dd1f2e10fcd..d83528e9ec79c 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -77,9 +77,6 @@ def get_quant_method(self, layer: torch.nn.Module, return AWQLinearMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return ["gelu", "gelu_fast", "gelu_new", "gelu_pytorch_tanh"] - def is_layer_skipped_awq(prefix: str, modules_to_not_convert: List[str]): return any(module_name in prefix for module_name in modules_to_not_convert) diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index ea69bee45f8d9..4d1a837d11585 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -127,9 +127,6 @@ def get_quant_method(self, layer: torch.nn.Module, return AWQMoEMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - @classmethod def is_awq_marlin_compatible(cls, quant_config: Dict[str, Any]): # Extract data from quant config. diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py index 75fa8249cd3c2..6dfac8aad5358 100644 --- a/vllm/model_executor/layers/quantization/base_config.py +++ b/vllm/model_executor/layers/quantization/base_config.py @@ -133,11 +133,3 @@ def get_quant_method(self, layer: torch.nn.Module, method. """ raise NotImplementedError - - @abstractmethod - def get_scaled_act_names(self) -> List[str]: - """Returns the activation function names that should be post-scaled. - - For now, this is only used by AWQ. - """ - raise NotImplementedError diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 78965d7b9495c..39965ac9115c2 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -114,9 +114,6 @@ def get_quant_method(self, layer: torch.nn.Module, return BitsAndBytesLinearMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - def is_layer_skipped_bnb(prefix: str, llm_int8_skip_modules: List[str]): # Split the prefix into its dot-separated components diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index ecc345f116c37..4f5758a42dbbc 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -45,9 +45,6 @@ def __init__(self, def get_linear_method(self) -> "CompressedTensorsLinearMethod": return CompressedTensorsLinearMethod(self) - def get_scaled_act_names(self) -> List[str]: - return [] - def get_supported_act_dtypes(cls) -> List[torch.dtype]: return [torch.float16, torch.bfloat16] diff --git a/vllm/model_executor/layers/quantization/deepspeedfp.py b/vllm/model_executor/layers/quantization/deepspeedfp.py index 29484801dc380..36598b3e2990f 100644 --- a/vllm/model_executor/layers/quantization/deepspeedfp.py +++ b/vllm/model_executor/layers/quantization/deepspeedfp.py @@ -50,9 +50,6 @@ def from_config(cls, config: Dict[str, Any]) -> "DeepSpeedFPConfig": def get_linear_method(self) -> "DeepSpeedFPLinearMethod": return DeepSpeedFPLinearMethod(self) - def get_scaled_act_names(self) -> List[str]: - return [] - @classmethod def get_supported_act_dtypes(cls) -> List[torch.dtype]: return [torch.half, torch.bfloat16] diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 116a4ea0aed89..97297970d9317 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -45,9 +45,6 @@ def get_quant_method(self, layer: torch.nn.Module, return ExpertsInt8MoEMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - class ExpertsInt8MoEMethod(FusedMoEMethodBase): diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py index 825d01d1b3551..7b71e13b50ccc 100644 --- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py +++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py @@ -64,9 +64,6 @@ def get_quant_method(self, layer: torch.nn.Module, return FBGEMMFp8LinearMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - class FBGEMMFp8LinearMethod(LinearMethodBase): diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index d34579b7099bb..978e727bc7cb3 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -92,9 +92,6 @@ def get_quant_method(self, layer: torch.nn.Module, return Fp8KVCacheMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - class Fp8LinearMethod(LinearMethodBase): """Linear method for FP8. diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index d73b9f6d92832..24138662eb25c 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -48,9 +48,6 @@ def get_quant_method(self, layer: torch.nn.Module, return GGUFEmbeddingMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - def _fuse_mul_mat(x: torch.Tensor, qweight: torch.Tensor, qweight_type: int) -> torch.Tensor: diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index 1cfadb4f42ca8..0aa605e62454e 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -80,9 +80,6 @@ def get_quant_method(self, layer: torch.nn.Module, return GPTQLinearMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - class ExllamaState(Enum): diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index b97dd108d6785..1f72e3afbbce5 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -125,9 +125,6 @@ def get_quant_method( return GPTQMarlinMoEMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - @classmethod def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]): # Extract data from quant config. diff --git a/vllm/model_executor/layers/quantization/gptq_marlin_24.py b/vllm/model_executor/layers/quantization/gptq_marlin_24.py index 0971aedba4c3c..07552c0f13348 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin_24.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin_24.py @@ -127,9 +127,6 @@ def get_quant_method(self, layer: torch.nn.Module, return GPTQMarlin24LinearMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - class GPTQMarlin24LinearMethod(LinearMethodBase): """Linear method for Marlin24. diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index e54052632e468..330c2ad195d78 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -54,7 +54,7 @@ def get_name(cls) -> str: @classmethod def get_supported_act_dtypes(cls) -> List[torch.dtype]: - return [torch.bfloat16] + return [torch.bfloat16, torch.float16] @classmethod def get_min_capability(cls) -> int: @@ -93,12 +93,6 @@ def get_quant_method(self, layer: torch.nn.Module, return self.quant_method(self) return None - def get_scaled_act_names(self) -> List[str]: - if self.method == "awq": - return ["gelu", "gelu_fast", "gelu_new", "gelu_pytorch_tanh"] - else: - return [] - class IPEXAWQLinearMethod(AWQLinearMethod): """AWQ linear method using IPEX for the CPU backend. diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py index 8f1b5370b4538..20212e672eab0 100644 --- a/vllm/model_executor/layers/quantization/marlin.py +++ b/vllm/model_executor/layers/quantization/marlin.py @@ -110,9 +110,6 @@ def get_quant_method(self, layer: torch.nn.Module, return MarlinLinearMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - class MarlinLinearMethod(LinearMethodBase): """Linear method for Marlin. diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 9694f2b8208e2..a1b3eeb43cbee 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -68,9 +68,6 @@ def get_quant_method(self, layer: torch.nn.Module, return ModelOptFp8KVCacheMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - class ModelOptFp8KVCacheMethod(BaseKVCacheMethod): """ diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py index 2624981f6a614..2d5cdfa165775 100644 --- a/vllm/model_executor/layers/quantization/neuron_quant.py +++ b/vllm/model_executor/layers/quantization/neuron_quant.py @@ -57,9 +57,6 @@ def get_quant_method(self, layer: Module, prefix: str) -> Optional[Any]: "Neuron Quantization is only supported through" " transformers_neuronx.") - def get_scaled_act_names(self) -> List[str]: - return [] - def get_quantization_config(self): from transformers_neuronx.config import QuantizationConfig return QuantizationConfig(quant_dtype=self.quant_dtype, diff --git a/vllm/model_executor/layers/quantization/qqq.py b/vllm/model_executor/layers/quantization/qqq.py index 5bc3737520865..2ccd082029610 100644 --- a/vllm/model_executor/layers/quantization/qqq.py +++ b/vllm/model_executor/layers/quantization/qqq.py @@ -112,9 +112,6 @@ def get_quant_method(self, layer: torch.nn.Module, return QQQLinearMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - class QQQLinearMethod(LinearMethodBase): """Linear method for QQQ. diff --git a/vllm/model_executor/layers/quantization/tpu_int8.py b/vllm/model_executor/layers/quantization/tpu_int8.py index be8235b468f68..605c3a38644ac 100644 --- a/vllm/model_executor/layers/quantization/tpu_int8.py +++ b/vllm/model_executor/layers/quantization/tpu_int8.py @@ -50,9 +50,6 @@ def get_quant_method(self, layer: Module, return TPUInt8LinearMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - class TPUInt8LinearMethod(LinearMethodBase): """Int8 Linear method for TPU Quant. """ diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 445117ac99a34..ec73533126ab6 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -7,8 +7,7 @@ # Input scaling factors are no longer optional in _scaled_mm starting # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale -TORCH_DEVICE_IDENTITY = torch.ones(1).cuda() \ - if current_platform.is_rocm() else None +TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32) def cutlass_fp8_supported() -> bool: @@ -166,8 +165,7 @@ def apply_fp8_linear( # Making sure the dummy tensor is on the same device as the weight global TORCH_DEVICE_IDENTITY - if (TORCH_DEVICE_IDENTITY is not None - and TORCH_DEVICE_IDENTITY.device != weight.device): + if TORCH_DEVICE_IDENTITY.device != weight.device: TORCH_DEVICE_IDENTITY = TORCH_DEVICE_IDENTITY.to(weight.device) # GEMM diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index f86c6ec362ebe..c10efefea5471 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -30,6 +30,15 @@ else: flashinfer_top_k_top_p_sampling = None + +def get_sampler() -> torch.nn.Module: + if envs.VLLM_USE_V1: + # Lazy import: the v1 package isn't distributed + from vllm.v1.sample.sampler import Sampler as V1Sampler + return V1Sampler() + return Sampler() + + # (num_token_ids, num_parent_ids) per sequence group. SampleResultType = List[Tuple[List[int], List[int]]] diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index 5b712ba83c25a..4fec314a70aa4 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -23,7 +23,7 @@ from vllm.model_executor.layers.quantization.deepspeedfp import ( DeepSpeedFPConfig, DeepSpeedFPParameter) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -436,7 +436,7 @@ def __init__(self, self.unpadded_vocab_size = config.vocab_size self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 1fbf4135add7a..cce182da4820f 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -37,7 +37,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -352,7 +352,7 @@ def __init__( if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index 0543ca978b7dd..fd600adceb21c 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -34,7 +34,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -393,8 +393,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config) self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) - self.activation_fn = get_act_fn(config.activation_function, - quant_config) + self.activation_fn = get_act_fn(config.activation_function) ffn_hidden_size = self.embed_dim ffn_intermediate_size = config.encoder_ffn_dim @@ -405,7 +404,7 @@ def __init__( bias=ffn_has_bias, quant_config=quant_config, ) - self.act = get_act_fn("gelu", quant_config, ffn_intermediate_size) + self.act = get_act_fn("gelu") self.fc2 = RowParallelLinear( ffn_intermediate_size, ffn_hidden_size, @@ -473,8 +472,7 @@ def __init__( config=config, cache_config=cache_config, quant_config=quant_config) - self.activation_fn = get_act_fn(config.activation_function, - quant_config) + self.activation_fn = get_act_fn(config.activation_function) self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) ''' @@ -840,7 +838,7 @@ def __init__(self, self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() def forward( self, diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index db1f92649bd49..efd24e7cf40f6 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -13,7 +13,7 @@ InputContext, token_inputs) from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.utils import consecutive_placeholder_ranges @@ -525,7 +525,7 @@ def sampler(self): if hasattr(self.language_model, "sampler"): return self.language_model.sampler - return Sampler() + return get_sampler() def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: h = w = self.config.vision_config.image_size diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 83ff39a30fbe3..c2440ee75d588 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -33,7 +33,7 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -146,7 +146,7 @@ def __init__( 4 * hidden_size, quant_config=quant_config, ) - self.gelu_impl = get_act_fn("gelu", quant_config, 4 * hidden_size) + self.gelu_impl = get_act_fn("gelu") self.dense_4h_to_h = RowParallelLinear( 4 * hidden_size, hidden_size, @@ -298,7 +298,7 @@ def __init__( self.config.hidden_size) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 9f6c6786c0fa4..58841f177ec22 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -21,7 +21,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -946,7 +946,7 @@ def __init__( logit_scale = getattr(config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, logit_scale) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 881b86564e811..032fa82ab93cd 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -24,7 +24,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -616,7 +616,7 @@ def __init__( self.transformer.embedding.weight) self.lm_head = self.transformer.output_layer self.logits_processor = LogitsProcessor(config.padded_vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() def forward(self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 835682ca3b379..718f26bed443f 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -37,7 +37,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -355,7 +355,7 @@ def __init__( cache_config, quant_config, lora_config=lora_config) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 3e60eee2d8fe2..ae43383155ffc 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -14,7 +14,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -373,7 +373,7 @@ def __init__( ) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index d278ea5b6a991..53a1c7cfbfef4 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -41,7 +41,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -399,7 +399,7 @@ def __init__( if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 834be78bce87b..95bbf4fb59c6a 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -42,7 +42,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -496,7 +496,7 @@ def __init__( config.hidden_size, quant_config=quant_config) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 23efe0359cb4a..a8d591b921cd6 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -42,7 +42,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( get_compressed_tensors_cache_scale) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -478,7 +478,7 @@ def __init__( self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, logit_scale) - self.sampler = Sampler() + self.sampler = get_sampler() else: self.lm_head = PPMissingLayer() diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index ad07fc3b3776e..daf49521637b0 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -38,7 +38,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -212,7 +212,7 @@ def __init__( bias=config.bias, skip_bias_add=True, quant_config=quant_config) - self.act = get_act_fn("gelu", quant_config, 4 * hidden_size) + self.act = get_act_fn("gelu") self.reduce_row_parallel_results = not (config.new_decoder_architecture or config.parallel_attn) self.dense_4h_to_h = RowParallelLinear( @@ -426,7 +426,7 @@ def __init__( quant_config=quant_config, ) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index 6840ac8b9e303..184bee5f65671 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -10,7 +10,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.bart import (BartDecoder, BartEncoder, BartParallelLMHead, @@ -112,7 +112,7 @@ def __init__(self, self.logits_processor = LogitsProcessor(self.vocab_size, config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() def forward( self, diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index fc3f5cb20afb0..1cc3ea679c553 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -33,7 +33,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -393,7 +393,7 @@ def __init__( quant_config, prefix=maybe_prefix(prefix, "model")) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index c365880109ef8..16e0d6b30713a 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -33,7 +33,7 @@ from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -414,7 +414,7 @@ def __init__( self.model = Gemma2Model(config, cache_config, quant_config) self.logits_processor = LogitsProcessor( config.vocab_size, soft_cap=config.final_logit_softcapping) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index a06200c4b7e08..7f81bbff94932 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -33,7 +33,7 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -123,8 +123,7 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.c_proj", ) - self.act = get_act_fn(config.activation_function, quant_config, - intermediate_size) + self.act = get_act_fn(config.activation_function) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states, _ = self.c_fc(hidden_states) @@ -260,7 +259,7 @@ def __init__( self.lm_head = ParallelLMHead(self.config.vocab_size, self.config.hidden_size) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 7612ea641d95c..4be8e4199f04d 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -33,7 +33,7 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -135,8 +135,7 @@ def __init__( bias=True, quant_config=quant_config, ) - self.act = get_act_fn(config.activation_function, quant_config, - intermediate_size) + self.act = get_act_fn(config.activation_function) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states, _ = self.c_fc(hidden_states) @@ -286,7 +285,7 @@ def __init__( self.unpadded_vocab_size += lora_config.lora_extra_vocab_size self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index b28a6081b868f..834b4aff2e4ba 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -32,7 +32,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -130,8 +130,7 @@ def __init__( hidden_size, quant_config=quant_config, ) - self.act = get_act_fn(config.activation_function, quant_config, - intermediate_size) + self.act = get_act_fn(config.activation_function) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states, _ = self.fc_in(hidden_states) @@ -248,7 +247,7 @@ def __init__( quant_config=quant_config, ) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 931052c7cccf0..1903156d7efe1 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -32,7 +32,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -128,8 +128,7 @@ def __init__( config.hidden_size, quant_config=quant_config, ) - self.act = get_act_fn(config.hidden_act, quant_config, - config.intermediate_size) + self.act = get_act_fn(config.hidden_act) def forward(self, hidden_states): hidden_states, _ = self.dense_h_to_4h(hidden_states) @@ -261,7 +260,7 @@ def __init__( if self.config.tie_word_embeddings: self.embed_out.weight = self.gpt_neox.embed_in.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.gpt_neox.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index bee48f377e0f5..8a75b9cb1d55d 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -42,7 +42,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( get_compressed_tensors_cache_scale) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -411,7 +411,7 @@ def __init__( self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, scale=logit_scale) - self.sampler = Sampler() + self.sampler = get_sampler() else: self.lm_head = PPMissingLayer() diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 691a6e77c46c4..b4da986efabe3 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -39,7 +39,7 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -371,7 +371,7 @@ def __init__( scale=1 / self.config.logits_scaling) - self.sampler = Sampler() + self.sampler = get_sampler() def forward( self, diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index 53869b8fa6bd8..b21bc2a3f9ce1 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -15,7 +15,7 @@ # limitations under the License. """PyTorch Idefics2 model.""" -from typing import Optional +from typing import Iterable, Optional, Tuple import torch from torch import nn @@ -29,6 +29,7 @@ QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.model_loader.weight_utils import default_weight_loader class Idefics2VisionEmbeddings(nn.Module): @@ -329,3 +330,25 @@ def forward( encoder_outputs = self.encoder(hidden_states) last_hidden_state = self.post_layernorm(encoder_outputs) return last_hidden_state + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + param = params_dict[name.replace(weight_name, param_name)] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py new file mode 100644 index 0000000000000..e4c98f22fb16f --- /dev/null +++ b/vllm/model_executor/models/idefics3.py @@ -0,0 +1,632 @@ +# Copyright 2024 the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Idefics3 model compatible with HuggingFace weights.""" + +import math +from typing import (Iterable, List, Literal, Mapping, Optional, Tuple, + TypedDict, Union) + +import torch +import torch.utils.checkpoint +from PIL import Image +from torch import nn +# Temporary solution for transformers below 4.46.0. +from transformers import PretrainedConfig as Idefics3Config + +from vllm.attention import AttentionMetadata +from vllm.config import CacheConfig, MultiModalConfig +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, + InputContext, token_inputs) +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import ReplicatedLinear +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs +from vllm.multimodal.image import cached_get_image_processor +from vllm.sequence import IntermediateTensors, SequenceData +from vllm.transformers_utils.processor import cached_get_processor +from vllm.utils import is_list_of + +# yapf: disable +from .idefics2_vision_model import ( + Idefics2VisionTransformer as Idefics3VisionTransformer) +# yapf: enable +from .interfaces import SupportsMultiModal +from .llama import LlamaModel +from .utils import AutoWeightsLoader, flatten_bn, merge_multimodal_embeddings + +logger = init_logger(__name__) + + +class Idefics3ImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + data: torch.Tensor + """ + Shape: `(batch_size * num_images, num_channels, height, width)` + """ + rows: List[int] + cols: List[int] + pixel_attention_mask: Optional[torch.BoolTensor] + + +class Idefics3ImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + data: torch.Tensor + """ + Shape: `(batch_size * num_images, image_feature_size, hidden_size)` + `hidden_size` must match the hidden size of language model backbone. + """ + + +ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs] + + +def input_mapper_for_idefics3( + ctx: InputContext, + data: object, +): + model_config = ctx.model_config + image_processor = cached_get_image_processor( + model_config.model, trust_remote_code=model_config.trust_remote_code) + if image_processor is None: + raise RuntimeError("No HuggingFace processor is available " + "to process the image object") + + if isinstance(data, Image.Image): + images = [[data]] + elif is_list_of(data, Image.Image): + images = [data] + else: + raise TypeError(f"Invalid image type: {type(data)}") + + try: + batch_data = image_processor(images, + return_tensors="pt", + return_row_col_info=True).data + except Exception: + logger.error("Failed to process image (%s)", data) + raise + + return MultiModalInputs(batch_data) + + +def _resize_output_size(height: int, + width: int, + max_len: Optional[int] = None, + min_len: Optional[int] = 1, + max_size: Optional[int] = None) -> Tuple[int, int]: + # Set default value for max_len if not provided + max_len = max(height, width) if max_len is None else max_len + aspect_ratio = width / height + + # Handle the maximum size constraint + if max_size is not None: + max_len = min(max_len, max_size) + + # Adjust dimensions according to the aspect ratio + if width >= height: + width = max_len + height = int(width / aspect_ratio) + else: + height = max_len + width = int(height * aspect_ratio) + + # Ensure both width and height are even (if needed) + height += 1 if height % 2 != 0 else 0 + width += 1 if width % 2 != 0 else 0 + + # Ensure dimensions are not smaller than the minimum length + height = max(height, min_len) + width = max(width, min_len) + + return height, width + + +def _get_resize_output_image_size( + image_size: Tuple[int, int], + resolution_max_side: int, + max_image_size: int = 1820, +) -> Tuple[int, int]: + if resolution_max_side > max_image_size: + raise ValueError( + "`resolution_max_side` cannot be larger than `max_image_size`") + + height, width = image_size + + # Find the output size, when rescaling the longest edge to max_len and + # preserving the aspect ratio + height, width = _resize_output_size(height, + width, + max_len=resolution_max_side) + + return height, width + + +def _prompt_split_image(image_seq_len: int, image_rows: int, image_cols: int, + fake_token_around_image: str, image_token: str, + global_img_token: str) -> str: + """ + Prompt with expanded image tokens for when the image is split + into patches. + """ + text_split_images = "" + for n_h in range(image_rows): + for n_w in range(image_cols): + text_split_images += (fake_token_around_image + + f"" + + image_token * image_seq_len) + text_split_images += "\n" + + text_split_images += "\n" + _prompt_single_image( + image_seq_len=image_seq_len, + fake_token_around_image=fake_token_around_image, + image_token=image_token, + global_img_token=global_img_token) + return text_split_images + + +def _prompt_single_image(image_seq_len: int, fake_token_around_image: str, + image_token: str, global_img_token: str): + """Prompt with expanded image tokens for a single image.""" + return (fake_token_around_image + global_img_token + + image_token * image_seq_len + fake_token_around_image) + + +def _get_image_prompt_string(image_rows: int, image_cols: int, + image_seq_len: int, fake_token_around_image: str, + image_token: str, global_img_token: str): + if image_rows == 0 and image_cols == 0: + return _prompt_single_image( + image_seq_len=image_seq_len, + fake_token_around_image=fake_token_around_image, + image_token=image_token, + global_img_token=global_img_token, + ) + return _prompt_split_image(image_seq_len, image_rows, image_cols, + fake_token_around_image, image_token, + global_img_token) + + +def input_processor_for_idefics3(ctx: InputContext, inputs: DecoderOnlyInputs): + multi_modal_data = inputs.get("multi_modal_data") + if multi_modal_data is None or "image" not in multi_modal_data: + return inputs + + model_config = ctx.model_config + processor = cached_get_processor(model_config.model) + image_processor = processor.image_processor + tokenizer = processor.tokenizer + size = image_processor.size['longest_edge'] + max_image_size = image_processor.max_image_size['longest_edge'] + + image_data = multi_modal_data["image"] + if isinstance(image_data, Image.Image): + image_list = [image_data] + elif is_list_of(image_data, Image.Image): + image_list = image_data + else: + raise TypeError(f"Invalid image type: {type(image_data)}") + + image_rows = [] + image_cols = [] + for image in image_list: + height, width = _get_resize_output_image_size(image.size, size) + + rows = math.ceil(height / max_image_size) + cols = math.ceil(width / max_image_size) + image_rows.append(rows) + image_cols.append(cols) + image_rows = [image_rows] + image_cols = [image_cols] + + n_images_in_text = [] + + text = inputs.get("prompt") + if text is not None: + if isinstance(text, str): + text = [text] + elif not isinstance(text, list) and not isinstance(text[0], str): + raise ValueError("Invalid input text. Please provide a string, " + "or a list of strings") + + fake_image_token = processor.fake_image_token.content + image_token = processor.image_token.content + global_img_token = processor.global_image_tag + + prompt_strings = [] + for sample, sample_rows, sample_cols in zip(text, image_rows, + image_cols): + n_images_in_text.append(sample.count(image_token)) + + # Replace the image token with fake tokens around the expanded + # image token sequence of length `image_seq_len` + image_prompt_strings = [] + for n_rows, n_cols in zip(sample_rows, sample_cols): + image_prompt_string = _get_image_prompt_string( + n_rows, + n_cols, + processor.image_seq_len, + image_token=image_token, + fake_token_around_image=fake_image_token, + global_img_token=global_img_token, + ) + image_prompt_strings.append(image_prompt_string) + + split_sample = sample.split(image_token) + if len(split_sample) == 0: + raise ValueError( + "The image token should be present in the text.") + + # Place in the image prompt strings where the image tokens are + sample = split_sample[0] + for i, image_prompt_string in enumerate(image_prompt_strings): + sample += image_prompt_string + split_sample[i + 1] + prompt_strings.append(sample) + + prompt_token_ids = tokenizer(text=prompt_strings[0]).input_ids + + return token_inputs( + prompt_token_ids=prompt_token_ids, + prompt=prompt_strings[0], + multi_modal_data=multi_modal_data, + ) + + +def get_max_idefics3_image_tokens(ctx: InputContext, + *, + num_crops: Optional[int] = None): + model_config = ctx.model_config + processor = cached_get_processor(model_config.model) + image_seq_len = processor.image_seq_len + image_processor = processor.image_processor + + size = image_processor.size['longest_edge'] + max_image_size = image_processor.max_image_size['longest_edge'] + resized_height, resized_width = size, size + + grid_h = resized_height // max_image_size + grid_w = resized_width // max_image_size + + return (grid_h * grid_w + 1) * image_seq_len + + +def dummy_data_for_idefics3(ctx: InputContext, seq_len: int, + mm_counts: Mapping[str, int]) -> DummyData: + hf_config = ctx.get_hf_config() + num_images = mm_counts["image"] + + processor = cached_get_processor(ctx.model_config.model) + image_seq_len = processor.image_seq_len + max_llm_image_tokens = 17 * image_seq_len * num_images + + seq_data = SequenceData.from_prompt_token_counts( + (hf_config.image_token_id, max_llm_image_tokens), (0, seq_len)) + + width = height = hf_config.vision_config.image_size + image = Image.new("RGB", (width, height), color=0) + mm_data = {"image": [image] if num_images == 1 else [image] * num_images} + + return DummyData(seq_data, mm_data) + + +class Idefics3SimpleMLP(nn.Module): + + def __init__(self, config): + super().__init__() + input_size = config.vision_config.hidden_size * (config.scale_factor** + 2) + output_size = config.text_config.hidden_size + self.proj = ReplicatedLinear(input_size, output_size, bias=False) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + out, _ = self.proj(x) + return out + + +class Idefics3Connector(nn.Module): + + def __init__(self, config): + super().__init__() + self.scale_factor = config.scale_factor + self.modality_projection = Idefics3SimpleMLP(config) + + def pixel_shuffle(self, + x: torch.Tensor, + scale_factor: int = 2) -> torch.Tensor: + bsz, seq, embed_dim = x.size() + height = width = int(seq**0.5) + x = x.view(bsz, height, width, embed_dim) + x = x.view(bsz, height, int(width / scale_factor), + embed_dim * scale_factor) + x = x.permute(0, 2, 1, 3) + x = x.reshape( + bsz, + int(width / scale_factor), + int(height / scale_factor), + embed_dim * (scale_factor**2), + ) + x = x.permute(0, 2, 1, 3) + x = x.reshape(bsz, int(seq / (scale_factor**2)), + embed_dim * (scale_factor**2)) + return x + + def forward(self, image_hidden_states: torch.Tensor) -> torch.Tensor: + image_hidden_states = self.pixel_shuffle(image_hidden_states, + self.scale_factor) + image_hidden_states = self.modality_projection(image_hidden_states) + return image_hidden_states + + +class Idefics3Model(nn.Module): + + def __init__( + self, + config: Idefics3Config, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + self.config = config + self.padding_idx = self.config.text_config.pad_token_id + self.vocab_size = self.config.text_config.vocab_size + + self.vision_model = Idefics3VisionTransformer(config.vision_config, + quant_config) + self.connector = Idefics3Connector(config) + self.text_model = LlamaModel(config.text_config, cache_config, + quant_config) + + self.image_seq_len = int( + ((config.vision_config.image_size // + config.vision_config.patch_size)**2) / (config.scale_factor**2)) + self.image_token_id = self.config.image_token_id + + def _validate_pixel_values( + self, data: Union[torch.Tensor, List[torch.Tensor]] + ) -> Union[torch.Tensor, List[torch.Tensor]]: + + h = w = self.config.vision_config.image_size + expected_dims = (3, h, w) + + def _validate_shape(d: torch.Tensor): + actual_dims = tuple(d.shape[1:]) + + if actual_dims != expected_dims: + expected_expr = ("num_patches", *map(str, expected_dims)) + raise ValueError( + "The expected shape of pixel values per image per batch " + f"is {expected_expr}. You supplied {tuple(d.shape)}.") + + for d in data: + _validate_shape(d) + + return data + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[ImageInputs]: + pixel_values = kwargs.pop("pixel_values", None) + image_embeds = kwargs.pop("image_embeds", None) + rows = kwargs.pop("rows", None) + cols = kwargs.pop("cols", None) + pixel_attention_mask = kwargs.pop("pixel_attention_mask", None) + + if pixel_values is None and image_embeds is None: + return None + + if image_embeds is not None: + if not isinstance(image_embeds, (torch.Tensor, list)): + raise ValueError("Incorrect type of image embeddings. " + f"Got type: {type(image_embeds)}") + + return Idefics3ImageEmbeddingInputs( + type="image_embeds", + data=flatten_bn(image_embeds, concat=True), + ) + + if pixel_values is not None: + if not isinstance(pixel_values, (torch.Tensor, list)): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") + + return Idefics3ImagePixelInputs(type="pixel_values", + data=self._validate_pixel_values( + flatten_bn(pixel_values, + concat=True)), + rows=rows, + cols=cols, + pixel_attention_mask=flatten_bn( + pixel_attention_mask, + concat=True)) + + raise AssertionError("This line should be unreachable.") + + def _image_pixels_to_features( + self, + pixel_values: torch.Tensor, + pixel_attention_mask: Optional[torch.BoolTensor] = None, + ) -> torch.Tensor: + # NOTE: we skip the step to select the vision feature layer since + # this is already done inside the vision tower + batch_size, num_images, num_channels, height, width = pixel_values.shape + pixel_values = pixel_values.to( + dtype=self.vision_model.embeddings.patch_embedding.weight.dtype + ) # fp16 compatibility + pixel_values = pixel_values.view(batch_size * num_images, + *pixel_values.shape[2:]) + + # Remove padding images - padding images are full 0. + nb_values_per_image = pixel_values.shape[1:].numel() + real_images_inds = (pixel_values == 0.0).sum( + dim=(-1, -2, -3)) != nb_values_per_image + pixel_values = pixel_values[real_images_inds].contiguous() + + # Handle the vision attention mask + if pixel_attention_mask is None: + pixel_attention_mask = torch.ones( + size=(pixel_values.size(0), pixel_values.size(2), + pixel_values.size(3)), + dtype=torch.bool, + device=pixel_values.device, + ) + else: + # Remove padding images from the mask + pixel_attention_mask = pixel_attention_mask.view( + batch_size * num_images, *pixel_attention_mask.shape[2:]) + pixel_attention_mask = pixel_attention_mask[ + real_images_inds].contiguous() + + patch_size = self.config.vision_config.patch_size + patches_subgrid = pixel_attention_mask.unfold(dimension=1, + size=patch_size, + step=patch_size) + patches_subgrid = patches_subgrid.unfold(dimension=2, + size=patch_size, + step=patch_size) + patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() + + # Get sequence from the vision encoder + image_hidden_states = self.vision_model( + pixel_values=pixel_values, + patch_attention_mask=patch_attention_mask, + ) + + return image_hidden_states + + def _process_image_pixels( + self, inputs: Idefics3ImagePixelInputs) -> torch.Tensor: + assert self.vision_model is not None + + pixel_values = inputs["data"] + pixel_attention_mask = inputs["pixel_attention_mask"] + + return self._image_pixels_to_features(pixel_values, + pixel_attention_mask) + + def _process_image_input(self, image_input: ImageInputs) -> torch.Tensor: + if image_input["type"] == "image_embeds": + return image_input["data"] + + assert self.vision_model is not None + image_features = self._process_image_pixels(image_input) + return self.connector(image_features) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + if intermediate_tensors is not None: + input_ids = None + inputs_embeds = None + else: + # always pass the input via `inputs_embeds` + # to make sure the computation graph is consistent + image_input = self._parse_and_validate_image_input(**kwargs) + + if image_input is not None: + vision_embeddings = self._process_image_input(image_input) + inputs_embeds = self.text_model.get_input_embeddings(input_ids) + + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, vision_embeddings, + self.image_token_id) + else: + inputs_embeds = self.text_model.get_input_embeddings(input_ids) + input_ids = None + + hidden_states = self.text_model( + input_ids, + positions, + kv_caches, + attn_metadata, + intermediate_tensors, + inputs_embeds=inputs_embeds, + ) + return hidden_states + + +@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_idefics3) +@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_idefics3_image_tokens) +@INPUT_REGISTRY.register_dummy_data(dummy_data_for_idefics3) +@INPUT_REGISTRY.register_input_processor(input_processor_for_idefics3) +class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal): + + def __init__( + self, + config: Idefics3Config, + multimodal_config: MultiModalConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + + self.config = config + self.multimodal_config = multimodal_config + + self.model = Idefics3Model(config, cache_config, quant_config) + self.image_token_id = self.config.image_token_id + + self.lm_head = ParallelLMHead( + config.text_config.vocab_size, + config.text_config.hidden_size, + quant_config=quant_config, + ) + if self.config.text_config.tie_word_embeddings: + self.lm_head.weight = self.model.text_model.wte.weight + self.logits_processor = LogitsProcessor(config.text_config.vocab_size) + self.sampler = Sampler() + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + hidden_states = self.model( + input_ids, + positions, + kv_caches, + attn_metadata, + intermediate_tensors, + **kwargs, + ) + return hidden_states + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + loader = AutoWeightsLoader(self) + loader.load_weights(weights) diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index afefb6cd9fa96..7ddb1e2a1ab10 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -20,7 +20,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -338,7 +338,7 @@ def __init__( if self.config.tie_word_embeddings: self.output.weight = self.model.tok_embeddings.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index d2ec0ff6e74c6..bb9d38889a175 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -21,7 +21,7 @@ InputContext, token_inputs) from vllm.model_executor.layers.quantization import (AWQConfig, QuantizationConfig) -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.models.intern_vit import (InternVisionModel, InternVisionPatchModel) from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -467,7 +467,7 @@ def sampler(self): if hasattr(self.language_model, "sampler"): return self.language_model.sampler - return Sampler() + return get_sampler() def _init_vision_model( self, diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index 301893f74cb87..23fdca09493b7 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -34,7 +34,7 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -308,7 +308,7 @@ def __init__( config.mup_width_scale) self.logits_processor = LogitsProcessor(vocab_size=config.vocab_size, scale=self.output_logits_scale) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 81d88a47c1941..9b18a1b68f9d3 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -17,7 +17,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -383,7 +383,7 @@ def __init__( self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() def forward(self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index d768a57b7ef8a..9e8a403b2f1fc 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -42,7 +42,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( get_compressed_tensors_cache_scale) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -536,7 +536,7 @@ def __init__( self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, logit_scale) - self.sampler = Sampler() + self.sampler = get_sampler() else: self.lm_head = PPMissingLayer() self.make_empty_intermediate_tensors = ( diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 7fbd59ebd98fd..bdd67b12a06d8 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -14,7 +14,7 @@ InputContext) from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.sequence import IntermediateTensors @@ -302,7 +302,7 @@ def sampler(self): if hasattr(self.language_model, "sampler"): return self.language_model.sampler - return Sampler() + return get_sampler() def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: h = w = self.config.vision_config.image_size diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 7a2c95594ddcd..37b8baa8c6be0 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -16,7 +16,7 @@ InputContext) from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY @@ -327,7 +327,7 @@ def sampler(self): if hasattr(self.language_model, "sampler"): return self.language_model.sampler - return Sampler() + return get_sampler() def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor: expected_dims = (2, ) diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index b755e2347f6ed..69bfc80a4372c 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -15,7 +15,7 @@ InputContext, token_inputs) from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY @@ -289,7 +289,7 @@ def sampler(self): if hasattr(self.language_model, "sampler"): return self.language_model.sampler - return Sampler() + return get_sampler() def _validate_video_pixel_values( self, data: Union[torch.Tensor, List[torch.Tensor]] diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index f410d64577a77..26ece8190e7de 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -19,7 +19,7 @@ InputContext, token_inputs) from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.utils import (cached_get_tokenizer, @@ -437,7 +437,7 @@ def sampler(self): if hasattr(self.language_model, "sampler"): return self.language_model.sampler - return Sampler() + return get_sampler() def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor: expected_dims = (2, ) diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index aac4b7aa2661d..91161957642f9 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -13,7 +13,7 @@ from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -169,7 +169,7 @@ def __init__( self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() def forward(self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index acf03cd8cb8ad..7704431a4d90a 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -43,7 +43,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -496,7 +496,7 @@ def __init__( self.logits_processor = LogitsProcessor(unpadded_vocab_size, config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 5acd3f65896c7..4ffe33bb6ce41 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -41,7 +41,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.resampler import (BaseResampler, Resampler2, get_2d_sincos_pos_embed) -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.utils import set_default_torch_dtype from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -420,7 +420,7 @@ def __init__( quant_config=quant_config, prefix="llm.lm_head") self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.llm.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index e9b9c4d838faa..f5c28e7d74811 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -38,7 +38,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -366,7 +366,7 @@ def __init__( self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index 9647d69be8a0a..007c4e2eabc90 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -40,7 +40,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -366,7 +366,7 @@ def __init__( if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 5fa8d19b97fe8..d442ffe3c1fb1 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -44,7 +44,7 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -1141,7 +1141,7 @@ def __init__(self, ) self.logits_processor = LogitsProcessor(config.output_hidden_states, config.text_config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() def compute_logits( self, diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index ae218d749fc0b..fde44265414c5 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -6,7 +6,7 @@ from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -137,7 +137,7 @@ def __init__(self, config: MLPSpeculatorConfig, **kwargs) -> None: self.config = config self.logits_processor = LogitsProcessor(config.vocab_size, config.vocab_size, 1.0) - self.sampler = Sampler() + self.sampler = get_sampler() def generate_proposals( self, diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 785b53670542f..3a50923de3741 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -33,7 +33,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -1053,7 +1053,7 @@ def __init__( self.logits_processor = LogitsProcessor(config.embedding_size or config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index fdd8af79b5470..b3977812cb273 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -16,7 +16,7 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -153,7 +153,7 @@ def __init__( bias=not config.no_bias, quant_config=quant_config, ) - self.act = get_act_fn("gelu", quant_config, intermediate_size) + self.act = get_act_fn("gelu") self.down_proj = RowParallelLinear( intermediate_size, hidden_size, @@ -281,7 +281,7 @@ def __init__( self.transformer = MPTModel(config, cache_config, quant_config) self.lm_head = self.transformer.wte self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index b649064536dc2..8d128a42b14b8 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -36,7 +36,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -441,7 +441,7 @@ def __init__( self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, logit_scale) - self.sampler = Sampler() + self.sampler = get_sampler() else: self.lm_head = PPMissingLayer() self.make_empty_intermediate_tensors = ( diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index dd3f58289a227..545d86eebb5ec 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -37,7 +37,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -309,7 +309,7 @@ def __init__(self, quant_config=quant_config, ) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 374cbb8df1fcd..de30b5270e7e8 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -28,7 +28,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -323,7 +323,7 @@ def __init__( config.hidden_size, quant_config=quant_config) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 7a76e4a0906db..a453376d02552 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -33,7 +33,7 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -147,8 +147,7 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.fc1", ) - self.activation_fn = get_act_fn(config.activation_function, - quant_config, config.ffn_dim) + self.activation_fn = get_act_fn(config.activation_function) self.fc2 = RowParallelLinear( config.ffn_dim, self.embed_dim, @@ -363,7 +362,7 @@ def __init__( self.lm_head = ParallelLMHead(config.vocab_size, config.word_embed_proj_dim) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index a338a93c2dd9a..d6ec1fb602f05 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -20,7 +20,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -284,7 +284,7 @@ def __init__( if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index bd4a9f698bacd..11e7c8abd4888 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -36,7 +36,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -60,7 +60,7 @@ def __init__(self, self.dense_4h_to_h = RowParallelLinear(config.intermediate_size, config.hidden_size, quant_config=quant_config) - self.act = get_act_fn(config.hidden_act, quant_config) + self.act = get_act_fn(config.hidden_act) def forward(self, hidden_states) -> torch.Tensor: hidden_states, _ = self.dense_h_to_4h(hidden_states) @@ -279,7 +279,7 @@ def __init__(self, config.hidden_size, bias=False) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 492122450b237..4dae6e323654b 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -51,7 +51,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -152,7 +152,7 @@ def __init__(self, config.hidden_size, quant_config=quant_config, ) - self.act = get_act_fn(config.hidden_act, quant_config, n_inner) + self.act = get_act_fn(config.hidden_act) def forward(self, hidden_states): hidden_states, _ = self.fc1(hidden_states) @@ -300,7 +300,7 @@ def __init__( bias=True, quant_config=quant_config) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py index 3a7afc606bb9a..92bf0e61448e5 100644 --- a/vllm/model_executor/models/phi3_small.py +++ b/vllm/model_executor/models/phi3_small.py @@ -15,7 +15,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -386,7 +386,7 @@ def __init__( if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 1c41891ced416..a84d6b317b479 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -32,7 +32,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.models.clip import CLIPVisionModel @@ -570,7 +570,7 @@ def sampler(self): if hasattr(self.language_model, "sampler"): return self.language_model.sampler - return Sampler() + return get_sampler() def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor: expected_dims = (2, ) diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 59843ae3dfd59..19e2621ead996 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -38,7 +38,7 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -562,7 +562,7 @@ def __init__( ) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 6e9092432467a..facf1969b9479 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -25,7 +25,7 @@ QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.utils import merge_multimodal_embeddings from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -190,7 +190,7 @@ def sampler(self): if hasattr(self.language_model, "sampler"): return self.language_model.sampler - return Sampler() + return get_sampler() def forward( self, diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 3a0e33e8a3eff..c91c2caa3d519 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -36,7 +36,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.resampler import Resampler2, get_abs_pos from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -203,7 +203,7 @@ def __init__( intermediate_size, bias=True, quant_config=quant_config) - self.act_fn = get_act_fn("gelu", quant_config, intermediate_size) + self.act_fn = get_act_fn("gelu") self.c_proj = RowParallelLinear( intermediate_size, hidden_size, @@ -884,7 +884,7 @@ def __init__( if self.config.tie_word_embeddings: self.lm_head.weight = self.transformer.wte.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 49b3de1304cca..1e99c1b13b31f 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -39,7 +39,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -444,7 +444,7 @@ def __init__( prefix, "lm_head")) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 556c09400ee83..54a7085f69ba9 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -36,7 +36,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) @@ -295,7 +295,7 @@ def __init__(self, self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.text_config.vocab_size, logit_scale) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 98bb48a274e49..c8c48c0894c36 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -44,7 +44,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -393,7 +393,7 @@ def __init__( if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index e30b84e8dd44c..af263262bd239 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -40,7 +40,7 @@ from vllm.attention import AttentionMetadata from vllm.attention.selector import _Backend -from vllm.config import CacheConfig, MultiModalConfig +from vllm.config import CacheConfig, LoRAConfig, MultiModalConfig from vllm.distributed import get_pp_group, parallel_state from vllm.distributed import utils as dist_utils from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, @@ -52,7 +52,7 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.qwen2 import Qwen2Model @@ -65,7 +65,7 @@ from vllm.transformers_utils.config import uses_mrope from vllm.transformers_utils.processor import cached_get_processor -from .interfaces import SupportsMultiModal, SupportsPP +from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .utils import (PPMissingLayer, get_vit_attn_backend, is_pp_missing_parameter, make_empty_intermediate_tensors_factory) @@ -927,13 +927,37 @@ def input_processor_for_qwen2_vl( @INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_vl) @INPUT_REGISTRY.register_input_processor(input_processor_for_qwen2_vl) class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, - SupportsPP): + SupportsLoRA, SupportsPP): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + # LoRA specific attributes + # TODO Support LoRA for the visual encoder in the future. + supported_lora_modules = [ + "qkv_proj", + "o_proj", + "gate_up_proj", + "down_proj", + ] + embedding_modules = {} + embedding_padding_modules = [] def __init__(self, config: Qwen2VLConfig, multimodal_config: MultiModalConfig, cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None) -> None: + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None) -> None: + super().__init__() assert not cache_config.enable_prefix_caching, \ @@ -966,7 +990,7 @@ def __init__(self, self.lm_head = PPMissingLayer() self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 792c6cec34ae0..32750602b988c 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -120,6 +120,7 @@ "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"), "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"), "InternVLChatModel": ("internvl", "InternVLChatModel"), + "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"), "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"), "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), # noqa: E501 diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 1b233ac7427dd..931e48a44f631 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -42,7 +42,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( get_compressed_tensors_cache_scale) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -449,7 +449,7 @@ def __init__( self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, logit_scale) - self.sampler = Sampler() + self.sampler = get_sampler() else: self.lm_head = PPMissingLayer() diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 34389b645a7c1..4cb55506bb237 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -34,7 +34,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -261,7 +261,7 @@ def __init__( if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index b24c5dadb2b2b..0b0e3f21065b4 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -34,7 +34,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -139,8 +139,7 @@ def __init__(self, bias=config.use_bias, quant_config=quant_config, ) - self.act = get_act_fn(config.hidden_act, quant_config, - config.intermediate_size) + self.act = get_act_fn(config.hidden_act) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states, _ = self.c_fc(hidden_states) @@ -270,7 +269,7 @@ def __init__(self, ) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 749750fc9c16e..3a343986a9345 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -21,7 +21,7 @@ from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.loader import DefaultModelLoader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalInputs, @@ -379,7 +379,7 @@ def sampler(self): if hasattr(self.language_model, "sampler"): return self.language_model.sampler - return Sampler() + return get_sampler() def _audio_features_to_embeddings( self, input_features: torch.Tensor) -> torch.Tensor: diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py index e559988ada753..1d08b382b0b00 100644 --- a/vllm/model_executor/models/xverse.py +++ b/vllm/model_executor/models/xverse.py @@ -37,7 +37,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -334,7 +334,7 @@ def __init__( if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 78bca391c028e..6b38ee31c2657 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -132,6 +132,15 @@ def uses_mrope(config: PretrainedConfig) -> bool: return "mrope_section" in rope_scaling +def is_encoder_decoder(config: PretrainedConfig) -> bool: + """Detect if the model with this config is used as an encoder/decoder.""" + text_config = getattr(config, "text_config", None) + if text_config is not None: + return is_encoder_decoder(text_config) + + return getattr(config, "is_encoder_decoder", False) + + def get_config( model: Union[str, Path], trust_remote_code: bool, diff --git a/vllm/utils.py b/vllm/utils.py index d78130873d3dc..13d7f6d475346 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -88,9 +88,6 @@ "currently supported with encoder/" "decoder models.") -STR_NOT_IMPL_ENC_DEC_CPU = ("CPU is not currently supported with " - "encoder/decoder models.") - # Efficiently import all enc/dec error strings # rather than having to import all of the above STR_NOT_IMPL_ENC_DEC_ERR_STRS = { @@ -105,7 +102,6 @@ "STR_NOT_IMPL_ENC_DEC_SPEC_DEC": STR_NOT_IMPL_ENC_DEC_SPEC_DEC, "STR_NOT_IMPL_ENC_DEC_BACKEND": STR_NOT_IMPL_ENC_DEC_BACKEND, "STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER": STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER, - "STR_NOT_IMPL_ENC_DEC_CPU": STR_NOT_IMPL_ENC_DEC_CPU } # Constants related to forcing the attention backend selection diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 906f06777a136..e73a1e60b2730 100644 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -136,7 +136,7 @@ def forward( "key/v_scale is not supported in FlashAttention.") output = torch.empty_like(query) - torch.ops.vllm.unified_flash_attention( + torch.ops.vllm.unified_v1_flash_attention( output, query, key, @@ -156,7 +156,7 @@ def forward( return output -def unified_flash_attention( +def unified_v1_flash_attention( output: torch.Tensor, query: torch.Tensor, key: torch.Tensor, @@ -222,7 +222,7 @@ def unified_flash_attention( output[:num_actual_tokens].copy_(attn_output) -def unified_flash_attention_fake( +def unified_v1_flash_attention_fake( output: torch.Tensor, query: torch.Tensor, key: torch.Tensor, @@ -243,8 +243,8 @@ def unified_flash_attention_fake( direct_register_custom_op( - op_name="unified_flash_attention", - op_func=unified_flash_attention, + op_name="unified_v1_flash_attention", + op_func=unified_v1_flash_attention, mutates_args=["kv_cache", "output"], - fake_impl=unified_flash_attention_fake, + fake_impl=unified_v1_flash_attention_fake, ) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 64cc18149d6c5..5f5720480abdc 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -155,6 +155,12 @@ def __init__( # GPU and CPU blocks, which are profiled in the distributed executor. self.scheduler = Scheduler(scheduler_config, cache_config, lora_config) + def __del__(self): + # Small hack- implicit clean up of resources on garbage collect + # TODO: this should probably be explicitly invoked when we're done with + # the engine + self.terminate_detokenizer() + def _initialize_kv_caches(self) -> None: num_gpu_blocks, _ = self.model_executor.determine_num_available_blocks( ) diff --git a/vllm/v1/tokenizer/detokenizer.py b/vllm/v1/tokenizer/detokenizer.py index 4bbcf4717981e..e485fcc3522d9 100644 --- a/vllm/v1/tokenizer/detokenizer.py +++ b/vllm/v1/tokenizer/detokenizer.py @@ -73,7 +73,7 @@ def recv(self) -> Optional[DetokenizerOutputs]: return None def terminate(self) -> None: - self.push_socket.send(b"", flags=zmq.NOBLOCK) + self.detokenizer.kill() self.detokenizer.join() @@ -108,10 +108,10 @@ def run(self): self.push_socket.bind(f"tcp://*:{self.push_port}") while True: + if self.pull_socket.poll(timeout=1000) == 0: + # Nothing to read + continue message = self.pull_socket.recv() - if message == b"": - # Terminate signal. - break inputs = self.msgpack_decoder.decode(message) for req_id in inputs.free_req_ids: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 63bf7c2e605a2..9bb49a21453d0 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2,7 +2,6 @@ import time from dataclasses import dataclass from typing import TYPE_CHECKING, Dict, List, Optional, Set -from unittest.mock import patch import numpy as np import torch @@ -26,7 +25,6 @@ FlashAttentionMetadata) from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.sample.metadata import SamplingMetadata -from vllm.v1.sample.sampler import Sampler if TYPE_CHECKING: from vllm.v1.core.scheduler import SchedulerOutput @@ -148,7 +146,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: for req_data in scheduler_output.scheduled_new_reqs: req_id = req_data.req_id sampling_params = req_data.sampling_params - if sampling_params.seed is not None: + if sampling_params.sampling_type == SamplingType.RANDOM_SEED: generator = torch.Generator(device=self.device) generator.manual_seed(sampling_params.seed) else: @@ -384,7 +382,8 @@ def execute_model( # Rewind the generator state as if the token was not sampled. generator = self.input_batch.generators.get(i) if generator is not None: - generator.set_offset(generator.get_offset() - 1) + # This relies on cuda-specific torch-internal impl details + generator.set_offset(generator.get_offset() - 4) if sampler_output.logprob_token_ids is None: logprob_token_ids = None @@ -418,8 +417,7 @@ def load_model(self) -> None: logger.info("Starting to load model %s...", self.model_config.model) with DeviceMemoryProfiler() as m: # noqa: SIM117 - with patch("vllm.model_executor.layers.sampler.Sampler", Sampler): - self.model = get_model(vllm_config=self.vllm_config) + self.model = get_model(vllm_config=self.vllm_config) self.model_memory_usage = m.consumed_memory logger.info("Loading model weights took %.4f GB", diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index fdd72a452f2ad..26a15ed645c43 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -18,7 +18,6 @@ MultiModalInputs, MultiModalPlaceholderMap) from vllm.sequence import (IntermediateTensors, SequenceData, SequenceGroupMetadata) -from vllm.transformers_utils.config import uses_mrope from vllm.utils import make_tensor_with_pad from vllm.worker.model_runner_base import ( ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase, @@ -163,7 +162,7 @@ def _compute_multi_modal_input(self, seq_group: SequenceGroupMetadata, # special processing for mrope position deltas. mrope_positions = None - if self.runner.model_is_mrope: + if self.runner.model_config.uses_mrope: image_grid_thw = mm_kwargs.get("image_grid_thw", None) video_grid_thw = mm_kwargs.get("video_grid_thw", None) assert image_grid_thw is not None or video_grid_thw is not None, ( @@ -446,12 +445,6 @@ def __init__( # Lazy initialization. self.model: nn.Module # Set after init_Model - @property - def model_is_mrope(self) -> bool: - """Detect if the model has "mrope" rope_scaling type. - mrope requires keep "rope_deltas" between prompt and decoding phases.""" - return uses_mrope(self.model_config.hf_config) - def load_model(self) -> None: self.model = get_model(vllm_config=self.vllm_config) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 3778707ae07e8..2914f520d823c 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -151,7 +151,7 @@ def __init__( self.local_omp_cpuid = omp_cpuids.split("|")[rank] ModelRunnerClass: Type[CPUModelRunner] = CPUModelRunner - if self._is_encoder_decoder_model(): + if self.model_config.is_encoder_decoder: ModelRunnerClass = CPUEncoderDecoderModelRunner self.model_runner: CPUModelRunner = ModelRunnerClass( vllm_config=vllm_config, @@ -188,9 +188,6 @@ def stop_profile(self): raise RuntimeError("Profiler is not enabled.") self.profiler.stop() - def _is_encoder_decoder_model(self): - return self.model_config.is_encoder_decoder_model - def init_device(self) -> None: if self.local_omp_cpuid != "all": ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 1e8ea4e8e79cf..a1ec2e85be7b8 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -47,7 +47,6 @@ LRUCacheWorkerPromptAdapterManager) from vllm.sampling_params import SamplingParams from vllm.sequence import IntermediateTensors, SequenceGroupMetadata -from vllm.transformers_utils.config import uses_mrope from vllm.utils import (DeviceMemoryProfiler, GiB_bytes, PyObjectCache, async_tensor_h2d, flatten_2d_lists, is_pin_memory_available, supports_dynamo, @@ -493,7 +492,7 @@ def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int, context_len = seq_data.get_num_computed_tokens() seq_len = min(seq_len, context_len + token_chunk_size) elif self.runner.scheduler_config.is_multi_step or \ - self.runner.model_config.is_encoder_decoder_model: + self.runner.model_config.is_encoder_decoder: context_len = seq_len - 1 else: context_len = seq_data.get_num_computed_tokens() @@ -666,7 +665,7 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, inter_data.multi_modal_placeholder_maps = placeholder_maps # special processing for mrope position deltas. - if self.runner.model_is_mrope: + if self.runner.model_config.uses_mrope: image_grid_thw = mm_kwargs.get("image_grid_thw", None) video_grid_thw = mm_kwargs.get("video_grid_thw", None) assert image_grid_thw is not None or video_grid_thw is not None, ( @@ -711,7 +710,7 @@ def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): encoder_seq_len = 0 - if self.runner.model_config.is_encoder_decoder_model: + if self.runner.model_config.is_encoder_decoder: encoder_seq_len = seq_group_metadata.encoder_seq_data.get_len() inter_data = self.init_cached_inter_data( @@ -837,7 +836,7 @@ def build(self) -> ModelInputForGPU: if not inter_data.is_prompt: max_decode_seq_len = max(max_decode_seq_len, max(inter_data.seq_lens)) - if self.runner.model_config.is_encoder_decoder_model: + if self.runner.model_config.is_encoder_decoder: max_encoder_seq_len = max(max_encoder_seq_len, inter_data.encoder_seq_len) @@ -1375,12 +1374,6 @@ def list_prompt_adapters(self) -> Set[int]: raise RuntimeError("PromptAdapter is not enabled.") return self.prompt_adapter_manager.list_adapters() - @property - def model_is_mrope(self) -> bool: - """Detect if the model has "mrope" rope_scaling type. - mrope requires keep "rope_deltas" between prompt and decoding phases.""" - return uses_mrope(self.model_config.hf_config) - @torch.inference_mode() def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: """Cuda graph capture a model. @@ -1411,7 +1404,7 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: max_batch_size = self.max_batchsize_to_capture input_tokens = torch.zeros(max_batch_size, dtype=torch.long).cuda() input_positions = torch.zeros(max_batch_size, dtype=torch.long).cuda() - if self.model_is_mrope: + if self.model_config.uses_mrope: input_positions = torch.tile(input_positions, (3, 1)) # Prepare dummy previous_hidden_states only if needed by the model. # This is used by draft models such as EAGLE. @@ -1447,7 +1440,7 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: self.attn_state.graph_capture_get_metadata_for_batch( batch_size, is_encoder_decoder_model=self.model_config. - is_encoder_decoder_model)) + is_encoder_decoder)) if self.lora_config: lora_mapping = LoRAMapping( @@ -1466,7 +1459,7 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: graph_runner = CUDAGraphRunner( self.model, self.attn_backend.get_name(), self.attn_state.graph_clone(batch_size), - self.model_config.is_encoder_decoder_model) + self.model_config.is_encoder_decoder) capture_inputs = { "input_ids": @@ -1497,7 +1490,7 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: self.model.get_seqlen_agnostic_capture_inputs( batch_size) }) - if self.model_config.is_encoder_decoder_model: + if self.model_config.is_encoder_decoder: # add the additional inputs to capture for # encoder-decoder models. self._update_inputs_to_capture_for_enc_dec_model( diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 8928936b4f9fc..d8c8011a585d8 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -77,7 +77,7 @@ def __init__( ModelRunnerClass = model_runner_cls elif model_config.task == "embedding": ModelRunnerClass = EmbeddingModelRunner - elif self._is_encoder_decoder_model(): + elif self.model_config.is_encoder_decoder: ModelRunnerClass = EncoderDecoderModelRunner self.model_runner: GPUModelRunnerBase = ModelRunnerClass( vllm_config=self.vllm_config, @@ -119,9 +119,6 @@ def stop_profile(self): raise RuntimeError("Profiler is not enabled.") self.profiler.stop() - def _is_encoder_decoder_model(self): - return self.model_config.is_encoder_decoder_model - def init_device(self) -> None: if self.device_config.device.type == "cuda": # torch.distributed.all_reduce does not free the input tensor until