diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index f90e464288cf1..7cf05610b9953 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -56,7 +56,7 @@ def read_markdown(file): if os.path.exists(file): - with open(file, "r") as f: + with open(file) as f: return f.read() + "\n" else: return f"{file} not found.\n" @@ -75,14 +75,14 @@ def results_to_json(latency, throughput, serving): # collect results for test_file in results_folder.glob("*.json"): - with open(test_file, "r") as f: + with open(test_file) as f: raw_result = json.loads(f.read()) if "serving" in str(test_file): # this result is generated via `benchmark_serving.py` # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands"), "r") as f: + with open(test_file.with_suffix(".commands")) as f: command = json.loads(f.read()) raw_result.update(command) @@ -97,7 +97,7 @@ def results_to_json(latency, throughput, serving): # this result is generated via `benchmark_latency.py` # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands"), "r") as f: + with open(test_file.with_suffix(".commands")) as f: command = json.loads(f.read()) raw_result.update(command) @@ -119,7 +119,7 @@ def results_to_json(latency, throughput, serving): # this result is generated via `benchmark_throughput.py` # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands"), "r") as f: + with open(test_file.with_suffix(".commands")) as f: command = json.loads(f.read()) raw_result.update(command) diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py index 6059588fe7277..052060c576300 100644 --- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py @@ -72,7 +72,7 @@ def main(args): # collect results for test_file in results_folder.glob("*_nightly_results.json"): - with open(test_file, "r") as f: + with open(test_file) as f: results = results + json.loads(f.read()) # generate markdown table @@ -80,7 +80,7 @@ def main(args): md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False) - with open(args.description, "r") as f: + with open(args.description) as f: description = f.read() description = description.format( diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py index 4e4d4cd4ca3c6..92d6fad73a94c 100644 --- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py @@ -36,11 +36,11 @@ # collect results for test_file in results_folder.glob("*.json"): - with open(test_file, "r") as f: + with open(test_file) as f: raw_result = json.loads(f.read()) # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands"), "r") as f: + with open(test_file.with_suffix(".commands")) as f: command = json.loads(f.read()) raw_result.update(command) diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml index 18b354948f0cc..28d2e5fb8dbd9 100644 --- a/.github/workflows/mypy.yaml +++ b/.github/workflows/mypy.yaml @@ -25,7 +25,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - name: Set up Python ${{ matrix.python-version }} diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index f959a1cacf866..578c3fbd4e816 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -48,7 +48,7 @@ jobs: fail-fast: false matrix: os: ['ubuntu-20.04'] - python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] + python-version: ['3.9', '3.10', '3.11', '3.12'] pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt. cuda-version: ['11.8', '12.1'] diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml index 197f918765e7d..edf98ce2fcab0 100644 --- a/.github/workflows/ruff.yml +++ b/.github/workflows/ruff.yml @@ -29,19 +29,19 @@ jobs: matrix: python-version: ["3.12"] steps: - - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Analysing the code with ruff - run: | - echo "::add-matcher::.github/workflows/matchers/ruff.json" - ruff check --output-format github . - - name: Run isort - run: | - isort . --check-only + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements-lint.txt + - name: Analysing the code with ruff + run: | + echo "::add-matcher::.github/workflows/matchers/ruff.json" + ruff check --output-format github . + - name: Run isort + run: | + isort . --check-only diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml index 35579302c5c14..4221c139ccf79 100644 --- a/.github/workflows/yapf.yml +++ b/.github/workflows/yapf.yml @@ -23,16 +23,16 @@ jobs: matrix: python-version: ["3.12"] steps: - - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install yapf==0.32.0 - pip install toml==0.10.2 - - name: Running yapf - run: | - yapf --diff --recursive . + - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install yapf==0.32.0 + pip install toml==0.10.2 + - name: Running yapf + run: | + yapf --diff --recursive . diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 42cbf18a0f712..34735700a224e 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -6,17 +6,16 @@ version: 2 build: os: ubuntu-22.04 tools: - python: "3.8" + python: '3.9' sphinx: - configuration: docs/source/conf.py - fail_on_warning: true + configuration: docs/source/conf.py + fail_on_warning: true # If using Sphinx, optionally build your docs in additional formats such as PDF formats: [] # Optionally declare the Python requirements required to build your docs python: - install: - - requirements: docs/requirements-docs.txt - + install: + - requirements: docs/requirements-docs.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index 943424bc4edfa..c372ba98befbf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -128,9 +128,9 @@ endif() if(VLLM_GPU_LANG STREQUAL "CUDA") # - # For cuda we want to be able to control which architectures we compile for on + # For cuda we want to be able to control which architectures we compile for on # a per-file basis in order to cut down on compile time. So here we extract - # the set of architectures we want to compile for and remove the from the + # the set of architectures we want to compile for and remove the from the # CMAKE_CUDA_FLAGS so that they are not applied globally. # clear_cuda_arches(CUDA_ARCH_FLAGS) @@ -138,7 +138,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "CUDA target architectures: ${CUDA_ARCHS}") # Filter the target architectures by the supported supported archs # since for some files we will build for all CUDA_ARCHS. - cuda_archs_loose_intersection(CUDA_ARCHS + cuda_archs_loose_intersection(CUDA_ARCHS "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}") message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}") else() @@ -236,7 +236,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # are not supported by Machete yet. cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS}) if (MARLIN_ARCHS) - set(MARLIN_SRCS + set(MARLIN_SRCS "csrc/quantization/fp8/fp8_marlin.cu" "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu" "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu" @@ -277,7 +277,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "in CUDA target architectures") endif() - # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't + # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't # build any 3x kernels set(SCALED_MM_3X_ARCHS) endif() @@ -285,7 +285,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x) # kernels for the remaining archs that are not already built for 3x. - cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS + cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS "7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}") # subtract out the archs that are already built for 3x list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) @@ -316,10 +316,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS) # - # For the Machete kernels we automatically generate sources for various + # For the Machete kernels we automatically generate sources for various # preselected input type pairs and schedules. # Generate sources: - set(MACHETE_GEN_SCRIPT + set(MACHETE_GEN_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py) file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH) @@ -329,8 +329,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH} OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH}) execute_process( - COMMAND ${CMAKE_COMMAND} -E env - PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH + COMMAND ${CMAKE_COMMAND} -E env + PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT} RESULT_VARIABLE machete_generation_result OUTPUT_VARIABLE machete_generation_output @@ -340,11 +340,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") if (NOT machete_generation_result EQUAL 0) message(FATAL_ERROR "Machete generation failed." - " Result: \"${machete_generation_result}\"" + " Result: \"${machete_generation_result}\"" "\nCheck the log for details: " "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log") else() - set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH} + set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH} CACHE STRING "Last run machete generate script hash" FORCE) message(STATUS "Machete generation completed successfully.") endif() @@ -366,7 +366,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}") else() - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS) message(STATUS "Not building Machete kernels as CUDA Compiler version is " "not >= 12.0, we recommend upgrading to CUDA 12.0 or " @@ -392,8 +392,8 @@ define_gpu_extension_target( USE_SABI 3 WITH_SOABI) -# If CUTLASS is compiled on NVCC >= 12.5, it by default uses -# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the +# If CUTLASS is compiled on NVCC >= 12.5, it by default uses +# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the # driver API. This causes problems when linking with earlier versions of CUDA. # Setting this variable sidesteps the issue by calling the driver directly. target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1) @@ -471,9 +471,9 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda") return() endif () -# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target -# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the -# arches in the CUDA case (and instead set the gencodes on a per file basis) +# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target +# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the +# arches in the CUDA case (and instead set the gencodes on a per file basis) # we need to manually set VLLM_GPU_ARCHES here. if(VLLM_GPU_LANG STREQUAL "CUDA") foreach(_ARCH ${CUDA_ARCHS}) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 0a903877f000d..a42e70170ba28 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -79,7 +79,7 @@ async def async_request_tgi( # any data, we should skip it. if chunk_bytes.startswith(":"): continue - chunk = remove_prefix(chunk_bytes, "data:") + chunk = chunk_bytes.removeprefix("data:") data = json.loads(chunk) timestamp = time.perf_counter() @@ -144,8 +144,8 @@ async def async_request_trt_llm( if not chunk_bytes: continue - chunk = remove_prefix(chunk_bytes.decode("utf-8"), - "data:") + chunk = chunk_bytes.decode("utf-8").removeprefix( + "data:") data = json.loads(chunk) output.generated_text += data["text_output"] @@ -261,8 +261,8 @@ async def async_request_openai_completions( if not chunk_bytes: continue - chunk = remove_prefix(chunk_bytes.decode("utf-8"), - "data: ") + chunk = chunk_bytes.decode("utf-8").removeprefix( + "data: ") if chunk == "[DONE]": latency = time.perf_counter() - st else: @@ -349,8 +349,8 @@ async def async_request_openai_chat_completions( if not chunk_bytes: continue - chunk = remove_prefix(chunk_bytes.decode("utf-8"), - "data: ") + chunk = chunk_bytes.decode("utf-8").removeprefix( + "data: ") if chunk == "[DONE]": latency = time.perf_counter() - st else: @@ -389,14 +389,6 @@ async def async_request_openai_chat_completions( return output -# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix) -# introduced in Python 3.9 -def remove_prefix(text: str, prefix: str) -> str: - if text.startswith(prefix): - return text[len(prefix):] - return text - - def get_model(pretrained_model_name_or_path: str) -> str: if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true': from modelscope import snapshot_download diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index b70c4b94c97a1..665b50bf18cf0 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -269,10 +269,10 @@ def run_square_bench(args): def run_range_bench(args): - m_start, k_start, n_start = [int(x) for x in args.dim_start.split(",")] - m_end, k_end, n_end = [int(x) for x in args.dim_end.split(",")] + m_start, k_start, n_start = (int(x) for x in args.dim_start.split(",")) + m_end, k_end, n_end = (int(x) for x in args.dim_end.split(",")) m_increment, k_increment, n_increment = \ - [int(x) for x in args.dim_increment.split(",")] + (int(x) for x in args.dim_increment.split(",")) Ms = list(range(m_start, m_end + 1, m_increment)) Ks = list(range(k_start, k_end + 1, k_increment)) Ns = list(range(n_start, n_end + 1, n_increment)) diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index ebbe76cfb944a..d126af1849024 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -468,7 +468,7 @@ def generate(): impl_configs = [] GPTQ_kernel_type_configs = list( - (TypeConfig( + TypeConfig( element_a=element_a, element_b=element_b, element_b_scale=element_a, @@ -476,7 +476,7 @@ def generate(): element_d=element_a, accumulator=DataType.f32, ) for element_b in (VLLMDataType.u4b8, VLLMDataType.u8b128) - for element_a in (DataType.f16, DataType.bf16))) + for element_a in (DataType.f16, DataType.bf16)) GPTQ_kernel_specializations = [ Specialization(with_C=False, with_zeropoints=False, with_scales=True) @@ -490,7 +490,7 @@ def generate(): ] AWQ_kernel_type_configs = list( - (TypeConfig( + TypeConfig( element_a=element_a, element_b=element_b, element_b_scale=element_a, @@ -498,7 +498,7 @@ def generate(): element_d=element_a, accumulator=DataType.f32, ) for element_b in (DataType.u4, DataType.u8) - for element_a in (DataType.f16, DataType.bf16))) + for element_a in (DataType.f16, DataType.bf16)) AWQ_kernel_specializations = [ Specialization(with_C=False, with_zeropoints=True, with_scales=True) diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst index a706b285edede..61871cdf41125 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation.rst @@ -10,7 +10,7 @@ Requirements ============ * OS: Linux -* Python: 3.8 - 3.12 +* Python: 3.9 -- 3.12 * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) Install released versions @@ -148,7 +148,7 @@ If you want to modify C++ or CUDA code, you'll need to build vLLM from source. T .. tip:: Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results. - For example, you can install `ccache `_ using ``conda install ccache`` or ``apt install ccache`` . + For example, you can install `ccache `_ using ``conda install ccache`` or ``apt install ccache`` . As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster. @@ -181,8 +181,8 @@ to be run simultaneously, via the environment variable ``MAX_JOBS``. For example $ export MAX_JOBS=6 $ pip install -e . -This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default `_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory. -A side effect is a much slower build process. +This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default `_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory. +A side effect is a much slower build process. Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. @@ -209,7 +209,7 @@ Here is a sanity check to verify that the CUDA Toolkit is correctly installed: Unsupported OS build -------------------- -vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. +vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. Simply disable the ``VLLM_TARGET_DEVICE`` environment variable before installing: diff --git a/pyproject.toml b/pyproject.toml index 0bbab3cd3fbc3..3562569647391 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ select = [ # Pyflakes "F", # pyupgrade - # "UP", + "UP", # flake8-bugbear "B", # flake8-simplify @@ -55,7 +55,7 @@ ignore = [ ] [tool.mypy] -python_version = "3.8" +python_version = "3.9" ignore_missing_imports = true check_untyped_defs = true diff --git a/setup.py b/setup.py index 8abeb0ba739db..f145a33258d70 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,4 @@ import importlib.util -import io import logging import os import re @@ -327,7 +326,7 @@ def get_neuronxcc_version(): "__init__.py") # Check if the command was executed successfully - with open(version_file, "rt") as fp: + with open(version_file) as fp: content = fp.read() # Extract the version using a regular expression @@ -404,7 +403,8 @@ def read_readme() -> str: """Read the README file if present.""" p = get_path("README.md") if os.path.isfile(p): - return io.open(get_path("README.md"), "r", encoding="utf-8").read() + with open(get_path("README.md"), encoding="utf-8") as f: + return f.read() else: return "" @@ -498,7 +498,6 @@ def _read_requirements(filename: str) -> List[str]: "Documentation": "https://vllm.readthedocs.io/en/latest/", }, classifiers=[ - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", @@ -512,7 +511,7 @@ def _read_requirements(filename: str) -> List[str]: ], packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples", "tests*")), - python_requires=">=3.8", + python_requires=">=3.9", install_requires=get_requirements(), ext_modules=ext_modules, extras_require={ diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index 9c65059c6b348..73fa9e9906936 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -429,8 +429,8 @@ def benchmark(): # print in tabular format print("batch size\teager mode\tfull cudagraph\tpiecewise cudagraph") for b in cudagraph_sizes: - print((f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}" - f"\t{piecewise_cudagraph_time[b]:.3f}")) + print(f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}" + f"\t{piecewise_cudagraph_time[b]:.3f}") if __name__ == "__main__": diff --git a/tests/conftest.py b/tests/conftest.py index bdc6ffb148602..f9dfabc82639b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,5 @@ import json import os -import sys import tempfile from collections import UserList from enum import Enum @@ -52,7 +51,7 @@ def _read_prompts(filename: str) -> List[str]: - with open(filename, "r") as f: + with open(filename) as f: prompts = f.readlines() return prompts @@ -62,14 +61,8 @@ class _ImageAssetPrompts(TypedDict): cherry_blossom: str -if sys.version_info < (3, 9): - # UserList cannot be subscripted - class _ImageAssetsBase(UserList): - pass -else: - - class _ImageAssetsBase(UserList[ImageAsset]): - pass +class _ImageAssetsBase(UserList[ImageAsset]): + pass class _ImageAssets(_ImageAssetsBase): @@ -94,14 +87,8 @@ class _VideoAssetPrompts(TypedDict): sample_demo_1: str -if sys.version_info < (3, 9): - # UserList cannot be subscripted - class _VideoAssetsBase(UserList): - pass -else: - - class _VideoAssetsBase(UserList[VideoAsset]): - pass +class _VideoAssetsBase(UserList[VideoAsset]): + pass class _VideoAssets(_VideoAssetsBase): @@ -958,7 +945,7 @@ def dummy_opt_path(): "*.msgpack" ]) assert os.path.exists(json_path) - with open(json_path, "r") as f: + with open(json_path) as f: config = json.load(f) config["architectures"] = ["MyOPTForCausalLM"] with open(json_path, "w") as f: @@ -977,7 +964,7 @@ def dummy_llava_path(): "*.msgpack" ]) assert os.path.exists(json_path) - with open(json_path, "r") as f: + with open(json_path) as f: config = json.load(f) config["architectures"] = ["MyLlava"] with open(json_path, "w") as f: @@ -996,7 +983,7 @@ def dummy_gemma2_embedding_path(): "*.msgpack" ]) assert os.path.exists(json_path) - with open(json_path, "r") as f: + with open(json_path) as f: config = json.load(f) config["architectures"] = ["MyGemma2Embedding"] with open(json_path, "w") as f: diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py index 1a6e17ef7b445..d325b9606843e 100644 --- a/tests/core/block/test_prefix_caching_block.py +++ b/tests/core/block/test_prefix_caching_block.py @@ -99,13 +99,11 @@ def test_blocks_have_correct_hash_in_chain(block_size: int, token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)] - first_chain, second_chain = [ - TestPrefixCachingBlock.create_chain( - block_size=block_size, - token_ids=token_ids, - num_empty_trailing_blocks=num_empty_trailing_blocks) - for _ in range(2) - ] + first_chain, second_chain = (TestPrefixCachingBlock.create_chain( + block_size=block_size, + token_ids=token_ids, + num_empty_trailing_blocks=num_empty_trailing_blocks) + for _ in range(2)) for first_chain_block, second_chain_block in zip( first_chain, second_chain): diff --git a/tests/kernels/test_mamba_ssm.py b/tests/kernels/test_mamba_ssm.py index ad05a97685351..19d1158c79c73 100644 --- a/tests/kernels/test_mamba_ssm.py +++ b/tests/kernels/test_mamba_ssm.py @@ -510,7 +510,7 @@ def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C, for var in (u_ref, delta_ref, B_ref, C_ref, z_ref) ] for i in range(len(seqlens[0])): - u_s, delta_s, B_s, C_s, z_s = [v[i].unsqueeze(0) for v in splits] + u_s, delta_s, B_s, C_s, z_s = (v[i].unsqueeze(0) for v in splits) if padded_state_indices[i] == PAD_SLOT_ID: continue out_ref_s, _ = selective_scan_ref( diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py index a01651b171d60..6ae8a6a704b0a 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py @@ -104,7 +104,7 @@ def test_input_mapper_valid_mm_data(input_mapper_for_qwen, # Sad path tests for the multimodal input processor and mapper, respectively @pytest.mark.parametrize("mm_data", [ { - "image": torch.rand((5)) + "image": torch.rand(5) }, { "image": torch.rand((5, 5, 5, 5, 5)) diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py index a8deab3718be1..f5497976faf7a 100644 --- a/tests/samplers/test_rejection_sampler.py +++ b/tests/samplers/test_rejection_sampler.py @@ -413,12 +413,10 @@ def __init__(self, vocab_size: int, rejection_sampler: RejectionSampler): def generate_probs_for_test( self, draft_and_target_probs_equal: bool ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - draft_probs, target_probs = [ - F.softmax( - torch.rand(self.vocab_size, dtype=torch.float32), - dim=-1, - ) for _ in range(2) - ] + draft_probs, target_probs = (F.softmax( + torch.rand(self.vocab_size, dtype=torch.float32), + dim=-1, + ) for _ in range(2)) num_reference_probs = 100 reference_probs = F.softmax( diff --git a/tests/test_logger.py b/tests/test_logger.py index fadf66f2b61d4..a937b0812ed0c 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -29,7 +29,7 @@ def test_trace_function_call(): cur_dir = os.path.dirname(__file__) enable_trace_function_call(path, cur_dir) f1(1) - with open(path, 'r') as f: + with open(path) as f: content = f.read() assert "f1" in content diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index a3e70a40db979..84348cbc0bced 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -93,10 +93,10 @@ def test_mistral_edge_case(tokenizer, truth): def skip_special_tokens(request, tokenizer_name) -> Generator[bool, Any, None]: if "mistral" in tokenizer_name: yield ( - bool(True) if request.param else + True if request.param else pytest.skip("mistral doesn't support skip_special_tokens=False")) else: - yield bool(True) if request.param else bool(False) + yield bool(request.param) @pytest.mark.parametrize("truth", TRUTH) diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py index bbd24b085e3a7..081076ad7dbdc 100644 --- a/tools/profiler/print_layerwise_table.py +++ b/tools/profiler/print_layerwise_table.py @@ -46,7 +46,7 @@ def get_entries(node, curr_depth=0): args = parser.parse_args() - with open(args.json_trace, "r") as f: + with open(args.json_trace) as f: profile_data = json.load(f) if args.table == "summary": diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py index 65ee3ae108ae1..efd6beee865c2 100644 --- a/tools/profiler/visualize_layerwise_profile.py +++ b/tools/profiler/visualize_layerwise_profile.py @@ -434,7 +434,7 @@ def make_plot_title_suffix(profile_json: dict) -> str: f"{', Sparsity ' + sparsity if sparsity else ''}") profile_json = None - with open(json_trace, "r") as f: + with open(json_trace) as f: profile_json = json.load(f) assert profile_json is not None diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py index 33431a33ac837..51ad2adc74fe1 100644 --- a/tools/report_build_time_ninja.py +++ b/tools/report_build_time_ninja.py @@ -81,7 +81,7 @@ def WeightedDuration(self): # Allow for modest floating-point errors epsilon = 0.000002 if (self.weighted_duration > self.Duration() + epsilon): - print('%s > %s?' % (self.weighted_duration, self.Duration())) + print('{} > {}?'.format(self.weighted_duration, self.Duration())) assert (self.weighted_duration <= self.Duration() + epsilon) return self.weighted_duration @@ -104,7 +104,7 @@ def ReadTargets(log, show_all): The result is a list of Target objects.""" header = log.readline() assert header == '# ninja log v5\n', \ - 'unrecognized ninja log version %r' % header + 'unrecognized ninja log version {!r}'.format(header) targets_dict = {} last_end_seen = 0.0 for line in log: @@ -254,8 +254,8 @@ def SummarizeEntries(entries, extra_step_types): # Warn if the sum of weighted times is off by more than half a second. if abs(length - weighted_total) > 500: print('Warning: Possible corrupt ninja log, results may be ' - 'untrustworthy. Length = %.3f, weighted total = %.3f' % - (length, weighted_total)) + 'untrustworthy. Length = {:.3f}, weighted total = {:.3f}'.format( + length, weighted_total)) entries_by_ext = defaultdict(list) for target in entries: @@ -263,16 +263,17 @@ def SummarizeEntries(entries, extra_step_types): entries_by_ext[extension].append(target) for key, values in entries_by_ext.items(): - print(' Longest build steps for %s:' % key) + print(' Longest build steps for {}:'.format(key)) values.sort(key=lambda x: x.WeightedDuration()) for target in values[-long_count:]: - print(' %8.1f weighted s to build %s (%.1f s elapsed time)' % - (target.WeightedDuration(), target.DescribeTargets(), - target.Duration())) - - print(' %.1f s weighted time (%.1f s elapsed time sum, %1.1fx ' - 'parallelism)' % - (length, total_cpu_time, total_cpu_time * 1.0 / length)) + print( + ' {:8.1f} weighted s to build {} ({:.1f} s elapsed time)'. + format(target.WeightedDuration(), target.DescribeTargets(), + target.Duration())) + + print(' {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x ' + 'parallelism)'.format(length, total_cpu_time, + total_cpu_time * 1.0 / length)) print(' %d build steps completed, average of %1.2f/s' % (len(entries), len(entries) / (length))) @@ -298,11 +299,12 @@ def main(): long_ext_count += len(args.step_types.split(';')) try: - with open(log_file, 'r') as log: + with open(log_file) as log: entries = ReadTargets(log, False) SummarizeEntries(entries, args.step_types) - except IOError: - print('Log file %r not found, no build summary created.' % log_file) + except OSError: + print('Log file {!r} not found, no build summary created.'.format( + log_file)) return errno.ENOENT diff --git a/use_existing_torch.py b/use_existing_torch.py index e11746459908b..319d262898fe3 100644 --- a/use_existing_torch.py +++ b/use_existing_torch.py @@ -4,7 +4,7 @@ requires_files += ["pyproject.toml"] for file in requires_files: print(f">>> cleaning {file}") - with open(file, 'r') as f: + with open(file) as f: lines = f.readlines() if "torch" in "".join(lines).lower(): print("removed:") diff --git a/vllm/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py index a98eb431ac7fc..350f88c8f9740 100644 --- a/vllm/attention/ops/blocksparse_attention/interface.py +++ b/vllm/attention/ops/blocksparse_attention/interface.py @@ -192,10 +192,8 @@ def spda(self, q, k, v, cu_seqlens_k, cu_seqlens_q=None, sm_scale=None): attn_mask = self.dense_attn_mask[None, :, :maxlen, :maxlen] q2 = self.transpose_and_pad(q, cu_seqlens, maxlen, 1) - k2, v2 = [ - self.transpose_and_pad(x, cu_seqlens, maxlen, q_k_ratio) - for x in [k, v] - ] + k2, v2 = (self.transpose_and_pad(x, cu_seqlens, maxlen, q_k_ratio) + for x in [k, v]) spda_output = torch.nn.functional.scaled_dot_product_attention( q2, k2, v2, attn_mask=attn_mask, scale=sm_scale) return self.transpose_and_unpad(spda_output, cu_seqlens) diff --git a/vllm/config.py b/vllm/config.py index 814e00c8785f0..851d35dfd9fb0 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -668,9 +668,10 @@ def get_multimodal_config(self) -> "MultiModalConfig": @property def is_encoder_decoder_model(self) -> bool: """Extract the HF encoder/decoder model flag.""" - return getattr(self.hf_config, "is_encoder_decoder", False) or ( - (hasattr(self.hf_config, "text_config") and getattr( - self.hf_config.text_config, "is_encoder_decoder", False))) + return getattr( + self.hf_config, "is_encoder_decoder", + False) or (hasattr(self.hf_config, "text_config") and getattr( + self.hf_config.text_config, "is_encoder_decoder", False)) @property def is_multimodal_model(self) -> bool: diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index 0b943e6e65f1c..ed7e06cab2996 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -52,7 +52,7 @@ def num_blocks(self) -> int: pass -class BlockMetaData(): +class BlockMetaData: """Data structure for storing key data describe cached block, so that evitor could use to make its decision which one to choose for eviction diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index 983e772a3f79b..1f78e10cc1dcd 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -240,7 +240,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool: if is_distributed: get_world_group().barrier() logger.info("reading GPU P2P access cache from %s", path) - with open(path, "r") as f: + with open(path) as f: cache = json.load(f) _gpu_p2p_access_cache = cache return _gpu_p2p_access_cache[f"{src}->{tgt}"] diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index b0fdc67776bbd..161b85646b6e8 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -812,7 +812,7 @@ async def _engine_abort(self, request_ids: Iterable[str]): async def run_engine_loop(engine_ref: ReferenceType): """We use a weakref to the engine so that the running loop doesn't prevent the engine being garbage collected.""" - engine: Optional["AsyncLLMEngine"] = engine_ref() + engine: Optional[AsyncLLMEngine] = engine_ref() if not engine: return diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index a1809b1a9dd26..404e7ed2c6ef9 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1541,8 +1541,8 @@ def _has_remaining_steps( seq_group.state.remaining_steps != ref_remaining_steps for seq_group in seq_group_metadata_list[1:] ]): - raise AssertionError(("All running sequence groups should " - "have the same remaining steps.")) + raise AssertionError("All running sequence groups should " + "have the same remaining steps.") return ref_remaining_steps > 0 diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index 25b7a7479672a..19dcbfe57d112 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -77,7 +77,7 @@ def __init__(self, local_interval: float) -> None: self.num_generation_tokens: List[int] = [] self.last_local_log = time.time() self.local_interval = local_interval - self.spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None + self.spec_decode_metrics: Optional[SpecDecodeWorkerMetrics] = None @abstractmethod def log(self, stats: Stats) -> None: diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 3ed37a269c4b4..223790806ab18 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -63,7 +63,7 @@ def process_prompt_logprob(self, seq_group: SequenceGroup, single_step_process_prompt_logprob(self, seq_group, output) @staticmethod - @functools.lru_cache() + @functools.lru_cache def _log_prompt_logprob_unsupported_warning_once(): # Reminder: Please update docs/source/serving/compatibility_matrix.rst # If the feature combo become valid diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 2b339ab6d44e4..0ada0aaacda24 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -362,7 +362,7 @@ def load_chat_template( if chat_template is None: return None try: - with open(chat_template, "r") as f: + with open(chat_template) as f: resolved_chat_template = f.read() except OSError as e: if isinstance(chat_template, Path): diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index a64467a311523..0d016d949d22b 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -120,7 +120,7 @@ async def read_file(path_or_url: str) -> str: session.get(path_or_url) as resp: return await resp.text() else: - with open(path_or_url, "r", encoding="utf-8") as f: + with open(path_or_url, encoding="utf-8") as f: return f.read() diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 9433dce842b09..66bab2c686c67 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -32,7 +32,7 @@ class RayGPUExecutor(DistributedGPUExecutor): uses_ray: bool = True def _init_executor(self) -> None: - self.forward_dag: Optional["ray.dag.CompiledDAG"] = None + self.forward_dag: Optional[ray.dag.CompiledDAG] = None # If the env var is set, it uses the Ray's compiled DAG API # which optimizes the control plane overhead. # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. diff --git a/vllm/logger.py b/vllm/logger.py index ccf09691a052a..d6fcda02a0fb3 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -67,8 +67,7 @@ def _configure_vllm_root_logger() -> None: raise RuntimeError( "Could not load logging config. File does not exist: %s", VLLM_LOGGING_CONFIG_PATH) - with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8", - mode="r") as file: + with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file: custom_config = json.loads(file.read()) if not isinstance(custom_config, dict): diff --git a/vllm/lora/models.py b/vllm/lora/models.py index d0279f273db7a..81e274612b73b 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -343,7 +343,7 @@ def __init__( # text modules (e.g. ChatGLM) and hasattr(self.model, "get_mm_mapping")) self.packed_modules: Dict[str, List[str]] = {} - self.modules: Dict[str, "BaseLayerWithLoRA"] = {} + self.modules: Dict[str, BaseLayerWithLoRA] = {} # Dict instead of a Set for compatibility with LRUCache. self._last_mapping: Optional[LoRAMapping] = None self._create_lora_modules() @@ -548,7 +548,7 @@ def create_dummy_lora( else: parts = module_name.split(".") replacements = self.packed_modules_mapping[parts[-1]] - subloras: List[Optional["LoRALayerWeights"]] = [] + subloras: List[Optional[LoRALayerWeights]] = [] for i, r in enumerate(replacements): lora = LoRALayerWeights.create_dummy_lora_weights( module_name + "." + r, diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 764f4e9c99df8..bfca15c2b6a3e 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -103,7 +103,7 @@ def enabled(cls) -> bool: # On by default if VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.PIECEWISE # Specifying 'all' or 'none' in VLLM_CUSTOM_OPS takes precedence. @staticmethod - @lru_cache() + @lru_cache def default_on() -> bool: count_none = envs.VLLM_CUSTOM_OPS.count("none") count_all = envs.VLLM_CUSTOM_OPS.count("all") diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py index bca44d2bf2e28..aae806f6af323 100644 --- a/vllm/model_executor/layers/resampler.py +++ b/vllm/model_executor/layers/resampler.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 2158ad3339673..ac60e0e6d48a0 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 1f8d531198324..464915248c9ad 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -746,7 +746,7 @@ def __init__(self, load_config: LoadConfig): config_file_path = self._get_config_file(qlora_adapter) - with open(config_file_path, "r") as f: + with open(config_file_path) as f: config = json.load(f) self.target_modules = config["target_modules"] diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py index 573f2a04895d9..e6299295c85a2 100644 --- a/vllm/model_executor/model_loader/openvino.py +++ b/vllm/model_executor/model_loader/openvino.py @@ -190,7 +190,7 @@ def get_model( kv_cache_dtype: ov.Type, **kwargs, ) -> torch.nn.Module: - lora_config = kwargs.get("lora_config", None) + lora_config = kwargs.get("lora_config") ov_core = kwargs.get("ov_core") if lora_config: raise ValueError( diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 36f33d6d139ee..437d2772e1f28 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -280,7 +280,7 @@ def __init__(self, tensorizer_config: TensorizerConfig, self.tensorizer_args = ( self.tensorizer_config._construct_tensorizer_args()) self.extra_kwargs = extra_kwargs - if extra_kwargs.get("quant_config", None) is not None: + if extra_kwargs.get("quant_config") is not None: self.quant_config = extra_kwargs["quant_config"] else: self.quant_config = quant_config @@ -380,8 +380,7 @@ def tensorizer_weights_iterator( stream = open_stream(tensorizer_args.tensorizer_uri, **stream_params) with TensorDeserializer(stream, **deserializer_args, device="cpu") as state: - for name, param in state.items(): - yield name, param + yield from state.items() del state diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 0c51314bc90df..9488d54edf365 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -188,7 +188,7 @@ def get_quant_config(model_config: ModelConfig, f"{quant_config_files}") quant_config_file = quant_config_files[0] - with open(quant_config_file, "r") as f: + with open(quant_config_file) as f: config = json.load(f) if model_config.quantization == "bitsandbytes": @@ -306,7 +306,7 @@ def filter_duplicate_safetensors_files(hf_weights_files: List[str], # Iterate through the weight_map (weight_name: safetensors files) # to identify weights that we should use. - with open(index_file_name, "r") as f: + with open(index_file_name) as f: weight_map = json.load(f)["weight_map"] weight_files_in_index = set() for weight_name in weight_map: @@ -382,7 +382,7 @@ def np_cache_weights_iterator( with open(weight_names_file, "w") as f: json.dump(weight_names, f) - with open(weight_names_file, "r") as f: + with open(weight_names_file) as f: weight_names = json.load(f) for name in weight_names: @@ -423,8 +423,7 @@ def pt_weights_iterator( bar_format=_BAR_FORMAT, ): state = torch.load(bin_file, map_location="cpu") - for name, param in state.items(): - yield name, param + yield from state.items() del state torch.cuda.empty_cache() diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index fd29d4ccc59d8..5b712ba83c25a 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -48,7 +48,7 @@ def __init__(self, is_residual_mlp: bool = False, quant_config: Optional[QuantizationConfig] = None, reduce_results: bool = True): - super(ArcticMLP, self).__init__() + super().__init__() self.hidden_size = config.hidden_size self.expert_id = expert_id self.layer_id = layer_id @@ -89,7 +89,7 @@ def __init__(self, params_dtype: Optional[torch.dtype] = None, quant_config: Optional[QuantizationConfig] = None, reduce_results: bool = True): - super(ArcticMoE, self).__init__() + super().__init__() self.tp_size = tp_size or get_tensor_model_parallel_world_size() self.hidden_size = config.hidden_size diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index f2cfdf8ffd30a..1fbf4135add7a 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 77ab7de6165fb..83ff39a30fbe3 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/bloom/modeling_bloom.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 181f3c2b0fc35..881b86564e811 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/THUDM/GLM-4 """Inference-only ChatGLM model compatible with THUDM weights.""" diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 348e6d20f3297..835682ca3b379 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2024 Cohere and the HuggingFace Inc. team. All rights reserved. # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index aae7ab7370b74..3e60eee2d8fe2 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -1,4 +1,3 @@ -# coding=utf-8 from typing import Iterable, List, Optional, Tuple, Union import torch diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py index 7ed2b96e65c49..8c9653463858b 100644 --- a/vllm/model_executor/models/decilm.py +++ b/vllm/model_executor/models/decilm.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 DeciAI Research Team. All rights reserved. diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index 5b4db8f258711..d278ea5b6a991 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index d4ad0c6b5c99e..834be78bce87b 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 22f194c776b69..23efe0359cb4a 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/modeling_exaone.py # Copyright 2024 The LG U+ CTO AI Tech Lab. diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index c376347811965..ad07fc3b3776e 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/a5cc30d72ae2dc19af534e4b35c986cc28db1275/src/transformers/models/falcon/modeling_falcon.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 0de590d1d8372..3db82a898159b 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -1,4 +1,3 @@ -# coding=utf-8 # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/fuyu/modeling_fuyu.py # Copyright 2023 The vLLM team. # Copyright 2023 HuggingFace Inc. team. All rights reserved. diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 029178af61da0..fc3f5cb20afb0 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2023 The vLLM team. # Copyright (c) Google Inc. # diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 9238ed839c9de..c365880109ef8 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2024 The vLLM team. # Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved. # diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py index 3213a8b29a104..025615b0920fd 100644 --- a/vllm/model_executor/models/glm4_vision_encoder.py +++ b/vllm/model_executor/models/glm4_vision_encoder.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/THUDM/GLM-4 """Inference-only GLM-4v model visual encoder compatible with THUDM weights.""" diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 3330d84021368..a06200c4b7e08 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 24c79a8855475..7612ea641d95c 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 9a42b359ae44f..b28a6081b868f 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gptj/modeling_gptj.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 1bccef7a5f173..931052c7cccf0 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index c968817747754..bee48f377e0f5 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 5307bb21adb96..691a6e77c46c4 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index 43f4f29814e6d..53869b8fa6bd8 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -1,5 +1,3 @@ -# coding=utf-8 - # adapted from https://github.com/huggingface/transformers/blob/v4.43.2/src/transformers/models/idefics2/modeling_idefics2.py # Copyright 2024 The vLLM team. # Copyright 2024 the HuggingFace Inc. team. All rights reserved. diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 313d98b649b48..afefb6cd9fa96 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- from functools import partial from typing import Any, Dict, Iterable, List, Optional, Tuple, Union diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py index edd867e4b6457..108fc8382049d 100644 --- a/vllm/model_executor/models/internlm2_ve.py +++ b/vllm/model_executor/models/internlm2_ve.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- from typing import List, Optional, Tuple, Union import torch diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index b947f24a693b5..301893f74cb87 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://huggingface.co/inceptionai/jais-30b-chat-v3/blob/main/modeling_jais.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 6f7949c880e61..81d88a47c1941 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -1,4 +1,3 @@ -# coding=utf-8 """Inference-only Jamba model.""" from typing import Iterable, List, Optional, Tuple diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 38a31f420cec9..6c0a8b5ef8451 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index 985ba6f3c60c1..aac4b7aa2661d 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -1,4 +1,3 @@ -# coding=utf-8 """PyTorch MAMBA model.""" from typing import Iterable, List, Optional, Tuple diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 03fb036020f2f..acf03cd8cb8ad 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index 3b5fd95328d74..eeedf55cf3e57 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2024 The ModelBest team. diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index f90df6b7df036..5acd3f65896c7 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 1514243ad59c9..e9b9c4d838faa 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index 63e2c60a84271..9647d69be8a0a 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 251bfc079684e..5fa8d19b97fe8 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2024 the HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index 42ccd01298169..ae218d749fc0b 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -37,7 +37,7 @@ def __init__( eps=1e-06, elementwise_scale_and_shift=True, ): - super(MLPSpeculatorLayerNorm, self).__init__() + super().__init__() self.elementwise_scale_and_shift = elementwise_scale_and_shift if self.elementwise_scale_and_shift: self.weight = nn.Parameter(torch.empty(normalized_shape)) diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 522aa748f78b6..785b53670542f 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1121,9 +1121,9 @@ def _merge_multimodal_embeddings( batch_size * num_image * num_patch, -1).contiguous() image_input_idx = image_input_idx * valid.to(image_input_idx.dtype) - offset = torch.cat( - [seq_len.new_zeros( - (1)), seq_len.cumsum(dim=0)[:-1]], dim=0)[:, None] + offset = torch.cat([seq_len.new_zeros(1), + seq_len.cumsum(dim=0)[:-1]], + dim=0)[:, None] image_input_idx = image_input_idx + offset.to(image_input_idx.dtype) image_input_idx = image_input_idx.flatten()[:, None] mat = image_input_idx == torch.arange( diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index ee802030a5ef3..fdd8af79b5470 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main import math from typing import Iterable, List, Optional, Tuple, Union diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 72a09129fed63..b649064536dc2 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 90ab8abcb84b4..dd3f58289a227 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/olmo/modeling_olmo.py # Copyright 2024 The vLLM team. diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 7521ab749e10f..7a76e4a0906db 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 055407587c598..a338a93c2dd9a 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/modeling_orion.py # Copyright (c) OrionStar Inc. diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index fc9ef15db26c0..bd4a9f698bacd 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -1,4 +1,3 @@ -# coding=utf-8 # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/persimmon/modeling_persimmon.py # Copyright 2023 The vLLM team. # Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved. diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 4e7935a7636c5..492122450b237 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_phi.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py index 02b2ff01c3832..34141511ea791 100644 --- a/vllm/model_executor/models/phi3.py +++ b/vllm/model_executor/models/phi3.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from llama.py """Inference-only Phi3 model code inherit from Llama.py""" diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 5b477a8ed5f49..1c41891ced416 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2024 The vLLM team. # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. # diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index bb8a9327b4ac8..59843ae3dfd59 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index ee9f150b17cfc..6e9092432467a 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -136,11 +136,11 @@ def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs): if image_token_id not in inputs['prompt_token_ids']: raise ValueError( - (f"You've passed {inputs=} without {image_token_id=}" - " Make sure to process your input via mistral_common's" - " tokenizer or pass a chat completion request. For more" - " For more info, see: " - "https://github.com/vllm-project/vllm/issues/8411.")) + f"You've passed {inputs=} without {image_token_id=}" + " Make sure to process your input via mistral_common's" + " tokenizer or pass a chat completion request. For more" + " For more info, see: " + "https://github.com/vllm-project/vllm/issues/8411.") return inputs diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index b2b5c70182135..3a0e33e8a3eff 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py # Copyright (c) Alibaba Cloud. diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 72b286fe6f6d6..49b3de1304cca 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py # Copyright 2024 The Qwen team. @@ -417,9 +416,9 @@ def __init__( and hasattr(config, "max_window_layers")): raise ValueError("Sliding window for some but all layers is not " "supported. This model uses sliding window " - "but `max_window_layers` = %s is less than " - "`num_hidden_layers` = %s. Please open an issue " - "to discuss this feature." % ( + "but `max_window_layers` = {} is less than " + "`num_hidden_layers` = {}. Please open an issue " + "to discuss this feature.".format( config.max_window_layers, config.num_hidden_layers, )) diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 6114548bda42c..556c09400ee83 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2024 The Qwen team. # Copyright 2023 The vLLM team. # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. diff --git a/vllm/model_executor/models/qwen2_cls.py b/vllm/model_executor/models/qwen2_cls.py index 2d6f3e90f761c..b9e3b74c477e2 100644 --- a/vllm/model_executor/models/qwen2_cls.py +++ b/vllm/model_executor/models/qwen2_cls.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py # Copyright 2024 Kakao Corp. (Kanana-X Team) @@ -60,9 +59,9 @@ def __init__( and hasattr(config, "max_window_layers")): raise ValueError("Sliding window for some but all layers is not " "supported. This model uses sliding window " - "but `max_window_layers` = %s is less than " - "`num_hidden_layers` = %s. Please open an issue " - "to discuss this feature." % ( + "but `max_window_layers` = {} is less than " + "`num_hidden_layers` = {}. Please open an issue " + "to discuss this feature.".format( config.max_window_layers, config.num_hidden_layers, )) diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index dac85e35d369d..98bb48a274e49 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py # Copyright 2024 The Qwen team. diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 901b1daaa14a4..0fbf305da8b94 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py # Copyright 2024 The Qwen team. @@ -71,9 +70,9 @@ def __init__( and hasattr(config, "max_window_layers")): raise ValueError("Sliding window for some but all layers is not " "supported. This model uses sliding window " - "but `max_window_layers` = %s is less than " - "`num_hidden_layers` = %s. Please open an issue " - "to discuss this feature." % ( + "but `max_window_layers` = {} is less than " + "`num_hidden_layers` = {}. Please open an issue " + "to discuss this feature.".format( config.max_window_layers, config.num_hidden_layers, )) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index d801903f8f9fe..e30b84e8dd44c 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py # Copyright 2024 The Qwen team. @@ -246,9 +245,8 @@ def forward( q, k, v = dist_utils.split_tensor_along_last_dim(x, 3) batch_size = q.shape[1] - q, k, v = [ - rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v) - ] + q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() + for x in (q, k, v)) if rotary_pos_emb is not None: q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) @@ -258,7 +256,7 @@ def forward( # flash_attn_varlen_func) from flash_attn import flash_attn_varlen_func - q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]] + q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() output = flash_attn_varlen_func(q, @@ -276,7 +274,7 @@ def forward( b=batch_size) elif self.attn_backend == _Backend.TORCH_SDPA: seq_length = q.size(1) - q, k, v = [rearrange(x, "b s h d -> b h s d") for x in [q, k, v]] + q, k, v = (rearrange(x, "b s h d -> b h s d") for x in [q, k, v]) attention_mask = torch.zeros([1, seq_length, seq_length], device=q.device, dtype=torch.bool) diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index e3e7ccb5cf179..1b233ac7427dd 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 083a48588d01a..34389b645a7c1 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team. # All rights reserved. # diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 8f0644bca3e2e..b24c5dadb2b2b 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved. # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py index 036789642d3c4..e559988ada753 100644 --- a/vllm/model_executor/models/xverse.py +++ b/vllm/model_executor/models/xverse.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://huggingface.co/xverse/XVERSE-7B/blob/main/modeling_xverse.py # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 6b10d0c609f13..5ff6f93fb25b4 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -1,4 +1,3 @@ -import sys from abc import ABC, abstractmethod from collections import UserDict, defaultdict from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Mapping, @@ -34,14 +33,9 @@ :meth:`MultiModalInputs.batch`. """ -if sys.version_info < (3, 9): - # UserDict cannot be subscripted - class _MultiModalInputsBase(UserDict): - pass -else: - class _MultiModalInputsBase(UserDict[str, NestedTensors]): - pass +class _MultiModalInputsBase(UserDict[str, NestedTensors]): + pass class MultiModalInputs(_MultiModalInputsBase): @@ -262,18 +256,23 @@ def wrapper(model_cls: N) -> N: logger.warning( "Model class %s already has an input mapper " "registered to %s. It is overwritten by the new one.", - model_cls, self) + model_cls, + self, + ) - self._input_mappers[model_cls] = mapper \ - or self._default_input_mapper + self._input_mappers[model_cls] = (mapper + or self._default_input_mapper) return model_cls return wrapper - def map_input(self, model_config: "ModelConfig", - data: MultiModalData[object], - mm_processor_kwargs: Dict[str, Any]) -> MultiModalInputs: + def map_input( + self, + model_config: "ModelConfig", + data: MultiModalData[object], + mm_processor_kwargs: Dict[str, Any], + ) -> MultiModalInputs: """ Transform the data into a dictionary of model inputs using the input mapper registered for that model. @@ -348,13 +347,15 @@ def wrapper(model_cls: N) -> N: logger.warning( "Model class %s already calculates maximum number of " "tokens in %s. It is overwritten by the new one.", - model_cls, self) + model_cls, + self, + ) if isinstance(max_mm_tokens, int): self._validate_max_multimodal_tokens(max_mm_tokens) - self._max_mm_tokens[model_cls] = max_mm_tokens \ - or self._default_max_multimodal_tokens + self._max_mm_tokens[model_cls] = ( + max_mm_tokens or self._default_max_multimodal_tokens) return model_cls @@ -482,8 +483,10 @@ def from_seq_group( placeholder_maps: Dict[str, MultiModalPlaceholderMap] = defaultdict( MultiModalPlaceholderMap) - for modality, placeholders in seq_group.multi_modal_placeholders.items( - ): + for ( + modality, + placeholders, + ) in seq_group.multi_modal_placeholders.items(): mm_items = mm_data.pop(modality) if not isinstance(mm_items, list): mm_items = [mm_items] @@ -499,8 +502,11 @@ def from_seq_group( return mm_data, placeholder_maps def append_items_from_seq_group( - self, positions: range, multi_modal_items: List[_T], - multi_modal_placeholders: List[PlaceholderRange]) -> List[_T]: + self, + positions: range, + multi_modal_items: List[_T], + multi_modal_placeholders: List[PlaceholderRange], + ) -> List[_T]: """ Adds the multi-modal items that intersect ```positions`` to this placeholder map and returns the intersecting items. @@ -515,20 +521,26 @@ def append_items_from_seq_group( multi_modal_items): placeholder = range( placeholder_dict["offset"], - placeholder_dict["offset"] + placeholder_dict["length"]) - intersection = range(max(positions.start, placeholder.start), - min(positions.stop, placeholder.stop)) + placeholder_dict["offset"] + placeholder_dict["length"], + ) + intersection = range( + max(positions.start, placeholder.start), + min(positions.stop, placeholder.stop), + ) if not intersection: # Skip this multi-modal item. continue - token_embedding_range = range(intersection.start - positions.start, - intersection.stop - positions.start) + token_embedding_range = range( + intersection.start - positions.start, + intersection.stop - positions.start, + ) multimodal_embedding_range = range( intersection.start - placeholder.start + self.src_len, - intersection.stop - placeholder.start + self.src_len) + intersection.stop - placeholder.start + self.src_len, + ) intersecting_items.append(mm_item) self.dest_ranges.append(token_embedding_range) diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py index 4cde2a0254b90..473b87c89c21d 100644 --- a/vllm/prompt_adapter/utils.py +++ b/vllm/prompt_adapter/utils.py @@ -37,9 +37,8 @@ def load_peft_weights(model_id: str, Additional arguments to pass to the `hf_hub_download` method when loading from the HuggingFace Hub. """ - path = (os.path.join(model_id, hf_hub_download_kwargs["subfolder"]) - if hf_hub_download_kwargs.get("subfolder", None) is not None else - model_id) + path = (os.path.join(model_id, hf_hub_download_kwargs["subfolder"]) if + hf_hub_download_kwargs.get("subfolder") is not None else model_id) if device is None: device = infer_device() @@ -51,19 +50,19 @@ def load_peft_weights(model_id: str, filename = os.path.join(path, WEIGHTS_NAME) use_safetensors = False else: - token = hf_hub_download_kwargs.get("token", None) + token = hf_hub_download_kwargs.get("token") if token is None: - token = hf_hub_download_kwargs.get("use_auth_token", None) + token = hf_hub_download_kwargs.get("use_auth_token") hub_filename = (os.path.join(hf_hub_download_kwargs["subfolder"], SAFETENSORS_WEIGHTS_NAME) - if hf_hub_download_kwargs.get("subfolder", None) - is not None else SAFETENSORS_WEIGHTS_NAME) + if hf_hub_download_kwargs.get("subfolder") is not None + else SAFETENSORS_WEIGHTS_NAME) has_remote_safetensors_file = file_exists( repo_id=model_id, filename=hub_filename, - revision=hf_hub_download_kwargs.get("revision", None), - repo_type=hf_hub_download_kwargs.get("repo_type", None), + revision=hf_hub_download_kwargs.get("revision"), + repo_type=hf_hub_download_kwargs.get("repo_type"), token=token, ) use_safetensors = has_remote_safetensors_file diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 08697274854e0..1a5870aa4f84c 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -308,7 +308,7 @@ def load_params_config(model, revision) -> PretrainedConfig: config_path = Path( hf_hub_download(model, config_file_name, revision=revision)) - with open(config_path, "r") as file: + with open(config_path) as file: config_dict = json.load(file) config_mapping = { diff --git a/vllm/transformers_utils/configs/chatglm.py b/vllm/transformers_utils/configs/chatglm.py index 49d2b8d8e21b1..e563bf6268d72 100644 --- a/vllm/transformers_utils/configs/chatglm.py +++ b/vllm/transformers_utils/configs/chatglm.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/THUDM/ChatGLM2-6B from transformers import PretrainedConfig diff --git a/vllm/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py index 805b8ad930039..f60a59f554133 100644 --- a/vllm/transformers_utils/configs/exaone.py +++ b/vllm/transformers_utils/configs/exaone.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copied from # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py # Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved. diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py index b06a946f34a47..82f129eb2018e 100644 --- a/vllm/transformers_utils/configs/jais.py +++ b/vllm/transformers_utils/configs/jais.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # Copyright 2023 Cerebras Systems. diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py index 497db0ae48c96..0f047c8b0361c 100644 --- a/vllm/transformers_utils/configs/mpt.py +++ b/vllm/transformers_utils/configs/mpt.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copied from # https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py """A HuggingFace-style model configuration.""" @@ -117,10 +116,10 @@ def _validate_config(self) -> None: init_config_defaults) if self.d_model % self.n_heads != 0: raise ValueError('d_model must be divisible by n_heads') - if any(( + if any( prob < 0 or prob > 1 for prob in - [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop] - )): + [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop + ]): raise ValueError( "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are " "probabilities and must be between 0 and 1") diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py index 139e6b3cdacbe..93fec667d1cf3 100644 --- a/vllm/transformers_utils/configs/nemotron.py +++ b/vllm/transformers_utils/configs/nemotron.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2024 HuggingFace Inc. team. All rights reserved. # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # @@ -144,7 +143,7 @@ def __init__( self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - head_dim = head_dim or kwargs.get("kv_channels", None) + head_dim = head_dim or kwargs.get("kv_channels") self.head_dim = head_dim if head_dim is not None else ( hidden_size // num_attention_heads) @@ -160,8 +159,8 @@ def __init__( self.rope_theta = rope_theta self.rope_scaling = rope_scaling # for backward compatibility - partial_rotary_factor = kwargs.get("rope_percent", None) or kwargs.get( - "rope_percentage", None) or partial_rotary_factor + partial_rotary_factor = kwargs.get("rope_percent") or kwargs.get( + "rope_percentage") or partial_rotary_factor self.partial_rotary_factor = partial_rotary_factor self._rope_scaling_validation() self.attention_bias = attention_bias diff --git a/vllm/transformers_utils/configs/solar.py b/vllm/transformers_utils/configs/solar.py index d5113bf01695a..0c1c048f670ee 100644 --- a/vllm/transformers_utils/configs/solar.py +++ b/vllm/transformers_utils/configs/solar.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX diff --git a/vllm/utils.py b/vllm/utils.py index 0b75e8761c916..6edc8d72f6bcf 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1153,7 +1153,7 @@ class SortedHelpFormatter(argparse.HelpFormatter): def add_arguments(self, actions): actions = sorted(actions, key=lambda x: x.option_strings) - super(SortedHelpFormatter, self).add_arguments(actions) + super().add_arguments(actions) class FlexibleArgumentParser(argparse.ArgumentParser): @@ -1279,7 +1279,7 @@ def _load_config_file(self, file_path: str) -> List[str]: config: Dict[str, Union[int, str]] = {} try: - with open(file_path, 'r') as config_file: + with open(file_path) as config_file: config = yaml.safe_load(config_file) except Exception as ex: logger.error(