diff --git a/Cargo.lock b/Cargo.lock index 2ed9cff0d..f5ea07c37 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -556,9 +556,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.0" +version = "1.2.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1aeb932158bd710538c73702db6945cb68a8fb08c519e6e12706b94263b36db8" +checksum = "32db95edf998450acc7881c932f94cd9b05c87b4b2599e8bab064753da4acfd1" dependencies = [ "jobserver", "libc", @@ -729,9 +729,9 @@ dependencies = [ [[package]] name = "crossbeam-channel" -version = "0.5.13" +version = "0.5.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33480d6946193aa8033910124896ca395333cae7e2d1113d1fef6c3272217df2" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" dependencies = [ "crossbeam-utils", ] @@ -1875,7 +1875,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34" dependencies = [ "cfg-if", - "windows-targets 0.52.6", + "windows-targets 0.48.5", ] [[package]] @@ -2514,9 +2514,9 @@ dependencies = [ [[package]] name = "openssl" -version = "0.10.70" +version = "0.10.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61cfb4e166a8bb8c9b55c500bc2308550148ece889be90f609377e58140f42c6" +checksum = "fedfea7d58a1f73118430a55da6a286e7b044961736ce96a16a17068ea25e5da" dependencies = [ "bitflags 2.6.0", "cfg-if", @@ -2546,9 +2546,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.105" +version = "0.9.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b22d5b84be05a8d6947c7cb71f7c849aa0f112acd4bf51c2a7c1c988ac0a9dc" +checksum = "e145e1651e858e820e4860f7b9c5e169bc1d8ce1c86043be79fa7b7634821847" dependencies = [ "cc", "libc", @@ -3396,7 +3396,7 @@ dependencies = [ "cc", "libc", "once_cell", - "spin 0.5.2", + "spin", "untrusted 0.7.1", "web-sys", "winapi", @@ -3404,15 +3404,14 @@ dependencies = [ [[package]] name = "ring" -version = "0.17.8" +version = "0.17.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" dependencies = [ "cc", "cfg-if", "getrandom", "libc", - "spin 0.9.8", "untrusted 0.9.0", "windows-sys 0.52.0", ] @@ -3520,7 +3519,7 @@ dependencies = [ "aws-lc-rs", "log", "once_cell", - "ring 0.17.8", + "ring 0.17.14", "rustls-pki-types", "rustls-webpki", "subtle", @@ -3549,7 +3548,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9" dependencies = [ "aws-lc-rs", - "ring 0.17.8", + "ring 0.17.14", "rustls-pki-types", "untrusted 0.9.0", ] @@ -3596,7 +3595,7 @@ version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" dependencies = [ - "ring 0.17.8", + "ring 0.17.14", "untrusted 0.9.0", ] @@ -3801,12 +3800,6 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" -[[package]] -name = "spin" -version = "0.9.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" - [[package]] name = "spm_precompiled" version = "0.1.4" @@ -4888,7 +4881,7 @@ version = "0.22.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed63aea5ce73d0ff405984102c42de94fc55a6b75765d621c65262469b3c9b53" dependencies = [ - "ring 0.17.8", + "ring 0.17.14", "untrusted 0.9.0", ] diff --git a/Dockerfile b/Dockerfile index 0988daf58..302fb6e57 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,6 +17,8 @@ FROM chef AS builder ARG GIT_SHA ARG DOCKER_LABEL +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + python3.11-dev RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \ curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \ unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \ @@ -37,8 +39,8 @@ RUN cargo build --release # Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile FROM nvidia/cuda:12.4.0-devel-ubuntu22.04 as pytorch-install -ARG PYTORCH_VERSION=2.4.0 -ARG PYTHON_VERSION=3.10 +ARG PYTORCH_VERSION=2.6.0 +ARG PYTHON_VERSION=3.11 # Keep in sync with `server/pyproject.toml ARG CUDA_VERSION=12.4 ARG MAMBA_VERSION=24.3.0-0 @@ -47,6 +49,7 @@ ARG INSTALL_CHANNEL=pytorch # Automatically set by buildx ARG TARGETPLATFORM +WORKDIR /usr/src ENV PATH /opt/conda/bin:$PATH RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ @@ -57,31 +60,19 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins git && \ rm -rf /var/lib/apt/lists/* -# Install conda -# translating Docker's TARGETPLATFORM into mamba arches -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") MAMBA_ARCH=aarch64 ;; \ - *) MAMBA_ARCH=x86_64 ;; \ - esac && \ - curl -fsSL -v -o ~/mambaforge.sh -O "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh" -RUN chmod +x ~/mambaforge.sh && \ - bash ~/mambaforge.sh -b -p /opt/conda && \ - rm ~/mambaforge.sh - -# Install pytorch -# On arm64 we exit with an error code -RUN case ${TARGETPLATFORM} in \ - "linux/arm64") exit 1 ;; \ - *) /opt/conda/bin/conda update -y conda && \ - /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)" ;; \ - esac && \ - /opt/conda/bin/conda clean -ya +COPY --from=ghcr.io/astral-sh/uv:0.5.31 /uv /uvx /bin/ +ENV PATH="$PATH:/root/.local/bin" +RUN uv python install ${PYTHON_VERSION} +RUN uv venv --python ${PYTHON_VERSION} && uv pip install torch==${PYTORCH_VERSION} torchvision pip setuptools packaging +ENV VIRTUAL_ENV=/usr/src/.venv/ +ENV PATH="$PATH:/usr/src/.venv/bin/" + # CUDA kernels builder image FROM pytorch-install as kernel-builder -ARG MAX_JOBS=2 - +ARG MAX_JOBS=8 +ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;9.0+PTX" RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ ninja-build cmake \ && rm -rf /var/lib/apt/lists/* @@ -90,38 +81,38 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins FROM kernel-builder as flash-att-builder WORKDIR /usr/src COPY server/Makefile-flash-att Makefile -RUN make build-flash-attention +RUN . .venv/bin/activate && make build-flash-attention # Build Flash Attention v2 CUDA kernels FROM kernel-builder as flash-att-v2-builder WORKDIR /usr/src COPY server/Makefile-flash-att-v2 Makefile -RUN make build-flash-attention-v2-cuda +RUN . .venv/bin/activate && make build-flash-attention-v2-cuda # Build Transformers exllama kernels FROM kernel-builder as exllama-kernels-builder WORKDIR /usr/src COPY server/exllama_kernels/ . -RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build +RUN . .venv/bin/activate && TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build # Build Transformers exllama kernels FROM kernel-builder as exllamav2-kernels-builder WORKDIR /usr/src COPY server/exllamav2_kernels/ . -RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build +RUN . .venv/bin/activate && TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build # Build Transformers awq kernels FROM kernel-builder as awq-kernels-builder WORKDIR /usr/src COPY server/Makefile-awq Makefile -RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq +RUN . .venv/bin/activate && TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq # Build Transformers CUDA kernels FROM kernel-builder as custom-kernels-builder WORKDIR /usr/src COPY server/custom_kernels/ . # Build specific version of transformers -RUN python setup.py build +RUN . .venv/bin/activate && python setup.py build # Build vllm CUDA kernels FROM kernel-builder as vllm-builder @@ -136,14 +127,14 @@ RUN ln -s "$(pwd)/cmake-3.30.0-linux-x86_64/bin/cmake" /usr/local/bin/cmake ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" COPY server/Makefile-vllm Makefile # Build specific version of vllm -RUN make build-vllm-cuda +RUN . .venv/bin/activate && make build-vllm-cuda # Build megablocks kernels FROM kernel-builder as megablocks-kernels-builder WORKDIR /usr/src COPY server/Makefile-megablocks Makefile ENV TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" -RUN make build-megablocks +RUN . .venv/bin/activate && make build-megablocks # Build punica CUDA kernels FROM kernel-builder as punica-builder @@ -151,14 +142,14 @@ WORKDIR /usr/src COPY server/punica_kernels/ . # Build specific version of punica ENV TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" -RUN python setup.py build +RUN . .venv/bin/activate && python setup.py build # Build eetq kernels FROM kernel-builder as eetq-kernels-builder WORKDIR /usr/src COPY server/Makefile-eetq Makefile # Build specific version of transformers -RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq +RUN . .venv/bin/activate && TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq # LoRAX base image FROM nvidia/cuda:12.4.0-base-ubuntu22.04 as base @@ -179,44 +170,59 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins ca-certificates \ make \ sudo \ + build-essential \ + g++ \ && rm -rf /var/lib/apt/lists/* # Copy conda with PyTorch installed -COPY --from=pytorch-install /opt/conda /opt/conda +# COPY --from=pytorch-install /opt/conda /opt/conda + +# RUN curl -LsSf https://astral.sh/uv/install.sh | sh +# ENV PATH="$PATH:/root/.local/bin" +COPY --from=ghcr.io/astral-sh/uv:0.5.31 /uv /uvx /bin/ +# Install flash-attention dependencies +# RUN pip install einops --no-cache-dir + +# Copy env with PyTorch installed +COPY --from=pytorch-install /usr/src/.venv /usr/src/.venv +ENV PYTHON_VERSION=3.11 +RUN uv python install ${PYTHON_VERSION} +ENV VIRTUAL_ENV=/usr/src/.venv/ +ENV PATH="$PATH:/usr/src/.venv/bin/" # Copy build artifacts from flash attention builder -COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages -COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages -COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages +COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages +COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages +COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages # Copy build artifacts from flash attention v2 builder -COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages +COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages # Copy build artifacts from custom kernels builder -COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages +COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages # Copy build artifacts from exllama kernels builder -COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages +COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages # Copy build artifacts from exllamav2 kernels builder -COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages +COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages # Copy build artifacts from awq kernels builder -COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages +COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages # Copy builds artifacts from vllm builder -COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages +COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages # Copy builds artifacts from punica builder -COPY --from=punica-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages +COPY --from=punica-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages # Copy build artifacts from megablocks builder -COPY --from=megablocks-kernels-builder /usr/src/megablocks/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages +COPY --from=megablocks-kernels-builder /usr/src/megablocks/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages # Copy build artifacts from eetq builder -COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages +COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages # Install flash-attention dependencies -RUN pip install einops --no-cache-dir +RUN uv pip install einops --no-cache-dir # Install flashinfer -RUN pip install --no-cache-dir flashinfer==0.1.6 -i https://flashinfer.ai/whl/cu124/torch2.4 +RUN uv pip install flashinfer-python -i https://flashinfer.ai/whl/cu126/torch2.6/ # Install server COPY proto proto @@ -225,20 +231,14 @@ COPY server/Makefile server/Makefile RUN cd server && \ make gen-server && \ - pip install --no-cache-dir -r requirements.txt && \ - pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir + uv pip install --no-cache-dir -r requirements.txt && \ + uv pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir # Install router COPY --from=builder /usr/src/target/release/lorax-router /usr/local/bin/lorax-router # Install launcher COPY --from=builder /usr/src/target/release/lorax-launcher /usr/local/bin/lorax-launcher -RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - build-essential \ - g++ \ - && rm -rf /var/lib/apt/lists/* - - # Final image FROM base LABEL source="https://github.com/predibase/lorax" diff --git a/server/Makefile b/server/Makefile index 0cf2a2911..5eee31fdf 100644 --- a/server/Makefile +++ b/server/Makefile @@ -19,7 +19,7 @@ gen-server: install: gen-server pip install pip --upgrade - pip install torch==2.4.0 + pip install torch==2.6.0 pip install -r requirements.txt pip install -e ".[bnb, accelerate, quantize, peft, outlines]" diff --git a/server/Makefile-eetq b/server/Makefile-eetq index 726e47b57..1b97f6fd7 100644 --- a/server/Makefile-eetq +++ b/server/Makefile-eetq @@ -1,4 +1,4 @@ -eetq_commit := 1657b1504faa359e2ce0ac02999439d7ac8c74c0 +eetq_commit := 465e9726bf7ae30803a2d0dd9e5d4315aef17491 eetq: # Clone eetq diff --git a/server/Makefile-vllm b/server/Makefile-vllm index 4c92391b3..e4569c95b 100644 --- a/server/Makefile-vllm +++ b/server/Makefile-vllm @@ -1,10 +1,10 @@ vllm-cuda: # Clone vllm - pip install -U ninja packaging --no-cache-dir + pip install -U setuptools_scm ninja packaging --no-cache-dir git clone https://github.com/vllm-project/vllm.git vllm build-vllm-cuda: vllm-cuda - cd vllm && git fetch && git checkout 766435e660a786933392eb8ef0a873bc38cf0c8b + cd vllm && git fetch && git checkout 296c6572dd1f76b31b93be19e550790afcfb8843 cd vllm && python setup.py build install-vllm-cuda: build-vllm-cuda diff --git a/server/lorax_server/models/flash_causal_lm.py b/server/lorax_server/models/flash_causal_lm.py index c064cbe5d..726c60135 100644 --- a/server/lorax_server/models/flash_causal_lm.py +++ b/server/lorax_server/models/flash_causal_lm.py @@ -1092,7 +1092,10 @@ def __init__( else: self.kv_dtype = dtype - torch.distributed.barrier(group=self.process_group) + if self.process_group.size() > 1: + torch.distributed.barrier(group=self.process_group) + else: + self.process_group.barrier() filenames = weight_files(model_id, revision=revision, extension=".safetensors", embedding_dim=embedding_dim) merged_weight_filenames = None @@ -1122,7 +1125,10 @@ def __init__( prefix = "" model = model_cls(prefix, config, weights) - torch.distributed.barrier(group=self.process_group) + if self.process_group.size() > 1: + torch.distributed.barrier(group=self.process_group) + else: + self.process_group.barrier() # VLM models define the config we care about in their text_config text_config = getattr(config, "text_config", None) @@ -1273,7 +1279,6 @@ def adapter_memory_size(self) -> int: total_gpu_memory = torch.cuda.get_device_properties(self.device).total_memory return ADAPTER_MEMORY_FRACTION * total_gpu_memory - def init_graph_wrapper(self, max_total_tokens: int): self.model_graph_wrapper = GraphCache( self.model, @@ -1291,7 +1296,7 @@ def init_graph_wrapper(self, max_total_tokens: int): ) def warmup(self, batch: FlashCausalLMBatch, max_new_tokens: int, embedding_model: bool = False): - logger.info(f'Pre warmup cuda memory: {get_cuda_free_memory(self.device, 1) / (1024 ** 3):.2f} GB') + logger.info(f"Pre warmup cuda memory: {get_cuda_free_memory(self.device, 1) / (1024 ** 3):.2f} GB") # The warmup batch is the biggest batch we could ever receive max_total_tokens = batch.max_input_length + max_new_tokens + get_speculative_tokens() @@ -1316,7 +1321,7 @@ def warmup(self, batch: FlashCausalLMBatch, max_new_tokens: int, embedding_model self.kv_dtype, self.device, ) - logger.info(f'Pre warmup kv init cuda memory: {get_cuda_free_memory(self.device, 1) / (1024 ** 3):.2f} GB') + logger.info(f"Pre warmup kv init cuda memory: {get_cuda_free_memory(self.device, 1) / (1024 ** 3):.2f} GB") if not embedding_model: with warmup_mode(): @@ -1351,12 +1356,12 @@ def warmup(self, batch: FlashCausalLMBatch, max_new_tokens: int, embedding_model logger.info("Estimated graph cache memory: {} MB", graph_cache_memory / 1024 / 1024) torch.cuda.synchronize(self.device) - logger.info(f'Post warmup cuda memory: {get_cuda_free_memory(self.device, 1) / (1024 ** 3):.2f} GB') + logger.info(f"Post warmup cuda memory: {get_cuda_free_memory(self.device, 1) / (1024 ** 3):.2f} GB") self.model_graph_wrapper = None self.kv_cache = [] torch.cuda.synchronize(self.device) torch.cuda.empty_cache() - logger.info(f'Post warmup empty_cache cuda memory: {get_cuda_free_memory(self.device, 1) / (1024 ** 3):.2f} GB') + logger.info(f"Post warmup empty_cache cuda memory: {get_cuda_free_memory(self.device, 1) / (1024 ** 3):.2f} GB") # Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm) # Calculate the number of blocks that can be allocated with the free memory dtype_size = torch.tensor([], dtype=self.dtype).element_size() @@ -1365,8 +1370,7 @@ def warmup(self, batch: FlashCausalLMBatch, max_new_tokens: int, embedding_model preloaded_adapter_memory_fraction = sum(self.preloaded_adapter_memory_fractions.values()) free_memory = get_cuda_free_memory( - self.device, - MEMORY_FRACTION - ADAPTER_MEMORY_FRACTION - preloaded_adapter_memory_fraction + self.device, MEMORY_FRACTION - ADAPTER_MEMORY_FRACTION - preloaded_adapter_memory_fraction ) free_memory = max(0, free_memory - graph_cache_memory) logger.info("Memory remaining for kv cache: {} MB", free_memory / 1024 / 1024) @@ -1762,8 +1766,7 @@ def generate_token( # Only save tokens if we are done prefilling for this request batch.all_input_ids_tensor[ i, - batch.cache_lengths_tensor[i] - + batch.input_lengths[i] : batch.cache_lengths_tensor[i] + batch.cache_lengths_tensor[i] + batch.input_lengths[i] : batch.cache_lengths_tensor[i] + batch.input_lengths[i] + accepted_ids[i], ] = next_input_ids[cu_accepted_ids[i] : cu_accepted_ids[i + 1]] diff --git a/server/pyproject.toml b/server/pyproject.toml index 2f46784bd..47f4fdb84 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -19,10 +19,10 @@ grpcio-status = "^1.51.1" grpcio-reflection = "^1.51.1" grpc-interceptor = "^0.15.0" typer = "^0.6.1" -accelerate = { version = "^0.24.1", optional = true } +accelerate = { version = "^1.0.0", optional = true } bitsandbytes = { version = "^0.43.1", optional = true } scipy = { version = "^1.0.0", optional = true } -safetensors = "0.4.2" +safetensors = "0.4.3" loguru = "^0.6.0" opentelemetry-api = "^1.15.0" opentelemetry-exporter-otlp = "^1.15.0" @@ -31,13 +31,13 @@ hf-transfer = "^0.1.2" sentencepiece = "^0.2" tokenizers = "^0.21" huggingface-hub = "^0.26" -transformers = "^4.49" +transformers = "^4.50.0" einops = "^0.6.1" tiktoken = "^0.5.2" texttable = { version = "^1.6.7", optional = true } datasets = { version = "^2.14.0", optional = true } torch = { version = "2.4.0", optional = true } -peft = { version = "0.4.0", optional = true } +peft = { version = "0.15.2", optional = true } boto3 = "^1.28.34" urllib3 = "1.26.19" hqq = { version = "^0.1.7", optional = true } diff --git a/server/requirements.txt b/server/requirements.txt index 3d1ef97b2..91b453bcf 100644 --- a/server/requirements.txt +++ b/server/requirements.txt @@ -29,22 +29,24 @@ jmespath==1.0.1 ; python_version >= "3.9" and python_version < "4.0" loguru==0.6.0 ; python_version >= "3.9" and python_version < "4.0" markupsafe==3.0.2 ; python_version >= "3.9" and python_version < "4.0" mpmath==1.3.0 ; python_version >= "3.9" and python_version < "4.0" +msgspec==0.6.0 ; python_version >= "3.9" and python_version < "4.0" multidict==6.1.0 ; python_version >= "3.9" and python_version < "4.0" networkx==3.2.1 ; python_version >= "3.9" and python_version < "4.0" numpy==1.26.4 ; python_version >= "3.9" and python_version < "4.0" -nvidia-cublas-cu12==12.1.3.1 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0" -nvidia-cuda-cupti-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0" -nvidia-cuda-nvrtc-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0" -nvidia-cuda-runtime-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0" +nvidia-cublas-cu12==12.4.5.8 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0" +nvidia-cuda-cupti-cu12==12.4.127 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0" +nvidia-cuda-nvrtc-cu12==12.4.127 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0" +nvidia-cuda-runtime-cu12==12.4.127 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0" nvidia-cudnn-cu12==9.1.0.70 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0" -nvidia-cufft-cu12==11.0.2.54 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0" -nvidia-curand-cu12==10.3.2.106 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0" -nvidia-cusolver-cu12==11.4.5.107 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0" -nvidia-cusparse-cu12==12.1.0.106 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0" +nvidia-cufft-cu12==11.2.1.3 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0" +nvidia-curand-cu12==10.3.5.147 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0" +nvidia-cusolver-cu12==11.6.1.9 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0" +nvidia-cusparse-cu12==12.3.1.170 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0" +nvidia-cusparselt-cu12==0.6.2 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0" nvidia-ml-py==12.570.86 ; python_version >= "3.9" and python_version < "4.0" -nvidia-nccl-cu12==2.20.5 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0" -nvidia-nvjitlink-cu12==12.8.61 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0" -nvidia-nvtx-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0" +nvidia-nccl-cu12==2.21.5 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0" +nvidia-nvjitlink-cu12==12.4.127 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0" +nvidia-nvtx-cu12==12.4.127 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0" opentelemetry-api==1.21.0 ; python_version >= "3.9" and python_version < "4.0" opentelemetry-exporter-otlp-proto-common==1.21.0 ; python_version >= "3.9" and python_version < "4.0" opentelemetry-exporter-otlp-proto-grpc==1.21.0 ; python_version >= "3.9" and python_version < "4.0" @@ -66,18 +68,18 @@ pyyaml==6.0.2 ; python_version >= "3.9" and python_version < "4.0" regex==2024.11.6 ; python_version >= "3.9" and python_version < "4.0" requests==2.32.3 ; python_version >= "3.9" and python_version < "4.0" s3transfer==0.11.4 ; python_version >= "3.9" and python_version < "4.0" -safetensors==0.4.2 ; python_version >= "3.9" and python_version < "4.0" +safetensors==0.4.3 ; python_version >= "3.9" and python_version < "4.0" sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "4.0" setuptools==75.8.2 ; python_version >= "3.9" and python_version < "4.0" six==1.17.0 ; python_version >= "3.9" and python_version < "4.0" stanford-stk==0.7.1 ; python_version >= "3.9" and python_version < "4.0" and sys_platform == "linux" -sympy==1.13.3 ; python_version >= "3.9" and python_version < "4.0" +sympy==1.13.1 ; python_version >= "3.9" and python_version < "4.0" tiktoken==0.5.2 ; python_version >= "3.9" and python_version < "4.0" tokenizers==0.21.0 ; python_version >= "3.9" and python_version < "4.0" -torch==2.4.0 ; python_version >= "3.9" and python_version < "4.0" +torch==2.6.0 ; python_version >= "3.9" and python_version < "4.0" tqdm==4.67.1 ; python_version >= "3.9" and python_version < "4.0" -transformers==4.49.0 ; python_version >= "3.9" and python_version < "4.0" -triton==3.0.0 ; python_version >= "3.9" and sys_platform == "linux" and python_version < "4.0" or python_version >= "3.9" and python_version < "3.13" and platform_machine == "x86_64" and platform_system == "Linux" +transformers==4.50.0 ; python_version >= "3.9" and python_version < "4.0" +triton==3.2.0 ; python_version >= "3.9" and sys_platform == "linux" and python_version < "4.0" or python_version >= "3.9" and python_version < "3.13" and platform_machine == "x86_64" and platform_system == "Linux" typer==0.6.1 ; python_version >= "3.9" and python_version < "4.0" typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "4.0" urllib3==1.26.19 ; python_version >= "3.9" and python_version < "4.0"