diff --git a/Cargo.lock b/Cargo.lock
index 2ed9cff0d..f5ea07c37 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -556,9 +556,9 @@ dependencies = [
 
 [[package]]
 name = "cc"
-version = "1.2.0"
+version = "1.2.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1aeb932158bd710538c73702db6945cb68a8fb08c519e6e12706b94263b36db8"
+checksum = "32db95edf998450acc7881c932f94cd9b05c87b4b2599e8bab064753da4acfd1"
 dependencies = [
  "jobserver",
  "libc",
@@ -729,9 +729,9 @@ dependencies = [
 
 [[package]]
 name = "crossbeam-channel"
-version = "0.5.13"
+version = "0.5.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "33480d6946193aa8033910124896ca395333cae7e2d1113d1fef6c3272217df2"
+checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2"
 dependencies = [
  "crossbeam-utils",
 ]
@@ -1875,7 +1875,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34"
 dependencies = [
  "cfg-if",
- "windows-targets 0.52.6",
+ "windows-targets 0.48.5",
 ]
 
 [[package]]
@@ -2514,9 +2514,9 @@ dependencies = [
 
 [[package]]
 name = "openssl"
-version = "0.10.70"
+version = "0.10.72"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61cfb4e166a8bb8c9b55c500bc2308550148ece889be90f609377e58140f42c6"
+checksum = "fedfea7d58a1f73118430a55da6a286e7b044961736ce96a16a17068ea25e5da"
 dependencies = [
  "bitflags 2.6.0",
  "cfg-if",
@@ -2546,9 +2546,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
 [[package]]
 name = "openssl-sys"
-version = "0.9.105"
+version = "0.9.108"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8b22d5b84be05a8d6947c7cb71f7c849aa0f112acd4bf51c2a7c1c988ac0a9dc"
+checksum = "e145e1651e858e820e4860f7b9c5e169bc1d8ce1c86043be79fa7b7634821847"
 dependencies = [
  "cc",
  "libc",
@@ -3396,7 +3396,7 @@ dependencies = [
  "cc",
  "libc",
  "once_cell",
- "spin 0.5.2",
+ "spin",
  "untrusted 0.7.1",
  "web-sys",
  "winapi",
@@ -3404,15 +3404,14 @@ dependencies = [
 
 [[package]]
 name = "ring"
-version = "0.17.8"
+version = "0.17.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d"
+checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7"
 dependencies = [
  "cc",
  "cfg-if",
  "getrandom",
  "libc",
- "spin 0.9.8",
  "untrusted 0.9.0",
  "windows-sys 0.52.0",
 ]
@@ -3520,7 +3519,7 @@ dependencies = [
  "aws-lc-rs",
  "log",
  "once_cell",
- "ring 0.17.8",
+ "ring 0.17.14",
  "rustls-pki-types",
  "rustls-webpki",
  "subtle",
@@ -3549,7 +3548,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9"
 dependencies = [
  "aws-lc-rs",
- "ring 0.17.8",
+ "ring 0.17.14",
  "rustls-pki-types",
  "untrusted 0.9.0",
 ]
@@ -3596,7 +3595,7 @@ version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414"
 dependencies = [
- "ring 0.17.8",
+ "ring 0.17.14",
  "untrusted 0.9.0",
 ]
 
@@ -3801,12 +3800,6 @@ version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
 
-[[package]]
-name = "spin"
-version = "0.9.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
-
 [[package]]
 name = "spm_precompiled"
 version = "0.1.4"
@@ -4888,7 +4881,7 @@ version = "0.22.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ed63aea5ce73d0ff405984102c42de94fc55a6b75765d621c65262469b3c9b53"
 dependencies = [
- "ring 0.17.8",
+ "ring 0.17.14",
  "untrusted 0.9.0",
 ]
 
diff --git a/Dockerfile b/Dockerfile
index 0988daf58..302fb6e57 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -17,6 +17,8 @@ FROM chef AS builder
 ARG GIT_SHA
 ARG DOCKER_LABEL
 
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    python3.11-dev
 RUN PROTOC_ZIP=protoc-21.12-linux-x86_64.zip && \
     curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP && \
     unzip -o $PROTOC_ZIP -d /usr/local bin/protoc && \
@@ -37,8 +39,8 @@ RUN cargo build --release
 # Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
 FROM nvidia/cuda:12.4.0-devel-ubuntu22.04 as pytorch-install
 
-ARG PYTORCH_VERSION=2.4.0
-ARG PYTHON_VERSION=3.10
+ARG PYTORCH_VERSION=2.6.0
+ARG PYTHON_VERSION=3.11
 # Keep in sync with `server/pyproject.toml
 ARG CUDA_VERSION=12.4
 ARG MAMBA_VERSION=24.3.0-0
@@ -47,6 +49,7 @@ ARG INSTALL_CHANNEL=pytorch
 # Automatically set by buildx
 ARG TARGETPLATFORM
 
+WORKDIR /usr/src
 ENV PATH /opt/conda/bin:$PATH
 
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
@@ -57,31 +60,19 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
     git && \
     rm -rf /var/lib/apt/lists/*
 
-# Install conda
-# translating Docker's TARGETPLATFORM into mamba arches
-RUN case ${TARGETPLATFORM} in \
-    "linux/arm64")  MAMBA_ARCH=aarch64  ;; \
-    *)              MAMBA_ARCH=x86_64   ;; \
-    esac && \
-    curl -fsSL -v -o ~/mambaforge.sh -O  "https://github.com/conda-forge/miniforge/releases/download/${MAMBA_VERSION}/Mambaforge-${MAMBA_VERSION}-Linux-${MAMBA_ARCH}.sh"
-RUN chmod +x ~/mambaforge.sh && \
-    bash ~/mambaforge.sh -b -p /opt/conda && \
-    rm ~/mambaforge.sh
-
-# Install pytorch
-# On arm64 we exit with an error code
-RUN case ${TARGETPLATFORM} in \
-    "linux/arm64")  exit 1 ;; \
-    *)              /opt/conda/bin/conda update -y conda &&  \
-    /opt/conda/bin/conda install -c "${INSTALL_CHANNEL}" -c "${CUDA_CHANNEL}" -y "python=${PYTHON_VERSION}" "pytorch=$PYTORCH_VERSION" "pytorch-cuda=$(echo $CUDA_VERSION | cut -d'.' -f 1-2)"  ;; \
-    esac && \
-    /opt/conda/bin/conda clean -ya
+COPY --from=ghcr.io/astral-sh/uv:0.5.31 /uv /uvx /bin/
+ENV PATH="$PATH:/root/.local/bin"
+RUN uv python install ${PYTHON_VERSION}
+RUN uv venv --python ${PYTHON_VERSION} && uv pip install torch==${PYTORCH_VERSION} torchvision pip setuptools packaging
+ENV VIRTUAL_ENV=/usr/src/.venv/
+ENV PATH="$PATH:/usr/src/.venv/bin/"
+
 
 # CUDA kernels builder image
 FROM pytorch-install as kernel-builder
 
-ARG MAX_JOBS=2
-
+ARG MAX_JOBS=8
+ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;9.0+PTX"
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
     ninja-build cmake \
     && rm -rf /var/lib/apt/lists/*
@@ -90,38 +81,38 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
 FROM kernel-builder as flash-att-builder
 WORKDIR /usr/src
 COPY server/Makefile-flash-att Makefile
-RUN make build-flash-attention
+RUN . .venv/bin/activate && make build-flash-attention
 
 # Build Flash Attention v2 CUDA kernels
 FROM kernel-builder as flash-att-v2-builder
 WORKDIR /usr/src
 COPY server/Makefile-flash-att-v2 Makefile
-RUN make build-flash-attention-v2-cuda
+RUN . .venv/bin/activate && make build-flash-attention-v2-cuda
 
 # Build Transformers exllama kernels
 FROM kernel-builder as exllama-kernels-builder
 WORKDIR /usr/src
 COPY server/exllama_kernels/ .
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+RUN . .venv/bin/activate && TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
 
 # Build Transformers exllama kernels
 FROM kernel-builder as exllamav2-kernels-builder
 WORKDIR /usr/src
 COPY server/exllamav2_kernels/ .
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
+RUN . .venv/bin/activate && TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" python setup.py build
 
 # Build Transformers awq kernels
 FROM kernel-builder as awq-kernels-builder
 WORKDIR /usr/src
 COPY server/Makefile-awq Makefile
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq
+RUN . .venv/bin/activate && TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq
 
 # Build Transformers CUDA kernels
 FROM kernel-builder as custom-kernels-builder
 WORKDIR /usr/src
 COPY server/custom_kernels/ .
 # Build specific version of transformers
-RUN python setup.py build
+RUN . .venv/bin/activate && python setup.py build
 
 # Build vllm CUDA kernels
 FROM kernel-builder as vllm-builder
@@ -136,14 +127,14 @@ RUN ln -s "$(pwd)/cmake-3.30.0-linux-x86_64/bin/cmake" /usr/local/bin/cmake
 ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
 COPY server/Makefile-vllm Makefile
 # Build specific version of vllm
-RUN make build-vllm-cuda
+RUN . .venv/bin/activate && make build-vllm-cuda
 
 # Build megablocks kernels
 FROM kernel-builder as megablocks-kernels-builder
 WORKDIR /usr/src
 COPY server/Makefile-megablocks Makefile
 ENV TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
-RUN make build-megablocks
+RUN . .venv/bin/activate && make build-megablocks
 
 # Build punica CUDA kernels
 FROM kernel-builder as punica-builder
@@ -151,14 +142,14 @@ WORKDIR /usr/src
 COPY server/punica_kernels/ .
 # Build specific version of punica
 ENV TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
-RUN python setup.py build
+RUN . .venv/bin/activate && python setup.py build
 
 # Build eetq kernels
 FROM kernel-builder as eetq-kernels-builder
 WORKDIR /usr/src
 COPY server/Makefile-eetq Makefile
 # Build specific version of transformers
-RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq
+RUN . .venv/bin/activate && TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq
 
 # LoRAX base image
 FROM nvidia/cuda:12.4.0-base-ubuntu22.04 as base
@@ -179,44 +170,59 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-ins
     ca-certificates \
     make \
     sudo \
+    build-essential \
+    g++ \
     && rm -rf /var/lib/apt/lists/*
 
 # Copy conda with PyTorch installed
-COPY --from=pytorch-install /opt/conda /opt/conda
+# COPY --from=pytorch-install /opt/conda /opt/conda
+
+# RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+# ENV PATH="$PATH:/root/.local/bin"
+COPY --from=ghcr.io/astral-sh/uv:0.5.31 /uv /uvx /bin/
+# Install flash-attention dependencies
+# RUN pip install einops --no-cache-dir
+
+# Copy env with PyTorch installed
+COPY --from=pytorch-install /usr/src/.venv /usr/src/.venv
+ENV PYTHON_VERSION=3.11
+RUN uv python install ${PYTHON_VERSION}
+ENV VIRTUAL_ENV=/usr/src/.venv/
+ENV PATH="$PATH:/usr/src/.venv/bin/"
 
 # Copy build artifacts from flash attention builder
-COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
-COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/layer_norm/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
+COPY --from=flash-att-builder /usr/src/flash-attention/csrc/rotary/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
 
 # Copy build artifacts from flash attention v2 builder
-COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
 
 # Copy build artifacts from custom kernels builder
-COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
 # Copy build artifacts from exllama kernels builder
-COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
 # Copy build artifacts from exllamav2 kernels builder
-COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=exllamav2-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
 # Copy build artifacts from awq kernels builder
-COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=awq-kernels-builder /usr/src/llm-awq/awq/kernels/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
 # Copy builds artifacts from vllm builder
-COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
 
 # Copy builds artifacts from punica builder
-COPY --from=punica-builder /usr/src/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=punica-builder /usr/src/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
 
 # Copy build artifacts from megablocks builder
-COPY --from=megablocks-kernels-builder /usr/src/megablocks/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=megablocks-kernels-builder /usr/src/megablocks/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
 
 # Copy build artifacts from eetq builder
-COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-310 /opt/conda/lib/python3.10/site-packages
+COPY --from=eetq-kernels-builder /usr/src/eetq/build/lib.linux-x86_64-cpython-311 /usr/src/.venv/lib/python3.11/site-packages
 
 # Install flash-attention dependencies
-RUN pip install einops --no-cache-dir
+RUN uv pip install einops --no-cache-dir
 
 # Install flashinfer
-RUN pip install --no-cache-dir flashinfer==0.1.6 -i https://flashinfer.ai/whl/cu124/torch2.4
+RUN uv pip install flashinfer-python -i https://flashinfer.ai/whl/cu126/torch2.6/
 
 # Install server
 COPY proto proto
@@ -225,20 +231,14 @@ COPY server/Makefile server/Makefile
 
 RUN cd server && \
     make gen-server && \
-    pip install --no-cache-dir -r requirements.txt && \
-    pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir
+    uv pip install --no-cache-dir -r requirements.txt && \
+    uv pip install ".[bnb, accelerate, quantize, peft, outlines]" --no-cache-dir
 
 # Install router
 COPY --from=builder /usr/src/target/release/lorax-router /usr/local/bin/lorax-router
 # Install launcher
 COPY --from=builder /usr/src/target/release/lorax-launcher /usr/local/bin/lorax-launcher
 
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
-    build-essential \
-    g++ \
-    && rm -rf /var/lib/apt/lists/*
-
-
 # Final image
 FROM base
 LABEL source="https://github.com/predibase/lorax"
diff --git a/server/Makefile b/server/Makefile
index 0cf2a2911..5eee31fdf 100644
--- a/server/Makefile
+++ b/server/Makefile
@@ -19,7 +19,7 @@ gen-server:
 
 install: gen-server
 	pip install pip --upgrade
-	pip install torch==2.4.0
+	pip install torch==2.6.0
 	pip install -r requirements.txt
 	pip install -e ".[bnb, accelerate, quantize, peft, outlines]"
 
diff --git a/server/Makefile-eetq b/server/Makefile-eetq
index 726e47b57..1b97f6fd7 100644
--- a/server/Makefile-eetq
+++ b/server/Makefile-eetq
@@ -1,4 +1,4 @@
-eetq_commit := 1657b1504faa359e2ce0ac02999439d7ac8c74c0
+eetq_commit := 465e9726bf7ae30803a2d0dd9e5d4315aef17491
 
 eetq:
     # Clone eetq
diff --git a/server/Makefile-vllm b/server/Makefile-vllm
index 4c92391b3..e4569c95b 100644
--- a/server/Makefile-vllm
+++ b/server/Makefile-vllm
@@ -1,10 +1,10 @@
 vllm-cuda:
     # Clone vllm
-	pip install -U ninja packaging --no-cache-dir
+	pip install -U setuptools_scm ninja packaging --no-cache-dir
 	git clone https://github.com/vllm-project/vllm.git vllm
 
 build-vllm-cuda: vllm-cuda
-	cd vllm && git fetch && git checkout 766435e660a786933392eb8ef0a873bc38cf0c8b
+	cd vllm && git fetch && git checkout 296c6572dd1f76b31b93be19e550790afcfb8843
 	cd vllm && python setup.py build
 
 install-vllm-cuda: build-vllm-cuda
diff --git a/server/lorax_server/models/flash_causal_lm.py b/server/lorax_server/models/flash_causal_lm.py
index c064cbe5d..726c60135 100644
--- a/server/lorax_server/models/flash_causal_lm.py
+++ b/server/lorax_server/models/flash_causal_lm.py
@@ -1092,7 +1092,10 @@ def __init__(
         else:
             self.kv_dtype = dtype
 
-        torch.distributed.barrier(group=self.process_group)
+        if self.process_group.size() > 1:
+            torch.distributed.barrier(group=self.process_group)
+        else:
+            self.process_group.barrier()
 
         filenames = weight_files(model_id, revision=revision, extension=".safetensors", embedding_dim=embedding_dim)
         merged_weight_filenames = None
@@ -1122,7 +1125,10 @@ def __init__(
         prefix = ""
         model = model_cls(prefix, config, weights)
 
-        torch.distributed.barrier(group=self.process_group)
+        if self.process_group.size() > 1:
+            torch.distributed.barrier(group=self.process_group)
+        else:
+            self.process_group.barrier()
 
         # VLM models define the config we care about in their text_config
         text_config = getattr(config, "text_config", None)
@@ -1273,7 +1279,6 @@ def adapter_memory_size(self) -> int:
         total_gpu_memory = torch.cuda.get_device_properties(self.device).total_memory
         return ADAPTER_MEMORY_FRACTION * total_gpu_memory
 
-
     def init_graph_wrapper(self, max_total_tokens: int):
         self.model_graph_wrapper = GraphCache(
             self.model,
@@ -1291,7 +1296,7 @@ def init_graph_wrapper(self, max_total_tokens: int):
         )
 
     def warmup(self, batch: FlashCausalLMBatch, max_new_tokens: int, embedding_model: bool = False):
-        logger.info(f'Pre warmup cuda memory: {get_cuda_free_memory(self.device, 1) / (1024 ** 3):.2f} GB')
+        logger.info(f"Pre warmup cuda memory: {get_cuda_free_memory(self.device, 1) / (1024 ** 3):.2f} GB")
 
         # The warmup batch is the biggest batch we could ever receive
         max_total_tokens = batch.max_input_length + max_new_tokens + get_speculative_tokens()
@@ -1316,7 +1321,7 @@ def warmup(self, batch: FlashCausalLMBatch, max_new_tokens: int, embedding_model
                 self.kv_dtype,
                 self.device,
             )
-            logger.info(f'Pre warmup kv init cuda memory: {get_cuda_free_memory(self.device, 1) / (1024 ** 3):.2f} GB')
+            logger.info(f"Pre warmup kv init cuda memory: {get_cuda_free_memory(self.device, 1) / (1024 ** 3):.2f} GB")
 
             if not embedding_model:
                 with warmup_mode():
@@ -1351,12 +1356,12 @@ def warmup(self, batch: FlashCausalLMBatch, max_new_tokens: int, embedding_model
             logger.info("Estimated graph cache memory: {} MB", graph_cache_memory / 1024 / 1024)
             torch.cuda.synchronize(self.device)
 
-        logger.info(f'Post warmup cuda memory: {get_cuda_free_memory(self.device, 1) / (1024 ** 3):.2f} GB')
+        logger.info(f"Post warmup cuda memory: {get_cuda_free_memory(self.device, 1) / (1024 ** 3):.2f} GB")
         self.model_graph_wrapper = None
         self.kv_cache = []
         torch.cuda.synchronize(self.device)
         torch.cuda.empty_cache()
-        logger.info(f'Post warmup empty_cache cuda memory: {get_cuda_free_memory(self.device, 1) / (1024 ** 3):.2f} GB')
+        logger.info(f"Post warmup empty_cache cuda memory: {get_cuda_free_memory(self.device, 1) / (1024 ** 3):.2f} GB")
         # Inspired by the original implementation in [vllm](https://github.com/vllm-project/vllm)
         # Calculate the number of blocks that can be allocated with the free memory
         dtype_size = torch.tensor([], dtype=self.dtype).element_size()
@@ -1365,8 +1370,7 @@ def warmup(self, batch: FlashCausalLMBatch, max_new_tokens: int, embedding_model
 
         preloaded_adapter_memory_fraction = sum(self.preloaded_adapter_memory_fractions.values())
         free_memory = get_cuda_free_memory(
-            self.device,
-            MEMORY_FRACTION - ADAPTER_MEMORY_FRACTION - preloaded_adapter_memory_fraction
+            self.device, MEMORY_FRACTION - ADAPTER_MEMORY_FRACTION - preloaded_adapter_memory_fraction
         )
         free_memory = max(0, free_memory - graph_cache_memory)
         logger.info("Memory remaining for kv cache: {} MB", free_memory / 1024 / 1024)
@@ -1762,8 +1766,7 @@ def generate_token(
                 # Only save tokens if we are done prefilling for this request
                 batch.all_input_ids_tensor[
                     i,
-                    batch.cache_lengths_tensor[i]
-                    + batch.input_lengths[i] : batch.cache_lengths_tensor[i]
+                    batch.cache_lengths_tensor[i] + batch.input_lengths[i] : batch.cache_lengths_tensor[i]
                     + batch.input_lengths[i]
                     + accepted_ids[i],
                 ] = next_input_ids[cu_accepted_ids[i] : cu_accepted_ids[i + 1]]
diff --git a/server/pyproject.toml b/server/pyproject.toml
index 2f46784bd..47f4fdb84 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -19,10 +19,10 @@ grpcio-status = "^1.51.1"
 grpcio-reflection = "^1.51.1"
 grpc-interceptor = "^0.15.0"
 typer = "^0.6.1"
-accelerate = { version = "^0.24.1", optional = true }
+accelerate = { version = "^1.0.0", optional = true }
 bitsandbytes = { version = "^0.43.1", optional = true }
 scipy = { version = "^1.0.0", optional = true }
-safetensors = "0.4.2"
+safetensors = "0.4.3"
 loguru = "^0.6.0"
 opentelemetry-api = "^1.15.0"
 opentelemetry-exporter-otlp = "^1.15.0"
@@ -31,13 +31,13 @@ hf-transfer = "^0.1.2"
 sentencepiece = "^0.2"
 tokenizers = "^0.21"
 huggingface-hub = "^0.26"
-transformers = "^4.49"
+transformers = "^4.50.0"
 einops = "^0.6.1"
 tiktoken = "^0.5.2"
 texttable = { version = "^1.6.7", optional = true }
 datasets = { version = "^2.14.0", optional = true }
 torch = { version = "2.4.0", optional = true }
-peft = { version = "0.4.0", optional = true }
+peft = { version = "0.15.2", optional = true }
 boto3 = "^1.28.34"
 urllib3 = "1.26.19"
 hqq = { version = "^0.1.7", optional = true }
diff --git a/server/requirements.txt b/server/requirements.txt
index 3d1ef97b2..91b453bcf 100644
--- a/server/requirements.txt
+++ b/server/requirements.txt
@@ -29,22 +29,24 @@ jmespath==1.0.1 ; python_version >= "3.9" and python_version < "4.0"
 loguru==0.6.0 ; python_version >= "3.9" and python_version < "4.0"
 markupsafe==3.0.2 ; python_version >= "3.9" and python_version < "4.0"
 mpmath==1.3.0 ; python_version >= "3.9" and python_version < "4.0"
+msgspec==0.6.0 ; python_version >= "3.9" and python_version < "4.0"
 multidict==6.1.0 ; python_version >= "3.9" and python_version < "4.0"
 networkx==3.2.1 ; python_version >= "3.9" and python_version < "4.0"
 numpy==1.26.4 ; python_version >= "3.9" and python_version < "4.0"
-nvidia-cublas-cu12==12.1.3.1 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
-nvidia-cuda-cupti-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
-nvidia-cuda-nvrtc-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
-nvidia-cuda-runtime-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
+nvidia-cublas-cu12==12.4.5.8 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
+nvidia-cuda-cupti-cu12==12.4.127 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
+nvidia-cuda-nvrtc-cu12==12.4.127 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
+nvidia-cuda-runtime-cu12==12.4.127 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
 nvidia-cudnn-cu12==9.1.0.70 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
-nvidia-cufft-cu12==11.0.2.54 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
-nvidia-curand-cu12==10.3.2.106 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
-nvidia-cusolver-cu12==11.4.5.107 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
-nvidia-cusparse-cu12==12.1.0.106 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
+nvidia-cufft-cu12==11.2.1.3 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
+nvidia-curand-cu12==10.3.5.147 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
+nvidia-cusolver-cu12==11.6.1.9 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
+nvidia-cusparse-cu12==12.3.1.170 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
+nvidia-cusparselt-cu12==0.6.2 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
 nvidia-ml-py==12.570.86 ; python_version >= "3.9" and python_version < "4.0"
-nvidia-nccl-cu12==2.20.5 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
-nvidia-nvjitlink-cu12==12.8.61 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
-nvidia-nvtx-cu12==12.1.105 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
+nvidia-nccl-cu12==2.21.5 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
+nvidia-nvjitlink-cu12==12.4.127 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
+nvidia-nvtx-cu12==12.4.127 ; platform_system == "Linux" and platform_machine == "x86_64" and python_version >= "3.9" and python_version < "4.0"
 opentelemetry-api==1.21.0 ; python_version >= "3.9" and python_version < "4.0"
 opentelemetry-exporter-otlp-proto-common==1.21.0 ; python_version >= "3.9" and python_version < "4.0"
 opentelemetry-exporter-otlp-proto-grpc==1.21.0 ; python_version >= "3.9" and python_version < "4.0"
@@ -66,18 +68,18 @@ pyyaml==6.0.2 ; python_version >= "3.9" and python_version < "4.0"
 regex==2024.11.6 ; python_version >= "3.9" and python_version < "4.0"
 requests==2.32.3 ; python_version >= "3.9" and python_version < "4.0"
 s3transfer==0.11.4 ; python_version >= "3.9" and python_version < "4.0"
-safetensors==0.4.2 ; python_version >= "3.9" and python_version < "4.0"
+safetensors==0.4.3 ; python_version >= "3.9" and python_version < "4.0"
 sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "4.0"
 setuptools==75.8.2 ; python_version >= "3.9" and python_version < "4.0"
 six==1.17.0 ; python_version >= "3.9" and python_version < "4.0"
 stanford-stk==0.7.1 ; python_version >= "3.9" and python_version < "4.0" and sys_platform == "linux"
-sympy==1.13.3 ; python_version >= "3.9" and python_version < "4.0"
+sympy==1.13.1 ; python_version >= "3.9" and python_version < "4.0"
 tiktoken==0.5.2 ; python_version >= "3.9" and python_version < "4.0"
 tokenizers==0.21.0 ; python_version >= "3.9" and python_version < "4.0"
-torch==2.4.0 ; python_version >= "3.9" and python_version < "4.0"
+torch==2.6.0 ; python_version >= "3.9" and python_version < "4.0"
 tqdm==4.67.1 ; python_version >= "3.9" and python_version < "4.0"
-transformers==4.49.0 ; python_version >= "3.9" and python_version < "4.0"
-triton==3.0.0 ; python_version >= "3.9" and sys_platform == "linux" and python_version < "4.0" or python_version >= "3.9" and python_version < "3.13" and platform_machine == "x86_64" and platform_system == "Linux"
+transformers==4.50.0 ; python_version >= "3.9" and python_version < "4.0"
+triton==3.2.0 ; python_version >= "3.9" and sys_platform == "linux" and python_version < "4.0" or python_version >= "3.9" and python_version < "3.13" and platform_machine == "x86_64" and platform_system == "Linux"
 typer==0.6.1 ; python_version >= "3.9" and python_version < "4.0"
 typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "4.0"
 urllib3==1.26.19 ; python_version >= "3.9" and python_version < "4.0"