Skip to content

Commit

Permalink
Upgrade to torch 2.5.0 (#20)
Browse files Browse the repository at this point in the history
* Upgrade to torch 2.5.0

* Drop `mamba-ssm` for now

* Fix transformers broadcast issue
  • Loading branch information
chiragjn authored Dec 23, 2024
1 parent 46298e8 commit 9675011
Show file tree
Hide file tree
Showing 9 changed files with 37 additions and 21 deletions.
12 changes: 7 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
# https://hub.docker.com/layers/winglian/axolotl/main-20241111-py3.11-cu121-2.3.1/images/sha256-67c35533cf8e7a399de19cdaf3852be093b9e184b9554ea38801a482da5d7231?context=explore
FROM winglian/axolotl@sha256:1f892444717a6781ad0e6e02b3548cd76be14d65a7162f2d82eab5c809936bc5
# https://hub.docker.com/layers/winglian/axolotl/main-20241217/images/sha256-5ed6e068d193ac35d092f8d6ccb56b1750779415cd07047edbbfb8d4edd87ae2
FROM winglian/axolotl@sha256:0966ba0bdfda0a317016614a6eb9f599325d0e42109544f95f5540d144ddeebd
SHELL ["/bin/bash", "-c"]
USER root
RUN [ "$(/usr/local/cuda/bin/nvcc --version | egrep -o "V[0-9]+\.[0-9]+" | cut -c2-)" = "12.1" ] || (echo "Error: CUDA version is not 12.1" && exit 1)

# Install torch and axolotl requirements
COPY torch-requirements.txt base-requirements.txt requirements.txt /tmp/llm-finetune/
COPY base-requirements.txt requirements.txt /tmp/llm-finetune/
RUN pip install -U pip wheel setuptools && \
pip uninstall -y axolotl && \
MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -U --no-cache-dir --no-build-isolation --use-pep517 -r /tmp/llm-finetune/requirements.txt && \
pip uninstall -y axolotl torch && \
pip install -U --no-cache-dir --use-pep517 -r /tmp/llm-finetune/base-requirements.txt && \
MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install --no-cache-dir --no-build-isolation --use-pep517 -r /tmp/llm-finetune/requirements.txt && \
rm -rf /root/.cache/pip

# Install axolotl_truefoundry plugin
Expand Down
13 changes: 7 additions & 6 deletions Dockerfile-notebook
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
FROM tfy.jfrog.io/tfy-images/jupyter:0.3.8-cu121-py3.11.10-sudo
SHELL ["/bin/bash", "-c"]
USER root
RUN [ "$(/usr/local/cuda/bin/nvcc --version | egrep -o "V[0-9]+\.[0-9]+" | cut -c2-)" = "12.1" ] || (echo "Error: CUDA version is not 12.1" && exit 1)
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
ENV DEBIAN_FRONTEND=noninteractive

# upgrade libnccl
USER root
RUN apt update && \
apt install -y --no-install-recommends git curl wget && \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb -O /tmp/cuda-keyring_1.1-1_all.deb && \
Expand All @@ -14,11 +14,12 @@ RUN apt update && \

# Install torch and axolotl
USER jovyan
COPY torch-requirements.txt base-requirements.txt requirements.txt notebook-requirements.txt /tmp/llm-finetune/
COPY base-requirements.txt requirements.txt notebook-requirements.txt /tmp/llm-finetune/
RUN pip install -U pip wheel setuptools && \
pip uninstall -y axolotl && \
pip install --no-cache-dir -U -r /tmp/llm-finetune/torch-requirements.txt && \
MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -U --no-cache-dir --no-build-isolation --use-pep517 -r /tmp/llm-finetune/notebook-requirements.txt
pip uninstall -y axolotl torch && \
pip install -U --no-cache-dir --use-pep517 -r /tmp/llm-finetune/base-requirements.txt && \
MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install --no-cache-dir --no-build-isolation --use-pep517 -r /tmp/llm-finetune/requirements.txt && \
pip install --no-cache-dir --use-pep517 -r /tmp/llm-finetune/notebook-requirements.txt

# Setup editable packages
USER root
Expand Down
9 changes: 5 additions & 4 deletions base-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
--extra-index-url https://download.pytorch.org/whl/cu121
cloud-files==4.29.0
cut-cross-entropy[transformers] @ git+https://github.com/apple/ml-cross-entropy.git@1f3ebdb20653a26598a2722acd21e9b1528608c3
fsspec==2024.9.0
hf-transfer<0.2.0
pyarrow>=15.0.0,<19.0.0
rich>=13.0.0,<14
s3fs==2024.9.0
snowflake-connector-python[pandas]==3.12.3
torch==2.3.1+cu121
torchao==0.6.1+cu121
truefoundry==0.5.2rc1
unsloth[cu121-ampere-torch230] @ git+https://github.com/unslothai/unsloth.git@9dc399a6b6625ee40835c5eab361426d3c5d4abb
torch==2.5.0+cu121
torchao==0.7.0+cu121
transformers @ git+https://github.com/truefoundry/transformers.git@09a21295f3a13bf81b4fc22057bb2fc9ae063891
truefoundry==0.5.3rc2
5 changes: 5 additions & 0 deletions config-base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ val_set_size: 0.1
data_dir: auto # type: string
datasets: auto # type: list
test_datasets: auto # type: list
batch_flattening: auto # type: bool
bf16: auto # type: bool
bfloat16: auto # type: bool
flash_attention: auto # type: bool
Expand Down Expand Up @@ -90,6 +91,7 @@ peft_use_rslora: True
plugins:
- axolotl_truefoundry.TrueFoundryMLPlugin
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
pad_to_sequence_len: True
remove_unused_columns: True
report_to: tensorboard
Expand Down Expand Up @@ -140,3 +142,6 @@ liger_rms_norm: True
liger_glu_activation: True
liger_layer_norm: True
liger_fused_linear_cross_entropy: True

## CutCrossEntropy
cut_cross_entropy: False
1 change: 0 additions & 1 deletion notebook-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
-r requirements.txt
jupyter-app-launcher==0.3.1
2 changes: 1 addition & 1 deletion plugins/axolotl_truefoundry/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ dependencies = [
"transformers>=4.0.0,<5",
"truefoundry>=0.5.1,<0.6.0",
"pynvml>=11.0.0,<12",
"torch>=2.3.0,<2.4.0",
"torch>=2.0.0,<3.0.0",
"pydantic>=2.0.0,<3",
"orjson",
]
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
--extra-index-url https://download.pytorch.org/whl/cu121
-r base-requirements.txt
axolotl[deepspeed,flash-attn,mamba-ssm,optimizers,lion-pytorch,galore] @ git+https://github.com/truefoundry/axolotl@c7fc338e67c4313ec82fcca304733c9ececae5c0
unsloth[cu121-ampere-torch250] @ git+https://github.com/unslothai/unsloth.git@9dc399a6b6625ee40835c5eab361426d3c5d4abb
axolotl[deepspeed,flash-attn,optimizers,lion-pytorch,galore] @ git+https://github.com/truefoundry/axolotl@5878daa3beec58bf4f4d21a6abd6dba3c40e74f4
2 changes: 0 additions & 2 deletions torch-requirements.txt

This file was deleted.

11 changes: 10 additions & 1 deletion train.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,13 @@ def make_axolotl_config(config_base, kwargs, timestamp=None):
else:
cfg[k] = kwargs[k]
if not cfg.output_dir:
raise ValueError("`output_dir` must be set in config base")
raise ValueError("`output_dir` must be set")

if cfg.dataset_type == "chat" and cfg.long_sequences_strategy == "truncate":
raise ValueError(
"Chat datasets cannot be truncated. Please set `long_sequences_strategy` either to "
"`drop` to drop sequences longer than `sequence_len` or `error` to raise an error."
)

if is_main_process():
if cfg.cleanup_output_dir_on_start is True:
Expand Down Expand Up @@ -181,6 +187,9 @@ def make_axolotl_config(config_base, kwargs, timestamp=None):
set_cfg_option_if_auto(cfg, "flash_attn_fuse_mlp", cfg.adapter not in {"qlora", "lora"})
set_cfg_option_if_auto(cfg, "flash_attn_fuse_qkv", cfg.adapter not in {"qlora", "lora"})

set_cfg_option_if_auto(
cfg, "batch_flattening", not cfg.sample_packing and cfg.flash_attention and cfg.micro_batch_size > 1
)
set_cfg_option_if_auto(cfg, "optimizer", "adamw_torch_fused" if cfg.adapter == "qlora" else "adamw_torch")

if cfg.datasets == "auto":
Expand Down

0 comments on commit 9675011

Please sign in to comment.