diff --git a/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/full_finetune_distributed.yaml b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/full_finetune_distributed.yaml index 62b05de9..fc0d6dd0 100644 --- a/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/full_finetune_distributed.yaml +++ b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/configs/full_finetune_distributed.yaml @@ -25,10 +25,9 @@ tokenizer: # Dataset dataset: - _component_: torchtune.datasets.wiki_text - train_on_input: True + _component_: torchtune.datasets.wikitext_dataset seed: null -shuffle: True +shuffle: False # Model Arguments model: @@ -75,8 +74,8 @@ checkpointer: resume_from_checkpoint: False # Fine-tuning arguments -batch_size: 2 -epochs: 3 +batch_size: 1 +epochs: 1 optimizer: _component_: torch.optim.AdamW @@ -95,6 +94,7 @@ device: cuda # Memory management enable_activation_checkpointing: True memory_efficient_fsdp_wrap: True +fsdp_cpu_offload: True # Reduced precision dtype: bf16 diff --git a/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/full_finetune_distributed.sbatch b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/full_finetune_distributed.sbatch index 239c8b02..a3a3b235 100644 --- a/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/full_finetune_distributed.sbatch +++ b/3.test_cases/torchtune/slurm/tutorials/e2e-llama3-70b-development/full_finetune_distributed.sbatch @@ -77,7 +77,7 @@ declare -a TORCHRUN_ARGS=( --rdzv_endpoint=$(hostname) ) declare -a TRAIN_ARGS=( - --config ${PWD}/tutorials/e2e-llama3-70b-development/configs/lora_finetune_distributed.yaml + --config ${PWD}/tutorials/e2e-llama3-70b-development/configs/full_finetune_distributed.yaml tokenizer.path=${MODEL_PATH}/${HF_MODEL}/original/tokenizer.model checkpointer.checkpoint_dir=${MODEL_PATH}/${HF_MODEL} checkpointer.output_dir=${MODEL_PATH}/${HF_MODEL}-tuned