Skip to content

Commit

Permalink
Update tests/smoketest.sh to support FSDP + LoRA as a testing path.
Browse files Browse the repository at this point in the history
Additionally introuce a max_seq_len parameter to support testing
on lower-end hardware.

Signed-off-by: Oleg S <[email protected]>
  • Loading branch information
RobotSail committed Nov 13, 2024
1 parent 200d707 commit b945f56
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 4 deletions.
25 changes: 24 additions & 1 deletion src/instructlab/training/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,11 @@
import torch.nn.functional as F

# First Party
from instructlab.training.config import DistributedBackend, TrainingArgs
from instructlab.training.config import (
DistributedBackend,
QuantizeDataType,
TrainingArgs,
)


def check_valid_train_args(train_args: TrainingArgs):
Expand Down Expand Up @@ -76,6 +80,25 @@ def check_valid_train_args(train_args: TrainingArgs):
"\033[33m WARNING: is_padding_free is being deprecated due to adoption of the default padding-free support in Hugging Face Transformers. As such, this flag is non-functional in 0.6.0 and beyond. If you would like to use the older Dolomite padding-free implementation, please set use_dolomite moving forward.\033[0m"
)

if (
train_args.accelerate_full_state_at_epoch
and train_args.lora
and train_args.lora.rank > 0
):
raise ValueError(
"`accelerate_full_state_at_epoch` is not currently supported when training LoRA models."
)

if (
train_args.lora
and train_args.lora.rank > 0
and train_args.lora.quantize_data_type != QuantizeDataType.NONE
and train_args.distributed_backend == DistributedBackend.FSDP.value
):
raise ValueError(
"Quantization is not supported when training LoRA models with FSDP. For quantized LoRA training, please switch to DeepSpeed."
)


def retrieve_chat_template(chat_tmpl_path):
try:
Expand Down
5 changes: 2 additions & 3 deletions tests/smoketest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ NUM_GPUS="${2:-${DEFAULT_GPUS}}"
# ############### User-modifiable parameters ###############
# Change these as needed
MAX_BATCH_LEN=60000
MAX_SEQ_LEN=4096
NUM_SAMPLES_TRAINED_ON=5000 # upper-bound on training dataset size.

# ############### Test Functions ###############
Expand Down Expand Up @@ -203,17 +204,14 @@ function test_standard_loop_fsdp_lora() {
--nproc_per_node="${NUM_GPUS}" \
main_ds.py \
--model_name_or_path="${MODEL_NAME}" \
--is_granite \
--data_path="${COMPUTED_DATA_PATH}" \
--output_dir="${CHECKPOINTS_DIR}" \
--num_epochs=1 \
--effective_batch_size=128 \
--save_samples=0 \
--checkpoint_at_epoch \
--accelerate_full_state_at_epoch \
--distributed_training_framework="${DISTRIB_FRAMEWORK}" \
--max_batch_len="${MAX_BATCH_LEN}" \
--is_granite \
--lora_r=4 \
--lora_alpha=32 \
--lora_dropout=0.1
Expand All @@ -235,6 +233,7 @@ function main () {
test_standard_loop_nongranite
_cleanup_saved_checkpoints
test_standard_loop
test_standard_loop_fsdp_lora
}

main

0 comments on commit b945f56

Please sign in to comment.