diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py index 41148c8d..e6b62f48 100644 --- a/src/instructlab/training/utils.py +++ b/src/instructlab/training/utils.py @@ -46,7 +46,11 @@ import torch.nn.functional as F # First Party -from instructlab.training.config import DistributedBackend, TrainingArgs +from instructlab.training.config import ( + DistributedBackend, + QuantizeDataType, + TrainingArgs, +) def check_valid_train_args(train_args: TrainingArgs): @@ -76,6 +80,25 @@ def check_valid_train_args(train_args: TrainingArgs): "\033[33m WARNING: is_padding_free is being deprecated due to adoption of the default padding-free support in Hugging Face Transformers. As such, this flag is non-functional in 0.6.0 and beyond. If you would like to use the older Dolomite padding-free implementation, please set use_dolomite moving forward.\033[0m" ) + if ( + train_args.accelerate_full_state_at_epoch + and train_args.lora + and train_args.lora.rank > 0 + ): + raise ValueError( + "`accelerate_full_state_at_epoch` is not currently supported when training LoRA models." + ) + + if ( + train_args.lora + and train_args.lora.rank > 0 + and train_args.lora.quantize_data_type != QuantizeDataType.NONE + and train_args.distributed_backend == DistributedBackend.FSDP.value + ): + raise ValueError( + "Quantization is not supported when training LoRA models with FSDP. For quantized LoRA training, please switch to DeepSpeed." + ) + def retrieve_chat_template(chat_tmpl_path): try: diff --git a/tests/smoketest.sh b/tests/smoketest.sh index a54c9764..6918fb03 100755 --- a/tests/smoketest.sh +++ b/tests/smoketest.sh @@ -19,6 +19,7 @@ NUM_GPUS="${2:-${DEFAULT_GPUS}}" # ############### User-modifiable parameters ############### # Change these as needed MAX_BATCH_LEN=60000 +MAX_SEQ_LEN=4096 NUM_SAMPLES_TRAINED_ON=5000 # upper-bound on training dataset size. # ############### Test Functions ############### @@ -203,17 +204,14 @@ function test_standard_loop_fsdp_lora() { --nproc_per_node="${NUM_GPUS}" \ main_ds.py \ --model_name_or_path="${MODEL_NAME}" \ - --is_granite \ --data_path="${COMPUTED_DATA_PATH}" \ --output_dir="${CHECKPOINTS_DIR}" \ --num_epochs=1 \ --effective_batch_size=128 \ --save_samples=0 \ --checkpoint_at_epoch \ - --accelerate_full_state_at_epoch \ --distributed_training_framework="${DISTRIB_FRAMEWORK}" \ --max_batch_len="${MAX_BATCH_LEN}" \ - --is_granite \ --lora_r=4 \ --lora_alpha=32 \ --lora_dropout=0.1 @@ -235,6 +233,7 @@ function main () { test_standard_loop_nongranite _cleanup_saved_checkpoints test_standard_loop + test_standard_loop_fsdp_lora } main