Update tests/smoketest.sh to support FSDP + LoRA as a testing path.

Additionally introuce a max_seq_len parameter to support testing on lower-end hardware. Signed-off-by: Oleg S <[email protected]>
instructlab · Nov 13, 2024 · b945f56 · b945f56
1 parent 200d707
commit b945f56
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 4 deletions.
diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py
@@ -46,7 +46,11 @@
 import torch.nn.functional as F
 
 # First Party
-from instructlab.training.config import DistributedBackend, TrainingArgs
+from instructlab.training.config import (
+    DistributedBackend,
+    QuantizeDataType,
+    TrainingArgs,
+)
 
 
 def check_valid_train_args(train_args: TrainingArgs):
@@ -76,6 +80,25 @@ def check_valid_train_args(train_args: TrainingArgs):
             "\033[33m WARNING: is_padding_free is being deprecated due to adoption of the default padding-free support in Hugging Face Transformers. As such, this flag is non-functional in 0.6.0 and beyond. If you would like to use the older Dolomite padding-free implementation, please set use_dolomite moving forward.\033[0m"
         )
 
+    if (
+        train_args.accelerate_full_state_at_epoch
+        and train_args.lora
+        and train_args.lora.rank > 0
+    ):
+        raise ValueError(
+            "`accelerate_full_state_at_epoch` is not currently supported when training LoRA models."
+        )
+
+    if (
+        train_args.lora
+        and train_args.lora.rank > 0
+        and train_args.lora.quantize_data_type != QuantizeDataType.NONE
+        and train_args.distributed_backend == DistributedBackend.FSDP.value
+    ):
+        raise ValueError(
+            "Quantization is not supported when training LoRA models with FSDP. For quantized LoRA training, please switch to DeepSpeed."
+        )
+
 
 def retrieve_chat_template(chat_tmpl_path):
     try:

diff --git a/tests/smoketest.sh b/tests/smoketest.sh
@@ -19,6 +19,7 @@ NUM_GPUS="${2:-${DEFAULT_GPUS}}"
 # ############### User-modifiable parameters ############### 
 # Change these as needed
 MAX_BATCH_LEN=60000
+MAX_SEQ_LEN=4096
 NUM_SAMPLES_TRAINED_ON=5000 # upper-bound on training dataset size.
 
 # ############### Test Functions ############### 
@@ -203,17 +204,14 @@ function test_standard_loop_fsdp_lora() {
     --nproc_per_node="${NUM_GPUS}" \
     main_ds.py \
     --model_name_or_path="${MODEL_NAME}" \
-    --is_granite \
     --data_path="${COMPUTED_DATA_PATH}" \
     --output_dir="${CHECKPOINTS_DIR}" \
     --num_epochs=1 \
     --effective_batch_size=128 \
     --save_samples=0 \
     --checkpoint_at_epoch \
-    --accelerate_full_state_at_epoch \
     --distributed_training_framework="${DISTRIB_FRAMEWORK}" \
     --max_batch_len="${MAX_BATCH_LEN}" \
-    --is_granite \
     --lora_r=4 \
     --lora_alpha=32 \
     --lora_dropout=0.1
@@ -235,6 +233,7 @@ function main () {
     test_standard_loop_nongranite
     _cleanup_saved_checkpoints
     test_standard_loop
+    test_standard_loop_fsdp_lora
 }
 
 main