added more finetuning scripts

mesolitica · Jul 19, 2024 · d166194 · d166194
1 parent 80ae1e2
commit d166194
Show file tree

Hide file tree

Showing 9 changed files with 82 additions and 228 deletions.
diff --git a/README-pypi.rst b/README-pypi.rst
@@ -56,7 +56,7 @@ If you use our software for research, please cite:
 Acknowledgement
 ----------------
 
-Thanks to `KeyReply <https://www.keyreply.com/>`_ for private V100s cloud and `Mesolitica <https://mesolitica.com/>`_ for private RTXs cloud to train Malaya-Speech models.
+Thanks to `KeyReply <https://www.keyreply.com/>`_ for private V100s cloud and `Mesolitica <https://mesolitica.com/>`_ for private RTXs cloud to train Malaya models.
 
 Also, thanks to `Tensorflow Research Cloud <https://www.tensorflow.org/tfrc>`_ for free TPUs access.
 

diff --git a/session/llama3/README.md b/session/llama3/README.md
@@ -40,4 +40,17 @@ cd EasyContext
 pip install --pre torch==2.4.0.dev20240324  --index-url https://download.pytorch.org/whl/nightly/cu118
 pip install packaging &&  pip install ninja && pip install flash-attn --no-build-isolation --no-cache-dir
 pip install -r requirements.txt
+```
+
+## Unsloth
+
+1. Install dependencies,
+
+```
+pip3 install pip -U
+pip3 uninstall torch torchvision flash-attn -y
+pip3 install torch torchvision
+pip3 install mosaicml-streaming
+pip3 install "unsloth[cu121-torch230] @ git+https://github.com/unslothai/unsloth.git"
+pip3 install flash-attn --no-build-isolation
 ```
diff --git a/session/llama3/train_unsloth.sh b/session/llama3/train_unsloth.sh
@@ -1,5 +1,5 @@
 WANDB_PROJECT="unsloth-malaysian-llama-3-8b-instruct-16k" \
-~/.local/bin/torchrun --nproc_per_node 1 \
+~/.local/bin/torchrun --nproc_per_node 4 \
 -m train_unsloth \
 --model_name_or_path mesolitica/malaysian-llama-3-8b-instruct-16k \
 --per_device_train_batch_size 2 \
@@ -9,7 +9,7 @@ WANDB_PROJECT="unsloth-malaysian-llama-3-8b-instruct-16k" \
 --do_train \
 --do_eval false \
 --num_train_epochs 5 \
---dataset 'final-sft' \
+--dataset 'final-sft-llama3-packing-32k' \
 --logging_steps 1 \
 --learning_rate 5e-5 \
 --embedding_learning_rate 5e-6 \

diff --git a/session/qwen2/train-1.5B.sh b/session/qwen2/train-1.5B.sh
@@ -0,0 +1,20 @@
+WANDB_PROJECT="finetune-Qwen2-1.5B" \
+torchrun --nproc_per_node 4 \
+-m train \
+--model_name_or_path Qwen/Qwen2-1.5B \
+--per_device_train_batch_size 4 \
+--gradient_accumulation_steps 32 \
+--output_dir finetune-qwen2 \
+--bf16 \
+--do_train \
+--do_eval false \
+--num_train_epochs 2 \
+--train_file "/home/ubuntu/mosaic-qwen2-4096" \
+--logging_steps 1 \
+--learning_rate 2e-5 \
+--block_size 4096 \
+--save_steps 10 \
+--save_total_limit 3 \
+--gradient_checkpointing true \
+--log_level "info" \
+--torch_dtype "bfloat16"
diff --git a/session/qwen2/train.py b/session/qwen2/train.py
@@ -32,7 +32,6 @@
 from typing import Optional
 
 import datasets
-import evaluate
 import torch
 from datasets import load_dataset
 
@@ -276,15 +275,6 @@ def main():
     if os.path.isdir(
             training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
         last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
-            raise ValueError(
-                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
-                "Use --overwrite_output_dir to overcome."
-            )
-        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
-            logger.info(
-                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
-                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch.")
 
     # Set seed before initializing model.
     set_seed(training_args.seed)
@@ -294,7 +284,6 @@ def main():
         "revision": model_args.model_revision,
         "token": model_args.token,
         "trust_remote_code": model_args.trust_remote_code,
-        'max_position_embeddings': 32768,
     }
 
     if model_args.config_name:

diff --git a/session/translation/end-to-end/nanot5-small.sh b/session/translation/end-to-end/nanot5-small.sh
@@ -0,0 +1,21 @@
+WANDB_PROJECT="nanot5-small-malaysian-cased-translation-v3" \
+torchrun \
+--nproc_per_node 4 \
+-m run_t5_v2 \
+--model_name_or_path mesolitica/nanot5-small-malaysian-cased \
+--num_train_epochs 2 \
+--eval_steps 1000000000 \
+--logging_steps 2 \
+--save_steps 1500 \
+--save_total_limit 3 \
+--do_train \
+--train_file malaysian-translation \
+--output_dir nanot5-small-malaysian-cased-translation-v3 \
+--per_device_train_batch_size=12 \
+--per_device_eval_batch_size=3 \
+--gradient_accumulation_steps=2 \
+--max_source_length 4096 \
+--max_target_length 4096 \
+--learning_rate 2e-4 \
+--gradient_checkpointing true \
+--bf16
diff --git a/session/translation/end-to-end/run_t5.py b/session/translation/end-to-end/run_t5.py
@@ -284,15 +284,14 @@ def main():
         cache_dir=model_args.cache_dir,
         revision=model_args.model_revision,
         use_auth_token=True if model_args.use_auth_token else None,
+        attn_implementation = 'sdpa',
     )
     model.config.use_cache = False
 
     model.resize_token_embeddings(len(tokenizer))
     print(model)
 
     # Get the language codes for input/target.
-    source_lang = 'src'
-    target_lang = 'tgt'
 
     # Temporarily set max_target_length for training.
     max_target_length = data_args.max_target_length
@@ -333,7 +332,7 @@ def __len__(self):
 
     train_dataset = DatasetFixed(data_args.train_file)
 
-    label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
+    label_pad_token_id = -100
     if data_args.pad_to_max_length:
         data_collator = default_data_collator
     else: