Skip to content

Commit

Permalink
added more finetuning scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
huseinzol05 committed Jul 19, 2024
1 parent 80ae1e2 commit d166194
Show file tree
Hide file tree
Showing 9 changed files with 82 additions and 228 deletions.
2 changes: 1 addition & 1 deletion README-pypi.rst
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ If you use our software for research, please cite:
Acknowledgement
----------------

Thanks to `KeyReply <https://www.keyreply.com/>`_ for private V100s cloud and `Mesolitica <https://mesolitica.com/>`_ for private RTXs cloud to train Malaya-Speech models.
Thanks to `KeyReply <https://www.keyreply.com/>`_ for private V100s cloud and `Mesolitica <https://mesolitica.com/>`_ for private RTXs cloud to train Malaya models.

Also, thanks to `Tensorflow Research Cloud <https://www.tensorflow.org/tfrc>`_ for free TPUs access.

Expand Down
13 changes: 13 additions & 0 deletions session/llama3/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,17 @@ cd EasyContext
pip install --pre torch==2.4.0.dev20240324 --index-url https://download.pytorch.org/whl/nightly/cu118
pip install packaging && pip install ninja && pip install flash-attn --no-build-isolation --no-cache-dir
pip install -r requirements.txt
```

## Unsloth

1. Install dependencies,

```
pip3 install pip -U
pip3 uninstall torch torchvision flash-attn -y
pip3 install torch torchvision
pip3 install mosaicml-streaming
pip3 install "unsloth[cu121-torch230] @ git+https://github.com/unslothai/unsloth.git"
pip3 install flash-attn --no-build-isolation
```
4 changes: 2 additions & 2 deletions session/llama3/train_unsloth.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
WANDB_PROJECT="unsloth-malaysian-llama-3-8b-instruct-16k" \
~/.local/bin/torchrun --nproc_per_node 1 \
~/.local/bin/torchrun --nproc_per_node 4 \
-m train_unsloth \
--model_name_or_path mesolitica/malaysian-llama-3-8b-instruct-16k \
--per_device_train_batch_size 2 \
Expand All @@ -9,7 +9,7 @@ WANDB_PROJECT="unsloth-malaysian-llama-3-8b-instruct-16k" \
--do_train \
--do_eval false \
--num_train_epochs 5 \
--dataset 'final-sft' \
--dataset 'final-sft-llama3-packing-32k' \
--logging_steps 1 \
--learning_rate 5e-5 \
--embedding_learning_rate 5e-6 \
Expand Down
20 changes: 20 additions & 0 deletions session/qwen2/train-1.5B.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
WANDB_PROJECT="finetune-Qwen2-1.5B" \
torchrun --nproc_per_node 4 \
-m train \
--model_name_or_path Qwen/Qwen2-1.5B \
--per_device_train_batch_size 4 \
--gradient_accumulation_steps 32 \
--output_dir finetune-qwen2 \
--bf16 \
--do_train \
--do_eval false \
--num_train_epochs 2 \
--train_file "/home/ubuntu/mosaic-qwen2-4096" \
--logging_steps 1 \
--learning_rate 2e-5 \
--block_size 4096 \
--save_steps 10 \
--save_total_limit 3 \
--gradient_checkpointing true \
--log_level "info" \
--torch_dtype "bfloat16"
11 changes: 0 additions & 11 deletions session/qwen2/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
from typing import Optional

import datasets
import evaluate
import torch
from datasets import load_dataset

Expand Down Expand Up @@ -276,15 +275,6 @@ def main():
if os.path.isdir(
training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch.")

# Set seed before initializing model.
set_seed(training_args.seed)
Expand All @@ -294,7 +284,6 @@ def main():
"revision": model_args.model_revision,
"token": model_args.token,
"trust_remote_code": model_args.trust_remote_code,
'max_position_embeddings': 32768,
}

if model_args.config_name:
Expand Down
21 changes: 21 additions & 0 deletions session/translation/end-to-end/nanot5-small.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
WANDB_PROJECT="nanot5-small-malaysian-cased-translation-v3" \
torchrun \
--nproc_per_node 4 \
-m run_t5_v2 \
--model_name_or_path mesolitica/nanot5-small-malaysian-cased \
--num_train_epochs 2 \
--eval_steps 1000000000 \
--logging_steps 2 \
--save_steps 1500 \
--save_total_limit 3 \
--do_train \
--train_file malaysian-translation \
--output_dir nanot5-small-malaysian-cased-translation-v3 \
--per_device_train_batch_size=12 \
--per_device_eval_batch_size=3 \
--gradient_accumulation_steps=2 \
--max_source_length 4096 \
--max_target_length 4096 \
--learning_rate 2e-4 \
--gradient_checkpointing true \
--bf16
5 changes: 2 additions & 3 deletions session/translation/end-to-end/run_t5.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,15 +284,14 @@ def main():
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
attn_implementation = 'sdpa',
)
model.config.use_cache = False

model.resize_token_embeddings(len(tokenizer))
print(model)

# Get the language codes for input/target.
source_lang = 'src'
target_lang = 'tgt'

# Temporarily set max_target_length for training.
max_target_length = data_args.max_target_length
Expand Down Expand Up @@ -333,7 +332,7 @@ def __len__(self):

train_dataset = DatasetFixed(data_args.train_file)

label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
label_pad_token_id = -100
if data_args.pad_to_max_length:
data_collator = default_data_collator
else:
Expand Down
Loading

0 comments on commit d166194

Please sign in to comment.