diff --git a/session/llama3/lora-256-8b.sh b/session/llama3/lora-128-8b.sh similarity index 78% rename from session/llama3/lora-256-8b.sh rename to session/llama3/lora-128-8b.sh index a07a8ccd..9d318ee5 100644 --- a/session/llama3/lora-256-8b.sh +++ b/session/llama3/lora-128-8b.sh @@ -1,18 +1,18 @@ -WANDB_PROJECT=unsloth-Meta-Llama-3.1-8B-Instruct-lora-128-embedding-16k-multipack \ +WANDB_PROJECT=unsloth-Meta-Llama-3.1-8B-Instruct-lora-128-embedding-8k-multipack \ deepspeed run-instruction-lora-embedding-multipack.py \ --deepspeed ds_config_zero3.json \ --model_name_or_path unsloth/Meta-Llama-3.1-8B-Instruct \ ---per_device_train_batch_size 1 \ +--per_device_train_batch_size 4 \ --gradient_accumulation_steps 6 \ ---output_dir unsloth-Meta-Llama-3.1-8B-Instruct-lora-128-embedding-16k-multipack \ +--output_dir unsloth-Meta-Llama-3.1-8B-Instruct-lora-128-embedding-8k-multipack \ --bf16 \ --do_train \ --do_eval false \ --num_train_epochs 5 \ ---train_file 'malaysian-llama3.1-24k-language-multipack' \ +--train_file 'malaysian-llama3.1-8k-language-multipack' \ --logging_steps 1 \ --learning_rate 2e-5 \ ---embedding_learning_rate 5e-6 \ +--learning_rate 2e-5 \ --weight_decay 0.01 \ --block_size 24576 \ --save_steps 20 \ diff --git a/session/llama3/lora-256-smollm2-360m.sh b/session/llama3/lora-256-smollm2-360m.sh new file mode 100644 index 00000000..7510d587 --- /dev/null +++ b/session/llama3/lora-256-smollm2-360m.sh @@ -0,0 +1,25 @@ +WANDB_PROJECT="lora-embedding-256-HuggingFaceTB-SmolLM2-360M-Instruct-multipack" \ +TORCH_DISTRIBUTED_DEBUG="info" \ +torchrun --nproc_per_node 2 \ +-m run-instruction-lora-embedding-multipack \ +--model_name_or_path HuggingFaceTB/SmolLM2-360M-Instruct \ +--per_device_train_batch_size 6 \ +--gradient_accumulation_steps 4 \ +--output_dir lora-embedding-256-HuggingFaceTB-SmolLM2-360M-Instruct-multipack \ +--bf16 --do_train --do_eval false --num_train_epochs 5 \ +--train_file /home/husein/ssd4/continue-training/packing-4096 \ +--logging_steps 1 \ +--learning_rate 2e-5 \ +--learning_rate 2e-5 \ +--weight_decay 0.01 \ +--block_size 24576 \ +--save_steps 100 \ +--save_total_limit 3 \ +--gradient_checkpointing true \ +--neftune_noise_alpha 5.0 \ +--torch_dtype bfloat16 \ +--rank 256 \ +--ddp_find_unused_parameters false \ +--include_num_input_tokens_seen true \ +--dataloader_num_workers 3 \ +--dataloader_prefetch_factor 4 \ No newline at end of file diff --git a/session/translation/end-to-end/nanot5-base-coding.sh b/session/translation/end-to-end/nanot5-base-coding.sh new file mode 100644 index 00000000..c6a2e492 --- /dev/null +++ b/session/translation/end-to-end/nanot5-base-coding.sh @@ -0,0 +1,22 @@ +WANDB_PROJECT="nanot5-base-malaysian-cased-translation-v2-coding" \ +CUDA_VISIBLE_DEVICES=0 \ +python3.10 run_t5_v2.py \ +--model_name_or_path mesolitica/nanot5-base-malaysian-translation-v2 \ +--num_train_epochs 2 \ +--eval_steps 1000000000 \ +--logging_steps 2 \ +--save_steps 200 \ +--save_total_limit 3 \ +--do_train \ +--train_file mosaic-coding \ +--output_dir nanot5-base-malaysian-cased-translation-v2-coding \ +--dataloader_num_workers=10 \ +--per_device_train_batch_size=2 \ +--per_device_eval_batch_size=3 \ +--gradient_accumulation_steps=8 \ +--max_source_length 2048 \ +--max_target_length 2048 \ +--learning_rate 2e-5 \ +--gradient_checkpointing true \ +--weight_decay 0.01 \ +--bf16 \ No newline at end of file diff --git a/session/translation/end-to-end/nanot5-small-coding.sh b/session/translation/end-to-end/nanot5-small-coding.sh new file mode 100644 index 00000000..2ca6a368 --- /dev/null +++ b/session/translation/end-to-end/nanot5-small-coding.sh @@ -0,0 +1,24 @@ +WANDB_PROJECT="nanot5-small-malaysian-cased-translation-v2-coding" \ +CUDA_VISIBLE_DEVICES=1 \ +torchrun \ +--nproc_per_node 1 \ +-m run_t5_v2 \ +--model_name_or_path mesolitica/nanot5-small-malaysian-translation-v2 \ +--num_train_epochs 2 \ +--eval_steps 1000000000 \ +--logging_steps 2 \ +--save_steps 200 \ +--save_total_limit 3 \ +--do_train \ +--train_file mosaic-coding \ +--output_dir nanot5-small-malaysian-cased-translation-v2-coding \ +--dataloader_num_workers=10 \ +--per_device_train_batch_size=2 \ +--per_device_eval_batch_size=3 \ +--gradient_accumulation_steps=8 \ +--max_source_length 2048 \ +--max_target_length 2048 \ +--learning_rate 2e-5 \ +--gradient_checkpointing true \ +--weight_decay 0.01 \ +--bf16 \ No newline at end of file