From 5dc2dbbb0c9890a1ab52e16daf1e88dc8c3af4db Mon Sep 17 00:00:00 2001 From: tigranfah Date: Fri, 11 Oct 2024 23:04:09 +0400 Subject: [PATCH] prepare for llama 3.2 1b training, lr=2.5-e4 --- submitit_train_hparam_tuning.py | 3 ++- train_configs/llama3.2_1b.toml | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/submitit_train_hparam_tuning.py b/submitit_train_hparam_tuning.py index 531a1872..b0a0540d 100644 --- a/submitit_train_hparam_tuning.py +++ b/submitit_train_hparam_tuning.py @@ -18,7 +18,8 @@ hparams = { # "optimizer.lr": ["1.2e-3", "9e-4", "6e-4", "3e-4"], - "optimizer.lr": ["8e-4", "6e-4", "4e-4", "2e-4"], + # "optimizer.lr": ["8e-4", "6e-4", "4e-4", "2e-4"], + # "optimizer.lr": ["2.5e-4"], # "optimizer.lr": ["1e-4", "8e-5", "6e-5", "4e-5", "2e-5"], } diff --git a/train_configs/llama3.2_1b.toml b/train_configs/llama3.2_1b.toml index f30ff78d..9754cba6 100644 --- a/train_configs/llama3.2_1b.toml +++ b/train_configs/llama3.2_1b.toml @@ -26,13 +26,13 @@ tokenizer_path = "torchtitan/tokenizers/Llama-3.2-chem-1B/" [optimizer] name = "AdamW" -lr = 6.0e-4 +lr = 2.5e-4 [training] batch_size = 10 gradient_accumulation_steps = 12 seq_len = 2048 -warmup_steps = 1000 # lr scheduler warm up, normally 20% of the train steps +warmup_steps = 500 # lr scheduler warm up, normally 20% of the train steps max_norm = 1.0 # grad norm clipping steps = 20000 data_parallel_degree = -1