forked from HabanaAI/Model-References
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_bert_1.5b_8x.sh
executable file
·58 lines (52 loc) · 1.73 KB
/
run_bert_1.5b_8x.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/bin/bash
#####################################################################################
# Example: Pretraining phase 1 of BERT with 1.5B parameters on multicard i.e 8 cards
#####################################################################################
# Params: run_pretraining
DATA_DIR=/data/pytorch/bert/pretraining/hdf5_lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5/books_wiki_en_corpus
MODEL_CONFIG=./scripts/bert_1.5b_config.json
DS_CONFIG=./scripts/deepspeed_config_bert_1.5b.json
RESULTS_DIR=./results/bert_1.5b
MAX_SEQ_LENGTH=128
NUM_STEPS_PER_CP=200
MAX_STEPS=155000
RUN_STEPS=-1
LR=0.0015
WARMUP=0.05
CONST=0.25
LOG_FREQ=10
MAX_PRED=20
# Params: DeepSpeed
NUM_NODES=1
NGPU_PER_NODE=8
DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P)
CMD="python -u ./run_pretraining.py \
--disable_progress_bar \
--optimizer=lans \
--use_lr_scheduler \
--resume_from_checkpoint \
--do_train \
--bert_model=bert-base-uncased \
--config_file=$MODEL_CONFIG \
--json-summary=$RESULTS_DIR/dllogger.json \
--output_dir=$RESULTS_DIR/checkpoints \
--seed=12439 \
--input_dir=$DATA_DIR \
--max_seq_length $MAX_SEQ_LENGTH \
--max_predictions_per_seq=$MAX_PRED \
--max_steps=$MAX_STEPS \
--steps_this_run=$RUN_STEPS \
--num_steps_per_checkpoint=$NUM_STEPS_PER_CP \
--learning_rate=$LR \
--warmup_proportion=$WARMUP \
--constant_proportion=$CONST \
--scheduler_degree=1.0 \
--log_freq=$LOG_FREQ \
--deepspeed \
--deepspeed_config=$DS_CONFIG"
mkdir -p $RESULTS_DIR
deepspeed --num_nodes ${NUM_NODES} \
--num_gpus ${NGPU_PER_NODE} \
--no_local_rank \
--no_python \
$CMD