-
Notifications
You must be signed in to change notification settings - Fork 20
/
run_pangualpha_2_6b_em_f1.yaml
188 lines (172 loc) · 4.16 KB
/
run_pangualpha_2_6b_em_f1.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
seed: 0
run_mode: 'eval'
output_dir: './output' # 当前不支持自定义修改,请勿修改该默认值
load_checkpoint: ""
auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model
resume_training: False
# context
context:
mode: 0 #0--Graph Mode; 1--Pynative Mode
device_target: "Ascend"
enable_graph_kernel: False
graph_kernel_flags: "--disable_expand_ops=Softmax,Dropout --enable_parallel_fusion=true --reduce_fuse_depth=8 --enable_auto_tensor_inplace=true"
max_call_depth: 10000
max_device_memory: "30GB"
save_graphs: False
save_graphs_path: "./graph"
device_id: 0
# aicc
remote_save_url: "Please input obs url on AICC platform."
# runner
runner_config:
epochs: 1
batch_size: 1
sink_mode: True
sink_size: 2
runner_wrapper:
type: MFTrainOneStepCell
scale_sense:
type: DynamicLossScaleUpdateCell
loss_scale_value: 4294967296
scale_factor: 2
scale_window: 1000
use_clip_grad: True
# parallel
use_parallel: False
parallel:
parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel
gradients_mean: False
loss_repeated_mean: True
full_batch: True
search_mode: "sharding_propagation"
enable_parallel_optimizer: True
strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
# 1 node 8 device num
parallel_config:
data_parallel: 8
model_parallel: 1
pipeline_stage: 1
micro_batch_num: 1
vocab_emb_dp: True
gradient_aggregation_group: 4
micro_batch_interleave_num: 1
# moe
moe_config:
expert_num: 1
capacity_factor: 1.05
aux_loss_factor: 0.05
num_experts_chosen: 1
# recompute
recompute_config:
recompute: True
parallel_optimizer_comm_recompute: False
mp_comm_recompute: True
recompute_slice_activation: False
# autotune
auto_tune: True
filepath_prefix: './autotune'
autotune_per_step: 10
# profile
profile: False
profile_start_step: 1
profile_stop_step: 10
init_start_profile: True
profile_communication: True
profile_memory: True
# Trainer
trainer:
type: CausalLanguageModelingTrainer
model_name: 'pangualpha_2_6b'
# if True, do evaluate during the training process. if false, do nothing.
# note that the task trainer should support _evaluate_in_training function.
do_eval: False
eval_dataset: &eval_dataset
data_loader:
type: MindDataset
dataset_dir: ""
shuffle: True
input_columns: ["input_ids", "labels"]
eod_reset: False
num_parallel_workers: 8
python_multiprocessing: False
drop_remainder: True
batch_size: 16
repeat: 1
numa_enable: False
prefetch_size: 1
eval_dataset_task:
type: CausalLanguageModelDataset
dataset_config: *eval_dataset
# model
model:
model_config:
type: PanguAlphaConfig
batch_size: 16
seq_length: 1024
vocab_size: 40000
hidden_size: 2560
ffn_hidden_size: 10240
num_layers: 32
num_heads: 32
pad_token_id: 6
eod_token_id: 6
eos_token_id: 9
post_layernorm_residual: False
param_init_type: 'float32'
compute_dtype: 'float16'
softmax_compute_type: 'float16'
embedding_dropout_prob: 0.1
hidden_dropout_rate: 0.1
attention_dropout_rate: 0.1
hidden_act: 'fast_gelu'
use_past: False
use_moe: False
expert_num: 1
per_token_num_experts_chosen: 1
checkpoint_name_or_path: "pangualpha_2_6b"
repetition_penalty: 1.5
max_decode_length: 1024
max_new_tokens: 32
top_k: 3
top_p: 0.0
do_sample: True
arch:
type: PanguAlphaHeadModel
# lr sechdule
lr_schedule:
type: polynomial
learning_rate: 0.00005
lr_end: 0.000001
warmup_steps: 2000
total_steps: -1 # -1 means it will load the total steps of the dataset
layer_scale: False
layer_decay: 0.65
lr_scale: False
lr_scale_factor: 256
# optimizer
optimizer:
type: FP32StateAdamWeightDecay
beta1: 0.9
beta2: 0.95
eps: 0.00000001 # 1e-8
weight_decay: 0.1
# callbacks
callbacks:
- type: MFLossMonitor
- type: CheckpointMointor
prefix: "PanguAlpha-2_6b"
save_checkpoint_steps: 500
integrated_save: False
async_save: False
- type: ObsMonitor
eval_callbacks:
- type: ObsMonitor
# metric
metric:
type: EmF1Metric
# processor
processor:
return_tensors: ms
tokenizer:
type: PanguAlphaTokenizer
type: PanguAlphaProcessor