forked from CGCL-codes/naturalcc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
python.yml
247 lines (223 loc) · 20.8 KB
/
python.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
#registry
criterion: cross_entropy
optimizer: torch_adam
lr_scheduler: fixed
tokenizer: ~ # default=None
bpe: ~ # default=None
##################### default args #####################
common:
no_progress_bar: 0 # action='store_true', help='disable progress bar'
log_interval: 500 # type=int, default=100, metavar='N', help='log progress every N batches (when progress bar is disabled)'
log_format: simple # default=None, help='log format to use', choices=['json', 'none', 'simple', 'tqdm']
tensorboard_logdir: '' # default='', help='path to save logs for tensorboard, should match --logdir of running tensorboard (default: no tensorboard logging)'
memory_efficient_fp16: 0 # action='store_true', help='use a memory-efficient version of FP16 training; implies --fp16'
fp16_no_flatten_grads: 1 # action='store_true', help='don\'t flatten FP16 grads tensor'
fp16_init_scale: 128 # 2 ** 7 # default=2 ** 7, type=int, help='default FP16 loss scale'
fp16_scale_window: ~ # type=int, help='number of updates before increasing loss scale'
fp16_scale_tolerance: 0.0 # default=0.0, type=float, help='pct of updates that can overflow before decreasing the loss scale'
min_loss_scale: 1e-4 # default=1e-4, type=float, metavar='D', help='minimum FP16 loss scale, after which training is stopped'
threshold_loss_scale: ~ # type=float, help='threshold FP16 loss scale from below'
empty_cache_freq: 0 # type=int, help='how often to clear the PyTorch CUDA cache (0 to disable)'
task: summarization # multi-modalities task
seed: 1 # default=1, type=int, metavar='N', help='pseudo random number generator seed'
cpu: 0 # action='store_true', help='use CPU instead of CUDA'
fp16: 0 # action='store_true', help='use FP16'
fp16_opt_level: '01' # type=str, default="O1", help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details at https://nvidia.github.io/apex/amp.html",
bf16: 0 # '--bf16', action='store_true', help='use bfloat16; implies --tpu'
memory_efficient_bf16: 0 # 'use a memory-efficient version of BF16 training; implies --bf16
server_ip: '' # type=str, default="", help="For distant debugging."
server_port: '' # type=str, default="", help="For distant debugging."
amp: 0 # type=bool, default=False, help="use automatic mixed precision"
amp_batch_retries: 2 # type=int, default=2, help="number of retries of same batch after reducing loss scale with AMP"
amp_init_scale: 2 ** 7 # type=int, default=2 ** 7, help="default AMP loss scale"
amp_scale_window: ~ # type=Optional[int], default=None, help="number of updates before increasing AMP loss scale"
dataset:
num_workers: 3 # default=1, type=int, metavar='N', help='how many subprocesses to use for data loading'
skip_invalid_size_inputs_valid_test: 1 # action='store_true', help='ignore too long or too short lines in valid and test set'
max_tokens: ~ # type=int, metavar='N', help='maximum number of tokens in a batch'
max_sentences: 64 # default = 512 # '--batch-size', type=int, metavar='N', help='maximum number of sentences in a batch'
required_batch_size_multiple: 1 # default=8, type=int, metavar='N', help='batch size will be a multiplier of this value'
# dataset_impl: raw # choices=get_available_dataset_impl(), help='output dataset implementation'
dataset_impl: mmap # choices=get_available_dataset_impl(), help='output dataset implementation'
train_subset: train # default='train', metavar='SPLIT', help='data subset to use for training (e.g. train, valid, test)'
valid_subset: valid # default='valid', metavar='SPLIT', help='comma separated list of data subsets to use for validation (e.g. train, valid, test)'
validate_interval: 1 # type=int, default=1, metavar='N', help='validate every N epochs'
fixed_validation_seed: ~ # default=None, type=int, metavar='N', help='specified random seed for validation'
disable_validation: 0 # action='store_true',help='disable validation'
max_tokens_valid: ~ # type=int, metavar='N', help='maximum number of tokens in a validation batch (defaults to --max-tokens)'
max_sentences_valid: 2048 # type=int, metavar='N', help='maximum number of sentences in a validation batch (defaults to --max-sentences)'
curriculum: 0 # default=0, type=int, metavar='N', help='don\'t shuffle batches for first N epochs'
gen_subset: test # default='test', metavar='SPLIT', help='data subset to generate (train, valid, test)'
num_shards: 1 # default=1, type=int, metavar='N', help='shard generation over N shards'
shard_id: 0 # default=0, type=int, metavar='ID', help='id of the shard to generate (id < num_shards)'
distributed_training:
distributed_world_size: 1 # default=max(1, torch.cuda.device_count()
distributed_rank: 0 # default=0, type=int, help='rank of the current worker'
distributed_backend: nccl # default='nccl', type=str, help='distributed backend'
distributed_init_method: ~ # default=None, type=str,help='typically tcp://hostname:port that will be used to establish initial connetion'
distributed_port: -1 # default=-1, type=int, help='port number (not required if using --distributed-init-method)'
device_id: 0 # '--local_rank', default=0, type=int, help='which GPU to use (usually configured automatically)'
distributed_no_spawn: 0 # action='store_true', help='do not spawn multiple processes even if multiple GPUs are visible'
ddp_backend: c10d # default='c10d', type=str, choices=['c10d', 'no_c10d'], help='DistributedDataParallel backend'
bucket_cap_mb: 25 # default=25, type=int, metavar='MB', help='bucket size for reduction'
fix_batches_to_gpus: ~ # action='store_true', help='don\'t shuffle batches between GPUs; this reduces overall randomness and may affect precision but avoids the cost of re-reading the data'
find_unused_parameters: 0 # default=False, action='store_true', help='disable unused parameter detection (not applicable to no_c10d ddp-backend'
fast_stat_sync: 0 # default=False, action='store_true', help='[deprecated] this is now defined per Criterion'
broadcast_buffers: 0 # default=False, action='store_true', help='Copy non-trainable parameters between GPUs, such as batchnorm population statistics'
global_sync_iter: 50 # default=50, type=int, help="Iteration for syncing global model",
warmup_iterations: 0 # default=500, type=int, help="warmup iterations for model to broadcast",
local_rank: -1 # type=int, default=-1, help="For distributed training: local_rank"
block_momentum: 0.875 # default=0.875, type=float, help="block momentum for bmuf",
block_lr: 1 # default=1, type=float, help="block learning rate for bmuf"
use_nbm: 0 # default=False, action="store_true", help="Specify whether you want to use classical BM / Nesterov BM",
average_sync: 0 # default=False, action="store_true", help="Specify whether you want to average the local momentum after each sync",
task:
data: ~/python_wan/summarization/data-mmap # help='colon separated path to data directories list, will be iterated upon during epochs in round-robin manner'
# for summarization
source_lang: code_tokens
target_lang: docstring_tokens
load_alignments: 0 #', action='store_true', help='load the binarized alignments'
left_pad_source: 0 #', default='True', type=str, metavar='BOOL', help='pad the source on the left'
left_pad_target: 0 #', default='False', tydecoder_out_embed_dimpe=str, metavar='BOOL', help='pad the target on the left'
max_source_positions: 400 #', default=1024, type=int, metavar='N', help='max number of tokens in the source sequence'
max_target_positions: 30 #', default=1024, type=int, metavar='N', help='max number of tokens in the target sequence'
upsample_primary: 1 #', default=1, type=int, help='amount to upsample primary dataset'
truncate_source: 1 #', action='store_true', default=False, help='truncate source to max-source-positions'
truncate_target: 1 #', action='store_true', default=False, help='truncate target to max-target-positions'
append_eos_to_target: 1 # default=True, append eos to the end of target sentence for summarization task
# options for reporting BLEU during validation
eval_bleu: 1 #action='store_true', help='evaluation with BLEU scores'
eval_bleu_detok: space #', type=str, default="space", help='detokenizer before computing BLEU (e.g., "moses"); required if using --eval-bleu; use "space" to disable detokenization; see fairseq.data.encoders for other options'
eval_bleu_detok_args: ~ #', type=str, metavar='JSON', help='args for building the tokenizer, if needed'
eval_tokenized_bleu: 0 #', action='store_true', default=False, help='if setting, we compute tokenized BLEU instead of sacrebleu'
eval_bleu_remove_bpe: ~ # ', nargs='?', const='@@ ', default=None, help='remove BPE before computing BLEU'
eval_bleu_args: ~ #', type=str, metavar='JSON', help='generation args for BLUE scoring, e.g., \'{"beam": 4, "lenpen": 0.6}\''
eval_bleu_print_samples: 0 #', action='store_true', help='print sample generations during validation'
model:
arch: seq2seq
# encoder
# encoder embedding
encoder_embed_dim: 512 #', type=int, metavar='N', help='encoder embedding dimension'
encoder_embed: ~ #', type=str, metavar='STR', help='path to pre-trained encoder embedding'
encoder_freeze_embed: 0 #', action='store_true', help='freeze encoder embeddings'
# encoder-LSTM
encoder_hidden_size: 512 #', type=int, metavar='N', help='encoder hidden size'
encoder_layers: 1 #', type=int, metavar='N', help='number of encoder layers'
encoder_bidirectional: 0 #, action='store_true', help='make all layers of encoder bidirectional'
# decoder
decoder_embed_dim: 512 # ', type=int, metavar='N', help='decoder embedding dimension'
decoder_embed: ~ #', type=str, metavar='STR', help='path to pre-trained decoder embedding'
decoder_freeze_embed: ~ #', action='store_true', help='freeze decoder embeddings'
decoder_hidden_size: 512 #', type=int, metavar='N', help='decoder hidden size'
decoder_layers: 1 #', type=int, metavar='N', help='number of decoder layers'
decoder_out_embed_dim: 512
decoder_attention: 1 #', type=str, metavar='BOOL', help='decoder attention'
adaptive_softmax_cutoff: ~ #', metavar='EXPR', help='comma separated list of adaptive softmax cutoff points. Must be used with adaptive_loss criterion'
share_decoder_input_output_embed: 0 #', default=False, action='store_true', help='share decoder input and output embeddings'
share_all_embeddings: 0 #', default=False, action='store_true', help='share encoder, decoder and output embeddings (requires shared dictionary and embed dim)'
# Granular dropout settings (if not specified these default to --dropout)
encoder_dropout_in: 0.1 #', type=float, metavar='D', help='dropout probability for encoder input embedding'
encoder_dropout_out: 0.2 #', type=float, metavar='D', help='dropout probability for encoder output'
decoder_dropout_in: 0.1 #' type=float, metavar='D', help='dropout probability for decoder input embedding'
decoder_dropout_out: 0.2 #', type=float, metavar='D', help='dropout probability for decoder output'
# network: decoder
# pointer: False # if set true, add pointer-generator for model. TBC
max_source_positions: 400
max_target_positions: 30
optimization:
max_epoch: 200 # '--me', default=0, type=int, metavar='N', help='force stop training at specified epoch'
max_update: 0 # '--mu', default=0, type=int, metavar='N', help='force stop training at specified update'
clip_norm: 25 # default=25, type=float, metavar='NORM', help='clip threshold of gradients'
update_freq: # default='1', metavar='N1,N2,...,N_K', type=lambda uf: eval_str_list(uf, type=int), help='update parameters every N_i batches, when in epoch i'
- 1
lrs: # '--learning-rate', default='0.25', type=eval_str_list, metavar='LR_1,LR_2,...,LR_N', help='learning rate for the first N epochs; all epochs >N using LR_N (note: this may be interpreted differently depending on --lr-scheduler)'
- 1e-4
min_lr: -1 # default=-1, type=float, metavar='LR', help='stop training when the learning rate reaches this minimum'
use_bmuf: 0 # default=False, action='store_true', help='specify global optimizer for syncing models on different GPUs/shards'
force_anneal: 0 # '--fa', type=int, metavar='N', help='force annealing at specified epoch'
warmup_updates: 0 # default=0, type=int, metavar='N', help='warmup the learning rate linearly for the first N updates'
lr_shrink: 0.99
# for cross entropy
sentence_avg: 1 # action='store_true', help='normalize gradients by the number of sentences in a batch (default is to normalize by number of tokens)'
adam:
adam_betas: '(0.9, 0.999)' # default='(0.9, 0.999)', metavar='B', help='betas for Adam optimizer'
adam_eps: 1e-8 # type=float, default=1e-8, metavar='D', help='epsilon for Adam optimizer'
weight_decay: 0.0 # '--wd', default=0.0, type=float, metavar='WD', help='weight decay'
weight_decay: 0.0 # default=0.0, type=float, help="Weight decay if we apply some."
adam_epsilon: 1e-8 # default=1e-8, type=float, help="Epsilon for Adam optimizer."
max_grad_norm: 1.0 # default=1.0, type=float, help="Max gradient norm."
num_train_epochs: 5 # default=5, type=float, help="Total number of training epochs to perform."
max_steps: -1 # default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
warmup_steps: 0 # default=0, type=int, help="Linear warmup over warmup_steps."
gradient_accumulation_steps: 1 # type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass."
checkpoint:
restore_file: checkpoint_last.pt # default='checkpoint_last.pt', help='filename from which to load checkpoint (default: <save-dir>/checkpoint_last.pt'
reset_dataloader: ~ # action='store_true', help='if set, does not reload dataloader state from the checkpoint'
reset_lr_scheduler: ~ # action='store_true', help='if set, does not load lr scheduler state from the checkpoint'
reset_meters: ~ # action='store_true', help='if set, does not load meters from the checkpoint'
reset_optimizer: ~ # action='store_true', help='if set, does not load optimizer state from the checkpoint'
optimizer_overrides: '{}' # default="{}", type=str, metavar='DICT', help='a dictionary used to override optimizer args when loading a checkpoint'
save_interval: 1 # type=int, default=1, metavar='N', help='save a checkpoint every N epochs'
save_interval_updates: 0 # type=int, default=0, metavar='N', help='save a checkpoint (and validate) every N updates'
keep_interval_updates: 0 # type=int, default=-1, metavar='N', help='keep the last N checkpoints saved with --save-interval-updates'
keep_last_epochs: -1 # type=int, default=-1, metavar='N', help='keep last N epoch checkpoints'
keep_best_checkpoints: -1 # type=int, default=-1, metavar='N', help='keep best N checkpoints based on scores'
no_save: 0 # action='store_true', help='don\'t save models or checkpoints'
no_epoch_checkpoints: 1 # action='store_true', help='only store last and best checkpoints'
no_last_checkpoints: 0 # action='store_true', help='don\'t store last checkpoints'
no_save_optimizer_state: ~ # action='store_true', help='don\'t save optimizer-state as part of checkpoint'
best_checkpoint_metric: bleu # type=str, default='loss', help='metric to use for saving "best" checkpoints'
maximize_best_checkpoint_metric: 1 # action='store_true', help='select the largest metric value for saving "best" checkpoints'
patience: 10 # type=int, default=-1, metavar='N', help=('early stop training if valid performance doesn\'t improve for N consecutive validation runs; note that this is influenced by --validate-interval')
save_dir: ~/python_wan/summarization/data-mmap/seq2seq/checkpoints # type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.",
should_continue: 0 # action="store_true", help="Whether to continue from latest checkpoint in output_dir"
model_name_or_path: ~ # default=None, type=str, help="The model checkpoint for weights initialization. Leave None if you want to train a model from scratch.",
cache_dir: ~ # default=None, type=str, help="Optional directory to store the pre-trained models downloaded from s3 (instead of the default one)",
logging_steps: 500 # type=int, default=500, help="Log every X updates steps."
save_steps: 2000 # type=int, default=2000, help="Save checkpoint every X updates steps."
save_total_limit: 2 # type=int, default=2, help="Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default",
overwrite_output_dir: 0 # action="store_true", help="Overwrite the content of the output directory"
overwrite_cache: 0 # action="store_true", help="Overwrite the cached training and evaluation sets"
eval:
path: ~/python_wan/summarization/data-mmap/seq2seq/checkpoints/checkpoint_best.pt # , metavar='FILE', help='path(s) to model file(s), colon separated'
result_path: ~
remove_bpe: ~ # ', nargs='?', const='@@ ', default=None, help='remove BPE tokens before scoring (can be set to sentencepiece)'
quiet: 0 # ', action='store_true', help='only print final scores'
model_overrides: '{}' # default="{}", type=str, metavar='DICT', help='a dictionary used to override model args at generation '
max_sentences: 64 # evaluation batch size, because while evaluation, it will not cost much ram, we can increase the batch size
beam: 1 # ', default=5, type=int, metavar='N', help='beam size')
nbest: 1 # ', default=1, type=int, metavar='N', help='number of hypotheses to output')
max_len_a: 0 #', default=0, type=float, metavar='N', help=('generate sequences of maximum length ax + b, where x is the source length'))
max_len_b: 30 #', default=200, type=int, metavar='N', help=('generate sequences of maximum length ax + b, where x is the source length'))
min_len: 1 # ', default=1, type=float, metavar='N', help=('minimum generation length'))
match_source_len: 0 #', default=False, action='store_true', help=('generations should match the source length'))
no_early_stop: 0 # ', action='store_true', help='deprecated')
unnormalized: 0 # ', action='store_true', help='compare unnormalized hypothesis scores')
no_beamable_mm: 0 # ', action='store_true', help='don\'t use BeamableMM in attention layers')
lenpen: 1 # ', default=1, type=float, help='length penalty: <1.0 favors shorter, >1.0 favors longer sentences')
unkpen: 0 # ', default=0, type=float, help='unknown word penalty: <0 produces more unks, >0 produces fewer')
replace_unk: ~ # ', nargs='?', const=True, default=None, help='perform unknown replacement (optionally with alignment dictionary)')
sacrebleu: 0 # ', action='store_true', help='score with sacrebleu')
score_reference: 0 # ', action='store_true', help='just score the reference translation')
prefix_size: 0 # ', default=0, type=int, metavar='PS', help='initialize generation by target prefix of given length')
no_repeat_ngram_size: 0 # ', default=0, type=int, metavar='N', help='ngram blocking such that this size ngram cannot be repeated in the generation')
sampling: 0 # ', action='store_true', help='sample hypotheses instead of using beam search')
sampling_topk: -1 # ', default=-1, type=int, metavar='PS', help='sample from top K likely next words instead of all words')
sampling_topp: -1 # ', default=-1.0, type=float, metavar='PS', help='sample from the smallest set whose cumulative probability mass exceeds p for next words')
temperature: 1. #', default=1., type=float, metavar='N', help='temperature for generation')
diverse_beam_groups: -1 # ', default=-1, type=int, metavar='N', help='number of groups for Diverse Beam Search')
diverse_beam_strength: 0.5 # ', default=0.5, type=float, metavar='N', help='strength of diversity penalty for Diverse Beam Search')
diversity_rate: -1.0 # ', default=-1.0, type=float, metavar='N', help='strength of diversity penalty for Diverse Siblings Search')
print_alignment: 0 # ', action='store_true', help='if set, uses attention feedback to compute and print alignment to source tokens')
print_step: 0 # ', action='store_true')
# arguments for iterative refinement generator
iter_decode_eos_penalty: 0.0 # ', default=0.0, type=float, metavar='N', help='if > 0.0, it penalized early-stopping in decoding.')
iter_decode_max_iter: 10 # ', default=10, type=int, metavar='N', help='maximum iterations for iterative refinement.')
iter_decode_force_max_iter: 0 #', action='store_true', help='if set, run exact the maximum number of iterations without early stop')
iter_decode_with_beam: 1 # ', default=1, type=int, metavar='N', help='if > 1, model will generate translations varying by the lengths.')
iter_decode_with_external_reranker: 0 # ', action='store_true', help='if set, the last checkpoint are assumed to be a reranker to rescore the translations'),
retain_iter_history: 0 # ', action='store_true', help='if set, decoding returns the whole history of iterative refinement')
# special decoding format for advanced decoding.
decoding_format: ~ # ', default=None, type=str, choices=['unigram', 'ensemble', 'vote', 'dp', 'bs'])
nltk_bleu: 1
rouge: 1