-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_config.yaml
42 lines (37 loc) · 2.19 KB
/
train_config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# General run config
ppn: 1 # integer - number of MPI processes per node for ML training
ppd: 1 # integer - number of MPI processes per device for ML training
logging: "debug" # no, debug, verbose-perf - type of logging desired
device: "cpu" # cpu, cuda, xpu - device to train on
distributed: "ddp" # horovod, ddp - distributed training library
model: "sgs" # sgs, quadconv - model ID
data_path: "synthetic" # string, synthetic - path to training data to load
num_samples_per_rank: 111 # integer - number of data samples per rank. 111 - 20x mini_batch size
repeatability: False # True, False - make training deterministic
# Training hyperparameters config
epochs: 10 # integer - max number of epochs for training
mini_batch: 1 # integer - mini batch size for SGD update
learning_rate: 0.001 # float - serial leranring rate
tolerance: 1.0e-8 # float - convergence tolerance of validation loss
validation_split: 0.20 # float - percentage of data kept for validation
optimizer: "Adam" # Adam - optimizer used for training
scheduler: "None" # None,Plateau - optimizer scheduler to be used
precision: "fp32" # fp32,fp64,bf16,tf32 - precision for training
mixed_precision: False # True, False - enable automatic mixed precision
name: "./NNmodel" # string - name used to save ML model
save_db: False # True, False - save database contents at the end of trainint to .rdb file
# Online train config
online:
db_launch: "" # colocated,clustered - deployment method of SmartSim database
batch: 0 # integer - number of tensors to grab from database at once, 0: grab all tensors at once, recommended
simprocs: 1 # int - number of MPI processes simulation is running with
db_nodes: 1 # int - number of nodes dabase is sharded across. Always 1 for colocated DB.
# Model specific config
sgs:
neurons: 20 # integer - number of neurons in layers for SGS model
layers: 1 # integer - number of hidden layers for SGS model
comp_model_ins_outs: False # True, False - compute the model inputs and outputs from raw data
quadconv:
mesh_file: "" # string - path to mesh nodes for QCNN model (offline only)
channels: 4 # integer - channels for QCNN model
quadconv_config: "" # string - path to config file for QCNN model