Skip to content

Commit

Permalink
minor changes to the configuration files
Browse files Browse the repository at this point in the history
  • Loading branch information
zhenghh04 committed Feb 21, 2025
1 parent f8a0bf4 commit aafa316
Show file tree
Hide file tree
Showing 12 changed files with 20 additions and 13 deletions.
7 changes: 5 additions & 2 deletions dlio_benchmark/configs/workload/llama_1t.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,12 @@ model:
pipeline: 64
zero_stage: 1
transformer:
vocab_size: 128000
vocab_size: 128256
hidden_size: 25872
ffn_hidden_size: 98304
num_attention_heads: 32
num_kv_heads: 32


framework: pytorch

Expand All @@ -37,7 +40,7 @@ reader:

train:
epochs: 3
computation_time: 5 # 2.44 sec per step
computation_time: 5 # This is not actual measurement. Just set an interval so that checkpoint every 5 seconds
total_training_steps: 5


Expand Down
3 changes: 1 addition & 2 deletions dlio_benchmark/configs/workload/llama_405b.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# 8 node run with 4 GPUs per node and TPSIZE=4 and PPSIZE=8
model:
name: llama_405b
type: transformer
Expand Down Expand Up @@ -39,7 +38,7 @@ reader:

train:
epochs: 3
computation_time: 5 # 2.44 sec per step
computation_time: 5 # This is not actual measurement. Just set an interval so that checkpoint every 5 seconds
total_training_steps: 5

checkpoint:
Expand Down
3 changes: 1 addition & 2 deletions dlio_benchmark/configs/workload/llama_70b.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# we mimic the checkpoint data for megatron-deepspeed
model:
name: llama_70b
type: transformer
Expand Down Expand Up @@ -39,7 +38,7 @@ reader:

train:
epochs: 1
computation_time: 5 # 2.44 sec per step
computation_time: 5 # This is not actual measurement. Just set an interval so that checkpoint every 5 seconds
total_training_steps: 5

checkpoint:
Expand Down
3 changes: 1 addition & 2 deletions dlio_benchmark/configs/workload/llama_70b_zero3.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# we mimic the checkpoint data for megatron-deepspeed
model:
name: llama_70b
type: transformer
Expand Down Expand Up @@ -39,7 +38,7 @@ reader:

train:
epochs: 1
computation_time: 5 # 2.44 sec per step
computation_time: 5 # This is not actual measurement. Just set an interval so that checkpoint every 5 seconds
total_training_steps: 5

checkpoint:
Expand Down
4 changes: 2 additions & 2 deletions dlio_benchmark/configs/workload/llama_7b_zero3.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# 8 node run with 4 GPUs per node and TPSIZE=4 and PPSIZE=8
model:
name: llama_7b_zero3
type: transformer
Expand Down Expand Up @@ -37,9 +36,10 @@ reader:
file_shuffle: seed
sample_shuffle: seed

#The computation time is not validated.
train:
epochs: 1
computation_time: 5 # This is not actual measurement. Just set an interval so that checkpoint every 5 seconds
computation_time: 5
total_training_steps: 5

checkpoint:
Expand Down
1 change: 1 addition & 0 deletions dlio_benchmark/configs/workload/resnet50_a100.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
model:
name: resnet50
type: cnn

framework: tensorflow

Expand Down
1 change: 1 addition & 0 deletions dlio_benchmark/configs/workload/resnet50_h100.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
model:
name: resnet50
type: cnn

framework: tensorflow

Expand Down
1 change: 1 addition & 0 deletions dlio_benchmark/configs/workload/resnet50_tf.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
model:
name: resnet50
type: cnn

framework: tensorflow

Expand Down
1 change: 1 addition & 0 deletions dlio_benchmark/configs/workload/resnet50_v100.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
model:
name: resnet50
type: cnn

framework: tensorflow

Expand Down
3 changes: 2 additions & 1 deletion dlio_benchmark/configs/workload/unet3d_a100.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
model:
name: unet3d
type: cnn
model_size: 499153191

framework: pytorch

Expand Down Expand Up @@ -32,7 +34,6 @@ checkpoint:
checkpoint_folder: checkpoints/unet3d
checkpoint_after_epoch: 5
epochs_between_checkpoints: 2
model_size: 499153191

metric:
au: 0.90
3 changes: 2 additions & 1 deletion dlio_benchmark/configs/workload/unet3d_h100.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
model:
name: unet3d
type: cnn
model_size: 499153191

framework: pytorch

Expand Down Expand Up @@ -32,7 +34,6 @@ checkpoint:
checkpoint_folder: checkpoints/unet3d
checkpoint_after_epoch: 5
epochs_between_checkpoints: 2
model_size: 499153191

metric:
au: 0.90
3 changes: 2 additions & 1 deletion dlio_benchmark/configs/workload/unet3d_v100.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
model:
name: unet3d
type: cnn
model_size: 499153191

framework: pytorch

Expand Down Expand Up @@ -32,5 +34,4 @@ checkpoint:
checkpoint_folder: checkpoints/unet3d
checkpoint_after_epoch: 5
epochs_between_checkpoints: 2
model_size: 499153191

0 comments on commit aafa316

Please sign in to comment.