minor changes to the configuration files

argonne-lcf · Feb 21, 2025 · aafa316 · aafa316
1 parent f8a0bf4
commit aafa316
Show file tree

Hide file tree

Showing 12 changed files with 20 additions and 13 deletions.
diff --git a/dlio_benchmark/configs/workload/llama_1t.yaml b/dlio_benchmark/configs/workload/llama_1t.yaml
@@ -10,9 +10,12 @@ model:
     pipeline: 64
     zero_stage: 1
   transformer: 
-    vocab_size: 128000
+    vocab_size: 128256
     hidden_size: 25872
     ffn_hidden_size: 98304
+    num_attention_heads: 32
+    num_kv_heads: 32
+
 
 framework: pytorch
 
@@ -37,7 +40,7 @@ reader:
 
 train:
   epochs: 3
-  computation_time: 5 # 2.44 sec per step
+  computation_time: 5 # This is not actual measurement. Just set an interval so that checkpoint every 5 seconds
   total_training_steps: 5
 
 

diff --git a/dlio_benchmark/configs/workload/llama_405b.yaml b/dlio_benchmark/configs/workload/llama_405b.yaml
@@ -1,4 +1,3 @@
-# 8 node run with 4 GPUs per node and TPSIZE=4 and PPSIZE=8
 model: 
   name: llama_405b
   type: transformer
@@ -39,7 +38,7 @@ reader:
 
 train:
   epochs: 3
-  computation_time: 5 # 2.44 sec per step
+  computation_time: 5 # This is not actual measurement. Just set an interval so that checkpoint every 5 seconds
   total_training_steps: 5
 
 checkpoint:

diff --git a/dlio_benchmark/configs/workload/llama_70b.yaml b/dlio_benchmark/configs/workload/llama_70b.yaml
@@ -1,4 +1,3 @@
-# we mimic the checkpoint data for megatron-deepspeed
 model: 
   name: llama_70b
   type: transformer
@@ -39,7 +38,7 @@ reader:
 
 train:
   epochs: 1
-  computation_time: 5 # 2.44 sec per step
+  computation_time: 5 # This is not actual measurement. Just set an interval so that checkpoint every 5 seconds
   total_training_steps: 5
 
 checkpoint:

diff --git a/dlio_benchmark/configs/workload/llama_70b_zero3.yaml b/dlio_benchmark/configs/workload/llama_70b_zero3.yaml
@@ -1,4 +1,3 @@
-# we mimic the checkpoint data for megatron-deepspeed
 model: 
   name: llama_70b
   type: transformer
@@ -39,7 +38,7 @@ reader:
 
 train:
   epochs: 1
-  computation_time: 5 # 2.44 sec per step
+  computation_time: 5 # This is not actual measurement. Just set an interval so that checkpoint every 5 seconds
   total_training_steps: 5
 
 checkpoint:

diff --git a/dlio_benchmark/configs/workload/llama_7b_zero3.yaml b/dlio_benchmark/configs/workload/llama_7b_zero3.yaml
@@ -1,4 +1,3 @@
-# 8 node run with 4 GPUs per node and TPSIZE=4 and PPSIZE=8
 model:
   name: llama_7b_zero3
   type: transformer
@@ -37,9 +36,10 @@ reader:
   file_shuffle: seed
   sample_shuffle: seed
 
+#The computation time is not validated. 
 train:
   epochs: 1
-  computation_time: 5 # This is not actual measurement. Just set an interval so that checkpoint every 5 seconds
+  computation_time: 5
   total_training_steps: 5
 
 checkpoint:

diff --git a/dlio_benchmark/configs/workload/resnet50_a100.yaml b/dlio_benchmark/configs/workload/resnet50_a100.yaml
@@ -1,5 +1,6 @@
 model: 
   name: resnet50
+  type: cnn
 
 framework: tensorflow
 

diff --git a/dlio_benchmark/configs/workload/resnet50_h100.yaml b/dlio_benchmark/configs/workload/resnet50_h100.yaml
@@ -1,5 +1,6 @@
 model: 
   name: resnet50
+  type: cnn
 
 framework: tensorflow
 

diff --git a/dlio_benchmark/configs/workload/resnet50_tf.yaml b/dlio_benchmark/configs/workload/resnet50_tf.yaml
@@ -1,5 +1,6 @@
 model: 
   name: resnet50
+  type: cnn
 
 framework: tensorflow
 

diff --git a/dlio_benchmark/configs/workload/resnet50_v100.yaml b/dlio_benchmark/configs/workload/resnet50_v100.yaml
@@ -1,5 +1,6 @@
 model: 
   name: resnet50
+  type: cnn
 
 framework: tensorflow
 

diff --git a/dlio_benchmark/configs/workload/unet3d_a100.yaml b/dlio_benchmark/configs/workload/unet3d_a100.yaml
@@ -1,5 +1,7 @@
 model: 
   name: unet3d
+  type: cnn
+  model_size: 499153191
 
 framework: pytorch
 
@@ -32,7 +34,6 @@ checkpoint:
   checkpoint_folder: checkpoints/unet3d
   checkpoint_after_epoch: 5
   epochs_between_checkpoints: 2
-  model_size: 499153191
 
 metric:
   au: 0.90
diff --git a/dlio_benchmark/configs/workload/unet3d_h100.yaml b/dlio_benchmark/configs/workload/unet3d_h100.yaml
@@ -1,5 +1,7 @@
 model: 
   name: unet3d
+  type: cnn
+  model_size: 499153191
 
 framework: pytorch
 
@@ -32,7 +34,6 @@ checkpoint:
   checkpoint_folder: checkpoints/unet3d
   checkpoint_after_epoch: 5
   epochs_between_checkpoints: 2
-  model_size: 499153191
 
 metric:
   au: 0.90
diff --git a/dlio_benchmark/configs/workload/unet3d_v100.yaml b/dlio_benchmark/configs/workload/unet3d_v100.yaml
@@ -1,5 +1,7 @@
 model: 
   name: unet3d
+  type: cnn
+  model_size: 499153191
 
 framework: pytorch
 
@@ -32,5 +34,4 @@ checkpoint:
   checkpoint_folder: checkpoints/unet3d
   checkpoint_after_epoch: 5
   epochs_between_checkpoints: 2
-  model_size: 499153191