added support for checkpoint only

argonne-lcf · Feb 19, 2025 · 7c37e3f · 7c37e3f
1 parent 4d4ea28
commit 7c37e3f
Show file tree

Hide file tree

Showing 12 changed files with 183 additions and 140 deletions.
diff --git a/dlio_benchmark/checkpointing/base_checkpointing.py b/dlio_benchmark/checkpointing/base_checkpointing.py
@@ -98,7 +98,7 @@ def __init__(self, ext):
                     #logging.info(f"{utcnow()} {self.args.my_rank} [{start_layer}-{end_layer}]:::{layer_index}: {size/1024./1024./1024:.4f} GB ")
                     model_checkpoint_size += size
             if self.args.my_rank == 0:
-                logging.info(f"{utcnow()} Layer states defined! {model_checkpoint_size/1024./1024./1024} GB per rank")
+                logging.debug(f"{utcnow()} Layer states defined! {model_checkpoint_size/1024./1024./1024} GB per rank")
 
             # optimization state
             self.optimization_state = None
@@ -121,7 +121,7 @@ def __init__(self, ext):
                             optimizer_checkpoint_size += state * get_datatype_size(self.args.optimizer_datatype)
                             self.optimization_state[str(index)] = self.get_tensor(state, self.args.optimizer_datatype)
             if self.args.my_rank == 0:
-                logging.info(f"{utcnow()} Optimizer state defined: {self.checkpoint_size / 1024./1024./1024} GB per rank")
+                logging.debug(f"{utcnow()} Optimizer state defined: {optimizer_checkpoint_size / 1024./1024./1024} GB per rank")
             # layer state
 
 
@@ -137,7 +137,7 @@ def __init__(self, ext):
             model_checkpoint_size /= self.data_parallelism
         self.checkpoint_size = model_checkpoint_size + optimizer_checkpoint_size
         if self.args.my_rank == 0:
-            logging.info(f"{utcnow()} Layer size: {model_checkpoint_size} GB")
+            logging.info(f"{utcnow()} Model size: {model_checkpoint_size} GB")
             logging.info(f"{utcnow()} Optimizer state size: {optimizer_checkpoint_size} GB")
             logging.info(f"{utcnow()} Total checkpoint size: {self.checkpoint_size} GB")
 

diff --git a/dlio_benchmark/configs/workload/llama_1t.yaml b/dlio_benchmark/configs/workload/llama_1t.yaml
@@ -2,7 +2,6 @@
 model: 
   name: llama_405b
   type: transformer
-  model_size_bytes: 30102
   num_layers: 128
   model_datatype: fp16
   optimizer_datatype: fp32
@@ -44,5 +43,6 @@ train:
 
 checkpoint:
   checkpoint_folder: checkpoints/llama_405b
-  steps_between_checkpoints: 1
+  time_between_checkpoints: 5
+  num_checkpoints: 10
 
diff --git a/dlio_benchmark/configs/workload/llama_405b.yaml b/dlio_benchmark/configs/workload/llama_405b.yaml
@@ -2,7 +2,6 @@
 model: 
   name: llama_405b
   type: transformer
-  model_size_bytes: 30102
   num_layers: 126
   model_datatype: fp16
   optimizer_datatype: fp32
@@ -11,7 +10,7 @@ model:
     pipeline: 16
     zero_stage: 1
   transformer: 
-    vocab_size: 128000
+    vocab_size: 128256
     hidden_size: 16384
     ffn_hidden_size: 53248
     num_attention_heads: 128
@@ -20,8 +19,8 @@ model:
 framework: pytorch
 
 workflow:
-  generate_data: True
-  train: True
+  generate_data: False
+  train: False
   checkpoint: True
 
 dataset: 
@@ -45,5 +44,6 @@ train:
 
 checkpoint:
   checkpoint_folder: checkpoints/llama_405b
-  steps_between_checkpoints: 1
+  time_between_checkpoints: 5
+  num_checkpoints: 10
 
diff --git a/dlio_benchmark/configs/workload/llama_70b.yaml b/dlio_benchmark/configs/workload/llama_70b.yaml
@@ -2,7 +2,6 @@
 model: 
   name: llama_70b
   type: transformer
-  model_size_bytes: 30102
   num_layers: 80
   model_datatype: fp16
   optimizer_datatype: fp32
@@ -11,17 +10,17 @@ model:
     pipeline: 4
     zero_stage: 1
   transformer: 
-    vocab_size: 128000
+    vocab_size: 128256
     hidden_size: 8192
     ffn_hidden_size: 28672
-    num_attention_heads: 64
+    num_attention_heads: 128
     num_kv_heads: 8
 
 framework: pytorch
 
 workflow:
-  generate_data: True
-  train: True
+  generate_data: False
+  train: False
   checkpoint: True
 
 dataset: 
@@ -45,4 +44,5 @@ train:
 
 checkpoint:
   checkpoint_folder: checkpoints/llama_70b
-  steps_between_checkpoints: 1
+  time_between_checkpoints: 5
+  num_checkpoints: 10
diff --git a/dlio_benchmark/configs/workload/llama_70b_zero3.yaml b/dlio_benchmark/configs/workload/llama_70b_zero3.yaml
@@ -2,7 +2,6 @@
 model: 
   name: llama_70b
   type: transformer
-  model_size_bytes: 30102
   num_layers: 80
   model_datatype: fp16
   optimizer_datatype: fp32
@@ -11,7 +10,7 @@ model:
     pipeline: 1
     zero_stage: 3
   transformer: 
-    vocab_size: 128000
+    vocab_size: 128256
     hidden_size: 8192
     ffn_hidden_size: 28672
     num_attention_heads: 128
@@ -20,8 +19,8 @@ model:
 framework: pytorch
 
 workflow:
-  generate_data: True
-  train: True
+  generate_data: False
+  train: False
   checkpoint: True
 
 dataset: 
@@ -45,4 +44,5 @@ train:
 
 checkpoint:
   checkpoint_folder: checkpoints/llama_70b
-  steps_between_checkpoints: 1
+  time_between_checkpoints: 5
+  num_checkpoints: 10
diff --git a/dlio_benchmark/configs/workload/llama_7b.yaml b/dlio_benchmark/configs/workload/llama_7b.yaml
@@ -2,7 +2,6 @@
 model:
   name: llama_7b
   type: transformer
-  model_size_bytes: 30102
   num_layers: 32
   model_datatype: fp16
   optimizer_datatype: fp32
@@ -20,8 +19,8 @@ model:
 framework: pytorch
 
 workflow:
-  generate_data: True
-  train: True
+  generate_data: False
+  train: False
   checkpoint: True
 
 dataset: 
@@ -45,4 +44,5 @@ train:
 
 checkpoint:
   checkpoint_folder: checkpoints/llama_7b
-  steps_between_checkpoints: 1
+  time_between_checkpoints: 5
+  num_checkpoints: 10
diff --git a/dlio_benchmark/configs/workload/llama_7b_zero3.yaml b/dlio_benchmark/configs/workload/llama_7b_zero3.yaml
@@ -2,7 +2,6 @@
 model:
   name: llama_7b_zero3
   type: transformer
-  model_size_bytes: 30102
   num_layers: 32
   model_datatype: fp16
   optimizer_datatype: fp32
@@ -20,8 +19,8 @@ model:
 framework: pytorch
 
 workflow:
-  generate_data: True
-  train: True
+  generate_data: False
+  train: False
   checkpoint: True
 
 dataset: 
@@ -45,4 +44,5 @@ train:
 
 checkpoint:
   checkpoint_folder: checkpoints/llama_7b_zero3
-  steps_between_checkpoints: 1
+  time_between_checkpoints: 5
+  num_checkpoints: 10
diff --git a/dlio_benchmark/configs/workload/llama_8b_zero3.yaml b/dlio_benchmark/configs/workload/llama_8b_zero3.yaml
@@ -1,7 +1,6 @@
 model:
   name: llama_8b_zero3
   type: transformer
-  model_size_bytes: 30102
   num_layers: 32
   model_datatype: fp16
   optimizer_datatype: fp32
@@ -19,8 +18,8 @@ model:
 framework: pytorch
 
 workflow:
-  generate_data: True
-  train: True
+  generate_data: False
+  train: False
   checkpoint: True
 
 dataset: 
@@ -44,4 +43,6 @@ train:
 
 checkpoint:
   checkpoint_folder: checkpoints/llama_8b_zero3
-  steps_between_checkpoints: 1
+  time_between_checkpoints: 5
+  num_checkpoints: 10
+