Skip to content

Commit

Permalink
added support for checkpoint only
Browse files Browse the repository at this point in the history
  • Loading branch information
zhenghh04 committed Feb 19, 2025
1 parent 4d4ea28 commit 7c37e3f
Show file tree
Hide file tree
Showing 12 changed files with 183 additions and 140 deletions.
6 changes: 3 additions & 3 deletions dlio_benchmark/checkpointing/base_checkpointing.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def __init__(self, ext):
#logging.info(f"{utcnow()} {self.args.my_rank} [{start_layer}-{end_layer}]:::{layer_index}: {size/1024./1024./1024:.4f} GB ")
model_checkpoint_size += size
if self.args.my_rank == 0:
logging.info(f"{utcnow()} Layer states defined! {model_checkpoint_size/1024./1024./1024} GB per rank")
logging.debug(f"{utcnow()} Layer states defined! {model_checkpoint_size/1024./1024./1024} GB per rank")

# optimization state
self.optimization_state = None
Expand All @@ -121,7 +121,7 @@ def __init__(self, ext):
optimizer_checkpoint_size += state * get_datatype_size(self.args.optimizer_datatype)
self.optimization_state[str(index)] = self.get_tensor(state, self.args.optimizer_datatype)
if self.args.my_rank == 0:
logging.info(f"{utcnow()} Optimizer state defined: {self.checkpoint_size / 1024./1024./1024} GB per rank")
logging.debug(f"{utcnow()} Optimizer state defined: {optimizer_checkpoint_size / 1024./1024./1024} GB per rank")
# layer state


Expand All @@ -137,7 +137,7 @@ def __init__(self, ext):
model_checkpoint_size /= self.data_parallelism
self.checkpoint_size = model_checkpoint_size + optimizer_checkpoint_size
if self.args.my_rank == 0:
logging.info(f"{utcnow()} Layer size: {model_checkpoint_size} GB")
logging.info(f"{utcnow()} Model size: {model_checkpoint_size} GB")
logging.info(f"{utcnow()} Optimizer state size: {optimizer_checkpoint_size} GB")
logging.info(f"{utcnow()} Total checkpoint size: {self.checkpoint_size} GB")

Expand Down
4 changes: 2 additions & 2 deletions dlio_benchmark/configs/workload/llama_1t.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
model:
name: llama_405b
type: transformer
model_size_bytes: 30102
num_layers: 128
model_datatype: fp16
optimizer_datatype: fp32
Expand Down Expand Up @@ -44,5 +43,6 @@ train:

checkpoint:
checkpoint_folder: checkpoints/llama_405b
steps_between_checkpoints: 1
time_between_checkpoints: 5
num_checkpoints: 10

10 changes: 5 additions & 5 deletions dlio_benchmark/configs/workload/llama_405b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
model:
name: llama_405b
type: transformer
model_size_bytes: 30102
num_layers: 126
model_datatype: fp16
optimizer_datatype: fp32
Expand All @@ -11,7 +10,7 @@ model:
pipeline: 16
zero_stage: 1
transformer:
vocab_size: 128000
vocab_size: 128256
hidden_size: 16384
ffn_hidden_size: 53248
num_attention_heads: 128
Expand All @@ -20,8 +19,8 @@ model:
framework: pytorch

workflow:
generate_data: True
train: True
generate_data: False
train: False
checkpoint: True

dataset:
Expand All @@ -45,5 +44,6 @@ train:

checkpoint:
checkpoint_folder: checkpoints/llama_405b
steps_between_checkpoints: 1
time_between_checkpoints: 5
num_checkpoints: 10

12 changes: 6 additions & 6 deletions dlio_benchmark/configs/workload/llama_70b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
model:
name: llama_70b
type: transformer
model_size_bytes: 30102
num_layers: 80
model_datatype: fp16
optimizer_datatype: fp32
Expand All @@ -11,17 +10,17 @@ model:
pipeline: 4
zero_stage: 1
transformer:
vocab_size: 128000
vocab_size: 128256
hidden_size: 8192
ffn_hidden_size: 28672
num_attention_heads: 64
num_attention_heads: 128
num_kv_heads: 8

framework: pytorch

workflow:
generate_data: True
train: True
generate_data: False
train: False
checkpoint: True

dataset:
Expand All @@ -45,4 +44,5 @@ train:

checkpoint:
checkpoint_folder: checkpoints/llama_70b
steps_between_checkpoints: 1
time_between_checkpoints: 5
num_checkpoints: 10
10 changes: 5 additions & 5 deletions dlio_benchmark/configs/workload/llama_70b_zero3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
model:
name: llama_70b
type: transformer
model_size_bytes: 30102
num_layers: 80
model_datatype: fp16
optimizer_datatype: fp32
Expand All @@ -11,7 +10,7 @@ model:
pipeline: 1
zero_stage: 3
transformer:
vocab_size: 128000
vocab_size: 128256
hidden_size: 8192
ffn_hidden_size: 28672
num_attention_heads: 128
Expand All @@ -20,8 +19,8 @@ model:
framework: pytorch

workflow:
generate_data: True
train: True
generate_data: False
train: False
checkpoint: True

dataset:
Expand All @@ -45,4 +44,5 @@ train:

checkpoint:
checkpoint_folder: checkpoints/llama_70b
steps_between_checkpoints: 1
time_between_checkpoints: 5
num_checkpoints: 10
8 changes: 4 additions & 4 deletions dlio_benchmark/configs/workload/llama_7b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
model:
name: llama_7b
type: transformer
model_size_bytes: 30102
num_layers: 32
model_datatype: fp16
optimizer_datatype: fp32
Expand All @@ -20,8 +19,8 @@ model:
framework: pytorch

workflow:
generate_data: True
train: True
generate_data: False
train: False
checkpoint: True

dataset:
Expand All @@ -45,4 +44,5 @@ train:

checkpoint:
checkpoint_folder: checkpoints/llama_7b
steps_between_checkpoints: 1
time_between_checkpoints: 5
num_checkpoints: 10
8 changes: 4 additions & 4 deletions dlio_benchmark/configs/workload/llama_7b_zero3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
model:
name: llama_7b_zero3
type: transformer
model_size_bytes: 30102
num_layers: 32
model_datatype: fp16
optimizer_datatype: fp32
Expand All @@ -20,8 +19,8 @@ model:
framework: pytorch

workflow:
generate_data: True
train: True
generate_data: False
train: False
checkpoint: True

dataset:
Expand All @@ -45,4 +44,5 @@ train:

checkpoint:
checkpoint_folder: checkpoints/llama_7b_zero3
steps_between_checkpoints: 1
time_between_checkpoints: 5
num_checkpoints: 10
9 changes: 5 additions & 4 deletions dlio_benchmark/configs/workload/llama_8b_zero3.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
model:
name: llama_8b_zero3
type: transformer
model_size_bytes: 30102
num_layers: 32
model_datatype: fp16
optimizer_datatype: fp32
Expand All @@ -19,8 +18,8 @@ model:
framework: pytorch

workflow:
generate_data: True
train: True
generate_data: False
train: False
checkpoint: True

dataset:
Expand All @@ -44,4 +43,6 @@ train:

checkpoint:
checkpoint_folder: checkpoints/llama_8b_zero3
steps_between_checkpoints: 1
time_between_checkpoints: 5
num_checkpoints: 10

Loading

0 comments on commit 7c37e3f

Please sign in to comment.