From 9261410e4f9dcad7bd5c63c3ce8bcf27ef2a5c27 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Mon, 4 Dec 2023 23:22:33 -0800
Subject: [PATCH 01/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 15 +++++++++------
1 file changed, 9 insertions(+), 6 deletions(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index 78dfc3163b..16444cc433 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -73,6 +73,8 @@ def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, layer=N
for _ in range(self.num_local_experts):
expert = MLP(self.config, submodules, is_expert=True)
self.local_experts.append(expert)
+ if args.residual_moe:
+ self.fixed_mlp = MLP(self.config, submodules, is_expert=False)
def gather_indices(self, local_indices):
""" Gather tensors and concatenate along the first dimension."""
@@ -261,16 +263,17 @@ def forward(self, hidden_states):
if self.config.timers is not None:
self.config.timers('final_route', log_level=2).start()
- output_total = output_total * max_prob
if self.routing == 'top2' or self.routing == 'sinkhorn_top2':
- output_total_2 = output_total_2 * max_prob_2
- output_total = output_total + output_total_2
+ output_total = (output_total * max_prob + output_total_2 * max_prob_2) / (max_prob + max_prob_2)
+ if args.residual_moe:
+ output_mlp, output_bias_mlp = self.fixed_mlp(global_hidden_states)
+ output_total += output_mlp
output_total = output_total.view(hidden_shape)
if self.add_bias:
- output_bias_total = output_bias_total * max_prob
if self.routing == 'top2' or self.routing == 'sinkhorn_top2':
- output_bias_total_2 = output_bias_total_2 * max_prob_2
- output_bias_total = output_bias_total + output_bias_total_2
+ output_bias_total = (output_bias_total * max_prob + output_bias_total_2 * max_prob_2) / (max_prob + max_prob_2)
+ if args.residual_moe:
+ output_bias_total += output_bias_mlp
output_bias_total = output_bias_total.view(hidden_shape)
else:
output_bias_total = None
From f2ecc718b187549e4434bbbfcb8f1eee7bea6460 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Tue, 5 Dec 2023 01:15:18 -0800
Subject: [PATCH 02/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index 16444cc433..d242df1605 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -61,6 +61,10 @@ def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, layer=N
self.expert_parallel_size = parallel_state.get_expert_model_parallel_world_size()
assert self.config.num_moe_experts % self.expert_parallel_size == 0
+ if layer < "TOTAL_LAYERS" - 2:
+ self.num_local_experts /= 2
+ self.expert_parallel_size /=2
+ "data parallel size" *= 2
self.num_local_experts = self.config.num_moe_experts // self.expert_parallel_size
local_expert_indices_offset = (
parallel_state.get_expert_model_parallel_rank() * self.num_local_experts
@@ -68,6 +72,8 @@ def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, layer=N
self.local_expert_indices = [
local_expert_indices_offset + i for i in range(self.num_local_experts)
]
+ ### if num_loc_exp=exp_par_size=4 and i have 8 GPU's, do some indices identify with same expert?
+ ### this should be accounted for in the lines below
self.local_experts = torch.nn.ModuleList()
for _ in range(self.num_local_experts):
@@ -80,6 +86,7 @@ def gather_indices(self, local_indices):
""" Gather tensors and concatenate along the first dimension."""
group = get_tensor_and_expert_parallel_group()
world_size = torch.distributed.get_world_size(group=group)
+ ### in the example above, is world_size=2/TP?
# Bypass the function if we are using only 1 GPU.
if world_size == 1:
return local_indices
@@ -149,6 +156,7 @@ def forward(self, hidden_states):
if self.config.timers is not None:
self.config.timers('routing_gather', log_level=2).start()
+ ### sequence_parallel is when sequence parallel dimension > 1? why should i do this gather when EP_size > 1?
if self.sequence_parallel or (self.expert_parallel_size > 1):
global_hidden_states = tensor_parallel.gather_from_sequence_parallel_region_to_moe(
hidden_states
@@ -201,6 +209,8 @@ def forward(self, hidden_states):
self.config.timers('routing_loop', log_level=2).start()
for expert_num, expert in enumerate(self.local_experts):
local_expert_index = self.local_expert_indices[expert_num]
+ ### in the example above, local_expert_index could be 7 even when there are 4 experts?
+ ### this means 4 GPUs are idle because local_indices is empty
local_indices = (global_indices == local_expert_index).nonzero()
hidden = global_hidden_states[local_indices, :]
if self.config.timers is not None:
@@ -228,6 +238,7 @@ def forward(self, hidden_states):
if self.config.timers is not None:
self.config.timers('ep_scatter', log_level=2).start()
if self.sequence_parallel or (self.expert_parallel_size > 1):
+ ### what is this? should I apply it to output of self.fixed_mlp too?
output_total = tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe(
output_total
)
From c7ea711ba94b5ad246f4b2c5d5fe302a79b8e761 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Tue, 5 Dec 2023 15:19:42 -0800
Subject: [PATCH 03/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index d242df1605..06218ba085 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -234,6 +234,8 @@ def forward(self, hidden_states):
if self.config.timers is not None:
self.config.timers('routing_loop').stop()
+ if args.residual_moe:
+ output_mlp, output_bias_mlp = self.fixed_mlp(global_hidden_states)
if self.config.timers is not None:
self.config.timers('ep_scatter', log_level=2).start()
@@ -242,6 +244,10 @@ def forward(self, hidden_states):
output_total = tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe(
output_total
)
+ if args.residual_moe:
+ output_mlp = tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe(
+ output_mlp
+ )
if self.routing == 'top2' or self.routing == 'sinkhorn_top2':
output_total_2 = tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe(
output_total_2
@@ -256,6 +262,10 @@ def forward(self, hidden_states):
output_bias_total_2 = tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe(
output_bias_total_2
)
+ if args.residual_moe:
+ output_bias_mlp = tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe(
+ output_bias_mlp
+ )
# bias is duplicated across tensor parallelism ranks;
# reduce scatter reduces bias across tensor parallel_ranks
@@ -277,7 +287,6 @@ def forward(self, hidden_states):
if self.routing == 'top2' or self.routing == 'sinkhorn_top2':
output_total = (output_total * max_prob + output_total_2 * max_prob_2) / (max_prob + max_prob_2)
if args.residual_moe:
- output_mlp, output_bias_mlp = self.fixed_mlp(global_hidden_states)
output_total += output_mlp
output_total = output_total.view(hidden_shape)
if self.add_bias:
From d5f7af41c5155afc79f363b9e221075790d97d28 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 14:49:32 -0800
Subject: [PATCH 04/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 32 ++++---------------------
1 file changed, 4 insertions(+), 28 deletions(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index 06218ba085..2fe0ac7bb5 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -61,32 +61,27 @@ def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, layer=N
self.expert_parallel_size = parallel_state.get_expert_model_parallel_world_size()
assert self.config.num_moe_experts % self.expert_parallel_size == 0
- if layer < "TOTAL_LAYERS" - 2:
- self.num_local_experts /= 2
- self.expert_parallel_size /=2
- "data parallel size" *= 2
self.num_local_experts = self.config.num_moe_experts // self.expert_parallel_size
+ if layer in [5,6]:
+ self.num_local_experts *= 2
+ self.expert_parallel_size /=2
+ print('LAYER:', layer, 'NUM LOCAL EXPERTS:', self.num_local_experts)
local_expert_indices_offset = (
parallel_state.get_expert_model_parallel_rank() * self.num_local_experts
)
self.local_expert_indices = [
local_expert_indices_offset + i for i in range(self.num_local_experts)
]
- ### if num_loc_exp=exp_par_size=4 and i have 8 GPU's, do some indices identify with same expert?
- ### this should be accounted for in the lines below
self.local_experts = torch.nn.ModuleList()
for _ in range(self.num_local_experts):
expert = MLP(self.config, submodules, is_expert=True)
self.local_experts.append(expert)
- if args.residual_moe:
- self.fixed_mlp = MLP(self.config, submodules, is_expert=False)
def gather_indices(self, local_indices):
""" Gather tensors and concatenate along the first dimension."""
group = get_tensor_and_expert_parallel_group()
world_size = torch.distributed.get_world_size(group=group)
- ### in the example above, is world_size=2/TP?
# Bypass the function if we are using only 1 GPU.
if world_size == 1:
return local_indices
@@ -156,7 +151,6 @@ def forward(self, hidden_states):
if self.config.timers is not None:
self.config.timers('routing_gather', log_level=2).start()
- ### sequence_parallel is when sequence parallel dimension > 1? why should i do this gather when EP_size > 1?
if self.sequence_parallel or (self.expert_parallel_size > 1):
global_hidden_states = tensor_parallel.gather_from_sequence_parallel_region_to_moe(
hidden_states
@@ -209,8 +203,6 @@ def forward(self, hidden_states):
self.config.timers('routing_loop', log_level=2).start()
for expert_num, expert in enumerate(self.local_experts):
local_expert_index = self.local_expert_indices[expert_num]
- ### in the example above, local_expert_index could be 7 even when there are 4 experts?
- ### this means 4 GPUs are idle because local_indices is empty
local_indices = (global_indices == local_expert_index).nonzero()
hidden = global_hidden_states[local_indices, :]
if self.config.timers is not None:
@@ -234,20 +226,12 @@ def forward(self, hidden_states):
if self.config.timers is not None:
self.config.timers('routing_loop').stop()
- if args.residual_moe:
- output_mlp, output_bias_mlp = self.fixed_mlp(global_hidden_states)
-
if self.config.timers is not None:
self.config.timers('ep_scatter', log_level=2).start()
if self.sequence_parallel or (self.expert_parallel_size > 1):
- ### what is this? should I apply it to output of self.fixed_mlp too?
output_total = tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe(
output_total
)
- if args.residual_moe:
- output_mlp = tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe(
- output_mlp
- )
if self.routing == 'top2' or self.routing == 'sinkhorn_top2':
output_total_2 = tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe(
output_total_2
@@ -262,10 +246,6 @@ def forward(self, hidden_states):
output_bias_total_2 = tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe(
output_bias_total_2
)
- if args.residual_moe:
- output_bias_mlp = tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe(
- output_bias_mlp
- )
# bias is duplicated across tensor parallelism ranks;
# reduce scatter reduces bias across tensor parallel_ranks
@@ -286,14 +266,10 @@ def forward(self, hidden_states):
self.config.timers('final_route', log_level=2).start()
if self.routing == 'top2' or self.routing == 'sinkhorn_top2':
output_total = (output_total * max_prob + output_total_2 * max_prob_2) / (max_prob + max_prob_2)
- if args.residual_moe:
- output_total += output_mlp
output_total = output_total.view(hidden_shape)
if self.add_bias:
if self.routing == 'top2' or self.routing == 'sinkhorn_top2':
output_bias_total = (output_bias_total * max_prob + output_bias_total_2 * max_prob_2) / (max_prob + max_prob_2)
- if args.residual_moe:
- output_bias_total += output_bias_mlp
output_bias_total = output_bias_total.view(hidden_shape)
else:
output_bias_total = None
From 35d3ac4b8be934dfa8c834ee45b87dde75a8caf9 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 15:19:19 -0800
Subject: [PATCH 05/84] Update transformer_block.py
---
megatron/core/transformer/transformer_block.py | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py
index 1c47e2f716..ceed87a0ff 100644
--- a/megatron/core/transformer/transformer_block.py
+++ b/megatron/core/transformer/transformer_block.py
@@ -57,12 +57,20 @@ def _build_layers(self, transformer_layer_spec):
# coeff = self.layer_number
# self.norm_factor *= coeff
def build_layer(layer_number):
- layer = TransformerLayer(
+ if layer_number in [1, 2]:
+ layer = TransformerLayer(
config=self.config,
- submodules=transformer_layer_spec.submodules,
+ submodules=gpt_layer_with_transformer_engine_spec.submodules,
layer_number=layer_number,
self_attn_mask_type=self.self_attn_mask_type,
)
+ else:
+ layer = TransformerLayer(
+ config=self.config,
+ submodules=transformer_layer_spec.submodules,
+ layer_number=layer_number,
+ self_attn_mask_type=self.self_attn_mask_type,
+ )
return layer
if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
From 0e7eb2306a194d56869263cab1e8dbc2c692ed3a Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 15:21:26 -0800
Subject: [PATCH 06/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index 2fe0ac7bb5..0536ece542 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -65,6 +65,7 @@ def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, layer=N
if layer in [5,6]:
self.num_local_experts *= 2
self.expert_parallel_size /=2
+ if torch.distributed.get_rank() == 0:
print('LAYER:', layer, 'NUM LOCAL EXPERTS:', self.num_local_experts)
local_expert_indices_offset = (
parallel_state.get_expert_model_parallel_rank() * self.num_local_experts
From 4ad505a4d65d3c0941c456510989f332dd5c55f0 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 15:25:35 -0800
Subject: [PATCH 07/84] Update transformer_block.py
---
megatron/core/transformer/transformer_block.py | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py
index ceed87a0ff..f1484edd00 100644
--- a/megatron/core/transformer/transformer_block.py
+++ b/megatron/core/transformer/transformer_block.py
@@ -57,13 +57,15 @@ def _build_layers(self, transformer_layer_spec):
# coeff = self.layer_number
# self.norm_factor *= coeff
def build_layer(layer_number):
- if layer_number in [1, 2]:
+ if layer_number in [1, 2, 4]:
layer = TransformerLayer(
- config=self.config,
- submodules=gpt_layer_with_transformer_engine_spec.submodules,
- layer_number=layer_number,
- self_attn_mask_type=self.self_attn_mask_type,
- )
+ config=self.config,
+ submodules=gpt_layer_with_transformer_engine_spec.submodules,
+ layer_number=layer_number,
+ self_attn_mask_type=self.self_attn_mask_type,
+ )
+ if torch.distributed.get_rank() == 0:
+ print('LAYER:', layer, 'NO EXPERTS')
else:
layer = TransformerLayer(
config=self.config,
From 3af0059d967c2cc2fe9c3304ab24ab9baee00091 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 15:33:24 -0800
Subject: [PATCH 08/84] Update transformer_block.py
---
megatron/core/transformer/transformer_block.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py
index f1484edd00..3da69e3c08 100644
--- a/megatron/core/transformer/transformer_block.py
+++ b/megatron/core/transformer/transformer_block.py
@@ -14,6 +14,7 @@
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
from megatron.core.utils import make_sharded_tensor_for_checkpoint, make_viewless_tensor
+from megatron.core.models.gpt.gpt_layer_specs import gpt_layer_with_transformer_engine_spec
class TransformerBlock(MegatronModule):
From 13c54d9a8e4a127b877e6c7a370d49daa177602b Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 15:35:13 -0800
Subject: [PATCH 09/84] Update transformer_block.py
---
megatron/core/transformer/transformer_block.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py
index 3da69e3c08..307e488802 100644
--- a/megatron/core/transformer/transformer_block.py
+++ b/megatron/core/transformer/transformer_block.py
@@ -66,7 +66,7 @@ def build_layer(layer_number):
self_attn_mask_type=self.self_attn_mask_type,
)
if torch.distributed.get_rank() == 0:
- print('LAYER:', layer, 'NO EXPERTS')
+ print('LAYER:', layer_number, 'NO EXPERTS')
else:
layer = TransformerLayer(
config=self.config,
From df2eeb21dfa158a3e5293c8b1caf0822f6c08ae9 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 16:23:00 -0800
Subject: [PATCH 10/84] Update arguments.py
---
megatron/arguments.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/megatron/arguments.py b/megatron/arguments.py
index 9ffe2897a3..82f379f19c 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -628,6 +628,8 @@ def _add_network_size_args(parser):
dest='bert_binary_head')
group.add_argument('--num-experts', type=int, default=None,
help='Number of Experts in Switch Transformer (None means no Switch)')
+ group.add_argument('--kebab', nargs='+', type=int,
+ help='Number of experts for each layer (`1` means dense layer)')
group.add_argument('--routing-mode', type=str, default='sinkhorn',
choices=['sinkhorn', 'top1', 'top2', 'sinkhorn_top2'],
help='Mode of the expert routing.')
From 1f07db5c7f5399bb8fa65f62d410698c5d4ea125 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 16:23:45 -0800
Subject: [PATCH 11/84] Update initialize.py
---
megatron/initialize.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/megatron/initialize.py b/megatron/initialize.py
index 77e9569cab..0571c86fc4 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -95,7 +95,7 @@ def finish_mpu_init():
dir_path = os.path.join(args.router_profiling_path)
if not os.path.exists(dir_path):
os.makedirs(dir_path)
-
+ print('KEBABBBBBBBBBBBBBBBBBBBBBB:', args.kebab)
# No continuation function
return None
From b9d64756ff211a6720b1e80c70f9aa4b30a25a47 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 16:27:49 -0800
Subject: [PATCH 12/84] Update initialize.py
---
megatron/initialize.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/megatron/initialize.py b/megatron/initialize.py
index 0571c86fc4..1eb2dbcc7c 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -96,8 +96,8 @@ def finish_mpu_init():
if not os.path.exists(dir_path):
os.makedirs(dir_path)
print('KEBABBBBBBBBBBBBBBBBBBBBBB:', args.kebab)
- # No continuation function
- return None
+ # No continuation function
+ return None
def _compile_dependencies():
From 1c20628f630c93b434e1abd2e24d2059f08032ec Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 16:29:40 -0800
Subject: [PATCH 13/84] Update initialize.py
---
megatron/initialize.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/megatron/initialize.py b/megatron/initialize.py
index 1eb2dbcc7c..43dcbb454a 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -95,9 +95,9 @@ def finish_mpu_init():
dir_path = os.path.join(args.router_profiling_path)
if not os.path.exists(dir_path):
os.makedirs(dir_path)
- print('KEBABBBBBBBBBBBBBBBBBBBBBB:', args.kebab)
- # No continuation function
- return None
+ print('KEBABBBBBBBBBBBBBBBBBBBBBB:', args.kebab)
+ # No continuation function
+ return None
def _compile_dependencies():
From 413bf438349db1992162ebec6173bd80077231c4 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 16:30:00 -0800
Subject: [PATCH 14/84] Update initialize.py
---
megatron/initialize.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/megatron/initialize.py b/megatron/initialize.py
index 43dcbb454a..1eb2dbcc7c 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -95,9 +95,9 @@ def finish_mpu_init():
dir_path = os.path.join(args.router_profiling_path)
if not os.path.exists(dir_path):
os.makedirs(dir_path)
- print('KEBABBBBBBBBBBBBBBBBBBBBBB:', args.kebab)
- # No continuation function
- return None
+ print('KEBABBBBBBBBBBBBBBBBBBBBBB:', args.kebab)
+ # No continuation function
+ return None
def _compile_dependencies():
From 9237b34ac2493f1ea222558b2079d85e19220de5 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 16:32:31 -0800
Subject: [PATCH 15/84] Update initialize.py
---
megatron/initialize.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/megatron/initialize.py b/megatron/initialize.py
index 1eb2dbcc7c..6b7f84b71f 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -95,7 +95,7 @@ def finish_mpu_init():
dir_path = os.path.join(args.router_profiling_path)
if not os.path.exists(dir_path):
os.makedirs(dir_path)
- print('KEBABBBBBBBBBBBBBBBBBBBBBB:', args.kebab)
+
# No continuation function
return None
From 01dc1e91cee27cb1c724aeb9fe8254cccc090a9f Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 16:34:35 -0800
Subject: [PATCH 16/84] Update arguments.py
---
megatron/arguments.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/megatron/arguments.py b/megatron/arguments.py
index 82f379f19c..3dcb6362c6 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -628,7 +628,7 @@ def _add_network_size_args(parser):
dest='bert_binary_head')
group.add_argument('--num-experts', type=int, default=None,
help='Number of Experts in Switch Transformer (None means no Switch)')
- group.add_argument('--kebab', nargs='+', type=int,
+ group.add_argument('--moe-layers', nargs='+', type=int,
help='Number of experts for each layer (`1` means dense layer)')
group.add_argument('--routing-mode', type=str, default='sinkhorn',
choices=['sinkhorn', 'top1', 'top2', 'sinkhorn_top2'],
From 68b99a03c9dadc87da58c5a2ff368dfc229ceb53 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 16:51:28 -0800
Subject: [PATCH 17/84] Update arguments.py
---
megatron/arguments.py | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/megatron/arguments.py b/megatron/arguments.py
index 3dcb6362c6..0acdb3187b 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -390,8 +390,13 @@ def validate_args(args, defaults={}):
# MoE Spec check
if args.num_experts is not None:
assert args.model_spec is None, "Model Spec must be None when using MoEs"
+ assert args.num_experts > 1, "--num-experts should be greater than 2."
if args.use_balancing_loss is not None:
assert (args.routing_mode == 'top1' or args.routing_mode == 'top2'), "Need --routing-mode = 'top1' or 'top2' if setting --use-balancing-loss."
+ if args.moe_layers is not None:
+ import math
+ assert sum(args.moe_layers) == args.num_layers, "--moe-layers doesn't sum up to --num-layers."
+ assert min(x for x in args.moe_layers if x != 1) > 2, "Experts per layer should be greater than 2."
# Expert parallelism check
if args.expert_model_parallel_size > 1:
@@ -401,6 +406,8 @@ def validate_args(args, defaults={}):
if args.tensor_model_parallel_size > 1:
assert args.sequence_parallel, \
"When using expert parallelism and tensor parallelism, sequence parallelism must be used."
+ if args.moe_layers is not None:
+ assert all(x % args.expert_model_parallel_size == 0 for x in args.moe_layers if x != 1), "Experts per layer should be multiple of --expert-model-parallel-size."
# Print arguments.
_print_args("arguments", args)
@@ -628,8 +635,9 @@ def _add_network_size_args(parser):
dest='bert_binary_head')
group.add_argument('--num-experts', type=int, default=None,
help='Number of Experts in Switch Transformer (None means no Switch)')
- group.add_argument('--moe-layers', nargs='+', type=int,
- help='Number of experts for each layer (`1` means dense layer)')
+ group.add_argument('--moe-layers', nargs='+', type=int, default=None,
+ help='Number of experts for each layer (`1` means dense layer). '
+ 'Does not support pipeline parallelism.')
group.add_argument('--routing-mode', type=str, default='sinkhorn',
choices=['sinkhorn', 'top1', 'top2', 'sinkhorn_top2'],
help='Mode of the expert routing.')
From 28bc3a3b6c1ea8209e2b86c2f0d1b8a074fdbf80 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 16:55:06 -0800
Subject: [PATCH 18/84] Update pretrain_gpt.py
---
pretrain_gpt.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pretrain_gpt.py b/pretrain_gpt.py
index c4e65679e6..9129beb0bb 100644
--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
@@ -52,7 +52,7 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat
if args.model_spec is not None:
transformer_layer_spec = import_module(args.model_spec)
else:
- if args.num_experts is None:
+ if (args.num_experts is None) and (args.moe_layers is None):
transformer_layer_spec = gpt_layer_with_transformer_engine_spec
else:
transformer_layer_spec = gpt_layer_with_transformer_engine_spec_moe
From b819330dc2a599a2b77409df4c66c208999e16cc Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 16:57:26 -0800
Subject: [PATCH 19/84] Update arguments.py
---
megatron/arguments.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/megatron/arguments.py b/megatron/arguments.py
index 0acdb3187b..1def6cf44f 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -397,6 +397,7 @@ def validate_args(args, defaults={}):
import math
assert sum(args.moe_layers) == args.num_layers, "--moe-layers doesn't sum up to --num-layers."
assert min(x for x in args.moe_layers if x != 1) > 2, "Experts per layer should be greater than 2."
+ assert args.use_mcore_models == True, "--moe-layers supported only with --use-mcore-models."
# Expert parallelism check
if args.expert_model_parallel_size > 1:
From 56ad1208c801ea373a7b22d967b6883270cf7c83 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 16:59:58 -0800
Subject: [PATCH 20/84] Update transformer_block.py
---
.../core/transformer/transformer_block.py | 37 ++++++++++++-------
1 file changed, 23 insertions(+), 14 deletions(-)
diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py
index 307e488802..c125cc163e 100644
--- a/megatron/core/transformer/transformer_block.py
+++ b/megatron/core/transformer/transformer_block.py
@@ -58,22 +58,31 @@ def _build_layers(self, transformer_layer_spec):
# coeff = self.layer_number
# self.norm_factor *= coeff
def build_layer(layer_number):
- if layer_number in [1, 2, 4]:
- layer = TransformerLayer(
- config=self.config,
- submodules=gpt_layer_with_transformer_engine_spec.submodules,
- layer_number=layer_number,
- self_attn_mask_type=self.self_attn_mask_type,
- )
- if torch.distributed.get_rank() == 0:
- print('LAYER:', layer_number, 'NO EXPERTS')
+ args = get_args()
+ if args.moe_layers:
+ if args.moe_layers[layer_numer-1] == 1:
+ layer = TransformerLayer(
+ config=self.config,
+ submodules=gpt_layer_with_transformer_engine_spec.submodules,
+ layer_number=layer_number,
+ self_attn_mask_type=self.self_attn_mask_type,
+ )
+ if torch.distributed.get_rank() == 0:
+ print('LAYER:', layer_number, 'NO EXPERTS')
+ else:
+ layer = TransformerLayer(
+ config=self.config,
+ submodules=transformer_layer_spec.submodules,
+ layer_number=layer_number,
+ self_attn_mask_type=self.self_attn_mask_type,
+ )
else:
layer = TransformerLayer(
- config=self.config,
- submodules=transformer_layer_spec.submodules,
- layer_number=layer_number,
- self_attn_mask_type=self.self_attn_mask_type,
- )
+ config=self.config,
+ submodules=transformer_layer_spec.submodules,
+ layer_number=layer_number,
+ self_attn_mask_type=self.self_attn_mask_type,
+ )
return layer
if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None:
From 8ce18aa4d22a5853dd90168f1b6e79849bb0b164 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 17:05:45 -0800
Subject: [PATCH 21/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 24 ++++++++++++------------
1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index 0536ece542..0c5d81ed70 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -49,8 +49,11 @@ def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, layer=N
args = get_args()
self.config: TransformerConfig = config
-
- self.router = torch.nn.Linear(self.config.hidden_size, self.config.num_moe_experts)
+ if args.moe_experts:
+ self.num_moe_experts = args.moe_experts[layer-1]
+ else:
+ self.num_moe_experts = self.config.num_moe_experts
+ self.router = torch.nn.Linear(self.config.hidden_size, self.num_moe_experts)
self.add_bias = config.add_bias_linear
self.routing = args.routing_mode # 'sinkhorn', 'top1', 'top2', 'sinkhorn_top2'
self.layer = layer
@@ -60,11 +63,8 @@ def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, layer=N
self.router_activation = torch.sigmoid
self.expert_parallel_size = parallel_state.get_expert_model_parallel_world_size()
- assert self.config.num_moe_experts % self.expert_parallel_size == 0
- self.num_local_experts = self.config.num_moe_experts // self.expert_parallel_size
- if layer in [5,6]:
- self.num_local_experts *= 2
- self.expert_parallel_size /=2
+ assert self.num_moe_experts % self.expert_parallel_size == 0
+ self.num_local_experts = self.num_moe_experts // self.expert_parallel_size
if torch.distributed.get_rank() == 0:
print('LAYER:', layer, 'NUM LOCAL EXPERTS:', self.num_local_experts)
local_expert_indices_offset = (
@@ -101,7 +101,7 @@ def forward(self, hidden_states):
args = get_args()
hidden_shape = hidden_states.shape
route = self.router(hidden_states)
- route = route.view(-1, self.config.num_moe_experts)
+ route = route.view(-1, self.num_moe_experts)
if self.config.timers is not None:
self.config.timers('routing_block1', log_level=2).start()
@@ -173,14 +173,14 @@ def forward(self, hidden_states):
if (args.use_balancing_loss is not None) and self.training:
if hasattr(args, 'l_aux'):
me = torch.mean(route, dim=0)
- mask1 = F.one_hot(global_indices, num_classes=self.config.num_moe_experts)
+ mask1 = F.one_hot(global_indices, num_classes=self.num_moe_experts)
ce = torch.mean(mask1.float(), dim=0)
- args.l_aux += torch.sum(me * ce) * self.config.num_moe_experts
+ args.l_aux += torch.sum(me * ce) * self.num_moe_experts
if self.routing == 'top2':
me_2 = torch.mean(masked_route, dim=0)
- mask1 = F.one_hot(global_indices_2, num_classes=self.config.num_moe_experts)
+ mask1 = F.one_hot(global_indices_2, num_classes=self.num_moe_experts)
ce_2 = torch.mean(mask1.float(), dim=0)
- args.l_aux += torch.sum(me_2 * ce_2) * self.config.num_moe_experts
+ args.l_aux += torch.sum(me_2 * ce_2) * self.num_moe_experts
# Collect token count for each expert and save to file
if self.router_profiling_interval and (args.curr_iteration % self.router_profiling_interval == 0) and args.curr_iteration > 0:
From 0875fc50447fe7d300780346b23154eac783be23 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 17:09:31 -0800
Subject: [PATCH 22/84] Update arguments.py
---
megatron/arguments.py | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/megatron/arguments.py b/megatron/arguments.py
index 1def6cf44f..959b23e93f 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -394,8 +394,7 @@ def validate_args(args, defaults={}):
if args.use_balancing_loss is not None:
assert (args.routing_mode == 'top1' or args.routing_mode == 'top2'), "Need --routing-mode = 'top1' or 'top2' if setting --use-balancing-loss."
if args.moe_layers is not None:
- import math
- assert sum(args.moe_layers) == args.num_layers, "--moe-layers doesn't sum up to --num-layers."
+ assert len(args.moe_layers) == args.num_layers, "length of --moe-layers should equal --num-layers."
assert min(x for x in args.moe_layers if x != 1) > 2, "Experts per layer should be greater than 2."
assert args.use_mcore_models == True, "--moe-layers supported only with --use-mcore-models."
From 4989c58a57784ecd16c878c3dfb9d7c5d5711bba Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 17:10:54 -0800
Subject: [PATCH 23/84] Update transformer_block.py
---
megatron/core/transformer/transformer_block.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py
index c125cc163e..2818bad46d 100644
--- a/megatron/core/transformer/transformer_block.py
+++ b/megatron/core/transformer/transformer_block.py
@@ -15,6 +15,7 @@
from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
from megatron.core.utils import make_sharded_tensor_for_checkpoint, make_viewless_tensor
from megatron.core.models.gpt.gpt_layer_specs import gpt_layer_with_transformer_engine_spec
+from megatron import get_args
class TransformerBlock(MegatronModule):
From ef743f4824e3e147cd12141edfa509ae4fc2b81c Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 17:12:27 -0800
Subject: [PATCH 24/84] Update transformer_block.py
---
megatron/core/transformer/transformer_block.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py
index 2818bad46d..858241cbf0 100644
--- a/megatron/core/transformer/transformer_block.py
+++ b/megatron/core/transformer/transformer_block.py
@@ -61,7 +61,7 @@ def _build_layers(self, transformer_layer_spec):
def build_layer(layer_number):
args = get_args()
if args.moe_layers:
- if args.moe_layers[layer_numer-1] == 1:
+ if args.moe_layers[layer_number-1] == 1:
layer = TransformerLayer(
config=self.config,
submodules=gpt_layer_with_transformer_engine_spec.submodules,
From 25056d5d810f8caacb0457ece899538e077a30e4 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 17:13:37 -0800
Subject: [PATCH 25/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index 0c5d81ed70..fa7aba6d9b 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -49,7 +49,7 @@ def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, layer=N
args = get_args()
self.config: TransformerConfig = config
- if args.moe_experts:
+ if args.moe_layers:
self.num_moe_experts = args.moe_experts[layer-1]
else:
self.num_moe_experts = self.config.num_moe_experts
From 7dad1589658e4114ac9fbfb3d1802151af718463 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 17:14:56 -0800
Subject: [PATCH 26/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index fa7aba6d9b..dec9e7a09d 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -50,7 +50,7 @@ def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, layer=N
self.config: TransformerConfig = config
if args.moe_layers:
- self.num_moe_experts = args.moe_experts[layer-1]
+ self.num_moe_experts = args.moe_layers[layer-1]
else:
self.num_moe_experts = self.config.num_moe_experts
self.router = torch.nn.Linear(self.config.hidden_size, self.num_moe_experts)
From 2aee703ba971f267d9b2f45e08017f0c047fd75f Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 17:20:18 -0800
Subject: [PATCH 27/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index dec9e7a09d..99546296e3 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -206,6 +206,8 @@ def forward(self, hidden_states):
local_expert_index = self.local_expert_indices[expert_num]
local_indices = (global_indices == local_expert_index).nonzero()
hidden = global_hidden_states[local_indices, :]
+ if torch.distributed.get_rank():
+ print('LAYER:', self.layer, 'local_expert_index', local_expert_index)
if self.config.timers is not None:
self.config.timers('expert_fwd', log_level=2).start()
output, output_bias = expert(hidden)
From babbebba5ba9c6c96dba19f101aec2a18aba9ed4 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 17:23:29 -0800
Subject: [PATCH 28/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index 99546296e3..0717ee88d0 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -206,7 +206,7 @@ def forward(self, hidden_states):
local_expert_index = self.local_expert_indices[expert_num]
local_indices = (global_indices == local_expert_index).nonzero()
hidden = global_hidden_states[local_indices, :]
- if torch.distributed.get_rank():
+ if torch.distributed.get_rank() == 0:
print('LAYER:', self.layer, 'local_expert_index', local_expert_index)
if self.config.timers is not None:
self.config.timers('expert_fwd', log_level=2).start()
From 179d2a443743f7da93cb238d781794afa17cc88d Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 17:25:50 -0800
Subject: [PATCH 29/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 4 ----
1 file changed, 4 deletions(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index 0717ee88d0..f533a040ac 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -65,8 +65,6 @@ def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, layer=N
assert self.num_moe_experts % self.expert_parallel_size == 0
self.num_local_experts = self.num_moe_experts // self.expert_parallel_size
- if torch.distributed.get_rank() == 0:
- print('LAYER:', layer, 'NUM LOCAL EXPERTS:', self.num_local_experts)
local_expert_indices_offset = (
parallel_state.get_expert_model_parallel_rank() * self.num_local_experts
)
@@ -206,8 +204,6 @@ def forward(self, hidden_states):
local_expert_index = self.local_expert_indices[expert_num]
local_indices = (global_indices == local_expert_index).nonzero()
hidden = global_hidden_states[local_indices, :]
- if torch.distributed.get_rank() == 0:
- print('LAYER:', self.layer, 'local_expert_index', local_expert_index)
if self.config.timers is not None:
self.config.timers('expert_fwd', log_level=2).start()
output, output_bias = expert(hidden)
From 253c87c0d5b5611c487fd2ba08caad1ac08a0aa1 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 17:26:08 -0800
Subject: [PATCH 30/84] Update transformer_block.py
---
megatron/core/transformer/transformer_block.py | 2 --
1 file changed, 2 deletions(-)
diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py
index 858241cbf0..56772b2aae 100644
--- a/megatron/core/transformer/transformer_block.py
+++ b/megatron/core/transformer/transformer_block.py
@@ -68,8 +68,6 @@ def build_layer(layer_number):
layer_number=layer_number,
self_attn_mask_type=self.self_attn_mask_type,
)
- if torch.distributed.get_rank() == 0:
- print('LAYER:', layer_number, 'NO EXPERTS')
else:
layer = TransformerLayer(
config=self.config,
From a058ac2489c4ec7f44d2ae975ed2eaecfcb03cb5 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 17:38:44 -0800
Subject: [PATCH 31/84] Update README.md
---
README.md | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/README.md b/README.md
index cb65ec09f9..2b57e81343 100644
--- a/README.md
+++ b/README.md
@@ -89,6 +89,12 @@ A sample plot for `top2` routing mode (obtained from a tiny toy model) is:
+## Varying expert number across layers
+
+To set different number of experts across layers use the flag `--moe-layers` followed by a sequence of numbers corresponding to the number of experts per layer. For example, in a model with 5 layers, one can write `--moe-layers 1 8 16 8 1`.
+
+This flag does not currently support pipeline parallelism. Also, for MoE layers, each of these numbers should be multiple of `--expert-model-parallel-size` and greater than 2. For a dense layer, the number should be set to 1.
+
# NVIDIA Megatron-LM (copied from upstream)
Megatron ([1](https://arxiv.org/pdf/1909.08053.pdf), [2](https://arxiv.org/pdf/2104.04473.pdf), and [3](https://arxiv.org/pdf/2205.05198)) is a large, powerful transformer developed by the Applied Deep Learning Research team at NVIDIA. This repository is for ongoing research on training large transformer language models at scale. We developed efficient, model-parallel ([tensor](https://arxiv.org/pdf/1909.08053.pdf), [sequence](https://arxiv.org/pdf/2205.05198), and [pipeline](https://arxiv.org/pdf/2104.04473.pdf)), and multi-node pre-training of transformer based models such as [GPT](https://arxiv.org/abs/2005.14165), [BERT](https://arxiv.org/pdf/1810.04805.pdf), and [T5](https://arxiv.org/abs/1910.10683) using mixed precision.
From cec343707de6063ada09f78dac64e4f1e9c5521c Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 22:24:47 -0800
Subject: [PATCH 32/84] Update mlp.py
---
megatron/core/transformer/mlp.py | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/megatron/core/transformer/mlp.py b/megatron/core/transformer/mlp.py
index c2592bf7c8..1591f7278b 100644
--- a/megatron/core/transformer/mlp.py
+++ b/megatron/core/transformer/mlp.py
@@ -37,7 +37,7 @@ class MLP(MegatronModule):
"""
def __init__(
- self, config: TransformerConfig, submodules: MLPSubmodules, is_expert: bool = False
+ self, config: TransformerConfig, submodules: MLPSubmodules, is_expert: bool = False, ffn_hidden_ratio = 1
):
super().__init__(config=config)
@@ -51,7 +51,7 @@ def __init__(
self.linear_fc1 = build_module(
submodules.linear_fc1,
self.config.hidden_size,
- ffn_hidden_size,
+ ffn_hidden_size * ffn_hidden_ratio,
config=self.config,
init_method=self.config.init_method,
gather_output=False,
@@ -72,7 +72,7 @@ def glu(x):
self.linear_fc2 = build_module(
submodules.linear_fc2,
- self.config.ffn_hidden_size,
+ self.config.ffn_hidden_size * ffn_hidden_ratio,
self.config.hidden_size,
config=self.config,
init_method=self.config.output_layer_init_method,
@@ -86,6 +86,7 @@ def forward(self, hidden_states):
# [s, b, 4 * h/p]
intermediate_parallel, bias_parallel = self.linear_fc1(hidden_states)
+ print('DIMENSION OF INTERMEDIATE FFN LAYER:', intermediate_parallel.shape)
if self.config.bias_gelu_fusion:
assert self.config.add_bias_linear is True
From 381e899a84061ad4982108c3c916919889154a4b Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 22:26:44 -0800
Subject: [PATCH 33/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index 77545502f4..aee8d9a744 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -75,7 +75,7 @@ def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, layer=N
self.local_experts = torch.nn.ModuleList()
for _ in range(self.num_local_experts):
- expert = MLP(self.config, submodules, is_expert=True)
+ expert = MLP(self.config, submodules, is_expert=True, ffn_hidden_ratio = args.ffn_hidden_ratios[layer-1])
self.local_experts.append(expert)
def gather_indices(self, local_indices):
From cd4b5b1d86d54a253cdc3299a738c867f43f7226 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 22:46:21 -0800
Subject: [PATCH 34/84] Update transformer_layer.py
---
megatron/core/transformer/transformer_layer.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py
index dc4946c5f5..117a5d218e 100644
--- a/megatron/core/transformer/transformer_layer.py
+++ b/megatron/core/transformer/transformer_layer.py
@@ -108,7 +108,7 @@ def __init__(
if submodules.mlp.module == SwitchMLP:
self.mlp = build_module(submodules.mlp, config=self.config, layer=layer_number)
else:
- self.mlp = build_module(submodules.mlp, config=self.config)
+ self.mlp = build_module(submodules.mlp, config=self.config, layer=layer_number)
## [Module 9: BiasDropoutFusion]
self.mlp_bda = build_module(submodules.mlp_bda)
From 3c0a58f52f176b2103b26e3171834696a63538bb Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 22:47:41 -0800
Subject: [PATCH 35/84] Update mlp.py
---
megatron/core/transformer/mlp.py | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/megatron/core/transformer/mlp.py b/megatron/core/transformer/mlp.py
index 1591f7278b..02abbe42fb 100644
--- a/megatron/core/transformer/mlp.py
+++ b/megatron/core/transformer/mlp.py
@@ -37,11 +37,13 @@ class MLP(MegatronModule):
"""
def __init__(
- self, config: TransformerConfig, submodules: MLPSubmodules, is_expert: bool = False, ffn_hidden_ratio = 1
+ self, config: TransformerConfig, submodules: MLPSubmodules, is_expert: bool = False, layer=None
):
super().__init__(config=config)
self.config: TransformerConfig = config
+ if layer:
+ ffn_ratio = ffn_hidden_ratio[layer-1]
# If this is a gated linear unit we double the output width, see https://arxiv.org/pdf/2002.05202.pdf
ffn_hidden_size = self.config.ffn_hidden_size
@@ -51,7 +53,7 @@ def __init__(
self.linear_fc1 = build_module(
submodules.linear_fc1,
self.config.hidden_size,
- ffn_hidden_size * ffn_hidden_ratio,
+ ffn_hidden_size * ffn_ratio,
config=self.config,
init_method=self.config.init_method,
gather_output=False,
@@ -72,7 +74,7 @@ def glu(x):
self.linear_fc2 = build_module(
submodules.linear_fc2,
- self.config.ffn_hidden_size * ffn_hidden_ratio,
+ self.config.ffn_hidden_size * ffn_ratio,
self.config.hidden_size,
config=self.config,
init_method=self.config.output_layer_init_method,
From 2644ff8cbb098b603477f8f990f8882a6b83b215 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 22:48:30 -0800
Subject: [PATCH 36/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index aee8d9a744..8941c7b6ef 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -75,7 +75,7 @@ def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, layer=N
self.local_experts = torch.nn.ModuleList()
for _ in range(self.num_local_experts):
- expert = MLP(self.config, submodules, is_expert=True, ffn_hidden_ratio = args.ffn_hidden_ratios[layer-1])
+ expert = MLP(self.config, submodules, is_expert=True, layer=layer)
self.local_experts.append(expert)
def gather_indices(self, local_indices):
From 911ceeba6ff3ce7c85af654c58a29f0d324feff6 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 22:51:53 -0800
Subject: [PATCH 37/84] Update mlp.py
---
megatron/core/transformer/mlp.py | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/megatron/core/transformer/mlp.py b/megatron/core/transformer/mlp.py
index 02abbe42fb..730eaa6224 100644
--- a/megatron/core/transformer/mlp.py
+++ b/megatron/core/transformer/mlp.py
@@ -11,6 +11,7 @@
from megatron.core.transformer.spec_utils import ModuleSpec, build_module
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
+from megatron import get_args
@dataclass
@@ -40,10 +41,13 @@ def __init__(
self, config: TransformerConfig, submodules: MLPSubmodules, is_expert: bool = False, layer=None
):
super().__init__(config=config)
-
+
+ args = get_args()
self.config: TransformerConfig = config
- if layer:
- ffn_ratio = ffn_hidden_ratio[layer-1]
+ if layer and args.ffn_hidden_ratio:
+ ffn_ratio = args.ffn_hidden_ratio[layer-1]
+ else:
+ ffn_ratio = 1
# If this is a gated linear unit we double the output width, see https://arxiv.org/pdf/2002.05202.pdf
ffn_hidden_size = self.config.ffn_hidden_size
From 7d901059d90166c8409b8831c01148462f3c9632 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 22:56:33 -0800
Subject: [PATCH 38/84] Update arguments.py
---
megatron/arguments.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/megatron/arguments.py b/megatron/arguments.py
index 7e12d0d2c1..ca341e6035 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -638,6 +638,9 @@ def _add_network_size_args(parser):
group.add_argument('--moe-layers', nargs='+', type=int, default=None,
help='Number of experts for each layer (`1` means dense layer). '
'Does not support pipeline parallelism.')
+ group.add_argument('--ffn-hidden-ratio', nargs='+', type=int, default=None,
+ help='Ratio of MLP intermediate layer over embedding dimension (4 is default). '
+ 'It can be different in each layer.')
group.add_argument('--routing-mode', type=str, default='sinkhorn',
choices=['sinkhorn', 'top1', 'top2', 'sinkhorn_top2'],
help='Mode of the expert routing.')
From 0ff85c76bed4512ead462c6f403a717c467a740d Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 23:10:22 -0800
Subject: [PATCH 39/84] Update mlp.py
---
megatron/core/transformer/mlp.py | 17 ++++++++++-------
1 file changed, 10 insertions(+), 7 deletions(-)
diff --git a/megatron/core/transformer/mlp.py b/megatron/core/transformer/mlp.py
index 730eaa6224..001f953ff3 100644
--- a/megatron/core/transformer/mlp.py
+++ b/megatron/core/transformer/mlp.py
@@ -44,20 +44,23 @@ def __init__(
args = get_args()
self.config: TransformerConfig = config
+
if layer and args.ffn_hidden_ratio:
- ffn_ratio = args.ffn_hidden_ratio[layer-1]
+ ffn_hidden_size_1 = self.config.hidden_size * args.ffn_hidden_ratio[layer-1]
+ ffn_hidden_size_2 = self.config.hidden_size * args.ffn_hidden_ratio[layer-1]
else:
- ffn_ratio = 1
-
+ ffn_hidden_size_1 = self.config.ffn_hidden_size
+ ffn_hidden_size_2 = self.config.ffn_hidden_size
+
# If this is a gated linear unit we double the output width, see https://arxiv.org/pdf/2002.05202.pdf
- ffn_hidden_size = self.config.ffn_hidden_size
if self.config.gated_linear_unit:
- ffn_hidden_size *= 2
+ ffn_hidden_size_1 *= 2
+
self.linear_fc1 = build_module(
submodules.linear_fc1,
self.config.hidden_size,
- ffn_hidden_size * ffn_ratio,
+ ffn_hidden_size_1,
config=self.config,
init_method=self.config.init_method,
gather_output=False,
@@ -78,7 +81,7 @@ def glu(x):
self.linear_fc2 = build_module(
submodules.linear_fc2,
- self.config.ffn_hidden_size * ffn_ratio,
+ ffn_hidden_size_2,
self.config.hidden_size,
config=self.config,
init_method=self.config.output_layer_init_method,
From e91b58711da3f9f04761ecb8af09fc1fd5747aa4 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 23:12:42 -0800
Subject: [PATCH 40/84] Update mlp.py
---
megatron/core/transformer/mlp.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/megatron/core/transformer/mlp.py b/megatron/core/transformer/mlp.py
index 001f953ff3..dd7bbf434c 100644
--- a/megatron/core/transformer/mlp.py
+++ b/megatron/core/transformer/mlp.py
@@ -95,7 +95,8 @@ def forward(self, hidden_states):
# [s, b, 4 * h/p]
intermediate_parallel, bias_parallel = self.linear_fc1(hidden_states)
- print('DIMENSION OF INTERMEDIATE FFN LAYER:', intermediate_parallel.shape)
+ if torch.distribute.get_rank() == 0:
+ print('DIMENSION OF INTERMEDIATE FFN LAYER:', intermediate_parallel.shape)
if self.config.bias_gelu_fusion:
assert self.config.add_bias_linear is True
From 7f7d9abbc1a82709e3b6b01ad3dcec8ba7df3da2 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 23:13:26 -0800
Subject: [PATCH 41/84] Update mlp.py
---
megatron/core/transformer/mlp.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/megatron/core/transformer/mlp.py b/megatron/core/transformer/mlp.py
index dd7bbf434c..2bb5e0d4fa 100644
--- a/megatron/core/transformer/mlp.py
+++ b/megatron/core/transformer/mlp.py
@@ -49,8 +49,8 @@ def __init__(
ffn_hidden_size_1 = self.config.hidden_size * args.ffn_hidden_ratio[layer-1]
ffn_hidden_size_2 = self.config.hidden_size * args.ffn_hidden_ratio[layer-1]
else:
- ffn_hidden_size_1 = self.config.ffn_hidden_size
- ffn_hidden_size_2 = self.config.ffn_hidden_size
+ ffn_hidden_size_1 = self.config.ffn_hidden_size
+ ffn_hidden_size_2 = self.config.ffn_hidden_size
# If this is a gated linear unit we double the output width, see https://arxiv.org/pdf/2002.05202.pdf
if self.config.gated_linear_unit:
From 1f052fcdb7538587fa6ddd0f2383c231f3f45c9c Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 23:32:02 -0800
Subject: [PATCH 42/84] Update README.md
---
README.md | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index f23be16c68..3e823a437a 100644
--- a/README.md
+++ b/README.md
@@ -91,12 +91,14 @@ A sample plot for `top2` routing mode (obtained from a tiny toy model) is:
-## Varying expert number across layers
+## Varying expert number and MLP hidden dimension across layers
-To set different number of experts across layers use the flag `--moe-layers` followed by a sequence of numbers corresponding to the number of experts per layer. For example, in a model with 5 layers, one can write `--moe-layers 1 8 16 8 1`.
+To set different number of experts across layers use the flag `--moe-layers` followed by a sequence of integers corresponding to the number of experts per layer. For example, in a model with 5 layers, one can write `--moe-layers 1 8 16 8 1`.
This flag does not currently support pipeline parallelism. Also, for MoE layers, each of these numbers should be multiple of `--expert-model-parallel-size` and greater than 2. For a dense layer, the number should be set to 1.
+To change the hidden dimension of MLP's across layers, use the flag `--ffn-hidden-ratio` followed by a sequence of integers corresponding to the ratio between the hidden dimension and the model's embedding dimension. Without this flag, this value is set by default to 4 for all layers (unless `--ff-hidden-size` is used). For example, for a model with 5 layers, one can write `--ffn-hidden-ratio 4 4 2 4 4`.
+
# NVIDIA Megatron-LM (copied from upstream)
Megatron ([1](https://arxiv.org/pdf/1909.08053.pdf), [2](https://arxiv.org/pdf/2104.04473.pdf), and [3](https://arxiv.org/pdf/2205.05198)) is a large, powerful transformer developed by the Applied Deep Learning Research team at NVIDIA. This repository is for ongoing research on training large transformer language models at scale. We developed efficient, model-parallel ([tensor](https://arxiv.org/pdf/1909.08053.pdf), [sequence](https://arxiv.org/pdf/2205.05198), and [pipeline](https://arxiv.org/pdf/2104.04473.pdf)), and multi-node pre-training of transformer based models such as [GPT](https://arxiv.org/abs/2005.14165), [BERT](https://arxiv.org/pdf/1810.04805.pdf), and [T5](https://arxiv.org/abs/1910.10683) using mixed precision.
From 25dd9a863bd8562e268e3f02559389b6e09fed27 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 23:36:01 -0800
Subject: [PATCH 43/84] Update mlp.py
---
megatron/core/transformer/mlp.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/megatron/core/transformer/mlp.py b/megatron/core/transformer/mlp.py
index 2bb5e0d4fa..e6f8f56f2d 100644
--- a/megatron/core/transformer/mlp.py
+++ b/megatron/core/transformer/mlp.py
@@ -44,7 +44,7 @@ def __init__(
args = get_args()
self.config: TransformerConfig = config
-
+ self.layer = layer
if layer and args.ffn_hidden_ratio:
ffn_hidden_size_1 = self.config.hidden_size * args.ffn_hidden_ratio[layer-1]
ffn_hidden_size_2 = self.config.hidden_size * args.ffn_hidden_ratio[layer-1]
@@ -95,8 +95,8 @@ def forward(self, hidden_states):
# [s, b, 4 * h/p]
intermediate_parallel, bias_parallel = self.linear_fc1(hidden_states)
- if torch.distribute.get_rank() == 0:
- print('DIMENSION OF INTERMEDIATE FFN LAYER:', intermediate_parallel.shape)
+ if torch.distributed.get_rank() == 0:
+ print('LAYER:', self.layer, 'DIMENSION OF INTERMEDIATE FFN LAYER:', intermediate_parallel.shape)
if self.config.bias_gelu_fusion:
assert self.config.add_bias_linear is True
From e0f37f1c1e72e71370b3771984eb1ab7d27aa882 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 6 Dec 2023 23:39:17 -0800
Subject: [PATCH 44/84] Update mlp.py
---
megatron/core/transformer/mlp.py | 2 --
1 file changed, 2 deletions(-)
diff --git a/megatron/core/transformer/mlp.py b/megatron/core/transformer/mlp.py
index e6f8f56f2d..c8dedc3e06 100644
--- a/megatron/core/transformer/mlp.py
+++ b/megatron/core/transformer/mlp.py
@@ -95,8 +95,6 @@ def forward(self, hidden_states):
# [s, b, 4 * h/p]
intermediate_parallel, bias_parallel = self.linear_fc1(hidden_states)
- if torch.distributed.get_rank() == 0:
- print('LAYER:', self.layer, 'DIMENSION OF INTERMEDIATE FFN LAYER:', intermediate_parallel.shape)
if self.config.bias_gelu_fusion:
assert self.config.add_bias_linear is True
From 7e4a6ae3af39a65002dffa8262270968c0278162 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Thu, 14 Dec 2023 13:46:35 -0800
Subject: [PATCH 45/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 24 +++++++++++++++++++++---
1 file changed, 21 insertions(+), 3 deletions(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index 8941c7b6ef..0052ee5606 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -262,18 +262,36 @@ def forward(self, hidden_states):
self.config.timers('ep_scatter').stop()
+ #if self.config.timers is not None:
+ # self.config.timers('final_route', log_level=2).start()
+ #if self.routing == 'top2' or self.routing == 'sinkhorn_top2':
+ # output_total = (output_total * max_prob + output_total_2 * max_prob_2) / (max_prob + max_prob_2)
+ #output_total = output_total.view(hidden_shape)
+ #if self.add_bias:
+ # if self.routing == 'top2' or self.routing == 'sinkhorn_top2':
+ # output_bias_total = (output_bias_total * max_prob + output_bias_total_2 * max_prob_2) / (max_prob + max_prob_2)
+ # output_bias_total = output_bias_total.view(hidden_shape)
+ #else:
+ # output_bias_total = None
+ #if self.config.timers is not None:
+ # self.config.timers('final_route').stop()
+
if self.config.timers is not None:
self.config.timers('final_route', log_level=2).start()
+ output_total = output_total * max_prob
if self.routing == 'top2' or self.routing == 'sinkhorn_top2':
- output_total = (output_total * max_prob + output_total_2 * max_prob_2) / (max_prob + max_prob_2)
+ output_total_2 = output_total_2 * max_prob_2
+ output_total = output_total + output_total_2
output_total = output_total.view(hidden_shape)
if self.add_bias:
+ output_bias_total = output_bias_total * max_prob
if self.routing == 'top2' or self.routing == 'sinkhorn_top2':
- output_bias_total = (output_bias_total * max_prob + output_bias_total_2 * max_prob_2) / (max_prob + max_prob_2)
+ output_bias_total_2 = output_bias_total_2 * max_prob_2
+ output_bias_total = output_bias_total + output_bias_total_2
output_bias_total = output_bias_total.view(hidden_shape)
else:
output_bias_total = None
if self.config.timers is not None:
self.config.timers('final_route').stop()
-
+
return output_total, output_bias_total
From 42be8d47f97f4dabf641dda525f2fec825532db0 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Thu, 14 Dec 2023 13:50:41 -0800
Subject: [PATCH 46/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 18 +++---------------
1 file changed, 3 insertions(+), 15 deletions(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index 0052ee5606..d34f81c01d 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -276,22 +276,10 @@ def forward(self, hidden_states):
#if self.config.timers is not None:
# self.config.timers('final_route').stop()
- if self.config.timers is not None:
- self.config.timers('final_route', log_level=2).start()
+
output_total = output_total * max_prob
- if self.routing == 'top2' or self.routing == 'sinkhorn_top2':
- output_total_2 = output_total_2 * max_prob_2
- output_total = output_total + output_total_2
output_total = output_total.view(hidden_shape)
- if self.add_bias:
- output_bias_total = output_bias_total * max_prob
- if self.routing == 'top2' or self.routing == 'sinkhorn_top2':
- output_bias_total_2 = output_bias_total_2 * max_prob_2
- output_bias_total = output_bias_total + output_bias_total_2
- output_bias_total = output_bias_total.view(hidden_shape)
- else:
- output_bias_total = None
- if self.config.timers is not None:
- self.config.timers('final_route').stop()
+ output_bias_total = None
+
return output_total, output_bias_total
From f787add8ea48c4a71d875fb84a3c2142b13d1d45 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Thu, 14 Dec 2023 14:05:08 -0800
Subject: [PATCH 47/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 28 +++++++++++++------------
1 file changed, 15 insertions(+), 13 deletions(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index d34f81c01d..f8e6f2e4ef 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -262,19 +262,21 @@ def forward(self, hidden_states):
self.config.timers('ep_scatter').stop()
- #if self.config.timers is not None:
- # self.config.timers('final_route', log_level=2).start()
- #if self.routing == 'top2' or self.routing == 'sinkhorn_top2':
- # output_total = (output_total * max_prob + output_total_2 * max_prob_2) / (max_prob + max_prob_2)
- #output_total = output_total.view(hidden_shape)
- #if self.add_bias:
- # if self.routing == 'top2' or self.routing == 'sinkhorn_top2':
- # output_bias_total = (output_bias_total * max_prob + output_bias_total_2 * max_prob_2) / (max_prob + max_prob_2)
- # output_bias_total = output_bias_total.view(hidden_shape)
- #else:
- # output_bias_total = None
- #if self.config.timers is not None:
- # self.config.timers('final_route').stop()
+ if self.config.timers is not None:
+ self.config.timers('final_route', log_level=2).start()
+ output_total = output_total * max_prob
+ if self.routing == 'top2' or self.routing == 'sinkhorn_top2':
+ output_total = (output_total + output_total_2 * max_prob_2)
+ output_total = output_total.view(hidden_shape)
+ if self.add_bias:
+ output_bias_total = output_bias_total * max_prob
+ if self.routing == 'top2' or self.routing == 'sinkhorn_top2':
+ output_bias_total = (output_bias_total + output_bias_total_2 * max_prob_2)
+ output_bias_total = output_bias_total.view(hidden_shape)
+ else:
+ output_bias_total = None
+ if self.config.timers is not None:
+ self.config.timers('final_route').stop()
output_total = output_total * max_prob
From 03b50ec28886d7605abf387dc215227f6d68a3de Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Thu, 14 Dec 2023 20:23:33 -0800
Subject: [PATCH 48/84] Update gpt_model.py
---
megatron/core/models/gpt/gpt_model.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index 84e66014e6..540fed7073 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -148,6 +148,7 @@ def forward(
args = get_args()
if args.use_balancing_loss is not None:
args.l_aux = 0.0
+ args.l_router = 0.0
hidden_states = self.decoder(
hidden_states=decoder_input,
attention_mask=attention_mask,
@@ -171,6 +172,7 @@ def forward(
loss = self.compute_language_model_loss(labels, logits)
if args.use_balancing_loss is not None:
loss += args.use_balancing_loss * args.l_aux
+ loss += args.l_router
return loss
From 02969e7237cc227be58eda5719294254d5c38bd8 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Thu, 14 Dec 2023 20:33:18 -0800
Subject: [PATCH 49/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 15 ++++++---------
1 file changed, 6 insertions(+), 9 deletions(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index f8e6f2e4ef..d13bfa7fda 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -167,7 +167,10 @@ def forward(self, hidden_states):
self.config.timers('routing_gather').stop()
-
+ # Evaluate router loss
+ if hasattr(args, 'l_router') and self.training:
+ args.l_router -= torch.sum(route * torch.log(route + 1e-9))
+
# Evaluate balancing loss.
if (args.use_balancing_loss is not None) and self.training:
if hasattr(args, 'l_aux'):
@@ -264,9 +267,9 @@ def forward(self, hidden_states):
if self.config.timers is not None:
self.config.timers('final_route', log_level=2).start()
- output_total = output_total * max_prob
+ # output_total = output_total * max_prob
if self.routing == 'top2' or self.routing == 'sinkhorn_top2':
- output_total = (output_total + output_total_2 * max_prob_2)
+ output_total = (output_total * max_prob + output_total_2 * max_prob_2)
output_total = output_total.view(hidden_shape)
if self.add_bias:
output_bias_total = output_bias_total * max_prob
@@ -277,11 +280,5 @@ def forward(self, hidden_states):
output_bias_total = None
if self.config.timers is not None:
self.config.timers('final_route').stop()
-
-
- output_total = output_total * max_prob
- output_total = output_total.view(hidden_shape)
- output_bias_total = None
-
return output_total, output_bias_total
From fb61494f3d8cdff67b9e0fa20cd2c3a2c3846aa8 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Mon, 18 Dec 2023 19:40:01 -0600
Subject: [PATCH 50/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index d13bfa7fda..c54b14ca67 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -269,7 +269,7 @@ def forward(self, hidden_states):
self.config.timers('final_route', log_level=2).start()
# output_total = output_total * max_prob
if self.routing == 'top2' or self.routing == 'sinkhorn_top2':
- output_total = (output_total * max_prob + output_total_2 * max_prob_2)
+ output_total = (output_total + output_total_2 * max_prob_2)
output_total = output_total.view(hidden_shape)
if self.add_bias:
output_bias_total = output_bias_total * max_prob
From e26ca5794044d9c031a1206d1fb5bbed59be4af2 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Tue, 19 Dec 2023 14:25:45 -0600
Subject: [PATCH 51/84] Update training.py
---
megatron/training.py | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/megatron/training.py b/megatron/training.py
index c4dfd19605..4c951a44c5 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -50,6 +50,8 @@
from megatron.model.vision.knn_monitor import compute_feature_bank
from megatron.eval_harness import Evaluator
+global prev_params
+prev_params = [[] for i in range(1000)]
def print_datetime(string):
"""Note that this call will sync across all ranks."""
@@ -455,6 +457,16 @@ def train_step(forward_step_func, data_iterator,
unwrapped_model.cancel_gradients_last_layer(args.curr_iteration)
# Update parameters.
+
+ # print("JUST BEFORE STEP:" )
+ if args.curr_iteration % 10 == 0 and torch.distributed.get_rank() == 0:
+ for i,(n, p) in enumerate(model[0].named_parameters()):
+ if len(prev_params[i]) == 0:
+ prev_params[i] = p.detach().clone()
+ param_diff = p - prev_params[i]
+ grad_sum = str(p.grad.sum().item()) if p.grad is not None else "NO GRAD!"
+ print(args.curr_iteration, n, p.shape, torch.norm(p).item(), torch.norm(param_diff).item())
+ prev_params[i] = p.detach().clone()
if args.enable_manual_profiling: torch.cuda.nvtx.range_push(f"Optimizer step")
timers('optimizer', log_level=1).start(barrier=args.barrier_with_L1_time)
update_successful, grad_norm, num_zeros_in_grad = optimizer.step(args, timers)
From 83bcde19e2a6b90252394d687a1660a10c9b59b2 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 20 Dec 2023 03:25:34 -0600
Subject: [PATCH 52/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index c54b14ca67..36dd131a57 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -264,10 +264,10 @@ def forward(self, hidden_states):
if self.config.timers is not None:
self.config.timers('ep_scatter').stop()
-
+ print('THIS IS WHERE I'M PRINTING')
if self.config.timers is not None:
self.config.timers('final_route', log_level=2).start()
- # output_total = output_total * max_prob
+ output_total = output_total * max_prob
if self.routing == 'top2' or self.routing == 'sinkhorn_top2':
output_total = (output_total + output_total_2 * max_prob_2)
output_total = output_total.view(hidden_shape)
From a8ac36955ec682e5caa801bc95425104af97e923 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 20 Dec 2023 03:31:51 -0600
Subject: [PATCH 53/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index 36dd131a57..74e18e7c78 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -264,7 +264,7 @@ def forward(self, hidden_states):
if self.config.timers is not None:
self.config.timers('ep_scatter').stop()
- print('THIS IS WHERE I'M PRINTING')
+ print('THIS IS WHERE IM PRINTING')
if self.config.timers is not None:
self.config.timers('final_route', log_level=2).start()
output_total = output_total * max_prob
From c2edfbb0395c6145de9c56912c36a6f616c360c0 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 20 Dec 2023 03:33:55 -0600
Subject: [PATCH 54/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index 74e18e7c78..60249fa6bf 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -264,7 +264,6 @@ def forward(self, hidden_states):
if self.config.timers is not None:
self.config.timers('ep_scatter').stop()
- print('THIS IS WHERE IM PRINTING')
if self.config.timers is not None:
self.config.timers('final_route', log_level=2).start()
output_total = output_total * max_prob
From 74b0c7baad49f003c4b8e6cfe47370213fa3f9cc Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 20 Dec 2023 11:20:43 -0600
Subject: [PATCH 55/84] print statement of data shape
---
pretrain_gpt.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/pretrain_gpt.py b/pretrain_gpt.py
index 9129beb0bb..d0d8a214d3 100644
--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
@@ -178,6 +178,7 @@ def forward_step(data_iterator, model: GPTModel):
tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
data_iterator)
timers('batch-generator').stop()
+ print('SHAPE OF DATA AFTER GET_BATCH:', tokens.shape, labels.shape)
if args.enable_manual_profiling: torch.cuda.nvtx.range_pop()
if args.enable_manual_profiling: torch.cuda.nvtx.range_push(f"Forward pass")
From 8f7bb1cfba6f6fdb4065bdc3465347c0d19988dd Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 20 Dec 2023 11:24:13 -0600
Subject: [PATCH 56/84] print statement of data shape
---
megatron/core/models/gpt/gpt_model.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index 540fed7073..7182cfa30a 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -129,7 +129,9 @@ def forward(
if decoder_input is not None:
pass
elif self.pre_process:
+ print('DATA SHAPE BEFORE EMBEDDING:', input_ids.shape, labels.shape)
decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids)
+ print('DATA SHAPE AFTER EMBEDDING:', input_ids.shape, labels.shape)
else:
# intermediate stage of pipeline
# decoder will get hidden_states from encoder.input_tensor
From 714cd8d44ce80d99bb41ce11cc4cb28561ac0da1 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 20 Dec 2023 11:29:40 -0600
Subject: [PATCH 57/84] Update transformer_layer.py
---
megatron/core/transformer/transformer_layer.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py
index 117a5d218e..1bccb81e6e 100644
--- a/megatron/core/transformer/transformer_layer.py
+++ b/megatron/core/transformer/transformer_layer.py
@@ -175,9 +175,11 @@ def forward(
# TODO: could we move `bias_dropout_add_exec_handler` itself
# inside the module provided in the `bias_dropout_add_spec` module?
with self.bias_dropout_add_exec_handler():
+ print('SHAPE OF DATA BEFORE ATTENTION:', hidden_states.shape)
hidden_states = self.self_attn_bda(self.training, self.config.bias_dropout_fusion)(
attention_output_with_bias, residual, self.config.hidden_dropout
)
+ print('SHAPE OF DATA AFTER ATTENTION:', hidden_states.shape)
# Residual connection.
residual = hidden_states
From 5fd134fa3f792e593cbde4d5dd06c2b7ed8d734b Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 20 Dec 2023 11:38:42 -0600
Subject: [PATCH 58/84] Update gpt_model.py
---
megatron/core/models/gpt/gpt_model.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index 7182cfa30a..3105fc5c86 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -151,6 +151,7 @@ def forward(
if args.use_balancing_loss is not None:
args.l_aux = 0.0
args.l_router = 0.0
+ print('SHAPE OF DATA BEFORE DECODER IN GPT_MODEL.PY:' decoder_input.shape)
hidden_states = self.decoder(
hidden_states=decoder_input,
attention_mask=attention_mask,
From d95239c810d85efca20cb879f39e0c9f0ec6e599 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 20 Dec 2023 11:39:35 -0600
Subject: [PATCH 59/84] Update training.py
---
megatron/training.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/megatron/training.py b/megatron/training.py
index 4c951a44c5..d85e22128f 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -459,7 +459,7 @@ def train_step(forward_step_func, data_iterator,
# Update parameters.
# print("JUST BEFORE STEP:" )
- if args.curr_iteration % 10 == 0 and torch.distributed.get_rank() == 0:
+ if args.curr_iteration % 10 == 0 and torch.distributed.get_rank() == 0 and 1 == 0:
for i,(n, p) in enumerate(model[0].named_parameters()):
if len(prev_params[i]) == 0:
prev_params[i] = p.detach().clone()
From 8ac39c915450ebdb6a7d980290736d6908f2ce80 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 20 Dec 2023 11:41:31 -0600
Subject: [PATCH 60/84] Update gpt_model.py
---
megatron/core/models/gpt/gpt_model.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index 3105fc5c86..205ce94351 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -131,7 +131,7 @@ def forward(
elif self.pre_process:
print('DATA SHAPE BEFORE EMBEDDING:', input_ids.shape, labels.shape)
decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids)
- print('DATA SHAPE AFTER EMBEDDING:', input_ids.shape, labels.shape)
+ print('DATA SHAPE AFTER EMBEDDING:', decoder_input.shape, labels.shape)
else:
# intermediate stage of pipeline
# decoder will get hidden_states from encoder.input_tensor
@@ -151,7 +151,7 @@ def forward(
if args.use_balancing_loss is not None:
args.l_aux = 0.0
args.l_router = 0.0
- print('SHAPE OF DATA BEFORE DECODER IN GPT_MODEL.PY:' decoder_input.shape)
+ print('SHAPE OF DATA BEFORE DECODER IN GPTMODELPY:', decoder_input.shape)
hidden_states = self.decoder(
hidden_states=decoder_input,
attention_mask=attention_mask,
From 687687e48da96d60fde5efe17c41cd6c80edeec2 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Wed, 20 Dec 2023 12:33:47 -0600
Subject: [PATCH 61/84] Update gpt_model.py
---
megatron/core/models/gpt/gpt_model.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index 205ce94351..fc999605b0 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -168,6 +168,7 @@ def forward(
output_weight = self.shared_embedding_or_output_weight()
logits, _ = self.output_layer(hidden_states, weight=output_weight)
+ print('SHAPES BEFORE LOSS COMPUTE:', logits.shape, labels.shape)
if labels is None:
# [s b h] => [b s h]
return logits.transpose(0, 1).contiguous()
From fff647099ed16a97c544e9bbcb26564aa725ada3 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Thu, 21 Dec 2023 02:14:25 -0600
Subject: [PATCH 62/84] Update gpt_model.py
---
megatron/core/models/gpt/gpt_model.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index fc999605b0..1fe7548e02 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -105,7 +105,10 @@ def __init__(
)
if self.share_embeddings_and_output_weights and (self.pre_process or self.post_process):
+ print('THIS CONDITION TRUE')
self.initialize_last_stage_with_word_embeddings()
+ else:
+ print('THIS CONDITION FALSE')
def forward(
self,
From fac5f796d55c5ed1ed8781d7fc817965226ce7c3 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Thu, 21 Dec 2023 02:28:49 -0600
Subject: [PATCH 63/84] Update gpt_model.py
---
megatron/core/models/gpt/gpt_model.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index 1fe7548e02..79f8695f44 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -132,9 +132,9 @@ def forward(
if decoder_input is not None:
pass
elif self.pre_process:
- print('DATA SHAPE BEFORE EMBEDDING:', input_ids.shape, labels.shape)
decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids)
- print('DATA SHAPE AFTER EMBEDDING:', decoder_input.shape, labels.shape)
+ if torch.distributed.get_rank() == 0:
+ print(self.embedding.word_embeddings.weight.data, self.output_layer.weight.data)
else:
# intermediate stage of pipeline
# decoder will get hidden_states from encoder.input_tensor
@@ -154,7 +154,7 @@ def forward(
if args.use_balancing_loss is not None:
args.l_aux = 0.0
args.l_router = 0.0
- print('SHAPE OF DATA BEFORE DECODER IN GPTMODELPY:', decoder_input.shape)
+
hidden_states = self.decoder(
hidden_states=decoder_input,
attention_mask=attention_mask,
From b157390ebe7925c0a5c966ec9ff16acf31573603 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Thu, 21 Dec 2023 03:04:18 -0600
Subject: [PATCH 64/84] Update gpt_model.py
---
megatron/core/models/gpt/gpt_model.py | 2 --
1 file changed, 2 deletions(-)
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index 79f8695f44..832b6f9e3d 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -133,8 +133,6 @@ def forward(
pass
elif self.pre_process:
decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids)
- if torch.distributed.get_rank() == 0:
- print(self.embedding.word_embeddings.weight.data, self.output_layer.weight.data)
else:
# intermediate stage of pipeline
# decoder will get hidden_states from encoder.input_tensor
From 214e0d0cc9ef5b44e39bad97f6fb7c7d41d54b43 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Thu, 21 Dec 2023 03:31:21 -0600
Subject: [PATCH 65/84] Update pretrain_gpt.py
---
pretrain_gpt.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/pretrain_gpt.py b/pretrain_gpt.py
index d0d8a214d3..9129beb0bb 100644
--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
@@ -178,7 +178,6 @@ def forward_step(data_iterator, model: GPTModel):
tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
data_iterator)
timers('batch-generator').stop()
- print('SHAPE OF DATA AFTER GET_BATCH:', tokens.shape, labels.shape)
if args.enable_manual_profiling: torch.cuda.nvtx.range_pop()
if args.enable_manual_profiling: torch.cuda.nvtx.range_push(f"Forward pass")
From 0597d752a59233ffeb9bcd21867eadfd5747d25a Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Thu, 21 Dec 2023 03:31:59 -0600
Subject: [PATCH 66/84] Update gpt_model.py
---
megatron/core/models/gpt/gpt_model.py | 4 ----
1 file changed, 4 deletions(-)
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index 832b6f9e3d..9bec0f8516 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -105,10 +105,7 @@ def __init__(
)
if self.share_embeddings_and_output_weights and (self.pre_process or self.post_process):
- print('THIS CONDITION TRUE')
self.initialize_last_stage_with_word_embeddings()
- else:
- print('THIS CONDITION FALSE')
def forward(
self,
@@ -169,7 +166,6 @@ def forward(
output_weight = self.shared_embedding_or_output_weight()
logits, _ = self.output_layer(hidden_states, weight=output_weight)
- print('SHAPES BEFORE LOSS COMPUTE:', logits.shape, labels.shape)
if labels is None:
# [s b h] => [b s h]
return logits.transpose(0, 1).contiguous()
From 2b017d1794df1fdc991a76d314a766fa88e7a218 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Thu, 21 Dec 2023 03:32:31 -0600
Subject: [PATCH 67/84] Update transformer_layer.py
---
megatron/core/transformer/transformer_layer.py | 2 --
1 file changed, 2 deletions(-)
diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py
index 1bccb81e6e..117a5d218e 100644
--- a/megatron/core/transformer/transformer_layer.py
+++ b/megatron/core/transformer/transformer_layer.py
@@ -175,11 +175,9 @@ def forward(
# TODO: could we move `bias_dropout_add_exec_handler` itself
# inside the module provided in the `bias_dropout_add_spec` module?
with self.bias_dropout_add_exec_handler():
- print('SHAPE OF DATA BEFORE ATTENTION:', hidden_states.shape)
hidden_states = self.self_attn_bda(self.training, self.config.bias_dropout_fusion)(
attention_output_with_bias, residual, self.config.hidden_dropout
)
- print('SHAPE OF DATA AFTER ATTENTION:', hidden_states.shape)
# Residual connection.
residual = hidden_states
From 09b122861a71574f74f89c8ba142c1c68b7e28c1 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Fri, 29 Dec 2023 00:04:08 -0400
Subject: [PATCH 68/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index 60249fa6bf..e0c1640c91 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -168,8 +168,8 @@ def forward(self, hidden_states):
# Evaluate router loss
- if hasattr(args, 'l_router') and self.training:
- args.l_router -= torch.sum(route * torch.log(route + 1e-9))
+ # if hasattr(args, 'l_router') and self.training:
+ # args.l_router -= torch.sum(route * torch.log(route + 1e-9))
# Evaluate balancing loss.
if (args.use_balancing_loss is not None) and self.training:
From 3800c0ef2f3828aad17b6186489b13ccea0ea991 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Fri, 29 Dec 2023 00:13:16 -0400
Subject: [PATCH 69/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 5 -----
1 file changed, 5 deletions(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index e0c1640c91..93a71350e4 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -166,11 +166,6 @@ def forward(self, hidden_states):
if self.config.timers is not None:
self.config.timers('routing_gather').stop()
-
- # Evaluate router loss
- # if hasattr(args, 'l_router') and self.training:
- # args.l_router -= torch.sum(route * torch.log(route + 1e-9))
-
# Evaluate balancing loss.
if (args.use_balancing_loss is not None) and self.training:
if hasattr(args, 'l_aux'):
From 5a583a2cfb80662eb7d4147cedef85e11266a227 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Fri, 29 Dec 2023 00:13:42 -0400
Subject: [PATCH 70/84] Update gpt_model.py
---
megatron/core/models/gpt/gpt_model.py | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index 9bec0f8516..3cebde66d4 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -148,7 +148,6 @@ def forward(
args = get_args()
if args.use_balancing_loss is not None:
args.l_aux = 0.0
- args.l_router = 0.0
hidden_states = self.decoder(
hidden_states=decoder_input,
@@ -173,8 +172,7 @@ def forward(
loss = self.compute_language_model_loss(labels, logits)
if args.use_balancing_loss is not None:
loss += args.use_balancing_loss * args.l_aux
- loss += args.l_router
-
+
return loss
def shared_embedding_or_output_weight(self) -> Tensor:
From 5123488f30c8fcced6b4f5b03af62b5aa58fa132 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Fri, 29 Dec 2023 01:02:10 -0400
Subject: [PATCH 71/84] Update fused_softmax.py
---
megatron/model/fused_softmax.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py
index 9bacf33740..e94328df95 100644
--- a/megatron/model/fused_softmax.py
+++ b/megatron/model/fused_softmax.py
@@ -145,8 +145,10 @@ def forward(self, input, mask):
assert input.dim() == 4
if self.is_kernel_available(mask, *input.size()):
+ print('TEST_fused')
return self.forward_fused_softmax(input, mask)
else:
+ print('TEST1_notfused')
return self.forward_torch_softmax(input, mask)
def is_kernel_available(self, mask, b, np, sq, sk):
From 0d6f7c94a9615e5fd412847e523057e68a9c579e Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Fri, 29 Dec 2023 01:25:32 -0400
Subject: [PATCH 72/84] Update fused_softmax.py
---
megatron/model/fused_softmax.py | 2 --
1 file changed, 2 deletions(-)
diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py
index e94328df95..9bacf33740 100644
--- a/megatron/model/fused_softmax.py
+++ b/megatron/model/fused_softmax.py
@@ -145,10 +145,8 @@ def forward(self, input, mask):
assert input.dim() == 4
if self.is_kernel_available(mask, *input.size()):
- print('TEST_fused')
return self.forward_fused_softmax(input, mask)
else:
- print('TEST1_notfused')
return self.forward_torch_softmax(input, mask)
def is_kernel_available(self, mask, b, np, sq, sk):
From f8c8de9e71939cf948fbba60ce975bc2da3fda6d Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Fri, 29 Dec 2023 01:26:47 -0400
Subject: [PATCH 73/84] Update fused_softmax.py
---
megatron/model/fused_softmax.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py
index 9bacf33740..69927b2757 100644
--- a/megatron/model/fused_softmax.py
+++ b/megatron/model/fused_softmax.py
@@ -196,7 +196,8 @@ def forward_torch_softmax(self, input, mask):
if self.scale is not None:
input = input * self.scale
mask_output = self.mask_func(input, mask) if mask is not None else input
- probs = torch.nn.Softmax(dim=-1)(mask_output)
+ print('TRYING -2')
+ probs = torch.nn.Softmax(dim=-2)(mask_output)
if self.input_in_float16 and self.softmax_in_fp32:
if self.input_in_fp16:
From 2bfb7c08ac71fb442490913682e9e6bcff600646 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Fri, 29 Dec 2023 01:27:59 -0400
Subject: [PATCH 74/84] Update fused_softmax.py
---
megatron/model/fused_softmax.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py
index 69927b2757..29078ab749 100644
--- a/megatron/model/fused_softmax.py
+++ b/megatron/model/fused_softmax.py
@@ -196,7 +196,6 @@ def forward_torch_softmax(self, input, mask):
if self.scale is not None:
input = input * self.scale
mask_output = self.mask_func(input, mask) if mask is not None else input
- print('TRYING -2')
probs = torch.nn.Softmax(dim=-2)(mask_output)
if self.input_in_float16 and self.softmax_in_fp32:
From 36a23764f15f1f6199cf108d167d44c2caa343e2 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Fri, 29 Dec 2023 11:31:43 -0400
Subject: [PATCH 75/84] Update fused_softmax.py
---
megatron/model/fused_softmax.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py
index 29078ab749..9bacf33740 100644
--- a/megatron/model/fused_softmax.py
+++ b/megatron/model/fused_softmax.py
@@ -196,7 +196,7 @@ def forward_torch_softmax(self, input, mask):
if self.scale is not None:
input = input * self.scale
mask_output = self.mask_func(input, mask) if mask is not None else input
- probs = torch.nn.Softmax(dim=-2)(mask_output)
+ probs = torch.nn.Softmax(dim=-1)(mask_output)
if self.input_in_float16 and self.softmax_in_fp32:
if self.input_in_fp16:
From 7048fc1e3b4fa5fb7de36cc31951a2a94b055521 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Mon, 1 Jan 2024 12:12:20 -0600
Subject: [PATCH 76/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index 93a71350e4..0866028abc 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -99,7 +99,7 @@ def gather_indices(self, local_indices):
def forward(self, hidden_states):
args = get_args()
hidden_shape = hidden_states.shape
- route = self.router(hidden_states)
+ route = self.router(hidden_states.detach())
route = route.view(-1, self.num_moe_experts)
if self.config.timers is not None:
From 431fa63f4e5ab32459578e89cc7826a364b6ffdb Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Mon, 1 Jan 2024 15:43:26 -0600
Subject: [PATCH 77/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index 0866028abc..93a71350e4 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -99,7 +99,7 @@ def gather_indices(self, local_indices):
def forward(self, hidden_states):
args = get_args()
hidden_shape = hidden_states.shape
- route = self.router(hidden_states.detach())
+ route = self.router(hidden_states)
route = route.view(-1, self.num_moe_experts)
if self.config.timers is not None:
From fc5390c278b84fcf95b2e6cd503769c7a0ddb710 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Mon, 1 Jan 2024 19:47:43 -0600
Subject: [PATCH 78/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index 93a71350e4..f49281f6a7 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -261,7 +261,7 @@ def forward(self, hidden_states):
if self.config.timers is not None:
self.config.timers('final_route', log_level=2).start()
- output_total = output_total * max_prob
+ output_total = output_total * max_prob / (max_prob.detach())
if self.routing == 'top2' or self.routing == 'sinkhorn_top2':
output_total = (output_total + output_total_2 * max_prob_2)
output_total = output_total.view(hidden_shape)
From 5791a87bd0d853da0f26e5bb3b2a319539e58b5d Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Mon, 1 Jan 2024 19:55:50 -0600
Subject: [PATCH 79/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index f49281f6a7..93a71350e4 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -261,7 +261,7 @@ def forward(self, hidden_states):
if self.config.timers is not None:
self.config.timers('final_route', log_level=2).start()
- output_total = output_total * max_prob / (max_prob.detach())
+ output_total = output_total * max_prob
if self.routing == 'top2' or self.routing == 'sinkhorn_top2':
output_total = (output_total + output_total_2 * max_prob_2)
output_total = output_total.view(hidden_shape)
From 03aecf6b104ddfd668116773687f91f4c1980413 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Mon, 1 Jan 2024 21:59:44 -0600
Subject: [PATCH 80/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index 93a71350e4..9a571c75fa 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -27,11 +27,15 @@ def sinkhorn(cost, tol=0.0001):
eps = 0.00000001
error = 1e9
d1_old = d1
+ t = 0
while error > tol:
+ t += 1
d0 = (1 / d0.size(0)) * 1 / (torch.sum(d1 * cost, 1) + eps)
d1 = (1 / d1.size(0)) * 1 / (torch.sum(d0.unsqueeze(1) * cost, 0) + eps)
error = torch.mean(torch.abs(d1_old - d1))
d1_old = d1
+ if t > 1:
+ print('NUMBER OF STEPS:', t)
return d1 * cost * d0.unsqueeze(1)
def save_token_count(token_count, layer, iteration, router_profiling_path):
From 0e5b34fa3889e877fe09856919d5c8ca22584adc Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Mon, 1 Jan 2024 22:11:23 -0600
Subject: [PATCH 81/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index 9a571c75fa..ed415e41fc 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -27,16 +27,15 @@ def sinkhorn(cost, tol=0.0001):
eps = 0.00000001
error = 1e9
d1_old = d1
- t = 0
while error > tol:
- t += 1
d0 = (1 / d0.size(0)) * 1 / (torch.sum(d1 * cost, 1) + eps)
d1 = (1 / d1.size(0)) * 1 / (torch.sum(d0.unsqueeze(1) * cost, 0) + eps)
error = torch.mean(torch.abs(d1_old - d1))
d1_old = d1
- if t > 1:
- print('NUMBER OF STEPS:', t)
- return d1 * cost * d0.unsqueeze(1)
+ # return d1 * cost * d0.unsqueeze(1)
+ route = torch.softmax(2.0 * cost, dim=0) / torch.sum(torch.softmax(2.0 * cost, dim=0), dim=1, keepdim=True)
+ route = (1/self.num_moe_experts) * (route / torch.sum(route, dim=0, keepdim=True))
+ return route
def save_token_count(token_count, layer, iteration, router_profiling_path):
token_count_list = token_count.cpu().tolist()
From 704726d7b8d82ce2730793bcdbe3556824516705 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Mon, 1 Jan 2024 22:12:25 -0600
Subject: [PATCH 82/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index ed415e41fc..5317702129 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -19,7 +19,7 @@
def sinkhorn(cost, tol=0.0001):
"Sinkhorn based MoE routing function"
- cost = torch.exp(2.0 * cost)
+ """cost = torch.exp(2.0 * cost)
d0 = torch.ones(cost.size(0), device=cost.device, dtype=cost.dtype)
# d1 = torch.ones(cost.size(1), device=cost.device, dtype=cost.dtype)
d1 = 1 / (cost.size(1) * torch.sum(cost, 0))
@@ -32,7 +32,7 @@ def sinkhorn(cost, tol=0.0001):
d1 = (1 / d1.size(0)) * 1 / (torch.sum(d0.unsqueeze(1) * cost, 0) + eps)
error = torch.mean(torch.abs(d1_old - d1))
d1_old = d1
- # return d1 * cost * d0.unsqueeze(1)
+ return d1 * cost * d0.unsqueeze(1)"""
route = torch.softmax(2.0 * cost, dim=0) / torch.sum(torch.softmax(2.0 * cost, dim=0), dim=1, keepdim=True)
route = (1/self.num_moe_experts) * (route / torch.sum(route, dim=0, keepdim=True))
return route
From 277afeb213d633761bed45ef2ebfb4587bb8365b Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Mon, 1 Jan 2024 22:16:15 -0600
Subject: [PATCH 83/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index 5317702129..6d28da9a4b 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -34,7 +34,7 @@ def sinkhorn(cost, tol=0.0001):
d1_old = d1
return d1 * cost * d0.unsqueeze(1)"""
route = torch.softmax(2.0 * cost, dim=0) / torch.sum(torch.softmax(2.0 * cost, dim=0), dim=1, keepdim=True)
- route = (1/self.num_moe_experts) * (route / torch.sum(route, dim=0, keepdim=True))
+ route = (1/cost.size(1)) * (route / torch.sum(route, dim=0, keepdim=True))
return route
def save_token_count(token_count, layer, iteration, router_profiling_path):
From 4a31d0ff5463183c3dd786c776b61b00014c5498 Mon Sep 17 00:00:00 2001
From: pglorio <85982602+pglorio@users.noreply.github.com>
Date: Mon, 1 Jan 2024 22:30:15 -0600
Subject: [PATCH 84/84] Update switch_mlp.py
---
megatron/core/transformer/switch_mlp.py | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)
diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py
index 6d28da9a4b..93a71350e4 100644
--- a/megatron/core/transformer/switch_mlp.py
+++ b/megatron/core/transformer/switch_mlp.py
@@ -19,7 +19,7 @@
def sinkhorn(cost, tol=0.0001):
"Sinkhorn based MoE routing function"
- """cost = torch.exp(2.0 * cost)
+ cost = torch.exp(2.0 * cost)
d0 = torch.ones(cost.size(0), device=cost.device, dtype=cost.dtype)
# d1 = torch.ones(cost.size(1), device=cost.device, dtype=cost.dtype)
d1 = 1 / (cost.size(1) * torch.sum(cost, 0))
@@ -32,10 +32,7 @@ def sinkhorn(cost, tol=0.0001):
d1 = (1 / d1.size(0)) * 1 / (torch.sum(d0.unsqueeze(1) * cost, 0) + eps)
error = torch.mean(torch.abs(d1_old - d1))
d1_old = d1
- return d1 * cost * d0.unsqueeze(1)"""
- route = torch.softmax(2.0 * cost, dim=0) / torch.sum(torch.softmax(2.0 * cost, dim=0), dim=1, keepdim=True)
- route = (1/cost.size(1)) * (route / torch.sum(route, dim=0, keepdim=True))
- return route
+ return d1 * cost * d0.unsqueeze(1)
def save_token_count(token_count, layer, iteration, router_profiling_path):
token_count_list = token_count.cpu().tolist()