From 9261410e4f9dcad7bd5c63c3ce8bcf27ef2a5c27 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Mon, 4 Dec 2023 23:22:33 -0800 Subject: [PATCH 01/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index 78dfc3163b..16444cc433 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -73,6 +73,8 @@ def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, layer=N for _ in range(self.num_local_experts): expert = MLP(self.config, submodules, is_expert=True) self.local_experts.append(expert) + if args.residual_moe: + self.fixed_mlp = MLP(self.config, submodules, is_expert=False) def gather_indices(self, local_indices): """ Gather tensors and concatenate along the first dimension.""" @@ -261,16 +263,17 @@ def forward(self, hidden_states): if self.config.timers is not None: self.config.timers('final_route', log_level=2).start() - output_total = output_total * max_prob if self.routing == 'top2' or self.routing == 'sinkhorn_top2': - output_total_2 = output_total_2 * max_prob_2 - output_total = output_total + output_total_2 + output_total = (output_total * max_prob + output_total_2 * max_prob_2) / (max_prob + max_prob_2) + if args.residual_moe: + output_mlp, output_bias_mlp = self.fixed_mlp(global_hidden_states) + output_total += output_mlp output_total = output_total.view(hidden_shape) if self.add_bias: - output_bias_total = output_bias_total * max_prob if self.routing == 'top2' or self.routing == 'sinkhorn_top2': - output_bias_total_2 = output_bias_total_2 * max_prob_2 - output_bias_total = output_bias_total + output_bias_total_2 + output_bias_total = (output_bias_total * max_prob + output_bias_total_2 * max_prob_2) / (max_prob + max_prob_2) + if args.residual_moe: + output_bias_total += output_bias_mlp output_bias_total = output_bias_total.view(hidden_shape) else: output_bias_total = None From f2ecc718b187549e4434bbbfcb8f1eee7bea6460 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Tue, 5 Dec 2023 01:15:18 -0800 Subject: [PATCH 02/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index 16444cc433..d242df1605 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -61,6 +61,10 @@ def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, layer=N self.expert_parallel_size = parallel_state.get_expert_model_parallel_world_size() assert self.config.num_moe_experts % self.expert_parallel_size == 0 + if layer < "TOTAL_LAYERS" - 2: + self.num_local_experts /= 2 + self.expert_parallel_size /=2 + "data parallel size" *= 2 self.num_local_experts = self.config.num_moe_experts // self.expert_parallel_size local_expert_indices_offset = ( parallel_state.get_expert_model_parallel_rank() * self.num_local_experts @@ -68,6 +72,8 @@ def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, layer=N self.local_expert_indices = [ local_expert_indices_offset + i for i in range(self.num_local_experts) ] + ### if num_loc_exp=exp_par_size=4 and i have 8 GPU's, do some indices identify with same expert? + ### this should be accounted for in the lines below self.local_experts = torch.nn.ModuleList() for _ in range(self.num_local_experts): @@ -80,6 +86,7 @@ def gather_indices(self, local_indices): """ Gather tensors and concatenate along the first dimension.""" group = get_tensor_and_expert_parallel_group() world_size = torch.distributed.get_world_size(group=group) + ### in the example above, is world_size=2/TP? # Bypass the function if we are using only 1 GPU. if world_size == 1: return local_indices @@ -149,6 +156,7 @@ def forward(self, hidden_states): if self.config.timers is not None: self.config.timers('routing_gather', log_level=2).start() + ### sequence_parallel is when sequence parallel dimension > 1? why should i do this gather when EP_size > 1? if self.sequence_parallel or (self.expert_parallel_size > 1): global_hidden_states = tensor_parallel.gather_from_sequence_parallel_region_to_moe( hidden_states @@ -201,6 +209,8 @@ def forward(self, hidden_states): self.config.timers('routing_loop', log_level=2).start() for expert_num, expert in enumerate(self.local_experts): local_expert_index = self.local_expert_indices[expert_num] + ### in the example above, local_expert_index could be 7 even when there are 4 experts? + ### this means 4 GPUs are idle because local_indices is empty local_indices = (global_indices == local_expert_index).nonzero() hidden = global_hidden_states[local_indices, :] if self.config.timers is not None: @@ -228,6 +238,7 @@ def forward(self, hidden_states): if self.config.timers is not None: self.config.timers('ep_scatter', log_level=2).start() if self.sequence_parallel or (self.expert_parallel_size > 1): + ### what is this? should I apply it to output of self.fixed_mlp too? output_total = tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe( output_total ) From c7ea711ba94b5ad246f4b2c5d5fe302a79b8e761 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Tue, 5 Dec 2023 15:19:42 -0800 Subject: [PATCH 03/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index d242df1605..06218ba085 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -234,6 +234,8 @@ def forward(self, hidden_states): if self.config.timers is not None: self.config.timers('routing_loop').stop() + if args.residual_moe: + output_mlp, output_bias_mlp = self.fixed_mlp(global_hidden_states) if self.config.timers is not None: self.config.timers('ep_scatter', log_level=2).start() @@ -242,6 +244,10 @@ def forward(self, hidden_states): output_total = tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe( output_total ) + if args.residual_moe: + output_mlp = tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe( + output_mlp + ) if self.routing == 'top2' or self.routing == 'sinkhorn_top2': output_total_2 = tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe( output_total_2 @@ -256,6 +262,10 @@ def forward(self, hidden_states): output_bias_total_2 = tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe( output_bias_total_2 ) + if args.residual_moe: + output_bias_mlp = tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe( + output_bias_mlp + ) # bias is duplicated across tensor parallelism ranks; # reduce scatter reduces bias across tensor parallel_ranks @@ -277,7 +287,6 @@ def forward(self, hidden_states): if self.routing == 'top2' or self.routing == 'sinkhorn_top2': output_total = (output_total * max_prob + output_total_2 * max_prob_2) / (max_prob + max_prob_2) if args.residual_moe: - output_mlp, output_bias_mlp = self.fixed_mlp(global_hidden_states) output_total += output_mlp output_total = output_total.view(hidden_shape) if self.add_bias: From d5f7af41c5155afc79f363b9e221075790d97d28 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 14:49:32 -0800 Subject: [PATCH 04/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 32 ++++--------------------- 1 file changed, 4 insertions(+), 28 deletions(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index 06218ba085..2fe0ac7bb5 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -61,32 +61,27 @@ def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, layer=N self.expert_parallel_size = parallel_state.get_expert_model_parallel_world_size() assert self.config.num_moe_experts % self.expert_parallel_size == 0 - if layer < "TOTAL_LAYERS" - 2: - self.num_local_experts /= 2 - self.expert_parallel_size /=2 - "data parallel size" *= 2 self.num_local_experts = self.config.num_moe_experts // self.expert_parallel_size + if layer in [5,6]: + self.num_local_experts *= 2 + self.expert_parallel_size /=2 + print('LAYER:', layer, 'NUM LOCAL EXPERTS:', self.num_local_experts) local_expert_indices_offset = ( parallel_state.get_expert_model_parallel_rank() * self.num_local_experts ) self.local_expert_indices = [ local_expert_indices_offset + i for i in range(self.num_local_experts) ] - ### if num_loc_exp=exp_par_size=4 and i have 8 GPU's, do some indices identify with same expert? - ### this should be accounted for in the lines below self.local_experts = torch.nn.ModuleList() for _ in range(self.num_local_experts): expert = MLP(self.config, submodules, is_expert=True) self.local_experts.append(expert) - if args.residual_moe: - self.fixed_mlp = MLP(self.config, submodules, is_expert=False) def gather_indices(self, local_indices): """ Gather tensors and concatenate along the first dimension.""" group = get_tensor_and_expert_parallel_group() world_size = torch.distributed.get_world_size(group=group) - ### in the example above, is world_size=2/TP? # Bypass the function if we are using only 1 GPU. if world_size == 1: return local_indices @@ -156,7 +151,6 @@ def forward(self, hidden_states): if self.config.timers is not None: self.config.timers('routing_gather', log_level=2).start() - ### sequence_parallel is when sequence parallel dimension > 1? why should i do this gather when EP_size > 1? if self.sequence_parallel or (self.expert_parallel_size > 1): global_hidden_states = tensor_parallel.gather_from_sequence_parallel_region_to_moe( hidden_states @@ -209,8 +203,6 @@ def forward(self, hidden_states): self.config.timers('routing_loop', log_level=2).start() for expert_num, expert in enumerate(self.local_experts): local_expert_index = self.local_expert_indices[expert_num] - ### in the example above, local_expert_index could be 7 even when there are 4 experts? - ### this means 4 GPUs are idle because local_indices is empty local_indices = (global_indices == local_expert_index).nonzero() hidden = global_hidden_states[local_indices, :] if self.config.timers is not None: @@ -234,20 +226,12 @@ def forward(self, hidden_states): if self.config.timers is not None: self.config.timers('routing_loop').stop() - if args.residual_moe: - output_mlp, output_bias_mlp = self.fixed_mlp(global_hidden_states) - if self.config.timers is not None: self.config.timers('ep_scatter', log_level=2).start() if self.sequence_parallel or (self.expert_parallel_size > 1): - ### what is this? should I apply it to output of self.fixed_mlp too? output_total = tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe( output_total ) - if args.residual_moe: - output_mlp = tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe( - output_mlp - ) if self.routing == 'top2' or self.routing == 'sinkhorn_top2': output_total_2 = tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe( output_total_2 @@ -262,10 +246,6 @@ def forward(self, hidden_states): output_bias_total_2 = tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe( output_bias_total_2 ) - if args.residual_moe: - output_bias_mlp = tensor_parallel.reduce_scatter_to_sequence_parallel_region_from_moe( - output_bias_mlp - ) # bias is duplicated across tensor parallelism ranks; # reduce scatter reduces bias across tensor parallel_ranks @@ -286,14 +266,10 @@ def forward(self, hidden_states): self.config.timers('final_route', log_level=2).start() if self.routing == 'top2' or self.routing == 'sinkhorn_top2': output_total = (output_total * max_prob + output_total_2 * max_prob_2) / (max_prob + max_prob_2) - if args.residual_moe: - output_total += output_mlp output_total = output_total.view(hidden_shape) if self.add_bias: if self.routing == 'top2' or self.routing == 'sinkhorn_top2': output_bias_total = (output_bias_total * max_prob + output_bias_total_2 * max_prob_2) / (max_prob + max_prob_2) - if args.residual_moe: - output_bias_total += output_bias_mlp output_bias_total = output_bias_total.view(hidden_shape) else: output_bias_total = None From 35d3ac4b8be934dfa8c834ee45b87dde75a8caf9 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 15:19:19 -0800 Subject: [PATCH 05/84] Update transformer_block.py --- megatron/core/transformer/transformer_block.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py index 1c47e2f716..ceed87a0ff 100644 --- a/megatron/core/transformer/transformer_block.py +++ b/megatron/core/transformer/transformer_block.py @@ -57,12 +57,20 @@ def _build_layers(self, transformer_layer_spec): # coeff = self.layer_number # self.norm_factor *= coeff def build_layer(layer_number): - layer = TransformerLayer( + if layer_number in [1, 2]: + layer = TransformerLayer( config=self.config, - submodules=transformer_layer_spec.submodules, + submodules=gpt_layer_with_transformer_engine_spec.submodules, layer_number=layer_number, self_attn_mask_type=self.self_attn_mask_type, ) + else: + layer = TransformerLayer( + config=self.config, + submodules=transformer_layer_spec.submodules, + layer_number=layer_number, + self_attn_mask_type=self.self_attn_mask_type, + ) return layer if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None: From 0e7eb2306a194d56869263cab1e8dbc2c692ed3a Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 15:21:26 -0800 Subject: [PATCH 06/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index 2fe0ac7bb5..0536ece542 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -65,6 +65,7 @@ def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, layer=N if layer in [5,6]: self.num_local_experts *= 2 self.expert_parallel_size /=2 + if torch.distributed.get_rank() == 0: print('LAYER:', layer, 'NUM LOCAL EXPERTS:', self.num_local_experts) local_expert_indices_offset = ( parallel_state.get_expert_model_parallel_rank() * self.num_local_experts From 4ad505a4d65d3c0941c456510989f332dd5c55f0 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 15:25:35 -0800 Subject: [PATCH 07/84] Update transformer_block.py --- megatron/core/transformer/transformer_block.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py index ceed87a0ff..f1484edd00 100644 --- a/megatron/core/transformer/transformer_block.py +++ b/megatron/core/transformer/transformer_block.py @@ -57,13 +57,15 @@ def _build_layers(self, transformer_layer_spec): # coeff = self.layer_number # self.norm_factor *= coeff def build_layer(layer_number): - if layer_number in [1, 2]: + if layer_number in [1, 2, 4]: layer = TransformerLayer( - config=self.config, - submodules=gpt_layer_with_transformer_engine_spec.submodules, - layer_number=layer_number, - self_attn_mask_type=self.self_attn_mask_type, - ) + config=self.config, + submodules=gpt_layer_with_transformer_engine_spec.submodules, + layer_number=layer_number, + self_attn_mask_type=self.self_attn_mask_type, + ) + if torch.distributed.get_rank() == 0: + print('LAYER:', layer, 'NO EXPERTS') else: layer = TransformerLayer( config=self.config, From 3af0059d967c2cc2fe9c3304ab24ab9baee00091 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 15:33:24 -0800 Subject: [PATCH 08/84] Update transformer_block.py --- megatron/core/transformer/transformer_block.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py index f1484edd00..3da69e3c08 100644 --- a/megatron/core/transformer/transformer_block.py +++ b/megatron/core/transformer/transformer_block.py @@ -14,6 +14,7 @@ from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules from megatron.core.utils import make_sharded_tensor_for_checkpoint, make_viewless_tensor +from megatron.core.models.gpt.gpt_layer_specs import gpt_layer_with_transformer_engine_spec class TransformerBlock(MegatronModule): From 13c54d9a8e4a127b877e6c7a370d49daa177602b Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 15:35:13 -0800 Subject: [PATCH 09/84] Update transformer_block.py --- megatron/core/transformer/transformer_block.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py index 3da69e3c08..307e488802 100644 --- a/megatron/core/transformer/transformer_block.py +++ b/megatron/core/transformer/transformer_block.py @@ -66,7 +66,7 @@ def build_layer(layer_number): self_attn_mask_type=self.self_attn_mask_type, ) if torch.distributed.get_rank() == 0: - print('LAYER:', layer, 'NO EXPERTS') + print('LAYER:', layer_number, 'NO EXPERTS') else: layer = TransformerLayer( config=self.config, From df2eeb21dfa158a3e5293c8b1caf0822f6c08ae9 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 16:23:00 -0800 Subject: [PATCH 10/84] Update arguments.py --- megatron/arguments.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/megatron/arguments.py b/megatron/arguments.py index 9ffe2897a3..82f379f19c 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -628,6 +628,8 @@ def _add_network_size_args(parser): dest='bert_binary_head') group.add_argument('--num-experts', type=int, default=None, help='Number of Experts in Switch Transformer (None means no Switch)') + group.add_argument('--kebab', nargs='+', type=int, + help='Number of experts for each layer (`1` means dense layer)') group.add_argument('--routing-mode', type=str, default='sinkhorn', choices=['sinkhorn', 'top1', 'top2', 'sinkhorn_top2'], help='Mode of the expert routing.') From 1f07db5c7f5399bb8fa65f62d410698c5d4ea125 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 16:23:45 -0800 Subject: [PATCH 11/84] Update initialize.py --- megatron/initialize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/initialize.py b/megatron/initialize.py index 77e9569cab..0571c86fc4 100644 --- a/megatron/initialize.py +++ b/megatron/initialize.py @@ -95,7 +95,7 @@ def finish_mpu_init(): dir_path = os.path.join(args.router_profiling_path) if not os.path.exists(dir_path): os.makedirs(dir_path) - + print('KEBABBBBBBBBBBBBBBBBBBBBBB:', args.kebab) # No continuation function return None From b9d64756ff211a6720b1e80c70f9aa4b30a25a47 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 16:27:49 -0800 Subject: [PATCH 12/84] Update initialize.py --- megatron/initialize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/initialize.py b/megatron/initialize.py index 0571c86fc4..1eb2dbcc7c 100644 --- a/megatron/initialize.py +++ b/megatron/initialize.py @@ -96,8 +96,8 @@ def finish_mpu_init(): if not os.path.exists(dir_path): os.makedirs(dir_path) print('KEBABBBBBBBBBBBBBBBBBBBBBB:', args.kebab) - # No continuation function - return None + # No continuation function + return None def _compile_dependencies(): From 1c20628f630c93b434e1abd2e24d2059f08032ec Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 16:29:40 -0800 Subject: [PATCH 13/84] Update initialize.py --- megatron/initialize.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/initialize.py b/megatron/initialize.py index 1eb2dbcc7c..43dcbb454a 100644 --- a/megatron/initialize.py +++ b/megatron/initialize.py @@ -95,9 +95,9 @@ def finish_mpu_init(): dir_path = os.path.join(args.router_profiling_path) if not os.path.exists(dir_path): os.makedirs(dir_path) - print('KEBABBBBBBBBBBBBBBBBBBBBBB:', args.kebab) - # No continuation function - return None + print('KEBABBBBBBBBBBBBBBBBBBBBBB:', args.kebab) + # No continuation function + return None def _compile_dependencies(): From 413bf438349db1992162ebec6173bd80077231c4 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 16:30:00 -0800 Subject: [PATCH 14/84] Update initialize.py --- megatron/initialize.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/initialize.py b/megatron/initialize.py index 43dcbb454a..1eb2dbcc7c 100644 --- a/megatron/initialize.py +++ b/megatron/initialize.py @@ -95,9 +95,9 @@ def finish_mpu_init(): dir_path = os.path.join(args.router_profiling_path) if not os.path.exists(dir_path): os.makedirs(dir_path) - print('KEBABBBBBBBBBBBBBBBBBBBBBB:', args.kebab) - # No continuation function - return None + print('KEBABBBBBBBBBBBBBBBBBBBBBB:', args.kebab) + # No continuation function + return None def _compile_dependencies(): From 9237b34ac2493f1ea222558b2079d85e19220de5 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 16:32:31 -0800 Subject: [PATCH 15/84] Update initialize.py --- megatron/initialize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/initialize.py b/megatron/initialize.py index 1eb2dbcc7c..6b7f84b71f 100644 --- a/megatron/initialize.py +++ b/megatron/initialize.py @@ -95,7 +95,7 @@ def finish_mpu_init(): dir_path = os.path.join(args.router_profiling_path) if not os.path.exists(dir_path): os.makedirs(dir_path) - print('KEBABBBBBBBBBBBBBBBBBBBBBB:', args.kebab) + # No continuation function return None From 01dc1e91cee27cb1c724aeb9fe8254cccc090a9f Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 16:34:35 -0800 Subject: [PATCH 16/84] Update arguments.py --- megatron/arguments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index 82f379f19c..3dcb6362c6 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -628,7 +628,7 @@ def _add_network_size_args(parser): dest='bert_binary_head') group.add_argument('--num-experts', type=int, default=None, help='Number of Experts in Switch Transformer (None means no Switch)') - group.add_argument('--kebab', nargs='+', type=int, + group.add_argument('--moe-layers', nargs='+', type=int, help='Number of experts for each layer (`1` means dense layer)') group.add_argument('--routing-mode', type=str, default='sinkhorn', choices=['sinkhorn', 'top1', 'top2', 'sinkhorn_top2'], From 68b99a03c9dadc87da58c5a2ff368dfc229ceb53 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 16:51:28 -0800 Subject: [PATCH 17/84] Update arguments.py --- megatron/arguments.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index 3dcb6362c6..0acdb3187b 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -390,8 +390,13 @@ def validate_args(args, defaults={}): # MoE Spec check if args.num_experts is not None: assert args.model_spec is None, "Model Spec must be None when using MoEs" + assert args.num_experts > 1, "--num-experts should be greater than 2." if args.use_balancing_loss is not None: assert (args.routing_mode == 'top1' or args.routing_mode == 'top2'), "Need --routing-mode = 'top1' or 'top2' if setting --use-balancing-loss." + if args.moe_layers is not None: + import math + assert sum(args.moe_layers) == args.num_layers, "--moe-layers doesn't sum up to --num-layers." + assert min(x for x in args.moe_layers if x != 1) > 2, "Experts per layer should be greater than 2." # Expert parallelism check if args.expert_model_parallel_size > 1: @@ -401,6 +406,8 @@ def validate_args(args, defaults={}): if args.tensor_model_parallel_size > 1: assert args.sequence_parallel, \ "When using expert parallelism and tensor parallelism, sequence parallelism must be used." + if args.moe_layers is not None: + assert all(x % args.expert_model_parallel_size == 0 for x in args.moe_layers if x != 1), "Experts per layer should be multiple of --expert-model-parallel-size." # Print arguments. _print_args("arguments", args) @@ -628,8 +635,9 @@ def _add_network_size_args(parser): dest='bert_binary_head') group.add_argument('--num-experts', type=int, default=None, help='Number of Experts in Switch Transformer (None means no Switch)') - group.add_argument('--moe-layers', nargs='+', type=int, - help='Number of experts for each layer (`1` means dense layer)') + group.add_argument('--moe-layers', nargs='+', type=int, default=None, + help='Number of experts for each layer (`1` means dense layer). ' + 'Does not support pipeline parallelism.') group.add_argument('--routing-mode', type=str, default='sinkhorn', choices=['sinkhorn', 'top1', 'top2', 'sinkhorn_top2'], help='Mode of the expert routing.') From 28bc3a3b6c1ea8209e2b86c2f0d1b8a074fdbf80 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 16:55:06 -0800 Subject: [PATCH 18/84] Update pretrain_gpt.py --- pretrain_gpt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pretrain_gpt.py b/pretrain_gpt.py index c4e65679e6..9129beb0bb 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -52,7 +52,7 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat if args.model_spec is not None: transformer_layer_spec = import_module(args.model_spec) else: - if args.num_experts is None: + if (args.num_experts is None) and (args.moe_layers is None): transformer_layer_spec = gpt_layer_with_transformer_engine_spec else: transformer_layer_spec = gpt_layer_with_transformer_engine_spec_moe From b819330dc2a599a2b77409df4c66c208999e16cc Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 16:57:26 -0800 Subject: [PATCH 19/84] Update arguments.py --- megatron/arguments.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/arguments.py b/megatron/arguments.py index 0acdb3187b..1def6cf44f 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -397,6 +397,7 @@ def validate_args(args, defaults={}): import math assert sum(args.moe_layers) == args.num_layers, "--moe-layers doesn't sum up to --num-layers." assert min(x for x in args.moe_layers if x != 1) > 2, "Experts per layer should be greater than 2." + assert args.use_mcore_models == True, "--moe-layers supported only with --use-mcore-models." # Expert parallelism check if args.expert_model_parallel_size > 1: From 56ad1208c801ea373a7b22d967b6883270cf7c83 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 16:59:58 -0800 Subject: [PATCH 20/84] Update transformer_block.py --- .../core/transformer/transformer_block.py | 37 ++++++++++++------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py index 307e488802..c125cc163e 100644 --- a/megatron/core/transformer/transformer_block.py +++ b/megatron/core/transformer/transformer_block.py @@ -58,22 +58,31 @@ def _build_layers(self, transformer_layer_spec): # coeff = self.layer_number # self.norm_factor *= coeff def build_layer(layer_number): - if layer_number in [1, 2, 4]: - layer = TransformerLayer( - config=self.config, - submodules=gpt_layer_with_transformer_engine_spec.submodules, - layer_number=layer_number, - self_attn_mask_type=self.self_attn_mask_type, - ) - if torch.distributed.get_rank() == 0: - print('LAYER:', layer_number, 'NO EXPERTS') + args = get_args() + if args.moe_layers: + if args.moe_layers[layer_numer-1] == 1: + layer = TransformerLayer( + config=self.config, + submodules=gpt_layer_with_transformer_engine_spec.submodules, + layer_number=layer_number, + self_attn_mask_type=self.self_attn_mask_type, + ) + if torch.distributed.get_rank() == 0: + print('LAYER:', layer_number, 'NO EXPERTS') + else: + layer = TransformerLayer( + config=self.config, + submodules=transformer_layer_spec.submodules, + layer_number=layer_number, + self_attn_mask_type=self.self_attn_mask_type, + ) else: layer = TransformerLayer( - config=self.config, - submodules=transformer_layer_spec.submodules, - layer_number=layer_number, - self_attn_mask_type=self.self_attn_mask_type, - ) + config=self.config, + submodules=transformer_layer_spec.submodules, + layer_number=layer_number, + self_attn_mask_type=self.self_attn_mask_type, + ) return layer if parallel_state.get_virtual_pipeline_model_parallel_world_size() is not None: From 8ce18aa4d22a5853dd90168f1b6e79849bb0b164 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 17:05:45 -0800 Subject: [PATCH 21/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index 0536ece542..0c5d81ed70 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -49,8 +49,11 @@ def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, layer=N args = get_args() self.config: TransformerConfig = config - - self.router = torch.nn.Linear(self.config.hidden_size, self.config.num_moe_experts) + if args.moe_experts: + self.num_moe_experts = args.moe_experts[layer-1] + else: + self.num_moe_experts = self.config.num_moe_experts + self.router = torch.nn.Linear(self.config.hidden_size, self.num_moe_experts) self.add_bias = config.add_bias_linear self.routing = args.routing_mode # 'sinkhorn', 'top1', 'top2', 'sinkhorn_top2' self.layer = layer @@ -60,11 +63,8 @@ def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, layer=N self.router_activation = torch.sigmoid self.expert_parallel_size = parallel_state.get_expert_model_parallel_world_size() - assert self.config.num_moe_experts % self.expert_parallel_size == 0 - self.num_local_experts = self.config.num_moe_experts // self.expert_parallel_size - if layer in [5,6]: - self.num_local_experts *= 2 - self.expert_parallel_size /=2 + assert self.num_moe_experts % self.expert_parallel_size == 0 + self.num_local_experts = self.num_moe_experts // self.expert_parallel_size if torch.distributed.get_rank() == 0: print('LAYER:', layer, 'NUM LOCAL EXPERTS:', self.num_local_experts) local_expert_indices_offset = ( @@ -101,7 +101,7 @@ def forward(self, hidden_states): args = get_args() hidden_shape = hidden_states.shape route = self.router(hidden_states) - route = route.view(-1, self.config.num_moe_experts) + route = route.view(-1, self.num_moe_experts) if self.config.timers is not None: self.config.timers('routing_block1', log_level=2).start() @@ -173,14 +173,14 @@ def forward(self, hidden_states): if (args.use_balancing_loss is not None) and self.training: if hasattr(args, 'l_aux'): me = torch.mean(route, dim=0) - mask1 = F.one_hot(global_indices, num_classes=self.config.num_moe_experts) + mask1 = F.one_hot(global_indices, num_classes=self.num_moe_experts) ce = torch.mean(mask1.float(), dim=0) - args.l_aux += torch.sum(me * ce) * self.config.num_moe_experts + args.l_aux += torch.sum(me * ce) * self.num_moe_experts if self.routing == 'top2': me_2 = torch.mean(masked_route, dim=0) - mask1 = F.one_hot(global_indices_2, num_classes=self.config.num_moe_experts) + mask1 = F.one_hot(global_indices_2, num_classes=self.num_moe_experts) ce_2 = torch.mean(mask1.float(), dim=0) - args.l_aux += torch.sum(me_2 * ce_2) * self.config.num_moe_experts + args.l_aux += torch.sum(me_2 * ce_2) * self.num_moe_experts # Collect token count for each expert and save to file if self.router_profiling_interval and (args.curr_iteration % self.router_profiling_interval == 0) and args.curr_iteration > 0: From 0875fc50447fe7d300780346b23154eac783be23 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 17:09:31 -0800 Subject: [PATCH 22/84] Update arguments.py --- megatron/arguments.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index 1def6cf44f..959b23e93f 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -394,8 +394,7 @@ def validate_args(args, defaults={}): if args.use_balancing_loss is not None: assert (args.routing_mode == 'top1' or args.routing_mode == 'top2'), "Need --routing-mode = 'top1' or 'top2' if setting --use-balancing-loss." if args.moe_layers is not None: - import math - assert sum(args.moe_layers) == args.num_layers, "--moe-layers doesn't sum up to --num-layers." + assert len(args.moe_layers) == args.num_layers, "length of --moe-layers should equal --num-layers." assert min(x for x in args.moe_layers if x != 1) > 2, "Experts per layer should be greater than 2." assert args.use_mcore_models == True, "--moe-layers supported only with --use-mcore-models." From 4989c58a57784ecd16c878c3dfb9d7c5d5711bba Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 17:10:54 -0800 Subject: [PATCH 23/84] Update transformer_block.py --- megatron/core/transformer/transformer_block.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py index c125cc163e..2818bad46d 100644 --- a/megatron/core/transformer/transformer_block.py +++ b/megatron/core/transformer/transformer_block.py @@ -15,6 +15,7 @@ from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules from megatron.core.utils import make_sharded_tensor_for_checkpoint, make_viewless_tensor from megatron.core.models.gpt.gpt_layer_specs import gpt_layer_with_transformer_engine_spec +from megatron import get_args class TransformerBlock(MegatronModule): From ef743f4824e3e147cd12141edfa509ae4fc2b81c Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 17:12:27 -0800 Subject: [PATCH 24/84] Update transformer_block.py --- megatron/core/transformer/transformer_block.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py index 2818bad46d..858241cbf0 100644 --- a/megatron/core/transformer/transformer_block.py +++ b/megatron/core/transformer/transformer_block.py @@ -61,7 +61,7 @@ def _build_layers(self, transformer_layer_spec): def build_layer(layer_number): args = get_args() if args.moe_layers: - if args.moe_layers[layer_numer-1] == 1: + if args.moe_layers[layer_number-1] == 1: layer = TransformerLayer( config=self.config, submodules=gpt_layer_with_transformer_engine_spec.submodules, From 25056d5d810f8caacb0457ece899538e077a30e4 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 17:13:37 -0800 Subject: [PATCH 25/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index 0c5d81ed70..fa7aba6d9b 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -49,7 +49,7 @@ def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, layer=N args = get_args() self.config: TransformerConfig = config - if args.moe_experts: + if args.moe_layers: self.num_moe_experts = args.moe_experts[layer-1] else: self.num_moe_experts = self.config.num_moe_experts From 7dad1589658e4114ac9fbfb3d1802151af718463 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 17:14:56 -0800 Subject: [PATCH 26/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index fa7aba6d9b..dec9e7a09d 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -50,7 +50,7 @@ def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, layer=N self.config: TransformerConfig = config if args.moe_layers: - self.num_moe_experts = args.moe_experts[layer-1] + self.num_moe_experts = args.moe_layers[layer-1] else: self.num_moe_experts = self.config.num_moe_experts self.router = torch.nn.Linear(self.config.hidden_size, self.num_moe_experts) From 2aee703ba971f267d9b2f45e08017f0c047fd75f Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 17:20:18 -0800 Subject: [PATCH 27/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index dec9e7a09d..99546296e3 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -206,6 +206,8 @@ def forward(self, hidden_states): local_expert_index = self.local_expert_indices[expert_num] local_indices = (global_indices == local_expert_index).nonzero() hidden = global_hidden_states[local_indices, :] + if torch.distributed.get_rank(): + print('LAYER:', self.layer, 'local_expert_index', local_expert_index) if self.config.timers is not None: self.config.timers('expert_fwd', log_level=2).start() output, output_bias = expert(hidden) From babbebba5ba9c6c96dba19f101aec2a18aba9ed4 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 17:23:29 -0800 Subject: [PATCH 28/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index 99546296e3..0717ee88d0 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -206,7 +206,7 @@ def forward(self, hidden_states): local_expert_index = self.local_expert_indices[expert_num] local_indices = (global_indices == local_expert_index).nonzero() hidden = global_hidden_states[local_indices, :] - if torch.distributed.get_rank(): + if torch.distributed.get_rank() == 0: print('LAYER:', self.layer, 'local_expert_index', local_expert_index) if self.config.timers is not None: self.config.timers('expert_fwd', log_level=2).start() From 179d2a443743f7da93cb238d781794afa17cc88d Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 17:25:50 -0800 Subject: [PATCH 29/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index 0717ee88d0..f533a040ac 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -65,8 +65,6 @@ def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, layer=N assert self.num_moe_experts % self.expert_parallel_size == 0 self.num_local_experts = self.num_moe_experts // self.expert_parallel_size - if torch.distributed.get_rank() == 0: - print('LAYER:', layer, 'NUM LOCAL EXPERTS:', self.num_local_experts) local_expert_indices_offset = ( parallel_state.get_expert_model_parallel_rank() * self.num_local_experts ) @@ -206,8 +204,6 @@ def forward(self, hidden_states): local_expert_index = self.local_expert_indices[expert_num] local_indices = (global_indices == local_expert_index).nonzero() hidden = global_hidden_states[local_indices, :] - if torch.distributed.get_rank() == 0: - print('LAYER:', self.layer, 'local_expert_index', local_expert_index) if self.config.timers is not None: self.config.timers('expert_fwd', log_level=2).start() output, output_bias = expert(hidden) From 253c87c0d5b5611c487fd2ba08caad1ac08a0aa1 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 17:26:08 -0800 Subject: [PATCH 30/84] Update transformer_block.py --- megatron/core/transformer/transformer_block.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py index 858241cbf0..56772b2aae 100644 --- a/megatron/core/transformer/transformer_block.py +++ b/megatron/core/transformer/transformer_block.py @@ -68,8 +68,6 @@ def build_layer(layer_number): layer_number=layer_number, self_attn_mask_type=self.self_attn_mask_type, ) - if torch.distributed.get_rank() == 0: - print('LAYER:', layer_number, 'NO EXPERTS') else: layer = TransformerLayer( config=self.config, From a058ac2489c4ec7f44d2ae975ed2eaecfcb03cb5 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 17:38:44 -0800 Subject: [PATCH 31/84] Update README.md --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index cb65ec09f9..2b57e81343 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,12 @@ A sample plot for `top2` routing mode (obtained from a tiny toy model) is: Token Counts +## Varying expert number across layers + +To set different number of experts across layers use the flag `--moe-layers` followed by a sequence of numbers corresponding to the number of experts per layer. For example, in a model with 5 layers, one can write `--moe-layers 1 8 16 8 1`. + +This flag does not currently support pipeline parallelism. Also, for MoE layers, each of these numbers should be multiple of `--expert-model-parallel-size` and greater than 2. For a dense layer, the number should be set to 1. + # NVIDIA Megatron-LM (copied from upstream) Megatron ([1](https://arxiv.org/pdf/1909.08053.pdf), [2](https://arxiv.org/pdf/2104.04473.pdf), and [3](https://arxiv.org/pdf/2205.05198)) is a large, powerful transformer developed by the Applied Deep Learning Research team at NVIDIA. This repository is for ongoing research on training large transformer language models at scale. We developed efficient, model-parallel ([tensor](https://arxiv.org/pdf/1909.08053.pdf), [sequence](https://arxiv.org/pdf/2205.05198), and [pipeline](https://arxiv.org/pdf/2104.04473.pdf)), and multi-node pre-training of transformer based models such as [GPT](https://arxiv.org/abs/2005.14165), [BERT](https://arxiv.org/pdf/1810.04805.pdf), and [T5](https://arxiv.org/abs/1910.10683) using mixed precision. From cec343707de6063ada09f78dac64e4f1e9c5521c Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 22:24:47 -0800 Subject: [PATCH 32/84] Update mlp.py --- megatron/core/transformer/mlp.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/megatron/core/transformer/mlp.py b/megatron/core/transformer/mlp.py index c2592bf7c8..1591f7278b 100644 --- a/megatron/core/transformer/mlp.py +++ b/megatron/core/transformer/mlp.py @@ -37,7 +37,7 @@ class MLP(MegatronModule): """ def __init__( - self, config: TransformerConfig, submodules: MLPSubmodules, is_expert: bool = False + self, config: TransformerConfig, submodules: MLPSubmodules, is_expert: bool = False, ffn_hidden_ratio = 1 ): super().__init__(config=config) @@ -51,7 +51,7 @@ def __init__( self.linear_fc1 = build_module( submodules.linear_fc1, self.config.hidden_size, - ffn_hidden_size, + ffn_hidden_size * ffn_hidden_ratio, config=self.config, init_method=self.config.init_method, gather_output=False, @@ -72,7 +72,7 @@ def glu(x): self.linear_fc2 = build_module( submodules.linear_fc2, - self.config.ffn_hidden_size, + self.config.ffn_hidden_size * ffn_hidden_ratio, self.config.hidden_size, config=self.config, init_method=self.config.output_layer_init_method, @@ -86,6 +86,7 @@ def forward(self, hidden_states): # [s, b, 4 * h/p] intermediate_parallel, bias_parallel = self.linear_fc1(hidden_states) + print('DIMENSION OF INTERMEDIATE FFN LAYER:', intermediate_parallel.shape) if self.config.bias_gelu_fusion: assert self.config.add_bias_linear is True From 381e899a84061ad4982108c3c916919889154a4b Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 22:26:44 -0800 Subject: [PATCH 33/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index 77545502f4..aee8d9a744 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -75,7 +75,7 @@ def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, layer=N self.local_experts = torch.nn.ModuleList() for _ in range(self.num_local_experts): - expert = MLP(self.config, submodules, is_expert=True) + expert = MLP(self.config, submodules, is_expert=True, ffn_hidden_ratio = args.ffn_hidden_ratios[layer-1]) self.local_experts.append(expert) def gather_indices(self, local_indices): From cd4b5b1d86d54a253cdc3299a738c867f43f7226 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 22:46:21 -0800 Subject: [PATCH 34/84] Update transformer_layer.py --- megatron/core/transformer/transformer_layer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py index dc4946c5f5..117a5d218e 100644 --- a/megatron/core/transformer/transformer_layer.py +++ b/megatron/core/transformer/transformer_layer.py @@ -108,7 +108,7 @@ def __init__( if submodules.mlp.module == SwitchMLP: self.mlp = build_module(submodules.mlp, config=self.config, layer=layer_number) else: - self.mlp = build_module(submodules.mlp, config=self.config) + self.mlp = build_module(submodules.mlp, config=self.config, layer=layer_number) ## [Module 9: BiasDropoutFusion] self.mlp_bda = build_module(submodules.mlp_bda) From 3c0a58f52f176b2103b26e3171834696a63538bb Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 22:47:41 -0800 Subject: [PATCH 35/84] Update mlp.py --- megatron/core/transformer/mlp.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/megatron/core/transformer/mlp.py b/megatron/core/transformer/mlp.py index 1591f7278b..02abbe42fb 100644 --- a/megatron/core/transformer/mlp.py +++ b/megatron/core/transformer/mlp.py @@ -37,11 +37,13 @@ class MLP(MegatronModule): """ def __init__( - self, config: TransformerConfig, submodules: MLPSubmodules, is_expert: bool = False, ffn_hidden_ratio = 1 + self, config: TransformerConfig, submodules: MLPSubmodules, is_expert: bool = False, layer=None ): super().__init__(config=config) self.config: TransformerConfig = config + if layer: + ffn_ratio = ffn_hidden_ratio[layer-1] # If this is a gated linear unit we double the output width, see https://arxiv.org/pdf/2002.05202.pdf ffn_hidden_size = self.config.ffn_hidden_size @@ -51,7 +53,7 @@ def __init__( self.linear_fc1 = build_module( submodules.linear_fc1, self.config.hidden_size, - ffn_hidden_size * ffn_hidden_ratio, + ffn_hidden_size * ffn_ratio, config=self.config, init_method=self.config.init_method, gather_output=False, @@ -72,7 +74,7 @@ def glu(x): self.linear_fc2 = build_module( submodules.linear_fc2, - self.config.ffn_hidden_size * ffn_hidden_ratio, + self.config.ffn_hidden_size * ffn_ratio, self.config.hidden_size, config=self.config, init_method=self.config.output_layer_init_method, From 2644ff8cbb098b603477f8f990f8882a6b83b215 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 22:48:30 -0800 Subject: [PATCH 36/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index aee8d9a744..8941c7b6ef 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -75,7 +75,7 @@ def __init__(self, config: TransformerConfig, submodules: MLPSubmodules, layer=N self.local_experts = torch.nn.ModuleList() for _ in range(self.num_local_experts): - expert = MLP(self.config, submodules, is_expert=True, ffn_hidden_ratio = args.ffn_hidden_ratios[layer-1]) + expert = MLP(self.config, submodules, is_expert=True, layer=layer) self.local_experts.append(expert) def gather_indices(self, local_indices): From 911ceeba6ff3ce7c85af654c58a29f0d324feff6 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 22:51:53 -0800 Subject: [PATCH 37/84] Update mlp.py --- megatron/core/transformer/mlp.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/megatron/core/transformer/mlp.py b/megatron/core/transformer/mlp.py index 02abbe42fb..730eaa6224 100644 --- a/megatron/core/transformer/mlp.py +++ b/megatron/core/transformer/mlp.py @@ -11,6 +11,7 @@ from megatron.core.transformer.spec_utils import ModuleSpec, build_module from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint +from megatron import get_args @dataclass @@ -40,10 +41,13 @@ def __init__( self, config: TransformerConfig, submodules: MLPSubmodules, is_expert: bool = False, layer=None ): super().__init__(config=config) - + + args = get_args() self.config: TransformerConfig = config - if layer: - ffn_ratio = ffn_hidden_ratio[layer-1] + if layer and args.ffn_hidden_ratio: + ffn_ratio = args.ffn_hidden_ratio[layer-1] + else: + ffn_ratio = 1 # If this is a gated linear unit we double the output width, see https://arxiv.org/pdf/2002.05202.pdf ffn_hidden_size = self.config.ffn_hidden_size From 7d901059d90166c8409b8831c01148462f3c9632 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 22:56:33 -0800 Subject: [PATCH 38/84] Update arguments.py --- megatron/arguments.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/megatron/arguments.py b/megatron/arguments.py index 7e12d0d2c1..ca341e6035 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -638,6 +638,9 @@ def _add_network_size_args(parser): group.add_argument('--moe-layers', nargs='+', type=int, default=None, help='Number of experts for each layer (`1` means dense layer). ' 'Does not support pipeline parallelism.') + group.add_argument('--ffn-hidden-ratio', nargs='+', type=int, default=None, + help='Ratio of MLP intermediate layer over embedding dimension (4 is default). ' + 'It can be different in each layer.') group.add_argument('--routing-mode', type=str, default='sinkhorn', choices=['sinkhorn', 'top1', 'top2', 'sinkhorn_top2'], help='Mode of the expert routing.') From 0ff85c76bed4512ead462c6f403a717c467a740d Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 23:10:22 -0800 Subject: [PATCH 39/84] Update mlp.py --- megatron/core/transformer/mlp.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/megatron/core/transformer/mlp.py b/megatron/core/transformer/mlp.py index 730eaa6224..001f953ff3 100644 --- a/megatron/core/transformer/mlp.py +++ b/megatron/core/transformer/mlp.py @@ -44,20 +44,23 @@ def __init__( args = get_args() self.config: TransformerConfig = config + if layer and args.ffn_hidden_ratio: - ffn_ratio = args.ffn_hidden_ratio[layer-1] + ffn_hidden_size_1 = self.config.hidden_size * args.ffn_hidden_ratio[layer-1] + ffn_hidden_size_2 = self.config.hidden_size * args.ffn_hidden_ratio[layer-1] else: - ffn_ratio = 1 - + ffn_hidden_size_1 = self.config.ffn_hidden_size + ffn_hidden_size_2 = self.config.ffn_hidden_size + # If this is a gated linear unit we double the output width, see https://arxiv.org/pdf/2002.05202.pdf - ffn_hidden_size = self.config.ffn_hidden_size if self.config.gated_linear_unit: - ffn_hidden_size *= 2 + ffn_hidden_size_1 *= 2 + self.linear_fc1 = build_module( submodules.linear_fc1, self.config.hidden_size, - ffn_hidden_size * ffn_ratio, + ffn_hidden_size_1, config=self.config, init_method=self.config.init_method, gather_output=False, @@ -78,7 +81,7 @@ def glu(x): self.linear_fc2 = build_module( submodules.linear_fc2, - self.config.ffn_hidden_size * ffn_ratio, + ffn_hidden_size_2, self.config.hidden_size, config=self.config, init_method=self.config.output_layer_init_method, From e91b58711da3f9f04761ecb8af09fc1fd5747aa4 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 23:12:42 -0800 Subject: [PATCH 40/84] Update mlp.py --- megatron/core/transformer/mlp.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/megatron/core/transformer/mlp.py b/megatron/core/transformer/mlp.py index 001f953ff3..dd7bbf434c 100644 --- a/megatron/core/transformer/mlp.py +++ b/megatron/core/transformer/mlp.py @@ -95,7 +95,8 @@ def forward(self, hidden_states): # [s, b, 4 * h/p] intermediate_parallel, bias_parallel = self.linear_fc1(hidden_states) - print('DIMENSION OF INTERMEDIATE FFN LAYER:', intermediate_parallel.shape) + if torch.distribute.get_rank() == 0: + print('DIMENSION OF INTERMEDIATE FFN LAYER:', intermediate_parallel.shape) if self.config.bias_gelu_fusion: assert self.config.add_bias_linear is True From 7f7d9abbc1a82709e3b6b01ad3dcec8ba7df3da2 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 23:13:26 -0800 Subject: [PATCH 41/84] Update mlp.py --- megatron/core/transformer/mlp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/core/transformer/mlp.py b/megatron/core/transformer/mlp.py index dd7bbf434c..2bb5e0d4fa 100644 --- a/megatron/core/transformer/mlp.py +++ b/megatron/core/transformer/mlp.py @@ -49,8 +49,8 @@ def __init__( ffn_hidden_size_1 = self.config.hidden_size * args.ffn_hidden_ratio[layer-1] ffn_hidden_size_2 = self.config.hidden_size * args.ffn_hidden_ratio[layer-1] else: - ffn_hidden_size_1 = self.config.ffn_hidden_size - ffn_hidden_size_2 = self.config.ffn_hidden_size + ffn_hidden_size_1 = self.config.ffn_hidden_size + ffn_hidden_size_2 = self.config.ffn_hidden_size # If this is a gated linear unit we double the output width, see https://arxiv.org/pdf/2002.05202.pdf if self.config.gated_linear_unit: From 1f052fcdb7538587fa6ddd0f2383c231f3f45c9c Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 23:32:02 -0800 Subject: [PATCH 42/84] Update README.md --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f23be16c68..3e823a437a 100644 --- a/README.md +++ b/README.md @@ -91,12 +91,14 @@ A sample plot for `top2` routing mode (obtained from a tiny toy model) is: Token Counts -## Varying expert number across layers +## Varying expert number and MLP hidden dimension across layers -To set different number of experts across layers use the flag `--moe-layers` followed by a sequence of numbers corresponding to the number of experts per layer. For example, in a model with 5 layers, one can write `--moe-layers 1 8 16 8 1`. +To set different number of experts across layers use the flag `--moe-layers` followed by a sequence of integers corresponding to the number of experts per layer. For example, in a model with 5 layers, one can write `--moe-layers 1 8 16 8 1`. This flag does not currently support pipeline parallelism. Also, for MoE layers, each of these numbers should be multiple of `--expert-model-parallel-size` and greater than 2. For a dense layer, the number should be set to 1. +To change the hidden dimension of MLP's across layers, use the flag `--ffn-hidden-ratio` followed by a sequence of integers corresponding to the ratio between the hidden dimension and the model's embedding dimension. Without this flag, this value is set by default to 4 for all layers (unless `--ff-hidden-size` is used). For example, for a model with 5 layers, one can write `--ffn-hidden-ratio 4 4 2 4 4`. + # NVIDIA Megatron-LM (copied from upstream) Megatron ([1](https://arxiv.org/pdf/1909.08053.pdf), [2](https://arxiv.org/pdf/2104.04473.pdf), and [3](https://arxiv.org/pdf/2205.05198)) is a large, powerful transformer developed by the Applied Deep Learning Research team at NVIDIA. This repository is for ongoing research on training large transformer language models at scale. We developed efficient, model-parallel ([tensor](https://arxiv.org/pdf/1909.08053.pdf), [sequence](https://arxiv.org/pdf/2205.05198), and [pipeline](https://arxiv.org/pdf/2104.04473.pdf)), and multi-node pre-training of transformer based models such as [GPT](https://arxiv.org/abs/2005.14165), [BERT](https://arxiv.org/pdf/1810.04805.pdf), and [T5](https://arxiv.org/abs/1910.10683) using mixed precision. From 25dd9a863bd8562e268e3f02559389b6e09fed27 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 23:36:01 -0800 Subject: [PATCH 43/84] Update mlp.py --- megatron/core/transformer/mlp.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/core/transformer/mlp.py b/megatron/core/transformer/mlp.py index 2bb5e0d4fa..e6f8f56f2d 100644 --- a/megatron/core/transformer/mlp.py +++ b/megatron/core/transformer/mlp.py @@ -44,7 +44,7 @@ def __init__( args = get_args() self.config: TransformerConfig = config - + self.layer = layer if layer and args.ffn_hidden_ratio: ffn_hidden_size_1 = self.config.hidden_size * args.ffn_hidden_ratio[layer-1] ffn_hidden_size_2 = self.config.hidden_size * args.ffn_hidden_ratio[layer-1] @@ -95,8 +95,8 @@ def forward(self, hidden_states): # [s, b, 4 * h/p] intermediate_parallel, bias_parallel = self.linear_fc1(hidden_states) - if torch.distribute.get_rank() == 0: - print('DIMENSION OF INTERMEDIATE FFN LAYER:', intermediate_parallel.shape) + if torch.distributed.get_rank() == 0: + print('LAYER:', self.layer, 'DIMENSION OF INTERMEDIATE FFN LAYER:', intermediate_parallel.shape) if self.config.bias_gelu_fusion: assert self.config.add_bias_linear is True From e0f37f1c1e72e71370b3771984eb1ab7d27aa882 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 6 Dec 2023 23:39:17 -0800 Subject: [PATCH 44/84] Update mlp.py --- megatron/core/transformer/mlp.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/megatron/core/transformer/mlp.py b/megatron/core/transformer/mlp.py index e6f8f56f2d..c8dedc3e06 100644 --- a/megatron/core/transformer/mlp.py +++ b/megatron/core/transformer/mlp.py @@ -95,8 +95,6 @@ def forward(self, hidden_states): # [s, b, 4 * h/p] intermediate_parallel, bias_parallel = self.linear_fc1(hidden_states) - if torch.distributed.get_rank() == 0: - print('LAYER:', self.layer, 'DIMENSION OF INTERMEDIATE FFN LAYER:', intermediate_parallel.shape) if self.config.bias_gelu_fusion: assert self.config.add_bias_linear is True From 7e4a6ae3af39a65002dffa8262270968c0278162 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Thu, 14 Dec 2023 13:46:35 -0800 Subject: [PATCH 45/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index 8941c7b6ef..0052ee5606 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -262,18 +262,36 @@ def forward(self, hidden_states): self.config.timers('ep_scatter').stop() + #if self.config.timers is not None: + # self.config.timers('final_route', log_level=2).start() + #if self.routing == 'top2' or self.routing == 'sinkhorn_top2': + # output_total = (output_total * max_prob + output_total_2 * max_prob_2) / (max_prob + max_prob_2) + #output_total = output_total.view(hidden_shape) + #if self.add_bias: + # if self.routing == 'top2' or self.routing == 'sinkhorn_top2': + # output_bias_total = (output_bias_total * max_prob + output_bias_total_2 * max_prob_2) / (max_prob + max_prob_2) + # output_bias_total = output_bias_total.view(hidden_shape) + #else: + # output_bias_total = None + #if self.config.timers is not None: + # self.config.timers('final_route').stop() + if self.config.timers is not None: self.config.timers('final_route', log_level=2).start() + output_total = output_total * max_prob if self.routing == 'top2' or self.routing == 'sinkhorn_top2': - output_total = (output_total * max_prob + output_total_2 * max_prob_2) / (max_prob + max_prob_2) + output_total_2 = output_total_2 * max_prob_2 + output_total = output_total + output_total_2 output_total = output_total.view(hidden_shape) if self.add_bias: + output_bias_total = output_bias_total * max_prob if self.routing == 'top2' or self.routing == 'sinkhorn_top2': - output_bias_total = (output_bias_total * max_prob + output_bias_total_2 * max_prob_2) / (max_prob + max_prob_2) + output_bias_total_2 = output_bias_total_2 * max_prob_2 + output_bias_total = output_bias_total + output_bias_total_2 output_bias_total = output_bias_total.view(hidden_shape) else: output_bias_total = None if self.config.timers is not None: self.config.timers('final_route').stop() - + return output_total, output_bias_total From 42be8d47f97f4dabf641dda525f2fec825532db0 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Thu, 14 Dec 2023 13:50:41 -0800 Subject: [PATCH 46/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index 0052ee5606..d34f81c01d 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -276,22 +276,10 @@ def forward(self, hidden_states): #if self.config.timers is not None: # self.config.timers('final_route').stop() - if self.config.timers is not None: - self.config.timers('final_route', log_level=2).start() + output_total = output_total * max_prob - if self.routing == 'top2' or self.routing == 'sinkhorn_top2': - output_total_2 = output_total_2 * max_prob_2 - output_total = output_total + output_total_2 output_total = output_total.view(hidden_shape) - if self.add_bias: - output_bias_total = output_bias_total * max_prob - if self.routing == 'top2' or self.routing == 'sinkhorn_top2': - output_bias_total_2 = output_bias_total_2 * max_prob_2 - output_bias_total = output_bias_total + output_bias_total_2 - output_bias_total = output_bias_total.view(hidden_shape) - else: - output_bias_total = None - if self.config.timers is not None: - self.config.timers('final_route').stop() + output_bias_total = None + return output_total, output_bias_total From f787add8ea48c4a71d875fb84a3c2142b13d1d45 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Thu, 14 Dec 2023 14:05:08 -0800 Subject: [PATCH 47/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 28 +++++++++++++------------ 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index d34f81c01d..f8e6f2e4ef 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -262,19 +262,21 @@ def forward(self, hidden_states): self.config.timers('ep_scatter').stop() - #if self.config.timers is not None: - # self.config.timers('final_route', log_level=2).start() - #if self.routing == 'top2' or self.routing == 'sinkhorn_top2': - # output_total = (output_total * max_prob + output_total_2 * max_prob_2) / (max_prob + max_prob_2) - #output_total = output_total.view(hidden_shape) - #if self.add_bias: - # if self.routing == 'top2' or self.routing == 'sinkhorn_top2': - # output_bias_total = (output_bias_total * max_prob + output_bias_total_2 * max_prob_2) / (max_prob + max_prob_2) - # output_bias_total = output_bias_total.view(hidden_shape) - #else: - # output_bias_total = None - #if self.config.timers is not None: - # self.config.timers('final_route').stop() + if self.config.timers is not None: + self.config.timers('final_route', log_level=2).start() + output_total = output_total * max_prob + if self.routing == 'top2' or self.routing == 'sinkhorn_top2': + output_total = (output_total + output_total_2 * max_prob_2) + output_total = output_total.view(hidden_shape) + if self.add_bias: + output_bias_total = output_bias_total * max_prob + if self.routing == 'top2' or self.routing == 'sinkhorn_top2': + output_bias_total = (output_bias_total + output_bias_total_2 * max_prob_2) + output_bias_total = output_bias_total.view(hidden_shape) + else: + output_bias_total = None + if self.config.timers is not None: + self.config.timers('final_route').stop() output_total = output_total * max_prob From 03b50ec28886d7605abf387dc215227f6d68a3de Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Thu, 14 Dec 2023 20:23:33 -0800 Subject: [PATCH 48/84] Update gpt_model.py --- megatron/core/models/gpt/gpt_model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index 84e66014e6..540fed7073 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -148,6 +148,7 @@ def forward( args = get_args() if args.use_balancing_loss is not None: args.l_aux = 0.0 + args.l_router = 0.0 hidden_states = self.decoder( hidden_states=decoder_input, attention_mask=attention_mask, @@ -171,6 +172,7 @@ def forward( loss = self.compute_language_model_loss(labels, logits) if args.use_balancing_loss is not None: loss += args.use_balancing_loss * args.l_aux + loss += args.l_router return loss From 02969e7237cc227be58eda5719294254d5c38bd8 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Thu, 14 Dec 2023 20:33:18 -0800 Subject: [PATCH 49/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index f8e6f2e4ef..d13bfa7fda 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -167,7 +167,10 @@ def forward(self, hidden_states): self.config.timers('routing_gather').stop() - + # Evaluate router loss + if hasattr(args, 'l_router') and self.training: + args.l_router -= torch.sum(route * torch.log(route + 1e-9)) + # Evaluate balancing loss. if (args.use_balancing_loss is not None) and self.training: if hasattr(args, 'l_aux'): @@ -264,9 +267,9 @@ def forward(self, hidden_states): if self.config.timers is not None: self.config.timers('final_route', log_level=2).start() - output_total = output_total * max_prob + # output_total = output_total * max_prob if self.routing == 'top2' or self.routing == 'sinkhorn_top2': - output_total = (output_total + output_total_2 * max_prob_2) + output_total = (output_total * max_prob + output_total_2 * max_prob_2) output_total = output_total.view(hidden_shape) if self.add_bias: output_bias_total = output_bias_total * max_prob @@ -277,11 +280,5 @@ def forward(self, hidden_states): output_bias_total = None if self.config.timers is not None: self.config.timers('final_route').stop() - - - output_total = output_total * max_prob - output_total = output_total.view(hidden_shape) - output_bias_total = None - return output_total, output_bias_total From fb61494f3d8cdff67b9e0fa20cd2c3a2c3846aa8 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Mon, 18 Dec 2023 19:40:01 -0600 Subject: [PATCH 50/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index d13bfa7fda..c54b14ca67 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -269,7 +269,7 @@ def forward(self, hidden_states): self.config.timers('final_route', log_level=2).start() # output_total = output_total * max_prob if self.routing == 'top2' or self.routing == 'sinkhorn_top2': - output_total = (output_total * max_prob + output_total_2 * max_prob_2) + output_total = (output_total + output_total_2 * max_prob_2) output_total = output_total.view(hidden_shape) if self.add_bias: output_bias_total = output_bias_total * max_prob From e26ca5794044d9c031a1206d1fb5bbed59be4af2 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Tue, 19 Dec 2023 14:25:45 -0600 Subject: [PATCH 51/84] Update training.py --- megatron/training.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/megatron/training.py b/megatron/training.py index c4dfd19605..4c951a44c5 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -50,6 +50,8 @@ from megatron.model.vision.knn_monitor import compute_feature_bank from megatron.eval_harness import Evaluator +global prev_params +prev_params = [[] for i in range(1000)] def print_datetime(string): """Note that this call will sync across all ranks.""" @@ -455,6 +457,16 @@ def train_step(forward_step_func, data_iterator, unwrapped_model.cancel_gradients_last_layer(args.curr_iteration) # Update parameters. + + # print("JUST BEFORE STEP:" ) + if args.curr_iteration % 10 == 0 and torch.distributed.get_rank() == 0: + for i,(n, p) in enumerate(model[0].named_parameters()): + if len(prev_params[i]) == 0: + prev_params[i] = p.detach().clone() + param_diff = p - prev_params[i] + grad_sum = str(p.grad.sum().item()) if p.grad is not None else "NO GRAD!" + print(args.curr_iteration, n, p.shape, torch.norm(p).item(), torch.norm(param_diff).item()) + prev_params[i] = p.detach().clone() if args.enable_manual_profiling: torch.cuda.nvtx.range_push(f"Optimizer step") timers('optimizer', log_level=1).start(barrier=args.barrier_with_L1_time) update_successful, grad_norm, num_zeros_in_grad = optimizer.step(args, timers) From 83bcde19e2a6b90252394d687a1660a10c9b59b2 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 20 Dec 2023 03:25:34 -0600 Subject: [PATCH 52/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index c54b14ca67..36dd131a57 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -264,10 +264,10 @@ def forward(self, hidden_states): if self.config.timers is not None: self.config.timers('ep_scatter').stop() - + print('THIS IS WHERE I'M PRINTING') if self.config.timers is not None: self.config.timers('final_route', log_level=2).start() - # output_total = output_total * max_prob + output_total = output_total * max_prob if self.routing == 'top2' or self.routing == 'sinkhorn_top2': output_total = (output_total + output_total_2 * max_prob_2) output_total = output_total.view(hidden_shape) From a8ac36955ec682e5caa801bc95425104af97e923 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 20 Dec 2023 03:31:51 -0600 Subject: [PATCH 53/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index 36dd131a57..74e18e7c78 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -264,7 +264,7 @@ def forward(self, hidden_states): if self.config.timers is not None: self.config.timers('ep_scatter').stop() - print('THIS IS WHERE I'M PRINTING') + print('THIS IS WHERE IM PRINTING') if self.config.timers is not None: self.config.timers('final_route', log_level=2).start() output_total = output_total * max_prob From c2edfbb0395c6145de9c56912c36a6f616c360c0 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 20 Dec 2023 03:33:55 -0600 Subject: [PATCH 54/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 1 - 1 file changed, 1 deletion(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index 74e18e7c78..60249fa6bf 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -264,7 +264,6 @@ def forward(self, hidden_states): if self.config.timers is not None: self.config.timers('ep_scatter').stop() - print('THIS IS WHERE IM PRINTING') if self.config.timers is not None: self.config.timers('final_route', log_level=2).start() output_total = output_total * max_prob From 74b0c7baad49f003c4b8e6cfe47370213fa3f9cc Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 20 Dec 2023 11:20:43 -0600 Subject: [PATCH 55/84] print statement of data shape --- pretrain_gpt.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pretrain_gpt.py b/pretrain_gpt.py index 9129beb0bb..d0d8a214d3 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -178,6 +178,7 @@ def forward_step(data_iterator, model: GPTModel): tokens, labels, loss_mask, attention_mask, position_ids = get_batch( data_iterator) timers('batch-generator').stop() + print('SHAPE OF DATA AFTER GET_BATCH:', tokens.shape, labels.shape) if args.enable_manual_profiling: torch.cuda.nvtx.range_pop() if args.enable_manual_profiling: torch.cuda.nvtx.range_push(f"Forward pass") From 8f7bb1cfba6f6fdb4065bdc3465347c0d19988dd Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 20 Dec 2023 11:24:13 -0600 Subject: [PATCH 56/84] print statement of data shape --- megatron/core/models/gpt/gpt_model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index 540fed7073..7182cfa30a 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -129,7 +129,9 @@ def forward( if decoder_input is not None: pass elif self.pre_process: + print('DATA SHAPE BEFORE EMBEDDING:', input_ids.shape, labels.shape) decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids) + print('DATA SHAPE AFTER EMBEDDING:', input_ids.shape, labels.shape) else: # intermediate stage of pipeline # decoder will get hidden_states from encoder.input_tensor From 714cd8d44ce80d99bb41ce11cc4cb28561ac0da1 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 20 Dec 2023 11:29:40 -0600 Subject: [PATCH 57/84] Update transformer_layer.py --- megatron/core/transformer/transformer_layer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py index 117a5d218e..1bccb81e6e 100644 --- a/megatron/core/transformer/transformer_layer.py +++ b/megatron/core/transformer/transformer_layer.py @@ -175,9 +175,11 @@ def forward( # TODO: could we move `bias_dropout_add_exec_handler` itself # inside the module provided in the `bias_dropout_add_spec` module? with self.bias_dropout_add_exec_handler(): + print('SHAPE OF DATA BEFORE ATTENTION:', hidden_states.shape) hidden_states = self.self_attn_bda(self.training, self.config.bias_dropout_fusion)( attention_output_with_bias, residual, self.config.hidden_dropout ) + print('SHAPE OF DATA AFTER ATTENTION:', hidden_states.shape) # Residual connection. residual = hidden_states From 5fd134fa3f792e593cbde4d5dd06c2b7ed8d734b Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 20 Dec 2023 11:38:42 -0600 Subject: [PATCH 58/84] Update gpt_model.py --- megatron/core/models/gpt/gpt_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index 7182cfa30a..3105fc5c86 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -151,6 +151,7 @@ def forward( if args.use_balancing_loss is not None: args.l_aux = 0.0 args.l_router = 0.0 + print('SHAPE OF DATA BEFORE DECODER IN GPT_MODEL.PY:' decoder_input.shape) hidden_states = self.decoder( hidden_states=decoder_input, attention_mask=attention_mask, From d95239c810d85efca20cb879f39e0c9f0ec6e599 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 20 Dec 2023 11:39:35 -0600 Subject: [PATCH 59/84] Update training.py --- megatron/training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/training.py b/megatron/training.py index 4c951a44c5..d85e22128f 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -459,7 +459,7 @@ def train_step(forward_step_func, data_iterator, # Update parameters. # print("JUST BEFORE STEP:" ) - if args.curr_iteration % 10 == 0 and torch.distributed.get_rank() == 0: + if args.curr_iteration % 10 == 0 and torch.distributed.get_rank() == 0 and 1 == 0: for i,(n, p) in enumerate(model[0].named_parameters()): if len(prev_params[i]) == 0: prev_params[i] = p.detach().clone() From 8ac39c915450ebdb6a7d980290736d6908f2ce80 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 20 Dec 2023 11:41:31 -0600 Subject: [PATCH 60/84] Update gpt_model.py --- megatron/core/models/gpt/gpt_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index 3105fc5c86..205ce94351 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -131,7 +131,7 @@ def forward( elif self.pre_process: print('DATA SHAPE BEFORE EMBEDDING:', input_ids.shape, labels.shape) decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids) - print('DATA SHAPE AFTER EMBEDDING:', input_ids.shape, labels.shape) + print('DATA SHAPE AFTER EMBEDDING:', decoder_input.shape, labels.shape) else: # intermediate stage of pipeline # decoder will get hidden_states from encoder.input_tensor @@ -151,7 +151,7 @@ def forward( if args.use_balancing_loss is not None: args.l_aux = 0.0 args.l_router = 0.0 - print('SHAPE OF DATA BEFORE DECODER IN GPT_MODEL.PY:' decoder_input.shape) + print('SHAPE OF DATA BEFORE DECODER IN GPTMODELPY:', decoder_input.shape) hidden_states = self.decoder( hidden_states=decoder_input, attention_mask=attention_mask, From 687687e48da96d60fde5efe17c41cd6c80edeec2 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Wed, 20 Dec 2023 12:33:47 -0600 Subject: [PATCH 61/84] Update gpt_model.py --- megatron/core/models/gpt/gpt_model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index 205ce94351..fc999605b0 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -168,6 +168,7 @@ def forward( output_weight = self.shared_embedding_or_output_weight() logits, _ = self.output_layer(hidden_states, weight=output_weight) + print('SHAPES BEFORE LOSS COMPUTE:', logits.shape, labels.shape) if labels is None: # [s b h] => [b s h] return logits.transpose(0, 1).contiguous() From fff647099ed16a97c544e9bbcb26564aa725ada3 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Thu, 21 Dec 2023 02:14:25 -0600 Subject: [PATCH 62/84] Update gpt_model.py --- megatron/core/models/gpt/gpt_model.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index fc999605b0..1fe7548e02 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -105,7 +105,10 @@ def __init__( ) if self.share_embeddings_and_output_weights and (self.pre_process or self.post_process): + print('THIS CONDITION TRUE') self.initialize_last_stage_with_word_embeddings() + else: + print('THIS CONDITION FALSE') def forward( self, From fac5f796d55c5ed1ed8781d7fc817965226ce7c3 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Thu, 21 Dec 2023 02:28:49 -0600 Subject: [PATCH 63/84] Update gpt_model.py --- megatron/core/models/gpt/gpt_model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index 1fe7548e02..79f8695f44 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -132,9 +132,9 @@ def forward( if decoder_input is not None: pass elif self.pre_process: - print('DATA SHAPE BEFORE EMBEDDING:', input_ids.shape, labels.shape) decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids) - print('DATA SHAPE AFTER EMBEDDING:', decoder_input.shape, labels.shape) + if torch.distributed.get_rank() == 0: + print(self.embedding.word_embeddings.weight.data, self.output_layer.weight.data) else: # intermediate stage of pipeline # decoder will get hidden_states from encoder.input_tensor @@ -154,7 +154,7 @@ def forward( if args.use_balancing_loss is not None: args.l_aux = 0.0 args.l_router = 0.0 - print('SHAPE OF DATA BEFORE DECODER IN GPTMODELPY:', decoder_input.shape) + hidden_states = self.decoder( hidden_states=decoder_input, attention_mask=attention_mask, From b157390ebe7925c0a5c966ec9ff16acf31573603 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Thu, 21 Dec 2023 03:04:18 -0600 Subject: [PATCH 64/84] Update gpt_model.py --- megatron/core/models/gpt/gpt_model.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index 79f8695f44..832b6f9e3d 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -133,8 +133,6 @@ def forward( pass elif self.pre_process: decoder_input = self.embedding(input_ids=input_ids, position_ids=position_ids) - if torch.distributed.get_rank() == 0: - print(self.embedding.word_embeddings.weight.data, self.output_layer.weight.data) else: # intermediate stage of pipeline # decoder will get hidden_states from encoder.input_tensor From 214e0d0cc9ef5b44e39bad97f6fb7c7d41d54b43 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Thu, 21 Dec 2023 03:31:21 -0600 Subject: [PATCH 65/84] Update pretrain_gpt.py --- pretrain_gpt.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pretrain_gpt.py b/pretrain_gpt.py index d0d8a214d3..9129beb0bb 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -178,7 +178,6 @@ def forward_step(data_iterator, model: GPTModel): tokens, labels, loss_mask, attention_mask, position_ids = get_batch( data_iterator) timers('batch-generator').stop() - print('SHAPE OF DATA AFTER GET_BATCH:', tokens.shape, labels.shape) if args.enable_manual_profiling: torch.cuda.nvtx.range_pop() if args.enable_manual_profiling: torch.cuda.nvtx.range_push(f"Forward pass") From 0597d752a59233ffeb9bcd21867eadfd5747d25a Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Thu, 21 Dec 2023 03:31:59 -0600 Subject: [PATCH 66/84] Update gpt_model.py --- megatron/core/models/gpt/gpt_model.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index 832b6f9e3d..9bec0f8516 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -105,10 +105,7 @@ def __init__( ) if self.share_embeddings_and_output_weights and (self.pre_process or self.post_process): - print('THIS CONDITION TRUE') self.initialize_last_stage_with_word_embeddings() - else: - print('THIS CONDITION FALSE') def forward( self, @@ -169,7 +166,6 @@ def forward( output_weight = self.shared_embedding_or_output_weight() logits, _ = self.output_layer(hidden_states, weight=output_weight) - print('SHAPES BEFORE LOSS COMPUTE:', logits.shape, labels.shape) if labels is None: # [s b h] => [b s h] return logits.transpose(0, 1).contiguous() From 2b017d1794df1fdc991a76d314a766fa88e7a218 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Thu, 21 Dec 2023 03:32:31 -0600 Subject: [PATCH 67/84] Update transformer_layer.py --- megatron/core/transformer/transformer_layer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py index 1bccb81e6e..117a5d218e 100644 --- a/megatron/core/transformer/transformer_layer.py +++ b/megatron/core/transformer/transformer_layer.py @@ -175,11 +175,9 @@ def forward( # TODO: could we move `bias_dropout_add_exec_handler` itself # inside the module provided in the `bias_dropout_add_spec` module? with self.bias_dropout_add_exec_handler(): - print('SHAPE OF DATA BEFORE ATTENTION:', hidden_states.shape) hidden_states = self.self_attn_bda(self.training, self.config.bias_dropout_fusion)( attention_output_with_bias, residual, self.config.hidden_dropout ) - print('SHAPE OF DATA AFTER ATTENTION:', hidden_states.shape) # Residual connection. residual = hidden_states From 09b122861a71574f74f89c8ba142c1c68b7e28c1 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Fri, 29 Dec 2023 00:04:08 -0400 Subject: [PATCH 68/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index 60249fa6bf..e0c1640c91 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -168,8 +168,8 @@ def forward(self, hidden_states): # Evaluate router loss - if hasattr(args, 'l_router') and self.training: - args.l_router -= torch.sum(route * torch.log(route + 1e-9)) + # if hasattr(args, 'l_router') and self.training: + # args.l_router -= torch.sum(route * torch.log(route + 1e-9)) # Evaluate balancing loss. if (args.use_balancing_loss is not None) and self.training: From 3800c0ef2f3828aad17b6186489b13ccea0ea991 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Fri, 29 Dec 2023 00:13:16 -0400 Subject: [PATCH 69/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index e0c1640c91..93a71350e4 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -166,11 +166,6 @@ def forward(self, hidden_states): if self.config.timers is not None: self.config.timers('routing_gather').stop() - - # Evaluate router loss - # if hasattr(args, 'l_router') and self.training: - # args.l_router -= torch.sum(route * torch.log(route + 1e-9)) - # Evaluate balancing loss. if (args.use_balancing_loss is not None) and self.training: if hasattr(args, 'l_aux'): From 5a583a2cfb80662eb7d4147cedef85e11266a227 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Fri, 29 Dec 2023 00:13:42 -0400 Subject: [PATCH 70/84] Update gpt_model.py --- megatron/core/models/gpt/gpt_model.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py index 9bec0f8516..3cebde66d4 100644 --- a/megatron/core/models/gpt/gpt_model.py +++ b/megatron/core/models/gpt/gpt_model.py @@ -148,7 +148,6 @@ def forward( args = get_args() if args.use_balancing_loss is not None: args.l_aux = 0.0 - args.l_router = 0.0 hidden_states = self.decoder( hidden_states=decoder_input, @@ -173,8 +172,7 @@ def forward( loss = self.compute_language_model_loss(labels, logits) if args.use_balancing_loss is not None: loss += args.use_balancing_loss * args.l_aux - loss += args.l_router - + return loss def shared_embedding_or_output_weight(self) -> Tensor: From 5123488f30c8fcced6b4f5b03af62b5aa58fa132 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Fri, 29 Dec 2023 01:02:10 -0400 Subject: [PATCH 71/84] Update fused_softmax.py --- megatron/model/fused_softmax.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py index 9bacf33740..e94328df95 100644 --- a/megatron/model/fused_softmax.py +++ b/megatron/model/fused_softmax.py @@ -145,8 +145,10 @@ def forward(self, input, mask): assert input.dim() == 4 if self.is_kernel_available(mask, *input.size()): + print('TEST_fused') return self.forward_fused_softmax(input, mask) else: + print('TEST1_notfused') return self.forward_torch_softmax(input, mask) def is_kernel_available(self, mask, b, np, sq, sk): From 0d6f7c94a9615e5fd412847e523057e68a9c579e Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Fri, 29 Dec 2023 01:25:32 -0400 Subject: [PATCH 72/84] Update fused_softmax.py --- megatron/model/fused_softmax.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py index e94328df95..9bacf33740 100644 --- a/megatron/model/fused_softmax.py +++ b/megatron/model/fused_softmax.py @@ -145,10 +145,8 @@ def forward(self, input, mask): assert input.dim() == 4 if self.is_kernel_available(mask, *input.size()): - print('TEST_fused') return self.forward_fused_softmax(input, mask) else: - print('TEST1_notfused') return self.forward_torch_softmax(input, mask) def is_kernel_available(self, mask, b, np, sq, sk): From f8c8de9e71939cf948fbba60ce975bc2da3fda6d Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Fri, 29 Dec 2023 01:26:47 -0400 Subject: [PATCH 73/84] Update fused_softmax.py --- megatron/model/fused_softmax.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py index 9bacf33740..69927b2757 100644 --- a/megatron/model/fused_softmax.py +++ b/megatron/model/fused_softmax.py @@ -196,7 +196,8 @@ def forward_torch_softmax(self, input, mask): if self.scale is not None: input = input * self.scale mask_output = self.mask_func(input, mask) if mask is not None else input - probs = torch.nn.Softmax(dim=-1)(mask_output) + print('TRYING -2') + probs = torch.nn.Softmax(dim=-2)(mask_output) if self.input_in_float16 and self.softmax_in_fp32: if self.input_in_fp16: From 2bfb7c08ac71fb442490913682e9e6bcff600646 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Fri, 29 Dec 2023 01:27:59 -0400 Subject: [PATCH 74/84] Update fused_softmax.py --- megatron/model/fused_softmax.py | 1 - 1 file changed, 1 deletion(-) diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py index 69927b2757..29078ab749 100644 --- a/megatron/model/fused_softmax.py +++ b/megatron/model/fused_softmax.py @@ -196,7 +196,6 @@ def forward_torch_softmax(self, input, mask): if self.scale is not None: input = input * self.scale mask_output = self.mask_func(input, mask) if mask is not None else input - print('TRYING -2') probs = torch.nn.Softmax(dim=-2)(mask_output) if self.input_in_float16 and self.softmax_in_fp32: From 36a23764f15f1f6199cf108d167d44c2caa343e2 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Fri, 29 Dec 2023 11:31:43 -0400 Subject: [PATCH 75/84] Update fused_softmax.py --- megatron/model/fused_softmax.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py index 29078ab749..9bacf33740 100644 --- a/megatron/model/fused_softmax.py +++ b/megatron/model/fused_softmax.py @@ -196,7 +196,7 @@ def forward_torch_softmax(self, input, mask): if self.scale is not None: input = input * self.scale mask_output = self.mask_func(input, mask) if mask is not None else input - probs = torch.nn.Softmax(dim=-2)(mask_output) + probs = torch.nn.Softmax(dim=-1)(mask_output) if self.input_in_float16 and self.softmax_in_fp32: if self.input_in_fp16: From 7048fc1e3b4fa5fb7de36cc31951a2a94b055521 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Mon, 1 Jan 2024 12:12:20 -0600 Subject: [PATCH 76/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index 93a71350e4..0866028abc 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -99,7 +99,7 @@ def gather_indices(self, local_indices): def forward(self, hidden_states): args = get_args() hidden_shape = hidden_states.shape - route = self.router(hidden_states) + route = self.router(hidden_states.detach()) route = route.view(-1, self.num_moe_experts) if self.config.timers is not None: From 431fa63f4e5ab32459578e89cc7826a364b6ffdb Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Mon, 1 Jan 2024 15:43:26 -0600 Subject: [PATCH 77/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index 0866028abc..93a71350e4 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -99,7 +99,7 @@ def gather_indices(self, local_indices): def forward(self, hidden_states): args = get_args() hidden_shape = hidden_states.shape - route = self.router(hidden_states.detach()) + route = self.router(hidden_states) route = route.view(-1, self.num_moe_experts) if self.config.timers is not None: From fc5390c278b84fcf95b2e6cd503769c7a0ddb710 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Mon, 1 Jan 2024 19:47:43 -0600 Subject: [PATCH 78/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index 93a71350e4..f49281f6a7 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -261,7 +261,7 @@ def forward(self, hidden_states): if self.config.timers is not None: self.config.timers('final_route', log_level=2).start() - output_total = output_total * max_prob + output_total = output_total * max_prob / (max_prob.detach()) if self.routing == 'top2' or self.routing == 'sinkhorn_top2': output_total = (output_total + output_total_2 * max_prob_2) output_total = output_total.view(hidden_shape) From 5791a87bd0d853da0f26e5bb3b2a319539e58b5d Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Mon, 1 Jan 2024 19:55:50 -0600 Subject: [PATCH 79/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index f49281f6a7..93a71350e4 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -261,7 +261,7 @@ def forward(self, hidden_states): if self.config.timers is not None: self.config.timers('final_route', log_level=2).start() - output_total = output_total * max_prob / (max_prob.detach()) + output_total = output_total * max_prob if self.routing == 'top2' or self.routing == 'sinkhorn_top2': output_total = (output_total + output_total_2 * max_prob_2) output_total = output_total.view(hidden_shape) From 03aecf6b104ddfd668116773687f91f4c1980413 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Mon, 1 Jan 2024 21:59:44 -0600 Subject: [PATCH 80/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index 93a71350e4..9a571c75fa 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -27,11 +27,15 @@ def sinkhorn(cost, tol=0.0001): eps = 0.00000001 error = 1e9 d1_old = d1 + t = 0 while error > tol: + t += 1 d0 = (1 / d0.size(0)) * 1 / (torch.sum(d1 * cost, 1) + eps) d1 = (1 / d1.size(0)) * 1 / (torch.sum(d0.unsqueeze(1) * cost, 0) + eps) error = torch.mean(torch.abs(d1_old - d1)) d1_old = d1 + if t > 1: + print('NUMBER OF STEPS:', t) return d1 * cost * d0.unsqueeze(1) def save_token_count(token_count, layer, iteration, router_profiling_path): From 0e5b34fa3889e877fe09856919d5c8ca22584adc Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Mon, 1 Jan 2024 22:11:23 -0600 Subject: [PATCH 81/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index 9a571c75fa..ed415e41fc 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -27,16 +27,15 @@ def sinkhorn(cost, tol=0.0001): eps = 0.00000001 error = 1e9 d1_old = d1 - t = 0 while error > tol: - t += 1 d0 = (1 / d0.size(0)) * 1 / (torch.sum(d1 * cost, 1) + eps) d1 = (1 / d1.size(0)) * 1 / (torch.sum(d0.unsqueeze(1) * cost, 0) + eps) error = torch.mean(torch.abs(d1_old - d1)) d1_old = d1 - if t > 1: - print('NUMBER OF STEPS:', t) - return d1 * cost * d0.unsqueeze(1) + # return d1 * cost * d0.unsqueeze(1) + route = torch.softmax(2.0 * cost, dim=0) / torch.sum(torch.softmax(2.0 * cost, dim=0), dim=1, keepdim=True) + route = (1/self.num_moe_experts) * (route / torch.sum(route, dim=0, keepdim=True)) + return route def save_token_count(token_count, layer, iteration, router_profiling_path): token_count_list = token_count.cpu().tolist() From 704726d7b8d82ce2730793bcdbe3556824516705 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Mon, 1 Jan 2024 22:12:25 -0600 Subject: [PATCH 82/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index ed415e41fc..5317702129 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -19,7 +19,7 @@ def sinkhorn(cost, tol=0.0001): "Sinkhorn based MoE routing function" - cost = torch.exp(2.0 * cost) + """cost = torch.exp(2.0 * cost) d0 = torch.ones(cost.size(0), device=cost.device, dtype=cost.dtype) # d1 = torch.ones(cost.size(1), device=cost.device, dtype=cost.dtype) d1 = 1 / (cost.size(1) * torch.sum(cost, 0)) @@ -32,7 +32,7 @@ def sinkhorn(cost, tol=0.0001): d1 = (1 / d1.size(0)) * 1 / (torch.sum(d0.unsqueeze(1) * cost, 0) + eps) error = torch.mean(torch.abs(d1_old - d1)) d1_old = d1 - # return d1 * cost * d0.unsqueeze(1) + return d1 * cost * d0.unsqueeze(1)""" route = torch.softmax(2.0 * cost, dim=0) / torch.sum(torch.softmax(2.0 * cost, dim=0), dim=1, keepdim=True) route = (1/self.num_moe_experts) * (route / torch.sum(route, dim=0, keepdim=True)) return route From 277afeb213d633761bed45ef2ebfb4587bb8365b Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Mon, 1 Jan 2024 22:16:15 -0600 Subject: [PATCH 83/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index 5317702129..6d28da9a4b 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -34,7 +34,7 @@ def sinkhorn(cost, tol=0.0001): d1_old = d1 return d1 * cost * d0.unsqueeze(1)""" route = torch.softmax(2.0 * cost, dim=0) / torch.sum(torch.softmax(2.0 * cost, dim=0), dim=1, keepdim=True) - route = (1/self.num_moe_experts) * (route / torch.sum(route, dim=0, keepdim=True)) + route = (1/cost.size(1)) * (route / torch.sum(route, dim=0, keepdim=True)) return route def save_token_count(token_count, layer, iteration, router_profiling_path): From 4a31d0ff5463183c3dd786c776b61b00014c5498 Mon Sep 17 00:00:00 2001 From: pglorio <85982602+pglorio@users.noreply.github.com> Date: Mon, 1 Jan 2024 22:30:15 -0600 Subject: [PATCH 84/84] Update switch_mlp.py --- megatron/core/transformer/switch_mlp.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/megatron/core/transformer/switch_mlp.py b/megatron/core/transformer/switch_mlp.py index 6d28da9a4b..93a71350e4 100644 --- a/megatron/core/transformer/switch_mlp.py +++ b/megatron/core/transformer/switch_mlp.py @@ -19,7 +19,7 @@ def sinkhorn(cost, tol=0.0001): "Sinkhorn based MoE routing function" - """cost = torch.exp(2.0 * cost) + cost = torch.exp(2.0 * cost) d0 = torch.ones(cost.size(0), device=cost.device, dtype=cost.dtype) # d1 = torch.ones(cost.size(1), device=cost.device, dtype=cost.dtype) d1 = 1 / (cost.size(1) * torch.sum(cost, 0)) @@ -32,10 +32,7 @@ def sinkhorn(cost, tol=0.0001): d1 = (1 / d1.size(0)) * 1 / (torch.sum(d0.unsqueeze(1) * cost, 0) + eps) error = torch.mean(torch.abs(d1_old - d1)) d1_old = d1 - return d1 * cost * d0.unsqueeze(1)""" - route = torch.softmax(2.0 * cost, dim=0) / torch.sum(torch.softmax(2.0 * cost, dim=0), dim=1, keepdim=True) - route = (1/cost.size(1)) * (route / torch.sum(route, dim=0, keepdim=True)) - return route + return d1 * cost * d0.unsqueeze(1) def save_token_count(token_count, layer, iteration, router_profiling_path): token_count_list = token_count.cpu().tolist()