From a3e55c11644ff0ef2bd8179c136cac83d275e245 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Fri, 1 Dec 2023 14:01:40 +0800 Subject: [PATCH 01/48] Update transformer.py add alibi position embedding --- megatron/model/transformer.py | 143 ++++++++++++++++++++++++++++------ 1 file changed, 120 insertions(+), 23 deletions(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index cd6a9dd444..2f98781500 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -23,10 +23,23 @@ except ImportError: rearrange = None +# try: +# from flash_attn.flash_attn_interface import flash_attn_unpadded_func +# from flash_attn.flash_attn_triton import flash_attn_func +# except ImportError: +# flash_attn_unpadded_func = None +# flash_attn_func = None + try: from flash_attn.flash_attn_interface import flash_attn_unpadded_func + from flash_attn.flash_attn_triton import flash_attn_func except ImportError: - flash_attn_unpadded_func = None + try: + from flash_attn.flash_attn_interface import flash_attn_varlen_func as flash_attn_unpadded_func + from flash_attn.flash_attn_triton import flash_attn_func + except ImportError: + flash_attn_unpadded_func = None + flash_attn_func = None """ We use the following notation throughout this file: h: hidden size @@ -267,12 +280,14 @@ def forward(self, query_layer, key_layer, key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1) + # preallocting input tensor: [b * np, sq, sk] # preallocting input tensor: [b * np, sq, sk] matmul_input_buffer = mpu.get_global_memory_buffer().get_tensor( - (output_size[0]*output_size[1], output_size[2], output_size[3]), + (output_size[0]*output_size[1], + output_size[2], + output_size[3]), query_layer.dtype, "mpu") - # Raw attention scores. [b * np, sq, sk] matmul_result = torch.baddbmm( matmul_input_buffer, query_layer.transpose(0, 1), # [b * np, sq, hn] @@ -287,8 +302,7 @@ def forward(self, query_layer, key_layer, # =========================== # attention scores and attention mask [b, np, sq, sk] - attention_probs = self.scale_mask_softmax(attention_scores, - attention_mask) + attention_probs = self.scale_mask_softmax(attention_scores, attention_mask) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. @@ -356,7 +370,7 @@ def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0, self.softmax_scale = softmax_scale self.dropout_p = attention_dropout - def forward(self, q, k, v): + def forward(self, q, k, v, bias=None): """Implements the multihead softmax attention. Arguments --------- @@ -368,8 +382,8 @@ def forward(self, q, k, v): batch_size, seqlen_q = q.shape[0], q.shape[1] seqlen_k = k.shape[1] + # q, k, v = [rearrange(x, 'b s ... -> (b s) ...') for x in [q, k, v]] - q, k, v = [rearrange(x, 'b s ... -> (b s) ...') for x in [q, k, v]] cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, device=q.device) @@ -387,13 +401,17 @@ def forward(self, q, k, v): device=q.device) self.dropout_p = 0 - output = flash_attn_unpadded_func( - q, k, v, cu_seqlens_q, cu_seqlens_k, seqlen_q, seqlen_k, - self.dropout_p, - softmax_scale=self.softmax_scale, causal=is_causal - ) - - output = rearrange(output, '(b s) ... -> b s ...', b=batch_size) + if bias is None: + output = flash_attn_unpadded_func( + q, k, v, cu_seqlens_q, cu_seqlens_k, seqlen_q, seqlen_k, + self.dropout_p, + softmax_scale=self.softmax_scale, causal=is_causal + ) + else: + output = flash_attn_func( + q, k, v, bias, self.softmax_scale, is_causal + ) + # output = rearrange(output, '(b s) ... -> b s ...', b=batch_size) return output @@ -497,7 +515,7 @@ def custom_forward(*inputs): value_layer = inputs[2] attention_mask = inputs[3] output_ = self.core_attention(query_layer, key_layer, - value_layer, attention_mask) + value_layer, attention_mask) return output_ q_pos_emb, k_pos_emb = (None, None) if rotary_pos_emb is None \ @@ -581,7 +599,6 @@ def forward(self, hidden_states, attention_mask, (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) query_layer = query_layer.view(*new_tensor_shape) - # ================================== # Adjust key and value for inference # ================================== @@ -658,9 +675,9 @@ def forward(self, hidden_states, attention_mask, for x in (query_layer, key_layer, value_layer)] if not self.sequence_parallel: with tensor_parallel.get_cuda_rng_tracker().fork(): - context_layer = self.core_attention_flash(q, k, v) + context_layer = self.core_attention_flash(q, k, v, bias=attention_mask) else: - context_layer = self.core_attention_flash(q, k, v) + context_layer = self.core_attention_flash(q, k, v, bias=attention_mask) context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous() # ================= @@ -794,7 +811,7 @@ def forward(self, hidden_states, attention_mask, attention_mask, inference_params=inference_params, rotary_pos_emb=rotary_pos_emb) - + # Residual connection. if self.apply_residual_connection_post_layernorm: residual = layernorm_output @@ -995,7 +1012,9 @@ def __init__(self, init_method, output_layer_init_method, self.input_tensor = None self.drop_path_rate = drop_path_rate self.transformer_impl = args.transformer_impl - + self.position_embedding_type = args.position_embedding_type + self.num_attention_heads = args.num_attention_heads + self.params_dtype = args.params_dtype # Store activation checkpoiting flag. self.recompute_granularity = args.recompute_granularity self.recompute_method = args.recompute_method @@ -1222,11 +1241,91 @@ def set_input_tensor(self, input_tensor): forward_step_func""" self.input_tensor = input_tensor + @staticmethod + def _build_alibi_tensor(max_seq_len, num_attention_heads, batch_size): + # Copied from bigscience-workshop/Megatron-DeepSpeed + # Based on https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742 + """Returns tensor shaped + (1, num_attention_heads_per_partition, 1, max_seq_len), + """ + + def get_slopes(n): + def get_slopes_power_of_2(n): + start = (2 ** (-2 ** -(math.log2(n) - 3))) + ratio = start + return [start * ratio ** i for i in range(n)] + + if math.log2(n).is_integer(): + return get_slopes_power_of_2(n) + else: + closest_power_of_2 = 2 ** math.floor(math.log2(n)) + return ( + get_slopes_power_of_2(closest_power_of_2) + + get_slopes( + 2 * closest_power_of_2, + )[0::2][:n - closest_power_of_2] + ) + + slopes = torch.Tensor(get_slopes(num_attention_heads)) + alibi = ( + slopes.unsqueeze(1).unsqueeze(1) + * torch.arange(max_seq_len).unsqueeze(0).unsqueeze(0).expand( + num_attention_heads, -1, -1) + ) + + # Select the part of the tensor that corresponds to our tensor + # parallel index. + tp_world_size = mpu.get_tensor_model_parallel_world_size() + tp_index = mpu.get_tensor_model_parallel_rank() + alibi = alibi.reshape((tp_world_size, -1, *alibi.shape[1:]))[tp_index] # [num_attention_heads/world, 1, max_seq_len] + + # alibi = alibi.unsqueeze(0) + alibi_mask = torch.triu(_fill_with_neg_inf(torch.zeros([max_pos, max_pos])), 1) # [max_seq_len, max_seq_len] + alibi_mask = alibi_mask.unsqueeze(0) + alibi # [num_attention_heads/world, max_seq_len, max_seq_len] + return alibi_mask + def forward(self, hidden_states, attention_mask, encoder_output=None, enc_dec_attn_mask=None, inference_params=None, rotary_pos_emb=None): # hidden_states: [s, b, h] - + if self.position_embedding_type == "alibi": + # assert not args.use_flash_attn, \ + # 'ALiBi does not work with FlashAttention currently' + seq_len = hidden_states.shape[0] + if self.sequence_parallel: + seq_len = seq_len * mpu.get_tensor_model_parallel_world_size() + alibi_mask = self._build_alibi_tensor( + seq_len, + self.num_attention_heads, + hidden_states.shape[1], + ).to(torch.cuda.current_device()) + if self.params_dtype is torch.float16: + alialibi_maskbi = alibi_mask.to(torch.float16) + elif self.params_dtype is torch.bfloat16: + alibi_mask = alibi_mask.to(torch.bfloat16) + if attention_mask is not None: + print("attention_mask", attention_mask.shape) + if len(attention_mask.shape) == 2: + expanded_mask = attention_mask.to(alibi_mask.dtype) + expanded_mask = torch.tril( + torch.gt(expanded_mask[:, :, None] * expanded_mask[:, None, :], 0) + ) * torch.eq(expanded_mask[:, :, None] - expanded_mask[:, None, :], 0) + else: + expanded_mask = attention_mask + bsz = inputs_embeds.size(0) + src_len, tgt_len = alibi_mask.size()[-2:] + expanded_mask = ( + expanded_mask.unsqueeze(1) + .expand(bsz, 1, src_len, tgt_len) + .to(alibi_mask.dtype) + ) + inverted_mask = 1.0 - expanded_mask + inverted_mask = inverted_mask.masked_fill( + inverted_mask.to(torch.bool), torch.finfo(alibi_mask.dtype).min + ) + attention_mask = inverted_mask + alibi_mask.unsqueeze(0) + else: + attention_mask = alibi_mask # Checks. if inference_params: assert self.recompute_granularity is None, \ @@ -1296,7 +1395,6 @@ def forward(self, hidden_states, attention_mask, forward_kwargs['checkpoint_core_attention'] = self.checkpoint_core_attention else: forward_kwargs['rotary_pos_emb'] = rotary_pos_emb - for index in range(self.num_layers): layer = self._get_layer(index) @@ -1312,5 +1410,4 @@ def forward(self, hidden_states, attention_mask, # Final layer norm. if self.post_process and self.post_layer_norm: hidden_states = self.final_layernorm(hidden_states) - return hidden_states From cf7c168043d0f1318e39265136bbfd66779895d6 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Fri, 1 Dec 2023 14:08:58 +0800 Subject: [PATCH 02/48] Create pretrain_baichuan.py --- pretrain_baichuan.py | 122 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 pretrain_baichuan.py diff --git a/pretrain_baichuan.py b/pretrain_baichuan.py new file mode 100644 index 0000000000..f5fc978a2f --- /dev/null +++ b/pretrain_baichuan.py @@ -0,0 +1,122 @@ +# Copyright (c) 2023, ALIBABA CORPORATION. All rights reserved. + +"""Pretrain Baichuan""" +import os + +import torch +from functools import partial +from megatron import get_args +from megatron import print_rank_0 +from megatron import get_timers +from megatron import get_tokenizer +from megatron.core import tensor_parallel +from megatron.core.enums import ModelType +from megatron.data.gpt_dataset import build_train_valid_test_datasets +from megatron.model import BaichuanModel +from megatron.training import pretrain +from megatron.utils import get_ltor_masks_and_position_ids +from megatron.utils import average_losses_across_data_parallel_group + + +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + + print_rank_0('building Baichuan model ...') + model = BaichuanModel( + num_tokentypes=0, + parallel_output=True, + pre_process=pre_process, + post_process=post_process + ) + return model + + +def get_batch(data_iterator): + """Generate a batch""" + args = get_args() + tokenizer = get_tokenizer() + + # Items and their type. + keys = ['text'] + datatype = torch.int64 + + # Broadcast data. + if data_iterator is not None: + data = next(data_iterator) + else: + data = None + data_b = tensor_parallel.broadcast_data(keys, data, datatype) + + # Unpack. + tokens_ = data_b['text'].long() + labels = tokens_[:, 1:].contiguous() + tokens = tokens_[:, :-1].contiguous() + + # Get the masks and postition ids. + attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( + tokens, + tokenizer.eod, + args.reset_position_ids, + args.reset_attention_mask, + args.eod_mask_loss) + + return tokens, labels, loss_mask, attention_mask, position_ids + +def loss_func(loss_mask, output_tensor): + loss = output_tensor.float() + + # Reduce loss for logging. + averaged_loss = average_losses_across_data_parallel_group([loss]) + + return loss, {'lm loss': averaged_loss[0]} + + +def forward_step(data_iterator, model): + """Forward step.""" + timers = get_timers() + + # Get the batch. + timers('batch-generator', log_level=2).start() + tokens, labels, loss_mask, attention_mask, position_ids = get_batch( + data_iterator) + timers('batch-generator').stop() + + output_tensor = model(tokens, position_ids, attention_mask, + labels=labels) + + return output_tensor, partial(loss_func, loss_mask) + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid, and test datasets.""" + args = get_args() + + print_rank_0('> building train, validation, and test datasets ' + 'for Baichuan ...') + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=args.data_path, + data_impl=args.data_impl, + splits_string=args.split, + train_valid_test_num_samples=train_val_test_num_samples, + seq_length=args.seq_length, + seed=args.seed, + skip_warmup=(not args.mmap_warmup), + train_data_prefix=args.train_data_path, + valid_data_prefix=args.valid_data_path, + test_data_prefix=args.test_data_path) + print_rank_0("> finished creating Baichuan datasets ...") + + return train_ds, valid_ds, test_ds + + +if __name__ == "__main__": + os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1' + # os.environ['NCCL_NET_GDR_READ'] = '0' + # os.environ['NCCL_NET_GDR_LEVEL'] = '0' + os.environ['NCCL_MIN_NCHANNELS'] = '16' + + pretrain(train_valid_test_datasets_provider, model_provider, + ModelType.encoder_or_decoder, + forward_step, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer'} + ) From 9ffe45ca431cbbdc1e62a5e917f09a55499926a2 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Fri, 1 Dec 2023 14:12:14 +0800 Subject: [PATCH 03/48] Create baichuan_model.py --- megatron/model/baichuan_model.py | 169 +++++++++++++++++++++++++++++++ 1 file changed, 169 insertions(+) create mode 100644 megatron/model/baichuan_model.py diff --git a/megatron/model/baichuan_model.py b/megatron/model/baichuan_model.py new file mode 100644 index 0000000000..0ffc016b2f --- /dev/null +++ b/megatron/model/baichuan_model.py @@ -0,0 +1,169 @@ +# Copyright (c) 2023, ALIBABA CORPORATION. All rights reserved. + + +"""Baichuan model.""" + +import torch + +from megatron import get_args +from megatron.core import tensor_parallel +from .module import MegatronModule + +from .enums import AttnMaskType +from .language_model import parallel_lm_logits +from .language_model import get_language_model +from .utils import init_method_normal +from .utils import scaled_init_method_normal + + +def post_language_model_processing(lm_output, labels, logit_weights, + parallel_output, + fp16_lm_cross_entropy): + + # Output. Format [s b h] + output = parallel_lm_logits( + lm_output, + logit_weights, + parallel_output) + + if labels is None: + # [s b h] => [b s h] + return output.transpose(0,1).contiguous() + else: + # [b s] => [s b] + labels = labels.transpose(0,1).contiguous() + if fp16_lm_cross_entropy: + assert output.dtype == torch.half + loss = tensor_parallel.vocab_parallel_cross_entropy(output, labels) + else: + loss = tensor_parallel.vocab_parallel_cross_entropy(output.float(), labels) + + # [s b] => [b, s] + loss = loss.transpose(0,1).contiguous() + return loss + + +class BaichuanModel(MegatronModule): + """Baichuan Language model.""" + + def __init__(self, + num_tokentypes=0, + parallel_output=True, + pre_process=True, + post_process=True): + args = get_args() + super(BaichuanModel, self).__init__(share_word_embeddings=not args.untie_embeddings_and_output_weights) + + self.parallel_output = parallel_output + self.pre_process = pre_process + self.post_process = post_process + self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy + self.untie_embeddings_and_output_weights = args.untie_embeddings_and_output_weights + self.sequence_parallel = args.sequence_parallel + self.padded_vocab_size = args.padded_vocab_size + + self.language_model, self._language_model_key = get_language_model( + num_tokentypes=num_tokentypes, + add_pooler=False, + encoder_attn_mask_type=AttnMaskType.causal, + init_method=init_method_normal(args.init_method_std), + scaled_init_method=scaled_init_method_normal(args.init_method_std, + args.num_layers), + pre_process=self.pre_process, + post_process=self.post_process) + + self.causal_lm = args.causal_lm + + if not args.untie_embeddings_and_output_weights and not self.causal_lm: + self.initialize_word_embeddings(init_method_normal) + + if self.causal_lm and self.post_process: + self.lm_head = torch.nn.Linear(args.hidden_size, args.padded_vocab_size, bias=False) + + def set_input_tensor(self, input_tensor): + """See megatron.model.transformer.set_input_tensor()""" + self.language_model.set_input_tensor(input_tensor) + + def _causal_lm_process(self, lm_output, labels): + if self.sequence_parallel: + lm_output = tensor_parallel.gather_from_sequence_parallel_region(lm_output, False) + lm_output = lm_output.transpose(0, 1) + logits = self.lm_head(lm_output) + + if labels is None: + return logits + else: + loss = None + # [invalid] Shift so that tokens < n predict n + # Do not need to shift here + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., :-1].contiguous() + # Flatten the tokens + loss_fct = torch.nn.CrossEntropyLoss(ignore_index=0) + shift_logits = shift_logits.view(-1, self.padded_vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + return loss + + def forward(self, input_ids, position_ids, attention_mask, + ret_input_ids=None, ret_position_ids=None, ret_attn_mask=None, + labels=None, tokentype_ids=None, inference_params=None): + lm_output = self.language_model( + input_ids, + position_ids, + attention_mask, + ret_input_ids=ret_input_ids, + ret_position_ids=ret_position_ids, + ret_attn_mask=ret_attn_mask, + inference_params=inference_params) + + if self.post_process: + if self.causal_lm: + return self._causal_lm_process(lm_output=lm_output, labels=labels) + else: + return post_language_model_processing( + lm_output, labels, + self.language_model.output_layer.weight if self.untie_embeddings_and_output_weights else self.word_embeddings_weight(), + self.parallel_output, + self.fp16_lm_cross_entropy) + else: + return lm_output + + def state_dict_for_save_checkpoint(self, prefix='', keep_vars=False): + + state_dict_ = {} + state_dict_[self._language_model_key] \ + = self.language_model.state_dict_for_save_checkpoint( + prefix=prefix, keep_vars=keep_vars) + # Save word_embeddings. + if (self.post_process + and not self.pre_process + and not self.untie_embeddings_and_output_weights + and not self.causal_lm): + state_dict_[self._word_embeddings_for_head_key] \ + = self.word_embeddings.state_dict(prefix=prefix, + keep_vars=keep_vars) + if self.post_process and self.causal_lm: + state_dict_['lm_head'] = self.lm_head.state_dict() + + return state_dict_ + + def load_state_dict(self, state_dict, strict=True): + """Customized load.""" + + if self.causal_lm and self.post_process: + self.lm_head.load_state_dict(state_dict['lm_head'], strict=strict) + + # Load word_embeddings. + if self.post_process and \ + not self.pre_process \ + and not self.untie_embeddings_and_output_weights \ + and not self.causal_lm: + self.word_embeddings.load_state_dict( + state_dict[self._word_embeddings_for_head_key], strict=strict) + if self._language_model_key in state_dict: + state_dict = state_dict[self._language_model_key] + self.language_model.load_state_dict(state_dict, strict=strict) From 852f44ba9f6c4bf8c2e43f98d17d6ac0bcf10829 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Fri, 1 Dec 2023 14:15:52 +0800 Subject: [PATCH 04/48] Create baichuan_checkpoint_conversion.py --- .../baichuan_checkpoint_conversion.py | 876 ++++++++++++++++++ 1 file changed, 876 insertions(+) create mode 100644 tools/checkpoint_conversion/baichuan_checkpoint_conversion.py diff --git a/tools/checkpoint_conversion/baichuan_checkpoint_conversion.py b/tools/checkpoint_conversion/baichuan_checkpoint_conversion.py new file mode 100644 index 0000000000..292b241000 --- /dev/null +++ b/tools/checkpoint_conversion/baichuan_checkpoint_conversion.py @@ -0,0 +1,876 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import shutil +import json +import os +import re +import sys +import types + +import torch + +from transformers import AutoConfig +from transformers.modeling_utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME, shard_checkpoint + + +def add_checkpointing_args(parser): + parser.add_argument("--megatron-path", type=str, default=None, help="Base directory of Megatron repository") + parser.add_argument( + "--convert_checkpoint_from_megatron_to_transformers", + action="store_true", + help=( + "If True, convert a Megatron checkpoint to a Transformers checkpoint. " + "If False, convert a Transformers checkpoint to a Megatron checkpoint." + ), + ) + parser.add_argument( + "--load_path", + type=str, + required=True, + help="Path to the checkpoint to convert.", + ) + parser.add_argument( + "--save_path", + type=str, + required=True, + help="Path to the converted checkpoint.", + ) + parser.add_argument( + "--config_path", + type=str, + help="Path to the converted checkpoint.", + default="", + ) + parser.add_argument("--print-checkpoint-structure", action="store_true") + + return parser + + +def add_megatron_checkpoint_args(parser): + parser.add_argument( + "--target_tensor_model_parallel_size", + type=int, + default=1, + help=( + "The tensor model parallel size of the converted checkpoint. " + "Only used when converting a Transformers checkpoint to a Megatron checkpoint." + ), + ) + parser.add_argument( + "--target_pipeline_model_parallel_size", + type=int, + default=1, + help=( + "The pipeline model parallel size of the converted checkpoint. " + "Only used when converting a Transformers checkpoint to a Megatron checkpoint." + ), + ) + parser.add_argument( + "--target_data_parallel_size", + type=int, + default=1, + help=( + "The data parallel size of the converted checkpoint. " + "Only used when converting a Transformers checkpoint to a Megatron checkpoint." + ), + ) + parser.add_argument( + "--target_params_dtype", + type=str, + default="fp32", + help=( + "The dtype of the converted checkpoint. " + "Only used when converting a Transformers checkpoint to a Megatron checkpoint." + ), + ) + parser.add_argument( + "--make_vocab_size_divisible_by", + type=int, + default=128, + help=( + "Pad the vocab size to be divisible by this value. " + "This is added for computational efficieny reasons. " + "Only used when converting a Transformers checkpoint to a Megatron checkpoint." + ), + ) + parser.add_argument( + "--use_distributed_optimizer", + action="store_true", + help=( + "If True, use the distributed optimizer. " + "Only used when converting a Transformers checkpoint to a Megatron checkpoint." + ), + ) + return parser + + +def add_transformers_checkpoint_args(parser): + parser.add_argument( + "--tokenizer_name", + type=str, + default=None, + help=( + "The name of the pre-trained tokenizer to save. " + "If not None, the tokenizer will be saved. " + "Only used when converting a Megatron checkpoint to a Transformers checkpoint." + ), + ) + parser.add_argument( + "--max_shard_size", + type=str, + default="60GB", + help=( + "The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size " + "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`). " + "Only used when converting a Megatron checkpoint to a Transformers checkpoint." + ), + ) + + return parser + + +# The simple map of names for "automated" rules. +megatron_to_transformers = { + "self_attention.dense": ".self_attn.o_proj.", + "mlp.dense_4h_to_h": ".mlp.down_proj.", +} + +tensor_parallel_params = [ + # megatron-lm layers to merge across tp ranks + "self_attention.query_key_value.weight", + "self_attention.query_key_value.bias", + "self_attention.dense.weight", + "mlp.dense_h_to_4h.weight", + "mlp.dense_h_to_4h.bias", + "mlp.dense_4h_to_h.weight", + # deprecated + "attention.query_key_value.weight", + "attention.query_key_value.bias", + "attention.dense.weight", + # transformers layers to split across tp ranks + "attn.c_attn.weight", + "attn.c_attn.bias", + "attn.c_proj.weight", + "mlp.c_fc.weight", + "mlp.c_fc.bias", + "mlp.c_proj.weight", + 'self_attn.q_proj.weight', + 'self_attn.k_proj.weight', + 'self_attn.v_proj.weight', + 'self_attn.o_proj.weight', + 'mlp.down_proj.weight', + 'mlp.up_proj.weight', + 'mlp.gate_proj.weight', + 'self_attn.W_pack.weight' +] + + +def recursive_print(name, val, spaces=0): + """ + Recursively print the structure of a checkpoint. This function is taken from `convert_megatron_gpt2_checkpoint.py` + + Args: + name (str): the name of the current tensor parameter + val (Tuple(int)): the shape of the current tensor parameter + spaces (int): the number of spaces to print before the output for a nested structure + """ + # Format the message. + if name is None: + msg = None + else: + fmt = "." * max(0, spaces - 2) + "# {:" + str(50 - spaces) + "s}" + msg = fmt.format(name) + + # Print and recurse (if needed). + if isinstance(val, dict): + if msg is not None: + print(msg) + for k in val.keys(): + recursive_print(k, val[k], spaces + 2) + elif isinstance(val, torch.Tensor): + print(msg, ":", val.size()) + else: + print(msg, ":", val) + + +def merge_transformers_sharded_states(path, num_checkpoints): + """ + Merge sharded checkpoints from transformers into a single checkpoint. + + Args: + path (str): the path to the sharded checkpoints + num_checkpoints (int): the number of checkpoints to merge + """ + state_dict = {} + for i in range(1, num_checkpoints + 1): + print('loading', i, ':', num_checkpoints + 1) + checkpoint_path = os.path.join(path, f"pytorch_model-{i:05d}-of-{num_checkpoints:05d}.bin") + if not os.path.exists(checkpoint_path): + checkpoint_path = os.path.join(path, f"pytorch_model-{i}-of-{num_checkpoints}.bin") + assert os.path.exists(checkpoint_path), f"Cannot find checkpoint {checkpoint_path}" + current_chunk = torch.load(checkpoint_path, map_location="cpu") + state_dict.update(current_chunk) + return state_dict + + +def get_megatron_sharded_states(load_path, tp_size, pp_size, pp_rank): + """ + Get sharded checkpoints from NVIDIA Megatron-LM checkpoint based on the provided tensor parallel size, pipeline + parallel size and pipeline parallel rank. + + Args: + args (argparse.Namespace): the arguments to the script + tp_size (int): the tensor parallel size + pp_size (int): the pipeline parallel size + pp_rank (int): the pipeline parallel rank + """ + tp_state_dicts = [] + for i in range(tp_size): + possible_sub_dir_names = [ + f"mp_rank_{i:02d}" if pp_size == 1 else f"mp_rank_{i:02d}_{pp_rank:03d}", + f"mp_rank_{i:02d}_dp_000" if pp_size == 1 else f"mp_rank_{i:02d}_{pp_rank:03d}_dp_000" + ] + sub_dir_name = None + for p in possible_sub_dir_names: + if os.path.exists(os.path.join(load_path, p)): + sub_dir_name = p + break + assert sub_dir_name is not None, f"Cannot find sub dir in {possible_sub_dir_names}" + checkpoint_path = os.path.join(load_path, sub_dir_name, 'model_optim_rng.pt') + state_dict = torch.load(checkpoint_path, map_location="cpu") + tp_state_dicts.append(state_dict) + return tp_state_dicts + + +def get_element_from_dict_by_path(d, path): + """ + Get element from dictionary by path. If element is not present, recursively add empty dictionaries. + + Args: + d (dict): the dictionary to get the element from + path (list): the path to the element which is delimited by "." + """ + path = path.split(".") + for k in path: + if k not in d: + d[k] = {} + d = d[k] + return d + + +def copy_tokenizer(args): + os.makedirs(args.save_path, exist_ok=True) + tokenizer_dir = args.load_path + if os.path.exists(os.path.join(args.load_path, 'tokenizer')): + tokenizer_dir = os.path.join(args.load_path, 'tokenizer') + file_list = os.listdir(tokenizer_dir) + for f in file_list: + if 'token' in f: + shutil.copyfile(os.path.join(tokenizer_dir, f), os.path.join(args.save_path, f)) + + +def convert_checkpoint_from_megatron_to_transformers(args): + """ + Convert NVIDIA Megatron-LM checkpoint to HuggingFace Transformers checkpoint. This handles Megatron checkpoints + with different tensor parallelism and pipeline parallelism sizes. It saves the converted checkpoint into shards + using HuggingFace Transformers checkpoint sharding functionality. + + Args: + args (argparse.Namespace): the arguments to the script + """ + # Search in directory above this + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) + if args.megatron_path is not None: + sys.path.insert(0, args.megatron_path) + + # Load Megatron-LM checkpoint arguments from the state dict + sub_dirs = os.listdir(args.load_path) + release = False + if 'latest_checkpointed_iteration.txt' in sub_dirs: + with open(os.path.join(args.load_path, 'latest_checkpointed_iteration.txt')) as f: + latest_ckpt = f.readline().strip() + print(f"latest checkpoint: {latest_ckpt}") + if isinstance(latest_ckpt, bytearray): + latest_ckpt = latest_ckpt.decode("utf-8") + try: + iteration = int(latest_ckpt) + except ValueError: + release = (latest_ckpt == "release") + if not release: + raise ValueError(f"Invalid latest checkpoint: {latest_ckpt}") + for sub_dir in sub_dirs: + if latest_ckpt in sub_dir: + latest_ckpt = sub_dir + break + else: + raise ValueError('Cannot find latest ckpt!') + possible_state_paths = [os.path.join(args.load_path, latest_ckpt), + os.path.join(args.load_path, latest_ckpt, + 'iter_' + str(iteration) if not release else 'release')] + state_path = None + for p in possible_state_paths: + if os.path.exists(p): + state_path = p + print(f"Loading Megatron-LM checkpoint arguments from: {state_path}") + break + assert state_path is not None, f"Cannot find state path in {possible_state_paths}" + possible_sub_dirs = ["mp_rank_00", "mp_rank_00_000", "mp_rank_00_dp_000", "mp_rank_00_000_dp_000"] + state_dirs = os.listdir(state_path) + for sub_dir in possible_sub_dirs: + if sub_dir in state_dirs: + rank0_checkpoint_path = os.path.join(state_path, sub_dir, 'model_optim_rng.pt') + break + print(f"Loading Megatron-LM checkpoint arguments from: {rank0_checkpoint_path}") + state_dict = torch.load(rank0_checkpoint_path, map_location="cpu") + megatron_args = state_dict.get("args", None) + if megatron_args is None: + raise ValueError( + "Megatron-LM checkpoint does not contain arguments. This utility only supports Megatron-LM checkpoints" + " containing all the megatron arguments. This is because it loads all config related to model" + " architecture, the tensor and pipeline model parallel size from the checkpoint insead of user having to" + " manually specify all the details. Please save Megatron-LM checkpoint along with all the megatron" + " arguments to use this utility." + ) + + # Create Transformers GPT2 config from Megatron-LM arguments + vocab_size = megatron_args.padded_vocab_size + + # params dtype + if args.target_params_dtype == "fp16": + dtype = torch.float16 + elif args.target_params_dtype == "bf16": + dtype = torch.bfloat16 + else: + dtype = torch.float32 + + # config = AutoConfig.from_pretrained(args.config_path) + config = AutoConfig.from_pretrained(args.config_path, trust_remote_code=True) + config.bos_token_id = 1 + config.eos_token_id = 2 + config.hidden_act = 'silu' + config.hidden_size = megatron_args.hidden_size + config.intermediate_size = megatron_args.ffn_hidden_size + config.initializer_range = 0.02 + config.model_max_length = megatron_args.max_position_embeddings + config.model_type = 'baichuan' + config.num_attention_heads = megatron_args.num_attention_heads + config.num_hidden_layers = megatron_args.num_layers + config.pad_token_id = 0 + config.rms_norm_eps = 1e-6 + config.torch_dtype = 'bfloat16' + config.transformers_version = '4.29.2' + config.use_cache = True + config.vocab_size = vocab_size + config.architectures = ["BaichuanForCausalLM"] + + output_state_dict = {} + + tp_size = megatron_args.tensor_model_parallel_size + pp_size = megatron_args.pipeline_model_parallel_size + + # The regex to extract layer names. + layer_re = re.compile(r"layers\.(\d+)\.([a-z0-9_.]+)\.([a-z]+)") + + # Convert. + print("Converting") + + # Embeddings + print("Converting embeddings") + tp_state_dicts = get_megatron_sharded_states(state_path, tp_size, pp_size, 0) + + # Convert and store the word embeddings. + word_embeddings = torch.cat( + [ + get_element_from_dict_by_path( + tp_state_dicts[tp_rank], "model.language_model.embedding.word_embeddings.weight" + ) + for tp_rank in range(tp_size) + ], + dim=0, + ) + word_embeddings = word_embeddings[:vocab_size].to(dtype).clone().detach().contiguous() + output_state_dict["model.embed_tokens.weight"] = word_embeddings + + # Transformer Layers + print("Converting transformer layers") + # The hidden_size per head. + hidden_size_per_head = config.hidden_size // config.num_attention_heads + num_layers = config.num_hidden_layers // pp_size + layer_idx = 0 + for pp_rank in range(pp_size): + if pp_size > 0: + print(f"Converting pipeline parallel rank {pp_rank}") + tp_state_dicts = get_megatron_sharded_states(state_path, tp_size, pp_size, pp_rank) + # The transformer. + path = "model.language_model.encoder" + + # Extract the layers. + for key, val in get_element_from_dict_by_path(tp_state_dicts[0], path).items(): + # Match the name. + m = layer_re.match(key) + # Stop if that's not a layer + if m is None: + break + + # The index of the layer. + layer_idx = int(m.group(1)) + pp_rank * num_layers + # The name of the operation. + op_name = m.group(2) + # Is it a weight or a bias? + weight_or_bias = m.group(3) + # The name of the layer. + layer_name = f"model.layers.{layer_idx}" + + if op_name + "." + weight_or_bias not in tensor_parallel_params: + params = val.to(dtype) + else: + dim = 1 if op_name in ["self_attention.dense", "mlp.dense_4h_to_h"] else 0 + params = torch.cat( + [val] + + [ + get_element_from_dict_by_path(tp_state_dicts[tp_rank], f"{path}")[key] + for tp_rank in range(1, tp_size) + ], + dim=dim, + ).to(dtype) + + # For layernorm(s), simply store the layer norm. + if op_name.endswith("layernorm"): + ln_name = "input_layernorm" if op_name.startswith("input") else "post_attention_layernorm" + output_state_dict[layer_name + "." + ln_name + "." + weight_or_bias] = params + + # Split QKV packed weights + elif op_name == "self_attention.query_key_value" and weight_or_bias == "weight": + params_per_tp = params.chunk(dim=0, chunks=megatron_args.tensor_model_parallel_size) + q = torch.empty(0) + k = torch.empty(0) + v = torch.empty(0) + for t in params_per_tp: + qp, kp, vp = t.chunk(3) + q = torch.cat([q, qp]) + k = torch.cat([k, kp]) + v = torch.cat([v, vp]) + output_state_dict[layer_name + ".self_attn.W_pack.weight"] = torch.cat([q, k, v], dim=0).to(dtype).clone().detach().contiguous() + # output_state_dict[layer_name + ".self_attn.q_proj.weight"] = q.to(dtype).clone().detach().contiguous() + # output_state_dict[layer_name + ".self_attn.k_proj.weight"] = k.to(dtype).clone().detach().contiguous() + # output_state_dict[layer_name + ".self_attn.v_proj.weight"] = v.to(dtype).clone().detach().contiguous() + + elif op_name == "mlp.dense_h_to_4h" and weight_or_bias == "weight": + params_per_tp = params.chunk(dim=0, chunks=megatron_args.tensor_model_parallel_size) + gate = torch.empty(0) + up = torch.empty(0) + for t in params_per_tp: + gatep, upp = t.chunk(2) + gate = torch.cat([gate, gatep]) + up = torch.cat([up, upp]) + output_state_dict[layer_name + ".mlp.gate_proj.weight"] = gate.to(dtype).clone().detach().contiguous() + output_state_dict[layer_name + ".mlp.up_proj.weight"] = up.to(dtype).clone().detach().contiguous() + + # Transpose the weights. + elif weight_or_bias == "weight": + out_name = megatron_to_transformers[op_name] + output_state_dict[layer_name + out_name + "weight"] = params + + # Copy the bias. + elif weight_or_bias == "bias": + out_name = megatron_to_transformers[op_name] + output_state_dict[layer_name + out_name + "bias"] = params + + # rotary_base = 10000 + # inv_freq = 1.0 / (rotary_base ** (torch.arange(0, hidden_size_per_head, 2).float() / hidden_size_per_head)) + # output_state_dict[layer_name + '.self_attn.rotary_emb.inv_freq'] = inv_freq + + if config.num_hidden_layers != (layer_idx + 1): + raise ValueError(f"Expected {config.num_hidden_layers} layers but found {layer_idx + 1}") + + # The final layernorm. + print("Converting final layernorm") + params = get_element_from_dict_by_path(tp_state_dicts[0], str(path)) + output_state_dict["model.norm.weight"] = params["final_layernorm.weight"].to(dtype) + + # For LM head, transformers' wants the matrix to weight embeddings. + print("Converting LM head") + output_state_dict["lm_head.weight"] = state_dict['model']['lm_head']['weight'].to(dtype) + + # It should be done! + print("Conversion from Megatron-LM to Transformers is done!") + + # Print the structure of converted state dict. + if args.print_checkpoint_structure: + recursive_print(None, output_state_dict) + + # Save tokenizer based on args + copy_tokenizer(args=args) + + # Store the config to file. + print("Saving config") + config.save_pretrained(args.save_path) + + # Store the state_dict to file. + max_shard_size = int(args.max_shard_size) if args.max_shard_size.isdigit() else args.max_shard_size + shards, index = shard_checkpoint(output_state_dict, max_shard_size=max_shard_size) + + # Save the model + for shard_file, shard in shards.items(): + torch.save(shard, os.path.join(args.save_path, shard_file)) + + if index is None: + print(f"Model weights saved in {os.path.join(args.save_path, WEIGHTS_NAME)}") + else: + save_index_file = os.path.join(args.save_path, WEIGHTS_INDEX_NAME) + # Save the index as well + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + print( + f"The model is bigger than the maximum size per checkpoint ({args.max_shard_size}) and is going to be " + f"split in {len(shards)} checkpoint shards. You can find where each parameters has been saved in the " + f"index located at {save_index_file}." + ) + + +def convert_checkpoint_from_transformers_to_megatron(args): + """ + Convert a checkpoint from HuggingFace Transformers to Megatron-LM. This allows converted checkpoints with variable + tensor parallelism and pipeline parallelism sizes. It takes as input a checkpoint from HuggingFace Transformers + which can have multiple shards. + + Args: + args (argparse.Namespace): the arguments to the script + + """ + os.makedirs(args.save_path, exist_ok=True) + # Search in directory above this + sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) + if args.megatron_path is not None: + print("args.megatron_path", args.megatron_path) + sys.path.insert(0, args.megatron_path) + try: + from megatron.tokenizer.tokenizer import _vocab_size_with_padding + from megatron.fs_utils import create_read_file_system + # except ModuleNotFoundError: + except Exception as e: + print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.") + print(e) + exit(1) + + # load the transformers model state dict and config + sub_dirs = [x for x in os.listdir(args.load_path) if x.startswith("pytorch_model")] + if len(sub_dirs) == 1: + checkpoint_name = "pytorch_model.bin" + state_dict = torch.load(os.path.join(args.load_path, checkpoint_name), map_location="cpu") + else: + num_checkpoints = len(sub_dirs) - 1 + state_dict = merge_transformers_sharded_states(args.load_path, num_checkpoints) + + config = AutoConfig.from_pretrained(args.load_path, trust_remote_code=True) + + # Saving the tracker file + tracker_filepath = os.path.join(args.save_path, "latest_checkpointed_iteration.txt") + with open(tracker_filepath, "w") as f: + f.write("release") + + # create `release` dir in args.load_path + release_dir = os.path.join(args.save_path, "release") + os.makedirs(release_dir, exist_ok=True) + + # megatron args + megatron_args = { + "orig_vocab_size": config.vocab_size, + "max_position_embeddings": config.model_max_length, + "hidden_size": config.hidden_size, + "num_layers": config.num_hidden_layers, + "num_attention_heads": config.num_attention_heads, + "ffn_hidden_size": config.intermediate_size, + "tensor_model_parallel_size": args.target_tensor_model_parallel_size, + "pipeline_model_parallel_size": args.target_pipeline_model_parallel_size, + "data_parallel_size": args.target_data_parallel_size, + "make_vocab_size_divisible_by": args.make_vocab_size_divisible_by, + "rank": 0, + "tokenizer_type": "GPT2BPETokenizer", + } + + margs = types.SimpleNamespace() + for k, v in megatron_args.items(): + setattr(margs, k, v) + + # params dtype + if args.target_params_dtype == "fp16": + dtype = torch.float16 + elif args.target_params_dtype == "bf16": + dtype = torch.bfloat16 + else: + dtype = torch.float32 + setattr(margs, "params_dtype", dtype) + + # save dummy optim state dict + dummy_optim_state_dict = {} + dummy_optim_state_dict["optimizer"] = { + "step": 0, + "param_groups": [ + { + "lr": 0.0, + "beta1": 0.0, + "beta2": 0.0, + "eps": 0.0, + "weight_decay": 0.0, + "correct_bias": False, + "params": [], + } + ], + } + if args.use_distributed_optimizer: + for i in range(args.target_pipeline_model_parallel_size): + for j in range(args.target_tensor_model_parallel_size): + for k in range(args.target_data_parallel_size): + if args.target_pipeline_model_parallel_size == 1: + checkpoint_dir = f"mp_rank_{j:02d}_{i:03d}" + else: + checkpoint_dir = f"mp_rank_{j:02d}_{i:03d}_{k:03d}" + checkpoint_dir = os.path.join(release_dir, checkpoint_dir) + os.makedirs(checkpoint_dir, exist_ok=True) + torch.save( + dummy_optim_state_dict, + os.path.join(checkpoint_dir, "optim.pt"), + ) + + # Convert. + print("Converting") + output_state_dict = [] + for i in range(args.target_tensor_model_parallel_size): + output_state_dict.append({}) + + # Embedding layer + print("converting embedding layer") + # pos_embedding = state_dict["transformer.wpe.weight"].to(dtype) + word_embedding = state_dict["model.embed_tokens.weight"].to(dtype) + orig_vocab_size = config.vocab_size + padded_vocab_size = _vocab_size_with_padding(orig_vocab_size, margs) + setattr(margs, "padded_vocab_size", padded_vocab_size) + # Cut out extra padding we don't need + if orig_vocab_size > padded_vocab_size: + full_word_embed = word_embedding[0:padded_vocab_size, :] + # Expanding embedding to larger size by replicating final entry + elif orig_vocab_size < padded_vocab_size: + padding_size = padded_vocab_size - orig_vocab_size + full_word_embed = torch.cat((word_embedding, word_embedding[-1].unsqueeze(0).expand(padding_size, -1))) + # Same size! + else: + full_word_embed = word_embedding + + # Split into new tensor model parallel sizes + out_word_embed = torch.chunk(full_word_embed, args.target_tensor_model_parallel_size, dim=0) + for i in range(args.target_tensor_model_parallel_size): + word_emb_dict = get_element_from_dict_by_path( + output_state_dict[i], "model.language_model.embedding.word_embeddings" + ) + word_emb_dict["weight"] = out_word_embed[i] + + # Transformer layers + print("converting transformer layers") + if config.num_hidden_layers % args.target_tensor_model_parallel_size != 0: + raise ValueError( + f"Number of layers ({config.num_hidden_layers}) must be divisible by number of tensor parallelism" + f" ({args.target_tensor_model_parallel_size})" + ) + num_layers = config.num_hidden_layers // args.target_pipeline_model_parallel_size + + layer_re = re.compile(r"model.layers\.(\d+)\.([a-zA-Z0-9_.]+)\.([A-Za-z]+)") + # The number of heads. + heads = config.num_attention_heads + # The hidden_size per head. + hidden_size_per_head = config.hidden_size // config.num_attention_heads + weight_or_bias = "weight" + print("args.target_pipeline_model_parallel_size", args.target_pipeline_model_parallel_size) + for pp_rank in range(args.target_pipeline_model_parallel_size): + layer_offset = pp_rank * num_layers + if pp_rank > 0: + output_state_dict = [] + for i in range(args.target_tensor_model_parallel_size): + output_state_dict.append({}) + print("num_layers", num_layers) + for layer in range(num_layers): + pp_layer_id = layer + layer_offset + layers_to_copy = [ + layer_name + for layer_name in state_dict.keys() + if layer_name.startswith(f"model.layers.{pp_layer_id}.") + ] + qkv_weight_to_combine = {} + mlp_weight_to_combine = {} + for layer_name in layers_to_copy: + m = layer_re.match(layer_name) + # Stop if that's not a layer + if m is None: + break + + # The index of the layer. + _ = int(m.group(1)) + # The name of the operation. + op_name = m.group(2) + # Is it a weight or a bias? + weight_or_bias = m.group(3) + params = state_dict[layer_name].to(dtype) + # handle layernorm + if op_name.endswith("layernorm"): + # out_name = "input_layernorm" if op_name.endswith("1") else "post_attention_layernorm" + out_name = op_name + layer_name = f"layers.{layer}.{out_name}.{weight_or_bias}" + + elif 'self_attn.o_proj' in op_name and weight_or_bias == 'weight': + layer_name = f"layers.{layer}.self_attention.dense.{weight_or_bias}" + + # handle attention K, V, Q weights + elif 'self_attn.W_pack' in op_name and weight_or_bias == 'weight': + params_tmp = params.unflatten(0, (3, config.hidden_size)) + q_weights = params_tmp[0].chunk(args.target_tensor_model_parallel_size, dim=0) + k_weights = params_tmp[1].chunk(args.target_tensor_model_parallel_size, dim=0) + v_weights = params_tmp[2].chunk(args.target_tensor_model_parallel_size, dim=0) + result_weights = [] + for idx in range(len(q_weights)): + partition_weight = torch.cat([q_weights[idx], k_weights[idx], v_weights[idx]]) + print("partition_weight", partition_weight.shape) + result_weights.append(partition_weight) + + params = torch.cat(result_weights) + layer_name = f"layers.{layer}.self_attention.query_key_value.{weight_or_bias}" + # elif op_name.startswith("self_attn") and weight_or_bias == "weight": + # # transformers stores D X (3*D) but Megatron-LM expects (3*D) X D. + # # params = params.transpose(0, 1).contiguous() + # assert (len(qkv_weight_to_combine) != 3) + # if 'q_proj' in op_name: + # qkv_weight_to_combine['q_proj'] = params + # elif 'k_proj' in op_name: + # qkv_weight_to_combine['k_proj'] = params + # elif 'v_proj' in op_name: + # qkv_weight_to_combine['v_proj'] = params + # + # if len(qkv_weight_to_combine) == 3: + # q_weights = qkv_weight_to_combine['q_proj'].chunk(args.target_tensor_model_parallel_size, dim=0) + # k_weights = qkv_weight_to_combine['k_proj'].chunk(args.target_tensor_model_parallel_size, dim=0) + # v_weights = qkv_weight_to_combine['v_proj'].chunk(args.target_tensor_model_parallel_size, dim=0) + # result_weights = [] + # for idx in range(len(q_weights)): + # partition_weight = torch.cat([q_weights[idx], k_weights[idx], v_weights[idx]]) + # result_weights.append(partition_weight) + # + # params = torch.cat(result_weights) + # layer_name = f"layers.{layer}.self_attention.query_key_value.{weight_or_bias}" + # else: + # continue + + elif op_name.startswith("mlp") and weight_or_bias == "weight": + if 'down_proj' in op_name: + layer_name = f"layers.{layer}.mlp.dense_4h_to_h.{weight_or_bias}" + elif 'gate_proj' in op_name: + assert (len(mlp_weight_to_combine) != 2) + mlp_weight_to_combine['gate_proj'] = params + elif 'up_proj' in op_name: + assert (len(mlp_weight_to_combine) != 2) + mlp_weight_to_combine['up_proj'] = params + + if 'down_proj' not in op_name and len(mlp_weight_to_combine) == 2: + gate_weights = mlp_weight_to_combine['gate_proj'].chunk(args.target_tensor_model_parallel_size, + dim=0) + up_weights = mlp_weight_to_combine['up_proj'].chunk(args.target_tensor_model_parallel_size, + dim=0) + result_weights = [] + for idx in range(len(gate_weights)): + partition_weight = torch.cat([gate_weights[idx], up_weights[idx]]) + result_weights.append(partition_weight) + + params = torch.cat(result_weights) + layer_name = f"layers.{layer}.mlp.dense_h_to_4h.{weight_or_bias}" + elif 'down_proj' not in op_name and len(mlp_weight_to_combine) < 2: + continue + + else: + continue + + if op_name + "." + weight_or_bias in tensor_parallel_params: + dim = 1 if op_name in [ + "self_attn.o_proj", "mlp.down_proj"] else 0 + params = torch.chunk( + params, args.target_tensor_model_parallel_size, dim=dim) + + for i in range(args.target_tensor_model_parallel_size): + params_dict = get_element_from_dict_by_path( + output_state_dict[i], "model.language_model.encoder") + params_dict[layer_name] = ( + params[i].clone().detach().contiguous() if ( + op_name + "." + weight_or_bias in tensor_parallel_params) + else params.clone().detach().contiguous() + ) + + if pp_rank == args.target_pipeline_model_parallel_size - 1: + # handle final layernorm + params = state_dict[f"model.norm.weight"].to(dtype) + layer_name = f"final_layernorm.{weight_or_bias}" + for i in range(args.target_tensor_model_parallel_size): + params_dict = get_element_from_dict_by_path( + output_state_dict[i], "model.language_model.encoder") + params_dict[layer_name] = params.clone().detach().contiguous() + + # add the LM head + for i in range(args.target_tensor_model_parallel_size): + params_dict = get_element_from_dict_by_path( + output_state_dict[i], "model.lm_head") + params_dict["weight"] = state_dict['lm_head.weight'].to( + dtype).clone().detach().contiguous() + + # saving the state dict as per the tp_rank and pp_rank + for tp_rank in range(args.target_tensor_model_parallel_size): + output_state_dict[tp_rank]["checkpoint_version"] = 3.0 + output_state_dict[tp_rank]["args"] = margs + checkpoint_dir = ( + f"mp_rank_{tp_rank:02d}" + if args.target_pipeline_model_parallel_size == 1 + else f"mp_rank_{tp_rank:02d}_{pp_rank:03d}" + ) + if args.use_distributed_optimizer: + checkpoint_name = "model_optim_rng.pt" + else: + checkpoint_name = "model_optim_rng.pt" + output_state_dict[tp_rank]["optimizer"] = dummy_optim_state_dict["optimizer"] + checkpoint_dir = os.path.join(release_dir, checkpoint_dir) + os.makedirs(checkpoint_dir, exist_ok=True) + checkpoint_path = os.path.join(checkpoint_dir, checkpoint_name) + if args.print_checkpoint_structure: + print( + f"Checkpoint structure of model state dict shard belonging to TP rank {tp_rank} and PP rank" + f" {pp_rank}:" + ) + recursive_print(None, output_state_dict[tp_rank]) + torch.save(output_state_dict[tp_rank], checkpoint_path) + + copy_tokenizer(args=args) + + +def main(): + parser = argparse.ArgumentParser() + parser = add_checkpointing_args(parser) + parser = add_megatron_checkpoint_args(parser) + parser = add_transformers_checkpoint_args(parser) + args = parser.parse_args() + + if args.convert_checkpoint_from_megatron_to_transformers: + convert_checkpoint_from_megatron_to_transformers(args) + else: + convert_checkpoint_from_transformers_to_megatron(args) + + +if __name__ == "__main__": + main() From e08a890f126dabc597c073b0dce04c02aaaef9eb Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Fri, 1 Dec 2023 14:18:17 +0800 Subject: [PATCH 05/48] Create baichuan_hf_to_megatron.sh --- tools/checkpoint_conversion/baichuan_hf_to_megatron.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 tools/checkpoint_conversion/baichuan_hf_to_megatron.sh diff --git a/tools/checkpoint_conversion/baichuan_hf_to_megatron.sh b/tools/checkpoint_conversion/baichuan_hf_to_megatron.sh new file mode 100644 index 0000000000..5bcce3d175 --- /dev/null +++ b/tools/checkpoint_conversion/baichuan_hf_to_megatron.sh @@ -0,0 +1,10 @@ +python baichuan_checkpoint_conversion.py \ +--load_path "/data/share_user/quyincen/megatron_model/hf_model/60132_model" \ +--save_path "/data/share_user/quyincen/megatron_model/mg_model" \ +--target_tensor_model_parallel_size 2 \ +--target_pipeline_model_parallel_size 1 \ +--target_data_parallel_size 4 \ +--target_params_dtype "bf16" \ +--make_vocab_size_divisible_by 1 \ +--print-checkpoint-structure \ +--megatron-path "/data/share_user/quyincen/Megatron-LLaMA" \ From 19460d8fde36bf59b184fa5723f488e9aaa69b9f Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Fri, 1 Dec 2023 14:19:44 +0800 Subject: [PATCH 06/48] Update baichuan_hf_to_megatron.sh --- tools/checkpoint_conversion/baichuan_hf_to_megatron.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/checkpoint_conversion/baichuan_hf_to_megatron.sh b/tools/checkpoint_conversion/baichuan_hf_to_megatron.sh index 5bcce3d175..9b021a6fce 100644 --- a/tools/checkpoint_conversion/baichuan_hf_to_megatron.sh +++ b/tools/checkpoint_conversion/baichuan_hf_to_megatron.sh @@ -1,10 +1,10 @@ python baichuan_checkpoint_conversion.py \ ---load_path "/data/share_user/quyincen/megatron_model/hf_model/60132_model" \ ---save_path "/data/share_user/quyincen/megatron_model/mg_model" \ +--load_path "PATH_TO_CHECKPOINT_DOWNLOADED_FROM_HUGGINGFACE" \ +--save_path "PATH_TO_SAVE_CONVERTED_CHECKPOINT" \ --target_tensor_model_parallel_size 2 \ --target_pipeline_model_parallel_size 1 \ ---target_data_parallel_size 4 \ ---target_params_dtype "bf16" \ +--target_data_parallel_size 16 \ +--target_params_dtype "fp16" \ --make_vocab_size_divisible_by 1 \ --print-checkpoint-structure \ ---megatron-path "/data/share_user/quyincen/Megatron-LLaMA" \ +--megatron-path "PATH_TO_MEGATRON_SOURCE_CODE" From 642599f426796c7ec754d0901e66dbe9ec069466 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Fri, 1 Dec 2023 14:21:04 +0800 Subject: [PATCH 07/48] Create baichuan_megatron_to_hf.sh --- tools/checkpoint_conversion/baichuan_megatron_to_hf.sh | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 tools/checkpoint_conversion/baichuan_megatron_to_hf.sh diff --git a/tools/checkpoint_conversion/baichuan_megatron_to_hf.sh b/tools/checkpoint_conversion/baichuan_megatron_to_hf.sh new file mode 100644 index 0000000000..3f47fe7e77 --- /dev/null +++ b/tools/checkpoint_conversion/baichuan_megatron_to_hf.sh @@ -0,0 +1,8 @@ +python tools/checkpoint_conversion/baichuan_checkpoint_conversion.py \ +--convert_checkpoint_from_megatron_to_transformers \ +--load_path "PATH_TO_CHECKPOINT_GENERATED_BY_THIS_REPO" \ +--save_path "PATH_TO_SAVE_CONVERTED_CHECKPOINT" \ +--target_params_dtype "fp16" \ +--make_vocab_size_divisible_by 1 \ +--print-checkpoint-structure \ +--megatron-path "PATH_TO_MEGATRON_SOURCE_CODE" From af89fdac550172df48e7e6c760d12261fc40dda8 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Fri, 1 Dec 2023 14:22:38 +0800 Subject: [PATCH 08/48] Update README.md --- README.md | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 4ca840392e..c5f6618983 100644 --- a/README.md +++ b/README.md @@ -88,16 +88,24 @@ You can use the same launching method as in [Megatron-LM Usage](./original_READM This tool helps convert the format of paramters between Megatron-LLaMA/Megatron-LM and Huggingface format. **HuggingFace to Megatron-LLaMA** - +For LLaMa: +``` +sh tools/checkpoint_conversion/hf_to_megatron.sh +``` +For Baichuan: ``` -sh tools/checkpoint_conversion/hf_to_megatron.sh +sh tools/checkpoint_conversion/baichuan_hf_to_megatron.sh ``` **Megatron-LLaMA to HuggingFace** - +For LLaMa: ``` sh tools/checkpoint_conversion/megatron_to_hf.sh ``` +For Baichuan: +``` +sh tools/checkpoint_conversion/baichuan_megatron_to_hf.sh +``` #### B. Launching scripts From 46ae7dde59106b8deaae180d19c903bbd0c8d03b Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Fri, 1 Dec 2023 14:23:00 +0800 Subject: [PATCH 09/48] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index c5f6618983..96f986c5cf 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,7 @@ You can use the same launching method as in [Megatron-LM Usage](./original_READM This tool helps convert the format of paramters between Megatron-LLaMA/Megatron-LM and Huggingface format. **HuggingFace to Megatron-LLaMA** + For LLaMa: ``` sh tools/checkpoint_conversion/hf_to_megatron.sh @@ -98,6 +99,7 @@ sh tools/checkpoint_conversion/baichuan_hf_to_megatron.sh ``` **Megatron-LLaMA to HuggingFace** + For LLaMa: ``` sh tools/checkpoint_conversion/megatron_to_hf.sh From eb1a2580bf347dbb337fb24d8f2aec2c0a919177 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Fri, 1 Dec 2023 14:27:06 +0800 Subject: [PATCH 10/48] Update README_zh.md --- README_zh.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README_zh.md b/README_zh.md index be02c94214..88965302ca 100644 --- a/README_zh.md +++ b/README_zh.md @@ -85,17 +85,27 @@ Megatron-LLaMA使用方式与Megatron-LM基本一致,详细信息请参考[Meg **HuggingFace to Megatron-LLaMA** +LLama: ``` sh tools/checkpoint_conversion/hf_to_megatron.sh ``` +Baichuan: +``` +sh tools/checkpoint_conversion/baichuan_hf_to_megatron.sh +``` 完成训练后,将训练产出的权重转换成HuggingFace支持的格式,方便后续使用: **Megatron-LLaMA to HuggingFace** +LLama: ``` sh tools/checkpoint_conversion/megatron_to_hf.sh ``` +Baichuan: +``` +sh tools/checkpoint_conversion/baichuan_megatron_to_hf.sh +``` ### B. LLaMA训练脚本 From 8330e8647c2dae9563d98b3577fe260a7ce1d008 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Fri, 1 Dec 2023 14:28:31 +0800 Subject: [PATCH 11/48] Update README_zh.md --- README_zh.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README_zh.md b/README_zh.md index 88965302ca..3a41a7d9e9 100644 --- a/README_zh.md +++ b/README_zh.md @@ -85,7 +85,7 @@ Megatron-LLaMA使用方式与Megatron-LM基本一致,详细信息请参考[Meg **HuggingFace to Megatron-LLaMA** -LLama: +LLaMA: ``` sh tools/checkpoint_conversion/hf_to_megatron.sh ``` @@ -98,7 +98,7 @@ sh tools/checkpoint_conversion/baichuan_hf_to_megatron.sh **Megatron-LLaMA to HuggingFace** -LLama: +LLaMA: ``` sh tools/checkpoint_conversion/megatron_to_hf.sh ``` From 86d996b4c96b9d7a9e743a155063561a3aedf37b Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Fri, 1 Dec 2023 14:28:50 +0800 Subject: [PATCH 12/48] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 96f986c5cf..b6bf855bf5 100644 --- a/README.md +++ b/README.md @@ -89,7 +89,7 @@ This tool helps convert the format of paramters between Megatron-LLaMA/Megatron- **HuggingFace to Megatron-LLaMA** -For LLaMa: +For LLaMA: ``` sh tools/checkpoint_conversion/hf_to_megatron.sh ``` @@ -100,7 +100,7 @@ sh tools/checkpoint_conversion/baichuan_hf_to_megatron.sh **Megatron-LLaMA to HuggingFace** -For LLaMa: +For LLaMA: ``` sh tools/checkpoint_conversion/megatron_to_hf.sh ``` From 6ccfb61ae27f4dd8307f346a9343aabe41586a89 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Fri, 1 Dec 2023 14:31:38 +0800 Subject: [PATCH 13/48] Update arguments.py --- megatron/arguments.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/megatron/arguments.py b/megatron/arguments.py index 513366c7d3..8578934258 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -526,6 +526,9 @@ def _add_network_size_args(parser): 'attention. This is set to ' ' args.hidden_size // args.num_attention_heads ' 'if not provided.') + group.add_argument('--position-embedding-type', type=str, default='learned_absolute', + choices=['learned_absolute', 'rope', 'alibi'], + help='Position embedding type.') group.add_argument('--max-position-embeddings', type=int, default=None, help='Maximum number of position embeddings to use. ' 'This is the size of position embedding.') From b0ac4d39dea5f7689f5c1f9ab4297162e3f34aef Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Mon, 11 Dec 2023 10:30:54 +0800 Subject: [PATCH 14/48] add collate_fn --- megatron/training.py | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index d5e5aad575..59b657f666 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -56,6 +56,7 @@ def pretrain(train_valid_test_dataset_provider, forward_step_func, process_non_loss_data_func=None, extra_args_provider=None, + collator=None, args_defaults={}): """Main training program. @@ -118,10 +119,15 @@ def pretrain(train_valid_test_dataset_provider, # Data stuff. timers('train/valid/test-data-iterators-setup', log_level=0).start( barrier=True) + + data_collator = None + if collator is not None: + data_collator = collator() + if args.virtual_pipeline_model_parallel_size is not None: all_data_iterators = [ build_train_valid_test_data_iterators( - train_valid_test_dataset_provider) + train_valid_test_dataset_provider, data_collator) for _ in range(len(model)) ] train_data_iterator = [data_iterators[0] @@ -133,7 +139,7 @@ def pretrain(train_valid_test_dataset_provider, else: train_data_iterator, valid_data_iterator, test_data_iterator \ = build_train_valid_test_data_iterators( - train_valid_test_dataset_provider) + train_valid_test_dataset_provider, data_collator) timers('train/valid/test-data-iterators-setup').stop() print_datetime('after dataloaders are built') @@ -899,7 +905,7 @@ def cyclic_iter(iter): def build_train_valid_test_data_loaders( - build_train_valid_test_datasets_provider): + build_train_valid_test_datasets_provider, data_collator=None): """XXX""" args = get_args() @@ -942,10 +948,10 @@ def build_train_valid_test_data_loaders( # Build dataloders. train_dataloader = build_pretraining_data_loader( - train_ds, args.consumed_train_samples) + train_ds, args.consumed_train_samples, data_collator) valid_dataloader = build_pretraining_data_loader( - valid_ds, args.consumed_valid_samples) - test_dataloader = build_pretraining_data_loader(test_ds, 0) + valid_ds, args.consumed_valid_samples, data_collator) + test_dataloader = build_pretraining_data_loader(test_ds, 0, data_collator) # Flags to know if we need to do training/validation/testing. do_train = train_dataloader is not None and args.train_iters > 0 @@ -956,11 +962,13 @@ def build_train_valid_test_data_loaders( [int(do_train), int(do_valid), int(do_test)]) else: flags = torch.cuda.LongTensor([0, 0, 0]) - # Broadcast num tokens. - torch.distributed.broadcast(flags, - mpu.get_tensor_model_parallel_src_rank(), - group=mpu.get_tensor_model_parallel_group()) + torch.distributed.barrier() + print("begin brodcast ", flags) + print("mpu.get_tensor_model_parallel_src_rank()" + str(mpu.get_tensor_model_parallel_src_rank())) + # torch.distributed.broadcast(flags, mpu.get_tensor_model_parallel_src_rank(), group=mpu.get_tensor_model_parallel_group()) + torch.distributed.broadcast(flags, 0) + print("brodcast done") args.do_train = flags[0].item() args.do_valid = flags[1].item() args.do_test = flags[2].item() @@ -969,24 +977,25 @@ def build_train_valid_test_data_loaders( def build_train_valid_test_data_iterators( - build_train_valid_test_datasets_provider): + build_train_valid_test_datasets_provider, data_collator=None): args = get_args() # Build loaders. train_dataloader, valid_dataloader, test_dataloader = \ build_train_valid_test_data_loaders( - build_train_valid_test_datasets_provider) - + build_train_valid_test_datasets_provider, data_collator) + print("dataloader done") # Build iterators. dl_type = args.dataloader_type - assert dl_type in ['single', 'cyclic'] + assert dl_type in ['single', 'cyclic'], "dl_type not in 'single', 'cyclic' " if train_dataloader is not None: train_data_iterator = iter(train_dataloader) if dl_type == 'single' \ else iter(cyclic_iter(train_dataloader)) else: train_data_iterator = None + print("train_data_iterator done") if valid_dataloader is not None: valid_data_iterator = iter(valid_dataloader) if dl_type == 'single' \ From 09d01a8a92a332c2020032b7274788c43455312b Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Mon, 11 Dec 2023 10:32:31 +0800 Subject: [PATCH 15/48] Update training.py --- megatron/training.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index 59b657f666..b313e83924 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -964,11 +964,8 @@ def build_train_valid_test_data_loaders( flags = torch.cuda.LongTensor([0, 0, 0]) # Broadcast num tokens. torch.distributed.barrier() - print("begin brodcast ", flags) - print("mpu.get_tensor_model_parallel_src_rank()" + str(mpu.get_tensor_model_parallel_src_rank())) # torch.distributed.broadcast(flags, mpu.get_tensor_model_parallel_src_rank(), group=mpu.get_tensor_model_parallel_group()) torch.distributed.broadcast(flags, 0) - print("brodcast done") args.do_train = flags[0].item() args.do_valid = flags[1].item() args.do_test = flags[2].item() @@ -985,7 +982,6 @@ def build_train_valid_test_data_iterators( train_dataloader, valid_dataloader, test_dataloader = \ build_train_valid_test_data_loaders( build_train_valid_test_datasets_provider, data_collator) - print("dataloader done") # Build iterators. dl_type = args.dataloader_type assert dl_type in ['single', 'cyclic'], "dl_type not in 'single', 'cyclic' " @@ -995,7 +991,6 @@ def build_train_valid_test_data_iterators( else iter(cyclic_iter(train_dataloader)) else: train_data_iterator = None - print("train_data_iterator done") if valid_dataloader is not None: valid_data_iterator = iter(valid_dataloader) if dl_type == 'single' \ From 68c77a5f6690203dc47bab06e5bd0fbec0718d28 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Mon, 11 Dec 2023 10:35:11 +0800 Subject: [PATCH 16/48] add collate_fn --- megatron/data/data_samplers.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index 8dec2c1922..5f096ef503 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -11,7 +11,7 @@ from megatron.core import mpu -def build_pretraining_data_loader(dataset, consumed_samples): +def build_pretraining_data_loader(dataset, consumed_samples, data_collator=None): """Buld dataloader given an input dataset.""" if dataset is None: @@ -40,10 +40,13 @@ def build_pretraining_data_loader(dataset, consumed_samples): args.dataloader_type)) # Torch dataloader. - return torch.utils.data.DataLoader(dataset, + dataloader = torch.utils.data.DataLoader(dataset, + collate_fn=data_collator, batch_sampler=batch_sampler, num_workers=args.num_workers, pin_memory=True) + return dataloader + class MegatronPretrainingSampler: @@ -136,8 +139,8 @@ def __init__(self, dataset, total_samples, consumed_samples, micro_batch_size, # Sanity checks. assert self.total_samples > 0, \ 'no sample to consume: {}'.format(self.total_samples) - assert self.micro_batch_size > 0 - assert data_parallel_size > 0 + assert self.micro_batch_size > 0, 'self.micro_batch_size > 0' + assert data_parallel_size > 0, 'data_parallel_size > 0' assert self.data_parallel_rank < data_parallel_size, \ 'data_parallel_rank should be smaller than data size: {}, ' \ '{}'.format(self.data_parallel_rank, data_parallel_size) From 4c582c0c97aa9263cb22b431165dc64bd263a739 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Mon, 11 Dec 2023 11:15:45 +0800 Subject: [PATCH 17/48] add alibi --- megatron/model/utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/megatron/model/utils.py b/megatron/model/utils.py index cf3727c02b..ae8a0cc49b 100644 --- a/megatron/model/utils.py +++ b/megatron/model/utils.py @@ -31,6 +31,11 @@ def attention_mask_func(attention_scores, attention_mask): return attention_scores +def alibi_mask_func(attention_scores, attention_mask): + attention_scores = attention_scores + attention_mask + return attention_scores + + def get_linear_layer(rows, columns, init_method): """Simple linear layer with weight initialization.""" layer = torch.nn.Linear(rows, columns) From 3d6b633b19e843ac57591486d6fbdca0c86e4395 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Mon, 11 Dec 2023 11:20:27 +0800 Subject: [PATCH 18/48] Update tokenizer.py --- megatron/tokenizer/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index d87a7c98d9..0932f4d10b 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -551,7 +551,7 @@ def __init__(self, tokenizer_name_or_path, vocab_extra_ids=0): hf_tokenizer_kwargs = {} if vocab_extra_ids > 0: hf_tokenizer_kwargs["additional_special_tokens"] = [f"" for _id in range(vocab_extra_ids)] - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, **hf_tokenizer_kwargs) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, trust_remote_code=True,**hf_tokenizer_kwargs) self.tokenizer.pad_token = self.tokenizer.eos_token self.encoder = self.tokenizer.get_vocab() self.decoder = {v: k for k, v in self.encoder.items()} From 9b709dc4a45b89e7b20463c20ca3c4a4550f7f18 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Mon, 11 Dec 2023 14:59:36 +0800 Subject: [PATCH 19/48] Update transformer.py --- megatron/model/transformer.py | 213 ++++++++++++++++++++++------------ 1 file changed, 140 insertions(+), 73 deletions(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 2f98781500..177888603a 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -16,7 +16,7 @@ from megatron.model.fused_softmax import FusedScaleMaskSoftmax from megatron.model.fused_bias_gelu import bias_gelu_impl from megatron.model.rotary_pos_embedding import apply_rotary_pos_emb -from megatron.model.utils import attention_mask_func, openai_gelu, erf_gelu +from megatron.model.utils import attention_mask_func, alibi_mask_func, openai_gelu, erf_gelu try: from einops import rearrange @@ -56,6 +56,48 @@ hyperparameters: transformer hyperparameters """ +def get_slopes(n): + def get_slopes_power_of_2(n): + start = (2 ** (-2 ** -(math.log2(n) - 3))) + ratio = start + return [start * ratio ** i for i in range(n)] + + if math.log2(n).is_integer(): + return get_slopes_power_of_2(n) + else: + closest_power_of_2 = 2 ** math.floor(math.log2(n)) + return ( + get_slopes_power_of_2(closest_power_of_2) + + get_slopes( + 2 * closest_power_of_2, + )[0::2][:n - closest_power_of_2] + ) + +def _fill_with_neg_inf(t): + """FP16-compatible function that fills a tensor with -inf.""" + return t.float().fill_(float("-inf")).type_as(t) + + +def _buffered_future_mask(tensor, maxpos, alibi, attn_heads): + _future_mask = torch.triu(_fill_with_neg_inf(torch.zeros([maxpos, maxpos])), 1) + _future_mask = _future_mask.unsqueeze(0) + alibi + new_future_mask = _future_mask.to(tensor) + return new_future_mask[: tensor.shape[0] * attn_heads, :maxpos, :maxpos] + + +def _gen_alibi_mask(num_attention_heads, max_seq_len): + slopes = torch.Tensor(get_slopes(num_attention_heads)) + alibi = ( + slopes.unsqueeze(1).unsqueeze(1) + * torch.arange(max_seq_len).unsqueeze(0).unsqueeze(0).expand( + num_attention_heads, -1, -1) + ) + # alibi = alibi.unsqueeze(0) + alibi_mask = torch.triu(_fill_with_neg_inf(torch.zeros([max_seq_len, max_seq_len])), 1) # [max_seq_len, max_seq_len] + alibi_mask = alibi_mask.unsqueeze(0) + alibi # [num_attention_heads, max_seq_len, max_seq_len] + return alibi_mask + + class DropPath(MegatronModule): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). @@ -247,11 +289,16 @@ def __init__(self, layer_number, coeff = self.layer_number self.norm_factor *= coeff + cur_mask_func = attention_mask_func + self.position_embedding_type = args.position_embedding_type + if args.position_embedding_type == "alibi": + cur_mask_func = alibi_mask_func + self.scale_mask_softmax = FusedScaleMaskSoftmax( self.fp16, self.bf16, self.attn_mask_type, args.masked_softmax_fusion, - attention_mask_func, + cur_mask_func, self.attention_softmax_in_fp32, coeff) @@ -266,7 +313,7 @@ def forward(self, query_layer, key_layer, # =================================== # Raw attention scores. [b, np, s, s] # =================================== - + q_len = query_layer.size(0) # [b, np, sq, sk] output_size = (query_layer.size(1), query_layer.size(2), @@ -302,6 +349,11 @@ def forward(self, query_layer, key_layer, # =========================== # attention scores and attention mask [b, np, sq, sk] + if q_len == 1 and self.position_embedding_type == "alibi": # inference with cache + if len(attention_mask.size()) == 4: + attention_mask = attention_mask[:, :, -1:, :] + else: + attention_mask = attention_mask[:, -1:, :] attention_probs = self.scale_mask_softmax(attention_scores, attention_mask) # This is actually dropping out entire tokens to attend to, which might @@ -409,7 +461,7 @@ def forward(self, q, k, v, bias=None): ) else: output = flash_attn_func( - q, k, v, bias, self.softmax_scale, is_causal + q, k, v, bias, is_causal, self.softmax_scale ) # output = rearrange(output, '(b s) ... -> b s ...', b=batch_size) return output @@ -435,6 +487,7 @@ def __init__(self, init_method, self.sequence_parallel = args.sequence_parallel self.use_flash_attn = args.use_flash_attn + self.position_embedding_type = args.position_embedding_type if self.use_flash_attn: if flash_attn_unpadded_func is None: raise ImportError('FlashAttention is not installed, please install with ' @@ -489,10 +542,24 @@ def __init__(self, init_method, self.core_attention = CoreAttention(self.layer_number, self.attn_mask_type) self.checkpoint_core_attention = args.recompute_granularity == 'selective' + + self.apply_query_key_layer_scaling = args.apply_query_key_layer_scaling + world_size = mpu.get_tensor_model_parallel_world_size() + self.hidden_size_per_partition = core.utils.divide(projection_size, + world_size) + self.hidden_size_per_attention_head = core.utils.divide( + projection_size, args.num_attention_heads) + self.num_attention_heads_per_partition = core.utils.divide( + args.num_attention_heads, world_size) + coeff = None + self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) + if self.apply_query_key_layer_scaling: + coeff = self.layer_number + self.norm_factor *= coeff if self.use_flash_attn: self.core_attention_flash = FlashSelfAttention( - causal=True, attention_dropout=args.attention_dropout + causal=True, softmax_scale=(1.0/self.norm_factor), attention_dropout=args.attention_dropout ) # Output. @@ -541,7 +608,6 @@ def forward(self, hidden_states, attention_mask, encoder_output=None, inference_params=None, rotary_pos_emb=None): # hidden_states: [sq, b, h] - # ================================================= # Pre-allocate memory for key-values for inference. # ================================================= @@ -569,7 +635,6 @@ def forward(self, hidden_states, attention_mask, # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)] mixed_x_layer, _ = self.query_key_value(hidden_states) query_layer, key_layer, value_layer = tensor_parallel.split_tensor_along_last_dim(mixed_x_layer, 3) - # Changed layout, for compatibility with CKPT conversion new_tensor_shape = query_layer.size()[:-1] + \ (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) @@ -577,7 +642,6 @@ def forward(self, hidden_states, attention_mask, query_layer = query_layer.view(new_tensor_shape) key_layer = key_layer.view(new_tensor_shape) value_layer = value_layer.view(new_tensor_shape) - else: # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)] mixed_kv_layer, _ = self.key_value(encoder_output) @@ -602,7 +666,6 @@ def forward(self, hidden_states, attention_mask, # ================================== # Adjust key and value for inference # ================================== - # duplicate the pos_emb for self attention if rotary_pos_emb is not None: if isinstance(rotary_pos_emb, tuple): @@ -626,8 +689,6 @@ def forward(self, hidden_states, attention_mask, :sequence_end, batch_start:batch_end, ...] value_layer = inference_value_memory[ :sequence_end, batch_start:batch_end, ...] - - # adjust the key rotary positional embedding if rotary_pos_emb is not None: q_pos_emb, k_pos_emb = rotary_pos_emb @@ -662,7 +723,6 @@ def forward(self, hidden_states, attention_mask, # absolute positional embedding. # otherwise, only relative positional embedding takes effect # value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb) - if not self.use_flash_attn: if self.checkpoint_core_attention: context_layer = self._checkpointed_attention_forward( @@ -673,19 +733,17 @@ def forward(self, hidden_states, attention_mask, else: q, k, v = [rearrange(x, 's b ... -> b s ...').contiguous() for x in (query_layer, key_layer, value_layer)] + cur_mask = None if not self.sequence_parallel: with tensor_parallel.get_cuda_rng_tracker().fork(): context_layer = self.core_attention_flash(q, k, v, bias=attention_mask) else: context_layer = self.core_attention_flash(q, k, v, bias=attention_mask) context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous() - # ================= # Output. [sq, b, h] # ================= - output, bias = self.dense(context_layer) - return output, bias @@ -801,7 +859,6 @@ def forward(self, hidden_states, attention_mask, encoder_output=None, enc_dec_attn_mask=None, inference_params=None, rotary_pos_emb=None): # hidden_states: [s, b, h] - # Layer norm at the beginning of the transformer layer. layernorm_output = self.input_layernorm(hidden_states) # Self attention. @@ -811,13 +868,11 @@ def forward(self, hidden_states, attention_mask, attention_mask, inference_params=inference_params, rotary_pos_emb=rotary_pos_emb) - # Residual connection. if self.apply_residual_connection_post_layernorm: residual = layernorm_output else: residual = hidden_states - if self.drop_path is None: # jit scripting for a nn.module (with dropout) is not # trigerring the fusion kernel. For now, we use two @@ -844,10 +899,8 @@ def forward(self, hidden_states, attention_mask, p=self.hidden_dropout, training=self.training) layernorm_input = residual + self.drop_path(out) - # Layer norm post the self attention. layernorm_output = self.post_attention_layernorm(layernorm_input) - if self.layer_type == LayerType.decoder: attention_output, attention_bias = \ self.inter_attention(layernorm_output, @@ -874,7 +927,7 @@ def forward(self, hidden_states, attention_mask, # MLP. mlp_output, mlp_bias = self.mlp(layernorm_output) - + # torch.distributed.barrier() # Second residual connection. if self.apply_residual_connection_post_layernorm: residual = layernorm_output @@ -908,7 +961,6 @@ def forward(self, hidden_states, attention_mask, p=self.hidden_dropout, training=self.training) output = residual + self.drop_path(out) - return output @@ -1021,9 +1073,10 @@ def __init__(self, init_method, output_layer_init_method, self.recompute_num_layers = args.recompute_num_layers self.distribute_saved_activations = \ args.distribute_saved_activations and not args.sequence_parallel - + self.first_run = True + self.max_cache_pos = args.max_position_embeddings self.sequence_parallel = args.sequence_parallel - + self.alibi_mask = None # Transformer Engine Init. if self.transformer_impl == 'transformer_engine': global transformer_engine @@ -1241,48 +1294,54 @@ def set_input_tensor(self, input_tensor): forward_step_func""" self.input_tensor = input_tensor - @staticmethod - def _build_alibi_tensor(max_seq_len, num_attention_heads, batch_size): + def _build_alibi_tensor(self, tensor, max_seq_len, num_attention_heads): # Copied from bigscience-workshop/Megatron-DeepSpeed # Based on https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742 """Returns tensor shaped (1, num_attention_heads_per_partition, 1, max_seq_len), """ - - def get_slopes(n): - def get_slopes_power_of_2(n): - start = (2 ** (-2 ** -(math.log2(n) - 3))) - ratio = start - return [start * ratio ** i for i in range(n)] - - if math.log2(n).is_integer(): - return get_slopes_power_of_2(n) - else: - closest_power_of_2 = 2 ** math.floor(math.log2(n)) - return ( - get_slopes_power_of_2(closest_power_of_2) - + get_slopes( - 2 * closest_power_of_2, - )[0::2][:n - closest_power_of_2] + if self.training: + slopes = torch.Tensor(get_slopes(num_attention_heads)) + position_point = ( + torch.arange(max_seq_len) - max_seq_len + 1 + ) + position_point = ( + position_point.unsqueeze(0) + .unsqueeze(0) + .expand(num_attention_heads, max_seq_len, -1) + ) + diag = torch.diag(position_point[0]) + position_point = position_point - diag.unsqueeze(0).unsqueeze(0).transpose( + -1, -2 + ) + alibi = slopes.unsqueeze(1).unsqueeze(1) * position_point + mask = _buffered_future_mask( + tensor, max_seq_len, alibi, num_attention_heads + ) + else: + if self.first_run: + self.first_run = False + self.register_buffer( + "future_mask", + _gen_alibi_mask(num_attention_heads, self.max_cache_pos).to( + tensor + ), + persistent=False, ) + if max_seq_len > self.max_cache_pos: + self.max_cache_pos = max_seq_len + self.register_buffer( + "future_mask", + _gen_alibi_mask(num_attention_heads, self.max_cache_pos).to( + tensor + ), + persistent=False, + ) + mask = self.future_mask[ + : num_attention_heads, :max_seq_len, :max_seq_len + ] + return mask - slopes = torch.Tensor(get_slopes(num_attention_heads)) - alibi = ( - slopes.unsqueeze(1).unsqueeze(1) - * torch.arange(max_seq_len).unsqueeze(0).unsqueeze(0).expand( - num_attention_heads, -1, -1) - ) - - # Select the part of the tensor that corresponds to our tensor - # parallel index. - tp_world_size = mpu.get_tensor_model_parallel_world_size() - tp_index = mpu.get_tensor_model_parallel_rank() - alibi = alibi.reshape((tp_world_size, -1, *alibi.shape[1:]))[tp_index] # [num_attention_heads/world, 1, max_seq_len] - - # alibi = alibi.unsqueeze(0) - alibi_mask = torch.triu(_fill_with_neg_inf(torch.zeros([max_pos, max_pos])), 1) # [max_seq_len, max_seq_len] - alibi_mask = alibi_mask.unsqueeze(0) + alibi # [num_attention_heads/world, max_seq_len, max_seq_len] - return alibi_mask def forward(self, hidden_states, attention_mask, encoder_output=None, enc_dec_attn_mask=None, @@ -1294,17 +1353,27 @@ def forward(self, hidden_states, attention_mask, seq_len = hidden_states.shape[0] if self.sequence_parallel: seq_len = seq_len * mpu.get_tensor_model_parallel_world_size() - alibi_mask = self._build_alibi_tensor( - seq_len, - self.num_attention_heads, - hidden_states.shape[1], - ).to(torch.cuda.current_device()) + if self.training: + if ( + self.alibi_mask is None + or self.alibi_mask.shape[-1] != seq_len + ): + self.alibi_mask = self._build_alibi_tensor( + hidden_states, seq_len, self.num_attention_heads + ).to(torch.cuda.current_device()) + alibi_mask = self.alibi_mask + else: + alibi_mask = self._build_alibi_tensor(hidden_states, seq_len, self.num_attention_heads).to(torch.cuda.current_device()) if self.params_dtype is torch.float16: alialibi_maskbi = alibi_mask.to(torch.float16) elif self.params_dtype is torch.bfloat16: - alibi_mask = alibi_mask.to(torch.bfloat16) + alibi_mask = alibi_mask.to(torch.bfloat16) # [head, seq_len, seq_len] + # Select the part of the tensor that corresponds to our tensor + # parallel index. + tp_world_size = mpu.get_tensor_model_parallel_world_size() + tp_index = mpu.get_tensor_model_parallel_rank() + alibi_mask = alibi_mask.reshape((tp_world_size, -1, *alibi_mask.shape[1:]))[tp_index] # [num_attention_heads/world, seq_len, max_seq_len] if attention_mask is not None: - print("attention_mask", attention_mask.shape) if len(attention_mask.shape) == 2: expanded_mask = attention_mask.to(alibi_mask.dtype) expanded_mask = torch.tril( @@ -1312,18 +1381,17 @@ def forward(self, hidden_states, attention_mask, ) * torch.eq(expanded_mask[:, :, None] - expanded_mask[:, None, :], 0) else: expanded_mask = attention_mask - bsz = inputs_embeds.size(0) + bsz = hidden_states.size(1) src_len, tgt_len = alibi_mask.size()[-2:] expanded_mask = ( - expanded_mask.unsqueeze(1) + expanded_mask .expand(bsz, 1, src_len, tgt_len) .to(alibi_mask.dtype) ) - inverted_mask = 1.0 - expanded_mask - inverted_mask = inverted_mask.masked_fill( - inverted_mask.to(torch.bool), torch.finfo(alibi_mask.dtype).min + expanded_mask = expanded_mask.masked_fill( + expanded_mask.to(torch.bool), torch.finfo(alibi_mask.dtype).min ) - attention_mask = inverted_mask + alibi_mask.unsqueeze(0) + attention_mask = expanded_mask + alibi_mask.unsqueeze(0) # [batch_size, head_size, seq_len, seq_len] else: attention_mask = alibi_mask # Checks. @@ -1406,7 +1474,6 @@ def forward(self, hidden_states, attention_mask, # Skip counter update for eval and activation checkpointing if torch.is_grad_enabled() and self.training: self.microbatch_count += 1 - # Final layer norm. if self.post_process and self.post_layer_norm: hidden_states = self.final_layernorm(hidden_states) From a167ea2eca5132c0ff23788b75603998f2d622e1 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Thu, 14 Dec 2023 14:35:47 +0800 Subject: [PATCH 20/48] Create Baichuan_13_standalone.sh --- examples/Baichuan_13_standalone.sh | 90 ++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 examples/Baichuan_13_standalone.sh diff --git a/examples/Baichuan_13_standalone.sh b/examples/Baichuan_13_standalone.sh new file mode 100644 index 0000000000..c137377896 --- /dev/null +++ b/examples/Baichuan_13_standalone.sh @@ -0,0 +1,90 @@ +DATASET_1="" +DATASET_2="" +DATASET_3="" +DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}" + +TP_SIZE=2 +PP_SIZE=1 +WORLD_SIZE=8 +MICRO_BATCH_SIZE=2 +# The int is the number of micro steps of gradient accumulation +GLOBAL_BATCH_SIZE=$((($WORLD_SIZE * $MICRO_BATCH_SIZE) / ($TP_SIZE * $PP_SIZE) * 8)) +# GLOBAL_BATCH_SIZE=128 + +JOB_NAME="LLaMA_tp${TP_SIZE}_pp${PP_SIZE}_mbs${MICRO_BATCH_SIZE}_gpus${WORLD_SIZE}" + +LOAD_CHECKPOINT_PATH="PATH TO THE MODEL CHECKPOINT" +SAVE_CHECKPOINT_PATH="PATH TO SAVE MODEL CHECKPOINT" +TOKENIZER_PATH="PATH OR NAME FOR PRETRAINED TOKENIZER" +TENSORBOARD_DIR="TENSORBOARD DIRECTORY" + +TRAIN_ITERS=1000 +EVAL_ITERS=10 +EVAL_INTERVAL=1000 +SAVE_INTERVAL=100 +LOG_INTERVAL=1 + +# Setting --tensorboard-queue-size to 1 significantly slows down the training +options=" \ + --finetune \ + --sequence-parallel \ + --tensor-model-parallel-size ${TP_SIZE} \ + --pipeline-model-parallel-size ${PP_SIZE} \ + --num-layers 40 \ + --hidden-size 5120 \ + --num-attention-heads 40 \ + --seq-length 2048 \ + --max-position-embeddings 2048 \ + --no-position-embedding \ + --position-embedding-type alibi \ + --swiglu \ + --ffn-hidden-size 13696 \ + --disable-bias-linear \ + --RMSNorm \ + --attention-dropout 0 \ + --hidden-dropout 0 \ + --layernorm-epsilon 1e-6 \ + --causal-lm \ + --tokenizer-type PretrainedFromHF \ + --tokenizer-name-or-path $TOKENIZER_PATH \ + --make-vocab-size-divisible-by 1 \ + --init-method-std 0.01 \ + --micro-batch-size ${MICRO_BATCH_SIZE} \ + --global-batch-size ${GLOBAL_BATCH_SIZE} \ + --train-iters ${TRAIN_ITERS} \ + --lr 6.0e-5 \ + --lr-decay-iters 10 \ + --lr-warmup-iters 5 \ + --min-lr 6.0e-6 \ + --override-opt_param-scheduler \ + --lr-decay-style cosine \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --clip-grad 1.0 \ + --weight-decay 0.1 \ + --overlapped-distributed-optimizer \ + --reduce-bucket-size=2e8 \ + --no-gradient-accumulation-fusion \ + --dataloader-type cyclic \ + --data-impl mmap \ + --data-path ${DATASET} \ + --split 98,2,0 \ + --eval-interval ${EVAL_INTERVAL} \ + --eval-iters ${EVAL_ITERS} \ + --save-interval ${SAVE_INTERVAL} \ + --save ${SAVE_CHECKPOINT_PATH} \ + --load ${LOAD_CHECKPOINT_PATH} \ + --no-load-optim \ + --log-interval ${LOG_INTERVAL} \ + --tensorboard-dir ${TENSORBOARD_DIR} \ + --tensorboard-queue-size 1000 \ + --log-timers-to-tensorboard \ + --log-batch-size-to-tensorboard \ + --log-validation-ppl-to-tensorboard \ + --job-name ${JOB_NAME} \ + --bf16 \ + --recompute-activations \ + --recompute-granularity selective \ + --use-flash-attn" + +torchrun --nproc_per_node=8 --master_port=29500 pretrain_llama.py ${options} From d5fbd89d8d046314af8c863e9243b3f5b67cc790 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Thu, 14 Dec 2023 14:37:34 +0800 Subject: [PATCH 21/48] Update README.md --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index b6bf855bf5..62b1d69e01 100644 --- a/README.md +++ b/README.md @@ -113,9 +113,14 @@ sh tools/checkpoint_conversion/baichuan_megatron_to_hf.sh **Single-node launching** +For LLaMA: ``` sh examples/LLaMA/LLaMA_13_standalone.sh ``` +For Baichuan: +``` +sh examples/Baichuan_13_standalone.sh +``` **Distributed launching** From 448de2baeaaab7c3141d2f24bc504d16f97106e1 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Thu, 14 Dec 2023 22:05:56 +0800 Subject: [PATCH 22/48] Update __init__.py --- megatron/model/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/model/__init__.py b/megatron/model/__init__.py index 168b65cae6..bfcdaff78a 100644 --- a/megatron/model/__init__.py +++ b/megatron/model/__init__.py @@ -6,6 +6,7 @@ from .bert_model import BertModel from .gpt_model import GPTModel from .llama_model import LLaMAModel +from .baichuan_model import BaichuanModel from .t5_model import T5Model from .language_model import get_language_model from .module import Float16Module From e2af6cad4bccabd6917a7f95fc6ad9b00fdf5405 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Thu, 14 Dec 2023 22:25:54 +0800 Subject: [PATCH 23/48] solve Unsupported gpu architecture 'compute_90' --- megatron/fused_kernels/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/fused_kernels/__init__.py b/megatron/fused_kernels/__init__.py index dcbf24cb3f..d1a488bcf2 100644 --- a/megatron/fused_kernels/__init__.py +++ b/megatron/fused_kernels/__init__.py @@ -23,9 +23,9 @@ def load(args): if int(bare_metal_major) >= 11: cc_flag.append('-gencode') cc_flag.append('arch=compute_80,code=sm_80') - if int(bare_metal_minor) >= 7: - cc_flag.append('-gencode') - cc_flag.append('arch=compute_90,code=sm_90') + # if int(bare_metal_minor) >= 7: + # cc_flag.append('-gencode') + # cc_flag.append('arch=compute_90,code=sm_90') # Build path srcpath = pathlib.Path(__file__).parent.absolute() From aad5993756ca31cf48d279e0236b5efd5c517eda Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Tue, 19 Dec 2023 15:32:00 +0800 Subject: [PATCH 24/48] Update Baichuan_13_standalone.sh --- examples/Baichuan_13_standalone.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/Baichuan_13_standalone.sh b/examples/Baichuan_13_standalone.sh index c137377896..bc754a9c03 100644 --- a/examples/Baichuan_13_standalone.sh +++ b/examples/Baichuan_13_standalone.sh @@ -87,4 +87,4 @@ options=" \ --recompute-granularity selective \ --use-flash-attn" -torchrun --nproc_per_node=8 --master_port=29500 pretrain_llama.py ${options} +torchrun --nproc_per_node=8 --master_port=29500 pretrain_baichuan.py ${options} From d9140f413d65788a34f9cd916eb331a01c39f69e Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Tue, 19 Dec 2023 15:33:39 +0800 Subject: [PATCH 25/48] Update arguments.py --- megatron/arguments.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index 8578934258..8c00d4a483 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -526,12 +526,12 @@ def _add_network_size_args(parser): 'attention. This is set to ' ' args.hidden_size // args.num_attention_heads ' 'if not provided.') - group.add_argument('--position-embedding-type', type=str, default='learned_absolute', - choices=['learned_absolute', 'rope', 'alibi'], - help='Position embedding type.') group.add_argument('--max-position-embeddings', type=int, default=None, help='Maximum number of position embeddings to use. ' 'This is the size of position embedding.') + group.add_argument('--position-embedding-type', type=str, default='learned_absolute', + choices=['learned_absolute', 'rope', 'alibi'], + help='Position embedding type.') group.add_argument('--use-rotary-position-embeddings', action='store_true', help='Use rotary positional embeddings or not') group.add_argument('--rotary-percent', type=float, default=1.0, From 48538ce67a2992f041318444ac4020fd8afce6e0 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Tue, 19 Dec 2023 15:57:33 +0800 Subject: [PATCH 26/48] Update transformer.py --- megatron/model/transformer.py | 57 +++++++++++++++-------------------- 1 file changed, 24 insertions(+), 33 deletions(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 177888603a..a36c73fe94 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -16,7 +16,7 @@ from megatron.model.fused_softmax import FusedScaleMaskSoftmax from megatron.model.fused_bias_gelu import bias_gelu_impl from megatron.model.rotary_pos_embedding import apply_rotary_pos_emb -from megatron.model.utils import attention_mask_func, alibi_mask_func, openai_gelu, erf_gelu +from megatron.model.utils import attention_mask_func, alibi_mask_func, get_slopes, openai_gelu, erf_gelu try: from einops import rearrange @@ -32,13 +32,18 @@ try: from flash_attn.flash_attn_interface import flash_attn_unpadded_func - from flash_attn.flash_attn_triton import flash_attn_func except ImportError: try: from flash_attn.flash_attn_interface import flash_attn_varlen_func as flash_attn_unpadded_func - from flash_attn.flash_attn_triton import flash_attn_func except ImportError: flash_attn_unpadded_func = None + +try: + from flash_attn.flash_attn_triton import flash_attn_func +except ImportError: + try: + from flash_attn.flash_attn_triton import flash_attn_func + except ImportError: flash_attn_func = None """ We use the following notation throughout this file: @@ -56,22 +61,6 @@ hyperparameters: transformer hyperparameters """ -def get_slopes(n): - def get_slopes_power_of_2(n): - start = (2 ** (-2 ** -(math.log2(n) - 3))) - ratio = start - return [start * ratio ** i for i in range(n)] - - if math.log2(n).is_integer(): - return get_slopes_power_of_2(n) - else: - closest_power_of_2 = 2 ** math.floor(math.log2(n)) - return ( - get_slopes_power_of_2(closest_power_of_2) - + get_slopes( - 2 * closest_power_of_2, - )[0::2][:n - closest_power_of_2] - ) def _fill_with_neg_inf(t): """FP16-compatible function that fills a tensor with -inf.""" @@ -86,7 +75,7 @@ def _buffered_future_mask(tensor, maxpos, alibi, attn_heads): def _gen_alibi_mask(num_attention_heads, max_seq_len): - slopes = torch.Tensor(get_slopes(num_attention_heads)) + slopes = torch.Tensor(get_slopes(num_attention_heads), device=torch.cuda.current_device()) alibi = ( slopes.unsqueeze(1).unsqueeze(1) * torch.arange(max_seq_len).unsqueeze(0).unsqueeze(0).expand( @@ -313,6 +302,7 @@ def forward(self, query_layer, key_layer, # =================================== # Raw attention scores. [b, np, s, s] # =================================== + q_len = query_layer.size(0) # [b, np, sq, sk] output_size = (query_layer.size(1), @@ -327,12 +317,9 @@ def forward(self, query_layer, key_layer, key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1) - # preallocting input tensor: [b * np, sq, sk] # preallocting input tensor: [b * np, sq, sk] matmul_input_buffer = mpu.get_global_memory_buffer().get_tensor( - (output_size[0]*output_size[1], - output_size[2], - output_size[3]), + (output_size[0]*output_size[1], output_size[2], output_size[3]), query_layer.dtype, "mpu") matmul_result = torch.baddbmm( @@ -434,7 +421,7 @@ def forward(self, q, k, v, bias=None): batch_size, seqlen_q = q.shape[0], q.shape[1] seqlen_k = k.shape[1] - # q, k, v = [rearrange(x, 'b s ... -> (b s) ...') for x in [q, k, v]] + q, k, v = [rearrange(x, 'b s ... -> (b s) ...') for x in [q, k, v]] cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, device=q.device) @@ -552,14 +539,15 @@ def __init__(self, init_method, self.num_attention_heads_per_partition = core.utils.divide( args.num_attention_heads, world_size) - coeff = None - self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) - if self.apply_query_key_layer_scaling: - coeff = self.layer_number - self.norm_factor *= coeff + # coeff = None + # self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) + # if self.apply_query_key_layer_scaling: + # coeff = self.layer_number + # self.norm_factor *= coeff + # Thoese for softmax_scale. Current instances doesn't need for now. if self.use_flash_attn: self.core_attention_flash = FlashSelfAttention( - causal=True, softmax_scale=(1.0/self.norm_factor), attention_dropout=args.attention_dropout + causal=True, attention_dropout=args.attention_dropout ) # Output. @@ -582,7 +570,7 @@ def custom_forward(*inputs): value_layer = inputs[2] attention_mask = inputs[3] output_ = self.core_attention(query_layer, key_layer, - value_layer, attention_mask) + value_layer, attention_mask) return output_ q_pos_emb, k_pos_emb = (None, None) if rotary_pos_emb is None \ @@ -927,7 +915,7 @@ def forward(self, hidden_states, attention_mask, # MLP. mlp_output, mlp_bias = self.mlp(layernorm_output) - # torch.distributed.barrier() + # Second residual connection. if self.apply_residual_connection_post_layernorm: residual = layernorm_output @@ -1463,6 +1451,7 @@ def forward(self, hidden_states, attention_mask, forward_kwargs['checkpoint_core_attention'] = self.checkpoint_core_attention else: forward_kwargs['rotary_pos_emb'] = rotary_pos_emb + for index in range(self.num_layers): layer = self._get_layer(index) @@ -1474,7 +1463,9 @@ def forward(self, hidden_states, attention_mask, # Skip counter update for eval and activation checkpointing if torch.is_grad_enabled() and self.training: self.microbatch_count += 1 + # Final layer norm. if self.post_process and self.post_layer_norm: hidden_states = self.final_layernorm(hidden_states) + return hidden_states From 531c2e4d8ade679c18b8cf05a66b4d6a222dcc60 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Tue, 19 Dec 2023 15:58:27 +0800 Subject: [PATCH 27/48] Update utils.py --- megatron/model/utils.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/megatron/model/utils.py b/megatron/model/utils.py index ae8a0cc49b..2ffa36aa64 100644 --- a/megatron/model/utils.py +++ b/megatron/model/utils.py @@ -36,6 +36,24 @@ def alibi_mask_func(attention_scores, attention_mask): return attention_scores +def get_slopes(n): + def get_slopes_power_of_2(n): + start = (2 ** (-2 ** -(math.log2(n) - 3))) + ratio = start + return [start * ratio ** i for i in range(n)] + + if math.log2(n).is_integer(): + return get_slopes_power_of_2(n) + else: + closest_power_of_2 = 2 ** math.floor(math.log2(n)) + return ( + get_slopes_power_of_2(closest_power_of_2) + + get_slopes( + 2 * closest_power_of_2, + )[0::2][:n - closest_power_of_2] + ) + + def get_linear_layer(rows, columns, init_method): """Simple linear layer with weight initialization.""" layer = torch.nn.Linear(rows, columns) From ca53b5ea7df259ddcaf58a5abddb6f71a7a25e2f Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Tue, 19 Dec 2023 16:01:20 +0800 Subject: [PATCH 28/48] Update tokenizer.py --- megatron/tokenizer/tokenizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 0932f4d10b..801ca6c1c8 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -545,13 +545,13 @@ def additional_special_tokens_ids(self): class _AutoTokenizer(AbstractTokenizer): """AutoTokenizer for Hf Pretrained model loading.""" - def __init__(self, tokenizer_name_or_path, vocab_extra_ids=0): + def __init__(self, tokenizer_name_or_path, trust_remote_code=False, vocab_extra_ids=0): name = tokenizer_name_or_path super().__init__(name) hf_tokenizer_kwargs = {} if vocab_extra_ids > 0: hf_tokenizer_kwargs["additional_special_tokens"] = [f"" for _id in range(vocab_extra_ids)] - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, trust_remote_code=True,**hf_tokenizer_kwargs) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, trust_remote_code=trust_remote_code,**hf_tokenizer_kwargs) self.tokenizer.pad_token = self.tokenizer.eos_token self.encoder = self.tokenizer.get_vocab() self.decoder = {v: k for k, v in self.encoder.items()} From b4443d8bacb62d90d7f6c1b9ed2e67d344d35bcc Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Tue, 19 Dec 2023 16:03:47 +0800 Subject: [PATCH 29/48] Update tokenizer.py --- megatron/tokenizer/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 801ca6c1c8..9f259c239c 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -43,7 +43,7 @@ def build_tokenizer(args): tokenizer = _NullTokenizer(args.vocab_size) elif args.tokenizer_type == 'PretrainedFromHF': assert args.tokenizer_name_or_path is not None - tokenizer = _AutoTokenizer(args.tokenizer_name_or_path, vocab_extra_ids=args.vocab_extra_ids) + tokenizer = _AutoTokenizer(args.tokenizer_name_or_path, trust_remote_code=args.trust_remote_code, vocab_extra_ids=args.vocab_extra_ids) else: raise NotImplementedError('{} tokenizer is not ' 'implemented.'.format(args.tokenizer_type)) From ae70b9b30b3933c03d76d8c53d4c21b5224db4be Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Tue, 19 Dec 2023 16:05:39 +0800 Subject: [PATCH 30/48] Update arguments.py --- megatron/arguments.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/megatron/arguments.py b/megatron/arguments.py index 8c00d4a483..bd3ce37244 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -1107,6 +1107,8 @@ def _add_data_args(parser): help='Sentencepiece tokenizer model.') group.add_argument('--tokenizer-name-or-path', type=str, default=None, help='tokenizer model path for PretrainedFromHF.') + group.add_argument('--trust_remote_code', action='store_true', + help='Whether trust remote code when using PretrainedFromHF.') group.add_argument('--data-impl', type=str, default='infer', choices=['lazy', 'cached', 'mmap', 'infer'], help='Implementation of indexed datasets.') From c2e18671d4cc4925c4100a0e8010df5708f4d291 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Tue, 19 Dec 2023 16:06:30 +0800 Subject: [PATCH 31/48] Update Baichuan_13_standalone.sh --- examples/Baichuan_13_standalone.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/Baichuan_13_standalone.sh b/examples/Baichuan_13_standalone.sh index bc754a9c03..351504f9ee 100644 --- a/examples/Baichuan_13_standalone.sh +++ b/examples/Baichuan_13_standalone.sh @@ -48,6 +48,7 @@ options=" \ --tokenizer-type PretrainedFromHF \ --tokenizer-name-or-path $TOKENIZER_PATH \ --make-vocab-size-divisible-by 1 \ + --trust_remote_code \ --init-method-std 0.01 \ --micro-batch-size ${MICRO_BATCH_SIZE} \ --global-batch-size ${GLOBAL_BATCH_SIZE} \ From 60f218f0073ec456b2fe5b580cd25d1ec140b1b5 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Tue, 19 Dec 2023 16:11:07 +0800 Subject: [PATCH 32/48] Update training.py --- megatron/training.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index b313e83924..30f430b81e 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -963,8 +963,9 @@ def build_train_valid_test_data_loaders( else: flags = torch.cuda.LongTensor([0, 0, 0]) # Broadcast num tokens. - torch.distributed.barrier() - # torch.distributed.broadcast(flags, mpu.get_tensor_model_parallel_src_rank(), group=mpu.get_tensor_model_parallel_group()) + # torch.distributed.broadcast(flags, + # mpu.get_tensor_model_parallel_src_rank(), + # group=mpu.get_tensor_model_parallel_group()) torch.distributed.broadcast(flags, 0) args.do_train = flags[0].item() args.do_valid = flags[1].item() From 917992c301a4571136d3e9a026ea4eb145d5cb9c Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Wed, 20 Dec 2023 16:22:24 +0800 Subject: [PATCH 33/48] support flash attention v1 --- megatron/model/transformer.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index a36c73fe94..3ef187a60e 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -41,10 +41,7 @@ try: from flash_attn.flash_attn_triton import flash_attn_func except ImportError: - try: - from flash_attn.flash_attn_triton import flash_attn_func - except ImportError: - flash_attn_func = None + flash_attn_func = None """ We use the following notation throughout this file: h: hidden size @@ -421,7 +418,8 @@ def forward(self, q, k, v, bias=None): batch_size, seqlen_q = q.shape[0], q.shape[1] seqlen_k = k.shape[1] - q, k, v = [rearrange(x, 'b s ... -> (b s) ...') for x in [q, k, v]] + if bias is None: + q, k, v = [rearrange(x, 'b s ... -> (b s) ...') for x in [q, k, v]] cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, device=q.device) @@ -531,13 +529,13 @@ def __init__(self, init_method, self.checkpoint_core_attention = args.recompute_granularity == 'selective' self.apply_query_key_layer_scaling = args.apply_query_key_layer_scaling - world_size = mpu.get_tensor_model_parallel_world_size() + tensor_parallel_size = mpu.get_tensor_model_parallel_world_size() self.hidden_size_per_partition = core.utils.divide(projection_size, - world_size) + tensor_parallel_size) self.hidden_size_per_attention_head = core.utils.divide( projection_size, args.num_attention_heads) self.num_attention_heads_per_partition = core.utils.divide( - args.num_attention_heads, world_size) + args.num_attention_heads, tensor_parallel_size) # coeff = None # self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) @@ -724,9 +722,15 @@ def forward(self, hidden_states, attention_mask, cur_mask = None if not self.sequence_parallel: with tensor_parallel.get_cuda_rng_tracker().fork(): - context_layer = self.core_attention_flash(q, k, v, bias=attention_mask) + if self.position_embedding_type == "alibi": + context_layer = self.core_attention_flash(q, k, v, bias=attention_mask) + else: + context_layer = self.core_attention_flash(q, k, v) else: - context_layer = self.core_attention_flash(q, k, v, bias=attention_mask) + if self.position_embedding_type == "alibi": + context_layer = self.core_attention_flash(q, k, v, bias=attention_mask) + else: + context_layer = self.core_attention_flash(q, k, v) context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous() # ================= # Output. [sq, b, h] From 3b36b504f28c46b849864984698aa7fc9af696c5 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Wed, 20 Dec 2023 20:39:11 +0800 Subject: [PATCH 34/48] Update Baichuan_13_standalone.sh --- examples/Baichuan_13_standalone.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/Baichuan_13_standalone.sh b/examples/Baichuan_13_standalone.sh index 351504f9ee..96cc36b479 100644 --- a/examples/Baichuan_13_standalone.sh +++ b/examples/Baichuan_13_standalone.sh @@ -48,7 +48,7 @@ options=" \ --tokenizer-type PretrainedFromHF \ --tokenizer-name-or-path $TOKENIZER_PATH \ --make-vocab-size-divisible-by 1 \ - --trust_remote_code \ + --trust-remote-code \ --init-method-std 0.01 \ --micro-batch-size ${MICRO_BATCH_SIZE} \ --global-batch-size ${GLOBAL_BATCH_SIZE} \ From 64255e3833bc2341299bf86464fdff6ed1dbe5d9 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Wed, 20 Dec 2023 20:39:40 +0800 Subject: [PATCH 35/48] Update arguments.py --- megatron/arguments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index bd3ce37244..c2c52cc0b0 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -1107,7 +1107,7 @@ def _add_data_args(parser): help='Sentencepiece tokenizer model.') group.add_argument('--tokenizer-name-or-path', type=str, default=None, help='tokenizer model path for PretrainedFromHF.') - group.add_argument('--trust_remote_code', action='store_true', + group.add_argument('--trust-remote-code', action='store_true', help='Whether trust remote code when using PretrainedFromHF.') group.add_argument('--data-impl', type=str, default='infer', choices=['lazy', 'cached', 'mmap', 'infer'], From c6607c260ff7377a9e179c2ade059ff664c9335e Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Wed, 20 Dec 2023 21:19:37 +0800 Subject: [PATCH 36/48] initialize certain Tensors on GPU --- megatron/model/transformer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 3ef187a60e..a604a8eb1c 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -65,17 +65,17 @@ def _fill_with_neg_inf(t): def _buffered_future_mask(tensor, maxpos, alibi, attn_heads): - _future_mask = torch.triu(_fill_with_neg_inf(torch.zeros([maxpos, maxpos])), 1) + _future_mask = torch.triu(_fill_with_neg_inf(torch.zeros([maxpos, maxpos])), 1).to(torch.cuda.current_device()) _future_mask = _future_mask.unsqueeze(0) + alibi new_future_mask = _future_mask.to(tensor) return new_future_mask[: tensor.shape[0] * attn_heads, :maxpos, :maxpos] def _gen_alibi_mask(num_attention_heads, max_seq_len): - slopes = torch.Tensor(get_slopes(num_attention_heads), device=torch.cuda.current_device()) + slopes = torch.tensor(get_slopes(num_attention_heads), device=torch.cuda.current_device()) alibi = ( slopes.unsqueeze(1).unsqueeze(1) - * torch.arange(max_seq_len).unsqueeze(0).unsqueeze(0).expand( + * torch.arange(max_seq_len, device=torch.cuda.current_device()).unsqueeze(0).unsqueeze(0).expand( num_attention_heads, -1, -1) ) # alibi = alibi.unsqueeze(0) @@ -1293,10 +1293,10 @@ def _build_alibi_tensor(self, tensor, max_seq_len, num_attention_heads): (1, num_attention_heads_per_partition, 1, max_seq_len), """ if self.training: - slopes = torch.Tensor(get_slopes(num_attention_heads)) + slopes = torch.tensor(get_slopes(num_attention_heads), device=torch.cuda.current_device()) position_point = ( torch.arange(max_seq_len) - max_seq_len + 1 - ) + ).to(torch.cuda.current_device()) position_point = ( position_point.unsqueeze(0) .unsqueeze(0) From 39ecc656be62a9c81241bf4fae4f8d07c0a514d3 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Thu, 21 Dec 2023 10:37:37 +0800 Subject: [PATCH 37/48] Update transformer.py --- megatron/model/transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index a604a8eb1c..51a2ca7e67 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -79,7 +79,7 @@ def _gen_alibi_mask(num_attention_heads, max_seq_len): num_attention_heads, -1, -1) ) # alibi = alibi.unsqueeze(0) - alibi_mask = torch.triu(_fill_with_neg_inf(torch.zeros([max_seq_len, max_seq_len])), 1) # [max_seq_len, max_seq_len] + alibi_mask = torch.triu(_fill_with_neg_inf(torch.zeros([max_seq_len, max_seq_len])), 1).to(torch.cuda.current_device()) # [max_seq_len, max_seq_len] alibi_mask = alibi_mask.unsqueeze(0) + alibi # [num_attention_heads, max_seq_len, max_seq_len] return alibi_mask From 975bac6c2f1d414a5405f14bd357d5755aaa3546 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Tue, 26 Dec 2023 11:44:01 +0800 Subject: [PATCH 38/48] Update __init__.py --- megatron/fused_kernels/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/megatron/fused_kernels/__init__.py b/megatron/fused_kernels/__init__.py index d1a488bcf2..dcbf24cb3f 100644 --- a/megatron/fused_kernels/__init__.py +++ b/megatron/fused_kernels/__init__.py @@ -23,9 +23,9 @@ def load(args): if int(bare_metal_major) >= 11: cc_flag.append('-gencode') cc_flag.append('arch=compute_80,code=sm_80') - # if int(bare_metal_minor) >= 7: - # cc_flag.append('-gencode') - # cc_flag.append('arch=compute_90,code=sm_90') + if int(bare_metal_minor) >= 7: + cc_flag.append('-gencode') + cc_flag.append('arch=compute_90,code=sm_90') # Build path srcpath = pathlib.Path(__file__).parent.absolute() From 5b8fca10ad267c6d34b33f1fcc76218ea050ebf2 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Tue, 26 Dec 2023 11:51:56 +0800 Subject: [PATCH 39/48] Update Baichuan_13_standalone.sh --- examples/Baichuan_13_standalone.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/Baichuan_13_standalone.sh b/examples/Baichuan_13_standalone.sh index 96cc36b479..b2e165fb9d 100644 --- a/examples/Baichuan_13_standalone.sh +++ b/examples/Baichuan_13_standalone.sh @@ -11,7 +11,7 @@ MICRO_BATCH_SIZE=2 GLOBAL_BATCH_SIZE=$((($WORLD_SIZE * $MICRO_BATCH_SIZE) / ($TP_SIZE * $PP_SIZE) * 8)) # GLOBAL_BATCH_SIZE=128 -JOB_NAME="LLaMA_tp${TP_SIZE}_pp${PP_SIZE}_mbs${MICRO_BATCH_SIZE}_gpus${WORLD_SIZE}" +JOB_NAME="Baichuan_tp${TP_SIZE}_pp${PP_SIZE}_mbs${MICRO_BATCH_SIZE}_gpus${WORLD_SIZE}" LOAD_CHECKPOINT_PATH="PATH TO THE MODEL CHECKPOINT" SAVE_CHECKPOINT_PATH="PATH TO SAVE MODEL CHECKPOINT" From d636039ebbc20bc910604a47625f18d205c889c2 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Tue, 2 Jan 2024 21:39:55 +0800 Subject: [PATCH 40/48] remove collate_fn --- megatron/data/data_samplers.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index 5f096ef503..2722034863 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -2,7 +2,6 @@ """Dataloaders.""" - import random import torch import numpy as np @@ -11,7 +10,7 @@ from megatron.core import mpu -def build_pretraining_data_loader(dataset, consumed_samples, data_collator=None): +def build_pretraining_data_loader(dataset, consumed_samples): """Buld dataloader given an input dataset.""" if dataset is None: @@ -37,15 +36,13 @@ def build_pretraining_data_loader(dataset, consumed_samples, data_collator=None) data_sharding=args.data_sharding) else: raise Exception('{} dataloader type is not supported.'.format( - args.dataloader_type)) + args.dataloader_type)) # Torch dataloader. - dataloader = torch.utils.data.DataLoader(dataset, - collate_fn=data_collator, + return torch.utils.data.DataLoader(dataset, batch_sampler=batch_sampler, num_workers=args.num_workers, pin_memory=True) - return dataloader class MegatronPretrainingSampler: @@ -139,8 +136,8 @@ def __init__(self, dataset, total_samples, consumed_samples, micro_batch_size, # Sanity checks. assert self.total_samples > 0, \ 'no sample to consume: {}'.format(self.total_samples) - assert self.micro_batch_size > 0, 'self.micro_batch_size > 0' - assert data_parallel_size > 0, 'data_parallel_size > 0' + assert self.micro_batch_size > 0 + assert data_parallel_size > 0 assert self.data_parallel_rank < data_parallel_size, \ 'data_parallel_rank should be smaller than data size: {}, ' \ '{}'.format(self.data_parallel_rank, data_parallel_size) @@ -160,17 +157,17 @@ def __iter__(self): # data sharding and random sampling if self.data_sharding: bucket_size = (self.total_samples // self.micro_batch_times_data_parallel_size) \ - * self.micro_batch_size + * self.micro_batch_size bucket_offset = current_epoch_samples // self.data_parallel_size start_idx = self.data_parallel_rank * bucket_size - + g = torch.Generator() g.manual_seed(self.epoch) random_idx = torch.randperm(bucket_size, generator=g).tolist() idx_range = [start_idx + x for x in random_idx[bucket_offset:]] else: full_bucket_size = (self.total_samples // self.micro_batch_size) \ - * self.micro_batch_size + * self.micro_batch_size full_bucket_offset = current_epoch_samples g = torch.Generator() g.manual_seed(self.epoch) From 0dd0c210031c093502d44bc5a8dc61f425a9ae6d Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Tue, 2 Jan 2024 21:56:47 +0800 Subject: [PATCH 41/48] Update indentent in transformer.py --- megatron/model/transformer.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 51a2ca7e67..1c39b57c46 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -594,6 +594,7 @@ def forward(self, hidden_states, attention_mask, encoder_output=None, inference_params=None, rotary_pos_emb=None): # hidden_states: [sq, b, h] + # ================================================= # Pre-allocate memory for key-values for inference. # ================================================= @@ -621,6 +622,7 @@ def forward(self, hidden_states, attention_mask, # Attention heads [sq, b, h] --> [sq, b, (np * 3 * hn)] mixed_x_layer, _ = self.query_key_value(hidden_states) query_layer, key_layer, value_layer = tensor_parallel.split_tensor_along_last_dim(mixed_x_layer, 3) + # Changed layout, for compatibility with CKPT conversion new_tensor_shape = query_layer.size()[:-1] + \ (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) @@ -628,6 +630,7 @@ def forward(self, hidden_states, attention_mask, query_layer = query_layer.view(new_tensor_shape) key_layer = key_layer.view(new_tensor_shape) value_layer = value_layer.view(new_tensor_shape) + else: # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)] mixed_kv_layer, _ = self.key_value(encoder_output) @@ -649,9 +652,11 @@ def forward(self, hidden_states, attention_mask, (self.num_attention_heads_per_partition, self.hidden_size_per_attention_head) query_layer = query_layer.view(*new_tensor_shape) + # ================================== # Adjust key and value for inference # ================================== + # duplicate the pos_emb for self attention if rotary_pos_emb is not None: if isinstance(rotary_pos_emb, tuple): @@ -675,6 +680,8 @@ def forward(self, hidden_states, attention_mask, :sequence_end, batch_start:batch_end, ...] value_layer = inference_value_memory[ :sequence_end, batch_start:batch_end, ...] + + # adjust the key rotary positional embedding if rotary_pos_emb is not None: q_pos_emb, k_pos_emb = rotary_pos_emb @@ -709,6 +716,7 @@ def forward(self, hidden_states, attention_mask, # absolute positional embedding. # otherwise, only relative positional embedding takes effect # value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb) + if not self.use_flash_attn: if self.checkpoint_core_attention: context_layer = self._checkpointed_attention_forward( @@ -732,10 +740,13 @@ def forward(self, hidden_states, attention_mask, else: context_layer = self.core_attention_flash(q, k, v) context_layer = rearrange(context_layer, 'b s h d -> s b (h d)').contiguous() + # ================= # Output. [sq, b, h] # ================= + output, bias = self.dense(context_layer) + return output, bias @@ -851,6 +862,7 @@ def forward(self, hidden_states, attention_mask, encoder_output=None, enc_dec_attn_mask=None, inference_params=None, rotary_pos_emb=None): # hidden_states: [s, b, h] + # Layer norm at the beginning of the transformer layer. layernorm_output = self.input_layernorm(hidden_states) # Self attention. @@ -860,11 +872,13 @@ def forward(self, hidden_states, attention_mask, attention_mask, inference_params=inference_params, rotary_pos_emb=rotary_pos_emb) + # Residual connection. if self.apply_residual_connection_post_layernorm: residual = layernorm_output else: residual = hidden_states + if self.drop_path is None: # jit scripting for a nn.module (with dropout) is not # trigerring the fusion kernel. For now, we use two @@ -891,8 +905,10 @@ def forward(self, hidden_states, attention_mask, p=self.hidden_dropout, training=self.training) layernorm_input = residual + self.drop_path(out) + # Layer norm post the self attention. layernorm_output = self.post_attention_layernorm(layernorm_input) + if self.layer_type == LayerType.decoder: attention_output, attention_bias = \ self.inter_attention(layernorm_output, @@ -919,7 +935,7 @@ def forward(self, hidden_states, attention_mask, # MLP. mlp_output, mlp_bias = self.mlp(layernorm_output) - + # Second residual connection. if self.apply_residual_connection_post_layernorm: residual = layernorm_output @@ -953,6 +969,7 @@ def forward(self, hidden_states, attention_mask, p=self.hidden_dropout, training=self.training) output = residual + self.drop_path(out) + return output @@ -1339,6 +1356,7 @@ def forward(self, hidden_states, attention_mask, encoder_output=None, enc_dec_attn_mask=None, inference_params=None, rotary_pos_emb=None): # hidden_states: [s, b, h] + if self.position_embedding_type == "alibi": # assert not args.use_flash_attn, \ # 'ALiBi does not work with FlashAttention currently' @@ -1455,7 +1473,7 @@ def forward(self, hidden_states, attention_mask, forward_kwargs['checkpoint_core_attention'] = self.checkpoint_core_attention else: forward_kwargs['rotary_pos_emb'] = rotary_pos_emb - + for index in range(self.num_layers): layer = self._get_layer(index) @@ -1467,9 +1485,9 @@ def forward(self, hidden_states, attention_mask, # Skip counter update for eval and activation checkpointing if torch.is_grad_enabled() and self.training: self.microbatch_count += 1 - + # Final layer norm. if self.post_process and self.post_layer_norm: hidden_states = self.final_layernorm(hidden_states) - + return hidden_states From 56684aa5885f9d79c7c3b4b50639ba078397f12f Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Tue, 2 Jan 2024 22:01:58 +0800 Subject: [PATCH 42/48] Update data_samplers.py --- megatron/data/data_samplers.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/megatron/data/data_samplers.py b/megatron/data/data_samplers.py index 2722034863..8dec2c1922 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/data/data_samplers.py @@ -2,6 +2,7 @@ """Dataloaders.""" + import random import torch import numpy as np @@ -36,7 +37,7 @@ def build_pretraining_data_loader(dataset, consumed_samples): data_sharding=args.data_sharding) else: raise Exception('{} dataloader type is not supported.'.format( - args.dataloader_type)) + args.dataloader_type)) # Torch dataloader. return torch.utils.data.DataLoader(dataset, @@ -44,7 +45,6 @@ def build_pretraining_data_loader(dataset, consumed_samples): num_workers=args.num_workers, pin_memory=True) - class MegatronPretrainingSampler: def __init__(self, total_samples, consumed_samples, micro_batch_size, @@ -157,17 +157,17 @@ def __iter__(self): # data sharding and random sampling if self.data_sharding: bucket_size = (self.total_samples // self.micro_batch_times_data_parallel_size) \ - * self.micro_batch_size + * self.micro_batch_size bucket_offset = current_epoch_samples // self.data_parallel_size start_idx = self.data_parallel_rank * bucket_size - + g = torch.Generator() g.manual_seed(self.epoch) random_idx = torch.randperm(bucket_size, generator=g).tolist() idx_range = [start_idx + x for x in random_idx[bucket_offset:]] else: full_bucket_size = (self.total_samples // self.micro_batch_size) \ - * self.micro_batch_size + * self.micro_batch_size full_bucket_offset = current_epoch_samples g = torch.Generator() g.manual_seed(self.epoch) From 451ffab3effca0073e5cb26217c5f31e1780be1c Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Tue, 2 Jan 2024 22:03:52 +0800 Subject: [PATCH 43/48] Update training.py --- megatron/training.py | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/megatron/training.py b/megatron/training.py index 30f430b81e..d5e5aad575 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -56,7 +56,6 @@ def pretrain(train_valid_test_dataset_provider, forward_step_func, process_non_loss_data_func=None, extra_args_provider=None, - collator=None, args_defaults={}): """Main training program. @@ -119,15 +118,10 @@ def pretrain(train_valid_test_dataset_provider, # Data stuff. timers('train/valid/test-data-iterators-setup', log_level=0).start( barrier=True) - - data_collator = None - if collator is not None: - data_collator = collator() - if args.virtual_pipeline_model_parallel_size is not None: all_data_iterators = [ build_train_valid_test_data_iterators( - train_valid_test_dataset_provider, data_collator) + train_valid_test_dataset_provider) for _ in range(len(model)) ] train_data_iterator = [data_iterators[0] @@ -139,7 +133,7 @@ def pretrain(train_valid_test_dataset_provider, else: train_data_iterator, valid_data_iterator, test_data_iterator \ = build_train_valid_test_data_iterators( - train_valid_test_dataset_provider, data_collator) + train_valid_test_dataset_provider) timers('train/valid/test-data-iterators-setup').stop() print_datetime('after dataloaders are built') @@ -905,7 +899,7 @@ def cyclic_iter(iter): def build_train_valid_test_data_loaders( - build_train_valid_test_datasets_provider, data_collator=None): + build_train_valid_test_datasets_provider): """XXX""" args = get_args() @@ -948,10 +942,10 @@ def build_train_valid_test_data_loaders( # Build dataloders. train_dataloader = build_pretraining_data_loader( - train_ds, args.consumed_train_samples, data_collator) + train_ds, args.consumed_train_samples) valid_dataloader = build_pretraining_data_loader( - valid_ds, args.consumed_valid_samples, data_collator) - test_dataloader = build_pretraining_data_loader(test_ds, 0, data_collator) + valid_ds, args.consumed_valid_samples) + test_dataloader = build_pretraining_data_loader(test_ds, 0) # Flags to know if we need to do training/validation/testing. do_train = train_dataloader is not None and args.train_iters > 0 @@ -962,11 +956,11 @@ def build_train_valid_test_data_loaders( [int(do_train), int(do_valid), int(do_test)]) else: flags = torch.cuda.LongTensor([0, 0, 0]) + # Broadcast num tokens. - # torch.distributed.broadcast(flags, - # mpu.get_tensor_model_parallel_src_rank(), - # group=mpu.get_tensor_model_parallel_group()) - torch.distributed.broadcast(flags, 0) + torch.distributed.broadcast(flags, + mpu.get_tensor_model_parallel_src_rank(), + group=mpu.get_tensor_model_parallel_group()) args.do_train = flags[0].item() args.do_valid = flags[1].item() args.do_test = flags[2].item() @@ -975,17 +969,18 @@ def build_train_valid_test_data_loaders( def build_train_valid_test_data_iterators( - build_train_valid_test_datasets_provider, data_collator=None): + build_train_valid_test_datasets_provider): args = get_args() # Build loaders. train_dataloader, valid_dataloader, test_dataloader = \ build_train_valid_test_data_loaders( - build_train_valid_test_datasets_provider, data_collator) + build_train_valid_test_datasets_provider) + # Build iterators. dl_type = args.dataloader_type - assert dl_type in ['single', 'cyclic'], "dl_type not in 'single', 'cyclic' " + assert dl_type in ['single', 'cyclic'] if train_dataloader is not None: train_data_iterator = iter(train_dataloader) if dl_type == 'single' \ From b4a21331a58f1d2f8f0879f9d53936e9cb4889b7 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Tue, 2 Jan 2024 22:07:14 +0800 Subject: [PATCH 44/48] Update transformer.py --- megatron/model/transformer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 1c39b57c46..d6155aada3 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -319,6 +319,7 @@ def forward(self, query_layer, key_layer, (output_size[0]*output_size[1], output_size[2], output_size[3]), query_layer.dtype, "mpu") + # Raw attention scores. [b * np, sq, sk] matmul_result = torch.baddbmm( matmul_input_buffer, query_layer.transpose(0, 1), # [b * np, sq, hn] From 7cf20bde63788b434ea3ecfaaaf6c1eb2d661b64 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Tue, 2 Jan 2024 22:12:29 +0800 Subject: [PATCH 45/48] Update transformer.py --- megatron/model/transformer.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index d6155aada3..988209dde4 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -1083,10 +1083,13 @@ def __init__(self, init_method, output_layer_init_method, self.recompute_num_layers = args.recompute_num_layers self.distribute_saved_activations = \ args.distribute_saved_activations and not args.sequence_parallel + self.first_run = True self.max_cache_pos = args.max_position_embeddings - self.sequence_parallel = args.sequence_parallel self.alibi_mask = None + + self.sequence_parallel = args.sequence_parallel + # Transformer Engine Init. if self.transformer_impl == 'transformer_engine': global transformer_engine From 5af1af2fcafb27a73a25d9b6f792f203eaf1f2c5 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Tue, 2 Jan 2024 22:17:26 +0800 Subject: [PATCH 46/48] Update utils.py --- megatron/model/utils.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/megatron/model/utils.py b/megatron/model/utils.py index 2ffa36aa64..0464e298b6 100644 --- a/megatron/model/utils.py +++ b/megatron/model/utils.py @@ -54,6 +54,31 @@ def get_slopes_power_of_2(n): ) +def _fill_with_neg_inf(t): + """FP16-compatible function that fills a tensor with -inf.""" + return t.float().fill_(float("-inf")).type_as(t) + + +def _buffered_future_mask(tensor, maxpos, alibi, attn_heads): + _future_mask = torch.triu(_fill_with_neg_inf(torch.zeros([maxpos, maxpos])), 1).to(torch.cuda.current_device()) + _future_mask = _future_mask.unsqueeze(0) + alibi + new_future_mask = _future_mask.to(tensor) + return new_future_mask[: tensor.shape[0] * attn_heads, :maxpos, :maxpos] + + +def _gen_alibi_mask(num_attention_heads, max_seq_len): + slopes = torch.tensor(get_slopes(num_attention_heads), device=torch.cuda.current_device()) + alibi = ( + slopes.unsqueeze(1).unsqueeze(1) + * torch.arange(max_seq_len, device=torch.cuda.current_device()).unsqueeze(0).unsqueeze(0).expand( + num_attention_heads, -1, -1) + ) + # alibi = alibi.unsqueeze(0) + alibi_mask = torch.triu(_fill_with_neg_inf(torch.zeros([max_seq_len, max_seq_len])), 1).to(torch.cuda.current_device()) # [max_seq_len, max_seq_len] + alibi_mask = alibi_mask.unsqueeze(0) + alibi # [num_attention_heads, max_seq_len, max_seq_len] + return alibi_mask + + def get_linear_layer(rows, columns, init_method): """Simple linear layer with weight initialization.""" layer = torch.nn.Linear(rows, columns) From 7de3e0aa2c543ed7b4c6508f9b393717f7ac9b35 Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Tue, 2 Jan 2024 22:24:17 +0800 Subject: [PATCH 47/48] Update transformer.py --- megatron/model/transformer.py | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 988209dde4..f5e0004e68 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -16,7 +16,7 @@ from megatron.model.fused_softmax import FusedScaleMaskSoftmax from megatron.model.fused_bias_gelu import bias_gelu_impl from megatron.model.rotary_pos_embedding import apply_rotary_pos_emb -from megatron.model.utils import attention_mask_func, alibi_mask_func, get_slopes, openai_gelu, erf_gelu +from megatron.model.utils import attention_mask_func, alibi_mask_func, get_slopes, _buffered_future_mask, _gen_alibi_mask, openai_gelu, erf_gelu try: from einops import rearrange @@ -58,32 +58,6 @@ hyperparameters: transformer hyperparameters """ - -def _fill_with_neg_inf(t): - """FP16-compatible function that fills a tensor with -inf.""" - return t.float().fill_(float("-inf")).type_as(t) - - -def _buffered_future_mask(tensor, maxpos, alibi, attn_heads): - _future_mask = torch.triu(_fill_with_neg_inf(torch.zeros([maxpos, maxpos])), 1).to(torch.cuda.current_device()) - _future_mask = _future_mask.unsqueeze(0) + alibi - new_future_mask = _future_mask.to(tensor) - return new_future_mask[: tensor.shape[0] * attn_heads, :maxpos, :maxpos] - - -def _gen_alibi_mask(num_attention_heads, max_seq_len): - slopes = torch.tensor(get_slopes(num_attention_heads), device=torch.cuda.current_device()) - alibi = ( - slopes.unsqueeze(1).unsqueeze(1) - * torch.arange(max_seq_len, device=torch.cuda.current_device()).unsqueeze(0).unsqueeze(0).expand( - num_attention_heads, -1, -1) - ) - # alibi = alibi.unsqueeze(0) - alibi_mask = torch.triu(_fill_with_neg_inf(torch.zeros([max_seq_len, max_seq_len])), 1).to(torch.cuda.current_device()) # [max_seq_len, max_seq_len] - alibi_mask = alibi_mask.unsqueeze(0) + alibi # [num_attention_heads, max_seq_len, max_seq_len] - return alibi_mask - - class DropPath(MegatronModule): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). @@ -1077,6 +1051,7 @@ def __init__(self, init_method, output_layer_init_method, self.position_embedding_type = args.position_embedding_type self.num_attention_heads = args.num_attention_heads self.params_dtype = args.params_dtype + # Store activation checkpoiting flag. self.recompute_granularity = args.recompute_granularity self.recompute_method = args.recompute_method From 251d6c5193055132246f655a07c8be30e1032d3a Mon Sep 17 00:00:00 2001 From: qyccc <910845102@qq.com> Date: Tue, 2 Jan 2024 22:28:19 +0800 Subject: [PATCH 48/48] Update transformer.py --- megatron/model/transformer.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index f5e0004e68..425ea7e2cc 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -23,13 +23,6 @@ except ImportError: rearrange = None -# try: -# from flash_attn.flash_attn_interface import flash_attn_unpadded_func -# from flash_attn.flash_attn_triton import flash_attn_func -# except ImportError: -# flash_attn_unpadded_func = None -# flash_attn_func = None - try: from flash_attn.flash_attn_interface import flash_attn_unpadded_func except ImportError: @@ -512,12 +505,6 @@ def __init__(self, init_method, self.num_attention_heads_per_partition = core.utils.divide( args.num_attention_heads, tensor_parallel_size) - # coeff = None - # self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) - # if self.apply_query_key_layer_scaling: - # coeff = self.layer_number - # self.norm_factor *= coeff - # Thoese for softmax_scale. Current instances doesn't need for now. if self.use_flash_attn: self.core_attention_flash = FlashSelfAttention( causal=True, attention_dropout=args.attention_dropout