diff --git a/configs/common/data/gpt_dataset.py b/configs/common/data/gpt_dataset.py index 781eef1b0..0bcccec42 100644 --- a/configs/common/data/gpt_dataset.py +++ b/configs/common/data/gpt_dataset.py @@ -20,40 +20,40 @@ dataloader = OmegaConf.create() -dataloader.train = LazyCall(build_nlp_train_val_test_loader)( - dataset=[ - LazyCall(GPT2Dataset)( - name="gpt-2", - data_prefix="/workspace/data/libai_dataset/loss_compara_content_sentence", - indexed_dataset=LazyCall(get_indexed_dataset)( - data_prefix="/workspace/data/libai_dataset/loss_compara_content_sentence", - data_impl="mmap", - skip_warmup=False, - ), - max_seq_length=1024, - seed=1234, - ), - ], - train_val_test_num_samples=None, # a hint for deferred assignment - splits=[[949.0, 50.0, 1.0]], - weights=[1.0], - num_workers=4, -) +# dataloader.train = LazyCall(build_nlp_train_val_test_loader)( +# dataset=[ +# LazyCall(GPT2Dataset)( +# name="gpt-2", +# data_prefix="/workspace/data/libai_dataset/loss_compara_content_sentence", +# indexed_dataset=LazyCall(get_indexed_dataset)( +# data_prefix="/workspace/data/libai_dataset/loss_compara_content_sentence", +# data_impl="mmap", +# skip_warmup=False, +# ), +# max_seq_length=1024, +# seed=1234, +# ), +# ], +# train_val_test_num_samples=None, # a hint for deferred assignment +# splits=[[949.0, 50.0, 1.0]], +# weights=[1.0], +# num_workers=4, +# ) -dataloader.test = [ - LazyCall(build_nlp_test_loader)( - dataset=LazyCall(GPT2Dataset)( - name="gpt-2", - data_prefix="/workspace/data/libai_dataset/loss_compara_content_sentence", - indexed_dataset=LazyCall(get_indexed_dataset)( - data_prefix="/workspace/data/libai_dataset/loss_compara_content_sentence", - data_impl="mmap", - skip_warmup=False, - ), - max_seq_length=1024, - max_num_samples=10, - seed=1234, - ), - test_batch_size=4, - ) -] +# dataloader.test = [ +# LazyCall(build_nlp_test_loader)( +# dataset=LazyCall(GPT2Dataset)( +# name="gpt-2", +# data_prefix="/workspace/data/libai_dataset/loss_compara_content_sentence", +# indexed_dataset=LazyCall(get_indexed_dataset)( +# data_prefix="/workspace/data/libai_dataset/loss_compara_content_sentence", +# data_impl="mmap", +# skip_warmup=False, +# ), +# max_seq_length=1024, +# max_num_samples=10, +# seed=1234, +# ), +# test_batch_size=4, +# ) +# ] diff --git a/configs/common/optim.py b/configs/common/optim.py index 7cd5f2cf0..8e062f2bf 100644 --- a/configs/common/optim.py +++ b/configs/common/optim.py @@ -7,8 +7,8 @@ params=LazyCall(get_default_optimizer_params)( # params.model is meant to be set to the model object, # before instantiating the optimizer. - clip_grad_max_norm=1.0, - clip_grad_norm_type=2.0, + # clip_grad_max_norm=1.0, + # clip_grad_norm_type=2.0, weight_decay_norm=0.0, weight_decay_bias=0.0, ), diff --git a/libai/config/configs b/libai/config/configs new file mode 120000 index 000000000..4528f6785 --- /dev/null +++ b/libai/config/configs @@ -0,0 +1 @@ +/home/zhangxiaoyu/libai/configs \ No newline at end of file diff --git a/libai/inference/generator/generation_utils.py b/libai/inference/generator/generation_utils.py index 2481a02fa..b4dcdda66 100644 --- a/libai/inference/generator/generation_utils.py +++ b/libai/inference/generator/generation_utils.py @@ -524,7 +524,7 @@ def greedy_search( # if eos_token was found in one sentence, set sentence to finished if eos_token_id is not None: unfinished_sequences = flow.mul( - unfinished_sequences, (next_tokens != eos_token_id).long() + unfinished_sequences, (next_tokens.to(flow.int32) != eos_token_id).long() ) if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores): diff --git a/libai/layers/activation.py b/libai/layers/activation.py index f394b0dce..636552fae 100644 --- a/libai/layers/activation.py +++ b/libai/layers/activation.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import math from enum import Enum from typing import Optional diff --git a/libai/layers/embedding.py b/libai/layers/embedding.py index c96e6b3df..f0a5cafb3 100644 --- a/libai/layers/embedding.py +++ b/libai/layers/embedding.py @@ -148,8 +148,10 @@ def __init__( ) ) # Initialize the word embedding + if os.getenv("ONEFLOW_LINEAR_EMBEDDING_SKIP_INIT", "0") != "1": self.init_method(self.weight) + # FIXME(Lxy): Fill padding_idx is not supported in nd_sbp right now. # self._fill_padding_idx_with_zero() @@ -164,7 +166,6 @@ def forward(self, input_ids): input_embeds = flow._C.gather(weight, input_ids, axis=0) # Set the embeds sbp from [S(0), P] --> [S(0), B] to get complete embedding results. input_embeds = input_embeds.to_global(sbp=dist.get_hidden_sbp()) - return input_embeds def _fill_padding_idx_with_zero(self) -> None: diff --git a/libai/models/gpt_model.py b/libai/models/gpt_model.py index 27f6bc8e9..cc65a99bc 100644 --- a/libai/models/gpt_model.py +++ b/libai/models/gpt_model.py @@ -203,7 +203,6 @@ def forward(self, input_ids): Returns: flow.Tensor: logits """ - input_ids = input_ids.to_global(placement=dist.get_layer_placement(0)) input_embeds = self.embeddings(input_ids, 0) diff --git a/libai/tokenizer/tokenization_base.py b/libai/tokenizer/tokenization_base.py index d5bd5a9ea..e2e439e04 100644 --- a/libai/tokenizer/tokenization_base.py +++ b/libai/tokenizer/tokenization_base.py @@ -783,7 +783,7 @@ def convert_to_tensors(self, token_ids, return_tensors=None, is_global=False, ** elif is_global: sbp = kwargs.get("sbp", dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])) placement = kwargs.get( - "placement", flow.placement("cuda", list(range(dist.get_world_size()))) + "placement", flow.placement("mlu", list(range(dist.get_world_size()))) ) return_token_ids = flow.tensor( token_ids, sbp=sbp, placement=placement, dtype=flow.long diff --git a/libai/utils/distributed.py b/libai/utils/distributed.py index e7914a0ad..4dd934c6f 100644 --- a/libai/utils/distributed.py +++ b/libai/utils/distributed.py @@ -71,7 +71,7 @@ def _init_distributed_env(self, cfg): self._world_size = num_gpus_per_node * num_nodes # Add set device type - self._device_type = try_get_key(cfg, "device_type", default="cuda") + self._device_type = try_get_key(cfg, "device_type", default="mlu") def _init_parallel_size(self, cfg): @@ -438,7 +438,7 @@ def convert_to_distributed_default_setting(t): def ttol(tensor, pure_local=False, ranks=None): """Global tensor to local tensor.""" if tensor.is_global: - placement = tensor.placement if not ranks else flow.placement("cuda", ranks) + placement = tensor.placement if not ranks else flow.placement("mlu", ranks) if pure_local: tensor = tensor.to_global(placement=placement).to_local() else: diff --git a/libai/version.py b/libai/version.py new file mode 100644 index 000000000..e7d545026 --- /dev/null +++ b/libai/version.py @@ -0,0 +1,2 @@ +__version__ = '0.2.0' +git_version = 'd47fe9c7908c1e7a4604574b2bd58a7e06d20645' diff --git a/projects/MagicPrompt/configs/gpt2_inference.py b/projects/MagicPrompt/configs/gpt2_inference.py index 87bd99eec..565fcdc1e 100644 --- a/projects/MagicPrompt/configs/gpt2_inference.py +++ b/projects/MagicPrompt/configs/gpt2_inference.py @@ -4,7 +4,9 @@ from projects.MagicPrompt.gpt2 import GPTModel, GPTForPreTraining from configs.common.data.gpt_dataset import tokenization from configs.common.train import train +from configs.common.models.graph import graph +graph.enabled=True cfg.update( # Model @@ -56,13 +58,13 @@ sep_token_id=None, decoder_start_token_id=None, # train - pretrained_model_path="/data/home/magicprompt", + pretrained_model_path="./oneflow-model", ) model = LazyCall(GPTModel)(cfg=cfg) pretrain_model = LazyCall(GPTForPreTraining)(cfg=cfg) tokenization.tokenizer = LazyCall(mock_tokenization.GPT2Tokenizer)( - vocab_file="/data/home/magicprompt/vocab.json", - merges_file="/data/home/magicprompt/merges.txt", + vocab_file="./oneflow-model/vocab.json", + merges_file="./oneflow-model/merges.txt", ) diff --git a/projects/MagicPrompt/configs/gpt2_training.py b/projects/MagicPrompt/configs/gpt2_training.py index f7458e427..b2cecdb36 100644 --- a/projects/MagicPrompt/configs/gpt2_training.py +++ b/projects/MagicPrompt/configs/gpt2_training.py @@ -9,10 +9,12 @@ from configs.common.train import train from configs.common.models.graph import graph +graph.enabled=False +graph.debug = 2 -vocab_file = "/data/home/magicprompt/vocab.json" -merge_files = "/data/home/magicprompt/merges.txt" -train_data_prefix = "/data/home/magicprompt/train/en_train_mmap_text_sentence" +vocab_file = "./magicprompt/vocab.json" +merge_files = "./magicprompt/merges.txt" +train_data_prefix = "./magicprompt/train/en_train_mmap_text_sentence" tokenization.tokenizer.vocab_file = vocab_file tokenization.tokenizer.merges_file = merge_files @@ -33,9 +35,9 @@ model.cfg.vocab_size = 50257 model.cfg.layernorm_epsilon = 1e-5 model.cfg.use_scaled_init_for_output_weights = True -model.cfg.bias_gelu_fusion = True -model.cfg.bias_dropout_fusion = True -model.cfg.scale_mask_softmax_fusion = True +model.cfg.bias_gelu_fusion = False +model.cfg.bias_dropout_fusion = False +model.cfg.scale_mask_softmax_fusion = False model.cfg.apply_query_key_layer_scaling = True model.cfg.apply_residual_post_layernorm = False model.cfg.amp_enabled = True @@ -56,14 +58,16 @@ test_micro_batch_size=4, train_epoch=33, train_iter=10000, - log_period=50, + log_period=1, amp=dict(enabled=True), warmup_ratio=0, checkpointer=dict(period=8000, max_to_keep=20), dist=dict( - data_parallel_size=1, + data_parallel_size=4, tensor_parallel_size=1, pipeline_parallel_size=1, + # pipeline_num_layers = 12, + # custom_pipeline_stage_id = [0] * 6 + [1] * 6, # pipeline_num_layers=model.cfg.hidden_layers, ), scheduler=LazyCall(WarmupExponentialLR)( diff --git a/projects/MagicPrompt/pipeline.py b/projects/MagicPrompt/pipeline.py index 56d1fa0d1..919149be2 100644 --- a/projects/MagicPrompt/pipeline.py +++ b/projects/MagicPrompt/pipeline.py @@ -97,7 +97,7 @@ def postprocess(self, model_output_dict, **kwargs) -> dict: pipeline_parallel=1, # pipeline_stage_id=[0] * 6 + [1] * 6, # pipeline_num_layers=12, - model_path="/path/to/oneflow-model", + model_path="oneflow-model/model", mode="libai", ) diff --git a/tools/train_net.py b/tools/train_net.py index 458849c75..a7c684910 100644 --- a/tools/train_net.py +++ b/tools/train_net.py @@ -20,6 +20,7 @@ import numpy as np import oneflow as flow +import oneflow_mlu sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) from libai.config import LazyConfig, default_argument_parser, try_get_key @@ -36,7 +37,8 @@ def main(args): seed_for_rank = cfg.train.seed + flow.env.get_rank() flow.manual_seed(seed_for_rank) - flow.cuda.manual_seed(seed_for_rank) + # flow.cuda.manual_seed(seed_for_rank) + flow._oneflow_internal.default_generator("mlu").manual_seed(seed_for_rank) np.random.seed(seed_for_rank) random.seed(seed_for_rank)