Skip to content

Commit

Permalink
Merge branch 'main' into xren/cp_debug
Browse files Browse the repository at this point in the history
  • Loading branch information
xrennvidia committed Jul 30, 2024
2 parents 242d77c + 86bfac2 commit c7dff10
Show file tree
Hide file tree
Showing 24 changed files with 736 additions and 211 deletions.
59 changes: 58 additions & 1 deletion .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3304,6 +3304,62 @@ jobs:
AFTER_SCRIPT: |
rm -rf /home/TestData/nlp/lora_tuning_tp2
L2_Megatron_GPT_PEFT_Lora_TP2SP1:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
with:
RUNNER: self-hosted-azure-gpus-2-h100
SCRIPT: |
rm -rf /home/TestData/nlp/lora_tuning_tp2_sp1
CUDA_DEVICE_MAX_CONNECTIONS=1 NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=1 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.max_epochs=9999 \
trainer.max_steps=3 \
trainer.val_check_interval=3 \
++trainer.limit_val_batches=2 \
trainer.precision=bf16 \
exp_manager.exp_dir=/home/TestData/nlp/lora_tuning_tp2_sp1 \
+model.mcore_gpt=True \
model.pipeline_model_parallel_size=1 \
model.tensor_model_parallel_size=2 \
model.sequence_parallel=True \
model.megatron_amp_O2=True \
model.restore_from_path=/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo \
+model.fp8=True \
+model.fp8_params=True \
+model.fp8_hybrid=True \
+model.fp8_e4m3=False \
+model.fp8_interval=1 \
+model.fp8_margin=0 \
+model.fp8_amax_history_len=32 \
+model.fp8_amax_compute_algo=max \
+model.reduce_amax=False \
+model.ub_tp_comm_overlap=False \
+model.tp_comm_overlap_ag=False \
+model.tp_comm_overlap_rs=False \
+model.tp_comm_overlap_disable_qkv=True \
model.peft.peft_scheme='lora' \
model.peft.lora_tuning.adapter_dim=16 \
model.peft.lora_tuning.alpha=32 \
model.peft.lora_tuning.column_init_method="kaiming" \
+model.peft.lora_tuning.dropout_position='pre' \
model.peft.lora_tuning.target_modules=['attention'] \
model.peft.lora_tuning.adapter_dropout=0.1 \
+model.peft.lora_tuning.a2a_experimental=1 \
model.answer_only_loss=True \
model.micro_batch_size=1 \
model.global_batch_size=1 \
model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
model.data.train_ds.concat_sampling_probabilities=[1.0] \
model.data.train_ds.num_workers=0 \
model.data.validation_ds.num_workers=0 \
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
model.data.validation_ds.names=[quarel]
AFTER_SCRIPT: |
rm -rf /home/TestData/nlp/lora_tuning_tp2_sp1
L2_Megatron_GPT_Eval:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
Expand Down Expand Up @@ -3858,7 +3914,7 @@ jobs:
trainer.precision=16 \
trainer.gradient_clip_val=1.0 \
exp_manager.exp_dir=examples/nlp/language_modeling/t5_pretrain_results \
model.pipeline_model_parallel_split_rank=1 \
model.pipeline_model_parallel_split_rank=0 \
model.seq_length=256 \
model.encoder.num_layers=4 \
model.decoder.num_layers=1 \
Expand Down Expand Up @@ -4631,6 +4687,7 @@ jobs:
- L2_Megatron_GPT_Embedding
- L2_Megatron_GPT_PEFT_Lora_PP2_O2
- L2_Megatron_GPT_PEFT_Lora_TP2_O1
- L2_Megatron_GPT_PEFT_Lora_TP2SP1
- L2_Megatron_GPT_Eval
- L2_Megatron_GPT_Eval_PP2
- L2_Megatron_GPT_SFT_Eval_inference_seq_len_greaterThan_training_seq_len
Expand Down
3 changes: 1 addition & 2 deletions Dockerfile.ci
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ WORKDIR /workspace
# Install NeMo requirements
ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
ARG MODELOPT_VERSION=0.13.0
ARG MCORE_TAG=c7a1f82d761577e6ca0338d3521eac82f2aa0904
ARG MCORE_TAG=2bbe55be32e2d478c4b2ce575af1cccb8fc3d9b9
ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
RUN \
--mount=type=bind,source=requirements,target=requirements \
Expand Down Expand Up @@ -90,4 +90,3 @@ chmod 777 -R /workspace
EOF

ENV PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"

1 change: 0 additions & 1 deletion examples/llm/megatron_gpt_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ def get_args():
checkpoint_callback = ModelCheckpoint(
every_n_train_steps=5000,
enable_nemo_ckpt_io=False,
async_save=False,
)
callbacks = [checkpoint_callback]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,11 @@ model:
vocab_file: null
merge_file: null

# embedding-specific arguemnts
softmax_temp: 0.02 # softmax temp for contrastive loss
global_inbatch_negatives: True # whether to use in-batch negatives from other ranks during training
backprop_type: 'global' # whether to use `global` or `local` backpropagation during training. Refer to Flava paper for details.

# precision
native_amp_init_scale: 4294967296 # 2 ** 32
native_amp_growth_interval: 1000
Expand All @@ -93,7 +98,7 @@ model:
use_cpu_initialization: False # Init weights on the CPU (slow for large models)
onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)

## Activation Checkpointing
# NeMo Megatron supports 'selective' activation checkpointing where only the memory intensive part of attention is checkpointed.
# These memory intensive activations are also less compute intensive which makes activation checkpointing more efficient for LLMs (20B+).
Expand Down Expand Up @@ -127,7 +132,7 @@ model:
# Path to data must be specified by the user.
data_train: null
data_validation: null
hard_negatives_to_train: 4
hard_negatives_to_train: 4 # number of hard negatives to use per example for training
index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix
data_impl: mmap
splits_string: 900,50,50
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def main(cfg) -> None:
model_cfg = MegatronBertEmbeddingModel.merge_cfg_with(cfg.restore_from_path, cfg)

assert (
model_cfg.micro_batch_size * cfg.trainer.devices == model_cfg.global_batch_size
model_cfg.micro_batch_size * cfg.trainer.devices * cfg.trainer.num_nodes == model_cfg.global_batch_size
), "Gradiant accumulation is not supported for contrastive learning yet"

OmegaConf.set_struct(model_cfg, True)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch.multiprocessing as mp
from omegaconf.omegaconf import OmegaConf, open_dict

from nemo.collections.nlp.models.information_retrieval.megatron_bert_embedding_model import MegatronBertEmbeddingModel
from nemo.collections.nlp.parts.megatron_trainer_builder import MegatronBertTrainerBuilder
from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
from nemo.core.config import hydra_runner
from nemo.utils import logging
from nemo.utils.exp_manager import exp_manager


@hydra_runner(config_path="conf", config_name="megatron_bert_embedding_config")
def main(cfg) -> None:
if cfg.model.data.dataloader_type != "LDDL":
mp.set_start_method("spawn", force=True)

logging.info("\n\n************** Experiment configuration ***********")
logging.info(f'\n{OmegaConf.to_yaml(cfg)}')

trainer = MegatronBertTrainerBuilder(cfg).create_trainer()
exp_manager(trainer, cfg.exp_manager)

model_cfg = MegatronBertEmbeddingModel.merge_cfg_with(cfg.restore_from_path, cfg)

OmegaConf.set_struct(model_cfg, True)
with open_dict(model_cfg):
model_cfg.precision = trainer.precision

logging.info(f"Loading model from {cfg.restore_from_path}")
model = MegatronBertEmbeddingModel.restore_from(
restore_path=cfg.restore_from_path,
trainer=trainer,
save_restore_connector=NLPSaveRestoreConnector(),
override_config_path=model_cfg,
strict=True,
)

trainer.test(model)


if __name__ == '__main__':
main()
35 changes: 1 addition & 34 deletions nemo/collections/llm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,24 +42,7 @@
gpt_data_step,
gpt_forward_step,
)
from nemo.collections.llm.gpt.model.api import (
code_gemma_2b,
code_gemma_7b,
code_llama_7b,
code_llama_13b,
code_llama_34b,
code_llama_70b,
gemma,
gemma_2b,
gemma_7b,
llama2_7b,
llama2_13b,
llama2_70b,
llama3_8b,
llama3_70b,
mistral,
mixtral,
)
from nemo.collections.llm.recipes import * # noqa

__all__ = [
"MockDataModule",
Expand Down Expand Up @@ -103,21 +86,5 @@
"mock",
"squad",
"dolly",
"mistral",
"mixtral",
"llama2_7b",
"llama3_8b",
"llama2_13b",
"llama2_70b",
"llama3_70b",
"code_llama_7b",
"code_llama_13b",
"code_llama_34b",
"code_llama_70b",
"gemma",
"gemma_2b",
"gemma_7b",
"code_gemma_2b",
"code_gemma_7b",
"peft",
]
125 changes: 0 additions & 125 deletions nemo/collections/llm/gpt/model/api.py

This file was deleted.

13 changes: 13 additions & 0 deletions nemo/collections/llm/recipes/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from nemo.collections.llm.recipes import llama2_7b, llama3_8b, llama3_8b_16k, llama3_8b_64k, mistral
from nemo.collections.llm.recipes.log.default import default_log
from nemo.collections.llm.recipes.optim import adam

__all__ = [
"llama3_8b",
"llama3_8b_16k",
"llama3_8b_64k",
"llama2_7b",
"mistral",
"adam",
"default_log",
]
Loading

0 comments on commit c7dff10

Please sign in to comment.