Skip to content

Commit

Permalink
Merge branch 'main' into lazy-export
Browse files Browse the repository at this point in the history
  • Loading branch information
meatybobby authored Nov 22, 2024
2 parents 2a0bebc + 9ea442c commit 9200abc
Show file tree
Hide file tree
Showing 662 changed files with 16,514 additions and 2,803 deletions.
124 changes: 92 additions & 32 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ jobs:
outputs:
test_to_run: ${{ steps.test_to_run.outputs.main }}
all: ${{ steps.all.outputs.main }}
event_name: ${{ steps.github-event.outputs.main }}
steps:
- name: Parse test_to_run
id: test_to_run
Expand All @@ -47,11 +48,16 @@ jobs:
- name: Parse all
id: all
run: |
echo "main=${{ contains(fromJSON(steps.test_to_run.outputs.main), 'all') }}" | tee -a "$GITHUB_OUTPUT"
echo "main=${{ contains(fromJSON(steps.test_to_run.outputs.main), 'all') }}" | tee -a "$GITHUB_OUTPUT"
- name: Infer github event
id: github-event
run: |
echo "main=${{ github.event_name }}" | tee -a "$GITHUB_OUTPUT"
cicd-test-container-build:
if: ${{ github.event.label.name == 'Run CICD' || github.event_name == 'workflow_dispatch' }}
uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/[email protected]
if: ${{ github.event.label.name == 'Run CICD' || needs.pre-flight.outputs.event_name == 'workflow_dispatch' }}
uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/[email protected]
needs: pre-flight
with:
image-name: nemo_container
dockerfile: Dockerfile.ci
Expand Down Expand Up @@ -2943,7 +2949,7 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \
python examples/nlp/language_modeling/megatron_t5_pretraining.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.max_epochs=null \
Expand Down Expand Up @@ -2975,7 +2981,7 @@ jobs:
+model.data.data_impl_kwargs.workers=null \
+model.data.data_impl_kwargs.sort_dataset_paths=False
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_t5_pretraining.py \
python examples/nlp/language_modeling/megatron_t5_pretraining.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.max_epochs=null \
Expand Down Expand Up @@ -3398,8 +3404,8 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_t5_eval.py \
--model_file /home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m.nemo \
python examples/nlp/language_modeling/megatron_t5_eval.py \
--model_file /home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m_padding_attnmasktype.nemo \
--prompt "How do I fix my GPU memory issue? I am seeing <mask> out of memory." \
--tensor_model_parallel_size 1
Expand All @@ -3410,7 +3416,7 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \
python examples/nlp/language_modeling/tuning/megatron_t5_finetuning.py \
trainer.devices=2 \
trainer.log_every_n_steps=1 \
trainer.max_epochs=9999 \
Expand All @@ -3421,7 +3427,7 @@ jobs:
exp_manager.exp_dir=/tmp/nlp_mcore_t5_lora_tuning_tp2 \
model.pipeline_model_parallel_size=1 \
model.tensor_model_parallel_size=2 \
model.restore_from_path=/home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m.nemo \
model.restore_from_path=/home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m_padding_attnmasktype.nemo \
model.peft.peft_scheme=lora \
model.answer_only_loss=True \
model.micro_batch_size=1 \
Expand All @@ -3433,8 +3439,8 @@ jobs:
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \
model.data.validation_ds.names=[quarel]
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \
model.restore_from_path=/home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m.nemo \
python examples/nlp/language_modeling/tuning/megatron_t5_generate.py \
model.restore_from_path=/home/TestData/nlp/megatron_t5/220m/megatron_mcore_t5_220m_padding_attnmasktype.nemo \
model.peft.restore_from_path=/tmp/nlp_mcore_t5_lora_tuning_tp2/megatron_t5_peft_lora_tuning/checkpoints/megatron_t5_peft_lora_tuning.nemo \
model.peft.restore_from_ckpt_name=null \
model.peft.restore_from_hparams_path=null \
Expand All @@ -3451,7 +3457,20 @@ jobs:
inference.repetition_penalty=1.0 \
inference.outfile_path=/tmp/nlp_mcore_t5_lora_tuning_tp2/out.jsonl
# L2: Megatron Mock Data Generation
L2_HF_Transformer_SFT_TE_Acceleration:
needs: [ cicd-test-container-setup ]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_HF_Transformer_SFT_TE_Acceleration') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
python examples/llm/sft/hf.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --model-accelerator te
AFTER_SCRIPT: |
rm -rf nemo_experiments
# L2: Megatron Mock Data Generation
L2_Megatron_Mock_Data_Generation_MockGPTDataset:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
Expand Down Expand Up @@ -3572,12 +3591,12 @@ jobs:
# timeout-minutes: 10
# container:
# image: nemoci.azurecr.io/nemo_container:${{ github.run_id }}
# options:
# options:
# # --user 0:128
# --device=/dev/nvidia0
# --gpus all
# --shm-size=8g
# --env TRANSFORMERS_OFFLINE=0
# --env TRANSFORMERS_OFFLINE=0
# --env HYDRA_FULL_ERROR=1
# --volume /mnt/datadrive/TestData:/home/TestData
# steps:
Expand Down Expand Up @@ -3637,12 +3656,12 @@ jobs:
# runs-on: self-hosted-azure
# container:
# image: nemoci.azurecr.io/nemo_container:${{ github.run_id }}
# options:
# options:
# # --user 0:128
# --device=/dev/nvidia0
# --gpus all
# --shm-size=8g
# --env TRANSFORMERS_OFFLINE=0
# --shm-size=8g
# --env TRANSFORMERS_OFFLINE=0
# --env HYDRA_FULL_ERROR=1
# --volume /mnt/datadrive/TestData:/home/TestData
# steps:
Expand Down Expand Up @@ -3852,14 +3871,14 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_pretraining.py \
python tests/collections/llm/megatron_t5_pretraining.py \
--devices=2 \
--max-steps=3 \
--experiment-dir=tests/collections/llm/t5_pretrain_results/${{ github.run_id }} \
--data-path=/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document \
--index-mapping-dir=tests/collections/llm/t5_index_mappings/${{ github.run_id }}
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_pretraining.py \
python tests/collections/llm/megatron_t5_pretraining.py \
--devices=2 \
--max-steps=6 \
--experiment-dir=tests/collections/llm/t5_pretrain_results/${{ github.run_id }} \
Expand All @@ -3876,11 +3895,11 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_finetuning.py \
python tests/collections/llm/megatron_t5_finetuning.py \
--devices=2 \
--max-steps=250 \
--experiment-dir=tests/collections/llm/t5_finetune_results/${{ github.run_id }} \
--checkpoint-path=/home/TestData/nlp/megatron_t5/220m/nemo2.0_t5_220m_150steps
--checkpoint-path=/home/TestData/nlp/megatron_t5/220m/nemo2.0_t5_220m_padding_attnmasktype_150steps
AFTER_SCRIPT: |
rm -rf tests/collections/llm/t5_finetune_results/${{ github.run_id }}
Expand All @@ -3891,12 +3910,12 @@ jobs:
with:
RUNNER: self-hosted-azure
SCRIPT: |
NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python tests/collections/llm/megatron_t5_finetuning.py \
python tests/collections/llm/megatron_t5_finetuning.py \
--devices=2 \
--max-steps=250 \
--peft=lora \
--experiment-dir=tests/collections/llm/t5_peft_results/${{ github.run_id }} \
--checkpoint-path=/home/TestData/nlp/megatron_t5/220m/nemo2.0_t5_220m_150steps
--checkpoint-path=/home/TestData/nlp/megatron_t5/220m/nemo2.0_t5_220m_padding_attnmasktype_150steps
AFTER_SCRIPT: |
rm -rf tests/collections/llm/t5_peft_results/${{ github.run_id }}
Expand Down Expand Up @@ -4199,6 +4218,34 @@ jobs:
--pp_size 1 \
--mbs 1 --packed
L2_NeMo_2_GPT_DoRA_TP1PP1_MBS1_PACKED:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_GPT_DoRA_TP1PP1_MBS1_PACKED') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M \
--devices 2 \
--max_steps 3 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
--peft dora \
--tp_size 1 \
--pp_size 1 \
--mbs 1 --packed
python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M \
--devices 2 \
--max_steps 6 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
--peft dora \
--tp_size 1 \
--pp_size 1 \
--mbs 1 --packed
L2_NeMo_2_Mixtral_LoRA_EP2PP1_MBS2:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
Expand Down Expand Up @@ -4272,7 +4319,18 @@ jobs:
--mbs 1 \
--model mistral \
--dist-opt
L2_NEMO_2_LoRA_MERGE:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NEMO_2_LoRA_MERGE') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
python tests/collections/llm/peft/lora_merge.py \
--lora_checkpoint_path=/home/TestData/nemo2_ckpt/llama_lora_ci_checkpoint/ \
--output_path=/tmp/nemo2_lora_merge/${{ github.run_id }}
L2_NeMo_2_NeMo_Mcore_Mixtral_bitexact:
needs: [cicd-test-container-setup]
Expand All @@ -4299,7 +4357,7 @@ jobs:
rm -rf /tmp/nemo2_ptq_engine
Nemo_CICD_Test:
needs:
needs:
- pre-flight
- cicd-test-container-setup

Expand All @@ -4314,7 +4372,7 @@ jobs:
- L0_Unit_Tests_GPU_Hydra
- L0_Unit_Tests_GPU_Lightning
- L0_Unit_Tests_GPU_Others

- L0_Unit_Tests_CPU_ASR
- L0_Unit_Tests_CPU_Audio
- L0_Unit_Tests_CPU_Common
Expand Down Expand Up @@ -4412,7 +4470,8 @@ jobs:
- L2_NeMo_2_GPT_Pretraining_no_transformer_engine
- L2_NeMo_2_GPT_DDP_Param_Parity_check
- L2_NeMo_2_HF_MODEL_IMPORT
- L2_NeMo_2_llama3_pretraining_recipe
- L2_NeMo_2_llama3_pretraining_recipe
- L2_HF_Transformer_SFT_TE_Acceleration
- L2_NeMo_2_SSM_Pretraining
- L2_NeMo_2_SSM_Finetuning
- L2_NeMo_2_T5_Pretraining
Expand All @@ -4428,6 +4487,7 @@ jobs:
- L2_NeMo_2_GPT_LoRA_TP1PP2_MBS2
- L2_NeMo_2_GPT_LoRA_TP2PP1_MBS2
- L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_PACKED
- L2_NeMo_2_GPT_DoRA_TP1PP1_MBS1_PACKED
- L2_NeMo_2_Mixtral_LoRA_EP2PP1_MBS2
- L2_NeMo_2_Mixtral_LoRA_TP1PP1_MBS1
- L2_NeMo_2_Mixtral_LoRA_TP2PP1_MBS1
Expand All @@ -4450,7 +4510,7 @@ jobs:
- L2_NeMo_2_PTQ_Llama2_FP8
if: always()
runs-on: ubuntu-latest
steps:
steps:
- name: Evaluate conclusion
if: ${{ always() }}
id: pipeline-conclusion
Expand All @@ -4464,14 +4524,14 @@ jobs:
echo "SUCCESS=$SUCCESS" >> $GITHUB_OUTPUT
# This should depend on all the tests so we block/unblock based on all tests passing
- name: Pipeline successful, set exit code to 0
- name: Pipeline successful, set exit code to 0
if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'true' }}
run: exit 0

- name: Pipeline successful, add PR comment
- name: Pipeline successful, add PR comment
if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'true' && github.event_name == 'pull_request' && env.SLACK_WEBHOOK != '' }}
uses: peter-evans/create-or-update-comment@v4
env:
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
REPOSITORY: ${{ github.repository }}
RUN_ID: ${{ github.run_id }}
Expand All @@ -4490,7 +4550,7 @@ jobs:
- name: "Pipeline not successful and not cancelled: Send Slack alert & create step summary"
if: ${{ always() && steps.pipeline-conclusion.outputs.FAILED == 'true' && env.SLACK_WEBHOOK != '' }}
env:
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
SLACK_WEBHOOK_ADMIN: <!subteam^${{ secrets.SLACK_WEBHOOK_ADMIN }}>
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
Expand Down Expand Up @@ -4583,4 +4643,4 @@ jobs:
- name: "Pipeline not successful, set exit code to 1"
if: ${{ always() && steps.pipeline-conclusion.outputs.SUCCESS == 'false' }}
run: exit 1
run: exit 1
2 changes: 1 addition & 1 deletion .github/workflows/monitor-vms.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
| jq -c '[
.runners[]
| select(.status == "online")
| select(.name | contains("gpu"))
| select(.name | contains("cpu") | not)
| {
"vm": .name,
"n_gpus": [
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ on:

jobs:
release:
uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.10.0
uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.12.3
with:
release-ref: ${{ inputs.release-ref }}
image-name: nemo_container
Expand All @@ -39,3 +39,4 @@ jobs:
TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
SLACK_RELEASE_ENDPOINT: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
PAT: ${{ secrets.PAT }}
10 changes: 7 additions & 3 deletions .secrets.baseline
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@
{
"path": "detect_secrets.filters.allowlist.is_line_allowlisted"
},
{
"path": "detect_secrets.filters.common.is_baseline_file",
"filename": ".secrets.baseline"
},
{
"path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies",
"min_level": 2
Expand Down Expand Up @@ -273,7 +277,7 @@
"filename": "scripts/checkpoint_converters/convert_mistral_7b_hf_to_nemo.py",
"hashed_secret": "e0308bd21bffc156d79208f9ecf130370a015002",
"is_verified": false,
"line_number": 460
"line_number": 471
}
],
"scripts/dataset_processing/nlp/intent_and_slot/assistant_utils.py": [
Expand Down Expand Up @@ -1929,7 +1933,7 @@
"filename": "tutorials/speaker_tasks/Speaker_Diarization_Inference.ipynb",
"hashed_secret": "80903ddedcf4ec0a2ee5911cefa7e1ad52419dcc",
"is_verified": false,
"line_number": 989
"line_number": 990
}
],
"tutorials/tools/DefinedCrowd_x_NeMo_ASR_Training_Tutorial.ipynb": [
Expand Down Expand Up @@ -2083,5 +2087,5 @@
}
]
},
"generated_at": "2024-10-25T13:43:17Z"
"generated_at": "2024-11-14T09:37:19Z"
}
2 changes: 1 addition & 1 deletion Dockerfile.ci
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
# Install NeMo requirements
ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
ARG MODELOPT_VERSION=0.19.0
ARG MCORE_TAG=aded519cfb1de2abf96f36ca059f992294b7876f
ARG MCORE_TAG=c1728c12f1f1cdbb786e52f1ffe512295d76bef3

ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
RUN \
Expand Down
Loading

0 comments on commit 9200abc

Please sign in to comment.