Skip to content

Commit

Permalink
Merge branch 'main' into xren/nsys_profiling
Browse files Browse the repository at this point in the history
  • Loading branch information
xrennvidia committed Aug 12, 2024
2 parents 24f4141 + d6cfdc0 commit 2da1e47
Show file tree
Hide file tree
Showing 11 changed files with 160 additions and 182 deletions.
31 changes: 11 additions & 20 deletions .github/workflows/_test_template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,30 +34,18 @@ on:
description: Last 2000 characters of the test step's log
value: ${{ jobs.main.outputs.log }}
jobs:
runner-auto-clean:
runs-on: ${{ inputs.RUNNER }}
steps:
- name: Docker system cleanup
run: |
docker system prune -a --filter "until=48h" --force
main:
runs-on: ${{ inputs.RUNNER }}
outputs:
conclusion: ${{ steps.main.conclusion }}
log: ${{ steps.main.outputs.log }}
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
permissions:
actions: write # Required for cancelling workflows
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Docker system cleanup
run: |
docker system prune -a --filter "until=48h" --force
- id: main
name: Run main script
timeout-minutes: ${{ inputs.TIMEOUT }}
Expand All @@ -66,7 +54,7 @@ jobs:
(
set -e
${{ inputs.SCRIPT }}
docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '${{ inputs.SCRIPT }}'
) 2> >(tee err.log)
EXIT_CODE=$?
Expand All @@ -77,6 +65,9 @@ jobs:
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: failure() && inputs.IS_OPTIONAL == false

- name: after_script
if: always() && inputs.AFTER_SCRIPT != ':'
run: ${{ inputs.AFTER_SCRIPT }}
run: |
docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container_${{ github.run_id }} bash -c '${{ inputs.AFTER_SCRIPT }}'
59 changes: 22 additions & 37 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3098,44 +3098,29 @@ jobs:
L2_Megatron_GPT_Reranker:
needs: [cicd-test-container-setup]
runs-on: self-hosted-azure
timeout-minutes: 10
container:
image: nemoci.azurecr.io/nemo_container_${{ github.run_id }}
options:
# --user 0:128
--device=/dev/nvidia0
--gpus all
--shm-size=8g
--env TRANSFORMERS_OFFLINE=0
--env HYDRA_FULL_ERROR=1
--volume /mnt/datadrive/TestData:/home/TestData
steps:
- name: Checkout repository
uses: actions/checkout@v4
- run: |
rm -rf /home/TestData/nlp/megatron_ir/working_dir
python examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py \
exp_manager.exp_dir='/home/TestData/nlp/megatron_ir/working_dir' \
model.global_batch_size=4 \
model.micro_batch_size=4 \
trainer.devices=1 \
trainer.num_nodes=1 \
trainer.max_epochs=null \
trainer.max_steps=20 \
trainer.val_check_interval=10 \
model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \
model.peft.lora_tuning.adapter_dim=8 \
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] \
model.data.validation_ds.write_embeddings_to_file=True \
model.data.validation_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/val_embs' \
model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl]
uses: ./.github/workflows/_test_template.yml
with:
RUNNER: self-hosted-azure
SCRIPT: |
rm -rf /home/TestData/nlp/megatron_ir/working_dir
rm -rf /home/TestData/nlp/megatron_ir/working_dir
- uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
if: "failure()"
python examples/nlp/information_retrieval/megatron_gpt_reranker_finetuning.py \
exp_manager.exp_dir='/home/TestData/nlp/megatron_ir/working_dir' \
model.global_batch_size=4 \
model.micro_batch_size=4 \
trainer.devices=1 \
trainer.num_nodes=1 \
trainer.max_epochs=null \
trainer.max_steps=20 \
trainer.val_check_interval=10 \
model.restore_from_path='/home/TestData/nlp/megatron_gpt/mcore_45M/megatron_llama.nemo' \
model.peft.lora_tuning.adapter_dim=8 \
model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl] \
model.data.validation_ds.write_embeddings_to_file=True \
model.data.validation_ds.output_file_path_prefix='/home/TestData/nlp/megatron_ir/working_dir/val_embs' \
model.data.train_ds.file_names=[/home/TestData/nlp/megatron_ir/train.jsonl]
AFTER_SCRIPT: |
rm -rf /home/TestData/nlp/megatron_ir/working_dir
L2_Megatron_GPT_Embedding:
needs: [cicd-test-container-setup]
Expand Down
4 changes: 2 additions & 2 deletions docs/source/asr/speech_intent_slot/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ Mixins
.. autoclass:: nemo.collections.asr.parts.mixins.ASRModuleMixin
:show-inheritance:
:members:
:no-index:
:noindex:

.. autoclass:: nemo.collections.asr.parts.mixins.ASRBPEMixin
:show-inheritance:
:members:
:no-index:
:noindex:

4 changes: 2 additions & 2 deletions docs/source/asr/ssl/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@ Mixins
.. autoclass:: nemo.collections.asr.parts.mixins.mixins.ASRModuleMixin
:show-inheritance:
:members:
:no-index:
:noindex:

.. autoclass:: nemo.core.classes.mixins.access_mixins.AccessMixin
:show-inheritance:
:members:
:no-index:
:noindex:



14 changes: 7 additions & 7 deletions docs/source/core/adapters/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Core
:members:
:member-order: bysource
:undoc-members: adapter_module_names
:no-index:
:noindex:

-----

Expand All @@ -18,7 +18,7 @@ Core
:members:
:member-order: bysource
:undoc-members: adapter_module_names
:no-index:
:noindex:

-----

Expand All @@ -30,15 +30,15 @@ Adapter Networks
:show-inheritance:
:members:
:member-order: bysource
:no-index:
:noindex:

-----

.. autoclass:: nemo.collections.common.parts.adapter_modules.LinearAdapter
:show-inheritance:
:members:
:member-order: bysource
:no-index:
:noindex:

-----

Expand All @@ -51,7 +51,7 @@ Adapter Strategies
:members:
:member-order: bysource
:undoc-members: adapter_module_names
:no-index:
:noindex:

-----

Expand All @@ -60,7 +60,7 @@ Adapter Strategies
:members:
:member-order: bysource
:undoc-members: adapter_module_names
:no-index:
:noindex:

-----

Expand All @@ -69,4 +69,4 @@ Adapter Strategies
:members:
:member-order: bysource
:undoc-members: adapter_module_names
:no-index:
:noindex:
8 changes: 4 additions & 4 deletions docs/source/core/adapters/components.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,15 @@ Adapter modules represent the functional form of the adapter. We discuss an exam
:show-inheritance:
:members:
:member-order: bysource
:no-index:
:noindex:

-----

.. autoclass:: nemo.collections.common.parts.adapter_modules.LinearAdapter
:show-inheritance:
:members:
:member-order: bysource
:no-index:
:noindex:


Insertion Form - Module Adapters
Expand Down Expand Up @@ -72,7 +72,7 @@ We discuss a simple residual additional connection strategy below - that accepts
:members:
:member-order: bysource
:undoc-members: adapter_module_names
:no-index:
:noindex:

-----

Expand All @@ -81,7 +81,7 @@ We discuss a simple residual additional connection strategy below - that accepts
:members:
:member-order: bysource
:undoc-members: adapter_module_names
:no-index:
:noindex:

-----

Expand Down
Loading

0 comments on commit 2da1e47

Please sign in to comment.