ci: Run unit tests (#308)

Signed-off-by: Oliver Koenig <[email protected]> Signed-off-by: Terry Kong <[email protected]> Co-authored-by: Terry Kong <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
NVIDIA · Oct 5, 2024 · c92a3bf · c92a3bf
1 parent 378c7cd
commit c92a3bf
Show file tree

Hide file tree

Showing 4 changed files with 114 additions and 5 deletions.
diff --git a/.github/workflows/_run_test.yml b/.github/workflows/_run_test.yml
@@ -0,0 +1,76 @@
+name: ~test template
+
+on:
+  workflow_call:
+    inputs:
+      RUNNER:
+        type: string
+        description: Runner to use for test
+        required: true
+      TIMEOUT:
+        type: number
+        description: Max runtime of test in minutes
+        required: false
+        default: 10
+      SCRIPT:
+        type: string
+        description: Test script to execute
+        required: true
+      AFTER_SCRIPT:
+        type: string
+        description: Script to run after main test
+        required: false
+        default: ":"
+      IS_OPTIONAL:
+        type: boolean
+        description: Failure will cancel all other tests if set to true
+        required: false
+        default: false
+    outputs:
+      conclusion:
+        description: Conclusion of main test step
+        value: ${{ jobs.main.outputs.conclusion }}
+      log:
+        description: Last 2000 characters of the test step's log
+        value: ${{ jobs.main.outputs.log }} 
+jobs:
+
+  main:
+    runs-on: ${{ inputs.RUNNER }} 
+    outputs:
+      conclusion: ${{ steps.main.conclusion }}
+      log: ${{ steps.main.outputs.log }}
+    steps:
+        - name: Docker system cleanup
+          run: |
+            docker system prune -a --filter "until=48h" --force || true
+
+        - name: Docker pull image
+          run: |
+            docker pull nemoci.azurecr.io/nemo_aligner_container:${{ github.run_id }}
+
+        - id: main
+          name: Run main script
+          timeout-minutes: ${{ inputs.TIMEOUT }}
+          run: |
+            mkdir -p ${{ github.run_id }}
+            cd ${{ github.run_id }}/
+            set +e 
+            (  
+              set -e
+
+              docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData:ro nemoci.azurecr.io/nemo_aligner_container:${{ github.run_id }} bash -c '${{ inputs.SCRIPT }}'
+            ) 2> >(tee err.log)
+
+            EXIT_CODE=$?
+            
+            echo "log=$(tail -c 2000 err.log |  base64 -w 0)" >> "$GITHUB_OUTPUT"
+            
+            exit $EXIT_CODE
+            
+        - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main"
+          if: failure() && inputs.IS_OPTIONAL == false
+        - name: after_script
+          if: always() && inputs.AFTER_SCRIPT != ':'
+          run: |
+            docker run --rm --device=/dev/nvidia0 --gpus all --shm-size=8g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_aligner_container:${{ github.run_id }} bash -c '${{ inputs.AFTER_SCRIPT }}'
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -22,12 +22,46 @@ on:
   push:
     branches:
       - 'main'
+  workflow_dispatch:
+    inputs:
+      test_to_run:
+        required: false
+        default: all
+        type: string
+        description: Comma-separated list of tests to run. Use "all" to run the full test suite.
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
 
-jobs:
+jobs: 
+  pre-flight:
+    runs-on: ubuntu-latest
+    outputs:
+      test_to_run: ${{ steps.test_to_run.outputs.main }}
+      all: ${{ steps.all.outputs.main }}
+    steps:
+      - name: Parse test_to_run
+        id: test_to_run
+        run: |
+          parsed_string=$(echo ${{ inputs.test_to_run || 'all' }} | jq -c --raw-input 'split(",")')
+          echo "main=${parsed_string}" | tee -a "$GITHUB_OUTPUT"
+      - name: Parse all
+        id: all
+        run: |
+          echo "main=${{ contains(fromJSON(steps.test_to_run.outputs.main), 'all') }}" | tee -a "$GITHUB_OUTPUT" 
+        
   build-container:
     if: ${{ github.event.label.name == 'Run CICD' || github.ref == 'refs/heads/main' }}
     uses: ./.github/workflows/_build_container.yml
+
+  Unit_Tests:
+     needs: [build-container, pre-flight]
+     uses: ./.github/workflows/_run_test.yml
+     if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'Unit_Tests') || needs.pre-flight.outputs.all == 'true'
+     with:
+       RUNNER: self-hosted
+       TIMEOUT: 20
+       SCRIPT: |
+          cd /opt/NeMo-Aligner/
+          # PYTHONPATH=$(pwd) pytest tests/
diff --git a/tests/test_cai_utils.py b/tests/test_cai_utils.py
@@ -15,9 +15,7 @@
 import os
 import sys
 
-sys.path.append(os.path.abspath("../examples/nlp/cai"))
-
-from cai_utils import ChatTemplateHelper, PromptTemplate, UserAssistantPromptTemplate
+from examples.nlp.cai.cai_utils import ChatTemplateHelper, PromptTemplate, UserAssistantPromptTemplate
 
 
 def test_extra_id_format_single_user_prompt():

diff --git a/tests/test_distributed.py b/tests/test_distributed.py
@@ -14,8 +14,9 @@
 
 import pytest
 import torch
-from nemo_aligner.utils import parallel_state, tensor_parallel
+from megatron.core import tensor_parallel
 
+from nemo_aligner.utils import parallel_state
 from nemo_aligner.utils.distributed import (
     calculate_distributed_entropy,
     from_parallel_logits_to_logprobs,