From 6fff50f52b85dda0686bbe365fcc27be986de4a5 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Tue, 20 Dec 2022 15:45:49 -0800 Subject: [PATCH] Unit tests setup own venv (#2628) add reusable workflow that sets up fresh venv for each test and prints relevant environment info --- .github/workflows/amd.yml | 15 +++----- .github/workflows/formatting.yml | 6 ++-- .github/workflows/nv-accelerate-v100.yml | 14 +++----- .github/workflows/nv-inference.yml | 15 +++----- .github/workflows/nv-lightning-v100.yml | 15 +++----- .github/workflows/nv-megatron.yml | 24 ++++++------- .github/workflows/nv-mii.yml | 29 +++++++++------- .github/workflows/nv-nightly.yml | 16 +++------ .github/workflows/nv-torch-latest-v100.yml | 15 +++----- .github/workflows/nv-torch-nightly-v100.yml | 15 +++----- .github/workflows/nv-torch18-p40.yml | 15 +++----- .github/workflows/nv-torch18-v100.yml | 15 +++----- .github/workflows/nv-transformers-v100.yml | 15 +++----- .github/workflows/pre-compile-ops.yml | 8 ++--- .github/workflows/setup-venv/action.yml | 38 +++++++++++++++++++++ 15 files changed, 110 insertions(+), 145 deletions(-) create mode 100644 .github/workflows/setup-venv/action.yml diff --git a/.github/workflows/amd.yml b/.github/workflows/amd.yml index 84a46c1942ad..da6516c4fe9b 100644 --- a/.github/workflows/amd.yml +++ b/.github/workflows/amd.yml @@ -25,21 +25,14 @@ jobs: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - uses: actions/checkout@v2 - # Runs a single command using the runners shell - - name: environment + - id: setup-venv + uses: ./.github/workflows/setup-venv + + - name: Install pytorch run: | - echo "JobID: $AISC_NODE_INSTANCE_ID" - rocm-smi --showhw - which python - python --version - which hipcc - hipcc --version - pip install --upgrade pip - pip uninstall --yes torch torchvision triton pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.1.1 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" - sudo apt-get update sudo apt-get install -y libaio-dev - name: Install transformers diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml index a800ce566f87..6ebd9c6d1e9f 100644 --- a/.github/workflows/formatting.yml +++ b/.github/workflows/formatting.yml @@ -22,10 +22,8 @@ jobs: steps: - uses: actions/checkout@v2 - - name: environment - run: | - which python - python --version + - id: setup-venv + uses: ./.github/workflows/setup-venv - name: Install deepspeed run: | diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml index 44d12e57c80d..1ecf31d35407 100644 --- a/.github/workflows/nv-accelerate-v100.yml +++ b/.github/workflows/nv-accelerate-v100.yml @@ -22,23 +22,17 @@ jobs: steps: - uses: actions/checkout@v2 - - name: environment + - id: setup-venv + uses: ./.github/workflows/setup-venv + + - name: Install pytorch run: | - echo "JobID: $AISC_NODE_INSTANCE_ID" - nvidia-smi - which python - python --version - which nvcc - nvcc --version - pip install --upgrade pip - pip uninstall --yes torch torchvision triton pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" - name: Install deepspeed run: | - pip uninstall --yes deepspeed pip install .[dev,autotuning] ds_report diff --git a/.github/workflows/nv-inference.yml b/.github/workflows/nv-inference.yml index dac0462611d7..f1f43ca55961 100644 --- a/.github/workflows/nv-inference.yml +++ b/.github/workflows/nv-inference.yml @@ -22,16 +22,11 @@ jobs: steps: - uses: actions/checkout@v2 - - name: environment + - id: setup-venv + uses: ./.github/workflows/setup-venv + + - name: Install pytorch run: | - echo "JobID: $AISC_NODE_INSTANCE_ID" - nvidia-smi - which python - python --version - which nvcc - nvcc --version - pip install --upgrade pip - pip uninstall --yes torch torchvision triton pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" @@ -41,12 +36,10 @@ jobs: git clone https://github.com/huggingface/transformers cd transformers git rev-parse --short HEAD - pip uninstall --yes transformers pip install . - name: Install deepspeed run: | - pip uninstall --yes deepspeed pip install .[dev,1bit,autotuning,inf] ds_report diff --git a/.github/workflows/nv-lightning-v100.yml b/.github/workflows/nv-lightning-v100.yml index 0c5ad4184f05..a94f6c9375de 100644 --- a/.github/workflows/nv-lightning-v100.yml +++ b/.github/workflows/nv-lightning-v100.yml @@ -22,23 +22,17 @@ jobs: steps: - uses: actions/checkout@v2 - - name: environment + - id: setup-venv + uses: ./.github/workflows/setup-venv + + - name: Install pytorch run: | - echo "JobID: $AISC_NODE_INSTANCE_ID" - nvidia-smi - which python - python --version - which nvcc - nvcc --version - pip install --upgrade pip - pip uninstall --yes torch torchvision pip install torch==1.9.1+cu111 torchvision==0.10.1+cu111 torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" - name: Install deepspeed run: | - pip uninstall --yes deepspeed pip install .[dev,autotuning] ds_report @@ -49,7 +43,6 @@ jobs: - name: PyTorch Lightning Tests run: | if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi - pip uninstall --yes pytorch-lightning pip install pytorch-lightning pip install "protobuf<4.21.0" cd tests diff --git a/.github/workflows/nv-megatron.yml b/.github/workflows/nv-megatron.yml index 3260a7c07a12..6c405086c730 100644 --- a/.github/workflows/nv-megatron.yml +++ b/.github/workflows/nv-megatron.yml @@ -22,33 +22,31 @@ jobs: steps: - uses: actions/checkout@v2 - - name: environment + - id: setup-venv + uses: ./.github/workflows/setup-venv + + - name: Install pytorch run: | - echo "JobID: $AISC_NODE_INSTANCE_ID" - nvidia-smi - which python - python --version - which nvcc - nvcc --version - pip install --upgrade pip - pip uninstall --yes torch torchvision triton pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" - - name: Install + - name: Install deepspeed run: | - pip uninstall --yes deepspeed pip install .[dev] + ds_report + + - name: Install apex + run: | pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" git+https://github.com/NVIDIA/apex.git - name: Python environment run: | pip list - - name: Unit tests + - name: Megatron unit tests run: | - git clone --branch mrwyattii/add-unit-test https://github.com/microsoft/Megatron-DeepSpeed.git + git clone --branch mrwyattii/fix-deprecated-numpy-types https://github.com/microsoft/Megatron-DeepSpeed.git cd Megatron-DeepSpeed pip install . unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch diff --git a/.github/workflows/nv-mii.yml b/.github/workflows/nv-mii.yml index fee6b95f275d..11c438421876 100644 --- a/.github/workflows/nv-mii.yml +++ b/.github/workflows/nv-mii.yml @@ -22,31 +22,34 @@ jobs: steps: - uses: actions/checkout@v2 - - name: environment + - id: setup-venv + uses: ./.github/workflows/setup-venv + + - name: Install pytorch run: | - echo "JobID: $AISC_NODE_INSTANCE_ID" - nvidia-smi - which python - python --version - which nvcc - nvcc --version - pip install --upgrade pip - pip uninstall --yes torch torchvision triton pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" - - name: Install MII + - name: Install transformers + run: | + git clone https://github.com/huggingface/transformers + cd transformers + # if needed switch to the last known good SHA until transformers@master is fixed + # git checkout 1cc453d33 + git rev-parse --short HEAD + pip install . + + - name: Install deepspeed run: | - pip uninstall --yes deepspeed deepspeed-mii transformers pip install .[dev] - pip install git+https://github.com/huggingface/transformers.git + ds_report - name: Python environment run: | pip list - - name: Unit tests + - name: MII unit tests run: | git clone https://github.com/microsoft/DeepSpeed-MII.git cd DeepSpeed-MII diff --git a/.github/workflows/nv-nightly.yml b/.github/workflows/nv-nightly.yml index 4a0d4f896053..4229bee32893 100644 --- a/.github/workflows/nv-nightly.yml +++ b/.github/workflows/nv-nightly.yml @@ -15,16 +15,11 @@ jobs: steps: - uses: actions/checkout@v2 - - name: environment + - id: setup-venv + uses: ./.github/workflows/setup-venv + + - name: Install pytorch run: | - echo "JobID: $AISC_NODE_INSTANCE_ID" - nvidia-smi - which python - python --version - which nvcc - nvcc --version - pip install --upgrade pip - pip uninstall --yes torch torchvision triton pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" @@ -36,18 +31,15 @@ jobs: # if needed switch to the last known good SHA until transformers@master is fixed # git checkout 1cc453d33 git rev-parse --short HEAD - pip uninstall --yes transformers pip install . - name: Install deepspeed run: | - pip uninstall --yes deepspeed pip install .[dev,1bit,autotuning,inf] ds_report - name: Install lm-eval run: | - pip uninstall --yes lm-eval pip install git+https://github.com/EleutherAI/lm-evaluation-harness # This is required until lm-eval makes a new release. v0.2.0 is # broken for latest version of transformers diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml index 488874a13c4b..65eb21ccca73 100644 --- a/.github/workflows/nv-torch-latest-v100.yml +++ b/.github/workflows/nv-torch-latest-v100.yml @@ -22,16 +22,11 @@ jobs: steps: - uses: actions/checkout@v2 - - name: environment + - id: setup-venv + uses: ./.github/workflows/setup-venv + + - name: Install pytorch run: | - echo "JobID: $AISC_NODE_INSTANCE_ID" - nvidia-smi - which python - python --version - which nvcc - nvcc --version - pip install --upgrade pip - pip uninstall --yes torch torchvision triton pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu116 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" @@ -43,12 +38,10 @@ jobs: # if needed switch to the last known good SHA until transformers@master is fixed # git checkout 1cc453d33 git rev-parse --short HEAD - pip uninstall --yes transformers pip install . - name: Install deepspeed run: | - pip uninstall --yes deepspeed pip install .[dev,1bit,autotuning] ds_report diff --git a/.github/workflows/nv-torch-nightly-v100.yml b/.github/workflows/nv-torch-nightly-v100.yml index 5804f65f73f5..625517d59167 100644 --- a/.github/workflows/nv-torch-nightly-v100.yml +++ b/.github/workflows/nv-torch-nightly-v100.yml @@ -15,16 +15,11 @@ jobs: steps: - uses: actions/checkout@v2 - - name: environment + - id: setup-venv + uses: ./.github/workflows/setup-venv + + - name: Install pytorch run: | - echo "JobID: $AISC_NODE_INSTANCE_ID" - nvidia-smi - which python - python --version - which nvcc - nvcc --version - pip install --upgrade pip - pip uninstall --yes torch torchvision triton pip install --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu116 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" @@ -36,12 +31,10 @@ jobs: # if needed switch to the last known good SHA until transformers@master is fixed # git checkout 1cc453d33 git rev-parse --short HEAD - pip uninstall --yes transformers pip install . - name: Install deepspeed run: | - pip uninstall --yes deepspeed pip install .[dev,1bit,autotuning] ds_report diff --git a/.github/workflows/nv-torch18-p40.yml b/.github/workflows/nv-torch18-p40.yml index 6be052f350b4..abed0a0f08f7 100644 --- a/.github/workflows/nv-torch18-p40.yml +++ b/.github/workflows/nv-torch18-p40.yml @@ -22,16 +22,11 @@ jobs: steps: - uses: actions/checkout@v2 - - name: environment + - id: setup-venv + uses: ./.github/workflows/setup-venv + + - name: Install pytorch run: | - echo "JobID: $AISC_NODE_INSTANCE_ID" - nvidia-smi - which python - python --version - which nvcc - nvcc --version - pip install --upgrade pip - pip uninstall --yes torch torchvision triton pip install torch==1.8.2 torchvision==0.9.2 --extra-index-url https://download.pytorch.org/whl/lts/1.8/cu101 python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" @@ -43,12 +38,10 @@ jobs: # if needed switch to the last known good SHA until transformers@master is fixed # git checkout 1cc453d33 git rev-parse --short HEAD - pip uninstall --yes transformers pip install . - name: Install deepspeed run: | - pip uninstall --yes deepspeed pip install .[dev,1bit,autotuning] ds_report diff --git a/.github/workflows/nv-torch18-v100.yml b/.github/workflows/nv-torch18-v100.yml index eb7a577d5608..128ff62b120a 100644 --- a/.github/workflows/nv-torch18-v100.yml +++ b/.github/workflows/nv-torch18-v100.yml @@ -22,16 +22,11 @@ jobs: steps: - uses: actions/checkout@v2 - - name: environment + - id: setup-venv + uses: ./.github/workflows/setup-venv + + - name: Install pytorch run: | - echo "JobID: $AISC_NODE_INSTANCE_ID" - nvidia-smi - which python - python --version - which nvcc - nvcc --version - pip install --upgrade pip - pip uninstall --yes torch torchvision triton pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" @@ -43,12 +38,10 @@ jobs: # if needed switch to the last known good SHA until transformers@master is fixed # git checkout 1cc453d33 git rev-parse --short HEAD - pip uninstall --yes transformers pip install . - name: Install deepspeed run: | - pip uninstall --yes deepspeed pip install .[dev,1bit,autotuning] ds_report diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml index 9ed60a71093f..99fcdf1e4565 100644 --- a/.github/workflows/nv-transformers-v100.yml +++ b/.github/workflows/nv-transformers-v100.yml @@ -22,25 +22,18 @@ jobs: steps: - uses: actions/checkout@v2 - - name: environment + - id: setup-venv + uses: ./.github/workflows/setup-venv + + - name: Install pytorch run: | - echo "JobID: $AISC_NODE_INSTANCE_ID" - nvidia-smi - which python - python --version - which nvcc - nvcc --version - pip install --upgrade pip - pip uninstall --yes torch torchvision triton pip install torch==1.8.2+cu111 torchvision==0.9.2+cu111 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" - sudo apt-get update sudo apt-get install -y libaio-dev - name: Install deepspeed run: | - pip uninstall --yes deepspeed pip install .[dev,autotuning] ds_report diff --git a/.github/workflows/pre-compile-ops.yml b/.github/workflows/pre-compile-ops.yml index 4005d4baf2fc..2ff3bb6a4fc7 100644 --- a/.github/workflows/pre-compile-ops.yml +++ b/.github/workflows/pre-compile-ops.yml @@ -19,14 +19,12 @@ jobs: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - uses: actions/checkout@v2 + - id: setup-venv + uses: ./.github/workflows/setup-venv + # Runs a single command using the runners shell - name: environment run: | - nvidia-smi - which python - python --version - which nvcc - nvcc --version python -c "import torch; print('torch:', torch.__version__, torch)" python -c "import torch; print('CUDA available:', torch.cuda.is_available())" diff --git a/.github/workflows/setup-venv/action.yml b/.github/workflows/setup-venv/action.yml new file mode 100644 index 000000000000..ea2eed7bbc7a --- /dev/null +++ b/.github/workflows/setup-venv/action.yml @@ -0,0 +1,38 @@ +name: Create Virtual Environment + +runs: + using: "composite" + steps: + - id: update-env + run: | + sudo apt-get update && sudo apt-get -y upgrade + pip install --upgrade pip + python -m pip install -U virtualenv + shell: bash + - id: create-venv + run: | + sudo apt-get update && sudo apt-get -y upgrade + pip install --upgrade pip + python -m pip install -U virtualenv + python -m venv unit-test-venv + source ./unit-test-venv/bin/activate + pip install --upgrade pip + echo PATH=$PATH >> $GITHUB_ENV # Make it so venv is inherited for other steps + shell: bash + - id: print-env + run: | + which python + python --version + if [[ -z "${AISC_NODE_INSTANCE_ID}" ]]; then + echo "JobID: ${AISC_NODE_INSTANCE_ID}" + fi + if command -v nvidia-smi; then + nvidia-smi + which nvcc + nvcc --version + elif command -c rocm-smi; then + rocm-smi --showhw + which hipcc + hipcc --version + fi + shell: bash