From 4f53efcdb4bccacd884860fdcbefb2e28dd24243 Mon Sep 17 00:00:00 2001 From: Jirka Date: Sat, 6 Apr 2024 18:12:33 +0200 Subject: [PATCH 1/3] ci/lint: yaml formatting with prettier --- .pre-commit-config.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 51705ec238..d942b1b341 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -69,3 +69,11 @@ repos: # hooks: # - id: ruff # args: ["--fix"] + + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v3.1.0 + hooks: + - id: prettier + # https://prettier.io/docs/en/options.html#print-width + files: \.(json|yml|yaml|toml) + args: ["--print-width=120"] From 22dfb63ca361a65eed5bf28fb9ae96d031a6ebc4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 6 Apr 2024 16:13:48 +0000 Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .azure/docker-build.yml | 50 +++--- .azure/gpu-tests.yml | 219 +++++++++++++------------- .azure/notebook-runs.yml | 97 ++++++------ .codecov.yml | 5 +- .github/workflows/ci-testing.yml | 198 ++++++++++++----------- .github/workflows/labeler.yml | 16 +- .github/workflows/release-nightly.yml | 48 +++--- .github/workflows/release-pypi.yml | 46 +++--- .pre-commit-config.yaml | 12 +- 9 files changed, 346 insertions(+), 345 deletions(-) diff --git a/.azure/docker-build.yml b/.azure/docker-build.yml index dc353f689f..9e210a977e 100644 --- a/.azure/docker-build.yml +++ b/.azure/docker-build.yml @@ -1,6 +1,6 @@ trigger: tags: - include: ['*'] + include: ["*"] branches: include: ["main"] paths: @@ -16,7 +16,7 @@ trigger: pr: branches: - include: ['*'] + include: ["*"] paths: include: - ".azure/docker-build.yml" @@ -29,10 +29,10 @@ pr: - "**/*.md" schedules: -- cron: '0 */2 * * *' - displayName: rebuild dockers for CI every 2 hours - branches: - include: ["main"] + - cron: "0 */2 * * *" + displayName: rebuild dockers for CI every 2 hours + branches: + include: ["main"] jobs: - job: build_push @@ -40,24 +40,30 @@ jobs: #maxParallel: "3" matrix: # CUDA 12.1 - 'cuda 12.1 | torch 2.2 | cudnn FE v1.2': - {CUDA_VERSION: '12.1.1', TORCH_VERSION: '2.2.1', TRITON_VERSION: '2.2.0', CUDNN_FRONTEND: "1.2.1"} - 'cuda 12.1 | torch 2.3 /test | cudnn FE v1.2': - {CUDA_VERSION: '12.1.1', TORCH_VERSION: '2.3.0', TRITON_VERSION: '2.2.0', TORCH_INSTALL: 'test', CUDNN_FRONTEND: "1.2.1"} - 'cuda 12.1 | torch 2.4 /nightly | cudnn FE v1.2': - {CUDA_VERSION: '12.1.1', TORCH_VERSION: 'main', TORCH_INSTALL: 'source', CUDNN_FRONTEND: "1.2.1"} + "cuda 12.1 | torch 2.2 | cudnn FE v1.2": + { CUDA_VERSION: "12.1.1", TORCH_VERSION: "2.2.1", TRITON_VERSION: "2.2.0", CUDNN_FRONTEND: "1.2.1" } + "cuda 12.1 | torch 2.3 /test | cudnn FE v1.2": + { + CUDA_VERSION: "12.1.1", + TORCH_VERSION: "2.3.0", + TRITON_VERSION: "2.2.0", + TORCH_INSTALL: "test", + CUDNN_FRONTEND: "1.2.1", + } + "cuda 12.1 | torch 2.4 /nightly | cudnn FE v1.2": + { CUDA_VERSION: "12.1.1", TORCH_VERSION: "main", TORCH_INSTALL: "source", CUDNN_FRONTEND: "1.2.1" } #'cuda 12.1': # this version - '8.9.5.29-1+cuda12.1' for 'libcudnn8' was not found # how much time to give 'run always even if cancelled tasks' before stopping them cancelTimeoutInMinutes: "2" timeoutInMinutes: "95" variables: - UBUNTU_VERSION: '22.04' - PYTHON_VERSION: '3.10' - APEX_CHECKOUT: 'master' - imageRepository: 'pytorchlightning/lightning-thunder' - dockerfilePath: 'dockers/ubuntu-cuda/Dockerfile' - imageTag: 'ubuntu$(UBUNTU_VERSION)-cuda$(CUDA_VERSION)-cudnn-fe$(CUDNN_FRONTEND)-py$(PYTHON_VERSION)-pt_${TORCH_VERSION/v/}-apex' - pool: 'lit-rtx-3090' + UBUNTU_VERSION: "22.04" + PYTHON_VERSION: "3.10" + APEX_CHECKOUT: "master" + imageRepository: "pytorchlightning/lightning-thunder" + dockerfilePath: "dockers/ubuntu-cuda/Dockerfile" + imageTag: "ubuntu$(UBUNTU_VERSION)-cuda$(CUDA_VERSION)-cudnn-fe$(CUDNN_FRONTEND)-py$(PYTHON_VERSION)-pt_${TORCH_VERSION/v/}-apex" + pool: "lit-rtx-3090" workspace: clean: all steps: @@ -84,7 +90,7 @@ jobs: --build-arg APEX_CHECKOUT="$(APEX_CHECKOUT)" \ . --no-cache timeoutInMinutes: "95" - displayName: 'Build base image' + displayName: "Build base image" - bash: | docker image ls | grep $(imageRepository) @@ -95,7 +101,7 @@ jobs: bash -c "cd /workspace && ls -lh . && \ pip install -q . && \ bash azure/sanity-check.sh" - displayName: 'Sanity check' + displayName: "Sanity check" - bash: | set -e @@ -104,7 +110,7 @@ jobs: docker push $(imageRepository):$(imageTag) condition: ne(variables['Build.Reason'], 'PullRequest') timeoutInMinutes: "35" - displayName: 'Push base image' + displayName: "Push base image" #- task: Docker@1 # inputs: diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index df030eaefd..96655a46dc 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -1,6 +1,6 @@ trigger: tags: - include: ['*'] + include: ["*"] branches: include: - "main" @@ -9,34 +9,34 @@ trigger: pr: branches: - include: ['*'] + include: ["*"] jobs: - job: testing strategy: matrix: # CUDA 12.1 - 'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.2 | regular': - docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.2.1-apex' - CUDA_VERSION_MM: '121' - 'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.2 | distributed': - docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.2.1-apex' - CUDA_VERSION_MM: '121' - testing: 'distributed' - 'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.3 | regular': - docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.3.0-apex' - CUDA_VERSION_MM: '121' - 'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.3 | distributed': - docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.3.0-apex' - CUDA_VERSION_MM: '121' - testing: 'distributed' - 'ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | regular': - docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_main-apex' - CUDA_VERSION_MM: '121' - 'ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | distributed': - docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_main-apex' - CUDA_VERSION_MM: '121' - testing: 'distributed' + "ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.2 | regular": + docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.2.1-apex" + CUDA_VERSION_MM: "121" + "ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.2 | distributed": + docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.2.1-apex" + CUDA_VERSION_MM: "121" + testing: "distributed" + "ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.3 | regular": + docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.3.0-apex" + CUDA_VERSION_MM: "121" + "ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.3 | distributed": + docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.3.0-apex" + CUDA_VERSION_MM: "121" + testing: "distributed" + "ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | regular": + docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_main-apex" + CUDA_VERSION_MM: "121" + "ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | distributed": + docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_main-apex" + CUDA_VERSION_MM: "121" + testing: "distributed" # how much time to give 'run always even if cancelled tasks' before stopping them cancelTimeoutInMinutes: "2" pool: "lit-rtx-3090" @@ -52,102 +52,101 @@ jobs: workspace: clean: all steps: + - bash: | + echo $(DEVICES) + echo "CUDA_VERSION_MM=$CUDA_VERSION_MM" + lspci | egrep 'VGA|3D' + whereis nvidia + nvidia-smi + which python && which pip + python --version + pip --version + pip list + echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)" + displayName: "Image info & NVIDIA" - - bash: | - echo $(DEVICES) - echo "CUDA_VERSION_MM=$CUDA_VERSION_MM" - lspci | egrep 'VGA|3D' - whereis nvidia - nvidia-smi - which python && which pip - python --version - pip --version - pip list - echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)" - displayName: 'Image info & NVIDIA' + - bash: | + # drop pt from requirements so not to interfere with the existing one + bash .azure/remove-torch-lines.sh requirements/base.txt + cat requirements/base.txt - - bash: | - # drop pt from requirements so not to interfere with the existing one - bash .azure/remove-torch-lines.sh requirements/base.txt - cat requirements/base.txt + # double check on test requirements + pip install -r requirements/test.txt - # double check on test requirements - pip install -r requirements/test.txt + # https://docs.codecov.com/docs/codecov-uploader + curl -Os https://uploader.codecov.io/latest/linux/codecov + chmod +x codecov - # https://docs.codecov.com/docs/codecov-uploader - curl -Os https://uploader.codecov.io/latest/linux/codecov - chmod +x codecov + # install this package + python setup.py develop + displayName: "Install package & ..." - # install this package - python setup.py develop - displayName: 'Install package & ...' + - bash: bash .azure/sanity-check.sh + displayName: "Sanity check / details" - - bash: bash .azure/sanity-check.sh - displayName: 'Sanity check / details' + - bash: | + set -ex + coverage run --source thunder -m \ + pytest thunder/tests/ \ + -m "not standalone" \ + -v --datefmt="%Y%m%d-%H:%M:%S.%f" \ + --timeout=240 \ + --random-order-seed=42 \ + --durations=250 \ + --timeout=240 \ + --numprocesses=9 \ + --ignore=thunder/tests/distributed --ignore=thunder/tests/test_networks.py + # compile coverage results + python -m coverage report + python -m coverage xml + # upload to codecov + ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \ + --flags=gpu,pytest,regular --name="GPU-coverage" --env=linux,azure + condition: ne(variables['testing'], 'distributed') + timeoutInMinutes: "30" + displayName: "Testing: regular" - - bash: | - set -ex - coverage run --source thunder -m \ - pytest thunder/tests/ \ - -m "not standalone" \ - -v --datefmt="%Y%m%d-%H:%M:%S.%f" \ - --timeout=240 \ - --random-order-seed=42 \ - --durations=250 \ - --timeout=240 \ - --numprocesses=9 \ - --ignore=thunder/tests/distributed --ignore=thunder/tests/test_networks.py - # compile coverage results - python -m coverage report - python -m coverage xml - # upload to codecov - ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \ - --flags=gpu,pytest,regular --name="GPU-coverage" --env=linux,azure - condition: ne(variables['testing'], 'distributed') - timeoutInMinutes: "30" - displayName: 'Testing: regular' + - bash: | + set -ex + # these test need to run in single thread as they occurs with CUDA OOM + coverage run --source thunder -m \ + pytest \ + thunder/tests/test_networks.py \ + -m "not standalone" \ + -v --durations=0 \ + --random-order-seed=42 \ + --numprocesses=3 + # compile coverage results + python -m coverage report + python -m coverage xml + # upload to codecov + ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \ + --flags=gpu,pytest,networks --name="GPU-coverage" --env=linux,azure + condition: ne(variables['testing'], 'distributed') + timeoutInMinutes: "15" + displayName: "Testing: networks" - - bash: | - set -ex - # these test need to run in single thread as they occurs with CUDA OOM - coverage run --source thunder -m \ - pytest \ - thunder/tests/test_networks.py \ - -m "not standalone" \ - -v --durations=0 \ - --random-order-seed=42 \ - --numprocesses=3 - # compile coverage results - python -m coverage report - python -m coverage xml - # upload to codecov - ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \ - --flags=gpu,pytest,networks --name="GPU-coverage" --env=linux,azure - condition: ne(variables['testing'], 'distributed') - timeoutInMinutes: "15" - displayName: 'Testing: networks' + #- bash: | + # bash .azure/run_standalone_tests.sh \ + # "thunder/tests" \ + # "-m standalone --ignore=thunder/tests/distributed" + # condition: ne(variables['testing'], 'distributed') + # displayName: 'Testing: standalone' - #- bash: | - # bash .azure/run_standalone_tests.sh \ - # "thunder/tests" \ - # "-m standalone --ignore=thunder/tests/distributed" - # condition: ne(variables['testing'], 'distributed') - # displayName: 'Testing: standalone' - - - bash: | - set -ex - # run all found tests in given past as standalone - bash scripts/run_standalone_tests.sh "thunder/tests/distributed" - # compile coverage results - # TODO: collect and merge reports - # python -m coverage report - # python -m coverage xml - # # upload to codecov - # ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \ - # --flags=gpu,pytest,distributed --name="GPU-coverage" --env=linux,azure - condition: eq(variables['testing'], 'distributed') - timeoutInMinutes: "20" - displayName: 'Testing: distributed' + - bash: | + set -ex + # run all found tests in given past as standalone + bash scripts/run_standalone_tests.sh "thunder/tests/distributed" + # compile coverage results + # TODO: collect and merge reports + # python -m coverage report + # python -m coverage xml + # # upload to codecov + # ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \ + # --flags=gpu,pytest,distributed --name="GPU-coverage" --env=linux,azure + condition: eq(variables['testing'], 'distributed') + timeoutInMinutes: "20" + displayName: "Testing: distributed" # todo (mruberry): decide whether this should be here or in another workflow #- bash: | diff --git a/.azure/notebook-runs.yml b/.azure/notebook-runs.yml index f205e5822b..d0cce2cf60 100644 --- a/.azure/notebook-runs.yml +++ b/.azure/notebook-runs.yml @@ -1,6 +1,6 @@ trigger: tags: - include: ['*'] + include: ["*"] branches: include: - "main" @@ -9,18 +9,18 @@ trigger: pr: branches: - include: ['*'] + include: ["*"] jobs: - job: jupyter strategy: matrix: - 'ubuntu22.04 | cuda 12.1 | torch 2.2': - docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.1.0-py3.10-pt_2.2.1-apex' - CUDA_VERSION_MM: '121' - 'ubuntu22.04 | cuda 12.1 | torch-nightly': - docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.0-py3.10-pt_main-apex' - CUDA_VERSION_MM: '121' + "ubuntu22.04 | cuda 12.1 | torch 2.2": + docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.1.0-py3.10-pt_2.2.1-apex" + CUDA_VERSION_MM: "121" + "ubuntu22.04 | cuda 12.1 | torch-nightly": + docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.2.0-py3.10-pt_main-apex" + CUDA_VERSION_MM: "121" # how long to run the job before automatically cancelling timeoutInMinutes: "45" # how much time to give 'run always even if cancelled tasks' before stopping them @@ -36,47 +36,46 @@ jobs: workspace: clean: all steps: + - bash: | + echo $(DEVICES) + echo "CUDA_VERSION_MM=$CUDA_VERSION_MM" + lspci | egrep 'VGA|3D' + whereis nvidia + nvidia-smi + which python && which pip + python --version + pip --version + pip list + echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)" + displayName: "Image info & NVIDIA" - - bash: | - echo $(DEVICES) - echo "CUDA_VERSION_MM=$CUDA_VERSION_MM" - lspci | egrep 'VGA|3D' - whereis nvidia - nvidia-smi - which python && which pip - python --version - pip --version - pip list - echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)" - displayName: 'Image info & NVIDIA' + - bash: | + # drop pt from requirements so not to interfere with the existing one + bash .azure/remove-torch-lines.sh requirements/base.txt + cat requirements/base.txt + # double check on test requirements + pip install -r requirements/notebooks.txt + # install this package + python setup.py develop + displayName: "Install package & ..." - - bash: | - # drop pt from requirements so not to interfere with the existing one - bash .azure/remove-torch-lines.sh requirements/base.txt - cat requirements/base.txt - # double check on test requirements - pip install -r requirements/notebooks.txt - # install this package - python setup.py develop - displayName: 'Install package & ...' + - bash: | + set -ex + bash .azure/sanity-check.sh + displayName: "Sanity check / details" - - bash: | - set -ex - bash .azure/sanity-check.sh - displayName: 'Sanity check / details' - - - bash: | - set -ex - # list all notebooks in this folder - find . -name "*.ipynb" > all.txt - # drop all "./" from beginning of each line - sed -i 's/^\.\///' all.txt - # filter out the ones that are listed in .ignore.ci - grep -Fxv -f .ignore.ci all.txt > ci.txt - # iterate over all listed notebooks and execute them with jupyter - while read -r line; do - echo "Processing $line" - jupyter execute $line --timeout=300 - done <<< $(cat ci.txt) - workingDirectory: 'notebooks/' - displayName: 'Execute notebooks' + - bash: | + set -ex + # list all notebooks in this folder + find . -name "*.ipynb" > all.txt + # drop all "./" from beginning of each line + sed -i 's/^\.\///' all.txt + # filter out the ones that are listed in .ignore.ci + grep -Fxv -f .ignore.ci all.txt > ci.txt + # iterate over all listed notebooks and execute them with jupyter + while read -r line; do + echo "Processing $line" + jupyter execute $line --timeout=300 + done <<< $(cat ci.txt) + workingDirectory: "notebooks/" + displayName: "Execute notebooks" diff --git a/.codecov.yml b/.codecov.yml index a399a32984..9a4832f719 100644 --- a/.codecov.yml +++ b/.codecov.yml @@ -2,7 +2,6 @@ # Validation check: # $ curl --data-binary @.codecov.yml https://codecov.io/validate - # https://docs.codecov.io/docs/codecovyml-reference codecov: bot: "codecov-io" @@ -13,7 +12,7 @@ codecov: wait_for_ci: yes coverage: - precision: 0 # 2 = xx.xx%, 0 = xx% + precision: 0 # 2 = xx.xx%, 0 = xx% round: nearest # how coverage is rounded: down/up/nearest range: 40...100 # custom range of coverage colors from red -> yellow -> green status: @@ -36,7 +35,7 @@ coverage: # https://docs.codecov.com/docs/github-checks#disabling-github-checks-patch-annotations github_checks: - annotations: false + annotations: false parsers: gcov: diff --git a/.github/workflows/ci-testing.yml b/.github/workflows/ci-testing.yml index 492a1754d6..e470d90af7 100644 --- a/.github/workflows/ci-testing.yml +++ b/.github/workflows/ci-testing.yml @@ -1,7 +1,7 @@ name: CI testing # see: https://help.github.com/en/actions/reference/events-that-trigger-workflows -on: # Trigger the workflow on push or pull request, but only for the main branch +on: # Trigger the workflow on push or pull request, but only for the main branch push: branches: [main] pull_request: {} @@ -21,120 +21,118 @@ env: TORCH_URL_STABLE: "https://download.pytorch.org/whl/cpu" jobs: - pytester: runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: os: ["ubuntu-22.04", "macOS-12", "windows-2022"] - python-version: ['3.10'] - requires: ['latest', 'nightly'] # , 'oldest' + python-version: ["3.10"] + requires: ["latest", "nightly"] # , 'oldest' include: - - { os: "ubuntu-22.04", python-version: '3.11', requires: 'latest' } + - { os: "ubuntu-22.04", python-version: "3.11", requires: "latest" } # Timeout: https://stackoverflow.com/a/59076067/4521646 timeout-minutes: 35 steps: - - uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - - name: Setup Ubuntu - if: runner.os == 'ubuntu' - run: | - sudo apt-get install -y graphviz - - - name: Set min. dependencies - if: matrix.requires == 'oldest' - run: | - for fpath in ('requirements/base.txt', 'requirements/test.txt'): - req = open(fpath).read().replace('>=', '==') - open(fpath, 'w').write(req) - shell: python - - - name: Get pip cache dir - id: pip-cache - run: echo "dir=$(pip cache dir)" >> $GITHUB_OUTPUT - - - name: Cache pip - uses: actions/cache@v4 - with: - path: ${{ steps.pip-cache.outputs.dir }} - key: ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}-pip-${{ hashFiles('requirements/*.txt') }} - restore-keys: | - ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}-pip- - - name: switch Torch source - run: | - if [[ "${{ matrix.requires }}" == "nightly" ]]; then - echo "TORCH_URL=$TORCH_URL_NIGHTLY" >> $GITHUB_ENV - echo "PIP_EXTRA_FLAG=--pre" >> $GITHUB_ENV - else - echo "TORCH_URL=$TORCH_URL_STABLE" >> $GITHUB_ENV - fi - - - name: Install package & dependencies - run: | - pip --version - pip install -e . -U \ - -r requirements/test.txt \ - --find-links=${TORCH_URL} ${PIP_EXTRA_FLAG} - pip list - shell: bash - - - name: Testing Local - if: matrix.python-version == '3.10' - run: | - coverage run --source thunder -m \ - pytest thunder/tests/ \ - --ignore=thunder/tests/distributed \ - -v --datefmt="%Y%m%d-%H:%M:%S.%f" \ - --random-order-seed=$GITHUB_RUN_ID \ - -n 2 --durations=250 - - - name: Testing Distributed - # run all found tests in given past as standalone - if: matrix.python-version == '3.10' && runner.os == 'Linux' - run: bash scripts/run_standalone_tests.sh "thunder/tests/distributed" - - - name: Testing just a few - if: matrix.python-version == '3.11' - #continue-on-error: true - run: | - python -m pytest \ - thunder/tests/test_interpreter.py \ - -v --durations=50 --cov=thunder - python -m pytest thunder/tests/test_jit_general.py -v --durations=50 --cov=thunder - - - name: Statistics - run: | - coverage report - coverage xml - - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v4 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: ./coverage.xml - flags: unittests - env_vars: OS,PYTHON - name: codecov-umbrella - fail_ci_if_error: false - + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Setup Ubuntu + if: runner.os == 'ubuntu' + run: | + sudo apt-get install -y graphviz + + - name: Set min. dependencies + if: matrix.requires == 'oldest' + run: | + for fpath in ('requirements/base.txt', 'requirements/test.txt'): + req = open(fpath).read().replace('>=', '==') + open(fpath, 'w').write(req) + shell: python + + - name: Get pip cache dir + id: pip-cache + run: echo "dir=$(pip cache dir)" >> $GITHUB_OUTPUT + + - name: Cache pip + uses: actions/cache@v4 + with: + path: ${{ steps.pip-cache.outputs.dir }} + key: ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}-pip-${{ hashFiles('requirements/*.txt') }} + restore-keys: | + ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.requires }}-pip- + - name: switch Torch source + run: | + if [[ "${{ matrix.requires }}" == "nightly" ]]; then + echo "TORCH_URL=$TORCH_URL_NIGHTLY" >> $GITHUB_ENV + echo "PIP_EXTRA_FLAG=--pre" >> $GITHUB_ENV + else + echo "TORCH_URL=$TORCH_URL_STABLE" >> $GITHUB_ENV + fi + + - name: Install package & dependencies + run: | + pip --version + pip install -e . -U \ + -r requirements/test.txt \ + --find-links=${TORCH_URL} ${PIP_EXTRA_FLAG} + pip list + shell: bash + + - name: Testing Local + if: matrix.python-version == '3.10' + run: | + coverage run --source thunder -m \ + pytest thunder/tests/ \ + --ignore=thunder/tests/distributed \ + -v --datefmt="%Y%m%d-%H:%M:%S.%f" \ + --random-order-seed=$GITHUB_RUN_ID \ + -n 2 --durations=250 + + - name: Testing Distributed + # run all found tests in given past as standalone + if: matrix.python-version == '3.10' && runner.os == 'Linux' + run: bash scripts/run_standalone_tests.sh "thunder/tests/distributed" + + - name: Testing just a few + if: matrix.python-version == '3.11' + #continue-on-error: true + run: | + python -m pytest \ + thunder/tests/test_interpreter.py \ + -v --durations=50 --cov=thunder + python -m pytest thunder/tests/test_jit_general.py -v --durations=50 --cov=thunder + + - name: Statistics + run: | + coverage report + coverage xml + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v4 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: ./coverage.xml + flags: unittests + env_vars: OS,PYTHON + name: codecov-umbrella + fail_ci_if_error: false testing-guardian: runs-on: ubuntu-latest needs: pytester if: always() steps: - - run: echo "${{ needs.pytester.result }}" - - name: failing... - if: needs.pytester.result == 'failure' - run: exit 1 - - name: cancelled or skipped... - if: contains(fromJSON('["cancelled", "skipped"]'), needs.pytester.result) - timeout-minutes: 1 - run: sleep 90 + - run: echo "${{ needs.pytester.result }}" + - name: failing... + if: needs.pytester.result == 'failure' + run: exit 1 + - name: cancelled or skipped... + if: contains(fromJSON('["cancelled", "skipped"]'), needs.pytester.result) + timeout-minutes: 1 + run: sleep 90 diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index fdcd5251f8..ae7cc9a77f 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -8,11 +8,11 @@ jobs: pull-requests: write runs-on: ubuntu-latest steps: - # Uploads repository content to the runner - - uses: actions/checkout@v4 - - uses: actions/labeler@v5 - with: - # The path to the label configuration file. - configuration-path: .github/labeling-config.yml - # Whether removing labels when matching files are reverted or no longer changed by the PR - sync-labels: true + # Uploads repository content to the runner + - uses: actions/checkout@v4 + - uses: actions/labeler@v5 + with: + # The path to the label configuration file. + configuration-path: .github/labeling-config.yml + # Whether removing labels when matching files are reverted or no longer changed by the PR + sync-labels: true diff --git a/.github/workflows/release-nightly.yml b/.github/workflows/release-nightly.yml index 441524257e..b80f4a9fc5 100644 --- a/.github/workflows/release-nightly.yml +++ b/.github/workflows/release-nightly.yml @@ -18,30 +18,30 @@ jobs: releasing-nightly: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: - python-version: "3.10" + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.10" - - name: Install dependencies - run: python -m pip install --user --upgrade setuptools wheel - - name: Build package - env: - CONVERT_VERSION2NIGHTLY: "1" - run: python setup.py sdist bdist_wheel + - name: Install dependencies + run: python -m pip install --user --upgrade setuptools wheel + - name: Build package + env: + CONVERT_VERSION2NIGHTLY: "1" + run: python setup.py sdist bdist_wheel - # We do this, since failures on test.pypi aren't that bad - - name: Publish to Test PyPI - if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' - uses: pypa/gh-action-pypi-publish@v1.8.14 - with: - user: __token__ - password: ${{ secrets.test_pypi_password }} - repository_url: https://test.pypi.org/legacy/ + # We do this, since failures on test.pypi aren't that bad + - name: Publish to Test PyPI + if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' + uses: pypa/gh-action-pypi-publish@v1.8.14 + with: + user: __token__ + password: ${{ secrets.test_pypi_password }} + repository_url: https://test.pypi.org/legacy/ - - name: Publish distribution 📦 to PyPI - if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' - uses: pypa/gh-action-pypi-publish@v1.8.14 - with: - user: __token__ - password: ${{ secrets.pypi_password }} + - name: Publish distribution 📦 to PyPI + if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' + uses: pypa/gh-action-pypi-publish@v1.8.14 + with: + user: __token__ + password: ${{ secrets.pypi_password }} diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml index cfcf1409eb..cf55ca1c2a 100644 --- a/.github/workflows/release-pypi.yml +++ b/.github/workflows/release-pypi.yml @@ -1,7 +1,7 @@ name: PyPI Release # https://help.github.com/en/actions/reference/events-that-trigger-workflows -on: # Trigger the workflow on push or pull request, but only for the main branch +on: # Trigger the workflow on push or pull request, but only for the main branch push: branches: [main] release: @@ -13,28 +13,28 @@ jobs: releasing-pypi: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: - python-version: "3.10" + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.10" - - name: Install dependencies - run: python -m pip install --user --upgrade setuptools wheel - - name: Build - run: python setup.py sdist bdist_wheel + - name: Install dependencies + run: python -m pip install --user --upgrade setuptools wheel + - name: Build + run: python setup.py sdist bdist_wheel - # We do this, since failures on test.pypi aren't that bad - - name: Publish to Test PyPI - if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' - uses: pypa/gh-action-pypi-publish@v1.8.14 - with: - user: __token__ - password: ${{ secrets.test_pypi_password }} - repository_url: https://test.pypi.org/legacy/ + # We do this, since failures on test.pypi aren't that bad + - name: Publish to Test PyPI + if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' + uses: pypa/gh-action-pypi-publish@v1.8.14 + with: + user: __token__ + password: ${{ secrets.test_pypi_password }} + repository_url: https://test.pypi.org/legacy/ - - name: Publish distribution 📦 to PyPI - if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' - uses: pypa/gh-action-pypi-publish@v1.8.14 - with: - user: __token__ - password: ${{ secrets.pypi_password }} + - name: Publish distribution 📦 to PyPI + if: startsWith(github.event.ref, 'refs/tags') || github.event_name == 'release' + uses: pypa/gh-action-pypi-publish@v1.8.14 + with: + user: __token__ + password: ${{ secrets.pypi_password }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d942b1b341..215e5f3660 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ default_language_version: ci: autofix_prs: true - autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions' + autoupdate_commit_msg: "[pre-commit.ci] pre-commit suggestions" # submodules: true repos: @@ -64,11 +64,11 @@ repos: hooks: - id: yesqa -# - repo: https://github.com/charliermarsh/ruff-pre-commit -# rev: v0.0.270 -# hooks: -# - id: ruff -# args: ["--fix"] + # - repo: https://github.com/charliermarsh/ruff-pre-commit + # rev: v0.0.270 + # hooks: + # - id: ruff + # args: ["--fix"] - repo: https://github.com/pre-commit/mirrors-prettier rev: v3.1.0 From 64befab4d08f9abef3d71925229341ec6d51d719 Mon Sep 17 00:00:00 2001 From: Jirka Date: Thu, 11 Apr 2024 23:06:01 +0200 Subject: [PATCH 3/3] apply --- .azure/docker-build.yml | 18 ++++++++++------ .azure/gpu-tests.yml | 44 ++++++++++++++++++++-------------------- .azure/notebook-runs.yml | 12 +++++------ 3 files changed, 40 insertions(+), 34 deletions(-) diff --git a/.azure/docker-build.yml b/.azure/docker-build.yml index eed8e3e8d9..6bb18e928f 100644 --- a/.azure/docker-build.yml +++ b/.azure/docker-build.yml @@ -40,12 +40,18 @@ jobs: #maxParallel: "3" matrix: # CUDA 12.1 - 'cuda 12.1 | torch 2.2 | cudnn FE v1.3': - {CUDA_VERSION: '12.1.1', TORCH_VERSION: '2.2.1', TRITON_VERSION: '2.2.0', CUDNN_FRONTEND: "1.3.0"} - 'cuda 12.1 | torch 2.3 /test | cudnn FE v1.3': - {CUDA_VERSION: '12.1.1', TORCH_VERSION: '2.3.0', TRITON_VERSION: '2.3.0', TORCH_INSTALL: 'test', CUDNN_FRONTEND: "1.3.0"} - 'cuda 12.1 | torch 2.4 /nightly | cudnn FE v1.3': - {CUDA_VERSION: '12.1.1', TORCH_VERSION: 'main', TORCH_INSTALL: 'source', CUDNN_FRONTEND: "1.3.0"} + "cuda 12.1 | torch 2.2 | cudnn FE v1.3": + { CUDA_VERSION: "12.1.1", TORCH_VERSION: "2.2.1", TRITON_VERSION: "2.2.0", CUDNN_FRONTEND: "1.3.0" } + "cuda 12.1 | torch 2.3 /test | cudnn FE v1.3": + { + CUDA_VERSION: "12.1.1", + TORCH_VERSION: "2.3.0", + TRITON_VERSION: "2.3.0", + TORCH_INSTALL: "test", + CUDNN_FRONTEND: "1.3.0", + } + "cuda 12.1 | torch 2.4 /nightly | cudnn FE v1.3": + { CUDA_VERSION: "12.1.1", TORCH_VERSION: "main", TORCH_INSTALL: "source", CUDNN_FRONTEND: "1.3.0" } #'cuda 12.1': # this version - '8.9.5.29-1+cuda12.1' for 'libcudnn8' was not found # how much time to give 'run always even if cancelled tasks' before stopping them cancelTimeoutInMinutes: "2" diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index 1e0bc36dc8..69bd9c8621 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -16,27 +16,27 @@ jobs: strategy: matrix: # CUDA 12.1 - 'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.2 | regular': - docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.3.0-py3.10-pt_2.2.1-apex' - CUDA_VERSION_MM: '121' - 'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.2 | distributed': - docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.3.0-py3.10-pt_2.2.1-apex' - CUDA_VERSION_MM: '121' - testing: 'distributed' - 'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.3 | regular': - docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.3.0-py3.10-pt_2.3.0-apex' - CUDA_VERSION_MM: '121' - 'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.3 | distributed': - docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.3.0-py3.10-pt_2.3.0-apex' - CUDA_VERSION_MM: '121' - testing: 'distributed' - 'ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | regular': - docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.3.0-py3.10-pt_main-apex' - CUDA_VERSION_MM: '121' - 'ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | distributed': - docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.3.0-py3.10-pt_main-apex' - CUDA_VERSION_MM: '121' - testing: 'distributed' + "ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.2 | regular": + docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.3.0-py3.10-pt_2.2.1-apex" + CUDA_VERSION_MM: "121" + "ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.2 | distributed": + docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.3.0-py3.10-pt_2.2.1-apex" + CUDA_VERSION_MM: "121" + testing: "distributed" + "ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.3 | regular": + docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.3.0-py3.10-pt_2.3.0-apex" + CUDA_VERSION_MM: "121" + "ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.3 | distributed": + docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.3.0-py3.10-pt_2.3.0-apex" + CUDA_VERSION_MM: "121" + testing: "distributed" + "ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | regular": + docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.3.0-py3.10-pt_main-apex" + CUDA_VERSION_MM: "121" + "ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | distributed": + docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.3.0-py3.10-pt_main-apex" + CUDA_VERSION_MM: "121" + testing: "distributed" # how much time to give 'run always even if cancelled tasks' before stopping them cancelTimeoutInMinutes: "2" pool: "lit-rtx-3090" @@ -146,7 +146,7 @@ jobs: # --flags=gpu,pytest,distributed --name="GPU-coverage" --env=linux,azure condition: eq(variables['testing'], 'distributed') timeoutInMinutes: "25" - displayName: 'Testing: distributed' + displayName: "Testing: distributed" # todo (mruberry): decide whether this should be here or in another workflow #- bash: | diff --git a/.azure/notebook-runs.yml b/.azure/notebook-runs.yml index f8dd71bec3..a82f0673e7 100644 --- a/.azure/notebook-runs.yml +++ b/.azure/notebook-runs.yml @@ -15,12 +15,12 @@ jobs: - job: jupyter strategy: matrix: - 'ubuntu22.04 | cuda 12.1 | torch 2.2': - docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.1.0-py3.10-pt_2.2.1-apex' - CUDA_VERSION_MM: '121' - 'ubuntu22.04 | cuda 12.1 | torch-nightly': - docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.3.0-py3.10-pt_main-apex' - CUDA_VERSION_MM: '121' + "ubuntu22.04 | cuda 12.1 | torch 2.2": + docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.1.0-py3.10-pt_2.2.1-apex" + CUDA_VERSION_MM: "121" + "ubuntu22.04 | cuda 12.1 | torch-nightly": + docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.3.0-py3.10-pt_main-apex" + CUDA_VERSION_MM: "121" # how long to run the job before automatically cancelling timeoutInMinutes: "45" # how much time to give 'run always even if cancelled tasks' before stopping them