Skip to content

Commit

Permalink
[pre-commit.ci] auto fixes from pre-commit.com hooks
Browse files Browse the repository at this point in the history
for more information, see https://pre-commit.ci
  • Loading branch information
pre-commit-ci[bot] committed Apr 6, 2024
1 parent 95572e5 commit 22dfb63
Show file tree
Hide file tree
Showing 9 changed files with 346 additions and 345 deletions.
50 changes: 28 additions & 22 deletions .azure/docker-build.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
trigger:
tags:
include: ['*']
include: ["*"]
branches:
include: ["main"]
paths:
Expand All @@ -16,7 +16,7 @@ trigger:

pr:
branches:
include: ['*']
include: ["*"]
paths:
include:
- ".azure/docker-build.yml"
Expand All @@ -29,35 +29,41 @@ pr:
- "**/*.md"

schedules:
- cron: '0 */2 * * *'
displayName: rebuild dockers for CI every 2 hours
branches:
include: ["main"]
- cron: "0 */2 * * *"
displayName: rebuild dockers for CI every 2 hours
branches:
include: ["main"]

jobs:
- job: build_push
strategy:
#maxParallel: "3"
matrix:
# CUDA 12.1
'cuda 12.1 | torch 2.2 | cudnn FE v1.2':
{CUDA_VERSION: '12.1.1', TORCH_VERSION: '2.2.1', TRITON_VERSION: '2.2.0', CUDNN_FRONTEND: "1.2.1"}
'cuda 12.1 | torch 2.3 /test | cudnn FE v1.2':
{CUDA_VERSION: '12.1.1', TORCH_VERSION: '2.3.0', TRITON_VERSION: '2.2.0', TORCH_INSTALL: 'test', CUDNN_FRONTEND: "1.2.1"}
'cuda 12.1 | torch 2.4 /nightly | cudnn FE v1.2':
{CUDA_VERSION: '12.1.1', TORCH_VERSION: 'main', TORCH_INSTALL: 'source', CUDNN_FRONTEND: "1.2.1"}
"cuda 12.1 | torch 2.2 | cudnn FE v1.2":
{ CUDA_VERSION: "12.1.1", TORCH_VERSION: "2.2.1", TRITON_VERSION: "2.2.0", CUDNN_FRONTEND: "1.2.1" }
"cuda 12.1 | torch 2.3 /test | cudnn FE v1.2":
{
CUDA_VERSION: "12.1.1",
TORCH_VERSION: "2.3.0",
TRITON_VERSION: "2.2.0",
TORCH_INSTALL: "test",
CUDNN_FRONTEND: "1.2.1",
}
"cuda 12.1 | torch 2.4 /nightly | cudnn FE v1.2":
{ CUDA_VERSION: "12.1.1", TORCH_VERSION: "main", TORCH_INSTALL: "source", CUDNN_FRONTEND: "1.2.1" }
#'cuda 12.1': # this version - '8.9.5.29-1+cuda12.1' for 'libcudnn8' was not found
# how much time to give 'run always even if cancelled tasks' before stopping them
cancelTimeoutInMinutes: "2"
timeoutInMinutes: "95"
variables:
UBUNTU_VERSION: '22.04'
PYTHON_VERSION: '3.10'
APEX_CHECKOUT: 'master'
imageRepository: 'pytorchlightning/lightning-thunder'
dockerfilePath: 'dockers/ubuntu-cuda/Dockerfile'
imageTag: 'ubuntu$(UBUNTU_VERSION)-cuda$(CUDA_VERSION)-cudnn-fe$(CUDNN_FRONTEND)-py$(PYTHON_VERSION)-pt_${TORCH_VERSION/v/}-apex'
pool: 'lit-rtx-3090'
UBUNTU_VERSION: "22.04"
PYTHON_VERSION: "3.10"
APEX_CHECKOUT: "master"
imageRepository: "pytorchlightning/lightning-thunder"
dockerfilePath: "dockers/ubuntu-cuda/Dockerfile"
imageTag: "ubuntu$(UBUNTU_VERSION)-cuda$(CUDA_VERSION)-cudnn-fe$(CUDNN_FRONTEND)-py$(PYTHON_VERSION)-pt_${TORCH_VERSION/v/}-apex"
pool: "lit-rtx-3090"
workspace:
clean: all
steps:
Expand All @@ -84,7 +90,7 @@ jobs:
--build-arg APEX_CHECKOUT="$(APEX_CHECKOUT)" \
. --no-cache
timeoutInMinutes: "95"
displayName: 'Build base image'
displayName: "Build base image"
- bash: |
docker image ls | grep $(imageRepository)
Expand All @@ -95,7 +101,7 @@ jobs:
bash -c "cd /workspace && ls -lh . && \
pip install -q . && \
bash azure/sanity-check.sh"
displayName: 'Sanity check'
displayName: "Sanity check"
- bash: |
set -e
Expand All @@ -104,7 +110,7 @@ jobs:
docker push $(imageRepository):$(imageTag)
condition: ne(variables['Build.Reason'], 'PullRequest')
timeoutInMinutes: "35"
displayName: 'Push base image'
displayName: "Push base image"
#- task: Docker@1
# inputs:
Expand Down
219 changes: 109 additions & 110 deletions .azure/gpu-tests.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
trigger:
tags:
include: ['*']
include: ["*"]
branches:
include:
- "main"
Expand All @@ -9,34 +9,34 @@ trigger:

pr:
branches:
include: ['*']
include: ["*"]

jobs:
- job: testing
strategy:
matrix:
# CUDA 12.1
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.2 | regular':
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.2.1-apex'
CUDA_VERSION_MM: '121'
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.2 | distributed':
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.2.1-apex'
CUDA_VERSION_MM: '121'
testing: 'distributed'
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.3 | regular':
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.3.0-apex'
CUDA_VERSION_MM: '121'
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.3 | distributed':
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.3.0-apex'
CUDA_VERSION_MM: '121'
testing: 'distributed'
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | regular':
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_main-apex'
CUDA_VERSION_MM: '121'
'ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | distributed':
docker-image: 'ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_main-apex'
CUDA_VERSION_MM: '121'
testing: 'distributed'
"ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.2 | regular":
docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.2.1-apex"
CUDA_VERSION_MM: "121"
"ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.2 | distributed":
docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.2.1-apex"
CUDA_VERSION_MM: "121"
testing: "distributed"
"ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.3 | regular":
docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.3.0-apex"
CUDA_VERSION_MM: "121"
"ubuntu22.04 | cuda 12.1 | python 3.10 | torch 2.3 | distributed":
docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_2.3.0-apex"
CUDA_VERSION_MM: "121"
testing: "distributed"
"ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | regular":
docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_main-apex"
CUDA_VERSION_MM: "121"
"ubuntu22.04 | cuda 12.1 | python 3.10 | torch-nightly | distributed":
docker-image: "ubuntu22.04-cuda12.1.1-cudnn-fe1.2.1-py3.10-pt_main-apex"
CUDA_VERSION_MM: "121"
testing: "distributed"
# how much time to give 'run always even if cancelled tasks' before stopping them
cancelTimeoutInMinutes: "2"
pool: "lit-rtx-3090"
Expand All @@ -52,102 +52,101 @@ jobs:
workspace:
clean: all
steps:
- bash: |
echo $(DEVICES)
echo "CUDA_VERSION_MM=$CUDA_VERSION_MM"
lspci | egrep 'VGA|3D'
whereis nvidia
nvidia-smi
which python && which pip
python --version
pip --version
pip list
echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
displayName: "Image info & NVIDIA"
- bash: |
echo $(DEVICES)
echo "CUDA_VERSION_MM=$CUDA_VERSION_MM"
lspci | egrep 'VGA|3D'
whereis nvidia
nvidia-smi
which python && which pip
python --version
pip --version
pip list
echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
displayName: 'Image info & NVIDIA'
- bash: |
# drop pt from requirements so not to interfere with the existing one
bash .azure/remove-torch-lines.sh requirements/base.txt
cat requirements/base.txt
- bash: |
# drop pt from requirements so not to interfere with the existing one
bash .azure/remove-torch-lines.sh requirements/base.txt
cat requirements/base.txt
# double check on test requirements
pip install -r requirements/test.txt
# double check on test requirements
pip install -r requirements/test.txt
# https://docs.codecov.com/docs/codecov-uploader
curl -Os https://uploader.codecov.io/latest/linux/codecov
chmod +x codecov
# https://docs.codecov.com/docs/codecov-uploader
curl -Os https://uploader.codecov.io/latest/linux/codecov
chmod +x codecov
# install this package
python setup.py develop
displayName: "Install package & ..."
# install this package
python setup.py develop
displayName: 'Install package & ...'
- bash: bash .azure/sanity-check.sh
displayName: "Sanity check / details"

- bash: bash .azure/sanity-check.sh
displayName: 'Sanity check / details'
- bash: |
set -ex
coverage run --source thunder -m \
pytest thunder/tests/ \
-m "not standalone" \
-v --datefmt="%Y%m%d-%H:%M:%S.%f" \
--timeout=240 \
--random-order-seed=42 \
--durations=250 \
--timeout=240 \
--numprocesses=9 \
--ignore=thunder/tests/distributed --ignore=thunder/tests/test_networks.py
# compile coverage results
python -m coverage report
python -m coverage xml
# upload to codecov
./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
--flags=gpu,pytest,regular --name="GPU-coverage" --env=linux,azure
condition: ne(variables['testing'], 'distributed')
timeoutInMinutes: "30"
displayName: "Testing: regular"
- bash: |
set -ex
coverage run --source thunder -m \
pytest thunder/tests/ \
-m "not standalone" \
-v --datefmt="%Y%m%d-%H:%M:%S.%f" \
--timeout=240 \
--random-order-seed=42 \
--durations=250 \
--timeout=240 \
--numprocesses=9 \
--ignore=thunder/tests/distributed --ignore=thunder/tests/test_networks.py
# compile coverage results
python -m coverage report
python -m coverage xml
# upload to codecov
./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
--flags=gpu,pytest,regular --name="GPU-coverage" --env=linux,azure
condition: ne(variables['testing'], 'distributed')
timeoutInMinutes: "30"
displayName: 'Testing: regular'
- bash: |
set -ex
# these test need to run in single thread as they occurs with CUDA OOM
coverage run --source thunder -m \
pytest \
thunder/tests/test_networks.py \
-m "not standalone" \
-v --durations=0 \
--random-order-seed=42 \
--numprocesses=3
# compile coverage results
python -m coverage report
python -m coverage xml
# upload to codecov
./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
--flags=gpu,pytest,networks --name="GPU-coverage" --env=linux,azure
condition: ne(variables['testing'], 'distributed')
timeoutInMinutes: "15"
displayName: "Testing: networks"
- bash: |
set -ex
# these test need to run in single thread as they occurs with CUDA OOM
coverage run --source thunder -m \
pytest \
thunder/tests/test_networks.py \
-m "not standalone" \
-v --durations=0 \
--random-order-seed=42 \
--numprocesses=3
# compile coverage results
python -m coverage report
python -m coverage xml
# upload to codecov
./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
--flags=gpu,pytest,networks --name="GPU-coverage" --env=linux,azure
condition: ne(variables['testing'], 'distributed')
timeoutInMinutes: "15"
displayName: 'Testing: networks'
#- bash: |
# bash .azure/run_standalone_tests.sh \
# "thunder/tests" \
# "-m standalone --ignore=thunder/tests/distributed"
# condition: ne(variables['testing'], 'distributed')
# displayName: 'Testing: standalone'

#- bash: |
# bash .azure/run_standalone_tests.sh \
# "thunder/tests" \
# "-m standalone --ignore=thunder/tests/distributed"
# condition: ne(variables['testing'], 'distributed')
# displayName: 'Testing: standalone'

- bash: |
set -ex
# run all found tests in given past as standalone
bash scripts/run_standalone_tests.sh "thunder/tests/distributed"
# compile coverage results
# TODO: collect and merge reports
# python -m coverage report
# python -m coverage xml
# # upload to codecov
# ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
# --flags=gpu,pytest,distributed --name="GPU-coverage" --env=linux,azure
condition: eq(variables['testing'], 'distributed')
timeoutInMinutes: "20"
displayName: 'Testing: distributed'
- bash: |
set -ex
# run all found tests in given past as standalone
bash scripts/run_standalone_tests.sh "thunder/tests/distributed"
# compile coverage results
# TODO: collect and merge reports
# python -m coverage report
# python -m coverage xml
# # upload to codecov
# ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
# --flags=gpu,pytest,distributed --name="GPU-coverage" --env=linux,azure
condition: eq(variables['testing'], 'distributed')
timeoutInMinutes: "20"
displayName: "Testing: distributed"
# todo (mruberry): decide whether this should be here or in another workflow
#- bash: |
Expand Down
Loading

0 comments on commit 22dfb63

Please sign in to comment.