From ad3087f3e6466d5f61f3f6278f003243fa60a0f1 Mon Sep 17 00:00:00 2001 From: bhimrazy Date: Wed, 15 Oct 2025 14:47:17 +0545 Subject: [PATCH 1/5] fix: add unzip to dependencies in pytorch workflow --- .lightning/workflows/pytorch.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.lightning/workflows/pytorch.yml b/.lightning/workflows/pytorch.yml index a1177e6521f06..56f6c85b5c1a6 100644 --- a/.lightning/workflows/pytorch.yml +++ b/.lightning/workflows/pytorch.yml @@ -50,7 +50,8 @@ run: | openmpi-bin \ ninja-build \ libnccl2 \ - libnccl-dev + libnccl-dev \ + unzip echo "Install Python ${python_version} and UV" apt-get install -y python${python_version} python${python_version}-venv python${python_version}-dev From 9441488f139b5ad7a6f4d7270b4ea359bd40a62b Mon Sep 17 00:00:00 2001 From: Bhimraj Yadav Date: Sun, 2 Nov 2025 09:43:17 +0000 Subject: [PATCH 2/5] fix: update variable interpolation for PACKAGE_NAME in YAML workflows --- .azure/gpu-tests-fabric.yml | 4 ++-- .azure/gpu-tests-pytorch.yml | 4 ++-- .lightning/workflows/pytorch.yml | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index b2f8ab0447a20..819a225939021 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -76,7 +76,7 @@ jobs: cuda_ver=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") echo "##vso[task.setvariable variable=CUDA_VERSION_MM]$cuda_ver" echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${cuda_ver}/torch_stable.html" - scope=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="lightning_fabric").get(n, n))') + scope=$(python -c 'n = "${PACKAGE_NAME}" ; print(dict(fabric="lightning_fabric").get(n, n))') echo "##vso[task.setvariable variable=COVERAGE_SOURCE]$scope" displayName: "set env. vars" - bash: | @@ -135,7 +135,7 @@ jobs: - bash: | set -e - extra=$(python -c "print({'lightning': 'fabric-'}.get('$(PACKAGE_NAME)', ''))") + extra=$(python -c "print({'lightning': 'fabric-'}.get('${PACKAGE_NAME}', ''))") pip install -e ".[${extra}dev]" -U --upgrade-strategy=eager --extra-index-url="${TORCH_URL}" displayName: "Install package & dependencies" diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 888272d80012a..c407c3f78abca 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -75,7 +75,7 @@ jobs: cuda_ver=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") echo "##vso[task.setvariable variable=CUDA_VERSION_MM]$cuda_ver" echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${cuda_ver}/torch_stable.html" - scope=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(pytorch="pytorch_lightning").get(n, n))') + scope=$(python -c 'n = "${PACKAGE_NAME}" ; print(dict(pytorch="pytorch_lightning").get(n, n))') echo "##vso[task.setvariable variable=COVERAGE_SOURCE]$scope" displayName: "set env. vars" - bash: | @@ -134,7 +134,7 @@ jobs: - bash: | set -e - extra=$(python -c "print({'lightning': 'pytorch-'}.get('$(PACKAGE_NAME)', ''))") + extra=$(python -c "print({'lightning': 'pytorch-'}.get('${PACKAGE_NAME}', ''))") pip install -e ".[${extra}dev]" -U --upgrade-strategy=eager --extra-index-url="${TORCH_URL}" displayName: "Install package & dependencies" diff --git a/.lightning/workflows/pytorch.yml b/.lightning/workflows/pytorch.yml index 56f6c85b5c1a6..a0b6043be0aa9 100644 --- a/.lightning/workflows/pytorch.yml +++ b/.lightning/workflows/pytorch.yml @@ -86,7 +86,7 @@ run: | CUDA_VERSION_M_M="${CUDA_VERSION%.*}" # "12.6" CUDA_VERSION_MM="${CUDA_VERSION_M_M//./}" # "126" export UV_TORCH_BACKEND=cu${CUDA_VERSION_MM} - COVERAGE_SOURCE=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="pytorch_lightning").get(n, n))') + COVERAGE_SOURCE=$(python -c 'n = "${PACKAGE_NAME}" ; print(dict(fabric="pytorch_lightning").get(n, n))') echo "collecting coverage for: ${COVERAGE_SOURCE}" uv pip install -q fire wget packaging "lightning-utilities[cli]" @@ -121,7 +121,7 @@ run: | fi echo "Install package" - extra=$(python -c "print({'lightning': 'pytorch-'}.get('$(PACKAGE_NAME)', ''))") + extra=$(python -c "print({'lightning': 'pytorch-'}.get('${PACKAGE_NAME}', ''))") uv pip install -e ".[${extra}dev]" --upgrade echo "Ensure only a single package is installed" From 3d363c0a8b2048a26f5fd71645a2e56c9e91d264 Mon Sep 17 00:00:00 2001 From: Bhimraj Yadav Date: Sun, 2 Nov 2025 09:54:27 +0000 Subject: [PATCH 3/5] fix: update variable interpolation for PACKAGE_NAME in fabric workflow --- .lightning/workflows/fabric.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.lightning/workflows/fabric.yml b/.lightning/workflows/fabric.yml index a62776aa9ffab..e6db648741b35 100644 --- a/.lightning/workflows/fabric.yml +++ b/.lightning/workflows/fabric.yml @@ -85,7 +85,7 @@ run: | CUDA_VERSION_M_M="${CUDA_VERSION%.*}" # "12.6" CUDA_VERSION_MM="${CUDA_VERSION_M_M//./}" # "126" export UV_TORCH_BACKEND=cu${CUDA_VERSION_MM} - COVERAGE_SOURCE=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="lightning_fabric").get(n, n))') + COVERAGE_SOURCE=$(python -c 'n = "${PACKAGE_NAME}" ; print(dict(fabric="lightning_fabric").get(n, n))') echo "collecting coverage for: ${COVERAGE_SOURCE}" uv pip install fire wget packaging "lightning-utilities[cli]" @@ -120,7 +120,7 @@ run: | fi echo "Install package with [${PACKAGE_NAME}] extras" - extra=$(python -c "print({'lightning': 'fabric-'}.get('$(PACKAGE_NAME)', ''))") + extra=$(python -c "print({'lightning': 'fabric-'}.get('${PACKAGE_NAME}', ''))") uv pip install ".[${extra}dev]" --upgrade python requirements/collect_env_details.py From 0d521d74aeb1d70fb59710c16eb02ad81fb733de Mon Sep 17 00:00:00 2001 From: Bhimraj Yadav Date: Sun, 2 Nov 2025 09:59:25 +0000 Subject: [PATCH 4/5] fix: disable CUDA_LAUNCH_BLOCKING to speed up tests --- .lightning/workflows/pytorch.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.lightning/workflows/pytorch.yml b/.lightning/workflows/pytorch.yml index a0b6043be0aa9..78cbcc776da00 100644 --- a/.lightning/workflows/pytorch.yml +++ b/.lightning/workflows/pytorch.yml @@ -27,7 +27,7 @@ env: DEBIAN_FRONTEND: "noninteractive" CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda" MKL_THREADING_LAYER: "GNU" - CUDA_LAUNCH_BLOCKING: "1" + # CUDA_LAUNCH_BLOCKING: "1" # disabling to speed up tests NCCL_DEBUG: "INFO" TORCHDYNAMO_VERBOSE: "1" FREEZE_REQUIREMENTS: "1" From adb43387c7ce5cede264ddb545fbb437ae1325f2 Mon Sep 17 00:00:00 2001 From: Bhimraj Yadav Date: Tue, 4 Nov 2025 18:58:54 +0545 Subject: [PATCH 5/5] apply suggestion --- .lightning/workflows/pytorch.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.lightning/workflows/pytorch.yml b/.lightning/workflows/pytorch.yml index dcc211d29bd7a..15dfc4a1f9064 100644 --- a/.lightning/workflows/pytorch.yml +++ b/.lightning/workflows/pytorch.yml @@ -27,7 +27,6 @@ env: DEBIAN_FRONTEND: "noninteractive" CUDA_TOOLKIT_ROOT_DIR: "/usr/local/cuda" MKL_THREADING_LAYER: "GNU" - # CUDA_LAUNCH_BLOCKING: "1" # disabling to speed up tests NCCL_DEBUG: "INFO" TORCHDYNAMO_VERBOSE: "1" FREEZE_REQUIREMENTS: "1"