diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 6530796fb7f..934fe6e1057 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -46,19 +46,38 @@ jobs: - "2.6.0" - "2.7.1" - "2.8.0" + testing: ["common", "distributed"] include: # cover additional python and PT combinations - - { os: "ubuntu-22.04", python-version: "3.9", pytorch-version: "2.0.1", requires: "oldest" } - - { os: "ubuntu-22.04", python-version: "3.12", pytorch-version: "2.7.1" } - - { os: "ubuntu-22.04", python-version: "3.12", pytorch-version: "2.8.0" } + - { + os: "ubuntu-22.04", + python-version: "3.9", + pytorch-version: "2.0.1", + requires: "oldest", + testing: "common", + } + - { + os: "ubuntu-22.04", + python-version: "3.9", + pytorch-version: "2.0.1", + requires: "oldest", + testing: "distributed", + } + - { os: "ubuntu-22.04", python-version: "3.12", pytorch-version: "2.8.0", testing: "common" } + - { os: "ubuntu-22.04", python-version: "3.12", pytorch-version: "2.8.0", testing: "distributed" } # standard mac machine, not the M1 - - { os: "macOS-13", python-version: "3.10", pytorch-version: "2.0.1" } + - { os: "macOS-14", python-version: "3.10", pytorch-version: "2.0.1", testing: "common" } + - { os: "macOS-14", python-version: "3.10", pytorch-version: "2.0.1", testing: "distributed" } # using the ARM based M1 machine - - { os: "macOS-14", python-version: "3.10", pytorch-version: "2.0.1" } - - { os: "macOS-14", python-version: "3.12", pytorch-version: "2.8.0" } + - { os: "macOS-14", python-version: "3.10", pytorch-version: "2.0.1", testing: "common" } + - { os: "macOS-14", python-version: "3.10", pytorch-version: "2.0.1", testing: "distributed" } + - { os: "macOS-14", python-version: "3.12", pytorch-version: "2.8.0", testing: "common" } + - { os: "macOS-14", python-version: "3.12", pytorch-version: "2.8.0", testing: "distributed" } # some windows - - { os: "windows-2022", python-version: "3.10", pytorch-version: "2.0.1" } - - { os: "windows-2022", python-version: "3.12", pytorch-version: "2.8.0" } + - { os: "windows-2022", python-version: "3.10", pytorch-version: "2.0.1", testing: "common" } + - { os: "windows-2022", python-version: "3.10", pytorch-version: "2.0.1", testing: "distributed" } + - { os: "windows-2022", python-version: "3.12", pytorch-version: "2.8.0", testing: "common" } + - { os: "windows-2022", python-version: "3.12", pytorch-version: "2.8.0", testing: "distributed" } # Future released version #- { os: "ubuntu-22.04", python-version: "3.11", pytorch-version: "2.8.0" } #- { os: "macOS-14", python-version: "3.11", pytorch-version: "2.8.0" } @@ -73,7 +92,7 @@ jobs: # Timeout: https://stackoverflow.com/a/59076067/4521646 # seems that macOS jobs take much more than orger OS - timeout-minutes: 120 + timeout-minutes: 70 steps: - uses: actions/checkout@v5 @@ -182,15 +201,13 @@ jobs: - name: Unittests common # skip for PR if there is nothing to test, note that outside PR there is default 'unittests' - if: ${{ env.TEST_DIRS != '' }} + if: ${{ env.TEST_DIRS != '' && matrix.testing == 'common' }} working-directory: ./tests run: | - python -m pytest \ + pytest \ $TEST_DIRS \ --cov=torchmetrics \ --durations=50 \ - --reruns 3 \ - --reruns-delay 1 \ -m "not DDP" \ -n auto \ --dist=load \ @@ -198,18 +215,16 @@ jobs: - name: Unittests DDP # skip for PR if there is nothing to test, note that outside PR there is default 'unittests' - if: ${{ env.TEST_DIRS != '' }} + if: ${{ env.TEST_DIRS != '' && matrix.testing == 'distributed' }} working-directory: ./tests env: USE_PYTEST_POOL: "1" run: | - python -m pytest -v \ + pytest -v \ $TEST_DIRS \ --cov=torchmetrics \ --durations=50 \ -m DDP \ - --reruns 3 \ - --reruns-delay 1 \ ${{ env.UNITTEST_TIMEOUT }} - name: Statistics diff --git a/README.md b/README.md index fd00bcaa6bd..c77320656fd 100644 --- a/README.md +++ b/README.md @@ -39,13 +39,15 @@ ______________________________________________________________________ # Looking for GPUs? -Over 340,000 developers use [Lightning Cloud](https://lightning.ai/?utm_source=tm_readme&utm_medium=referral&utm_campaign=tm_readme) - purpose-built for PyTorch and PyTorch Lightning. -- [GPUs](https://lightning.ai/pricing?utm_source=tm_readme&utm_medium=referral&utm_campaign=tm_readme) from $0.19. -- [Clusters](https://lightning.ai/clusters?utm_source=tm_readme&utm_medium=referral&utm_campaign=tm_readme): frontier-grade training/inference clusters. + +Over 340,000 developers use [Lightning Cloud](https://lightning.ai/?utm_source=tm_readme&utm_medium=referral&utm_campaign=tm_readme) - purpose-built for PyTorch and PyTorch Lightning. + +- [GPUs](https://lightning.ai/pricing?utm_source=tm_readme&utm_medium=referral&utm_campaign=tm_readme) from $0.19. +- [Clusters](https://lightning.ai/clusters?utm_source=tm_readme&utm_medium=referral&utm_campaign=tm_readme): frontier-grade training/inference clusters. - [AI Studio (vibe train)](https://lightning.ai/studios?utm_source=tm_readme&utm_medium=referral&utm_campaign=tm_readme): workspaces where AI helps you debug, tune and vibe train. -- [AI Studio (vibe deploy)](https://lightning.ai/studios?utm_source=tm_readme&utm_medium=referral&utm_campaign=tm_readme): workspaces where AI helps you optimize, and deploy models. +- [AI Studio (vibe deploy)](https://lightning.ai/studios?utm_source=tm_readme&utm_medium=referral&utm_campaign=tm_readme): workspaces where AI helps you optimize, and deploy models. - [Notebooks](https://lightning.ai/notebooks?utm_source=tm_readme&utm_medium=referral&utm_campaign=tm_readme): Persistent GPU workspaces where AI helps you code and analyze. -- [Inference](https://lightning.ai/deploy?utm_source=tm_readme&utm_medium=referral&utm_campaign=tm_readme): Deploy models as inference APIs. +- [Inference](https://lightning.ai/deploy?utm_source=tm_readme&utm_medium=referral&utm_campaign=tm_readme): Deploy models as inference APIs. # Installation diff --git a/tests/unittests/bases/test_ddp.py b/tests/unittests/bases/test_ddp.py index de92bf381ab..b42f151be61 100644 --- a/tests/unittests/bases/test_ddp.py +++ b/tests/unittests/bases/test_ddp.py @@ -26,7 +26,7 @@ from unittests import NUM_PROCESSES, USE_PYTEST_POOL from unittests._helpers import _IS_WINDOWS, seed_all from unittests._helpers.testers import DummyListMetric, DummyMetric, DummyMetricSum -from unittests.conftest import setup_ddp +from unittests.conftest import get_free_port, setup_ddp seed_all(42) @@ -105,9 +105,9 @@ def test_ddp(process): pytest.pool.map(process, range(NUM_PROCESSES)) -def _test_ddp_gather_all_autograd_same_shape(rank: int, worldsize: int = NUM_PROCESSES) -> None: +def _test_ddp_gather_all_autograd_same_shape(rank: int, worldsize: int, port: int) -> None: """Test that ddp gather preserves local rank's autograd graph for same-shaped tensors across ranks.""" - setup_ddp(rank, worldsize) + setup_ddp(rank, worldsize, port) x = (rank + 1) * torch.ones(10, requires_grad=True) # random linear transformation, it should really not matter what we do here @@ -120,9 +120,9 @@ def _test_ddp_gather_all_autograd_same_shape(rank: int, worldsize: int = NUM_PRO assert torch.allclose(grad, a * torch.ones_like(x)) -def _test_ddp_gather_all_autograd_different_shape(rank: int, worldsize: int = NUM_PROCESSES) -> None: +def _test_ddp_gather_all_autograd_different_shape(rank: int, worldsize: int, port: int) -> None: """Test that ddp gather preserves local rank's autograd graph for differently-shaped tensors across ranks.""" - setup_ddp(rank, worldsize) + setup_ddp(rank, worldsize, port) x = (rank + 1) * torch.ones(rank + 1, 2 - rank, requires_grad=True) # random linear transformation, it should really not matter what we do here @@ -143,7 +143,8 @@ def _test_ddp_gather_all_autograd_different_shape(rank: int, worldsize: int = NU ) def test_ddp_autograd(process): """Test ddp functions for autograd compatibility.""" - pytest.pool.map(process, range(NUM_PROCESSES)) + port = get_free_port() + pytest.pool.starmap(process, [(rank, NUM_PROCESSES, port) for rank in range(NUM_PROCESSES)]) def _test_non_contiguous_tensors(rank): diff --git a/tests/unittests/conftest.py b/tests/unittests/conftest.py index 65c086cb9ba..dd1ac85ee66 100644 --- a/tests/unittests/conftest.py +++ b/tests/unittests/conftest.py @@ -13,6 +13,7 @@ # limitations under the License. import contextlib import os +import socket import sys import pytest @@ -30,9 +31,6 @@ EXTRA_DIM = 3 THRESHOLD = 0.5 -MAX_PORT = 8100 -START_PORT = 8088 -CURRENT_PORT = START_PORT USE_PYTEST_POOL = os.getenv("USE_PYTEST_POOL", "0") == "1" @@ -44,7 +42,16 @@ def use_deterministic_algorithms(): torch.use_deterministic_algorithms(False) -def setup_ddp(rank, world_size): +def get_free_port() -> int: + """Find an available free port on localhost and keep it reserved.""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("localhost", 0)) # Bind to a free port provided by the OS + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + port = s.getsockname()[1] + return int(port) + + +def setup_ddp(rank: int, world_size: int, port: int) -> None: """Initialize ddp environment. If a particular test relies on the order of the processes in the pool to be [0, 1, 2, ...], then this function @@ -54,16 +61,11 @@ def setup_ddp(rank, world_size): Args: rank: the rank of the process world_size: the number of processes + port: the port to use for communication """ - global CURRENT_PORT - os.environ["MASTER_ADDR"] = "localhost" - os.environ["MASTER_PORT"] = str(CURRENT_PORT) - - CURRENT_PORT += 1 - if CURRENT_PORT > MAX_PORT: - CURRENT_PORT = START_PORT + os.environ["MASTER_PORT"] = str(port) if torch.distributed.group.WORLD is not None: # if already initialized, destroy the process group torch.distributed.destroy_process_group() @@ -72,12 +74,19 @@ def setup_ddp(rank, world_size): torch.distributed.init_process_group("gloo", rank=rank, world_size=world_size) +def cleanup_ddp(): + """Clean up the DDP process group if initialized.""" + if torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + + def pytest_sessionstart(): """Global initialization of multiprocessing pool; runs before any test.""" if not USE_PYTEST_POOL: return + port = get_free_port() pool = Pool(processes=NUM_PROCESSES) - pool.starmap(setup_ddp, [(rank, NUM_PROCESSES) for rank in range(NUM_PROCESSES)]) + pool.starmap(setup_ddp, [(rank, NUM_PROCESSES, port) for rank in range(NUM_PROCESSES)]) pytest.pool = pool diff --git a/tests/unittests/image/__init__.py b/tests/unittests/image/__init__.py index 8eea7d284b8..13ab22f7140 100644 --- a/tests/unittests/image/__init__.py +++ b/tests/unittests/image/__init__.py @@ -13,23 +13,6 @@ # limitations under the License. import os -import torch -import torch.distributed as dist - from unittests import _PATH_ALL_TESTS _SAMPLE_IMAGE = os.path.join(_PATH_ALL_TESTS, "_data", "image", "i01_01_5.bmp") - - -def setup_ddp(rank: int, world_size: int, free_port: int): - """Set up DDP with a free port and assign CUDA device to the given rank.""" - os.environ["MASTER_ADDR"] = "localhost" - os.environ["MASTER_PORT"] = str(free_port) - dist.init_process_group(backend="nccl", rank=rank, world_size=world_size) - torch.cuda.set_device(rank) - - -def cleanup_ddp(): - """Clean up the DDP process group if initialized.""" - if dist.is_initialized(): - dist.destroy_process_group() diff --git a/tests/unittests/image/test_ms_ssim.py b/tests/unittests/image/test_ms_ssim.py index 89b9b5778bd..3b3cb85fc24 100644 --- a/tests/unittests/image/test_ms_ssim.py +++ b/tests/unittests/image/test_ms_ssim.py @@ -23,8 +23,7 @@ from unittests import NUM_BATCHES, _Input from unittests._helpers import _IS_WINDOWS, seed_all from unittests._helpers.testers import MetricTester -from unittests.image import cleanup_ddp, setup_ddp -from unittests.utilities.test_utilities import find_free_port +from unittests.conftest import cleanup_ddp, get_free_port, setup_ddp seed_all(42) @@ -136,7 +135,7 @@ def test_ms_ssim_reduction_none_ddp(): """ world_size = 2 - free_port = find_free_port() + free_port = get_free_port() if free_port == -1: pytest.skip("No free port available for DDP test.") mp.spawn(_run_ms_ssim_ddp, args=(world_size, free_port), nprocs=world_size, join=True) diff --git a/tests/unittests/image/test_ssim.py b/tests/unittests/image/test_ssim.py index e327e7d7f70..9ece21f1dfa 100644 --- a/tests/unittests/image/test_ssim.py +++ b/tests/unittests/image/test_ssim.py @@ -26,8 +26,7 @@ from unittests import NUM_BATCHES, _Input from unittests._helpers import _IS_WINDOWS, seed_all from unittests._helpers.testers import MetricTester -from unittests.image import cleanup_ddp, setup_ddp -from unittests.utilities.test_utilities import find_free_port +from unittests.conftest import cleanup_ddp, get_free_port, setup_ddp seed_all(42) @@ -391,7 +390,7 @@ def test_ssim_reduction_none_ddp(): """ world_size = 2 - free_port = find_free_port() + free_port = get_free_port() if free_port == -1: pytest.skip("No free port available for DDP test.") mp.spawn(_run_ssim_ddp, args=(world_size, free_port), nprocs=world_size, join=True) diff --git a/tests/unittests/utilities/test_utilities.py b/tests/unittests/utilities/test_utilities.py index 226edf1b724..00ffa53b3a1 100644 --- a/tests/unittests/utilities/test_utilities.py +++ b/tests/unittests/utilities/test_utilities.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import socket import sys import numpy as np @@ -20,7 +19,6 @@ from lightning_utilities.test.warning import no_warning_call from torch import tensor from unittests._helpers import _IS_WINDOWS -from unittests.conftest import MAX_PORT, START_PORT from torchmetrics.regression import MeanSquaredError, PearsonCorrCoef from torchmetrics.utilities import check_forward_full_state_property, rank_zero_debug, rank_zero_info, rank_zero_warn @@ -240,15 +238,3 @@ def test_half_precision_top_k_cpu_raises_error(): x = torch.randn(100, 10, dtype=torch.half) with pytest.raises(RuntimeError, match="\"topk_cpu\" not implemented for 'Half'"): torch.topk(x, k=3, dim=1) - - -def find_free_port(start=START_PORT, end=MAX_PORT): - """Returns an available localhost port in the given range or returns -1 if no port available.""" - for port in range(start, end + 1): - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - try: - s.bind(("localhost", port)) - return port - except OSError: - continue - return -1