Skip to content

Commit

Permalink
ci/gpu: debug skipped cache (#2709)
Browse files Browse the repository at this point in the history
---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
Borda and pre-commit-ci[bot] authored Sep 3, 2024
1 parent c8ebda5 commit 3e890d7
Show file tree
Hide file tree
Showing 11 changed files with 105 additions and 31 deletions.
56 changes: 56 additions & 0 deletions .azure/gpu-nuke-cache.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
trigger:
tags:
include:
- "*"
# run every month to sanitatize dev environment
schedules:
- cron: "0 0 1 * *"
displayName: Monthly nuke caches
branches:
include:
- master
# run on PR changing only this file
pr:
branches:
include:
- master
paths:
include:
- .azure/gpu-nuke-cache.yml

jobs:
- job: nuke_caches
# how long to run the job before automatically cancelling
timeoutInMinutes: "10"
# how much time to give 'run always even if cancelled tasks' before stopping them
cancelTimeoutInMinutes: "2"

pool: "lit-rtx-3090"

variables:
# these two caches assume to run repetitively on the same set of machines
# see: https://github.com/microsoft/azure-pipelines-agent/issues/4113#issuecomment-1439241481
TORCH_HOME: "/var/tmp/torch"
TRANSFORMERS_CACHE: "/var/tmp/hf/transformers"
HF_HOME: "/var/tmp/hf/home"
HF_HUB_CACHE: "/var/tmp/hf/hub"
PIP_CACHE_DIR: "/var/tmp/pip"
CACHED_REFERENCES: "/var/tmp/cached-references.zip"

container:
image: "ubuntu:22.04"
options: "-v /var/tmp:/var/tmp"

steps:
- bash: |
set -ex
rm -rf $(TORCH_HOME)
rm -rf $(TRANSFORMERS_CACHE)
rm -rf $(HF_HOME)
rm -rf $(HF_HUB_CACHE)
rm -rf $(PIP_CACHE_DIR)
rm -rf $(CACHED_REFERENCES)
displayName: "delete all caches"
- bash: |
ls -lh /var/tmp
displayName: "show tmp/ folder"
47 changes: 31 additions & 16 deletions .azure/gpu-unittests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@ trigger:
- master
- release/*
- refs/tags/*
# run every month to populate caches
schedules:
- cron: "0 1 1 * *"
displayName: Monthly re-build caches
branches:
include:
- master
pr:
- master
- release/*
Expand Down Expand Up @@ -67,6 +74,11 @@ jobs:
CUDA_version_mm="${CUDA_version//'.'/''}"
echo "##vso[task.setvariable variable=CUDA_VERSION_MM]$CUDA_version_mm"
echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${CUDA_version_mm}/torch_stable.html"
mkdir -p $(TORCH_HOME)
mkdir -p $(TRANSFORMERS_CACHE)
mkdir -p $(HF_HOME)
mkdir -p $(HF_HUB_CACHE)
mkdir -p $(PIP_CACHE_DIR)
displayName: "set Env. vars"
- bash: |
echo "##vso[task.setvariable variable=ALLOW_SKIP_IF_OUT_OF_MEMORY]1"
Expand Down Expand Up @@ -111,7 +123,7 @@ jobs:
- bash: |
python .github/assistant.py set-oldest-versions
condition: eq(variables['torch-ver'], '1.10.2')
condition: eq(variables['torch-ver'], '1.10')
displayName: "Setting oldest versions"
- bash: |
Expand All @@ -132,6 +144,21 @@ jobs:
displayName: "Show caches"
- bash: |
python -m pytest torchmetrics --cov=torchmetrics \
--timeout=240 --durations=50 \
--reruns 2 --reruns-delay 1
# --numprocesses=5 --dist=loadfile
env:
DOCTEST_DOWNLOAD_TIMEOUT: "180"
SKIP_SLOW_DOCTEST: "1"
workingDirectory: "src/"
timeoutInMinutes: "40"
displayName: "DocTesting"
- bash: |
df -h .
ls -lh $(CACHED_REFERENCES)
ls -lh tests/
# Check if the file references exists
if [ -f $(CACHED_REFERENCES) ]; then
# Create a directory if it doesn't already exist
Expand All @@ -142,25 +169,12 @@ jobs:
else
echo "The file '$(CACHED_REFERENCES)' does not exist."
fi
du -h --max-depth=1 tests/
timeoutInMinutes: "5"
# if pull request, copy the cache to the tests folder to be used in the next steps
condition: eq(variables['Build.Reason'], 'PullRequest')
continueOnError: "true"
displayName: "Copy/Unzip cached refs"
- bash: |
python -m pytest torchmetrics --cov=torchmetrics \
--timeout=240 --durations=50 \
--reruns 2 --reruns-delay 1
# --numprocesses=5 --dist=loadfile
env:
DOCTEST_DOWNLOAD_TIMEOUT: "180"
SKIP_SLOW_DOCTEST: "1"
workingDirectory: "src/"
timeoutInMinutes: "40"
displayName: "DocTesting"
- bash: |
wget https://pl-public-data.s3.amazonaws.com/metrics/data.zip
unzip -o data.zip
Expand All @@ -169,6 +183,7 @@ jobs:
displayName: "Pull testing data from S3"
- bash: |
du -h --max-depth=1 .
python -m pytest $(TEST_DIRS) \
-m "not DDP" --numprocesses=5 --dist=loadfile \
--cov=torchmetrics --timeout=240 --durations=100 \
Expand All @@ -192,9 +207,10 @@ jobs:
displayName: "UnitTesting DDP"
- bash: |
du -h --max-depth=1 tests/
# archive potentially updated cache to the machine filesystem to be reused with next jobs
zip -q -r $(CACHED_REFERENCES) tests/_cache-references
du -h --max-depth=1 tests/
ls -lh $(CACHED_REFERENCES)
# set as extra step to not pollute general cache when jobs fails or crashes
# so do this update only with successful jobs on master
condition: and(succeeded(), ne(variables['Build.Reason'], 'PullRequest'))
Expand All @@ -209,7 +225,6 @@ jobs:
python -m coverage xml
python -m codecov --token=$(CODECOV_TOKEN) --name="GPU-coverage" \
--commit=$(Build.SourceVersion) --flags=gpu,unittest --env=linux,azure
ls -l
workingDirectory: "tests/"
# skip for PR if there is nothing to test, note that outside PR there is default 'unittests'
condition: and(succeeded(), ne(variables['TEST_DIRS'], ''))
Expand Down
2 changes: 1 addition & 1 deletion requirements/_docs.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,4 @@ pydantic > 1.0.0, < 3.0.0
# todo: until this has resolution - https://github.com/sphinx-gallery/sphinx-gallery/issues/1290
# Image
scikit-image ~=0.22; python_version < "3.10"
scikit-image ~=0.24; python_version >= "3.10"
scikit-image ~=0.24; python_version > "3.9" # we do not use `> =` because of oldest replcement
11 changes: 6 additions & 5 deletions requirements/_tests.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment

coverage ==7.6.*
codecov ==2.1.13
pytest ==8.3.*
pytest-cov ==5.0.0
pytest-doctestplus ==1.2.1
Expand All @@ -10,11 +11,11 @@ pytest-timeout ==2.3.1
pytest-xdist ==3.6.1
phmdoctest ==1.4.0

psutil <6.1.0
pyGithub ==2.4.0
fire <=0.6.0
psutil ==6.*
pyGithub >2.0.0, <2.5.0
fire ==0.6.*

cloudpickle >1.3, <=3.0.0
scikit-learn >=1.1.1, <1.3.0; python_version < "3.9"
scikit-learn >=1.4.0, <1.6.0; python_version >= "3.9"
scikit-learn ==1.2.*; python_version < "3.9"
scikit-learn ==1.5.*; python_version > "3.8" # we do not use `> =` because of oldest replcement
cachier ==3.0.1
4 changes: 2 additions & 2 deletions requirements/text.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package
# in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment

nltk >=3.8.2, <=3.9.1
tqdm >=4.41.0, <4.67.0
nltk >3.8.1, <=3.9.1
tqdm <4.67.0
regex >=2021.9.24, <=2024.7.24
transformers >4.4.0, <4.45.0
mecab-python3 >=1.0.6, <1.1.0
Expand Down
2 changes: 1 addition & 1 deletion src/torchmetrics/audio/dnsmos.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ class DeepNoiseSuppressionMeanOpinionScore(Metric):
>>> preds = randn(8000)
>>> dnsmos = DeepNoiseSuppressionMeanOpinionScore(8000, False)
>>> dnsmos(preds)
tensor([2.2687, 2.0766, 1.1375, 1.2722], dtype=torch.float64)
tensor([2.2..., 2.0..., 1.1..., 1.2...], dtype=torch.float64)
"""

Expand Down
2 changes: 1 addition & 1 deletion src/torchmetrics/audio/stoi.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class ShortTimeObjectiveIntelligibility(Metric):
>>> target = randn(8000)
>>> stoi = ShortTimeObjectiveIntelligibility(8000, False)
>>> stoi(preds, target)
tensor(-0.0842)
tensor(-0.084...)
"""

Expand Down
2 changes: 1 addition & 1 deletion src/torchmetrics/functional/audio/dnsmos.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def deep_noise_suppression_mean_opinion_score(
>>> from torchmetrics.functional.audio.dnsmos import deep_noise_suppression_mean_opinion_score
>>> preds = randn(8000)
>>> deep_noise_suppression_mean_opinion_score(preds, 8000, False)
tensor([2.2687, 2.0766, 1.1375, 1.2722], dtype=torch.float64)
tensor([2.2..., 2.0..., 1.1..., 1.2...], dtype=torch.float64)
"""
if not _LIBROSA_AVAILABLE or not _ONNXRUNTIME_AVAILABLE or not _REQUESTS_AVAILABLE:
Expand Down
2 changes: 1 addition & 1 deletion src/torchmetrics/functional/audio/stoi.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def short_time_objective_intelligibility(
>>> preds = randn(8000)
>>> target = randn(8000)
>>> short_time_objective_intelligibility(preds, target, 8000).float()
tensor(-0.0842)
tensor(-0.084...)
"""
if not _PYSTOI_AVAILABLE:
Expand Down
2 changes: 2 additions & 0 deletions tests/unittests/audio/test_stoi.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from torch import Tensor
from torchmetrics.audio import ShortTimeObjectiveIntelligibility
from torchmetrics.functional.audio import short_time_objective_intelligibility
from torchmetrics.utilities.imports import _TORCH_GREATER_EQUAL_2_0

from unittests import _Input
from unittests._helpers import seed_all
Expand Down Expand Up @@ -120,6 +121,7 @@ def test_error_on_different_shape(metric_class=ShortTimeObjectiveIntelligibility
metric(torch.randn(100), torch.randn(50))


@pytest.mark.skipif(not _TORCH_GREATER_EQUAL_2_0, reason="precision issue with older torch")
def test_on_real_audio():
"""Test that metric works on real audio signal."""
rate, ref = wavfile.read(_SAMPLE_AUDIO_SPEECH)
Expand Down
6 changes: 3 additions & 3 deletions tests/unittests/segmentation/test_generalized_dice_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,11 @@ def _reference_generalized_dice(
],
)
@pytest.mark.parametrize("include_background", [True, False])
class TestMeanDiceScore(MetricTester):
class TestGeneralizedDiceScore(MetricTester):
"""Test class for `MeanIoU` metric."""

@pytest.mark.parametrize("ddp", [pytest.param(True, marks=pytest.mark.DDP), False])
def test_mean_iou_class(self, preds, target, input_format, include_background, ddp):
def test_generalized_dice_class(self, preds, target, input_format, include_background, ddp):
"""Test class implementation of metric."""
self.run_class_metric_test(
ddp=ddp,
Expand All @@ -90,7 +90,7 @@ def test_mean_iou_class(self, preds, target, input_format, include_background, d
},
)

def test_mean_iou_functional(self, preds, target, input_format, include_background):
def test_generalized_dice_functional(self, preds, target, input_format, include_background):
"""Test functional implementation of metric."""
self.run_functional_metric_test(
preds=preds,
Expand Down

0 comments on commit 3e890d7

Please sign in to comment.