-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[SARC-331] Implémenter les alertes : GPU-util moyen d’un user sur une…
… période X plus bas qu’un threshold X (#133) * [SARC-331] Implémenter les alertes : GPU-util moyen d’un user sur une période X plus bas qu’un threshold X * Use file_regression for tests. * pylint: disable too-many-positional-arguments --------- Co-authored-by: Bruno Carrez <[email protected]>
- Loading branch information
1 parent
ee50c58
commit 8a49625
Showing
8 changed files
with
141 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
import logging | ||
from datetime import datetime, timedelta | ||
from typing import Optional | ||
|
||
from sarc.config import MTL | ||
from sarc.jobs.series import compute_cost_and_waste, load_job_series | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def check_gpu_util_per_user( | ||
threshold: timedelta, | ||
time_interval: Optional[timedelta] = timedelta(days=7), | ||
minimum_runtime: Optional[timedelta] = timedelta(minutes=5), | ||
): | ||
""" | ||
Check if users have enough utilization of GPUs. | ||
Log a warning for each user if average GPU-util of user jobs | ||
in time interval is lower than a given threshold. | ||
For a given user job, GPU-util is computed as | ||
gpu_utilization * gpu_equivalent_cost | ||
(with gpu_equivalent_cost as elapsed_time * allocated.gres_gpu). | ||
Parameters | ||
---------- | ||
threshold: timedelta | ||
Minimum value for average GPU-util expected per user. | ||
We assume GPU-util is expressed in GPU-seconds, | ||
thus threshold can be expressed with a timedelta. | ||
time_interval | ||
If given, only jobs which ran in [now - time_interval, time_interval] will be used for checking. | ||
Default is last 7 days. | ||
If None, all jobs are used. | ||
minimum_runtime | ||
If given, only jobs which ran at least for this minimum runtime will be used for checking. | ||
Default is 5 minutes. | ||
If None, set to 0. | ||
""" | ||
# Parse time_interval | ||
start, end, clip_time = None, None, False | ||
if time_interval is not None: | ||
end = datetime.now(tz=MTL) | ||
start = end - time_interval | ||
clip_time = True | ||
|
||
# Get data frame. We clip time if start and end are available, | ||
# so that minimum_runtime is compared to job running time in given interval. | ||
df = load_job_series(start=start, end=end, clip_time=clip_time) | ||
|
||
# Parse minimum_runtime, and select only jobs where | ||
# elapsed time >= minimum runtime and allocated.gres_gpu > 0 | ||
if minimum_runtime is None: | ||
minimum_runtime = timedelta(seconds=0) | ||
df = df[ | ||
(df["elapsed_time"] >= minimum_runtime.total_seconds()) | ||
& (df["allocated.gres_gpu"] > 0) | ||
] | ||
|
||
# Compute cost | ||
df = compute_cost_and_waste(df) | ||
|
||
# Compute GPU-util for each job | ||
df["gpu_util"] = df["gpu_utilization"] * df["gpu_equivalent_cost"] | ||
|
||
# Compute average GPU-util per user | ||
f_stats = df.groupby(["user"])[["gpu_util"]].mean() | ||
|
||
# Now we can check | ||
for row in f_stats.itertuples(): | ||
user = row.Index | ||
gpu_util = row.gpu_util | ||
if gpu_util < threshold.total_seconds(): | ||
logger.warning( | ||
f"[{user}] insufficient average gpu_util: {gpu_util} GPU-seconds; " | ||
f"minimum required: {threshold} ({threshold.total_seconds()} GPU-seconds)" | ||
) |
51 changes: 51 additions & 0 deletions
51
tests/functional/usage_alerts/test_alert_gpu_util_per_user.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
import functools | ||
import re | ||
from datetime import timedelta | ||
|
||
import pytest | ||
|
||
from sarc.alerts.usage_alerts.gpu_util_per_user import check_gpu_util_per_user | ||
from sarc.client import get_jobs | ||
from tests.functional.jobs.test_func_load_job_series import MOCK_TIME | ||
|
||
from ..jobs.test_func_job_statistics import generate_fake_timeseries | ||
|
||
|
||
@pytest.mark.freeze_time(MOCK_TIME) | ||
@pytest.mark.usefixtures("read_only_db", "tzlocal_is_mtl") | ||
@pytest.mark.parametrize( | ||
"params", | ||
[ | ||
# Check with default params. In last 7 days from now (mock time: 2023-11-22), | ||
# there is only 2 jobs, both with no gpu_utilization, so, no warnings. | ||
dict(threshold=timedelta()), | ||
# Check with no time_interval and a threshold to 7 days | ||
dict(threshold=timedelta(hours=7), time_interval=None), | ||
# Check with no time_interval and threshold to 6 days | ||
dict(threshold=timedelta(hours=6), time_interval=None), | ||
# Check with a valid time_interval | ||
dict(threshold=timedelta(hours=8), time_interval=timedelta(days=276)), | ||
# Check will all params, including minimum_runtime | ||
dict( | ||
threshold=timedelta(hours=8), | ||
time_interval=timedelta(days=276), | ||
minimum_runtime=timedelta(seconds=39000), | ||
), | ||
], | ||
) | ||
def test_alert_gpu_util_per_user(params, caplog, monkeypatch, file_regression): | ||
monkeypatch.setattr( | ||
"sarc.jobs.series.get_job_time_series", generate_fake_timeseries | ||
) | ||
|
||
for job in get_jobs(): | ||
job.statistics(save=True) | ||
|
||
check_gpu_util_per_user(**params) | ||
file_regression.check( | ||
re.sub( | ||
r"WARNING +sarc\.alerts\.usage_alerts\.gpu_util_per_user:gpu_util_per_user.py:[0-9]+ +", | ||
"", | ||
caplog.text, | ||
) | ||
) |
Empty file.
4 changes: 4 additions & 0 deletions
4
...ional/usage_alerts/test_alert_gpu_util_per_user/test_alert_gpu_util_per_user_params1_.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
[beaubonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 7:00:00 (25200.0 GPU-seconds) | ||
[bonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 7:00:00 (25200.0 GPU-seconds) | ||
[grosbonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 7:00:00 (25200.0 GPU-seconds) | ||
[petitbonhomme] insufficient average gpu_util: 22784.166666666668 GPU-seconds; minimum required: 7:00:00 (25200.0 GPU-seconds) |
3 changes: 3 additions & 0 deletions
3
...ional/usage_alerts/test_alert_gpu_util_per_user/test_alert_gpu_util_per_user_params2_.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
[beaubonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 6:00:00 (21600.0 GPU-seconds) | ||
[bonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 6:00:00 (21600.0 GPU-seconds) | ||
[grosbonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 6:00:00 (21600.0 GPU-seconds) |
3 changes: 3 additions & 0 deletions
3
...ional/usage_alerts/test_alert_gpu_util_per_user/test_alert_gpu_util_per_user_params3_.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
[beaubonhomme] insufficient average gpu_util: 19816.229166666668 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds) | ||
[grosbonhomme] insufficient average gpu_util: 9023.729166666666 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds) | ||
[petitbonhomme] insufficient average gpu_util: 28780.0 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds) |
2 changes: 2 additions & 0 deletions
2
...ional/usage_alerts/test_alert_gpu_util_per_user/test_alert_gpu_util_per_user_params4_.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
[beaubonhomme] insufficient average gpu_util: 19816.229166666668 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds) | ||
[petitbonhomme] insufficient average gpu_util: 28780.0 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds) |