Skip to content

Commit

Permalink
[SARC-331] Implémenter les alertes : GPU-util moyen d’un user sur une…
Browse files Browse the repository at this point in the history
… période X plus bas qu’un threshold X (#133)

* [SARC-331] Implémenter les alertes : GPU-util moyen d’un user sur une période X plus bas qu’un threshold X

* Use file_regression for tests.

* pylint: disable too-many-positional-arguments

---------

Co-authored-by: Bruno Carrez <[email protected]>
  • Loading branch information
notoraptor and nurbal authored Oct 6, 2024
1 parent ee50c58 commit 8a49625
Show file tree
Hide file tree
Showing 8 changed files with 141 additions and 0 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ disable = [
"line-too-long", # Black takes care of line length.
"logging-fstring-interpolation",
"duplicate-code",
"too-many-positional-arguments",
]
extension-pkg-whitelist = "pydantic"

Expand Down
77 changes: 77 additions & 0 deletions sarc/alerts/usage_alerts/gpu_util_per_user.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import logging
from datetime import datetime, timedelta
from typing import Optional

from sarc.config import MTL
from sarc.jobs.series import compute_cost_and_waste, load_job_series

logger = logging.getLogger(__name__)


def check_gpu_util_per_user(
threshold: timedelta,
time_interval: Optional[timedelta] = timedelta(days=7),
minimum_runtime: Optional[timedelta] = timedelta(minutes=5),
):
"""
Check if users have enough utilization of GPUs.
Log a warning for each user if average GPU-util of user jobs
in time interval is lower than a given threshold.
For a given user job, GPU-util is computed as
gpu_utilization * gpu_equivalent_cost
(with gpu_equivalent_cost as elapsed_time * allocated.gres_gpu).
Parameters
----------
threshold: timedelta
Minimum value for average GPU-util expected per user.
We assume GPU-util is expressed in GPU-seconds,
thus threshold can be expressed with a timedelta.
time_interval
If given, only jobs which ran in [now - time_interval, time_interval] will be used for checking.
Default is last 7 days.
If None, all jobs are used.
minimum_runtime
If given, only jobs which ran at least for this minimum runtime will be used for checking.
Default is 5 minutes.
If None, set to 0.
"""
# Parse time_interval
start, end, clip_time = None, None, False
if time_interval is not None:
end = datetime.now(tz=MTL)
start = end - time_interval
clip_time = True

# Get data frame. We clip time if start and end are available,
# so that minimum_runtime is compared to job running time in given interval.
df = load_job_series(start=start, end=end, clip_time=clip_time)

# Parse minimum_runtime, and select only jobs where
# elapsed time >= minimum runtime and allocated.gres_gpu > 0
if minimum_runtime is None:
minimum_runtime = timedelta(seconds=0)
df = df[
(df["elapsed_time"] >= minimum_runtime.total_seconds())
& (df["allocated.gres_gpu"] > 0)
]

# Compute cost
df = compute_cost_and_waste(df)

# Compute GPU-util for each job
df["gpu_util"] = df["gpu_utilization"] * df["gpu_equivalent_cost"]

# Compute average GPU-util per user
f_stats = df.groupby(["user"])[["gpu_util"]].mean()

# Now we can check
for row in f_stats.itertuples():
user = row.Index
gpu_util = row.gpu_util
if gpu_util < threshold.total_seconds():
logger.warning(
f"[{user}] insufficient average gpu_util: {gpu_util} GPU-seconds; "
f"minimum required: {threshold} ({threshold.total_seconds()} GPU-seconds)"
)
51 changes: 51 additions & 0 deletions tests/functional/usage_alerts/test_alert_gpu_util_per_user.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import functools
import re
from datetime import timedelta

import pytest

from sarc.alerts.usage_alerts.gpu_util_per_user import check_gpu_util_per_user
from sarc.client import get_jobs
from tests.functional.jobs.test_func_load_job_series import MOCK_TIME

from ..jobs.test_func_job_statistics import generate_fake_timeseries


@pytest.mark.freeze_time(MOCK_TIME)
@pytest.mark.usefixtures("read_only_db", "tzlocal_is_mtl")
@pytest.mark.parametrize(
"params",
[
# Check with default params. In last 7 days from now (mock time: 2023-11-22),
# there is only 2 jobs, both with no gpu_utilization, so, no warnings.
dict(threshold=timedelta()),
# Check with no time_interval and a threshold to 7 days
dict(threshold=timedelta(hours=7), time_interval=None),
# Check with no time_interval and threshold to 6 days
dict(threshold=timedelta(hours=6), time_interval=None),
# Check with a valid time_interval
dict(threshold=timedelta(hours=8), time_interval=timedelta(days=276)),
# Check will all params, including minimum_runtime
dict(
threshold=timedelta(hours=8),
time_interval=timedelta(days=276),
minimum_runtime=timedelta(seconds=39000),
),
],
)
def test_alert_gpu_util_per_user(params, caplog, monkeypatch, file_regression):
monkeypatch.setattr(
"sarc.jobs.series.get_job_time_series", generate_fake_timeseries
)

for job in get_jobs():
job.statistics(save=True)

check_gpu_util_per_user(**params)
file_regression.check(
re.sub(
r"WARNING +sarc\.alerts\.usage_alerts\.gpu_util_per_user:gpu_util_per_user.py:[0-9]+ +",
"",
caplog.text,
)
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[beaubonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 7:00:00 (25200.0 GPU-seconds)
[bonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 7:00:00 (25200.0 GPU-seconds)
[grosbonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 7:00:00 (25200.0 GPU-seconds)
[petitbonhomme] insufficient average gpu_util: 22784.166666666668 GPU-seconds; minimum required: 7:00:00 (25200.0 GPU-seconds)
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[beaubonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 6:00:00 (21600.0 GPU-seconds)
[bonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 6:00:00 (21600.0 GPU-seconds)
[grosbonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 6:00:00 (21600.0 GPU-seconds)
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[beaubonhomme] insufficient average gpu_util: 19816.229166666668 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds)
[grosbonhomme] insufficient average gpu_util: 9023.729166666666 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds)
[petitbonhomme] insufficient average gpu_util: 28780.0 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds)
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[beaubonhomme] insufficient average gpu_util: 19816.229166666668 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds)
[petitbonhomme] insufficient average gpu_util: 28780.0 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds)

0 comments on commit 8a49625

Please sign in to comment.