Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SARC-327] Implémenter les alertes : Proportion des jobs avec GPU-type sur un noeud donné plus bas qu'un threshold X #127

Merged
merged 3 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
99 changes: 99 additions & 0 deletions sarc/alerts/usage_alerts/gpu_usage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import logging
from datetime import datetime, timedelta
from typing import Optional, Sequence

from sarc.config import MTL
from sarc.jobs.series import load_job_series

logger = logging.getLogger(__name__)


def check_gpu_type_usage_per_node(
gpu_type: str,
time_interval: Optional[timedelta] = timedelta(hours=24),
minimum_runtime: Optional[timedelta] = timedelta(minutes=5),
threshold=1.0,
min_tasks=0,
ignore_min_tasks_for_clusters: Optional[Sequence[str]] = ("mila",),
):
"""
Check if a GPU type is sufficiently used on each node.
Log a warning for each node where ratio of jobs using GPU type is lesser than given threshold.

Parameters
----------
gpu_type: str
GPU type to check.
time_interval: timedelta
If given, only jobs which ran in [now - time_interval, time_interval] will be used for checking.
Default is last 24 hours.
If None, all jobs are used.
minimum_runtime: timedelta
If given, only jobs which ran at least for this minimum runtime will be used for checking.
Default is 5 minutes.
If None, set to 0.
threshold: float
A value between 0 and 1 to represent the minimum expected ratio of jobs that use given GPU type
wr/t running jobs on each node. Log a warning if computed ratio is lesser than this threshold.
min_tasks: int
Minimum number of jobs required on a cluster node to make checking.
Checking is performed on a node only if, either it contains at least `min_tasks` jobs,
or node cluster is in `ignore_min_tasks_for_clusters`.
ignore_min_tasks_for_clusters: Sequence
Clusters to check even if nodes from those clusters don't have `min_tasks` jobs.
"""
# Parse time_interval
start, end, clip_time = None, None, False
if time_interval is not None:
end = datetime.now(tz=MTL)
start = end - time_interval
clip_time = True

# Parse minimum_runtime
if minimum_runtime is None:
minimum_runtime = timedelta(seconds=0)

# Get data frame. We clip time if start and end are available,
# so that minimum_runtime is compared to job running time in given interval.
df = load_job_series(start=start, end=end, clip_time=clip_time)

# Add a column `gpu_task_` with value 1 for each job running on given GPU type.
df.loc[:, "gpu_task_"] = df["allocated.gpu_type"] == gpu_type
# Add a column `task_` with value 1 for each job. Used later to count jobs in a groupby().
df.loc[:, "task_"] = 1

# Group jobs.
ff = (
# Select only jobs where elapsed time >= minimum runtime and gres_gpu > 0
df[
(df["elapsed_time"] >= minimum_runtime.total_seconds())
& (df["allocated.gres_gpu"] > 0)
]
# `nodes` is a list of nodes. We explode this column to count each job for each node where it is running
.explode("nodes")
# Then we group by cluster name and nodes,
.groupby(["cluster_name", "nodes"])[["gpu_task_", "task_"]]
# and we sum on gpu_task_ and task_
.sum()
)
# Finally, we compute GPU usage.
ff["gpu_usage_"] = ff["gpu_task_"] / ff["task_"]

# We can now check GPU usage.
ignore_min_tasks_for_clusters = set(ignore_min_tasks_for_clusters or ())
for row in ff.itertuples():
cluster_name, node = row.Index
nb_gpu_tasks = row.gpu_task_
nb_tasks = row.task_
gpu_usage = row.gpu_usage_
if gpu_usage < threshold and (
cluster_name in ignore_min_tasks_for_clusters or nb_tasks >= min_tasks
):
# We warn if gpu usage < threshold and if
# either we are on a cluster listed in `ignore_min_tasks_for_clusters`,
# or there are enough jobs in node.
logger.warning(
f"[{cluster_name}][{node}] insufficient usage for GPU {gpu_type}: "
f"{round(gpu_usage * 100, 2)} % ({nb_gpu_tasks}/{nb_tasks}), "
f"minimum required: {round(threshold * 100, 2)} %"
)
4 changes: 3 additions & 1 deletion sarc/jobs/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,9 @@ def load_job_series(
"gpu_utilization", "cpu_utilization", "gpu_memory", "gpu_power", "system_memory"
- Optional job series fields, added if clip_time is True:
"unclipped_start" and "unclipped_end"
- Optional user info fields if job users found. See `_user_to_series` for user fields.
- Optional user info fields if job users found.
Fields from `User.dict()` in format `user.<flattened dot-separated field>`,
+ special field `user.primary_email` containing either `user.mila.email` or fallback `job.user`.
"""

# If fields is a list, convert it to a renaming dict with same old and new names.
Expand Down
Empty file.
11 changes: 11 additions & 0 deletions tests/functional/usage_alerts/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
def _get_warnings(text: str, module: str) -> list:
"""Parse warning messages from given text (typically caplog.text)"""
warnings = []
for line in text.split("\n"):
line = line.strip()
if line.startswith("WARNING "):
line_content = line[len("WARNING") :].lstrip()
line_ref, warning_msg = line_content.split(" ", maxsplit=1)
assert line_ref.startswith(f"{module}:"), line_ref
warnings.append(warning_msg.strip())
return warnings
170 changes: 170 additions & 0 deletions tests/functional/usage_alerts/test_alert_gpu_usage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
"""
Initial jobs in read_only_db (for reference):

| | job_id | cluster_name | nodes | allocated.gres_gpu | allocated.gpu_type | start_time | end_time | elapsed_time |
|---:|----------:|:---------------|:----------------------------------|---------------------:|:---------------------|:--------------------------|:--------------------------|---------------:|
| 0 | 1 | raisin | ['cn-c021'] | 1 | | 2023-02-14 00:01:00-05:00 | 2023-02-14 12:01:00-05:00 | 43200 |
| 1 | 2 | raisin | ['cn-c021'] | 1 | | 2023-02-14 06:01:00-05:00 | 2023-02-14 18:01:00-05:00 | 43200 |
| 2 | 3 | raisin | ['cn-c021'] | 1 | | 2023-02-14 12:01:00-05:00 | 2023-02-15 00:01:00-05:00 | 43200 |
| 3 | 4 | raisin | ['cn-c021'] | 1 | | 2023-02-14 18:01:00-05:00 | 2023-02-15 06:01:00-05:00 | 43200 |
| 4 | 5 | raisin | ['cn-c021'] | 1 | | 2023-02-15 00:01:00-05:00 | 2023-02-15 12:01:00-05:00 | 43200 |
| 5 | 6 | raisin | ['cn-c021'] | 1 | | 2023-02-15 06:01:00-05:00 | 2023-02-15 18:01:00-05:00 | 43200 |
| 6 | 7 | raisin | ['cn-c021'] | 1 | | 2023-11-21 07:00:00-05:00 | 2023-11-21 19:00:00-05:00 | 43200 |
| 7 | 8 | raisin | ['cn-c021'] | 1 | | 2023-11-21 07:00:00-05:00 | 2023-11-21 19:00:00-05:00 | 43200 |
| 8 | 9 | raisin | ['cn-c021'] | 1 | | 2023-02-16 00:01:00-05:00 | 2023-02-16 12:01:00-05:00 | 43200 |
| 9 | 10 | raisin | ['cn-c021'] | 1 | | 2023-02-16 00:01:00-05:00 | 2023-02-16 12:01:00-05:00 | 43200 |
| 10 | 11 | raisin | ['cn-c021'] | 1 | | 2023-02-16 00:01:00-05:00 | 2023-02-16 12:01:00-05:00 | 43200 |
| 11 | 12 | raisin | ['bart'] | 1 | | 2023-02-16 18:01:00-05:00 | 2023-02-17 06:01:00-05:00 | 43200 |
| 12 | 13 | raisin | ['cn-c021', 'cn-c022', 'cn-d001'] | 1 | | 2023-02-17 00:01:00-05:00 | 2023-02-17 12:01:00-05:00 | 43200 |
| 13 | 14 | raisin | ['cn-c021'] | 1 | | 2023-02-17 06:01:00-05:00 | 2023-02-17 18:01:00-05:00 | 43200 |
| 14 | 15 | fromage | ['cn-c021'] | 1 | | 2023-02-17 12:01:00-05:00 | 2023-02-18 00:01:00-05:00 | 43200 |
| 15 | 16 | patate | ['cn-c021'] | 1 | | 2023-02-17 18:01:00-05:00 | 2023-02-18 06:01:00-05:00 | 43200 |
| 16 | 17 | raisin | ['cn-c021'] | 1 | | 2023-02-18 00:01:00-05:00 | 2023-02-18 12:01:00-05:00 | 43200 |
| 17 | 18 | raisin | ['cn-c021'] | 1 | | 2023-02-18 06:01:00-05:00 | 2023-02-18 18:01:00-05:00 | 43200 |
| 18 | 19 | mila | ['cn-c021'] | 1 | | 2023-02-18 12:01:00-05:00 | 2023-02-19 00:01:00-05:00 | 43200 |
| 19 | 20 | raisin | ['cn-c021'] | 1 | | 2023-02-18 18:01:00-05:00 | 2023-02-19 06:01:00-05:00 | 43200 |
| 20 | 1000000 | raisin | ['cn-c017'] | 1 | | 2023-02-19 00:01:00-05:00 | 2023-02-19 12:01:00-05:00 | 43200 |
| 21 | 1000000 | raisin | ['cn-b099'] | 1 | | 2023-02-19 06:01:00-05:00 | 2023-02-19 18:01:00-05:00 | 43200 |
| 22 | 23 | raisin | ['cn-c021'] | 2 | A100 | 2023-02-19 12:01:00-05:00 | 2023-02-20 00:01:00-05:00 | 43200 |
| 23 | 999999999 | mila | ['cn-c021'] | 0 | | 2023-02-19 18:01:00-05:00 | 2023-02-20 12:01:00-05:00 | 64800 |
"""

import functools
from datetime import timedelta

import pytest

from sarc.alerts.usage_alerts.gpu_usage import check_gpu_type_usage_per_node
from tests.functional.jobs.test_func_load_job_series import MOCK_TIME

from .common import _get_warnings

get_warnings = functools.partial(
_get_warnings, module="sarc.alerts.usage_alerts.gpu_usage:gpu_usage.py"
)


@pytest.mark.freeze_time(MOCK_TIME)
@pytest.mark.usefixtures("read_only_db", "tzlocal_is_mtl")
@pytest.mark.parametrize(
"params,expected",
[
(
# Check GPU A100 with no interval (i.e. all jobs)
dict(gpu_type="A100", time_interval=None, minimum_runtime=None),
[
"[fromage][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[mila][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[patate][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][bart] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][cn-b099] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][cn-c017] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][cn-c021] insufficient usage for GPU A100: 5.88 % (1/17), minimum required: 100.0 %",
"[raisin][cn-c022] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][cn-d001] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
],
),
(
# Check GPU A100 with no interval (i.e. all jobs) and minimum runtime
dict(
gpu_type="A100",
time_interval=None,
minimum_runtime=timedelta(seconds=43200),
),
[
"[fromage][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[mila][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[patate][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][bart] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][cn-b099] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][cn-c017] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][cn-c021] insufficient usage for GPU A100: 5.88 % (1/17), minimum required: 100.0 %",
"[raisin][cn-c022] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][cn-d001] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
],
),
(
# Check GPU A100 with no interval (i.e. all jobs) and minimum runtime too high
dict(
gpu_type="A100",
time_interval=None,
minimum_runtime=timedelta(seconds=43200 + 1),
),
[],
),
(
# Check GPU A100 for all jobs with a greater threshold.
dict(
gpu_type="A100",
time_interval=None,
minimum_runtime=None,
threshold=5 / 100,
),
[
"[fromage][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %",
"[mila][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %",
"[patate][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %",
"[raisin][bart] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %",
"[raisin][cn-b099] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %",
"[raisin][cn-c017] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %",
# "[raisin][cn-c021] insufficient usage for GPU A100: 5.88 % (1/17), minimum required: 5.0 %",
"[raisin][cn-c022] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %",
"[raisin][cn-d001] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %",
],
),
(
# Check GPU A100 for all jobs with threshold zero.
dict(
gpu_type="A100", time_interval=None, minimum_runtime=None, threshold=0
),
[],
),
(
# Check GPU A100 for all jobs, a greater threshold, and minimum number of jobs per drac node set to 2.
dict(
gpu_type="A100",
time_interval=None,
minimum_runtime=None,
threshold=10 / 100,
min_tasks=2,
),
[
# "[fromage][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %",
"[mila][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %",
# "[patate][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %",
# "[raisin][bart] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %",
# "[raisin][cn-b099] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %",
# "[raisin][cn-c017] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %",
"[raisin][cn-c021] insufficient usage for GPU A100: 5.88 % (1/17), minimum required: 10.0 %",
# "[raisin][cn-c022] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %",
# "[raisin][cn-d001] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %",
],
),
(
# Check GPU A100 with default intervals (24 hours).
# Only 2 jobs (6 and 7) will match for current frozen mock time.
dict(gpu_type="A100"),
[
"[raisin][cn-c021] insufficient usage for GPU A100: 0.0 % (0/2), minimum required: 100.0 %",
],
),
(
# Check unknown GPU.
dict(gpu_type="unknown", time_interval=None),
[
"[fromage][cn-c021] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %",
"[mila][cn-c021] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %",
"[patate][cn-c021] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][bart] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][cn-b099] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][cn-c017] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][cn-c021] insufficient usage for GPU unknown: 0.0 % (0/17), minimum required: 100.0 %",
"[raisin][cn-c022] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][cn-d001] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %",
],
),
],
)
def test_check_gpu_type_usage_per_node(params, expected, caplog):
check_gpu_type_usage_per_node(**params)
assert get_warnings(caplog.text) == expected
Loading