Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SARC-330] Implémenter les alertes : Proportion de jobs GPU avec stats prometheus spécifique aux GPUs sur un noeud donné plus bas qu’un threshold X #135

Merged
merged 7 commits into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 80 additions & 14 deletions sarc/alerts/usage_alerts/prometheus_stats_occurrences.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,17 @@ def __init__(self, name):
self.threshold = None


# pylint: disable=too-many-branches
def check_prometheus_stats_occurrences(
time_interval: Optional[timedelta] = timedelta(days=7),
time_unit=timedelta(days=1),
minimum_runtime: Optional[timedelta] = timedelta(minutes=5),
cluster_names: Optional[List[str]] = None,
group_by_node: Optional[Sequence[str]] = ("mila",),
group_by_node: Union[bool, Sequence[str]] = ("mila",),
min_jobs_per_group: Optional[Union[int, Dict[str, int]]] = None,
nb_stddev=2,
with_gres_gpu=False,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ce nouveau parametre n'est pas testés dans test_check_prometheus_scraping_stats, et donc on teste uniquement le cas des jobs CPU.

prometheus_stats=("cpu_utilization", "system_memory"),
):
"""
Check if we have scrapped Prometheus stats for enough jobs per node per cluster per time unit.
Expand Down Expand Up @@ -56,8 +59,10 @@ def check_prometheus_stats_occurrences(
If a cluster in this list does not appear in jobs, a warning will be logged.

If empty (or not specified), use all clusters available among jobs retrieved with time_interval.
group_by_node: Sequence
Optional sequence of clusters to group by node.
group_by_node: Sequence | bool
Either a sequence of clusters to group by node,
or False to indicate no cluster to group by node (equivalent to empty sequence),
or True to indicate that all clusters must be grouped by node.
For clusters in this list, we will check each node separately (ie. a "group" is a cluster node).
By default, we check the entire cluster (i.e. the "group" is the cluster itself).
min_jobs_per_group: int | dict
Expand All @@ -71,6 +76,11 @@ def check_prometheus_stats_occurrences(
Amount of standard deviation to remove from average statistics to compute checking threshold.
Threshold is computed as:
max(0, average - nb_stddev * stddev)
with_gres_gpu: bool
If True, check only jobs which have allocated.gres_gpu > 0 (GPU jobs)
If False (default), check only jobs which have allocated.gres_gpu == 0 (CPU jobs).
prometheus_stats: Sequence[str]
Prometheus stats to check. Default: "cpu_utilization", "system_memory"
"""

# Parse time_interval and get data frame
Expand All @@ -81,24 +91,41 @@ def check_prometheus_stats_occurrences(
clip_time = True
df = load_job_series(start=start, end=end, clip_time=clip_time)

# Parse minimum_runtime, and select only jobs where
# elapsed time >= minimum runtime and allocated.gres_gpu == 0
Copy link
Collaborator

@nurbal nurbal Nov 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

... d'ailleurs avant on ignorait les jobs GPU, on dirait bien ^^

# Parse minimum_runtime
if minimum_runtime is None:
minimum_runtime = timedelta(seconds=0)
df = df[
(df["elapsed_time"] >= minimum_runtime.total_seconds())
& (df["allocated.gres_gpu"] == 0)
]
# Select only jobs where elapsed time >= minimum runtime and
# jobs are GPU or CPU jobs, depending on `with_gres_gpu`
selection_elapsed_time = df["elapsed_time"] >= minimum_runtime.total_seconds()
selection_gres_gpu = (
(df["allocated.gres_gpu"] > 0)
if with_gres_gpu
else (df["allocated.gres_gpu"] == 0)
)
df = df[selection_elapsed_time & selection_gres_gpu]

# List clusters
cluster_names = cluster_names or sorted(df["cluster_name"].unique())

# If df is empty, warn for each cluster that we can't check Prometheus stats.
if df.empty:
for cluster_name in cluster_names:
logger.warning(
f"[{cluster_name}] no Prometheus data available: no job found"
)
# As there's nothing to check, we return immediately.
return

# Split data frame into time frames using `time_unit`
df = compute_time_frames(df, frame_size=time_unit)

# Duplicates lines per node to count each job for each node where it runs
df = df.explode("nodes")

# parse group_by_node
if isinstance(group_by_node, bool):
group_by_node = list(df["cluster_name"].unique()) if group_by_node else ()

# If cluster not in group_by_node,
# then we must count jobs for the entire cluster, not per node.
# To simplify the code, let's just define 1 common node for all cluster jobs
Expand All @@ -109,14 +136,13 @@ def check_prometheus_stats_occurrences(
df.loc[:, "task_"] = 1

# Generate Prometheus context for each Prometheus stat we want to check.
prom_contexts = [
PrometheusStatInfo(name=prom_col)
for prom_col in ["cpu_utilization", "system_memory"]
]
prom_contexts = [PrometheusStatInfo(name=prom_col) for prom_col in prometheus_stats]

# Add columns to check if job has prometheus stats
for prom in prom_contexts:
df.loc[:, prom.col_has] = ~df[prom.name].isnull()
# NB: Use DataFrame.reindex() to add column with NaN values if missing:
# (2024/09/26) https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.reindex.html
df.loc[:, prom.col_has] = ~(df.reindex(columns=[prom.name])[prom.name].isnull())

# Group per timestamp per cluster per node, and count jobs and prometheus stats.
# If "cluster_names" are given, use only jobs in these clusters.
Expand Down Expand Up @@ -175,3 +201,43 @@ def check_prometheus_stats_occurrences(
logger.warning(
f"[{cluster_name}] no Prometheus data available: no job found"
)


def check_prometheus_stats_for_gpu_jobs(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

... au temps pour moi, je n'étais pas bien réveillé. test_check_prometheus_stats_for_gpu_jobs teste bien les GPU à travers l'appel à check_prometheus_stats_for_gpu_jobs. Pas vraiment un test unitaire, mais c'est ok pour moi :-)

time_interval: Optional[timedelta] = timedelta(days=7),
time_unit=timedelta(days=1),
minimum_runtime: Optional[timedelta] = timedelta(minutes=5),
cluster_names: Optional[List[str]] = None,
# For GPU jobs, default behaviour is to group each cluster by nodes for checking.
group_by_node: Union[bool, Sequence[str]] = True,
min_jobs_per_group: Optional[Union[int, Dict[str, int]]] = None,
nb_stddev=2,
):
"""
Check if we have scrapped Prometheus stats for enough GPU jobs per node per cluster per time unit.
Log a warning for each node / cluster where ratio of GPU jobs with Prometheus stats is lower than
a threshold computed using mean and standard deviation statistics from all clusters.

To get more info about parameters, see documentation for `check_prometheus_stats_occurrences`.
"""
return check_prometheus_stats_occurrences(
time_interval=time_interval,
time_unit=time_unit,
minimum_runtime=minimum_runtime,
cluster_names=cluster_names,
group_by_node=group_by_node,
min_jobs_per_group=min_jobs_per_group,
nb_stddev=nb_stddev,
# We are looking for GPU jobs
with_gres_gpu=True,
# We are looking for GPU-related Prometheus stats
prometheus_stats=(
"gpu_utilization",
"gpu_utilization_fp16",
"gpu_utilization_fp32",
"gpu_utilization_fp64",
"gpu_sm_occupancy",
"gpu_memory",
"gpu_power",
),
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import functools
import re

import pytest

from sarc.alerts.usage_alerts.prometheus_stats_occurrences import (
check_prometheus_stats_for_gpu_jobs,
)
from sarc.client import get_jobs
from tests.functional.jobs.test_func_load_job_series import MOCK_TIME

from ..jobs.test_func_job_statistics import generate_fake_timeseries

PARAMS = {
# Check with default params. In last 7 days from now (mock time: 2023-11-22),
# there is only 2 jobs from 1 cluster in 1 timestamp, both with no GPU stats.
# So threshold will be 0 everywhere, and no warning will be printed.
"default": dict(),
# Check with no time_interval.
"no_time_interval": dict(time_interval=None),
# Check with no time_interval and low amount of stddev (0.25).
"std_025": dict(time_interval=None, nb_stddev=0.25),
# Check with no time_interval, 0.25 stddev, and 1 extra cluster.
# Expected 1 more warning, no other changes .
"std_025_clusters_extra": dict(
time_interval=None,
nb_stddev=0.25,
cluster_names=[
"raisin",
"patate",
"fromage",
"mila",
"invisible-cluster",
],
),
# Check with no time_interval, 0.25 stddev, with only 2 clusters. Thresholds will change.
"std_025_clusters_2": dict(
time_interval=None, nb_stddev=0.25, cluster_names=["raisin", "mila"]
),
# Check with no time_interval, 0.25 stddev, and no group_by_node.
"std_025_group_none": dict(time_interval=None, nb_stddev=0.25, group_by_node=()),
# Check with no time_interval, 0.25 stddev, and group_by_node for all clusters.
# Sams as if group_by_node is not specified, as only `raisin` triggers some warnings.
"std_025_group_full": dict(
time_interval=None,
nb_stddev=0.25,
group_by_node=["raisin", "patate", "fromage", "mila"],
),
# Check with no time_interval, 0.25 stddev, group_by_node for all clusters, and min jobs to 2.
"std_025_group_full_min_jobs_2": dict(
time_interval=None,
nb_stddev=0.25,
group_by_node=["raisin", "patate", "fromage", "mila"],
min_jobs_per_group=2,
),
# Check with no time_interval, 0.25 stddev, group_by_node for all clusters,
# and min jobs set to 2 for only `raisin`.
# No warning, since timestamp when `raisin` triggers warnings has only 2 jobs on this cluster.
"std_025_group_full_min_jobs_raisin": dict(
time_interval=None,
nb_stddev=0.25,
group_by_node=["raisin", "patate", "fromage", "mila"],
min_jobs_per_group={"raisin": 3},
),
}


@pytest.mark.freeze_time(MOCK_TIME)
@pytest.mark.usefixtures("read_only_db", "tzlocal_is_mtl")
@pytest.mark.parametrize("params", PARAMS.values(), ids=PARAMS.keys())
def test_check_prometheus_stats_for_gpu_jobs(
params, monkeypatch, caplog, file_regression
):
monkeypatch.setattr(
"sarc.jobs.series.get_job_time_series", generate_fake_timeseries
)

for job in get_jobs():
job.statistics(save=True)
check_prometheus_stats_for_gpu_jobs(**params)
file_regression.check(
re.sub(
r"WARNING +sarc\.alerts\.usage_alerts\.prometheus_stats_occurrences:prometheus_stats_occurrences.py:[0-9]+ +",
"",
caplog.text,
)
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[invisible-cluster] no Prometheus data available: no job found
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00
Loading
Loading