-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SARC-330] Implémenter les alertes : Proportion de jobs GPU avec stats prometheus spécifique aux GPUs sur un noeud donné plus bas qu’un threshold X #135
Changes from all commits
d47872f
e9055be
4584f08
55bc40d
0db501a
559d619
eeec859
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,14 +20,17 @@ def __init__(self, name): | |
self.threshold = None | ||
|
||
|
||
# pylint: disable=too-many-branches | ||
def check_prometheus_stats_occurrences( | ||
time_interval: Optional[timedelta] = timedelta(days=7), | ||
time_unit=timedelta(days=1), | ||
minimum_runtime: Optional[timedelta] = timedelta(minutes=5), | ||
cluster_names: Optional[List[str]] = None, | ||
group_by_node: Optional[Sequence[str]] = ("mila",), | ||
group_by_node: Union[bool, Sequence[str]] = ("mila",), | ||
min_jobs_per_group: Optional[Union[int, Dict[str, int]]] = None, | ||
nb_stddev=2, | ||
with_gres_gpu=False, | ||
prometheus_stats=("cpu_utilization", "system_memory"), | ||
): | ||
""" | ||
Check if we have scrapped Prometheus stats for enough jobs per node per cluster per time unit. | ||
|
@@ -56,8 +59,10 @@ def check_prometheus_stats_occurrences( | |
If a cluster in this list does not appear in jobs, a warning will be logged. | ||
|
||
If empty (or not specified), use all clusters available among jobs retrieved with time_interval. | ||
group_by_node: Sequence | ||
Optional sequence of clusters to group by node. | ||
group_by_node: Sequence | bool | ||
Either a sequence of clusters to group by node, | ||
or False to indicate no cluster to group by node (equivalent to empty sequence), | ||
or True to indicate that all clusters must be grouped by node. | ||
For clusters in this list, we will check each node separately (ie. a "group" is a cluster node). | ||
By default, we check the entire cluster (i.e. the "group" is the cluster itself). | ||
min_jobs_per_group: int | dict | ||
|
@@ -71,6 +76,11 @@ def check_prometheus_stats_occurrences( | |
Amount of standard deviation to remove from average statistics to compute checking threshold. | ||
Threshold is computed as: | ||
max(0, average - nb_stddev * stddev) | ||
with_gres_gpu: bool | ||
If True, check only jobs which have allocated.gres_gpu > 0 (GPU jobs) | ||
If False (default), check only jobs which have allocated.gres_gpu == 0 (CPU jobs). | ||
prometheus_stats: Sequence[str] | ||
Prometheus stats to check. Default: "cpu_utilization", "system_memory" | ||
""" | ||
|
||
# Parse time_interval and get data frame | ||
|
@@ -81,24 +91,41 @@ def check_prometheus_stats_occurrences( | |
clip_time = True | ||
df = load_job_series(start=start, end=end, clip_time=clip_time) | ||
|
||
# Parse minimum_runtime, and select only jobs where | ||
# elapsed time >= minimum runtime and allocated.gres_gpu == 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ... d'ailleurs avant on ignorait les jobs GPU, on dirait bien ^^ |
||
# Parse minimum_runtime | ||
if minimum_runtime is None: | ||
minimum_runtime = timedelta(seconds=0) | ||
df = df[ | ||
(df["elapsed_time"] >= minimum_runtime.total_seconds()) | ||
& (df["allocated.gres_gpu"] == 0) | ||
] | ||
# Select only jobs where elapsed time >= minimum runtime and | ||
# jobs are GPU or CPU jobs, depending on `with_gres_gpu` | ||
selection_elapsed_time = df["elapsed_time"] >= minimum_runtime.total_seconds() | ||
selection_gres_gpu = ( | ||
(df["allocated.gres_gpu"] > 0) | ||
if with_gres_gpu | ||
else (df["allocated.gres_gpu"] == 0) | ||
) | ||
df = df[selection_elapsed_time & selection_gres_gpu] | ||
|
||
# List clusters | ||
cluster_names = cluster_names or sorted(df["cluster_name"].unique()) | ||
|
||
# If df is empty, warn for each cluster that we can't check Prometheus stats. | ||
if df.empty: | ||
for cluster_name in cluster_names: | ||
logger.warning( | ||
f"[{cluster_name}] no Prometheus data available: no job found" | ||
) | ||
# As there's nothing to check, we return immediately. | ||
return | ||
|
||
# Split data frame into time frames using `time_unit` | ||
df = compute_time_frames(df, frame_size=time_unit) | ||
|
||
# Duplicates lines per node to count each job for each node where it runs | ||
df = df.explode("nodes") | ||
|
||
# parse group_by_node | ||
if isinstance(group_by_node, bool): | ||
group_by_node = list(df["cluster_name"].unique()) if group_by_node else () | ||
|
||
# If cluster not in group_by_node, | ||
# then we must count jobs for the entire cluster, not per node. | ||
# To simplify the code, let's just define 1 common node for all cluster jobs | ||
|
@@ -109,14 +136,13 @@ def check_prometheus_stats_occurrences( | |
df.loc[:, "task_"] = 1 | ||
|
||
# Generate Prometheus context for each Prometheus stat we want to check. | ||
prom_contexts = [ | ||
PrometheusStatInfo(name=prom_col) | ||
for prom_col in ["cpu_utilization", "system_memory"] | ||
] | ||
prom_contexts = [PrometheusStatInfo(name=prom_col) for prom_col in prometheus_stats] | ||
|
||
# Add columns to check if job has prometheus stats | ||
for prom in prom_contexts: | ||
df.loc[:, prom.col_has] = ~df[prom.name].isnull() | ||
# NB: Use DataFrame.reindex() to add column with NaN values if missing: | ||
# (2024/09/26) https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.reindex.html | ||
df.loc[:, prom.col_has] = ~(df.reindex(columns=[prom.name])[prom.name].isnull()) | ||
|
||
# Group per timestamp per cluster per node, and count jobs and prometheus stats. | ||
# If "cluster_names" are given, use only jobs in these clusters. | ||
|
@@ -175,3 +201,43 @@ def check_prometheus_stats_occurrences( | |
logger.warning( | ||
f"[{cluster_name}] no Prometheus data available: no job found" | ||
) | ||
|
||
|
||
def check_prometheus_stats_for_gpu_jobs( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ... au temps pour moi, je n'étais pas bien réveillé. |
||
time_interval: Optional[timedelta] = timedelta(days=7), | ||
time_unit=timedelta(days=1), | ||
minimum_runtime: Optional[timedelta] = timedelta(minutes=5), | ||
cluster_names: Optional[List[str]] = None, | ||
# For GPU jobs, default behaviour is to group each cluster by nodes for checking. | ||
group_by_node: Union[bool, Sequence[str]] = True, | ||
min_jobs_per_group: Optional[Union[int, Dict[str, int]]] = None, | ||
nb_stddev=2, | ||
): | ||
""" | ||
Check if we have scrapped Prometheus stats for enough GPU jobs per node per cluster per time unit. | ||
Log a warning for each node / cluster where ratio of GPU jobs with Prometheus stats is lower than | ||
a threshold computed using mean and standard deviation statistics from all clusters. | ||
|
||
To get more info about parameters, see documentation for `check_prometheus_stats_occurrences`. | ||
""" | ||
return check_prometheus_stats_occurrences( | ||
time_interval=time_interval, | ||
time_unit=time_unit, | ||
minimum_runtime=minimum_runtime, | ||
cluster_names=cluster_names, | ||
group_by_node=group_by_node, | ||
min_jobs_per_group=min_jobs_per_group, | ||
nb_stddev=nb_stddev, | ||
# We are looking for GPU jobs | ||
with_gres_gpu=True, | ||
# We are looking for GPU-related Prometheus stats | ||
prometheus_stats=( | ||
"gpu_utilization", | ||
"gpu_utilization_fp16", | ||
"gpu_utilization_fp32", | ||
"gpu_utilization_fp64", | ||
"gpu_sm_occupancy", | ||
"gpu_memory", | ||
"gpu_power", | ||
), | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
import functools | ||
import re | ||
|
||
import pytest | ||
|
||
from sarc.alerts.usage_alerts.prometheus_stats_occurrences import ( | ||
check_prometheus_stats_for_gpu_jobs, | ||
) | ||
from sarc.client import get_jobs | ||
from tests.functional.jobs.test_func_load_job_series import MOCK_TIME | ||
|
||
from ..jobs.test_func_job_statistics import generate_fake_timeseries | ||
|
||
PARAMS = { | ||
# Check with default params. In last 7 days from now (mock time: 2023-11-22), | ||
# there is only 2 jobs from 1 cluster in 1 timestamp, both with no GPU stats. | ||
# So threshold will be 0 everywhere, and no warning will be printed. | ||
"default": dict(), | ||
# Check with no time_interval. | ||
"no_time_interval": dict(time_interval=None), | ||
# Check with no time_interval and low amount of stddev (0.25). | ||
"std_025": dict(time_interval=None, nb_stddev=0.25), | ||
# Check with no time_interval, 0.25 stddev, and 1 extra cluster. | ||
# Expected 1 more warning, no other changes . | ||
"std_025_clusters_extra": dict( | ||
time_interval=None, | ||
nb_stddev=0.25, | ||
cluster_names=[ | ||
"raisin", | ||
"patate", | ||
"fromage", | ||
"mila", | ||
"invisible-cluster", | ||
], | ||
), | ||
# Check with no time_interval, 0.25 stddev, with only 2 clusters. Thresholds will change. | ||
"std_025_clusters_2": dict( | ||
time_interval=None, nb_stddev=0.25, cluster_names=["raisin", "mila"] | ||
), | ||
# Check with no time_interval, 0.25 stddev, and no group_by_node. | ||
"std_025_group_none": dict(time_interval=None, nb_stddev=0.25, group_by_node=()), | ||
# Check with no time_interval, 0.25 stddev, and group_by_node for all clusters. | ||
# Sams as if group_by_node is not specified, as only `raisin` triggers some warnings. | ||
"std_025_group_full": dict( | ||
time_interval=None, | ||
nb_stddev=0.25, | ||
group_by_node=["raisin", "patate", "fromage", "mila"], | ||
), | ||
# Check with no time_interval, 0.25 stddev, group_by_node for all clusters, and min jobs to 2. | ||
"std_025_group_full_min_jobs_2": dict( | ||
time_interval=None, | ||
nb_stddev=0.25, | ||
group_by_node=["raisin", "patate", "fromage", "mila"], | ||
min_jobs_per_group=2, | ||
), | ||
# Check with no time_interval, 0.25 stddev, group_by_node for all clusters, | ||
# and min jobs set to 2 for only `raisin`. | ||
# No warning, since timestamp when `raisin` triggers warnings has only 2 jobs on this cluster. | ||
"std_025_group_full_min_jobs_raisin": dict( | ||
time_interval=None, | ||
nb_stddev=0.25, | ||
group_by_node=["raisin", "patate", "fromage", "mila"], | ||
min_jobs_per_group={"raisin": 3}, | ||
), | ||
} | ||
|
||
|
||
@pytest.mark.freeze_time(MOCK_TIME) | ||
@pytest.mark.usefixtures("read_only_db", "tzlocal_is_mtl") | ||
@pytest.mark.parametrize("params", PARAMS.values(), ids=PARAMS.keys()) | ||
def test_check_prometheus_stats_for_gpu_jobs( | ||
params, monkeypatch, caplog, file_regression | ||
): | ||
monkeypatch.setattr( | ||
"sarc.jobs.series.get_job_time_series", generate_fake_timeseries | ||
) | ||
|
||
for job in get_jobs(): | ||
job.statistics(save=True) | ||
check_prometheus_stats_for_gpu_jobs(**params) | ||
file_regression.check( | ||
re.sub( | ||
r"WARNING +sarc\.alerts\.usage_alerts\.prometheus_stats_occurrences:prometheus_stats_occurrences.py:[0-9]+ +", | ||
"", | ||
caplog.text, | ||
) | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[invisible-cluster] no Prometheus data available: no job found |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 | ||
[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ce nouveau parametre n'est pas testés dans test_check_prometheus_scraping_stats, et donc on teste uniquement le cas des jobs CPU.