diff --git a/.gitignore b/.gitignore index 33a061af..f9754547 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,5 @@ sarc_mongo *-checkpoint.ipynb sarc-test-cache dbconfig.txt +token.json +.vscode \ No newline at end of file diff --git a/examples/milatools_usage_report.py b/examples/milatools_usage_report.py new file mode 100644 index 00000000..31fc5d51 --- /dev/null +++ b/examples/milatools_usage_report.py @@ -0,0 +1,472 @@ +"""Analyze the milatools usage based on the number of jobs called 'mila-{command}'.""" + +# /// script +# requires-python = ">=3.9" +# dependencies = [ +# "matplotlib", +# "simple-parsing", +# "pandas", +# "sarc>=0.1.0", +# "pydantic", +# "tzlocal", +# ] +# /// +from __future__ import annotations + +import dataclasses +import logging +import os +import pickle +import pprint +import tempfile +from dataclasses import dataclass +from datetime import datetime, timedelta +from logging import getLogger as get_logger +from pathlib import Path +from typing import Any, Iterable, TypedDict, TypeVar, Union + +import matplotlib +import matplotlib.axes +import matplotlib.figure +import matplotlib.pyplot as plt +import pandas as pd +import pymongo +import pymongo.collection +import simple_parsing +from pandas.core.indexes.datetimes import DatetimeIndex +from typing_extensions import TypeGuard + +from sarc.config import MTL +from sarc.jobs.job import jobs_collection + +logger = get_logger(__name__) + +# Remember to set up the port forwarding if you want +# to access data from SARC. +# ssh -L 27017:localhost:27017 sarc +# (or using the LocalForward option in your ~/.ssh/config file) + +# Change this to the path to your config file. + +if "SARC_CONFIG" not in os.environ: + # TODO: Probably need to remove this, but idk how to make it work without it.. + sarc_config_file = Path(__file__).parent / "milatools-sarc-client.json" + if sarc_config_file.exists(): + os.environ["SARC_CONFIG"] = str(sarc_config_file) + + +@dataclass(frozen=True, unsafe_hash=True) +class Args: + start_date: datetime | str = datetime.today().replace( + hour=0, minute=0, second=0, microsecond=0 + ).astimezone(tz=MTL) - timedelta(days=30) + + end_date: datetime = ( + datetime.today() + .replace(hour=0, minute=0, second=0, microsecond=0) + .astimezone(tz=MTL) + ) + + verbose: int = simple_parsing.field( + alias="-v", action="count", default=0, hash=False + ) + + +@dataclass(frozen=True, unsafe_hash=True) +class Period: + start_date: datetime + end_date: datetime + + +def main(): + parser = simple_parsing.ArgumentParser(description="Analyze the milatools usage.") + parser.add_arguments(Args, dest="args") + args: Args = parser.parse_args().args + start_date = args.start_date + if isinstance(start_date, str): + start_date = start_date = datetime.fromisoformat(start_date).astimezone(tz=MTL) + end_date = args.end_date + if isinstance(end_date, str): + end_date = datetime.fromisoformat(end_date).astimezone(tz=MTL) + + print("Args:") + pprint.pprint(dataclasses.asdict(args)) + + _setup_logging(args.verbose) + + period = Period(start_date, end_date) + + all_clusters = _get_all_clusters(period.start_date, period.end_date) + logger.info(f"All clusters: {all_clusters}") + + figures: list[Path] = [] + + sampling_interval = timedelta(weeks=1) + figures += make_milatools_usage_plots( + period, cluster="mila", fig_suffix="mila", sampling_interval=sampling_interval + ) + figures += make_milatools_usage_plots( + period, + cluster=sorted(set(all_clusters) - {"mila"}), + fig_suffix="drac", + sampling_interval=sampling_interval, + ) + figures += make_milatools_usage_plots( + period, cluster=None, fig_suffix="all", sampling_interval=sampling_interval + ) + # figures = make_usage_plots(args, job_name="mila-code") + # figures += make_usage_plots(args, job_name="mila-cpu") + + # upload_figures_to_google_drive(figures) + + +def _get_cache_dir(): + return Path(os.environ.get("SCRATCH", tempfile.gettempdir())) + + +def _get_all_clusters(start_date: datetime, end_date: datetime): + cache_dir = _get_cache_dir() + job_db: pymongo.collection.Collection = jobs_collection().get_collection() + + if ( + all_clusters_file := cache_dir / f"all_clusters_{start_date}_{end_date}.pkl" + ).exists(): + with all_clusters_file.open("rb") as f: + all_clusters = pickle.load(f) + assert _is_iterable_of(all_clusters, str) and isinstance(all_clusters, list) + else: + _period_filter = _get_filter( + start_date=start_date, end_date=end_date, cluster_name=None, name=None + ) + all_clusters: list[str] = list( + job_db.distinct("cluster_name", filter=_period_filter) + ) + with all_clusters_file.open("wb") as f: + pickle.dump(all_clusters, f) + return sorted(all_clusters) + + +def make_milatools_usage_plots( + period: Period, + cluster: str | list[str] | None, + fig_suffix: str, + sampling_interval: timedelta = timedelta(days=7), +) -> list[Path]: + if cluster is None: + cluster_suffix = " on all slurm clusters" + elif isinstance(cluster, str): + cluster_suffix = f" on the {cluster} cluster" + else: + cluster_suffix = f" on the {cluster} clusters" + + df = _get_milatools_usage_data( + period, cluster=cluster, sampling_interval=sampling_interval + ) + df["using_milatools"] = df["milatools_users"] / df["cluster_users"] + df["used_milatools_before"] = ( + df["users_this_period_that_used_milatools_before"] / df["cluster_users"] + ) + # print(df) + + # daily_counts = df.resample(rule="D").size() + axes: list[matplotlib.axes.Axes] + fig, axes = plt.subplots(sharex=True, sharey=False, ncols=2, nrows=1) + fig.suptitle(f"Statistics on the use of Milatools{cluster_suffix}") + (ax1, ax2) = axes + ax1.set_title("Adoption") + ax1.set_ylim(0, 1) + ax2.set_title(f"Users {cluster_suffix}") + df["not using milatools"] = 1 - df["using_milatools"] + df["never used milatools"] = 1 - df["used_milatools_before"] + df[ + [ + "using_milatools", + "used_milatools_before", + # "not using milatools", + # "never used milatools", + ] + ].plot( + kind="line", + ax=ax1, + legend=True, + xlabel="Date", + ylabel="Percentage of users using milatools", + linewidth=2.5, + color=["green", "blue"], # "lightgray", "gray"], + ) + _annotate_start_and_end(df, ax1, "using_milatools", percentage=True) + _annotate_start_and_end(df, ax1, "used_milatools_before", percentage=True) + + # In a stacked area plot, the second column is stacked on top of the first + df["cluster users"] = df["cluster_users"] - df["milatools_users"] + df[["milatools_users", "cluster users"]].plot( + kind="area", + ax=ax2, + stacked=True, + legend=True, + linewidth=2.5, + color=["green", "silver"], + ) + _annotate_start_and_end(df, ax2, "milatools_users", percentage=False) + # need to annotate using 'cluster_users' (before the subtraction) + _annotate_start_and_end(df, ax2, "cluster_users", percentage=False) + + ax1.set_yticklabels([f"{x:.0%}" for x in ax1.get_yticks()]) + + # Make all labels gray, then select the added ones and make them darker. + ax1.set_yticklabels(ax1.get_yticklabels(), color="dimgray") + ax1.get_yticklabels()[-2].set_color("black") + ax1.get_yticklabels()[-1].set_color("black") + + ax2.set_yticklabels(ax2.get_yticklabels(), color="dimgray") + ax2.get_yticklabels()[-2].set_color("black") + ax2.get_yticklabels()[-1].set_color("black") + + # Set x-ticks and labels + assert isinstance(df.index, DatetimeIndex) + ax1.set_xticks(df.index) # Set all possible x-tick positions + ax2.set_xticks(df.index) # Set all possible x-tick positions + + # Apply all labels with rotation + label_every_week = df.index.strftime("%Y-%m-%d") + if sampling_interval == timedelta(days=7): + # one label every month + ticks = [ + label_every_week[i] if i % 4 == 0 else "" + for i in range(len(label_every_week)) + ] + else: + # one label every month + ticks = label_every_week + ax1.set_xticklabels(ticks, rotation=45) + ax2.set_xticklabels(ticks, rotation=45) + + fig.tight_layout() + # fig.layout + fig_path = Path( + f"milatools_usage_{period.start_date.date()}_{period.end_date.date()}_{fig_suffix}.png" + ) + # plt.show() + fig.set_size_inches(12, 6) + fig.savefig(fig_path) + print(f"Figure saved at {fig_path}") + return [fig_path] + + +# Annotate the start and end values for the plots +def _annotate_start_and_end( + df: pd.DataFrame, ax: matplotlib.axes.Axes, row: str, percentage: bool +): + def _format(v): + return f"{v:.0%}" if percentage else f"{v}" + + ax.set_yticks(list(ax.get_yticks()) + [df[row].iloc[0]]) + + # ax.annotate( + # text=_format(df[row].iloc[0]), + # xy=(df.index[0], df[row].iloc[0]), + # xycoords="data", + # xytext=(-45, -15), + # textcoords="offset points", + # # add color maybe? + # # color="blue", + # # arrowprops=dict(arrowstyle="->", color="black"), + # fontsize="12", + # ) + ax.annotate( + _format(df[row].iloc[-1]), + (df.index[-1], df[row].iloc[-1]), + fontsize="12", + ) + + +def _get_milatools_usage_data( + args: Period, cluster: str | list[str] | None, sampling_interval: timedelta +): + cluster_suffix = f" on the {cluster} cluster" if cluster else "" + logger.info( + f"Getting milatools usage data from {args.start_date} to {args.end_date}{cluster_suffix}" + ) + + milatools_users_so_far: set[str] = set() + cluster_users_so_far: set[str] = set() + + num_milatools_users_each_period: list[int] = [] + num_cluster_users_each_period: list[int] = [] + + num_milatools_users_so_far: list[int] = [] + num_cluster_users_so_far: list[int] = [] + + num_users_this_period_that_have_used_milatools_before: list[int] = [] + + date_range = pd.date_range( + args.start_date, args.end_date, freq=sampling_interval, inclusive="both" + ) + for interval_start, interval_end in zip( + date_range.to_list()[:-1], date_range.to_list()[1:] + ): + milatools_users_that_period, cluster_users_that_period = _get_unique_users( + interval_start, interval_end, cluster=cluster + ) + if not cluster_users_that_period: + raise RuntimeError( + f"No users of the {cluster=} cluster in the period from {interval_start} to {interval_end}??" + ) + + cluster_users_so_far.update(cluster_users_that_period) + milatools_users_so_far.update(milatools_users_that_period) + + users_this_period_that_have_used_milatools_before: set[str] = set( + user for user in cluster_users_that_period if user in milatools_users_so_far + ) + + # adoption_pct_overall = len(milatools_users_so_far) / len(cluster_users_so_far) + # logger.info(f"Adoption percentage so far: {adoption_pct_overall:.2%}") + + num_milatools_users_each_period.append(len(milatools_users_that_period)) + num_cluster_users_each_period.append(len(cluster_users_that_period)) + num_milatools_users_so_far.append(len(milatools_users_so_far)) + num_cluster_users_so_far.append(len(cluster_users_so_far)) + num_users_this_period_that_have_used_milatools_before.append( + len(users_this_period_that_have_used_milatools_before) + ) + + assert ( + len(date_range) - 1 + == len(num_milatools_users_each_period) + == len(num_cluster_users_each_period) + == len(num_milatools_users_so_far) + == len(num_cluster_users_so_far) + == len(num_users_this_period_that_have_used_milatools_before) + ), (len(date_range), len(num_milatools_users_each_period)) + + return pd.DataFrame( + { + "milatools_users": num_milatools_users_each_period, + "cluster_users": num_cluster_users_each_period, + "milatools_users_so_far": num_milatools_users_so_far, + "cluster_users_so_far": num_cluster_users_so_far, + "users_this_period_that_used_milatools_before": num_users_this_period_that_have_used_milatools_before, + }, + index=date_range[:-1], + ) + + +def _setup_logging(verbose: int): + import rich.logging + + logging.basicConfig( + handlers=[rich.logging.RichHandler()], + format="%(message)s", + level=logging.ERROR, + ) + + match verbose: + case 0: + logger.setLevel("WARNING") + case 1: + logger.setLevel("INFO") + case _: + logger.setLevel("DEBUG") + + +InQuery = TypedDict("InQuery", {"$in": list[Any]}) +OrQuery = TypedDict("OrQuery", {"$or": list[Any]}) +RegexQuery = TypedDict("RegexQuery", {"$regex": list[str]}) +Query = Union[InQuery, OrQuery, RegexQuery] + + +def _get_filter( + start_date: datetime, + end_date: datetime, + cluster_name: str | Query | None, + name: str | Query | None, +): + query: dict = { + "submit_time": {"$gte": start_date, "$lt": end_date}, + } + if cluster_name is not None: + query["cluster_name"] = cluster_name + + if name is not None: + query["name"] = name + + # _filtre = { + # "$and": [ + # {**_filtre, "submit_time": {"$gte": start}}, + # {**_filtre, "submit_time": {"$lt": end}}, + # ] + # } + return query + + +def _get_unique_users( + start_date: datetime, + end_date: datetime, + cluster: str | list[str] | Query | None = None, +) -> tuple[set[str], set[str]]: + milatools_job_names: list[str] = ["mila-code", "mila-cpu"] + cluster_suffix = f" on the {cluster} cluster" if cluster else "" + + if isinstance(cluster, list): + cluster = {"$in": cluster} + + cache_dir = Path(os.environ.get("SCRATCH", tempfile.gettempdir())) + # _hash = hashlib.md5(f"{start_date}-{end_date}-{cluster}".encode()).hexdigest() + cached_results_path = ( + Path(cache_dir) + / f"milatools-unique_users-{cluster}-{start_date}-{end_date}.pkl" + ) + + if cached_results_path.exists(): + logger.debug(f"Reading data from {cached_results_path}") + with cached_results_path.open("rb") as f: + milatools_users, all_users = pickle.load(f) + assert _is_iterable_of(milatools_users, str) and isinstance( + milatools_users, set + ) + assert _is_iterable_of(all_users, str) and isinstance(all_users, set) + return milatools_users, all_users + + job_structured_db = jobs_collection() + job_db: pymongo.collection.Collection = job_structured_db.get_collection() + + _period_filter = _get_filter(start_date, end_date, cluster, name=None) + _milatools_filter = _get_filter( + start_date, end_date, cluster, name={"$in": milatools_job_names} + ) + + all_users: set[str] = set(job_db.distinct("user", filter=_period_filter)) + milatools_users: set[str] = set(job_db.distinct("user", filter=_milatools_filter)) + logger.debug(f"All users:\n{all_users}") + logger.debug(f"Milatools users:\n{milatools_users}") + + n_milatools = len(milatools_users) + n_total = len(all_users) + logger.info( + f"{n_milatools} out of {n_total} ({n_milatools / n_total:.2%}) of users used milatools between " + f"{start_date.date()} and {end_date.date()}{cluster_suffix}." + ) + + assert milatools_users <= all_users + + cached_results_path.parent.mkdir(exist_ok=True) + with cached_results_path.open("wb") as f: + logger.debug(f"Saving data at {cached_results_path}") + pickle.dump((milatools_users, all_users), f) + + return milatools_users, all_users + + +T = TypeVar("T") + + +def _is_iterable_of(v: Any, t: type[T]) -> TypeGuard[Iterable[T]]: + try: + return all(isinstance(v_i, t) for v_i in v) + except TypeError: + return False + + +if __name__ == "__main__": + main() diff --git a/poetry.lock b/poetry.lock index 9b13754b..d1b32783 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.1 and should not be changed by hand. [[package]] name = "alabaster" @@ -2554,13 +2554,13 @@ files = [ [[package]] name = "opentelemetry-api" -version = "1.28.1" +version = "1.28.2" description = "OpenTelemetry Python API" optional = false python-versions = ">=3.8" files = [ - {file = "opentelemetry_api-1.28.1-py3-none-any.whl", hash = "sha256:bfe86c95576cf19a914497f439fd79c9553a38de0adbdc26f7cfc46b0c00b16c"}, - {file = "opentelemetry_api-1.28.1.tar.gz", hash = "sha256:6fa7295a12c707f5aebef82da3d9ec5afe6992f3e42bfe7bec0339a44b3518e7"}, + {file = "opentelemetry_api-1.28.2-py3-none-any.whl", hash = "sha256:6fcec89e265beb258fe6b1acaaa3c8c705a934bd977b9f534a2b7c0d2d4275a6"}, + {file = "opentelemetry_api-1.28.2.tar.gz", hash = "sha256:ecdc70c7139f17f9b0cf3742d57d7020e3e8315d6cffcdf1a12a905d45b19cc0"}, ] [package.dependencies] @@ -2569,34 +2569,34 @@ importlib-metadata = ">=6.0,<=8.5.0" [[package]] name = "opentelemetry-sdk" -version = "1.28.1" +version = "1.28.2" description = "OpenTelemetry Python SDK" optional = false python-versions = ">=3.8" files = [ - {file = "opentelemetry_sdk-1.28.1-py3-none-any.whl", hash = "sha256:72aad7f5fcbe37113c4ab4899f6cdeb6ac77ed3e62f25a85e3627b12583dad0f"}, - {file = "opentelemetry_sdk-1.28.1.tar.gz", hash = "sha256:100fa371b2046ffba6a340c18f0b2a0463acad7461e5177e126693b613a6ca57"}, + {file = "opentelemetry_sdk-1.28.2-py3-none-any.whl", hash = "sha256:93336c129556f1e3ccd21442b94d3521759541521861b2214c499571b85cb71b"}, + {file = "opentelemetry_sdk-1.28.2.tar.gz", hash = "sha256:5fed24c5497e10df30282456fe2910f83377797511de07d14cec0d3e0a1a3110"}, ] [package.dependencies] -opentelemetry-api = "1.28.1" -opentelemetry-semantic-conventions = "0.49b1" +opentelemetry-api = "1.28.2" +opentelemetry-semantic-conventions = "0.49b2" typing-extensions = ">=3.7.4" [[package]] name = "opentelemetry-semantic-conventions" -version = "0.49b1" +version = "0.49b2" description = "OpenTelemetry Semantic Conventions" optional = false python-versions = ">=3.8" files = [ - {file = "opentelemetry_semantic_conventions-0.49b1-py3-none-any.whl", hash = "sha256:dd6f3ac8169d2198c752e1a63f827e5f5e110ae9b0ce33f2aad9a3baf0739743"}, - {file = "opentelemetry_semantic_conventions-0.49b1.tar.gz", hash = "sha256:91817883b159ffb94c2ca9548509c4fe0aafce7c24f437aa6ac3fc613aa9a758"}, + {file = "opentelemetry_semantic_conventions-0.49b2-py3-none-any.whl", hash = "sha256:51e7e1d0daa958782b6c2a8ed05e5f0e7dd0716fc327ac058777b8659649ee54"}, + {file = "opentelemetry_semantic_conventions-0.49b2.tar.gz", hash = "sha256:44e32ce6a5bb8d7c0c617f84b9dc1c8deda1045a07dc16a688cc7cbeab679997"}, ] [package.dependencies] deprecated = ">=1.2.6" -opentelemetry-api = "1.28.1" +opentelemetry-api = "1.28.2" [[package]] name = "overrides" diff --git a/sarc/alerts/usage_alerts/prometheus_stats_occurrences.py b/sarc/alerts/usage_alerts/prometheus_stats_occurrences.py index 5ebf4c41..7acdcbb1 100644 --- a/sarc/alerts/usage_alerts/prometheus_stats_occurrences.py +++ b/sarc/alerts/usage_alerts/prometheus_stats_occurrences.py @@ -20,14 +20,17 @@ def __init__(self, name): self.threshold = None +# pylint: disable=too-many-branches def check_prometheus_stats_occurrences( time_interval: Optional[timedelta] = timedelta(days=7), time_unit=timedelta(days=1), minimum_runtime: Optional[timedelta] = timedelta(minutes=5), cluster_names: Optional[List[str]] = None, - group_by_node: Optional[Sequence[str]] = ("mila",), + group_by_node: Union[bool, Sequence[str]] = ("mila",), min_jobs_per_group: Optional[Union[int, Dict[str, int]]] = None, nb_stddev=2, + with_gres_gpu=False, + prometheus_stats=("cpu_utilization", "system_memory"), ): """ Check if we have scrapped Prometheus stats for enough jobs per node per cluster per time unit. @@ -56,8 +59,10 @@ def check_prometheus_stats_occurrences( If a cluster in this list does not appear in jobs, a warning will be logged. If empty (or not specified), use all clusters available among jobs retrieved with time_interval. - group_by_node: Sequence - Optional sequence of clusters to group by node. + group_by_node: Sequence | bool + Either a sequence of clusters to group by node, + or False to indicate no cluster to group by node (equivalent to empty sequence), + or True to indicate that all clusters must be grouped by node. For clusters in this list, we will check each node separately (ie. a "group" is a cluster node). By default, we check the entire cluster (i.e. the "group" is the cluster itself). min_jobs_per_group: int | dict @@ -71,6 +76,11 @@ def check_prometheus_stats_occurrences( Amount of standard deviation to remove from average statistics to compute checking threshold. Threshold is computed as: max(0, average - nb_stddev * stddev) + with_gres_gpu: bool + If True, check only jobs which have allocated.gres_gpu > 0 (GPU jobs) + If False (default), check only jobs which have allocated.gres_gpu == 0 (CPU jobs). + prometheus_stats: Sequence[str] + Prometheus stats to check. Default: "cpu_utilization", "system_memory" """ # Parse time_interval and get data frame @@ -81,24 +91,41 @@ def check_prometheus_stats_occurrences( clip_time = True df = load_job_series(start=start, end=end, clip_time=clip_time) - # Parse minimum_runtime, and select only jobs where - # elapsed time >= minimum runtime and allocated.gres_gpu == 0 + # Parse minimum_runtime if minimum_runtime is None: minimum_runtime = timedelta(seconds=0) - df = df[ - (df["elapsed_time"] >= minimum_runtime.total_seconds()) - & (df["allocated.gres_gpu"] == 0) - ] + # Select only jobs where elapsed time >= minimum runtime and + # jobs are GPU or CPU jobs, depending on `with_gres_gpu` + selection_elapsed_time = df["elapsed_time"] >= minimum_runtime.total_seconds() + selection_gres_gpu = ( + (df["allocated.gres_gpu"] > 0) + if with_gres_gpu + else (df["allocated.gres_gpu"] == 0) + ) + df = df[selection_elapsed_time & selection_gres_gpu] # List clusters cluster_names = cluster_names or sorted(df["cluster_name"].unique()) + # If df is empty, warn for each cluster that we can't check Prometheus stats. + if df.empty: + for cluster_name in cluster_names: + logger.warning( + f"[{cluster_name}] no Prometheus data available: no job found" + ) + # As there's nothing to check, we return immediately. + return + # Split data frame into time frames using `time_unit` df = compute_time_frames(df, frame_size=time_unit) # Duplicates lines per node to count each job for each node where it runs df = df.explode("nodes") + # parse group_by_node + if isinstance(group_by_node, bool): + group_by_node = list(df["cluster_name"].unique()) if group_by_node else () + # If cluster not in group_by_node, # then we must count jobs for the entire cluster, not per node. # To simplify the code, let's just define 1 common node for all cluster jobs @@ -109,14 +136,13 @@ def check_prometheus_stats_occurrences( df.loc[:, "task_"] = 1 # Generate Prometheus context for each Prometheus stat we want to check. - prom_contexts = [ - PrometheusStatInfo(name=prom_col) - for prom_col in ["cpu_utilization", "system_memory"] - ] + prom_contexts = [PrometheusStatInfo(name=prom_col) for prom_col in prometheus_stats] # Add columns to check if job has prometheus stats for prom in prom_contexts: - df.loc[:, prom.col_has] = ~df[prom.name].isnull() + # NB: Use DataFrame.reindex() to add column with NaN values if missing: + # (2024/09/26) https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.reindex.html + df.loc[:, prom.col_has] = ~(df.reindex(columns=[prom.name])[prom.name].isnull()) # Group per timestamp per cluster per node, and count jobs and prometheus stats. # If "cluster_names" are given, use only jobs in these clusters. @@ -175,3 +201,43 @@ def check_prometheus_stats_occurrences( logger.warning( f"[{cluster_name}] no Prometheus data available: no job found" ) + + +def check_prometheus_stats_for_gpu_jobs( + time_interval: Optional[timedelta] = timedelta(days=7), + time_unit=timedelta(days=1), + minimum_runtime: Optional[timedelta] = timedelta(minutes=5), + cluster_names: Optional[List[str]] = None, + # For GPU jobs, default behaviour is to group each cluster by nodes for checking. + group_by_node: Union[bool, Sequence[str]] = True, + min_jobs_per_group: Optional[Union[int, Dict[str, int]]] = None, + nb_stddev=2, +): + """ + Check if we have scrapped Prometheus stats for enough GPU jobs per node per cluster per time unit. + Log a warning for each node / cluster where ratio of GPU jobs with Prometheus stats is lower than + a threshold computed using mean and standard deviation statistics from all clusters. + + To get more info about parameters, see documentation for `check_prometheus_stats_occurrences`. + """ + return check_prometheus_stats_occurrences( + time_interval=time_interval, + time_unit=time_unit, + minimum_runtime=minimum_runtime, + cluster_names=cluster_names, + group_by_node=group_by_node, + min_jobs_per_group=min_jobs_per_group, + nb_stddev=nb_stddev, + # We are looking for GPU jobs + with_gres_gpu=True, + # We are looking for GPU-related Prometheus stats + prometheus_stats=( + "gpu_utilization", + "gpu_utilization_fp16", + "gpu_utilization_fp32", + "gpu_utilization_fp64", + "gpu_sm_occupancy", + "gpu_memory", + "gpu_power", + ), + ) diff --git a/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs.py b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs.py new file mode 100644 index 00000000..3f3d0e1f --- /dev/null +++ b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs.py @@ -0,0 +1,87 @@ +import functools +import re + +import pytest + +from sarc.alerts.usage_alerts.prometheus_stats_occurrences import ( + check_prometheus_stats_for_gpu_jobs, +) +from sarc.client import get_jobs +from tests.functional.jobs.test_func_load_job_series import MOCK_TIME + +from ..jobs.test_func_job_statistics import generate_fake_timeseries + +PARAMS = { + # Check with default params. In last 7 days from now (mock time: 2023-11-22), + # there is only 2 jobs from 1 cluster in 1 timestamp, both with no GPU stats. + # So threshold will be 0 everywhere, and no warning will be printed. + "default": dict(), + # Check with no time_interval. + "no_time_interval": dict(time_interval=None), + # Check with no time_interval and low amount of stddev (0.25). + "std_025": dict(time_interval=None, nb_stddev=0.25), + # Check with no time_interval, 0.25 stddev, and 1 extra cluster. + # Expected 1 more warning, no other changes . + "std_025_clusters_extra": dict( + time_interval=None, + nb_stddev=0.25, + cluster_names=[ + "raisin", + "patate", + "fromage", + "mila", + "invisible-cluster", + ], + ), + # Check with no time_interval, 0.25 stddev, with only 2 clusters. Thresholds will change. + "std_025_clusters_2": dict( + time_interval=None, nb_stddev=0.25, cluster_names=["raisin", "mila"] + ), + # Check with no time_interval, 0.25 stddev, and no group_by_node. + "std_025_group_none": dict(time_interval=None, nb_stddev=0.25, group_by_node=()), + # Check with no time_interval, 0.25 stddev, and group_by_node for all clusters. + # Sams as if group_by_node is not specified, as only `raisin` triggers some warnings. + "std_025_group_full": dict( + time_interval=None, + nb_stddev=0.25, + group_by_node=["raisin", "patate", "fromage", "mila"], + ), + # Check with no time_interval, 0.25 stddev, group_by_node for all clusters, and min jobs to 2. + "std_025_group_full_min_jobs_2": dict( + time_interval=None, + nb_stddev=0.25, + group_by_node=["raisin", "patate", "fromage", "mila"], + min_jobs_per_group=2, + ), + # Check with no time_interval, 0.25 stddev, group_by_node for all clusters, + # and min jobs set to 2 for only `raisin`. + # No warning, since timestamp when `raisin` triggers warnings has only 2 jobs on this cluster. + "std_025_group_full_min_jobs_raisin": dict( + time_interval=None, + nb_stddev=0.25, + group_by_node=["raisin", "patate", "fromage", "mila"], + min_jobs_per_group={"raisin": 3}, + ), +} + + +@pytest.mark.freeze_time(MOCK_TIME) +@pytest.mark.usefixtures("read_only_db", "tzlocal_is_mtl") +@pytest.mark.parametrize("params", PARAMS.values(), ids=PARAMS.keys()) +def test_check_prometheus_stats_for_gpu_jobs( + params, monkeypatch, caplog, file_regression +): + monkeypatch.setattr( + "sarc.jobs.series.get_job_time_series", generate_fake_timeseries + ) + + for job in get_jobs(): + job.statistics(save=True) + check_prometheus_stats_for_gpu_jobs(**params) + file_regression.check( + re.sub( + r"WARNING +sarc\.alerts\.usage_alerts\.prometheus_stats_occurrences:prometheus_stats_occurrences.py:[0-9]+ +", + "", + caplog.text, + ) + ) diff --git a/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_default_.txt b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_default_.txt new file mode 100644 index 00000000..e69de29b diff --git a/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_no_time_interval_.txt b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_no_time_interval_.txt new file mode 100644 index 00000000..bc2623a9 --- /dev/null +++ b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_no_time_interval_.txt @@ -0,0 +1,7 @@ +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.4561052205155693 (0.9411764705882353 - 2 * 0.242535625036333); time unit: 1 day, 0:00:00 diff --git a/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_.txt b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_.txt new file mode 100644 index 00000000..c89b5913 --- /dev/null +++ b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_.txt @@ -0,0 +1,7 @@ +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 diff --git a/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_clusters_2_.txt b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_clusters_2_.txt new file mode 100644 index 00000000..e498ec5f --- /dev/null +++ b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_clusters_2_.txt @@ -0,0 +1,7 @@ +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.8617561180933225 (0.9285714285714286 - 0.25 * 0.2672612419124244); time unit: 1 day, 0:00:00 diff --git a/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_clusters_extra_.txt b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_clusters_extra_.txt new file mode 100644 index 00000000..f8dd1246 --- /dev/null +++ b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_clusters_extra_.txt @@ -0,0 +1,8 @@ +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[invisible-cluster] no Prometheus data available: no job found diff --git a/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_group_full_.txt b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_group_full_.txt new file mode 100644 index 00000000..c89b5913 --- /dev/null +++ b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_group_full_.txt @@ -0,0 +1,7 @@ +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 diff --git a/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_group_full_min_jobs_2_.txt b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_group_full_min_jobs_2_.txt new file mode 100644 index 00000000..c89b5913 --- /dev/null +++ b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_group_full_min_jobs_2_.txt @@ -0,0 +1,7 @@ +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin][cn-c021] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / node / cluster / time unit; minimum required: 0.880542564329152 (0.9411764705882353 - 0.25 * 0.242535625036333); time unit: 1 day, 0:00:00 diff --git a/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_group_full_min_jobs_raisin_.txt b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_group_full_min_jobs_raisin_.txt new file mode 100644 index 00000000..e69de29b diff --git a/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_group_none_.txt b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_group_none_.txt new file mode 100644 index 00000000..dae24a82 --- /dev/null +++ b/tests/functional/usage_alerts/test_prometheus_stats_for_gpu_jobs/test_check_prometheus_stats_for_gpu_jobs_std_025_group_none_.txt @@ -0,0 +1,7 @@ +[2023-11-21 00:01:00-05:00][raisin] insufficient Prometheus data for gpu_utilization: 0.0 % of CPU jobs / cluster / time unit; minimum required: 0.8337130729464681 (0.9090909090909091 - 0.25 * 0.30151134457776363); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin] insufficient Prometheus data for gpu_utilization_fp16: 0.0 % of CPU jobs / cluster / time unit; minimum required: 0.8337130729464681 (0.9090909090909091 - 0.25 * 0.30151134457776363); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin] insufficient Prometheus data for gpu_utilization_fp32: 0.0 % of CPU jobs / cluster / time unit; minimum required: 0.8337130729464681 (0.9090909090909091 - 0.25 * 0.30151134457776363); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin] insufficient Prometheus data for gpu_utilization_fp64: 0.0 % of CPU jobs / cluster / time unit; minimum required: 0.8337130729464681 (0.9090909090909091 - 0.25 * 0.30151134457776363); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin] insufficient Prometheus data for gpu_sm_occupancy: 0.0 % of CPU jobs / cluster / time unit; minimum required: 0.8337130729464681 (0.9090909090909091 - 0.25 * 0.30151134457776363); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin] insufficient Prometheus data for gpu_memory: 0.0 % of CPU jobs / cluster / time unit; minimum required: 0.8337130729464681 (0.9090909090909091 - 0.25 * 0.30151134457776363); time unit: 1 day, 0:00:00 +[2023-11-21 00:01:00-05:00][raisin] insufficient Prometheus data for gpu_power: 0.0 % of CPU jobs / cluster / time unit; minimum required: 0.8337130729464681 (0.9090909090909091 - 0.25 * 0.30151134457776363); time unit: 1 day, 0:00:00