mila-iqia · nurbal · Sep 9, 2024 · Jul 14, 2024 · Jul 24, 2024 · Sep 6, 2024
diff --git a/sarc/alerts/usage_alerts/__init__.py b/sarc/alerts/usage_alerts/__init__.py
diff --git a/sarc/alerts/usage_alerts/gpu_usage.py b/sarc/alerts/usage_alerts/gpu_usage.py
@@ -0,0 +1,99 @@
+import logging
+from datetime import datetime, timedelta
+from typing import Optional, Sequence
+
+from sarc.config import MTL
+from sarc.jobs.series import load_job_series
+
+logger = logging.getLogger(__name__)
+
+
+def check_gpu_type_usage_per_node(
+    gpu_type: str,
+    time_interval: Optional[timedelta] = timedelta(hours=24),
+    minimum_runtime: Optional[timedelta] = timedelta(minutes=5),
+    threshold=1.0,
+    min_tasks=0,
+    ignore_min_tasks_for_clusters: Optional[Sequence[str]] = ("mila",),
+):
+    """
+    Check if a GPU type is sufficiently used on each node.
+    Log a warning for each node where ratio of jobs using GPU type is lesser than given threshold.
+
+    Parameters
+    ----------
+    gpu_type: str
+        GPU type to check.
+    time_interval: timedelta
+        If given, only jobs which ran in [now - time_interval, time_interval] will be used for checking.
+        Default is last 24 hours.
+        If None, all jobs are used.
+    minimum_runtime: timedelta
+        If given, only jobs which ran at least for this minimum runtime will be used for checking.
+        Default is 5 minutes.
+        If None, set to 0.
+    threshold: float
+        A value between 0 and 1 to represent the minimum expected ratio of jobs that use given GPU type
+        wr/t running jobs on each node. Log a warning if computed ratio is lesser than this threshold.
+    min_tasks: int
+        Minimum number of jobs required on a cluster node to make checking.
+        Checking is performed on a node only if, either it contains at least `min_tasks` jobs,
+        or node cluster is in `ignore_min_tasks_for_clusters`.
+    ignore_min_tasks_for_clusters: Sequence
+        Clusters to check even if nodes from those clusters don't have `min_tasks` jobs.
+    """
+    # Parse time_interval
+    start, end, clip_time = None, None, False
+    if time_interval is not None:
+        end = datetime.now(tz=MTL)
+        start = end - time_interval
+        clip_time = True
+
+    # Parse minimum_runtime
+    if minimum_runtime is None:
+        minimum_runtime = timedelta(seconds=0)
+
+    # Get data frame. We clip time if start and end are available,
+    # so that minimum_runtime is compared to job running time in given interval.
+    df = load_job_series(start=start, end=end, clip_time=clip_time)
+
+    # Add a column `gpu_task_` with value 1 for each job running on given GPU type.
+    df.loc[:, "gpu_task_"] = df["allocated.gpu_type"] == gpu_type
+    # Add a column `task_` with value 1 for each job. Used later to count jobs in a groupby().
+    df.loc[:, "task_"] = 1
+
+    # Group jobs.
+    ff = (
+        # Select only jobs where elapsed time >= minimum runtime and gres_gpu > 0
+        df[
+            (df["elapsed_time"] >= minimum_runtime.total_seconds())
+            & (df["allocated.gres_gpu"] > 0)
+        ]
+        # `nodes` is a list of nodes. We explode this column to count each job for each node where it is running
+        .explode("nodes")
+        # Then we group by cluster name and nodes,
+        .groupby(["cluster_name", "nodes"])[["gpu_task_", "task_"]]
+        # and we sum on gpu_task_ and task_
+        .sum()
+    )
+    # Finally, we compute GPU usage.
+    ff["gpu_usage_"] = ff["gpu_task_"] / ff["task_"]
+
+    # We can now check GPU usage.
+    ignore_min_tasks_for_clusters = set(ignore_min_tasks_for_clusters or ())
+    for row in ff.itertuples():
+        cluster_name, node = row.Index
+        nb_gpu_tasks = row.gpu_task_
+        nb_tasks = row.task_
+        gpu_usage = row.gpu_usage_
+        if gpu_usage < threshold and (
+            cluster_name in ignore_min_tasks_for_clusters or nb_tasks >= min_tasks
+        ):
+            # We warn if gpu usage < threshold and if
+            # either we are on a cluster listed in `ignore_min_tasks_for_clusters`,
+            # or there are enough jobs in node.
+            logger.warning(
+                f"[{cluster_name}][{node}] insufficient usage for GPU {gpu_type}: "
+                f"{round(gpu_usage * 100, 2)} % ({nb_gpu_tasks}/{nb_tasks}), "
+                f"minimum required: {round(threshold * 100, 2)} %"
+            )
diff --git a/sarc/jobs/series.py b/sarc/jobs/series.py
@@ -321,7 +321,9 @@ def load_job_series(
           "gpu_utilization", "cpu_utilization", "gpu_memory", "gpu_power", "system_memory"
         - Optional job series fields, added if clip_time is True:
           "unclipped_start" and "unclipped_end"
-        - Optional user info fields if job users found. See `_user_to_series` for user fields.
+        - Optional user info fields if job users found.
+          Fields from `User.dict()` in format `user.<flattened dot-separated field>`,
+          + special field `user.primary_email` containing either `user.mila.email` or fallback `job.user`.
     """
 
     # If fields is a list, convert it to a renaming dict with same old and new names.

diff --git a/tests/functional/usage_alerts/__init__.py b/tests/functional/usage_alerts/__init__.py
diff --git a/tests/functional/usage_alerts/common.py b/tests/functional/usage_alerts/common.py
@@ -0,0 +1,11 @@
+def _get_warnings(text: str, module: str) -> list:
+    """Parse warning messages from given text (typically caplog.text)"""
+    warnings = []
+    for line in text.split("\n"):
+        line = line.strip()
+        if line.startswith("WARNING "):
+            line_content = line[len("WARNING") :].lstrip()
+            line_ref, warning_msg = line_content.split(" ", maxsplit=1)
+            assert line_ref.startswith(f"{module}:"), line_ref
+            warnings.append(warning_msg.strip())
+    return warnings
diff --git a/tests/functional/usage_alerts/test_alert_gpu_usage.py b/tests/functional/usage_alerts/test_alert_gpu_usage.py
@@ -0,0 +1,170 @@
+"""
+Initial jobs in read_only_db (for reference):
+
+|    |    job_id | cluster_name   | nodes                             |   allocated.gres_gpu | allocated.gpu_type   | start_time                | end_time                  |   elapsed_time |
+|---:|----------:|:---------------|:----------------------------------|---------------------:|:---------------------|:--------------------------|:--------------------------|---------------:|
+|  0 |         1 | raisin         | ['cn-c021']                       |                    1 |                      | 2023-02-14 00:01:00-05:00 | 2023-02-14 12:01:00-05:00 |          43200 |
+|  1 |         2 | raisin         | ['cn-c021']                       |                    1 |                      | 2023-02-14 06:01:00-05:00 | 2023-02-14 18:01:00-05:00 |          43200 |
+|  2 |         3 | raisin         | ['cn-c021']                       |                    1 |                      | 2023-02-14 12:01:00-05:00 | 2023-02-15 00:01:00-05:00 |          43200 |
+|  3 |         4 | raisin         | ['cn-c021']                       |                    1 |                      | 2023-02-14 18:01:00-05:00 | 2023-02-15 06:01:00-05:00 |          43200 |
+|  4 |         5 | raisin         | ['cn-c021']                       |                    1 |                      | 2023-02-15 00:01:00-05:00 | 2023-02-15 12:01:00-05:00 |          43200 |
+|  5 |         6 | raisin         | ['cn-c021']                       |                    1 |                      | 2023-02-15 06:01:00-05:00 | 2023-02-15 18:01:00-05:00 |          43200 |
+|  6 |         7 | raisin         | ['cn-c021']                       |                    1 |                      | 2023-11-21 07:00:00-05:00 | 2023-11-21 19:00:00-05:00 |          43200 |
+|  7 |         8 | raisin         | ['cn-c021']                       |                    1 |                      | 2023-11-21 07:00:00-05:00 | 2023-11-21 19:00:00-05:00 |          43200 |
+|  8 |         9 | raisin         | ['cn-c021']                       |                    1 |                      | 2023-02-16 00:01:00-05:00 | 2023-02-16 12:01:00-05:00 |          43200 |
+|  9 |        10 | raisin         | ['cn-c021']                       |                    1 |                      | 2023-02-16 00:01:00-05:00 | 2023-02-16 12:01:00-05:00 |          43200 |
+| 10 |        11 | raisin         | ['cn-c021']                       |                    1 |                      | 2023-02-16 00:01:00-05:00 | 2023-02-16 12:01:00-05:00 |          43200 |
+| 11 |        12 | raisin         | ['bart']                          |                    1 |                      | 2023-02-16 18:01:00-05:00 | 2023-02-17 06:01:00-05:00 |          43200 |
+| 12 |        13 | raisin         | ['cn-c021', 'cn-c022', 'cn-d001'] |                    1 |                      | 2023-02-17 00:01:00-05:00 | 2023-02-17 12:01:00-05:00 |          43200 |
+| 13 |        14 | raisin         | ['cn-c021']                       |                    1 |                      | 2023-02-17 06:01:00-05:00 | 2023-02-17 18:01:00-05:00 |          43200 |
+| 14 |        15 | fromage        | ['cn-c021']                       |                    1 |                      | 2023-02-17 12:01:00-05:00 | 2023-02-18 00:01:00-05:00 |          43200 |
+| 15 |        16 | patate         | ['cn-c021']                       |                    1 |                      | 2023-02-17 18:01:00-05:00 | 2023-02-18 06:01:00-05:00 |          43200 |
+| 16 |        17 | raisin         | ['cn-c021']                       |                    1 |                      | 2023-02-18 00:01:00-05:00 | 2023-02-18 12:01:00-05:00 |          43200 |
+| 17 |        18 | raisin         | ['cn-c021']                       |                    1 |                      | 2023-02-18 06:01:00-05:00 | 2023-02-18 18:01:00-05:00 |          43200 |
+| 18 |        19 | mila           | ['cn-c021']                       |                    1 |                      | 2023-02-18 12:01:00-05:00 | 2023-02-19 00:01:00-05:00 |          43200 |
+| 19 |        20 | raisin         | ['cn-c021']                       |                    1 |                      | 2023-02-18 18:01:00-05:00 | 2023-02-19 06:01:00-05:00 |          43200 |
+| 20 |   1000000 | raisin         | ['cn-c017']                       |                    1 |                      | 2023-02-19 00:01:00-05:00 | 2023-02-19 12:01:00-05:00 |          43200 |
+| 21 |   1000000 | raisin         | ['cn-b099']                       |                    1 |                      | 2023-02-19 06:01:00-05:00 | 2023-02-19 18:01:00-05:00 |          43200 |
+| 22 |        23 | raisin         | ['cn-c021']                       |                    2 | A100                 | 2023-02-19 12:01:00-05:00 | 2023-02-20 00:01:00-05:00 |          43200 |
+| 23 | 999999999 | mila           | ['cn-c021']                       |                    0 |                      | 2023-02-19 18:01:00-05:00 | 2023-02-20 12:01:00-05:00 |          64800 |
+"""
+
+import functools
+from datetime import timedelta
+
+import pytest
+
+from sarc.alerts.usage_alerts.gpu_usage import check_gpu_type_usage_per_node
+from tests.functional.jobs.test_func_load_job_series import MOCK_TIME
+
+from .common import _get_warnings
+
+get_warnings = functools.partial(
+    _get_warnings, module="sarc.alerts.usage_alerts.gpu_usage:gpu_usage.py"
+)
+
+
+@pytest.mark.freeze_time(MOCK_TIME)
+@pytest.mark.usefixtures("read_only_db", "tzlocal_is_mtl")
+@pytest.mark.parametrize(
+    "params,expected",
+    [
+        (
+            # Check GPU A100 with no interval (i.e. all jobs)
+            dict(gpu_type="A100", time_interval=None, minimum_runtime=None),
+            [
+                "[fromage][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
+                "[mila][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
+                "[patate][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
+                "[raisin][bart] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
+                "[raisin][cn-b099] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
+                "[raisin][cn-c017] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
+                "[raisin][cn-c021] insufficient usage for GPU A100: 5.88 % (1/17), minimum required: 100.0 %",
+                "[raisin][cn-c022] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
+                "[raisin][cn-d001] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
+            ],
+        ),
+        (
+            # Check GPU A100 with no interval (i.e. all jobs) and minimum runtime
+            dict(
+                gpu_type="A100",
+                time_interval=None,
+                minimum_runtime=timedelta(seconds=43200),
+            ),
+            [
+                "[fromage][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
+                "[mila][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
+                "[patate][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
+                "[raisin][bart] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
+                "[raisin][cn-b099] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
+                "[raisin][cn-c017] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
+                "[raisin][cn-c021] insufficient usage for GPU A100: 5.88 % (1/17), minimum required: 100.0 %",
+                "[raisin][cn-c022] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
+                "[raisin][cn-d001] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
+            ],
+        ),
+        (
+            # Check GPU A100 with no interval (i.e. all jobs) and minimum runtime too high
+            dict(
+                gpu_type="A100",
+                time_interval=None,
+                minimum_runtime=timedelta(seconds=43200 + 1),
+            ),
+            [],
+        ),
+        (
+            # Check GPU A100 for all jobs with a greater threshold.
+            dict(
+                gpu_type="A100",
+                time_interval=None,
+                minimum_runtime=None,
+                threshold=5 / 100,
+            ),
+            [
+                "[fromage][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %",
+                "[mila][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %",
+                "[patate][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %",
+                "[raisin][bart] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %",
+                "[raisin][cn-b099] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %",
+                "[raisin][cn-c017] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %",
+                # "[raisin][cn-c021] insufficient usage for GPU A100: 5.88 % (1/17), minimum required: 5.0 %",
+                "[raisin][cn-c022] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %",
+                "[raisin][cn-d001] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %",
+            ],
+        ),
+        (
+            # Check GPU A100 for all jobs with threshold zero.
+            dict(
+                gpu_type="A100", time_interval=None, minimum_runtime=None, threshold=0
+            ),
+            [],
+        ),
+        (
+            # Check GPU A100 for all jobs, a greater threshold, and minimum number of jobs per drac node set to 2.
+            dict(
+                gpu_type="A100",
+                time_interval=None,
+                minimum_runtime=None,
+                threshold=10 / 100,
+                min_tasks=2,
+            ),
+            [
+                # "[fromage][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %",
+                "[mila][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %",
+                # "[patate][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %",
+                # "[raisin][bart] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %",
+                # "[raisin][cn-b099] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %",
+                # "[raisin][cn-c017] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %",
+                "[raisin][cn-c021] insufficient usage for GPU A100: 5.88 % (1/17), minimum required: 10.0 %",
+                # "[raisin][cn-c022] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %",
+                # "[raisin][cn-d001] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %",
+            ],
+        ),
+        (
+            # Check GPU A100 with default intervals (24 hours).
+            # Only 2 jobs (6 and 7) will match for current frozen mock time.
+            dict(gpu_type="A100"),
+            [
+                "[raisin][cn-c021] insufficient usage for GPU A100: 0.0 % (0/2), minimum required: 100.0 %",
+            ],
+        ),
+        (
+            # Check unknown GPU.
+            dict(gpu_type="unknown", time_interval=None),
+            [
+                "[fromage][cn-c021] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %",
+                "[mila][cn-c021] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %",
+                "[patate][cn-c021] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %",
+                "[raisin][bart] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %",
+                "[raisin][cn-b099] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %",
+                "[raisin][cn-c017] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %",
+                "[raisin][cn-c021] insufficient usage for GPU unknown: 0.0 % (0/17), minimum required: 100.0 %",
+                "[raisin][cn-c022] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %",
+                "[raisin][cn-d001] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %",
+            ],
+        ),
+    ],
+)
+def test_check_gpu_type_usage_per_node(params, expected, caplog):
+    check_gpu_type_usage_per_node(**params)
+    assert get_warnings(caplog.text) == expected