Select sub-dataframe with given cluster names to compute stats, then …

…use full dataframe to check warnings Add supplementary tests
mila-iqia · Sep 10, 2024 · 7b6ff2f · 7b6ff2f
1 parent 0144f20
commit 7b6ff2f
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 23 deletions.
diff --git a/sarc/alerts/usage_alerts/cluster_usage.py b/sarc/alerts/usage_alerts/cluster_usage.py
@@ -2,8 +2,6 @@
 from datetime import datetime, timedelta
 from typing import List, Optional
 
-import pandas
-
 from sarc.config import MTL
 from sarc.jobs.series import compute_time_frames, load_job_series
 
@@ -46,35 +44,24 @@ def check_nb_jobs_per_cluster_per_time(
     # Get data frame
     df = load_job_series(start=start, end=end, clip_time=clip_time)
 
-    # List clusters
-    if not cluster_names:
-        cluster_names = sorted(df["cluster_name"].unique())
-
     # Split data frame into time frames using `time_unit`
     tf = compute_time_frames(df, frame_size=time_unit)
 
-    # List timestamps
-    timestamps = sorted(tf["timestamp"].unique())
+    # List clusters
+    if not cluster_names:
+        cluster_names = sorted(df["cluster_name"].unique())
 
-    # Generate a dataframe associating each timestamp to number of all clusters.
-    f_nb_clusters_per_timestamp = pandas.DataFrame(
-        {
-            "timestamp": timestamps,
-            "nb_all_clusters": [len(cluster_names)] * len(timestamps),
-        }
-    )
-    # Generate a dataframe associating each timestamp to number of jobs which run at this timestamp.
-    f_nb_jobs_per_timestamp = (
+    # Generate a dataframe for stats.
+    f_stats = (
+        # Use only lines associated with given clusters
         tf[tf["cluster_name"].isin(cluster_names)]
+        # Group by timestamp
         .groupby(["timestamp"])[["job_id"]]
+        # And count jobs by counting column `job_id`
         .count()
     )
-    # Generate a dataframe associating each timestamp to number of clusters and number of jobs
-    f_stats = f_nb_clusters_per_timestamp.merge(
-        f_nb_jobs_per_timestamp, on="timestamp", how="left"
-    )
     # Compute cluster usage: number of jobs per cluster per timestamp
-    f_stats["jobs_per_cluster"] = f_stats["job_id"] / f_stats["nb_all_clusters"]
+    f_stats["jobs_per_cluster"] = f_stats["job_id"] / len(cluster_names)
     # Compute average cluster usage
     avg = f_stats["jobs_per_cluster"].mean()
     # Compute standard deviation for cluster usage
@@ -103,7 +90,8 @@ def check_nb_jobs_per_cluster_per_time(
     # NB: For these cases, number of jobs is always 0
     for cluster_name in cluster_names:
         if cluster_name not in exclude:
-            for timestamp in timestamps:
+            # Iter for each timestamp available in data frame
+            for timestamp in sorted(tf["timestamp"].unique()):
                 key = (cluster_name, timestamp)
                 nb_jobs = 0
                 if key not in founds and nb_jobs < threshold:

diff --git a/tests/functional/usage_alerts/test_alert_cluster_usage.py b/tests/functional/usage_alerts/test_alert_cluster_usage.py
@@ -136,6 +136,32 @@
                 "[another_cluster][2023-11-21 00:01:00-05:00] insufficient cluster usage: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
             ],
         ),
+        # Check and use only "raisin" cluster to compute stats
+        (
+            dict(time_interval=None, cluster_names=["raisin"]),
+            [
+                "[fromage][2023-02-17 00:01:00-05:00] insufficient cluster usage: 1 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
+                "[mila][2023-02-18 00:01:00-05:00] insufficient cluster usage: 1 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
+                "[mila][2023-02-19 00:01:00-05:00] insufficient cluster usage: 1 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
+                "[mila][2023-02-20 00:01:00-05:00] insufficient cluster usage: 1 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
+                "[patate][2023-02-17 00:01:00-05:00] insufficient cluster usage: 1 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
+                "[patate][2023-02-18 00:01:00-05:00] insufficient cluster usage: 1 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
+                "[raisin][2023-02-20 00:01:00-05:00] insufficient cluster usage: 0 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
+            ],
+        ),
+        # Check above case with 1 ignored cluster
+        (
+            dict(time_interval=None, cluster_names=["raisin"], exclude=["patate"]),
+            [
+                "[fromage][2023-02-17 00:01:00-05:00] insufficient cluster usage: 1 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
+                "[mila][2023-02-18 00:01:00-05:00] insufficient cluster usage: 1 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
+                "[mila][2023-02-19 00:01:00-05:00] insufficient cluster usage: 1 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
+                "[mila][2023-02-20 00:01:00-05:00] insufficient cluster usage: 1 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
+                # "[patate][2023-02-17 00:01:00-05:00] insufficient cluster usage: 1 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
+                # "[patate][2023-02-18 00:01:00-05:00] insufficient cluster usage: 1 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
+                "[raisin][2023-02-20 00:01:00-05:00] insufficient cluster usage: 0 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
+            ],
+        ),
     ],
 )
 def test_check_nb_jobs_per_cluster_per_time(params, expected, caplog):