Skip to content

Commit

Permalink
Select sub-dataframe with given cluster names to compute stats, then …
Browse files Browse the repository at this point in the history
…use full dataframe to check warnings

Add supplementary tests
  • Loading branch information
notoraptor committed Sep 10, 2024
1 parent 0144f20 commit 7b6ff2f
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 23 deletions.
34 changes: 11 additions & 23 deletions sarc/alerts/usage_alerts/cluster_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
from datetime import datetime, timedelta
from typing import List, Optional

import pandas

from sarc.config import MTL
from sarc.jobs.series import compute_time_frames, load_job_series

Expand Down Expand Up @@ -46,35 +44,24 @@ def check_nb_jobs_per_cluster_per_time(
# Get data frame
df = load_job_series(start=start, end=end, clip_time=clip_time)

# List clusters
if not cluster_names:
cluster_names = sorted(df["cluster_name"].unique())

# Split data frame into time frames using `time_unit`
tf = compute_time_frames(df, frame_size=time_unit)

# List timestamps
timestamps = sorted(tf["timestamp"].unique())
# List clusters
if not cluster_names:
cluster_names = sorted(df["cluster_name"].unique())

# Generate a dataframe associating each timestamp to number of all clusters.
f_nb_clusters_per_timestamp = pandas.DataFrame(
{
"timestamp": timestamps,
"nb_all_clusters": [len(cluster_names)] * len(timestamps),
}
)
# Generate a dataframe associating each timestamp to number of jobs which run at this timestamp.
f_nb_jobs_per_timestamp = (
# Generate a dataframe for stats.
f_stats = (
# Use only lines associated with given clusters
tf[tf["cluster_name"].isin(cluster_names)]
# Group by timestamp
.groupby(["timestamp"])[["job_id"]]
# And count jobs by counting column `job_id`
.count()
)
# Generate a dataframe associating each timestamp to number of clusters and number of jobs
f_stats = f_nb_clusters_per_timestamp.merge(
f_nb_jobs_per_timestamp, on="timestamp", how="left"
)
# Compute cluster usage: number of jobs per cluster per timestamp
f_stats["jobs_per_cluster"] = f_stats["job_id"] / f_stats["nb_all_clusters"]
f_stats["jobs_per_cluster"] = f_stats["job_id"] / len(cluster_names)
# Compute average cluster usage
avg = f_stats["jobs_per_cluster"].mean()
# Compute standard deviation for cluster usage
Expand Down Expand Up @@ -103,7 +90,8 @@ def check_nb_jobs_per_cluster_per_time(
# NB: For these cases, number of jobs is always 0
for cluster_name in cluster_names:
if cluster_name not in exclude:
for timestamp in timestamps:
# Iter for each timestamp available in data frame
for timestamp in sorted(tf["timestamp"].unique()):
key = (cluster_name, timestamp)
nb_jobs = 0
if key not in founds and nb_jobs < threshold:
Expand Down
26 changes: 26 additions & 0 deletions tests/functional/usage_alerts/test_alert_cluster_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,32 @@
"[another_cluster][2023-11-21 00:01:00-05:00] insufficient cluster usage: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
],
),
# Check and use only "raisin" cluster to compute stats
(
dict(time_interval=None, cluster_names=["raisin"]),
[
"[fromage][2023-02-17 00:01:00-05:00] insufficient cluster usage: 1 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
"[mila][2023-02-18 00:01:00-05:00] insufficient cluster usage: 1 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
"[mila][2023-02-19 00:01:00-05:00] insufficient cluster usage: 1 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
"[mila][2023-02-20 00:01:00-05:00] insufficient cluster usage: 1 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
"[patate][2023-02-17 00:01:00-05:00] insufficient cluster usage: 1 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
"[patate][2023-02-18 00:01:00-05:00] insufficient cluster usage: 1 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
"[raisin][2023-02-20 00:01:00-05:00] insufficient cluster usage: 0 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
],
),
# Check above case with 1 ignored cluster
(
dict(time_interval=None, cluster_names=["raisin"], exclude=["patate"]),
[
"[fromage][2023-02-17 00:01:00-05:00] insufficient cluster usage: 1 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
"[mila][2023-02-18 00:01:00-05:00] insufficient cluster usage: 1 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
"[mila][2023-02-19 00:01:00-05:00] insufficient cluster usage: 1 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
"[mila][2023-02-20 00:01:00-05:00] insufficient cluster usage: 1 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
# "[patate][2023-02-17 00:01:00-05:00] insufficient cluster usage: 1 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
# "[patate][2023-02-18 00:01:00-05:00] insufficient cluster usage: 1 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
"[raisin][2023-02-20 00:01:00-05:00] insufficient cluster usage: 0 jobs / cluster / time unit; minimum required: 1.7738563936773766 (3.2857142857142856 - 2 * 0.7559289460184545); time unit: 1 day, 0:00:00",
],
),
],
)
def test_check_nb_jobs_per_cluster_per_time(params, expected, caplog):
Expand Down

0 comments on commit 7b6ff2f

Please sign in to comment.