accommodate asymmetric runs and make table prettier

interTwin-eu · Oct 16, 2024 · 93f2d62 · 93f2d62
1 parent 381f72e
commit 93f2d62
Show file tree

Hide file tree

Showing 4 changed files with 47 additions and 35 deletions.
diff --git a/scaling-test/generate_comm_plot.py b/scaling-test/generate_comm_plot.py
@@ -1,4 +1,5 @@
 import matplotlib
+from torch._dynamo.skipfiles import comptime
 
 # Doing this because otherwise I get an error about X11 Forwarding which I believe
 # is due to the server trying to pass the image to the client computer
@@ -32,7 +33,7 @@ def create_combined_comm_overhead_df(logs_dir: Path, pattern: str) -> pd.DataFra
         if not match:
             continue
 
-        # Getting the captured regex groups, i.e. the contents of "(\d+)"
+        # Getting the captured regex groups, e.g. the contents of "(\d+)"
         strategy, num_gpus, global_rank = match.groups()
         df = pd.read_csv(entry)
         df["num_gpus"] = num_gpus
@@ -49,32 +50,40 @@ def get_comp_fraction_full_array(df: pd.DataFrame) -> np.ndarray:
     """Creates a MxN NumPy array where M is the number of strategies
     and N is the number of GPU configurations. The strategies are sorted
     alphabetically and the GPU configurations are sorted in ascending number
-    of GPUs."""
+    of GPUs.
+    """
     unique_num_gpus = sorted(df["num_gpus"].unique(), key=lambda x: int(x))
     unique_strategies = sorted(df["strategy"].unique())
     values = []
 
+    print(f"{'-'*50}")
+    print(f"{'Strategy':>12} | {'Num. GPUs':>10} | {'Comp.':>9} | {'Comm.':>8}")
+    print(f"{'-'*50}")
     for strategy in unique_strategies:
         strategy_values = []
         for num_gpus in unique_num_gpus:
             filtered_df = df[
                 (df["strategy"] == strategy) & (df["num_gpus"] == num_gpus)
             ]
+            log_string = f"{strategy:>12} | {num_gpus:>10}"
+
+            # Don't need to test all configurations, still want to print it 
+            if len(filtered_df) == 0: 
+                comp_time, comm_time = np.NaN, np.NaN
+                strategy_values.append(np.NaN)
+
+                log_string += f" | {'(NO DATA)':>15}"
+            else: 
+                comp_time, comm_time = calculate_comp_and_comm_time(df=filtered_df)
+                comp_fraction = comp_time / (comp_time + comm_time)
+                strategy_values.append(comp_fraction)
 
-            # For now we assume that we test all strategies for all sizes, but this might
-            # be useful to change later
-            assert len(filtered_df) > 0
-            comp_time, comm_time = calculate_comp_and_comm_time(df=filtered_df)
-            comp_fraction = comp_time / (comp_time + comm_time)
-            strategy_values.append(comp_fraction)
-
-            print(
-                f"Strategy: {strategy:>10}, "
-                f"Num. GPUs: {num_gpus}, "
-                f"Comp. time: {comp_time:>5.2f}s, "
-                f"Comm. time: {comm_time:>5.2f}s"
-            )
+                log_string += f" | {comp_time:>8.2f}s"
+                log_string += f" | {comm_time:>8.2f}s"
+
+            print(log_string)
         values.append(strategy_values)
+    print(f"{'-'*50}")
 
     return np.array(values)
 
@@ -91,6 +100,7 @@ def main():
 
     logs_dir = Path("profiling_logs")
     logs_dir.mkdir(parents=True, exist_ok=True)
+
     pattern = r"profile_(\w+)_(\d+)_(\d+)\.csv$"
     df = create_combined_comm_overhead_df(logs_dir=logs_dir, pattern=pattern)
     values = get_comp_fraction_full_array(df)

diff --git a/scaling-test/plot.py b/scaling-test/plot.py
@@ -19,9 +19,9 @@ def create_stacked_plot(
             correspond to the GPU numbers in 'gpu_numbers' sorted numerically
             in ascending order.
     """
-    assert values.shape[0] == len(strategy_labels)
-    assert values.shape[1] == len(gpu_numbers)
-    assert (values >= 0).all() and (values <= 1).all()
+    # assert values.shape[0] == len(strategy_labels)
+    # assert values.shape[1] == len(gpu_numbers)
+    # assert (values >= 0).all() and (values <= 1).all()
 
     strategy_labels = sorted(strategy_labels)
     gpu_numbers = sorted(gpu_numbers, key=lambda x: int(x))
@@ -37,57 +37,59 @@ def create_stacked_plot(
 
     # Creating an offset to "center" around zero
     static_offset = len(strategy_labels) / 2 - 0.5
-    for idx in range(len(strategy_labels)):
-        dynamic_bar_offset = idx - static_offset
+    for strategy_idx in range(len(strategy_labels)):
+        dynamic_bar_offset = strategy_idx - static_offset
+
 
         # Drawing the stacked bars
         ax.bar(
             x=x + dynamic_bar_offset * width,
-            height=values[idx],
+            height=values[strategy_idx],
             width=width,
             color=comp_color,
         )
         ax.bar(
             x=x + dynamic_bar_offset * width,
-            height=complements[idx],
+            height=complements[strategy_idx],
             width=width,
-            bottom=values[idx],
+            bottom=values[strategy_idx],
             color=comm_color,
         )
 
-        for i in range(len(gpu_numbers)):
+        for gpu_idx in range(len(gpu_numbers)):
             # Positioning the labels under the stacks
-            dynamic_label_offset = idx - static_offset
+            if np.isnan(values[strategy_idx, gpu_idx]): 
+                continue
+            dynamic_label_offset = strategy_idx - static_offset
             ax.text(
-                x=x[i] + dynamic_label_offset * width,
+                x=x[gpu_idx] + dynamic_label_offset * width,
                 y=-0.1,
-                s=strategy_labels[idx],
+                s=strategy_labels[strategy_idx],
                 ha="center",
                 va="top",
                 fontsize=10,
                 rotation=60,
             )
 
-    # Adjust the bottom of the plot to accommodate the new labels
 
     ax.set_ylabel("Computation fraction")
     ax.set_title("Computation vs Communication Time by Method")
     ax.set_xticks(x)
     ax.set_xticklabels(gpu_numbers)
-    ax.set_ylim(0, 1)  # Ensure y-axis goes from 0 to 1
+    ax.set_ylim(0, 1)
 
+    # Setting the appropriate colors since the legend is manual
     legend_elements = [
         Patch(facecolor=comm_color, label="Communication"),
         Patch(facecolor=comp_color, label="Computation"),
     ]
-    # ax.legend(handles=legend_elements, loc="upper right")
-    # ax.legend(handles=legend_elements, loc="upper right", bbox_to_anchor=(1.2, 1))
 
+    # Positioning the legend outside of the plot to not obstruct
     ax.legend(
         handles=legend_elements,
-        loc="upper left",  # Anchor point of the legend
-        bbox_to_anchor=(0.80, 1.22),  # Position outside the plot
-        borderaxespad=0.0,  # No padding between legend and axes
+        loc="upper left",  
+        bbox_to_anchor=(0.80, 1.22),  
+        borderaxespad=0.0,  
     )
     fig.subplots_adjust(bottom=0.25)
     fig.subplots_adjust(top=0.85)

diff --git a/use-cases/eurac/plots/comm_plot.png b/use-cases/eurac/plots/comm_plot.png
diff --git a/use-cases/eurac/slurm.sh b/use-cases/eurac/slurm.sh
@@ -100,7 +100,7 @@ if [ "$DIST_MODE" == "horovod" ] ; then
 	srun --cpu-bind=none \
 	--ntasks-per-node=$SLURM_GPUS_PER_NODE \
 	--cpus-per-task=$SLURM_CPUS_PER_GPU \
-	--ntasks=$SLURM_GPUS_PER_NODE \
+	--ntasks=$(($SLURM_GPUS_PER_NODE * $SLURM_NNODES)) \
 	$TRAINING_CMD
 else # E.g. for 'deepspeed' or 'ddp'
   srun --cpu-bind=none --ntasks-per-node=1 \