diff --git a/scaling-test/generate_comm_plot.py b/scaling-test/generate_comm_plot.py index f4bd4416..f2b059ed 100644 --- a/scaling-test/generate_comm_plot.py +++ b/scaling-test/generate_comm_plot.py @@ -1,4 +1,5 @@ import matplotlib +from torch._dynamo.skipfiles import comptime # Doing this because otherwise I get an error about X11 Forwarding which I believe # is due to the server trying to pass the image to the client computer @@ -32,7 +33,7 @@ def create_combined_comm_overhead_df(logs_dir: Path, pattern: str) -> pd.DataFra if not match: continue - # Getting the captured regex groups, i.e. the contents of "(\d+)" + # Getting the captured regex groups, e.g. the contents of "(\d+)" strategy, num_gpus, global_rank = match.groups() df = pd.read_csv(entry) df["num_gpus"] = num_gpus @@ -49,32 +50,40 @@ def get_comp_fraction_full_array(df: pd.DataFrame) -> np.ndarray: """Creates a MxN NumPy array where M is the number of strategies and N is the number of GPU configurations. The strategies are sorted alphabetically and the GPU configurations are sorted in ascending number - of GPUs.""" + of GPUs. + """ unique_num_gpus = sorted(df["num_gpus"].unique(), key=lambda x: int(x)) unique_strategies = sorted(df["strategy"].unique()) values = [] + print(f"{'-'*50}") + print(f"{'Strategy':>12} | {'Num. GPUs':>10} | {'Comp.':>9} | {'Comm.':>8}") + print(f"{'-'*50}") for strategy in unique_strategies: strategy_values = [] for num_gpus in unique_num_gpus: filtered_df = df[ (df["strategy"] == strategy) & (df["num_gpus"] == num_gpus) ] + log_string = f"{strategy:>12} | {num_gpus:>10}" + + # Don't need to test all configurations, still want to print it + if len(filtered_df) == 0: + comp_time, comm_time = np.NaN, np.NaN + strategy_values.append(np.NaN) + + log_string += f" | {'(NO DATA)':>15}" + else: + comp_time, comm_time = calculate_comp_and_comm_time(df=filtered_df) + comp_fraction = comp_time / (comp_time + comm_time) + strategy_values.append(comp_fraction) - # For now we assume that we test all strategies for all sizes, but this might - # be useful to change later - assert len(filtered_df) > 0 - comp_time, comm_time = calculate_comp_and_comm_time(df=filtered_df) - comp_fraction = comp_time / (comp_time + comm_time) - strategy_values.append(comp_fraction) - - print( - f"Strategy: {strategy:>10}, " - f"Num. GPUs: {num_gpus}, " - f"Comp. time: {comp_time:>5.2f}s, " - f"Comm. time: {comm_time:>5.2f}s" - ) + log_string += f" | {comp_time:>8.2f}s" + log_string += f" | {comm_time:>8.2f}s" + + print(log_string) values.append(strategy_values) + print(f"{'-'*50}") return np.array(values) @@ -91,6 +100,7 @@ def main(): logs_dir = Path("profiling_logs") logs_dir.mkdir(parents=True, exist_ok=True) + pattern = r"profile_(\w+)_(\d+)_(\d+)\.csv$" df = create_combined_comm_overhead_df(logs_dir=logs_dir, pattern=pattern) values = get_comp_fraction_full_array(df) diff --git a/scaling-test/plot.py b/scaling-test/plot.py index 91859d09..07f4aebf 100644 --- a/scaling-test/plot.py +++ b/scaling-test/plot.py @@ -19,9 +19,9 @@ def create_stacked_plot( correspond to the GPU numbers in 'gpu_numbers' sorted numerically in ascending order. """ - assert values.shape[0] == len(strategy_labels) - assert values.shape[1] == len(gpu_numbers) - assert (values >= 0).all() and (values <= 1).all() + # assert values.shape[0] == len(strategy_labels) + # assert values.shape[1] == len(gpu_numbers) + # assert (values >= 0).all() and (values <= 1).all() strategy_labels = sorted(strategy_labels) gpu_numbers = sorted(gpu_numbers, key=lambda x: int(x)) @@ -37,57 +37,59 @@ def create_stacked_plot( # Creating an offset to "center" around zero static_offset = len(strategy_labels) / 2 - 0.5 - for idx in range(len(strategy_labels)): - dynamic_bar_offset = idx - static_offset + for strategy_idx in range(len(strategy_labels)): + dynamic_bar_offset = strategy_idx - static_offset + # Drawing the stacked bars ax.bar( x=x + dynamic_bar_offset * width, - height=values[idx], + height=values[strategy_idx], width=width, color=comp_color, ) ax.bar( x=x + dynamic_bar_offset * width, - height=complements[idx], + height=complements[strategy_idx], width=width, - bottom=values[idx], + bottom=values[strategy_idx], color=comm_color, ) - for i in range(len(gpu_numbers)): + for gpu_idx in range(len(gpu_numbers)): # Positioning the labels under the stacks - dynamic_label_offset = idx - static_offset + if np.isnan(values[strategy_idx, gpu_idx]): + continue + dynamic_label_offset = strategy_idx - static_offset ax.text( - x=x[i] + dynamic_label_offset * width, + x=x[gpu_idx] + dynamic_label_offset * width, y=-0.1, - s=strategy_labels[idx], + s=strategy_labels[strategy_idx], ha="center", va="top", fontsize=10, rotation=60, ) - # Adjust the bottom of the plot to accommodate the new labels ax.set_ylabel("Computation fraction") ax.set_title("Computation vs Communication Time by Method") ax.set_xticks(x) ax.set_xticklabels(gpu_numbers) - ax.set_ylim(0, 1) # Ensure y-axis goes from 0 to 1 + ax.set_ylim(0, 1) + # Setting the appropriate colors since the legend is manual legend_elements = [ Patch(facecolor=comm_color, label="Communication"), Patch(facecolor=comp_color, label="Computation"), ] - # ax.legend(handles=legend_elements, loc="upper right") - # ax.legend(handles=legend_elements, loc="upper right", bbox_to_anchor=(1.2, 1)) + # Positioning the legend outside of the plot to not obstruct ax.legend( handles=legend_elements, - loc="upper left", # Anchor point of the legend - bbox_to_anchor=(0.80, 1.22), # Position outside the plot - borderaxespad=0.0, # No padding between legend and axes + loc="upper left", + bbox_to_anchor=(0.80, 1.22), + borderaxespad=0.0, ) fig.subplots_adjust(bottom=0.25) fig.subplots_adjust(top=0.85) diff --git a/use-cases/eurac/plots/comm_plot.png b/use-cases/eurac/plots/comm_plot.png index 771b96d8..2a8425cd 100644 Binary files a/use-cases/eurac/plots/comm_plot.png and b/use-cases/eurac/plots/comm_plot.png differ diff --git a/use-cases/eurac/slurm.sh b/use-cases/eurac/slurm.sh index e1ec58b1..e907e54c 100644 --- a/use-cases/eurac/slurm.sh +++ b/use-cases/eurac/slurm.sh @@ -100,7 +100,7 @@ if [ "$DIST_MODE" == "horovod" ] ; then srun --cpu-bind=none \ --ntasks-per-node=$SLURM_GPUS_PER_NODE \ --cpus-per-task=$SLURM_CPUS_PER_GPU \ - --ntasks=$SLURM_GPUS_PER_NODE \ + --ntasks=$(($SLURM_GPUS_PER_NODE * $SLURM_NNODES)) \ $TRAINING_CMD else # E.g. for 'deepspeed' or 'ddp' srun --cpu-bind=none --ntasks-per-node=1 \