Skip to content

Commit

Permalink
accommodate asymmetric runs and make table prettier
Browse files Browse the repository at this point in the history
  • Loading branch information
jarlsondre authored and matbun committed Oct 16, 2024
1 parent 381f72e commit 93f2d62
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 35 deletions.
40 changes: 25 additions & 15 deletions scaling-test/generate_comm_plot.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import matplotlib
from torch._dynamo.skipfiles import comptime

# Doing this because otherwise I get an error about X11 Forwarding which I believe
# is due to the server trying to pass the image to the client computer
Expand Down Expand Up @@ -32,7 +33,7 @@ def create_combined_comm_overhead_df(logs_dir: Path, pattern: str) -> pd.DataFra
if not match:
continue

# Getting the captured regex groups, i.e. the contents of "(\d+)"
# Getting the captured regex groups, e.g. the contents of "(\d+)"
strategy, num_gpus, global_rank = match.groups()
df = pd.read_csv(entry)
df["num_gpus"] = num_gpus
Expand All @@ -49,32 +50,40 @@ def get_comp_fraction_full_array(df: pd.DataFrame) -> np.ndarray:
"""Creates a MxN NumPy array where M is the number of strategies
and N is the number of GPU configurations. The strategies are sorted
alphabetically and the GPU configurations are sorted in ascending number
of GPUs."""
of GPUs.
"""
unique_num_gpus = sorted(df["num_gpus"].unique(), key=lambda x: int(x))
unique_strategies = sorted(df["strategy"].unique())
values = []

print(f"{'-'*50}")
print(f"{'Strategy':>12} | {'Num. GPUs':>10} | {'Comp.':>9} | {'Comm.':>8}")
print(f"{'-'*50}")
for strategy in unique_strategies:
strategy_values = []
for num_gpus in unique_num_gpus:
filtered_df = df[
(df["strategy"] == strategy) & (df["num_gpus"] == num_gpus)
]
log_string = f"{strategy:>12} | {num_gpus:>10}"

# Don't need to test all configurations, still want to print it
if len(filtered_df) == 0:
comp_time, comm_time = np.NaN, np.NaN
strategy_values.append(np.NaN)

log_string += f" | {'(NO DATA)':>15}"
else:
comp_time, comm_time = calculate_comp_and_comm_time(df=filtered_df)
comp_fraction = comp_time / (comp_time + comm_time)
strategy_values.append(comp_fraction)

# For now we assume that we test all strategies for all sizes, but this might
# be useful to change later
assert len(filtered_df) > 0
comp_time, comm_time = calculate_comp_and_comm_time(df=filtered_df)
comp_fraction = comp_time / (comp_time + comm_time)
strategy_values.append(comp_fraction)

print(
f"Strategy: {strategy:>10}, "
f"Num. GPUs: {num_gpus}, "
f"Comp. time: {comp_time:>5.2f}s, "
f"Comm. time: {comm_time:>5.2f}s"
)
log_string += f" | {comp_time:>8.2f}s"
log_string += f" | {comm_time:>8.2f}s"

print(log_string)
values.append(strategy_values)
print(f"{'-'*50}")

return np.array(values)

Expand All @@ -91,6 +100,7 @@ def main():

logs_dir = Path("profiling_logs")
logs_dir.mkdir(parents=True, exist_ok=True)

pattern = r"profile_(\w+)_(\d+)_(\d+)\.csv$"
df = create_combined_comm_overhead_df(logs_dir=logs_dir, pattern=pattern)
values = get_comp_fraction_full_array(df)
Expand Down
40 changes: 21 additions & 19 deletions scaling-test/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ def create_stacked_plot(
correspond to the GPU numbers in 'gpu_numbers' sorted numerically
in ascending order.
"""
assert values.shape[0] == len(strategy_labels)
assert values.shape[1] == len(gpu_numbers)
assert (values >= 0).all() and (values <= 1).all()
# assert values.shape[0] == len(strategy_labels)
# assert values.shape[1] == len(gpu_numbers)
# assert (values >= 0).all() and (values <= 1).all()

strategy_labels = sorted(strategy_labels)
gpu_numbers = sorted(gpu_numbers, key=lambda x: int(x))
Expand All @@ -37,57 +37,59 @@ def create_stacked_plot(

# Creating an offset to "center" around zero
static_offset = len(strategy_labels) / 2 - 0.5
for idx in range(len(strategy_labels)):
dynamic_bar_offset = idx - static_offset
for strategy_idx in range(len(strategy_labels)):
dynamic_bar_offset = strategy_idx - static_offset


# Drawing the stacked bars
ax.bar(
x=x + dynamic_bar_offset * width,
height=values[idx],
height=values[strategy_idx],
width=width,
color=comp_color,
)
ax.bar(
x=x + dynamic_bar_offset * width,
height=complements[idx],
height=complements[strategy_idx],
width=width,
bottom=values[idx],
bottom=values[strategy_idx],
color=comm_color,
)

for i in range(len(gpu_numbers)):
for gpu_idx in range(len(gpu_numbers)):
# Positioning the labels under the stacks
dynamic_label_offset = idx - static_offset
if np.isnan(values[strategy_idx, gpu_idx]):
continue
dynamic_label_offset = strategy_idx - static_offset
ax.text(
x=x[i] + dynamic_label_offset * width,
x=x[gpu_idx] + dynamic_label_offset * width,
y=-0.1,
s=strategy_labels[idx],
s=strategy_labels[strategy_idx],
ha="center",
va="top",
fontsize=10,
rotation=60,
)

# Adjust the bottom of the plot to accommodate the new labels

ax.set_ylabel("Computation fraction")
ax.set_title("Computation vs Communication Time by Method")
ax.set_xticks(x)
ax.set_xticklabels(gpu_numbers)
ax.set_ylim(0, 1) # Ensure y-axis goes from 0 to 1
ax.set_ylim(0, 1)

# Setting the appropriate colors since the legend is manual
legend_elements = [
Patch(facecolor=comm_color, label="Communication"),
Patch(facecolor=comp_color, label="Computation"),
]
# ax.legend(handles=legend_elements, loc="upper right")
# ax.legend(handles=legend_elements, loc="upper right", bbox_to_anchor=(1.2, 1))

# Positioning the legend outside of the plot to not obstruct
ax.legend(
handles=legend_elements,
loc="upper left", # Anchor point of the legend
bbox_to_anchor=(0.80, 1.22), # Position outside the plot
borderaxespad=0.0, # No padding between legend and axes
loc="upper left",
bbox_to_anchor=(0.80, 1.22),
borderaxespad=0.0,
)
fig.subplots_adjust(bottom=0.25)
fig.subplots_adjust(top=0.85)
Expand Down
Binary file modified use-cases/eurac/plots/comm_plot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion use-cases/eurac/slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ if [ "$DIST_MODE" == "horovod" ] ; then
srun --cpu-bind=none \
--ntasks-per-node=$SLURM_GPUS_PER_NODE \
--cpus-per-task=$SLURM_CPUS_PER_GPU \
--ntasks=$SLURM_GPUS_PER_NODE \
--ntasks=$(($SLURM_GPUS_PER_NODE * $SLURM_NNODES)) \
$TRAINING_CMD
else # E.g. for 'deepspeed' or 'ddp'
srun --cpu-bind=none --ntasks-per-node=1 \
Expand Down

0 comments on commit 93f2d62

Please sign in to comment.