Skip to content

Commit

Permalink
Gpu monitoring (#237)
Browse files Browse the repository at this point in the history
* add gpu utilization decorator and begin work on plots

* add decorator for gpu energy utilization

* Added config option to hpo script, styling (#235)

* Update README.md

* Update README.md

* Update createEnvVega.sh

* remove unused dist file

* run black and isort to fix linting errors

* remove redundant variable

* remove trailing whitespace

* fix issues from PR

* fix import in eurac trainer

* fix linting errors

* update logging directory and pattern

* update default pattern for gpu energy plots

* fix isort linting

* add support for none pattern and general cleanup

* fix linting errors with black and isort

* add configurable and dynamic wait and warmup times for the profiler

* remove old plot

* move horovod import

* fix linting errors

---------

Co-authored-by: Anna Lappe <[email protected]>
Co-authored-by: Matteo Bunino <[email protected]>
  • Loading branch information
3 people committed Nov 15, 2024
1 parent f7443db commit d538510
Show file tree
Hide file tree
Showing 12 changed files with 645 additions and 164 deletions.
66 changes: 57 additions & 9 deletions src/itwinai/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,63 @@
app = typer.Typer(pretty_exceptions_enable=False)


@app.command()
def generate_gpu_energy_plot(
log_dir: str = "scalability_metrics/gpu_energy_data",
pattern: str = r"gpu_energy_data.*\.csv$",
output_file: str = "plots/gpu_energy_plot.png",
) -> None:
"""Generate a GPU energy plot showing the expenditure for each combination of
strategy and number of GPUs in Watt hours.
Args:
log_dir: The directory where the csv logs are stored. Defaults to
``utilization_logs``.
pattern: A regex pattern to recognize the file names in the 'log_dir' folder.
Defaults to ``dataframe_(?:\\w+)_(?:\\d+)\\.csv$``. Set it to 'None' to
make it None. In this case, it will match all files in the given folder.
output_file: The path to where the resulting plot should be saved. Defaults to
``plots/gpu_energy_plot.png``.
"""
import matplotlib.pyplot as plt

from itwinai.torch.monitoring.plotting import gpu_energy_plot, read_energy_df

log_dir_path = Path(log_dir)
if not log_dir_path.exists():
raise ValueError(
f"The provided log_dir, '{log_dir_path.resolve()}', does not exist."
)

if pattern.lower() == "none":
pattern = None

gpu_utilization_df = read_energy_df(pattern=pattern, log_dir=log_dir_path)
gpu_energy_plot(gpu_utilization_df=gpu_utilization_df)

output_path = Path(output_file)
output_path.parent.mkdir(parents=True, exist_ok=True)

plt.savefig(output_path)
print(f"\nSaved GPU energy plot at '{output_path.resolve()}'.")


@app.command()
def generate_communication_plot(
log_dir: str = "profiling_logs",
pattern: str = r"profile_(\w+)_(\d+)_(\d+)\.csv$",
output_file: str = "plots/comm_plot.png",
log_dir: str = "scalability_metrics/communication_data",
pattern: str = r"(.+)_(\d+)_(\d+)\.csv$",
output_file: str = "plots/communication_plot.png",
) -> None:
"""Generate stacked plot showing computation vs. communication fraction. Stores it
to output_file.
Args:
log_dir: The directory where the csv logs are stored. Defauls to
log_dir: The directory where the csv logs are stored. Defaults to
``profiling_logs``.
pattern: A regex pattern to recognize the file names in the 'log_dir' folder.
Defaults to ``profile_(\\w+)_(\\d+)_(\\d+)\\.csv$``.
Defaults to ``profile_(\\w+)_(\\d+)_(\\d+)\\.csv$``. Set it to 'None' to
make it None. In this case, it will match all files in the given folder.
output_file: The path to where the resulting plot should be saved. Defaults to
``plots/comm_plot.png``.
"""
Expand All @@ -45,13 +89,17 @@ def generate_communication_plot(

log_dir_path = Path(log_dir)
if not log_dir_path.exists():
raise IOError(
raise ValueError(
f"The directory '{log_dir_path.resolve()}' does not exist, so could not"
f"extract profiling logs. Make sure you are running this command in the "
f"same directory as the logging dir."
f"same directory as the logging dir or are passing a sufficient relative"
f"path."
)

df = create_combined_comm_overhead_df(logs_dir=log_dir_path, pattern=pattern)
if pattern.lower() == "none":
pattern = None

df = create_combined_comm_overhead_df(log_dir=log_dir_path, pattern=pattern)
values = get_comp_fraction_full_array(df, print_table=True)

strategies = sorted(df["strategy"].unique())
Expand All @@ -67,7 +115,7 @@ def generate_communication_plot(
output_path.parent.mkdir(parents=True, exist_ok=True)

plt.savefig(output_path)
print(f"\nSaved computation vs. communication plot at '{output_path.resolve()}'")
print(f"\nSaved computation vs. communication plot at '{output_path.resolve()}'.")


@app.command()
Expand Down
Loading

0 comments on commit d538510

Please sign in to comment.