From 3bd19914dfbfb654032828613cc54114a111ace1 Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khalidhossain@MacBook-Pro-39.local>
Date: Wed, 11 Dec 2024 14:15:15 -0600
Subject: [PATCH 01/32] Added unitrace profiling for Aurora, updated mkdocs
 nav.

---
 docs/aurora/data-science/profiling_dl.md | 101 +++++++++++++++++++++++
 mkdocs.yml                               |   1 +
 2 files changed, 102 insertions(+)
 create mode 100644 docs/aurora/data-science/profiling_dl.md

diff --git a/docs/aurora/data-science/profiling_dl.md b/docs/aurora/data-science/profiling_dl.md
new file mode 100644
index 000000000..540b03b0e
--- /dev/null
+++ b/docs/aurora/data-science/profiling_dl.md
@@ -0,0 +1,101 @@
+# Profiling Deep Learning Applications
+
+On Aurora we can use the `unitrace` profiler from Intel to profile Deep 
+Learning applications. Refer to the 
+[unitrace documentation page](https://github.com/intel/pti-gpu/tree/master/tools/unitrace)
+for details.
+
+## Example Usage
+
+We can use `unitrace` to trace an application running on multiple ranks and 
+multiple nodes. A simple example, where we use a wrapper script to trace the
+rank 0 on each node of a 4 node job running a PyTorch application is below:
+
+### A `unitrace` wrapper
+```
+#!/bin/bash
+## This wrapper should be used with unitrace to trace in any number of nodes.
+## The script for this example is set up to trace rank 0 of first 4 Nodes in the case of
+## profiling a job running on larger than 4 nodes.
+FNAME_EXT=$(basename "$2")
+FNAME="${FNAME_EXT%%.*}"
+
+NNODES=`wc -l < $PBS_NODEFILE`
+
+WORK_DIR=/path/to/the/Python/program
+UNITRACE_DIR=/opt/aurora/24.180.1/support/tools/pti-gpu/063214e
+UNITRACE_LIB=${UNITRACE_DIR}/lib64
+UNITRACE_BIN=${UNITRACE_DIR}/bin
+UNITRACE_EXE=${UNITRACE_BIN}/unitrace
+DTAG=$(date +%F_%H%M%S)
+UNITRACE_OUTDIR=${WORK_DIR}/logs/unitrace_profiles/name_of_choice_json_n${NNODES}_${DTAG}/${FNAME}_n${NNODES}_${DTAG}
+mkdir -p ${UNITRACE_OUTDIR}
+UNITRACE_OPTS=" --ccl-summary-report --chrome-mpi-logging --chrome-sycl-logging \
+--chrome-device-logging \
+--chrome-ccl-logging --chrome-call-logging --chrome-dnn-logging --device-timing --host-timing \
+--output-dir-path ${UNITRACE_OUTDIR} --output ${UNITRACE_OUTDIR}/UNITRACE_${FNAME}_n${NNODES}_${DTAG}.txt "
+
+
+export LD_LIBRARY_PATH=${UNITRACE_LIB}:${UNITRACE_BIN}:$LD_LIBRARY_PATH
+
+# Use $PMIX_RANK for MPICH and $SLURM_PROCID with srun.
+PROFRANK=0
+RANKCUTOFF=48
+
+if [[ $PALS_LOCAL_RANKID -eq $PROFRANK ]] && [[ $PMIX_RANK -lt $RANKCUTOFF ]]; then
+  echo "On rank $PMIX_RANK, collecting traces "
+  $UNITRACE_EXE $UNITRACE_OPTS "$@"
+else
+  "$@"
+fi
+
+```
+There are a few important things to notice in the wrapper.
+
+- `UNITRACE_DIR`: This is the main `unitrace` directory, which may change after
+an update to the programming environment.
+
+- `UNITRACE_OPTS`: These are the options that `unitrace` uses to trace data at
+different levels. Based on the number of options, the sizes of the output 
+profiles will vary. Usually enabling more options lead to a larger profile 
+(in terms of storage in MB).
+
+- `PROFRANK`: As implemented, this variable is set by the user to trace the rank
+of choice. For example, this wrapper will trace the rank 0 on each node.
+
+- `RANKCUTOFF`: This variable is Aurora specific. As we can run as many as 12
+ranks per node (without using CCS), the first 4 nodes of a job will have 48 
+ranks running. This provides the upper cutoff of the label (in number) of ranks,
+beyond which `unitrace` will not trace any rank. An user can change the number
+according to the number of maximum ranks running per node to set up how many 
+ranks to be
+traced. `unitrace` will produce a profile (`json` file, by default) per traced 
+rank.
+
+### Deployment
+
+The wrapper above can be deployed using a PBS job script the following way
+
+```
+#!/bin/bash -x
+#PBS -l select=4
+#PBS -l place=scatter
+#PBS -l walltime=00:10:00
+#PBS -q workq
+#PBS -A Aurora_deployment
+
+WORK_DIR=/path/to/the/Python/program
+UNITRACE_WRAPPER=${WORK_DIR}/unitrace_wrapper.sh
+
+# MPI and OpenMP settings
+NNODES=`wc -l < $PBS_NODEFILE`
+NRANKS_PER_NODE=12
+
+let NRANKS=${NNODES}*${NRANKS_PER_NODE}
+
+module load frameworks/2024.2.1_u1
+
+mpiexec --pmi=pmix -n ${NRANKS} -ppn ${NRANKS_PER_NODE} -l --line-buffer \
+${UNITRACE_WRAPPER} python ${WORK_DIR}/application.py 
+```
+
diff --git a/mkdocs.yml b/mkdocs.yml
index 13e897f8e..18f475e62 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -203,6 +203,7 @@ nav:
         #- Applications:
           #- gpt-neox: aurora/data-science/applications/gpt-neox.md
         - Containers: aurora/data-science/containers/containers.md
+        - Profiling: aurora/data-science/profiling_dl.md  
         - Frameworks:
           #- DeepSpeed: aurora/data-science/frameworks/deepspeed.md
           #- JAX: aurora/data-science/frameworks/jax.md

From c26f6b421e34d2370bbb8938c237912c8e5a9f2d Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khalidhossain@MacBook-Pro-39.local>
Date: Wed, 11 Dec 2024 16:54:57 -0600
Subject: [PATCH 02/32] Added profiling with nsys on Polaris, updated
 mkdocs.yml

---
 docs/polaris/data-science/profiling_dl.md | 250 ++++++++++++++++++++++
 mkdocs.yml                                |   1 +
 2 files changed, 251 insertions(+)
 create mode 100644 docs/polaris/data-science/profiling_dl.md

diff --git a/docs/polaris/data-science/profiling_dl.md b/docs/polaris/data-science/profiling_dl.md
new file mode 100644
index 000000000..91b4c37b3
--- /dev/null
+++ b/docs/polaris/data-science/profiling_dl.md
@@ -0,0 +1,250 @@
+# Profiling Deep Learning Applications
+
+We can use both framework (for example, PyTorch) native profiler and vendor specific 
+[Nsys profiler](https://developer.nvidia.com/nsight-systems/get-started) to get
+high level profiling information and timeline of execution for an application.
+For kernel level information, we may use 
+[Nsight compute profiler](https://developer.nvidia.com/tools-overview/nsight-compute/get-started).
+Refer to the respective documentation for more details:
+
+[Nsight System User Guide](https://docs.nvidia.com/nsight-systems/UserGuide/index.html)
+
+[Nsight Compute Documentation](https://docs.nvidia.com/nsight-compute/)
+
+[Nsight Compute CLI](https://docs.nvidia.com/nsight-compute/NsightComputeCli/index.html)
+
+[PyTorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html)
+
+## Example Usage
+
+At the high level, the usage of the `nsys` or the `ncu` profiler can be
+summarized through the following command:
+
+```
+nsys profile -o profile python application.py
+```
+If we want to launch with `MPI` then
+
+```
+mpiexec ... nsys profile ... python application.py ... 
+```
+These two commands show the basic command-line structure of deploying the 
+profilers. Below we discuss important use cases that are relevant in 
+large scale distributed profiling.
+
+We can use `nsys` to trace an application running on multiple ranks and 
+multiple nodes. A simple example, where we use a wrapper script to trace the 
+rank 0 on each node of a 2 node job running a PyTorch application is below:
+
+### An `nsys` wrapper
+
+We can use `nsys` to trace an application running on multiple ranks and
+multiple nodes. A simple example, where we use a wrapper script to trace the
+rank 0 on each node of a 4 node job running a PyTorch application is below:
+
+```
+#!/bin/bash
+## This wrapper should be used with nsys profiler to trace in any number of nodes
+## The script is set up to trace rank 0 of first 2 Nodes in the case of
+## profiling a job running on larger than 2 nodes.
+FNAME_EXT=$(basename "$2")
+FNAME="${FNAME_EXT%%.*}"
+
+NNODES=`wc -l < $PBS_NODEFILE`
+
+WORK_DIR=/path/to/the/Python/application
+DTAG=$(date +%F_%H%M%S)
+PROFILER_OUTDIR=${WORK_DIR}/profiles/choice_of_name_nsys_n${NNODES}_${DTAG}/${FNAME}_n${NNODES}_${DTAG}
+RUN_ID=choice_of_name_nsys_n${NNODES}_${DTAG}
+
+mkdir -p ${PROFILER_OUTDIR}
+NSYS_OPTS=" -o ${PROFILER_OUTDIR}/${RUN_ID}_%q{PMI_RANK} --stats=true --show-output=true "
+
+PROFRANK=0
+RANKCUTOFF=8
+
+if [[ $PALS_LOCAL_RANKID -eq $PROFRANK ]] && [[ $PMI_RANK -lt $RANKCUTOFF ]]; then
+  echo "On rank ${PMI_RANK}, collecting traces "
+  nsys profile $NSYS_OPTS "$@"
+else
+  "$@"
+fi
+```
+There are a few important things to notice in the wrapper.
+
+- `NSYS_OPTS`: These are the options that `nsys` uses to trace data at
+different levels. An exhaustive list of options can be found in the 
+[nsys user guide](https://docs.nvidia.com/nsight-systems/UserGuide/index.html).
+Note that, `%q{PMI_RANK}` is essential to get a per rank profile.
+
+
+- `PROFRANK`: As implemented, this variable is set by the user to trace the rank
+of choice. For example, this wrapper will trace the rank 0 on each node.
+
+- `RANKCUTOFF`: This variable is Polaris specific. As we can run as many as 4
+ranks per node (without using MPS), the first 2 nodes of a job will have 8
+ranks running. This provides the upper cutoff of the label (in number) of ranks,
+beyond which `nsys` will not trace any rank. An user can change the number
+according to the number of maximum ranks running per node to set up how many
+ranks to be
+traced. `nsys` will produce a profile (`nsys-rep` file, by default) per traced
+rank.
+
+To view the produced trace files, we need to use NVIDIA's Nsight Systems on the 
+local machine
+
+[Getting Started, Download Nsys](https://developer.nvidia.com/nsight-systems/get-started)
+
+#### Deployment
+
+The wrapper above can be deployed using a PBS job script the following way
+
+```
+#!/bin/bash -l
+#PBS -l select=2:system=polaris
+#PBS -l place=scatter
+#PBS -l walltime=0:05:00
+#PBS -q debug-scaling
+#PBS -l filesystems=home:eagle
+#PBS -A YOUR ALLOCATION
+
+
+# What's the benchmark work directory?
+WORK_DIR=/path/to/the/Python/program
+TEMPORARY_DIR=/path/to/a/temporary/directory/for/`nsys`/to/use
+NSYS_WRAPPER=${WORK_DIR}/nsys_wrapper.sh
+
+# MPI and OpenMP settings
+NNODES=`wc -l < $PBS_NODEFILE`
+NRANKS_PER_NODE=4
+
+let NRANKS=${NNODES}*${NRANKS_PER_NODE}
+
+module use /soft/modulefiles/
+module load conda/2024-04-29
+conda activate
+
+mpiexec -n ${NRANKS} -ppn ${NRANKS_PER_NODE} --env TMPDIR=${TEMPORARY_DIR} -l --line-buffer \
+${NSYS_WRAPPER} python ${WORK_DIR}/application.py
+```
+
+Note that, `--env TMPDIR=${TEMPORARY_DIR}` is critical for the `nsys` 
+functioning.
+
+### An `ncu` wrapper
+
+We can get kernel level information (for example roofline, tensorcore usage)
+using NVIDIA's Nsight Compute profiler. Below is a simple wrapper script to 
+show the usage.
+
+```
+#!/bin/bash
+FNAME_EXT=$(basename "$2")
+FNAME="${FNAME_EXT%%.*}"
+
+NNODES=`wc -l < $PBS_NODEFILE`
+
+WORK_DIR=/path/to/the/Python/program
+DTAG=$(date +%F_%H%M%S)
+PROFILER_OUTDIR=${WORK_DIR}/profiles/choice_of_name_ncu_n${NNODES}_${DTAG}/${FNAME}_n${NNODES}_${DTAG}
+RUN_ID=choice_of_name_ncu_n${NNODES}_${DTAG}
+
+mkdir -p ${PROFILER_OUTDIR}
+#KERNEL_NAME=ampere_sgemm_128x128_tn
+KERNEL_NAME=ampere_bf16_s16816gemm_bf16_128x256_ldg8_f2f_stages_64x3_tn
+#NCU_OPTS_DETAILED=" --set detailed -k ${KERNEL_NAME} -o ${PROFILER_OUTDIR}/${RUN_ID}_%q{PMI_RANK} "
+NCU_OPTS_ROOFLINE=" --set roofline -k ${KERNEL_NAME} -o ${PROFILER_OUTDIR}/${RUN_ID}_%q{PMI_RANK} "
+#NCU_OPTS_FULL=" --set full -k ${KERNEL_NAME} -o ${PROFILER_OUTDIR}/${RUN_ID}_%q{PMI_RANK} "
+
+PROFRANK=0
+RANKCUTOFF=8
+
+if [[ $PALS_LOCAL_RANKID -eq $PROFRANK ]] && [[ $PMI_RANK -lt $RANKCUTOFF ]]; then
+  echo "On rank ${PMI_RANK}, collecting traces "
+  ncu $NCU_OPTS_DETAILED "$@"
+else
+  "$@"
+fi
+```
+
+This wrapper can be deployed as the `nsys` example above. In the `ncu` wrapper
+we explicitly set the name of the kernel that we want to analyze 
+(a gemm kernel in this case).
+The exhaustive list of option to set the amount
+of data collection can be found in the 
+[command line section](https://docs.nvidia.com/nsight-compute/NsightComputeCli/index.html#command-line-options)
+of the documentation. Here we only show standard options, either of the three 
+could be chosen. Note that, invoking each option will lead to varying amounts 
+of time the profiler need to run. This will be important in setting the 
+requested wall-time for your batch job.
+
+`ncu` will generate `ncu-rep` files for each traced ranks, and we will need 
+NVIDIA's Nsight Compute system on the local machine.
+
+[Download Nsight Compute](https://developer.nvidia.com/tools-overview/nsight-compute/get-started)
+
+The next step is to load the `nsys-rep` files in the Nsight Systems GUI, and
+the `ncu-rep` files to the Nsight Compute GUI.
+
+### For a single rank run
+
+#### `nsys` profiles
+In the single rank case, we go to the top left, go `file` --> `open` and select
+the file that we want to look at. For this particular example, we have focused
+on the GPU activities. This activity is shown on the second column from the
+left, named as `CUDA HW ...`. If we expand the `CUDA HW ...` tab, we find an
+`NCCL` tab. This tab shows the communicaltion library calls. 
+
+#### `ncu` profiles
+The primary qualitative distinction between the `nsys-rep` files and the
+`ncu-rep` files is that, the `nsys-rep` file presents data for the overall
+execution of the application, whereas  the `ncu-rep` file presents data for the
+execution of one particular kernel. Our setup here traces only one kernel, but
+multiple kernels could be traced at a time, but that can become a time consuming
+process.
+
+We use the `--stats=true --show-output=true`(see `nsys_wrapper.sh`)
+options while collecting the
+`nsys` data. As a result, we get a system-wide summary in our `.OU` files
+(if run with a job submission script, otherwise on the terminal), and find the
+names of the kernels that has been called/used for compute and communication.
+Often we would start with investigating the kernels that have been called the
+most times or the ones where we spent the most time executing them. In this
+particular instance we chose to analyze the `gemm` kernels, which are related
+to the matrix multiplication. The full name of this kernel is passed to the
+`ncu` profiler with the option `-k` (see `ncu_wrapper.sh`).
+
+Loading the `ncu-rep` files works similarly as the `nsys-rep` files. Here, the
+important tab is the `Details` tab. We find that at the 3rd row from the top.
+Under that tab we have the `GPU Speed of Light Throughput` section. In this
+section we can find plots showing GPU compute and memory usage. On the right
+hand side of the tab, there is a menu bar which gives us the option to select
+which plot to display, either the roofline plot or the compute-memory
+throughput chart.
+
+### For a multi-rank run
+
+#### `nsys` profiles
+In the case, where we have traced multiple ranks, whether from a single node or
+multiple nodes `nsys` GUI allow us to view the reports in a combined fashion on
+a single timeline (same time-axis for both reports). This is done through the
+"multi-report view", `file` --> `New multi-report view` or `file` --> `Open`
+and selecting however many reports we would like to see in a combined timeline,
+`nsys` prompts the user to allow for a "multi-report view". These can also be
+viewed separately.
+
+### Profiler Options
+In both cases, `nsys` and `ncu` we have used the standard option sets to
+generate the profiles. The exhaustive list could be found in the respective
+documentation pages:
+
+[Nsight System User Guide](https://docs.nvidia.com/nsight-systems/UserGuide/index.html)
+
+[Nsight Compute Documentation](https://docs.nvidia.com/nsight-compute/)
+
+[Nsight Compute CLI](https://docs.nvidia.com/nsight-compute/NsightComputeCli/index.html)
+
+There are many other information provided through these reports. Here we have
+discussed the way to view the high level information.
+
+
diff --git a/mkdocs.yml b/mkdocs.yml
index 18f475e62..9166964f1 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -74,6 +74,7 @@ nav:
     - Data Science:
         - Julia: polaris/data-science/julia.md
         - Python: polaris/data-science/python.md
+        - Profiling: polaris/data-science/profiling_dl.md   
         - Frameworks:
             - TensorFlow: polaris/data-science/frameworks/tensorflow.md
             - PyTorch: polaris/data-science/frameworks/pytorch.md

From 2cea3527d9f3c47dabf33aa0ed11e443b84f941c Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 09:30:29 -0600
Subject: [PATCH 03/32] Update docs/aurora/data-science/profiling_dl.md

Co-authored-by: Kyle Gerard Felker <felker@anl.gov>
---
 docs/aurora/data-science/profiling_dl.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/aurora/data-science/profiling_dl.md b/docs/aurora/data-science/profiling_dl.md
index 540b03b0e..a34ba53d9 100644
--- a/docs/aurora/data-science/profiling_dl.md
+++ b/docs/aurora/data-science/profiling_dl.md
@@ -74,7 +74,7 @@ rank.
 
 ### Deployment
 
-The wrapper above can be deployed using a PBS job script the following way
+The wrapper above can be deployed using the following PBS job script:
 
 ```
 #!/bin/bash -x

From 82162308a515f67250f3e918c96c275389c58b19 Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 09:30:38 -0600
Subject: [PATCH 04/32] Update docs/polaris/data-science/profiling_dl.md

Co-authored-by: Kyle Gerard Felker <felker@anl.gov>
---
 docs/polaris/data-science/profiling_dl.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/polaris/data-science/profiling_dl.md b/docs/polaris/data-science/profiling_dl.md
index 91b4c37b3..beee6bed6 100644
--- a/docs/polaris/data-science/profiling_dl.md
+++ b/docs/polaris/data-science/profiling_dl.md
@@ -1,6 +1,6 @@
 # Profiling Deep Learning Applications
 
-We can use both framework (for example, PyTorch) native profiler and vendor specific 
+We can use both a framework-specific (for example, PyTorch-specific) native profiler and the vendor-specific NVIDIA
 [Nsys profiler](https://developer.nvidia.com/nsight-systems/get-started) to get
 high level profiling information and timeline of execution for an application.
 For kernel level information, we may use 

From 3d9f724184c6e270a0708a37574df7e514311e6f Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 09:31:13 -0600
Subject: [PATCH 05/32] Update docs/polaris/data-science/profiling_dl.md

Co-authored-by: Kyle Gerard Felker <felker@anl.gov>
---
 docs/polaris/data-science/profiling_dl.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/polaris/data-science/profiling_dl.md b/docs/polaris/data-science/profiling_dl.md
index beee6bed6..1ccea3235 100644
--- a/docs/polaris/data-science/profiling_dl.md
+++ b/docs/polaris/data-science/profiling_dl.md
@@ -128,7 +128,7 @@ mpiexec -n ${NRANKS} -ppn ${NRANKS_PER_NODE} --env TMPDIR=${TEMPORARY_DIR} -l --
 ${NSYS_WRAPPER} python ${WORK_DIR}/application.py
 ```
 
-Note that, `--env TMPDIR=${TEMPORARY_DIR}` is critical for the `nsys` 
+Note that `--env TMPDIR=${TEMPORARY_DIR}` is essential for `nsys` to function correctly.
 functioning.
 
 ### An `ncu` wrapper

From 85f6b092b6444f1ca767e3c5e2bff09d4b525a26 Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 09:31:31 -0600
Subject: [PATCH 06/32] Update docs/polaris/data-science/profiling_dl.md

Co-authored-by: Kyle Gerard Felker <felker@anl.gov>
---
 docs/polaris/data-science/profiling_dl.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/polaris/data-science/profiling_dl.md b/docs/polaris/data-science/profiling_dl.md
index 1ccea3235..86bca4c02 100644
--- a/docs/polaris/data-science/profiling_dl.md
+++ b/docs/polaris/data-science/profiling_dl.md
@@ -129,7 +129,6 @@ ${NSYS_WRAPPER} python ${WORK_DIR}/application.py
 ```
 
 Note that `--env TMPDIR=${TEMPORARY_DIR}` is essential for `nsys` to function correctly.
-functioning.
 
 ### An `ncu` wrapper
 

From 9498ce0eecc8e8784bcd1282db0a652c10fb0551 Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 09:31:52 -0600
Subject: [PATCH 07/32] Update docs/polaris/data-science/profiling_dl.md

Co-authored-by: Kyle Gerard Felker <felker@anl.gov>
---
 docs/polaris/data-science/profiling_dl.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/polaris/data-science/profiling_dl.md b/docs/polaris/data-science/profiling_dl.md
index 86bca4c02..28e670fb3 100644
--- a/docs/polaris/data-science/profiling_dl.md
+++ b/docs/polaris/data-science/profiling_dl.md
@@ -109,7 +109,7 @@ The wrapper above can be deployed using a PBS job script the following way
 #PBS -A YOUR ALLOCATION
 
 
-# What's the benchmark work directory?
+# What's the working directory for the benchmark?
 WORK_DIR=/path/to/the/Python/program
 TEMPORARY_DIR=/path/to/a/temporary/directory/for/`nsys`/to/use
 NSYS_WRAPPER=${WORK_DIR}/nsys_wrapper.sh

From a9868e7339100e753fd5e69ec5f44183cf2835d1 Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 09:32:01 -0600
Subject: [PATCH 08/32] Update docs/polaris/data-science/profiling_dl.md

Co-authored-by: Kyle Gerard Felker <felker@anl.gov>
---
 docs/polaris/data-science/profiling_dl.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/polaris/data-science/profiling_dl.md b/docs/polaris/data-science/profiling_dl.md
index 28e670fb3..9120155be 100644
--- a/docs/polaris/data-science/profiling_dl.md
+++ b/docs/polaris/data-science/profiling_dl.md
@@ -185,7 +185,7 @@ NVIDIA's Nsight Compute system on the local machine.
 The next step is to load the `nsys-rep` files in the Nsight Systems GUI, and
 the `ncu-rep` files to the Nsight Compute GUI.
 
-### For a single rank run
+### Single rank run
 
 #### `nsys` profiles
 In the single rank case, we go to the top left, go `file` --> `open` and select

From 291b4fabb60ea823a20316a2e505ee1ada320940 Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 09:32:14 -0600
Subject: [PATCH 09/32] Update docs/polaris/data-science/profiling_dl.md

Co-authored-by: Kyle Gerard Felker <felker@anl.gov>
---
 docs/polaris/data-science/profiling_dl.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/polaris/data-science/profiling_dl.md b/docs/polaris/data-science/profiling_dl.md
index 9120155be..2a791612c 100644
--- a/docs/polaris/data-science/profiling_dl.md
+++ b/docs/polaris/data-science/profiling_dl.md
@@ -175,7 +175,7 @@ of data collection can be found in the
 of the documentation. Here we only show standard options, either of the three 
 could be chosen. Note that, invoking each option will lead to varying amounts 
 of time the profiler need to run. This will be important in setting the 
-requested wall-time for your batch job.
+requested walltime for your batch job.
 
 `ncu` will generate `ncu-rep` files for each traced ranks, and we will need 
 NVIDIA's Nsight Compute system on the local machine.

From a2862df5ed472b4002f5fd60bf856236871b352e Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 09:32:32 -0600
Subject: [PATCH 10/32] Update docs/polaris/data-science/profiling_dl.md

Co-authored-by: Kyle Gerard Felker <felker@anl.gov>
---
 docs/polaris/data-science/profiling_dl.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/polaris/data-science/profiling_dl.md b/docs/polaris/data-science/profiling_dl.md
index 2a791612c..a0cd5a6c3 100644
--- a/docs/polaris/data-science/profiling_dl.md
+++ b/docs/polaris/data-science/profiling_dl.md
@@ -97,7 +97,7 @@ local machine
 
 #### Deployment
 
-The wrapper above can be deployed using a PBS job script the following way
+The wrapper above can be deployed using the following PBS job script:
 
 ```
 #!/bin/bash -l

From b3ddb458f9ebeda80304be8967ac8efa8313910c Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 09:32:52 -0600
Subject: [PATCH 11/32] Update docs/polaris/data-science/profiling_dl.md

Co-authored-by: Kyle Gerard Felker <felker@anl.gov>
---
 docs/polaris/data-science/profiling_dl.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/polaris/data-science/profiling_dl.md b/docs/polaris/data-science/profiling_dl.md
index a0cd5a6c3..be3a12348 100644
--- a/docs/polaris/data-science/profiling_dl.md
+++ b/docs/polaris/data-science/profiling_dl.md
@@ -132,7 +132,7 @@ Note that `--env TMPDIR=${TEMPORARY_DIR}` is essential for `nsys` to function co
 
 ### An `ncu` wrapper
 
-We can get kernel level information (for example roofline, tensorcore usage)
+We can get kernel level information (for example roofline, Tensor Core usage)
 using NVIDIA's Nsight Compute profiler. Below is a simple wrapper script to 
 show the usage.
 

From 36e726b598262d69ab004243b3c7850a5be22ed7 Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 09:33:02 -0600
Subject: [PATCH 12/32] Update docs/polaris/data-science/profiling_dl.md

Co-authored-by: Kyle Gerard Felker <felker@anl.gov>
---
 docs/polaris/data-science/profiling_dl.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/polaris/data-science/profiling_dl.md b/docs/polaris/data-science/profiling_dl.md
index be3a12348..ae3722f80 100644
--- a/docs/polaris/data-science/profiling_dl.md
+++ b/docs/polaris/data-science/profiling_dl.md
@@ -168,7 +168,7 @@ fi
 
 This wrapper can be deployed as the `nsys` example above. In the `ncu` wrapper
 we explicitly set the name of the kernel that we want to analyze 
-(a gemm kernel in this case).
+(a GEMM kernel in this case).
 The exhaustive list of option to set the amount
 of data collection can be found in the 
 [command line section](https://docs.nvidia.com/nsight-compute/NsightComputeCli/index.html#command-line-options)

From 4f5b75076b7be989c06d0b312490b3dd8850141a Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 09:33:29 -0600
Subject: [PATCH 13/32] Update docs/aurora/data-science/profiling_dl.md

Co-authored-by: Kyle Gerard Felker <felker@anl.gov>
---
 docs/aurora/data-science/profiling_dl.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/aurora/data-science/profiling_dl.md b/docs/aurora/data-science/profiling_dl.md
index a34ba53d9..896342f88 100644
--- a/docs/aurora/data-science/profiling_dl.md
+++ b/docs/aurora/data-science/profiling_dl.md
@@ -50,7 +50,7 @@ else
 fi
 
 ```
-There are a few important things to notice in the wrapper.
+There are several important shell variables in the wrapper, which may require modification:
 
 - `UNITRACE_DIR`: This is the main `unitrace` directory, which may change after
 an update to the programming environment.

From 3e3b4de0be1089492ec72b290bb0537ce3496a65 Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 10:06:08 -0600
Subject: [PATCH 14/32] Updated the code block title for the unitrace wrapper.

---
 docs/aurora/data-science/profiling_dl.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/docs/aurora/data-science/profiling_dl.md b/docs/aurora/data-science/profiling_dl.md
index 896342f88..d746d200b 100644
--- a/docs/aurora/data-science/profiling_dl.md
+++ b/docs/aurora/data-science/profiling_dl.md
@@ -11,8 +11,7 @@ We can use `unitrace` to trace an application running on multiple ranks and
 multiple nodes. A simple example, where we use a wrapper script to trace the
 rank 0 on each node of a 4 node job running a PyTorch application is below:
 
-### A `unitrace` wrapper
-```
+```bash title="unitrace_wrapper.sh"
 #!/bin/bash
 ## This wrapper should be used with unitrace to trace in any number of nodes.
 ## The script for this example is set up to trace rank 0 of first 4 Nodes in the case of
@@ -70,7 +69,7 @@ beyond which `unitrace` will not trace any rank. An user can change the number
 according to the number of maximum ranks running per node to set up how many 
 ranks to be
 traced. `unitrace` will produce a profile (`json` file, by default) per traced 
-rank.
+rank. This profile can be viewed using the [perfetto trace viewer](https://ui.perfetto.dev/)
 
 ### Deployment
 

From 1a6a6baf6c49881317f0c3fcfafe2a58aef6bf49 Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 10:19:23 -0600
Subject: [PATCH 15/32] Added stylistic suggestions for the Polaris profiling
 page.

---
 docs/polaris/data-science/profiling_dl.md | 24 +++++++++++------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/docs/polaris/data-science/profiling_dl.md b/docs/polaris/data-science/profiling_dl.md
index ae3722f80..94bcabd88 100644
--- a/docs/polaris/data-science/profiling_dl.md
+++ b/docs/polaris/data-science/profiling_dl.md
@@ -7,13 +7,13 @@ For kernel level information, we may use
 [Nsight compute profiler](https://developer.nvidia.com/tools-overview/nsight-compute/get-started).
 Refer to the respective documentation for more details:
 
-[Nsight System User Guide](https://docs.nvidia.com/nsight-systems/UserGuide/index.html)
+- [Nsight System User Guide](https://docs.nvidia.com/nsight-systems/UserGuide/index.html)
 
-[Nsight Compute Documentation](https://docs.nvidia.com/nsight-compute/)
+- [Nsight Compute Documentation](https://docs.nvidia.com/nsight-compute/)
 
-[Nsight Compute CLI](https://docs.nvidia.com/nsight-compute/NsightComputeCli/index.html)
+- [Nsight Compute CLI](https://docs.nvidia.com/nsight-compute/NsightComputeCli/index.html)
 
-[PyTorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html)
+- [PyTorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html)
 
 ## Example Usage
 
@@ -36,13 +36,11 @@ We can use `nsys` to trace an application running on multiple ranks and
 multiple nodes. A simple example, where we use a wrapper script to trace the 
 rank 0 on each node of a 2 node job running a PyTorch application is below:
 
-### An `nsys` wrapper
-
 We can use `nsys` to trace an application running on multiple ranks and
 multiple nodes. A simple example, where we use a wrapper script to trace the
 rank 0 on each node of a 4 node job running a PyTorch application is below:
 
-```
+```bash title="nsys_wrapper.sh"
 #!/bin/bash
 ## This wrapper should be used with nsys profiler to trace in any number of nodes
 ## The script is set up to trace rank 0 of first 2 Nodes in the case of
@@ -99,7 +97,7 @@ local machine
 
 The wrapper above can be deployed using the following PBS job script:
 
-```
+```bash title="pbs_jobscript_nsys.sh"
 #!/bin/bash -l
 #PBS -l select=2:system=polaris
 #PBS -l place=scatter
@@ -130,13 +128,13 @@ ${NSYS_WRAPPER} python ${WORK_DIR}/application.py
 
 Note that `--env TMPDIR=${TEMPORARY_DIR}` is essential for `nsys` to function correctly.
 
-### An `ncu` wrapper
+### A `ncu` wrapper
 
 We can get kernel level information (for example roofline, Tensor Core usage)
 using NVIDIA's Nsight Compute profiler. Below is a simple wrapper script to 
 show the usage.
 
-```
+```bash title="ncu_wrapper.sh"
 #!/bin/bash
 FNAME_EXT=$(basename "$2")
 FNAME="${FNAME_EXT%%.*}"
@@ -237,11 +235,11 @@ In both cases, `nsys` and `ncu` we have used the standard option sets to
 generate the profiles. The exhaustive list could be found in the respective
 documentation pages:
 
-[Nsight System User Guide](https://docs.nvidia.com/nsight-systems/UserGuide/index.html)
+- [Nsight System User Guide](https://docs.nvidia.com/nsight-systems/UserGuide/index.html)
 
-[Nsight Compute Documentation](https://docs.nvidia.com/nsight-compute/)
+- [Nsight Compute Documentation](https://docs.nvidia.com/nsight-compute/)
 
-[Nsight Compute CLI](https://docs.nvidia.com/nsight-compute/NsightComputeCli/index.html)
+- [Nsight Compute CLI](https://docs.nvidia.com/nsight-compute/NsightComputeCli/index.html)
 
 There are many other information provided through these reports. Here we have
 discussed the way to view the high level information.

From 9484e255e984513bae6c45265e9b3c3dcc6a1219 Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 10:52:49 -0600
Subject: [PATCH 16/32] Added pytorch profiler to the Polaris page

---
 docs/polaris/data-science/profiling_dl.md | 33 +++++++++++++++++++++++
 mkdocs.yml                                |  3 ---
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/docs/polaris/data-science/profiling_dl.md b/docs/polaris/data-science/profiling_dl.md
index 94bcabd88..a3e971401 100644
--- a/docs/polaris/data-science/profiling_dl.md
+++ b/docs/polaris/data-science/profiling_dl.md
@@ -245,3 +245,36 @@ There are many other information provided through these reports. Here we have
 discussed the way to view the high level information.
 
 
+### PyTorch Profiler
+
+Using the PyTorch profiler requires changes in the application source code. A
+simple example is the following:
+
+```py title="pytorch_profiler_example.py"
+from torch.profiler import profile, record_function, ProfilerActivity
+
+# A tracer decorator for a function to be traced
+def trace_func(func):
+   def wrapper(*args, **kwargs):
+      try:
+         function_name = func.__func__.__qualname__
+      except:
+         function_name = func.__qualname__
+      with record_function(function_name):
+         return func(*args, **kwargs)
+   return wrapper
+
+@trace_func
+def trace_this_function(a, b, c):
+    ...
+    ...
+    return x, y, z
+
+activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA]
+
+with profile(activities=activities, record_shapes=True) as prof:
+    result = trace_this_function(a, b, c)
+prof.export_chrome_trace(f"{/path/to/the/trace/dir}/{name/of/the/trace}-{rank}-of-{world_size}.json")
+```
+This procedure described above works for both single and multi-rank deployments.
+
diff --git a/mkdocs.yml b/mkdocs.yml
index 9edc1e5ae..237300706 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -252,7 +252,6 @@ nav:
       - SambaNova Documentation: ai-testbed/sambanova/documentation.md
       # - Performance Tools: ai-testbed/sambanova/performance-tools.md
     - Data Management: ai-testbed/data-management/data-management-overview.md
-<<<<<<< HEAD
   - Aurora:
       - Getting Started: aurora/getting-started-on-aurora.md
       - Aurora PE: aurora/aurora-pe.md
@@ -338,8 +337,6 @@ nav:
     #- Visualization:
       #- Visualization on Crux: crux/visualization/visualization.md
       #- ParaView (Launch from Client): crux/visualization/paraview.md
-=======
->>>>>>> main
   - Facility Policies:
     - Overview of Policies: policies/facility-policies.md
     - ALCF Acknowledgement Policy: policies/alcf-acknowledgement-policy.md

From 20833b0a2699d9cca45de44edf8aff8e59fe4354 Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 11:02:39 -0600
Subject: [PATCH 17/32] Added a new line fix in the Polaris page.

---
 docs/polaris/data-science/profiling_dl.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/polaris/data-science/profiling_dl.md b/docs/polaris/data-science/profiling_dl.md
index a3e971401..2a6336861 100644
--- a/docs/polaris/data-science/profiling_dl.md
+++ b/docs/polaris/data-science/profiling_dl.md
@@ -75,7 +75,6 @@ different levels. An exhaustive list of options can be found in the
 [nsys user guide](https://docs.nvidia.com/nsight-systems/UserGuide/index.html).
 Note that, `%q{PMI_RANK}` is essential to get a per rank profile.
 
-
 - `PROFRANK`: As implemented, this variable is set by the user to trace the rank
 of choice. For example, this wrapper will trace the rank 0 on each node.
 

From 22f49e4fbc29a499727071dea24c66114ca2a1ca Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 11:10:11 -0600
Subject: [PATCH 18/32] Title fix for the ncu wrapper

---
 docs/polaris/data-science/profiling_dl.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/polaris/data-science/profiling_dl.md b/docs/polaris/data-science/profiling_dl.md
index 2a6336861..792d99c1b 100644
--- a/docs/polaris/data-science/profiling_dl.md
+++ b/docs/polaris/data-science/profiling_dl.md
@@ -127,8 +127,6 @@ ${NSYS_WRAPPER} python ${WORK_DIR}/application.py
 
 Note that `--env TMPDIR=${TEMPORARY_DIR}` is essential for `nsys` to function correctly.
 
-### A `ncu` wrapper
-
 We can get kernel level information (for example roofline, Tensor Core usage)
 using NVIDIA's Nsight Compute profiler. Below is a simple wrapper script to 
 show the usage.

From 99d208c3a2845a1b8031c9e203653ee4023f71ca Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 12:17:32 -0600
Subject: [PATCH 19/32] Update docs/aurora/data-science/profiling_dl.md

Co-authored-by: Kyle Gerard Felker <felker@anl.gov>
---
 docs/aurora/data-science/profiling_dl.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/aurora/data-science/profiling_dl.md b/docs/aurora/data-science/profiling_dl.md
index d746d200b..09efea0b6 100644
--- a/docs/aurora/data-science/profiling_dl.md
+++ b/docs/aurora/data-science/profiling_dl.md
@@ -11,7 +11,7 @@ We can use `unitrace` to trace an application running on multiple ranks and
 multiple nodes. A simple example, where we use a wrapper script to trace the
 rank 0 on each node of a 4 node job running a PyTorch application is below:
 
-```bash title="unitrace_wrapper.sh"
+```bash linenums="1" title="unitrace_wrapper.sh"
 #!/bin/bash
 ## This wrapper should be used with unitrace to trace in any number of nodes.
 ## The script for this example is set up to trace rank 0 of first 4 Nodes in the case of

From 594ffebf9ceed388bd72e7b1d8a56ca24da42291 Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 12:17:51 -0600
Subject: [PATCH 20/32] Update docs/aurora/data-science/profiling_dl.md

Co-authored-by: Kyle Gerard Felker <felker@anl.gov>
---
 docs/aurora/data-science/profiling_dl.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/docs/aurora/data-science/profiling_dl.md b/docs/aurora/data-science/profiling_dl.md
index 09efea0b6..6c91d2469 100644
--- a/docs/aurora/data-science/profiling_dl.md
+++ b/docs/aurora/data-science/profiling_dl.md
@@ -1,8 +1,6 @@
 # Profiling Deep Learning Applications
 
-On Aurora we can use the `unitrace` profiler from Intel to profile Deep 
-Learning applications. Refer to the 
-[unitrace documentation page](https://github.com/intel/pti-gpu/tree/master/tools/unitrace)
+On Aurora we can use the `unitrace` profiler from Intel to profile deep learning applications. Refer to the [`unitrace` documentation page](https://github.com/intel/pti-gpu/tree/master/tools/unitrace)
 for details.
 
 ## Example Usage

From 1e0e1ef2d85c7c3ac84934176c699ad5e80e4a46 Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 12:18:00 -0600
Subject: [PATCH 21/32] Update docs/aurora/data-science/profiling_dl.md

Co-authored-by: Kyle Gerard Felker <felker@anl.gov>
---
 docs/aurora/data-science/profiling_dl.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/aurora/data-science/profiling_dl.md b/docs/aurora/data-science/profiling_dl.md
index 6c91d2469..f67b6a4d5 100644
--- a/docs/aurora/data-science/profiling_dl.md
+++ b/docs/aurora/data-science/profiling_dl.md
@@ -67,7 +67,7 @@ beyond which `unitrace` will not trace any rank. An user can change the number
 according to the number of maximum ranks running per node to set up how many 
 ranks to be
 traced. `unitrace` will produce a profile (`json` file, by default) per traced 
-rank. This profile can be viewed using the [perfetto trace viewer](https://ui.perfetto.dev/)
+rank. This profile can be viewed using the [Perfetto trace viewer](https://ui.perfetto.dev/)
 
 ### Deployment
 

From c44636a8b0f8706632f81992b0ba1400266719a2 Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 12:18:07 -0600
Subject: [PATCH 22/32] Update docs/polaris/data-science/profiling_dl.md

Co-authored-by: Kyle Gerard Felker <felker@anl.gov>
---
 docs/polaris/data-science/profiling_dl.md | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/docs/polaris/data-science/profiling_dl.md b/docs/polaris/data-science/profiling_dl.md
index 792d99c1b..75573df95 100644
--- a/docs/polaris/data-science/profiling_dl.md
+++ b/docs/polaris/data-science/profiling_dl.md
@@ -6,13 +6,9 @@ high level profiling information and timeline of execution for an application.
 For kernel level information, we may use 
 [Nsight compute profiler](https://developer.nvidia.com/tools-overview/nsight-compute/get-started).
 Refer to the respective documentation for more details:
-
 - [Nsight System User Guide](https://docs.nvidia.com/nsight-systems/UserGuide/index.html)
-
 - [Nsight Compute Documentation](https://docs.nvidia.com/nsight-compute/)
-
 - [Nsight Compute CLI](https://docs.nvidia.com/nsight-compute/NsightComputeCli/index.html)
-
 - [PyTorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html)
 
 ## Example Usage

From 88a70eab61d19e04a521009e01660fb79c9727db Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 12:18:15 -0600
Subject: [PATCH 23/32] Update docs/polaris/data-science/profiling_dl.md

Co-authored-by: Kyle Gerard Felker <felker@anl.gov>
---
 docs/polaris/data-science/profiling_dl.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/polaris/data-science/profiling_dl.md b/docs/polaris/data-science/profiling_dl.md
index 75573df95..b090e70ac 100644
--- a/docs/polaris/data-science/profiling_dl.md
+++ b/docs/polaris/data-science/profiling_dl.md
@@ -92,7 +92,7 @@ local machine
 
 The wrapper above can be deployed using the following PBS job script:
 
-```bash title="pbs_jobscript_nsys.sh"
+```bash linenums="1" title="pbs_jobscript_nsys.sh"
 #!/bin/bash -l
 #PBS -l select=2:system=polaris
 #PBS -l place=scatter

From fa894ef01de145271a490491eb47f579b79b83cb Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 12:34:16 -0600
Subject: [PATCH 24/32] Update docs/polaris/data-science/profiling_dl.md

Co-authored-by: Kyle Gerard Felker <felker@anl.gov>
---
 docs/polaris/data-science/profiling_dl.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/polaris/data-science/profiling_dl.md b/docs/polaris/data-science/profiling_dl.md
index b090e70ac..b6b4300b3 100644
--- a/docs/polaris/data-science/profiling_dl.md
+++ b/docs/polaris/data-science/profiling_dl.md
@@ -13,8 +13,7 @@ Refer to the respective documentation for more details:
 
 ## Example Usage
 
-At the high level, the usage of the `nsys` or the `ncu` profiler can be
-summarized through the following command:
+Both the `nsys` and `ncu` profiler commands take the following generic structure:
 
 ```
 nsys profile -o profile python application.py

From ffcfcb1d8a8be75cbf190203d87919be1ca30bbc Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 12:34:37 -0600
Subject: [PATCH 25/32] Update docs/polaris/data-science/profiling_dl.md

Co-authored-by: Kyle Gerard Felker <felker@anl.gov>
---
 docs/polaris/data-science/profiling_dl.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/polaris/data-science/profiling_dl.md b/docs/polaris/data-science/profiling_dl.md
index b6b4300b3..4b16e0b7a 100644
--- a/docs/polaris/data-science/profiling_dl.md
+++ b/docs/polaris/data-science/profiling_dl.md
@@ -18,8 +18,7 @@ Both the `nsys` and `ncu` profiler commands take the following generic structure
 ```
 nsys profile -o profile python application.py
 ```
-If we want to launch with `MPI` then
-
+If we want to launch the profiled application with MPI, then `mpiexec` must be used:
 ```
 mpiexec ... nsys profile ... python application.py ... 
 ```

From bf280608c2deba278458489702305f14460941eb Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 12:34:47 -0600
Subject: [PATCH 26/32] Update docs/polaris/data-science/profiling_dl.md

Co-authored-by: Kyle Gerard Felker <felker@anl.gov>
---
 docs/polaris/data-science/profiling_dl.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/polaris/data-science/profiling_dl.md b/docs/polaris/data-science/profiling_dl.md
index 4b16e0b7a..f8607988e 100644
--- a/docs/polaris/data-science/profiling_dl.md
+++ b/docs/polaris/data-science/profiling_dl.md
@@ -125,7 +125,7 @@ We can get kernel level information (for example roofline, Tensor Core usage)
 using NVIDIA's Nsight Compute profiler. Below is a simple wrapper script to 
 show the usage.
 
-```bash title="ncu_wrapper.sh"
+```bash linenums="1" title="ncu_wrapper.sh"
 #!/bin/bash
 FNAME_EXT=$(basename "$2")
 FNAME="${FNAME_EXT%%.*}"

From 434461f88945c6d105ccb420787791e1f65bfdcc Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 12:35:05 -0600
Subject: [PATCH 27/32] Update docs/polaris/data-science/profiling_dl.md

Co-authored-by: Kyle Gerard Felker <felker@anl.gov>
---
 docs/polaris/data-science/profiling_dl.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/docs/polaris/data-science/profiling_dl.md b/docs/polaris/data-science/profiling_dl.md
index f8607988e..0cdea1e42 100644
--- a/docs/polaris/data-science/profiling_dl.md
+++ b/docs/polaris/data-science/profiling_dl.md
@@ -225,11 +225,8 @@ viewed separately.
 In both cases, `nsys` and `ncu` we have used the standard option sets to
 generate the profiles. The exhaustive list could be found in the respective
 documentation pages:
-
 - [Nsight System User Guide](https://docs.nvidia.com/nsight-systems/UserGuide/index.html)
-
 - [Nsight Compute Documentation](https://docs.nvidia.com/nsight-compute/)
-
 - [Nsight Compute CLI](https://docs.nvidia.com/nsight-compute/NsightComputeCli/index.html)
 
 There are many other information provided through these reports. Here we have

From 0c1d59e6f3a4ef5733470c11598ff872dacf9ed1 Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 12:35:25 -0600
Subject: [PATCH 28/32] Update docs/polaris/data-science/profiling_dl.md

Co-authored-by: Kyle Gerard Felker <felker@anl.gov>
---
 docs/polaris/data-science/profiling_dl.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/polaris/data-science/profiling_dl.md b/docs/polaris/data-science/profiling_dl.md
index 0cdea1e42..33996e3a3 100644
--- a/docs/polaris/data-science/profiling_dl.md
+++ b/docs/polaris/data-science/profiling_dl.md
@@ -238,7 +238,7 @@ discussed the way to view the high level information.
 Using the PyTorch profiler requires changes in the application source code. A
 simple example is the following:
 
-```py title="pytorch_profiler_example.py"
+```python linenums="1" title="pytorch_profiler_example.py"
 from torch.profiler import profile, record_function, ProfilerActivity
 
 # A tracer decorator for a function to be traced

From a7e4965eddc6ed32bf6d8d80ea64b17c8b5fad0d Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 12:45:50 -0600
Subject: [PATCH 29/32] Tried to fix the mkdocs issue by checking out the file
 from main

---
 mkdocs.yml | 86 ------------------------------------------------------
 1 file changed, 86 deletions(-)

diff --git a/mkdocs.yml b/mkdocs.yml
index 11e4110e0..1113326e9 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -153,7 +153,6 @@ nav:
     - Data Science:
         - Julia: polaris/data-science/julia.md
         - Python: polaris/data-science/python.md
-        - Profiling: polaris/data-science/profiling_dl.md   
         - Frameworks:
             - TensorFlow: polaris/data-science/frameworks/tensorflow.md
             - PyTorch: polaris/data-science/frameworks/pytorch.md
@@ -255,91 +254,6 @@ nav:
       - SambaNova Documentation: ai-testbed/sambanova/documentation.md
       # - Performance Tools: ai-testbed/sambanova/performance-tools.md
     - Data Management: ai-testbed/data-management/data-management-overview.md
-  - Aurora:
-      - Getting Started: aurora/getting-started-on-aurora.md
-      - Aurora PE: aurora/aurora-pe.md
-      - Sunspot to Aurora: aurora/sunspot-to-aurora.md
-      - Known Issues: aurora/known-issues.md
-      - Hardware Overview: aurora/hardware-overview/machine-overview.md
-      - Node Performance Overview: aurora/node-performance-overview/node-performance-overview.md
-      - Compiling and Linking:
-        - Compiling and Linking Overview: aurora/compiling-and-linking/compiling-and-linking-overview.md
-        - Programming Models: aurora/compiling-and-linking/aurora-programming-models.md
-        - Example Program and Makefile: aurora/compiling-and-linking/aurora-example-program-makefile.md
-      # - LLVM Compilers: aurora/compiling-and-linking/llvm-compilers-aurora.md
-      # - GNU Compilers: aurora/compiling-and-linking/gnu-compilers-aurora.md
-      # - CCE Compilers: aurora/compiling-and-linking/cce-compilers-aurora.md
-      # - Continuous Integration: aurora/compiling-and-linking/continuous-integration-aurora.md
-      - Build Tools:
-        - CMake: aurora/build-tools/cmake-aurora.md
-      - Running Jobs: aurora/running-jobs-aurora.md
-      - Data Management:
-        - Copper: aurora/data-management/copper/copper.md
-        - DAOS: aurora/data-management/daos/daos-overview.md
-        - Lustre (Flare): aurora/data-management/lustre/flare.md
-        - Moving data to Aurora:
-          - DAOS Data Mover: aurora/data-management/moving_data_to_aurora/daos_datamover.md
-          - Globus: aurora/data-management/moving_data_to_aurora/globus.md
-          - SCP: aurora/data-management/moving_data_to_aurora/scp.md
-      - Applications and Libraries:
-        - Libraries:
-          - Cabana: aurora/applications-and-libraries/libraries/cabana-aurora.md
-          #- Math Libraries: aurora/applications-and-libraries/libraries/math-libraries.md
-          #- MKL: aurora/applications-and-libraries/libraries/mkl.md
-          #- MPI: aurora/applications-and-libraries/libraries/mpi.md
-          #- oneDAL: aurora/applications-and-libraries/libraries/onedal.md
-          - Spack PE: aurora/applications-and-libraries/libraries/spack-pe.md
-      - Data Science:
-        #- Julia: aurora/data-science/julia.md
-        - Python: aurora/data-science/python.md
-        #- Applications:
-          #- gpt-neox: aurora/data-science/applications/gpt-neox.md
-        - Containers: aurora/data-science/containers/containers.md
-        - Profiling: aurora/data-science/profiling_dl.md  
-        - Frameworks:
-          #- DeepSpeed: aurora/data-science/frameworks/deepspeed.md
-          #- JAX: aurora/data-science/frameworks/jax.md
-          - PyTorch: aurora/data-science/frameworks/pytorch.md
-          - TensorFlow: aurora/data-science/frameworks/tensorflow.md
-          - LibTorch: aurora/data-science/frameworks/libtorch.md
-          - OpenVINO: aurora/data-science/frameworks/openvino.md
-          - oneCCL: aurora/data-science/frameworks/oneCCL.md
-      - Programming Models:
-        - Kokkos: aurora/programming-models/kokkos-aurora.md
-        - Level Zero: aurora/programming-models/level-0.md
-        - OpenCL: aurora/programming-models/opencl-aurora.md
-        - OpenMP: aurora/programming-models/openmp-aurora.md
-        #- RAJA: aurora/programming-models/raja-aurora.md
-        - SYCL: aurora/programming-models/sycl-aurora.md
-      - Debugging Tools:
-        - Overview: aurora/debugging/debugging-overview.md
-        #- gdb-oneapi: aurora/debugging/gdb-oneapi-aurora.md
-      - Performance Tools:
-      #  - Overview: aurora/performance-tools/performance-overview.md
-        - Advisor: aurora/performance-tools/advisor.md
-        - VTune: aurora/performance-tools/vtune.md
-      # - Visualization:
-      #   - ParaView: aurora/visualization/paraview.md
-      - Services:
-        - GitLab: aurora/services/gitlab-ci.md
-        #- JupyterHub: aurora/services/jupyterhub.md
-      - Workflows:
-        # - Balsam: aurora/workflows/balsam.md
-        # - DeepHyper: aurora/workflows/deephyper.md
-        # - libEnsemble: aurora/workflows/libensemble.md
-        # - Parsl: aurora/workflows/parsl.md
-        - SmartSim: aurora/workflows/smartsim.md
-  - Crux:
-    - Machine Overview: crux/hardware-overview/machine-overview.md
-    - Getting Started: crux/getting-started.md
-    - Running Jobs: crux/queueing-and-running-jobs/running-jobs.md
-    - Compiling and Linking: crux/compiling-and-linking/compiling-and-linking-overview.md
-    - Containers: crux/containers/containers.md
-    - Data Science:
-      - Python: crux/data-science/python.md
-    #- Visualization:
-      #- Visualization on Crux: crux/visualization/visualization.md
-      #- ParaView (Launch from Client): crux/visualization/paraview.md
   - Facility Policies:
     - Overview of Policies: policies/facility-policies.md
     - ALCF Acknowledgement Policy: policies/alcf-acknowledgement-policy.md

From a82f598376ee8419fd1161c0b98868701598fd0e Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 13:05:21 -0600
Subject: [PATCH 30/32] Added latest suggestions. Replaced mkdocs from main, no
 nav bar.

---
 docs/polaris/data-science/profiling_dl.md | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/docs/polaris/data-science/profiling_dl.md b/docs/polaris/data-science/profiling_dl.md
index 33996e3a3..15d97b9c8 100644
--- a/docs/polaris/data-science/profiling_dl.md
+++ b/docs/polaris/data-science/profiling_dl.md
@@ -30,11 +30,7 @@ We can use `nsys` to trace an application running on multiple ranks and
 multiple nodes. A simple example, where we use a wrapper script to trace the 
 rank 0 on each node of a 2 node job running a PyTorch application is below:
 
-We can use `nsys` to trace an application running on multiple ranks and
-multiple nodes. A simple example, where we use a wrapper script to trace the
-rank 0 on each node of a 4 node job running a PyTorch application is below:
-
-```bash title="nsys_wrapper.sh"
+```bash linenums="1" title="nsys_wrapper.sh"
 #!/bin/bash
 ## This wrapper should be used with nsys profiler to trace in any number of nodes
 ## The script is set up to trace rank 0 of first 2 Nodes in the case of

From f2d7de0719864bebcfbbdcdcc81b10c0351f4bdc Mon Sep 17 00:00:00 2001
From: Khalid Hossain <khossain4337@gmail.com>
Date: Thu, 16 Jan 2025 13:17:16 -0600
Subject: [PATCH 31/32] Added the DL Profiling page to the nav bar

---
 mkdocs.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mkdocs.yml b/mkdocs.yml
index eb58cfb1d..299c64e02 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -77,6 +77,7 @@ nav:
         #- Applications:
           #- gpt-neox: aurora/data-science/applications/gpt-neox.md
         - Containers: aurora/data-science/containers/containers.md
+        - Profiling: aurora/data-science/profiling_dl.md
         - Frameworks:
           - DeepSpeed: aurora/data-science/frameworks/deepspeed.md
           #- JAX: aurora/data-science/frameworks/jax.md
@@ -153,6 +154,7 @@ nav:
     - Data Science:
         - Julia: polaris/data-science/julia.md
         - Python: polaris/data-science/python.md
+        - Profiling: polaris/data-science/profiling_dl.md
         - Frameworks:
             - TensorFlow: polaris/data-science/frameworks/tensorflow.md
             - PyTorch: polaris/data-science/frameworks/pytorch.md

From 526a3643d9fd7f7707466c1828dd73df8027fa1a Mon Sep 17 00:00:00 2001
From: Kyle Gerard Felker <felker@anl.gov>
Date: Thu, 16 Jan 2025 13:33:03 -0600
Subject: [PATCH 32/32] Use code annotations

---
 docs/aurora/data-science/profiling_dl.md | 30 ++++++++++--------------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/docs/aurora/data-science/profiling_dl.md b/docs/aurora/data-science/profiling_dl.md
index f67b6a4d5..a4a6e38a2 100644
--- a/docs/aurora/data-science/profiling_dl.md
+++ b/docs/aurora/data-science/profiling_dl.md
@@ -7,8 +7,9 @@ for details.
 
 We can use `unitrace` to trace an application running on multiple ranks and 
 multiple nodes. A simple example, where we use a wrapper script to trace the
-rank 0 on each node of a 4 node job running a PyTorch application is below:
+rank 0 on each node of a 4 node job running a PyTorch application is below.
 
+There are several important shell variables in the wrapper, which may require modification:
 ```bash linenums="1" title="unitrace_wrapper.sh"
 #!/bin/bash
 ## This wrapper should be used with unitrace to trace in any number of nodes.
@@ -20,7 +21,7 @@ FNAME="${FNAME_EXT%%.*}"
 NNODES=`wc -l < $PBS_NODEFILE`
 
 WORK_DIR=/path/to/the/Python/program
-UNITRACE_DIR=/opt/aurora/24.180.1/support/tools/pti-gpu/063214e
+UNITRACE_DIR=/opt/aurora/24.180.1/support/tools/pti-gpu/063214e # (1)!
 UNITRACE_LIB=${UNITRACE_DIR}/lib64
 UNITRACE_BIN=${UNITRACE_DIR}/bin
 UNITRACE_EXE=${UNITRACE_BIN}/unitrace
@@ -30,14 +31,14 @@ mkdir -p ${UNITRACE_OUTDIR}
 UNITRACE_OPTS=" --ccl-summary-report --chrome-mpi-logging --chrome-sycl-logging \
 --chrome-device-logging \
 --chrome-ccl-logging --chrome-call-logging --chrome-dnn-logging --device-timing --host-timing \
---output-dir-path ${UNITRACE_OUTDIR} --output ${UNITRACE_OUTDIR}/UNITRACE_${FNAME}_n${NNODES}_${DTAG}.txt "
+--output-dir-path ${UNITRACE_OUTDIR} --output ${UNITRACE_OUTDIR}/UNITRACE_${FNAME}_n${NNODES}_${DTAG}.txt "  # (2)!
 
 
 export LD_LIBRARY_PATH=${UNITRACE_LIB}:${UNITRACE_BIN}:$LD_LIBRARY_PATH
 
 # Use $PMIX_RANK for MPICH and $SLURM_PROCID with srun.
-PROFRANK=0
-RANKCUTOFF=48
+PROFRANK=0 # (3)!
+RANKCUTOFF=48 # (4)!
 
 if [[ $PALS_LOCAL_RANKID -eq $PROFRANK ]] && [[ $PMIX_RANK -lt $RANKCUTOFF ]]; then
   echo "On rank $PMIX_RANK, collecting traces "
@@ -45,35 +46,29 @@ if [[ $PALS_LOCAL_RANKID -eq $PROFRANK ]] && [[ $PMIX_RANK -lt $RANKCUTOFF ]]; t
 else
   "$@"
 fi
-
 ```
-There are several important shell variables in the wrapper, which may require modification:
 
-- `UNITRACE_DIR`: This is the main `unitrace` directory, which may change after
+1. `UNITRACE_DIR`: This is the main `unitrace` directory, which may change after
 an update to the programming environment.
-
-- `UNITRACE_OPTS`: These are the options that `unitrace` uses to trace data at
+2. `UNITRACE_OPTS`: These are the options that `unitrace` uses to trace data at
 different levels. Based on the number of options, the sizes of the output 
 profiles will vary. Usually enabling more options lead to a larger profile 
 (in terms of storage in MB).
-
-- `PROFRANK`: As implemented, this variable is set by the user to trace the rank
+3. `PROFRANK`: As implemented, this variable is set by the user to trace the rank
 of choice. For example, this wrapper will trace the rank 0 on each node.
-
-- `RANKCUTOFF`: This variable is Aurora specific. As we can run as many as 12
+4. `RANKCUTOFF`: This variable is Aurora specific. As we can run as many as 12
 ranks per node (without using CCS), the first 4 nodes of a job will have 48 
 ranks running. This provides the upper cutoff of the label (in number) of ranks,
 beyond which `unitrace` will not trace any rank. An user can change the number
 according to the number of maximum ranks running per node to set up how many 
-ranks to be
-traced. `unitrace` will produce a profile (`json` file, by default) per traced 
+ranks to be traced. `unitrace` will produce a profile (`json` file, by default) per traced 
 rank. This profile can be viewed using the [Perfetto trace viewer](https://ui.perfetto.dev/)
 
 ### Deployment
 
 The wrapper above can be deployed using the following PBS job script:
 
-```
+```bash linenums="1" title="job_script.sh"
 #!/bin/bash -x
 #PBS -l select=4
 #PBS -l place=scatter
@@ -95,4 +90,3 @@ module load frameworks/2024.2.1_u1
 mpiexec --pmi=pmix -n ${NRANKS} -ppn ${NRANKS_PER_NODE} -l --line-buffer \
 ${UNITRACE_WRAPPER} python ${WORK_DIR}/application.py 
 ```
-