From 80a6fab606b0e698563818a1ec87bfeab7c7664c Mon Sep 17 00:00:00 2001
From: Uros Lipovsek <lipovsek@amazon.co.uk>
Date: Thu, 27 Apr 2023 09:44:56 +0100
Subject: [PATCH 1/4] added work from another branch

---
 content/03-Cluster/05-run-efa-nccl-tests.md   | 106 ----------
 content/03-Cluster/06-ami.md                  |  65 ++++++
 content/04-Verify cluster/01-preflight.md     |  10 +
 .../02-run-efa-nccl-tests.md                  | 177 +++++++++++++++++
 .../01-distirbuted-training.md                |  21 ++
 .../02-train-GPT3.md                          | 186 ++++++++++++++++++
 .../03-huggignface.md                         |  10 +
 .../04-torchx.md                              |   8 +
 8 files changed, 477 insertions(+), 106 deletions(-)
 delete mode 100644 content/03-Cluster/05-run-efa-nccl-tests.md
 create mode 100644 content/03-Cluster/06-ami.md
 create mode 100644 content/04-Verify cluster/01-preflight.md
 create mode 100644 content/04-Verify cluster/02-run-efa-nccl-tests.md
 create mode 100644 content/05-run-machine-learning-workloads/01-distirbuted-training.md
 create mode 100644 content/05-run-machine-learning-workloads/02-train-GPT3.md
 create mode 100644 content/05-run-machine-learning-workloads/03-huggignface.md
 create mode 100644 content/05-run-machine-learning-workloads/04-torchx.md

diff --git a/content/03-Cluster/05-run-efa-nccl-tests.md b/content/03-Cluster/05-run-efa-nccl-tests.md
deleted file mode 100644
index 3d0bc732..00000000
--- a/content/03-Cluster/05-run-efa-nccl-tests.md
+++ /dev/null
@@ -1,106 +0,0 @@
----
-title: "e. EFA NCCL Test"
-weight: 35
-tags: ["tutorial", "ParallelCluster", "nccl", "efa"]
----
-
-We're going to run the [nccl-tests](https://github.com/NVIDIA/nccl-tests) (this was installed during AMI creation in `/tmp/nccl-tests`) and check to make sure NCCL, and EFA are setup. This also serves as an explanation of how to submit jobs to Slurm.
-
-1. Create a file `nccl-efa-tests.sh` with the following:
-
-    ```bash
-    #!/bin/bash
-
-    #SBATCH --job-name=nccl-tests
-    #SBATCH --nodes=2
-    #SBATCH --tasks-per-node=8
-    #SBATCH --cpus-per-task=12
-    #SBATCH --output=%x_%j.out
-
-    # Load libraries
-    export 
-    LD_LIBRARY_PATH=/opt/nccl/build/lib:/usr/local/cuda/lib64:/opt/amazon/efa/lib64:/opt/amazon/openmpi/lib64:/opt/aws-ofi-nccl/lib:$LD_LIBRARY_PATH
-
-    # EFA configurations
-    export FI_PROVIDER=efa
-    export FI_EFA_USE_DEVICE_RDMA=1
-
-    # NCCL configurations
-    export NCCL_DEBUG=info
-    export NCCL_PROTO=simple
-    export NCCL_BUFFSIZE=33554432
-    export NCCL_ALGO=ring
-
-    # Run nccl-tests all reduce perf benchmark
-    module load openmpi
-    mpirun -n $SLURM_NTASKS -N $SLURM_JOB_NUM_NODES -x NCCL_BUFFSIZE=33554432 --map-by ppr:8:node --rank-by slot \
-        --mca pml ^cm --mca btl tcp,self --mca btl_tcp_if_exclude lo,docker0 --bind-to none \
-        /tmp/nccl-tests/build/all_reduce_perf -b 8 -e 2G -f 2 -g 1 -c 1 -n 100
-    ```
-
-    | Slurm Flag           | Description                                                  |
-    |----------------------|--------------------------------------------------------------|
-    | `--nodes=2`          | Run on two nodes                                             |
-    | `--tasks-per-node=8` | Run on 8 processes per node                                  |
-    | `--cpus-per-task=12` | Run on 12 cpus per process, for a total of 8 * 12 = 96 vcpus |
-
-2. Submit the job
-
-    ```bash
-    sbatch nccl-efa-tests.sh
-    watch squeue # wait for job to go into 'R' running
-    ```
-
-3. After the job has completed, take a look at the output file:
-
-    ```sbatch
-    cat nccl-tests_2.out
-    ```
-
-At the bottom of the file, you'll see the results of the NCCL traffic tests.
-
-```
-compute-dy-g4dn-1:15351:15351 [0] NCCL INFO Launch mode Parallel
-           8             2     float     sum      -1    55.02    0.00    0.00      0    55.46    0.00    0.00      0
-          16             4     float     sum      -1    54.06    0.00    0.00      0    53.81    0.00    0.00      0
-          32             8     float     sum      -1    54.44    0.00    0.00      0    53.67    0.00    0.00      0
-          64            16     float     sum      -1    54.58    0.00    0.00      0    53.77    0.00    0.00      0
-         128            32     float     sum      -1    55.10    0.00    0.00      0    54.45    0.00    0.00      0
-         256            64     float     sum      -1    55.32    0.00    0.01      0    54.72    0.00    0.01      0
-         512           128     float     sum      -1    55.82    0.01    0.02      0    55.17    0.01    0.02      0
-        1024           256     float     sum      -1    56.55    0.02    0.03      0    55.37    0.02    0.03      0
-        2048           512     float     sum      -1    59.01    0.03    0.06      0    58.37    0.04    0.06      0
-        4096          1024     float     sum      -1    60.91    0.07    0.12      0    60.66    0.07    0.12      0
-        8192          2048     float     sum      -1    63.56    0.13    0.23      0    63.43    0.13    0.23      0
-       16384          4096     float     sum      -1    68.21    0.24    0.42      0    68.12    0.24    0.42      0
-       32768          8192     float     sum      -1    77.48    0.42    0.74      0    77.56    0.42    0.74      0
-       65536         16384     float     sum      -1    87.36    0.75    1.31      0    87.43    0.75    1.31      0
-      131072         32768     float     sum      -1    116.8    1.12    1.96      0    116.7    1.12    1.96      0
-      262144         65536     float     sum      -1    164.1    1.60    2.80      0    163.7    1.60    2.80      0
-      524288        131072     float     sum      -1    248.3    2.11    3.70      0    247.9    2.11    3.70      0
-     1048576        262144     float     sum      -1    422.1    2.48    4.35      0    421.9    2.49    4.35      0
-     2097152        524288     float     sum      -1    770.2    2.72    4.77      0    767.2    2.73    4.78      0
-     4194304       1048576     float     sum      -1   1460.6    2.87    5.03      0   1460.9    2.87    5.02      0
-     8388608       2097152     float     sum      -1   2802.7    2.99    5.24      0   2803.6    2.99    5.24      0
-    16777216       4194304     float     sum      -1   5555.3    3.02    5.29      0   5553.1    3.02    5.29      0
-    33554432       8388608     float     sum      -1    11061    3.03    5.31      0    11062    3.03    5.31      0
-    67108864      16777216     float     sum      -1    22090    3.04    5.32      0    22084    3.04    5.32      0
-   134217728      33554432     float     sum      -1    44123    3.04    5.32      0    44129    3.04    5.32      0
-   268435456      67108864     float     sum      -1    88195    3.04    5.33      0    88183    3.04    5.33      0
-   536870912     134217728     float     sum      -1   176380    3.04    5.33      0   176388    3.04    5.33      0
-  1073741824     268435456     float     sum      -1   352670    3.04    5.33      0   352679    3.04    5.33      0
-  2147483648     536870912     float     sum      -1   705156    3.05    5.33      0   705133    3.05    5.33      0
-compute-dy-g4dn-1:15358:15358 [7] NCCL INFO comm 0x7f06f4000f60 rank 7 nranks 8 cudaDev 7 busId f5000 - Destroy COMPLETE
-compute-dy-g4dn-1:15351:15351 [0] NCCL INFO comm 0x7f8958000f60 rank 0 nranks 8 cudaDev 0 busId 18000 - Destroy COMPLETE
-# Out of bounds values : 0 OK
-# Avg bus bandwidth    : 2.52874
-#
-compute-dy-g4dn-1:15355:15355 [4] NCCL INFO comm 0x7f86bc000f60 rank 4 nranks 8 cudaDev 4 busId e7000 - Destroy COMPLETE
-compute-dy-g4dn-1:15353:15353 [2] NCCL INFO comm 0x7f5184000f60 rank 2 nranks 8 cudaDev 2 busId 35000 - Destroy COMPLETE
-compute-dy-g4dn-1:15356:15356 [5] NCCL INFO comm 0x7fbd28000f60 rank 5 nranks 8 cudaDev 5 busId e8000 - Destroy COMPLETE
-compute-dy-g4dn-1:15357:15357 [6] NCCL INFO comm 0x7fef90000f60 rank 6 nranks 8 cudaDev 6 busId f4000 - Destroy COMPLETE
-compute-dy-g4dn-1:15354:15354 [3] NCCL INFO comm 0x7f2b7c000f60 rank 3 nranks 8 cudaDev 3 busId 36000 - Destroy COMPLETE
-compute-dy-g4dn-1:15352:15352 [1] NCCL INFO comm 0x7f05e8000f60 rank 1 nranks 8 cudaDev 1 busId 19000 - Destroy COMPLETE
-```
-
-You'll see the NCCL
\ No newline at end of file
diff --git a/content/03-Cluster/06-ami.md b/content/03-Cluster/06-ami.md
new file mode 100644
index 00000000..b6e4c226
--- /dev/null
+++ b/content/03-Cluster/06-ami.md
@@ -0,0 +1,65 @@
+---
+title: "e. AMI"
+date: 2020-05-12T13:27:03Z
+weight : 40
+tags : ["tutorial", "EFA", "ec2", "NCCL", "MPI", "Benchmark", "compile"]
+---
+
+Pcluster provides default AMI, but we can use custom AMI to make experience better for our use case. Lets look at 2 ways we can build custom AMI.
+
+# DLAMI
+[DLAMI](https://docs.aws.amazon.com/dlami/latest/devguide/what-is-dlami.html) contains GPU dependencies and ML frameworks (for example pytorch) which are seamlesly integrated with other AWS services like EFA.
+Lets create `ami.yml` (see [DLAMI release notes](https://docs.aws.amazon.com/dlami/latest/devguide/appendix-ami-release-notes.html) to get AMI ARN (`ParentImage` in config bellow)):
+```
+Build:
+  SecurityGroupIds: [<insert you SG - in requires outbound traffic>]
+  SubnetId: subnet-123
+  InstanceType: g5.2xlarge # you can choose different instance
+  ParentImage: ami-123
+```
+
+Now run [pcluster command](https://docs.aws.amazon.com/parallelcluster/latest/ug/pcluster.build-image-v3.html) that will add all pcluster dependencies to your DLAMI of choice:
+```
+pcluster build-image -c ami.yml -i NEW_AMI_ID -r REGION
+```
+
+# Fully custom AMI
+We created Packer configuration for AMI that allows you to customize different aspects of deep learning toolchain(for example use specific CUDA version).
+
+#### 1 - Assets Required to Build the Image
+
+First let's fetch the assets required to build the image:
+
+```bash
+wget https://ml.hpcworkshops.com/scripts/packer/packer.tar.gz
+tar -xzf packer.tar.gz
+```
+
+This consists of:
+* `nvidia-efa-ml-al2-enroot_pyxis.json`: is your main image file, it consists of several sections to define the resources (instance, base AMI, security groups...) you will use to build your image. The base AMI is a ParallelCluster Amazon Linux 2 base AMI. The provisioners section consists of inline scripts that will be executed serially to install the desired software stack onto your image.
+* `variables.json`: contains some key variables. Packer will refer to them in the image script through user variables calls.
+* `enroot.com`: in the enroot directory contains the [Enroot](https://github.com/NVIDIA/enroot) configuration that will be copied to your AMI.
+
+#### 2 - Installing Packer
+
+You can install Packer using [Brew](https://brew.sh/) on OSX or Linux as follows:
+
+```bash
+brew install packer
+```
+
+Alternatively, you can download the Packer binary through the [tool website](https://www.packer.io/). Ensure your `PATH` is set to use the binary or use its absolute path. Once Packer installed, proceed to the next stage.
+
+#### 3 - Build Your Image
+
+Once packer installed, from the assets directory run the command below:
+
+```bash
+packer build -color=true -var-file variables.json nvidia-efa-ml-al2-enroot_pyxis.json | tee build_AL2.log
+```
+
+Packer will start by creating the instances and associated resources (EC2 Key, Security Group...), run through the installation scripts, shutdown the instance and image it then terminate the instance.
+
+The process is automated and the output will be displayed on your terminal. If Packer encounters an error during the installation, it will stop the process and terminate all the resources. You will have to go through its log to identify where the error occurred and correct it.
+
+Once the image build, feel free to use it to create new clusters. The image will be retrieval from the Amazon EC2 Console under "Images -> AMIs"
diff --git a/content/04-Verify cluster/01-preflight.md b/content/04-Verify cluster/01-preflight.md
new file mode 100644
index 00000000..0370b46f
--- /dev/null
+++ b/content/04-Verify cluster/01-preflight.md	
@@ -0,0 +1,10 @@
+---
+title: "b. Download, compile and run the NCCL tests"
+date: 2020-05-12T13:27:03Z
+weight : 40
+tags : ["tutorial", "EFA", "ec2", "NCCL", "MPI", "Benchmark", "compile"]
+---
+
+# Preflight
+
+We created automated cluster validator to make sure all you dependencies are set correctly. Follow instructions in [preflight](https://github.com/aws-samples/parallelcluster-efa-gpu-preflight-ami/tree/main/preflight) repository.
\ No newline at end of file
diff --git a/content/04-Verify cluster/02-run-efa-nccl-tests.md b/content/04-Verify cluster/02-run-efa-nccl-tests.md
new file mode 100644
index 00000000..e7c1d0ba
--- /dev/null
+++ b/content/04-Verify cluster/02-run-efa-nccl-tests.md	
@@ -0,0 +1,177 @@
+---
+title: "b. Download, compile and run the NCCL tests"
+date: 2020-05-12T13:27:03Z
+weight : 40
+tags : ["tutorial", "EFA", "ec2", "NCCL", "MPI", "Benchmark", "compile"]
+---
+
+
+
+In this section, you will download, compile and run on 2 nodes a common GPU to GPU communication benchmarks from Nvidia used in ML Frameworks such as PyTorch.
+
+
+#### Download and Compile the NCCL tests
+
+You can run the script below on the Master node of your ParallelCluster in the home directory to 
+
+```bash
+cd ~
+
+cat > compile_nccl.sh << EOF
+#!/bin/bash
+
+module load intelmpi
+
+git clone -b v2.17.1-1 https://github.com/NVIDIA/nccl.git
+cd nccl
+make -j src.build CUDA_HOME=/usr/local/cuda NVCC_GENCODE='-gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80'
+cd ..
+
+git clone -b aws https://github.com/aws/aws-ofi-nccl.git
+cd aws-ofi-nccl
+./autogen.sh
+./configure --prefix=${HOME}/aws-ofi-nccl/install --with-mpi=/opt/amazon/openmpi --with-libfabric=/opt/amazon/efa --with-cuda=/usr/local/cuda
+make
+make install
+cd ..
+
+git clone -b v2.13.6 https://github.com/NVIDIA/nccl-tests.git
+cd nccl-tests
+make MPI=1 CUDA_HOME=/usr/local/cuda MPI_HOME=/opt/amazon/openmpi NCCL_HOME=${HOME}/nccl/build
+
+echo "Installation done, run a quick test!"
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HOME}/nccl/build/lib:${HOME}/aws-ofi-nccl/install/lib
+/opt/amazon/openmpi/bin/mpirun -np $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 1
+
+EOF
+
+sh ./compile_nccl.sh
+```
+
+
+#### Submit NCCL benchmark
+
+Create your job submission script for *OSU Latency* and use **sbatch** to submit your job:
+
+```bash
+cat > nccl_test.sbatch << \EOF
+#!/bin/bash
+#SBATCH -n 192
+#SBATCH -N 2
+#SBATCH --gres=gpu:8
+#SBATCH --output=nccl.out
+
+NCCL_TEST_PATH=${HOME}/nccl-tests/build
+MPI_PATH=/opt/amazon/openmpi
+
+export LD_LIBRARY_PATH=${HOME}/nccl/build/lib:${HOME}/aws-ofi-nccl/install/lib
+
+export NCCL_PROTO=simple
+export FI_EFA_USE_DEVICE_RDMA=1 # use for P4
+export FI_EFA_FORK_SAFE=1
+export FI_PROVIDER=efa
+export FI_EFA_ENABLE_SHM_TRANSFER=0
+
+export NCCL_DEBUG=INFO
+export FI_LOG_LEVEL=1
+
+${MPI_PATH}/bin/mpirun --map-by ppr:8:node --rank-by slot \
+                     --mca pml ^cm  --mca btl tcp,self \
+                     --mca btl_tcp_if_exclude lo,docker0 --bind-to none \
+                     ${NCCL_TEST_PATH}/all_reduce_perf -b 8 -e 9G -f 2 -g 1 -c 1 -n 100
+EOF
+
+sbatch nccl_test.sbatch
+watch squeue
+```
+
+You have to wait a couple of minutes for your compute instances to come up, once you see the job go from **PD** pending to **R** running state, you know the instances are up. Type **Ctrl-C** to exit squeue at any point.
+
+After the job has completed, find the output in `cat ~/nccl.out` . You will see something like:
+
+```bash
+$ head -n 19 ~/nccl.out
+# nThread 1 nGpus 1 minBytes 8 maxBytes 9663676416 step: 2(factor) warmup iters: 5 iters: 100 agg iters: 1 validation: 1 graph: 0
+#
+# Using devices
+#  Rank  0 Group  0 Pid  77208 on new-st-gpu-1 device  0 [0x10] NVIDIA A100-SXM4-80GB
+#  Rank  1 Group  0 Pid  77209 on new-st-gpu-1 device  1 [0x10] NVIDIA A100-SXM4-80GB
+#  Rank  2 Group  0 Pid  77211 on new-st-gpu-1 device  2 [0x20] NVIDIA A100-SXM4-80GB
+#  Rank  3 Group  0 Pid  77212 on new-st-gpu-1 device  3 [0x20] NVIDIA A100-SXM4-80GB
+#  Rank  4 Group  0 Pid  77213 on new-st-gpu-1 device  4 [0x90] NVIDIA A100-SXM4-80GB
+#  Rank  5 Group  0 Pid  77214 on new-st-gpu-1 device  5 [0x90] NVIDIA A100-SXM4-80GB
+#  Rank  6 Group  0 Pid  77215 on new-st-gpu-1 device  6 [0xa0] NVIDIA A100-SXM4-80GB
+#  Rank  7 Group  0 Pid  77216 on new-st-gpu-1 device  7 [0xa0] NVIDIA A100-SXM4-80GB
+#  Rank  8 Group  0 Pid  95401 on new-st-gpu-2 device  0 [0x10] NVIDIA A100-SXM4-80GB
+#  Rank  9 Group  0 Pid  95402 on new-st-gpu-2 device  1 [0x10] NVIDIA A100-SXM4-80GB
+#  Rank 10 Group  0 Pid  95403 on new-st-gpu-2 device  2 [0x20] NVIDIA A100-SXM4-80GB
+#  Rank 11 Group  0 Pid  95404 on new-st-gpu-2 device  3 [0x20] NVIDIA A100-SXM4-80GB
+#  Rank 12 Group  0 Pid  95405 on new-st-gpu-2 device  4 [0x90] NVIDIA A100-SXM4-80GB
+#  Rank 13 Group  0 Pid  95406 on new-st-gpu-2 device  5 [0x90] NVIDIA A100-SXM4-80GB
+#  Rank 14 Group  0 Pid  95407 on new-st-gpu-2 device  6 [0xa0] NVIDIA A100-SXM4-80GB
+#  Rank 15 Group  0 Pid  95408 on new-st-gpu-2 device  7 [0xa0] NVIDIA A100-SXM4-80GB
+```
+This tells us we have 16 GPUs (one rank is one GPU) and 2 nodes in the job.
+
+
+We can check if EFA is indeed used. You should see these parts of text:
+```bash
+cat nccl.out
+
+....
+
+new-st-gpu-2:8802:8863 [5] NCCL INFO NET/OFI Using aws-ofi-nccl 1.5.0aws
+new-st-gpu-2:8802:8863 [5] NCCL INFO NET/OFI Configuring AWS-specific options
+
+...
+
+new-st-gpu-2:8804:8862 [6] NCCL INFO NET/OFI Selected Provider is efa (found 4 nics)
+new-st-gpu-2:8804:8862 [6] NCCL INFO Using network AWS Libfabric
+....
+
+```
+
+Alternatively we could check counter:
+```bash
+cat /sys/class/infiniband/rdmap*/ports/1/hw_counters/rx_bytes
+```
+for read bytes (this counter should be bigger after test) - see full [workshop](https://catalog.us-east-1.prod.workshops.aws/workshops/5563d004-a892-4c83-8d82-d8fa6baa0517/en-US/monitor) for more details.
+`rdmap*` is for P4 instances, you will see other names (`efa_0` for example) in other instance types.
+
+
+We can check end of the result file (`~/nccl.out` set in `~nccl_test.sbatch` as `#SBATCH --output=nccl.out` - you can also see it as `StdOut` in `scontrol show job ${YOUR_JOB_ID}`) like:
+```bash
+tail -n 60 nccl.out
+#
+#                                                              out-of-place                       in-place          
+#       size         count      type   redop    root     time   algbw   busbw #wrong     time   algbw   busbw #wrong
+#        (B)    (elements)                               (us)  (GB/s)  (GB/s)            (us)  (GB/s)  (GB/s)       
+           8             2     float     sum      -1    174.2    0.00    0.00      0    171.7    0.00    0.00      0
+          16             4     float     sum      -1    171.1    0.00    0.00      0    167.0    0.00    0.00      0
+          32             8     float     sum      -1    162.4    0.00    0.00      0    158.1    0.00    0.00      0
+          64            16     float     sum      -1    157.9    0.00    0.00      0    157.9    0.00    0.00      0
+         128            32     float     sum      -1    158.7    0.00    0.00      0    158.3    0.00    0.00      0
+         256            64     float     sum      -1    158.7    0.00    0.00      0    158.7    0.00    0.00      0
+         512           128     float     sum      -1    158.7    0.00    0.01      0    159.1    0.00    0.01      0
+        1024           256     float     sum      -1    161.3    0.01    0.01      0    161.4    0.01    0.01      0
+        2048           512     float     sum      -1    176.5    0.01    0.02      0    175.7    0.01    0.02      0
+        4096          1024     float     sum      -1    165.7    0.02    0.05      0    165.7    0.02    0.05      0
+        8192          2048     float     sum      -1    172.1    0.05    0.09      0    171.5    0.05    0.09      0
+       16384          4096     float     sum      -1    189.9    0.09    0.16      0    189.0    0.09    0.16      0
+       32768          8192     float     sum      -1    220.4    0.15    0.28      0    218.2    0.15    0.28      0
+       65536         16384     float     sum      -1    224.0    0.29    0.55      0    221.0    0.30    0.56      0
+      131072         32768     float     sum      -1    227.3    0.58    1.08      0    223.3    0.59    1.10      0
+      262144         65536     float     sum      -1    234.2    1.12    2.10      0    233.2    1.12    2.11      0
+      524288        131072     float     sum      -1    257.4    2.04    3.82      0    257.3    2.04    3.82      0
+     1048576        262144     float     sum      -1    307.4    3.41    6.40      0    306.7    3.42    6.41      0
+     2097152        524288     float     sum      -1    388.3    5.40   10.13      0    388.7    5.40   10.12      0
+     4194304       1048576     float     sum      -1    522.7    8.02   15.04      0    521.6    8.04   15.08      0
+     8388608       2097152     float     sum      -1    761.2   11.02   20.66      0    757.8   11.07   20.75      0
+    16777216       4194304     float     sum      -1   1200.2   13.98   26.21      0   1195.9   14.03   26.30      0
+    33554432       8388608     float     sum      -1   1565.5   21.43   40.19      0   1559.6   21.52   40.34      0
+    67108864      16777216     float     sum      -1   2724.5   24.63   46.18      0   2727.9   24.60   46.13      0
+   134217728      33554432     float     sum      -1   4071.8   32.96   61.80      0   4070.3   32.98   61.83      0
+   268435456      67108864     float     sum      -1   7390.4   36.32   68.10      0   7387.7   36.34   68.13      0
+   536870912     134217728     float     sum      -1    13605   39.46   73.99      0    13594   39.49   74.05      0
+  1073741824     268435456     float     sum      -1    25940   41.39   77.61      0    25985   41.32   77.48      0
+```
diff --git a/content/05-run-machine-learning-workloads/01-distirbuted-training.md b/content/05-run-machine-learning-workloads/01-distirbuted-training.md
new file mode 100644
index 00000000..7d77a1ae
--- /dev/null
+++ b/content/05-run-machine-learning-workloads/01-distirbuted-training.md
@@ -0,0 +1,21 @@
+---
+title : "a. Run PyTorch Data Parallel training on ParallelCluster with synthetic data"
+date: 2020-09-04T15:58:58Z
+weight : 30
+tags : ["training", "data parallel", "ML", "sbatch", "slurm", "multi node", "multi gpu"]
+---
+
+In this step you will use the **PyTorch** _DistributedDataParallel API_ to start a simple training job on 2 nodes with sythetic data and simple deep learning model.
+It assumes you have python and pytorch installed on the nodes. Users can use this as a test before running their workloads.
+
+#### Create distributed training script and run the job
+```bash
+wget https://raw.githubusercontent.com/aws-samples/parallelcluster-efa-gpu-preflight-ami/main/preflight/run.py
+wget https://raw.githubusercontent.com/aws-samples/parallelcluster-efa-gpu-preflight-ami/main/preflight/multi_node_ddp.sbatch
+sbatch multi_node_ddp.sbatch
+```
+
+The last command will return job ID, tail the logs with `tail -f slurm-${JOB_ID}`. You should see the following in job log:
+1. You should see `NCCL INFO NET/OFI Selected Provider is efa` in the logs to make sure EFA is choosen.
+2. We use [torchrun](https://pytorch.org/docs/stable/elastic/run.html) to [launch](https://github.com/aws-samples/parallelcluster-efa-gpu-preflight-ami/blob/main/preflight/multi_node_ddp.sbatch#L14) training job ([TorchX](https://pytorch.org/torchx/latest/schedulers/slurm.html) is alternative). It will define [environmental variables](https://pytorch.org/docs/stable/elastic/run.html#environment-variables) that `torch.distributed` then uses to form a cluster of workers like `RANK` some of which are printed in logs and can be used to make sure cluster is formed - we have 2 nodes and each has 8 GPUs where we set `--nproc_per_node` to 1 which means each GPU will host 1 worker and we shoudl have 16 (2x8) ranks from 0 to 15.
+3. You should see logs like `rank: 4, step: 800` which means that rank 4 just finished 800th training step.
diff --git a/content/05-run-machine-learning-workloads/02-train-GPT3.md b/content/05-run-machine-learning-workloads/02-train-GPT3.md
new file mode 100644
index 00000000..c309bb44
--- /dev/null
+++ b/content/05-run-machine-learning-workloads/02-train-GPT3.md
@@ -0,0 +1,186 @@
+---
+title : "b. Run GPT3 on ParallelCluster"
+date: 2020-09-04T15:58:58Z
+weight : 30
+tags : ["training", "data parallel", "ML", "sbatch", "slurm", "multi node", "multi gpu", "GPT3"]
+---
+
+# NeMo
+
+The base container `bignlp-training` is still in beta (as of this writing), so you need to register
+an NVidia account to pull this Docker image. You'll extend the `bignlp-training` image with
+AWS-specific stuffs, such as EFA, etc. (TODO: the EFA install in the
+[Dockerfile](nemo4aws/Dockerfile) can still be improved to make it work with the latest release).
+
+Relevant scripts can be found on folder [nemo4aws](nemo4aws).
+
+The general philosophy is straightforward: under normal circumstances, `bignlp-training` contains a
+script. When ran, the container (i) generates a submit script (which calls either `srun` or `bcp`),
+then (ii) immediately runs that submit script. It seems that NVidia NGC/GCP inject/mount `bcp` or
+`srun` into the running container, because those commands are not built into the image.
+
+On PC, what we've done is to make `bignlp-training` performs only step (i), then we invoke an `srun`
+command (at AMI/instance level) to run an enroot container using the command generated by step (i)
+-- see [nemo-megatron-v2.sbatch](nemo4aws/nemo-megatron-v2.sbatch). Note that we also experimented
+bypassing step (i), by directly starting the actual Nemo script using `torchrun`
+([run.sh](nemo4aws/run.sh) and [nemo-megatron.sbatch](nemo4aws/nemo-megatron.sbatch)) -- this seems
+to work, but probably we need to restore some optimized NVidia settings or env-vars that the
+`bignlp-training` script generates. It should be noted that for some reason, `bignlp-training` only
+allows you to generate `torchrun`-based submit script for single node, but in our experiment we
+observe that we can actually `torchrun`-ed Nemo for multiple nodes.
+
+You can submit jobs to PC using `sbatch nemo-megatron-v2.sbatch` (the `srun` version, which is as
+close as it can to the original `bignlp-training`, or `sbatch nemo-megatron.sbatch` (the `torchrun`
+version). Each `.sbatch` script has default hyperparameters, which you can override by setting
+environment variables. See
+[submit-gpt3-175b16-nccl_stock.sh](nemo4aws/submit-gpt3-175b16-nccl_stock.sh) for an example of
+setting hyperparameters for the `torchrun`-based `nemo-megatron.sbatch`.
+
+## 1. Quickstart
+
+All these steps are to be done on the head node.
+
+### 1.1. Prepare the Pile dataset
+
+A copy of dataset in the Nemo-Megatron format (i.e., `.{bin,idx}` files) are stored on
+`s3://frameworks-shared-bucket/data/the_pile_gpt3/`. Download these to the FSx Lustre as follows:
+
+```bash
+s5cmd sync 's3://frameworks-shared-bucket/data/bpe/*' /fsx/data/bpe/
+
+# Below timing is on head node m5.8xlarge. Intra-region copy us-east-1.
+
+/usr/bin/time s5cmd sync 's3://frameworks-shared-bucket/data/the_pile_gpt3/*.idx' /fsx/data/the_pile_gpt3/
+# 4.00user 13.34system 0:04.78elapsed 362%CPU (0avgtext+0avgdata 37108maxresident)k
+# 2855376inputs+7954112outputs (0major+7599minor)pagefaults 0swaps
+
+/usr/bin/time s5cmd sync 's3://frameworks-shared-bucket/data/the_pile_gpt3/*.bin' /fsx/data/the_pile_gpt3/
+# 605.89user 3384.71system 19:09.46elapsed 347%CPU (0avgtext+0avgdata 69656maxresident)k
+# 25209576inputs+1462271680outputs (0major+60785minor)pagefaults 0swaps
+#
+# See also the observed throughput captured in references/s3-to-fsx.png
+```
+
+Should you choose to download and pre-process the Pile dataset from scratch, please refer to
+`01-prep-pile-data.sh`. The download size is 426 GB (i.e., compressed `.jsonl.zst` files), and
+extacted to 850 GB of `.jsonl` files. After processed into `.{bin,idx}` files, the final size is 702
+GB. It took ~10.5h on 2x `c5.24xlarge` to download, extract, and process the dataset. Majority of
+the time (~8h) were incurred by processing step. See `01-prep-pile-data.sh` for the runtime
+statistics (as comments).
+
+### 1.2. Training
+
+Build enroot container on the PC headnode:
+
+```bash
+cd docker/
+
+docker login nvcr.io
+# ... Enter your username and API key to NGC.
+
+docker pull nvcr.io/ea-bignlp/bignlp-training:22.09-py3
+
+# Build Docker image
+docker build . \
+    -t 111122223333.dkr.ecr.us-east-1.amazonaws.com/bignlp-training:22.09-py3-bcp-nsys-2022.5.1-v2-efa
+
+# Convert Docker image to enroot image. Timing on head node m5.8xlarge.
+/usr/bin/time enroot import \
+    -o /apps/bignlp-training_22.09-py3-bcp-nsys-2022.5.1-v2-efa.sqsh \
+    dockerd://111122223333.dkr.ecr.us-east-1.amazonaws.com/bignlp-training:22.09-py3-bcp-nsys-2022.5.1-v2-efa
+# 13.39user 53.30system 2:51.89elapsed 38%CPU (0avgtext+0avgdata 688284maxresident)k
+# 0inputs+67630496outputs (0major+200814minor)pagefaults 0swaps
+```
+
+Next, ensure that the `p4de` partition is up:
+
+```console
+$ pcluster describe-compute-fleet --cluster-name benchmarking-megatron --region us-east-1
+{
+  "status": "RUNNING",
+  "lastStatusUpdatedTime": "2023-01-16T08:23:13.093Z"
+}
+```
+
+Otherwise, on your local environment with parallel cluster installed (i.e., **not** the head node),
+start the partition using the `pcluster update-compute-fleet`, and once-in-a-while check that its
+status is `RUNNING` using the `pcluster describe-compute-fleet` command.
+
+```console
+$ pcluster update-compute-fleet --cluster-name benchmarking-megatron --status START_REQUESTED --region us-east-1
+{
+  "status": "START_REQUESTED",
+  "lastStatusUpdatedTime": "2023-01-16T08:22:25.337Z"
+}
+```
+
+Once the enroot image and `p4de` partition are ready, submit a job either directly using `sbatch`:
+
+```bash
+sbatch --nodes=2 nemo-megatron.sbatch
+```
+
+or simply run one of the git-versioned submit script such as `submit-gpt3-04b8-nccl_stock.sh`. You
+can add new submit scripts, then git-version them to promote readability and reproducability.
+
+Lastly, be a good citizen by stopping the partition so other users in the AWS account can use the
+`p4de` instances by running the following command on your parallel-cluster client machine:
+
+```console
+$ pcluster update-compute-fleet --cluster-name benchmarking-megatron --status STOP_REQUESTED --region us-east-1
+{
+  "status": "STOP_REQUESTED",
+  "lastStatusUpdatedTime": "2023-01-16T08:55:01.745Z"
+}
+```
+
+## 2. Storage Layout
+
+The PC (Parallel Cluster) provides these shared volumes:
+
+```yaml
+SharedStorage:
+  - MountDir: /fsx
+    Name: fsx
+    StorageType: FsxLustre
+    FsxLustreSettings:
+      StorageCapacity: 4800
+      DeploymentType: "SCRATCH_2"
+  - Name: SharedEBS
+    StorageType: Ebs
+    MountDir: /apps
+    EbsSettings:
+      VolumeType: gp3
+      Size: 200
+      Throughput: 300
+      Iops: 6000
+```
+
+## 3. Appendix
+
+### 3.1. Frequently Used Commands
+
+Assume `pwd` is `GITROOT`.
+
+```bash
+
+sinfo -o "%20N  %10c  %10m  %25f  %10G"
+
+squeue --format="%.18i %.9P %.30j %.8u %.8t %.10M %.9l %.6D %R"
+
+scancel <JOB_ID>
+
+egrep 'export (MODEL_NAME|MODEL_SIZE|NODE|MAX_STEP|IMAGE)' submit*sh
+
+
+find /fsx/results/ -name joblog.log | xargs grep -m1 JOB_ID | sort
+```
+
+### 3.2. Docker Images
+
+- `159553542841.dkr.ecr.us-east-1.amazonaws.com/bignlp-training:22.09-py3-bcp-nsys-2022.5.1-v2-efa`
+  uses built-in nccl which is `NCCL_VERSION=2.12.12`. The `Dockerfile` can be checked-out from
+  git-tag `dockerfile-22.09-py3-bcp-nsys-2022.5.1-v2-efa`.
+
+
+# OPT
\ No newline at end of file
diff --git a/content/05-run-machine-learning-workloads/03-huggignface.md b/content/05-run-machine-learning-workloads/03-huggignface.md
new file mode 100644
index 00000000..6341f610
--- /dev/null
+++ b/content/05-run-machine-learning-workloads/03-huggignface.md
@@ -0,0 +1,10 @@
+---
+title : "c. Run Huggingface models on ParallelCluster"
+date: 2020-09-04T15:58:58Z
+weight : 20
+tags : ["Huggingface", "data", "ML", "srun", "slurm"]
+---
+
+In this section, you will learn how to run script from Huggingface examples with PyTorch FSDP and DDP.
+
+
diff --git a/content/05-run-machine-learning-workloads/04-torchx.md b/content/05-run-machine-learning-workloads/04-torchx.md
new file mode 100644
index 00000000..ad71c502
--- /dev/null
+++ b/content/05-run-machine-learning-workloads/04-torchx.md
@@ -0,0 +1,8 @@
+---
+title : "d. TorchX: multi platform deployment tool for distributed training"
+date: 2020-09-04T15:58:58Z
+weight : 30
+tags : ["training", "data parallel", "ML", "sbatch", "slurm", "multi node", "multi gpu"]
+---
+ 
+TorchX is a CLI to schedule pytorch distributed training workloads and supports [slurm scheduler](https://pytorch.org/torchx/latest/schedulers/slurm.html). We can use it to easily deploy PyTorch distributed workloads accross different schedulers.
\ No newline at end of file

From 61fd42bcf8e2ea47d54b48139ab4ecee367ba7b3 Mon Sep 17 00:00:00 2001
From: lipovsek-aws <117850701+lipovsek-aws@users.noreply.github.com>
Date: Thu, 27 Apr 2023 14:29:03 +0100
Subject: [PATCH 2/4] Update content/03-Cluster/06-ami.md

Co-authored-by: mhuguesaws <71357145+mhuguesaws@users.noreply.github.com>
---
 content/03-Cluster/06-ami.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/content/03-Cluster/06-ami.md b/content/03-Cluster/06-ami.md
index b6e4c226..5cac433d 100644
--- a/content/03-Cluster/06-ami.md
+++ b/content/03-Cluster/06-ami.md
@@ -1,5 +1,5 @@
 ---
-title: "e. AMI"
+title: "e. Create a Machine Image
 date: 2020-05-12T13:27:03Z
 weight : 40
 tags : ["tutorial", "EFA", "ec2", "NCCL", "MPI", "Benchmark", "compile"]

From 5af8cd5015f7f76d895804dc7a5f26b296dfa6f7 Mon Sep 17 00:00:00 2001
From: lipovsek-aws <117850701+lipovsek-aws@users.noreply.github.com>
Date: Thu, 27 Apr 2023 14:32:05 +0100
Subject: [PATCH 3/4] Update 06-ami.md

---
 content/03-Cluster/06-ami.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/content/03-Cluster/06-ami.md b/content/03-Cluster/06-ami.md
index 5cac433d..7461d8d1 100644
--- a/content/03-Cluster/06-ami.md
+++ b/content/03-Cluster/06-ami.md
@@ -1,11 +1,11 @@
 ---
-title: "e. Create a Machine Image
+title: "e. Create a Machine Image"
 date: 2020-05-12T13:27:03Z
 weight : 40
 tags : ["tutorial", "EFA", "ec2", "NCCL", "MPI", "Benchmark", "compile"]
 ---
 
-Pcluster provides default AMI, but we can use custom AMI to make experience better for our use case. Lets look at 2 ways we can build custom AMI.
+Pcluster provides default [AMI](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AMIs.html), but we can use custom AMI to make experience better for our use case. Lets look at 2 ways we can build custom AMI.
 
 # DLAMI
 [DLAMI](https://docs.aws.amazon.com/dlami/latest/devguide/what-is-dlami.html) contains GPU dependencies and ML frameworks (for example pytorch) which are seamlesly integrated with other AWS services like EFA.

From 1b6d0ee5bf2c84b49de4e6987bc8d10257bd8be7 Mon Sep 17 00:00:00 2001
From: lipovsek-aws <117850701+lipovsek-aws@users.noreply.github.com>
Date: Thu, 27 Apr 2023 16:53:45 +0100
Subject: [PATCH 4/4] Update
 content/05-run-machine-learning-workloads/02-train-GPT3.md

Co-authored-by: mhuguesaws <71357145+mhuguesaws@users.noreply.github.com>
---
 content/05-run-machine-learning-workloads/02-train-GPT3.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/content/05-run-machine-learning-workloads/02-train-GPT3.md b/content/05-run-machine-learning-workloads/02-train-GPT3.md
index c309bb44..b9f8a534 100644
--- a/content/05-run-machine-learning-workloads/02-train-GPT3.md
+++ b/content/05-run-machine-learning-workloads/02-train-GPT3.md
@@ -1,5 +1,5 @@
 ---
-title : "b. Run GPT3 on ParallelCluster"
+title : "b. Run GPT3 on AWS ParallelCluster"
 date: 2020-09-04T15:58:58Z
 weight : 30
 tags : ["training", "data parallel", "ML", "sbatch", "slurm", "multi node", "multi gpu", "GPT3"]