acceptance gpt3 code

eth-cscs · Jul 16, 2024 · 232c258 · 232c258
1 parent 1845acf
commit 232c258
Show file tree

Hide file tree

Showing 6 changed files with 178 additions and 146 deletions.
diff --git a/large_language_model/nemo-v40-NVIDIA/Dockerfile b/large_language_model/nemo-v40-NVIDIA/Dockerfile
@@ -18,10 +18,23 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.
 
-
 ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:24.04-py3
 FROM ${FROM_IMAGE_NAME}
 
+RUN apt-get update \
+  && apt-get install -yqq --no-install-recommends openssh-server \
+  && rm -rf /var/lib/apt/lists/*
+RUN mkdir -p /run/sshd
+
+RUN pip install -vvv \
+  git+https://github.com/mlcommons/[email protected] \
+  git+https://github.com/NVIDIA/mlperf-common.git@68cf1d0d5e3de3351e66abb696d0e2d011aabf47 \
+  zarr
+
+# NeMo dependencies
+RUN pip install -vvv tensorstore==0.1.45
+RUN pip install -vvv causal-conv1d==1.2.0.post2 opencc-python-reimplemented==0.1.7
+
 # Document build setup
 ARG FROM_IMAGE_NAME
 ENV CUSTOM_FROM_IMAGE_NAME ${FROM_IMAGE_NAME}
@@ -43,15 +56,15 @@ RUN if [ "${APEX_REVISION}" != SKIP ]; then \
       MAX_JOBS=${APEX_MAX_JOBS} NVCC_APPEND_FLAGS="--threads 8" pip install -v --no-build-isolation --no-cache-dir --disable-pip-version-check --config-settings "--build-option=--cpp_ext --cuda_ext --bnp --xentropy --deprecated_fused_adam --deprecated_fused_lamb --fast_multihead_attn --distributed_lamb --fast_layer_norm --transducer --distributed_adam --fmha --fast_bottleneck --nccl_p2p --peer_memory --permutation_search --focal_loss --fused_conv_bias_relu --index_mul_2d --cudnn_gbn --group_norm" . \
     ; fi
 
-## 2. Transformer Engine
-ARG TE_REVISION=v1.6rc1
+## 2. Transformer Engine +fix for arm64
+ARG TE_REVISION=3b4d9e8766b829d50ac78bb26f770fb8d9825ae7 # v1.6rc1
 ENV CUSTOM_TE_REVISION ${TE_REVISION}
 
 RUN if [ "${TE_REVISION}" != SKIP ]; then \
       NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install --force-reinstall --no-deps git+https://github.com/NVIDIA/TransformerEngine.git@${TE_REVISION} \
     ; fi
 
-## 3. NeMo
+## 3. NeMo v2.0.0.rc0.beta doesn't install on aarch64, but v2.0.0rc0 does
 ARG NEMO_REVISION=v2.0.0.rc0.beta
 ENV CUSTOM_NEMO_REVISION ${NEMO_REVISION}
 ARG NEMO_BASE_VERSION=r2.0.0
@@ -69,13 +82,16 @@ RUN if [ "${NEMO_REVISION}" == SKIP ]; then \
       git clone https://github.com/NVIDIA/NeMo.git && \
       cd NeMo && \
       git config user.email "[email protected]" && \
-      git config user.name "name name" && \  
+      git config user.name "name name" && \
+      git checkout v2.0.0rc0 && \
+      echo NEMO_COMMIT_HASH=$(git rev-parse HEAD) && \
+      pip uninstall -y nemo-toolkit && \
+      pip install pkgconfig py-cpuinfo "cython<3.0.0" && \
+      pip install --no-build-isolation -e ".[nlp]" && \
       echo NEMO_REVISION=${NEMO_REVISION} && \
       git checkout ${NEMO_REVISION} && \
       echo NEMO_COMMIT_HASH=$(git rev-parse HEAD) && \
-      pip uninstall -y nemo-toolkit && \
-      pip install "cython<3.0.0" && \
-      pip install --no-build-isolation -e ".[nlp]" \
+      pip install --no-build-isolation --no-deps -e ".[nlp]" \
     ; fi
 
 ### Make (has to be called after all changes to repo)
@@ -103,26 +119,25 @@ RUN if [ "${MEGATRON_REVISION}" != SKIP ]; then \
 ENV PYTHONPATH "${PYTHONPATH}:/workspace/Megatron-LM"
 
 # Pin PL version
-# RUN pip install --force-reinstall --no-deps pytorch-lightning==2.0.7
+RUN pip install --force-reinstall --no-deps pytorch-lightning==2.2.4
 
-## 5. Benchmark dependencies
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
+# # 5. Benchmark dependencies
+# COPY requirements.txt .
+# RUN pip install --no-cache-dir -r requirements.txt
 
 ## 6. Use nccl-rdma-sharp-plugins from master to pick a fix after HPCX2.18 release
-RUN rm -rf /opt/hpcx/nccl_rdma_sharp_plugin && \
-    ln -s /usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so && \
-    git clone https://github.com/Mellanox/nccl-rdma-sharp-plugins && \
-    cd nccl-rdma-sharp-plugins/ && \
-    ./autogen.sh && \
-    ./configure --prefix=/opt/hpcx/nccl_rdma_sharp_plugin --with-cuda=/usr/local/cuda --with-sharp=/opt/hpcx/sharp/ && \
-    make -j install && \
-    cd ../ && \
-    rm -rf nccl-rdma-sharp-plugins/
+#RUN rm -rf /opt/hpcx/nccl_rdma_sharp_plugin && \
+#    ln -s /usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so && \
+#    git clone https://github.com/Mellanox/nccl-rdma-sharp-plugins && \
+#    cd nccl-rdma-sharp-plugins/ && \
+#    ./autogen.sh && \
+#    ./configure --prefix=/opt/hpcx/nccl_rdma_sharp_plugin --with-cuda=/usr/local/cuda --with-sharp=/opt/hpcx/sharp/ && \
+#    make -j install && \
+#    cd ../ && \
+#    rm -rf nccl-rdma-sharp-plugins/
 
 # Benchmark code
 WORKDIR /workspace/llm
 
 COPY . .
 ENV PYTHONPATH "/workspace/llm:/workspace/NeMo:${PYTHONPATH}"
-
diff --git a/large_language_model/nemo-v40-NVIDIA/conf/custom.yaml b/large_language_model/nemo-v40-NVIDIA/conf/custom.yaml
@@ -7,7 +7,7 @@ proxy_gbs: ${oc.decode:${oc.env:PROXY_GBS,${model.global_batch_size}}}
 is_proxy_run: ${neq:${proxy_gbs},${model.global_batch_size}}
 
 trainer:
-  devices: ${oc.decode:${oc.env:DGXNGPU,8}}
+  devices: ${oc.decode:${oc.env:DGXNGPU,4}}
   num_nodes: ${oc.decode:${oc.env:DGXNNODES,1}}
   precision: bf16
   max_steps: ${oc.decode:${oc.env:MAX_STEPS,${if:${is_proxy_run},${oc.decode:${oc.env:LIMIT_TRAIN_BATCHES,500}},${ceil_div:20000000,${proxy_gbs}}}}}

diff --git a/large_language_model/nemo-v40-NVIDIA/config_common.sh b/large_language_model/nemo-v40-NVIDIA/config_common.sh
@@ -1,6 +1,6 @@
 ## System run parms
 
-export CUDA_VISIBLE_DEVICES=0,4,2,6,1,5,3,7
+export CUDA_VISIBLE_DEVICES=0,1,2,3
 
 export TRAIN_ONLY=0
 
@@ -15,7 +15,7 @@ export NCCL_MIN_NCHANNELS=4
 
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 
-: "${CHECKPOINT_NAME:=""}"
+: "${CHECKPOINT_NAME:="ckpt4000-consumed_samples=0"}"
 export LOAD_CHECKPOINT="/load_checkpoints/"$CHECKPOINT_NAME
 
 export MICRO_BATCH_SIZE=2
@@ -34,14 +34,14 @@ export NCCL_CUMEM_ENABLE=0
 # This is needed to save memory. nvbug 4264087 tracks  fix.
 export NCCL_NVLS_ENABLE=0
 
-# TP overlap: use FP8/MC strided atomic RS and pipelined AG
-export NVTE_UB_SPLIT_RS=0
-export NVTE_UB_ATOMIC_GEMM_RS=1
-export NVTE_RS_STRIDED_ATOMIC=1
-#export NVTE_UB_FP8_RS=1
-unset UB_SKIPMC
-export MC_TP_OVERLAP_AG=True
-export MC_TP_OVERLAP_RS=True
+# # TP overlap: use FP8/MC strided atomic RS and pipelined AG
+# export NVTE_UB_SPLIT_RS=0
+# export NVTE_UB_ATOMIC_GEMM_RS=1
+# export NVTE_RS_STRIDED_ATOMIC=1
+# #export NVTE_UB_FP8_RS=1
+# unset UB_SKIPMC
+# export MC_TP_OVERLAP_AG=True
+# export MC_TP_OVERLAP_RS=True
 
 # FA: Disbale FAv2 from cuDNN and optimizations that consume memory (expected < 200MB) as they cause IMAs
 #export NVTE_FUSED_ATTN=0 # Disable cuDNN fused attention

diff --git a/large_language_model/nemo-v40-NVIDIA/nemo.toml b/large_language_model/nemo-v40-NVIDIA/nemo.toml
@@ -0,0 +1,17 @@
+image = "/iopsstor/scratch/cscs/dealmeih/images/mlperf-training-v4.0-nvidia-gpt3-24.04.pl224.sqsh"
+# image = "/iopsstor/scratch/cscs/dealmeih/images/mlperf-training-v4.0-nvidia-gpt3-24.06.sqsh"
+
+mounts = ["/capstor", "/iopsstor", "/users"]
+
+writable = true
+
+[annotations]
+com.hooks.aws_ofi_nccl.enabled = "true"
+com.hooks.aws_ofi_nccl.variant = "cuda12"
+com.hooks.dcgm.enabled = "true"
+
+[env]
+FI_CXI_DISABLE_HOST_REGISTER = "1"
+FI_MR_CACHE_MONITOR = "userfaultfd"
+NCCL_DEBUG = "WARN"
+# NCCL_DEBUG_SUBSYS = "INIT,TUNING"