Skip to content

Commit

Permalink
acceptance gpt3 code
Browse files Browse the repository at this point in the history
  • Loading branch information
henrique committed Jul 16, 2024
1 parent 1845acf commit 232c258
Show file tree
Hide file tree
Showing 6 changed files with 178 additions and 146 deletions.
59 changes: 37 additions & 22 deletions large_language_model/nemo-v40-NVIDIA/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,23 @@
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.


ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:24.04-py3
FROM ${FROM_IMAGE_NAME}

RUN apt-get update \
&& apt-get install -yqq --no-install-recommends openssh-server \
&& rm -rf /var/lib/apt/lists/*
RUN mkdir -p /run/sshd

RUN pip install -vvv \
git+https://github.com/mlcommons/[email protected] \
git+https://github.com/NVIDIA/mlperf-common.git@68cf1d0d5e3de3351e66abb696d0e2d011aabf47 \
zarr

# NeMo dependencies
RUN pip install -vvv tensorstore==0.1.45
RUN pip install -vvv causal-conv1d==1.2.0.post2 opencc-python-reimplemented==0.1.7

# Document build setup
ARG FROM_IMAGE_NAME
ENV CUSTOM_FROM_IMAGE_NAME ${FROM_IMAGE_NAME}
Expand All @@ -43,15 +56,15 @@ RUN if [ "${APEX_REVISION}" != SKIP ]; then \
MAX_JOBS=${APEX_MAX_JOBS} NVCC_APPEND_FLAGS="--threads 8" pip install -v --no-build-isolation --no-cache-dir --disable-pip-version-check --config-settings "--build-option=--cpp_ext --cuda_ext --bnp --xentropy --deprecated_fused_adam --deprecated_fused_lamb --fast_multihead_attn --distributed_lamb --fast_layer_norm --transducer --distributed_adam --fmha --fast_bottleneck --nccl_p2p --peer_memory --permutation_search --focal_loss --fused_conv_bias_relu --index_mul_2d --cudnn_gbn --group_norm" . \
; fi

## 2. Transformer Engine
ARG TE_REVISION=v1.6rc1
## 2. Transformer Engine +fix for arm64
ARG TE_REVISION=3b4d9e8766b829d50ac78bb26f770fb8d9825ae7 # v1.6rc1
ENV CUSTOM_TE_REVISION ${TE_REVISION}

RUN if [ "${TE_REVISION}" != SKIP ]; then \
NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install --force-reinstall --no-deps git+https://github.com/NVIDIA/TransformerEngine.git@${TE_REVISION} \
; fi

## 3. NeMo
## 3. NeMo v2.0.0.rc0.beta doesn't install on aarch64, but v2.0.0rc0 does
ARG NEMO_REVISION=v2.0.0.rc0.beta
ENV CUSTOM_NEMO_REVISION ${NEMO_REVISION}
ARG NEMO_BASE_VERSION=r2.0.0
Expand All @@ -69,13 +82,16 @@ RUN if [ "${NEMO_REVISION}" == SKIP ]; then \
git clone https://github.com/NVIDIA/NeMo.git && \
cd NeMo && \
git config user.email "[email protected]" && \
git config user.name "name name" && \
git config user.name "name name" && \
git checkout v2.0.0rc0 && \
echo NEMO_COMMIT_HASH=$(git rev-parse HEAD) && \
pip uninstall -y nemo-toolkit && \
pip install pkgconfig py-cpuinfo "cython<3.0.0" && \
pip install --no-build-isolation -e ".[nlp]" && \
echo NEMO_REVISION=${NEMO_REVISION} && \
git checkout ${NEMO_REVISION} && \
echo NEMO_COMMIT_HASH=$(git rev-parse HEAD) && \
pip uninstall -y nemo-toolkit && \
pip install "cython<3.0.0" && \
pip install --no-build-isolation -e ".[nlp]" \
pip install --no-build-isolation --no-deps -e ".[nlp]" \
; fi

### Make (has to be called after all changes to repo)
Expand Down Expand Up @@ -103,26 +119,25 @@ RUN if [ "${MEGATRON_REVISION}" != SKIP ]; then \
ENV PYTHONPATH "${PYTHONPATH}:/workspace/Megatron-LM"

# Pin PL version
# RUN pip install --force-reinstall --no-deps pytorch-lightning==2.0.7
RUN pip install --force-reinstall --no-deps pytorch-lightning==2.2.4

## 5. Benchmark dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# # 5. Benchmark dependencies
# COPY requirements.txt .
# RUN pip install --no-cache-dir -r requirements.txt

## 6. Use nccl-rdma-sharp-plugins from master to pick a fix after HPCX2.18 release
RUN rm -rf /opt/hpcx/nccl_rdma_sharp_plugin && \
ln -s /usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so && \
git clone https://github.com/Mellanox/nccl-rdma-sharp-plugins && \
cd nccl-rdma-sharp-plugins/ && \
./autogen.sh && \
./configure --prefix=/opt/hpcx/nccl_rdma_sharp_plugin --with-cuda=/usr/local/cuda --with-sharp=/opt/hpcx/sharp/ && \
make -j install && \
cd ../ && \
rm -rf nccl-rdma-sharp-plugins/
#RUN rm -rf /opt/hpcx/nccl_rdma_sharp_plugin && \
# ln -s /usr/lib/x86_64-linux-gnu/libibverbs.so.1 /usr/lib/x86_64-linux-gnu/libibverbs.so && \
# git clone https://github.com/Mellanox/nccl-rdma-sharp-plugins && \
# cd nccl-rdma-sharp-plugins/ && \
# ./autogen.sh && \
# ./configure --prefix=/opt/hpcx/nccl_rdma_sharp_plugin --with-cuda=/usr/local/cuda --with-sharp=/opt/hpcx/sharp/ && \
# make -j install && \
# cd ../ && \
# rm -rf nccl-rdma-sharp-plugins/

# Benchmark code
WORKDIR /workspace/llm

COPY . .
ENV PYTHONPATH "/workspace/llm:/workspace/NeMo:${PYTHONPATH}"

2 changes: 1 addition & 1 deletion large_language_model/nemo-v40-NVIDIA/conf/custom.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ proxy_gbs: ${oc.decode:${oc.env:PROXY_GBS,${model.global_batch_size}}}
is_proxy_run: ${neq:${proxy_gbs},${model.global_batch_size}}

trainer:
devices: ${oc.decode:${oc.env:DGXNGPU,8}}
devices: ${oc.decode:${oc.env:DGXNGPU,4}}
num_nodes: ${oc.decode:${oc.env:DGXNNODES,1}}
precision: bf16
max_steps: ${oc.decode:${oc.env:MAX_STEPS,${if:${is_proxy_run},${oc.decode:${oc.env:LIMIT_TRAIN_BATCHES,500}},${ceil_div:20000000,${proxy_gbs}}}}}
Expand Down
20 changes: 10 additions & 10 deletions large_language_model/nemo-v40-NVIDIA/config_common.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
## System run parms

export CUDA_VISIBLE_DEVICES=0,4,2,6,1,5,3,7
export CUDA_VISIBLE_DEVICES=0,1,2,3

export TRAIN_ONLY=0

Expand All @@ -15,7 +15,7 @@ export NCCL_MIN_NCHANNELS=4

export CUDA_DEVICE_MAX_CONNECTIONS=1

: "${CHECKPOINT_NAME:=""}"
: "${CHECKPOINT_NAME:="ckpt4000-consumed_samples=0"}"
export LOAD_CHECKPOINT="/load_checkpoints/"$CHECKPOINT_NAME

export MICRO_BATCH_SIZE=2
Expand All @@ -34,14 +34,14 @@ export NCCL_CUMEM_ENABLE=0
# This is needed to save memory. nvbug 4264087 tracks fix.
export NCCL_NVLS_ENABLE=0

# TP overlap: use FP8/MC strided atomic RS and pipelined AG
export NVTE_UB_SPLIT_RS=0
export NVTE_UB_ATOMIC_GEMM_RS=1
export NVTE_RS_STRIDED_ATOMIC=1
#export NVTE_UB_FP8_RS=1
unset UB_SKIPMC
export MC_TP_OVERLAP_AG=True
export MC_TP_OVERLAP_RS=True
# # TP overlap: use FP8/MC strided atomic RS and pipelined AG
# export NVTE_UB_SPLIT_RS=0
# export NVTE_UB_ATOMIC_GEMM_RS=1
# export NVTE_RS_STRIDED_ATOMIC=1
# #export NVTE_UB_FP8_RS=1
# unset UB_SKIPMC
# export MC_TP_OVERLAP_AG=True
# export MC_TP_OVERLAP_RS=True

# FA: Disbale FAv2 from cuDNN and optimizations that consume memory (expected < 200MB) as they cause IMAs
#export NVTE_FUSED_ATTN=0 # Disable cuDNN fused attention
Expand Down
17 changes: 17 additions & 0 deletions large_language_model/nemo-v40-NVIDIA/nemo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
image = "/iopsstor/scratch/cscs/dealmeih/images/mlperf-training-v4.0-nvidia-gpt3-24.04.pl224.sqsh"
# image = "/iopsstor/scratch/cscs/dealmeih/images/mlperf-training-v4.0-nvidia-gpt3-24.06.sqsh"

mounts = ["/capstor", "/iopsstor", "/users"]

writable = true

[annotations]
com.hooks.aws_ofi_nccl.enabled = "true"
com.hooks.aws_ofi_nccl.variant = "cuda12"
com.hooks.dcgm.enabled = "true"

[env]
FI_CXI_DISABLE_HOST_REGISTER = "1"
FI_MR_CACHE_MONITOR = "userfaultfd"
NCCL_DEBUG = "WARN"
# NCCL_DEBUG_SUBSYS = "INIT,TUNING"
Loading

0 comments on commit 232c258

Please sign in to comment.