diff --git a/Makefile b/Makefile index 741dfe8..191a3b8 100644 --- a/Makefile +++ b/Makefile @@ -78,7 +78,6 @@ build_torch: $(TORCH_ARCHIVE) .PHONY: clean_torch clean_torch: - rm -rf $(TORCH_BUILD_DIR) $(TORCH_ARCHIVE) $(TORCH_INSTALL_DIR) cd pytorch && git clean -fdx && git restore . cd pytorch/third_party/kineto && git restore . @@ -94,7 +93,8 @@ clean_tensorflow: rm -rf $(TF_INSTALL_DIR) cd tensorflow && \ bazel clean --expunge_async && \ - git restore . + git restore . && \ + git reset --hard .PHONY: clean_onnxruntime clean_onnxruntime: diff --git a/architectures/linux-rocm-6.1.2.mk b/architectures/linux-rocm-6.1.2.mk new file mode 100644 index 0000000..ba699f6 --- /dev/null +++ b/architectures/linux-rocm-6.1.2.mk @@ -0,0 +1,94 @@ + +# BSD 2-Clause License +# +# Copyright (c) 2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +OS = linux +ARCHITECTURE = x64 +ROCM_VERSION = 6.1.2 +STACK=rocm-$(ROCM_VERSION) + +# pyTorch options +PYTORCH_VERSION = 2.4.0 +TORCH_CMAKE_OPTIONS = -DBUILD_PYTHON=OFF +TORCH_CMAKE_OPTIONS += -DUSE_ROCM=ON -DUSE_RCCL=ON -DROCM_SOURCE_DIR=${ROCM_PATH} +TORCH_CMAKE_OPTIONS += -DUSE_NCCL=OFF -DUSE_CUDA=OFF -DUSE_STATIC_MKL=ON +PYTORCH_PREBUILD_TARGETS = pytorch_rocm_checkout pytorch_rocm_prebuild + + +# Tensorflow options +TF_VERSION = 2.15 +TF_TAG = r$(TF_VERSION)-rocm-enhanced +TF_REMOTE = https://github.com/ROCm/tensorflow-upstream.git +TF_PREBUILD_TARGETS = tf_rocm_checkout tf_rocm_prebuild +TF_BAZEL_OPTS = --config=opt --verbose_failures + +ONNXRT_VERSION = 1.17.3 +ONNXRT_OPTIONS = --use_rocm --rocm_home $(ROCM_PATH) +ONNXRT_PREBUILD_TARGETS = onnxrt_checkout +# No prebuild steps for ONNX + +# From PyTorch for ROCm instructions +# https://github.com/pytorch/pytorch/blob/v2.3.1/README.md?plain=1#L241-L245 +# For at ROCm 5.5.0 and later, also need to patch one of the ATen files +pytorch_rocm_checkout: + cd pytorch && \ + git checkout v${PYTORCH_VERSION} && \ + git submodule update --init --recursive && \ + git reset --hard + +pytorch_rocm_prebuild: + cd pytorch; python tools/amd_build/build_amd.py + sed -i 's/attr.memoryType/attr.type/g' pytorch/aten/src/ATen/hip/detail/HIPHooks.cpp + sed -i 's,/opt/rocm,${ROCM_PATH},g' pytorch/third_party/kineto/libkineto/CMakeLists.txt + sed -i 's,\.,\\.,g' pytorch/cmake/public/LoadHIP.cmake + +# (1) Patch .bazelrc to avoid hard-coded paths to Clang +# (2) Run the bazel configure script +tf_rocm_prebuild: + cd tensorflow; \ + git restore .bazelrc + # git apply ../patches/tensorflow/bazelrc.rocm.patch + cd tensorflow; \ + USE_DEFAULT_PYTHON_LIB_PATH=1 \ + PYTHON_BIN_PATH=$$(which python) \ + TF_NEED_CLANG=0 \ + TF_NEED_ROCM=1 \ + TF_NEED_CUDA=0 \ + CC_OPT_FLAGS="-Wno-sign-compare -B/usr/bin" \ + TF_SET_ANDROID_WORKSPACE=0 \ + python configure.py + +tf_rocm_checkout: + cd tensorflow; \ + git fetch $(TF_REMOTE) $(TF_TAG) && \ + git checkout FETCH_HEAD + +onnxrt_checkout: + cd onnxruntime && \ + git checkout v$(ONNXRT_VERSION) && \ + git reset --hard && \ + git clean -xdf && \ + git submodule update --init --recursive diff --git a/environments/onnxruntime/pinoak-rocm-6.1.2 b/environments/onnxruntime/pinoak-rocm-6.1.2 new file mode 100644 index 0000000..cd3d61a --- /dev/null +++ b/environments/onnxruntime/pinoak-rocm-6.1.2 @@ -0,0 +1,10 @@ +module purge + +export ROCM_VERSION=6.1.2 + +module load PrgEnv-gnu rocm/$ROCM_VERSION libffi libsqlite3 cmake cray-python + +# Following come from hipconfig, not all variables set by ROCm module +export ROCM_PATH=/global/opt/rocm-$ROCM_VERSION + + diff --git a/environments/pytorch/pinoak-cuda-11.8.0 b/environments/pytorch/pinoak-cuda-11.8.0 index 7e353a5..5fa46c0 100644 --- a/environments/pytorch/pinoak-cuda-11.8.0 +++ b/environments/pytorch/pinoak-cuda-11.8.0 @@ -1,4 +1,4 @@ module purge -module load PrgEnv-gnu cudatoolkit/11.8.0 cudnn/8.9.7.29 gcc/11.2.0 ninja libffi libsqlite3 cray-python -export CC=gcc CXX=g++ FC=gfortran \ No newline at end of file +module load PrgEnv-gnu cudatoolkit/11.8.0 cudnn/8.9.7.29 gcc/11.2.0 ninja cray-python +export CC=gcc CXX=g++ FC=gfortran diff --git a/environments/pytorch/pinoak-cuda-12.5.0 b/environments/pytorch/pinoak-cuda-12.5.0 index 05b0a09..b6de49f 100644 --- a/environments/pytorch/pinoak-cuda-12.5.0 +++ b/environments/pytorch/pinoak-cuda-12.5.0 @@ -1,5 +1,5 @@ module purge -module load PrgEnv-gnu cudatoolkit/12.5.0 cudnn/cuda-12/9.3.0.75 gcc/11.2.0 ninja libffi libsqlite3 cray-python +module load PrgEnv-gnu cudatoolkit/12.5.0 cudnn/cuda-12/9.3.0.75 gcc/11.2.0 ninja cray-python export TORCH_CUDA_ARCH_LIST="5.0 5.1 5.3 6.0 6.1 6.2 7.0 7.2 7.5 8.0 8.6 8.7 8.9 9.0" -export CC=gcc CXX=g++ FC=gfortran \ No newline at end of file +export CC=gcc CXX=g++ FC=gfortran diff --git a/environments/pytorch/pinoak-rocm-5.7.0 b/environments/pytorch/pinoak-rocm-5.7.0 index d094c72..0d20bb8 100644 --- a/environments/pytorch/pinoak-rocm-5.7.0 +++ b/environments/pytorch/pinoak-rocm-5.7.0 @@ -1,7 +1,7 @@ module purge export ROCM_VERSION=5.7.0 -module load PrgEnv-gnu rocm/5.7.0 ninja libffi libsqlite3 cray-python +module load PrgEnv-gnu rocm/$ROCM_VERSION ninja cray-python export CC=gcc CXX=g++ FC=gfortran # Following come from hipconfig, not all variables set by ROCm module diff --git a/environments/pytorch/pinoak-rocm-6.1.2 b/environments/pytorch/pinoak-rocm-6.1.2 new file mode 100644 index 0000000..b9eb3b5 --- /dev/null +++ b/environments/pytorch/pinoak-rocm-6.1.2 @@ -0,0 +1,14 @@ +module purge + +export ROCM_VERSION=6.1.2 +module load PrgEnv-gnu rocm/$ROCM_VERSION ninja cray-python +export CC=gcc CXX=g++ FC=gfortran + +# Following come from hipconfig, not all variables set by ROCm module +export ROCM_PATH=$(hipconfig --rocmpath) +export ROCM_SOURCE_PATH=$ROCM_PATH +export HIP_PATH=$(hipconfig --path) +export HIP_LIB_PATH=$ROCM_PATH/lib + +# Build for all ROCm architectures +export PYTORCH_ROCM_ARCH="gfx90a"