Skip to content

Commit

Permalink
add hsa CI
Browse files Browse the repository at this point in the history
  • Loading branch information
makslevental committed Sep 3, 2024
1 parent 9041a72 commit e29775b
Show file tree
Hide file tree
Showing 3 changed files with 302 additions and 10 deletions.
299 changes: 299 additions & 0 deletions .github/workflows/ci-hsa-linux.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,299 @@
name: CI Linux HSA

on:
workflow_call:
workflow_dispatch:
pull_request:
merge_group:
push:
branches:
- main

concurrency:
group: ci-build-test-cpp-linux-hsa-${{ github.event.number || github.sha }}
cancel-in-progress: true

jobs:
build_hsa:
name: Build HSA (linux)
runs-on: nod-ai-shared-cpubuilder-manylinux-x86_64
steps:
- name: Configure local git mirrors
run: |
/gitmirror/scripts/trigger_update_mirrors.sh
/gitmirror/scripts/git_config.sh
- name: "Checking out repository"
env:
BRANCH_NAME: ${{ github.ref }}
REPO_ADDRESS: ${{ github.server_url }}/${{ github.repository }}
run: |
git init
git remote add origin $REPO_ADDRESS
git -c protocol.version=2 fetch --depth 1 origin $BRANCH_NAME
git reset --hard FETCH_HEAD
git -c submodule."third_party/torch-mlir".update=none -c submodule."third_party/stablehlo".update=none -c submodule."src/runtime_src/core/common/aiebu".update=none submodule update --init --recursive --depth 1 --single-branch -j 10
- name: Install deps
run: |
dnf install -y almalinux-release-devel
yum install -y elfutils-libelf-devel p7zip p7zip-plugins \
sudo ncurses-compat-libs openssh vim-common
- name: Build and install libnuma
working-directory: ${{ github.workspace }}/third_party/ROCR-Runtime
run: |
curl --silent -L \
https://github.com/numactl/numactl/releases/download/v2.0.18/numactl-2.0.18.tar.gz \
-o numactl-2.0.18.tar.gz
tar -xf numactl-2.0.18.tar.gz
pushd numactl-2.0.18
./configure
# i have no idea why this is necessary
# but without it you get something about "can't cd into dir"
sed -i '7563s/`cd "$dir" && pwd`/$dir/g' libtool
make install
popd
- name: Hack ROCR
working-directory: ${{ github.workspace }}/third_party/ROCR-Runtime
run: |
sed -i 's/amdgcn-amd-amdhsa/amdgcn-amd-amdhsa -nogpulib/g' runtime/hsa-runtime/core/runtime/blit_shaders/CMakeLists.txt
sed -i 's/amdgcn-amd-amdhsa/amdgcn-amd-amdhsa -nogpulib/g' runtime/hsa-runtime/core/runtime/trap_handler/CMakeLists.txt
sed -i 's/amdgcn-amd-amdhsa/amdgcn-amd-amdhsa -nogpulib/g' runtime/hsa-runtime/image/blit_src/CMakeLists.txt
- name: Get compatible Clang
working-directory: ${{ github.workspace }}/third_party/ROCR-Runtime
run: |
pip download mlir==20.0.0.2024090301+amdgpu.b6597f52 -f https://makslevental.github.io/wheels
unzip -q mlir-*.whl
- name: Build ROCR distro
working-directory: ${{ github.workspace }}/third_party/ROCR-Runtime
run: |
rocr_dir="$PWD"
build_rocr_dir="$PWD/rocr-build"
mkdir -p "$build_rocr_dir"
build_rocr_dir="$(cd $build_rocr_dir && pwd)"
rocr_install_dir="$PWD/rocr-install"
cmake -GNinja \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX="$rocr_install_dir" \
-DClang_DIR=$PWD/mlir/lib/cmake/clang \
-DLLVM_DIR=$PWD/mlir/lib/cmake/mlir \
-DIMAGE_SUPPORT=OFF \
-S "$rocr_dir" -B "$build_rocr_dir"
cmake --build "$build_rocr_dir" --target install
tar -cf rocr-${GITHUB_SHA::8}.tar rocr-install
- name: Upload artifacts
uses: actions/upload-artifact@v4
if: ${{ !cancelled() }}
with:
name: linux_hsa_x86_64_release_packages
path: ${{ github.workspace }}/third_party/ROCR-Runtime/rocr-*.tar
if-no-files-found: error

build_and_ctest:
name: Build and Test with HSA (linux, ASSERTIONS)
needs: [build_hsa]
runs-on: nod-ai-shared-cpubuilder-manylinux-x86_64
strategy:
fail-fast: true
env:
CACHE_DIR: ${{ github.workspace }}/.container-cache
# either the PR number or `branch-N` where N always increments
CACHE_KEY: linux-build-test-cpp-asserts-manylinux-v2-${{ format('{0}-{1}', github.ref_name, github.run_number) }}
steps:
- name: Set unified TZ
uses: szenius/[email protected]
with:
# this is an arbitrary choice
timezoneLinux: "Asia/Singapore"
timezoneMacos: "Asia/Singapore"
timezoneWindows: "Singapore Standard Time"

- name: Configure local git mirrors
run: |
/gitmirror/scripts/trigger_update_mirrors.sh
/gitmirror/scripts/git_config.sh
- name: "Checking out repository"
env:
BRANCH_NAME: ${{ github.ref }}
REPO_ADDRESS: ${{ github.server_url }}/${{ github.repository }}
run: |
git init
git remote add origin $REPO_ADDRESS
git -c protocol.version=2 fetch --depth 1 origin $BRANCH_NAME
git reset --hard FETCH_HEAD
git -c submodule."third_party/torch-mlir".update=none -c submodule."third_party/stablehlo".update=none -c submodule."src/runtime_src/core/common/aiebu".update=none submodule update --init --recursive --depth 1 --single-branch -j 10
- name: Install deps
run: |
dnf install -y almalinux-release-devel epel-release
yum remove -y openssl-devel zlib-devel || true
yum install -y protobuf-devel protobuf-compiler tmate
- name: Python deps
run: |
pip install "numpy<2" pyyaml "pybind11[global]==2.10.3" nanobind
- name: Enable cache
uses: actions/cache/restore@v3
with:
path: ${{ env.CACHE_DIR }}
key: ${{ env.CACHE_KEY }}
restore-keys: linux-build-test-cpp-

- name: Download artifacts
uses: actions/download-artifact@v4
with:
name: linux_hsa_x86_64_release_packages

- name: Extract artifact
run: |
tar -xvf rocr-*.tar
echo "hsa-runtime64_ROOT=$PWD/rocr-install" >> $GITHUB_ENV
echo IREE_EXTERNAL_HAL_DRIVER=hsa >> $GITHUB_ENV
- name: Build packages
run: |
export cache_dir="${{ env.CACHE_DIR }}"
export CCACHE_COMPILERCHECK="string:$(clang --version)"
bash build_tools/ci/build_test_cpp.sh
- name: Create artifacts
if: ${{ !cancelled() }}
run: |
tar cf iree-dist-linux.tar iree-install
- name: Upload artifacts
uses: actions/upload-artifact@v4
if: ${{ !cancelled() }}
with:
name: linux_x86_64_release_packages
path: iree-dist-linux.tar
if-no-files-found: warn

- name: Save cache
uses: actions/cache/save@v3
if: ${{ !cancelled() && github.event_name == 'push' && github.ref_name == 'main' }}
with:
path: ${{ env.CACHE_DIR }}
key: ${{ env.CACHE_KEY }}

test_linux:
name: E2E Test linux with HSA
needs: build_and_ctest
strategy:
fail-fast: false
matrix:
runs-on: [linux-phoenix]
runs-on: ${{ matrix.runs-on }}
env:
XILINXD_LICENSE_FILE: /opt/xilinx/Xilinx.lic
steps:
- name: "Checking out repository" # for test scripts
uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
with:
submodules: false # not required for testbench

- name: Download artifacts
uses: actions/download-artifact@v4
with:
name: linux_x86_64_release_packages

- name: Extract artifact
run: |
tar -xvf iree-dist-linux.tar
bash build_tools/download_peano.sh
- name: Create venv and install dependencies
run: |
python3 -m venv .venv
source .venv/bin/activate
pip install -r tests/matmul/requirements.txt
- name: E2E correctness matmul test
run: |
source .venv/bin/activate
# Without this additional line an error like
#
# [XRT] ERROR: Failed to allocate host memory buffer (mmap(len=10616832, prot=3, flags=8193, offset=4294967296)
# failed (err=11): Resource temporarily unavailable), make sure host bank is enabled (see xbutil configure --host-mem)
# iree-amd-aie/runtime/src/iree-amd-aie/driver/xrt/direct_allocator.cc:179: RESOURCE_EXHAUSTED; could not allocate
# memory for buffer; while invoking C++ function matmul_test.generate_random_matrix; while calling import;
#
# might be observed when too much memory is allocated. For example this
# error was seen when running a bf16->f32 matmul with m=n=k=2304.
#
# This line was suggested at https://github.com/Xilinx/mlir-air/issues/566
#
# Note that this is only half of the fix. It is also necessary that
# the machine that CI is running on has permission to run this line.
#
# This permission can be adding by adding the line
# ```
# %github ALL=(ALL) NOPASSWD: /usr/bin/prlimit *
# ```
#
# to the file /etc/sudoers.d/github, which can be done by running
# ```
# sudo visudo -f /etc/sudoers.d/github
# ```
# on the guthub CI machine.
sudo prlimit -lunlimited --pid $$
source /opt/xilinx/xrt/setup.sh
bash build_tools/ci/run_matmul_test.sh \
test_matmuls \
iree-install \
$PWD/llvm-aie \
/opt/xilinx/xrt \
/opt/Xilinx/Vitis/2024.2
- name : Smoke E2E comparison flag test
run: |
source .venv/bin/activate
source /opt/xilinx/xrt/setup.sh
python3 build_tools/ci/cpu_comparison/run_test.py \
test_aie_vs_cpu \
iree-install \
$PWD/llvm-aie \
--xrt-dir /opt/xilinx/xrt \
--test-set='Smoke' \
--do-not-run-aie
# Assert that output.log is empty (because verbose=0)
if [ -s output.log ]; then
echo "output.log is not empty:"
cat output.log
exit 1
else
echo "output.log is empty"
fi
- name : E2E comparison of AIE to llvm-cpu
run: |
source .venv/bin/activate
source /opt/xilinx/xrt/setup.sh
python3 build_tools/ci/cpu_comparison/run_test.py \
test_aie_vs_cpu \
$PWD/iree-install \
$PWD/llvm-aie \
--xrt-dir /opt/xilinx/xrt \
--vitis-dir /opt/Xilinx/Vitis/2024.2 \
--reset-npu-between-runs -v
- name: Printing IR from aie2xclbin
run: |
source .venv/bin/activate
source /opt/xilinx/xrt/setup.sh
bash build_tools/ci/print_ir_aie2xclbin/print_ir_aie2xclbin.sh \
iree-install \
print_ir_aie2xclbin_results \
$PWD/llvm-aie
8 changes: 1 addition & 7 deletions .github/workflows/ci-linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ jobs:
run: |
dnf install -y almalinux-release-devel epel-release
yum remove -y openssl-devel zlib-devel || true
yum install -y protobuf-devel protobuf-compiler libnuma-devel tmate
yum install -y protobuf-devel protobuf-compiler tmate
- name: Python deps
run: |
Expand All @@ -68,12 +68,6 @@ jobs:
key: ${{ env.CACHE_KEY }}
restore-keys: linux-build-test-cpp-

- name: Build ROCT/ROCR
run: |
export cache_dir="${{ env.CACHE_DIR }}"
bash build_tools/ci/build_roct_rocr.sh
echo "hsa-runtime64_ROOT=$PWD/rocr-install" >> $GITHUB_ENV
- name: Build packages
run: |
export cache_dir="${{ env.CACHE_DIR }}"
Expand Down
5 changes: 2 additions & 3 deletions build_tools/ci/build_test_cpp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,10 @@ if [[ "$OSTYPE" != "darwin"* ]]; then
-DCMAKE_CXX_COMPILER="${CXX}" \
-DLLVM_TARGET_ARCH=X86 \
-DLLVM_TARGETS_TO_BUILD=X86 \
-DIREE_EXTERNAL_HAL_DRIVERS=hsa \
-DIREE_EXTERNAL_HAL_DRIVERS=${IREE_EXTERNAL_HAL_DRIVER:-xrt} \
-S $iree_dir -B $build_dir
else
cmake $CMAKE_ARGS \
-S $iree_dir -B $build_dir
cmake $CMAKE_ARGS -S $iree_dir -B $build_dir
fi

echo "Building all"
Expand Down

0 comments on commit e29775b

Please sign in to comment.