From e29775b598a965eabfbde919460dec80799dc05b Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Tue, 3 Sep 2024 12:30:35 -0500 Subject: [PATCH] add hsa CI --- .github/workflows/ci-hsa-linux.yml | 299 +++++++++++++++++++++++++++++ .github/workflows/ci-linux.yml | 8 +- build_tools/ci/build_test_cpp.sh | 5 +- 3 files changed, 302 insertions(+), 10 deletions(-) create mode 100644 .github/workflows/ci-hsa-linux.yml diff --git a/.github/workflows/ci-hsa-linux.yml b/.github/workflows/ci-hsa-linux.yml new file mode 100644 index 000000000..94de75315 --- /dev/null +++ b/.github/workflows/ci-hsa-linux.yml @@ -0,0 +1,299 @@ +name: CI Linux HSA + +on: + workflow_call: + workflow_dispatch: + pull_request: + merge_group: + push: + branches: + - main + +concurrency: + group: ci-build-test-cpp-linux-hsa-${{ github.event.number || github.sha }} + cancel-in-progress: true + +jobs: + build_hsa: + name: Build HSA (linux) + runs-on: nod-ai-shared-cpubuilder-manylinux-x86_64 + steps: + - name: Configure local git mirrors + run: | + /gitmirror/scripts/trigger_update_mirrors.sh + /gitmirror/scripts/git_config.sh + + - name: "Checking out repository" + env: + BRANCH_NAME: ${{ github.ref }} + REPO_ADDRESS: ${{ github.server_url }}/${{ github.repository }} + run: | + git init + git remote add origin $REPO_ADDRESS + git -c protocol.version=2 fetch --depth 1 origin $BRANCH_NAME + git reset --hard FETCH_HEAD + git -c submodule."third_party/torch-mlir".update=none -c submodule."third_party/stablehlo".update=none -c submodule."src/runtime_src/core/common/aiebu".update=none submodule update --init --recursive --depth 1 --single-branch -j 10 + + - name: Install deps + run: | + dnf install -y almalinux-release-devel + yum install -y elfutils-libelf-devel p7zip p7zip-plugins \ + sudo ncurses-compat-libs openssh vim-common + + - name: Build and install libnuma + working-directory: ${{ github.workspace }}/third_party/ROCR-Runtime + run: | + curl --silent -L \ + https://github.com/numactl/numactl/releases/download/v2.0.18/numactl-2.0.18.tar.gz \ + -o numactl-2.0.18.tar.gz + tar -xf numactl-2.0.18.tar.gz + pushd numactl-2.0.18 + ./configure + # i have no idea why this is necessary + # but without it you get something about "can't cd into dir" + sed -i '7563s/`cd "$dir" && pwd`/$dir/g' libtool + make install + popd + + - name: Hack ROCR + working-directory: ${{ github.workspace }}/third_party/ROCR-Runtime + run: | + sed -i 's/amdgcn-amd-amdhsa/amdgcn-amd-amdhsa -nogpulib/g' runtime/hsa-runtime/core/runtime/blit_shaders/CMakeLists.txt + sed -i 's/amdgcn-amd-amdhsa/amdgcn-amd-amdhsa -nogpulib/g' runtime/hsa-runtime/core/runtime/trap_handler/CMakeLists.txt + sed -i 's/amdgcn-amd-amdhsa/amdgcn-amd-amdhsa -nogpulib/g' runtime/hsa-runtime/image/blit_src/CMakeLists.txt + + - name: Get compatible Clang + working-directory: ${{ github.workspace }}/third_party/ROCR-Runtime + run: | + pip download mlir==20.0.0.2024090301+amdgpu.b6597f52 -f https://makslevental.github.io/wheels + unzip -q mlir-*.whl + + - name: Build ROCR distro + working-directory: ${{ github.workspace }}/third_party/ROCR-Runtime + run: | + rocr_dir="$PWD" + build_rocr_dir="$PWD/rocr-build" + mkdir -p "$build_rocr_dir" + build_rocr_dir="$(cd $build_rocr_dir && pwd)" + rocr_install_dir="$PWD/rocr-install" + + cmake -GNinja \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX="$rocr_install_dir" \ + -DClang_DIR=$PWD/mlir/lib/cmake/clang \ + -DLLVM_DIR=$PWD/mlir/lib/cmake/mlir \ + -DIMAGE_SUPPORT=OFF \ + -S "$rocr_dir" -B "$build_rocr_dir" + + cmake --build "$build_rocr_dir" --target install + tar -cf rocr-${GITHUB_SHA::8}.tar rocr-install + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + if: ${{ !cancelled() }} + with: + name: linux_hsa_x86_64_release_packages + path: ${{ github.workspace }}/third_party/ROCR-Runtime/rocr-*.tar + if-no-files-found: error + + build_and_ctest: + name: Build and Test with HSA (linux, ASSERTIONS) + needs: [build_hsa] + runs-on: nod-ai-shared-cpubuilder-manylinux-x86_64 + strategy: + fail-fast: true + env: + CACHE_DIR: ${{ github.workspace }}/.container-cache + # either the PR number or `branch-N` where N always increments + CACHE_KEY: linux-build-test-cpp-asserts-manylinux-v2-${{ format('{0}-{1}', github.ref_name, github.run_number) }} + steps: + - name: Set unified TZ + uses: szenius/set-timezone@v2.0 + with: + # this is an arbitrary choice + timezoneLinux: "Asia/Singapore" + timezoneMacos: "Asia/Singapore" + timezoneWindows: "Singapore Standard Time" + + - name: Configure local git mirrors + run: | + /gitmirror/scripts/trigger_update_mirrors.sh + /gitmirror/scripts/git_config.sh + + - name: "Checking out repository" + env: + BRANCH_NAME: ${{ github.ref }} + REPO_ADDRESS: ${{ github.server_url }}/${{ github.repository }} + run: | + git init + git remote add origin $REPO_ADDRESS + git -c protocol.version=2 fetch --depth 1 origin $BRANCH_NAME + git reset --hard FETCH_HEAD + git -c submodule."third_party/torch-mlir".update=none -c submodule."third_party/stablehlo".update=none -c submodule."src/runtime_src/core/common/aiebu".update=none submodule update --init --recursive --depth 1 --single-branch -j 10 + + - name: Install deps + run: | + dnf install -y almalinux-release-devel epel-release + yum remove -y openssl-devel zlib-devel || true + yum install -y protobuf-devel protobuf-compiler tmate + + - name: Python deps + run: | + pip install "numpy<2" pyyaml "pybind11[global]==2.10.3" nanobind + + - name: Enable cache + uses: actions/cache/restore@v3 + with: + path: ${{ env.CACHE_DIR }} + key: ${{ env.CACHE_KEY }} + restore-keys: linux-build-test-cpp- + + - name: Download artifacts + uses: actions/download-artifact@v4 + with: + name: linux_hsa_x86_64_release_packages + + - name: Extract artifact + run: | + tar -xvf rocr-*.tar + echo "hsa-runtime64_ROOT=$PWD/rocr-install" >> $GITHUB_ENV + echo IREE_EXTERNAL_HAL_DRIVER=hsa >> $GITHUB_ENV + + - name: Build packages + run: | + export cache_dir="${{ env.CACHE_DIR }}" + export CCACHE_COMPILERCHECK="string:$(clang --version)" + bash build_tools/ci/build_test_cpp.sh + + - name: Create artifacts + if: ${{ !cancelled() }} + run: | + tar cf iree-dist-linux.tar iree-install + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + if: ${{ !cancelled() }} + with: + name: linux_x86_64_release_packages + path: iree-dist-linux.tar + if-no-files-found: warn + + - name: Save cache + uses: actions/cache/save@v3 + if: ${{ !cancelled() && github.event_name == 'push' && github.ref_name == 'main' }} + with: + path: ${{ env.CACHE_DIR }} + key: ${{ env.CACHE_KEY }} + + test_linux: + name: E2E Test linux with HSA + needs: build_and_ctest + strategy: + fail-fast: false + matrix: + runs-on: [linux-phoenix] + runs-on: ${{ matrix.runs-on }} + env: + XILINXD_LICENSE_FILE: /opt/xilinx/Xilinx.lic + steps: + - name: "Checking out repository" # for test scripts + uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0 + with: + submodules: false # not required for testbench + + - name: Download artifacts + uses: actions/download-artifact@v4 + with: + name: linux_x86_64_release_packages + + - name: Extract artifact + run: | + tar -xvf iree-dist-linux.tar + bash build_tools/download_peano.sh + + - name: Create venv and install dependencies + run: | + python3 -m venv .venv + source .venv/bin/activate + pip install -r tests/matmul/requirements.txt + + - name: E2E correctness matmul test + run: | + source .venv/bin/activate + # Without this additional line an error like + # + # [XRT] ERROR: Failed to allocate host memory buffer (mmap(len=10616832, prot=3, flags=8193, offset=4294967296) + # failed (err=11): Resource temporarily unavailable), make sure host bank is enabled (see xbutil configure --host-mem) + # iree-amd-aie/runtime/src/iree-amd-aie/driver/xrt/direct_allocator.cc:179: RESOURCE_EXHAUSTED; could not allocate + # memory for buffer; while invoking C++ function matmul_test.generate_random_matrix; while calling import; + # + # might be observed when too much memory is allocated. For example this + # error was seen when running a bf16->f32 matmul with m=n=k=2304. + # + # This line was suggested at https://github.com/Xilinx/mlir-air/issues/566 + # + # Note that this is only half of the fix. It is also necessary that + # the machine that CI is running on has permission to run this line. + # + # This permission can be adding by adding the line + # ``` + # %github ALL=(ALL) NOPASSWD: /usr/bin/prlimit * + # ``` + # + # to the file /etc/sudoers.d/github, which can be done by running + # ``` + # sudo visudo -f /etc/sudoers.d/github + # ``` + # on the guthub CI machine. + sudo prlimit -lunlimited --pid $$ + + source /opt/xilinx/xrt/setup.sh + bash build_tools/ci/run_matmul_test.sh \ + test_matmuls \ + iree-install \ + $PWD/llvm-aie \ + /opt/xilinx/xrt \ + /opt/Xilinx/Vitis/2024.2 + + + - name : Smoke E2E comparison flag test + run: | + source .venv/bin/activate + source /opt/xilinx/xrt/setup.sh + python3 build_tools/ci/cpu_comparison/run_test.py \ + test_aie_vs_cpu \ + iree-install \ + $PWD/llvm-aie \ + --xrt-dir /opt/xilinx/xrt \ + --test-set='Smoke' \ + --do-not-run-aie + + # Assert that output.log is empty (because verbose=0) + if [ -s output.log ]; then + echo "output.log is not empty:" + cat output.log + exit 1 + else + echo "output.log is empty" + fi + + - name : E2E comparison of AIE to llvm-cpu + run: | + source .venv/bin/activate + source /opt/xilinx/xrt/setup.sh + python3 build_tools/ci/cpu_comparison/run_test.py \ + test_aie_vs_cpu \ + $PWD/iree-install \ + $PWD/llvm-aie \ + --xrt-dir /opt/xilinx/xrt \ + --vitis-dir /opt/Xilinx/Vitis/2024.2 \ + --reset-npu-between-runs -v + + - name: Printing IR from aie2xclbin + run: | + source .venv/bin/activate + source /opt/xilinx/xrt/setup.sh + bash build_tools/ci/print_ir_aie2xclbin/print_ir_aie2xclbin.sh \ + iree-install \ + print_ir_aie2xclbin_results \ + $PWD/llvm-aie diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index f3b1e83bc..b14c2125a 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -55,7 +55,7 @@ jobs: run: | dnf install -y almalinux-release-devel epel-release yum remove -y openssl-devel zlib-devel || true - yum install -y protobuf-devel protobuf-compiler libnuma-devel tmate + yum install -y protobuf-devel protobuf-compiler tmate - name: Python deps run: | @@ -68,12 +68,6 @@ jobs: key: ${{ env.CACHE_KEY }} restore-keys: linux-build-test-cpp- - - name: Build ROCT/ROCR - run: | - export cache_dir="${{ env.CACHE_DIR }}" - bash build_tools/ci/build_roct_rocr.sh - echo "hsa-runtime64_ROOT=$PWD/rocr-install" >> $GITHUB_ENV - - name: Build packages run: | export cache_dir="${{ env.CACHE_DIR }}" diff --git a/build_tools/ci/build_test_cpp.sh b/build_tools/ci/build_test_cpp.sh index 8ad78c03f..38eafa641 100644 --- a/build_tools/ci/build_test_cpp.sh +++ b/build_tools/ci/build_test_cpp.sh @@ -91,11 +91,10 @@ if [[ "$OSTYPE" != "darwin"* ]]; then -DCMAKE_CXX_COMPILER="${CXX}" \ -DLLVM_TARGET_ARCH=X86 \ -DLLVM_TARGETS_TO_BUILD=X86 \ - -DIREE_EXTERNAL_HAL_DRIVERS=hsa \ + -DIREE_EXTERNAL_HAL_DRIVERS=${IREE_EXTERNAL_HAL_DRIVER:-xrt} \ -S $iree_dir -B $build_dir else - cmake $CMAKE_ARGS \ - -S $iree_dir -B $build_dir + cmake $CMAKE_ARGS -S $iree_dir -B $build_dir fi echo "Building all"