-
Notifications
You must be signed in to change notification settings - Fork 30
300 lines (265 loc) · 10.8 KB
/
ci-linux.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
name: CI Linux
on:
workflow_call:
workflow_dispatch:
pull_request:
merge_group:
push:
branches:
- main
concurrency:
# A PR number if a pull request and otherwise the commit hash. This cancels
# queued and in-progress runs for the same PR (presubmit) or commit
# (postsubmit).
group: ci-build-test-cpp-linux-${{ github.event.number || github.sha }}
cancel-in-progress: true
jobs:
build_hsa:
name: Build HSA (linux)
runs-on: nod-ai-shared-cpubuilder-manylinux-x86_64
steps:
- name: Configure local git mirrors
run: |
/gitmirror/scripts/trigger_update_mirrors.sh
/gitmirror/scripts/git_config.sh
- name: "Checking out repository"
env:
BRANCH_NAME: ${{ github.ref }}
REPO_ADDRESS: ${{ github.server_url }}/${{ github.repository }}
run: |
git init
git remote add origin $REPO_ADDRESS
git -c protocol.version=2 fetch --depth 1 origin $BRANCH_NAME
git reset --hard FETCH_HEAD
git -c submodule."third_party/torch-mlir".update=none -c submodule."third_party/stablehlo".update=none -c submodule."src/runtime_src/core/common/aiebu".update=none submodule update --init --recursive --depth 1 --single-branch -j 10
- name: Install deps
run: |
dnf install -y almalinux-release-devel
yum install -y elfutils-libelf-devel p7zip p7zip-plugins \
sudo ncurses-compat-libs openssh vim-common
- name: Build and install libnuma
working-directory: ${{ github.workspace }}/third_party/ROCR-Runtime
run: |
curl --silent -L \
https://github.com/numactl/numactl/releases/download/v2.0.18/numactl-2.0.18.tar.gz \
-o numactl-2.0.18.tar.gz
tar -xf numactl-2.0.18.tar.gz
pushd numactl-2.0.18
./configure
# i have no idea why this is necessary
# but without it you get something about "can't cd into dir"
sed -i '7563s/`cd "$dir" && pwd`/$dir/g' libtool
make install
popd
- name: Hack ROCR
working-directory: ${{ github.workspace }}/third_party/ROCR-Runtime
run: |
sed -i 's/amdgcn-amd-amdhsa/amdgcn-amd-amdhsa -nogpulib/g' runtime/hsa-runtime/core/runtime/blit_shaders/CMakeLists.txt
sed -i 's/amdgcn-amd-amdhsa/amdgcn-amd-amdhsa -nogpulib/g' runtime/hsa-runtime/core/runtime/trap_handler/CMakeLists.txt
sed -i 's/amdgcn-amd-amdhsa/amdgcn-amd-amdhsa -nogpulib/g' runtime/hsa-runtime/image/blit_src/CMakeLists.txt
- name: Get compatible Clang
working-directory: ${{ github.workspace }}/third_party/ROCR-Runtime
run: |
pip download mlir==20.0.0.2024090301+amdgpu.b6597f52 -f https://makslevental.github.io/wheels
unzip -q mlir-*.whl
- name: Build ROCR distro
working-directory: ${{ github.workspace }}/third_party/ROCR-Runtime
run: |
rocr_dir="$PWD"
build_rocr_dir="$PWD/rocr-build"
mkdir -p "$build_rocr_dir"
build_rocr_dir="$(cd $build_rocr_dir && pwd)"
rocr_install_dir="$PWD/rocr-install"
cmake -GNinja \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX="$rocr_install_dir" \
-DClang_DIR=$PWD/mlir/lib/cmake/clang \
-DLLVM_DIR=$PWD/mlir/lib/cmake/mlir \
-DIMAGE_SUPPORT=OFF \
-S "$rocr_dir" -B "$build_rocr_dir"
cmake --build "$build_rocr_dir" --target install
tar -cf rocr-${GITHUB_SHA::8}.tar rocr-install
- name: Upload artifacts
uses: actions/upload-artifact@v4
if: ${{ !cancelled() }}
with:
name: linux_hsa_x86_64_release_packages
path: ${{ github.workspace }}/third_party/ROCR-Runtime/rocr-*.tar
if-no-files-found:
build_and_ctest:
name: Build and Test (linux, ASSERTIONS)
runs-on: nod-ai-shared-cpubuilder-manylinux-x86_64
strategy:
fail-fast: true
env:
CACHE_DIR: ${{ github.workspace }}/.container-cache
# either the PR number or `branch-N` where N always increments
CACHE_KEY: linux-build-test-cpp-asserts-manylinux-v2-${{ format('{0}-{1}', github.ref_name, github.run_number) }}
steps:
- name: Set unified TZ
uses: szenius/[email protected]
with:
# this is an arbitrary choice
timezoneLinux: "Asia/Singapore"
timezoneMacos: "Asia/Singapore"
timezoneWindows: "Singapore Standard Time"
- name: Configure local git mirrors
run: |
/gitmirror/scripts/trigger_update_mirrors.sh
/gitmirror/scripts/git_config.sh
- name: "Checking out repository"
env:
BRANCH_NAME: ${{ github.ref }}
REPO_ADDRESS: ${{ github.server_url }}/${{ github.repository }}
run: |
git init
git remote add origin $REPO_ADDRESS
git -c protocol.version=2 fetch --depth 1 origin $BRANCH_NAME
git reset --hard FETCH_HEAD
git -c submodule."third_party/torch-mlir".update=none -c submodule."third_party/stablehlo".update=none -c submodule."src/runtime_src/core/common/aiebu".update=none submodule update --init --recursive --depth 1 --single-branch -j 10
- name: Install deps
run: |
dnf install -y almalinux-release-devel epel-release
yum remove -y openssl-devel zlib-devel || true
yum install -y protobuf-devel protobuf-compiler tmate
- name: Python deps
run: |
pip install "numpy<2" pyyaml "pybind11[global]==2.10.3" nanobind
- name: Enable cache
uses: actions/cache/restore@v3
with:
path: ${{ env.CACHE_DIR }}
key: ${{ env.CACHE_KEY }}
restore-keys: linux-build-test-cpp-
- name: Download artifacts
uses: actions/download-artifact@v4
with:
name: linux_hsa_x86_64_release_packages
- name: Extract artifact
run: |
tar -xvf rocr-*.tar
echo "hsa-runtime64_ROOT=$PWD/rocr-install" >> $GITHUB_ENV
- name: Build packages
run: |
export cache_dir="${{ env.CACHE_DIR }}"
export CCACHE_COMPILERCHECK="string:$(clang --version)"
bash build_tools/ci/build_test_cpp.sh
- name: Create artifacts
if: ${{ !cancelled() }}
run: |
tar cf iree-dist-linux.tar iree-install
- name: Upload artifacts
uses: actions/upload-artifact@v4
if: ${{ !cancelled() }}
with:
name: linux_x86_64_release_packages
path: iree-dist-linux.tar
if-no-files-found: warn
- name: Save cache
uses: actions/cache/save@v3
if: ${{ !cancelled() && github.event_name == 'push' && github.ref_name == 'main' }}
with:
path: ${{ env.CACHE_DIR }}
key: ${{ env.CACHE_KEY }}
test_linux:
name: E2E Test linux
needs: build_and_ctest
strategy:
fail-fast: false
matrix:
runs-on: [linux-phoenix]
runs-on: ${{ matrix.runs-on }}
env:
XILINXD_LICENSE_FILE: /opt/xilinx/Xilinx.lic
steps:
- name: "Checking out repository" # for test scripts
uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0
with:
submodules: false # not required for testbench
- name: Download artifacts
uses: actions/download-artifact@v4
with:
name: linux_x86_64_release_packages
- name: Extract artifact
run: |
tar -xvf iree-dist-linux.tar
bash build_tools/download_peano.sh
- name: Create venv and install dependencies
run: |
python3 -m venv .venv
source .venv/bin/activate
pip install -r tests/matmul/requirements.txt
- name: E2E correctness matmul test
run: |
source .venv/bin/activate
# Without this additional line an error like
#
# [XRT] ERROR: Failed to allocate host memory buffer (mmap(len=10616832, prot=3, flags=8193, offset=4294967296)
# failed (err=11): Resource temporarily unavailable), make sure host bank is enabled (see xbutil configure --host-mem)
# iree-amd-aie/runtime/src/iree-amd-aie/driver/xrt/direct_allocator.cc:179: RESOURCE_EXHAUSTED; could not allocate
# memory for buffer; while invoking C++ function matmul_test.generate_random_matrix; while calling import;
#
# might be observed when too much memory is allocated. For example this
# error was seen when running a bf16->f32 matmul with m=n=k=2304.
#
# This line was suggested at https://github.com/Xilinx/mlir-air/issues/566
#
# Note that this is only half of the fix. It is also necessary that
# the machine that CI is running on has permission to run this line.
#
# This permission can be adding by adding the line
# ```
# %github ALL=(ALL) NOPASSWD: /usr/bin/prlimit *
# ```
#
# to the file /etc/sudoers.d/github, which can be done by running
# ```
# sudo visudo -f /etc/sudoers.d/github
# ```
# on the guthub CI machine.
sudo prlimit -lunlimited --pid $$
source /opt/xilinx/xrt/setup.sh
bash build_tools/ci/run_matmul_test.sh \
test_matmuls \
iree-install \
$PWD/llvm-aie \
/opt/xilinx/xrt \
/opt/Xilinx/Vitis/2024.2
- name : Smoke E2E comparison flag test
run: |
source .venv/bin/activate
source /opt/xilinx/xrt/setup.sh
python3 build_tools/ci/cpu_comparison/run_test.py \
test_aie_vs_cpu \
iree-install \
$PWD/llvm-aie \
--xrt-dir /opt/xilinx/xrt \
--test-set='Smoke' \
--do-not-run-aie
# Assert that output.log is empty (because verbose=0)
if [ -s output.log ]; then
echo "output.log is not empty:"
cat output.log
exit 1
else
echo "output.log is empty"
fi
- name : E2E comparison of AIE to llvm-cpu
run: |
source .venv/bin/activate
source /opt/xilinx/xrt/setup.sh
python3 build_tools/ci/cpu_comparison/run_test.py \
test_aie_vs_cpu \
$PWD/iree-install \
$PWD/llvm-aie \
--xrt-dir /opt/xilinx/xrt \
--vitis-dir /opt/Xilinx/Vitis/2024.2 \
--reset-npu-between-runs -v
- name: Printing IR from aie2xclbin
run: |
source .venv/bin/activate
source /opt/xilinx/xrt/setup.sh
bash build_tools/ci/print_ir_aie2xclbin/print_ir_aie2xclbin.sh \
iree-install \
print_ir_aie2xclbin_results \
$PWD/llvm-aie