From 9317f2303baf13cbe9d68877ffe6b23d8f255084 Mon Sep 17 00:00:00 2001 From: Paul Balanca Date: Tue, 17 Oct 2023 12:24:21 +0000 Subject: [PATCH] wip --- .../core/vertex/tile_jacobi_vertex.cpp | 100 ++++-------------- tests/linalg/test_tile_linalg_jacobi.py | 2 +- 2 files changed, 24 insertions(+), 78 deletions(-) diff --git a/tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp b/tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp index 7c7cb4d..8310b28 100644 --- a/tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp +++ b/tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp @@ -206,89 +206,43 @@ void jacobi_update_second_step(const unsigned* rotset_sorted_arr, unsigned short wend) noexcept { using IndexType = unsigned short; const unsigned wsize = (wend - wstart) / 2; + // Necessary for generating `rpt` loop. __builtin_assume(wsize < 4096); using T2 = float2; rotset_sorted_arr += 2 * wstart; - const T2 zeros{0, 0}; - // __builtin_ipu_f32v2gina(zeros, 0); zeroFpAccumulators(); const T2* cs_arr_ptr = reinterpret_cast(cs_arr) + wstart; - - // T2 res, cs0, cs1, Sp0, Sq0, Sp1, Sq1, tmp0, tmp1; - // unsigned k0, l0, k1, l1; - - // // Parallized loop on update using other columns coefficients - // for (unsigned half_idx = 0; half_idx != (wsize / 2); ++half_idx) { - // k0 = ipu::load_postinc(&rotset_sorted_arr, 1); - // l0 = ipu::load_postinc(&rotset_sorted_arr, 1); - // cs0 = ipu::load_postinc(&cs_arr_ptr, 1); - - // // 4 coefficients updates! - // Sp0 = {pcol[k0], pcol[l0]}; - // Sq0 = {qcol[k0], qcol[l0]}; - - // // Using outer-product for parallel multiplications. - // // Note: saving 1 AUX cycle only, but helping a lot on register pressure. - // __builtin_ipu_f32v2aop(cs0, Sp0, 0); - // tmp0 = __builtin_ipu_f32v2gina(zeros, 0); - // tmp1 = __builtin_ipu_f32v2gina(zeros, 0); - - // pcol_updated[k0] = tmp0[0] - tmp1[1]; - // pcol_updated[l0] = tmp0[1] + tmp1[0]; - - // __builtin_ipu_f32v2aop(cs0, Sq0, 0); - // tmp0 = __builtin_ipu_f32v2gina(zeros, 0); - // tmp1 = __builtin_ipu_f32v2gina(zeros, 0); - - // qcol_updated[k0] = tmp0[0] - tmp1[1]; - // qcol_updated[l0] = tmp0[1] + tmp1[0]; - - // // Unrolling: second part. - // // Unrolling: second part. - // // Unrolling: second part. - // k1 = ipu::load_postinc(&rotset_sorted_arr, 1); - // l1 = ipu::load_postinc(&rotset_sorted_arr, 1); - // cs1 = ipu::load_postinc(&cs_arr_ptr, 1); - - // // TODO: vectorization?! - // Sp1 = {pcol[k1], pcol[l1]}; - // Sq1 = {qcol[k1], qcol[l1]}; - - // // Using outer-product for parallel multiplications. - // // Note: saving 1 AUX cycle only, but helping a lot on register pressure. - // __builtin_ipu_f32v2aop(cs1, Sp1, 0); - // tmp0 = __builtin_ipu_f32v2gina(zeros, 0); - // tmp1 = __builtin_ipu_f32v2gina(zeros, 0); - - // pcol_updated[k1] = tmp0[0] - tmp1[1]; - // pcol_updated[l1] = tmp0[1] + tmp1[0]; - - // __builtin_ipu_f32v2aop(cs1, Sq1, 0); - // tmp0 = __builtin_ipu_f32v2gina(zeros, 0); - // tmp1 = __builtin_ipu_f32v2gina(zeros, 0); - - // qcol_updated[k1] = tmp0[0] - tmp1[1]; - // qcol_updated[l1] = tmp0[1] + tmp1[0]; - // } + const T2 zeros{0, 0}; T2 res, cs0, cs1, Sp0, Sq0, Sp1, Sq1, tmp0, tmp1; unsigned k0, l0, k1, l1; + // The loop body is roughly the following equations: + // const T Spk = pcol_ptr[k]; + // const T Spl = pcol_ptr[l]; + // const T Sqk = qcol_ptr[k]; + // const T Sql = qcol_ptr[l]; + + // pcol_updated_ptr[k] = c * Spk - s * Spl; + // pcol_updated_ptr[l] = s * Spk + c * Spl; + // qcol_updated_ptr[k] = c * Sqk - s * Sql; + // qcol_updated_ptr[l] = s * Sqk + c * Sql; + + // Problem: generate poor bundling of operations in the loop. + // Solution: unroll 2 steps + f32v2aop + manual re-ordering. + // NOTE: f32v2aop mostly useful for reducing register pressure, + // as results are stored in AACC registers (not AUX). Just saving 1 compute + // cycle. + + // Pre-loading due to unrolling + reordering. k0 = ipu::load_postinc(&rotset_sorted_arr, 1); l0 = ipu::load_postinc(&rotset_sorted_arr, 1); cs0 = ipu::load_postinc(&cs_arr_ptr, 1); - Sp0 = {pcol[k0], pcol[l0]}; - - // Parallized loop on update using other columns coefficients for (unsigned half_idx = 0; half_idx < wsize; ++half_idx) { - // Using outer-product for parallel multiplications. - // Note: saving 1 AUX cycle only, but helping a lot on register pressure. - // 4 coefficients updates! - // Sp0 = {pcol[k0], pcol[l0]}; - + // Pseudo bundling of instructions, to help popc. { Sq0[0] = qcol[k0]; __builtin_ipu_f32v2aop(cs0, Sp0, 0); @@ -321,16 +275,12 @@ void jacobi_update_second_step(const unsigned* rotset_sorted_arr, Sp1[1] = pcol[l1]; qcol_updated[k0] = tmp0[0] - tmp1[1]; } + // Unrolling: second part. + // NOTE: inputs already (partially) loaded. { qcol_updated[l0] = tmp0[1] + tmp1[0]; __builtin_ipu_f32v2aop(cs1, Sp1, 0); } - // Unrolling: second part. - // Unrolling: second part. - // Unrolling: second part. - - // TODO: vectorization?! - // Sq1 = {qcol[k1], qcol[l1]}; { Sq1[0] = qcol[k1]; tmp0 = __builtin_ipu_f32v2gina(zeros, 0); @@ -339,9 +289,6 @@ void jacobi_update_second_step(const unsigned* rotset_sorted_arr, Sq1[1] = qcol[l1]; tmp1 = __builtin_ipu_f32v2gina(zeros, 0); } - - // Using outer-product for parallel multiplications. - // Note: saving 1 AUX cycle only, but helping a lot on register pressure. { k0 = ipu::load_postinc(&rotset_sorted_arr, 1); pcol_updated[k1] = tmp0[0] - tmp1[1]; @@ -402,7 +349,6 @@ class JacobiUpdateSecondStep : public MultiVertex { // Worker load: start + end vectorized indexes. const IndexType wstart = worker_offsets[wid]; const IndexType wend = worker_offsets[wid + 1]; - const IndexType wsize = wend - wstart; // Forward pq indices. pcol_updated[0] = pcol[0]; diff --git a/tests/linalg/test_tile_linalg_jacobi.py b/tests/linalg/test_tile_linalg_jacobi.py index c411bb4..4528dec 100644 --- a/tests/linalg/test_tile_linalg_jacobi.py +++ b/tests/linalg/test_tile_linalg_jacobi.py @@ -164,7 +164,7 @@ def jacobi_update_eigenvectors_fn(cs, pcol, qcol): @unittest.skipUnless(ipu_num_tiles >= 64, "Requires IPU with 64 tiles") def test__jacobi_eigh__single_iteration(self): - N = 128 + N = 6 * 8 x = np.random.randn(N, N).astype(np.float32) x = (x + x.T) / 2.0