From 9317f2303baf13cbe9d68877ffe6b23d8f255084 Mon Sep 17 00:00:00 2001
From: Paul Balanca <paulb@graphcore.ai>
Date: Tue, 17 Oct 2023 12:24:21 +0000
Subject: [PATCH] wip

---
 .../core/vertex/tile_jacobi_vertex.cpp        | 100 ++++--------------
 tests/linalg/test_tile_linalg_jacobi.py       |   2 +-
 2 files changed, 24 insertions(+), 78 deletions(-)
diff --git a/tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp b/tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp
index 7c7cb4d..8310b28 100644
--- a/tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp
+++ b/tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp
@@ -206,89 +206,43 @@ void jacobi_update_second_step(const unsigned* rotset_sorted_arr,
                                unsigned short wend) noexcept {
   using IndexType = unsigned short;
   const unsigned wsize = (wend - wstart) / 2;
+  // Necessary for generating `rpt` loop.
   __builtin_assume(wsize < 4096);
   using T2 = float2;
 
   rotset_sorted_arr += 2 * wstart;
-  const T2 zeros{0, 0};
-
   // __builtin_ipu_f32v2gina(zeros, 0);
   zeroFpAccumulators();
 
   const T2* cs_arr_ptr = reinterpret_cast<const T2*>(cs_arr) + wstart;
-
-  // T2 res, cs0, cs1, Sp0, Sq0, Sp1, Sq1, tmp0, tmp1;
-  // unsigned k0, l0, k1, l1;
-
-  // // Parallized loop on update using other columns coefficients
-  // for (unsigned half_idx = 0; half_idx != (wsize / 2); ++half_idx) {
-  //   k0 = ipu::load_postinc(&rotset_sorted_arr, 1);
-  //   l0 = ipu::load_postinc(&rotset_sorted_arr, 1);
-  //   cs0 = ipu::load_postinc(&cs_arr_ptr, 1);
-
-  //   // 4 coefficients updates!
-  //   Sp0 = {pcol[k0], pcol[l0]};
-  //   Sq0 = {qcol[k0], qcol[l0]};
-
-  //   // Using outer-product for parallel multiplications.
-  //   // Note: saving 1 AUX cycle only, but helping a lot on register pressure.
-  //   __builtin_ipu_f32v2aop(cs0, Sp0, 0);
-  //   tmp0 = __builtin_ipu_f32v2gina(zeros, 0);
-  //   tmp1 = __builtin_ipu_f32v2gina(zeros, 0);
-
-  //   pcol_updated[k0] = tmp0[0] - tmp1[1];
-  //   pcol_updated[l0] = tmp0[1] + tmp1[0];
-
-  //   __builtin_ipu_f32v2aop(cs0, Sq0, 0);
-  //   tmp0 = __builtin_ipu_f32v2gina(zeros, 0);
-  //   tmp1 = __builtin_ipu_f32v2gina(zeros, 0);
-
-  //   qcol_updated[k0] = tmp0[0] - tmp1[1];
-  //   qcol_updated[l0] = tmp0[1] + tmp1[0];
-
-  //   // Unrolling: second part.
-  //   // Unrolling: second part.
-  //   // Unrolling: second part.
-  //   k1 = ipu::load_postinc(&rotset_sorted_arr, 1);
-  //   l1 = ipu::load_postinc(&rotset_sorted_arr, 1);
-  //   cs1 = ipu::load_postinc(&cs_arr_ptr, 1);
-
-  //   // TODO: vectorization?!
-  //   Sp1 = {pcol[k1], pcol[l1]};
-  //   Sq1 = {qcol[k1], qcol[l1]};
-
-  //   // Using outer-product for parallel multiplications.
-  //   // Note: saving 1 AUX cycle only, but helping a lot on register pressure.
-  //   __builtin_ipu_f32v2aop(cs1, Sp1, 0);
-  //   tmp0 = __builtin_ipu_f32v2gina(zeros, 0);
-  //   tmp1 = __builtin_ipu_f32v2gina(zeros, 0);
-
-  //   pcol_updated[k1] = tmp0[0] - tmp1[1];
-  //   pcol_updated[l1] = tmp0[1] + tmp1[0];
-
-  //   __builtin_ipu_f32v2aop(cs1, Sq1, 0);
-  //   tmp0 = __builtin_ipu_f32v2gina(zeros, 0);
-  //   tmp1 = __builtin_ipu_f32v2gina(zeros, 0);
-
-  //   qcol_updated[k1] = tmp0[0] - tmp1[1];
-  //   qcol_updated[l1] = tmp0[1] + tmp1[0];
-  // }
+  const T2 zeros{0, 0};
   T2 res, cs0, cs1, Sp0, Sq0, Sp1, Sq1, tmp0, tmp1;
   unsigned k0, l0, k1, l1;
 
+  // The loop body is roughly the following equations:
+  // const T Spk = pcol_ptr[k];
+  // const T Spl = pcol_ptr[l];
+  // const T Sqk = qcol_ptr[k];
+  // const T Sql = qcol_ptr[l];
+
+  // pcol_updated_ptr[k] = c * Spk - s * Spl;
+  // pcol_updated_ptr[l] = s * Spk + c * Spl;
+  // qcol_updated_ptr[k] = c * Sqk - s * Sql;
+  // qcol_updated_ptr[l] = s * Sqk + c * Sql;
+
+  // Problem: generate poor bundling of operations in the loop.
+  // Solution: unroll 2 steps + f32v2aop + manual re-ordering.
+  // NOTE: f32v2aop mostly useful for reducing register pressure,
+  // as results are stored in AACC registers (not AUX). Just saving 1 compute
+  // cycle.
+
+  // Pre-loading due to unrolling + reordering.
   k0 = ipu::load_postinc(&rotset_sorted_arr, 1);
   l0 = ipu::load_postinc(&rotset_sorted_arr, 1);
   cs0 = ipu::load_postinc(&cs_arr_ptr, 1);
-
   Sp0 = {pcol[k0], pcol[l0]};
-
-  // Parallized loop on update using other columns coefficients
   for (unsigned half_idx = 0; half_idx < wsize; ++half_idx) {
-    // Using outer-product for parallel multiplications.
-    // Note: saving 1 AUX cycle only, but helping a lot on register pressure.
-    // 4 coefficients updates!
-    // Sp0 = {pcol[k0], pcol[l0]};
-
+    // Pseudo bundling of instructions, to help popc.
     {
       Sq0[0] = qcol[k0];
       __builtin_ipu_f32v2aop(cs0, Sp0, 0);
@@ -321,16 +275,12 @@ void jacobi_update_second_step(const unsigned* rotset_sorted_arr,
       Sp1[1] = pcol[l1];
       qcol_updated[k0] = tmp0[0] - tmp1[1];
     }
+    // Unrolling: second part.
+    // NOTE: inputs already (partially) loaded.
     {
       qcol_updated[l0] = tmp0[1] + tmp1[0];
       __builtin_ipu_f32v2aop(cs1, Sp1, 0);
     }
-    // Unrolling: second part.
-    // Unrolling: second part.
-    // Unrolling: second part.
-
-    // TODO: vectorization?!
-    // Sq1 = {qcol[k1], qcol[l1]};
     {
       Sq1[0] = qcol[k1];
       tmp0 = __builtin_ipu_f32v2gina(zeros, 0);
@@ -339,9 +289,6 @@ void jacobi_update_second_step(const unsigned* rotset_sorted_arr,
       Sq1[1] = qcol[l1];
       tmp1 = __builtin_ipu_f32v2gina(zeros, 0);
     }
-
-    // Using outer-product for parallel multiplications.
-    // Note: saving 1 AUX cycle only, but helping a lot on register pressure.
     {
       k0 = ipu::load_postinc(&rotset_sorted_arr, 1);
       pcol_updated[k1] = tmp0[0] - tmp1[1];
@@ -402,7 +349,6 @@ class JacobiUpdateSecondStep : public MultiVertex {
     // Worker load: start + end vectorized indexes.
     const IndexType wstart = worker_offsets[wid];
     const IndexType wend = worker_offsets[wid + 1];
-    const IndexType wsize = wend - wstart;
 
     // Forward pq indices.
     pcol_updated[0] = pcol[0];
diff --git a/tests/linalg/test_tile_linalg_jacobi.py b/tests/linalg/test_tile_linalg_jacobi.py
index c411bb4..4528dec 100644
--- a/tests/linalg/test_tile_linalg_jacobi.py
+++ b/tests/linalg/test_tile_linalg_jacobi.py
@@ -164,7 +164,7 @@ def jacobi_update_eigenvectors_fn(cs, pcol, qcol):
 
     @unittest.skipUnless(ipu_num_tiles >= 64, "Requires IPU with 64 tiles")
     def test__jacobi_eigh__single_iteration(self):
-        N = 128
+        N = 6 * 8
         x = np.random.randn(N, N).astype(np.float32)
         x = (x + x.T) / 2.0