Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
balancap committed Oct 17, 2023
1 parent bafc610 commit 9317f23
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 78 deletions.
100 changes: 23 additions & 77 deletions tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,89 +206,43 @@ void jacobi_update_second_step(const unsigned* rotset_sorted_arr,
unsigned short wend) noexcept {
using IndexType = unsigned short;
const unsigned wsize = (wend - wstart) / 2;
// Necessary for generating `rpt` loop.
__builtin_assume(wsize < 4096);
using T2 = float2;

rotset_sorted_arr += 2 * wstart;
const T2 zeros{0, 0};

// __builtin_ipu_f32v2gina(zeros, 0);
zeroFpAccumulators();

const T2* cs_arr_ptr = reinterpret_cast<const T2*>(cs_arr) + wstart;

// T2 res, cs0, cs1, Sp0, Sq0, Sp1, Sq1, tmp0, tmp1;
// unsigned k0, l0, k1, l1;

// // Parallized loop on update using other columns coefficients
// for (unsigned half_idx = 0; half_idx != (wsize / 2); ++half_idx) {
// k0 = ipu::load_postinc(&rotset_sorted_arr, 1);
// l0 = ipu::load_postinc(&rotset_sorted_arr, 1);
// cs0 = ipu::load_postinc(&cs_arr_ptr, 1);

// // 4 coefficients updates!
// Sp0 = {pcol[k0], pcol[l0]};
// Sq0 = {qcol[k0], qcol[l0]};

// // Using outer-product for parallel multiplications.
// // Note: saving 1 AUX cycle only, but helping a lot on register pressure.
// __builtin_ipu_f32v2aop(cs0, Sp0, 0);
// tmp0 = __builtin_ipu_f32v2gina(zeros, 0);
// tmp1 = __builtin_ipu_f32v2gina(zeros, 0);

// pcol_updated[k0] = tmp0[0] - tmp1[1];
// pcol_updated[l0] = tmp0[1] + tmp1[0];

// __builtin_ipu_f32v2aop(cs0, Sq0, 0);
// tmp0 = __builtin_ipu_f32v2gina(zeros, 0);
// tmp1 = __builtin_ipu_f32v2gina(zeros, 0);

// qcol_updated[k0] = tmp0[0] - tmp1[1];
// qcol_updated[l0] = tmp0[1] + tmp1[0];

// // Unrolling: second part.
// // Unrolling: second part.
// // Unrolling: second part.
// k1 = ipu::load_postinc(&rotset_sorted_arr, 1);
// l1 = ipu::load_postinc(&rotset_sorted_arr, 1);
// cs1 = ipu::load_postinc(&cs_arr_ptr, 1);

// // TODO: vectorization?!
// Sp1 = {pcol[k1], pcol[l1]};
// Sq1 = {qcol[k1], qcol[l1]};

// // Using outer-product for parallel multiplications.
// // Note: saving 1 AUX cycle only, but helping a lot on register pressure.
// __builtin_ipu_f32v2aop(cs1, Sp1, 0);
// tmp0 = __builtin_ipu_f32v2gina(zeros, 0);
// tmp1 = __builtin_ipu_f32v2gina(zeros, 0);

// pcol_updated[k1] = tmp0[0] - tmp1[1];
// pcol_updated[l1] = tmp0[1] + tmp1[0];

// __builtin_ipu_f32v2aop(cs1, Sq1, 0);
// tmp0 = __builtin_ipu_f32v2gina(zeros, 0);
// tmp1 = __builtin_ipu_f32v2gina(zeros, 0);

// qcol_updated[k1] = tmp0[0] - tmp1[1];
// qcol_updated[l1] = tmp0[1] + tmp1[0];
// }
const T2 zeros{0, 0};
T2 res, cs0, cs1, Sp0, Sq0, Sp1, Sq1, tmp0, tmp1;
unsigned k0, l0, k1, l1;

// The loop body is roughly the following equations:
// const T Spk = pcol_ptr[k];
// const T Spl = pcol_ptr[l];
// const T Sqk = qcol_ptr[k];
// const T Sql = qcol_ptr[l];

// pcol_updated_ptr[k] = c * Spk - s * Spl;
// pcol_updated_ptr[l] = s * Spk + c * Spl;
// qcol_updated_ptr[k] = c * Sqk - s * Sql;
// qcol_updated_ptr[l] = s * Sqk + c * Sql;

// Problem: generate poor bundling of operations in the loop.
// Solution: unroll 2 steps + f32v2aop + manual re-ordering.
// NOTE: f32v2aop mostly useful for reducing register pressure,
// as results are stored in AACC registers (not AUX). Just saving 1 compute
// cycle.

// Pre-loading due to unrolling + reordering.
k0 = ipu::load_postinc(&rotset_sorted_arr, 1);
l0 = ipu::load_postinc(&rotset_sorted_arr, 1);
cs0 = ipu::load_postinc(&cs_arr_ptr, 1);

Sp0 = {pcol[k0], pcol[l0]};

// Parallized loop on update using other columns coefficients
for (unsigned half_idx = 0; half_idx < wsize; ++half_idx) {
// Using outer-product for parallel multiplications.
// Note: saving 1 AUX cycle only, but helping a lot on register pressure.
// 4 coefficients updates!
// Sp0 = {pcol[k0], pcol[l0]};

// Pseudo bundling of instructions, to help popc.
{
Sq0[0] = qcol[k0];
__builtin_ipu_f32v2aop(cs0, Sp0, 0);
Expand Down Expand Up @@ -321,16 +275,12 @@ void jacobi_update_second_step(const unsigned* rotset_sorted_arr,
Sp1[1] = pcol[l1];
qcol_updated[k0] = tmp0[0] - tmp1[1];
}
// Unrolling: second part.
// NOTE: inputs already (partially) loaded.
{
qcol_updated[l0] = tmp0[1] + tmp1[0];
__builtin_ipu_f32v2aop(cs1, Sp1, 0);
}
// Unrolling: second part.
// Unrolling: second part.
// Unrolling: second part.

// TODO: vectorization?!
// Sq1 = {qcol[k1], qcol[l1]};
{
Sq1[0] = qcol[k1];
tmp0 = __builtin_ipu_f32v2gina(zeros, 0);
Expand All @@ -339,9 +289,6 @@ void jacobi_update_second_step(const unsigned* rotset_sorted_arr,
Sq1[1] = qcol[l1];
tmp1 = __builtin_ipu_f32v2gina(zeros, 0);
}

// Using outer-product for parallel multiplications.
// Note: saving 1 AUX cycle only, but helping a lot on register pressure.
{
k0 = ipu::load_postinc(&rotset_sorted_arr, 1);
pcol_updated[k1] = tmp0[0] - tmp1[1];
Expand Down Expand Up @@ -402,7 +349,6 @@ class JacobiUpdateSecondStep : public MultiVertex {
// Worker load: start + end vectorized indexes.
const IndexType wstart = worker_offsets[wid];
const IndexType wend = worker_offsets[wid + 1];
const IndexType wsize = wend - wstart;

// Forward pq indices.
pcol_updated[0] = pcol[0];
Expand Down
2 changes: 1 addition & 1 deletion tests/linalg/test_tile_linalg_jacobi.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def jacobi_update_eigenvectors_fn(cs, pcol, qcol):

@unittest.skipUnless(ipu_num_tiles >= 64, "Requires IPU with 64 tiles")
def test__jacobi_eigh__single_iteration(self):
N = 128
N = 6 * 8
x = np.random.randn(N, N).astype(np.float32)
x = (x + x.T) / 2.0

Expand Down

0 comments on commit 9317f23

Please sign in to comment.