From 5971b65b8771ea1f5bcc6195af963f03292c0b9b Mon Sep 17 00:00:00 2001 From: Paul Balanca Date: Mon, 2 Oct 2023 16:00:48 +0100 Subject: [PATCH] wip --- .../core/tile_interpreter_vertex_utils.py | 1 + .../core/vertex/tile_jacobi_vertex.cpp | 89 +++++++++++++------ tessellate_ipu/linalg/tile_linalg_jacobi.py | 6 +- 3 files changed, 67 insertions(+), 29 deletions(-) diff --git a/tessellate_ipu/core/tile_interpreter_vertex_utils.py b/tessellate_ipu/core/tile_interpreter_vertex_utils.py index db7ac6a..2ba3c2a 100644 --- a/tessellate_ipu/core/tile_interpreter_vertex_utils.py +++ b/tessellate_ipu/core/tile_interpreter_vertex_utils.py @@ -39,6 +39,7 @@ def make_ipu_vector1d_worker_offsets( Returns: (6,) number of data vectors per thread. """ + def make_offsets_fn(sizes): sizes = [0] + sizes offsets = np.cumsum(np.array(sizes, wdtype), dtype=wdtype) diff --git a/tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp b/tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp index d9a73a6..18ccde5 100644 --- a/tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp +++ b/tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp @@ -177,23 +177,24 @@ class [[poplar::constraint("elem(*pcol) != elem(*qcol)")]] JacobiUpdateFirstStep const unsigned p = pindex[0]; const unsigned q = qindex[0]; - const IndexType wstart = worker_offsets[wid]; const IndexType wend = worker_offsets[wid + 1]; if (p <= q) { // Proper ordering of p and q already. jacob_update_first_step(pcol.data(), qcol.data(), pcol_updated.data(), - qcol_updated.data(), cs.data(), p, q, wstart, wend); + qcol_updated.data(), cs.data(), p, q, wstart, + wend); rotset_sorted[0] = p; rotset_sorted[1] = q; - } - else { + } else { // Swap p and q columns as q < p jacob_update_first_step(qcol.data(), pcol.data(), qcol_updated.data(), - pcol_updated.data(), cs.data(), q, p, wstart, wend); + pcol_updated.data(), cs.data(), q, p, wstart, + wend); // jacob_update_first_step(pcol.data(), qcol.data(), pcol_updated.data(), - // qcol_updated.data(), cs.data(), q, p, wstart, wend); + // qcol_updated.data(), cs.data(), q, p, wstart, + // wend); rotset_sorted[0] = q; rotset_sorted[1] = p; } @@ -271,6 +272,37 @@ class JacobiUpdateSecondStep : public MultiVertex { } }; +template +void jacob_update_eigenvectors(const T* vpcol, const T* vqcol, T* vpcol_updated, + T* vqcol_updated, T c, T s, + unsigned short wstart, + unsigned short wend) noexcept { + using T2 = float2; + // Using `uint16` seems to be generating more efficient loops? + using IndexType = unsigned short; + + const T2 cvec = T2{c, c}; + const T2 svec = T2{s, s}; + const IndexType wsize = wend - wstart; + + // pcol, qcol and results pointers. + const T2* ptr_pcol = reinterpret_cast(vpcol) + wstart; + const T2* ptr_qcol = reinterpret_cast(vqcol) + wstart; + T2* ptr_pcol_updated = reinterpret_cast(vpcol_updated) + wstart; + T2* ptr_qcol_updated = reinterpret_cast(vqcol_updated) + wstart; + + for (IndexType idx = 0; idx != wsize; ++idx) { + const T2 vpvec = ipu::load_postinc(&ptr_pcol, 1); + const T2 vqvec = ipu::load_postinc(&ptr_qcol, 1); + + const T2 vpvec_updated = cvec * vpvec - svec * vqvec; + const T2 vqvec_updated = svec * vpvec + cvec * vqvec; + + ipu::store_postinc(&ptr_qcol_updated, vqvec_updated, 1); + ipu::store_postinc(&ptr_pcol_updated, vpvec_updated, 1); + } +} + /** * @brief Jacobi algorithm, update of eigen vectors matrix. * @@ -304,31 +336,36 @@ class [[poplar::constraint( bool compute(unsigned wid) { const T c = cs[0]; const T s = cs[1]; - const T2 cvec = T2{c, c}; - const T2 svec = T2{s, s}; - - // Worker load: start + end vectorized indexes. - constexpr unsigned ptr_step = 1; const IndexType wstart = worker_offsets[wid]; const IndexType wend = worker_offsets[wid + 1]; - const IndexType wsize = wend - wstart; - // pcol, qcol and results pointers. - const T2* ptr_pcol = reinterpret_cast(vpcol.data()) + wstart; - const T2* ptr_qcol = reinterpret_cast(vqcol.data()) + wstart; - T2* ptr_pcol_updated = reinterpret_cast(vpcol_out.data()) + wstart; - T2* ptr_qcol_updated = reinterpret_cast(vqcol_out.data()) + wstart; + jacob_update_eigenvectors(vpcol.data(), vqcol.data(), vpcol_out.data(), + vqcol_out.data(), c, s, wstart, wend); + return true; - for (IndexType idx = 0; idx != wsize; ++idx) { - const T2 vpvec = ipu::load_postinc(&ptr_pcol, 1); - const T2 vqvec = ipu::load_postinc(&ptr_qcol, 1); + // const T2 cvec = T2{c, c}; + // const T2 svec = T2{s, s}; - const T2 vpvec_updated = cvec * vpvec - svec * vqvec; - const T2 vqvec_updated = svec * vpvec + cvec * vqvec; + // // Worker load: start + end vectorized indexes. + // constexpr unsigned ptr_step = 1; + // const IndexType wsize = wend - wstart; - ipu::store_postinc(&ptr_qcol_updated, vqvec_updated, 1); - ipu::store_postinc(&ptr_pcol_updated, vpvec_updated, 1); - } - return true; + // // pcol, qcol and results pointers. + // const T2* ptr_pcol = reinterpret_cast(vpcol.data()) + wstart; + // const T2* ptr_qcol = reinterpret_cast(vqcol.data()) + wstart; + // T2* ptr_pcol_updated = reinterpret_cast(vpcol_out.data()) + wstart; + // T2* ptr_qcol_updated = reinterpret_cast(vqcol_out.data()) + wstart; + + // for (IndexType idx = 0; idx != wsize; ++idx) { + // const T2 vpvec = ipu::load_postinc(&ptr_pcol, 1); + // const T2 vqvec = ipu::load_postinc(&ptr_qcol, 1); + + // const T2 vpvec_updated = cvec * vpvec - svec * vqvec; + // const T2 vqvec_updated = svec * vpvec + cvec * vqvec; + + // ipu::store_postinc(&ptr_qcol_updated, vqvec_updated, 1); + // ipu::store_postinc(&ptr_pcol_updated, vpvec_updated, 1); + // } + // return true; } }; diff --git a/tessellate_ipu/linalg/tile_linalg_jacobi.py b/tessellate_ipu/linalg/tile_linalg_jacobi.py index 6dc9c61..6e44965 100644 --- a/tessellate_ipu/linalg/tile_linalg_jacobi.py +++ b/tessellate_ipu/linalg/tile_linalg_jacobi.py @@ -193,8 +193,8 @@ def ipu_jacobi_eigh_iteration(all_AV_cols: Tuple[Array, ...], Atiles: Any, Vtile with jax.named_scope("cs_replicated_sharded"): cs_replicated = tile_put_replicated(cs_per_tile.array, tiles=Atiles) # Just copy Schur decomposition to associated V tiles. - cs_Vtiles = tile_put_sharded(cs_per_tile.array, tiles=Vtiles) - cs_replicated, cs_Vtiles = tile_data_barrier(cs_replicated, cs_Vtiles) + cs_sharded_Vtiles = tile_put_sharded(cs_per_tile.array, tiles=Vtiles) + cs_replicated, cs_sharded_Vtiles = tile_data_barrier(cs_replicated, cs_sharded_Vtiles) # Second Jacobi update step. # Note: does not require sorting of pcols and qcols. @@ -210,7 +210,7 @@ def ipu_jacobi_eigh_iteration(all_AV_cols: Tuple[Array, ...], Atiles: Any, Vtile # Jacobi eigenvectors update step. Vpcols, Vqcols = tile_map( # type:ignore jacobi_update_eigenvectors_p, - cs_Vtiles, + cs_sharded_Vtiles, Vpcols, Vqcols, )