diff --git a/tessellate_ipu/core/tile_interpreter_vertex_utils.py b/tessellate_ipu/core/tile_interpreter_vertex_utils.py index 777bf09..1389281 100644 --- a/tessellate_ipu/core/tile_interpreter_vertex_utils.py +++ b/tessellate_ipu/core/tile_interpreter_vertex_utils.py @@ -1,6 +1,6 @@ # Copyright (c) 2022 Graphcore Ltd. All rights reserved. import math -from typing import List +from typing import List, Optional import numpy as np from numpy.typing import DTypeLike, NDArray @@ -26,9 +26,14 @@ def make_num_elements_per_worker(N: int, num_workers: int) -> NDArray[np.int32]: def make_ipu_vector1d_worker_offsets( - size: int, vector_size: int = 2, num_workers: int = 6, wdtype: DTypeLike = np.uint16 + size: int, + vector_size: int = 2, + num_workers: int = 6, + wdtype: DTypeLike = np.uint16, + allow_overlap: bool = False, + grain_size: Optional[int] = None, ) -> NDArray[np.int_]: - """Make the QR householder row update worker sizes, i.e. how many + """Make worker sizes/offsets for a 1D array workload, i.e. how many data vectors per worker thread? Args: @@ -36,27 +41,38 @@ def make_ipu_vector1d_worker_offsets( vector_size: Vector size (2: float, 4: half). num_workers: Number of workers. wdtype: Worklists dtype. + allow_overlap: Allowing overlap between workers. Make it easier to deal with remainer term. + grain_size: Optional grain size. vector_size by default. Returns: (6,) number of data vectors per thread. """ + grain_size = grain_size or vector_size + grain_scale = grain_size // vector_size def make_offsets_fn(sizes): sizes = [0] + sizes - offsets = np.cumsum(np.array(sizes, wdtype), dtype=wdtype) - # print("OFFSETS:", offsets) + offsets = np.cumsum(np.array(sizes, wdtype) * grain_scale, dtype=wdtype) return offsets - assert size % vector_size == 0 + # TODO: support properly odd size. + assert size % 2 == 0, "Not supporting odd sizing at the moment." + # Base checks! + assert grain_size % vector_size == 0 + assert size >= grain_size, f"Requires at least a size of {grain_size}." + assert ( + size % grain_size == 0 or allow_overlap + ), f"Requires the size, {size}, divisible by the grain size {grain_size}, (or allowing overlap)." + # Base worksize on the first few workers. - base_worksize: int = math.ceil(size / (vector_size * num_workers)) - num_base_workers = size // (vector_size * base_worksize) + base_worksize: int = math.ceil(size / (grain_size * num_workers)) + num_base_workers = size // (grain_size * base_worksize) worker_sizes: List[int] = [base_worksize] * num_base_workers if num_base_workers == num_workers: return make_offsets_fn(worker_sizes) # Remainer term, for the next thread. - rem_worksize = size - base_worksize * vector_size * num_base_workers - rem_worksize = rem_worksize // vector_size + rem_worksize = size - base_worksize * grain_size * num_base_workers + rem_worksize = rem_worksize // grain_size worker_sizes += [rem_worksize] # Fill the rest with zeros. unused_workers = num_workers - num_base_workers - 1 diff --git a/tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp b/tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp index 8310b28..e35a166 100644 --- a/tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp +++ b/tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp @@ -209,12 +209,11 @@ void jacobi_update_second_step(const unsigned* rotset_sorted_arr, // Necessary for generating `rpt` loop. __builtin_assume(wsize < 4096); using T2 = float2; - + // Increment pointers. NOTE: unrolling creating "4x" factor. rotset_sorted_arr += 2 * wstart; - // __builtin_ipu_f32v2gina(zeros, 0); + const T2* cs_arr_ptr = reinterpret_cast(cs_arr) + wstart; zeroFpAccumulators(); - const T2* cs_arr_ptr = reinterpret_cast(cs_arr) + wstart; const T2 zeros{0, 0}; T2 res, cs0, cs1, Sp0, Sq0, Sp1, Sq1, tmp0, tmp1; unsigned k0, l0, k1, l1; diff --git a/tessellate_ipu/linalg/tile_linalg_jacobi.py b/tessellate_ipu/linalg/tile_linalg_jacobi.py index e1ae0d7..0a67f0c 100644 --- a/tessellate_ipu/linalg/tile_linalg_jacobi.py +++ b/tessellate_ipu/linalg/tile_linalg_jacobi.py @@ -70,8 +70,10 @@ def get_jacobi_vertex_gp_filename() -> str: inputs=["cs_arr", "rotset_sorted_arr", "rotset_idx_ignored", "pcol", "qcol"], outputs={"cs_arr": 0, "pcol_updated": 3, "qcol_updated": 4}, constants={ + # NOTE: using grain_size=4 because of partial loop unrolling + # TODO: support overlap properly. "worker_offsets": lambda inavals, *_: make_ipu_vector1d_worker_offsets( - inavals[3].size - INDEX_PREFIX, vector_size=2, wdtype=np.uint16 + inavals[3].size - INDEX_PREFIX, vector_size=2, wdtype=np.uint16, allow_overlap=False, grain_size=4 ) }, gp_filename=get_jacobi_vertex_gp_filename(), diff --git a/tests/linalg/test_tile_linalg_jacobi.py b/tests/linalg/test_tile_linalg_jacobi.py index 4528dec..689eb38 100644 --- a/tests/linalg/test_tile_linalg_jacobi.py +++ b/tests/linalg/test_tile_linalg_jacobi.py @@ -178,7 +178,7 @@ def test__jacobi_eigh__single_iteration(self): @unittest.skipUnless(ipu_num_tiles >= 16, "Requires IPU with 16 tiles") def test__jacobi_eigh_raw__proper_eigh_result(self): - N = 8 * 6 + N = 8 x = np.random.randn(N, N).astype(np.float32) x = (x + x.T) / 2.0