Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
balancap committed Oct 17, 2023
1 parent 9317f23 commit d32eef2
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 15 deletions.
36 changes: 26 additions & 10 deletions tessellate_ipu/core/tile_interpreter_vertex_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
import math
from typing import List
from typing import List, Optional

import numpy as np
from numpy.typing import DTypeLike, NDArray
Expand All @@ -26,37 +26,53 @@ def make_num_elements_per_worker(N: int, num_workers: int) -> NDArray[np.int32]:


def make_ipu_vector1d_worker_offsets(
size: int, vector_size: int = 2, num_workers: int = 6, wdtype: DTypeLike = np.uint16
size: int,
vector_size: int = 2,
num_workers: int = 6,
wdtype: DTypeLike = np.uint16,
allow_overlap: bool = False,
grain_size: Optional[int] = None,
) -> NDArray[np.int_]:
"""Make the QR householder row update worker sizes, i.e. how many
"""Make worker sizes/offsets for a 1D array workload, i.e. how many
data vectors per worker thread?
Args:
size: Size of the vector to divide.
vector_size: Vector size (2: float, 4: half).
num_workers: Number of workers.
wdtype: Worklists dtype.
allow_overlap: Allowing overlap between workers. Make it easier to deal with remainer term.
grain_size: Optional grain size. vector_size by default.
Returns:
(6,) number of data vectors per thread.
"""
grain_size = grain_size or vector_size
grain_scale = grain_size // vector_size

def make_offsets_fn(sizes):
sizes = [0] + sizes
offsets = np.cumsum(np.array(sizes, wdtype), dtype=wdtype)
# print("OFFSETS:", offsets)
offsets = np.cumsum(np.array(sizes, wdtype) * grain_scale, dtype=wdtype)
return offsets

assert size % vector_size == 0
# TODO: support properly odd size.
assert size % 2 == 0, "Not supporting odd sizing at the moment."
# Base checks!
assert grain_size % vector_size == 0
assert size >= grain_size, f"Requires at least a size of {grain_size}."
assert (
size % grain_size == 0 or allow_overlap
), f"Requires the size, {size}, divisible by the grain size {grain_size}, (or allowing overlap)."

# Base worksize on the first few workers.
base_worksize: int = math.ceil(size / (vector_size * num_workers))
num_base_workers = size // (vector_size * base_worksize)
base_worksize: int = math.ceil(size / (grain_size * num_workers))
num_base_workers = size // (grain_size * base_worksize)
worker_sizes: List[int] = [base_worksize] * num_base_workers
if num_base_workers == num_workers:
return make_offsets_fn(worker_sizes)

# Remainer term, for the next thread.
rem_worksize = size - base_worksize * vector_size * num_base_workers
rem_worksize = rem_worksize // vector_size
rem_worksize = size - base_worksize * grain_size * num_base_workers
rem_worksize = rem_worksize // grain_size
worker_sizes += [rem_worksize]
# Fill the rest with zeros.
unused_workers = num_workers - num_base_workers - 1
Expand Down
5 changes: 2 additions & 3 deletions tessellate_ipu/core/vertex/tile_jacobi_vertex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -209,12 +209,11 @@ void jacobi_update_second_step(const unsigned* rotset_sorted_arr,
// Necessary for generating `rpt` loop.
__builtin_assume(wsize < 4096);
using T2 = float2;

// Increment pointers. NOTE: unrolling creating "4x" factor.
rotset_sorted_arr += 2 * wstart;
// __builtin_ipu_f32v2gina(zeros, 0);
const T2* cs_arr_ptr = reinterpret_cast<const T2*>(cs_arr) + wstart;
zeroFpAccumulators();

const T2* cs_arr_ptr = reinterpret_cast<const T2*>(cs_arr) + wstart;
const T2 zeros{0, 0};
T2 res, cs0, cs1, Sp0, Sq0, Sp1, Sq1, tmp0, tmp1;
unsigned k0, l0, k1, l1;
Expand Down
4 changes: 3 additions & 1 deletion tessellate_ipu/linalg/tile_linalg_jacobi.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,10 @@ def get_jacobi_vertex_gp_filename() -> str:
inputs=["cs_arr", "rotset_sorted_arr", "rotset_idx_ignored", "pcol", "qcol"],
outputs={"cs_arr": 0, "pcol_updated": 3, "qcol_updated": 4},
constants={
# NOTE: using grain_size=4 because of partial loop unrolling
# TODO: support overlap properly.
"worker_offsets": lambda inavals, *_: make_ipu_vector1d_worker_offsets(
inavals[3].size - INDEX_PREFIX, vector_size=2, wdtype=np.uint16
inavals[3].size - INDEX_PREFIX, vector_size=2, wdtype=np.uint16, allow_overlap=False, grain_size=4
)
},
gp_filename=get_jacobi_vertex_gp_filename(),
Expand Down
2 changes: 1 addition & 1 deletion tests/linalg/test_tile_linalg_jacobi.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def test__jacobi_eigh__single_iteration(self):

@unittest.skipUnless(ipu_num_tiles >= 16, "Requires IPU with 16 tiles")
def test__jacobi_eigh_raw__proper_eigh_result(self):
N = 8 * 6
N = 8
x = np.random.randn(N, N).astype(np.float32)
x = (x + x.T) / 2.0

Expand Down

0 comments on commit d32eef2

Please sign in to comment.