From a1bc514b3f2ce53613c6c6edc20c15c5b8cfc55a Mon Sep 17 00:00:00 2001 From: mbertuletti Date: Thu, 12 Sep 2024 14:23:40 +0200 Subject: [PATCH] [software] Improve performance of axpy and dotp --- software/apps/baremetal/axpy_f16/main.c | 22 +-- software/apps/baremetal/axpy_f32/main.c | 21 ++- software/apps/baremetal/dotp_f16/main.c | 4 +- software/apps/baremetal/dotp_f32/main.c | 4 +- software/data/data_axpy_f16.h.tpl | 6 +- software/data/data_axpy_f32.h.tpl | 7 +- software/data/generate_dotp.py | 40 +++--- software/kernels/baremetal/mempool_axpy_f16.h | 103 +++++++------- software/kernels/baremetal/mempool_axpy_f32.h | 128 ++++++++---------- software/kernels/baremetal/mempool_dotp_f16.h | 54 +++++--- software/kernels/baremetal/mempool_dotp_f32.h | 51 ++++--- 11 files changed, 221 insertions(+), 219 deletions(-) diff --git a/software/apps/baremetal/axpy_f16/main.c b/software/apps/baremetal/axpy_f16/main.c index 9fe49d299..ff13cb879 100644 --- a/software/apps/baremetal/axpy_f16/main.c +++ b/software/apps/baremetal/axpy_f16/main.c @@ -18,9 +18,8 @@ #define NUM_BANKS (NUM_CORES * BANKING_FACTOR) // Vectors for kernel computation -__fp16 l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio"))); -__fp16 l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio"))); -__fp16 l1_C[LEN] __attribute__((aligned(LEN), section(".l1_prio"))); +__fp16 l1_X[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); +__fp16 l1_Y[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); #include "baremetal/mempool_axpy_f16.h" #include "baremetal/mempool_checks.h" @@ -35,25 +34,28 @@ int main() { time_init = 0; time_end = 0; if (core_id == 0) { - dma_memcpy_blocking(l1_A, l2_A, LEN * sizeof(int16_t)); - dma_memcpy_blocking(l1_B, l2_B, LEN * sizeof(int16_t)); - dma_memcpy_blocking(l1_C, l2_C, LEN * sizeof(int16_t)); + dma_memcpy_blocking(l1_X, l2_X, LEN * sizeof(int16_t)); + dma_memcpy_blocking(l1_Y, l2_Y, LEN * sizeof(int16_t)); } + uint32_t register volatile a = *(uint32_t *)&(A)&0x0000FFFF; mempool_barrier(num_cores); // // SINGLE // time_init = mempool_get_timer(); - // axpy_f16s(l1_A, l1_B, l1_C, LEN); + // axpy_f16s(A, l1_X, l1_Y, LEN); // time_end = mempool_get_timer(); // // PARALLEL // time_init = mempool_get_timer(); - // axpy_f16vecp_unrolled4(l1_A, l1_B, l1_C, LEN, num_cores); + // axpy_f16vecp_unrolled4(A, l1_X, l1_Y, LEN, num_cores); // time_end = mempool_get_timer(); // PARALLEL, LOCAL ACCESSES time_init = mempool_get_timer(); - axpy_f16vecp_local_unrolled4(l1_A, l1_B, l1_C, LEN); + // axpy_f16vecp_local_unrolled4(a, l1_X, l1_Y, LEN); + mempool_start_benchmark(); + axpy_f16vecp_local_unrolled4(a, l1_X, l1_Y, LEN); + mempool_stop_benchmark(); time_end = mempool_get_timer(); mempool_barrier(num_cores); @@ -62,7 +64,7 @@ int main() { uint32_t clock_cycles = (time_end - time_init); printf("\nKernel execution takes %d clock cycles\n", clock_cycles); } - mempool_check_f16(l1_C, l2_out, 100, 0.1f, 0); + mempool_check_f16(l1_Y, l2_out, 100, 0.1f, 0); mempool_barrier(num_cores); return 0; diff --git a/software/apps/baremetal/axpy_f32/main.c b/software/apps/baremetal/axpy_f32/main.c index 262342fb2..1b1bef859 100644 --- a/software/apps/baremetal/axpy_f32/main.c +++ b/software/apps/baremetal/axpy_f32/main.c @@ -16,13 +16,10 @@ #include "data_axpy_f32.h" #define NUM_BANKS (NUM_CORES * BANKING_FACTOR) -#define SINGLE_CORE_REDUCTION -// #define BINARY_REDUCTION // Vectors for kernel computation -float l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio"))); -float l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio"))); -float l1_C[LEN] __attribute__((aligned(LEN), section(".l1_prio"))); +float l1_X[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); +float l1_Y[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); #include "baremetal/mempool_axpy_f32.h" #include "baremetal/mempool_checks.h" @@ -37,17 +34,17 @@ int main() { time_init = 0; time_end = 0; if (core_id == 0) { - dma_memcpy_blocking(l1_A, l2_A, LEN * sizeof(int32_t)); - dma_memcpy_blocking(l1_B, l2_B, LEN * sizeof(int32_t)); - dma_memcpy_blocking(l1_C, l2_C, LEN * sizeof(int32_t)); + dma_memcpy_blocking(l1_X, l2_X, LEN * sizeof(int32_t)); + dma_memcpy_blocking(l1_Y, l2_Y, LEN * sizeof(int32_t)); } + float register volatile a = A; mempool_barrier(num_cores); // PARALLEL time_init = mempool_get_timer(); - // axpy_f32p(l1_A, l1_B, l1_C, LEN, num_cores); - // axpy_f32p_unrolled4(l1_A, l1_B, l1_C, LEN, num_cores); - axpy_f32p_local_unrolled4(l1_A, l1_B, l1_C, LEN); + // axpy_f32p(a, l1_X, l1_Y, LEN, num_cores); + // axpy_f32p_unrolled4(a, l1_X, l1_Y, LEN, num_cores); + axpy_f32p_local_unrolled4(a, l1_X, l1_Y, LEN); time_end = mempool_get_timer(); // Check results @@ -55,7 +52,7 @@ int main() { uint32_t clock_cycles = (time_end - time_init); printf("\nKernel execution takes %d clock cycles\n", clock_cycles); } - mempool_check_f32(l1_C, l2_out, 100, 0.1f, 0); + mempool_check_f32(l1_Y, l2_out, 100, 0.1f, 0); mempool_barrier(num_cores); return 0; diff --git a/software/apps/baremetal/dotp_f16/main.c b/software/apps/baremetal/dotp_f16/main.c index 36a7f8f99..c579c8151 100644 --- a/software/apps/baremetal/dotp_f16/main.c +++ b/software/apps/baremetal/dotp_f16/main.c @@ -19,8 +19,8 @@ #define BINARY_REDUCTION // Vectors for kernel computation -__fp16 l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio"))); -__fp16 l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio"))); +__fp16 l1_A[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); +__fp16 l1_B[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); uint32_t red_barrier[NUM_BANKS] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); __fp16 sum[2 * NUM_BANKS] diff --git a/software/apps/baremetal/dotp_f32/main.c b/software/apps/baremetal/dotp_f32/main.c index 8c3c7e8cd..731942eb7 100644 --- a/software/apps/baremetal/dotp_f32/main.c +++ b/software/apps/baremetal/dotp_f32/main.c @@ -20,8 +20,8 @@ #define BINARY_REDUCTION // Vectors for kernel computation -float l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio"))); -float l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio"))); +float l1_A[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); +float l1_B[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); uint32_t red_barrier[NUM_BANKS] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); float sum[NUM_BANKS] __attribute__((aligned(NUM_BANKS), section(".l1_prio"))); diff --git a/software/data/data_axpy_f16.h.tpl b/software/data/data_axpy_f16.h.tpl index 09ea72cbf..4c6034baf 100644 --- a/software/data/data_axpy_f16.h.tpl +++ b/software/data/data_axpy_f16.h.tpl @@ -17,10 +17,10 @@ #define LEN (${Len}) -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)}; +__fp16 __attribute__((section(".l2"))) A = ${'(__fp16){:.4f}'.format(A)}; -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)}; +__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_X[${Len}] = ${array_to_cstr(X)}; -__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C[${Len}] = ${array_to_cstr(C)}; +__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_Y[${Len}] = ${array_to_cstr(Y)}; __fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_out[${Len}] = ${array_to_cstr(out)}; diff --git a/software/data/data_axpy_f32.h.tpl b/software/data/data_axpy_f32.h.tpl index 2efe34b45..f3fdc8b6a 100644 --- a/software/data/data_axpy_f32.h.tpl +++ b/software/data/data_axpy_f32.h.tpl @@ -15,12 +15,13 @@ return out %> \ + #define LEN (${Len}) -float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)}; +float __attribute__((section(".l2"))) A = ${'(float){:.8f}'.format(A)}; -float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)}; +float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_X[${Len}] = ${array_to_cstr(X)}; -float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C[${Len}] = ${array_to_cstr(C)}; +float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_Y[${Len}] = ${array_to_cstr(Y)}; float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_out[${Len}] = ${array_to_cstr(out)}; diff --git a/software/data/generate_dotp.py b/software/data/generate_dotp.py index 64170f573..b5e7410af 100644 --- a/software/data/generate_dotp.py +++ b/software/data/generate_dotp.py @@ -26,8 +26,8 @@ def generate_dotp_i32(Len): def generate_dotp_f32(Len): # Create matrix - A = np.random.rand(Len).astype(np.float32) - B = np.random.rand(Len).astype(np.float32) + A = np.random.randn(Len).astype(np.float32) + B = np.random.randn(Len).astype(np.float32) C = (np.dot(A, B)).astype(np.float32) return A, B, C @@ -35,8 +35,8 @@ def generate_dotp_f32(Len): def generate_dotp_f16(Len): # Create matrix - A = np.random.rand(Len).astype(np.float16) - B = np.random.rand(Len).astype(np.float16) + A = np.random.randn(Len).astype(np.float16) + B = np.random.randn(Len).astype(np.float16) C = (np.dot(A, B)).astype(np.float16) return A, B, C @@ -44,21 +44,21 @@ def generate_dotp_f16(Len): def generate_axpy_f32(Len): # Create matrix - A = np.random.rand(Len).astype(np.float32) - B = np.random.rand(Len).astype(np.float32) - C = np.random.rand(Len).astype(np.float32) - out = C + A * B - return A, B, C, out + X = np.random.rand(Len).astype(np.float32) + Y = np.random.rand(Len).astype(np.float32) + A = np.float32(3.14) + out = Y + A * X + return A, X, Y, out def generate_axpy_f16(Len): # Create matrix - A = np.random.rand(Len).astype(np.float16) - B = np.random.rand(Len).astype(np.float16) - C = np.random.rand(Len).astype(np.float16) - out = C + A * B - return A, B, C, out + X = np.random.rand(Len).astype(np.float16) + Y = np.random.rand(Len).astype(np.float16) + A = np.float16(3.14) + out = Y + A * X + return A, X, Y, out ################## # compute_result # @@ -130,24 +130,24 @@ def main(): 'Len': Len} gen_data_header_file(args.outdir, tpl, **kwargs) - A, B, C, out = generate_axpy_f32(Len) + A, X, Y, out = generate_axpy_f32(Len) tpl = pathlib.Path(__file__).parent.absolute() / "data_axpy_f32.h.tpl" kwargs = { 'name': 'data_axpy_f32', 'A': A, - 'B': B, - 'C': C, + 'X': X, + 'Y': Y, 'out': out, 'Len': Len} gen_data_header_file(args.outdir, tpl, **kwargs) - A, B, C, out = generate_axpy_f16(Len) + A, X, Y, out = generate_axpy_f16(Len) tpl = pathlib.Path(__file__).parent.absolute() / "data_axpy_f16.h.tpl" kwargs = { 'name': 'data_axpy_f16', 'A': A, - 'B': B, - 'C': C, + 'X': X, + 'Y': Y, 'out': out, 'Len': Len} gen_data_header_file(args.outdir, tpl, **kwargs) diff --git a/software/kernels/baremetal/mempool_axpy_f16.h b/software/kernels/baremetal/mempool_axpy_f16.h index e54331d2d..f123166ed 100644 --- a/software/kernels/baremetal/mempool_axpy_f16.h +++ b/software/kernels/baremetal/mempool_axpy_f16.h @@ -9,37 +9,43 @@ #define AXPYF16VEC_UNROLLED4_LOOP \ { \ - a01 = (*(v2h *)&in_a[i]); \ - a23 = (*(v2h *)&in_a[i + 2]); \ - b01 = (*(v2h *)&in_b[i]); \ - b23 = (*(v2h *)&in_b[i + 2]); \ - c01 = (*(v2h *)&in_c[i]); \ - c23 = (*(v2h *)&in_c[i + 2]); \ - asm volatile( \ - "vfmac.h %[c01], %[a01], %[b01];" \ - "vfmac.h %[c23], %[a23], %[b23];" \ - : [c01] "+&r"(c01), [c23] "+&r"(c23) \ - : [a01] "r"(a01), [a23] "r"(a23), [b01] "r"(b01), [b23] "r"(b23)); \ - (*(v2h *)&in_c[i]) = c01; \ - (*(v2h *)&in_c[i + 2]) = c23; \ + x01 = (*(v2h *)&in_x[i]); \ + x23 = (*(v2h *)&in_x[i + 2]); \ + x45 = (*(v2h *)&in_x[i + 4]); \ + x67 = (*(v2h *)&in_x[i + 6]); \ + y01 = (*(v2h *)&in_y[i]); \ + y23 = (*(v2h *)&in_y[i + 2]); \ + y45 = (*(v2h *)&in_y[i + 4]); \ + y67 = (*(v2h *)&in_y[i + 6]); \ + asm volatile("vfmac.h %[y01], %[x01], %[aa];" \ + "vfmac.h %[y23], %[x23], %[aa];" \ + "vfmac.h %[y45], %[x45], %[aa];" \ + "vfmac.h %[y67], %[x67], %[aa];" \ + : [y01] "+&r"(y01), [y23] "+&r"(y23), [y45] "+&r"(y45), \ + [y67] "+&r"(y67) \ + : [x01] "r"(x01), [x23] "r"(x23), [x45] "r"(x45), \ + [x67] "r"(x67), [aa] "r"(aa)); \ + (*(v2h *)&in_y[i]) = y01; \ + (*(v2h *)&in_y[i + 2]) = y23; \ + (*(v2h *)&in_y[i + 4]) = y45; \ + (*(v2h *)&in_y[i + 6]) = y67; \ } /* Single-core dot-product */ -void axpy_f16s(__fp16 *in_a, __fp16 *in_b, __fp16 *in_c, uint32_t Len) { +void axpy_f16s(uint32_t a, __fp16 *in_x, __fp16 *in_y, uint32_t Len) { uint32_t core_id = mempool_get_core_id(); if (core_id == 0) { mempool_start_benchmark(); // Kernel execution - __fp16 *end = in_a + Len / 2; + __fp16 *end = in_x + Len / 2; do { asm volatile("fmadd.h %0, %1, %2, %0;" - : "+&r"(*in_c) - : "r"(*in_a), "r"(*in_b)); - in_a++; - in_b++; - in_c++; - } while (in_a < end); + : "+&r"(*in_y) + : "r"(a), "r"(*in_x)); + in_x++; + in_y++; + } while (in_x < end); mempool_stop_benchmark(); } @@ -47,17 +53,16 @@ void axpy_f16s(__fp16 *in_a, __fp16 *in_b, __fp16 *in_c, uint32_t Len) { } /* Single-core dot-product unrolled4 */ -void axpy_f16s_unrolled4(__fp16 *in_a, __fp16 *in_b, __fp16 *in_c, - uint32_t Len) { +void axpy_f16s_unrolled4(uint32_t a, __fp16 *in_x, __fp16 *in_y, uint32_t Len) { uint32_t core_id = mempool_get_core_id(); if (core_id == 0) { mempool_start_benchmark(); uint32_t i = 0; - v2h a01, a23; - v2h b01, b23; - v2h c01, c23; - for (i = 0; i < Len; i += 4) { + uint32_t aa = (a << 16U) | a; + v2h x01, x23, x45, x67; + v2h y01, y23, y45, y67; + for (i = 0; i < Len; i += 8) { AXPYF16VEC_UNROLLED4_LOOP; } mempool_stop_benchmark(); @@ -67,58 +72,56 @@ void axpy_f16s_unrolled4(__fp16 *in_a, __fp16 *in_b, __fp16 *in_c, } /* Parallel dot-product */ -void axpy_f16p(__fp16 *in_a, __fp16 *in_b, __fp16 *in_c, uint32_t Len, +void axpy_f16p(uint32_t a, __fp16 *in_x, __fp16 *in_y, uint32_t Len, uint32_t nPE) { - uint32_t num_cores = mempool_get_core_count(); uint32_t core_id = mempool_get_core_id(); uint32_t step = Len / nPE; - __fp16 a, b, c; + __fp16 x, y; for (uint32_t i = core_id * step; i < core_id * step + step; i++) { - a = in_a[i]; - b = in_b[i]; - c = in_c[i]; - asm volatile("fmadd.h %0, %1, %2, %0;" : "+&r"(c) : "r"(a), "r"(b)); - in_c[i] = c; + x = in_x[i]; + y = in_y[i]; + asm volatile("fmadd.h %0, %1, %2, %0;" : "+&r"(y) : "r"(a), "r"(x)); + in_y[i] = y; } - mempool_barrier(num_cores); + mempool_log_barrier(2, core_id); return; } /* Parallel dot-product with loop unrolling*/ -void axpy_f16vecp_unrolled4(__fp16 *in_a, __fp16 *in_b, __fp16 *in_c, +void axpy_f16vecp_unrolled4(uint32_t a, __fp16 *in_x, __fp16 *in_y, uint32_t Len, uint32_t nPE) { - uint32_t num_cores = mempool_get_core_count(); uint32_t core_id = mempool_get_core_id(); uint32_t step = Len / nPE; uint32_t i; - v2h a01, a23; - v2h b01, b23; - v2h c01, c23; - for (i = core_id * step; i < core_id * step + step; i += 4) { + + uint32_t aa = (a << 16U) | a; + v2h x01, x23, x45, x67; + v2h y01, y23, y45, y67; + for (i = core_id * step; i < (core_id * step + step); i += 8) { AXPYF16VEC_UNROLLED4_LOOP; } - mempool_barrier(num_cores); + mempool_log_barrier(2, core_id); return; } /* Parallel dot-product with loop unrolling */ /* Load and stores only in local memory */ -void axpy_f16vecp_local_unrolled4(__fp16 *in_a, __fp16 *in_b, __fp16 *in_c, +void axpy_f16vecp_local_unrolled4(uint32_t a, __fp16 *in_x, __fp16 *in_y, uint32_t Len) { - uint32_t num_cores = mempool_get_core_count(); uint32_t core_id = mempool_get_core_id(); - v2h a01, a23; - v2h b01, b23; - v2h c01, c23; - for (uint32_t i = core_id * BANKING_FACTOR; i < Len; i += NUM_BANKS) { + + uint32_t aa = (a << 16U) | a; + v2h x01, x23, x45, x67; + v2h y01, y23, y45, y67; + for (uint32_t i = 2 * core_id * BANKING_FACTOR; i < Len; i += 2 * NUM_BANKS) { AXPYF16VEC_UNROLLED4_LOOP; } - mempool_barrier(num_cores); + mempool_log_barrier(2, core_id); return; } diff --git a/software/kernels/baremetal/mempool_axpy_f32.h b/software/kernels/baremetal/mempool_axpy_f32.h index ff069524c..27962d5cb 100644 --- a/software/kernels/baremetal/mempool_axpy_f32.h +++ b/software/kernels/baremetal/mempool_axpy_f32.h @@ -6,55 +6,49 @@ #define AXPYF32_UNROLLED4_LOOP \ { \ - a0 = in_a[i]; \ - b0 = in_b[i]; \ - c0 = in_c[i]; \ - a1 = in_a[i + 1]; \ - b1 = in_b[i + 1]; \ - c1 = in_c[i + 1]; \ - a2 = in_a[i + 2]; \ - b2 = in_b[i + 2]; \ - c2 = in_c[i + 2]; \ - a3 = in_a[i + 3]; \ - b3 = in_b[i + 3]; \ - c3 = in_c[i + 3]; \ + x0 = in_x[i]; \ + y0 = in_y[i]; \ + x1 = in_x[i + 1]; \ + y1 = in_y[i + 1]; \ + x2 = in_x[i + 2]; \ + y2 = in_y[i + 2]; \ + x3 = in_x[i + 3]; \ + y3 = in_y[i + 3]; \ asm volatile( \ - "fmadd.s %[c0], %[a0], %[b0], %[c0];" \ - "fmadd.s %[c1], %[a1], %[b1], %[c1];" \ - "fmadd.s %[c2], %[a2], %[b2], %[c2];" \ - "fmadd.s %[c3], %[a3], %[b3], %[c3];" \ - : [c0] "+&r"(c0), [c1] "+&r"(c1), [c2] "+&r"(c2), [c3] "+&r"(c3) \ - : [a0] "r"(a0), [a1] "r"(a1), [a2] "r"(a2), [a3] "r"(a3), \ - [b0] "r"(b0), [b1] "r"(b1), [b2] "r"(b2), [b3] "r"(b3)); \ - in_c[i] = c0; \ - in_c[i + 1] = c1; \ - in_c[i + 2] = c2; \ - in_c[i + 3] = c3; \ + "fmadd.s %[y0], %[a], %[x0], %[y0];" \ + "fmadd.s %[y1], %[a], %[x1], %[y1];" \ + "fmadd.s %[y2], %[a], %[x2], %[y2];" \ + "fmadd.s %[y3], %[a], %[x3], %[y3];" \ + : [y0] "+&r"(y0), [y1] "+&r"(y1), [y2] "+&r"(y2), [y3] "+&r"(y3) \ + : [x0] "r"(x0), [x1] "r"(x1), [x2] "r"(x2), [x3] "r"(x3), [a] "r"(a)); \ + in_y[i] = y0; \ + in_y[i + 1] = y1; \ + in_y[i + 2] = y2; \ + in_y[i + 3] = y3; \ } /* Single-core dot-product */ -void axpy_f32s(float *in_a, float *in_b, float *in_c, uint32_t Len) { +void axpy_f32s(float a, float *in_x, float *in_y, uint32_t Len) { uint32_t core_id = mempool_get_core_id(); if (core_id == 0) { mempool_start_benchmark(); // Kernel execution - float *end = in_a + Len; + float *end = in_x + Len; do { asm volatile("fmadd.s %0, %1, %2, %0;" - : "+&r"(*in_c) - : "r"(*in_a), "r"(*in_b)); - in_a++; - in_b++; - in_c++; - } while (in_a < end); + : "+&r"(*in_y) + : "r"(a), "r"(*in_x)); + in_x++; + in_y++; + } while (in_x < end); mempool_stop_benchmark(); } return; } /* Single-core dot-product unrolled4 */ -void axpy_f32s_unrolled4(float *in_a, float *in_b, float *in_c, uint32_t Len) { +void axpy_f32s_unrolled4(float a, float *in_x, float *in_y, uint32_t Len) { uint32_t core_id = mempool_get_core_id(); if (core_id == 0) { @@ -62,19 +56,16 @@ void axpy_f32s_unrolled4(float *in_a, float *in_b, float *in_c, uint32_t Len) { uint32_t reminder = Len % 4; uint32_t i = 0; - register float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f; - register float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f; - register float c2 = 0.0f, c1 = 0.0f, c0 = 0.0f, c3 = 0.0f; - + register float x2, x1, x0, x3; + register float y2, y1, y0, y3; for (i = 0; i < (Len - reminder); i += 4) { AXPYF32_UNROLLED4_LOOP; } while (i < Len) { - a0 = in_a[i]; - b0 = in_b[i]; - c0 = in_c[i]; - asm volatile("fmadd.s %0, %1, %2, %0;" : "+&r"(c0) : "r"(a0), "r"(b0)); - in_c[i] = c0; + x0 = in_x[i]; + y0 = in_y[i]; + asm volatile("fmadd.s %0, %1, %2, %0;" : "+&r"(y0) : "r"(a), "r"(x0)); + in_y[i] = y0; i++; } mempool_stop_benchmark(); @@ -83,83 +74,72 @@ void axpy_f32s_unrolled4(float *in_a, float *in_b, float *in_c, uint32_t Len) { } /* Parallel dot-product */ -void axpy_f32p(float *in_a, float *in_b, float *in_c, uint32_t Len, - uint32_t nPE) { +void axpy_f32p(float a, float *in_x, float *in_y, uint32_t Len, uint32_t nPE) { - uint32_t num_cores = mempool_get_core_count(); uint32_t core_id = mempool_get_core_id(); uint32_t step = Len / nPE; - register float a, b, c; + register float x, y; for (uint32_t i = core_id * step; i < core_id * step + step; i++) { - a = in_a[i]; - b = in_b[i]; - c = in_c[i]; - asm volatile("fmadd.s %0, %1, %2, %0;" : "+&r"(c) : "r"(a), "r"(b)); - in_c[i] = c; + x = in_x[i]; + y = in_y[i]; + asm volatile("fmadd.s %0, %1, %2, %0;" : "+&r"(y) : "r"(a), "r"(x)); + in_y[i] = y; } - mempool_barrier(num_cores); + mempool_log_barrier(2, core_id); return; } /* Parallel dot-product with loop unrolling*/ -void axpy_f32p_unrolled4(float *in_a, float *in_b, float *in_c, uint32_t Len, +void axpy_f32p_unrolled4(float a, float *in_x, float *in_y, uint32_t Len, uint32_t nPE) { - uint32_t num_cores = mempool_get_core_count(); uint32_t core_id = mempool_get_core_id(); uint32_t step = Len / nPE; uint32_t reminder = step % 4; uint32_t i; - register float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f; - register float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f; - register float c2 = 0.0f, c1 = 0.0f, c0 = 0.0f, c3 = 0.0f; - + register float x2, x1, x0, x3; + register float y2, y1, y0, y3; for (i = core_id * step; i < (core_id * step + step) - reminder; i += 4) { AXPYF32_UNROLLED4_LOOP; } i = core_id * step + step - reminder; while (i < step) { - a0 = in_a[i]; - b0 = in_b[i]; - c0 = in_c[i]; - asm volatile("fmadd.s %0, %1, %2, %0;" : "+&r"(c0) : "r"(a0), "r"(b0)); - in_c[i] = c0; + x0 = in_x[i]; + y0 = in_y[i]; + asm volatile("fmadd.s %0, %1, %2, %0;" : "+&r"(y0) : "r"(a), "r"(x0)); + in_y[i] = y0; i++; } - mempool_barrier(num_cores); + mempool_log_barrier(2, core_id); return; } /* Parallel dot-product with loop unrolling */ /* Load and stores only in local memory */ -void axpy_f32p_local_unrolled4(float *in_a, float *in_b, float *in_c, +void axpy_f32p_local_unrolled4(float a, float *in_x, float *in_y, uint32_t Len) { - uint32_t num_cores = mempool_get_core_count(); uint32_t core_id = mempool_get_core_id(); uint32_t const remainder = Len % BANKING_FACTOR; uint32_t const idx_stop = Len - remainder; - register float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f; - register float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f; - register float c2 = 0.0f, c1 = 0.0f, c0 = 0.0f, c3 = 0.0f; - + register float x2, x1, x0, x3; + register float y2, y1, y0, y3; for (uint32_t i = core_id * BANKING_FACTOR; i < idx_stop; i += NUM_BANKS) { AXPYF32_UNROLLED4_LOOP; } if (core_id == ((Len % NUM_BANKS) / 4)) { for (uint32_t i = Len - remainder; i < Len; i++) { - a0 = in_a[i]; - b0 = in_b[i]; - asm volatile("fmadd.s %0, %1, %2, %0;" : "+&r"(c0) : "r"(a0), "r"(b0)); - in_c[i] = c0; + x0 = in_x[i]; + asm volatile("fmadd.s %0, %1, %2, %0;" : "+&r"(y0) : "r"(a), "r"(x0)); + in_y[i] = y0; } } - mempool_barrier(num_cores); + mempool_log_barrier(2, core_id); return; } diff --git a/software/kernels/baremetal/mempool_dotp_f16.h b/software/kernels/baremetal/mempool_dotp_f16.h index 17d13df24..791b7c68e 100644 --- a/software/kernels/baremetal/mempool_dotp_f16.h +++ b/software/kernels/baremetal/mempool_dotp_f16.h @@ -11,13 +11,21 @@ { \ a01 = (*(v2h *)&in_a[i]); \ a23 = (*(v2h *)&in_a[i + 2]); \ + a45 = (*(v2h *)&in_a[i + 4]); \ + a67 = (*(v2h *)&in_a[i + 6]); \ b01 = (*(v2h *)&in_b[i]); \ b23 = (*(v2h *)&in_b[i + 2]); \ + b45 = (*(v2h *)&in_b[i + 4]); \ + b67 = (*(v2h *)&in_b[i + 6]); \ asm volatile( \ "vfdotpex.s.h %[local_sum0], %[a01], %[b01];" \ "vfdotpex.s.h %[local_sum1], %[a23], %[b23];" \ - : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1) \ - : [a01] "r"(a01), [a23] "r"(a23), [b01] "r"(b01), [b23] "r"(b23)); \ + "vfdotpex.s.h %[local_sum2], %[a45], %[b45];" \ + "vfdotpex.s.h %[local_sum3], %[a67], %[b67];" \ + : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1), \ + [local_sum2] "+&r"(local_sum2), [local_sum3] "+&r"(local_sum3) \ + : [a01] "r"(a01), [a23] "r"(a23), [a45] "r"(a45), [a67] "r"(a67), \ + [b01] "r"(b01), [b23] "r"(b23), [b45] "r"(b45), [b67] "r"(b67)); \ } /* Single core reduction */ @@ -116,20 +124,23 @@ void dotp_f16s_unrolled4(__fp16 *in_a, __fp16 *in_b, __fp16 *s, uint32_t Len) { if (core_id == 0) { mempool_start_benchmark(); uint32_t i = 0; - - v2h a01, a23; - v2h b01, b23; + v2h a01, a23, a45, a67; + v2h b01, b23, b45, b67; float local_sum0 = 0.0f; float local_sum1 = 0.0f; - - for (i = 0; i < Len; i += 4) { + float local_sum2 = 0.0f; + float local_sum3 = 0.0f; + for (i = 0; i < Len; i += 8) { DOTPF16VEC_UNROLLED4_LOOP; } // Reduction asm volatile( "fadd.s %[local_sum0], %[local_sum0], %[local_sum1];" + "fadd.s %[local_sum2], %[local_sum2], %[local_sum3];" + "fadd.s %[local_sum0], %[local_sum0], %[local_sum2];" "fcvt.h.s %[local_sum0], %[local_sum0];" - : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1) + : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1), + [local_sum2] "+&r"(local_sum2), [local_sum3] "+&r"(local_sum3) :); s[0] = *(__fp16 *)&local_sum0; mempool_stop_benchmark(); @@ -168,17 +179,21 @@ void dotp_f16vecp_unrolled4(__fp16 *in_a, __fp16 *in_b, __fp16 *s, uint32_t Len, uint32_t step = Len / nPE; uint32_t i; - v2h a01, a23; - v2h b01, b23; + v2h a01, a23, a45, a67; + v2h b01, b23, b45, b67; float local_sum0 = 0.0f; float local_sum1 = 0.0f; - - for (i = core_id * step; i < core_id * step + step; i += 4) { + float local_sum2 = 0.0f; + float local_sum3 = 0.0f; + for (i = core_id * step; i < (core_id * step + step); i += 8) { DOTPF16VEC_UNROLLED4_LOOP; } asm volatile("fadd.s %[local_sum0], %[local_sum0], %[local_sum1];" + "fadd.s %[local_sum2], %[local_sum2], %[local_sum3];" + "fadd.s %[local_sum0], %[local_sum0], %[local_sum2];" "fcvt.h.s %[local_sum0], %[local_sum0];" - : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1) + : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1), + [local_sum2] "+&r"(local_sum2), [local_sum3] "+&r"(local_sum3) :); s[2 * core_id * BANKING_FACTOR] = *(__fp16 *)&local_sum0; uint32_t num_cores = mempool_get_core_count(); @@ -194,16 +209,21 @@ void dotp_f16vecp_local_unrolled4(__fp16 *in_a, __fp16 *in_b, __fp16 *s, uint32_t core_id = mempool_get_core_id(); - v2h a01, a23; - v2h b01, b23; + v2h a01, a23, a45, a67; + v2h b01, b23, b45, b67; float local_sum0 = 0.0f; float local_sum1 = 0.0f; - for (uint32_t i = core_id * BANKING_FACTOR; i < Len; i += NUM_BANKS) { + float local_sum2 = 0.0f; + float local_sum3 = 0.0f; + for (uint32_t i = 2 * core_id * BANKING_FACTOR; i < Len; i += 2 * NUM_BANKS) { DOTPF16VEC_UNROLLED4_LOOP; } asm volatile("fadd.s %[local_sum0], %[local_sum0], %[local_sum1];" + "fadd.s %[local_sum2], %[local_sum2], %[local_sum3];" + "fadd.s %[local_sum0], %[local_sum0], %[local_sum2];" "fcvt.h.s %[local_sum0], %[local_sum0];" - : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1) + : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1), + [local_sum2] "+&r"(local_sum2), [local_sum3] "+&r"(local_sum3) :); s[2 * core_id * BANKING_FACTOR] = *(__fp16 *)&local_sum0; diff --git a/software/kernels/baremetal/mempool_dotp_f32.h b/software/kernels/baremetal/mempool_dotp_f32.h index 58fa0e9d5..290b96d59 100644 --- a/software/kernels/baremetal/mempool_dotp_f32.h +++ b/software/kernels/baremetal/mempool_dotp_f32.h @@ -7,12 +7,12 @@ #define DOTPF32_UNROLLED4_LOOP \ { \ a0 = in_a[i]; \ - b0 = in_b[i]; \ a1 = in_a[i + 1]; \ - b1 = in_b[i + 1]; \ a2 = in_a[i + 2]; \ - b2 = in_b[i + 2]; \ a3 = in_a[i + 3]; \ + b0 = in_b[i]; \ + b1 = in_b[i + 1]; \ + b2 = in_b[i + 2]; \ b3 = in_b[i + 3]; \ asm volatile( \ "fmadd.s %[local_sum0], %[a0], %[b0], %[local_sum0];" \ @@ -95,7 +95,7 @@ void dotp_f32s(float *in_a, float *in_b, float *s, uint32_t Len) { if (core_id == 0) { mempool_start_benchmark(); // Kernel execution - register float local_sum = 0; + float local_sum = 0; float *end = in_a + Len; do { asm volatile("fmadd.s %0, %1, %2, %0;" @@ -120,13 +120,12 @@ void dotp_f32s_unrolled4(float *in_a, float *in_b, float *s, uint32_t Len) { uint32_t reminder = Len % 4; uint32_t i = 0; - register float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f; - register float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f; - register float local_sum0 = 0.0f; - register float local_sum1 = 0.0f; - register float local_sum2 = 0.0f; - register float local_sum3 = 0.0f; - + float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f; + float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f; + float local_sum0 = 0.0f; + float local_sum1 = 0.0f; + float local_sum2 = 0.0f; + float local_sum3 = 0.0f; for (i = 0; i < (Len - reminder); i += 4) { DOTPF32_UNROLLED4_LOOP; } @@ -158,8 +157,8 @@ void dotp_f32p(float *in_a, float *in_b, float *s, uint32_t Len, uint32_t nPE) { uint32_t core_id = mempool_get_core_id(); uint32_t step = Len / nPE; - register float local_sum = 0; - register float a, b; + float local_sum = 0; + float a, b; for (uint32_t i = core_id * step; i < core_id * step + step; i++) { a = in_a[i]; b = in_b[i]; @@ -182,12 +181,12 @@ void dotp_f32p_unrolled4(float *in_a, float *in_b, float *s, uint32_t Len, uint32_t reminder = step % 4; uint32_t i; - register float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f; - register float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f; - register float local_sum0 = 0.0f; - register float local_sum1 = 0.0f; - register float local_sum2 = 0.0f; - register float local_sum3 = 0.0f; + float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f; + float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f; + float local_sum0 = 0.0f; + float local_sum1 = 0.0f; + float local_sum2 = 0.0f; + float local_sum3 = 0.0f; for (i = core_id * step; i < (core_id * step + step) - reminder; i += 4) { DOTPF32_UNROLLED4_LOOP; @@ -223,18 +222,18 @@ void dotp_f32p_local_unrolled4(float *in_a, float *in_b, float *s, uint32_t const remainder = Len % BANKING_FACTOR; uint32_t const idx_stop = Len - remainder; - register float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f; - register float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f; - register float local_sum0 = 0.0f; - register float local_sum1 = 0.0f; - register float local_sum2 = 0.0f; - register float local_sum3 = 0.0f; + float a0, a1, a2, a3; + float b2, b1, b0, b3; + float local_sum0 = 0.0f; + float local_sum1 = 0.0f; + float local_sum2 = 0.0f; + float local_sum3 = 0.0f; for (uint32_t i = core_id * BANKING_FACTOR; i < idx_stop; i += NUM_BANKS) { DOTPF32_UNROLLED4_LOOP; } if (core_id == ((Len % NUM_BANKS) / 4)) { - for (uint32_t i = Len - remainder; i < Len; i++) { + for (uint32_t i = idx_stop; i < Len; i++) { a0 = in_a[i]; b0 = in_b[i]; asm volatile("fmadd.s %0, %1, %2, %0;"