From a1bc514b3f2ce53613c6c6edc20c15c5b8cfc55a Mon Sep 17 00:00:00 2001
From: mbertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Thu, 12 Sep 2024 14:23:40 +0200
Subject: [PATCH] [software] Improve performance of axpy and dotp

---
 software/apps/baremetal/axpy_f16/main.c       |  22 +--
 software/apps/baremetal/axpy_f32/main.c       |  21 ++-
 software/apps/baremetal/dotp_f16/main.c       |   4 +-
 software/apps/baremetal/dotp_f32/main.c       |   4 +-
 software/data/data_axpy_f16.h.tpl             |   6 +-
 software/data/data_axpy_f32.h.tpl             |   7 +-
 software/data/generate_dotp.py                |  40 +++---
 software/kernels/baremetal/mempool_axpy_f16.h | 103 +++++++-------
 software/kernels/baremetal/mempool_axpy_f32.h | 128 ++++++++----------
 software/kernels/baremetal/mempool_dotp_f16.h |  54 +++++---
 software/kernels/baremetal/mempool_dotp_f32.h |  51 ++++---
 11 files changed, 221 insertions(+), 219 deletions(-)

diff --git a/software/apps/baremetal/axpy_f16/main.c b/software/apps/baremetal/axpy_f16/main.c
index 9fe49d299..ff13cb879 100644
--- a/software/apps/baremetal/axpy_f16/main.c
+++ b/software/apps/baremetal/axpy_f16/main.c
@@ -18,9 +18,8 @@
 #define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
 
 // Vectors for kernel computation
-__fp16 l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
-__fp16 l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
-__fp16 l1_C[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
+__fp16 l1_X[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+__fp16 l1_Y[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 
 #include "baremetal/mempool_axpy_f16.h"
 #include "baremetal/mempool_checks.h"
@@ -35,25 +34,28 @@ int main() {
   time_init = 0;
   time_end = 0;
   if (core_id == 0) {
-    dma_memcpy_blocking(l1_A, l2_A, LEN * sizeof(int16_t));
-    dma_memcpy_blocking(l1_B, l2_B, LEN * sizeof(int16_t));
-    dma_memcpy_blocking(l1_C, l2_C, LEN * sizeof(int16_t));
+    dma_memcpy_blocking(l1_X, l2_X, LEN * sizeof(int16_t));
+    dma_memcpy_blocking(l1_Y, l2_Y, LEN * sizeof(int16_t));
   }
+  uint32_t register volatile a = *(uint32_t *)&(A)&0x0000FFFF;
   mempool_barrier(num_cores);
 
   //  // SINGLE
   //  time_init = mempool_get_timer();
-  //  axpy_f16s(l1_A, l1_B, l1_C, LEN);
+  //  axpy_f16s(A, l1_X, l1_Y, LEN);
   //  time_end = mempool_get_timer();
 
   //  // PARALLEL
   //  time_init = mempool_get_timer();
-  //  axpy_f16vecp_unrolled4(l1_A, l1_B, l1_C, LEN, num_cores);
+  //  axpy_f16vecp_unrolled4(A, l1_X, l1_Y, LEN, num_cores);
   //  time_end = mempool_get_timer();
 
   // PARALLEL, LOCAL ACCESSES
   time_init = mempool_get_timer();
-  axpy_f16vecp_local_unrolled4(l1_A, l1_B, l1_C, LEN);
+  // axpy_f16vecp_local_unrolled4(a, l1_X, l1_Y, LEN);
+  mempool_start_benchmark();
+  axpy_f16vecp_local_unrolled4(a, l1_X, l1_Y, LEN);
+  mempool_stop_benchmark();
   time_end = mempool_get_timer();
 
   mempool_barrier(num_cores);
@@ -62,7 +64,7 @@ int main() {
     uint32_t clock_cycles = (time_end - time_init);
     printf("\nKernel execution takes %d clock cycles\n", clock_cycles);
   }
-  mempool_check_f16(l1_C, l2_out, 100, 0.1f, 0);
+  mempool_check_f16(l1_Y, l2_out, 100, 0.1f, 0);
   mempool_barrier(num_cores);
 
   return 0;
diff --git a/software/apps/baremetal/axpy_f32/main.c b/software/apps/baremetal/axpy_f32/main.c
index 262342fb2..1b1bef859 100644
--- a/software/apps/baremetal/axpy_f32/main.c
+++ b/software/apps/baremetal/axpy_f32/main.c
@@ -16,13 +16,10 @@
 
 #include "data_axpy_f32.h"
 #define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
-#define SINGLE_CORE_REDUCTION
-// #define BINARY_REDUCTION
 
 // Vectors for kernel computation
-float l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
-float l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
-float l1_C[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
+float l1_X[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+float l1_Y[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 
 #include "baremetal/mempool_axpy_f32.h"
 #include "baremetal/mempool_checks.h"
@@ -37,17 +34,17 @@ int main() {
   time_init = 0;
   time_end = 0;
   if (core_id == 0) {
-    dma_memcpy_blocking(l1_A, l2_A, LEN * sizeof(int32_t));
-    dma_memcpy_blocking(l1_B, l2_B, LEN * sizeof(int32_t));
-    dma_memcpy_blocking(l1_C, l2_C, LEN * sizeof(int32_t));
+    dma_memcpy_blocking(l1_X, l2_X, LEN * sizeof(int32_t));
+    dma_memcpy_blocking(l1_Y, l2_Y, LEN * sizeof(int32_t));
   }
+  float register volatile a = A;
   mempool_barrier(num_cores);
 
   // PARALLEL
   time_init = mempool_get_timer();
-  // axpy_f32p(l1_A, l1_B, l1_C, LEN, num_cores);
-  // axpy_f32p_unrolled4(l1_A, l1_B, l1_C, LEN, num_cores);
-  axpy_f32p_local_unrolled4(l1_A, l1_B, l1_C, LEN);
+  // axpy_f32p(a, l1_X, l1_Y, LEN, num_cores);
+  // axpy_f32p_unrolled4(a, l1_X, l1_Y, LEN, num_cores);
+  axpy_f32p_local_unrolled4(a, l1_X, l1_Y, LEN);
   time_end = mempool_get_timer();
 
   // Check results
@@ -55,7 +52,7 @@ int main() {
     uint32_t clock_cycles = (time_end - time_init);
     printf("\nKernel execution takes %d clock cycles\n", clock_cycles);
   }
-  mempool_check_f32(l1_C, l2_out, 100, 0.1f, 0);
+  mempool_check_f32(l1_Y, l2_out, 100, 0.1f, 0);
   mempool_barrier(num_cores);
 
   return 0;
diff --git a/software/apps/baremetal/dotp_f16/main.c b/software/apps/baremetal/dotp_f16/main.c
index 36a7f8f99..c579c8151 100644
--- a/software/apps/baremetal/dotp_f16/main.c
+++ b/software/apps/baremetal/dotp_f16/main.c
@@ -19,8 +19,8 @@
 #define BINARY_REDUCTION
 
 // Vectors for kernel computation
-__fp16 l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
-__fp16 l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
+__fp16 l1_A[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+__fp16 l1_B[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 uint32_t red_barrier[NUM_BANKS]
     __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 __fp16 sum[2 * NUM_BANKS]
diff --git a/software/apps/baremetal/dotp_f32/main.c b/software/apps/baremetal/dotp_f32/main.c
index 8c3c7e8cd..731942eb7 100644
--- a/software/apps/baremetal/dotp_f32/main.c
+++ b/software/apps/baremetal/dotp_f32/main.c
@@ -20,8 +20,8 @@
 #define BINARY_REDUCTION
 
 // Vectors for kernel computation
-float l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
-float l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
+float l1_A[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+float l1_B[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 uint32_t red_barrier[NUM_BANKS]
     __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 float sum[NUM_BANKS] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
diff --git a/software/data/data_axpy_f16.h.tpl b/software/data/data_axpy_f16.h.tpl
index 09ea72cbf..4c6034baf 100644
--- a/software/data/data_axpy_f16.h.tpl
+++ b/software/data/data_axpy_f16.h.tpl
@@ -17,10 +17,10 @@
 
 #define LEN (${Len})
 
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)};
+__fp16 __attribute__((section(".l2"))) A = ${'(__fp16){:.4f}'.format(A)};
 
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)};
+__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_X[${Len}] = ${array_to_cstr(X)};
 
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C[${Len}] = ${array_to_cstr(C)};
+__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_Y[${Len}] = ${array_to_cstr(Y)};
 
 __fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_out[${Len}] = ${array_to_cstr(out)};
diff --git a/software/data/data_axpy_f32.h.tpl b/software/data/data_axpy_f32.h.tpl
index 2efe34b45..f3fdc8b6a 100644
--- a/software/data/data_axpy_f32.h.tpl
+++ b/software/data/data_axpy_f32.h.tpl
@@ -15,12 +15,13 @@
     return out
 %> \
 
+
 #define LEN (${Len})
 
-float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)};
+float __attribute__((section(".l2"))) A = ${'(float){:.8f}'.format(A)};
 
-float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)};
+float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_X[${Len}] = ${array_to_cstr(X)};
 
-float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C[${Len}] = ${array_to_cstr(C)};
+float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_Y[${Len}] = ${array_to_cstr(Y)};
 
 float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_out[${Len}] = ${array_to_cstr(out)};
diff --git a/software/data/generate_dotp.py b/software/data/generate_dotp.py
index 64170f573..b5e7410af 100644
--- a/software/data/generate_dotp.py
+++ b/software/data/generate_dotp.py
@@ -26,8 +26,8 @@ def generate_dotp_i32(Len):
 def generate_dotp_f32(Len):
 
     # Create matrix
-    A = np.random.rand(Len).astype(np.float32)
-    B = np.random.rand(Len).astype(np.float32)
+    A = np.random.randn(Len).astype(np.float32)
+    B = np.random.randn(Len).astype(np.float32)
     C = (np.dot(A, B)).astype(np.float32)
     return A, B, C
 
@@ -35,8 +35,8 @@ def generate_dotp_f32(Len):
 def generate_dotp_f16(Len):
 
     # Create matrix
-    A = np.random.rand(Len).astype(np.float16)
-    B = np.random.rand(Len).astype(np.float16)
+    A = np.random.randn(Len).astype(np.float16)
+    B = np.random.randn(Len).astype(np.float16)
     C = (np.dot(A, B)).astype(np.float16)
     return A, B, C
 
@@ -44,21 +44,21 @@ def generate_dotp_f16(Len):
 def generate_axpy_f32(Len):
 
     # Create matrix
-    A = np.random.rand(Len).astype(np.float32)
-    B = np.random.rand(Len).astype(np.float32)
-    C = np.random.rand(Len).astype(np.float32)
-    out = C + A * B
-    return A, B, C, out
+    X = np.random.rand(Len).astype(np.float32)
+    Y = np.random.rand(Len).astype(np.float32)
+    A = np.float32(3.14)
+    out = Y + A * X
+    return A, X, Y, out
 
 
 def generate_axpy_f16(Len):
 
     # Create matrix
-    A = np.random.rand(Len).astype(np.float16)
-    B = np.random.rand(Len).astype(np.float16)
-    C = np.random.rand(Len).astype(np.float16)
-    out = C + A * B
-    return A, B, C, out
+    X = np.random.rand(Len).astype(np.float16)
+    Y = np.random.rand(Len).astype(np.float16)
+    A = np.float16(3.14)
+    out = Y + A * X
+    return A, X, Y, out
 
 ##################
 # compute_result #
@@ -130,24 +130,24 @@ def main():
         'Len': Len}
     gen_data_header_file(args.outdir, tpl, **kwargs)
 
-    A, B, C, out = generate_axpy_f32(Len)
+    A, X, Y, out = generate_axpy_f32(Len)
     tpl = pathlib.Path(__file__).parent.absolute() / "data_axpy_f32.h.tpl"
     kwargs = {
         'name': 'data_axpy_f32',
         'A': A,
-        'B': B,
-        'C': C,
+        'X': X,
+        'Y': Y,
         'out': out,
         'Len': Len}
     gen_data_header_file(args.outdir, tpl, **kwargs)
 
-    A, B, C, out = generate_axpy_f16(Len)
+    A, X, Y, out = generate_axpy_f16(Len)
     tpl = pathlib.Path(__file__).parent.absolute() / "data_axpy_f16.h.tpl"
     kwargs = {
         'name': 'data_axpy_f16',
         'A': A,
-        'B': B,
-        'C': C,
+        'X': X,
+        'Y': Y,
         'out': out,
         'Len': Len}
     gen_data_header_file(args.outdir, tpl, **kwargs)
diff --git a/software/kernels/baremetal/mempool_axpy_f16.h b/software/kernels/baremetal/mempool_axpy_f16.h
index e54331d2d..f123166ed 100644
--- a/software/kernels/baremetal/mempool_axpy_f16.h
+++ b/software/kernels/baremetal/mempool_axpy_f16.h
@@ -9,37 +9,43 @@
 
 #define AXPYF16VEC_UNROLLED4_LOOP                                              \
   {                                                                            \
-    a01 = (*(v2h *)&in_a[i]);                                                  \
-    a23 = (*(v2h *)&in_a[i + 2]);                                              \
-    b01 = (*(v2h *)&in_b[i]);                                                  \
-    b23 = (*(v2h *)&in_b[i + 2]);                                              \
-    c01 = (*(v2h *)&in_c[i]);                                                  \
-    c23 = (*(v2h *)&in_c[i + 2]);                                              \
-    asm volatile(                                                              \
-        "vfmac.h %[c01], %[a01], %[b01];"                                      \
-        "vfmac.h %[c23], %[a23], %[b23];"                                      \
-        : [c01] "+&r"(c01), [c23] "+&r"(c23)                                   \
-        : [a01] "r"(a01), [a23] "r"(a23), [b01] "r"(b01), [b23] "r"(b23));     \
-    (*(v2h *)&in_c[i]) = c01;                                                  \
-    (*(v2h *)&in_c[i + 2]) = c23;                                              \
+    x01 = (*(v2h *)&in_x[i]);                                                  \
+    x23 = (*(v2h *)&in_x[i + 2]);                                              \
+    x45 = (*(v2h *)&in_x[i + 4]);                                              \
+    x67 = (*(v2h *)&in_x[i + 6]);                                              \
+    y01 = (*(v2h *)&in_y[i]);                                                  \
+    y23 = (*(v2h *)&in_y[i + 2]);                                              \
+    y45 = (*(v2h *)&in_y[i + 4]);                                              \
+    y67 = (*(v2h *)&in_y[i + 6]);                                              \
+    asm volatile("vfmac.h %[y01], %[x01], %[aa];"                              \
+                 "vfmac.h %[y23], %[x23], %[aa];"                              \
+                 "vfmac.h %[y45], %[x45], %[aa];"                              \
+                 "vfmac.h %[y67], %[x67], %[aa];"                              \
+                 : [y01] "+&r"(y01), [y23] "+&r"(y23), [y45] "+&r"(y45),       \
+                   [y67] "+&r"(y67)                                            \
+                 : [x01] "r"(x01), [x23] "r"(x23), [x45] "r"(x45),             \
+                   [x67] "r"(x67), [aa] "r"(aa));                              \
+    (*(v2h *)&in_y[i]) = y01;                                                  \
+    (*(v2h *)&in_y[i + 2]) = y23;                                              \
+    (*(v2h *)&in_y[i + 4]) = y45;                                              \
+    (*(v2h *)&in_y[i + 6]) = y67;                                              \
   }
 
 /* Single-core dot-product */
-void axpy_f16s(__fp16 *in_a, __fp16 *in_b, __fp16 *in_c, uint32_t Len) {
+void axpy_f16s(uint32_t a, __fp16 *in_x, __fp16 *in_y, uint32_t Len) {
 
   uint32_t core_id = mempool_get_core_id();
   if (core_id == 0) {
     mempool_start_benchmark();
     // Kernel execution
-    __fp16 *end = in_a + Len / 2;
+    __fp16 *end = in_x + Len / 2;
     do {
       asm volatile("fmadd.h %0, %1, %2, %0;"
-                   : "+&r"(*in_c)
-                   : "r"(*in_a), "r"(*in_b));
-      in_a++;
-      in_b++;
-      in_c++;
-    } while (in_a < end);
+                   : "+&r"(*in_y)
+                   : "r"(a), "r"(*in_x));
+      in_x++;
+      in_y++;
+    } while (in_x < end);
     mempool_stop_benchmark();
   }
 
@@ -47,17 +53,16 @@ void axpy_f16s(__fp16 *in_a, __fp16 *in_b, __fp16 *in_c, uint32_t Len) {
 }
 
 /* Single-core dot-product unrolled4 */
-void axpy_f16s_unrolled4(__fp16 *in_a, __fp16 *in_b, __fp16 *in_c,
-                         uint32_t Len) {
+void axpy_f16s_unrolled4(uint32_t a, __fp16 *in_x, __fp16 *in_y, uint32_t Len) {
 
   uint32_t core_id = mempool_get_core_id();
   if (core_id == 0) {
     mempool_start_benchmark();
     uint32_t i = 0;
-    v2h a01, a23;
-    v2h b01, b23;
-    v2h c01, c23;
-    for (i = 0; i < Len; i += 4) {
+    uint32_t aa = (a << 16U) | a;
+    v2h x01, x23, x45, x67;
+    v2h y01, y23, y45, y67;
+    for (i = 0; i < Len; i += 8) {
       AXPYF16VEC_UNROLLED4_LOOP;
     }
     mempool_stop_benchmark();
@@ -67,58 +72,56 @@ void axpy_f16s_unrolled4(__fp16 *in_a, __fp16 *in_b, __fp16 *in_c,
 }
 
 /* Parallel dot-product */
-void axpy_f16p(__fp16 *in_a, __fp16 *in_b, __fp16 *in_c, uint32_t Len,
+void axpy_f16p(uint32_t a, __fp16 *in_x, __fp16 *in_y, uint32_t Len,
                uint32_t nPE) {
 
-  uint32_t num_cores = mempool_get_core_count();
   uint32_t core_id = mempool_get_core_id();
   uint32_t step = Len / nPE;
-  __fp16 a, b, c;
+  __fp16 x, y;
   for (uint32_t i = core_id * step; i < core_id * step + step; i++) {
-    a = in_a[i];
-    b = in_b[i];
-    c = in_c[i];
-    asm volatile("fmadd.h %0, %1, %2, %0;" : "+&r"(c) : "r"(a), "r"(b));
-    in_c[i] = c;
+    x = in_x[i];
+    y = in_y[i];
+    asm volatile("fmadd.h %0, %1, %2, %0;" : "+&r"(y) : "r"(a), "r"(x));
+    in_y[i] = y;
   }
-  mempool_barrier(num_cores);
+  mempool_log_barrier(2, core_id);
 
   return;
 }
 
 /* Parallel dot-product with loop unrolling*/
-void axpy_f16vecp_unrolled4(__fp16 *in_a, __fp16 *in_b, __fp16 *in_c,
+void axpy_f16vecp_unrolled4(uint32_t a, __fp16 *in_x, __fp16 *in_y,
                             uint32_t Len, uint32_t nPE) {
 
-  uint32_t num_cores = mempool_get_core_count();
   uint32_t core_id = mempool_get_core_id();
   uint32_t step = Len / nPE;
   uint32_t i;
-  v2h a01, a23;
-  v2h b01, b23;
-  v2h c01, c23;
-  for (i = core_id * step; i < core_id * step + step; i += 4) {
+
+  uint32_t aa = (a << 16U) | a;
+  v2h x01, x23, x45, x67;
+  v2h y01, y23, y45, y67;
+  for (i = core_id * step; i < (core_id * step + step); i += 8) {
     AXPYF16VEC_UNROLLED4_LOOP;
   }
-  mempool_barrier(num_cores);
+  mempool_log_barrier(2, core_id);
 
   return;
 }
 
 /* Parallel dot-product with loop unrolling */
 /* Load and stores only in local memory */
-void axpy_f16vecp_local_unrolled4(__fp16 *in_a, __fp16 *in_b, __fp16 *in_c,
+void axpy_f16vecp_local_unrolled4(uint32_t a, __fp16 *in_x, __fp16 *in_y,
                                   uint32_t Len) {
 
-  uint32_t num_cores = mempool_get_core_count();
   uint32_t core_id = mempool_get_core_id();
-  v2h a01, a23;
-  v2h b01, b23;
-  v2h c01, c23;
-  for (uint32_t i = core_id * BANKING_FACTOR; i < Len; i += NUM_BANKS) {
+
+  uint32_t aa = (a << 16U) | a;
+  v2h x01, x23, x45, x67;
+  v2h y01, y23, y45, y67;
+  for (uint32_t i = 2 * core_id * BANKING_FACTOR; i < Len; i += 2 * NUM_BANKS) {
     AXPYF16VEC_UNROLLED4_LOOP;
   }
-  mempool_barrier(num_cores);
+  mempool_log_barrier(2, core_id);
 
   return;
 }
diff --git a/software/kernels/baremetal/mempool_axpy_f32.h b/software/kernels/baremetal/mempool_axpy_f32.h
index ff069524c..27962d5cb 100644
--- a/software/kernels/baremetal/mempool_axpy_f32.h
+++ b/software/kernels/baremetal/mempool_axpy_f32.h
@@ -6,55 +6,49 @@
 
 #define AXPYF32_UNROLLED4_LOOP                                                 \
   {                                                                            \
-    a0 = in_a[i];                                                              \
-    b0 = in_b[i];                                                              \
-    c0 = in_c[i];                                                              \
-    a1 = in_a[i + 1];                                                          \
-    b1 = in_b[i + 1];                                                          \
-    c1 = in_c[i + 1];                                                          \
-    a2 = in_a[i + 2];                                                          \
-    b2 = in_b[i + 2];                                                          \
-    c2 = in_c[i + 2];                                                          \
-    a3 = in_a[i + 3];                                                          \
-    b3 = in_b[i + 3];                                                          \
-    c3 = in_c[i + 3];                                                          \
+    x0 = in_x[i];                                                              \
+    y0 = in_y[i];                                                              \
+    x1 = in_x[i + 1];                                                          \
+    y1 = in_y[i + 1];                                                          \
+    x2 = in_x[i + 2];                                                          \
+    y2 = in_y[i + 2];                                                          \
+    x3 = in_x[i + 3];                                                          \
+    y3 = in_y[i + 3];                                                          \
     asm volatile(                                                              \
-        "fmadd.s %[c0], %[a0], %[b0], %[c0];"                                  \
-        "fmadd.s %[c1], %[a1], %[b1], %[c1];"                                  \
-        "fmadd.s %[c2], %[a2], %[b2], %[c2];"                                  \
-        "fmadd.s %[c3], %[a3], %[b3], %[c3];"                                  \
-        : [c0] "+&r"(c0), [c1] "+&r"(c1), [c2] "+&r"(c2), [c3] "+&r"(c3)       \
-        : [a0] "r"(a0), [a1] "r"(a1), [a2] "r"(a2), [a3] "r"(a3),              \
-          [b0] "r"(b0), [b1] "r"(b1), [b2] "r"(b2), [b3] "r"(b3));             \
-    in_c[i] = c0;                                                              \
-    in_c[i + 1] = c1;                                                          \
-    in_c[i + 2] = c2;                                                          \
-    in_c[i + 3] = c3;                                                          \
+        "fmadd.s %[y0], %[a], %[x0], %[y0];"                                   \
+        "fmadd.s %[y1], %[a], %[x1], %[y1];"                                   \
+        "fmadd.s %[y2], %[a], %[x2], %[y2];"                                   \
+        "fmadd.s %[y3], %[a], %[x3], %[y3];"                                   \
+        : [y0] "+&r"(y0), [y1] "+&r"(y1), [y2] "+&r"(y2), [y3] "+&r"(y3)       \
+        : [x0] "r"(x0), [x1] "r"(x1), [x2] "r"(x2), [x3] "r"(x3), [a] "r"(a)); \
+    in_y[i] = y0;                                                              \
+    in_y[i + 1] = y1;                                                          \
+    in_y[i + 2] = y2;                                                          \
+    in_y[i + 3] = y3;                                                          \
   }
 
 /* Single-core dot-product */
-void axpy_f32s(float *in_a, float *in_b, float *in_c, uint32_t Len) {
+void axpy_f32s(float a, float *in_x, float *in_y, uint32_t Len) {
 
   uint32_t core_id = mempool_get_core_id();
   if (core_id == 0) {
     mempool_start_benchmark();
     // Kernel execution
-    float *end = in_a + Len;
+    float *end = in_x + Len;
     do {
       asm volatile("fmadd.s %0, %1, %2, %0;"
-                   : "+&r"(*in_c)
-                   : "r"(*in_a), "r"(*in_b));
-      in_a++;
-      in_b++;
-      in_c++;
-    } while (in_a < end);
+                   : "+&r"(*in_y)
+                   : "r"(a), "r"(*in_x));
+      in_x++;
+      in_y++;
+    } while (in_x < end);
     mempool_stop_benchmark();
   }
   return;
 }
 
 /* Single-core dot-product unrolled4 */
-void axpy_f32s_unrolled4(float *in_a, float *in_b, float *in_c, uint32_t Len) {
+void axpy_f32s_unrolled4(float a, float *in_x, float *in_y, uint32_t Len) {
 
   uint32_t core_id = mempool_get_core_id();
   if (core_id == 0) {
@@ -62,19 +56,16 @@ void axpy_f32s_unrolled4(float *in_a, float *in_b, float *in_c, uint32_t Len) {
     uint32_t reminder = Len % 4;
     uint32_t i = 0;
 
-    register float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f;
-    register float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f;
-    register float c2 = 0.0f, c1 = 0.0f, c0 = 0.0f, c3 = 0.0f;
-
+    register float x2, x1, x0, x3;
+    register float y2, y1, y0, y3;
     for (i = 0; i < (Len - reminder); i += 4) {
       AXPYF32_UNROLLED4_LOOP;
     }
     while (i < Len) {
-      a0 = in_a[i];
-      b0 = in_b[i];
-      c0 = in_c[i];
-      asm volatile("fmadd.s %0, %1, %2, %0;" : "+&r"(c0) : "r"(a0), "r"(b0));
-      in_c[i] = c0;
+      x0 = in_x[i];
+      y0 = in_y[i];
+      asm volatile("fmadd.s %0, %1, %2, %0;" : "+&r"(y0) : "r"(a), "r"(x0));
+      in_y[i] = y0;
       i++;
     }
     mempool_stop_benchmark();
@@ -83,83 +74,72 @@ void axpy_f32s_unrolled4(float *in_a, float *in_b, float *in_c, uint32_t Len) {
 }
 
 /* Parallel dot-product */
-void axpy_f32p(float *in_a, float *in_b, float *in_c, uint32_t Len,
-               uint32_t nPE) {
+void axpy_f32p(float a, float *in_x, float *in_y, uint32_t Len, uint32_t nPE) {
 
-  uint32_t num_cores = mempool_get_core_count();
   uint32_t core_id = mempool_get_core_id();
   uint32_t step = Len / nPE;
 
-  register float a, b, c;
+  register float x, y;
   for (uint32_t i = core_id * step; i < core_id * step + step; i++) {
-    a = in_a[i];
-    b = in_b[i];
-    c = in_c[i];
-    asm volatile("fmadd.s %0, %1, %2, %0;" : "+&r"(c) : "r"(a), "r"(b));
-    in_c[i] = c;
+    x = in_x[i];
+    y = in_y[i];
+    asm volatile("fmadd.s %0, %1, %2, %0;" : "+&r"(y) : "r"(a), "r"(x));
+    in_y[i] = y;
   }
-  mempool_barrier(num_cores);
+  mempool_log_barrier(2, core_id);
 
   return;
 }
 
 /* Parallel dot-product with loop unrolling*/
-void axpy_f32p_unrolled4(float *in_a, float *in_b, float *in_c, uint32_t Len,
+void axpy_f32p_unrolled4(float a, float *in_x, float *in_y, uint32_t Len,
                          uint32_t nPE) {
 
-  uint32_t num_cores = mempool_get_core_count();
   uint32_t core_id = mempool_get_core_id();
   uint32_t step = Len / nPE;
   uint32_t reminder = step % 4;
   uint32_t i;
 
-  register float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f;
-  register float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f;
-  register float c2 = 0.0f, c1 = 0.0f, c0 = 0.0f, c3 = 0.0f;
-
+  register float x2, x1, x0, x3;
+  register float y2, y1, y0, y3;
   for (i = core_id * step; i < (core_id * step + step) - reminder; i += 4) {
     AXPYF32_UNROLLED4_LOOP;
   }
   i = core_id * step + step - reminder;
   while (i < step) {
-    a0 = in_a[i];
-    b0 = in_b[i];
-    c0 = in_c[i];
-    asm volatile("fmadd.s %0, %1, %2, %0;" : "+&r"(c0) : "r"(a0), "r"(b0));
-    in_c[i] = c0;
+    x0 = in_x[i];
+    y0 = in_y[i];
+    asm volatile("fmadd.s %0, %1, %2, %0;" : "+&r"(y0) : "r"(a), "r"(x0));
+    in_y[i] = y0;
     i++;
   }
-  mempool_barrier(num_cores);
+  mempool_log_barrier(2, core_id);
 
   return;
 }
 
 /* Parallel dot-product with loop unrolling */
 /* Load and stores only in local memory */
-void axpy_f32p_local_unrolled4(float *in_a, float *in_b, float *in_c,
+void axpy_f32p_local_unrolled4(float a, float *in_x, float *in_y,
                                uint32_t Len) {
 
-  uint32_t num_cores = mempool_get_core_count();
   uint32_t core_id = mempool_get_core_id();
   uint32_t const remainder = Len % BANKING_FACTOR;
   uint32_t const idx_stop = Len - remainder;
 
-  register float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f;
-  register float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f;
-  register float c2 = 0.0f, c1 = 0.0f, c0 = 0.0f, c3 = 0.0f;
-
+  register float x2, x1, x0, x3;
+  register float y2, y1, y0, y3;
   for (uint32_t i = core_id * BANKING_FACTOR; i < idx_stop; i += NUM_BANKS) {
     AXPYF32_UNROLLED4_LOOP;
   }
   if (core_id == ((Len % NUM_BANKS) / 4)) {
     for (uint32_t i = Len - remainder; i < Len; i++) {
-      a0 = in_a[i];
-      b0 = in_b[i];
-      asm volatile("fmadd.s %0, %1, %2, %0;" : "+&r"(c0) : "r"(a0), "r"(b0));
-      in_c[i] = c0;
+      x0 = in_x[i];
+      asm volatile("fmadd.s %0, %1, %2, %0;" : "+&r"(y0) : "r"(a), "r"(x0));
+      in_y[i] = y0;
     }
   }
-  mempool_barrier(num_cores);
+  mempool_log_barrier(2, core_id);
 
   return;
 }
diff --git a/software/kernels/baremetal/mempool_dotp_f16.h b/software/kernels/baremetal/mempool_dotp_f16.h
index 17d13df24..791b7c68e 100644
--- a/software/kernels/baremetal/mempool_dotp_f16.h
+++ b/software/kernels/baremetal/mempool_dotp_f16.h
@@ -11,13 +11,21 @@
   {                                                                            \
     a01 = (*(v2h *)&in_a[i]);                                                  \
     a23 = (*(v2h *)&in_a[i + 2]);                                              \
+    a45 = (*(v2h *)&in_a[i + 4]);                                              \
+    a67 = (*(v2h *)&in_a[i + 6]);                                              \
     b01 = (*(v2h *)&in_b[i]);                                                  \
     b23 = (*(v2h *)&in_b[i + 2]);                                              \
+    b45 = (*(v2h *)&in_b[i + 4]);                                              \
+    b67 = (*(v2h *)&in_b[i + 6]);                                              \
     asm volatile(                                                              \
         "vfdotpex.s.h %[local_sum0], %[a01], %[b01];"                          \
         "vfdotpex.s.h %[local_sum1], %[a23], %[b23];"                          \
-        : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1)       \
-        : [a01] "r"(a01), [a23] "r"(a23), [b01] "r"(b01), [b23] "r"(b23));     \
+        "vfdotpex.s.h %[local_sum2], %[a45], %[b45];"                          \
+        "vfdotpex.s.h %[local_sum3], %[a67], %[b67];"                          \
+        : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1),      \
+          [local_sum2] "+&r"(local_sum2), [local_sum3] "+&r"(local_sum3)       \
+        : [a01] "r"(a01), [a23] "r"(a23), [a45] "r"(a45), [a67] "r"(a67),      \
+          [b01] "r"(b01), [b23] "r"(b23), [b45] "r"(b45), [b67] "r"(b67));     \
   }
 
 /* Single core reduction */
@@ -116,20 +124,23 @@ void dotp_f16s_unrolled4(__fp16 *in_a, __fp16 *in_b, __fp16 *s, uint32_t Len) {
   if (core_id == 0) {
     mempool_start_benchmark();
     uint32_t i = 0;
-
-    v2h a01, a23;
-    v2h b01, b23;
+    v2h a01, a23, a45, a67;
+    v2h b01, b23, b45, b67;
     float local_sum0 = 0.0f;
     float local_sum1 = 0.0f;
-
-    for (i = 0; i < Len; i += 4) {
+    float local_sum2 = 0.0f;
+    float local_sum3 = 0.0f;
+    for (i = 0; i < Len; i += 8) {
       DOTPF16VEC_UNROLLED4_LOOP;
     }
     // Reduction
     asm volatile(
         "fadd.s   %[local_sum0], %[local_sum0], %[local_sum1];"
+        "fadd.s   %[local_sum2], %[local_sum2], %[local_sum3];"
+        "fadd.s   %[local_sum0], %[local_sum0], %[local_sum2];"
         "fcvt.h.s %[local_sum0], %[local_sum0];"
-        : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1)
+        : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1),
+          [local_sum2] "+&r"(local_sum2), [local_sum3] "+&r"(local_sum3)
         :);
     s[0] = *(__fp16 *)&local_sum0;
     mempool_stop_benchmark();
@@ -168,17 +179,21 @@ void dotp_f16vecp_unrolled4(__fp16 *in_a, __fp16 *in_b, __fp16 *s, uint32_t Len,
   uint32_t step = Len / nPE;
   uint32_t i;
 
-  v2h a01, a23;
-  v2h b01, b23;
+  v2h a01, a23, a45, a67;
+  v2h b01, b23, b45, b67;
   float local_sum0 = 0.0f;
   float local_sum1 = 0.0f;
-
-  for (i = core_id * step; i < core_id * step + step; i += 4) {
+  float local_sum2 = 0.0f;
+  float local_sum3 = 0.0f;
+  for (i = core_id * step; i < (core_id * step + step); i += 8) {
     DOTPF16VEC_UNROLLED4_LOOP;
   }
   asm volatile("fadd.s   %[local_sum0], %[local_sum0], %[local_sum1];"
+               "fadd.s   %[local_sum2], %[local_sum2], %[local_sum3];"
+               "fadd.s   %[local_sum0], %[local_sum0], %[local_sum2];"
                "fcvt.h.s %[local_sum0], %[local_sum0];"
-               : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1)
+               : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1),
+                 [local_sum2] "+&r"(local_sum2), [local_sum3] "+&r"(local_sum3)
                :);
   s[2 * core_id * BANKING_FACTOR] = *(__fp16 *)&local_sum0;
   uint32_t num_cores = mempool_get_core_count();
@@ -194,16 +209,21 @@ void dotp_f16vecp_local_unrolled4(__fp16 *in_a, __fp16 *in_b, __fp16 *s,
 
   uint32_t core_id = mempool_get_core_id();
 
-  v2h a01, a23;
-  v2h b01, b23;
+  v2h a01, a23, a45, a67;
+  v2h b01, b23, b45, b67;
   float local_sum0 = 0.0f;
   float local_sum1 = 0.0f;
-  for (uint32_t i = core_id * BANKING_FACTOR; i < Len; i += NUM_BANKS) {
+  float local_sum2 = 0.0f;
+  float local_sum3 = 0.0f;
+  for (uint32_t i = 2 * core_id * BANKING_FACTOR; i < Len; i += 2 * NUM_BANKS) {
     DOTPF16VEC_UNROLLED4_LOOP;
   }
   asm volatile("fadd.s   %[local_sum0], %[local_sum0], %[local_sum1];"
+               "fadd.s   %[local_sum2], %[local_sum2], %[local_sum3];"
+               "fadd.s   %[local_sum0], %[local_sum0], %[local_sum2];"
                "fcvt.h.s %[local_sum0], %[local_sum0];"
-               : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1)
+               : [local_sum0] "+&r"(local_sum0), [local_sum1] "+&r"(local_sum1),
+                 [local_sum2] "+&r"(local_sum2), [local_sum3] "+&r"(local_sum3)
                :);
   s[2 * core_id * BANKING_FACTOR] = *(__fp16 *)&local_sum0;
 
diff --git a/software/kernels/baremetal/mempool_dotp_f32.h b/software/kernels/baremetal/mempool_dotp_f32.h
index 58fa0e9d5..290b96d59 100644
--- a/software/kernels/baremetal/mempool_dotp_f32.h
+++ b/software/kernels/baremetal/mempool_dotp_f32.h
@@ -7,12 +7,12 @@
 #define DOTPF32_UNROLLED4_LOOP                                                 \
   {                                                                            \
     a0 = in_a[i];                                                              \
-    b0 = in_b[i];                                                              \
     a1 = in_a[i + 1];                                                          \
-    b1 = in_b[i + 1];                                                          \
     a2 = in_a[i + 2];                                                          \
-    b2 = in_b[i + 2];                                                          \
     a3 = in_a[i + 3];                                                          \
+    b0 = in_b[i];                                                              \
+    b1 = in_b[i + 1];                                                          \
+    b2 = in_b[i + 2];                                                          \
     b3 = in_b[i + 3];                                                          \
     asm volatile(                                                              \
         "fmadd.s %[local_sum0], %[a0], %[b0], %[local_sum0];"                  \
@@ -95,7 +95,7 @@ void dotp_f32s(float *in_a, float *in_b, float *s, uint32_t Len) {
   if (core_id == 0) {
     mempool_start_benchmark();
     // Kernel execution
-    register float local_sum = 0;
+    float local_sum = 0;
     float *end = in_a + Len;
     do {
       asm volatile("fmadd.s %0, %1, %2, %0;"
@@ -120,13 +120,12 @@ void dotp_f32s_unrolled4(float *in_a, float *in_b, float *s, uint32_t Len) {
     uint32_t reminder = Len % 4;
     uint32_t i = 0;
 
-    register float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f;
-    register float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f;
-    register float local_sum0 = 0.0f;
-    register float local_sum1 = 0.0f;
-    register float local_sum2 = 0.0f;
-    register float local_sum3 = 0.0f;
-
+    float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f;
+    float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f;
+    float local_sum0 = 0.0f;
+    float local_sum1 = 0.0f;
+    float local_sum2 = 0.0f;
+    float local_sum3 = 0.0f;
     for (i = 0; i < (Len - reminder); i += 4) {
       DOTPF32_UNROLLED4_LOOP;
     }
@@ -158,8 +157,8 @@ void dotp_f32p(float *in_a, float *in_b, float *s, uint32_t Len, uint32_t nPE) {
 
   uint32_t core_id = mempool_get_core_id();
   uint32_t step = Len / nPE;
-  register float local_sum = 0;
-  register float a, b;
+  float local_sum = 0;
+  float a, b;
   for (uint32_t i = core_id * step; i < core_id * step + step; i++) {
     a = in_a[i];
     b = in_b[i];
@@ -182,12 +181,12 @@ void dotp_f32p_unrolled4(float *in_a, float *in_b, float *s, uint32_t Len,
   uint32_t reminder = step % 4;
   uint32_t i;
 
-  register float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f;
-  register float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f;
-  register float local_sum0 = 0.0f;
-  register float local_sum1 = 0.0f;
-  register float local_sum2 = 0.0f;
-  register float local_sum3 = 0.0f;
+  float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f;
+  float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f;
+  float local_sum0 = 0.0f;
+  float local_sum1 = 0.0f;
+  float local_sum2 = 0.0f;
+  float local_sum3 = 0.0f;
 
   for (i = core_id * step; i < (core_id * step + step) - reminder; i += 4) {
     DOTPF32_UNROLLED4_LOOP;
@@ -223,18 +222,18 @@ void dotp_f32p_local_unrolled4(float *in_a, float *in_b, float *s,
   uint32_t const remainder = Len % BANKING_FACTOR;
   uint32_t const idx_stop = Len - remainder;
 
-  register float a0 = 0.0f, a1 = 0.0f, a2 = 0.0f, a3 = 0.0f;
-  register float b2 = 0.0f, b1 = 0.0f, b0 = 0.0f, b3 = 0.0f;
-  register float local_sum0 = 0.0f;
-  register float local_sum1 = 0.0f;
-  register float local_sum2 = 0.0f;
-  register float local_sum3 = 0.0f;
+  float a0, a1, a2, a3;
+  float b2, b1, b0, b3;
+  float local_sum0 = 0.0f;
+  float local_sum1 = 0.0f;
+  float local_sum2 = 0.0f;
+  float local_sum3 = 0.0f;
 
   for (uint32_t i = core_id * BANKING_FACTOR; i < idx_stop; i += NUM_BANKS) {
     DOTPF32_UNROLLED4_LOOP;
   }
   if (core_id == ((Len % NUM_BANKS) / 4)) {
-    for (uint32_t i = Len - remainder; i < Len; i++) {
+    for (uint32_t i = idx_stop; i < Len; i++) {
       a0 = in_a[i];
       b0 = in_b[i];
       asm volatile("fmadd.s %0, %1, %2, %0;"