From c27e56cc737a3d5e99bd8ef898196dbdfb4ba5de Mon Sep 17 00:00:00 2001
From: mbertuletti <mbertuletti@iis.ee.ethz.ch>
Date: Thu, 25 Apr 2024 09:13:29 +0200
Subject: [PATCH] [software] Adapt to new folder structure

---
 Makefile                                      |   2 +-
 software/apps/baremetal/Makefile              |   4 +-
 .../apps/baremetal/cfft_radix4_f16/main.c     | 117 +--
 software/apps/baremetal/chest_f16/main.c      |   7 +-
 software/apps/baremetal/chest_q16/main.c      |   4 +-
 software/apps/baremetal/cholesky_f16/main.c   |   1 -
 software/apps/baremetal/cholesky_q16/main.c   |   5 +-
 software/apps/baremetal/cmatmul_f16/main.c    |   7 +-
 .../apps/{ => baremetal}/cmatmul_q16/main.c   |   6 +-
 software/apps/baremetal/mimo_mmse_f16/main.c  |  29 +-
 software/apps/baremetal/mimo_mmse_f32/main.c  |  18 +-
 software/apps/baremetal/mimo_mmse_q16/main.c  |  10 +-
 software/apps/{ => baremetal}/ofdm/main.c     |  53 +-
 software/apps/cfft_radix4_q16/main.c          |   0
 software/apps/chest_q16/main.c                |   0
 .../{runtime => }/data/data_cfft_f16.h.tpl    |   0
 .../{runtime => }/data/data_cfft_q16.h.tpl    |   0
 software/data/data_cfft_radix2_q16.h.tpl      |  55 --
 software/data/data_cfft_radix2_q16.py         | 200 -----
 software/data/data_cfft_radix4_f16.h.tpl      |  45 -
 software/data/data_cfft_radix4_f16.py         | 121 ---
 software/data/data_cfft_radix4_q16.h.tpl      |  57 --
 software/data/data_cfft_radix4_q16.py         | 200 -----
 software/data/data_chest_f16.py               | 132 ---
 software/data/data_chest_q16.py               | 160 ----
 software/data/data_cholesky_f16.py            | 108 ---
 software/data/data_cholesky_q16.py            | 117 ---
 software/data/data_cholesky_q32.py            | 106 ---
 software/data/data_cmatmul_f16.py             | 117 ---
 .../{runtime => }/data/data_cmatmul_q16.h.tpl |   0
 software/data/data_matmulf16.py               | 111 ---
 software/data/data_matmulf32.py               | 112 ---
 software/data/data_mimo_mmse_f16.py           | 185 ----
 software/data/data_mimo_mmse_f32.py           | 154 ----
 software/data/data_mimo_mmse_q16.py           | 166 ----
 software/data/data_ofdm.py                    | 137 ---
 software/{runtime => }/data/generate_cfft.py  |   0
 software/{runtime => }/data/generate_chest.py |   4 +-
 .../{runtime => }/data/generate_cholesky.py   |   9 +-
 .../{runtime => }/data/generate_matmul.py     |   0
 .../{runtime => }/data/generate_mimo_mmse.py  |  25 +-
 software/{runtime => }/data/generate_ofdm.py  |   0
 .../mempool_cfft_radix4_butterfly_f16.h       | 199 -----
 .../baremetal/mempool_cfft_radix4_f16p.h      | 526 -----------
 .../mempool_cfft_radix4_q16_bitreversal.h     |  23 -
 .../kernels/baremetal/mempool_chest_f16.h     | 382 ++++++++
 .../kernels/baremetal/mempool_chest_f16p.h    |  62 --
 .../kernels/baremetal/mempool_chest_f16s.h    | 194 -----
 .../kernels/baremetal/mempool_chest_q16.h     |   2 +-
 .../kernels/baremetal/mempool_cholesky_f16s.h |   1 +
 .../kernels/baremetal/mempool_cholesky_f32s.h |   4 +
 .../baremetal}/mempool_cholesky_q16s.h        |   3 +-
 .../kernels/baremetal/mempool_cmatmul_f16.h   | 814 ++++++++++--------
 .../baremetal}/mempool_cmatmul_q16.h          |   2 +-
 .../baremetal/mempool_linearsolver_f32s.h     |   4 +
 .../baremetal}/mempool_linearsolver_q16s.h    |   0
 .../baremetal/mempool_mimo_mmse_f16s.h        |   1 +
 .../baremetal}/mempool_mimo_mmse_q16s.h       |   0
 .../mempool_radix4_cfft_butterfly_f16.h       |   2 +-
 .../baremetal}/mempool_radix4_cfft_f16p.h     | 173 ++--
 .../runtime/data/data_cfft_radix4_f16.h.tpl   |   1 -
 .../runtime/data/data_cfft_radix4_q16.h.tpl   |   1 -
 software/runtime/data/data_ofdm.py            |   1 -
 software/runtime/kernel/mempool_checks.h      |   0
 software/runtime/kernel/mempool_chest_f16.h   | 372 --------
 software/runtime/kernel/mempool_chest_q16.h   | 245 ------
 software/runtime/kernel/mempool_chest_q16p.h  |   1 -
 software/runtime/kernel/mempool_chest_q16s.h  |   1 -
 .../runtime/kernel/mempool_radix2_cfft_q16s.h |   0
 69 files changed, 1102 insertions(+), 4496 deletions(-)
 rename software/apps/{ => baremetal}/cmatmul_q16/main.c (93%)
 rename software/apps/{ => baremetal}/ofdm/main.c (74%)
 delete mode 100644 software/apps/cfft_radix4_q16/main.c
 delete mode 100644 software/apps/chest_q16/main.c
 rename software/{runtime => }/data/data_cfft_f16.h.tpl (100%)
 rename software/{runtime => }/data/data_cfft_q16.h.tpl (100%)
 delete mode 100644 software/data/data_cfft_radix2_q16.h.tpl
 delete mode 100644 software/data/data_cfft_radix2_q16.py
 delete mode 100644 software/data/data_cfft_radix4_f16.h.tpl
 delete mode 100644 software/data/data_cfft_radix4_f16.py
 delete mode 100644 software/data/data_cfft_radix4_q16.h.tpl
 delete mode 100755 software/data/data_cfft_radix4_q16.py
 delete mode 100644 software/data/data_chest_f16.py
 delete mode 100755 software/data/data_chest_q16.py
 delete mode 100644 software/data/data_cholesky_f16.py
 delete mode 100644 software/data/data_cholesky_q16.py
 delete mode 100644 software/data/data_cholesky_q32.py
 delete mode 100644 software/data/data_cmatmul_f16.py
 rename software/{runtime => }/data/data_cmatmul_q16.h.tpl (100%)
 delete mode 100644 software/data/data_matmulf16.py
 delete mode 100644 software/data/data_matmulf32.py
 delete mode 100644 software/data/data_mimo_mmse_f16.py
 delete mode 100644 software/data/data_mimo_mmse_f32.py
 delete mode 100644 software/data/data_mimo_mmse_q16.py
 delete mode 100644 software/data/data_ofdm.py
 rename software/{runtime => }/data/generate_cfft.py (100%)
 rename software/{runtime => }/data/generate_chest.py (98%)
 rename software/{runtime => }/data/generate_cholesky.py (95%)
 rename software/{runtime => }/data/generate_matmul.py (100%)
 rename software/{runtime => }/data/generate_mimo_mmse.py (93%)
 rename software/{runtime => }/data/generate_ofdm.py (100%)
 delete mode 100644 software/kernels/baremetal/mempool_cfft_radix4_butterfly_f16.h
 delete mode 100644 software/kernels/baremetal/mempool_cfft_radix4_f16p.h
 delete mode 100644 software/kernels/baremetal/mempool_cfft_radix4_q16_bitreversal.h
 create mode 100644 software/kernels/baremetal/mempool_chest_f16.h
 delete mode 100644 software/kernels/baremetal/mempool_chest_f16p.h
 delete mode 100644 software/kernels/baremetal/mempool_chest_f16s.h
 rename software/{runtime/kernel => kernels/baremetal}/mempool_cholesky_q16s.h (97%)
 rename software/{runtime/kernel => kernels/baremetal}/mempool_cmatmul_q16.h (99%)
 rename software/{runtime/kernel => kernels/baremetal}/mempool_linearsolver_q16s.h (100%)
 rename software/{runtime/kernel => kernels/baremetal}/mempool_mimo_mmse_q16s.h (100%)
 rename software/{runtime/kernel => kernels/baremetal}/mempool_radix4_cfft_butterfly_f16.h (99%)
 rename software/{runtime/kernel => kernels/baremetal}/mempool_radix4_cfft_f16p.h (72%)
 delete mode 100644 software/runtime/data/data_cfft_radix4_f16.h.tpl
 delete mode 100644 software/runtime/data/data_cfft_radix4_q16.h.tpl
 delete mode 100644 software/runtime/data/data_ofdm.py
 delete mode 100644 software/runtime/kernel/mempool_checks.h
 delete mode 100644 software/runtime/kernel/mempool_chest_f16.h
 delete mode 100644 software/runtime/kernel/mempool_chest_q16.h
 delete mode 100644 software/runtime/kernel/mempool_chest_q16p.h
 delete mode 100644 software/runtime/kernel/mempool_chest_q16s.h
 delete mode 100644 software/runtime/kernel/mempool_radix2_cfft_q16s.h

diff --git a/Makefile b/Makefile
index 6afa3eaba..9e5ff6e60 100644
--- a/Makefile
+++ b/Makefile
@@ -183,7 +183,7 @@ toolchain/riscv-opcodes/*:
 
 format:
 	$(ROOT_DIR)/scripts/run_clang_format.py --clang-format-executable=$(LLVM_INSTALL_DIR)/bin/clang-format -i -r $(ROOT_DIR)
-	find ./software/runtime/data -name '*.py' -exec autopep8 --in-place --aggressive {} +
+	find ./software/data -name '*.py' -exec autopep8 --in-place --aggressive {} +
 
 clean: clean-riscv-tests
 	rm -rf $(INSTALL_DIR)
diff --git a/software/apps/baremetal/Makefile b/software/apps/baremetal/Makefile
index cc9e2db7a..b4b2ee496 100644
--- a/software/apps/baremetal/Makefile
+++ b/software/apps/baremetal/Makefile
@@ -22,8 +22,8 @@ ALLPYS := $(patsubst %.py,%.h,$(wildcard $(DATA_DIR)/*.py))
 BINARIES := $(addprefix $(BIN_DIR)/,$(APPS))
 ALL := $(APPS)
 
-ALL_GCC := $(filter-out matmul_f16 matmul_f32, $(ALL))
-ALL_LLVM := $(filter-out synth_i32 chest_q16 cfft_radix2_q16 cfft_radix4_q16, $(ALL))
+ALL_GCC := $(filter-out cfft_radix4_f16 chest_f16 cholesky_f16 cmatmul_f16 matmul_f16 matmul_f32 mimo_mmse_f32 mimo_mmse_f16 ofdm, $(ALL))
+ALL_LLVM := $(filter-out synth_i32 cfft_radix2_q16 cfft_radix4_q16 chest_q16 cholesky_q16 cholesky_q32 cmatmul_q16 mimo_mmse_q16, $(ALL))
 
 # Make all applications
 all: $(ALL_GCC)
diff --git a/software/apps/baremetal/cfft_radix4_f16/main.c b/software/apps/baremetal/cfft_radix4_f16/main.c
index 2f4270c80..c459d062a 100644
--- a/software/apps/baremetal/cfft_radix4_f16/main.c
+++ b/software/apps/baremetal/cfft_radix4_f16/main.c
@@ -10,35 +10,53 @@
 #include <string.h>
 
 /* Mempool runtime libraries */
+#include "builtins_v2.h"
 #include "dma.h"
 #include "encoding.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
-#include "xpulp/builtins_v2.h"
 
 /* CFFT data libraries */
-#include "data/data_cfft_radix4_f16.h"
-
-/*
-   - FOLDED:    Parallel FFT with "memory-aware" load/store scheme
-   - SCHEDULED: Scheduling of multiple parallel FFTs with "memory-aware"
-   load/store scheme
-      - N_FFTs_COL: Independent FFTs scheduled on one row (default 1)
-      - N_FFTs_ROW: Independent FFTs scheduled on columns (default 1)
-      - FOLDED_TWIDDLES: Also the twiddles have "memory-aware" load/stores
-*/
-
-#define FOLDED
+#include "data_cfft_radix4_f16.h"
+
+/* CHOOSE ONE */
+//#define SINGLE // Single core FFT.
+//#define PARALLEL // Parallel FFT not "memory-aware".
+//#define FOLDED // Parallel FFT with "memory-aware" load/store.
+#define SCHEDULED // Folded FFTs arranged in rows and cols.'''
+
+// Bitreversal index from table.
+#define BITREVERSETABLE
+// Independent FFTs scheduled on one row (default 1).
+#define N_FFTs_ROW 2
+// Independent FFTs scheduled on columns (default 1).
+#define N_FFTs_COL 2
+#if (N_FFTs_COL > MAX_COL)
+#error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)]
+#endif
+// Also the twiddles have "memory-aware" load/stores.
 #define FOLDED_TWIDDLES
-#define N_FFTs_ROW 1
-#define N_FFTs_COL 1
 
-#include "kernel/mempool_checks.h"
-#include "kernel/mempool_radix4_cfft_butterfly_f16.h"
-#include "kernel/mempool_radix4_cfft_f16p.h"
-#include "kernel/mempool_radix4_cfft_q16_bitreversal.h"
+#include "baremetal/mempool_cfft_q16_bitreversal.h"
+#include "baremetal/mempool_checks.h"
+#include "baremetal/mempool_radix4_cfft_butterfly_f16.h"
+#include "baremetal/mempool_radix4_cfft_f16p.h"
+
+#if (defined(SINGLE) || defined(PARALLEL))
+__fp16 l1_pSrc[2 * N_CSAMPLES]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+__fp16 l1_pDst[2 * N_CSAMPLES]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+__fp16 l1_twiddleCoef_f16_src[2 * 3 * N_CSAMPLES / 4]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+__fp16 l1_twiddleCoef_f16_dst[2 * 3 * N_CSAMPLES / 4]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+#endif
 
+#if (defined(SCHEDULED) || defined(FOLDED))
 __fp16 l1_pSrc[N_FFTs_ROW * 8 * N_BANKS]
     __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
 __fp16 l1_pDst[N_FFTs_ROW * 8 * N_BANKS]
@@ -49,49 +67,44 @@ __fp16 l1_twiddleCoef_f16_dst[8 * N_BANKS]
     __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
 uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
     __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+#endif
 
 int main() {
-
   uint32_t core_id = mempool_get_core_id();
   uint32_t num_cores = mempool_get_core_count();
-  mempool_barrier_init(core_id);
   __fp16 *pRes = (__fp16 *)0;
+  mempool_barrier_init(core_id);
+
+  /* INITIALIZATION */
 
   if (core_id == 0) {
-    // Each FFT is folded over 4 memory rows
-    // Each memory row is 2 * N_BANKS (real-imag) samples
     for (uint32_t j = 0; j < N_FFTs_ROW; j++) {
-      dma_memcpy_blocking(l1_pSrc + j * (8 * N_BANKS), l2_pSrc,
-                          (N_CSAMPLES * N_FFTs_COL) * sizeof(int32_t));
+      for (uint32_t i = 0; i < N_FFTs_COL; i++) {
+        dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * N_BANKS),
+                            l2_pSrc, N_CSAMPLES * sizeof(int32_t));
+      }
     }
-    dma_memcpy_blocking(l1_pSrc, l2_pSrc, N_CSAMPLES * sizeof(int32_t));
     dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable,
-                        BITREVINDEXTABLE_LENGTH * sizeof(int16_t));
-    dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16,
-                        3 * (N_CSAMPLES / 4) * sizeof(int32_t));
+                        BITREVINDEXTABLE_LENGTH * sizeof(int32_t));
   }
-// Initialize the Twiddles folded
-#ifdef FOLDED_TWIDDLES
+  mempool_barrier(num_cores);
   for (uint32_t j = 0; j < N_FFTs_COL; j++) {
-    uint32_t N_WORDS_COL = (N_CSAMPLES / 4);
+    uint32_t N_WORDS_COL = N_CSAMPLES >> 2;
     for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) {
-      *(v2h *)&l1_twiddleCoef_f16_src[2U * (i + j * N_WORDS_COL)] =
-          *(v2h *)&l2_twiddleCoef_f16[2U * i];
-      *(v2h *)&l1_twiddleCoef_f16_src[2U *
-                                      (i + j * N_WORDS_COL + 1 * N_BANKS)] =
-          *(v2h *)&l2_twiddleCoef_f16[2U * (i * 2U)];
-      *(v2h *)&l1_twiddleCoef_f16_src[2U *
-                                      (i + j * N_WORDS_COL + 2 * N_BANKS)] =
-          *(v2h *)&l2_twiddleCoef_f16[2U * (i * 3U)];
+      *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL)] =
+          *(v2h *)&l2_twiddleCoef_f16[2 * i];
+      *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL + 1 * N_BANKS)] =
+          *(v2h *)&l2_twiddleCoef_f16[2 * (i * 2U)];
+      *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL + 2 * N_BANKS)] =
+          *(v2h *)&l2_twiddleCoef_f16[2 * (i * 3U)];
     }
   }
-#endif
   if (core_id == 0) {
     printf("01: END INITIALIZATION\n");
   }
   mempool_barrier(num_cores);
 
-#if (defined(FOLDED) && defined(FOLDED_TWIDDLES))
+#ifdef FOLDED
   if (core_id < (N_CSAMPLES / 16)) {
     mempool_start_benchmark();
     mempool_radix4_cfft_f16p_folded(l1_pSrc, l1_pDst, N_CSAMPLES,
@@ -105,26 +118,28 @@ int main() {
 #endif
 
 #ifdef SCHEDULED
-  uint32_t nPE = (N_CSAMPLES / 16);
-  if (core_id < N_FFTs_COL * nPE) {
+  uint32_t CORES_USED = (N_CSAMPLES / 4) / BANKING_FACTOR;
+  if (core_id < N_FFTs_COL * CORES_USED) {
     mempool_start_benchmark();
-    uint32_t N_WORDS_COL = N_CSAMPLES / 4;
-    uint32_t col_id = core_id / nPE;
     mempool_radix4_cfft_f16p_scheduler(
         l1_pSrc, l1_pDst, N_CSAMPLES, N_FFTs_ROW, N_FFTs_COL,
-        l1_twiddleCoef_f16_src + 2 * col_id * N_WORDS_COL,
-        l1_twiddleCoef_f16_dst + 2 * col_id * N_WORDS_COL, l1_BitRevIndexTable,
-        BITREVINDEXTABLE_LENGTH, 1, nPE);
-    pRes = l1_pDst;
-    mempool_log_partial_barrier(2, core_id, N_FFTs_COL * nPE);
+        l1_twiddleCoef_f16_src, l1_twiddleCoef_f16_dst, l1_BitRevIndexTable,
+        BITREVINDEXTABLE_LENGTH, 1, CORES_USED);
+    mempool_log_partial_barrier(2, core_id, N_FFTs_COL * CORES_USED);
     mempool_stop_benchmark();
   }
+#ifdef BITREVERSETABLE
+  pRes = ((LOG2 / 2) % 2) == 0 ? l1_pSrc : l1_pDst;
+#else
+  pRes = ((LOG2 / 2) % 2) == 0 ? l1_pDst : l1_pSrc;
+#endif
 #endif
 
   mempool_barrier(num_cores);
   if (core_id == 0) {
     printf("02: END COMPUTATION\n");
   }
+
   mempool_check_f16(pRes, l2_pRes, 2 * N_CSAMPLES, 0.5, 0);
   mempool_barrier(num_cores);
   return 0;
diff --git a/software/apps/baremetal/chest_f16/main.c b/software/apps/baremetal/chest_f16/main.c
index 7abfa8add..e0feb90c7 100644
--- a/software/apps/baremetal/chest_f16/main.c
+++ b/software/apps/baremetal/chest_f16/main.c
@@ -8,17 +8,16 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include "builtins_v2.h"
 #include "dma.h"
 #include "encoding.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
-#include "builtins_v2.h"
 
-#include "data_chest_f16.h"
 #include "baremetal/mempool_checks.h"
-#include "baremetal/mempool_chest_f16p.h"
-#include "baremetal/mempool_chest_f16s.h"
+#include "baremetal/mempool_chest_f16.h"
+#include "data_chest_f16.h"
 
 //#define SINGLE
 #define PARALLEL
diff --git a/software/apps/baremetal/chest_q16/main.c b/software/apps/baremetal/chest_q16/main.c
index eecac204a..9288bb4d7 100644
--- a/software/apps/baremetal/chest_q16/main.c
+++ b/software/apps/baremetal/chest_q16/main.c
@@ -55,8 +55,8 @@ int main() {
 #endif
 #ifdef PARALLEL
   mempool_start_benchmark();
-  mempool_chest_q16p_unrolled4_local(l1_HEST, l1_PilotRX, l1_PilotTX, N_RX,
-                                     N_TX, N_SAMPLES, core_id, num_cores);
+  mempool_chest_q16p_unrolled4(l1_HEST, l1_PilotRX, l1_PilotTX, N_RX, N_TX,
+                               N_SAMPLES, core_id, num_cores);
   mempool_stop_benchmark();
   mempool_barrier(num_cores);
 #endif
diff --git a/software/apps/baremetal/cholesky_f16/main.c b/software/apps/baremetal/cholesky_f16/main.c
index d27af143c..908ca99fa 100644
--- a/software/apps/baremetal/cholesky_f16/main.c
+++ b/software/apps/baremetal/cholesky_f16/main.c
@@ -11,7 +11,6 @@
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
-#include "builtins_v2.h"
 
 #include "data_cholesky_f16.h"
 
diff --git a/software/apps/baremetal/cholesky_q16/main.c b/software/apps/baremetal/cholesky_q16/main.c
index 2f30ae94a..3c382c500 100644
--- a/software/apps/baremetal/cholesky_q16/main.c
+++ b/software/apps/baremetal/cholesky_q16/main.c
@@ -9,11 +9,10 @@
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
-#include "xpulp/builtins_v2.h"
 
+#include "baremetal/mempool_checks.h"
+#include "baremetal/mempool_cholesky_q16s.h"
 #include "data_cholesky_q16.h"
-#include "kernel/mempool_checks.h"
-#include "kernel/mempool_cholesky_q16s.h"
 
 #define SINGLE
 
diff --git a/software/apps/baremetal/cmatmul_f16/main.c b/software/apps/baremetal/cmatmul_f16/main.c
index 2ca261599..2bfbcb144 100644
--- a/software/apps/baremetal/cmatmul_f16/main.c
+++ b/software/apps/baremetal/cmatmul_f16/main.c
@@ -13,9 +13,11 @@
 #include "synchronization.h"
 
 #include "data_cmatmul_f16.h"
+
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_cmatmul_f16.h"
-#define PARALLEL_2x2
+#define PARALLEL_2x4
+#define TEST
 
 __fp16 matrix_a[2 * dim_M * dim_N]
     __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)),
@@ -26,7 +28,7 @@ __fp16 matrix_b[2 * dim_N * dim_P]
 __fp16 matrix_c[2 * dim_M * dim_P]
     __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)),
                    section(".l1_prio")));
-__fp16 matrix_a_folded[2 * dim_M * (4 * NUM_CORES)]
+__fp16 matrix_a_folded[2 * (BANKING_FACTOR * NUM_CORES)]
     __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)),
                    section(".l1_prio")));
 
@@ -73,7 +75,6 @@ int main() {
     mempool_start_benchmark();
     cmatmul_2x4_f16p(matrix_a, matrix_b, matrix_c, dim_M, dim_N, dim_P, core_id,
                      nPE);
-    mempool_log_partial_barrier(2, core_id, nPE);
     mempool_stop_benchmark();
   }
   mempool_barrier(num_cores);
diff --git a/software/apps/cmatmul_q16/main.c b/software/apps/baremetal/cmatmul_q16/main.c
similarity index 93%
rename from software/apps/cmatmul_q16/main.c
rename to software/apps/baremetal/cmatmul_q16/main.c
index b3e4e7503..f7a6bd31d 100644
--- a/software/apps/cmatmul_q16/main.c
+++ b/software/apps/baremetal/cmatmul_q16/main.c
@@ -12,9 +12,9 @@
 #include "runtime.h"
 #include "synchronization.h"
 
-#include "data/data_cmatmul_q16.h"
-#include "kernel/mempool_checks.h"
-#include "kernel/mempool_cmatmul_q16.h"
+#include "baremetal/mempool_checks.h"
+#include "baremetal/mempool_cmatmul_q16.h"
+#include "data_cmatmul_q16.h"
 
 #define PARALLEL
 
diff --git a/software/apps/baremetal/mimo_mmse_f16/main.c b/software/apps/baremetal/mimo_mmse_f16/main.c
index 1ff7f43f4..4389a0f3e 100644
--- a/software/apps/baremetal/mimo_mmse_f16/main.c
+++ b/software/apps/baremetal/mimo_mmse_f16/main.c
@@ -11,19 +11,26 @@
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
-#include "builtins_v2.h"
 
-#include "data_mimo_mmse_f16.h"
 #include "baremetal/mempool_checks.h"
 #include "baremetal/mempool_cholesky_f16s.h"
 #include "baremetal/mempool_linearsolver_f16s.h"
 #include "baremetal/mempool_mimo_mmse_f16s.h"
 
-//#define DOUBLE_BUFFERING
+#include "data_mimo_mmse_f16.h"
+
+// #define DOUBLE_BUFFERING
+// #define N_ROUNDS (1)
+// #define DMA_TRANSFER2
+
 #ifndef DOUBLE_BUFFERING
 
-#define SINGLE
-//#define PARALLEL
+/**********************************************/
+/* TEST OF THE KERNELS WITH NO DATA MOVEMENTS */
+/**********************************************/
+
+//#define SINGLE
+#define PARALLEL
 //#define FOLDED
 
 __fp16 l1_H[2 * N_TX * N_RX * N_ITR]
@@ -110,7 +117,7 @@ int main() {
       Ptrx += 2 * itr_bg * N_TX_bg;
     }
   }
-  mempool_log_barrier(2, core_id);
+  mempool_barrier(num_cores);
   mempool_stop_benchmark();
 #endif
 
@@ -139,7 +146,7 @@ int main() {
     mempool_Ltrisol_folded_f16s(PtrL, Ptry2, Ptry3, N_TX);
     mempool_Lttrisol_folded_f16s(PtrL, Ptry3, Ptrx, N_TX);
   }
-  mempool_log_barrier(2, core_id);
+  mempool_barrier(num_cores);
   mempool_stop_benchmark();
 #endif
 
@@ -244,8 +251,8 @@ int main() {
       __fp16 *PtrL = L + itr * (2 * N_TX * N_TX);
       __fp16 *Ptry2 = y2 + itr * (2 * N_TX);
       __fp16 *Ptry3 = y3 + itr * (2 * N_TX);
-      mempool_hermitian_f16s(PtrH, PtrG, PtrSigma, N_RX, N_TX, 0, 0);
-      mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX, 0);
+      mempool_hermitian_f16vecs(PtrH, PtrG, PtrSigma, N_RX, N_TX);
+      mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX);
       mempool_cholesky_f16vecs(PtrG, PtrL, N_TX);
       mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX);
       mempool_Lttrisol_f16s(PtrL, Ptry3, Ptrx, N_TX);
@@ -294,8 +301,8 @@ int main() {
       __fp16 *PtrSigma = cmpt_Sigma + itr * (2 * N_TX);
       __fp16 *PtrG = G + itr * (2 * N_TX * N_TX);
       __fp16 *Ptry2 = y2 + itr * (2 * N_TX);
-      mempool_hermitian_f16s(PtrH, PtrG, PtrSigma, N_RX, N_TX, 0, 0);
-      mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX, 0);
+      mempool_hermitian_f16vecs(PtrH, PtrG, PtrSigma, N_RX, N_TX);
+      mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX);
     }
     mempool_log_barrier(2, core_id);
 
diff --git a/software/apps/baremetal/mimo_mmse_f32/main.c b/software/apps/baremetal/mimo_mmse_f32/main.c
index 8e5f5f80f..aa95b1919 100644
--- a/software/apps/baremetal/mimo_mmse_f32/main.c
+++ b/software/apps/baremetal/mimo_mmse_f32/main.c
@@ -10,13 +10,17 @@
 #include "runtime.h"
 #include "synchronization.h"
 
-#include "data_mimo_mmse_f32.h"
 #include "baremetal/mempool_checks.h"
-#include "baremetal/mempool_cholesky_f32s.h"
-#include "baremetal/mempool_linearsolver_f32s.h"
 #include "baremetal/mempool_mimo_mmse_f32p.h"
 #include "baremetal/mempool_mimo_mmse_f32s.h"
 
+#if defined(__XDIVSQRT)
+#include "baremetal/mempool_cholesky_f32s.h"
+#include "baremetal/mempool_linearsolver_f32s.h"
+#endif
+
+#include "data_mimo_mmse_f32.h"
+
 //#define SINGLE
 //#define JACOBI
 #define PARALLEL
@@ -52,7 +56,7 @@ int main() {
   }
   mempool_barrier(num_cores);
 
-#ifdef SINGLE
+#if defined(SINGLE) && defined(__XDIVSQRT)
   /* Benchmark */
   if (core_id == 0) {
     mempool_start_benchmark();
@@ -80,7 +84,7 @@ int main() {
   mempool_barrier(num_cores);
 #endif
 
-#ifdef PARALLEL
+#if defined(PARALLEL) && defined(__XDIVSQRT)
   // Each iteration is assigned to a processor
   mempool_start_benchmark();
   for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) {
@@ -104,7 +108,7 @@ int main() {
   mempool_stop_benchmark();
 #endif
 
-#ifdef PARALLEL_HERMITIAN
+#if defined(PARALLEL_HERMITIAN) && defined(__XDIVSQRT)
   mempool_start_benchmark();
   // Each iteration is assigned to a pool of processors
   // In a pool each PE gets a column of the H matrix, accumulating a row of the
@@ -139,7 +143,7 @@ int main() {
   mempool_stop_benchmark();
 #endif
 
-#ifdef FOLDED
+#if defined(FOLDED) && defined(__XDIVSQRT)
   mempool_start_benchmark();
   for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) {
     // Inputs
diff --git a/software/apps/baremetal/mimo_mmse_q16/main.c b/software/apps/baremetal/mimo_mmse_q16/main.c
index dff61adf8..c7dcda78d 100644
--- a/software/apps/baremetal/mimo_mmse_q16/main.c
+++ b/software/apps/baremetal/mimo_mmse_q16/main.c
@@ -9,12 +9,12 @@
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
-#include "xpulp/builtins_v2.h"
 
-#include "data/data_mimo_mmse_q16.h"
-#include "kernel/mempool_cholesky_q16s.h"
-#include "kernel/mempool_linearsolver_q16s.h"
-#include "kernel/mempool_mimo_mmse_q16s.h"
+#include "data_mimo_mmse_q16.h"
+
+#include "baremetal/mempool_cholesky_q16s.h"
+#include "baremetal/mempool_linearsolver_q16s.h"
+#include "baremetal/mempool_mimo_mmse_q16s.h"
 
 #define PARALLEL
 
diff --git a/software/apps/ofdm/main.c b/software/apps/baremetal/ofdm/main.c
similarity index 74%
rename from software/apps/ofdm/main.c
rename to software/apps/baremetal/ofdm/main.c
index 8408c1035..59b1835d7 100644
--- a/software/apps/ofdm/main.c
+++ b/software/apps/baremetal/ofdm/main.c
@@ -10,14 +10,14 @@
 #include <string.h>
 
 /* Mempool runtime libraries */
+#include "builtins_v2.h"
 #include "dma.h"
 #include "encoding.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
-#include "xpulp/builtins_v2.h"
 
-#include "data/data_ofdm.h"
+#include "data_ofdm.h"
 
 // CFFT Parameters
 #define SCHEDULED
@@ -28,17 +28,20 @@
 #define N_FFTs_ROW (N_RX / N_FFTs_COL)
 // CMATMUL Parameters
 #define NUM_COPIES (N_BANKS / (N_BEAMS * N_RX))
+#define dim_M (N_BEAMS)
+#define dim_N (N_RX)
+#define dim_P (N_SC)
 
 #define ROUNDS 3
-dump(prova, 1);
+dump(checkpoint, 1);
 
-#include "kernel/mempool_cmatmul_f16.h"
-#include "kernel/mempool_radix4_cfft_butterfly_f16.h"
-#include "kernel/mempool_radix4_cfft_f16p.h"
-#include "kernel/mempool_radix4_cfft_q16_bitreversal.h"
+#include "baremetal/mempool_cfft_q16_bitreversal.h"
+#include "baremetal/mempool_cmatmul_f16.h"
+#include "baremetal/mempool_radix4_cfft_butterfly_f16.h"
+#include "baremetal/mempool_radix4_cfft_f16p.h"
 
 uint32_t arrival_index __attribute__((section(".l1_prio")));
-__fp16 l1_pBF_Coef_folded[2 * N_BEAMS * N_RX * NUM_COPIES]
+__fp16 l1_pBF_Coef_folded[2 * BANKING_FACTOR * NUM_CORES]
     __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
 
 __fp16 l1_pFFT_Src[N_FFTs_ROW * 8 * N_BANKS]
@@ -69,9 +72,9 @@ int main() {
                         (N_RX * N_SC) * sizeof(int32_t));
     dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable,
                         BITREVINDEXTABLE_LENGTH * sizeof(int16_t));
-    for (uint32_t i = 0; i < NUM_COPIES; i++) {
-      dma_memcpy_blocking(l1_pBF_Coef_folded + i * (N_BEAMS * N_RX),
-                          l2_pBF_Coef, (N_BEAMS * N_RX) * sizeof(int32_t));
+    for (uint32_t i = 0; i < BANKING_FACTOR * NUM_CORES; i += dim_M * dim_N) {
+      dma_memcpy_blocking(&l1_pBF_Coef_folded[2 * i], l2_pBF_Coef,
+                          dim_M * dim_N * sizeof(int32_t));
     }
     for (uint32_t i = 0; i < N_FFTs_COL; i++) {
       dma_memcpy_blocking(l1_twiddleCoef_f16_src + (2 * i * N_BANKS),
@@ -80,31 +83,27 @@ int main() {
   }
   mempool_barrier(num_cores);
   mempool_stop_benchmark();
-  dump_prova(0);
-
-  //  // Start of the iterations
-  //  for (uint32_t round = 0; round < ROUNDS; round++) {
+  dump_checkpoint(0);
 
   /* FFT */
   mempool_start_benchmark();
-  uint32_t col_fftLen = N_SC / 4;
-  uint32_t col_id = core_id / (N_SC / 16);
+  uint32_t CORES_USED = (N_SC / 4) / BANKING_FACTOR;
   // Distribute FFTs over columns
   mempool_radix4_cfft_f16p_scheduler(
-      l1_pFFT_Src, l1_pFFT_Dst, N_SC,
-      l1_twiddleCoef_f16_src + 2 * col_id * col_fftLen,
-      l1_twiddleCoef_f16_dst + 2 * col_id * col_fftLen, l1_BitRevIndexTable,
-      BITREVINDEXTABLE_LENGTH, 1, (N_SC / 16));
+      l1_pFFT_Src, l1_pFFT_Dst, N_SC, N_FFTs_ROW, N_FFTs_COL,
+      l1_twiddleCoef_f16_src, l1_twiddleCoef_f16_dst, l1_BitRevIndexTable,
+      BITREVINDEXTABLE_LENGTH, 1, CORES_USED);
   mempool_log_barrier(2, core_id);
   mempool_stop_benchmark();
-  dump_prova(1);
+  dump_checkpoint(1);
 
   /* BEAMFORMING */
   mempool_start_benchmark();
-  cmatmul_2x4_folded_f16p(l1_pBF_Coef_folded, l1_pBF_Coef_folded, l1_pFFT_Src,
-                          l1_pFFT_Dst, N_BEAMS, N_RX, N_SC, core_id, num_cores);
+  cmatmul_4x4_f16p((int32_t *)l1_pBF_Coef_folded, (int32_t *)l1_pFFT_Src,
+                   (int32_t *)l1_pFFT_Dst, dim_M, dim_N, dim_P, core_id,
+                   num_cores);
   mempool_stop_benchmark();
-  dump_prova(2);
+  dump_checkpoint(2);
 
   mempool_start_benchmark();
   // Transfer and synchronization
@@ -124,9 +123,7 @@ int main() {
   }
   mempool_wfi();
   mempool_stop_benchmark();
-  dump_prova(3);
-
-  //  }
+  dump_checkpoint(3);
 
   return 0;
 }
diff --git a/software/apps/cfft_radix4_q16/main.c b/software/apps/cfft_radix4_q16/main.c
deleted file mode 100644
index e69de29bb..000000000
diff --git a/software/apps/chest_q16/main.c b/software/apps/chest_q16/main.c
deleted file mode 100644
index e69de29bb..000000000
diff --git a/software/runtime/data/data_cfft_f16.h.tpl b/software/data/data_cfft_f16.h.tpl
similarity index 100%
rename from software/runtime/data/data_cfft_f16.h.tpl
rename to software/data/data_cfft_f16.h.tpl
diff --git a/software/runtime/data/data_cfft_q16.h.tpl b/software/data/data_cfft_q16.h.tpl
similarity index 100%
rename from software/runtime/data/data_cfft_q16.h.tpl
rename to software/data/data_cfft_q16.h.tpl
diff --git a/software/data/data_cfft_radix2_q16.h.tpl b/software/data/data_cfft_radix2_q16.h.tpl
deleted file mode 100644
index 6044e424d..000000000
--- a/software/data/data_cfft_radix2_q16.h.tpl
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Automatically generated by:
-// data/data_cfft_radix2_q16.py
-
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '(int16_t) 0X{:04X}, '.format(a&0xffff)
-        i += 1
-        if i % 16 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-<% def array_to_str(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '{}, '.format(a)
-        i += 1
-        if i % 16 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-#define LOG2 (${Log2Len})
-#define N_CSAMPLES (${Len})
-#define N_TWIDDLES (3 * N_CSAMPLES / 4)
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
-#define BITREVINDEXTABLE_LENGTH (${BitrevLen})
-
-// Tolerance for correctness check
-#define TOLERANCE (${tolerance})
-
-% for m, m_str in zip([vector_inp, vector_res], ['l2_pSrc', 'l2_pRes']):
-
-// Data arrays for matrix ${m_str}
-int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) ${m_str}[${2*Len}] = ${array_to_cstr(m)};
-
-% endfor \
-
-// Twiddles
-int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_twiddleCoef_q16[${int(6*Len/4)}] = ${array_to_cstr(vector_twi)};
-
-// Bitreversal
-uint16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_BitRevIndexTable[${BitrevLen}] = ${array_to_str(vector_bitrev)};
diff --git a/software/data/data_cfft_radix2_q16.py b/software/data/data_cfft_radix2_q16.py
deleted file mode 100644
index e1615e53e..000000000
--- a/software/data/data_cfft_radix2_q16.py
+++ /dev/null
@@ -1,200 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the cfft kernel.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import math as M
-import argparse
-import pathlib
-from mako.template import Template
-from sympy.combinatorics import Permutation
-
-
-##################
-# compute_result #
-##################
-
-
-def compute_result(inp, len):
-    """
-    Funciton to generate the expected result of the testcase.
-
-    Arguments
-    ---------
-    input: numpy array of inputs
-    env: Length of the input transform.
-    """
-
-    # Q16:
-    # len=16:    Q1.15 -> Q5.11
-    # len=32:    Q1.15 -> Q6.10
-    # len=64:    Q1.15 -> Q7.9
-    # len=128:   Q1.15 -> Q8.8
-    # len=256:   Q1.15 -> Q9.7
-    # len=512:   Q1.15 -> Q10.6
-    # len=1024:  Q1.15 -> Q11.5
-    # len=2048:  Q1.15 -> Q12.4
-    # len=4096:  Q1.15 -> Q13.3
-    bit_shift_dict_q16 = {
-        16: 11,
-        32: 10,
-        64: 9,
-        128: 8,
-        256: 7,
-        512: 6,
-        1024: 5,
-        2048: 4,
-        4096: 3}
-    my_type = np.int16
-    my_fixpoint = 15
-    bit_shift_dict = bit_shift_dict_q16
-    a = inp.astype(my_type)
-    result = np.zeros(a.size, dtype=my_type)
-    complex_a = np.zeros(int(a.size / 2), dtype=np.csingle)
-    complex_result = np.zeros(a.size >> 1, dtype=np.csingle)
-    for i in range(a.size >> 1):
-        complex_a[i] = a[2 * i].astype(np.csingle) / (2**(my_fixpoint)) + (
-            a[2 * i + 1].astype(np.csingle) / (2**(my_fixpoint))) * 1j
-    complex_result = np.fft.fft(complex_a)
-    for i in range(int(a.size / 2)):
-        result[2 * i] = (np.real(complex_result[i]) *
-                         (2**(bit_shift_dict[int(a.size / 2)]))
-                         ).astype(my_type)
-        result[2 * i + 1] = (np.imag(complex_result[i]) *
-                             (2**(bit_shift_dict[int(a.size / 2)]))
-                             ).astype(my_type)
-
-    return result
-
-
-def compute_twiddles(length):
-    PI = 3.14159265358979
-    N = length
-    twiddleCoefq15 = np.zeros((int)(2 * 3 * N / 4), np.int16)
-    for i in range(0, (int)(3 * N / 4)):
-        twiddleCoefq15_cos = M.cos(i * 2 * PI / N)
-        twiddleCoefq15_sin = M.sin(i * 2 * PI / N)
-        twiddleCoefq15[2 * i] = int(round(twiddleCoefq15_cos * (2**15 - 1)))
-        twiddleCoefq15[2 * i +
-                       1] = int(round(twiddleCoefq15_sin * (2**15 - 1)))
-    return twiddleCoefq15
-
-
-def compute_bitreversal(N, R):
-
-    # Decompose
-    logR2 = []
-    idx = N
-    while (idx >= R):
-        logR2.append(int(M.log2(R)))
-        idx = idx // R
-    if (idx > 1):
-        logR2.append(int(M.log2(idx)))
-
-    # Bitreversal
-    indexes = []
-    for x in range(N):
-        result = 0
-        for bits in logR2:
-            mask = (0xffffffff >> (32 - bits))
-            result = (result << bits) | (x & mask)
-            x = x >> bits
-        indexes.append(result)
-
-    # Create transpositions table
-    tps = []
-    for c in Permutation.from_sequence(indexes).cyclic_form:
-        for i in range(len(c) - 1):
-            tps.append([c[i] * 8, c[-1] * 8])
-
-    return tps
-
-
-def gen_data_header_file(
-        outdir: pathlib.Path.cwd(),
-        tpl: pathlib.Path.cwd(),
-        **kwargs):
-
-    file = outdir / f"{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-t",
-        "--tpl",
-        type=pathlib.Path,
-        required=False,
-        default=pathlib.Path(__file__).parent.absolute() /
-        "data_cfft_radix2_q16.h.tpl",
-        help='Path to mako template')
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action='store_true',
-        help='Set verbose'
-    )
-    parser.add_argument(
-        "-d",
-        "--dimension",
-        type=int,
-        required=False,
-        default=64,
-        help='Input dimension'
-    )
-
-    args = parser.parse_args()
-
-    # Create sparse matrix
-    Len = args.dimension
-    Input = np.random.randint(-2**(15), 2**(15) - 1, 2 * Len, dtype=np.int16)
-    Result = compute_result(Input, Len)
-    Twiddles = compute_twiddles(Len)
-    Bitreversal = np.ndarray.flatten(np.array(compute_bitreversal(Len, 2)))
-
-    tolerance = {
-        16: 16,
-        32: 20,
-        64: 24,
-        128: 28,
-        256: 32,
-        512: 48,
-        1024: 64,
-        2048: 96,
-        4096: 128}
-
-    kwargs = {'name': 'data_cfft_radix2_q16',
-              'vector_inp': Input,
-              'vector_res': Result,
-              'vector_twi': Twiddles,
-              'vector_bitrev': Bitreversal,
-              'Len': Len,
-              'Log2Len': int(np.log2(Len)),
-              'BitrevLen': int(2 * len(Bitreversal)),
-              'tolerance': tolerance[int(Len)]}
-
-    gen_data_header_file(args.outdir, args.tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/data/data_cfft_radix4_f16.h.tpl b/software/data/data_cfft_radix4_f16.h.tpl
deleted file mode 100644
index 883049a44..000000000
--- a/software/data/data_cfft_radix4_f16.h.tpl
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '(__fp16){:0.4}f, '.format(a)
-        i += 1
-        if i % 8 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-<% def array_to_str(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '{}, '.format(a)
-        i += 1
-        if i % 16 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-#define LOG2 (${Log2Len})
-#define N_CSAMPLES (${Len})
-#define N_RSAMPLES (2 * N_CSAMPLES)
-#define N_TWIDDLES (3 * N_CSAMPLES / 4)
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
-#define BITREVINDEXTABLE_LENGTH (${BitrevLen})
-
-__fp16 l2_pSrc[${2 * Len}] = ${array_to_cstr(src)};
-
-__fp16 l2_pRes[${2 * Len}] = ${array_to_cstr(dst)};
-
-__fp16 l2_twiddleCoef_f16[${2 * Len}] = ${array_to_cstr(twi)};
-
-// Bitreversal
-uint16_t l2_BitRevIndexTable[${BitrevLen}] = ${array_to_str(bitrev)};
diff --git a/software/data/data_cfft_radix4_f16.py b/software/data/data_cfft_radix4_f16.py
deleted file mode 100644
index ca90265c8..000000000
--- a/software/data/data_cfft_radix4_f16.py
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the fp16 FFT.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import math as M
-import argparse
-import pathlib
-from mako.template import Template
-from sympy.combinatorics import Permutation
-
-
-def compute_bitreversal(N, R):
-    # Decompose
-    logR2 = []
-    idx = N
-    while (idx >= R):
-        logR2.append(int(M.log2(R)))
-        idx = idx // R
-    if (idx > 1):
-        logR2.append(int(M.log2(idx)))
-    # Bitreversal
-    indexes = []
-    for x in range(N):
-        result = 0
-        for bits in logR2:
-            mask = (0xffffffff >> (32 - bits))
-            result = (result << bits) | (x & mask)
-            x = x >> bits
-        indexes.append(result)
-
-    # Create transpositions table
-    tps = []
-    for c in Permutation.from_sequence(indexes).cyclic_form:
-        for i in range(len(c) - 1):
-            tps.append([c[i] * 8, c[-1] * 8])
-    return tps
-
-
-def gen_data_header_file(
-        outdir: pathlib.Path.cwd(),
-        tpl: pathlib.Path.cwd(),
-        **kwargs):
-
-    file = outdir / f"{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-t",
-        "--tpl",
-        type=pathlib.Path,
-        required=False,
-        default=pathlib.Path(__file__).parent.absolute() /
-        "data_cfft_radix4_f16.h.tpl",
-        help='Path to mako template')
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action='store_true',
-        help='Set verbose'
-    )
-    parser.add_argument(
-        "-d",
-        "--dimension",
-        type=int,
-        required=False,
-        default=4096,
-        help='FFT dimension'
-    )
-
-    args = parser.parse_args()
-    Len = args.dimension
-
-    src = np.random.rand(Len).astype(np.float16)
-    src = src + 1.j * np.random.rand(Len).astype(np.float16)
-    dst = np.fft.fft(src)
-    src = np.column_stack((src.imag, src.real)).astype(np.float16).flatten()
-    dst = np.column_stack((dst.imag, dst.real)).astype(np.float16).flatten()
-    Bitreversal = np.ndarray.flatten(np.array(compute_bitreversal(Len, 2)))
-
-    twi = np.zeros(int(2 * 3 * Len / 4), np.float16)
-    for i in range(0, int(3 * Len / 4)):
-        twi[2 * i] = np.sin(i * 2 * np.pi / Len).astype(np.float16)
-        twi[2 * i + 1] = np.cos(i * 2 * np.pi / Len).astype(np.float16)
-
-    kwargs = {'name': 'data_cfft_radix4_f16',
-              'src': src,
-              'dst': dst,
-              'twi': twi,
-              'bitrev': Bitreversal,
-              'Len': Len,
-              'Log2Len': int(np.log2(Len)),
-              'BitrevLen': len(Bitreversal)}
-
-    gen_data_header_file(args.outdir, args.tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/data/data_cfft_radix4_q16.h.tpl b/software/data/data_cfft_radix4_q16.h.tpl
deleted file mode 100644
index 3af1b764d..000000000
--- a/software/data/data_cfft_radix4_q16.h.tpl
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Automatically generated by:
-// data/data_cfft_radix4_q16.py
-
-\
-<% def array_to_cstr(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '(int16_t) 0X{:04X}, '.format(a&0xffff)
-        i += 1
-        if i % 16 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-<% def array_to_str(array):
-    out = '{'
-    i = 0
-    out += '\n'
-    for a in array:
-        out += '{}, '.format(a)
-        i += 1
-        if i % 16 == 0:
-            out += '\n'
-    out = out[:-2] + '}'
-    return out
-%> \
-
-#define LOG2 (${Log2Len})
-#define N_CSAMPLES (${Len})
-#define N_TWIDDLES (3 * N_CSAMPLES / 4)
-#define N_BANKS (NUM_CORES * BANKING_FACTOR)
-#define BITREVINDEXTABLE_LENGTH (${BitrevLen})
-
-// Maximum number of independent FFT columns allowed
-#define MAX_COL (N_BANKS / (N_CSAMPLES / 4))
-// Tolerance for correctness check
-#define TOLERANCE (${tolerance})
-
-% for m, m_str in zip([vector_inp, vector_res], ['l2_pSrc', 'l2_pRes']):
-
-// Data arrays for matrix ${m_str}
-int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) ${m_str}[${2*Len}] = ${array_to_cstr(m)};
-
-% endfor \
-
-// Twiddles
-int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_twiddleCoef_q16[${int(6*Len/4)}] = ${array_to_cstr(vector_twi)};
-
-// Bitreversal
-uint16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_BitRevIndexTable[${BitrevLen}] = ${array_to_str(vector_bitrev)};
diff --git a/software/data/data_cfft_radix4_q16.py b/software/data/data_cfft_radix4_q16.py
deleted file mode 100755
index b394a2884..000000000
--- a/software/data/data_cfft_radix4_q16.py
+++ /dev/null
@@ -1,200 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the cfft kernel.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import math as M
-import argparse
-import pathlib
-from mako.template import Template
-from sympy.combinatorics import Permutation
-
-
-##################
-# compute_result #
-##################
-
-
-def compute_result(inp, len):
-    """
-    Funciton to generate the expected result of the testcase.
-
-    Arguments
-    ---------
-    input: numpy array of inputs
-    env: Length of the input transform.
-    """
-
-    # Q16:
-    # len=16:    Q1.15 -> Q5.11
-    # len=32:    Q1.15 -> Q6.10
-    # len=64:    Q1.15 -> Q7.9
-    # len=128:   Q1.15 -> Q8.8
-    # len=256:   Q1.15 -> Q9.7
-    # len=512:   Q1.15 -> Q10.6
-    # len=1024:  Q1.15 -> Q11.5
-    # len=2048:  Q1.15 -> Q12.4
-    # len=4096:  Q1.15 -> Q13.3
-    bit_shift_dict_q16 = {
-        16: 11,
-        32: 10,
-        64: 9,
-        128: 8,
-        256: 7,
-        512: 6,
-        1024: 5,
-        2048: 4,
-        4096: 3}
-    my_type = np.int16
-    my_fixpoint = 15
-    bit_shift_dict = bit_shift_dict_q16
-    a = inp.astype(my_type)
-    result = np.zeros(a.size, dtype=my_type)
-    complex_a = np.zeros(int(a.size / 2), dtype=np.csingle)
-    complex_result = np.zeros(a.size >> 1, dtype=np.csingle)
-    for i in range(a.size >> 1):
-        complex_a[i] = a[2 * i].astype(np.csingle) / (2**(my_fixpoint)) + (
-            a[2 * i + 1].astype(np.csingle) / (2**(my_fixpoint))) * 1j
-    complex_result = np.fft.fft(complex_a)
-    for i in range(int(a.size / 2)):
-        result[2 * i] = (np.real(complex_result[i]) *
-                         (2**(bit_shift_dict[int(a.size / 2)]))
-                         ).astype(my_type)
-        result[2 * i + 1] = (np.imag(complex_result[i]) *
-                             (2**(bit_shift_dict[int(a.size / 2)]))
-                             ).astype(my_type)
-
-    return result
-
-
-def compute_twiddles(length):
-    PI = 3.14159265358979
-    N = length
-    twiddleCoefq15 = np.zeros((int)(2 * 3 * N / 4), np.int16)
-    for i in range(0, (int)(3 * N / 4)):
-        twiddleCoefq15_cos = M.cos(i * 2 * PI / N)
-        twiddleCoefq15_sin = M.sin(i * 2 * PI / N)
-        twiddleCoefq15[2 * i] = int(round(twiddleCoefq15_cos * (2**15 - 1)))
-        twiddleCoefq15[2 * i +
-                       1] = int(round(twiddleCoefq15_sin * (2**15 - 1)))
-    return twiddleCoefq15
-
-
-def compute_bitreversal(N, R):
-
-    # Decompose
-    logR2 = []
-    idx = N
-    while (idx >= R):
-        logR2.append(int(M.log2(R)))
-        idx = idx // R
-    if (idx > 1):
-        logR2.append(int(M.log2(idx)))
-
-    # Bitreversal
-    indexes = []
-    for x in range(N):
-        result = 0
-        for bits in logR2:
-            mask = (0xffffffff >> (32 - bits))
-            result = (result << bits) | (x & mask)
-            x = x >> bits
-        indexes.append(result)
-
-    # Create transpositions table
-    tps = []
-    for c in Permutation.from_sequence(indexes).cyclic_form:
-        for i in range(len(c) - 1):
-            tps.append([c[i] * 8, c[-1] * 8])
-
-    return tps
-
-
-def gen_data_header_file(
-        outdir: pathlib.Path.cwd(),
-        tpl: pathlib.Path.cwd(),
-        **kwargs):
-
-    file = outdir / f"{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-t",
-        "--tpl",
-        type=pathlib.Path,
-        required=False,
-        default=pathlib.Path(__file__).parent.absolute() /
-        "data_cfft_radix4_q16.h.tpl",
-        help='Path to mako template')
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action='store_true',
-        help='Set verbose'
-    )
-    parser.add_argument(
-        "-d",
-        "--dimension",
-        type=int,
-        required=False,
-        default=64,
-        help='Input dimension'
-    )
-
-    args = parser.parse_args()
-
-    # Create sparse matrix
-    Len = args.dimension
-    Input = np.random.randint(-2**(15), 2**(15) - 1, 2 * Len, dtype=np.int16)
-    Result = compute_result(Input, Len)
-    Twiddles = compute_twiddles(Len)
-    Bitreversal = np.ndarray.flatten(np.array(compute_bitreversal(Len, 2)))
-
-    tolerance = {
-        16: 16,
-        32: 20,
-        64: 24,
-        128: 28,
-        256: 32,
-        512: 48,
-        1024: 64,
-        2048: 96,
-        4096: 128}
-
-    kwargs = {'name': 'data_cfft_radix4_q16',
-              'vector_inp': Input,
-              'vector_res': Result,
-              'vector_twi': Twiddles,
-              'vector_bitrev': Bitreversal,
-              'Len': Len,
-              'Log2Len': int(np.log2(Len)),
-              'BitrevLen': len(Bitreversal),
-              'tolerance': tolerance[int(Len)]}
-
-    gen_data_header_file(args.outdir, args.tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/data/data_chest_f16.py b/software/data/data_chest_f16.py
deleted file mode 100644
index 29c19e4a3..000000000
--- a/software/data/data_chest_f16.py
+++ /dev/null
@@ -1,132 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the Channel estimation.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import argparse
-import pathlib
-
-from mako.template import Template
-
-##################
-#  write_result  #
-##################
-
-
-def gen_data_header_file(
-        outdir: pathlib.Path.cwd(),
-        tpl: pathlib.Path.cwd(),
-        **kwargs):
-
-    file = outdir / f"{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-t",
-        "--tpl",
-        type=pathlib.Path,
-        required=False,
-        default=pathlib.Path(__file__).parent.absolute() /
-        "data_chest_f16.h.tpl",
-        help='Path to mako template')
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action='store_true',
-        help='Set verbose'
-    )
-    parser.add_argument(
-        "-b",
-        "--num_beams",
-        type=int,
-        required=False,
-        default=4,
-        help='Number beams'
-    )
-    parser.add_argument(
-        "-l",
-        "--num_layers",
-        type=int,
-        required=False,
-        default=4,
-        help='Number layers'
-    )
-    parser.add_argument(
-        "-s",
-        "--num_samples",
-        type=int,
-        required=False,
-        default=32,
-        help='Number samples'
-    )
-
-    args = parser.parse_args()
-    nb_rx = args.num_beams
-    nb_tx = args.num_layers
-    nb_samples = args.num_samples
-
-    H = np.random.randn(nb_rx, nb_tx) + 1j * np.random.randn(nb_rx, nb_tx)
-
-    vector_pilot_tx = []
-    vector_pilot_rx = []
-    vector_Hest = []
-    for k in range(nb_samples):
-
-        # Compute data
-        pilot_tx = 1 * np.exp(1j * np.random.randn(nb_tx))
-        pilot_rx = np.dot(H, pilot_tx)
-        Hest = pilot_rx[:, np.newaxis] / pilot_tx[np.newaxis, :]
-
-        # Interleaved real and imaginary parts
-        pilot_tx = np.column_stack(
-            (pilot_tx.real, pilot_tx.imag)).astype(np.float16).flatten()
-        pilot_rx = np.column_stack(
-            (pilot_rx.real, pilot_rx.imag)).astype(np.float16).flatten()
-        Hest = Hest.flatten()
-        Hest = np.column_stack((Hest.real, Hest.imag)
-                               ).astype(np.float16).flatten()
-
-        # Output vectors
-        vector_pilot_tx.append(pilot_tx)
-        vector_pilot_rx.append(pilot_rx)
-        vector_Hest.append(Hest)
-
-    vector_pilot_rx = np.concatenate(vector_pilot_rx, axis=0)
-    vector_pilot_tx = np.concatenate(vector_pilot_tx, axis=0)
-    vector_Hest = np.concatenate(vector_Hest, axis=0)
-
-    kwargs = {'name': 'data_chest_f16',
-              'pilot_rx': vector_pilot_rx,
-              'pilot_tx': vector_pilot_tx,
-              'Hest': vector_Hest,
-              'nb_tx': nb_tx,
-              'nb_rx': nb_rx,
-              'nb_samples': nb_samples}
-
-    gen_data_header_file(args.outdir, args.tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/data/data_chest_q16.py b/software/data/data_chest_q16.py
deleted file mode 100755
index e1fca8649..000000000
--- a/software/data/data_chest_q16.py
+++ /dev/null
@@ -1,160 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the Channel estimation.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import argparse
-import pathlib
-
-from mako.template import Template
-
-##################
-#  write_result  #
-##################
-
-
-def gen_data_header_file(
-        outdir: pathlib.Path.cwd(),
-        tpl: pathlib.Path.cwd(),
-        **kwargs):
-
-    file = outdir / f"{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-######################
-# Fixpoint Functions #
-######################
-
-
-def q_sat(x):
-    if x > 2**15 - 1:
-        return x - 2**16
-    elif x < -2**15:
-        return x + 2**16
-    else:
-        return x
-
-
-def compute_chest_q16(in_rx, in_tx, p):
-    n_rx = in_rx.size
-    n_tx = in_tx.size
-    result = np.zeros(2 * (n_tx * n_rx), dtype=np.int16)
-    for i in range(n_rx):
-        a_r = in_rx[i].real
-        a_i = in_rx[i].imag
-        for j in range(n_tx):
-            b_r = in_tx[j].real
-            b_i = in_tx[j].imag
-
-#            # Compute data division
-#            den = (2**16) // (b_r * b_r + b_i * b_i)
-#            num_r = (a_r * b_r) + (a_i * b_i)
-#            num_i = (a_i * b_r) - (a_r * b_i)
-#            result[2 * (i * n_tx + j)] = q_sat((num_r * den) // 2**p)
-#            result[2 * (i * n_tx + j) + 1] = q_sat((num_i * den) // 2**p)
-
-            # Compute data multiplication
-            num_r = (a_r * b_r) - (a_i * b_i)
-            num_i = (a_i * b_r) + (a_r * b_i)
-            result[2 * (i * n_tx + j)] = q_sat(num_r // 2**p)
-            result[2 * (i * n_tx + j) + 1] = q_sat(num_i // 2**p)
-    return result
-
-
-def generate_chest_q16(nb_tx, nb_rx, nb_samples):
-    FIXED_POINT = 8
-    MAX = 2**7
-
-    qvector_pilot_tx = []
-    qvector_pilot_rx = []
-    qvector_Hest = []
-    for k in range(nb_samples):
-        # Create pilots
-        pilot_rx = np.random.randint(-MAX, MAX - 1, size=nb_rx) + 1j * \
-            np.random.randint(-MAX, MAX - 1, size=nb_rx)
-        pilot_tx = np.random.randint(-MAX, MAX - 1, size=nb_tx) + 1j * \
-            np.random.randint(-MAX, MAX - 1, size=nb_tx)
-        # Compute Hest
-        Hest = compute_chest_q16(pilot_rx, pilot_tx, FIXED_POINT)
-
-        pilot_tx = np.column_stack(
-            (pilot_tx.imag, pilot_tx.real)).astype(
-            np.int16).flatten()
-        pilot_rx = np.column_stack(
-            (pilot_rx.imag, pilot_rx.real)).astype(
-            np.int16).flatten()
-        qvector_pilot_tx.append(pilot_tx)
-        qvector_pilot_rx.append(pilot_rx)
-        qvector_Hest.append(Hest)
-
-    qvector_pilot_tx = np.reshape(qvector_pilot_tx, [2 * nb_tx * nb_samples])
-    qvector_pilot_rx = np.reshape(qvector_pilot_rx, [2 * nb_rx * nb_samples])
-    qvector_Hest = np.reshape(qvector_Hest, [2 * nb_tx * nb_rx * nb_samples])
-    return qvector_pilot_tx, qvector_pilot_rx, qvector_Hest
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-b",
-        "--num_rx",
-        type=int,
-        required=False,
-        default=32,
-        help='Number beams'
-    )
-    parser.add_argument(
-        "-l",
-        "--num_tx",
-        type=int,
-        required=False,
-        default=4,
-        help='Number layers'
-    )
-    parser.add_argument(
-        "-s",
-        "--num_samples",
-        type=int,
-        required=False,
-        default=32,
-        help='Number samples'
-    )
-
-    args = parser.parse_args()
-    nb_tx = args.num_tx
-    nb_rx = args.num_rx
-    nb_samples = args.num_samples
-
-    pilot_tx, pilot_rx, Hest = generate_chest_q16(nb_tx, nb_rx, nb_samples)
-    tpl = pathlib.Path(__file__).parent.absolute() / "data_chest_q16.h.tpl"
-    kwargs = {'name': 'data_chest_q16',
-              'pilot_tx': pilot_tx,
-              'pilot_rx': pilot_rx,
-              'Hest': Hest,
-              'nb_tx': nb_tx,
-              'nb_rx': nb_rx,
-              'nb_samples': nb_samples}
-    gen_data_header_file(args.outdir, tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/data/data_cholesky_f16.py b/software/data/data_cholesky_f16.py
deleted file mode 100644
index 32dfa8df9..000000000
--- a/software/data/data_cholesky_f16.py
+++ /dev/null
@@ -1,108 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the fp16 cholesky.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import argparse
-import pathlib
-from mako.template import Template
-
-
-##################
-# compute_result #
-##################
-
-def gen_data_header_file(outdir: pathlib.Path.cwd(),
-                         tpl: pathlib.Path.cwd(), **kwargs):
-
-    file = outdir / f"{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-t",
-        "--tpl",
-        type=pathlib.Path,
-        required=False,
-        default=pathlib.Path(__file__).parent.absolute() /
-        "data_cholesky_f16.h.tpl",
-        help='Path to mako template'
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action='store_true',
-        help='Set verbose'
-    )
-    parser.add_argument(
-        "-n",
-        "--dimension",
-        type=int,
-        required=False,
-        default=4,
-        help='Matrix dimension'
-    )
-    parser.add_argument(
-        "-s",
-        "--num_samples",
-        type=int,
-        required=False,
-        default=256,
-        help='Number samples'
-    )
-
-    args = parser.parse_args()
-    n_matrix = args.dimension
-    n_samples = args.num_samples
-
-    vector_G = []
-    vector_L = []
-    for k in range(n_samples):
-        # Create hermitian matrix
-        H = np.random.rand(n_matrix, n_matrix) + 1.j * \
-            np.random.rand(n_matrix, n_matrix)
-        # Matrix to be inverted
-        # H_H = np.asmatrix(H).H
-        G = np.matmul(H, np.asmatrix(H).H)
-        # Cholesky decomposition
-        L = np.linalg.cholesky(G)
-        # Reshape
-        G = np.reshape(np.asarray(G), (n_matrix * n_matrix), order='C')
-        L = np.reshape(np.asarray(L), (n_matrix * n_matrix), order='C')
-        G = np.column_stack((G.real, G.imag)).astype(np.float16).flatten()
-        L = np.column_stack((L.real, L.imag)).astype(np.float16).flatten()
-        # Output vectors
-        vector_G.append(G)
-        vector_L.append(L)
-
-    vector_G = np.concatenate(vector_G, axis=0)
-    vector_L = np.concatenate(vector_L, axis=0)
-
-    kwargs = {'name': 'data_cholesky_f16', 'G': vector_G,
-              'L': vector_L, 'n_matrix': n_matrix, 'n_samples': n_samples}
-    gen_data_header_file(args.outdir, args.tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/data/data_cholesky_q16.py b/software/data/data_cholesky_q16.py
deleted file mode 100644
index d342f3fb9..000000000
--- a/software/data/data_cholesky_q16.py
+++ /dev/null
@@ -1,117 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the fp16 cholesky.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import argparse
-import pathlib
-from mako.template import Template
-
-
-##################
-# compute_result #
-##################
-
-
-def gen_data_header_file(outdir: pathlib.Path.cwd(),
-                         tpl: pathlib.Path.cwd(), **kwargs):
-
-    file = outdir / f"{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-t",
-        "--tpl",
-        type=pathlib.Path,
-        required=False,
-        default=pathlib.Path(__file__).parent.absolute() /
-        "data_cholesky_q16.h.tpl",
-        help='Path to mako template'
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action='store_true',
-        help='Set verbose'
-    )
-    parser.add_argument(
-        "-n",
-        "--dimension",
-        type=int,
-        required=False,
-        default=4,
-        help='Matrix dimension'
-    )
-    parser.add_argument(
-        "-s",
-        "--num_samples",
-        type=int,
-        required=False,
-        default=256,
-        help='Number samples'
-    )
-
-    args = parser.parse_args()
-    n_matrix = args.dimension
-    n_samples = args.num_samples
-
-    vector_G = []
-    vector_L = []
-    for k in range(n_samples):
-        # Create hermitian matrix
-        H = np.random.randint(-2**(15), 2**(15) - 1,
-                              n_matrix * n_matrix, dtype=np.int16) \
-            + 1.j * np.random.randint(-2**(15), 2**(15) - 1,
-                                      n_matrix * n_matrix, dtype=np.int16)
-        H = H.reshape(n_matrix, n_matrix)
-        # Matrix to be inverted
-        H_h = (np.asmatrix(H).H)
-        # H_H = np.asmatrix(H).H
-        G = H_h * H
-        # Cholesky decomposition
-        L = np.linalg.cholesky(G)
-        # Reshape
-        G = np.reshape(np.asarray(G), (n_matrix * n_matrix), order='C')
-        L = np.reshape(np.asarray(L), (n_matrix * n_matrix), order='C')
-        G = np.column_stack((G.real, G.imag)).astype(np.int16).flatten()
-        L = np.column_stack((L.real, L.imag)).astype(np.int16).flatten()
-        # Output vectors
-        vector_G.append(G)
-        vector_L.append(L)
-
-    vector_G = np.concatenate(vector_G, axis=0)
-    vector_L = np.concatenate(vector_L, axis=0)
-
-    kwargs = {'name': 'data_cholesky_q16',
-              'G': vector_G,
-              'L': vector_L,
-              'n_matrix': n_matrix,
-              'n_samples': n_samples}
-
-    gen_data_header_file(args.outdir, args.tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/data/data_cholesky_q32.py b/software/data/data_cholesky_q32.py
deleted file mode 100644
index acadcc135..000000000
--- a/software/data/data_cholesky_q32.py
+++ /dev/null
@@ -1,106 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the fp16 cholesky.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import argparse
-import pathlib
-from scipy.linalg import solve_triangular
-from mako.template import Template
-
-
-##################
-# compute_result #
-##################
-
-def gen_data_header_file(outdir: pathlib.Path.cwd(),
-                         tpl: pathlib.Path.cwd(), **kwargs):
-
-    file = outdir / f"{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-t",
-        "--tpl",
-        type=pathlib.Path,
-        required=False,
-        default=pathlib.Path(__file__).parent.absolute() /
-        "data_cholesky_q32.h.tpl",
-        help='Path to mako template'
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action='store_true',
-        help='Set verbose'
-    )
-    parser.add_argument(
-        "-n",
-        "--dimension",
-        type=int,
-        required=False,
-        default=4,
-        help='Matrix dimension'
-    )
-
-    args = parser.parse_args()
-    n_matrix = args.dimension
-
-    # Create hermitian matrix
-    L = np.random.randint(-2**(15), 2**(15) - 1,
-                          size=(n_matrix, n_matrix), dtype=np.int32)
-    L = np.tril(L).astype(np.int32)
-    G = np.dot(np.asmatrix(L), np.asmatrix(L).transpose())
-
-    y = np.random.randint(-2**(15), 2**(15) - 1, n_matrix, dtype=np.int32)
-
-    # Linear system solution
-    y = solve_triangular(L, y, lower=True)
-    # x = solve_triangular(np.asmatrix(L).T, y)
-
-    # Reshape
-    G = np.reshape(
-        np.asarray(G),
-        (n_matrix * n_matrix),
-        order='C').astype(
-        np.int32)
-    L = np.reshape(
-        np.asarray(L),
-        (n_matrix * n_matrix),
-        order='C').astype(
-        np.int32)
-    y = np.reshape(np.asarray(y), (n_matrix), order='C').astype(np.int32)
-
-    kwargs = {'name': 'data_cholesky_q32',
-              'G': G,
-              'L': L,
-              'y': y,
-              'n_matrix': n_matrix}
-
-    gen_data_header_file(args.outdir, args.tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/data/data_cmatmul_f16.py b/software/data/data_cmatmul_f16.py
deleted file mode 100644
index b3010977b..000000000
--- a/software/data/data_cmatmul_f16.py
+++ /dev/null
@@ -1,117 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the fp16 matmul.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import argparse
-import pathlib
-from mako.template import Template
-
-
-##################
-# compute_result #
-##################
-
-def gen_data_header_file(outdir: pathlib.Path.cwd(),
-                         tpl: pathlib.Path.cwd(), **kwargs):
-
-    file = outdir / f"{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-t",
-        "--tpl",
-        type=pathlib.Path,
-        required=False,
-        default=pathlib.Path(__file__).parent.absolute() /
-        "data_cmatmul_f16.h.tpl",
-        help='Path to mako template'
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action='store_true',
-        help='Set verbose'
-    )
-    parser.add_argument(
-        "-m",
-        "--dim_m",
-        type=int,
-        required=False,
-        default=16,
-        help='First dimension.'
-    )
-    parser.add_argument(
-        "-n",
-        "--dim_n",
-        type=int,
-        required=False,
-        default=16,
-        help='Second dimension.'
-    )
-    parser.add_argument(
-        "-p",
-        "--dim_p",
-        type=int,
-        required=False,
-        default=16,
-        help='Third dimension.'
-    )
-
-    args = parser.parse_args()
-
-    matrix_M = args.dim_m
-    matrix_N = args.dim_n
-    matrix_P = args.dim_p
-
-    # Create sparse matrix
-    A = np.random.rand(matrix_M, matrix_N) + 1j * \
-        np.random.rand(matrix_M, matrix_N)
-    B = np.random.rand(matrix_N, matrix_P) + 1j * \
-        np.random.rand(matrix_N, matrix_P)
-    C = np.matmul(A, B)
-
-    A = np.reshape(A, (matrix_M * matrix_N), order='C')
-    B = np.reshape(B, (matrix_N * matrix_P), order='C')
-    C = np.reshape(C, (matrix_M * matrix_P), order='C')
-
-    A = np.column_stack((A.imag, A.real)).astype(np.float16).flatten()
-    B = np.column_stack((B.imag, B.real)).astype(np.float16).flatten()
-    C = np.column_stack((C.imag, C.real)).astype(np.float16).flatten()
-
-    kwargs = {
-        'name': 'data_cmatmul_f16',
-        'A': A,
-        'B': B,
-        'C': C,
-        'matrix_M': matrix_M,
-        'matrix_N': matrix_N,
-        'matrix_P': matrix_P}
-
-    gen_data_header_file(args.outdir, args.tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/runtime/data/data_cmatmul_q16.h.tpl b/software/data/data_cmatmul_q16.h.tpl
similarity index 100%
rename from software/runtime/data/data_cmatmul_q16.h.tpl
rename to software/data/data_cmatmul_q16.h.tpl
diff --git a/software/data/data_matmulf16.py b/software/data/data_matmulf16.py
deleted file mode 100644
index 2c362208b..000000000
--- a/software/data/data_matmulf16.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the fp16 matmul.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import argparse
-import pathlib
-from mako.template import Template
-
-
-##################
-# compute_result #
-##################
-
-def gen_data_header_file(outdir: pathlib.Path.cwd(),
-                         tpl: pathlib.Path.cwd(), **kwargs):
-
-    file = outdir / f"{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-t",
-        "--tpl",
-        type=pathlib.Path,
-        required=False,
-        default=pathlib.Path(__file__).parent.absolute() /
-        "data_matmul_f16.h.tpl",
-        help='Path to mako template'
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action='store_true',
-        help='Set verbose'
-    )
-    parser.add_argument(
-        "-m",
-        "--dim_m",
-        type=int,
-        required=False,
-        default=16,
-        help='First dimension.'
-    )
-    parser.add_argument(
-        "-n",
-        "--dim_n",
-        type=int,
-        required=False,
-        default=16,
-        help='Second dimension.'
-    )
-    parser.add_argument(
-        "-p",
-        "--dim_p",
-        type=int,
-        required=False,
-        default=16,
-        help='Third dimension.'
-    )
-
-    args = parser.parse_args()
-
-    matrix_M = args.dim_m
-    matrix_N = args.dim_n
-    matrix_P = args.dim_p
-
-    # Create matrix
-    A = (np.random.rand(matrix_M, matrix_N) - 0.5).astype(np.float16)
-    B = (np.random.rand(matrix_N, matrix_P) - 0.5).astype(np.float16)
-    C = np.matmul(A, B)
-
-    A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(np.float16)
-    B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(np.float16)
-    C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(np.float16)
-
-    kwargs = {
-        'name': 'data_matmul_f16',
-        'A': A,
-        'B': B,
-        'C': C,
-        'matrix_M': matrix_M,
-        'matrix_N': matrix_N,
-        'matrix_P': matrix_P}
-
-    gen_data_header_file(args.outdir, args.tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/data/data_matmulf32.py b/software/data/data_matmulf32.py
deleted file mode 100644
index 15086d0fc..000000000
--- a/software/data/data_matmulf32.py
+++ /dev/null
@@ -1,112 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the fp32 matmul.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import argparse
-import pathlib
-from mako.template import Template
-
-
-##################
-# compute_result #
-##################
-
-def gen_data_header_file(outdir: pathlib.Path.cwd(),
-                         tpl: pathlib.Path.cwd(), **kwargs):
-
-    file = outdir / f"{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-t",
-        "--tpl",
-        type=pathlib.Path,
-        required=False,
-        default=pathlib.Path(__file__).parent.absolute() /
-        "data_matmul_f32.h.tpl",
-        help='Path to mako template'
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action='store_true',
-        help='Set verbose'
-    )
-
-    parser.add_argument(
-        "-m",
-        "--dim_m",
-        type=int,
-        required=False,
-        default=16,
-        help='First dimension.'
-    )
-    parser.add_argument(
-        "-n",
-        "--dim_n",
-        type=int,
-        required=False,
-        default=16,
-        help='Second dimension.'
-    )
-    parser.add_argument(
-        "-p",
-        "--dim_p",
-        type=int,
-        required=False,
-        default=16,
-        help='Third dimension.'
-    )
-
-    args = parser.parse_args()
-
-    matrix_M = args.dim_m
-    matrix_N = args.dim_n
-    matrix_P = args.dim_p
-
-    # Create matrix
-    A = np.random.rand(matrix_M, matrix_N)
-    B = np.random.rand(matrix_N, matrix_P)
-    C = np.matmul(A, B)
-
-    A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(np.float32)
-    B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(np.float32)
-    C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(np.float32)
-
-    kwargs = {
-        'name': 'data_matmul_f32',
-        'A': A,
-        'B': B,
-        'C': C,
-        'matrix_M': matrix_M,
-        'matrix_N': matrix_N,
-        'matrix_P': matrix_P}
-
-    gen_data_header_file(args.outdir, args.tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/data/data_mimo_mmse_f16.py b/software/data/data_mimo_mmse_f16.py
deleted file mode 100644
index ff41e7e18..000000000
--- a/software/data/data_mimo_mmse_f16.py
+++ /dev/null
@@ -1,185 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the fp16 mmse.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import argparse
-import pathlib
-from mako.template import Template
-from scipy.linalg import solve_triangular
-
-
-##################
-# compute_result #
-##################
-
-def gen_data_header_file(outdir: pathlib.Path.cwd(),
-                         tpl: pathlib.Path.cwd(), **kwargs):
-
-    file = outdir / f"{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def gen_input_data(N_rx, N_tx, y):
-    # Create channel matrix
-    H = np.random.rand(N_rx, N_tx).astype(np.float16) + 1.j * \
-        np.random.rand(N_rx, N_tx).astype(np.float16)
-    # Generate noise variance
-    sigma = np.diag(np.random.rand(N_tx, N_tx).astype(np.float16))
-
-    # Matrix to be inverted in MMSE estimator
-    H_h = (np.asmatrix(H).H)
-
-    G = H_h * H
-    G = G + np.diag(sigma)
-    # Cholesky decomposition
-    L = np.linalg.cholesky(G)
-    # Linear system solution
-    y1 = np.transpose(np.dot(H_h, y))
-    y2 = solve_triangular(L, y1, lower=True)
-    x = solve_triangular(np.asmatrix(L).H, y2)
-
-    sigma = sigma + 0j
-    H = np.reshape(np.asarray(H), (N_tx * N_rx), order='C')
-    G = np.reshape(np.asarray(G), (N_tx * N_tx), order='C')
-    L = np.reshape(np.asarray(L), (N_tx * N_tx), order='C')
-    sigma = np.column_stack((sigma.real, sigma.imag)
-                            ).astype(np.float16).flatten()
-    H = np.column_stack((H.real, H.imag)).astype(np.float16).flatten()
-    G = np.column_stack((G.real, G.imag)).astype(np.float16).flatten()
-    L = np.column_stack((L.real, L.imag)).astype(np.float16).flatten()
-
-    y = np.column_stack((y.real, y.imag)).astype(np.float16).flatten()
-    x = np.column_stack((x.real, x.imag)).astype(np.float16).flatten()
-
-    return sigma, H, G, y, x
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-t",
-        "--tpl",
-        type=pathlib.Path,
-        required=False,
-        default=pathlib.Path(__file__).parent.absolute() /
-        "data_mimo_mmse_f16.h.tpl",
-        help='Path to mako template'
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action='store_true',
-        help='Set verbose'
-    )
-    parser.add_argument(
-        "-n",
-        "--transmitters",
-        type=int,
-        required=False,
-        default=4,
-        help='First dimension.'
-    )
-    parser.add_argument(
-        "-m",
-        "--receivers",
-        type=int,
-        required=False,
-        default=32,
-        help='First dimension.'
-    )
-    parser.add_argument(
-        "-k",
-        "--iterations",
-        type=int,
-        required=False,
-        default=256,
-        help='Iterations.'
-    )
-    parser.add_argument(
-        "-r",
-        "--randomize",
-        type=int,
-        required=False,
-        default=0,
-        help='Randomizes the number of beamgroups on each subcarrier.'
-    )
-
-    args = parser.parse_args()
-    N_tx = args.transmitters
-    N_rx = args.receivers
-    N_itr = args.iterations
-
-    sigma = np.zeros([N_itr, 2 * N_tx])
-    H_RI = np.zeros([N_itr, 2 * N_tx * N_rx])
-    G_RI = np.zeros([N_itr, 2 * N_tx * N_tx])
-    y_RI = np.zeros([N_itr, 2 * N_rx])
-    x_RI = np.zeros([N_itr, 2 * N_tx])
-    beamgroups = np.zeros(N_itr)
-
-    for k in range(N_itr):
-
-        # Create input vector
-        y_bg = np.random.rand(N_rx).astype(np.float16) + 1.j * \
-            np.random.rand(N_rx).astype(np.float16)
-        if (args.randomize == 1):
-            N_beamgroups = 2 ** np.random.randint(0, np.log2(2 * N_tx))
-        else:
-            N_beamgroups = 1
-        N_tx_itr = N_tx // N_beamgroups
-        beamgroups[k] = N_beamgroups
-
-        for i in range(N_beamgroups):
-
-            sigma_itr, H_itr, G_itr, y_itr, x_itr = gen_input_data(
-                N_rx, N_tx_itr, y_bg)
-            sigma[k, (i * 2 * N_tx_itr):((i + 1) * 2 * N_tx_itr)] = sigma_itr
-            H_RI[k, (i * 2 * N_tx_itr * N_rx)
-                     :((i + 1) * 2 * N_tx_itr * N_rx)] = H_itr
-            G_RI[k, (i * 2 * N_tx_itr * N_tx_itr)
-                     :((i + 1) * 2 * N_tx_itr * N_tx_itr)] = G_itr
-            y_RI[k, :] = y_itr
-            x_RI[k, (i * 2 * N_tx_itr):((i + 1) * 2 * N_tx_itr)] = x_itr
-
-    sigma = np.reshape(sigma, (2 * N_tx * N_itr)).astype(np.float16)
-    H_RI = np.reshape(H_RI, (2 * N_rx * N_tx * N_itr)).astype(np.float16)
-    G_RI = np.reshape(G_RI, (2 * N_tx * N_tx * N_itr)).astype(np.float16)
-    y_RI = np.reshape(y_RI, (2 * N_rx * N_itr)).astype(np.float16)
-    x_RI = np.reshape(x_RI, (2 * N_tx * N_itr)).astype(np.float16)
-    beamgroups = beamgroups.astype(np.int32)
-
-    kwargs = {'name': 'data_mimo_mmse_f16',
-              'H': H_RI,
-              'G': G_RI,
-              'sigma': sigma,
-              'y': y_RI,
-              'x': x_RI,
-              'beamgroups': beamgroups,
-              'N_tx': N_tx,
-              'N_rx': N_rx,
-              'N_itr': N_itr}
-
-    gen_data_header_file(args.outdir, args.tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/data/data_mimo_mmse_f32.py b/software/data/data_mimo_mmse_f32.py
deleted file mode 100644
index 26515e03d..000000000
--- a/software/data/data_mimo_mmse_f32.py
+++ /dev/null
@@ -1,154 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the fp32 mmse.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import argparse
-import pathlib
-from mako.template import Template
-from scipy.linalg import solve_triangular
-
-
-##################
-# compute_result #
-##################
-
-def gen_data_header_file(outdir: pathlib.Path.cwd(),
-                         tpl: pathlib.Path.cwd(), **kwargs):
-
-    file = outdir / f"{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def gen_input_data(N_rx, N_tx):
-
-    # Create channel matrix
-    H = np.random.rand(N_rx, N_tx).astype(np.float32) + 1.j * \
-        np.random.rand(N_rx, N_tx).astype(np.float32)
-    # Create input vector
-    y = np.random.rand(N_rx).astype(np.float32) + 1.j * \
-        np.random.rand(N_rx).astype(np.float32)
-    # Generate noise variance
-    sigma = np.diag(np.random.rand(N_tx, N_tx).astype(np.float32))
-
-    # Matrix to be inverted in MMSE estimator
-    H_h = np.asmatrix(H).H
-
-    G = H_h * H
-    G = G + np.diag(sigma)
-    # Cholesky decomposition
-    L = np.linalg.cholesky(G)
-    # Linear system solution
-    y1 = np.transpose(np.dot(H_h, y))
-    y2 = solve_triangular(L, y1, lower=True)
-    x = solve_triangular(np.asmatrix(L).H, y2)
-
-    H = np.reshape(np.asarray(H), (N_tx * N_rx), order='C')
-    G = np.reshape(np.asarray(G), (N_tx * N_tx), order='C')
-    L = np.reshape(np.asarray(L), (N_tx * N_tx), order='C')
-    H = np.column_stack((H.real, H.imag)).astype(np.float32).flatten()
-    G = np.column_stack((G.real, G.imag)).astype(np.float32).flatten()
-    L = np.column_stack((L.real, L.imag)).astype(np.float32).flatten()
-
-    y = np.column_stack((y.real, y.imag)).astype(np.float32).flatten()
-    x = np.column_stack((x.real, x.imag)).astype(np.float32).flatten()
-
-    return sigma, H, G, y, x
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-t",
-        "--tpl",
-        type=pathlib.Path,
-        required=False,
-        default=pathlib.Path(__file__).parent.absolute() /
-        "data_mimo_mmse_f32.h.tpl",
-        help='Path to mako template'
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action='store_true',
-        help='Set verbose'
-    )
-    parser.add_argument(
-        "-n",
-        "--transmitters",
-        type=int,
-        required=False,
-        default=4,
-        help='First dimension.'
-    )
-    parser.add_argument(
-        "-m",
-        "--receivers",
-        type=int,
-        required=False,
-        default=32,
-        help='Second dimension.'
-    )
-    parser.add_argument(
-        "-k",
-        "--iterations",
-        type=int,
-        required=False,
-        default=256,
-        help='Iterations.'
-    )
-
-    args = parser.parse_args()
-    N_tx = args.transmitters
-    N_rx = args.receivers
-    itr = args.iterations
-
-    sigma = np.zeros([itr, N_tx])
-    H_RI = np.zeros([itr, 2 * N_tx * N_rx])
-    G_RI = np.zeros([itr, 2 * N_tx * N_tx])
-    y_RI = np.zeros([itr, 2 * N_rx])
-    x_RI = np.zeros([itr, 2 * N_tx])
-    for k in range(itr):
-        sigma[k, :], H_RI[k, :], G_RI[k, :], \
-            y_RI[k, :], x_RI[k, :] = gen_input_data(N_rx, N_tx)
-
-    sigma = np.reshape(sigma, (N_tx * itr))
-    H_RI = np.reshape(H_RI, (2 * N_rx * N_tx * itr))
-    G_RI = np.reshape(G_RI, (2 * N_tx * N_tx * itr))
-    y_RI = np.reshape(y_RI, (2 * N_rx * itr))
-    x_RI = np.reshape(x_RI, (2 * N_tx * itr))
-
-    kwargs = {'name': 'data_mimo_mmse_f32',
-              'H': H_RI,
-              'G': G_RI,
-              'sigma': sigma,
-              'y': y_RI,
-              'x': x_RI,
-              'N_tx': N_tx,
-              'N_rx': N_rx,
-              'N_itr': itr}
-
-    gen_data_header_file(args.outdir, args.tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/data/data_mimo_mmse_q16.py b/software/data/data_mimo_mmse_q16.py
deleted file mode 100644
index 718978824..000000000
--- a/software/data/data_mimo_mmse_q16.py
+++ /dev/null
@@ -1,166 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Solderpad Hardware License, Version 0.51, see LICENSE for details.
-# SPDX-License-Identifier: SHL-0.51
-
-# This script generates data for the fp16 mmse.
-# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
-
-import numpy as np
-import argparse
-import pathlib
-from mako.template import Template
-from scipy.linalg import solve_triangular
-
-
-##################
-# compute_result #
-##################
-
-def gen_data_header_file(outdir: pathlib.Path.cwd(),
-                         tpl: pathlib.Path.cwd(), **kwargs):
-
-    file = outdir / f"{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def gen_input_data(N_rx, N_tx):
-
-    # Create channel matrix
-    H = np.random.randint(-2**(15), 2**(15) - 1,
-                          N_rx * N_tx, dtype=np.int16) \
-        + 1.j * np.random.randint(-2**(15), 2**(15) - 1,
-                                  N_rx * N_tx, dtype=np.int16)
-    H = H.reshape(N_rx, N_tx)
-    # Create input vector
-    y = np.random.randint(-2**(15), 2**(15) - 1, N_rx, dtype=np.int16) + \
-        1.j * np.random.randint(-2**(15), 2**(15) - 1, N_rx, dtype=np.int16)
-    # Generate noise variance
-    sigma = np.random.randint(-2**(15), 2**(15) - 1, N_tx, dtype=np.int16)
-
-    # Matrix to be inverted in MMSE estimator
-    H_h = (np.asmatrix(H).H)
-
-    # Hermitian
-    G = H_h * H + np.diag(sigma)
-    # Matrix vector product
-    y1 = np.transpose(np.dot(H_h, y))
-
-    # Cholesky decomposition
-    # L = np.linalg.cholesky(G)
-    L = G
-    # Linear system solution
-    y2 = solve_triangular(L, y1, lower=True)
-    x = solve_triangular(np.asmatrix(L).H, y2)
-
-    sigma = sigma + 0j
-    H = np.reshape(np.asarray(H), (N_rx * N_tx), order='C')
-    G = np.reshape(np.asarray(G), (N_tx * N_tx), order='C')
-    L = np.reshape(np.asarray(L), (N_tx * N_tx), order='C')
-    sigma = np.column_stack(
-        (sigma.real, sigma.imag)).astype(
-        np.int16).flatten()
-    H = np.column_stack((H.real, H.imag)).astype(np.int16).flatten()
-    G = np.column_stack((G.real, G.imag)).astype(np.int16).flatten()
-    L = np.column_stack((L.real, L.imag)).astype(np.int16).flatten()
-    y = np.column_stack((y.real, y.imag)).astype(np.int16).flatten()
-    x = np.column_stack((x.real, x.imag)).astype(np.int16).flatten()
-
-    return sigma, H, G, y, x
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-t",
-        "--tpl",
-        type=pathlib.Path,
-        required=False,
-        default=pathlib.Path(__file__).parent.absolute() /
-        "data_mimo_mmse_q16.h.tpl",
-        help='Path to mako template'
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action='store_true',
-        help='Set verbose'
-    )
-    parser.add_argument(
-        "-n",
-        "--transmitters",
-        type=int,
-        required=False,
-        default=4,
-        help='First dimension.'
-    )
-    parser.add_argument(
-        "-m",
-        "--receivers",
-        type=int,
-        required=False,
-        default=32,
-        help='First dimension.'
-    )
-    parser.add_argument(
-        "-k",
-        "--iterations",
-        type=int,
-        required=False,
-        default=1,
-        help='Iterations.'
-    )
-
-    args = parser.parse_args()
-    N_tx = args.transmitters
-    N_rx = args.receivers
-    itr = args.iterations
-
-    sigma = np.zeros([itr, 2 * N_tx], dtype=np.int16)
-    H_RI = np.zeros([itr, 2 * N_tx * N_rx], dtype=np.int16)
-    G_RI = np.zeros([itr, 2 * N_tx * N_tx], dtype=np.int16)
-    y_RI = np.zeros([itr, 2 * N_rx], dtype=np.int16)
-    x_RI = np.zeros([itr, 2 * N_tx], dtype=np.int16)
-    for k in range(itr):
-        [sigma[k, :],
-            H_RI[k, :],
-            G_RI[k, :],
-            y_RI[k, :],
-            x_RI[k, :]] = gen_input_data(N_rx, N_tx)
-
-    sigma = np.reshape(sigma, (2 * N_tx * itr)).astype(np.int16)
-    H_RI = np.reshape(H_RI, (2 * N_rx * N_tx * itr)).astype(np.int16)
-    G_RI = np.reshape(G_RI, (2 * N_tx * N_tx * itr)).astype(np.int16)
-    y_RI = np.reshape(y_RI, (2 * N_rx * itr)).astype(np.int16)
-    x_RI = np.reshape(x_RI, (2 * N_tx * itr)).astype(np.int16)
-
-    kwargs = {'name': 'data_mimo_mmse_q16',
-              'H': H_RI,
-              'G': G_RI,
-              'sigma': sigma,
-              'y': y_RI,
-              'x': x_RI,
-              'N_tx': N_tx,
-              'N_rx': N_rx,
-              'N_itr': itr}
-
-    gen_data_header_file(args.outdir, args.tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/data/data_ofdm.py b/software/data/data_ofdm.py
deleted file mode 100644
index 64b0a7ca6..000000000
--- a/software/data/data_ofdm.py
+++ /dev/null
@@ -1,137 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2022 ETH Zurich and University of Bologna.
-# Licensed under the Apache License, Version 2.0, see LICENSE for details.
-# SPDX-License-Identifier: Apache-2.0
-
-# Author: Marco Bertuletti, ETH Zurich
-
-import numpy as np
-import math as M
-import argparse
-import pathlib
-from mako.template import Template
-from sympy.combinatorics import Permutation
-
-##################
-# compute_result #
-##################
-
-
-def compute_bitreversal(N, R):
-    # Decompose
-    logR2 = []
-    idx = N
-    while (idx >= R):
-        logR2.append(int(M.log2(R)))
-        idx = idx // R
-    if (idx > 1):
-        logR2.append(int(M.log2(idx)))
-    # Bitreversal
-    indexes = []
-    for x in range(N):
-        result = 0
-        for bits in logR2:
-            mask = (0xffffffff >> (32 - bits))
-            result = (result << bits) | (x & mask)
-            x = x >> bits
-        indexes.append(result)
-
-    # Create transpositions table
-    tps = []
-    for c in Permutation.from_sequence(indexes).cyclic_form:
-        for i in range(len(c) - 1):
-            tps.append([c[i] * 8, c[-1] * 8])
-    return tps
-
-
-def gen_data_header_file(outdir: pathlib.Path.cwd(),
-                         tpl: pathlib.Path.cwd(), **kwargs):
-
-    file = outdir / f"data_{kwargs['name']}.h"
-
-    print(tpl, outdir, kwargs['name'])
-
-    template = Template(filename=str(tpl))
-    with file.open('w') as f:
-        f.write(template.render(**kwargs))
-
-
-def main():
-
-    parser = argparse.ArgumentParser(description='Generate data for kernels')
-    parser.add_argument(
-        "-o",
-        "--outdir",
-        type=pathlib.Path,
-        default=pathlib.Path(__file__).parent.absolute(),
-        required=False,
-        help='Select out directory of generated data files'
-    )
-    parser.add_argument(
-        "-t",
-        "--tpl",
-        type=pathlib.Path,
-        required=False,
-        default=pathlib.Path(__file__).parent.absolute() / "data_ofdm.h.tpl",
-        help='Path to mako template'
-    )
-    parser.add_argument(
-        "-v",
-        "--verbose",
-        action='store_true',
-        help='Set verbose'
-    )
-    parser.add_argument(
-        "-rx",
-        "--receivers",
-        type=int,
-        required=False,
-        default=64,
-        help='First dimension.'
-    )
-    parser.add_argument(
-        "-bs",
-        "--beams",
-        type=int,
-        required=False,
-        default=32,
-        help='Second dimension.'
-    )
-    parser.add_argument(
-        "-sc",
-        "--subcarriers",
-        type=int,
-        required=False,
-        default=4096,
-        help='Iterations.'
-    )
-
-    args = parser.parse_args()
-    N_rx = args.receivers
-    N_bs = args.beams
-    N_sc = args.subcarriers
-
-    pFFT_src = (np.random.rand(2 * N_rx * N_sc)).astype(np.float16)
-    pTw_coef = (np.random.rand(int(3 * N_sc / 4))).astype(np.float16)
-    pBF_coef = (np.random.rand(2 * N_rx * N_bs)).astype(np.float16)
-    pBF_dst = (np.random.rand(2 * N_bs * N_sc)).astype(np.float16)
-
-    Bitreversal = np.ndarray.flatten(np.array(compute_bitreversal(N_sc, 2)))
-
-    kwargs = {'name': 'ofdm',
-              'pFFT_src': pFFT_src,
-              'pTw_coef': pTw_coef,
-              'pBF_coef': pBF_coef,
-              'pBF_dst': pBF_dst,
-              'bitrev': Bitreversal,
-              'N_rx': N_rx,
-              'N_bs': N_bs,
-              'N_sc': N_sc,
-              'Log2Len': int(np.log2(N_sc)),
-              'BitrevLen': len(Bitreversal)}
-    gen_data_header_file(args.outdir, args.tpl, **kwargs)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/software/runtime/data/generate_cfft.py b/software/data/generate_cfft.py
similarity index 100%
rename from software/runtime/data/generate_cfft.py
rename to software/data/generate_cfft.py
diff --git a/software/runtime/data/generate_chest.py b/software/data/generate_chest.py
similarity index 98%
rename from software/runtime/data/generate_chest.py
rename to software/data/generate_chest.py
index 058ce2abf..e11eb8b62 100755
--- a/software/runtime/data/generate_chest.py
+++ b/software/data/generate_chest.py
@@ -103,8 +103,8 @@ def compute_chest_q16(in_rx, in_tx, p):
 #            result[2 * (i * n_tx + j) + 1] = q_sat((num_i * den) // 2**p)
 
             # Compute data multiplication
-            num_r = (a_r * b_r) + (a_i * b_i)
-            num_i = (a_i * b_r) - (a_r * b_i)
+            num_r = (a_r * b_r) - (a_i * b_i)
+            num_i = (a_i * b_r) + (a_r * b_i)
             result[2 * (i * n_tx + j)] = q_sat(num_r // 2**p)
             result[2 * (i * n_tx + j) + 1] = q_sat(num_i // 2**p)
     return result
diff --git a/software/runtime/data/generate_cholesky.py b/software/data/generate_cholesky.py
similarity index 95%
rename from software/runtime/data/generate_cholesky.py
rename to software/data/generate_cholesky.py
index a72dc210f..1a25c4206 100644
--- a/software/runtime/data/generate_cholesky.py
+++ b/software/data/generate_cholesky.py
@@ -64,11 +64,10 @@ def generate_cholesky_q16(n_matrix, n_samples):
     vector_L = []
     for k in range(n_samples):
         # Create hermitian matrix
-        H = np.random.randint(-2**(15), 2**(15) - 1, n_matrix * n_matrix, dtype=np.int16) \
-            + 1.j * np.random.randint(-2**(15),
-                                      2**(15) - 1,
-                                      n_matrix * n_matrix,
-                                      dtype=np.int16)
+        H = np.random.randint(-2**(15), 2**(15) - 1, n_matrix * n_matrix,
+                              dtype=np.int16) + \
+            1.j * np.random.randint(-2**(15), 2**(15) - 1, n_matrix * n_matrix,
+                                    dtype=np.int16)
         H = H.reshape(n_matrix, n_matrix)
         # Matrix to be inverted
         H_h = (np.asmatrix(H).H)
diff --git a/software/runtime/data/generate_matmul.py b/software/data/generate_matmul.py
similarity index 100%
rename from software/runtime/data/generate_matmul.py
rename to software/data/generate_matmul.py
diff --git a/software/runtime/data/generate_mimo_mmse.py b/software/data/generate_mimo_mmse.py
similarity index 93%
rename from software/runtime/data/generate_mimo_mmse.py
rename to software/data/generate_mimo_mmse.py
index 454976f27..5e95d3ef6 100644
--- a/software/runtime/data/generate_mimo_mmse.py
+++ b/software/data/generate_mimo_mmse.py
@@ -137,9 +137,10 @@ def generate_mimo_mmse_f16(N_tx, N_rx, N_itr, randomize):
             x = np.column_stack((x.real, x.imag)).astype(np.float16).flatten()
 
             vSigma[k, (i * 2 * N_tx_itr):((i + 1) * 2 * N_tx_itr)] = sigma
-            vH[k, (i * 2 * N_tx_itr * N_rx):((i + 1) * 2 * N_tx_itr * N_rx)] = H
-            vG[k, (i * 2 * N_tx_itr * N_tx_itr)
-                   :((i + 1) * 2 * N_tx_itr * N_tx_itr)] = G
+            vH[k, (i * 2 * N_tx_itr * N_rx):(
+                (i + 1) * 2 * N_tx_itr * N_rx)] = H
+            vG[k, (i * 2 * N_tx_itr * N_tx_itr):(
+                (i + 1) * 2 * N_tx_itr * N_tx_itr)] = G
             vy[k, :] = y
             vx[k, (i * 2 * N_tx_itr):((i + 1) * 2 * N_tx_itr)] = x
 
@@ -162,13 +163,15 @@ def generate_mimo_mmse_q16(N_tx, N_rx, N_itr):
     vx = np.zeros([N_itr, 2 * N_tx], dtype=np.int16)
     for k in range(N_itr):
         # Create channel matrix
-        H = np.random.randint(-2**(15), 2**(15) - 1, N_rx * N_tx, dtype=np.int16) \
-            + 1.j * np.random.randint(-2**(15), 2 **
-                                      (15) - 1, N_rx * N_tx, dtype=np.int16)
+        H = np.random.randint(-2**(15), 2**(15) - 1, N_rx * N_tx,
+                              dtype=np.int16) + \
+            1.j * np.random.randint(-2**(15), 2 ** (15) - 1,
+                                    N_rx * N_tx, dtype=np.int16)
         # Create input vector
-        y = np.random.randint(-2**(15), 2**(15) - 1, N_rx, dtype=np.int16) \
-            + 1.j * np.random.randint(-2**(15), 2 **
-                                      (15) - 1, N_rx, dtype=np.int16)
+        y = np.random.randint(-2**(15), 2**(15) - 1, N_rx,
+                              dtype=np.int16) + \
+            1.j * np.random.randint(-2**(15), 2 ** (15) - 1, N_rx,
+                                    dtype=np.int16)
         # Generate noise variance
         sigma = np.random.randint(-2**(15), 2**(15) - 1, N_tx, dtype=np.int16)
 
@@ -238,7 +241,7 @@ def main():
         "--iterations",
         type=int,
         required=False,
-        default=1,
+        default=32,
         help='Iterations.'
     )
 
@@ -261,7 +264,7 @@ def main():
     gen_data_header_file(args.outdir, tpl, **kwargs)
 
     vSigma, vH, vG, vy, vx, beamgroups = generate_mimo_mmse_f16(
-        N_tx, N_rx, N_itr, 1)
+        N_tx, N_rx, N_itr, 0)
     tpl = pathlib.Path(__file__).parent.absolute() / "data_mimo_mmse_f16.h.tpl"
     kwargs = {'name': 'data_mimo_mmse_f16',
               'H': vH,
diff --git a/software/runtime/data/generate_ofdm.py b/software/data/generate_ofdm.py
similarity index 100%
rename from software/runtime/data/generate_ofdm.py
rename to software/data/generate_ofdm.py
diff --git a/software/kernels/baremetal/mempool_cfft_radix4_butterfly_f16.h b/software/kernels/baremetal/mempool_cfft_radix4_butterfly_f16.h
deleted file mode 100644
index 5196fc30d..000000000
--- a/software/kernels/baremetal/mempool_cfft_radix4_butterfly_f16.h
+++ /dev/null
@@ -1,199 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-#include "xpulp/builtins_v2.h"
-
-/**
-  @brief         First butterfly stage.
-  @param[in]     pIn  points to input buffer of 16b data, Re and Im parts are
-  interleaved
-  @param[out]    pOut  points to output buffer of 16b data, Re and Im parts are
-  interleaved
-  @param[in]     i0 points to the first element to be processed
-  @param[in]     n2 number of elements in the first wing of the butterfly
-  @param[in]     CoSi1 packed cosine and sine first twiddle
-  @param[in]     CoSi2 packed cosine and sine second twiddle
-  @param[in]     CoSi3 packed cosine and sine third twiddle
-  @param[in]     C1 packed sine and cosine first twiddle
-  @param[in]     C2 packed sine and cosine second twiddle
-  @param[in]     C3 packed sine and cosine third twiddle
-  @return        none
-*/
-static inline void radix4_butterfly(__fp16 *pIn, __fp16 *pOut,
-                                    uint32_t i0, uint32_t n2, v2h CoSi1,
-                                    v2h CoSi2, v2h CoSi3, v2h C1, v2h C2,
-                                    v2h C3) {
-  uint32_t i1, i2, i3;
-  __fp16 t0, t1, t2, t3, t4, t5;
-  v2h A, B, C, D, E, F, G, H;
-
-#if defined(FOLDED) || defined(SCHEDULED)
-  /* index calculation for the input as, */
-  /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */
-  i1 = i0 + N_BANKS;
-  i2 = i1 + N_BANKS;
-  i3 = i2 + N_BANKS;
-  uint32_t n2_store = n2 >> 2U;
-  uint32_t i0_store =
-      (i0 % n2_store) + (i0 / n2) * n2 + ((i0 % n2) / n2_store) * N_BANKS;
-  uint32_t i1_store = i0_store + n2_store;
-  uint32_t i2_store = i1_store + n2_store;
-  uint32_t i3_store = i2_store + n2_store;
-#else
-  /* index calculation for the input as, */
-  /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */
-  i1 = i0 + n2;
-  i2 = i1 + n2;
-  i3 = i2 + n2;
-#endif
-  /* Read ya (real), xa (imag) input */
-  A = *(v2h *)&pIn[i0 * 2U];
-  /* Read yb (real), xb(imag) input */
-  B = *(v2h *)&pIn[i1 * 2U];
-  /* Read yc (real), xc(imag) input */
-  C = *(v2h *)&pIn[i2 * 2U];
-  /* Read yd (real), xd(imag) input */
-  D = *(v2h *)&pIn[i3 * 2U];
-  asm volatile(
-               // xa + xc, ya + yc
-               "vfadd.h  %[E],%[A],%[C];"
-               // xa - xc, ya - yc
-               "vfsub.h  %[F],%[A],%[C];"
-               // xb + xd, yd + yd
-               "vfadd.h  %[G],%[B],%[D];"
-               // xb - xd, yb - yd
-               "vfsub.h  %[H],%[B],%[D];"
-               "pv.extract.h  %[t0],%[H],0;"
-               "pv.extract.h  %[t1],%[H],1;"
-               "fsub.h %[t3],zero,%[t1];"
-               "fsub.h %[t4],zero,%[t0];"
-               // yd - yb, xb - xd
-               "pv.pack.h %[C],%[t0],%[t3];"
-               // yb - yd, xd - xb
-               "pv.pack.h %[D],%[t4],%[t1];"
-               // xa + xc + xb + xd, ya + yb + yc + yd
-               "vfadd.h  %[A],%[E],%[G];"
-               // xa - xc + yb - yd, ya - yc + xd - xb
-               "vfadd.h  %[D],%[F],%[D];"
-               // xa + xc - xb - xd, ya + yc - yb - yd
-               "vfsub.h  %[B],%[E],%[G];"
-               // xa - xc - yb + yd, ya - yc + xb - xd
-               "vfadd.h  %[C],%[F],%[C];"
-               "vfdotpex.s.h  %[t0],%[CoSi1],%[D];"
-               "vfdotpex.s.h  %[t2],%[CoSi2],%[B];"
-               "vfdotpex.s.h  %[t4],%[CoSi3],%[C];"
-               "vfdotpex.s.h  %[t1],%[C1],%[D];"
-               "vfdotpex.s.h  %[t3],%[C1],%[B];"
-               "vfdotpex.s.h  %[t5],%[C3],%[C];"
-               "fcvt.h.s %[t0],%[t0];"
-               "fcvt.h.s %[t1],%[t1];"
-               "fcvt.h.s %[t2],%[t2];"
-               "fcvt.h.s %[t3],%[t3];"
-               "fcvt.h.s %[t4],%[t4];"
-               "fcvt.h.s %[t5],%[t5];"
-               "pv.pack.h %[E],%[t1],%[t0];"
-               "pv.pack.h %[F],%[t3],%[t2];"
-               "pv.pack.h %[G],%[t5],%[t4];"
-               : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D),
-                 [E] "=&r"(E), [F] "=&r"(F), [G] "=&r"(G), [H] "=&r"(H),
-                 [t0] "=&r"(t0), [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3),
-                 [t4] "=&r"(t4), [t5] "=&r"(t5)
-               : [C1] "r"(C1), [C2] "r"(C2), [C3] "r"(C3), [CoSi1] "r"(CoSi1),
-                 [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3)
-               :);
-#if defined(FOLDED) || defined(SCHEDULED)
-  *((v2h *)&pOut[i0_store * 2U]) = A;
-  *((v2h *)&pOut[i1_store * 2U]) = E;
-  *((v2h *)&pOut[i2_store * 2U]) = F;
-  *((v2h *)&pOut[i3_store * 2U]) = G;
-#else
-  *((v2h *)&pOut[i0 * 2U]) = A;
-  *((v2h *)&pOut[i1 * 2U]) = E;
-  *((v2h *)&pOut[i2 * 2U]) = F;
-  *((v2h *)&pOut[i3 * 2U]) = G;
-#endif
-
-}
-
-/**
-  @brief         Last butterfly stage.
-  @param[in]     pIn  points to input buffer of 16b data, Re and Im parts are
-  interleaved
-  @param[out]    pOut  points to output buffer of 16b data, Re and Im parts are
-  interleaved
-  @param[in]     i0 points to the first element to be processed
-  @return        none
-*/
-static inline void radix4_butterfly_last(__fp16 *pIn, __fp16 *pOut,
-                                         uint32_t i0) {
-  __fp16 t0, t1;
-  uint32_t i1, i2, i3;
-  v2h A, B, C, D, E, F, G, H;
-
-#if defined(FOLDED) || defined(SCHEDULED)
-  /*  index calculation for the input as, */
-  /*  pIn[i0 + 0], pIn[i0 + fftLen/4],
-      pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */
-  i1 = i0 + N_BANKS;
-  i2 = i1 + N_BANKS;
-  i3 = i2 + N_BANKS;
-#ifndef SCHEDULED
-  uint32_t i0_store = i0 * 4;
-  uint32_t i1_store = i0_store + 1;
-  uint32_t i2_store = i1_store + 1;
-  uint32_t i3_store = i2_store + 1;
-#endif
-#else
-  /*  index calculation for the input as, */
-  /*  pIn[i0 + 0], pIn[i0 + fftLen/4],
-      pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */
-  i1 = i0 + 1U;
-  i2 = i1 + 1U;
-  i3 = i2 + 1U;
-#endif
-
-  /* Read ya (real), xa(imag) input */
-  A = *(v2h *)&pIn[i0 * 2U];
-  /* Read yb (real), xb(imag) input */
-  B = *(v2h *)&pIn[i1 * 2U];
-  /* Read yc (real), xc(imag) input */
-  C = *(v2h *)&pIn[i2 * 2U];
-  /* Read yd (real), xd(imag) input */
-  D = *(v2h *)&pIn[i3 * 2U];
-  __fp16 t2, t3;
-  asm volatile(
-      "vfsub.h  %[H],%[B],%[D];"
-      "vfadd.h  %[G],%[B],%[D];"
-      "vfadd.h  %[E],%[A],%[C];"
-      "vfsub.h  %[F],%[A],%[C];"
-      "pv.extract.h  %[t0],%[H],0;"
-      "pv.extract.h  %[t1],%[H],1;"
-      "fsub.h %[t2], zero, %[t0];"
-      "fsub.h %[t3], zero, %[t1];"
-      "pv.pack.h %[A],%[t2],%[t1];"
-      "pv.pack.h %[B],%[t0],%[t3];"
-      "vfadd.h  %[H],%[E],%[G];"
-      "vfsub.h  %[E],%[E],%[G];"
-      "vfadd.h  %[A],%[F],%[A];"
-      "vfadd.h  %[B],%[F],%[B];"
-      : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "=&r"(E),
-        [F] "=&r"(F), [G] "=&r"(G), [H] "=&r"(H), [t0] "=&r"(t0),
-        [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3)
-      :
-      :);
-#if defined(FOLDED)
-  *((v2h *)&pOut[i0_store * 2U]) = H;
-  *((v2h *)&pOut[i1_store * 2U]) = E;
-  *((v2h *)&pOut[i2_store * 2U]) = A;
-  *((v2h *)&pOut[i3_store * 2U]) = B;
-#else
-  *((v2h *)&pOut[i0 * 2U]) = H;
-  *((v2h *)&pOut[i1 * 2U]) = E;
-  *((v2h *)&pOut[i2 * 2U]) = A;
-  *((v2h *)&pOut[i3 * 2U]) = B;
-#endif
-
-}
diff --git a/software/kernels/baremetal/mempool_cfft_radix4_f16p.h b/software/kernels/baremetal/mempool_cfft_radix4_f16p.h
deleted file mode 100644
index d2220d090..000000000
--- a/software/kernels/baremetal/mempool_cfft_radix4_f16p.h
+++ /dev/null
@@ -1,526 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-#include "xpulp/builtins_v2.h"
-#define MIN(x, y) (((x) < (y)) ? (x) : (y))
-
-/**
-  @brief         Folding in local memory function
-  @param[in]     pSrc16  points to input buffer of 16b data, Re and Im parts are
-  interleaved
-  @param[in]     fftLen  Length of the complex input vector
-  @param[in]     nPE Number of PE
-  @return        none
-*/
-
-static inline void fold_radix4(__fp16 *pSrc16, uint32_t fftLen,
-                               uint32_t core_id, uint32_t nPE) {
-  uint32_t n2, i0, i1, i2, i3;
-  uint32_t i1_store, i2_store, i3_store;
-  volatile v2h A, B, C;
-  n2 = fftLen >> 2U;
-  for (i0 = core_id * STEP; i0 < MIN(core_id * STEP + STEP, n2); i0++) {
-    i1 = i0 + n2;
-    i2 = i1 + n2;
-    i3 = i2 + n2;
-    A = *(v2h *)&pSrc16[i1 * 2U];
-    B = *(v2h *)&pSrc16[i2 * 2U];
-    C = *(v2h *)&pSrc16[i3 * 2U];
-    i1_store = i0 + N_BANKS;
-    i2_store = i1_store + N_BANKS;
-    i3_store = i2_store + N_BANKS;
-    *(v2h *)&pSrc16[i1_store * 2U] = A;
-    *(v2h *)&pSrc16[i2_store * 2U] = B;
-    *(v2h *)&pSrc16[i3_store * 2U] = C;
-  }
-  mempool_log_partial_barrier(2 * WU_STRIDE, WU_STRIDE * core_id,
-                              nPE * WU_STRIDE);
-}
-
-#ifdef FOLDED_TWIDDLES
-/**
-  @brief         Full FFT butterfly
-  @param[in]     pSrc16  points to input buffer of 16b data, Re and Im parts are
-  interleaved
-  @param[out]    pDst16  points to output buffer of 16b data, Re and Im parts
-  are interleaved
-  @param[in]     fftLen  Length of the complex input vector
-  @param[in]     pCoef_src Twiddle coefficients vector
-  @param[in]     pCoef_dst Auxiliary twiddle coefficients vector
-  @param[in]     nPE Number of PE
-  @return        pointer to output vector
-*/
-__fp16 *mempool_radix4_cfft_q16p_folded(__fp16 *pSrc16, __fp16 *pDst16,
-                                         uint32_t fftLen, __fp16 *pCoef_src,
-                                         __fp16 *pCoef_dst, uint32_t nPE)
-#else
-/**
-  Twiddles are not folded in memory
-  @brief         Full FFT butterfly
-  @param[in]     pSrc16  points to input buffer of 16b data, Re and Im parts are
-  interleaved
-  @param[out]    pDst16  points to output buffer of 16b data, Re and Im parts
-  are interleaved
-  @param[in]     fftLen  Length of the complex input vector
-  @param[in]     pCoef_src Twiddle coefficients vector
-  @param[in]     nPE Number of PE
-  @return        pointer to output vector
-*/
-__fp16 *mempool_radix4_cfft_q16p_folded(__fp16 *pSrc16, __fp16 *pDst16,
-                                         uint32_t fftLen, __fp16 *pCoef_src,
-                                         uint32_t nPE)
-#endif
-{
-
-#ifdef FOLDED_TWIDDLES
-  uint32_t absolute_core_id = mempool_get_core_id();
-  uint32_t core_id = absolute_core_id / WU_STRIDE;
-  __fp16 t0, t1, t2, t3, t4, t5;
-  v2h CoSi1, CoSi2, CoSi3;
-  v2h C1, C2, C3;
-  uint32_t n1, n2, n2_store, i0, j, k;
-  uint32_t ic, offset, wing_idx;
-  __fp16 *pTmp;
-#else
-  uint32_t absolute_core_id = mempool_get_core_id();
-  uint32_t core_id = absolute_core_id / WU_STRIDE;
-  __fp16 t0, t1, t2, t3, t4, t5;
-  v2h CoSi1, CoSi2, CoSi3;
-  v2h C1, C2, C3;
-  uint32_t n1, n2, n2_store, i0, j, k;
-  uint32_t ic, offset, wing_id, bank_id;
-  __fp16 *pTmp;
-  uint32_t twidCoefModifier = 1U;
-#endif
-
-  if (fftLen <= N_BANKS)
-    fold_radix4(pSrc16, fftLen, core_id, nPE);
-
-  /* START OF FIRST STAGE PROCESS */
-  n1 = fftLen;
-  n2 = n1 >> 2U;
-  n2_store = n2 >> 2U;
-  for (i0 = core_id * STEP; i0 < MIN(core_id * STEP + STEP, n2); i0++) {
-
-#ifdef FOLDED_TWIDDLES
-    CoSi1 = *(v2h *)&pCoef_src[2U * i0];
-    CoSi2 = *(v2h *)&pCoef_src[2U * (i0 + 1 * N_BANKS)];
-    CoSi3 = *(v2h *)&pCoef_src[2U * (i0 + 2 * N_BANKS)];
-    if (i0 % 4 == 0) {
-      ic = i0 >> 2U;
-      *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi1;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi1;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi1;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi1;
-      ic += N_BANKS;
-      *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi2;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi2;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi2;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi2;
-      ic += N_BANKS;
-      *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi3;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi3;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi3;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi3;
-    }
-#else
-    CoSi1 = *(v2h *)&pCoef_src[2U * i0];
-    CoSi2 = *(v2h *)&pCoef_src[2U * (i0 * 2U)];
-    CoSi3 = *(v2h *)&pCoef_src[2U * (i0 * 3U)];
-#endif
-    asm volatile("pv.extract.h  %[t1],%[CoSi1],0;"
-                 "pv.extract.h  %[t3],%[CoSi2],0;"
-                 "pv.extract.h  %[t5],%[CoSi3],0;"
-                 "pv.extract.h  %[t0],%[CoSi1],1;"
-                 "pv.extract.h  %[t2],%[CoSi2],1;"
-                 "pv.extract.h  %[t4],%[CoSi3],1;"
-                 "fsub.h           %[t0],zero,%[t0];"
-                 "fsub.h           %[t2],zero,%[t2];"
-                 "fsub.h           %[t4],zero,%[t4];"
-                 "pv.pack.h %[C1],%[t1],%[t0];"
-                 "pv.pack.h %[C2],%[t3],%[t2];"
-                 "pv.pack.h %[C3],%[t5],%[t4];"
-                 : [C1] "=r"(C1), [C2] "=r"(C2), [C3] "=r"(C3), [t0] "=&r"(t0),
-                   [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3),
-                   [t4] "=&r"(t4), [t5] "=&r"(t5)
-                 : [CoSi1] "r"(CoSi1), [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3)
-                 :);
-    radix4_butterfly(pSrc16, pDst16, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2,
-                           C3);
-  }
-  pTmp = pSrc16;
-  pSrc16 = pDst16;
-  pDst16 = pTmp;
-#ifdef FOLDED_TWIDDLES
-  pTmp = pCoef_src;
-  pCoef_src = pCoef_dst;
-  pCoef_dst = pTmp;
-#else
-  twidCoefModifier <<= 2U;
-#endif
-  mempool_log_partial_barrier(2 * WU_STRIDE, absolute_core_id, nPE * WU_STRIDE);
-  /* END OF FIRST STAGE PROCESSING */
-
-  /* START OF MIDDLE STAGE PROCESS */
-  for (k = fftLen / 4U; k > 4U; k >>= 2U) {
-    n1 = n2;
-    n2 >>= 2U;
-    n2_store = n2 >> 2U;
-
-#ifdef FOLDED_TWIDDLES
-    for (j = core_id * STEP; j < core_id * STEP + STEP; j++) {
-      CoSi1 = *(v2h *)&pCoef_src[2U * j];
-      CoSi2 = *(v2h *)&pCoef_src[2U * (j + 1 * N_BANKS)];
-      CoSi3 = *(v2h *)&pCoef_src[2U * (j + 2 * N_BANKS)];
-      if (j % 4 == 0) {
-        wing_idx = j % n2;
-        offset = (j / n2);
-        ic = wing_idx >> 2U;
-        ic += offset * n2;
-        *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi1;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi1;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi1;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi1;
-        ic += N_BANKS;
-        *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi2;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi2;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi2;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi2;
-        ic += N_BANKS;
-        *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi3;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi3;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi3;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi3;
-      }
-#else
-    bank_id = core_id / n2_store;
-    wing_id = core_id % n2_store;
-    offset = bank_id * n2;
-    for (j = wing_id * 4; j < MIN(wing_id * 4 + 4, n2); j++) {
-      ic = j * twidCoefModifier;
-      CoSi1 = *(v2h *)&pCoef_src[2U * ic];
-      CoSi2 = *(v2h *)&pCoef_src[2U * (ic * 2U)];
-      CoSi3 = *(v2h *)&pCoef_src[2U * (ic * 3U)];
-#endif
-      asm volatile("pv.extract.h  %[t1],%[CoSi1],0;"
-                   "pv.extract.h  %[t3],%[CoSi2],0;"
-                   "pv.extract.h  %[t5],%[CoSi3],0;"
-                   "pv.extract.h  %[t0],%[CoSi1],1;"
-                   "pv.extract.h  %[t2],%[CoSi2],1;"
-                   "pv.extract.h  %[t4],%[CoSi3],1;"
-                   "fsub.h           %[t0],zero,%[t0];"
-                   "fsub.h           %[t2],zero,%[t2];"
-                   "fsub.h           %[t4],zero,%[t4];"
-                   "pv.pack %[C1],%[t1],%[t0];"
-                   "pv.pack %[C2],%[t3],%[t2];"
-                   "pv.pack %[C3],%[t5],%[t4];"
-                   : [C1] "=r"(C1), [C2] "=r"(C2), [C3] "=r"(C3),
-                     [t0] "=&r"(t0), [t1] "=&r"(t1), [t2] "=&r"(t2),
-                     [t3] "=&r"(t3), [t4] "=&r"(t4), [t5] "=&r"(t5)
-                   : [CoSi1] "r"(CoSi1), [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3)
-                   :);
-#ifdef FOLDED_TWIDDLES
-      i0 = j;
-      radix4_butterfly(pSrc16, pDst16, i0, n2, CoSi1, CoSi2, CoSi3, C1,
-                              C2, C3);
-    }
-#else
-      i0 = offset + j;
-      radix4_butterfly(pSrc16, pDst16, i0, n2, CoSi1, CoSi2, CoSi3, C1,
-                              C2, C3);
-    }
-#endif
-    pTmp = pSrc16;
-    pSrc16 = pDst16;
-    pDst16 = pTmp;
-#ifdef FOLDED_TWIDDLES
-    pTmp = pCoef_src;
-    pCoef_src = pCoef_dst;
-    pCoef_dst = pTmp;
-#else
-    twidCoefModifier <<= 2U;
-#endif
-    mempool_log_partial_barrier(2 * WU_STRIDE, absolute_core_id,
-                                nPE * WU_STRIDE);
-  }
-  /* END OF MIDDLE STAGE PROCESSING */
-
-  /* START OF LAST STAGE PROCESSING */
-  n1 = n2;
-  n2 >>= 2U;
-  for (i0 = core_id * STEP; i0 < MIN(core_id * STEP + STEP, fftLen >> 2U);
-       i0++) {
-    radix4_butterfly_last(pSrc16, pDst16, i0);
-  }
-  mempool_log_partial_barrier(2 * WU_STRIDE, absolute_core_id, nPE * WU_STRIDE);
-  /* END OF LAST STAGE PROCESSING */
-
-  return pDst16;
-}
-
-/**
-  SCHEDULER OF MULTIPLE FOLDED FFTS
-  Memory:
-
-  1st row of FFTS
-
-  col_idx1     col_idx2     col_idx3
-  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
-  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
-  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
-  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
-
-  2nd row of FFTS
-
-  col_idx1     col_idx2     col_idx3
-  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
-  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
-  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
-  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
-
-  ...
-
-  @brief         Scheduler of folded FFTs
-  @param[in]     column index of the current FFT
-  @param[in]     pSrc16  input buffer of 16b data, Re and Im are interleaved
-  @param[out]    pDst16  output buffer of 16b data, Re and Im are interleaved
-  @param[in]     fftLen  Length of the complex input vector
-  @param[in]     pCoef_src Twiddle coefficients vector
-  @param[in]     pCoef_dst Twiddle coefficients vector
-  @param[in]     pBitRevTable Bitreversal table
-  @param[in]     bitReverseLen Length of bitreversal table
-  @param[in]     bitReverseFlag Flag for bitreversal
-  @param[in]     nPE Number of PE
-  @return        void
-*/
-
-void mempool_radix4_cfft_q16p_scheduler(uint32_t col_id, __fp16 *pSrc16,
-                                        __fp16 *pDst16, uint32_t fftLen,
-                                        __fp16 *pCoef_src, __fp16 *pCoef_dst,
-                                        __attribute__((unused))
-                                        uint16_t *pBitRevTable,
-                                        __attribute__((unused))
-                                        uint16_t bitReverseLen,
-                                        uint8_t bitReverseFlag, uint32_t nPE) {
-
-  uint32_t absolute_core_id = mempool_get_core_id();
-  uint32_t core_id = absolute_core_id % (fftLen >> 4U);
-
-  uint32_t n1, n2, i0, ic, j, k;
-  uint32_t n2_store;
-  uint32_t offset, wing_idx;
-  __fp16 *pTmp;
-  int32_t t0, t1, t2, t3, t4, t5;
-  v2h CoSi1, CoSi2, CoSi3;
-  v2h C1, C2, C3;
-
-  /* FIRST STAGE */
-  n1 = fftLen;
-  n2 = n1 >> 2U;
-  n2_store = n2 >> 2U;
-  for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, n2); i0++) {
-    CoSi1 = *(v2h *)&pCoef_src[2U * i0];
-    CoSi2 = *(v2h *)&pCoef_src[2U * (i0 + 1 * N_BANKS)];
-    CoSi3 = *(v2h *)&pCoef_src[2U * (i0 + 2 * N_BANKS)];
-    if (i0 % 4 == 0) {
-      ic = i0 / 4;
-      *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi1;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi1;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi1;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi1;
-      ic += N_BANKS;
-      *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi2;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi2;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi2;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi2;
-      ic += N_BANKS;
-      *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi3;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi3;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi3;
-      *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi3;
-    }
-    asm volatile("pv.extract.h  %[t1],%[CoSi1],0;"
-                 "pv.extract.h  %[t3],%[CoSi2],0;"
-                 "pv.extract.h  %[t5],%[CoSi3],0;"
-                 "pv.extract.h  %[t0],%[CoSi1],1;"
-                 "pv.extract.h  %[t2],%[CoSi2],1;"
-                 "pv.extract.h  %[t4],%[CoSi3],1;"
-                 "fsub.h           %[t0],zero,%[t0];"
-                 "fsub.h           %[t2],zero,%[t2];"
-                 "fsub.h           %[t4],zero,%[t4];"
-                 "pv.pack.h %[C1],%[t1],%[t0];"
-                 "pv.pack.h %[C2],%[t3],%[t2];"
-                 "pv.pack.h %[C3],%[t5],%[t4];"
-                 : [C1] "=r"(C1), [C2] "=r"(C2), [C3] "=r"(C3), [t0] "=&r"(t0),
-                   [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3),
-                   [t4] "=&r"(t4), [t5] "=&r"(t5)
-                 : [CoSi1] "r"(CoSi1), [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3)
-                 :);
-    for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
-      __fp16 *pIn = pSrc16 + idx_row * (N_BANKS * 8);
-      __fp16 *pOut = pDst16 + idx_row * (N_BANKS * 8);
-      radix4_butterfly(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2,
-                             C3);
-    }
-  }
-  pTmp = pSrc16;
-  pSrc16 = pDst16;
-  pDst16 = pTmp;
-  pTmp = pCoef_src;
-  pCoef_src = pCoef_dst;
-  pCoef_dst = pTmp;
-  mempool_log_partial_barrier(2, absolute_core_id, nPE);
-
-  /* MIDDLE STAGE */
-  for (k = fftLen / 4U; k > 4U; k >>= 2U) {
-    n1 = n2;
-    n2 >>= 2U;
-    n2_store = n2 >> 2U;
-
-    for (j = core_id * 4; j < core_id * 4 + 4; j++) {
-      CoSi1 = *(v2h *)&pCoef_src[2U * (j)];
-      CoSi2 = *(v2h *)&pCoef_src[2U * (j + 1 * N_BANKS)];
-      CoSi3 = *(v2h *)&pCoef_src[2U * (j + 2 * N_BANKS)];
-      if (j % 4 == 0) {
-
-        wing_idx = j % n2;
-        offset = (j / n2);
-        ic = wing_idx >> 2U;
-        ic += offset * n2;
-
-        *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi1;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi1;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi1;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi1;
-        ic += N_BANKS;
-        *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi2;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi2;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi2;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi2;
-        ic += N_BANKS;
-        *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi3;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi3;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi3;
-        *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi3;
-      }
-      asm volatile("pv.extract.h  %[t1],%[CoSi1],0;"
-                   "pv.extract.h  %[t3],%[CoSi2],0;"
-                   "pv.extract.h  %[t5],%[CoSi3],0;"
-                   "pv.extract.h  %[t0],%[CoSi1],1;"
-                   "pv.extract.h  %[t2],%[CoSi2],1;"
-                   "pv.extract.h  %[t4],%[CoSi3],1;"
-                   "fsub.h           %[t0],zero,%[t0];"
-                   "fsub.h           %[t2],zero,%[t2];"
-                   "fsub.h           %[t4],zero,%[t4];"
-                   "pv.pack.h %[C1],%[t1],%[t0];"
-                   "pv.pack.h %[C2],%[t3],%[t2];"
-                   "pv.pack.h %[C3],%[t5],%[t4];"
-                   : [C1] "=r"(C1), [C2] "=r"(C2), [C3] "=r"(C3),
-                     [t0] "=&r"(t0), [t1] "=&r"(t1), [t2] "=&r"(t2),
-                     [t3] "=&r"(t3), [t4] "=&r"(t4), [t5] "=&r"(t5)
-                   : [CoSi1] "r"(CoSi1), [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3)
-                   :);
-      for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
-        __fp16 *pIn = pSrc16 + idx_row * (N_BANKS * 8);
-        __fp16 *pOut = pDst16 + idx_row * (N_BANKS * 8);
-        radix4_butterfly(pIn, pOut, j, n2, CoSi1, CoSi2, CoSi3, C1, C2,
-                                C3);
-      }
-    }
-    pTmp = pSrc16;
-    pSrc16 = pDst16;
-    pDst16 = pTmp;
-    pTmp = pCoef_src;
-    pCoef_src = pCoef_dst;
-    pCoef_dst = pTmp;
-    mempool_log_partial_barrier(2, absolute_core_id, nPE);
-  }
-
-  /*  LAST STAGE */
-  n1 = n2;
-  n2 >>= 2U;
-  for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, fftLen >> 2U); i0++) {
-    for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
-      __fp16 *pIn = pSrc16 + idx_row * (N_BANKS * 8);
-      __fp16 *pOut = pDst16 + idx_row * (N_BANKS * 8);
-      radix4_butterfly_last(pIn, pOut, i0);
-    }
-  }
-  pTmp = pSrc16;
-  pSrc16 = pDst16;
-  pDst16 = pTmp;
-  mempool_log_partial_barrier(2, absolute_core_id, nPE);
-
-  mempool_stop_benchmark();
-  mempool_start_benchmark();
-
-  /* BITREVERSAL */
-  // Bitreversal stage stores in the sequential addresses
-  if (bitReverseFlag) {
-#ifdef BITREVERSETABLE
-    uint16_t *ptr1 = (uint16_t *)(pSrc16 + 2 * col_id * (fftLen >> 2U));
-    uint16_t *ptr2 = (uint16_t *)(pDst16 + 2 * col_id * (3 * (fftLen >> 2)));
-    for (j = 2 * core_id; j < bitReverseLen; j += 2 * nPE) {
-      v2h addr, tmpa, tmpb;
-      addr = __SRA2(*(v2h *)&pBitRevTable[j], ((v2h){2, 2}));
-      for (int32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
-        int32_t a0 = addr[0] / 4 + (addr[0] % 4) * N_BANKS;
-        int32_t a1 = addr[1] / 4 + (addr[0] % 4) * N_BANKS;
-        tmpa = *(v2h *)&ptr1[a0 + idx_row * (N_BANKS * 8)];
-        tmpb = *(v2h *)&ptr1[a1 + idx_row * (N_BANKS * 8)];
-        *((v2h *)&ptr2[addr[0] + idx_row * (N_BANKS * 8)]) = tmpb;
-        *((v2h *)&ptr2[addr[1] + idx_row * (N_BANKS * 8)]) = tmpa;
-      }
-    }
-#else
-    uint16_t *ptr1 = (uint16_t *)(pSrc16 + 2 * col_id * (fftLen >> 2U));
-    uint16_t *ptr2 = (uint16_t *)(pDst16 + 2 * col_id * (3 * (fftLen >> 2)));
-    for (j = core_id * 16; j < MIN(core_id * 16 + 16, fftLen >> 2U); j += 4) {
-      uint32_t idx0 = j;
-      uint32_t idx1 = j + 1;
-      uint32_t idx2 = j + 2;
-      uint32_t idx3 = j + 3;
-      uint32_t idx_result0 = 0;
-      uint32_t idx_result1 = 0;
-      uint32_t idx_result2 = 0;
-      uint32_t idx_result3 = 0;
-      for (k = 0; k < LOG2; k++) {
-        idx_result0 = (idx_result0 << 1U) | (idx0 & 1U);
-        idx_result1 = (idx_result1 << 1U) | (idx1 & 1U);
-        idx_result2 = (idx_result2 << 1U) | (idx2 & 1U);
-        idx_result3 = (idx_result3 << 1U) | (idx3 & 1U);
-        idx0 = idx0 >> 1U;
-        idx1 = idx1 >> 1U;
-        idx2 = idx2 >> 1U;
-        idx3 = idx3 >> 1U;
-      }
-      for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
-        uint32_t addr_src0 = (idx0 / 4) + (idx0 % 4) * N_BANKS;
-        uint32_t addr_src1 = (idx1 / 4) + (idx1 % 4) * N_BANKS;
-        uint32_t addr_src2 = (idx2 / 4) + (idx2 % 4) * N_BANKS;
-        uint32_t addr_src3 = (idx3 / 4) + (idx3 % 4) * N_BANKS;
-        uint32_t addr_dst0 = idx_result0;
-        uint32_t addr_dst1 = idx_result1;
-        uint32_t addr_dst2 = idx_result2;
-        uint32_t addr_dst3 = idx_result3;
-        addr_src0 += idx_row * (N_BANKS * 8);
-        addr_src1 += idx_row * (N_BANKS * 8);
-        addr_src2 += idx_row * (N_BANKS * 8);
-        addr_src3 += idx_row * (N_BANKS * 8);
-        addr_dst0 += idx_row * (N_BANKS * 8);
-        addr_dst1 += idx_row * (N_BANKS * 8);
-        addr_dst2 += idx_row * (N_BANKS * 8);
-        addr_dst3 += idx_row * (N_BANKS * 8);
-        *((uint32_t *)&ptr2[addr_dst0]) = (uint32_t)ptr1[addr_src0];
-        *((uint32_t *)&ptr2[addr_dst1]) = (uint32_t)ptr1[addr_src1];
-        *((uint32_t *)&ptr2[addr_dst2]) = (uint32_t)ptr1[addr_src2];
-        *((uint32_t *)&ptr2[addr_dst3]) = (uint32_t)ptr1[addr_src3];
-      }
-    }
-#endif
-  }
-  mempool_log_partial_barrier(2, absolute_core_id, nPE);
-}
diff --git a/software/kernels/baremetal/mempool_cfft_radix4_q16_bitreversal.h b/software/kernels/baremetal/mempool_cfft_radix4_q16_bitreversal.h
deleted file mode 100644
index 32f7a5265..000000000
--- a/software/kernels/baremetal/mempool_cfft_radix4_q16_bitreversal.h
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-void mempool_bitrev_q16p_xpulpimg(uint16_t *pSrc, uint16_t *pDst,
-                                  const uint16_t fftLen, const uint32_t nPE) {
-  uint32_t absolute_core_id = mempool_get_core_id();
-  uint32_t core_id = absolute_core_id / WU_STRIDE;
-  uint32_t idx_result, idx, i, j;
-  for (i = core_id; i < fftLen; i += nPE) {
-    idx_result = 0;
-    idx = i;
-    for (j = 0; j < LOG2; j++) {
-      idx_result = (idx_result << 1U) | (idx & 1U);
-      idx = idx >> 1U;
-    }
-    pDst[2 * idx_result] = pSrc[2 * i];
-    pDst[2 * idx_result + 1] = pSrc[2 * i + 1];
-  }
-  mempool_log_partial_barrier(2 * WU_STRIDE, absolute_core_id, nPE * WU_STRIDE);
-}
diff --git a/software/kernels/baremetal/mempool_chest_f16.h b/software/kernels/baremetal/mempool_chest_f16.h
new file mode 100644
index 000000000..7d53afc65
--- /dev/null
+++ b/software/kernels/baremetal/mempool_chest_f16.h
@@ -0,0 +1,382 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#pragma once
+#define __CDOTP
+#define __MUL
+
+/* a[i] = ar[i] + i * ai[j]
+
+   out[i][j] = a[i] / c[j]
+   out[i][j + 1] = a[i] / c[j + 1h
+   out[i][j + 2] = a[i] / c[j + 2]
+   out[i][j + 3] = a[i] / c[j + 3]*/
+
+#ifdef __XDIVSQRT
+#define DIV_LOOP(ab, ab_n, i)                                                  \
+  {                                                                            \
+    re0 = 0;                                                                   \
+    re1 = 0;                                                                   \
+    re2 = 0;                                                                   \
+    re3 = 0;                                                                   \
+    im0 = 0;                                                                   \
+    im1 = 0;                                                                   \
+    im2 = 0;                                                                   \
+    im3 = 0;                                                                   \
+    D0 = 0;                                                                    \
+    D1 = 0;                                                                    \
+    D2 = 0;                                                                    \
+    D3 = 0;                                                                    \
+    cd0 = *(uint32_t *)&pPilotTX_itr[2U * j];                                  \
+    cd1 = *(uint32_t *)&pPilotTX_itr[2U * (j + 1)];                            \
+    cd2 = *(uint32_t *)&pPilotTX_itr[2U * (j + 2)];                            \
+    cd3 = *(uint32_t *)&pPilotTX_itr[2U * (j + 3)];                            \
+    asm volatile("vfdotpex.s.h   %[D0],  %[cd0], %[cd0];"                      \
+                 "vfdotpex.s.h   %[D1],  %[cd1], %[cd1];"                      \
+                 "vfdotpex.s.h   %[D2],  %[cd2], %[cd2];"                      \
+                 "vfdotpex.s.h   %[D3],  %[cd3], %[cd3];"                      \
+                 "vfdotpex.s.h   %[re0], %[x],   %[cd0];"                      \
+                 "vfdotpex.s.h   %[re1], %[x],   %[cd1];"                      \
+                 "vfdotpex.s.h   %[re2], %[x],   %[cd2];"                      \
+                 "vfdotpex.s.h   %[re3], %[x],   %[cd3];"                      \
+                 "vfdotpex.s.h   %[im0], %[y],   %[cd0];"                      \
+                 "vfdotpex.s.h   %[im1], %[y],   %[cd1];"                      \
+                 "vfdotpex.s.h   %[im2], %[y],   %[cd2];"                      \
+                 "vfdotpex.s.h   %[im3], %[y],   %[cd3];"                      \
+                 "fdiv.s         %[re0], %[re0], %[D0];"                       \
+                 "fdiv.s         %[re1], %[re1], %[D1];"                       \
+                 "fdiv.s         %[re2], %[re2], %[D2];"                       \
+                 "fdiv.s         %[re3], %[re3], %[D3];"                       \
+                 "fdiv.s         %[im0], %[im0], %[D0];"                       \
+                 "fdiv.s         %[im1], %[im1], %[D1];"                       \
+                 "fdiv.s         %[im2], %[im2], %[D2];"                       \
+                 "fdiv.s         %[im3], %[im3], %[D3];"                       \
+                 "vfcpka.h.s     %[re0], %[re0], %[im0];"                      \
+                 "vfcpka.h.s     %[re1], %[re1], %[im1];"                      \
+                 "vfcpka.h.s     %[re2], %[re2], %[im2];"                      \
+                 "vfcpka.h.s     %[re3], %[re3], %[im3];"                      \
+                 : [D0] "+&r"(D0), [D1] "+&r"(D1), [D2] "+&r"(D2),             \
+                   [D3] "+&r"(D3), [re0] "+&r"(re0), [re1] "+&r"(re1),         \
+                   [re2] "+&r"(re2), [re3] "+&r"(re3), [im0] "+&r"(im0),       \
+                   [im1] "+&r"(im1), [im2] "+&r"(im2), [im3] "+&r"(im3)        \
+                 : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2),             \
+                   [cd3] "r"(cd3), [x] "r"(ab), [y] "r"(ab_n)                  \
+                 :);                                                           \
+    *((uint32_t *)&pH_itr[2 * (i * nTX + j)]) = re0;                           \
+    *((uint32_t *)&pH_itr[2 * (i * nTX + j + 1)]) = re1;                       \
+    *((uint32_t *)&pH_itr[2 * (i * nTX + j + 2)]) = re2;                       \
+    *((uint32_t *)&pH_itr[2 * (i * nTX + j + 3)]) = re3;                       \
+  }
+#else
+#define DIV_LOOP(ab, ab_n, i)                                                  \
+  {                                                                            \
+    re0 = 0;                                                                   \
+    re1 = 0;                                                                   \
+    re2 = 0;                                                                   \
+    re3 = 0;                                                                   \
+    im0 = 0;                                                                   \
+    im1 = 0;                                                                   \
+    im2 = 0;                                                                   \
+    im3 = 0;                                                                   \
+    D0 = 0;                                                                    \
+    D1 = 0;                                                                    \
+    D2 = 0;                                                                    \
+    D3 = 0;                                                                    \
+    cd0 = *(uint32_t *)&pPilotTX_itr[2U * j];                                  \
+    cd1 = *(uint32_t *)&pPilotTX_itr[2U * (j + 1)];                            \
+    cd2 = *(uint32_t *)&pPilotTX_itr[2U * (j + 2)];                            \
+    cd3 = *(uint32_t *)&pPilotTX_itr[2U * (j + 3)];                            \
+    asm volatile("vfdotpex.s.h   %[D0],  %[cd0], %[cd0];"                      \
+                 "vfdotpex.s.h   %[D1],  %[cd1], %[cd1];"                      \
+                 "vfdotpex.s.h   %[D2],  %[cd2], %[cd2];"                      \
+                 "vfdotpex.s.h   %[D3],  %[cd3], %[cd3];"                      \
+                 "vfdotpex.s.h   %[re0], %[x],   %[cd0];"                      \
+                 "vfdotpex.s.h   %[re1], %[x],   %[cd1];"                      \
+                 "vfdotpex.s.h   %[re2], %[x],   %[cd2];"                      \
+                 "vfdotpex.s.h   %[re3], %[x],   %[cd3];"                      \
+                 "vfdotpex.s.h   %[im0], %[y],   %[cd0];"                      \
+                 "vfdotpex.s.h   %[im1], %[y],   %[cd1];"                      \
+                 "vfdotpex.s.h   %[im2], %[y],   %[cd2];"                      \
+                 "vfdotpex.s.h   %[im3], %[y],   %[cd3];"                      \
+                 : [D0] "+&r"(D0), [D1] "+&r"(D1), [D2] "+&r"(D2),             \
+                   [D3] "+&r"(D3), [re0] "+&r"(re0), [re1] "+&r"(re1),         \
+                   [re2] "+&r"(re2), [re3] "+&r"(re3), [im0] "+&r"(im0),       \
+                   [im1] "+&r"(im1), [im2] "+&r"(im2), [im3] "+&r"(im3)        \
+                 : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2),             \
+                   [cd3] "r"(cd3), [x] "r"(ab), [y] "r"(ab_n)                  \
+                 :);                                                           \
+    re0 = re0 / D0;                                                            \
+    re1 = re1 / D1;                                                            \
+    re2 = re2 / D2;                                                            \
+    re3 = re3 / D3;                                                            \
+    im0 = im0 / D0;                                                            \
+    im1 = im1 / D1;                                                            \
+    im2 = im2 / D2;                                                            \
+    im3 = im3 / D3;                                                            \
+    asm volatile("vfcpka.h.s %[re0], %[re0], %[im0];"                          \
+                 "vfcpka.h.s %[re1], %[re1], %[im1];"                          \
+                 "vfcpka.h.s %[re2], %[re2], %[im2];"                          \
+                 "vfcpka.h.s %[re3], %[re3], %[im3];"                          \
+                 : [re0] "+&r"(re0), [re1] "+&r"(re1), [re2] "+&r"(re2),       \
+                   [re3] "+&r"(re3), [im0] "+&r"(im0), [im1] "+&r"(im1),       \
+                   [im2] "+&r"(im2), [im3] "+&r"(im3)                          \
+                 : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2),             \
+                   [cd3] "r"(cd3), [x] "r"(ab), [y] "r"(ab_n)                  \
+                 :);                                                           \
+    *((uint32_t *)&pH_itr[2 * (i * nTX + j)]) = re0;                           \
+    *((uint32_t *)&pH_itr[2 * (i * nTX + j + 1)]) = re1;                       \
+    *((uint32_t *)&pH_itr[2 * (i * nTX + j + 2)]) = re2;                       \
+    *((uint32_t *)&pH_itr[2 * (i * nTX + j + 3)]) = re3;                       \
+  }
+#endif
+
+/* a[i] = ar[i] + i * ai[j]
+
+   out[i][j] = a[i] * c[j]
+   out[i][j + 1] = a[i] * c[j + 1]
+   out[i][j + 2] = a[i] * c[j + 2]
+   out[i][j + 3] = a[i] * c[j + 3]*/
+
+#define MUL_LOOP(ab, ab_n, i)                                                  \
+  {                                                                            \
+    re0 = 0;                                                                   \
+    re1 = 0;                                                                   \
+    re2 = 0;                                                                   \
+    re3 = 0;                                                                   \
+    im0 = 0;                                                                   \
+    im1 = 0;                                                                   \
+    im2 = 0;                                                                   \
+    im3 = 0;                                                                   \
+    cd0 = *(uint32_t *)&pPilotTX_itr[2U * j];                                  \
+    cd1 = *(uint32_t *)&pPilotTX_itr[2U * (j + 1)];                            \
+    cd2 = *(uint32_t *)&pPilotTX_itr[2U * (j + 2)];                            \
+    cd3 = *(uint32_t *)&pPilotTX_itr[2U * (j + 3)];                            \
+    asm volatile("vfdotpex.s.h   %[re0], %[x], %[cd0];"                        \
+                 "vfdotpex.s.h   %[re1], %[x], %[cd1];"                        \
+                 "vfdotpex.s.h   %[re2], %[x], %[cd2];"                        \
+                 "vfdotpex.s.h   %[re3], %[x], %[cd3];"                        \
+                 "vfdotpex.s.h   %[im0], %[y], %[cd0];"                        \
+                 "vfdotpex.s.h   %[im1], %[y], %[cd1];"                        \
+                 "vfdotpex.s.h   %[im2], %[y], %[cd2];"                        \
+                 "vfdotpex.s.h   %[im3], %[y], %[cd3];"                        \
+                 : [re0] "+&r"(re0), [re1] "+&r"(re1), [re2] "+&r"(re2),       \
+                   [re3] "+&r"(re3), [im0] "+&r"(im0), [im1] "+&r"(im1),       \
+                   [im2] "+&r"(im2), [im3] "+&r"(im3)                          \
+                 : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2),             \
+                   [cd3] "r"(cd3), [x] "r"(ab), [y] "r"(ab_n)                  \
+                 :);                                                           \
+    asm volatile(                                                              \
+        "vfcpka.h.s       %[re0], %[re0], %[im0];"                             \
+        "vfcpka.h.s       %[re1], %[re1], %[im1];"                             \
+        "vfcpka.h.s       %[re2], %[re2], %[im2];"                             \
+        "vfcpka.h.s       %[re3], %[re3], %[im3];"                             \
+        : [re0] "+&r"(re0), [re1] "+&r"(re1), [re2] "+&r"(re2),                \
+          [re3] "+&r"(re3), [im0] "+&r"(im0), [im1] "+&r"(im1),                \
+          [im2] "+&r"(im2), [im3] "+&r"(im3)                                   \
+        : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2), [cd3] "r"(cd3)       \
+        :);                                                                    \
+    *((uint32_t *)&pH_itr[2 * (i * nTX + j)]) = re0;                           \
+    *((uint32_t *)&pH_itr[2 * (i * nTX + j + 1)]) = re1;                       \
+    *((uint32_t *)&pH_itr[2 * (i * nTX + j + 2)]) = re2;                       \
+    *((uint32_t *)&pH_itr[2 * (i * nTX + j + 3)]) = re3;                       \
+  }
+
+#define CMUL_LOOP(ab, i)                                                       \
+  {                                                                            \
+    sum0 = 0;                                                                  \
+    sum1 = 0;                                                                  \
+    sum2 = 0;                                                                  \
+    sum3 = 0;                                                                  \
+    cd0 = *(uint32_t *)&pPilotTX_itr[2U * j];                                  \
+    cd1 = *(uint32_t *)&pPilotTX_itr[2U * (j + 1)];                            \
+    cd2 = *(uint32_t *)&pPilotTX_itr[2U * (j + 2)];                            \
+    cd3 = *(uint32_t *)&pPilotTX_itr[2U * (j + 3)];                            \
+    asm volatile("fcdotpex.s.h   %[sum0], %[x], %[cd0];"                       \
+                 "fcdotpex.s.h   %[sum1], %[x], %[cd1];"                       \
+                 "fcdotpex.s.h   %[sum2], %[x], %[cd2];"                       \
+                 "fcdotpex.s.h   %[sum3], %[x], %[cd3];"                       \
+                 : [sum0] "+&r"(sum0), [sum1] "+&r"(sum1), [sum2] "+&r"(sum2), \
+                   [sum3] "+&r"(sum3)                                          \
+                 : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2),             \
+                   [cd3] "r"(cd3), [x] "r"(ab)                                 \
+                 :);                                                           \
+    *((uint32_t *)&pH_itr[2 * (i * nTX + j)]) = sum0;                          \
+    *((uint32_t *)&pH_itr[2 * (i * nTX + j + 1)]) = sum1;                      \
+    *((uint32_t *)&pH_itr[2 * (i * nTX + j + 2)]) = sum2;                      \
+    *((uint32_t *)&pH_itr[2 * (i * nTX + j + 3)]) = sum3;                      \
+  }
+
+#define SHUFFLE_A                                                              \
+  {                                                                            \
+    asm volatile(                                                              \
+        "xor           %[ab_n0], %[ab0],   %[neg_mask];"                       \
+        "xor           %[ab_n1], %[ab1],   %[neg_mask];"                       \
+        "xor           %[ab_n2], %[ab2],   %[neg_mask];"                       \
+        "xor           %[ab_n3], %[ab3],   %[neg_mask];"                       \
+        "pv.shuffle2.h %[ab_n0], %[ab_n0], %[mask];"                           \
+        "pv.shuffle2.h %[ab_n1], %[ab_n1], %[mask];"                           \
+        "pv.shuffle2.h %[ab_n2], %[ab_n2], %[mask];"                           \
+        "pv.shuffle2.h %[ab_n3], %[ab_n3], %[mask];"                           \
+        : [ab_n0] "+&r"(ab_n0), [ab_n1] "+&r"(ab_n1), [ab_n2] "+&r"(ab_n2),    \
+          [ab_n3] "+&r"(ab_n3)                                                 \
+        : [ab0] "r"(ab0), [ab1] "r"(ab1), [ab2] "r"(ab2), [ab3] "r"(ab3),      \
+          [neg_mask] "r"(0x00008000), [mask] "r"(0x00020003)                   \
+        :);                                                                    \
+  }
+
+/**
+  @brief         Block-type channel estimation.
+  @param[in]     pH  points to output channel
+  @param[in]     pPilotRX points to received symbol
+  @param[in]     pPilotTX points to sent pilot
+  @param[in]     nTX Number of transmitters
+  @param[in]     nRX Number of receivers
+  @param[in]     nSc Number of Subcarriers
+  @return        none
+*/
+void mempool_chest_f16s_unrolled4(__fp16 *pH, __fp16 *pPilotRX,
+                                  __fp16 *pPilotTX, uint32_t nRX, uint32_t nTX,
+                                  uint32_t nSc) {
+
+  uint32_t ab0, ab1, ab2, ab3;
+  uint32_t cd0, cd1, cd2, cd3;
+  uint32_t re0, re1, re2, re3;
+  uint32_t im0, im1, im2, im3;
+  uint32_t D0, D1, D2, D3;
+  uint32_t ab_n0, ab_n1, ab_n2, ab_n3;
+  __fp16 *pPilotTX_itr;
+  __fp16 *pPilotRX_itr;
+  __fp16 *pH_itr;
+
+  for (uint32_t k = 0; k < nSc; k++) {
+    pPilotTX_itr = pPilotTX + k * (2 * nTX);
+    pPilotRX_itr = pPilotRX + k * (2 * nRX);
+    pH_itr = pH + k * 2 * (nTX * nRX);
+    for (uint32_t i = 0; i < nRX; i++) {
+      ab0 = *(uint32_t *)&pPilotRX_itr[2U * i];
+      ab1 = *(uint32_t *)&pPilotRX_itr[2U * (i + 1)];
+      ab2 = *(uint32_t *)&pPilotRX_itr[2U * (i + 2)];
+      ab3 = *(uint32_t *)&pPilotRX_itr[2U * (i + 3)];
+      SHUFFLE_A;
+      for (uint32_t j = 0; j < nTX; j += 4) {
+        DIV_LOOP(ab0, ab_n0, i);
+        DIV_LOOP(ab1, ab_n1, i + 1);
+        DIV_LOOP(ab2, ab_n2, i + 2);
+        DIV_LOOP(ab3, ab_n3, i + 3);
+      }
+    }
+  }
+  return;
+}
+
+/**
+  @brief         Block-type channel estimation.
+  @param[in]     pH  points to output channel
+  @param[in]     pPilotRX points to received symbol
+  @param[in]     pPilotTX points to sent pilot
+  @param[in]     nTX Number of transmitters
+  @param[in]     nRX Number of receivers
+  @param[in]     nSc Number of Subcarriers
+  @param[in]     core_id ID of the PE
+  @param[in]     nPE Number of PEs
+  @return        none
+*/
+void mempool_chest_f16p_unrolled4(__fp16 *pH, __fp16 *pPilotRX,
+                                  __fp16 *pPilotTX, uint32_t nRX, uint32_t nTX,
+                                  uint32_t nSc, uint32_t core_id,
+                                  uint32_t nPE) {
+  uint32_t ab0, ab1, ab2, ab3;
+  uint32_t cd0, cd1, cd2, cd3;
+#ifndef __CDOTP
+  uint32_t ab_n0, ab_n1, ab_n2, ab_n3;
+  uint32_t re0, re1, re2, re3;
+  uint32_t im0, im1, im2, im3;
+#else
+  uint32_t sum0, sum1, sum2, sum3;
+#endif
+
+#ifndef __MUL
+  uint32_t D0, D1, D2, D3;
+#endif
+
+  __fp16 *pPilotTX_itr;
+  __fp16 *pPilotRX_itr;
+  __fp16 *pH_itr;
+
+  for (uint32_t k = core_id; k < nSc; k += nPE) {
+    pPilotTX_itr = pPilotTX + k * (2 * nTX);
+    pPilotRX_itr = pPilotRX + k * (2 * nRX);
+    pH_itr = pH + k * 2 * (nTX * nRX);
+    for (uint32_t i = 0; i < nRX; i += 4) {
+      ab0 = *(uint32_t *)&pPilotRX_itr[2U * i];
+      ab1 = *(uint32_t *)&pPilotRX_itr[2U * (i + 1)];
+      ab2 = *(uint32_t *)&pPilotRX_itr[2U * (i + 2)];
+      ab3 = *(uint32_t *)&pPilotRX_itr[2U * (i + 3)];
+#ifndef __CDOTP
+      SHUFFLE_A;
+#endif
+
+      for (uint32_t j = 0; j < nTX; j += 4) {
+#if (defined(__CDOTP) && defined(__MUL))
+        CMUL_LOOP(ab0, i);
+        CMUL_LOOP(ab1, i + 1);
+        CMUL_LOOP(ab2, i + 2);
+        CMUL_LOOP(ab3, i + 3);
+#elif (!defined(__CDOTP) && defined(__MUL))
+        MUL_LOOP(ab0, ab_n0, i);
+        MUL_LOOP(ab1, ab_n1, i + 1);
+        MUL_LOOP(ab2, ab_n2, i + 2);
+        MUL_LOOP(ab3, ab_n3, i + 3);
+#else
+        DIV_LOOP(ab0, ab_n0, i)
+        DIV_LOOP(ab1, ab_n1, i + 1)
+        DIV_LOOP(ab2, ab_n2, i + 2)
+        DIV_LOOP(ab3, ab_n3, i + 3)
+#endif
+      }
+    }
+  }
+  return;
+}
+
+void mempool_chest_f16p_unrolled4_local(__fp16 *volatile pH,
+                                        __fp16 *volatile pPilotRX,
+                                        __fp16 *volatile pPilotTX, uint32_t nRX,
+                                        uint32_t nTX, uint32_t nSc,
+                                        uint32_t core_id, uint32_t nPE) {
+  uint32_t ab0, ab1, ab2, ab3;
+  uint32_t cd0, cd1, cd2, cd3;
+  uint32_t sum0, sum1, sum2, sum3;
+  __fp16 *pPilotTX_itr;
+  __fp16 *pPilotRX_itr;
+  __fp16 *pH_itr;
+  uint32_t itr, i, j;
+
+  // Cores Loop over the received pilots vector
+  for (itr = core_id * 4; itr < (nSc * nRX);
+       itr += (BANKING_FACTOR * NUM_CORES)) {
+    // Received pilots are aligned to cores
+    uint32_t sc_RX = itr / nRX;
+    pPilotTX_itr = pPilotTX + sc_RX * (2 * nTX);
+    pPilotRX_itr = pPilotRX + sc_RX * (2 * nRX);
+    pH_itr = pH + sc_RX * 2 * (nTX * nRX);
+
+    // Load received pilots
+    i = itr % nRX;
+    ab0 = *(uint32_t *)&pPilotRX_itr[2U * i];
+    ab1 = *(uint32_t *)&pPilotRX_itr[2U * (i + 1)];
+    ab2 = *(uint32_t *)&pPilotRX_itr[2U * (i + 2)];
+    ab3 = *(uint32_t *)&pPilotRX_itr[2U * (i + 3)];
+    for (j = 0; j < nTX; j += 4) {
+      CMUL_LOOP(ab0, i);
+      CMUL_LOOP(ab1, i + 1);
+      CMUL_LOOP(ab2, i + 2);
+      CMUL_LOOP(ab3, i + 3);
+    }
+  }
+  mempool_barrier(nPE);
+  return;
+}
diff --git a/software/kernels/baremetal/mempool_chest_f16p.h b/software/kernels/baremetal/mempool_chest_f16p.h
deleted file mode 100644
index 835b26237..000000000
--- a/software/kernels/baremetal/mempool_chest_f16p.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-// Includes inner loop
-#include "mempool_chest_f16s.h"
-
-/**
-  @brief         Block-type channel estimation.
-  @param[in]     pH  points to output channel
-  @param[in]     pPilotRX points to received symbol
-  @param[in]     pPilotTX points to sent pilot
-  @param[in]     nTX Number of transmitters
-  @param[in]     nRX Number of receivers
-  @param[in]     nSc Number of Subcarriers
-  @return        none
-*/
-void mempool_chest_f16p_unrolled4(__fp16 *pH, __fp16 *pPilotRX,
-                                  __fp16 *pPilotTX, uint32_t nRX, uint32_t nTX,
-                                  uint32_t nSc, uint32_t core_id,
-                                  uint32_t nPE) {
-  uint32_t ab0, ab1, ab2, ab3;
-  uint32_t ab_n0, ab_n1, ab_n2, ab_n3;
-  __fp16 *pTX;
-  __fp16 *pRX;
-  __fp16 *pOut;
-  for (uint32_t k = core_id; k < nSc; k += nPE) {
-    pTX = pPilotTX + k * (2 * nTX);
-    pRX = pPilotRX + k * (2 * nRX);
-    pOut = pH + k * 2 * (nTX * nRX);
-    for (uint32_t i = 0; i < nRX; i += 4) {
-      ab0 = *(uint32_t *)&pRX[2U * i];
-      ab1 = *(uint32_t *)&pRX[2U * (i + 1)];
-      ab2 = *(uint32_t *)&pRX[2U * (i + 2)];
-      ab3 = *(uint32_t *)&pRX[2U * (i + 3)];
-      asm volatile(
-          "xor      %[ab_n0], %[ab0], %[neg_mask];"
-          "xor      %[ab_n1], %[ab1], %[neg_mask];"
-          "xor      %[ab_n2], %[ab2], %[neg_mask];"
-          "xor      %[ab_n3], %[ab3], %[neg_mask];"
-          "pv.shuffle2.h %[ab_n0], %[ab_n0],  %[mask];"
-          "pv.shuffle2.h %[ab_n1], %[ab_n1],  %[mask];"
-          "pv.shuffle2.h %[ab_n2], %[ab_n2],  %[mask];"
-          "pv.shuffle2.h %[ab_n3], %[ab_n3],  %[mask];"
-          : [ab_n0] "=&r"(ab_n0), [ab_n1] "=&r"(ab_n1), [ab_n2] "=&r"(ab_n2),
-            [ab_n3] "=&r"(ab_n3)
-          : [ab0] "r"(ab0), [ab1] "r"(ab1), [ab2] "r"(ab2), [ab3] "r"(ab3),
-            [neg_mask] "r"(0x00008000), [mask] "r"(0x00020001)
-          :);
-      for (uint32_t j = 0; j < nTX; j += 4) {
-        chest_unrolled4_inner_loop_f16(pTX, pOut, nTX, ab0, ab_n0, i, j);
-        chest_unrolled4_inner_loop_f16(pTX, pOut, nTX, ab1, ab_n1, i + 1, j);
-        chest_unrolled4_inner_loop_f16(pTX, pOut, nTX, ab2, ab_n2, i + 2, j);
-        chest_unrolled4_inner_loop_f16(pTX, pOut, nTX, ab3, ab_n3, i + 3, j);
-      }
-    }
-  }
-  mempool_log_partial_barrier(2, core_id, nPE);
-  return;
-}
diff --git a/software/kernels/baremetal/mempool_chest_f16s.h b/software/kernels/baremetal/mempool_chest_f16s.h
deleted file mode 100644
index 4830ebc6f..000000000
--- a/software/kernels/baremetal/mempool_chest_f16s.h
+++ /dev/null
@@ -1,194 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-#pragma once
-
-#ifdef __XDIVSQRT
-
-/* a[i] = ar[i] + i * ai[j]
-
-   out[i][j] = a[i] / c[j]
-   out[i][j + 1] = a[i] / c[j + 1h
-   out[i][j + 2] = a[i] / c[j + 2]
-   out[i][j + 3] = a[i] / c[j + 3]*/
-
-static inline void chest_unrolled4_inner_loop_f16(__fp16 *pPilotTX, __fp16 *pH,
-                                                  uint32_t nTX, uint32_t ab,
-                                                  uint32_t ab_n, uint32_t i,
-                                                  uint32_t j) {
-
-  uint32_t cd0, cd1, cd2, cd3;
-  float re0 = 0.0f, re1 = 0.0f, re2 = 0.0f, re3 = 0.0f;
-  float im0 = 0.0f, im1 = 0.0f, im2 = 0.0f, im3 = 0.0f;
-  float D0 = 0.0f, D1 = 0.0f, D2 = 0.0f, D3 = 0.0f;
-  cd0 = *(uint32_t *)&pPilotTX[2U * j];
-  cd1 = *(uint32_t *)&pPilotTX[2U * (j + 1)];
-  cd2 = *(uint32_t *)&pPilotTX[2U * (j + 2)];
-  cd3 = *(uint32_t *)&pPilotTX[2U * (j + 3)];
-  asm volatile(
-      // Compute denominator
-      "vfdotpex.s.h   %[D0],  %[cd0],  %[cd0];"
-      "vfdotpex.s.h   %[D1],  %[cd1],  %[cd1];"
-      "vfdotpex.s.h   %[D2],  %[cd2],  %[cd2];"
-      "vfdotpex.s.h   %[D3],  %[cd3],  %[cd3];"
-      // Compute numerator
-      "vfdotpex.s.h   %[re0], %[ab],   %[cd0];"
-      "vfdotpex.s.h   %[re1], %[ab],   %[cd1];"
-      "vfdotpex.s.h   %[re2], %[ab],   %[cd2];"
-      "vfdotpex.s.h   %[re3], %[ab],   %[cd3];"
-      "vfdotpex.s.h   %[im0], %[ab_n], %[cd0];"
-      "vfdotpex.s.h   %[im1], %[ab_n], %[cd1];"
-      "vfdotpex.s.h   %[im2], %[ab_n], %[cd2];"
-      "vfdotpex.s.h   %[im3], %[ab_n], %[cd3];"
-      "fdiv.s         %[re0], %[re0],  %[D0];"
-      "fdiv.s         %[re1], %[re1],  %[D1];"
-      "fdiv.s         %[re2], %[re2],  %[D2];"
-      "fdiv.s         %[re3], %[re3],  %[D3];"
-      "fdiv.s         %[im0], %[im0],  %[D0];"
-      "fdiv.s         %[im1], %[im1],  %[D1];"
-      "fdiv.s         %[im2], %[im2],  %[D2];"
-      "fdiv.s         %[im3], %[im3],  %[D3];"
-      // Pack in 32b word
-      "vfcpka.h.s       %[re0], %[re0], %[im0];"
-      "vfcpka.h.s       %[re1], %[re1], %[im1];"
-      "vfcpka.h.s       %[re2], %[re2], %[im2];"
-      "vfcpka.h.s       %[re3], %[re3], %[im3];"
-      : [D0] "+&r"(D0), [D1] "+&r"(D1), [D2] "+&r"(D2), [D3] "+&r"(D3),
-        [re0] "+&r"(re0), [re1] "+&r"(re1), [re2] "+&r"(re2), [re3] "+&r"(re3),
-        [im0] "+&r"(im0), [im1] "+&r"(im1), [im2] "+&r"(im2), [im3] "+&r"(im3)
-      : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2), [cd3] "r"(cd3),
-        [ab] "r"(ab), [ab_n] "r"(ab_n)
-      :);
-  *((uint32_t *)&pH[2 * (i * nTX + j)]) = *(uint32_t *)&re0;
-  *((uint32_t *)&pH[2 * (i * nTX + j + 1)]) = *(uint32_t *)&re1;
-  *((uint32_t *)&pH[2 * (i * nTX + j + 2)]) = *(uint32_t *)&re2;
-  *((uint32_t *)&pH[2 * (i * nTX + j + 3)]) = *(uint32_t *)&re3;
-  return;
-}
-
-#else
-
-/* a[i] = ar[i] + i * ai[j]
-
-   out[i][j] = a[i] / c[j]
-   out[i][j + 1] = a[i] / c[j + 1h
-   out[i][j + 2] = a[i] / c[j + 2]
-   out[i][j + 3] = a[i] / c[j + 3]*/
-
-static inline void chest_unrolled4_inner_loop_f16(__fp16 *pPilotTX, __fp16 *pH,
-                                                  uint32_t nTX, uint32_t ab,
-                                                  uint32_t ab_n, uint32_t i,
-                                                  uint32_t j) {
-
-  uint32_t cd0, cd1, cd2, cd3;
-  float re0 = 0.0f, re1 = 0.0f, re2 = 0.0f, re3 = 0.0f;
-  float im0 = 0.0f, im1 = 0.0f, im2 = 0.0f, im3 = 0.0f;
-  float D0 = 0.0f, D1 = 0.0f, D2 = 0.0f, D3 = 0.0f;
-  cd0 = *(uint32_t *)&pPilotTX[2U * j];
-  cd1 = *(uint32_t *)&pPilotTX[2U * (j + 1)];
-  cd2 = *(uint32_t *)&pPilotTX[2U * (j + 2)];
-  cd3 = *(uint32_t *)&pPilotTX[2U * (j + 3)];
-  asm volatile(
-      // Compute denominator
-      "vfdotpex.s.h   %[D0],  %[cd0],  %[cd0];"
-      "vfdotpex.s.h   %[D1],  %[cd1],  %[cd1];"
-      "vfdotpex.s.h   %[D2],  %[cd2],  %[cd2];"
-      "vfdotpex.s.h   %[D3],  %[cd3],  %[cd3];"
-      // Compute numerator
-      "vfdotpex.s.h   %[re0], %[ab],   %[cd0];"
-      "vfdotpex.s.h   %[re1], %[ab],   %[cd1];"
-      "vfdotpex.s.h   %[re2], %[ab],   %[cd2];"
-      "vfdotpex.s.h   %[re3], %[ab],   %[cd3];"
-      "vfdotpex.s.h   %[im0], %[ab_n], %[cd0];"
-      "vfdotpex.s.h   %[im1], %[ab_n], %[cd1];"
-      "vfdotpex.s.h   %[im2], %[ab_n], %[cd2];"
-      "vfdotpex.s.h   %[im3], %[ab_n], %[cd3];"
-      : [D0] "+&r"(D0), [D1] "+&r"(D1), [D2] "+&r"(D2), [D3] "+&r"(D3),
-        [re0] "+&r"(re0), [re1] "+&r"(re1), [re2] "+&r"(re2), [re3] "+&r"(re3),
-        [im0] "+&r"(im0), [im1] "+&r"(im1), [im2] "+&r"(im2), [im3] "+&r"(im3)
-      : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2), [cd3] "r"(cd3),
-        [ab] "r"(ab), [ab_n] "r"(ab_n)
-      :);
-  re0 = re0 / D0;
-  re1 = re1 / D1;
-  re2 = re2 / D2;
-  re3 = re3 / D3;
-  im0 = im0 / D0;
-  im1 = im1 / D1;
-  im2 = im2 / D2;
-  im3 = im3 / D3;
-  asm volatile(
-      // Pack in 32b word
-      "vfcpka.h.s       %[re0], %[re0], %[im0];"
-      "vfcpka.h.s       %[re1], %[re1], %[im1];"
-      "vfcpka.h.s       %[re2], %[re2], %[im2];"
-      "vfcpka.h.s       %[re3], %[re3], %[im3];"
-      : [D0] "+&r"(D0), [D1] "+&r"(D1), [D2] "+&r"(D2), [D3] "+&r"(D3),
-        [re0] "+&r"(re0), [re1] "+&r"(re1), [re2] "+&r"(re2), [re3] "+&r"(re3),
-        [im0] "+&r"(im0), [im1] "+&r"(im1), [im2] "+&r"(im2), [im3] "+&r"(im3)
-      : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2), [cd3] "r"(cd3),
-        [ab] "r"(ab), [ab_n] "r"(ab_n)
-      :);
-
-  *((uint32_t *)&pH[2 * (i * nTX + j)]) = *(uint32_t *)&re0;
-  *((uint32_t *)&pH[2 * (i * nTX + j + 1)]) = *(uint32_t *)&re1;
-  *((uint32_t *)&pH[2 * (i * nTX + j + 2)]) = *(uint32_t *)&re2;
-  *((uint32_t *)&pH[2 * (i * nTX + j + 3)]) = *(uint32_t *)&re3;
-  return;
-}
-
-#endif
-
-/**
-  @brief         Block-type channel estimation.
-  @param[in]     pH  points to output channel
-  @param[in]     pPilotRX points to received symbol
-  @param[in]     pPilotTX points to sent pilot
-  @param[in]     nTX Number of transmitters
-  @param[in]     nRX Number of receivers
-  @param[in]     nSc Number of Subcarriers
-  @return        none
-*/
-void mempool_chest_f16s_unrolled4(__fp16 *pH, __fp16 *pPilotRX,
-                                  __fp16 *pPilotTX, uint32_t nRX, uint32_t nTX,
-                                  uint32_t nSc) {
-
-  uint32_t ab0, ab1, ab2, ab3;
-  uint32_t ab_n0, ab_n1, ab_n2, ab_n3;
-
-  for (uint32_t k = 0; k < nSc; k++) {
-    for (uint32_t i = 0; i < nRX; i++) {
-      ab0 = *(uint32_t *)&pPilotRX[2U * i];
-      ab1 = *(uint32_t *)&pPilotRX[2U * (i + 1)];
-      ab2 = *(uint32_t *)&pPilotRX[2U * (i + 2)];
-      ab3 = *(uint32_t *)&pPilotRX[2U * (i + 3)];
-      asm volatile(
-          "xor           %[ab_n0], %[ab0], %[neg_mask];"
-          "xor           %[ab_n1], %[ab1], %[neg_mask];"
-          "xor           %[ab_n2], %[ab2], %[neg_mask];"
-          "xor           %[ab_n3], %[ab3], %[neg_mask];"
-          "pv.shuffle2.h %[ab_n0], %[ab_n0],  %[mask];"
-          "pv.shuffle2.h %[ab_n1], %[ab_n1],  %[mask];"
-          "pv.shuffle2.h %[ab_n2], %[ab_n2],  %[mask];"
-          "pv.shuffle2.h %[ab_n3], %[ab_n3],  %[mask];"
-          : [ab_n0] "+&r"(ab_n0), [ab_n1] "+&r"(ab_n1), [ab_n2] "+&r"(ab_n2),
-            [ab_n3] "+&r"(ab_n3)
-          : [ab0] "r"(ab0), [ab1] "r"(ab1), [ab2] "r"(ab2), [ab3] "r"(ab3),
-            [neg_mask] "r"(0x00008000), [mask] "r"(0x00020003)
-          :);
-      for (uint32_t j = 0; j < nTX; j += 4) {
-        chest_unrolled4_inner_loop_f16(pPilotTX, pH, nTX, ab0, ab_n0, i, j);
-        chest_unrolled4_inner_loop_f16(pPilotTX, pH, nTX, ab1, ab_n1, i + 1, j);
-        chest_unrolled4_inner_loop_f16(pPilotTX, pH, nTX, ab2, ab_n2, i + 2, j);
-        chest_unrolled4_inner_loop_f16(pPilotTX, pH, nTX, ab3, ab_n3, i + 3, j);
-      }
-    }
-    pPilotTX += 2 * nTX;
-    pPilotRX += 2 * nRX;
-    pH += 2 * (nTX * nRX);
-  }
-  return;
-}
diff --git a/software/kernels/baremetal/mempool_chest_q16.h b/software/kernels/baremetal/mempool_chest_q16.h
index 914a8e09e..a91aa0097 100644
--- a/software/kernels/baremetal/mempool_chest_q16.h
+++ b/software/kernels/baremetal/mempool_chest_q16.h
@@ -307,7 +307,7 @@ void mempool_chest_q16p_unrolled4(int16_t *volatile pH,
       }
     }
   }
-  mempool_barrier(nPE);
+  mempool_log_partial_barrier(2, core_id, nPE);
   return;
 }
 
diff --git a/software/kernels/baremetal/mempool_cholesky_f16s.h b/software/kernels/baremetal/mempool_cholesky_f16s.h
index c870782da..bb6143ed7 100644
--- a/software/kernels/baremetal/mempool_cholesky_f16s.h
+++ b/software/kernels/baremetal/mempool_cholesky_f16s.h
@@ -6,6 +6,7 @@
 // Author: Bowen Wang, ETH Zurich
 
 #pragma once
+#include "builtins_v2.h"
 #define N_BANKS (NUM_CORES * BANKING_FACTOR)
 
 #ifdef __XDIVSQRT
diff --git a/software/kernels/baremetal/mempool_cholesky_f32s.h b/software/kernels/baremetal/mempool_cholesky_f32s.h
index 6cd54fe69..2bad891f5 100644
--- a/software/kernels/baremetal/mempool_cholesky_f32s.h
+++ b/software/kernels/baremetal/mempool_cholesky_f32s.h
@@ -165,4 +165,8 @@ void mempool_cholesky_folded_f32s(float *pSrc, float *pL, const uint32_t n) {
   return;
 }
 
+#else
+
+#error "ERROR: f32 MMSE functions available only for __XDIVSQRT."
+
 #endif
diff --git a/software/runtime/kernel/mempool_cholesky_q16s.h b/software/kernels/baremetal/mempool_cholesky_q16s.h
similarity index 97%
rename from software/runtime/kernel/mempool_cholesky_q16s.h
rename to software/kernels/baremetal/mempool_cholesky_q16s.h
index 65f8e0b42..dc20a2b94 100644
--- a/software/runtime/kernel/mempool_cholesky_q16s.h
+++ b/software/kernels/baremetal/mempool_cholesky_q16s.h
@@ -5,7 +5,8 @@
 // Author: Marco Bertuletti, ETH Zurich
 
 #pragma once
-#include "kernel/mempool_sqrt_q32s.h"
+#include "baremetal/mempool_sqrt_q32s.h"
+#include "builtins_v2.h"
 #define N_BANKS (NUM_CORES * BANKING_FACTOR)
 
 /** VECTORIZED CODE
diff --git a/software/kernels/baremetal/mempool_cmatmul_f16.h b/software/kernels/baremetal/mempool_cmatmul_f16.h
index c83443076..f795a82ba 100644
--- a/software/kernels/baremetal/mempool_cmatmul_f16.h
+++ b/software/kernels/baremetal/mempool_cmatmul_f16.h
@@ -14,318 +14,453 @@
 #include "builtins_v2.h"
 
 #define CMATMUL_2x2_LOOP                                                       \
-  float sum00_real = 0.0f;                                                     \
-  float sum01_real = 0.0f;                                                     \
-  float sum10_real = 0.0f;                                                     \
-  float sum11_real = 0.0f;                                                     \
-  float sum00_imag = 0.0f;                                                     \
-  float sum01_imag = 0.0f;                                                     \
-  float sum10_imag = 0.0f;                                                     \
-  float sum11_imag = 0.0f;                                                     \
-  v2h a00s, a01s, a10s, a11s;                                                  \
-  v2h res00, res01, res10, res11;                                              \
-  for (j = 0; j < N; j += 2) {                                                 \
-    v2h a00 = *(v2h *)&A[2 * ((i + 0) * N + (j + 0))];                         \
-    v2h a01 = *(v2h *)&A[2 * ((i + 0) * N + (j + 1))];                         \
-    v2h a10 = *(v2h *)&A[2 * ((i + 1) * N + (j + 0))];                         \
-    v2h a11 = *(v2h *)&A[2 * ((i + 1) * N + (j + 1))];                         \
-    v2h b00 = *(v2h *)&B[2 * ((j + 0) * P + (k + 0))];                         \
-    v2h b01 = *(v2h *)&B[2 * ((j + 0) * P + (k + 1))];                         \
-    v2h b10 = *(v2h *)&B[2 * ((j + 1) * P + (k + 0))];                         \
-    v2h b11 = *(v2h *)&B[2 * ((j + 1) * P + (k + 1))];                         \
-    asm volatile("pv.shuffle2.h  %[a00s], %[a00], %[mask];"                    \
-                 "pv.shuffle2.h  %[a10s], %[a10], %[mask];"                    \
-                 "pv.shuffle2.h  %[a01s], %[a01], %[mask];"                    \
-                 "pv.shuffle2.h  %[a11s], %[a11], %[mask];"                    \
-                 : [a00s] "=&r"(a00s), [a01s] "=&r"(a01s), [a10s] "=&r"(a10s), \
-                   [a11s] "=&r"(a11s)                                          \
-                 : [a00] "r"(a00), [a01] "r"(a01), [a10] "r"(a10),             \
-                   [a11] "r"(a11), [mask] "r"(0x00020003)                      \
+  {                                                                            \
+    float sum00_real = 0.0f;                                                   \
+    float sum01_real = 0.0f;                                                   \
+    float sum10_real = 0.0f;                                                   \
+    float sum11_real = 0.0f;                                                   \
+    float sum00_imag = 0.0f;                                                   \
+    float sum01_imag = 0.0f;                                                   \
+    float sum10_imag = 0.0f;                                                   \
+    float sum11_imag = 0.0f;                                                   \
+    v2h a00s, a01s, a10s, a11s;                                                \
+    v2h res00, res01, res10, res11;                                            \
+    for (j = 0; j < N; j += 2) {                                               \
+      v2h a00 = *(v2h *)&A[2 * ((i + 0) * N + (j + 0))];                       \
+      v2h a01 = *(v2h *)&A[2 * ((i + 0) * N + (j + 1))];                       \
+      v2h a10 = *(v2h *)&A[2 * ((i + 1) * N + (j + 0))];                       \
+      v2h a11 = *(v2h *)&A[2 * ((i + 1) * N + (j + 1))];                       \
+      v2h b00 = *(v2h *)&B[2 * ((j + 0) * P + (k + 0))];                       \
+      v2h b01 = *(v2h *)&B[2 * ((j + 0) * P + (k + 1))];                       \
+      v2h b10 = *(v2h *)&B[2 * ((j + 1) * P + (k + 0))];                       \
+      v2h b11 = *(v2h *)&B[2 * ((j + 1) * P + (k + 1))];                       \
+      asm volatile("pv.shuffle2.h  %[a00s], %[a00], %[mask];"                  \
+                   "pv.shuffle2.h  %[a10s], %[a10], %[mask];"                  \
+                   "pv.shuffle2.h  %[a01s], %[a01], %[mask];"                  \
+                   "pv.shuffle2.h  %[a11s], %[a11], %[mask];"                  \
+                   : [a00s] "=&r"(a00s), [a01s] "=&r"(a01s),                   \
+                     [a10s] "=&r"(a10s), [a11s] "=&r"(a11s)                    \
+                   : [a00] "r"(a00), [a01] "r"(a01), [a10] "r"(a10),           \
+                     [a11] "r"(a11), [mask] "r"(0x00020003)                    \
+                   :);                                                         \
+      asm volatile(                                                            \
+          "vfdotpex.s.h  %[sum00_imag], %[a00s], %[b00];"                      \
+          "vfdotpex.s.h  %[sum10_imag], %[a10s], %[b00];"                      \
+          "vfdotpex.s.h  %[sum01_imag], %[a00s], %[b01];"                      \
+          "vfdotpex.s.h  %[sum11_imag], %[a10s], %[b01];"                      \
+          "vfdotpex.s.h  %[sum00_imag], %[a01s], %[b10];"                      \
+          "vfdotpex.s.h  %[sum10_imag], %[a11s], %[b10];"                      \
+          "vfdotpex.s.h  %[sum01_imag], %[a01s], %[b11];"                      \
+          "vfdotpex.s.h  %[sum11_imag], %[a11s], %[b11];"                      \
+          : [sum00_imag] "+&r"(sum00_imag), [sum01_imag] "+&r"(sum01_imag),    \
+            [sum10_imag] "+&r"(sum10_imag), [sum11_imag] "+&r"(sum11_imag)     \
+          : [a00s] "r"(a00s), [a01s] "r"(a01s), [a10s] "r"(a10s),              \
+            [a11s] "r"(a11s), [b00] "r"(b00), [b01] "r"(b01), [b10] "r"(b10),  \
+            [b11] "r"(b11)                                                     \
+          :);                                                                  \
+      asm volatile("xor  %[a00s], %[a00], %[mask];"                            \
+                   "xor  %[a10s], %[a10], %[mask];"                            \
+                   "xor  %[a01s], %[a01], %[mask];"                            \
+                   "xor  %[a11s], %[a11], %[mask];"                            \
+                   : [a00s] "=&r"(a00s), [a01s] "=&r"(a01s),                   \
+                     [a10s] "=&r"(a10s), [a11s] "=&r"(a11s)                    \
+                   : [a00] "r"(a00), [a01] "r"(a01), [a10] "r"(a10),           \
+                     [a11] "r"(a11), [mask] "r"(0x00008000)                    \
+                   :);                                                         \
+      asm volatile(                                                            \
+          "vfdotpex.s.h  %[sum00_real], %[a00s], %[b00];"                      \
+          "vfdotpex.s.h  %[sum10_real], %[a10s], %[b00];"                      \
+          "vfdotpex.s.h  %[sum01_real], %[a00s], %[b01];"                      \
+          "vfdotpex.s.h  %[sum11_real], %[a10s], %[b01];"                      \
+          "vfdotpex.s.h  %[sum00_real], %[a01s], %[b10];"                      \
+          "vfdotpex.s.h  %[sum10_real], %[a11s], %[b10];"                      \
+          "vfdotpex.s.h  %[sum01_real], %[a01s], %[b11];"                      \
+          "vfdotpex.s.h  %[sum11_real], %[a11s], %[b11];"                      \
+          : [sum00_real] "+&r"(sum00_real), [sum01_real] "+&r"(sum01_real),    \
+            [sum10_real] "+&r"(sum10_real), [sum11_real] "+&r"(sum11_real)     \
+          : [a00s] "r"(a00s), [a01s] "r"(a01s), [a10s] "r"(a10s),              \
+            [a11s] "r"(a11s), [b00] "r"(b00), [b01] "r"(b01), [b10] "r"(b10),  \
+            [b11] "r"(b11)                                                     \
+          :);                                                                  \
+    }                                                                          \
+    asm volatile("vfcpka.h.s %[res00], %[sum00_imag], %[sum00_real];"          \
+                 "vfcpka.h.s %[res01], %[sum01_imag], %[sum01_real];"          \
+                 "vfcpka.h.s %[res10], %[sum10_imag], %[sum10_real];"          \
+                 "vfcpka.h.s %[res11], %[sum11_imag], %[sum11_real];"          \
+                 : [res00] "=r"(res00), [res01] "=r"(res01),                   \
+                   [res10] "=r"(res10), [res11] "=r"(res11)                    \
+                 : [sum00_imag] "r"(sum00_imag), [sum01_imag] "r"(sum01_imag), \
+                   [sum10_imag] "r"(sum10_imag), [sum11_imag] "r"(sum11_imag), \
+                   [sum00_real] "r"(sum00_real), [sum01_real] "r"(sum01_real), \
+                   [sum10_real] "r"(sum10_real), [sum11_real] "r"(sum11_real)  \
                  :);                                                           \
-    asm volatile(                                                              \
-        "vfdotpex.s.h  %[sum00_imag], %[a00s], %[b00];"                        \
-        "vfdotpex.s.h  %[sum10_imag], %[a10s], %[b00];"                        \
-        "vfdotpex.s.h  %[sum01_imag], %[a00s], %[b01];"                        \
-        "vfdotpex.s.h  %[sum11_imag], %[a10s], %[b01];"                        \
-        "vfdotpex.s.h  %[sum00_imag], %[a01s], %[b10];"                        \
-        "vfdotpex.s.h  %[sum10_imag], %[a11s], %[b10];"                        \
-        "vfdotpex.s.h  %[sum01_imag], %[a01s], %[b11];"                        \
-        "vfdotpex.s.h  %[sum11_imag], %[a11s], %[b11];"                        \
-        : [sum00_imag] "+&r"(sum00_imag), [sum01_imag] "+&r"(sum01_imag),      \
-          [sum10_imag] "+&r"(sum10_imag), [sum11_imag] "+&r"(sum11_imag)       \
-        : [a00s] "r"(a00s), [a01s] "r"(a01s), [a10s] "r"(a10s),                \
-          [a11s] "r"(a11s), [b00] "r"(b00), [b01] "r"(b01), [b10] "r"(b10),    \
-          [b11] "r"(b11)                                                       \
-        :);                                                                    \
-    asm volatile("xor  %[a00s], %[a00], %[mask];"                              \
-                 "xor  %[a10s], %[a10], %[mask];"                              \
-                 "xor  %[a01s], %[a01], %[mask];"                              \
-                 "xor  %[a11s], %[a11], %[mask];"                              \
-                 : [a00s] "=&r"(a00s), [a01s] "=&r"(a01s), [a10s] "=&r"(a10s), \
-                   [a11s] "=&r"(a11s)                                          \
-                 : [a00] "r"(a00), [a01] "r"(a01), [a10] "r"(a10),             \
-                   [a11] "r"(a11), [mask] "r"(0x00008000)                      \
-                 :);                                                           \
-    asm volatile(                                                              \
-        "vfdotpex.s.h  %[sum00_real], %[a00s], %[b00];"                        \
-        "vfdotpex.s.h  %[sum10_real], %[a10s], %[b00];"                        \
-        "vfdotpex.s.h  %[sum01_real], %[a00s], %[b01];"                        \
-        "vfdotpex.s.h  %[sum11_real], %[a10s], %[b01];"                        \
-        "vfdotpex.s.h  %[sum00_real], %[a01s], %[b10];"                        \
-        "vfdotpex.s.h  %[sum10_real], %[a11s], %[b10];"                        \
-        "vfdotpex.s.h  %[sum01_real], %[a01s], %[b11];"                        \
-        "vfdotpex.s.h  %[sum11_real], %[a11s], %[b11];"                        \
-        : [sum00_real] "+&r"(sum00_real), [sum01_real] "+&r"(sum01_real),      \
-          [sum10_real] "+&r"(sum10_real), [sum11_real] "+&r"(sum11_real)       \
-        : [a00s] "r"(a00s), [a01s] "r"(a01s), [a10s] "r"(a10s),                \
-          [a11s] "r"(a11s), [b00] "r"(b00), [b01] "r"(b01), [b10] "r"(b10),    \
-          [b11] "r"(b11)                                                       \
-        :);                                                                    \
-  }                                                                            \
-  asm volatile("vfcpka.h.s %[res00], %[sum00_imag], %[sum00_real];"            \
-               "vfcpka.h.s %[res01], %[sum01_imag], %[sum01_real];"            \
-               "vfcpka.h.s %[res10], %[sum10_imag], %[sum10_real];"            \
-               "vfcpka.h.s %[res11], %[sum11_imag], %[sum11_real];"            \
-               : [res00] "=r"(res00), [res01] "=r"(res01),                     \
-                 [res10] "=r"(res10), [res11] "=r"(res11)                      \
-               : [sum00_imag] "r"(sum00_imag), [sum01_imag] "r"(sum01_imag),   \
-                 [sum10_imag] "r"(sum10_imag), [sum11_imag] "r"(sum11_imag),   \
-                 [sum00_real] "r"(sum00_real), [sum01_real] "r"(sum01_real),   \
-                 [sum10_real] "r"(sum10_real), [sum11_real] "r"(sum11_real)    \
-               :);                                                             \
-  (*(v2h *)&C[2 * ((i + 0) * P + k + 0)]) = res00;                             \
-  (*(v2h *)&C[2 * ((i + 0) * P + k + 1)]) = res01;                             \
-  (*(v2h *)&C[2 * ((i + 1) * P + k + 0)]) = res10;                             \
-  (*(v2h *)&C[2 * ((i + 1) * P + k + 1)]) = res11;
+    (*(v2h *)&C[2 * ((i + 0) * P + k + 0)]) = res00;                           \
+    (*(v2h *)&C[2 * ((i + 0) * P + k + 1)]) = res01;                           \
+    (*(v2h *)&C[2 * ((i + 1) * P + k + 0)]) = res10;                           \
+    (*(v2h *)&C[2 * ((i + 1) * P + k + 1)]) = res11;                           \
+  }
 
 #define CMATMUL_2x4_LOOP                                                       \
-  float register volatile sum00_real = 0.0f;                                   \
-  float register volatile sum01_real = 0.0f;                                   \
-  float register volatile sum02_real = 0.0f;                                   \
-  float register volatile sum03_real = 0.0f;                                   \
-  float register volatile sum10_real = 0.0f;                                   \
-  float register volatile sum11_real = 0.0f;                                   \
-  float register volatile sum12_real = 0.0f;                                   \
-  float register volatile sum13_real = 0.0f;                                   \
-  float register volatile sum00_imag = 0.0f;                                   \
-  float register volatile sum01_imag = 0.0f;                                   \
-  float register volatile sum02_imag = 0.0f;                                   \
-  float register volatile sum03_imag = 0.0f;                                   \
-  float register volatile sum10_imag = 0.0f;                                   \
-  float register volatile sum11_imag = 0.0f;                                   \
-  float register volatile sum12_imag = 0.0f;                                   \
-  float register volatile sum13_imag = 0.0f;                                   \
-  v2h a00s, a01s, a10s, a11s;                                                  \
-  for (j = 0; j < N; j += 2) {                                                 \
-    v2h a00 = A[(i + 0) * N + (j + 0)];                                        \
-    v2h a01 = A[(i + 0) * N + (j + 1)];                                        \
-    v2h a10 = A[(i + 1) * N + (j + 0)];                                        \
-    v2h a11 = A[(i + 1) * N + (j + 1)];                                        \
-    v2h b00 = B[(j + 0) * P + (k + 0)];                                        \
-    v2h b01 = B[(j + 0) * P + (k + 1)];                                        \
-    v2h b02 = B[(j + 0) * P + (k + 2)];                                        \
-    v2h b03 = B[(j + 0) * P + (k + 3)];                                        \
-    v2h b10 = B[(j + 1) * P + (k + 0)];                                        \
-    v2h b11 = B[(j + 1) * P + (k + 1)];                                        \
-    v2h b12 = B[(j + 1) * P + (k + 2)];                                        \
-    v2h b13 = B[(j + 1) * P + (k + 3)];                                        \
+  {                                                                            \
+    float register volatile sum00_real = 0.0f;                                 \
+    float register volatile sum01_real = 0.0f;                                 \
+    float register volatile sum02_real = 0.0f;                                 \
+    float register volatile sum03_real = 0.0f;                                 \
+    float register volatile sum10_real = 0.0f;                                 \
+    float register volatile sum11_real = 0.0f;                                 \
+    float register volatile sum12_real = 0.0f;                                 \
+    float register volatile sum13_real = 0.0f;                                 \
+    float register volatile sum00_imag = 0.0f;                                 \
+    float register volatile sum01_imag = 0.0f;                                 \
+    float register volatile sum02_imag = 0.0f;                                 \
+    float register volatile sum03_imag = 0.0f;                                 \
+    float register volatile sum10_imag = 0.0f;                                 \
+    float register volatile sum11_imag = 0.0f;                                 \
+    float register volatile sum12_imag = 0.0f;                                 \
+    float register volatile sum13_imag = 0.0f;                                 \
+    v2h a00s, a01s, a10s, a11s;                                                \
+    for (j = 0; j < N; j += 2) {                                               \
+      v2h a00 = A[(i + 0) * N + (j + 0)];                                      \
+      v2h a01 = A[(i + 0) * N + (j + 1)];                                      \
+      v2h a10 = A[(i + 1) * N + (j + 0)];                                      \
+      v2h a11 = A[(i + 1) * N + (j + 1)];                                      \
+      v2h b00 = B[(j + 0) * P + (k + 0)];                                      \
+      v2h b01 = B[(j + 0) * P + (k + 1)];                                      \
+      v2h b02 = B[(j + 0) * P + (k + 2)];                                      \
+      v2h b03 = B[(j + 0) * P + (k + 3)];                                      \
+      v2h b10 = B[(j + 1) * P + (k + 0)];                                      \
+      v2h b11 = B[(j + 1) * P + (k + 1)];                                      \
+      v2h b12 = B[(j + 1) * P + (k + 2)];                                      \
+      v2h b13 = B[(j + 1) * P + (k + 3)];                                      \
+      asm volatile(                                                            \
+          "pv.shuffle2.h  %[a00s], %[a00], %[mask];"                           \
+          "pv.shuffle2.h  %[a10s], %[a10], %[mask];"                           \
+          "pv.shuffle2.h  %[a01s], %[a01], %[mask];"                           \
+          "pv.shuffle2.h  %[a11s], %[a11], %[mask];"                           \
+          "vfdotpex.s.h  %[sum00_imag], %[a00s], %[b00];"                      \
+          "vfdotpex.s.h  %[sum10_imag], %[a10s], %[b00];"                      \
+          "vfdotpex.s.h  %[sum01_imag], %[a00s], %[b01];"                      \
+          "vfdotpex.s.h  %[sum11_imag], %[a10s], %[b01];"                      \
+          "vfdotpex.s.h  %[sum02_imag], %[a00s], %[b02];"                      \
+          "vfdotpex.s.h  %[sum12_imag], %[a10s], %[b02];"                      \
+          "vfdotpex.s.h  %[sum03_imag], %[a00s], %[b03];"                      \
+          "vfdotpex.s.h  %[sum13_imag], %[a10s], %[b03];"                      \
+          "vfdotpex.s.h  %[sum00_imag], %[a01s], %[b10];"                      \
+          "vfdotpex.s.h  %[sum10_imag], %[a11s], %[b10];"                      \
+          "vfdotpex.s.h  %[sum01_imag], %[a01s], %[b11];"                      \
+          "vfdotpex.s.h  %[sum11_imag], %[a11s], %[b11];"                      \
+          "vfdotpex.s.h  %[sum02_imag], %[a01s], %[b12];"                      \
+          "vfdotpex.s.h  %[sum12_imag], %[a11s], %[b12];"                      \
+          "vfdotpex.s.h  %[sum03_imag], %[a01s], %[b13];"                      \
+          "vfdotpex.s.h  %[sum13_imag], %[a11s], %[b13];"                      \
+          : [a00s] "=&r"(a00s), [a01s] "=&r"(a01s), [a10s] "=&r"(a10s),        \
+            [a11s] "=&r"(a11s), [sum00_imag] "+&r"(sum00_imag),                \
+            [sum01_imag] "+&r"(sum01_imag), [sum02_imag] "+&r"(sum02_imag),    \
+            [sum03_imag] "+&r"(sum03_imag), [sum10_imag] "+&r"(sum10_imag),    \
+            [sum11_imag] "+&r"(sum11_imag), [sum12_imag] "+&r"(sum12_imag),    \
+            [sum13_imag] "+&r"(sum13_imag)                                     \
+          : [a00] "r"(a00), [a01] "r"(a01), [a10] "r"(a10), [a11] "r"(a11),    \
+            [b00] "r"(b00), [b01] "r"(b01), [b02] "r"(b02), [b03] "r"(b03),    \
+            [b10] "r"(b10), [b11] "r"(b11), [b12] "r"(b12), [b13] "r"(b13),    \
+            [mask] "r"(0x00020003)                                             \
+          :);                                                                  \
+      asm volatile(                                                            \
+          "xor  %[a00s], %[a00], %[maskn];"                                    \
+          "xor  %[a10s], %[a10], %[maskn];"                                    \
+          "xor  %[a01s], %[a01], %[maskn];"                                    \
+          "xor  %[a11s], %[a11], %[maskn];"                                    \
+          "vfdotpex.s.h  %[sum00_real], %[a00s], %[b00];"                      \
+          "vfdotpex.s.h  %[sum10_real], %[a10s], %[b00];"                      \
+          "vfdotpex.s.h  %[sum01_real], %[a00s], %[b01];"                      \
+          "vfdotpex.s.h  %[sum11_real], %[a10s], %[b01];"                      \
+          "vfdotpex.s.h  %[sum02_real], %[a00s], %[b02];"                      \
+          "vfdotpex.s.h  %[sum12_real], %[a10s], %[b02];"                      \
+          "vfdotpex.s.h  %[sum03_real], %[a00s], %[b03];"                      \
+          "vfdotpex.s.h  %[sum13_real], %[a10s], %[b03];"                      \
+          "vfdotpex.s.h  %[sum00_real], %[a01s], %[b10];"                      \
+          "vfdotpex.s.h  %[sum10_real], %[a11s], %[b10];"                      \
+          "vfdotpex.s.h  %[sum01_real], %[a01s], %[b11];"                      \
+          "vfdotpex.s.h  %[sum11_real], %[a11s], %[b11];"                      \
+          "vfdotpex.s.h  %[sum02_real], %[a01s], %[b12];"                      \
+          "vfdotpex.s.h  %[sum12_real], %[a11s], %[b12];"                      \
+          "vfdotpex.s.h  %[sum03_real], %[a01s], %[b13];"                      \
+          "vfdotpex.s.h  %[sum13_real], %[a11s], %[b13];"                      \
+          : [a00s] "=&r"(a00s), [a01s] "=&r"(a01s), [a10s] "=&r"(a10s),        \
+            [a11s] "=&r"(a11s), [sum00_real] "+&r"(sum00_real),                \
+            [sum01_real] "+&r"(sum01_real), [sum02_real] "+&r"(sum02_real),    \
+            [sum03_real] "+&r"(sum03_real), [sum10_real] "+&r"(sum10_real),    \
+            [sum11_real] "+&r"(sum11_real), [sum12_real] "+&r"(sum12_real),    \
+            [sum13_real] "+&r"(sum13_real)                                     \
+          : [a00] "r"(a00), [a01] "r"(a01), [a10] "r"(a10), [a11] "r"(a11),    \
+            [b00] "r"(b00), [b01] "r"(b01), [b02] "r"(b02), [b03] "r"(b03),    \
+            [b10] "r"(b10), [b11] "r"(b11), [b12] "r"(b12), [b13] "r"(b13),    \
+            [maskn] "r"(0x00008000)                                            \
+          :);                                                                  \
+    }                                                                          \
     asm volatile(                                                              \
-        "pv.shuffle2.h  %[a00s], %[a00], %[mask];"                             \
-        "pv.shuffle2.h  %[a10s], %[a10], %[mask];"                             \
-        "pv.shuffle2.h  %[a01s], %[a01], %[mask];"                             \
-        "pv.shuffle2.h  %[a11s], %[a11], %[mask];"                             \
-        "vfdotpex.s.h  %[sum00_imag], %[a00s], %[b00];"                        \
-        "vfdotpex.s.h  %[sum10_imag], %[a10s], %[b00];"                        \
-        "vfdotpex.s.h  %[sum01_imag], %[a00s], %[b01];"                        \
-        "vfdotpex.s.h  %[sum11_imag], %[a10s], %[b01];"                        \
-        "vfdotpex.s.h  %[sum02_imag], %[a00s], %[b02];"                        \
-        "vfdotpex.s.h  %[sum12_imag], %[a10s], %[b02];"                        \
-        "vfdotpex.s.h  %[sum03_imag], %[a00s], %[b03];"                        \
-        "vfdotpex.s.h  %[sum13_imag], %[a10s], %[b03];"                        \
-        "vfdotpex.s.h  %[sum00_imag], %[a01s], %[b10];"                        \
-        "vfdotpex.s.h  %[sum10_imag], %[a11s], %[b10];"                        \
-        "vfdotpex.s.h  %[sum01_imag], %[a01s], %[b11];"                        \
-        "vfdotpex.s.h  %[sum11_imag], %[a11s], %[b11];"                        \
-        "vfdotpex.s.h  %[sum02_imag], %[a01s], %[b12];"                        \
-        "vfdotpex.s.h  %[sum12_imag], %[a11s], %[b12];"                        \
-        "vfdotpex.s.h  %[sum03_imag], %[a01s], %[b13];"                        \
-        "vfdotpex.s.h  %[sum13_imag], %[a11s], %[b13];"                        \
-        : [a00s] "=&r"(a00s), [a01s] "=&r"(a01s), [a10s] "=&r"(a10s),          \
-          [a11s] "=&r"(a11s), [sum00_imag] "+&r"(sum00_imag),                  \
-          [sum01_imag] "+&r"(sum01_imag), [sum02_imag] "+&r"(sum02_imag),      \
-          [sum03_imag] "+&r"(sum03_imag), [sum10_imag] "+&r"(sum10_imag),      \
-          [sum11_imag] "+&r"(sum11_imag), [sum12_imag] "+&r"(sum12_imag),      \
-          [sum13_imag] "+&r"(sum13_imag)                                       \
-        : [a00] "r"(a00), [a01] "r"(a01), [a10] "r"(a10), [a11] "r"(a11),      \
-          [b00] "r"(b00), [b01] "r"(b01), [b02] "r"(b02), [b03] "r"(b03),      \
-          [b10] "r"(b10), [b11] "r"(b11), [b12] "r"(b12), [b13] "r"(b13),      \
-          [mask] "r"(0x00020003)                                               \
-        :);                                                                    \
-    asm volatile(                                                              \
-        "xor  %[a00s], %[a00], %[maskn];"                                      \
-        "xor  %[a10s], %[a10], %[maskn];"                                      \
-        "xor  %[a01s], %[a01], %[maskn];"                                      \
-        "xor  %[a11s], %[a11], %[maskn];"                                      \
-        "vfdotpex.s.h  %[sum00_real], %[a00s], %[b00];"                        \
-        "vfdotpex.s.h  %[sum10_real], %[a10s], %[b00];"                        \
-        "vfdotpex.s.h  %[sum01_real], %[a00s], %[b01];"                        \
-        "vfdotpex.s.h  %[sum11_real], %[a10s], %[b01];"                        \
-        "vfdotpex.s.h  %[sum02_real], %[a00s], %[b02];"                        \
-        "vfdotpex.s.h  %[sum12_real], %[a10s], %[b02];"                        \
-        "vfdotpex.s.h  %[sum03_real], %[a00s], %[b03];"                        \
-        "vfdotpex.s.h  %[sum13_real], %[a10s], %[b03];"                        \
-        "vfdotpex.s.h  %[sum00_real], %[a01s], %[b10];"                        \
-        "vfdotpex.s.h  %[sum10_real], %[a11s], %[b10];"                        \
-        "vfdotpex.s.h  %[sum01_real], %[a01s], %[b11];"                        \
-        "vfdotpex.s.h  %[sum11_real], %[a11s], %[b11];"                        \
-        "vfdotpex.s.h  %[sum02_real], %[a01s], %[b12];"                        \
-        "vfdotpex.s.h  %[sum12_real], %[a11s], %[b12];"                        \
-        "vfdotpex.s.h  %[sum03_real], %[a01s], %[b13];"                        \
-        "vfdotpex.s.h  %[sum13_real], %[a11s], %[b13];"                        \
-        : [a00s] "=&r"(a00s), [a01s] "=&r"(a01s), [a10s] "=&r"(a10s),          \
-          [a11s] "=&r"(a11s), [sum00_real] "+&r"(sum00_real),                  \
-          [sum01_real] "+&r"(sum01_real), [sum02_real] "+&r"(sum02_real),      \
-          [sum03_real] "+&r"(sum03_real), [sum10_real] "+&r"(sum10_real),      \
-          [sum11_real] "+&r"(sum11_real), [sum12_real] "+&r"(sum12_real),      \
-          [sum13_real] "+&r"(sum13_real)                                       \
-        : [a00] "r"(a00), [a01] "r"(a01), [a10] "r"(a10), [a11] "r"(a11),      \
-          [b00] "r"(b00), [b01] "r"(b01), [b02] "r"(b02), [b03] "r"(b03),      \
-          [b10] "r"(b10), [b11] "r"(b11), [b12] "r"(b12), [b13] "r"(b13),      \
-          [maskn] "r"(0x00008000)                                              \
+        "vfcpka.h.s %[sum00_real], %[sum00_imag], %[sum00_real];"              \
+        "vfcpka.h.s %[sum01_real], %[sum01_imag], %[sum01_real];"              \
+        "vfcpka.h.s %[sum02_real], %[sum02_imag], %[sum02_real];"              \
+        "vfcpka.h.s %[sum03_real], %[sum03_imag], %[sum03_real];"              \
+        "vfcpka.h.s %[sum10_real], %[sum10_imag], %[sum10_real];"              \
+        "vfcpka.h.s %[sum11_real], %[sum11_imag], %[sum11_real];"              \
+        "vfcpka.h.s %[sum12_real], %[sum12_imag], %[sum12_real];"              \
+        "vfcpka.h.s %[sum13_real], %[sum13_imag], %[sum13_real];"              \
+        : [sum00_real] "+&r"(sum00_real), [sum01_real] "+&r"(sum01_real),      \
+          [sum02_real] "+&r"(sum02_real), [sum03_real] "+&r"(sum03_real),      \
+          [sum10_real] "+&r"(sum10_real), [sum11_real] "+&r"(sum11_real),      \
+          [sum12_real] "+&r"(sum12_real), [sum13_real] "+&r"(sum13_real)       \
+        : [sum00_imag] "r"(sum00_imag), [sum01_imag] "r"(sum01_imag),          \
+          [sum02_imag] "r"(sum02_imag), [sum03_imag] "r"(sum03_imag),          \
+          [sum10_imag] "r"(sum10_imag), [sum11_imag] "r"(sum11_imag),          \
+          [sum12_imag] "r"(sum12_imag), [sum13_imag] "r"(sum13_imag)           \
         :);                                                                    \
-  }                                                                            \
-  asm volatile(                                                                \
-      "vfcpka.h.s %[sum00_real], %[sum00_imag], %[sum00_real];"                \
-      "vfcpka.h.s %[sum01_real], %[sum01_imag], %[sum01_real];"                \
-      "vfcpka.h.s %[sum02_real], %[sum02_imag], %[sum02_real];"                \
-      "vfcpka.h.s %[sum03_real], %[sum03_imag], %[sum03_real];"                \
-      "vfcpka.h.s %[sum10_real], %[sum10_imag], %[sum10_real];"                \
-      "vfcpka.h.s %[sum11_real], %[sum11_imag], %[sum11_real];"                \
-      "vfcpka.h.s %[sum12_real], %[sum12_imag], %[sum12_real];"                \
-      "vfcpka.h.s %[sum13_real], %[sum13_imag], %[sum13_real];"                \
-      : [sum00_real] "+&r"(sum00_real), [sum01_real] "+&r"(sum01_real),        \
-        [sum02_real] "+&r"(sum02_real), [sum03_real] "+&r"(sum03_real),        \
-        [sum10_real] "+&r"(sum10_real), [sum11_real] "+&r"(sum11_real),        \
-        [sum12_real] "+&r"(sum12_real), [sum13_real] "+&r"(sum13_real)         \
-      : [sum00_imag] "r"(sum00_imag), [sum01_imag] "r"(sum01_imag),            \
-        [sum02_imag] "r"(sum02_imag), [sum03_imag] "r"(sum03_imag),            \
-        [sum10_imag] "r"(sum10_imag), [sum11_imag] "r"(sum11_imag),            \
-        [sum12_imag] "r"(sum12_imag), [sum13_imag] "r"(sum13_imag)             \
-      :);                                                                      \
-  C[(i + 0) * P + k + 0] = (v2h)sum00_real;                                    \
-  C[(i + 0) * P + k + 1] = (v2h)sum01_real;                                    \
-  C[(i + 0) * P + k + 2] = (v2h)sum02_real;                                    \
-  C[(i + 0) * P + k + 3] = (v2h)sum03_real;                                    \
-  C[(i + 1) * P + k + 0] = (v2h)sum10_real;                                    \
-  C[(i + 1) * P + k + 1] = (v2h)sum11_real;                                    \
-  C[(i + 1) * P + k + 2] = (v2h)sum12_real;                                    \
-  C[(i + 1) * P + k + 3] = (v2h)sum13_real;
+    C[(i + 0) * P + k + 0] = (v2h)sum00_real;                                  \
+    C[(i + 0) * P + k + 1] = (v2h)sum01_real;                                  \
+    C[(i + 0) * P + k + 2] = (v2h)sum02_real;                                  \
+    C[(i + 0) * P + k + 3] = (v2h)sum03_real;                                  \
+    C[(i + 1) * P + k + 0] = (v2h)sum10_real;                                  \
+    C[(i + 1) * P + k + 1] = (v2h)sum11_real;                                  \
+    C[(i + 1) * P + k + 2] = (v2h)sum12_real;                                  \
+    C[(i + 1) * P + k + 3] = (v2h)sum13_real;                                  \
+  }
 
 /**************************************************************************/
 /**************************************************************************/
 // COMPLEX DOTP INSTRUCTIONS
 
 #define CMATMUL_CDOTP_1x1_LOOP                                                 \
-  v2h sum = (v2h)0.0f;                                                         \
-  for (j = 0; j < N; j++) {                                                    \
-    v2h a = *(v2h *)&A[2 * (i * M + j)];                                       \
-    v2h b = *(v2h *)&B[2 * (j * P + k)];                                       \
-    asm volatile("fcdotpex.s.h  %[sum], %[a], %[b];"                           \
-                 : [sum] "+&r"(sum)                                            \
-                 : [a] "r"(a), [b] "r"(b)                                      \
-                 :);                                                           \
-  }                                                                            \
-  (*(v2h *)&C[2 * ((i + 0) * P + k + 0)]) = sum;
+  {                                                                            \
+    v2h sum = (v2h)0.0f;                                                       \
+    for (j = 0; j < N; j++) {                                                  \
+      v2h a = *(v2h *)&A[2 * (i * M + j)];                                     \
+      v2h b = *(v2h *)&B[2 * (j * P + k)];                                     \
+      asm volatile("fcdotpex.s.h  %[sum], %[a], %[b];"                         \
+                   : [sum] "+&r"(sum)                                          \
+                   : [a] "r"(a), [b] "r"(b)                                    \
+                   :);                                                         \
+    }                                                                          \
+    (*(v2h *)&C[2 * ((i + 0) * P + k + 0)]) = sum;                             \
+  }
 
 #define CMATMUL_CDOTP_2x2_LOOP                                                 \
-  v2h sum00 = (v2h)0.0f;                                                       \
-  v2h sum01 = (v2h)0.0f;                                                       \
-  v2h sum10 = (v2h)0.0f;                                                       \
-  v2h sum11 = (v2h)0.0f;                                                       \
-  for (j = 0; j < N; j += 2) {                                                 \
-    v2h a00 = *(v2h *)&A[2 * ((i + 0) * M + (j + 0))];                         \
-    v2h a01 = *(v2h *)&A[2 * ((i + 0) * M + (j + 1))];                         \
-    v2h a10 = *(v2h *)&A[2 * ((i + 1) * M + (j + 0))];                         \
-    v2h a11 = *(v2h *)&A[2 * ((i + 1) * M + (j + 1))];                         \
-    v2h b00 = *(v2h *)&B[2 * ((j + 0) * P + (k + 0))];                         \
-    v2h b01 = *(v2h *)&B[2 * ((j + 0) * P + (k + 1))];                         \
-    v2h b10 = *(v2h *)&B[2 * ((j + 1) * P + (k + 0))];                         \
-    v2h b11 = *(v2h *)&B[2 * ((j + 1) * P + (k + 1))];                         \
-    asm volatile(                                                              \
-        "fcdotpex.s.h  %[sum00], %[a00], %[b00];"                              \
-        "fcdotpex.s.h  %[sum10], %[a10], %[b00];"                              \
-        "fcdotpex.s.h  %[sum01], %[a00], %[b01];"                              \
-        "fcdotpex.s.h  %[sum11], %[a10], %[b01];"                              \
-        "fcdotpex.s.h  %[sum00], %[a01], %[b10];"                              \
-        "fcdotpex.s.h  %[sum10], %[a11], %[b10];"                              \
-        "fcdotpex.s.h  %[sum01], %[a01], %[b11];"                              \
-        "fcdotpex.s.h  %[sum11], %[a11], %[b11];"                              \
-        : [sum00] "+&r"(sum00), [sum01] "+&r"(sum01), [sum10] "+&r"(sum10),    \
-          [sum11] "+&r"(sum11)                                                 \
-        : [a00] "r"(a00), [a01] "r"(a01), [a10] "r"(a10), [a11] "r"(a11),      \
-          [b00] "r"(b00), [b01] "r"(b01), [b10] "r"(b10), [b11] "r"(b11)       \
-        :);                                                                    \
-  }                                                                            \
-  (*(v2h *)&C[2 * ((i + 0) * P + k + 0)]) = sum00;                             \
-  (*(v2h *)&C[2 * ((i + 0) * P + k + 1)]) = sum01;                             \
-  (*(v2h *)&C[2 * ((i + 1) * P + k + 0)]) = sum10;                             \
-  (*(v2h *)&C[2 * ((i + 1) * P + k + 1)]) = sum11;
+  {                                                                            \
+    v2h sum00 = (v2h)0.0f;                                                     \
+    v2h sum01 = (v2h)0.0f;                                                     \
+    v2h sum10 = (v2h)0.0f;                                                     \
+    v2h sum11 = (v2h)0.0f;                                                     \
+    for (j = 0; j < N; j += 2) {                                               \
+      v2h a00 = *(v2h *)&A[2 * ((i + 0) * M + (j + 0))];                       \
+      v2h a01 = *(v2h *)&A[2 * ((i + 0) * M + (j + 1))];                       \
+      v2h a10 = *(v2h *)&A[2 * ((i + 1) * M + (j + 0))];                       \
+      v2h a11 = *(v2h *)&A[2 * ((i + 1) * M + (j + 1))];                       \
+      v2h b00 = *(v2h *)&B[2 * ((j + 0) * P + (k + 0))];                       \
+      v2h b01 = *(v2h *)&B[2 * ((j + 0) * P + (k + 1))];                       \
+      v2h b10 = *(v2h *)&B[2 * ((j + 1) * P + (k + 0))];                       \
+      v2h b11 = *(v2h *)&B[2 * ((j + 1) * P + (k + 1))];                       \
+      asm volatile(                                                            \
+          "fcdotpex.s.h  %[sum00], %[a00], %[b00];"                            \
+          "fcdotpex.s.h  %[sum10], %[a10], %[b00];"                            \
+          "fcdotpex.s.h  %[sum01], %[a00], %[b01];"                            \
+          "fcdotpex.s.h  %[sum11], %[a10], %[b01];"                            \
+          "fcdotpex.s.h  %[sum00], %[a01], %[b10];"                            \
+          "fcdotpex.s.h  %[sum10], %[a11], %[b10];"                            \
+          "fcdotpex.s.h  %[sum01], %[a01], %[b11];"                            \
+          "fcdotpex.s.h  %[sum11], %[a11], %[b11];"                            \
+          : [sum00] "+&r"(sum00), [sum01] "+&r"(sum01), [sum10] "+&r"(sum10),  \
+            [sum11] "+&r"(sum11)                                               \
+          : [a00] "r"(a00), [a01] "r"(a01), [a10] "r"(a10), [a11] "r"(a11),    \
+            [b00] "r"(b00), [b01] "r"(b01), [b10] "r"(b10), [b11] "r"(b11)     \
+          :);                                                                  \
+    }                                                                          \
+    (*(v2h *)&C[2 * ((i + 0) * P + k + 0)]) = sum00;                           \
+    (*(v2h *)&C[2 * ((i + 0) * P + k + 1)]) = sum01;                           \
+    (*(v2h *)&C[2 * ((i + 1) * P + k + 0)]) = sum10;                           \
+    (*(v2h *)&C[2 * ((i + 1) * P + k + 1)]) = sum11;                           \
+  }
 
 #define CMATMUL_CDOTP_2x4_LOOP                                                 \
-  v2h sum00 = (v2h)0.0f;                                                       \
-  v2h sum01 = (v2h)0.0f;                                                       \
-  v2h sum02 = (v2h)0.0f;                                                       \
-  v2h sum03 = (v2h)0.0f;                                                       \
-  v2h sum10 = (v2h)0.0f;                                                       \
-  v2h sum11 = (v2h)0.0f;                                                       \
-  v2h sum12 = (v2h)0.0f;                                                       \
-  v2h sum13 = (v2h)0.0f;                                                       \
-  for (j = 0; j < N; j += 2) {                                                 \
-    v2h a00 = A[i * M + j + 0];                                                \
-    v2h a01 = A[i * M + j + 1];                                                \
-    v2h a10 = A[(i + 1) * M + j + 0];                                          \
-    v2h a11 = A[(i + 1) * M + j + 1];                                          \
-    v2h b00 = B[j * P + k + 0];                                                \
-    v2h b01 = B[j * P + k + 1];                                                \
-    v2h b02 = B[j * P + k + 2];                                                \
-    v2h b03 = B[j * P + k + 3];                                                \
-    v2h b10 = B[(j + 1) * P + k + 0];                                          \
-    v2h b11 = B[(j + 1) * P + k + 1];                                          \
-    v2h b12 = B[(j + 1) * P + k + 2];                                          \
-    v2h b13 = B[(j + 1) * P + k + 3];                                          \
-    asm volatile(                                                              \
-        "fcdotpex.s.h  %[sum00], %[a00], %[b00];"                              \
-        "fcdotpex.s.h  %[sum10], %[a10], %[b00];"                              \
-        "fcdotpex.s.h  %[sum01], %[a00], %[b01];"                              \
-        "fcdotpex.s.h  %[sum11], %[a10], %[b01];"                              \
-        "fcdotpex.s.h  %[sum02], %[a00], %[b02];"                              \
-        "fcdotpex.s.h  %[sum12], %[a10], %[b02];"                              \
-        "fcdotpex.s.h  %[sum03], %[a00], %[b03];"                              \
-        "fcdotpex.s.h  %[sum13], %[a10], %[b03];"                              \
-        "fcdotpex.s.h  %[sum00], %[a01], %[b10];"                              \
-        "fcdotpex.s.h  %[sum10], %[a11], %[b10];"                              \
-        "fcdotpex.s.h  %[sum01], %[a01], %[b11];"                              \
-        "fcdotpex.s.h  %[sum11], %[a11], %[b11];"                              \
-        "fcdotpex.s.h  %[sum02], %[a01], %[b12];"                              \
-        "fcdotpex.s.h  %[sum12], %[a11], %[b12];"                              \
-        "fcdotpex.s.h  %[sum03], %[a01], %[b13];"                              \
-        "fcdotpex.s.h  %[sum13], %[a11], %[b13];"                              \
-        : [sum00] "+&r"(sum00), [sum01] "+&r"(sum01), [sum02] "+&r"(sum02),    \
-          [sum03] "+&r"(sum03), [sum10] "+&r"(sum10), [sum11] "+&r"(sum11),    \
-          [sum12] "+&r"(sum12), [sum13] "+&r"(sum13)                           \
-        : [a00] "r"(a00), [a01] "r"(a01), [a10] "r"(a10), [a11] "r"(a11),      \
-          [b00] "r"(b00), [b01] "r"(b01), [b02] "r"(b02), [b03] "r"(b03),      \
-          [b10] "r"(b10), [b11] "r"(b11), [b12] "r"(b12), [b13] "r"(b13)       \
-        :);                                                                    \
-  }                                                                            \
-  C[i * P + k + 0] = sum00;                                                    \
-  C[i * P + k + 1] = sum01;                                                    \
-  C[i * P + k + 2] = sum02;                                                    \
-  C[i * P + k + 3] = sum03;                                                    \
-  C[(i + 1) * P + k + 0] = sum10;                                              \
-  C[(i + 1) * P + k + 1] = sum11;                                              \
-  C[(i + 1) * P + k + 2] = sum12;                                              \
-  C[(i + 1) * P + k + 3] = sum13;
+  {                                                                            \
+    v2h sum00 = (v2h)0.0f;                                                     \
+    v2h sum01 = (v2h)0.0f;                                                     \
+    v2h sum02 = (v2h)0.0f;                                                     \
+    v2h sum03 = (v2h)0.0f;                                                     \
+    v2h sum10 = (v2h)0.0f;                                                     \
+    v2h sum11 = (v2h)0.0f;                                                     \
+    v2h sum12 = (v2h)0.0f;                                                     \
+    v2h sum13 = (v2h)0.0f;                                                     \
+    for (j = 0; j < N; j += 2) {                                               \
+      v2h a00 = *(v2h *)&A[2 * (i * M + j + 0)];                               \
+      v2h a01 = *(v2h *)&A[2 * (i * M + j + 1)];                               \
+      v2h a10 = *(v2h *)&A[2 * ((i + 1) * M + j + 0)];                         \
+      v2h a11 = *(v2h *)&A[2 * ((i + 1) * M + j + 1)];                         \
+      v2h b00 = *(v2h *)&B[2 * (j * P + k + 0)];                               \
+      v2h b01 = *(v2h *)&B[2 * (j * P + k + 1)];                               \
+      v2h b02 = *(v2h *)&B[2 * (j * P + k + 2)];                               \
+      v2h b03 = *(v2h *)&B[2 * (j * P + k + 3)];                               \
+      v2h b10 = *(v2h *)&B[2 * ((j + 1) * P + k + 0)];                         \
+      v2h b11 = *(v2h *)&B[2 * ((j + 1) * P + k + 1)];                         \
+      v2h b12 = *(v2h *)&B[2 * ((j + 1) * P + k + 2)];                         \
+      v2h b13 = *(v2h *)&B[2 * ((j + 1) * P + k + 3)];                         \
+      asm volatile(                                                            \
+          "fcdotpex.s.h  %[sum00], %[a00], %[b00];"                            \
+          "fcdotpex.s.h  %[sum10], %[a10], %[b00];"                            \
+          "fcdotpex.s.h  %[sum01], %[a00], %[b01];"                            \
+          "fcdotpex.s.h  %[sum11], %[a10], %[b01];"                            \
+          "fcdotpex.s.h  %[sum02], %[a00], %[b02];"                            \
+          "fcdotpex.s.h  %[sum12], %[a10], %[b02];"                            \
+          "fcdotpex.s.h  %[sum03], %[a00], %[b03];"                            \
+          "fcdotpex.s.h  %[sum13], %[a10], %[b03];"                            \
+          "fcdotpex.s.h  %[sum00], %[a01], %[b10];"                            \
+          "fcdotpex.s.h  %[sum10], %[a11], %[b10];"                            \
+          "fcdotpex.s.h  %[sum01], %[a01], %[b11];"                            \
+          "fcdotpex.s.h  %[sum11], %[a11], %[b11];"                            \
+          "fcdotpex.s.h  %[sum02], %[a01], %[b12];"                            \
+          "fcdotpex.s.h  %[sum12], %[a11], %[b12];"                            \
+          "fcdotpex.s.h  %[sum03], %[a01], %[b13];"                            \
+          "fcdotpex.s.h  %[sum13], %[a11], %[b13];"                            \
+          : [sum00] "+&r"(sum00), [sum01] "+&r"(sum01), [sum02] "+&r"(sum02),  \
+            [sum03] "+&r"(sum03), [sum10] "+&r"(sum10), [sum11] "+&r"(sum11),  \
+            [sum12] "+&r"(sum12), [sum13] "+&r"(sum13)                         \
+          : [a00] "r"(a00), [a01] "r"(a01), [a10] "r"(a10), [a11] "r"(a11),    \
+            [b00] "r"(b00), [b01] "r"(b01), [b02] "r"(b02), [b03] "r"(b03),    \
+            [b10] "r"(b10), [b11] "r"(b11), [b12] "r"(b12), [b13] "r"(b13)     \
+          :);                                                                  \
+    }                                                                          \
+    (*(v2h *)&C[2 * (i * P + k + 0)]) = sum00;                                 \
+    (*(v2h *)&C[2 * (i * P + k + 1)]) = sum01;                                 \
+    (*(v2h *)&C[2 * (i * P + k + 2)]) = sum02;                                 \
+    (*(v2h *)&C[2 * (i * P + k + 3)]) = sum03;                                 \
+    (*(v2h *)&C[2 * ((i + 1) * P + k + 0)]) = sum10;                           \
+    (*(v2h *)&C[2 * ((i + 1) * P + k + 1)]) = sum11;                           \
+    (*(v2h *)&C[2 * ((i + 1) * P + k + 2)]) = sum12;                           \
+    (*(v2h *)&C[2 * ((i + 1) * P + k + 3)]) = sum13;                           \
+  }
+
+#define CMATMUL_CDOTP_4x4_LOOP                                                 \
+  {                                                                            \
+    int32_t const *addr_a = &A[i * N];                                         \
+    int32_t const *addr_b = &B[j];                                             \
+    int32_t const *end_b = &B[N * P + j];                                      \
+    int32_t const *addr_c = &C[i * P + j];                                     \
+    int32_t const P3 = ((int32_t)P - 3) * 4;                                   \
+    int32_t const N31 = (-3 * (int32_t)N + 1) * 4;                             \
+    register int32_t k asm("x1") = (int32_t)end_b;                             \
+    __asm__ volatile(                                                          \
+        ".balign 16 \n\t"                                                      \
+        "p.lw  x3, %[N](%[addr_a]!) \n\t"                                      \
+        "p.lw x12, 4(%[addr_b]!) \n\t"                                         \
+        "p.lw x13, 4(%[addr_b]!) \n\t"                                         \
+        "p.lw x14, 4(%[addr_b]!) \n\t"                                         \
+        "p.lw x15, %[P3](%[addr_b]!) \n\t"                                     \
+        "p.lw  x4, %[N](%[addr_a]!) \n\t"                                      \
+        "p.lw x10, %[N](%[addr_a]!) \n\t"                                      \
+        "p.lw x11, %[N31](%[addr_a]!) \n\t"                                    \
+        "mv x16, zero \n\t"                                                    \
+        "mv x17, zero \n\t"                                                    \
+        "mv x18, zero \n\t"                                                    \
+        "mv x19, zero \n\t"                                                    \
+        "mv x20, zero \n\t"                                                    \
+        "mv x21, zero \n\t"                                                    \
+        "mv x22, zero \n\t"                                                    \
+        "mv x23, zero \n\t"                                                    \
+        "mv x24, zero \n\t"                                                    \
+        "mv x25, zero \n\t"                                                    \
+        "mv x26, zero \n\t"                                                    \
+        "mv x27, zero \n\t"                                                    \
+        "mv x28, zero \n\t"                                                    \
+        "mv x29, zero \n\t"                                                    \
+        "mv x30, zero \n\t"                                                    \
+        "mv x31, zero \n\t"                                                    \
+        "fcdotpex.s.h x16,  x3, x12 \n\t"                                      \
+        "fcdotpex.s.h x17,  x3, x13 \n\t"                                      \
+        "fcdotpex.s.h x18,  x3, x14 \n\t"                                      \
+        "fcdotpex.s.h x19,  x3, x15 \n\t"                                      \
+        "p.lw  x3, %[N](%[addr_a]!) \n\t"                                      \
+        "fcdotpex.s.h x20,  x4, x12 \n\t"                                      \
+        "fcdotpex.s.h x21,  x4, x13 \n\t"                                      \
+        "fcdotpex.s.h x22,  x4, x14 \n\t"                                      \
+        "fcdotpex.s.h x23,  x4, x15 \n\t"                                      \
+        "p.lw  x4, %[N](%[addr_a]!) \n\t"                                      \
+        "fcdotpex.s.h x24, x10, x12 \n\t"                                      \
+        "fcdotpex.s.h x25, x10, x13 \n\t"                                      \
+        "fcdotpex.s.h x26, x10, x14 \n\t"                                      \
+        "fcdotpex.s.h x27, x10, x15 \n\t"                                      \
+        "p.lw x10, %[N](%[addr_a]!) \n\t"                                      \
+        "fcdotpex.s.h x28, x11, x12 \n\t"                                      \
+        "p.lw x12, 4(%[addr_b]!) \n\t"                                         \
+        "fcdotpex.s.h x29, x11, x13 \n\t"                                      \
+        "p.lw x13, 4(%[addr_b]!) \n\t"                                         \
+        "fcdotpex.s.h x30, x11, x14 \n\t"                                      \
+        "p.lw x14, 4(%[addr_b]!) \n\t"                                         \
+        "fcdotpex.s.h x31, x11, x15 \n\t"                                      \
+        "p.lw x15, %[P3](%[addr_b]!) \n\t"                                     \
+        "p.lw x11, %[N31](%[addr_a]!) \n\t"                                    \
+        "1: \n\t"                                                              \
+        "fcdotpex.s.h x16,  x3, x12 \n\t"                                      \
+        "fcdotpex.s.h x17,  x3, x13 \n\t"                                      \
+        "fcdotpex.s.h x20,  x4, x12 \n\t"                                      \
+        "fcdotpex.s.h x21,  x4, x13 \n\t"                                      \
+        "fcdotpex.s.h x18,  x3, x14 \n\t"                                      \
+        "fcdotpex.s.h x22,  x4, x14 \n\t"                                      \
+        "fcdotpex.s.h x19,  x3, x15 \n\t"                                      \
+        "p.lw  x3, %[N](%[addr_a]!) \n\t"                                      \
+        "fcdotpex.s.h x23,  x4, x15 \n\t"                                      \
+        "p.lw  x4, %[N](%[addr_a]!) \n\t"                                      \
+        "fcdotpex.s.h x24, x10, x12 \n\t"                                      \
+        "fcdotpex.s.h x28, x11, x12 \n\t"                                      \
+        "p.lw x12, 4(%[addr_b]!) \n\t"                                         \
+        "fcdotpex.s.h x25, x10, x13 \n\t"                                      \
+        "fcdotpex.s.h x29, x11, x13 \n\t"                                      \
+        "p.lw x13, 4(%[addr_b]!) \n\t"                                         \
+        "fcdotpex.s.h x26, x10, x14 \n\t"                                      \
+        "fcdotpex.s.h x30, x11, x14 \n\t"                                      \
+        "p.lw x14, 4(%[addr_b]!) \n\t"                                         \
+        "fcdotpex.s.h x27, x10, x15 \n\t"                                      \
+        "fcdotpex.s.h x31, x11, x15 \n\t"                                      \
+        "p.lw x15, %[P3](%[addr_b]!) \n\t"                                     \
+        "p.lw x10, %[N](%[addr_a]!) \n\t"                                      \
+        "p.lw x11, %[N31](%[addr_a]!) \n\t"                                    \
+        "bne %[addr_b], x1, 1b \n\t"                                           \
+        "fcdotpex.s.h x16,  x3, x12 \n\t"                                      \
+        "fcdotpex.s.h x17,  x3, x13 \n\t"                                      \
+        "fcdotpex.s.h x18,  x3, x14 \n\t"                                      \
+        "p.sw x16, 4(%[addr_c]!) \n\t"                                         \
+        "fcdotpex.s.h x19,  x3, x15 \n\t"                                      \
+        "p.sw x17, 4(%[addr_c]!) \n\t"                                         \
+        "fcdotpex.s.h x20,  x4, x12 \n\t"                                      \
+        "p.sw x18, 4(%[addr_c]!) \n\t"                                         \
+        "fcdotpex.s.h x21,  x4, x13 \n\t"                                      \
+        "p.sw x19, %[P3](%[addr_c]!) \n\t"                                     \
+        "fcdotpex.s.h x22,  x4, x14 \n\t"                                      \
+        "p.sw x20, 4(%[addr_c]!) \n\t"                                         \
+        "fcdotpex.s.h x23,  x4, x15 \n\t"                                      \
+        "p.sw x21, 4(%[addr_c]!) \n\t"                                         \
+        "fcdotpex.s.h x24, x10, x12 \n\t"                                      \
+        "p.sw x22, 4(%[addr_c]!) \n\t"                                         \
+        "fcdotpex.s.h x25, x10, x13 \n\t"                                      \
+        "p.sw x23, %[P3](%[addr_c]!) \n\t"                                     \
+        "fcdotpex.s.h x26, x10, x14 \n\t"                                      \
+        "p.sw x24, 4(%[addr_c]!) \n\t"                                         \
+        "fcdotpex.s.h x27, x10, x15 \n\t"                                      \
+        "p.sw x25, 4(%[addr_c]!) \n\t"                                         \
+        "fcdotpex.s.h x28, x11, x12 \n\t"                                      \
+        "p.sw x26, 4(%[addr_c]!) \n\t"                                         \
+        "fcdotpex.s.h x29, x11, x13 \n\t"                                      \
+        "p.sw x27, %[P3](%[addr_c]!) \n\t"                                     \
+        "fcdotpex.s.h x30, x11, x14 \n\t"                                      \
+        "p.sw x28, 4(%[addr_c]!) \n\t"                                         \
+        "fcdotpex.s.h x31, x11, x15 \n\t"                                      \
+        "p.sw x29, 4(%[addr_c]!) \n\t"                                         \
+        "p.sw x30, 4(%[addr_c]!) \n\t"                                         \
+        "p.sw x31, %[P3](%[addr_c]!) \n\t"                                     \
+        :                                                                      \
+        [addr_a] "+&r"(addr_a), [addr_b] "+&r"(addr_b), [addr_c] "+&r"(addr_c) \
+        : [N31] "r"(N31), [P3] "r"(P3), [x1] "r"(k), [N] "I"(dim_N * 4)        \
+        : "x3", "x4", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17",  \
+          "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26",       \
+          "x27", "x28", "x29", "x30", "x31", "memory");                        \
+  }
 
 #define __CDOTP
 void cmatmul_2x2_f16s(__fp16 const *__restrict__ A,
@@ -364,14 +499,13 @@ void cmatmul_2x2_f16p(__fp16 const *__restrict__ A,
 #endif
     }
   }
-  mempool_log_partial_barrier(2, core_id, numThreads);
   return;
 }
 
 #define __SHIFT_A
-void cmatmul_2x4_f16p(v2h *__restrict__ A, v2h const *__restrict__ B,
-                      v2h *__restrict__ C, uint32_t M, uint32_t N, uint32_t P,
-                      uint32_t core_id, uint32_t numThreads) {
+void cmatmul_2x4_f16p(__fp16 *__restrict__ A, __fp16 const *__restrict__ B,
+                      __fp16 *__restrict__ C, uint32_t M, uint32_t N,
+                      uint32_t P, uint32_t core_id, uint32_t numThreads) {
   uint32_t i = 0; // loop counter for M
   uint32_t j = 0; // loop counter for N
   uint32_t k = 0; // loop counter for P
@@ -386,7 +520,7 @@ void cmatmul_2x4_f16p(v2h *__restrict__ A, v2h const *__restrict__ B,
     }
   }
 #else
-  uint32_t shift_id = 2 * (core_id % NUM_CORES_PER_TILE);
+  uint32_t shift_id = (2 * (core_id % NUM_CORES_PER_TILE)) % M;
   for (k = core_id * 4; k < P; k += 4 * numThreads) {
     for (i = shift_id; i < M; i += 2) {
 #ifdef __CDOTP
@@ -404,62 +538,42 @@ void cmatmul_2x4_f16p(v2h *__restrict__ A, v2h const *__restrict__ B,
     }
   }
 #endif
-  mempool_log_partial_barrier(2, core_id, numThreads);
   return;
 }
 
-void cmatmul_2x4_folded_f16p(v2h *A, v2h const *__restrict__ B,
-                             v2h *__restrict__ A_folded, v2h *__restrict__ C,
-                             uint32_t M, uint32_t N, uint32_t P,
-                             uint32_t core_id, uint32_t numThreads) {
-  uint32_t i = 0; // loop counter for M
-  uint32_t j = 0; // loop counter for N
-  uint32_t k = 0; // loop counter for P
-  // Copy multiple A matrices in memory
-  uint32_t num_copy = NUM_BANKS / (N * M);
-  for (k = core_id * 4; k < N * M; k += 4 * numThreads) {
-    v2h a0 = A[k];
-    v2h a1 = A[k + 1];
-    v2h a2 = A[k + 2];
-    v2h a3 = A[k + 3];
-    i = k / N; // row_index
-    j = k % N; // col_index
-    for (uint32_t idx_copy = 0; idx_copy < num_copy; idx_copy++) {
-      A_folded[idx_copy * N * M + i * N + j] = a0;
-      A_folded[idx_copy * N * M + i * N + j + 1] = a1;
-      A_folded[idx_copy * N * M + i * N + j + 2] = a2;
-      A_folded[idx_copy * N * M + i * N + j + 3] = a3;
+// 4x4 MATMUL
+void cmatmul_4x4_f16p(int32_t const *__restrict__ A,
+                      int32_t const *__restrict__ B, int32_t *__restrict__ C,
+                      uint32_t M, uint32_t N, uint32_t P, uint32_t id,
+                      uint32_t numThreads) {
+  uint32_t shift_id = (4 * (id % NUM_CORES_PER_TILE)) % M;
+  for (uint32_t j = 4 * id; j < P; j += 4 * numThreads) {
+    for (uint32_t i = shift_id; i < M; i += 4) {
+      CMATMUL_CDOTP_4x4_LOOP
     }
-  }
-  A = A_folded + (N * M) * ((core_id * BANKING_FACTOR) / (N * M));
-  mempool_log_partial_barrier(2, core_id, numThreads);
-  // Compute
-#ifndef __SHIFT_A
-  for (k = core_id * 4; k < P; k += 4 * numThreads) {
-    for (i = 0; i < M; i += 2) {
-#ifdef __CDOTP
-      CMATMUL_CDOTP_2x4_LOOP;
-#else
-      CMATMUL_2x4_LOOP;
-#endif
+    for (uint32_t i = 0; i < shift_id; i += 4) {
+      CMATMUL_CDOTP_4x4_LOOP
     }
   }
-#else
-  uint32_t shift_id = 2 * (core_id % NUM_CORES_PER_TILE);
-  for (k = core_id * 4; k < P; k += 4 * numThreads) {
-    for (i = shift_id; i < M; i += 2) {
-      // CMATMUL_2x4_LOOP;
-      CMATMUL_CDOTP_2x4_LOOP;
-    }
-    for (i = 0; i < shift_id; i += 2) {
-#ifdef __CDOTP
-      CMATMUL_CDOTP_2x4_LOOP;
-#else
-      CMATMUL_2x4_LOOP;
-#endif
+  mempool_log_partial_barrier(2, id, numThreads);
+  return;
+}
+
+void cmatmul_4x4_folded_f16p(int32_t *A_l2, int32_t *A_folded, int32_t *const B,
+                             int32_t *C, uint32_t M, uint32_t N, uint32_t P,
+                             uint32_t core_id, uint32_t numThreads) {
+
+  // Copy multiple A matrices in memory
+  if (core_id == 0) {
+    for (uint32_t idx_copy = 0; idx_copy < (BANKING_FACTOR * NUM_CORES);
+         idx_copy += (M * N)) {
+      dma_memcpy_blocking(&A_folded[idx_copy], A_l2, M * N * sizeof(int32_t));
     }
   }
-#endif
+  // Cores only fetch from local A
+  int32_t *A_shifted = A_folded;
+  A_shifted += (N * M) * ((core_id * BANKING_FACTOR) / (N * M));
   mempool_log_partial_barrier(2, core_id, numThreads);
+  cmatmul_4x4_f16p(A_shifted, B, C, M, N, P, core_id, numThreads);
   return;
 }
diff --git a/software/runtime/kernel/mempool_cmatmul_q16.h b/software/kernels/baremetal/mempool_cmatmul_q16.h
similarity index 99%
rename from software/runtime/kernel/mempool_cmatmul_q16.h
rename to software/kernels/baremetal/mempool_cmatmul_q16.h
index fc020619d..53b84d80c 100644
--- a/software/runtime/kernel/mempool_cmatmul_q16.h
+++ b/software/kernels/baremetal/mempool_cmatmul_q16.h
@@ -12,7 +12,7 @@
  */
 
 #pragma once
-#include "xpulp/builtins_v2.h"
+#include "builtins_v2.h"
 #define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
 
 #define CMATMUL_1x1_LOOP                                                       \
diff --git a/software/kernels/baremetal/mempool_linearsolver_f32s.h b/software/kernels/baremetal/mempool_linearsolver_f32s.h
index 18e4ec94e..c3f3b6ce1 100644
--- a/software/kernels/baremetal/mempool_linearsolver_f32s.h
+++ b/software/kernels/baremetal/mempool_linearsolver_f32s.h
@@ -222,4 +222,8 @@ void mempool_Lttrisol_folded_f32s(float *pL, float *in, float *x,
   return;
 }
 
+#else
+
+#error "ERROR: f32 MMSE functions available only for __XDIVSQRT."
+
 #endif
diff --git a/software/runtime/kernel/mempool_linearsolver_q16s.h b/software/kernels/baremetal/mempool_linearsolver_q16s.h
similarity index 100%
rename from software/runtime/kernel/mempool_linearsolver_q16s.h
rename to software/kernels/baremetal/mempool_linearsolver_q16s.h
diff --git a/software/kernels/baremetal/mempool_mimo_mmse_f16s.h b/software/kernels/baremetal/mempool_mimo_mmse_f16s.h
index e639918ce..e47ff133a 100644
--- a/software/kernels/baremetal/mempool_mimo_mmse_f16s.h
+++ b/software/kernels/baremetal/mempool_mimo_mmse_f16s.h
@@ -6,6 +6,7 @@
 // Author: Aofeng Aoshen, ETH Zurich
 
 #pragma once
+#include "builtins_v2.h"
 #define N_BANKS (NUM_CORES * BANKING_FACTOR)
 
 /**
diff --git a/software/runtime/kernel/mempool_mimo_mmse_q16s.h b/software/kernels/baremetal/mempool_mimo_mmse_q16s.h
similarity index 100%
rename from software/runtime/kernel/mempool_mimo_mmse_q16s.h
rename to software/kernels/baremetal/mempool_mimo_mmse_q16s.h
diff --git a/software/runtime/kernel/mempool_radix4_cfft_butterfly_f16.h b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h
similarity index 99%
rename from software/runtime/kernel/mempool_radix4_cfft_butterfly_f16.h
rename to software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h
index 7c305b222..edf7ea735 100644
--- a/software/runtime/kernel/mempool_radix4_cfft_butterfly_f16.h
+++ b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h
@@ -5,7 +5,7 @@
 // Author: Marco Bertuletti, ETH Zurich
 
 #pragma once
-#include "xpulp/builtins_v2.h"
+#include "builtins_v2.h"
 
 /**
   @brief         First butterfly stage.
diff --git a/software/runtime/kernel/mempool_radix4_cfft_f16p.h b/software/kernels/baremetal/mempool_radix4_cfft_f16p.h
similarity index 72%
rename from software/runtime/kernel/mempool_radix4_cfft_f16p.h
rename to software/kernels/baremetal/mempool_radix4_cfft_f16p.h
index 5699480fb..c82684995 100644
--- a/software/runtime/kernel/mempool_radix4_cfft_f16p.h
+++ b/software/kernels/baremetal/mempool_radix4_cfft_f16p.h
@@ -6,7 +6,7 @@
 
 #pragma once
 #define BITREVERSETABLE
-#include "xpulp/builtins_v2.h"
+#include "builtins_v2.h"
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 
 // CoSi: (Si, Co) -> C: (Co, -Si)
@@ -215,6 +215,10 @@ void mempool_radix4_cfft_f16p_scheduler(
   __fp16 t0, t1, t2, t3, t4, t5;
   v2h CoSi1, CoSi2, CoSi3;
   v2h C1, C2, C3;
+  __fp16 *pIn;
+  __fp16 *pOut;
+  __fp16 *pTmp;
+
 #ifdef FOLDED_TWIDDLES
   uint32_t n1, n2, n2_store;
   uint32_t i0, k, ic, ic_store;
@@ -223,7 +227,6 @@ void mempool_radix4_cfft_f16p_scheduler(
   uint32_t i0, k, ic;
   uint32_t twidCoefModifier = 1U;
 #endif
-  __fp16 *pTmp;
 
   /* FIRST STAGE */
   n1 = fftLen;
@@ -237,9 +240,8 @@ void mempool_radix4_cfft_f16p_scheduler(
     LOAD_STORE_TWIDDLEFACT;
     SHUFFLE_TWIDDLEFACT;
     for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) {
-      __fp16 *pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * fftLen;
-      __fp16 *pOut =
-          pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
+      pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * fftLen;
+      pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
       radix4_butterfly_first(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2,
                              C3);
     }
@@ -268,10 +270,8 @@ void mempool_radix4_cfft_f16p_scheduler(
       SHUFFLE_TWIDDLEFACT;
 
       for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) {
-        __fp16 *pIn =
-            pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
-        __fp16 *pOut =
-            pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
+        pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
+        pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
         radix4_butterfly_middle(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2,
                                 C3);
       }
@@ -288,10 +288,15 @@ void mempool_radix4_cfft_f16p_scheduler(
   /*  LAST STAGE */
   for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, fftLen >> 2U); i0++) {
     for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) {
-      __fp16 *pIn =
-          pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
-      __fp16 *pOut =
-          pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
+
+#if defined(BITREVERSETABLE)
+      uint32_t col_shift = fftLen;
+#else
+      uint32_t col_shift = fftLen / 4;
+#endif
+
+      pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
+      pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * col_shift;
       radix4_butterfly_last(pIn, pOut, i0);
     }
   }
@@ -300,22 +305,19 @@ void mempool_radix4_cfft_f16p_scheduler(
   pDst16 = pTmp;
   mempool_log_partial_barrier(2, absolute_core_id, n_FFTs_COL * nPE);
   mempool_stop_benchmark();
-  mempool_start_benchmark();
-  /* BITREVERSAL */
-  // Bitreversal stage stores in the sequential addresses
+
   if (bitReverseFlag) {
 #ifdef BITREVERSETABLE
-    pSrc16 = pSrc16 + 2 * col_id * (fftLen / 4);
-    pDst16 = pDst16 + 2 * col_id * fftLen;
+    /* BITREVERSAL */
+    mempool_start_benchmark();
+    pIn = pSrc16 + 2 * col_id * fftLen;
+    uint32_t addr1, addr2, addr3, addr4;
+    uint32_t s2 = 0x00020002;
+    uint32_t tmpa1, tmpa2, tmpa3, tmpa4;
+    uint32_t tmpb1, tmpb2, tmpb3, tmpb4;
+    int32_t a1, a2, a3, a4;
+    int32_t b1, b2, b3, b4;
     for (ic = 8 * core_id; ic < bitReverseLen; ic += 8 * nPE) {
-      uint32_t addr1, addr2, addr3, addr4;
-      uint32_t tmpa1, tmpa2, tmpa3, tmpa4;
-      uint32_t tmpb1, tmpb2, tmpb3, tmpb4;
-      uint32_t a1, a2, a3, a4;
-      uint32_t b1, b2, b3, b4;
-      uint32_t a1_load, a2_load, a3_load, a4_load;
-      uint32_t b1_load, b2_load, b3_load, b4_load;
-      uint32_t s2 = 0x00020002;
       addr1 = *(uint32_t *)&pBitRevTable[ic];
       addr2 = *(uint32_t *)&pBitRevTable[ic + 2];
       addr3 = *(uint32_t *)&pBitRevTable[ic + 4];
@@ -324,67 +326,59 @@ void mempool_radix4_cfft_f16p_scheduler(
                    "pv.sra.h  %[addr2],%[addr2],%[s2];"
                    "pv.sra.h  %[addr3],%[addr3],%[s2];"
                    "pv.sra.h  %[addr4],%[addr4],%[s2];"
-                   "pv.extract.h  %[a1],%[addr1],0;"
-                   "pv.extract.h  %[a2],%[addr2],0;"
-                   "pv.extract.h  %[a3],%[addr3],0;"
-                   "pv.extract.h  %[a4],%[addr4],0;"
-                   "pv.extract.h  %[b1],%[addr1],1;"
-                   "pv.extract.h  %[b2],%[addr2],1;"
-                   "pv.extract.h  %[b3],%[addr3],1;"
-                   "pv.extract.h  %[b4],%[addr4],1;"
+                   "pv.extract.h  %[a1],%[addr1],1;"
+                   "pv.extract.h  %[a2],%[addr2],1;"
+                   "pv.extract.h  %[a3],%[addr3],1;"
+                   "pv.extract.h  %[a4],%[addr4],1;"
+                   "pv.extract.h  %[b1],%[addr1],0;"
+                   "pv.extract.h  %[b2],%[addr2],0;"
+                   "pv.extract.h  %[b3],%[addr3],0;"
+                   "pv.extract.h  %[b4],%[addr4],0;"
                    : [a1] "=r"(a1), [a2] "=r"(a2), [a3] "=r"(a3), [a4] "=r"(a4),
                      [b1] "=r"(b1), [b2] "=r"(b2), [b3] "=r"(b3), [b4] "=r"(b4),
                      [addr1] "+&r"(addr1), [addr2] "+&r"(addr2),
                      [addr3] "+&r"(addr3), [addr4] "+&r"(addr4)
                    : [s2] "r"(s2)
                    :);
-      // Compute the local addresses from the natural order ones
-      a1_load = (a1 % 4) * 2 * N_BANKS + 2 * (a1 / 4);
-      a2_load = (a2 % 4) * 2 * N_BANKS + 2 * (a2 / 4);
-      a3_load = (a3 % 4) * 2 * N_BANKS + 2 * (a3 / 4);
-      a4_load = (a4 % 4) * 2 * N_BANKS + 2 * (a4 / 4);
-      b1_load = (b1 % 4) * 2 * N_BANKS + 2 * (b1 / 4);
-      b2_load = (b2 % 4) * 2 * N_BANKS + 2 * (b2 / 4);
-      b3_load = (b3 % 4) * 2 * N_BANKS + 2 * (b3 / 4);
-      b4_load = (b4 % 4) * 2 * N_BANKS + 2 * (b4 / 4);
-      for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) {
-        uint16_t *ptr1 = (uint16_t *)(pSrc16 + idx_row * (N_BANKS * 8));
-        uint16_t *ptr2 = (uint16_t *)(pDst16 + idx_row * (N_BANKS * 8));
+      for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
+        uint16_t *ptr = (uint16_t *)(pIn + idx_row * (N_BANKS * 8));
         // Load at address a
-        tmpa1 = *(uint32_t *)&ptr1[a1_load];
-        tmpa2 = *(uint32_t *)&ptr1[a2_load];
-        tmpa3 = *(uint32_t *)&ptr1[a3_load];
-        tmpa4 = *(uint32_t *)&ptr1[a4_load];
+        tmpa1 = *(uint32_t *)&ptr[a1];
+        tmpa2 = *(uint32_t *)&ptr[a2];
+        tmpa3 = *(uint32_t *)&ptr[a3];
+        tmpa4 = *(uint32_t *)&ptr[a4];
         // Load at address b
-        tmpb1 = *(uint32_t *)&ptr1[b1_load];
-        tmpb2 = *(uint32_t *)&ptr1[b2_load];
-        tmpb3 = *(uint32_t *)&ptr1[b3_load];
-        tmpb4 = *(uint32_t *)&ptr1[b4_load];
+        tmpb1 = *(uint32_t *)&ptr[b1];
+        tmpb2 = *(uint32_t *)&ptr[b2];
+        tmpb3 = *(uint32_t *)&ptr[b3];
+        tmpb4 = *(uint32_t *)&ptr[b4];
         // Swap a with b
-        *((uint32_t *)&ptr2[b1]) = tmpa1;
-        *((uint32_t *)&ptr2[b2]) = tmpa2;
-        *((uint32_t *)&ptr2[b3]) = tmpa3;
-        *((uint32_t *)&ptr2[b4]) = tmpa4;
+        *((uint32_t *)&ptr[b1]) = tmpa1;
+        *((uint32_t *)&ptr[b2]) = tmpa2;
+        *((uint32_t *)&ptr[b3]) = tmpa3;
+        *((uint32_t *)&ptr[b4]) = tmpa4;
         // Swap b with a
-        *((uint32_t *)&ptr2[a1]) = tmpb1;
-        *((uint32_t *)&ptr2[a2]) = tmpb2;
-        *((uint32_t *)&ptr2[a3]) = tmpb3;
-        *((uint32_t *)&ptr2[a4]) = tmpb4;
+        *((uint32_t *)&ptr[a1]) = tmpb1;
+        *((uint32_t *)&ptr[a2]) = tmpb2;
+        *((uint32_t *)&ptr[a3]) = tmpb3;
+        *((uint32_t *)&ptr[a4]) = tmpb4;
       }
     }
 #else
-    uint16_t *ptr1 = (uint16_t *)(pSrc16 + 2 * col_id * (fftLen / 4));
-    uint16_t *ptr2 = (uint16_t *)(pDst16 + 2 * col_id * fftLen);
-    for (ic = core_id * 16; ic < MIN(core_id * 16 + 16, fftLen >> 2U);
-         ic += 4) {
-      uint32_t idx0 = ic;
-      uint32_t idx1 = ic + 1;
-      uint32_t idx2 = ic + 2;
-      uint32_t idx3 = ic + 3;
-      uint32_t idx_result0 = 0;
-      uint32_t idx_result1 = 0;
-      uint32_t idx_result2 = 0;
-      uint32_t idx_result3 = 0;
+    mempool_start_benchmark();
+    int16_t *ptr1;
+    int16_t *ptr2;
+    uint32_t idx0, idx1, idx2, idx3;
+    uint32_t idx_result0, idx_result1, idx_result2, idx_result3;
+    for (ic = core_id * 4; ic < fftLen; ic += nPE * 4) {
+      idx_result0 = 0;
+      idx_result1 = 0;
+      idx_result2 = 0;
+      idx_result3 = 0;
+      idx0 = ic;
+      idx1 = ic + 1;
+      idx2 = ic + 2;
+      idx3 = ic + 3;
       for (k = 0; k < LOG2; k++) {
         idx_result0 = (idx_result0 << 1U) | (idx0 & 1U);
         idx_result1 = (idx_result1 << 1U) | (idx1 & 1U);
@@ -395,29 +389,20 @@ void mempool_radix4_cfft_f16p_scheduler(
         idx2 = idx2 >> 1U;
         idx3 = idx3 >> 1U;
       }
+      idx0 = ic / 4;
+      idx1 = ic / 4 + N_BANKS;
+      idx2 = ic / 4 + 2 * N_BANKS;
+      idx3 = ic / 4 + 3 * N_BANKS;
       for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) {
-        uint32_t addr_src0 = (idx0 / 4) + (idx0 % 4) * N_BANKS;
-        uint32_t addr_src1 = (idx1 / 4) + (idx1 % 4) * N_BANKS;
-        uint32_t addr_src2 = (idx2 / 4) + (idx2 % 4) * N_BANKS;
-        uint32_t addr_src3 = (idx3 / 4) + (idx3 % 4) * N_BANKS;
-        uint32_t addr_dst0 = idx_result0;
-        uint32_t addr_dst1 = idx_result1;
-        uint32_t addr_dst2 = idx_result2;
-        uint32_t addr_dst3 = idx_result3;
-        addr_src0 += idx_row * (N_BANKS * 8);
-        addr_src1 += idx_row * (N_BANKS * 8);
-        addr_src2 += idx_row * (N_BANKS * 8);
-        addr_src3 += idx_row * (N_BANKS * 8);
-        addr_dst0 += idx_row * (N_BANKS * 8);
-        addr_dst1 += idx_row * (N_BANKS * 8);
-        addr_dst2 += idx_row * (N_BANKS * 8);
-        addr_dst3 += idx_row * (N_BANKS * 8);
-        *((uint32_t *)&ptr2[addr_dst0]) = (uint32_t)ptr1[addr_src0];
-        *((uint32_t *)&ptr2[addr_dst1]) = (uint32_t)ptr1[addr_src1];
-        *((uint32_t *)&ptr2[addr_dst2]) = (uint32_t)ptr1[addr_src2];
-        *((uint32_t *)&ptr2[addr_dst3]) = (uint32_t)ptr1[addr_src3];
+        ptr1 = pSrc16 + 2 * col_id * (fftLen / 4) + idx_row * (N_BANKS * 8);
+        ptr2 = pDst16 + 2 * col_id * fftLen + idx_row * (N_BANKS * 8);
+        *((uint32_t *)&ptr2[2 * idx_result0]) = *((uint32_t *)&ptr1[2 * idx0]);
+        *((uint32_t *)&ptr2[2 * idx_result1]) = *((uint32_t *)&ptr1[2 * idx1]);
+        *((uint32_t *)&ptr2[2 * idx_result2]) = *((uint32_t *)&ptr1[2 * idx2]);
+        *((uint32_t *)&ptr2[2 * idx_result3]) = *((uint32_t *)&ptr1[2 * idx3]);
       }
     }
+    mempool_stop_benchmark();
 #endif
   }
   mempool_log_partial_barrier(2, absolute_core_id, nPE);
diff --git a/software/runtime/data/data_cfft_radix4_f16.h.tpl b/software/runtime/data/data_cfft_radix4_f16.h.tpl
deleted file mode 100644
index 8b1378917..000000000
--- a/software/runtime/data/data_cfft_radix4_f16.h.tpl
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/software/runtime/data/data_cfft_radix4_q16.h.tpl b/software/runtime/data/data_cfft_radix4_q16.h.tpl
deleted file mode 100644
index 8b1378917..000000000
--- a/software/runtime/data/data_cfft_radix4_q16.h.tpl
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/software/runtime/data/data_ofdm.py b/software/runtime/data/data_ofdm.py
deleted file mode 100644
index 8b1378917..000000000
--- a/software/runtime/data/data_ofdm.py
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/software/runtime/kernel/mempool_checks.h b/software/runtime/kernel/mempool_checks.h
deleted file mode 100644
index e69de29bb..000000000
diff --git a/software/runtime/kernel/mempool_chest_f16.h b/software/runtime/kernel/mempool_chest_f16.h
deleted file mode 100644
index ba99a9e3f..000000000
--- a/software/runtime/kernel/mempool_chest_f16.h
+++ /dev/null
@@ -1,372 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-#pragma once
-#define __CDOTP
-#define __MUL
-
-/* a[i] = ar[i] + i * ai[j]
-
-   out[i][j] = a[i] / c[j]
-   out[i][j + 1] = a[i] / c[j + 1h
-   out[i][j + 2] = a[i] / c[j + 2]
-   out[i][j + 3] = a[i] / c[j + 3]*/
-
-#ifdef __XDIVSQRT
-#define DIV_LOOP(ab, ab_n, i)                                                  \
-  re0 = 0;                                                                     \
-  re1 = 0;                                                                     \
-  re2 = 0;                                                                     \
-  re3 = 0;                                                                     \
-  im0 = 0;                                                                     \
-  im1 = 0;                                                                     \
-  im2 = 0;                                                                     \
-  im3 = 0;                                                                     \
-  D0 = 0;                                                                      \
-  D1 = 0;                                                                      \
-  D2 = 0;                                                                      \
-  D3 = 0;                                                                      \
-  cd0 = *(uint32_t *)&pPilotTX_itr[2U * j];                                    \
-  cd1 = *(uint32_t *)&pPilotTX_itr[2U * (j + 1)];                              \
-  cd2 = *(uint32_t *)&pPilotTX_itr[2U * (j + 2)];                              \
-  cd3 = *(uint32_t *)&pPilotTX_itr[2U * (j + 3)];                              \
-  asm volatile("vfdotpex.s.h   %[D0],  %[cd0], %[cd0];"                        \
-               "vfdotpex.s.h   %[D1],  %[cd1], %[cd1];"                        \
-               "vfdotpex.s.h   %[D2],  %[cd2], %[cd2];"                        \
-               "vfdotpex.s.h   %[D3],  %[cd3], %[cd3];"                        \
-               "vfdotpex.s.h   %[re0], %[x],   %[cd0];"                        \
-               "vfdotpex.s.h   %[re1], %[x],   %[cd1];"                        \
-               "vfdotpex.s.h   %[re2], %[x],   %[cd2];"                        \
-               "vfdotpex.s.h   %[re3], %[x],   %[cd3];"                        \
-               "vfdotpex.s.h   %[im0], %[y],   %[cd0];"                        \
-               "vfdotpex.s.h   %[im1], %[y],   %[cd1];"                        \
-               "vfdotpex.s.h   %[im2], %[y],   %[cd2];"                        \
-               "vfdotpex.s.h   %[im3], %[y],   %[cd3];"                        \
-               "fdiv.s         %[re0], %[re0], %[D0];"                         \
-               "fdiv.s         %[re1], %[re1], %[D1];"                         \
-               "fdiv.s         %[re2], %[re2], %[D2];"                         \
-               "fdiv.s         %[re3], %[re3], %[D3];"                         \
-               "fdiv.s         %[im0], %[im0], %[D0];"                         \
-               "fdiv.s         %[im1], %[im1], %[D1];"                         \
-               "fdiv.s         %[im2], %[im2], %[D2];"                         \
-               "fdiv.s         %[im3], %[im3], %[D3];"                         \
-               "vfcpka.h.s     %[re0], %[re0], %[im0];"                        \
-               "vfcpka.h.s     %[re1], %[re1], %[im1];"                        \
-               "vfcpka.h.s     %[re2], %[re2], %[im2];"                        \
-               "vfcpka.h.s     %[re3], %[re3], %[im3];"                        \
-               : [D0] "+&r"(D0), [D1] "+&r"(D1), [D2] "+&r"(D2),               \
-                 [D3] "+&r"(D3), [re0] "+&r"(re0), [re1] "+&r"(re1),           \
-                 [re2] "+&r"(re2), [re3] "+&r"(re3), [im0] "+&r"(im0),         \
-                 [im1] "+&r"(im1), [im2] "+&r"(im2), [im3] "+&r"(im3)          \
-               : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2),               \
-                 [cd3] "r"(cd3), [x] "r"(ab), [y] "r"(ab_n)                    \
-               :);                                                             \
-  *((uint32_t *)&pH_itr[2 * (i * nTX + j)]) = re0;                             \
-  *((uint32_t *)&pH_itr[2 * (i * nTX + j + 1)]) = re1;                         \
-  *((uint32_t *)&pH_itr[2 * (i * nTX + j + 2)]) = re2;                         \
-  *((uint32_t *)&pH_itr[2 * (i * nTX + j + 3)]) = re3;
-#else
-#define DIV_LOOP(ab, ab_n, i)                                                  \
-  re0 = 0;                                                                     \
-  re1 = 0;                                                                     \
-  re2 = 0;                                                                     \
-  re3 = 0;                                                                     \
-  im0 = 0;                                                                     \
-  im1 = 0;                                                                     \
-  im2 = 0;                                                                     \
-  im3 = 0;                                                                     \
-  D0 = 0;                                                                      \
-  D1 = 0;                                                                      \
-  D2 = 0;                                                                      \
-  D3 = 0;                                                                      \
-  cd0 = *(uint32_t *)&pPilotTX_itr[2U * j];                                    \
-  cd1 = *(uint32_t *)&pPilotTX_itr[2U * (j + 1)];                              \
-  cd2 = *(uint32_t *)&pPilotTX_itr[2U * (j + 2)];                              \
-  cd3 = *(uint32_t *)&pPilotTX_itr[2U * (j + 3)];                              \
-  asm volatile("vfdotpex.s.h   %[D0],  %[cd0], %[cd0];"                        \
-               "vfdotpex.s.h   %[D1],  %[cd1], %[cd1];"                        \
-               "vfdotpex.s.h   %[D2],  %[cd2], %[cd2];"                        \
-               "vfdotpex.s.h   %[D3],  %[cd3], %[cd3];"                        \
-               "vfdotpex.s.h   %[re0], %[x],   %[cd0];"                        \
-               "vfdotpex.s.h   %[re1], %[x],   %[cd1];"                        \
-               "vfdotpex.s.h   %[re2], %[x],   %[cd2];"                        \
-               "vfdotpex.s.h   %[re3], %[x],   %[cd3];"                        \
-               "vfdotpex.s.h   %[im0], %[y],   %[cd0];"                        \
-               "vfdotpex.s.h   %[im1], %[y],   %[cd1];"                        \
-               "vfdotpex.s.h   %[im2], %[y],   %[cd2];"                        \
-               "vfdotpex.s.h   %[im3], %[y],   %[cd3];"                        \
-               : [D0] "+&r"(D0), [D1] "+&r"(D1), [D2] "+&r"(D2),               \
-                 [D3] "+&r"(D3), [re0] "+&r"(re0), [re1] "+&r"(re1),           \
-                 [re2] "+&r"(re2), [re3] "+&r"(re3), [im0] "+&r"(im0),         \
-                 [im1] "+&r"(im1), [im2] "+&r"(im2), [im3] "+&r"(im3)          \
-               : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2),               \
-                 [cd3] "r"(cd3), [x] "r"(ab), [y] "r"(ab_n)                    \
-               :);                                                             \
-  re0 = re0 / D0;                                                              \
-  re1 = re1 / D1;                                                              \
-  re2 = re2 / D2;                                                              \
-  re3 = re3 / D3;                                                              \
-  im0 = im0 / D0;                                                              \
-  im1 = im1 / D1;                                                              \
-  im2 = im2 / D2;                                                              \
-  im3 = im3 / D3;                                                              \
-  asm volatile("vfcpka.h.s %[re0], %[re0], %[im0];"                            \
-               "vfcpka.h.s %[re1], %[re1], %[im1];"                            \
-               "vfcpka.h.s %[re2], %[re2], %[im2];"                            \
-               "vfcpka.h.s %[re3], %[re3], %[im3];"                            \
-               : [re0] "+&r"(re0), [re1] "+&r"(re1), [re2] "+&r"(re2),         \
-                 [re3] "+&r"(re3), [im0] "+&r"(im0), [im1] "+&r"(im1),         \
-                 [im2] "+&r"(im2), [im3] "+&r"(im3)                            \
-               : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2),               \
-                 [cd3] "r"(cd3), [x] "r"(ab), [y] "r"(ab_n)                    \
-               :);                                                             \
-  *((uint32_t *)&pH_itr[2 * (i * nTX + j)]) = re0;                             \
-  *((uint32_t *)&pH_itr[2 * (i * nTX + j + 1)]) = re1;                         \
-  *((uint32_t *)&pH_itr[2 * (i * nTX + j + 2)]) = re2;                         \
-  *((uint32_t *)&pH_itr[2 * (i * nTX + j + 3)]) = re3;
-#endif
-
-/* a[i] = ar[i] + i * ai[j]
-
-   out[i][j] = a[i] * c[j]
-   out[i][j + 1] = a[i] * c[j + 1]
-   out[i][j + 2] = a[i] * c[j + 2]
-   out[i][j + 3] = a[i] * c[j + 3]*/
-
-#define MUL_LOOP(ab, ab_n, i)                                                  \
-  re0 = 0;                                                                     \
-  re1 = 0;                                                                     \
-  re2 = 0;                                                                     \
-  re3 = 0;                                                                     \
-  im0 = 0;                                                                     \
-  im1 = 0;                                                                     \
-  im2 = 0;                                                                     \
-  im3 = 0;                                                                     \
-  cd0 = *(uint32_t *)&pPilotTX_itr[2U * j];                                    \
-  cd1 = *(uint32_t *)&pPilotTX_itr[2U * (j + 1)];                              \
-  cd2 = *(uint32_t *)&pPilotTX_itr[2U * (j + 2)];                              \
-  cd3 = *(uint32_t *)&pPilotTX_itr[2U * (j + 3)];                              \
-  asm volatile("vfdotpex.s.h   %[re0], %[x], %[cd0];"                          \
-               "vfdotpex.s.h   %[re1], %[x], %[cd1];"                          \
-               "vfdotpex.s.h   %[re2], %[x], %[cd2];"                          \
-               "vfdotpex.s.h   %[re3], %[x], %[cd3];"                          \
-               "vfdotpex.s.h   %[im0], %[y], %[cd0];"                          \
-               "vfdotpex.s.h   %[im1], %[y], %[cd1];"                          \
-               "vfdotpex.s.h   %[im2], %[y], %[cd2];"                          \
-               "vfdotpex.s.h   %[im3], %[y], %[cd3];"                          \
-               : [re0] "+&r"(re0), [re1] "+&r"(re1), [re2] "+&r"(re2),         \
-                 [re3] "+&r"(re3), [im0] "+&r"(im0), [im1] "+&r"(im1),         \
-                 [im2] "+&r"(im2), [im3] "+&r"(im3)                            \
-               : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2),               \
-                 [cd3] "r"(cd3), [x] "r"(ab), [y] "r"(ab_n)                    \
-               :);                                                             \
-  asm volatile(                                                                \
-      "vfcpka.h.s       %[re0], %[re0], %[im0];"                               \
-      "vfcpka.h.s       %[re1], %[re1], %[im1];"                               \
-      "vfcpka.h.s       %[re2], %[re2], %[im2];"                               \
-      "vfcpka.h.s       %[re3], %[re3], %[im3];"                               \
-      : [re0] "+&r"(re0), [re1] "+&r"(re1), [re2] "+&r"(re2),                  \
-        [re3] "+&r"(re3), [im0] "+&r"(im0), [im1] "+&r"(im1),                  \
-        [im2] "+&r"(im2), [im3] "+&r"(im3)                                     \
-      : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2), [cd3] "r"(cd3)         \
-      :);                                                                      \
-  *((uint32_t *)&pH_itr[2 * (i * nTX + j)]) = re0;                             \
-  *((uint32_t *)&pH_itr[2 * (i * nTX + j + 1)]) = re1;                         \
-  *((uint32_t *)&pH_itr[2 * (i * nTX + j + 2)]) = re2;                         \
-  *((uint32_t *)&pH_itr[2 * (i * nTX + j + 3)]) = re3;
-
-#define CMUL_LOOP(ab, i)                                                       \
-  sum0 = 0;                                                                    \
-  sum1 = 0;                                                                    \
-  sum2 = 0;                                                                    \
-  sum3 = 0;                                                                    \
-  cd0 = *(uint32_t *)&pPilotTX_itr[2U * j];                                    \
-  cd1 = *(uint32_t *)&pPilotTX_itr[2U * (j + 1)];                              \
-  cd2 = *(uint32_t *)&pPilotTX_itr[2U * (j + 2)];                              \
-  cd3 = *(uint32_t *)&pPilotTX_itr[2U * (j + 3)];                              \
-  asm volatile("fcdotpex.s.h   %[sum0], %[x], %[cd0];"                         \
-               "fcdotpex.s.h   %[sum1], %[x], %[cd1];"                         \
-               "fcdotpex.s.h   %[sum2], %[x], %[cd2];"                         \
-               "fcdotpex.s.h   %[sum3], %[x], %[cd3];"                         \
-               : [sum0] "+&r"(sum0), [sum1] "+&r"(sum1), [sum2] "+&r"(sum2),   \
-                 [sum3] "+&r"(sum3)                                            \
-               : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2),               \
-                 [cd3] "r"(cd3), [x] "r"(ab)                                   \
-               :);                                                             \
-  *((uint32_t *)&pH_itr[2 * (i * nTX + j)]) = sum0;                            \
-  *((uint32_t *)&pH_itr[2 * (i * nTX + j + 1)]) = sum1;                        \
-  *((uint32_t *)&pH_itr[2 * (i * nTX + j + 2)]) = sum2;                        \
-  *((uint32_t *)&pH_itr[2 * (i * nTX + j + 3)]) = sum3;
-
-#define SHUFFLE_A                                                              \
-  asm volatile(                                                                \
-      "xor           %[ab_n0], %[ab0],   %[neg_mask];"                         \
-      "xor           %[ab_n1], %[ab1],   %[neg_mask];"                         \
-      "xor           %[ab_n2], %[ab2],   %[neg_mask];"                         \
-      "xor           %[ab_n3], %[ab3],   %[neg_mask];"                         \
-      "pv.shuffle2.h %[ab_n0], %[ab_n0], %[mask];"                             \
-      "pv.shuffle2.h %[ab_n1], %[ab_n1], %[mask];"                             \
-      "pv.shuffle2.h %[ab_n2], %[ab_n2], %[mask];"                             \
-      "pv.shuffle2.h %[ab_n3], %[ab_n3], %[mask];"                             \
-      : [ab_n0] "+&r"(ab_n0), [ab_n1] "+&r"(ab_n1), [ab_n2] "+&r"(ab_n2),      \
-        [ab_n3] "+&r"(ab_n3)                                                   \
-      : [ab0] "r"(ab0), [ab1] "r"(ab1), [ab2] "r"(ab2), [ab3] "r"(ab3),        \
-        [neg_mask] "r"(0x00008000), [mask] "r"(0x00020003)                     \
-      :);                                                                      \
-/**                                                                            \
-  @brief         Block-type channel estimation.                                \
-  @param[in]     pH  points to output channel                                  \
-  @param[in]     pPilotRX points to received symbol                            \
-  @param[in]     pPilotTX points to sent pilot                                 \
-  @param[in]     nTX Number of transmitters                                    \
-  @param[in]     nRX Number of receivers                                       \
-  @param[in]     nSc Number of Subcarriers                                     \
-  @return        none                                                          \
-*/
-void mempool_chest_f16s_unrolled4(__fp16 *pH, __fp16 *pPilotRX,
-                                  __fp16 *pPilotTX, uint32_t nRX, uint32_t nTX,
-                                  uint32_t nSc) {
-
-  uint32_t ab0, ab1, ab2, ab3;
-  uint32_t cd0, cd1, cd2, cd3;
-  uint32_t re0, re1, re2, re3;
-  uint32_t im0, im1, im2, im3;
-  uint32_t D0, D1, D2, D3;
-  uint32_t ab_n0, ab_n1, ab_n2, ab_n3;
-  __fp16 *pPilotTX_itr;
-  __fp16 *pPilotRX_itr;
-  __fp16 *pH_itr;
-
-  for (uint32_t k = 0; k < nSc; k++) {
-    pPilotTX_itr = pPilotTX + k * (2 * nTX);
-    pPilotRX_itr = pPilotRX + k * (2 * nRX);
-    pH_itr = pH + k * 2 * (nTX * nRX);
-    for (uint32_t i = 0; i < nRX; i++) {
-      ab0 = *(uint32_t *)&pPilotRX_itr[2U * i];
-      ab1 = *(uint32_t *)&pPilotRX_itr[2U * (i + 1)];
-      ab2 = *(uint32_t *)&pPilotRX_itr[2U * (i + 2)];
-      ab3 = *(uint32_t *)&pPilotRX_itr[2U * (i + 3)];
-      SHUFFLE_A;
-      for (uint32_t j = 0; j < nTX; j += 4) {
-        DIV_LOOP(ab0, ab_n0, i);
-        DIV_LOOP(ab1, ab_n1, i + 1);
-        DIV_LOOP(ab2, ab_n2, i + 2);
-        DIV_LOOP(ab3, ab_n3, i + 3);
-      }
-    }
-  }
-  return;
-}
-
-/**
-  @brief         Block-type channel estimation.
-  @param[in]     pH  points to output channel
-  @param[in]     pPilotRX points to received symbol
-  @param[in]     pPilotTX points to sent pilot
-  @param[in]     nTX Number of transmitters
-  @param[in]     nRX Number of receivers
-  @param[in]     nSc Number of Subcarriers
-  @param[in]     core_id ID of the PE
-  @param[in]     nPE Number of PEs
-  @return        none
-*/
-void mempool_chest_f16p_unrolled4(__fp16 *pH, __fp16 *pPilotRX,
-                                  __fp16 *pPilotTX, uint32_t nRX, uint32_t nTX,
-                                  uint32_t nSc, uint32_t core_id,
-                                  uint32_t nPE) {
-  uint32_t ab0, ab1, ab2, ab3;
-  uint32_t cd0, cd1, cd2, cd3;
-#ifndef __CDOTP
-  uint32_t ab_n0, ab_n1, ab_n2, ab_n3;
-  uint32_t re0, re1, re2, re3;
-  uint32_t im0, im1, im2, im3;
-#else
-  uint32_t sum0, sum1, sum2, sum3;
-#endif
-
-#ifndef __MUL
-  uint32_t D0, D1, D2, D3;
-#endif
-
-  __fp16 *pPilotTX_itr;
-  __fp16 *pPilotRX_itr;
-  __fp16 *pH_itr;
-
-  for (uint32_t k = core_id; k < nSc; k += nPE) {
-    pPilotTX_itr = pPilotTX + k * (2 * nTX);
-    pPilotRX_itr = pPilotRX + k * (2 * nRX);
-    pH_itr = pH + k * 2 * (nTX * nRX);
-    for (uint32_t i = 0; i < nRX; i += 4) {
-      ab0 = *(uint32_t *)&pPilotRX_itr[2U * i];
-      ab1 = *(uint32_t *)&pPilotRX_itr[2U * (i + 1)];
-      ab2 = *(uint32_t *)&pPilotRX_itr[2U * (i + 2)];
-      ab3 = *(uint32_t *)&pPilotRX_itr[2U * (i + 3)];
-#ifndef __CDOTP
-      SHUFFLE_A;
-#endif
-
-      for (uint32_t j = 0; j < nTX; j += 4) {
-#if (defined(__CDOTP) && defined(__MUL))
-        CMUL_LOOP(ab0, i);
-        CMUL_LOOP(ab1, i + 1);
-        CMUL_LOOP(ab2, i + 2);
-        CMUL_LOOP(ab3, i + 3);
-#elif (!defined(__CDOTP) && defined(__MUL))
-        MUL_LOOP(ab0, ab_n0, i);
-        MUL_LOOP(ab1, ab_n1, i + 1);
-        MUL_LOOP(ab2, ab_n2, i + 2);
-        MUL_LOOP(ab3, ab_n3, i + 3);
-#else
-        DIV_LOOP(ab0, ab_n0, i)
-        DIV_LOOP(ab1, ab_n1, i + 1)
-        DIV_LOOP(ab2, ab_n2, i + 2)
-        DIV_LOOP(ab3, ab_n3, i + 3)
-#endif
-      }
-    }
-  }
-  mempool_barrier(nPE);
-  return;
-}
-
-void mempool_chest_f16p_unrolled4_local(__fp16 *volatile pH,
-                                        __fp16 *volatile pPilotRX,
-                                        __fp16 *volatile pPilotTX, uint32_t nRX,
-                                        uint32_t nTX, uint32_t nSc,
-                                        uint32_t core_id, uint32_t nPE) {
-  uint32_t ab0, ab1, ab2, ab3;
-  uint32_t cd0, cd1, cd2, cd3;
-  uint32_t sum0, sum1, sum2, sum3;
-  __fp16 *pPilotTX_itr;
-  __fp16 *pPilotRX_itr;
-  __fp16 *pH_itr;
-  uint32_t itr, i, j;
-
-  // Cores Loop over the received pilots vector
-  for (itr = core_id * 4; itr < (nSc * nRX);
-       itr += (BANKING_FACTOR * NUM_CORES)) {
-    // Received pilots are aligned to cores
-    uint32_t sc_RX = itr / nRX;
-    pPilotTX_itr = pPilotTX + sc_RX * (2 * nTX);
-    pPilotRX_itr = pPilotRX + sc_RX * (2 * nRX);
-    pH_itr = pH + sc_RX * 2 * (nTX * nRX);
-
-    // Load received pilots
-    i = itr % nRX;
-    ab0 = *(uint32_t *)&pPilotRX_itr[2U * i];
-    ab1 = *(uint32_t *)&pPilotRX_itr[2U * (i + 1)];
-    ab2 = *(uint32_t *)&pPilotRX_itr[2U * (i + 2)];
-    ab3 = *(uint32_t *)&pPilotRX_itr[2U * (i + 3)];
-    for (j = 0; j < nTX; j += 4) {
-      CMUL_LOOP(ab0, i);
-      CMUL_LOOP(ab1, i + 1);
-      CMUL_LOOP(ab2, i + 2);
-      CMUL_LOOP(ab3, i + 3);
-    }
-  }
-  mempool_barrier(nPE);
-  return;
-}
diff --git a/software/runtime/kernel/mempool_chest_q16.h b/software/runtime/kernel/mempool_chest_q16.h
deleted file mode 100644
index c66aa0537..000000000
--- a/software/runtime/kernel/mempool_chest_q16.h
+++ /dev/null
@@ -1,245 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-// Author: Marco Bertuletti, ETH Zurich
-
-#pragma once
-#include "xpulp/builtins_v2.h"
-#define __MUL
-
-/* a[i] = ar[i] + i * ai[j]
-   out[i][j] = a[i] / c[j]
-   out[i][j + 1] = a[i] / c[j + 1]
-   out[i][j + 2] = a[i] / c[j + 2]
-   out[i][j + 3] = a[i] / c[j + 3]*/
-
-#define DIV_LOOP(ab, ab_n, i)                                                  \
-  cd0 = *(v2s *)&pPilotTX_itr[2U * j];                                         \
-  cd1 = *(v2s *)&pPilotTX_itr[2U * (j + 1)];                                   \
-  cd2 = *(v2s *)&pPilotTX_itr[2U * (j + 2)];                                   \
-  cd3 = *(v2s *)&pPilotTX_itr[2U * (j + 3)];                                   \
-  D0 = (1 << 16U) / __DOTP2(cd0, cd0);                                         \
-  D1 = (1 << 16U) / __DOTP2(cd1, cd1);                                         \
-  D2 = (1 << 16U) / __DOTP2(cd2, cd2);                                         \
-  D3 = (1 << 16U) / __DOTP2(cd3, cd3);                                         \
-  re0 = __DOTP2(ab, cd0);                                                      \
-  re1 = __DOTP2(ab, cd1);                                                      \
-  re2 = __DOTP2(ab, cd2);                                                      \
-  re3 = __DOTP2(ab, cd3);                                                      \
-  im0 = __DOTP2(ab_n, cd0);                                                    \
-  im1 = __DOTP2(ab_n, cd1);                                                    \
-  im2 = __DOTP2(ab_n, cd2);                                                    \
-  im3 = __DOTP2(ab_n, cd3);                                                    \
-  re0 = __CLIP((re0 * D0) >> 8, 16);                                           \
-  re1 = __CLIP((re1 * D1) >> 8, 16);                                           \
-  re2 = __CLIP((re2 * D2) >> 8, 16);                                           \
-  re3 = __CLIP((re3 * D3) >> 8, 16);                                           \
-  im0 = __CLIP((im0 * D0) >> 8, 16);                                           \
-  im1 = __CLIP((im1 * D1) >> 8, 16);                                           \
-  im2 = __CLIP((im2 * D2) >> 8, 16);                                           \
-  im3 = __CLIP((im3 * D3) >> 8, 16);                                           \
-  re0 = (int32_t)(__PACK2(re0, im0));                                          \
-  re1 = (int32_t)(__PACK2(re1, im1));                                          \
-  re2 = (int32_t)(__PACK2(re2, im2));                                          \
-  re3 = (int32_t)(__PACK2(re3, im3));                                          \
-  *((v2s *)&pH_itr[2 * (i * nTX + j)]) = (v2s)re0;                             \
-  *((v2s *)&pH_itr[2 * (i * nTX + j + 1)]) = (v2s)re1;                         \
-  *((v2s *)&pH_itr[2 * (i * nTX + j + 2)]) = (v2s)re2;                         \
-  *((v2s *)&pH_itr[2 * (i * nTX + j + 3)]) = (v2s)re3;
-
-/* a[i] = ar[i] + i * ai[j]
-   out[i][j] = a[i] * c[j]
-   out[i][j + 1] = a[i] * c[j + 1]
-   out[i][j + 2] = a[i] * c[j + 2]
-   out[i][j + 3] = a[i] * c[j + 3]*/
-
-#define MUL_LOOP(ab, ab_n, i)                                                  \
-  cd0 = *(v2s *)&pPilotTX_itr[2U * j];                                         \
-  cd1 = *(v2s *)&pPilotTX_itr[2U * (j + 1)];                                   \
-  cd2 = *(v2s *)&pPilotTX_itr[2U * (j + 2)];                                   \
-  cd3 = *(v2s *)&pPilotTX_itr[2U * (j + 3)];                                   \
-  re0 = __DOTP2(ab, cd0);                                                      \
-  re1 = __DOTP2(ab, cd1);                                                      \
-  re2 = __DOTP2(ab, cd2);                                                      \
-  re3 = __DOTP2(ab, cd3);                                                      \
-  im0 = __DOTP2(ab_n, cd0);                                                    \
-  im1 = __DOTP2(ab_n, cd1);                                                    \
-  im2 = __DOTP2(ab_n, cd2);                                                    \
-  im3 = __DOTP2(ab_n, cd3);                                                    \
-  re0 = __CLIP(re0 >> 8, 16);                                                  \
-  re1 = __CLIP(re1 >> 8, 16);                                                  \
-  re2 = __CLIP(re2 >> 8, 16);                                                  \
-  re3 = __CLIP(re3 >> 8, 16);                                                  \
-  im0 = __CLIP(im0 >> 8, 16);                                                  \
-  im1 = __CLIP(im1 >> 8, 16);                                                  \
-  im2 = __CLIP(im2 >> 8, 16);                                                  \
-  im3 = __CLIP(im3 >> 8, 16);                                                  \
-  re0 = (int32_t)(__PACK2(re0, im0));                                          \
-  re1 = (int32_t)(__PACK2(re1, im1));                                          \
-  re2 = (int32_t)(__PACK2(re2, im2));                                          \
-  re3 = (int32_t)(__PACK2(re3, im3));                                          \
-  *((v2s *)&pH_itr[2 * (i * nTX + j)]) = (v2s)re0;                             \
-  *((v2s *)&pH_itr[2 * (i * nTX + j + 1)]) = (v2s)re1;                         \
-  *((v2s *)&pH_itr[2 * (i * nTX + j + 2)]) = (v2s)re2;                         \
-  *((v2s *)&pH_itr[2 * (i * nTX + j + 3)]) = (v2s)re3;
-
-#define SHUFFLE_A                                                              \
-  asm volatile(                                                                \
-      "pv.sub.h      %[ab_n0], %[zero], %[ab0];"                               \
-      "pv.sub.h      %[ab_n1], %[zero], %[ab1];"                               \
-      "pv.sub.h      %[ab_n2], %[zero], %[ab2];"                               \
-      "pv.sub.h      %[ab_n3], %[zero], %[ab3];"                               \
-      "pv.shuffle2.h %[ab_n0], %[ab_n0],  %[mask];"                            \
-      "pv.shuffle2.h %[ab_n1], %[ab_n1],  %[mask];"                            \
-      "pv.shuffle2.h %[ab_n2], %[ab_n2],  %[mask];"                            \
-      "pv.shuffle2.h %[ab_n3], %[ab_n3],  %[mask];"                            \
-      : [ab_n0] "=&r"(ab_n0), [ab_n1] "=&r"(ab_n1), [ab_n2] "=&r"(ab_n2),      \
-        [ab_n3] "=&r"(ab_n3)                                                   \
-      : [ab0] "r"(ab0), [ab1] "r"(ab1), [ab2] "r"(ab2), [ab3] "r"(ab3),        \
-        [zero] "r"(0x00000000), [mask] "r"(0x00020001)                         \
-      :);
-
-/**
-  @brief         Block-type channel estimation.
-  @param[in]     pH  points to output channel
-  @param[in]     pPilotRX points to received symbol
-  @param[in]     pPilotTX points to sent pilot
-  @param[in]     nTX Number of transmitters
-  @param[in]     nRX Number of receivers
-  @param[in]     nSc Number of Subcarriers
-  @return        none
-*/
-void mempool_chest_q16s_unrolled4(int16_t *pH, int16_t *pPilotRX,
-                                  int16_t *pPilotTX, uint32_t nRX, uint32_t nTX,
-                                  uint32_t nSc) {
-
-  v2s ab0, ab1, ab2, ab3;
-  v2s ab_n0, ab_n1, ab_n2, ab_n3;
-  v2s cd0, cd1, cd2, cd3;
-  int32_t re0, re1, re2, re3;
-  int32_t im0, im1, im2, im3;
-  int32_t D0, D1, D2, D3;
-
-  int16_t *pPilotTX_itr;
-  int16_t *pPilotRX_itr;
-  int16_t *pH_itr;
-  for (uint32_t k = 0; k < nSc; k++) {
-    pPilotTX_itr = pPilotTX + k * (2 * nTX);
-    pPilotRX_itr = pPilotRX + k * (2 * nRX);
-    pH_itr = pH + k * 2 * (nTX * nRX);
-    for (uint32_t i = 0; i < nRX; i += 4) {
-      ab0 = *(v2s *)&pPilotRX_itr[2U * i];
-      ab1 = *(v2s *)&pPilotRX_itr[2U * (i + 1)];
-      ab2 = *(v2s *)&pPilotRX_itr[2U * (i + 2)];
-      ab3 = *(v2s *)&pPilotRX_itr[2U * (i + 3)];
-      SHUFFLE_A;
-      for (uint32_t j = 0; j < nTX; j += 4) {
-        DIV_LOOP(ab0, ab_n0, i);
-        DIV_LOOP(ab1, ab_n1, i + 1);
-        DIV_LOOP(ab2, ab_n2, i + 2);
-        DIV_LOOP(ab3, ab_n3, i + 3);
-      }
-    }
-  }
-  return;
-}
-
-/**
-  @brief         Block-type channel estimation.
-  @param[in]     pH  points to output channel
-  @param[in]     pPilotRX points to received symbol
-  @param[in]     pPilotTX points to sent pilot
-  @param[in]     nTX Number of transmitters
-  @param[in]     nRX Number of receivers
-  @param[in]     nSc Number of Subcarriers
-  @return        none
-*/
-void mempool_chest_q16p_unrolled4(int16_t *volatile pH,
-                                  int16_t *volatile pPilotRX,
-                                  int16_t *volatile pPilotTX, uint32_t nRX,
-                                  uint32_t nTX, uint32_t nSc, uint32_t core_id,
-                                  uint32_t nPE) {
-
-  v2s ab0, ab1, ab2, ab3;
-  v2s ab_n0, ab_n1, ab_n2, ab_n3;
-  v2s cd0, cd1, cd2, cd3;
-  int32_t re0, re1, re2, re3;
-  int32_t im0, im1, im2, im3;
-#ifndef __MUL
-  int32_t D0, D1, D2, D3;
-#endif
-
-  int16_t *pPilotTX_itr;
-  int16_t *pPilotRX_itr;
-  int16_t *pH_itr;
-  for (uint32_t k = core_id; k < nSc; k += nPE) {
-    pPilotTX_itr = pPilotTX + k * (2 * nTX);
-    pPilotRX_itr = pPilotRX + k * (2 * nRX);
-    pH_itr = pH + k * 2 * (nTX * nRX);
-    for (uint32_t i = 0; i < nRX; i += 4) {
-      ab0 = *(v2s *)&pPilotRX_itr[2U * i];
-      ab1 = *(v2s *)&pPilotRX_itr[2U * (i + 1)];
-      ab2 = *(v2s *)&pPilotRX_itr[2U * (i + 2)];
-      ab3 = *(v2s *)&pPilotRX_itr[2U * (i + 3)];
-      SHUFFLE_A;
-      for (uint32_t j = 0; j < nTX; j += 4) {
-#ifdef __MUL
-        MUL_LOOP(ab0, ab_n0, i);
-        MUL_LOOP(ab1, ab_n1, i + 1);
-        MUL_LOOP(ab2, ab_n2, i + 2);
-        MUL_LOOP(ab3, ab_n3, i + 3);
-#else
-        DIV_LOOP(ab0, ab_n0, i);
-        DIV_LOOP(ab1, ab_n1, i + 1);
-        DIV_LOOP(ab2, ab_n2, i + 2);
-        DIV_LOOP(ab3, ab_n3, i + 3);
-#endif
-      }
-    }
-  }
-  mempool_barrier(nPE);
-  return;
-}
-
-void mempool_chest_q16p_unrolled4_local(int16_t *volatile pH,
-                                        int16_t *volatile pPilotRX,
-                                        int16_t *volatile pPilotTX,
-                                        uint32_t nRX, uint32_t nTX,
-                                        uint32_t nSc, uint32_t core_id,
-                                        uint32_t nPE) {
-  v2s ab0, ab1, ab2, ab3;
-  v2s ab_n0, ab_n1, ab_n2, ab_n3;
-  v2s cd0, cd1, cd2, cd3;
-  int32_t re0, re1, re2, re3;
-  int32_t im0, im1, im2, im3;
-  int16_t *pPilotTX_itr;
-  int16_t *pPilotRX_itr;
-  int16_t *pH_itr;
-  uint32_t itr, i, j;
-
-  // Cores Loop over the received pilots vector
-  for (itr = core_id * 4; itr < (nSc * nRX);
-       itr += (BANKING_FACTOR * NUM_CORES)) {
-    // Received pilots are aligned to cores
-    uint32_t sc_RX = itr / nRX;
-    pPilotTX_itr = pPilotTX + sc_RX * (2 * nTX);
-    pPilotRX_itr = pPilotRX + sc_RX * (2 * nRX);
-    pH_itr = pH + sc_RX * 2 * (nTX * nRX);
-    // Load received pilots
-    i = itr % nRX;
-    ab0 = *(v2s *)&pPilotRX_itr[2U * i];
-    ab1 = *(v2s *)&pPilotRX_itr[2U * (i + 1)];
-    ab2 = *(v2s *)&pPilotRX_itr[2U * (i + 2)];
-    ab3 = *(v2s *)&pPilotRX_itr[2U * (i + 3)];
-    SHUFFLE_A;
-    for (j = 0; j < nTX; j += 4) {
-      MUL_LOOP(ab0, ab_n0, i);
-      MUL_LOOP(ab1, ab_n1, i + 1);
-      MUL_LOOP(ab2, ab_n2, i + 2);
-      MUL_LOOP(ab3, ab_n3, i + 3);
-    }
-  }
-  mempool_barrier(nPE);
-  return;
-}
diff --git a/software/runtime/kernel/mempool_chest_q16p.h b/software/runtime/kernel/mempool_chest_q16p.h
deleted file mode 100644
index 8b1378917..000000000
--- a/software/runtime/kernel/mempool_chest_q16p.h
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/software/runtime/kernel/mempool_chest_q16s.h b/software/runtime/kernel/mempool_chest_q16s.h
deleted file mode 100644
index 8b1378917..000000000
--- a/software/runtime/kernel/mempool_chest_q16s.h
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/software/runtime/kernel/mempool_radix2_cfft_q16s.h b/software/runtime/kernel/mempool_radix2_cfft_q16s.h
deleted file mode 100644
index e69de29bb..000000000