From 74e7ca367a114fea0e6eb47ee7e88a5220dd8ff2 Mon Sep 17 00:00:00 2001
From: nicomar0 <nmaronese@student.ethz.ch>
Date: Sun, 19 Nov 2023 17:54:53 +0100
Subject: [PATCH] Merge pull request #84 from pulp-platform/cfft

Add cfft radix-4 and radix-2 kernels
---
 python-requirements.txt                       |   1 +
 software/.gitignore                           |   2 +-
 software/apps/Makefile                        |   2 +-
 software/apps/cfft_radix2_q16/main.c          |  81 +++
 software/apps/cfft_radix4_q16/main.c          | 195 ++++++
 .../runtime/data/data_cfft_radix2_q16.h.tpl   |  56 ++
 software/runtime/data/data_cfft_radix2_q16.py | 200 ++++++
 .../runtime/data/data_cfft_radix4_q16.h.tpl   |  58 ++
 software/runtime/data/data_cfft_radix4_q16.py | 200 ++++++
 .../runtime/kernel/mempool_radix2_cfft_q16p.h | 203 ++++++
 .../runtime/kernel/mempool_radix2_cfft_q16s.h | 141 ++++
 .../mempool_radix4_cfft_butterfly_q16.h       | 488 +++++++++++++
 .../mempool_radix4_cfft_q16_bitreversal.h     | 147 ++++
 .../runtime/kernel/mempool_radix4_cfft_q16p.h | 640 ++++++++++++++++++
 .../runtime/kernel/mempool_radix4_cfft_q16s.h | 572 ++++++++++++++++
 15 files changed, 2984 insertions(+), 2 deletions(-)
 create mode 100644 software/apps/cfft_radix2_q16/main.c
 create mode 100644 software/apps/cfft_radix4_q16/main.c
 create mode 100644 software/runtime/data/data_cfft_radix2_q16.h.tpl
 create mode 100644 software/runtime/data/data_cfft_radix2_q16.py
 create mode 100644 software/runtime/data/data_cfft_radix4_q16.h.tpl
 create mode 100755 software/runtime/data/data_cfft_radix4_q16.py
 create mode 100644 software/runtime/kernel/mempool_radix2_cfft_q16p.h
 create mode 100644 software/runtime/kernel/mempool_radix2_cfft_q16s.h
 create mode 100644 software/runtime/kernel/mempool_radix4_cfft_butterfly_q16.h
 create mode 100644 software/runtime/kernel/mempool_radix4_cfft_q16_bitreversal.h
 create mode 100644 software/runtime/kernel/mempool_radix4_cfft_q16p.h
 create mode 100644 software/runtime/kernel/mempool_radix4_cfft_q16s.h

diff --git a/python-requirements.txt b/python-requirements.txt
index b459939ae..09e19ccd7 100644
--- a/python-requirements.txt
+++ b/python-requirements.txt
@@ -13,3 +13,4 @@ numpy
 pandas
 progressbar2
 tabulate
+sympy
diff --git a/software/.gitignore b/software/.gitignore
index 127f990fd..d738641d4 100644
--- a/software/.gitignore
+++ b/software/.gitignore
@@ -26,4 +26,4 @@ runtime/arch.ld
 
 # Generated data files
 data.h
-runtime/data/*.h
+runtime/data/data*.h
diff --git a/software/apps/Makefile b/software/apps/Makefile
index cb7e7ad02..03ea07e75 100644
--- a/software/apps/Makefile
+++ b/software/apps/Makefile
@@ -25,7 +25,7 @@ else
 	ALL := $(filter-out systolic/%,$(APPS))
 endif
 
-ALL_LLVM := $(filter-out chest_q16, $(ALL))
+ALL_LLVM := $(filter-out chest_q16 cfft_radix2_q16 cfft_radix4_q16, $(ALL))
 
 # Make all applications
 all: $(ALL)
diff --git a/software/apps/cfft_radix2_q16/main.c b/software/apps/cfft_radix2_q16/main.c
new file mode 100644
index 000000000..a7dfa2c7b
--- /dev/null
+++ b/software/apps/cfft_radix2_q16/main.c
@@ -0,0 +1,81 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* Mempool runtime libraries */
+#include "dma.h"
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+#include "xpulp/builtins_v2.h"
+
+/* CFFT mempool libraries */
+#include "data/data_cfft_radix2_q16.h"
+#include "kernel/mempool_radix2_cfft_q16p.h"
+#include "kernel/mempool_radix2_cfft_q16s.h"
+
+#define PARALLEL
+#define SINGLE
+
+/* CFFT mempool data */
+int16_t l1_pSrc[N_RSAMPLES]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+int16_t l1_twiddleCoef_q16[6 * N_CSAMPLES / 4]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+
+int main() {
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  mempool_barrier_init(core_id);
+
+  if (core_id == 0) {
+    dma_memcpy_blocking(l1_pSrc, l2_pSrc, N_CSAMPLES * sizeof(int32_t));
+    dma_memcpy_blocking(l1_twiddleCoef_q16, l2_twiddleCoef_q16,
+                        (3 * N_CSAMPLES / 4) * sizeof(int32_t));
+    dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable,
+                        BITREVINDEXTABLE_LENGTH * sizeof(int32_t));
+  }
+  mempool_barrier(num_cores);
+
+  /* SINGLE-CORE */
+#ifdef SINGLE
+  if (core_id == 0) {
+    mempool_start_benchmark();
+    mempool_radix2_cfft_q16s((uint16_t)16, l1_twiddleCoef_q16,
+                             l1_BitRevIndexTable, l1_pSrc,
+                             BITREVINDEXTABLE_LENGTH, 0, 0);
+    mempool_stop_benchmark();
+  }
+  mempool_barrier(num_cores);
+#endif
+
+  /* PARALLEL-CORE */
+#ifdef PARALLEL
+  mempool_start_benchmark();
+  mempool_radix2_cfft_q16p((uint16_t)16, l1_twiddleCoef_q16,
+                           l1_BitRevIndexTable, l1_pSrc,
+                           BITREVINDEXTABLE_LENGTH, 0, 0, num_cores);
+  mempool_stop_benchmark();
+#endif
+
+  if (core_id == 0) {
+    for (uint32_t i = 0; i < N_RSAMPLES; i += 2) {
+      printf("{%6d;%6d } \n", l1_pSrc[i], l1_pSrc[i + 1]);
+    }
+    printf("Done!\n");
+  }
+  mempool_barrier(num_cores);
+  return 0;
+}
diff --git a/software/apps/cfft_radix4_q16/main.c b/software/apps/cfft_radix4_q16/main.c
new file mode 100644
index 000000000..ca9cb29c3
--- /dev/null
+++ b/software/apps/cfft_radix4_q16/main.c
@@ -0,0 +1,195 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* Mempool runtime libraries */
+#include "dma.h"
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+#include "xpulp/builtins_v2.h"
+
+/* CFFT data libraries */
+#include "data/data_cfft_radix4_q16.h"
+
+/*
+  CHOOSE ONE
+   - SINGLE:    Single core FFT
+   - PARALLEL:  Parallel FFT not "memory-aware"
+   - FOLDED:    Parallel FFT with "memory-aware" load/store scheme
+   - SCHEDULED: Scheduling of multiple parallel FFTs with "memory-aware"
+  load/store scheme
+      - N_FFTs_COL: Independent FFTs scheduled on one row (default 1)
+      - N_FFTs_ROW: Independent FFTs scheduled on columns (default 1)
+      (OPTIONALLY ENABLE)
+      - FOLDED_TWIDDLES: Also the twiddles have "memory-aware" load/stores
+      - BITREVERSETABLE: The bitreversal indeces are loaded from a table
+      - ASM:             Use asm_volatile statements
+*/
+
+#define SCHEDULED
+#define FOLDED_TWIDDLES
+#define BITREVERSETABLE
+#define ASM // Use asm_volatile statements
+
+#if !(defined(N_FFT_ROW) && defined(N_FFTs_COL))
+#define N_FFTs_ROW 2
+#define N_FFTs_COL 2
+#endif
+
+#define ABS(x) (((x) < 0) ? (-x) : (x))
+#include "kernel/mempool_radix4_cfft_butterfly_q16.h"
+#include "kernel/mempool_radix4_cfft_q16_bitreversal.h"
+#include "kernel/mempool_radix4_cfft_q16p.h"
+#include "kernel/mempool_radix4_cfft_q16s.h"
+
+int16_t l1_pSrc[N_FFTs_ROW * 8 * N_BANKS]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+int16_t l1_pDst[N_FFTs_ROW * 8 * N_BANKS]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+int16_t l1_twiddleCoef_q16_src[8 * N_BANKS]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+int16_t l1_twiddleCoef_q16_dst[8 * N_BANKS]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/* MAIN */
+int main() {
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  mempool_barrier_init(core_id);
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////
+  /* INITIALIZATION */
+  if (core_id == 0) {
+    // Each FFT is folded over 4 memory rows
+    // Each memory row is 2 * N_BANKS samples
+    for (uint32_t j = 0; j < N_FFTs_ROW; j++) {
+      dma_memcpy_blocking(l1_pSrc + j * (8 * N_BANKS), l2_pSrc,
+                          (N_RSAMPLES * N_FFTs_COL) * sizeof(int32_t));
+    }
+    dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable,
+                        BITREVINDEXTABLE_LENGTH * sizeof(int32_t));
+    dma_memcpy_blocking(l1_twiddleCoef_q16_src, l2_twiddleCoef_q16,
+                        3 * (N_CSAMPLES / 4) * sizeof(int32_t));
+  }
+  // Initialize the Twiddles folded
+#ifdef FOLDED_TWIDDLES
+  for (uint32_t j = 0; j < N_FFTs_COL; j++) {
+    uint32_t N_WORDS_COL = (N_CSAMPLES / 4);
+    for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) {
+      *(v2s *)&l1_twiddleCoef_q16_src[2U * (i + j * (N_CSAMPLES / 4))] =
+          *(v2s *)&l2_twiddleCoef_q16[2U * i];
+      *(v2s *)&l1_twiddleCoef_q16_src[2U * (i + j * (N_CSAMPLES / 4) +
+                                            1 * N_BANKS)] =
+          *(v2s *)&l2_twiddleCoef_q16[2U * (i * 2U)];
+      *(v2s *)&l1_twiddleCoef_q16_src[2U * (i + j * (N_CSAMPLES / 4) +
+                                            2 * N_BANKS)] =
+          *(v2s *)&l2_twiddleCoef_q16[2U * (i * 3U)];
+    }
+  }
+#endif
+  mempool_barrier(num_cores);
+
+  if (core_id == 0) {
+    printf("On the run...\n");
+  }
+  mempool_barrier(num_cores);
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/* SINGLE-CORE */
+#ifdef SINGLE
+  int16_t *pRes; // Result pointer
+  if (core_id == 0) {
+    mempool_start_benchmark();
+    mempool_radix4_cfft_q16s_xpulpimg(l1_pSrc, (uint16_t)N_CSAMPLES,
+                                      l1_twiddleCoef_q16_src, 1);
+    mempool_bitrevtable_q16s_xpulpimg(
+        (uint16_t *)l1_pSrc, BITREVINDEXTABLE_LENGTH, l1_BitRevIndexTable);
+    pRes = l1_pSrc;
+    mempool_stop_benchmark();
+  }
+  mempool_barrier(num_cores);
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/* MULTI-CORE */
+#ifdef PARALLEL
+  int16_t *pRes; // Result pointer
+  mempool_start_benchmark();
+  mempool_radix4_cfft_q16p_xpulpimg(l1_pSrc, (uint16_t)N_CSAMPLES,
+                                    l1_twiddleCoef_q16_src, 1, num_cores);
+  mempool_bitrevtable_q16p_xpulpimg((uint16_t *)l1_pSrc,
+                                    BITREVINDEXTABLE_LENGTH,
+                                    l1_BitRevIndexTable, num_cores);
+  pRes = l1_pSrc;
+  mempool_stop_benchmark();
+#endif
+  mempool_barrier(num_cores);
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/* MULTI-CORE FOLDED */
+#ifdef FOLDED
+  int16_t *pRes; // Result pointer
+  if (core_id < (N_CSAMPLES / 16)) {
+    mempool_start_benchmark();
+#ifdef FOLDED_TWIDDLES
+    mempool_radix4_cfft_q16p_folded(l1_pSrc, l1_pDst, (uint16_t)N_CSAMPLES,
+                                    l1_twiddleCoef_q16_src,
+                                    l1_twiddleCoef_q16_dst, (N_CSAMPLES / 16));
+#else
+    mempool_radix4_cfft_q16p_folded(l1_pSrc, l1_pDst, (uint16_t)N_CSAMPLES,
+                                    l1_twiddleCoef_q16_src, (N_CSAMPLES / 16));
+#endif
+    pRes = ((LOG2 / 2) % 2) == 0 ? l1_pSrc : l1_pDst;
+    mempool_bitrevtable_q16p_xpulpimg((uint16_t *)pRes, BITREVINDEXTABLE_LENGTH,
+                                      pRevT16, (N_CSAMPLES / 16));
+    mempool_stop_benchmark();
+  }
+  mempool_barrier(num_cores);
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/* MULTI-CORE SCHEDULED */
+#ifdef SCHEDULED
+  if (core_id < N_FFTs_COL * (N_CSAMPLES >> 4U)) {
+    mempool_start_benchmark();
+    uint32_t col_fftLen = N_CSAMPLES >> 2U;
+    uint32_t col_id = core_id / (N_CSAMPLES >> 4U);
+    mempool_radix4_cfft_q16p_scheduler(
+        l1_pSrc, l1_pDst, N_CSAMPLES,
+        l1_twiddleCoef_q16_src + 2 * col_id * col_fftLen,
+        l1_twiddleCoef_q16_dst + 2 * col_id * col_fftLen, l1_BitRevIndexTable,
+        BITREVINDEXTABLE_LENGTH, 1, N_CSAMPLES >> 4U);
+    mempool_log_partial_barrier(2, core_id, N_CSAMPLES >> 4U);
+    mempool_stop_benchmark();
+  }
+#endif
+  mempool_barrier(num_cores);
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/* CHECK */
+#if defined(SINGLE) || defined(PARALLEL) || defined(FOLDED)
+  if (core_id == 0) {
+    printf("Done!\n");
+    for (uint32_t i = 0; i < N_RSAMPLES; i++) {
+      if (ABS(((int32_t)pRes[i] - (int32_t)l2_pRes[i])) > TOLERANCE)
+        printf("ERROR!!! Result[%d]: %6d Expected[%d]: %6d\n", i, pRes[i], i,
+               l2_pRes[i]);
+    }
+  }
+  mempool_barrier(num_cores);
+#endif
+
+  return 0;
+}
diff --git a/software/runtime/data/data_cfft_radix2_q16.h.tpl b/software/runtime/data/data_cfft_radix2_q16.h.tpl
new file mode 100644
index 000000000..35ae84005
--- /dev/null
+++ b/software/runtime/data/data_cfft_radix2_q16.h.tpl
@@ -0,0 +1,56 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Automatically generated by:
+// data/data_cfft_radix2_q16.py
+
+\
+<% def array_to_cstr(array):
+    out = '{'
+    i = 0
+    out += '\n'
+    for a in array:
+        out += '(int16_t) 0X{:04X}, '.format(a&0xffff)
+        i += 1
+        if i % 16 == 0:
+            out += '\n'
+    out = out[:-2] + '}'
+    return out
+%> \
+
+<% def array_to_str(array):
+    out = '{'
+    i = 0
+    out += '\n'
+    for a in array:
+        out += '{}, '.format(a)
+        i += 1
+        if i % 16 == 0:
+            out += '\n'
+    out = out[:-2] + '}'
+    return out
+%> \
+
+#define LOG2 (${Log2Len})
+#define N_CSAMPLES (${Len})
+#define N_RSAMPLES (2 * N_CSAMPLES)
+#define N_TWIDDLES (3 * N_CSAMPLES / 4)
+#define N_BANKS (NUM_CORES * BANKING_FACTOR)
+#define BITREVINDEXTABLE_LENGTH (${BitrevLen})
+
+// Tolerance for correctness check
+#define TOLERANCE (${tolerance})
+
+% for m, m_str in zip([vector_inp, vector_res], ['l2_pSrc', 'l2_pRes']):
+
+// Data arrays for matrix ${m_str}
+int16_t ${m_str}[${2*Len}] = ${array_to_cstr(m)};
+
+% endfor \
+
+// Twiddles
+int16_t l2_twiddleCoef_q16[${int(6*Len/4)}] = ${array_to_cstr(vector_twi)};
+
+// Bitreversal
+uint16_t l2_BitRevIndexTable[${BitrevLen}] = ${array_to_str(vector_bitrev)};
diff --git a/software/runtime/data/data_cfft_radix2_q16.py b/software/runtime/data/data_cfft_radix2_q16.py
new file mode 100644
index 000000000..e1615e53e
--- /dev/null
+++ b/software/runtime/data/data_cfft_radix2_q16.py
@@ -0,0 +1,200 @@
+#!/usr/bin/env python3
+
+# Copyright 2022 ETH Zurich and University of Bologna.
+# Solderpad Hardware License, Version 0.51, see LICENSE for details.
+# SPDX-License-Identifier: SHL-0.51
+
+# This script generates data for the cfft kernel.
+# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
+
+import numpy as np
+import math as M
+import argparse
+import pathlib
+from mako.template import Template
+from sympy.combinatorics import Permutation
+
+
+##################
+# compute_result #
+##################
+
+
+def compute_result(inp, len):
+    """
+    Funciton to generate the expected result of the testcase.
+
+    Arguments
+    ---------
+    input: numpy array of inputs
+    env: Length of the input transform.
+    """
+
+    # Q16:
+    # len=16:    Q1.15 -> Q5.11
+    # len=32:    Q1.15 -> Q6.10
+    # len=64:    Q1.15 -> Q7.9
+    # len=128:   Q1.15 -> Q8.8
+    # len=256:   Q1.15 -> Q9.7
+    # len=512:   Q1.15 -> Q10.6
+    # len=1024:  Q1.15 -> Q11.5
+    # len=2048:  Q1.15 -> Q12.4
+    # len=4096:  Q1.15 -> Q13.3
+    bit_shift_dict_q16 = {
+        16: 11,
+        32: 10,
+        64: 9,
+        128: 8,
+        256: 7,
+        512: 6,
+        1024: 5,
+        2048: 4,
+        4096: 3}
+    my_type = np.int16
+    my_fixpoint = 15
+    bit_shift_dict = bit_shift_dict_q16
+    a = inp.astype(my_type)
+    result = np.zeros(a.size, dtype=my_type)
+    complex_a = np.zeros(int(a.size / 2), dtype=np.csingle)
+    complex_result = np.zeros(a.size >> 1, dtype=np.csingle)
+    for i in range(a.size >> 1):
+        complex_a[i] = a[2 * i].astype(np.csingle) / (2**(my_fixpoint)) + (
+            a[2 * i + 1].astype(np.csingle) / (2**(my_fixpoint))) * 1j
+    complex_result = np.fft.fft(complex_a)
+    for i in range(int(a.size / 2)):
+        result[2 * i] = (np.real(complex_result[i]) *
+                         (2**(bit_shift_dict[int(a.size / 2)]))
+                         ).astype(my_type)
+        result[2 * i + 1] = (np.imag(complex_result[i]) *
+                             (2**(bit_shift_dict[int(a.size / 2)]))
+                             ).astype(my_type)
+
+    return result
+
+
+def compute_twiddles(length):
+    PI = 3.14159265358979
+    N = length
+    twiddleCoefq15 = np.zeros((int)(2 * 3 * N / 4), np.int16)
+    for i in range(0, (int)(3 * N / 4)):
+        twiddleCoefq15_cos = M.cos(i * 2 * PI / N)
+        twiddleCoefq15_sin = M.sin(i * 2 * PI / N)
+        twiddleCoefq15[2 * i] = int(round(twiddleCoefq15_cos * (2**15 - 1)))
+        twiddleCoefq15[2 * i +
+                       1] = int(round(twiddleCoefq15_sin * (2**15 - 1)))
+    return twiddleCoefq15
+
+
+def compute_bitreversal(N, R):
+
+    # Decompose
+    logR2 = []
+    idx = N
+    while (idx >= R):
+        logR2.append(int(M.log2(R)))
+        idx = idx // R
+    if (idx > 1):
+        logR2.append(int(M.log2(idx)))
+
+    # Bitreversal
+    indexes = []
+    for x in range(N):
+        result = 0
+        for bits in logR2:
+            mask = (0xffffffff >> (32 - bits))
+            result = (result << bits) | (x & mask)
+            x = x >> bits
+        indexes.append(result)
+
+    # Create transpositions table
+    tps = []
+    for c in Permutation.from_sequence(indexes).cyclic_form:
+        for i in range(len(c) - 1):
+            tps.append([c[i] * 8, c[-1] * 8])
+
+    return tps
+
+
+def gen_data_header_file(
+        outdir: pathlib.Path.cwd(),
+        tpl: pathlib.Path.cwd(),
+        **kwargs):
+
+    file = outdir / f"{kwargs['name']}.h"
+
+    print(tpl, outdir, kwargs['name'])
+
+    template = Template(filename=str(tpl))
+    with file.open('w') as f:
+        f.write(template.render(**kwargs))
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Generate data for kernels')
+    parser.add_argument(
+        "-o",
+        "--outdir",
+        type=pathlib.Path,
+        default=pathlib.Path(__file__).parent.absolute(),
+        required=False,
+        help='Select out directory of generated data files'
+    )
+    parser.add_argument(
+        "-t",
+        "--tpl",
+        type=pathlib.Path,
+        required=False,
+        default=pathlib.Path(__file__).parent.absolute() /
+        "data_cfft_radix2_q16.h.tpl",
+        help='Path to mako template')
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action='store_true',
+        help='Set verbose'
+    )
+    parser.add_argument(
+        "-d",
+        "--dimension",
+        type=int,
+        required=False,
+        default=64,
+        help='Input dimension'
+    )
+
+    args = parser.parse_args()
+
+    # Create sparse matrix
+    Len = args.dimension
+    Input = np.random.randint(-2**(15), 2**(15) - 1, 2 * Len, dtype=np.int16)
+    Result = compute_result(Input, Len)
+    Twiddles = compute_twiddles(Len)
+    Bitreversal = np.ndarray.flatten(np.array(compute_bitreversal(Len, 2)))
+
+    tolerance = {
+        16: 16,
+        32: 20,
+        64: 24,
+        128: 28,
+        256: 32,
+        512: 48,
+        1024: 64,
+        2048: 96,
+        4096: 128}
+
+    kwargs = {'name': 'data_cfft_radix2_q16',
+              'vector_inp': Input,
+              'vector_res': Result,
+              'vector_twi': Twiddles,
+              'vector_bitrev': Bitreversal,
+              'Len': Len,
+              'Log2Len': int(np.log2(Len)),
+              'BitrevLen': int(2 * len(Bitreversal)),
+              'tolerance': tolerance[int(Len)]}
+
+    gen_data_header_file(args.outdir, args.tpl, **kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/software/runtime/data/data_cfft_radix4_q16.h.tpl b/software/runtime/data/data_cfft_radix4_q16.h.tpl
new file mode 100644
index 000000000..1a147a30c
--- /dev/null
+++ b/software/runtime/data/data_cfft_radix4_q16.h.tpl
@@ -0,0 +1,58 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Automatically generated by:
+// data/data_cfft_radix4_q16.py
+
+\
+<% def array_to_cstr(array):
+    out = '{'
+    i = 0
+    out += '\n'
+    for a in array:
+        out += '(int16_t) 0X{:04X}, '.format(a&0xffff)
+        i += 1
+        if i % 16 == 0:
+            out += '\n'
+    out = out[:-2] + '}'
+    return out
+%> \
+
+<% def array_to_str(array):
+    out = '{'
+    i = 0
+    out += '\n'
+    for a in array:
+        out += '{}, '.format(a)
+        i += 1
+        if i % 16 == 0:
+            out += '\n'
+    out = out[:-2] + '}'
+    return out
+%> \
+
+#define LOG2 (${Log2Len})
+#define N_CSAMPLES (${Len})
+#define N_RSAMPLES (2 * N_CSAMPLES)
+#define N_TWIDDLES (3 * N_CSAMPLES / 4)
+#define N_BANKS (NUM_CORES * BANKING_FACTOR)
+#define BITREVINDEXTABLE_LENGTH (${BitrevLen})
+
+// Maximum number of independent FFT columns allowed
+#define MAX_COL (N_BANKS / (N_CSAMPLES / 4))
+// Tolerance for correctness check
+#define TOLERANCE (${tolerance})
+
+% for m, m_str in zip([vector_inp, vector_res], ['l2_pSrc', 'l2_pRes']):
+
+// Data arrays for matrix ${m_str}
+int16_t ${m_str}[${2*Len}] = ${array_to_cstr(m)};
+
+% endfor \
+
+// Twiddles
+int16_t l2_twiddleCoef_q16[${int(6*Len/4)}] = ${array_to_cstr(vector_twi)};
+
+// Bitreversal
+uint16_t l2_BitRevIndexTable[${BitrevLen}] = ${array_to_str(vector_bitrev)};
diff --git a/software/runtime/data/data_cfft_radix4_q16.py b/software/runtime/data/data_cfft_radix4_q16.py
new file mode 100755
index 000000000..b394a2884
--- /dev/null
+++ b/software/runtime/data/data_cfft_radix4_q16.py
@@ -0,0 +1,200 @@
+#!/usr/bin/env python3
+
+# Copyright 2022 ETH Zurich and University of Bologna.
+# Solderpad Hardware License, Version 0.51, see LICENSE for details.
+# SPDX-License-Identifier: SHL-0.51
+
+# This script generates data for the cfft kernel.
+# Author: Marco Bertuletti <mbertuletti@iis.ee.ethz.ch>
+
+import numpy as np
+import math as M
+import argparse
+import pathlib
+from mako.template import Template
+from sympy.combinatorics import Permutation
+
+
+##################
+# compute_result #
+##################
+
+
+def compute_result(inp, len):
+    """
+    Funciton to generate the expected result of the testcase.
+
+    Arguments
+    ---------
+    input: numpy array of inputs
+    env: Length of the input transform.
+    """
+
+    # Q16:
+    # len=16:    Q1.15 -> Q5.11
+    # len=32:    Q1.15 -> Q6.10
+    # len=64:    Q1.15 -> Q7.9
+    # len=128:   Q1.15 -> Q8.8
+    # len=256:   Q1.15 -> Q9.7
+    # len=512:   Q1.15 -> Q10.6
+    # len=1024:  Q1.15 -> Q11.5
+    # len=2048:  Q1.15 -> Q12.4
+    # len=4096:  Q1.15 -> Q13.3
+    bit_shift_dict_q16 = {
+        16: 11,
+        32: 10,
+        64: 9,
+        128: 8,
+        256: 7,
+        512: 6,
+        1024: 5,
+        2048: 4,
+        4096: 3}
+    my_type = np.int16
+    my_fixpoint = 15
+    bit_shift_dict = bit_shift_dict_q16
+    a = inp.astype(my_type)
+    result = np.zeros(a.size, dtype=my_type)
+    complex_a = np.zeros(int(a.size / 2), dtype=np.csingle)
+    complex_result = np.zeros(a.size >> 1, dtype=np.csingle)
+    for i in range(a.size >> 1):
+        complex_a[i] = a[2 * i].astype(np.csingle) / (2**(my_fixpoint)) + (
+            a[2 * i + 1].astype(np.csingle) / (2**(my_fixpoint))) * 1j
+    complex_result = np.fft.fft(complex_a)
+    for i in range(int(a.size / 2)):
+        result[2 * i] = (np.real(complex_result[i]) *
+                         (2**(bit_shift_dict[int(a.size / 2)]))
+                         ).astype(my_type)
+        result[2 * i + 1] = (np.imag(complex_result[i]) *
+                             (2**(bit_shift_dict[int(a.size / 2)]))
+                             ).astype(my_type)
+
+    return result
+
+
+def compute_twiddles(length):
+    PI = 3.14159265358979
+    N = length
+    twiddleCoefq15 = np.zeros((int)(2 * 3 * N / 4), np.int16)
+    for i in range(0, (int)(3 * N / 4)):
+        twiddleCoefq15_cos = M.cos(i * 2 * PI / N)
+        twiddleCoefq15_sin = M.sin(i * 2 * PI / N)
+        twiddleCoefq15[2 * i] = int(round(twiddleCoefq15_cos * (2**15 - 1)))
+        twiddleCoefq15[2 * i +
+                       1] = int(round(twiddleCoefq15_sin * (2**15 - 1)))
+    return twiddleCoefq15
+
+
+def compute_bitreversal(N, R):
+
+    # Decompose
+    logR2 = []
+    idx = N
+    while (idx >= R):
+        logR2.append(int(M.log2(R)))
+        idx = idx // R
+    if (idx > 1):
+        logR2.append(int(M.log2(idx)))
+
+    # Bitreversal
+    indexes = []
+    for x in range(N):
+        result = 0
+        for bits in logR2:
+            mask = (0xffffffff >> (32 - bits))
+            result = (result << bits) | (x & mask)
+            x = x >> bits
+        indexes.append(result)
+
+    # Create transpositions table
+    tps = []
+    for c in Permutation.from_sequence(indexes).cyclic_form:
+        for i in range(len(c) - 1):
+            tps.append([c[i] * 8, c[-1] * 8])
+
+    return tps
+
+
+def gen_data_header_file(
+        outdir: pathlib.Path.cwd(),
+        tpl: pathlib.Path.cwd(),
+        **kwargs):
+
+    file = outdir / f"{kwargs['name']}.h"
+
+    print(tpl, outdir, kwargs['name'])
+
+    template = Template(filename=str(tpl))
+    with file.open('w') as f:
+        f.write(template.render(**kwargs))
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Generate data for kernels')
+    parser.add_argument(
+        "-o",
+        "--outdir",
+        type=pathlib.Path,
+        default=pathlib.Path(__file__).parent.absolute(),
+        required=False,
+        help='Select out directory of generated data files'
+    )
+    parser.add_argument(
+        "-t",
+        "--tpl",
+        type=pathlib.Path,
+        required=False,
+        default=pathlib.Path(__file__).parent.absolute() /
+        "data_cfft_radix4_q16.h.tpl",
+        help='Path to mako template')
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action='store_true',
+        help='Set verbose'
+    )
+    parser.add_argument(
+        "-d",
+        "--dimension",
+        type=int,
+        required=False,
+        default=64,
+        help='Input dimension'
+    )
+
+    args = parser.parse_args()
+
+    # Create sparse matrix
+    Len = args.dimension
+    Input = np.random.randint(-2**(15), 2**(15) - 1, 2 * Len, dtype=np.int16)
+    Result = compute_result(Input, Len)
+    Twiddles = compute_twiddles(Len)
+    Bitreversal = np.ndarray.flatten(np.array(compute_bitreversal(Len, 2)))
+
+    tolerance = {
+        16: 16,
+        32: 20,
+        64: 24,
+        128: 28,
+        256: 32,
+        512: 48,
+        1024: 64,
+        2048: 96,
+        4096: 128}
+
+    kwargs = {'name': 'data_cfft_radix4_q16',
+              'vector_inp': Input,
+              'vector_res': Result,
+              'vector_twi': Twiddles,
+              'vector_bitrev': Bitreversal,
+              'Len': Len,
+              'Log2Len': int(np.log2(Len)),
+              'BitrevLen': len(Bitreversal),
+              'tolerance': tolerance[int(Len)]}
+
+    gen_data_header_file(args.outdir, args.tpl, **kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/software/runtime/kernel/mempool_radix2_cfft_q16p.h b/software/runtime/kernel/mempool_radix2_cfft_q16p.h
new file mode 100644
index 000000000..52a693773
--- /dev/null
+++ b/software/runtime/kernel/mempool_radix2_cfft_q16p.h
@@ -0,0 +1,203 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+#include "xpulp/builtins_v2.h"
+
+void mempool_radix2_cfft_q16p(uint16_t fftLen, int16_t *pTwiddle,
+                              uint16_t *pBitRevTable, int16_t *pSrc,
+                              uint16_t bitReverseLen, uint8_t ifftFlag,
+                              uint8_t bitReverseFlag, uint32_t nPE);
+
+void mempool_radix2_butterfly_q16p(int16_t *pSrc16, uint32_t fftLen,
+                                   int16_t *pCoef16, uint32_t twidCoefModifier,
+                                   uint32_t nPE);
+
+void mempool_radix2_bitreversal_q16p(uint16_t *pSrc, const uint16_t bitRevLen,
+                                     const uint16_t *pBitRevTab, uint32_t nPE);
+
+void mempool_radix2_cfft_q16p(uint16_t fftLen, int16_t *pTwiddle,
+                              uint16_t *pBitRevTable, int16_t *pSrc,
+                              uint16_t bitReverseLen, uint8_t ifftFlag,
+                              uint8_t bitReverseFlag, uint32_t nPE) {
+  if (ifftFlag == 0) {
+    mempool_radix2_butterfly_q16p(pSrc, fftLen, pTwiddle, 1U, nPE);
+  }
+
+  if (bitReverseFlag) {
+    mempool_radix2_bitreversal_q16p((uint16_t *)pSrc, bitReverseLen,
+                                    pBitRevTable, nPE);
+  }
+}
+
+void mempool_radix2_butterfly_q16p(int16_t *pSrc16, uint32_t fftLen,
+                                   int16_t *pCoef16, uint32_t twidCoefModifier,
+                                   uint32_t nPE) {
+
+  uint32_t i, j, k, l;
+  uint32_t n1, n2, ia;
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t step, steps, butt_id, offset;
+  v2s T, S, R;
+  v2s coeff;
+  int16_t out1, out2;
+
+  n1 = fftLen;
+  n2 = n1 >> 1;
+  step = (n2 + nPE - 1) / nPE;
+
+  // loop for groups
+  for (i = core_id * step; i < MIN(core_id * step + step, n2); i++) {
+
+    coeff = *(v2s *)&pCoef16[i * 2U];
+
+    /* Reading i and i+fftLen/2 inputs */
+    /* Input is downscaled by 1 to avoid overflow */
+    l = i + n2;
+    /* Read ya (real), xa (imag) input */
+    T = __SRA2(*(v2s *)&pSrc16[i * 2U], ((v2s){1, 1}));
+    /* Read yb (real), xb (imag) input */
+    S = __SRA2(*(v2s *)&pSrc16[l * 2U], ((v2s){1, 1}));
+
+    /* R0 = (ya - yb) */
+    /* R1 = (xa - xb) */
+    R = __SUB2(T, S);
+
+    /*  writing the butterfly processed i sample */
+    /* ya' = ya + yb */
+    /* xa' = xa + xb */
+    *((v2s *)&pSrc16[i * 2U]) = __SRA2(__ADD2(T, S), ((v2s){1, 1}));
+
+    /* out1 = (ya - yb)*cos + (xa - xb)*sin */
+    /* out2 = (ya - yb)*cos - (xa - xb)*sin */
+    out1 = (int16_t)(__DOTP2(R, coeff) >> 16U);
+    out2 = (int16_t)(__DOTP2(R, __PACK2(coeff[0], -coeff[1])) >> 16U);
+    *((v2s *)&pSrc16[l * 2U]) = __PACK2(out2, out1);
+
+  } /* groups loop end */
+
+  // if(core_id==0) {
+  //  for(uint32_t i=0; i<N_RSAMPLES; i+=2) {
+  //    printf("{%6d;%6d } \n", pSrc16[i], pSrc16[i+1]);
+  //  }
+  //  printf("Done PARALLEL!\n");
+  //}
+
+  mempool_barrier(nPE);
+
+  twidCoefModifier = twidCoefModifier << 1U;
+  /* loop for stage */
+  for (k = fftLen / 2; k > 2; k = k >> 1) {
+
+    n1 = n2;
+    n2 = n2 >> 1;
+
+    step = (n2 + nPE - 1) / nPE;
+    butt_id = core_id % n2;
+    offset = (core_id / n2) * n1;
+    ia = butt_id * step;
+
+    /* loop for groups */
+    for (j = butt_id * step; j < MIN(butt_id * step + step, n2); j++) {
+      // for (j = core_id * step; j < MIN(core_id*step + step, n2); j++) {
+
+      coeff = *(v2s *)&pCoef16[ia * 2U];
+      ia = ia + twidCoefModifier;
+
+      /* loop for butterfly */
+      for (i = offset + j; i < fftLen; i += ((nPE / n2) + 1) * n1)
+      // for (i = j; i < fftLen; i += n1)
+      {
+
+        /* Reading i and i+fftLen/2 inputs */
+        /* Input is downscaled by 1 to avoid overflow */
+        l = i + n2;
+        /* Read ya (real), xa (imag) input */
+        T = __SRA2(*(v2s *)&pSrc16[i * 2U], ((v2s){1, 1}));
+        /* Read yb (real), xb (imag) input */
+        S = __SRA2(*(v2s *)&pSrc16[l * 2U], ((v2s){1, 1}));
+        /* R0 = (ya - yb) */
+        /* R1 = (xa - xb) */
+        R = __SUB2(T, S);
+        /*  writing the butterfly processed i sample */
+        /* ya' = ya + yb */
+        /* xa' = xa + xb */
+        *((v2s *)&pSrc16[i * 2U]) = __SRA2(__ADD2(T, S), ((v2s){1, 1}));
+        /* out1 = (ya - yb)*cos + (xa - xb)*sin */
+        /* out2 = (ya - yb)*cos - (xa - xb)*sin */
+        out1 = (int16_t)(__DOTP2(R, coeff) >> 16U);
+        out2 = (int16_t)(__DOTP2(R, __PACK2(coeff[0], -coeff[1])) >> 16U);
+        *((v2s *)&pSrc16[l * 2U]) = __PACK2(out2, out1);
+
+      } /* butterfly loop end */
+
+    } /* groups loop end */
+
+    twidCoefModifier = twidCoefModifier << 1U;
+    mempool_barrier(nPE);
+  } /* stages loop end */
+
+  // if(core_id==0) {
+  //  for(uint32_t i=0; i<N_RSAMPLES; i+=2) {
+  //    printf("{%6d;%6d } \n", pSrc16[i], pSrc16[i+1]);
+  //  }
+  //  printf("Done PARALLEL!\n");
+  //}
+  // mempool_barrier(nPE);
+
+  n1 = n2;
+  n2 = n2 >> 1;
+  steps = fftLen / n1;
+  step = (steps + nPE - 1) / nPE;
+  /* loop for butterfly */
+
+  for (i = core_id * step * n1; i < MIN((core_id * step + step) * n1, fftLen);
+       i += n1) {
+
+    l = i + n2;
+    /* Read ya (real), xa (imag) input */
+    T = __SRA2(*(v2s *)&pSrc16[i * 2U], ((v2s){1, 1}));
+    /* Read yb (real), xb (imag) input */
+    S = __SRA2(*(v2s *)&pSrc16[l * 2U], ((v2s){1, 1}));
+    /* R0 = (ya - yb) */
+    /* R1 = (xa - xb) */
+    R = __SUB2(T, S);
+    /* ya' = ya + yb */
+    /* xa' = xa + xb */
+    *((v2s *)&pSrc16[i * 2U]) = __ADD2(T, S);
+    /* yb' = ya - yb */
+    /* xb' = xa - xb */
+    *((v2s *)&pSrc16[l * 2U]) = R;
+
+  } /* groups loop end */
+
+  // if(core_id==0) {
+  //  for(uint32_t i=0; i<N_RSAMPLES; i+=2) {
+  //    printf("{%6d;%6d } \n", pSrc16[i], pSrc16[i+1]);
+  //  }
+  //  printf("Done PARALLEL!\n");
+  //}
+
+  mempool_barrier(nPE);
+}
+
+void mempool_radix2_bitreversal_q16p(uint16_t *pSrc, const uint16_t bitRevLen,
+                                     const uint16_t *pBitRevTab, uint32_t nPE) {
+  uint32_t i;
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  v2s addr, tmpa, tmpb;
+
+  for (i = 2 * core_id; i < bitRevLen; i += (2 * nPE)) {
+
+    addr = *(v2s *)&pBitRevTab[i] >> 2;
+    tmpa = *(v2s *)&pSrc[addr[0]];
+    tmpb = *(v2s *)&pSrc[addr[1]];
+    *((v2s *)&pSrc[addr[0]]) = tmpb;
+    *((v2s *)&pSrc[addr[1]]) = tmpa;
+  }
+  mempool_barrier(num_cores);
+}
diff --git a/software/runtime/kernel/mempool_radix2_cfft_q16s.h b/software/runtime/kernel/mempool_radix2_cfft_q16s.h
new file mode 100644
index 000000000..db4253b96
--- /dev/null
+++ b/software/runtime/kernel/mempool_radix2_cfft_q16s.h
@@ -0,0 +1,141 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+#include "xpulp/builtins_v2.h"
+
+void mempool_radix2_cfft_q16s(uint16_t fftLen, int16_t *pTwiddle,
+                              uint16_t *pBitRevTable, int16_t *pSrc,
+                              uint16_t bitReverseLen, uint8_t ifftFlag,
+                              uint8_t bitReverseFlag);
+
+void mempool_radix2_butterfly_q16s(int16_t *pSrc16, uint32_t fftLen,
+                                   int16_t *pCoef16, uint32_t twidCoefModifier);
+
+void mempool_radix2_bitreversal_q16s(uint16_t *pSrc, const uint16_t bitRevLen,
+                                     const uint16_t *pBitRevTab);
+
+void mempool_radix2_cfft_q16s(uint16_t fftLen, int16_t *pTwiddle,
+                              uint16_t *pBitRevTable, int16_t *pSrc,
+                              uint16_t bitReverseLen, uint8_t ifftFlag,
+                              uint8_t bitReverseFlag) {
+  if (ifftFlag == 0) {
+    mempool_radix2_butterfly_q16s(pSrc, fftLen, pTwiddle, 1U);
+  }
+
+  if (bitReverseFlag) {
+    mempool_radix2_bitreversal_q16s((uint16_t *)pSrc, bitReverseLen,
+                                    pBitRevTable);
+  }
+}
+
+void mempool_radix2_butterfly_q16s(int16_t *pSrc16, uint32_t fftLen,
+                                   int16_t *pCoef16,
+                                   uint32_t twidCoefModifier) {
+
+  uint32_t i, j, k, l;
+  uint32_t n1, n2, ia;
+  int16_t xt, yt, cosVal, sinVal;
+
+  // N = fftLen;
+  n2 = fftLen;
+
+  n1 = n2;
+  n2 = n2 >> 1;
+  ia = 0;
+
+  for (i = 0; i < n2; i++) {
+
+    cosVal = pCoef16[i * 2];
+    sinVal = pCoef16[(i * 2) + 1];
+    l = i + n2;
+
+    xt = (int16_t)((pSrc16[2 * i] >> 1U) - (pSrc16[2 * l] >> 1U));
+    pSrc16[2 * i] =
+        (int16_t)(((pSrc16[2 * i] >> 1U) + (pSrc16[2 * l] >> 1U)) >> 1U);
+    yt = (int16_t)((pSrc16[2 * i + 1] >> 1U) - (pSrc16[2 * l + 1] >> 1U));
+    pSrc16[2 * i + 1] =
+        (int16_t)(((pSrc16[2 * l + 1] >> 1U) + (pSrc16[2 * i + 1] >> 1U)) >>
+                  1U);
+    pSrc16[2U * l] = (int16_t)(((int16_t)(((int32_t)xt * cosVal) >> 16)) +
+                               ((int16_t)(((int32_t)yt * sinVal) >> 16)));
+    pSrc16[2U * l + 1] = (int16_t)(((int16_t)(((int32_t)yt * cosVal) >> 16)) -
+                                   ((int16_t)(((int32_t)xt * sinVal) >> 16)));
+  }
+  twidCoefModifier = twidCoefModifier << 1U;
+
+  /* loop for stage */
+  for (k = fftLen / 2; k > 2; k = k >> 1) {
+    n1 = n2;
+    n2 = n2 >> 1;
+    ia = 0;
+
+    /* loop for groups */
+    for (j = 0; j < n2; j++) {
+      cosVal = pCoef16[ia * 2];
+      sinVal = pCoef16[(ia * 2) + 1];
+      ia = ia + twidCoefModifier;
+
+      /* loop for butterfly */
+      for (i = j; i < fftLen; i += n1) {
+        l = i + n2;
+
+        xt = (int16_t)((pSrc16[2 * i] >> 1U) - (pSrc16[2 * l] >> 1U));
+        pSrc16[2 * i] =
+            (int16_t)(((pSrc16[2 * i] >> 1U) + (pSrc16[2 * l] >> 1U)) >> 1U);
+        yt = (int16_t)((pSrc16[2 * i + 1] >> 1U) - (pSrc16[2 * l + 1] >> 1U));
+        pSrc16[2 * i + 1] =
+            (int16_t)(((pSrc16[2 * l + 1] >> 1U) + (pSrc16[2 * i + 1] >> 1U)) >>
+                      1U);
+        pSrc16[2U * l] = (int16_t)(((int16_t)(((int32_t)xt * cosVal) >> 16)) +
+                                   ((int16_t)(((int32_t)yt * sinVal) >> 16)));
+        pSrc16[2U * l + 1] =
+            (int16_t)(((int16_t)(((int32_t)yt * cosVal) >> 16)) -
+                      ((int16_t)(((int32_t)xt * sinVal) >> 16)));
+
+      } /* butterfly loop end */
+
+    } /* groups loop end */
+
+    twidCoefModifier = twidCoefModifier << 1U;
+  } /* stages loop end */
+
+  n1 = n2;
+  n2 = n2 >> 1;
+  /* loop for groups */
+  for (i = 0; i < fftLen; i += n1) {
+    l = i + n2;
+    xt = (int16_t)((pSrc16[2 * i] >> 1U) - (pSrc16[2 * l] >> 1U));
+    pSrc16[2 * i] = (int16_t)(((pSrc16[2 * i] >> 1U) + (pSrc16[2 * l] >> 1U)));
+    yt = (int16_t)((pSrc16[2 * i + 1] >> 1U) - (pSrc16[2 * l + 1] >> 1U));
+    pSrc16[2 * i + 1] =
+        (int16_t)(((pSrc16[2 * l + 1] >> 1U) + (pSrc16[2 * i + 1] >> 1U)));
+    pSrc16[2U * l] = xt;
+    pSrc16[2U * l + 1] = yt;
+  } /* groups loop end */
+}
+
+void mempool_radix2_bitreversal_q16s(uint16_t *pSrc, const uint16_t bitRevLen,
+                                     const uint16_t *pBitRevTab) {
+  uint16_t a, b, tmp;
+
+  for (uint32_t i = 0; i < bitRevLen; i += 2) {
+    a = pBitRevTab[i] >> 2;
+    b = pBitRevTab[i + 1] >> 2;
+
+    // real
+    tmp = pSrc[a];
+    pSrc[a] = pSrc[b];
+    pSrc[b] = tmp;
+
+    // complex
+    tmp = pSrc[a + 1];
+    pSrc[a + 1] = pSrc[b + 1];
+    pSrc[b + 1] = tmp;
+
+    // i += 2;
+  }
+}
diff --git a/software/runtime/kernel/mempool_radix4_cfft_butterfly_q16.h b/software/runtime/kernel/mempool_radix4_cfft_butterfly_q16.h
new file mode 100644
index 000000000..725908b71
--- /dev/null
+++ b/software/runtime/kernel/mempool_radix4_cfft_butterfly_q16.h
@@ -0,0 +1,488 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#include "xpulp/builtins_v2.h"
+
+/**
+  @brief         First butterfly stage.
+  @param[in]     pIn  points to input buffer of 16b data, Re and Im parts are
+  interleaved
+  @param[out]    pOut  points to output buffer of 16b data, Re and Im parts are
+  interleaved
+  @param[in]     i0 points to the first element to be processed
+  @param[in]     n2 number of elements in the first wing of the butterfly
+  @param[in]     CoSi1 packed cosine and sine first twiddle
+  @param[in]     CoSi2 packed cosine and sine second twiddle
+  @param[in]     CoSi3 packed cosine and sine third twiddle
+  @param[in]     C1 packed sine and cosine first twiddle
+  @param[in]     C2 packed sine and cosine second twiddle
+  @param[in]     C3 packed sine and cosine third twiddle
+  @return        none
+*/
+static inline void radix4_butterfly_first(int16_t *pIn, int16_t *pOut,
+                                          uint32_t i0, uint32_t n2, v2s CoSi1,
+                                          v2s CoSi2, v2s CoSi3, v2s C1, v2s C2,
+                                          v2s C3) {
+  int16_t t0, t1, t2, t3, t4, t5;
+  uint32_t i1, i2, i3;
+  uint32_t i0_store, i1_store, i2_store, i3_store;
+  v2s A, B, C, D, E, F, G, H;
+
+// LOAD INDEXES
+#if defined(FOLDED) || defined(SCHEDULED)
+  /* index calculation for the input as, */
+  /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */
+  i1 = i0 + n2;
+  i2 = i1 + n2;
+  i3 = i2 + n2;
+#else
+  /* index calculation for the input as, */
+  /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */
+  i1 = i0 + n2;
+  i2 = i1 + n2;
+  i3 = i2 + n2;
+#endif
+// STORE INDEXES
+#if defined(FOLDED) || defined(SCHEDULED)
+  uint32_t n2_store = n2 >> 2U;
+  i0_store = (i0 % n2_store) + (i0 / n2_store) * N_BANKS;
+  i1_store = i0_store + n2_store;
+  i2_store = i1_store + n2_store;
+  i3_store = i2_store + n2_store;
+#else
+  i0_store = i0;
+  i1_store = i1;
+  i2_store = i2;
+  i3_store = i3;
+#endif
+
+#ifndef ASM
+  v2s s1 = {1, 1};
+  v2s s2 = {2, 2};
+  /* Read yb (real), xb(imag) input */
+  B = __SRA2(*(v2s *)&pIn[i1 * 2U], s2);
+  /* Read yd (real), xd(imag) input */
+  D = __SRA2(*(v2s *)&pIn[i3 * 2U], s2);
+  /* Read ya (real), xa (imag) input */
+  A = __SRA2(*(v2s *)&pIn[i0 * 2U], s2);
+  /* Read yc (real), xc(imag) input */
+  C = __SRA2(*(v2s *)&pIn[i2 * 2U], s2);
+  /* G0 = (yb + yd), G1 = (xb + xd) */
+  G = __ADD2(B, D);
+  /* H0 = (yb - yd), H1 = (xb - xd) */
+  H = __SUB2(B, D);
+  /* E0 = (ya + yc), E1 = (xa + xc) */
+  E = __ADD2(A, C);
+  /* F0 = (ya - yc), F1 = (xa - xc) */
+  F = __SUB2(A, C);
+  t0 = (int16_t)H[0];
+  t1 = (int16_t)H[1];
+  A = __SRA2(E, s1);
+  B = __SRA2(G, s1);
+  /* C0 = (xb - xd), C1 = (yd - yb) */
+  C = __PACK2(t0, -t1);
+  /* D0 = (xd - xb), D1 = (yb - yd) */
+  D = __PACK2(-t0, t1);
+  /* E0 = (ya+yc) - (yb+yd), E1 = (xa+xc) - (xb+xd) */
+  E = __SUB2(E, G);
+  /* G1 = (ya-yc) + (xb-xd), G0 = (xa-xc) - (yb-yd) */
+  G = __ADD2(F, C);
+  /* H1 = (ya-yc) - (xb-xd), H0 = (xa-xc) + (yb-yd) */
+  H = __ADD2(F, D);
+  /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
+  /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+  t0 = (int16_t)(__DOTP2(CoSi2, E) >> 16U);
+  t1 = (int16_t)(__DOTP2(C2, E) >> 16U);
+  /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
+  /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
+  t2 = (int16_t)(__DOTP2(CoSi1, H) >> 16U);
+  t3 = (int16_t)(__DOTP2(C1, H) >> 16U);
+  /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
+  /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
+  t4 = (int16_t)(__DOTP2(CoSi3, G) >> 16U);
+  t5 = (int16_t)(__DOTP2(C3, G) >> 16U);
+  /* ya' = ya + yb + yc + yd */
+  /* xa' = xa + xb + xc + xd */
+  A = __ADD2(A, B);
+  E = __PACK2(t1, t0);
+  F = __PACK2(t3, t2);
+  G = __PACK2(t5, t4);
+  *((v2s *)&pOut[i0_store * 2U]) = A;
+  *((v2s *)&pOut[i1_store * 2U]) = E;
+  *((v2s *)&pOut[i2_store * 2U]) = F;
+  *((v2s *)&pOut[i3_store * 2U]) = G;
+#else
+  v2s s1, s2;
+  /* Read yb (real), xb(imag) input */
+  B = *(v2s *)&pIn[i1 * 2U];
+  /* Read yd (real), xd(imag) input */
+  D = *(v2s *)&pIn[i3 * 2U];
+  /* Read ya (real), xa (imag) input */
+  A = *(v2s *)&pIn[i0 * 2U];
+  /* Read yc (real), xc(imag) input */
+  C = *(v2s *)&pIn[i2 * 2U];
+  asm volatile("addi %[s1], zero, 0x01;"
+               "slli %[s1], %[s1], 0x10;"
+               "addi %[s1], %[s1], 0x01;"
+               "addi %[s2], zero, 0x02;"
+               "slli %[s2], %[s2], 0x10;"
+               "addi %[s2], %[s2], 0x02;"
+               "pv.sra.h  %[B],%[B],%[s2];"
+               "pv.sra.h  %[D],%[D],%[s2];"
+               "pv.sra.h  %[A],%[A],%[s2];"
+               "pv.sra.h  %[C],%[C],%[s2];"
+               "pv.add.h  %[G],%[B],%[D];"
+               "pv.sub.h  %[H],%[B],%[D];"
+               "pv.add.h  %[E],%[A],%[C];"
+               "pv.sub.h  %[F],%[A],%[C];"
+               "pv.extract.h  %[t0],%[H],0;"
+               "pv.extract.h  %[t1],%[H],1;"
+               "pv.sra.h  %[A],%[E],%[s1];"
+               "pv.sra.h  %[B],%[G],%[s1];"
+               "sub %[t3],zero,%[t1];"
+               "sub %[t4],zero,%[t0];"
+               "pv.pack %[C],%[t0],%[t3];"
+               "pv.pack %[D],%[t4],%[t1];"
+               "pv.sub.h  %[E],%[E],%[G];"
+               "pv.add.h  %[G],%[F],%[C];"
+               "pv.add.h  %[H],%[F],%[D];"
+               "pv.dotsp.h  %[C],%[CoSi2],%[E];"
+               "pv.dotsp.h  %[D],%[C2],%[E];"
+               "pv.dotsp.h  %[E],%[CoSi1],%[H];"
+               "pv.dotsp.h  %[F],%[C1],%[H];"
+               "srai  %[t0],%[C],0x10;"
+               "srai  %[t1],%[D],0x10;"
+               "pv.dotsp.h  %[C],%[CoSi3],%[G];"
+               "pv.dotsp.h  %[D],%[C3],%[G];"
+               "srai  %[t2],%[E],0x10;"
+               "srai  %[t3],%[F],0x10;"
+               "srai  %[t4],%[C],0x10;"
+               "srai  %[t5],%[D],0x10;"
+               "pv.add.h  %[A],%[A],%[B];"
+               "pv.pack %[E],%[t1],%[t0];"
+               "pv.pack %[F],%[t3],%[t2];"
+               "pv.pack %[G],%[t5],%[t4];"
+               : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D),
+                 [E] "=&r"(E), [F] "=&r"(F), [G] "=&r"(G), [H] "=&r"(H),
+                 [t0] "=&r"(t0), [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3),
+                 [t4] "=&r"(t4), [t5] "=&r"(t5), [s1] "=&r"(s1), [s2] "=&r"(s2)
+               : [C1] "r"(C1), [C2] "r"(C2), [C3] "r"(C3), [CoSi1] "r"(CoSi1),
+                 [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3)
+               :);
+  *((v2s *)&pOut[i0_store * 2U]) = A;
+  *((v2s *)&pOut[i1_store * 2U]) = E;
+  *((v2s *)&pOut[i2_store * 2U]) = F;
+  *((v2s *)&pOut[i3_store * 2U]) = G;
+#endif
+}
+
+/**
+  @brief         Middle butterfly stage.
+  @param[in]     pIn  points to input buffer of 16b data, Re and Im parts are
+  interleaved
+  @param[out]    pOut  points to output buffer of 16b data, Re and Im parts are
+  interleaved
+  @param[in]     i0 points to the first element to be processed
+  @param[in]     n2 number of elements in the first wing of the butterfly
+  @param[in]     CoSi1 packed cosine and sine first twiddle
+  @param[in]     CoSi2 packed cosine and sine second twiddle
+  @param[in]     CoSi3 packed cosine and sine third twiddle
+  @param[in]     C1 packed sine and cosine first twiddle
+  @param[in]     C2 packed sine and cosine second twiddle
+  @param[in]     C3 packed sine and cosine third twiddle
+  @return        none
+*/
+static inline void radix4_butterfly_middle(int16_t *pIn, int16_t *pOut,
+                                           uint32_t i0, uint32_t n2, v2s CoSi1,
+                                           v2s CoSi2, v2s CoSi3, v2s C1, v2s C2,
+                                           v2s C3) {
+  int16_t t0, t1, t2, t3, t4, t5;
+  uint32_t i1, i2, i3;
+  uint32_t i0_store, i1_store, i2_store, i3_store;
+  v2s A, B, C, D, E, F, G, H;
+
+// LOAD INDEXES
+#if defined(FOLDED) || defined(SCHEDULED)
+  /*  index calculation for the input as, */
+  /*  pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 +
+   * 3fftLen/4] */
+  i1 = i0 + N_BANKS;
+  i2 = i1 + N_BANKS;
+  i3 = i2 + N_BANKS;
+#else
+  /*  index calculation for the input as, */
+  /*  pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 +
+   * 3fftLen/4] */
+  i1 = i0 + n2;
+  i2 = i1 + n2;
+  i3 = i2 + n2;
+#endif
+// STORE INDEXES
+#if defined(FOLDED) || defined(SCHEDULED)
+  uint32_t n2_store = n2 >> 2U;
+  i0_store =
+      (i0 % n2_store) + (i0 / n2) * n2 + ((i0 % n2) / n2_store) * N_BANKS;
+  i1_store = i0_store + n2_store;
+  i2_store = i1_store + n2_store;
+  i3_store = i2_store + n2_store;
+#else
+  i0_store = i0;
+  i1_store = i1;
+  i2_store = i2;
+  i3_store = i3;
+#endif
+
+#ifndef ASM
+  v2s s1 = {1, 1};
+  /* Read yb (real), xb(imag) input */
+  B = *(v2s *)&pIn[i1 * 2U];
+  /* Read yd (real), xd(imag) input */
+  D = *(v2s *)&pIn[i3 * 2U];
+  /* Read ya (real), xa(imag) input */
+  A = *(v2s *)&pIn[i0 * 2U];
+  /* Read yc (real), xc(imag) input */
+  C = *(v2s *)&pIn[i2 * 2U];
+  /* G0 = (yb + yd), G1 = (xb + xd) */
+  G = __ADD2(B, D);
+  /* H0 = (yb - yd), H1 = (xb - xd) */
+  H = __SUB2(B, D);
+  /* E0 = (ya + yc), E1 = (xa + xc) */
+  E = __ADD2(A, C);
+  /* F0 = (ya - yc), F1 =(xa - xc) */
+  F = __SUB2(A, C);
+  G = __SRA2(G, s1);
+  H = __SRA2(H, s1);
+  E = __SRA2(E, s1);
+  F = __SRA2(F, s1);
+  t0 = (int16_t)H[0];
+  t1 = (int16_t)H[1];
+  /* C0 = (ya+yc) - (yb+yd), C1 = (xa+xc) - (xb+xd) */
+  C = __SUB2(E, G);
+  /* D0 = (ya+yc) + (yb+yd), D1 = (xa+xc) + (xb+xd) */
+  D = __ADD2(E, G);
+  /* A0 = (xb-xd), A1 = (yd-yb) */
+  A = __PACK2(t0, -t1);
+  /* B0 = (xd-xb), B1 = (yb-yd) */
+  B = __PACK2(-t0, t1);
+  /* xa' = xa + xb + xc + xd */
+  /* ya' = ya + yb + yc + yd */
+  D = __SRA2(D, s1);
+  /* E1 = (ya-yc) + (xb-xd),  E0 = (xa-xc) - (yb-yd)) */
+  E = __ADD2(F, A);
+  /* F1 = (ya-yc) - (xb-xd), F0 = (xa-xc) + (yb-yd)) */
+  F = __ADD2(F, B);
+  /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
+  /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+  t0 = (int16_t)(__DOTP2(CoSi2, C) >> 16U);
+  t1 = (int16_t)(__DOTP2(C2, C) >> 16U);
+  /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
+  /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
+  t2 = (int16_t)(__DOTP2(CoSi1, F) >> 16U);
+  t3 = (int16_t)(__DOTP2(C1, F) >> 16U);
+  /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
+  /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
+  t4 = (int16_t)(__DOTP2(CoSi3, E) >> 16U);
+  t5 = (int16_t)(__DOTP2(C3, E) >> 16U);
+  A = __PACK2(t1, t0);
+  B = __PACK2(t3, t2);
+  C = __PACK2(t5, t4);
+  *((v2s *)&pOut[i0_store * 2U]) = D;
+  *((v2s *)&pOut[i1_store * 2U]) = A;
+  *((v2s *)&pOut[i2_store * 2U]) = B;
+  *((v2s *)&pOut[i3_store * 2U]) = C;
+#else
+  v2s s1;
+  /* Read yb (real), xb(imag) input */
+  B = *(v2s *)&pIn[i1 * 2U];
+  /* Read yd (real), xd(imag) input */
+  D = *(v2s *)&pIn[i3 * 2U];
+  /* Read ya (real), xa(imag) input */
+  A = *(v2s *)&pIn[i0 * 2U];
+  /* Read yc (real), xc(imag) input */
+  C = *(v2s *)&pIn[i2 * 2U];
+  asm volatile("addi %[s1], zero, 0x01;"
+               "slli %[s1], %[s1], 0x10;"
+               "addi %[s1], %[s1], 0x01;"
+               "pv.add.h  %[G],%[B],%[D];"
+               "pv.sub.h  %[H],%[B],%[D];"
+               "pv.add.h  %[E],%[A],%[C];"
+               "pv.sub.h  %[F],%[A],%[C];"
+               "pv.sra.h  %[G],%[G],%[s1];"
+               "pv.sra.h  %[H],%[H],%[s1];"
+               "pv.sra.h  %[E],%[E],%[s1];"
+               "pv.sra.h  %[F],%[F],%[s1];"
+               "pv.extract.h  %[t0],%[H],0;"
+               "pv.extract.h  %[t1],%[H],1;"
+               "pv.sub.h  %[C],%[E],%[G];"
+               "pv.add.h  %[D],%[E],%[G];"
+               "sub %[t4],zero,%[t1];"
+               "sub %[t3],zero,%[t0];"
+               "pv.pack %[A],%[t0],%[t4];"
+               "pv.pack %[B],%[t3],%[t1];"
+               "pv.sra.h  %[D],%[D],%[s1];"
+               "pv.add.h  %[E],%[F],%[A];"
+               "pv.add.h  %[F],%[F],%[B];"
+               "pv.dotsp.h  %[G],%[CoSi2],%[C];"
+               "pv.dotsp.h  %[H],%[C2],%[C];"
+               "pv.dotsp.h  %[A],%[CoSi1],%[F];"
+               "pv.dotsp.h  %[B],%[C1],%[F];"
+               "srai  %[t0],%[G],0x10;"
+               "srai  %[t1],%[H],0x10;"
+               "pv.dotsp.h  %[G],%[CoSi3],%[E];"
+               "pv.dotsp.h  %[H],%[C3],%[E];"
+               "srai  %[t2],%[A],0x10;"
+               "srai  %[t3],%[B],0x10;"
+               "srai  %[t4],%[G],0x10;"
+               "srai  %[t5],%[H],0x10;"
+               "pv.pack %[A],%[t1],%[t0];"
+               "pv.pack %[B],%[t3],%[t2];"
+               "pv.pack %[C],%[t5],%[t4];"
+               : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D),
+                 [E] "=&r"(E), [F] "=&r"(F), [G] "=&r"(G), [H] "=&r"(H),
+                 [t0] "=&r"(t0), [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3),
+                 [t4] "=&r"(t4), [t5] "=&r"(t5), [s1] "=&r"(s1)
+               : [C1] "r"(C1), [C2] "r"(C2), [C3] "r"(C3), [CoSi1] "r"(CoSi1),
+                 [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3)
+               :);
+  *((v2s *)&pOut[i0_store * 2U]) = D;
+  *((v2s *)&pOut[i1_store * 2U]) = A;
+  *((v2s *)&pOut[i2_store * 2U]) = B;
+  *((v2s *)&pOut[i3_store * 2U]) = C;
+#endif
+}
+
+/**
+  @brief         Last butterfly stage.
+  @param[in]     pIn  points to input buffer of 16b data, Re and Im parts are
+  interleaved
+  @param[out]    pOut  points to output buffer of 16b data, Re and Im parts are
+  interleaved
+  @param[in]     i0 points to the first element to be processed
+  @return        none
+*/
+static inline void radix4_butterfly_last(int16_t *pIn, int16_t *pOut,
+                                         uint32_t i0) {
+  int16_t t0, t1;
+  uint32_t i1, i2, i3;
+  uint32_t i0_store, i1_store, i2_store, i3_store;
+  v2s A, B, C, D, E, F, G, H;
+
+// LOAD INDEXES
+#if defined(FOLDED) || defined(SCHEDULED)
+  /*  index calculation for the input as, */
+  /*  pIn[i0 + 0], pIn[i0 + fftLen/4],
+      pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */
+  i1 = i0 + N_BANKS;
+  i2 = i1 + N_BANKS;
+  i3 = i2 + N_BANKS;
+#else
+  /*  index calculation for the input as, */
+  /*  pIn[i0 + 0], pIn[i0 + fftLen/4],
+      pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */
+  i1 = i0 + 1U;
+  i2 = i1 + 1U;
+  i3 = i2 + 1U;
+#endif
+// STORE INDEXES
+#if defined(FOLDED)
+  i0_store = i0 * 4;
+  i1_store = i0_store + 1;
+  i2_store = i1_store + 1;
+  i3_store = i2_store + 1;
+#else
+  i0_store = i0;
+  i1_store = i1;
+  i2_store = i2;
+  i3_store = i3;
+#endif
+
+#ifndef ASM
+  v2s s1 = {1, 1};
+  /* Read yb (real), xb(imag) input */
+  B = *(v2s *)&pIn[i1 * 2U];
+  /* Read yd (real), xd(imag) input */
+  D = *(v2s *)&pIn[i3 * 2U];
+  /* Read ya (real), xa(imag) input */
+  A = *(v2s *)&pIn[i0 * 2U];
+  /* Read yc (real), xc(imag) input */
+  C = *(v2s *)&pIn[i2 * 2U];
+  /* H0 = (yb-yd), H1 = (xb-xd) */
+  H = __SUB2(B, D);
+  /* G0 = (yb+yd), G1 = (xb+xd) */
+  G = __ADD2(B, D);
+  /* E0 = (ya+yc), E1 = (xa+xc) */
+  E = __ADD2(A, C);
+  /* F0 = (ya-yc), F1 = (xa-xc) */
+  F = __SUB2(A, C);
+  H = __SRA2(H, s1);
+  G = __SRA2(G, s1);
+  E = __SRA2(E, s1);
+  t0 = (int16_t)H[0];
+  t1 = (int16_t)H[1];
+  F = __SRA2(F, s1);
+  /* xa' = (xa+xb+xc+xd) */
+  /* ya' = (ya+yb+yc+yd) */
+  *((v2s *)&pOut[i0_store * 2U]) = __ADD2(E, G);
+  /* A0 = (xb-xd), A1 = (yd-yb) */
+  A = __PACK2(-t0, t1);
+  /* B0 = (xd-xb), B1 = (yb-yd) */
+  B = __PACK2(t0, -t1);
+  /* xc' = (xa-xb+xc-xd) */
+  /* yc' = (ya-yb+yc-yd) */
+  E = __SUB2(E, G);
+  /* xb' = (xa+yb-xc-yd) */
+  /* yb' = (ya-xb-yc+xd) */
+  A = __ADD2(F, A);
+  /* xd' = (xa-yb-xc+yd) */
+  /* yd' = (ya+xb-yc-xd) */
+  B = __ADD2(F, B);
+  *((v2s *)&pOut[i1_store * 2U]) = E;
+  *((v2s *)&pOut[i2_store * 2U]) = A;
+  *((v2s *)&pOut[i3_store * 2U]) = B;
+#else
+  /* Read yb (real), xb(imag) input */
+  B = *(v2s *)&pIn[i1 * 2U];
+  /* Read yd (real), xd(imag) input */
+  D = *(v2s *)&pIn[i3 * 2U];
+  /* Read ya (real), xa(imag) input */
+  A = *(v2s *)&pIn[i0 * 2U];
+  /* Read yc (real), xc(imag) input */
+  C = *(v2s *)&pIn[i2 * 2U];
+  int16_t t2, t3;
+  v2s s1;
+  asm volatile(
+      "addi %[s1], zero, 0x01;"
+      "slli %[s1], %[s1], 0x10;"
+      "addi %[s1], %[s1], 0x01;"
+      "pv.sub.h  %[H],%[B],%[D];"
+      "pv.add.h  %[G],%[B],%[D];"
+      "pv.add.h  %[E],%[A],%[C];"
+      "pv.sub.h  %[F],%[A],%[C];"
+      "pv.sra.h  %[H],%[H],%[s1];"
+      "pv.sra.h  %[G],%[G],%[s1];"
+      "pv.sra.h  %[E],%[E],%[s1];"
+      "pv.extract.h  %[t0],%[H],0;"
+      "pv.extract.h  %[t1],%[H],1;"
+      "pv.sra.h  %[F],%[F],%[s1];"
+      "sub %[t2], zero, %[t0];"
+      "sub %[t3], zero, %[t1];"
+      "pv.pack %[A],%[t2],%[t1];"
+      "pv.pack %[B],%[t0],%[t3];"
+      "pv.add.h  %[H],%[E],%[G];"
+      "pv.sub.h  %[E],%[E],%[G];"
+      "pv.add.h  %[A],%[F],%[A];"
+      "pv.add.h  %[B],%[F],%[B];"
+      : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "=&r"(E),
+        [F] "=&r"(F), [G] "=&r"(G), [H] "=&r"(H), [t0] "=&r"(t0),
+        [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [s1] "=&r"(s1)
+      :
+      :);
+  *((v2s *)&pOut[i0_store * 2U]) = H;
+  *((v2s *)&pOut[i1_store * 2U]) = E;
+  *((v2s *)&pOut[i2_store * 2U]) = A;
+  *((v2s *)&pOut[i3_store * 2U]) = B;
+#endif
+}
diff --git a/software/runtime/kernel/mempool_radix4_cfft_q16_bitreversal.h b/software/runtime/kernel/mempool_radix4_cfft_q16_bitreversal.h
new file mode 100644
index 000000000..56f4f478b
--- /dev/null
+++ b/software/runtime/kernel/mempool_radix4_cfft_q16_bitreversal.h
@@ -0,0 +1,147 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+void mempool_bitrevtable_q16s_riscv32(uint16_t *pSrc, const uint16_t bitRevLen,
+                                      const uint16_t *pBitRevTab) {
+  uint16_t addr1, addr2;
+  uint16_t tmpa, tmpb;
+  for (uint32_t i = 0; i < bitRevLen; i += 2) {
+    addr1 = pBitRevTab[i] >> 2U;
+    addr2 = pBitRevTab[i + 1] >> 2U;
+
+    tmpa = pSrc[addr1];
+    tmpb = pSrc[addr2];
+    pSrc[addr1] = tmpb;
+    pSrc[addr2] = tmpa;
+
+    tmpa = pSrc[addr1 + 1];
+    tmpb = pSrc[addr2 + 1];
+    pSrc[addr1 + 1] = tmpb;
+    pSrc[addr2 + 1] = tmpa;
+  }
+}
+
+#ifndef ASM
+#define SWAP_ITEMS                                                             \
+  addr1 = *(v2s *)&pBitRevTab[i];                                              \
+  addr2 = *(v2s *)&pBitRevTab[i + 2];                                          \
+  addr3 = *(v2s *)&pBitRevTab[i + 4];                                          \
+  addr4 = *(v2s *)&pBitRevTab[i + 6];                                          \
+  addr1 = __SRA2(addr1, s2);                                                   \
+  addr2 = __SRA2(addr2, s2);                                                   \
+  addr3 = __SRA2(addr3, s2);                                                   \
+  addr4 = __SRA2(addr4, s2);                                                   \
+  a1 = addr1[1];                                                               \
+  a2 = addr2[1];                                                               \
+  a3 = addr3[1];                                                               \
+  a4 = addr4[1];                                                               \
+  b1 = addr1[0];                                                               \
+  b2 = addr2[0];                                                               \
+  b3 = addr3[0];                                                               \
+  b4 = addr4[0];                                                               \
+  tmpa1 = *(v2s *)&pSrc[a1];                                                   \
+  tmpa2 = *(v2s *)&pSrc[a2];                                                   \
+  tmpa3 = *(v2s *)&pSrc[a3];                                                   \
+  tmpa4 = *(v2s *)&pSrc[a4];                                                   \
+  tmpb1 = *(v2s *)&pSrc[b1];                                                   \
+  tmpb2 = *(v2s *)&pSrc[b2];                                                   \
+  tmpb3 = *(v2s *)&pSrc[b3];                                                   \
+  tmpb4 = *(v2s *)&pSrc[b4];                                                   \
+  *((v2s *)&pSrc[a1]) = tmpb1;                                                 \
+  *((v2s *)&pSrc[a2]) = tmpb2;                                                 \
+  *((v2s *)&pSrc[a3]) = tmpb3;                                                 \
+  *((v2s *)&pSrc[a4]) = tmpb4;                                                 \
+  *((v2s *)&pSrc[b1]) = tmpa1;                                                 \
+  *((v2s *)&pSrc[b2]) = tmpa2;                                                 \
+  *((v2s *)&pSrc[b3]) = tmpa3;                                                 \
+  *((v2s *)&pSrc[b4]) = tmpa4;
+#else
+#define SWAP_ITEMS                                                             \
+  addr1 = *(v2s *)&pBitRevTab[i];                                              \
+  addr2 = *(v2s *)&pBitRevTab[i + 2];                                          \
+  addr3 = *(v2s *)&pBitRevTab[i + 4];                                          \
+  addr4 = *(v2s *)&pBitRevTab[i + 6];                                          \
+  asm volatile("pv.sra.h  %[addr1],%[addr1],%[s2];"                            \
+               "pv.sra.h  %[addr2],%[addr2],%[s2];"                            \
+               "pv.sra.h  %[addr3],%[addr3],%[s2];"                            \
+               "pv.sra.h  %[addr4],%[addr4],%[s2];"                            \
+               "pv.extract.h  %[a1],%[addr1],1;"                               \
+               "pv.extract.h  %[a2],%[addr2],1;"                               \
+               "pv.extract.h  %[a3],%[addr3],1;"                               \
+               "pv.extract.h  %[a4],%[addr4],1;"                               \
+               "pv.extract.h  %[b1],%[addr1],0;"                               \
+               "pv.extract.h  %[b2],%[addr2],0;"                               \
+               "pv.extract.h  %[b3],%[addr3],0;"                               \
+               "pv.extract.h  %[b4],%[addr4],0;"                               \
+               : [a1] "=r"(a1), [a2] "=r"(a2), [a3] "=r"(a3), [a4] "=r"(a4),   \
+                 [b1] "=r"(b1), [b2] "=r"(b2), [b3] "=r"(b3), [b4] "=r"(b4),   \
+                 [addr1] "+&r"(addr1), [addr2] "+&r"(addr2),                   \
+                 [addr3] "+&r"(addr3), [addr4] "+&r"(addr4)                    \
+               : [s2] "r"(s2)                                                  \
+               :);                                                             \
+  tmpa1 = *(v2s *)&pSrc[a1];                                                   \
+  tmpa2 = *(v2s *)&pSrc[a2];                                                   \
+  tmpa3 = *(v2s *)&pSrc[a3];                                                   \
+  tmpa4 = *(v2s *)&pSrc[a4];                                                   \
+  tmpb1 = *(v2s *)&pSrc[b1];                                                   \
+  tmpb2 = *(v2s *)&pSrc[b2];                                                   \
+  tmpb3 = *(v2s *)&pSrc[b3];                                                   \
+  tmpb4 = *(v2s *)&pSrc[b4];                                                   \
+  *((v2s *)&pSrc[a1]) = tmpb1;                                                 \
+  *((v2s *)&pSrc[a2]) = tmpb2;                                                 \
+  *((v2s *)&pSrc[a3]) = tmpb3;                                                 \
+  *((v2s *)&pSrc[a4]) = tmpb4;                                                 \
+  *((v2s *)&pSrc[b1]) = tmpa1;                                                 \
+  *((v2s *)&pSrc[b2]) = tmpa2;                                                 \
+  *((v2s *)&pSrc[b3]) = tmpa3;                                                 \
+  *((v2s *)&pSrc[b4]) = tmpa4;
+#endif
+
+void mempool_bitrevtable_q16s_xpulpimg(uint16_t *pSrc, const uint16_t bitRevLen,
+                                       const uint16_t *pBitRevTab) {
+  v2s addr1, addr2, addr3, addr4;
+  v2s s2 = (v2s){2, 2};
+  v2s tmpa1, tmpa2, tmpa3, tmpa4;
+  v2s tmpb1, tmpb2, tmpb3, tmpb4;
+  int32_t a1, a2, a3, a4;
+  int32_t b1, b2, b3, b4;
+  for (uint32_t i = 0; i < bitRevLen; i += 8) {
+    SWAP_ITEMS;
+  }
+}
+
+void mempool_bitrevtable_q16p_xpulpimg(uint16_t *pSrc, const uint16_t bitRevLen,
+                                       const uint16_t *pBitRevTab,
+                                       const uint32_t nPE) {
+  uint32_t core_id = mempool_get_core_id();
+  v2s addr1, addr2, addr3, addr4;
+  v2s s2 = (v2s){2, 2};
+  v2s tmpa1, tmpa2, tmpa3, tmpa4;
+  v2s tmpb1, tmpb2, tmpb3, tmpb4;
+  int32_t a1, a2, a3, a4;
+  int32_t b1, b2, b3, b4;
+  for (uint32_t i = 8 * core_id; i < bitRevLen; i += (8 * nPE)) {
+    SWAP_ITEMS;
+  }
+  mempool_log_partial_barrier(2, core_id, nPE);
+}
+
+void mempool_bitrev_q16p_xpulpimg(uint16_t *pSrc, uint16_t *pDst,
+                                  const uint16_t fftLen, const uint32_t nPE) {
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t idx_result, idx, i, j;
+  for (i = core_id; i < fftLen; i += nPE) {
+    idx_result = 0;
+    idx = i;
+    for (j = 0; j < LOG2; j++) {
+      idx_result = (idx_result << 1U) | (idx & 1U);
+      idx = idx >> 1U;
+    }
+    pDst[2 * idx_result] = pSrc[2 * i];
+    pDst[2 * idx_result + 1] = pSrc[2 * i + 1];
+  }
+  mempool_log_partial_barrier(2, core_id, nPE);
+}
diff --git a/software/runtime/kernel/mempool_radix4_cfft_q16p.h b/software/runtime/kernel/mempool_radix4_cfft_q16p.h
new file mode 100644
index 000000000..34f338ce8
--- /dev/null
+++ b/software/runtime/kernel/mempool_radix4_cfft_q16p.h
@@ -0,0 +1,640 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#include "xpulp/builtins_v2.h"
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+#ifndef ASM
+#define SHUFFLE_TWIDDLEFACT                                                    \
+  t1 = (int16_t)CoSi1[0];                                                      \
+  t3 = (int16_t)CoSi2[0];                                                      \
+  t5 = (int16_t)CoSi3[0];                                                      \
+  t0 = (int16_t)CoSi1[1];                                                      \
+  t2 = (int16_t)CoSi2[1];                                                      \
+  t4 = (int16_t)CoSi3[1];                                                      \
+  C1 = __PACK2(t1, -t0);                                                       \
+  C2 = __PACK2(t3, -t2);                                                       \
+  C3 = __PACK2(t5, -t4);
+#else
+#define SHUFFLE_TWIDDLEFACT                                                    \
+  asm volatile("pv.extract.h  %[t1],%[CoSi1],0;"                               \
+               "pv.extract.h  %[t3],%[CoSi2],0;"                               \
+               "pv.extract.h  %[t5],%[CoSi3],0;"                               \
+               "pv.extract.h  %[t0],%[CoSi1],1;"                               \
+               "pv.extract.h  %[t2],%[CoSi2],1;"                               \
+               "pv.extract.h  %[t4],%[CoSi3],1;"                               \
+               "sub           %[t0],zero,%[t0];"                               \
+               "sub           %[t2],zero,%[t2];"                               \
+               "sub           %[t4],zero,%[t4];"                               \
+               "pv.pack %[C1],%[t1],%[t0];"                                    \
+               "pv.pack %[C2],%[t3],%[t2];"                                    \
+               "pv.pack %[C3],%[t5],%[t4];"                                    \
+               : [C1] "=r"(C1), [C2] "=r"(C2), [C3] "=r"(C3), [t0] "=&r"(t0),  \
+                 [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3),               \
+                 [t4] "=&r"(t4), [t5] "=&r"(t5)                                \
+               : [CoSi1] "r"(CoSi1), [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3)    \
+               :);
+#endif
+
+#ifdef FOLDED_TWIDDLES
+
+#define LOAD_STORE_TWIDDLEFACT                                                 \
+  CoSi1 = *(v2s *)&pCoef_src[2U * ic];                                         \
+  CoSi2 = *(v2s *)&pCoef_src[2U * (ic + 1 * N_BANKS)];                         \
+  CoSi3 = *(v2s *)&pCoef_src[2U * (ic + 2 * N_BANKS)];                         \
+  if (ic % 4 == 0) {                                                           \
+    *((v2s *)&pCoef_dst[2U * (ic_store)]) = CoSi1;                             \
+    *((v2s *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi1;              \
+    *((v2s *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi1;              \
+    *((v2s *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi1;              \
+    ic_store += N_BANKS;                                                       \
+    *((v2s *)&pCoef_dst[2U * (ic_store)]) = CoSi2;                             \
+    *((v2s *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi2;              \
+    *((v2s *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi2;              \
+    *((v2s *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi2;              \
+    ic_store += N_BANKS;                                                       \
+    *((v2s *)&pCoef_dst[2U * (ic_store)]) = CoSi3;                             \
+    *((v2s *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi3;              \
+    *((v2s *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi3;              \
+    *((v2s *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi3;              \
+  }
+
+#else
+#define LOAD_STORE_TWIDDLEFACT                                                 \
+  CoSi1 = *(v2s *)&pCoef_src[2U * ic];                                         \
+  CoSi2 = *(v2s *)&pCoef_src[2U * (ic * 2U)];                                  \
+  CoSi3 = *(v2s *)&pCoef_src[2U * (ic * 3U)];
+#endif
+
+void mempool_radix4_cfft_q16p_xpulpimg(int16_t *pSrc16, uint32_t fftLen,
+                                       const int16_t *pCoef16,
+                                       uint32_t twidCoefModifier,
+                                       uint32_t nPE) {
+  uint32_t absolute_core_id = mempool_get_core_id();
+  uint32_t core_id = absolute_core_id % nPE;
+  v2s CoSi1, CoSi2, CoSi3;
+  v2s C1, C2, C3;
+  int16_t t0, t1, t2, t3, t4, t5;
+  uint32_t n1, n2, ic, i0, j, k;
+  uint32_t step, steps;
+
+  /* START OF FIRST STAGE PROCESS */
+  n1 = fftLen;
+  n2 = n1 >> 2U;
+  step = (n2 + nPE - 1) / nPE;
+  for (i0 = core_id * step; i0 < MIN(core_id * step + step, n2); i0++) {
+
+    /*  Twiddle coefficients index modifier */
+    ic = i0 * twidCoefModifier;
+    /* co1 & si1 are read from Coefficient pointer */
+    CoSi1 = *(v2s *)&pCoef16[ic * 2U];
+    /* co2 & si2 are read from Coefficient pointer */
+    CoSi2 = *(v2s *)&pCoef16[2U * (ic * 2U)];
+    /* co3 & si3 are read from Coefficient pointer */
+    CoSi3 = *(v2s *)&pCoef16[3U * (ic * 2U)];
+    SHUFFLE_TWIDDLEFACT;
+
+    radix4_butterfly_first(pSrc16, pSrc16, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2,
+                           C3);
+  }
+  mempool_log_barrier(2, absolute_core_id);
+  /* END OF FIRST STAGE PROCESSING */
+
+  /* START OF MIDDLE STAGE PROCESSING */
+  twidCoefModifier <<= 2U;
+  for (k = fftLen / 4U; k > 4U; k >>= 2U) {
+
+    uint32_t offset, butt_id;
+    n1 = n2;
+    n2 >>= 2U;
+    step = (n2 + nPE - 1) / nPE;
+    butt_id = core_id % n2;
+    offset = (core_id / n2) * n1;
+    for (j = butt_id * step; j < MIN(butt_id * step + step, n2); j++) {
+      /*  Twiddle coefficients index modifier */
+      ic = twidCoefModifier * j;
+      /* co1 & si1 are read from Coefficient pointer */
+      CoSi1 = *(v2s *)&pCoef16[ic * 2U];
+      /* co2 & si2 are read from Coefficient pointer */
+      CoSi2 = *(v2s *)&pCoef16[2U * (ic * 2U)];
+      /* co3 & si3 are read from Coefficient pointer */
+      CoSi3 = *(v2s *)&pCoef16[3U * (ic * 2U)];
+      SHUFFLE_TWIDDLEFACT;
+
+      /*  Butterfly implementation */
+      for (i0 = offset + j; i0 < fftLen; i0 += ((nPE + n2 - 1) / n2) * n1) {
+        radix4_butterfly_middle(pSrc16, pSrc16, i0, n2, CoSi1, CoSi2, CoSi3, C1,
+                                C2, C3);
+      }
+    }
+    twidCoefModifier <<= 2U;
+    mempool_log_barrier(2, absolute_core_id);
+  }
+  /* END OF MIDDLE STAGE PROCESSING */
+
+  /* START OF LAST STAGE PROCESSING */
+  n1 = n2;
+  n2 >>= 2U;
+  steps = fftLen / n1;
+  step = (steps + nPE - 1) / nPE;
+  /*  Butterfly implementation */
+  for (i0 = core_id * step * n1; i0 < MIN((core_id * step + step) * n1, fftLen);
+       i0 += n1) {
+    radix4_butterfly_last(pSrc16, pSrc16, i0);
+  }
+  mempool_log_barrier(2, absolute_core_id);
+  /* END OF LAST STAGE PROCESSING */
+  return;
+}
+
+void mempool_radix4by2_cfft_q16p(int16_t *pSrc, uint32_t fftLen,
+                                 const int16_t *pCoef, uint32_t nPE) {
+
+  uint32_t i;
+  uint32_t n2, step;
+  v2s pa, pb;
+
+  uint32_t l;
+  v2s CoSi;
+  v2s a, b, t;
+  int16_t testa, testb;
+  uint32_t core_id = mempool_get_core_id();
+
+  n2 = fftLen >> 1;
+  step = (n2 + nPE - 1) / nPE;
+  for (i = core_id * step; i < MIN(core_id * step + step, n2); i++) {
+
+    CoSi = *(v2s *)&pCoef[i * 2];
+    l = i + n2;
+    a = __SRA2(*(v2s *)&pSrc[2 * i], ((v2s){1, 1}));
+    b = __SRA2(*(v2s *)&pSrc[2 * l], ((v2s){1, 1}));
+    t = __SUB2(a, b);
+    *((v2s *)&pSrc[i * 2]) = __SRA2(__ADD2(a, b), ((v2s){1, 1}));
+
+    testa = (int16_t)(__DOTP2(t, CoSi) >> 16);
+    testb = (int16_t)(__DOTP2(t, __PACK2(-CoSi[1], CoSi[0])) >> 16);
+    *((v2s *)&pSrc[l * 2]) = __PACK2(testa, testb);
+  }
+  mempool_log_barrier(2, core_id);
+
+  if (nPE > 1) {
+    if (core_id < nPE / 2) {
+      // first col
+      mempool_radix4_cfft_q16p_xpulpimg(pSrc, n2, (int16_t *)pCoef, 2U,
+                                        nPE / 2);
+    } else {
+      // second col
+      mempool_radix4_cfft_q16p_xpulpimg(pSrc + fftLen, n2, (int16_t *)pCoef, 2U,
+                                        nPE - nPE / 2);
+    }
+  } else {
+    // first col
+    mempool_radix4_cfft_q16p_xpulpimg(pSrc, n2, (int16_t *)pCoef, 2U, nPE);
+    // second col
+    mempool_radix4_cfft_q16p_xpulpimg(pSrc + fftLen, n2, (int16_t *)pCoef, 2U,
+                                      nPE);
+  }
+
+  for (i = core_id * step; i < MIN(core_id * step + step, n2); i++) {
+
+    pa = *(v2s *)&pSrc[4 * i];
+    pb = *(v2s *)&pSrc[4 * i + 2];
+
+    pa = __SLL2(pa, ((v2s){1, 1}));
+    pb = __SLL2(pb, ((v2s){1, 1}));
+
+    *((v2s *)&pSrc[4 * i]) = pa;
+    *((v2s *)&pSrc[4 * i + 2]) = pb;
+  }
+  mempool_log_barrier(2, core_id);
+  return;
+}
+
+/**
+  @brief         Folding in local memory function
+  @param[in]     pSrc16  points to input buffer of 16b data, Re and Im parts are
+  interleaved
+  @param[in]     fftLen  Length of the complex input vector
+  @param[in]     nPE Number of PE
+  @return        none
+*/
+
+static inline void fold_radix4(int16_t *pSrc16, uint32_t fftLen, uint32_t nPE) {
+  uint32_t n2, i0, i1, i2, i3;
+  uint32_t i1_store, i2_store, i3_store;
+  volatile v2s A, B, C;
+  uint32_t absolute_core_id = mempool_get_core_id();
+  uint32_t core_id = absolute_core_id % 4U;
+  n2 = fftLen >> 2U;
+  for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, n2); i0++) {
+    i1 = i0 + n2;
+    i2 = i1 + n2;
+    i3 = i2 + n2;
+    i1_store = i0 + N_BANKS;
+    i2_store = i1_store + N_BANKS;
+    i3_store = i2_store + N_BANKS;
+    for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
+      A = *(v2s *)&pSrc16[i1 * 2U + idx_row * (8 * N_BANKS)];
+      B = *(v2s *)&pSrc16[i2 * 2U + idx_row * (8 * N_BANKS)];
+      C = *(v2s *)&pSrc16[i3 * 2U + idx_row * (8 * N_BANKS)];
+      *(v2s *)&pSrc16[i1_store * 2U + idx_row * (8 * N_BANKS)] = A;
+      *(v2s *)&pSrc16[i2_store * 2U + idx_row * (8 * N_BANKS)] = B;
+      *(v2s *)&pSrc16[i3_store * 2U + idx_row * (8 * N_BANKS)] = C;
+    }
+  }
+  mempool_log_partial_barrier(2, absolute_core_id, nPE);
+  return;
+}
+
+#ifdef FOLDED_TWIDDLES
+/**
+  @brief         Full FFT butterfly
+  @param[in]     pSrc16  points to input buffer of 16b data, Re and Im parts are
+  interleaved
+  @param[out]    pDst16  points to output buffer of 16b data, Re and Im parts
+  are interleaved
+  @param[in]     fftLen  Length of the complex input vector
+  @param[in]     pCoef_src Twiddle coefficients vector
+  @param[in]     pCoef_dst Auxiliary twiddle coefficients vector
+  @param[in]     nPE Number of PE
+  @return        pointer to output vector
+*/
+void mempool_radix4_cfft_q16p_folded(int16_t *pSrc16, int16_t *pDst16,
+                                     uint32_t fftLen, int16_t *pCoef_src,
+                                     int16_t *pCoef_dst, uint32_t nPE)
+#else
+/**
+  Twiddles are not folded in memory
+  @brief         Full FFT butterfly
+  @param[in]     pSrc16  points to input buffer of 16b data, Re and Im parts are
+  interleaved
+  @param[out]    pDst16  points to output buffer of 16b data, Re and Im parts
+  are interleaved
+  @param[in]     fftLen  Length of the complex input vector
+  @param[in]     pCoef_src Twiddle coefficients vector
+  @param[in]     nPE Number of PE
+  @return        pointer to output vector
+*/
+void mempool_radix4_cfft_q16p_folded(int16_t *pSrc16, int16_t *pDst16,
+                                     uint32_t fftLen, int16_t *pCoef_src,
+                                     uint32_t nPE)
+#endif
+{
+
+  uint32_t absolute_core_id = mempool_get_core_id();
+  uint32_t core_id = absolute_core_id;
+  int16_t t0, t1, t2, t3, t4, t5;
+  v2s CoSi1, CoSi2, CoSi3;
+  v2s C1, C2, C3;
+#ifdef FOLDED_TWIDDLES
+  uint32_t n1, n2, n2_store;
+  uint32_t i0, k, ic, ic_store;
+  int16_t *pTmp;
+#else
+  uint32_t n1, n2;
+  uint32_t i0, k, ic;
+  int16_t *pTmp;
+  uint32_t twidCoefModifier = 1U;
+#endif
+
+  /* START OF FIRST STAGE PROCESSING */
+  n1 = fftLen;
+  n2 = n1 >> 2U;
+  for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, n2); i0++) {
+#ifdef FOLDED_TWIDDLES
+    ic = i0;
+    ic_store = ic >> 2U;
+    n2_store = n2 >> 2U;
+#else
+    ic = i0;
+#endif
+    LOAD_STORE_TWIDDLEFACT;
+    SHUFFLE_TWIDDLEFACT;
+    radix4_butterfly_first(pSrc16, pDst16, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2,
+                           C3);
+  }
+  pTmp = pSrc16;
+  pSrc16 = pDst16;
+  pDst16 = pTmp;
+#ifdef FOLDED_TWIDDLES
+  pTmp = pCoef_src;
+  pCoef_src = pCoef_dst;
+  pCoef_dst = pTmp;
+#else
+  twidCoefModifier <<= 2U;
+#endif
+  mempool_log_partial_barrier(2, absolute_core_id, nPE);
+  /* END OF FIRST STAGE PROCESSING */
+
+  /* START OF MIDDLE STAGE PROCESSING */
+  for (k = fftLen / 4U; k > 4U; k >>= 2U) {
+    n1 = n2;
+    n2 >>= 2U;
+    for (i0 = core_id * 4; i0 < core_id * 4 + 4; i0++) {
+#ifdef FOLDED_TWIDDLES
+      ic = i0;
+      // (ic % n2) / 4 take only every 4th index in the wing
+      // (ic / n2) * n2 shift of the wing size
+      ic_store = ((ic % n2) >> 2) + (ic / n2) * n2;
+      n2_store = n2 >> 2U;
+#else
+      ic = (i0 % n2) * twidCoefModifier;
+#endif
+      LOAD_STORE_TWIDDLEFACT;
+      SHUFFLE_TWIDDLEFACT;
+      radix4_butterfly_middle(pSrc16, pDst16, i0, n2, CoSi1, CoSi2, CoSi3, C1,
+                              C2, C3);
+    }
+    pTmp = pSrc16;
+    pSrc16 = pDst16;
+    pDst16 = pTmp;
+#ifdef FOLDED_TWIDDLES
+    pTmp = pCoef_src;
+    pCoef_src = pCoef_dst;
+    pCoef_dst = pTmp;
+#else
+    twidCoefModifier <<= 2U;
+#endif
+    mempool_log_partial_barrier(2, absolute_core_id, nPE);
+  }
+  /* END OF MIDDLE STAGE PROCESSING */
+
+  /* START OF LAST STAGE PROCESSING */
+  for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, fftLen >> 2U); i0++) {
+    radix4_butterfly_last(pSrc16, pDst16, i0);
+  }
+  mempool_log_partial_barrier(2, absolute_core_id, nPE);
+  /* END OF LAST STAGE PROCESSING */
+
+  return;
+}
+
+/**
+  SCHEDULER OF MULTIPLE FOLDED FFTS
+  Memory:
+
+  1st row of FFTS
+
+  col_idx1     col_idx2     col_idx3
+  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
+  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
+  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
+  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
+
+  2nd row of FFTS
+
+  col_idx1     col_idx2     col_idx3
+  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
+  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
+  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
+  xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ...
+
+  ...
+
+  @brief         Scheduler of folded FFTs
+  @param[in]     column index of the current FFT
+  @param[in]     pSrc16  input buffer of 16b data, Re and Im are interleaved
+  @param[out]    pDst16  output buffer of 16b data, Re and Im are interleaved
+  @param[in]     fftLen  Length of the complex input vector
+  @param[in]     pCoef_src Twiddle coefficients vector
+  @param[in]     pCoef_dst Twiddle coefficients vector
+  @param[in]     pBitRevTable Bitreversal table
+  @param[in]     bitReverseLen Length of bitreversal table
+  @param[in]     bitReverseFlag Flag for bitreversal
+  @param[in]     nPE Number of PE
+  @return        void
+*/
+
+void mempool_radix4_cfft_q16p_scheduler(
+    int16_t *pSrc16, int16_t *pDst16, uint32_t fftLen, int16_t *pCoef_src,
+    __attribute__((unused)) int16_t *pCoef_dst,
+    __attribute__((unused)) uint16_t *pBitRevTable,
+    __attribute__((unused)) uint16_t bitReverseLen, uint8_t bitReverseFlag,
+    uint32_t nPE) {
+
+  uint32_t absolute_core_id = mempool_get_core_id();
+  uint32_t core_id = absolute_core_id % (fftLen >> 4U);
+  uint32_t col_id = absolute_core_id / (fftLen >> 4U);
+
+  int16_t t0, t1, t2, t3, t4, t5;
+  v2s CoSi1, CoSi2, CoSi3;
+  v2s C1, C2, C3;
+#ifdef FOLDED_TWIDDLES
+  uint32_t n1, n2, n2_store;
+  uint32_t i0, k, ic, ic_store;
+#else
+  uint32_t n1, n2;
+  uint32_t i0, k, ic;
+  uint32_t twidCoefModifier = 1U;
+#endif
+  int16_t *pTmp;
+
+  /* FIRST STAGE */
+  n1 = fftLen;
+  n2 = n1 >> 2U;
+  for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, n2); i0++) {
+    ic = i0;
+#ifdef FOLDED_TWIDDLES
+    ic_store = ic >> 2U;
+    n2_store = n2 >> 2U;
+#endif
+    LOAD_STORE_TWIDDLEFACT;
+    SHUFFLE_TWIDDLEFACT;
+    for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
+      int16_t *pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * fftLen;
+      int16_t *pOut =
+          pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
+      radix4_butterfly_first(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2,
+                             C3);
+    }
+  }
+  pTmp = pSrc16;
+  pSrc16 = pDst16;
+  pDst16 = pTmp;
+#ifdef FOLDED_TWIDDLES
+  pTmp = pCoef_src;
+  pCoef_src = pCoef_dst;
+  pCoef_dst = pTmp;
+#else
+  twidCoefModifier <<= 2U;
+#endif
+  mempool_log_partial_barrier(2, absolute_core_id, nPE);
+
+  /* MIDDLE STAGE */
+  for (k = fftLen / 4U; k > 4U; k >>= 2U) {
+    n1 = n2;
+    n2 >>= 2U;
+    for (i0 = core_id * 4; i0 < core_id * 4 + 4; i0++) {
+#ifdef FOLDED_TWIDDLES
+      ic = i0;
+      ic_store = ((ic % n2) >> 2) + (ic / n2) * n2;
+      n2_store = n2 >> 2U;
+#else
+      ic = (i0 % n2) * twidCoefModifier;
+#endif
+      LOAD_STORE_TWIDDLEFACT;
+      SHUFFLE_TWIDDLEFACT;
+
+      for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
+        int16_t *pIn =
+            pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
+        int16_t *pOut =
+            pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
+        radix4_butterfly_middle(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2,
+                                C3);
+      }
+    }
+    pTmp = pSrc16;
+    pSrc16 = pDst16;
+    pDst16 = pTmp;
+#ifdef FOLDED_TWIDDLES
+    pTmp = pCoef_src;
+    pCoef_src = pCoef_dst;
+    pCoef_dst = pTmp;
+#else
+    twidCoefModifier <<= 2U;
+#endif
+    mempool_log_partial_barrier(2, absolute_core_id, N_FFTs_COL * nPE);
+  }
+
+  /*  LAST STAGE */
+  for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, fftLen >> 2U); i0++) {
+    for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
+      int16_t *pIn =
+          pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
+      int16_t *pOut =
+          pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4);
+      radix4_butterfly_last(pIn, pOut, i0);
+    }
+  }
+  pTmp = pSrc16;
+  pSrc16 = pDst16;
+  pDst16 = pTmp;
+  mempool_log_partial_barrier(2, absolute_core_id, N_FFTs_COL * nPE);
+  mempool_stop_benchmark();
+  mempool_start_benchmark();
+  /* BITREVERSAL */
+  // Bitreversal stage stores in the sequential addresses
+  if (bitReverseFlag) {
+#ifdef BITREVERSETABLE
+    pSrc16 = pSrc16 + 2 * col_id * (fftLen / 4);
+    pDst16 = pDst16 + 2 * col_id * fftLen;
+    for (ic = 8 * core_id; ic < bitReverseLen; ic += 8 * nPE) {
+      uint32_t addr1, addr2, addr3, addr4;
+      uint32_t tmpa1, tmpa2, tmpa3, tmpa4;
+      uint32_t tmpb1, tmpb2, tmpb3, tmpb4;
+      uint32_t a1, a2, a3, a4;
+      uint32_t b1, b2, b3, b4;
+      uint32_t a1_load, a2_load, a3_load, a4_load;
+      uint32_t b1_load, b2_load, b3_load, b4_load;
+      uint32_t s2 = 0x00020002;
+      addr1 = *(uint32_t *)&pBitRevTable[ic];
+      addr2 = *(uint32_t *)&pBitRevTable[ic + 2];
+      addr3 = *(uint32_t *)&pBitRevTable[ic + 4];
+      addr4 = *(uint32_t *)&pBitRevTable[ic + 6];
+      asm volatile("pv.sra.h  %[addr1],%[addr1],%[s2];"
+                   "pv.sra.h  %[addr2],%[addr2],%[s2];"
+                   "pv.sra.h  %[addr3],%[addr3],%[s2];"
+                   "pv.sra.h  %[addr4],%[addr4],%[s2];"
+                   "pv.extract.h  %[a1],%[addr1],0;"
+                   "pv.extract.h  %[a2],%[addr2],0;"
+                   "pv.extract.h  %[a3],%[addr3],0;"
+                   "pv.extract.h  %[a4],%[addr4],0;"
+                   "pv.extract.h  %[b1],%[addr1],1;"
+                   "pv.extract.h  %[b2],%[addr2],1;"
+                   "pv.extract.h  %[b3],%[addr3],1;"
+                   "pv.extract.h  %[b4],%[addr4],1;"
+                   : [a1] "=r"(a1), [a2] "=r"(a2), [a3] "=r"(a3), [a4] "=r"(a4),
+                     [b1] "=r"(b1), [b2] "=r"(b2), [b3] "=r"(b3), [b4] "=r"(b4),
+                     [addr1] "+&r"(addr1), [addr2] "+&r"(addr2),
+                     [addr3] "+&r"(addr3), [addr4] "+&r"(addr4)
+                   : [s2] "r"(s2)
+                   :);
+      // Compute the local addresses from the natural order ones
+      a1_load = (a1 % 4) * 2 * N_BANKS + 2 * (a1 / 4);
+      a2_load = (a2 % 4) * 2 * N_BANKS + 2 * (a2 / 4);
+      a3_load = (a3 % 4) * 2 * N_BANKS + 2 * (a3 / 4);
+      a4_load = (a4 % 4) * 2 * N_BANKS + 2 * (a4 / 4);
+      b1_load = (b1 % 4) * 2 * N_BANKS + 2 * (b1 / 4);
+      b2_load = (b2 % 4) * 2 * N_BANKS + 2 * (b2 / 4);
+      b3_load = (b3 % 4) * 2 * N_BANKS + 2 * (b3 / 4);
+      b4_load = (b4 % 4) * 2 * N_BANKS + 2 * (b4 / 4);
+      for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
+        uint16_t *ptr1 = (uint16_t *)(pSrc16 + idx_row * (N_BANKS * 8));
+        uint16_t *ptr2 = (uint16_t *)(pDst16 + idx_row * (N_BANKS * 8));
+        // Load at address a
+        tmpa1 = *(uint32_t *)&ptr1[a1_load];
+        tmpa2 = *(uint32_t *)&ptr1[a2_load];
+        tmpa3 = *(uint32_t *)&ptr1[a3_load];
+        tmpa4 = *(uint32_t *)&ptr1[a4_load];
+        // Load at address b
+        tmpb1 = *(uint32_t *)&ptr1[b1_load];
+        tmpb2 = *(uint32_t *)&ptr1[b2_load];
+        tmpb3 = *(uint32_t *)&ptr1[b3_load];
+        tmpb4 = *(uint32_t *)&ptr1[b4_load];
+        // Swap a with b
+        *((uint32_t *)&ptr2[b1]) = tmpa1;
+        *((uint32_t *)&ptr2[b2]) = tmpa2;
+        *((uint32_t *)&ptr2[b3]) = tmpa3;
+        *((uint32_t *)&ptr2[b4]) = tmpa4;
+        // Swap b with a
+        *((uint32_t *)&ptr2[a1]) = tmpb1;
+        *((uint32_t *)&ptr2[a2]) = tmpb2;
+        *((uint32_t *)&ptr2[a3]) = tmpb3;
+        *((uint32_t *)&ptr2[a4]) = tmpb4;
+      }
+    }
+#else
+    uint16_t *ptr1 = (uint16_t *)(pSrc16 + 2 * col_id * (fftLen / 4));
+    uint16_t *ptr2 = (uint16_t *)(pDst16 + 2 * col_id * fftLen);
+    for (ic = core_id * 16; ic < MIN(core_id * 16 + 16, fftLen >> 2U);
+         ic += 4) {
+      uint32_t idx0 = ic;
+      uint32_t idx1 = ic + 1;
+      uint32_t idx2 = ic + 2;
+      uint32_t idx3 = ic + 3;
+      uint32_t idx_result0 = 0;
+      uint32_t idx_result1 = 0;
+      uint32_t idx_result2 = 0;
+      uint32_t idx_result3 = 0;
+      for (k = 0; k < LOG2; k++) {
+        idx_result0 = (idx_result0 << 1U) | (idx0 & 1U);
+        idx_result1 = (idx_result1 << 1U) | (idx1 & 1U);
+        idx_result2 = (idx_result2 << 1U) | (idx2 & 1U);
+        idx_result3 = (idx_result3 << 1U) | (idx3 & 1U);
+        idx0 = idx0 >> 1U;
+        idx1 = idx1 >> 1U;
+        idx2 = idx2 >> 1U;
+        idx3 = idx3 >> 1U;
+      }
+      for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) {
+        uint32_t addr_src0 = (idx0 / 4) + (idx0 % 4) * N_BANKS;
+        uint32_t addr_src1 = (idx1 / 4) + (idx1 % 4) * N_BANKS;
+        uint32_t addr_src2 = (idx2 / 4) + (idx2 % 4) * N_BANKS;
+        uint32_t addr_src3 = (idx3 / 4) + (idx3 % 4) * N_BANKS;
+        uint32_t addr_dst0 = idx_result0;
+        uint32_t addr_dst1 = idx_result1;
+        uint32_t addr_dst2 = idx_result2;
+        uint32_t addr_dst3 = idx_result3;
+        addr_src0 += idx_row * (N_BANKS * 8);
+        addr_src1 += idx_row * (N_BANKS * 8);
+        addr_src2 += idx_row * (N_BANKS * 8);
+        addr_src3 += idx_row * (N_BANKS * 8);
+        addr_dst0 += idx_row * (N_BANKS * 8);
+        addr_dst1 += idx_row * (N_BANKS * 8);
+        addr_dst2 += idx_row * (N_BANKS * 8);
+        addr_dst3 += idx_row * (N_BANKS * 8);
+        *((uint32_t *)&ptr2[addr_dst0]) = (uint32_t)ptr1[addr_src0];
+        *((uint32_t *)&ptr2[addr_dst1]) = (uint32_t)ptr1[addr_src1];
+        *((uint32_t *)&ptr2[addr_dst2]) = (uint32_t)ptr1[addr_src2];
+        *((uint32_t *)&ptr2[addr_dst3]) = (uint32_t)ptr1[addr_src3];
+      }
+    }
+#endif
+  }
+  mempool_log_partial_barrier(2, absolute_core_id, nPE);
+  return;
+}
diff --git a/software/runtime/kernel/mempool_radix4_cfft_q16s.h b/software/runtime/kernel/mempool_radix4_cfft_q16s.h
new file mode 100644
index 000000000..d795cc40c
--- /dev/null
+++ b/software/runtime/kernel/mempool_radix4_cfft_q16s.h
@@ -0,0 +1,572 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#include "xpulp/builtins_v2.h"
+
+void mempool_radix4_cfft_q16s_riscv32(int16_t *pIn, uint32_t fftLen,
+                                      int16_t *pCoef16,
+                                      uint32_t twidCoefModifier) {
+
+  int16_t R0, R1, S0, S1, T0, T1, U0, U1;
+  int16_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
+  uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
+
+  /* START OF FIRST STAGE PROCESS */
+  n2 = fftLen;
+  n1 = n2;
+  n2 >>= 2U;
+  ic = 0U;
+  for (i0 = 0; i0 < n2; i0++) {
+
+    i1 = i0 + n2;
+    i2 = i1 + n2;
+    i3 = i2 + n2;
+
+    /* Reading i0, i0+fftLen/2 inputs */
+    /* input is down scale by 4 to avoid overflow */
+    /* Read ya (real), xa (imag) input */
+    T0 = pIn[i0 * 2U] >> 2U;
+    T1 = pIn[(i0 * 2U) + 1U] >> 2U;
+    /* input is down scale by 4 to avoid overflow */
+    /* Read yc (real), xc(imag) input */
+    S0 = pIn[i2 * 2U] >> 2U;
+    S1 = pIn[(i2 * 2U) + 1U] >> 2U;
+    /* R0 = (ya + yc) */
+    R0 = (int16_t)__CLIP(T0 + S0, 15);
+    /* R1 = (xa + xc) */
+    R1 = (int16_t)__CLIP(T1 + S1, 15);
+    /* S0 = (ya - yc) */
+    S0 = (int16_t)__CLIP(T0 - S0, 15);
+    /* S1 = (xa - xc) */
+    S1 = (int16_t)__CLIP(T1 - S1, 15);
+    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
+    /* input is down scale by 4 to avoid overflow */
+    /* Read yb (real), xb(imag) input */
+    T0 = pIn[i1 * 2U] >> 2U;
+    T1 = pIn[(i1 * 2U) + 1U] >> 2U;
+    /* input is down scale by 4 to avoid overflow */
+    /* Read yd (real), xd(imag) input */
+    U0 = pIn[i3 * 2U] >> 2U;
+    U1 = pIn[(i3 * 2U) + 1U] >> 2U;
+    /* T0 = (yb + yd) */
+    T0 = (int16_t)__CLIP(T0 + U0, 15);
+    /* T1 = (xb + xd) */
+    T1 = (int16_t)__CLIP(T1 + U1, 15);
+    /*  writing the butterfly processed i0 sample */
+    /* ya' = ya + yb + yc + yd */
+    /* xa' = xa + xb + xc + xd */
+    pIn[i0 * 2] = (int16_t)((R0 >> 1U) + (T0 >> 1U));
+    pIn[(i0 * 2) + 1] = (int16_t)((R1 >> 1U) + (T1 >> 1U));
+    /* R0 = (ya + yc) - (yb + yd) */
+    /* R1 = (xa + xc) - (xb + xd) */
+    R0 = (int16_t)__CLIP(R0 - T0, 15);
+    R1 = (int16_t)__CLIP(R1 - T1, 15);
+    /* co2 & si2 are read from Coefficient pointer */
+    Co2 = pCoef16[2U * ic * 2U];
+    Si2 = pCoef16[(2U * ic * 2U) + 1];
+    /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
+    out1 = (int16_t)((Co2 * R0 + Si2 * R1) >> 16U);
+    /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+    out2 = (int16_t)((-Si2 * R0 + Co2 * R1) >> 16U);
+    /*  Reading i0+fftLen/4 */
+    /* input is down scale by 4 to avoid overflow */
+    /* T0 = yb, T1 =  xb */
+    T0 = pIn[i1 * 2U] >> 2;
+    T1 = pIn[(i1 * 2U) + 1] >> 2;
+    /* writing the butterfly processed i0 + fftLen/4 sample */
+    /* writing output(xc', yc') in little endian format */
+    pIn[i1 * 2U] = out1;
+    pIn[(i1 * 2U) + 1] = out2;
+    /*  Butterfly calculations */
+    /* input is down scale by 4 to avoid overflow */
+    /* U0 = yd, U1 = xd */
+    U0 = pIn[i3 * 2U] >> 2;
+    U1 = pIn[(i3 * 2U) + 1] >> 2;
+    /* T0 = yb-yd */
+    T0 = (int16_t)__CLIP(T0 - U0, 15);
+    /* T1 = xb-xd */
+    T1 = (int16_t)__CLIP(T1 - U1, 15);
+    /* R1 = (ya-yc) + (xb- xd),  R0 = (xa-xc) - (yb-yd)) */
+    R0 = (int16_t)__CLIP((int32_t)(S0 - T1), 15);
+    R1 = (int16_t)__CLIP((int32_t)(S1 + T0), 15);
+    /* S1 = (ya-yc) - (xb- xd), S0 = (xa-xc) + (yb-yd)) */
+    S0 = (int16_t)__CLIP(((int32_t)S0 + T1), 15);
+    S1 = (int16_t)__CLIP(((int32_t)S1 - T0), 15);
+    /* co1 & si1 are read from Coefficient pointer */
+    Co1 = pCoef16[ic * 2U];
+    Si1 = pCoef16[(ic * 2U) + 1];
+    /*  Butterfly process for the i0+fftLen/2 sample */
+    /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
+    out1 = (int16_t)((Si1 * S1 + Co1 * S0) >> 16);
+    /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
+    out2 = (int16_t)((-Si1 * S0 + Co1 * S1) >> 16);
+    /* writing output(xb', yb') in little endian format */
+    pIn[i2 * 2U] = out1;
+    pIn[(i2 * 2U) + 1] = out2;
+    /* Co3 & si3 are read from Coefficient pointer */
+    Co3 = pCoef16[3U * (ic * 2U)];
+    Si3 = pCoef16[(3U * (ic * 2U)) + 1];
+    /*  Butterfly process for the i0+3fftLen/4 sample */
+    /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
+    out1 = (int16_t)((Si3 * R1 + Co3 * R0) >> 16U);
+    /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
+    out2 = (int16_t)((-Si3 * R0 + Co3 * R1) >> 16U);
+    /* writing output(xd', yd') in little endian format */
+    pIn[i3 * 2U] = out1;
+    pIn[(i3 * 2U) + 1] = out2;
+
+    /*  Twiddle coefficients index modifier */
+    ic = ic + twidCoefModifier;
+  };
+  /* END OF FIRST STAGE PROCESS */
+
+  /* START OF MIDDLE STAGE PROCESS */
+  twidCoefModifier <<= 2U;
+  for (k = fftLen / 4U; k > 4U; k >>= 2U) {
+
+    /*  Initializations for the middle stage */
+    n1 = n2;
+    n2 >>= 2U;
+    ic = 0U;
+    for (j = 0U; j <= (n2 - 1U); j++) {
+
+      /*  index calculation for the coefficients */
+      Co1 = pCoef16[ic * 2U];
+      Si1 = pCoef16[(ic * 2U) + 1U];
+      Co2 = pCoef16[2U * (ic * 2U)];
+      Si2 = pCoef16[(2U * (ic * 2U)) + 1U];
+      Co3 = pCoef16[3U * (ic * 2U)];
+      Si3 = pCoef16[(3U * (ic * 2U)) + 1U];
+      /*  Twiddle coefficients index modifier */
+      ic = ic + twidCoefModifier;
+
+      /*  Butterfly implementation */
+      for (i0 = j; i0 < fftLen; i0 += n1) {
+
+        i1 = i0 + n2;
+        i2 = i1 + n2;
+        i3 = i2 + n2;
+
+        /*  Reading i0, i0+fftLen/2 inputs */
+        /* Read ya (real), xa(imag) input */
+        T0 = pIn[i0 * 2U];
+        T1 = pIn[(i0 * 2U) + 1U];
+        /* Read yc (real), xc(imag) input */
+        S0 = pIn[i2 * 2U];
+        S1 = pIn[(i2 * 2U) + 1U];
+        /* R0 = (ya + yc), R1 = (xa + xc) */
+        R0 = (int16_t)__CLIP(T0 + S0, 15);
+        R1 = (int16_t)__CLIP(T1 + S1, 15);
+        /* S0 = (ya - yc), S1 =(xa - xc) */
+        S0 = (int16_t)__CLIP(T0 - S0, 15);
+        S1 = (int16_t)__CLIP(T1 - S1, 15);
+        /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
+        /* Read yb (real), xb(imag) input */
+        T0 = pIn[i1 * 2U];
+        T1 = pIn[(i1 * 2U) + 1U];
+        /* Read yd (real), xd(imag) input */
+        U0 = pIn[i3 * 2U];
+        U1 = pIn[(i3 * 2U) + 1U];
+        /* T0 = (yb + yd), T1 = (xb + xd) */
+        T0 = (int16_t)__CLIP(T0 + U0, 15);
+        T1 = (int16_t)__CLIP(T1 + U1, 15);
+        /*  writing the butterfly processed i0 sample */
+        /* xa' = xa + xb + xc + xd */
+        /* ya' = ya + yb + yc + yd */
+        out1 = (int16_t)(((R0 >> 1U) + (T0 >> 1U)) >> 1U);
+        out2 = (int16_t)(((R1 >> 1U) + (T1 >> 1U)) >> 1U);
+        pIn[i0 * 2U] = out1;
+        pIn[(2U * i0) + 1U] = out2;
+        /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
+        R0 = (int16_t)((R0 >> 1U) - (T0 >> 1U));
+        R1 = (int16_t)((R1 >> 1U) - (T1 >> 1U));
+        /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
+        out1 = (int16_t)((Co2 * R0 + Si2 * R1) >> 16U);
+        /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+        out2 = (int16_t)((-Si2 * R0 + Co2 * R1) >> 16U);
+        /*  Reading i0+3fftLen/4 */
+        /* Read yb (real), xb(imag) input */
+        T0 = pIn[i1 * 2U];
+        T1 = pIn[(i1 * 2U) + 1U];
+        /*  writing the butterfly processed i0 + fftLen/4 sample */
+        /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
+        /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+        pIn[i1 * 2U] = out1;
+        pIn[(i1 * 2U) + 1U] = out2;
+        /*  Butterfly calculations */
+        /* Read yd (real), xd(imag) input */
+        U0 = pIn[i3 * 2U];
+        U1 = pIn[(i3 * 2U) + 1U];
+        /* T0 = yb-yd, T1 = xb-xd */
+        T0 = (int16_t)__CLIP(T0 - U0, 15);
+        T1 = (int16_t)__CLIP(T1 - U1, 15);
+        /* R0 = (ya-yc) + (xb- xd), R1 = (xa-xc) - (yb-yd)) */
+        R0 = (int16_t)((S0 >> 1U) - (T1 >> 1U));
+        R1 = (int16_t)((S1 >> 1U) + (T0 >> 1U));
+        /* S0 = (ya-yc) - (xb- xd), S1 = (xa-xc) + (yb-yd)) */
+        S0 = (int16_t)((S0 >> 1U) + (T1 >> 1U));
+        S1 = (int16_t)((S1 >> 1U) - (T0 >> 1U));
+        /*  Butterfly process for the i0+fftLen/2 sample */
+        out1 = (int16_t)((Co1 * S0 + Si1 * S1) >> 16U);
+        out2 = (int16_t)((-Si1 * S0 + Co1 * S1) >> 16U);
+        /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
+        /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
+        pIn[i2 * 2U] = out1;
+        pIn[(i2 * 2U) + 1U] = out2;
+        /*  Butterfly process for the i0+3fftLen/4 sample */
+        out1 = (int16_t)((Si3 * R1 + Co3 * R0) >> 16U);
+        out2 = (int16_t)((-Si3 * R0 + Co3 * R1) >> 16U);
+        /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
+        /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
+        pIn[i3 * 2U] = out1;
+        pIn[(i3 * 2U) + 1U] = out2;
+      }
+    }
+    /*  Twiddle coefficients index modifier */
+    twidCoefModifier <<= 2U;
+  }
+  /* END OF MIDDLE STAGE PROCESSING */
+
+  /* START OF LAST STAGE PROCESSING */
+  n1 = n2;
+  n2 >>= 2U;
+  for (i0 = 0U; i0 < fftLen; i0 += n1) {
+
+    i1 = i0 + n2;
+    i2 = i1 + n2;
+    i3 = i2 + n2;
+
+    /*  Reading i0, i0+fftLen/2 inputs */
+    /* Read ya (real), xa(imag) input */
+    T0 = pIn[i0 * 2U];
+    T1 = pIn[(i0 * 2U) + 1U];
+    /* Read yc (real), xc(imag) input */
+    S0 = pIn[i2 * 2U];
+    S1 = pIn[(i2 * 2U) + 1U];
+    /* R0 = (ya + yc), R1 = (xa + xc) */
+    R0 = (int16_t)__CLIP(T0 + S0, 15);
+    R1 = (int16_t)__CLIP(T1 + S1, 15);
+    /* S0 = (ya - yc), S1 = (xa - xc) */
+    S0 = (int16_t)__CLIP(T0 - S0, 15);
+    S1 = (int16_t)__CLIP(T1 - S1, 15);
+    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
+    /* Read yb (real), xb(imag) input */
+    T0 = pIn[i1 * 2U];
+    T1 = pIn[(i1 * 2U) + 1U];
+    /* Read yd (real), xd(imag) input */
+    U0 = pIn[i3 * 2U];
+    U1 = pIn[(i3 * 2U) + 1U];
+    /* T0 = (yb + yd), T1 = (xb + xd)) */
+    T0 = (int16_t)__CLIP(T0 + U0, 15);
+    T1 = (int16_t)__CLIP(T1 + U1, 15);
+    /*  writing the butterfly processed i0 sample */
+    /* xa' = xa + xb + xc + xd */
+    /* ya' = ya + yb + yc + yd */
+    pIn[i0 * 2U] = (int16_t)((R0 >> 1U) + (T0 >> 1U));
+    pIn[(i0 * 2U) + 1U] = (int16_t)((R1 >> 1U) + (T1 >> 1U));
+    /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
+    R0 = (int16_t)((R0 >> 1U) - (T0 >> 1U));
+    R1 = (int16_t)((R1 >> 1U) - (T1 >> 1U));
+    /* Read yb (real), xb(imag) input */
+    T0 = pIn[i1 * 2U];
+    T1 = pIn[(i1 * 2U) + 1U];
+    /*  writing the butterfly processed i0 + fftLen/4 sample */
+    /* xc' = (xa-xb+xc-xd) */
+    /* yc' = (ya-yb+yc-yd) */
+    pIn[i1 * 2U] = R0;
+    pIn[(i1 * 2U) + 1U] = R1;
+    /* Read yd (real), xd(imag) input */
+    U0 = pIn[i3 * 2U];
+    U1 = pIn[(i3 * 2U) + 1U];
+    /* T0 = (yb - yd), T1 = (xb - xd)  */
+    T0 = (int16_t)__CLIP(T0 - U0, 15);
+    T1 = (int16_t)__CLIP(T1 - U1, 15);
+    /*  writing the butterfly processed i0 + fftLen/2 sample */
+    /* xb' = (xa+yb-xc-yd) */
+    /* yb' = (ya-xb-yc+xd) */
+    pIn[i2 * 2U] = (int16_t)((S0 >> 1U) + (T1 >> 1U));
+    pIn[(i2 * 2U) + 1U] = (int16_t)((S1 >> 1U) - (T0 >> 1U));
+    /*  writing the butterfly processed i0 + 3fftLen/4 sample */
+    /* xd' = (xa-yb-xc+yd) */
+    /* yd' = (ya+xb-yc-xd) */
+    pIn[i3 * 2U] = (int16_t)((S0 >> 1U) - (T1 >> 1U));
+    pIn[(i3 * 2U) + 1U] = (int16_t)((S1 >> 1U) + (T0 >> 1U));
+  }
+  /* END OF LAST STAGE PROCESSING */
+}
+
+void mempool_radix4_cfft_q16s_xpulpimg(int16_t *pSrc16, uint32_t fftLen,
+                                       int16_t *pCoef16,
+                                       uint32_t twidCoefModifier) {
+
+  v2s CoSi1, CoSi2, CoSi3;
+  v2s C1, C2, C3;
+  int16_t t0, t1, t2, t3, t4, t5;
+  uint32_t n1, n2, ic, i0, j, k;
+
+  n1 = fftLen;
+  n2 = n1 >> 2U;
+
+  /* START OF FIRST STAGE PROCESS */
+  for (i0 = 0; i0 < n2; i0++) {
+    CoSi1 = *(v2s *)&pCoef16[2U * i0];
+    CoSi2 = *(v2s *)&pCoef16[2U * i0 * 2U];
+    CoSi3 = *(v2s *)&pCoef16[2U * i0 * 3U];
+    SHUFFLE_TWIDDLEFACT;
+
+    radix4_butterfly_first(pSrc16, pSrc16, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2,
+                           C3);
+  }
+  /* END OF FIRST STAGE PROCESS */
+
+  /* START OF MIDDLE STAGE PROCESS */
+  twidCoefModifier <<= 2U;
+  for (k = fftLen / 4U; k > 4U; k >>= 2U) {
+    n1 = n2;
+    n2 >>= 2U;
+    ic = 0U;
+    for (j = 0U; j <= (n2 - 1U); j++) {
+      /*  index calculation for the coefficients */
+      CoSi1 = *(v2s *)&pCoef16[ic * 2U];
+      CoSi2 = *(v2s *)&pCoef16[2U * (ic * 2U)];
+      CoSi3 = *(v2s *)&pCoef16[3U * (ic * 2U)];
+      SHUFFLE_TWIDDLEFACT;
+
+      /*  Twiddle coefficients index modifier */
+      ic = ic + twidCoefModifier;
+      /*  Butterfly implementation */
+      for (i0 = j; i0 < fftLen; i0 += n1) {
+        radix4_butterfly_middle(pSrc16, pSrc16, i0, n2, CoSi1, CoSi2, CoSi3, C1,
+                                C2, C3);
+      }
+    }
+    twidCoefModifier <<= 2U;
+  }
+  /* END OF MIDDLE STAGE PROCESSING */
+
+  /* START OF LAST STAGE PROCESSING */
+  n1 = n2;
+  n2 >>= 2U;
+  for (i0 = 0U; i0 < fftLen; i0 += n1) {
+    radix4_butterfly_last(pSrc16, pSrc16, i0);
+  }
+  /* END OF LAST STAGE PROCESSING */
+}
+
+void mempool_cfft_radix4by2_q16s_riscv32(int16_t *pSrc, uint32_t fftLen,
+                                         const int16_t *pCoef) {
+
+  uint32_t i;
+  uint32_t n2;
+  int16_t p0, p1, p2, p3;
+
+  uint32_t l;
+  int16_t xt, yt, cosVal, sinVal;
+
+  n2 = fftLen >> 1;
+
+  for (i = 0; i < n2; i++) {
+    cosVal = pCoef[i * 2];
+    sinVal = pCoef[(i * 2) + 1];
+
+    l = i + n2;
+
+    xt = (int16_t)((pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U));
+    pSrc[2 * i] = (int16_t)(((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U);
+
+    yt = (int16_t)((pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U));
+    pSrc[2 * i + 1] =
+        (int16_t)(((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U);
+
+    pSrc[2U * l] = (int16_t)(((int16_t)(((int32_t)xt * cosVal) >> 16)) +
+                             ((int16_t)(((int32_t)yt * sinVal) >> 16)));
+
+    pSrc[2U * l + 1U] = (int16_t)(((int16_t)(((int32_t)yt * cosVal) >> 16)) -
+                                  ((int16_t)(((int32_t)xt * sinVal) >> 16)));
+  }
+
+  // first col
+  mempool_radix4_cfft_q16s_riscv32(pSrc, n2, (int16_t *)pCoef, 2U);
+  // second col
+  mempool_radix4_cfft_q16s_riscv32(pSrc + fftLen, n2, (int16_t *)pCoef, 2U);
+
+  for (i = 0; i < (fftLen >> 1); i++) {
+    p0 = pSrc[4 * i + 0];
+    p1 = pSrc[4 * i + 1];
+    p2 = pSrc[4 * i + 2];
+    p3 = pSrc[4 * i + 3];
+
+    p0 = (int16_t)(p0 << 1U);
+    p1 = (int16_t)(p1 << 1U);
+    p2 = (int16_t)(p2 << 1U);
+    p3 = (int16_t)(p3 << 1U);
+
+    pSrc[4 * i + 0] = p0;
+    pSrc[4 * i + 1] = p1;
+    pSrc[4 * i + 2] = p2;
+    pSrc[4 * i + 3] = p3;
+  }
+}
+
+void mempool_cfft_radix4by2_q16s_xpulpimg(int16_t *pSrc, uint32_t fftLen,
+                                          const int16_t *pCoef) {
+
+  uint32_t i;
+  uint32_t n2;
+  v2s pa, pb;
+
+  uint32_t l;
+  v2s CoSi;
+  v2s a, b, t;
+  int16_t testa, testb;
+
+  n2 = fftLen >> 1;
+
+  for (i = 0; i < n2; i++) {
+    CoSi = *(v2s *)&pCoef[i * 2];
+    l = i + n2;
+    a = __SRA2(*(v2s *)&pSrc[2 * i], ((v2s){1, 1}));
+    b = __SRA2(*(v2s *)&pSrc[2 * l], ((v2s){1, 1}));
+    t = __SUB2(a, b);
+    *((v2s *)&pSrc[i * 2]) = __SRA2(__ADD2(a, b), ((v2s){1, 1}));
+
+    testa = (int16_t)(__DOTP2(t, CoSi) >> 16);
+    testb = (int16_t)(__DOTP2(t, __PACK2(-CoSi[1], CoSi[0])) >> 16);
+    *((v2s *)&pSrc[l * 2]) = __PACK2(testa, testb);
+  }
+
+  // first col
+  mempool_radix4_cfft_q16s_xpulpimg(pSrc, n2, (int16_t *)pCoef, 2U);
+  // second col
+  mempool_radix4_cfft_q16s_xpulpimg(pSrc + fftLen, n2, (int16_t *)pCoef, 2U);
+
+  for (i = 0; i < (fftLen >> 1); i++) {
+    pa = *(v2s *)&pSrc[4 * i];
+    pb = *(v2s *)&pSrc[4 * i + 2];
+
+    pa = __SLL2(pa, ((v2s){1, 1}));
+    pb = __SLL2(pb, ((v2s){1, 1}));
+
+    *((v2s *)&pSrc[4 * i]) = pa;
+    *((v2s *)&pSrc[4 * i + 2]) = pb;
+  }
+}
+
+void mempool_radix4_cfft_q16s_scheduler(
+    int16_t *pSrc16, uint16_t fftLen, int16_t *pCoef16, uint16_t *pBitRevTable,
+    uint16_t bitReverseLen, uint8_t bitReverseFlag, uint32_t nFFTs) {
+
+  /* Initializations for the first stage */
+  int16_t t0, t1, t2, t3, t4, t5;
+  uint32_t n1, n2, i0, ic, j, k, twidCoefModifier;
+  v2s CoSi1, CoSi2, CoSi3;
+  v2s C1, C2, C3;
+
+  /* FIRST STAGE */
+  n1 = fftLen;
+  n2 = n1 >> 2U;
+  for (i0 = 0; i0 < n2; i0++) {
+    CoSi1 = *(v2s *)&pCoef16[2U * i0];
+    CoSi2 = *(v2s *)&pCoef16[2U * i0 * 2U];
+    CoSi3 = *(v2s *)&pCoef16[2U * i0 * 3U];
+    SHUFFLE_TWIDDLEFACT;
+
+    for (uint32_t idx_fft = 0; idx_fft < nFFTs; idx_fft++) {
+      radix4_butterfly_first(pSrc16 + (int32_t)idx_fft * (2 * fftLen),
+                             pSrc16 + (int32_t)idx_fft * (2 * fftLen), i0, n2,
+                             CoSi1, CoSi2, CoSi3, C1, C2, C3);
+    }
+  }
+
+  /* MIDDLE STAGE */
+  twidCoefModifier = 4;
+  for (k = fftLen / 4U; k > 4U; k >>= 2U) {
+    n1 = n2;
+    n2 >>= 2U;
+    ic = 0U;
+    for (j = 0U; j <= (n2 - 1U); j++) {
+      CoSi1 = *(v2s *)&pCoef16[2U * ic];
+      CoSi2 = *(v2s *)&pCoef16[2U * ic * 2U];
+      CoSi3 = *(v2s *)&pCoef16[2U * ic * 3U];
+      SHUFFLE_TWIDDLEFACT;
+
+      ic = ic + twidCoefModifier;
+      for (i0 = j; i0 < fftLen; i0 += n1) {
+        for (uint32_t idx_fft = 0; idx_fft < nFFTs; idx_fft++) {
+          radix4_butterfly_middle(pSrc16 + (int32_t)idx_fft * (2 * fftLen),
+                                  pSrc16 + (int32_t)idx_fft * (2 * fftLen), i0,
+                                  n2, CoSi1, CoSi2, CoSi3, C1, C2, C3);
+        }
+      }
+    }
+    twidCoefModifier <<= 2U;
+  }
+
+  /* LAST STAGE */
+  n1 = n2;
+  n2 >>= 2U;
+  for (i0 = 0U; i0 < fftLen; i0 += n1) {
+    for (uint32_t idx_fft = 0; idx_fft < nFFTs; idx_fft++) {
+      radix4_butterfly_last(pSrc16 + (int32_t)idx_fft * (2 * fftLen),
+                            pSrc16 + (int32_t)idx_fft * (2 * fftLen), i0);
+    }
+  }
+
+  /* BITREVERSAL */
+  if (bitReverseFlag) {
+    v2s addr1, addr2, addr3, addr4;
+    v2s s2 = (v2s){2, 2};
+    v2s tmpa1, tmpa2, tmpa3, tmpa4;
+    v2s tmpb1, tmpb2, tmpb3, tmpb4;
+    int32_t a1, a2, a3, a4;
+    int32_t b1, b2, b3, b4;
+    uint16_t *ptr;
+    for (uint32_t i = 0; i < bitReverseLen; i += 8) {
+      addr1 = *(v2s *)&pBitRevTable[i];
+      addr2 = *(v2s *)&pBitRevTable[i + 2];
+      addr3 = *(v2s *)&pBitRevTable[i + 4];
+      addr4 = *(v2s *)&pBitRevTable[i + 6];
+      asm volatile("pv.sra.h  %[addr1],%[addr1],%[s2];"
+                   "pv.sra.h  %[addr2],%[addr2],%[s2];"
+                   "pv.sra.h  %[addr3],%[addr3],%[s2];"
+                   "pv.sra.h  %[addr4],%[addr4],%[s2];"
+                   "pv.extract.h  %[a1],%[addr1],0;"
+                   "pv.extract.h  %[a2],%[addr2],0;"
+                   "pv.extract.h  %[a3],%[addr3],0;"
+                   "pv.extract.h  %[a4],%[addr4],0;"
+                   "pv.extract.h  %[b1],%[addr1],1;"
+                   "pv.extract.h  %[b2],%[addr2],1;"
+                   "pv.extract.h  %[b3],%[addr3],1;"
+                   "pv.extract.h  %[b4],%[addr4],1;"
+                   : [a1] "=r"(a1), [a2] "=r"(a2), [a3] "=r"(a3), [a4] "=r"(a4),
+                     [b1] "=r"(b1), [b2] "=r"(b2), [b3] "=r"(b3), [b4] "=r"(b4),
+                     [addr1] "+r"(addr1), [addr2] "+r"(addr2),
+                     [addr3] "+r"(addr3), [addr4] "+r"(addr4)
+                   : [s2] "r"(s2)
+                   :);
+      ptr = (uint16_t *)pSrc16;
+      for (uint32_t idx_fft = 0; idx_fft < nFFTs; idx_fft++) {
+        tmpa1 = *(v2s *)&ptr[a1];
+        tmpa2 = *(v2s *)&ptr[a2];
+        tmpa3 = *(v2s *)&ptr[a3];
+        tmpa4 = *(v2s *)&ptr[a4];
+        tmpb1 = *(v2s *)&ptr[b1];
+        tmpb2 = *(v2s *)&ptr[b2];
+        tmpb3 = *(v2s *)&ptr[b3];
+        tmpb4 = *(v2s *)&ptr[b4];
+        *((v2s *)&ptr[a1]) = tmpb1;
+        *((v2s *)&ptr[a2]) = tmpb2;
+        *((v2s *)&ptr[a3]) = tmpb3;
+        *((v2s *)&ptr[a4]) = tmpb4;
+        *((v2s *)&ptr[b1]) = tmpa1;
+        *((v2s *)&ptr[b2]) = tmpa2;
+        *((v2s *)&ptr[b3]) = tmpa3;
+        *((v2s *)&ptr[b4]) = tmpa4;
+        ptr += (2 * fftLen);
+      }
+    }
+  }
+}