From 74e7ca367a114fea0e6eb47ee7e88a5220dd8ff2 Mon Sep 17 00:00:00 2001 From: nicomar0 Date: Sun, 19 Nov 2023 17:54:53 +0100 Subject: [PATCH] Merge pull request #84 from pulp-platform/cfft Add cfft radix-4 and radix-2 kernels --- python-requirements.txt | 1 + software/.gitignore | 2 +- software/apps/Makefile | 2 +- software/apps/cfft_radix2_q16/main.c | 81 +++ software/apps/cfft_radix4_q16/main.c | 195 ++++++ .../runtime/data/data_cfft_radix2_q16.h.tpl | 56 ++ software/runtime/data/data_cfft_radix2_q16.py | 200 ++++++ .../runtime/data/data_cfft_radix4_q16.h.tpl | 58 ++ software/runtime/data/data_cfft_radix4_q16.py | 200 ++++++ .../runtime/kernel/mempool_radix2_cfft_q16p.h | 203 ++++++ .../runtime/kernel/mempool_radix2_cfft_q16s.h | 141 ++++ .../mempool_radix4_cfft_butterfly_q16.h | 488 +++++++++++++ .../mempool_radix4_cfft_q16_bitreversal.h | 147 ++++ .../runtime/kernel/mempool_radix4_cfft_q16p.h | 640 ++++++++++++++++++ .../runtime/kernel/mempool_radix4_cfft_q16s.h | 572 ++++++++++++++++ 15 files changed, 2984 insertions(+), 2 deletions(-) create mode 100644 software/apps/cfft_radix2_q16/main.c create mode 100644 software/apps/cfft_radix4_q16/main.c create mode 100644 software/runtime/data/data_cfft_radix2_q16.h.tpl create mode 100644 software/runtime/data/data_cfft_radix2_q16.py create mode 100644 software/runtime/data/data_cfft_radix4_q16.h.tpl create mode 100755 software/runtime/data/data_cfft_radix4_q16.py create mode 100644 software/runtime/kernel/mempool_radix2_cfft_q16p.h create mode 100644 software/runtime/kernel/mempool_radix2_cfft_q16s.h create mode 100644 software/runtime/kernel/mempool_radix4_cfft_butterfly_q16.h create mode 100644 software/runtime/kernel/mempool_radix4_cfft_q16_bitreversal.h create mode 100644 software/runtime/kernel/mempool_radix4_cfft_q16p.h create mode 100644 software/runtime/kernel/mempool_radix4_cfft_q16s.h diff --git a/python-requirements.txt b/python-requirements.txt index b459939ae..09e19ccd7 100644 --- a/python-requirements.txt +++ b/python-requirements.txt @@ -13,3 +13,4 @@ numpy pandas progressbar2 tabulate +sympy diff --git a/software/.gitignore b/software/.gitignore index 127f990fd..d738641d4 100644 --- a/software/.gitignore +++ b/software/.gitignore @@ -26,4 +26,4 @@ runtime/arch.ld # Generated data files data.h -runtime/data/*.h +runtime/data/data*.h diff --git a/software/apps/Makefile b/software/apps/Makefile index cb7e7ad02..03ea07e75 100644 --- a/software/apps/Makefile +++ b/software/apps/Makefile @@ -25,7 +25,7 @@ else ALL := $(filter-out systolic/%,$(APPS)) endif -ALL_LLVM := $(filter-out chest_q16, $(ALL)) +ALL_LLVM := $(filter-out chest_q16 cfft_radix2_q16 cfft_radix4_q16, $(ALL)) # Make all applications all: $(ALL) diff --git a/software/apps/cfft_radix2_q16/main.c b/software/apps/cfft_radix2_q16/main.c new file mode 100644 index 000000000..a7dfa2c7b --- /dev/null +++ b/software/apps/cfft_radix2_q16/main.c @@ -0,0 +1,81 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +#include +#include +#include +#include +#include + +/* Mempool runtime libraries */ +#include "dma.h" +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" +#include "xpulp/builtins_v2.h" + +/* CFFT mempool libraries */ +#include "data/data_cfft_radix2_q16.h" +#include "kernel/mempool_radix2_cfft_q16p.h" +#include "kernel/mempool_radix2_cfft_q16s.h" + +#define PARALLEL +#define SINGLE + +/* CFFT mempool data */ +int16_t l1_pSrc[N_RSAMPLES] + __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); +int16_t l1_twiddleCoef_q16[6 * N_CSAMPLES / 4] + __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); +uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH] + __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); + +int main() { + + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + mempool_barrier_init(core_id); + + if (core_id == 0) { + dma_memcpy_blocking(l1_pSrc, l2_pSrc, N_CSAMPLES * sizeof(int32_t)); + dma_memcpy_blocking(l1_twiddleCoef_q16, l2_twiddleCoef_q16, + (3 * N_CSAMPLES / 4) * sizeof(int32_t)); + dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable, + BITREVINDEXTABLE_LENGTH * sizeof(int32_t)); + } + mempool_barrier(num_cores); + + /* SINGLE-CORE */ +#ifdef SINGLE + if (core_id == 0) { + mempool_start_benchmark(); + mempool_radix2_cfft_q16s((uint16_t)16, l1_twiddleCoef_q16, + l1_BitRevIndexTable, l1_pSrc, + BITREVINDEXTABLE_LENGTH, 0, 0); + mempool_stop_benchmark(); + } + mempool_barrier(num_cores); +#endif + + /* PARALLEL-CORE */ +#ifdef PARALLEL + mempool_start_benchmark(); + mempool_radix2_cfft_q16p((uint16_t)16, l1_twiddleCoef_q16, + l1_BitRevIndexTable, l1_pSrc, + BITREVINDEXTABLE_LENGTH, 0, 0, num_cores); + mempool_stop_benchmark(); +#endif + + if (core_id == 0) { + for (uint32_t i = 0; i < N_RSAMPLES; i += 2) { + printf("{%6d;%6d } \n", l1_pSrc[i], l1_pSrc[i + 1]); + } + printf("Done!\n"); + } + mempool_barrier(num_cores); + return 0; +} diff --git a/software/apps/cfft_radix4_q16/main.c b/software/apps/cfft_radix4_q16/main.c new file mode 100644 index 000000000..ca9cb29c3 --- /dev/null +++ b/software/apps/cfft_radix4_q16/main.c @@ -0,0 +1,195 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +#include +#include +#include +#include + +/* Mempool runtime libraries */ +#include "dma.h" +#include "encoding.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" +#include "xpulp/builtins_v2.h" + +/* CFFT data libraries */ +#include "data/data_cfft_radix4_q16.h" + +/* + CHOOSE ONE + - SINGLE: Single core FFT + - PARALLEL: Parallel FFT not "memory-aware" + - FOLDED: Parallel FFT with "memory-aware" load/store scheme + - SCHEDULED: Scheduling of multiple parallel FFTs with "memory-aware" + load/store scheme + - N_FFTs_COL: Independent FFTs scheduled on one row (default 1) + - N_FFTs_ROW: Independent FFTs scheduled on columns (default 1) + (OPTIONALLY ENABLE) + - FOLDED_TWIDDLES: Also the twiddles have "memory-aware" load/stores + - BITREVERSETABLE: The bitreversal indeces are loaded from a table + - ASM: Use asm_volatile statements +*/ + +#define SCHEDULED +#define FOLDED_TWIDDLES +#define BITREVERSETABLE +#define ASM // Use asm_volatile statements + +#if !(defined(N_FFT_ROW) && defined(N_FFTs_COL)) +#define N_FFTs_ROW 2 +#define N_FFTs_COL 2 +#endif + +#define ABS(x) (((x) < 0) ? (-x) : (x)) +#include "kernel/mempool_radix4_cfft_butterfly_q16.h" +#include "kernel/mempool_radix4_cfft_q16_bitreversal.h" +#include "kernel/mempool_radix4_cfft_q16p.h" +#include "kernel/mempool_radix4_cfft_q16s.h" + +int16_t l1_pSrc[N_FFTs_ROW * 8 * N_BANKS] + __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); +int16_t l1_pDst[N_FFTs_ROW * 8 * N_BANKS] + __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); +int16_t l1_twiddleCoef_q16_src[8 * N_BANKS] + __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); +int16_t l1_twiddleCoef_q16_dst[8 * N_BANKS] + __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); +uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH] + __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/* MAIN */ +int main() { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + mempool_barrier_init(core_id); + + /////////////////////////////////////////////////////////////////////////////////////////////////// + /* INITIALIZATION */ + if (core_id == 0) { + // Each FFT is folded over 4 memory rows + // Each memory row is 2 * N_BANKS samples + for (uint32_t j = 0; j < N_FFTs_ROW; j++) { + dma_memcpy_blocking(l1_pSrc + j * (8 * N_BANKS), l2_pSrc, + (N_RSAMPLES * N_FFTs_COL) * sizeof(int32_t)); + } + dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable, + BITREVINDEXTABLE_LENGTH * sizeof(int32_t)); + dma_memcpy_blocking(l1_twiddleCoef_q16_src, l2_twiddleCoef_q16, + 3 * (N_CSAMPLES / 4) * sizeof(int32_t)); + } + // Initialize the Twiddles folded +#ifdef FOLDED_TWIDDLES + for (uint32_t j = 0; j < N_FFTs_COL; j++) { + uint32_t N_WORDS_COL = (N_CSAMPLES / 4); + for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) { + *(v2s *)&l1_twiddleCoef_q16_src[2U * (i + j * (N_CSAMPLES / 4))] = + *(v2s *)&l2_twiddleCoef_q16[2U * i]; + *(v2s *)&l1_twiddleCoef_q16_src[2U * (i + j * (N_CSAMPLES / 4) + + 1 * N_BANKS)] = + *(v2s *)&l2_twiddleCoef_q16[2U * (i * 2U)]; + *(v2s *)&l1_twiddleCoef_q16_src[2U * (i + j * (N_CSAMPLES / 4) + + 2 * N_BANKS)] = + *(v2s *)&l2_twiddleCoef_q16[2U * (i * 3U)]; + } + } +#endif + mempool_barrier(num_cores); + + if (core_id == 0) { + printf("On the run...\n"); + } + mempool_barrier(num_cores); + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/* SINGLE-CORE */ +#ifdef SINGLE + int16_t *pRes; // Result pointer + if (core_id == 0) { + mempool_start_benchmark(); + mempool_radix4_cfft_q16s_xpulpimg(l1_pSrc, (uint16_t)N_CSAMPLES, + l1_twiddleCoef_q16_src, 1); + mempool_bitrevtable_q16s_xpulpimg( + (uint16_t *)l1_pSrc, BITREVINDEXTABLE_LENGTH, l1_BitRevIndexTable); + pRes = l1_pSrc; + mempool_stop_benchmark(); + } + mempool_barrier(num_cores); +#endif + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/* MULTI-CORE */ +#ifdef PARALLEL + int16_t *pRes; // Result pointer + mempool_start_benchmark(); + mempool_radix4_cfft_q16p_xpulpimg(l1_pSrc, (uint16_t)N_CSAMPLES, + l1_twiddleCoef_q16_src, 1, num_cores); + mempool_bitrevtable_q16p_xpulpimg((uint16_t *)l1_pSrc, + BITREVINDEXTABLE_LENGTH, + l1_BitRevIndexTable, num_cores); + pRes = l1_pSrc; + mempool_stop_benchmark(); +#endif + mempool_barrier(num_cores); + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/* MULTI-CORE FOLDED */ +#ifdef FOLDED + int16_t *pRes; // Result pointer + if (core_id < (N_CSAMPLES / 16)) { + mempool_start_benchmark(); +#ifdef FOLDED_TWIDDLES + mempool_radix4_cfft_q16p_folded(l1_pSrc, l1_pDst, (uint16_t)N_CSAMPLES, + l1_twiddleCoef_q16_src, + l1_twiddleCoef_q16_dst, (N_CSAMPLES / 16)); +#else + mempool_radix4_cfft_q16p_folded(l1_pSrc, l1_pDst, (uint16_t)N_CSAMPLES, + l1_twiddleCoef_q16_src, (N_CSAMPLES / 16)); +#endif + pRes = ((LOG2 / 2) % 2) == 0 ? l1_pSrc : l1_pDst; + mempool_bitrevtable_q16p_xpulpimg((uint16_t *)pRes, BITREVINDEXTABLE_LENGTH, + pRevT16, (N_CSAMPLES / 16)); + mempool_stop_benchmark(); + } + mempool_barrier(num_cores); +#endif + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/* MULTI-CORE SCHEDULED */ +#ifdef SCHEDULED + if (core_id < N_FFTs_COL * (N_CSAMPLES >> 4U)) { + mempool_start_benchmark(); + uint32_t col_fftLen = N_CSAMPLES >> 2U; + uint32_t col_id = core_id / (N_CSAMPLES >> 4U); + mempool_radix4_cfft_q16p_scheduler( + l1_pSrc, l1_pDst, N_CSAMPLES, + l1_twiddleCoef_q16_src + 2 * col_id * col_fftLen, + l1_twiddleCoef_q16_dst + 2 * col_id * col_fftLen, l1_BitRevIndexTable, + BITREVINDEXTABLE_LENGTH, 1, N_CSAMPLES >> 4U); + mempool_log_partial_barrier(2, core_id, N_CSAMPLES >> 4U); + mempool_stop_benchmark(); + } +#endif + mempool_barrier(num_cores); + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/* CHECK */ +#if defined(SINGLE) || defined(PARALLEL) || defined(FOLDED) + if (core_id == 0) { + printf("Done!\n"); + for (uint32_t i = 0; i < N_RSAMPLES; i++) { + if (ABS(((int32_t)pRes[i] - (int32_t)l2_pRes[i])) > TOLERANCE) + printf("ERROR!!! Result[%d]: %6d Expected[%d]: %6d\n", i, pRes[i], i, + l2_pRes[i]); + } + } + mempool_barrier(num_cores); +#endif + + return 0; +} diff --git a/software/runtime/data/data_cfft_radix2_q16.h.tpl b/software/runtime/data/data_cfft_radix2_q16.h.tpl new file mode 100644 index 000000000..35ae84005 --- /dev/null +++ b/software/runtime/data/data_cfft_radix2_q16.h.tpl @@ -0,0 +1,56 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Automatically generated by: +// data/data_cfft_radix2_q16.py + +\ +<% def array_to_cstr(array): + out = '{' + i = 0 + out += '\n' + for a in array: + out += '(int16_t) 0X{:04X}, '.format(a&0xffff) + i += 1 + if i % 16 == 0: + out += '\n' + out = out[:-2] + '}' + return out +%> \ + +<% def array_to_str(array): + out = '{' + i = 0 + out += '\n' + for a in array: + out += '{}, '.format(a) + i += 1 + if i % 16 == 0: + out += '\n' + out = out[:-2] + '}' + return out +%> \ + +#define LOG2 (${Log2Len}) +#define N_CSAMPLES (${Len}) +#define N_RSAMPLES (2 * N_CSAMPLES) +#define N_TWIDDLES (3 * N_CSAMPLES / 4) +#define N_BANKS (NUM_CORES * BANKING_FACTOR) +#define BITREVINDEXTABLE_LENGTH (${BitrevLen}) + +// Tolerance for correctness check +#define TOLERANCE (${tolerance}) + +% for m, m_str in zip([vector_inp, vector_res], ['l2_pSrc', 'l2_pRes']): + +// Data arrays for matrix ${m_str} +int16_t ${m_str}[${2*Len}] = ${array_to_cstr(m)}; + +% endfor \ + +// Twiddles +int16_t l2_twiddleCoef_q16[${int(6*Len/4)}] = ${array_to_cstr(vector_twi)}; + +// Bitreversal +uint16_t l2_BitRevIndexTable[${BitrevLen}] = ${array_to_str(vector_bitrev)}; diff --git a/software/runtime/data/data_cfft_radix2_q16.py b/software/runtime/data/data_cfft_radix2_q16.py new file mode 100644 index 000000000..e1615e53e --- /dev/null +++ b/software/runtime/data/data_cfft_radix2_q16.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python3 + +# Copyright 2022 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE for details. +# SPDX-License-Identifier: SHL-0.51 + +# This script generates data for the cfft kernel. +# Author: Marco Bertuletti + +import numpy as np +import math as M +import argparse +import pathlib +from mako.template import Template +from sympy.combinatorics import Permutation + + +################## +# compute_result # +################## + + +def compute_result(inp, len): + """ + Funciton to generate the expected result of the testcase. + + Arguments + --------- + input: numpy array of inputs + env: Length of the input transform. + """ + + # Q16: + # len=16: Q1.15 -> Q5.11 + # len=32: Q1.15 -> Q6.10 + # len=64: Q1.15 -> Q7.9 + # len=128: Q1.15 -> Q8.8 + # len=256: Q1.15 -> Q9.7 + # len=512: Q1.15 -> Q10.6 + # len=1024: Q1.15 -> Q11.5 + # len=2048: Q1.15 -> Q12.4 + # len=4096: Q1.15 -> Q13.3 + bit_shift_dict_q16 = { + 16: 11, + 32: 10, + 64: 9, + 128: 8, + 256: 7, + 512: 6, + 1024: 5, + 2048: 4, + 4096: 3} + my_type = np.int16 + my_fixpoint = 15 + bit_shift_dict = bit_shift_dict_q16 + a = inp.astype(my_type) + result = np.zeros(a.size, dtype=my_type) + complex_a = np.zeros(int(a.size / 2), dtype=np.csingle) + complex_result = np.zeros(a.size >> 1, dtype=np.csingle) + for i in range(a.size >> 1): + complex_a[i] = a[2 * i].astype(np.csingle) / (2**(my_fixpoint)) + ( + a[2 * i + 1].astype(np.csingle) / (2**(my_fixpoint))) * 1j + complex_result = np.fft.fft(complex_a) + for i in range(int(a.size / 2)): + result[2 * i] = (np.real(complex_result[i]) * + (2**(bit_shift_dict[int(a.size / 2)])) + ).astype(my_type) + result[2 * i + 1] = (np.imag(complex_result[i]) * + (2**(bit_shift_dict[int(a.size / 2)])) + ).astype(my_type) + + return result + + +def compute_twiddles(length): + PI = 3.14159265358979 + N = length + twiddleCoefq15 = np.zeros((int)(2 * 3 * N / 4), np.int16) + for i in range(0, (int)(3 * N / 4)): + twiddleCoefq15_cos = M.cos(i * 2 * PI / N) + twiddleCoefq15_sin = M.sin(i * 2 * PI / N) + twiddleCoefq15[2 * i] = int(round(twiddleCoefq15_cos * (2**15 - 1))) + twiddleCoefq15[2 * i + + 1] = int(round(twiddleCoefq15_sin * (2**15 - 1))) + return twiddleCoefq15 + + +def compute_bitreversal(N, R): + + # Decompose + logR2 = [] + idx = N + while (idx >= R): + logR2.append(int(M.log2(R))) + idx = idx // R + if (idx > 1): + logR2.append(int(M.log2(idx))) + + # Bitreversal + indexes = [] + for x in range(N): + result = 0 + for bits in logR2: + mask = (0xffffffff >> (32 - bits)) + result = (result << bits) | (x & mask) + x = x >> bits + indexes.append(result) + + # Create transpositions table + tps = [] + for c in Permutation.from_sequence(indexes).cyclic_form: + for i in range(len(c) - 1): + tps.append([c[i] * 8, c[-1] * 8]) + + return tps + + +def gen_data_header_file( + outdir: pathlib.Path.cwd(), + tpl: pathlib.Path.cwd(), + **kwargs): + + file = outdir / f"{kwargs['name']}.h" + + print(tpl, outdir, kwargs['name']) + + template = Template(filename=str(tpl)) + with file.open('w') as f: + f.write(template.render(**kwargs)) + + +def main(): + + parser = argparse.ArgumentParser(description='Generate data for kernels') + parser.add_argument( + "-o", + "--outdir", + type=pathlib.Path, + default=pathlib.Path(__file__).parent.absolute(), + required=False, + help='Select out directory of generated data files' + ) + parser.add_argument( + "-t", + "--tpl", + type=pathlib.Path, + required=False, + default=pathlib.Path(__file__).parent.absolute() / + "data_cfft_radix2_q16.h.tpl", + help='Path to mako template') + parser.add_argument( + "-v", + "--verbose", + action='store_true', + help='Set verbose' + ) + parser.add_argument( + "-d", + "--dimension", + type=int, + required=False, + default=64, + help='Input dimension' + ) + + args = parser.parse_args() + + # Create sparse matrix + Len = args.dimension + Input = np.random.randint(-2**(15), 2**(15) - 1, 2 * Len, dtype=np.int16) + Result = compute_result(Input, Len) + Twiddles = compute_twiddles(Len) + Bitreversal = np.ndarray.flatten(np.array(compute_bitreversal(Len, 2))) + + tolerance = { + 16: 16, + 32: 20, + 64: 24, + 128: 28, + 256: 32, + 512: 48, + 1024: 64, + 2048: 96, + 4096: 128} + + kwargs = {'name': 'data_cfft_radix2_q16', + 'vector_inp': Input, + 'vector_res': Result, + 'vector_twi': Twiddles, + 'vector_bitrev': Bitreversal, + 'Len': Len, + 'Log2Len': int(np.log2(Len)), + 'BitrevLen': int(2 * len(Bitreversal)), + 'tolerance': tolerance[int(Len)]} + + gen_data_header_file(args.outdir, args.tpl, **kwargs) + + +if __name__ == "__main__": + main() diff --git a/software/runtime/data/data_cfft_radix4_q16.h.tpl b/software/runtime/data/data_cfft_radix4_q16.h.tpl new file mode 100644 index 000000000..1a147a30c --- /dev/null +++ b/software/runtime/data/data_cfft_radix4_q16.h.tpl @@ -0,0 +1,58 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Automatically generated by: +// data/data_cfft_radix4_q16.py + +\ +<% def array_to_cstr(array): + out = '{' + i = 0 + out += '\n' + for a in array: + out += '(int16_t) 0X{:04X}, '.format(a&0xffff) + i += 1 + if i % 16 == 0: + out += '\n' + out = out[:-2] + '}' + return out +%> \ + +<% def array_to_str(array): + out = '{' + i = 0 + out += '\n' + for a in array: + out += '{}, '.format(a) + i += 1 + if i % 16 == 0: + out += '\n' + out = out[:-2] + '}' + return out +%> \ + +#define LOG2 (${Log2Len}) +#define N_CSAMPLES (${Len}) +#define N_RSAMPLES (2 * N_CSAMPLES) +#define N_TWIDDLES (3 * N_CSAMPLES / 4) +#define N_BANKS (NUM_CORES * BANKING_FACTOR) +#define BITREVINDEXTABLE_LENGTH (${BitrevLen}) + +// Maximum number of independent FFT columns allowed +#define MAX_COL (N_BANKS / (N_CSAMPLES / 4)) +// Tolerance for correctness check +#define TOLERANCE (${tolerance}) + +% for m, m_str in zip([vector_inp, vector_res], ['l2_pSrc', 'l2_pRes']): + +// Data arrays for matrix ${m_str} +int16_t ${m_str}[${2*Len}] = ${array_to_cstr(m)}; + +% endfor \ + +// Twiddles +int16_t l2_twiddleCoef_q16[${int(6*Len/4)}] = ${array_to_cstr(vector_twi)}; + +// Bitreversal +uint16_t l2_BitRevIndexTable[${BitrevLen}] = ${array_to_str(vector_bitrev)}; diff --git a/software/runtime/data/data_cfft_radix4_q16.py b/software/runtime/data/data_cfft_radix4_q16.py new file mode 100755 index 000000000..b394a2884 --- /dev/null +++ b/software/runtime/data/data_cfft_radix4_q16.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python3 + +# Copyright 2022 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE for details. +# SPDX-License-Identifier: SHL-0.51 + +# This script generates data for the cfft kernel. +# Author: Marco Bertuletti + +import numpy as np +import math as M +import argparse +import pathlib +from mako.template import Template +from sympy.combinatorics import Permutation + + +################## +# compute_result # +################## + + +def compute_result(inp, len): + """ + Funciton to generate the expected result of the testcase. + + Arguments + --------- + input: numpy array of inputs + env: Length of the input transform. + """ + + # Q16: + # len=16: Q1.15 -> Q5.11 + # len=32: Q1.15 -> Q6.10 + # len=64: Q1.15 -> Q7.9 + # len=128: Q1.15 -> Q8.8 + # len=256: Q1.15 -> Q9.7 + # len=512: Q1.15 -> Q10.6 + # len=1024: Q1.15 -> Q11.5 + # len=2048: Q1.15 -> Q12.4 + # len=4096: Q1.15 -> Q13.3 + bit_shift_dict_q16 = { + 16: 11, + 32: 10, + 64: 9, + 128: 8, + 256: 7, + 512: 6, + 1024: 5, + 2048: 4, + 4096: 3} + my_type = np.int16 + my_fixpoint = 15 + bit_shift_dict = bit_shift_dict_q16 + a = inp.astype(my_type) + result = np.zeros(a.size, dtype=my_type) + complex_a = np.zeros(int(a.size / 2), dtype=np.csingle) + complex_result = np.zeros(a.size >> 1, dtype=np.csingle) + for i in range(a.size >> 1): + complex_a[i] = a[2 * i].astype(np.csingle) / (2**(my_fixpoint)) + ( + a[2 * i + 1].astype(np.csingle) / (2**(my_fixpoint))) * 1j + complex_result = np.fft.fft(complex_a) + for i in range(int(a.size / 2)): + result[2 * i] = (np.real(complex_result[i]) * + (2**(bit_shift_dict[int(a.size / 2)])) + ).astype(my_type) + result[2 * i + 1] = (np.imag(complex_result[i]) * + (2**(bit_shift_dict[int(a.size / 2)])) + ).astype(my_type) + + return result + + +def compute_twiddles(length): + PI = 3.14159265358979 + N = length + twiddleCoefq15 = np.zeros((int)(2 * 3 * N / 4), np.int16) + for i in range(0, (int)(3 * N / 4)): + twiddleCoefq15_cos = M.cos(i * 2 * PI / N) + twiddleCoefq15_sin = M.sin(i * 2 * PI / N) + twiddleCoefq15[2 * i] = int(round(twiddleCoefq15_cos * (2**15 - 1))) + twiddleCoefq15[2 * i + + 1] = int(round(twiddleCoefq15_sin * (2**15 - 1))) + return twiddleCoefq15 + + +def compute_bitreversal(N, R): + + # Decompose + logR2 = [] + idx = N + while (idx >= R): + logR2.append(int(M.log2(R))) + idx = idx // R + if (idx > 1): + logR2.append(int(M.log2(idx))) + + # Bitreversal + indexes = [] + for x in range(N): + result = 0 + for bits in logR2: + mask = (0xffffffff >> (32 - bits)) + result = (result << bits) | (x & mask) + x = x >> bits + indexes.append(result) + + # Create transpositions table + tps = [] + for c in Permutation.from_sequence(indexes).cyclic_form: + for i in range(len(c) - 1): + tps.append([c[i] * 8, c[-1] * 8]) + + return tps + + +def gen_data_header_file( + outdir: pathlib.Path.cwd(), + tpl: pathlib.Path.cwd(), + **kwargs): + + file = outdir / f"{kwargs['name']}.h" + + print(tpl, outdir, kwargs['name']) + + template = Template(filename=str(tpl)) + with file.open('w') as f: + f.write(template.render(**kwargs)) + + +def main(): + + parser = argparse.ArgumentParser(description='Generate data for kernels') + parser.add_argument( + "-o", + "--outdir", + type=pathlib.Path, + default=pathlib.Path(__file__).parent.absolute(), + required=False, + help='Select out directory of generated data files' + ) + parser.add_argument( + "-t", + "--tpl", + type=pathlib.Path, + required=False, + default=pathlib.Path(__file__).parent.absolute() / + "data_cfft_radix4_q16.h.tpl", + help='Path to mako template') + parser.add_argument( + "-v", + "--verbose", + action='store_true', + help='Set verbose' + ) + parser.add_argument( + "-d", + "--dimension", + type=int, + required=False, + default=64, + help='Input dimension' + ) + + args = parser.parse_args() + + # Create sparse matrix + Len = args.dimension + Input = np.random.randint(-2**(15), 2**(15) - 1, 2 * Len, dtype=np.int16) + Result = compute_result(Input, Len) + Twiddles = compute_twiddles(Len) + Bitreversal = np.ndarray.flatten(np.array(compute_bitreversal(Len, 2))) + + tolerance = { + 16: 16, + 32: 20, + 64: 24, + 128: 28, + 256: 32, + 512: 48, + 1024: 64, + 2048: 96, + 4096: 128} + + kwargs = {'name': 'data_cfft_radix4_q16', + 'vector_inp': Input, + 'vector_res': Result, + 'vector_twi': Twiddles, + 'vector_bitrev': Bitreversal, + 'Len': Len, + 'Log2Len': int(np.log2(Len)), + 'BitrevLen': len(Bitreversal), + 'tolerance': tolerance[int(Len)]} + + gen_data_header_file(args.outdir, args.tpl, **kwargs) + + +if __name__ == "__main__": + main() diff --git a/software/runtime/kernel/mempool_radix2_cfft_q16p.h b/software/runtime/kernel/mempool_radix2_cfft_q16p.h new file mode 100644 index 000000000..52a693773 --- /dev/null +++ b/software/runtime/kernel/mempool_radix2_cfft_q16p.h @@ -0,0 +1,203 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) +#include "xpulp/builtins_v2.h" + +void mempool_radix2_cfft_q16p(uint16_t fftLen, int16_t *pTwiddle, + uint16_t *pBitRevTable, int16_t *pSrc, + uint16_t bitReverseLen, uint8_t ifftFlag, + uint8_t bitReverseFlag, uint32_t nPE); + +void mempool_radix2_butterfly_q16p(int16_t *pSrc16, uint32_t fftLen, + int16_t *pCoef16, uint32_t twidCoefModifier, + uint32_t nPE); + +void mempool_radix2_bitreversal_q16p(uint16_t *pSrc, const uint16_t bitRevLen, + const uint16_t *pBitRevTab, uint32_t nPE); + +void mempool_radix2_cfft_q16p(uint16_t fftLen, int16_t *pTwiddle, + uint16_t *pBitRevTable, int16_t *pSrc, + uint16_t bitReverseLen, uint8_t ifftFlag, + uint8_t bitReverseFlag, uint32_t nPE) { + if (ifftFlag == 0) { + mempool_radix2_butterfly_q16p(pSrc, fftLen, pTwiddle, 1U, nPE); + } + + if (bitReverseFlag) { + mempool_radix2_bitreversal_q16p((uint16_t *)pSrc, bitReverseLen, + pBitRevTable, nPE); + } +} + +void mempool_radix2_butterfly_q16p(int16_t *pSrc16, uint32_t fftLen, + int16_t *pCoef16, uint32_t twidCoefModifier, + uint32_t nPE) { + + uint32_t i, j, k, l; + uint32_t n1, n2, ia; + uint32_t core_id = mempool_get_core_id(); + uint32_t step, steps, butt_id, offset; + v2s T, S, R; + v2s coeff; + int16_t out1, out2; + + n1 = fftLen; + n2 = n1 >> 1; + step = (n2 + nPE - 1) / nPE; + + // loop for groups + for (i = core_id * step; i < MIN(core_id * step + step, n2); i++) { + + coeff = *(v2s *)&pCoef16[i * 2U]; + + /* Reading i and i+fftLen/2 inputs */ + /* Input is downscaled by 1 to avoid overflow */ + l = i + n2; + /* Read ya (real), xa (imag) input */ + T = __SRA2(*(v2s *)&pSrc16[i * 2U], ((v2s){1, 1})); + /* Read yb (real), xb (imag) input */ + S = __SRA2(*(v2s *)&pSrc16[l * 2U], ((v2s){1, 1})); + + /* R0 = (ya - yb) */ + /* R1 = (xa - xb) */ + R = __SUB2(T, S); + + /* writing the butterfly processed i sample */ + /* ya' = ya + yb */ + /* xa' = xa + xb */ + *((v2s *)&pSrc16[i * 2U]) = __SRA2(__ADD2(T, S), ((v2s){1, 1})); + + /* out1 = (ya - yb)*cos + (xa - xb)*sin */ + /* out2 = (ya - yb)*cos - (xa - xb)*sin */ + out1 = (int16_t)(__DOTP2(R, coeff) >> 16U); + out2 = (int16_t)(__DOTP2(R, __PACK2(coeff[0], -coeff[1])) >> 16U); + *((v2s *)&pSrc16[l * 2U]) = __PACK2(out2, out1); + + } /* groups loop end */ + + // if(core_id==0) { + // for(uint32_t i=0; i 2; k = k >> 1) { + + n1 = n2; + n2 = n2 >> 1; + + step = (n2 + nPE - 1) / nPE; + butt_id = core_id % n2; + offset = (core_id / n2) * n1; + ia = butt_id * step; + + /* loop for groups */ + for (j = butt_id * step; j < MIN(butt_id * step + step, n2); j++) { + // for (j = core_id * step; j < MIN(core_id*step + step, n2); j++) { + + coeff = *(v2s *)&pCoef16[ia * 2U]; + ia = ia + twidCoefModifier; + + /* loop for butterfly */ + for (i = offset + j; i < fftLen; i += ((nPE / n2) + 1) * n1) + // for (i = j; i < fftLen; i += n1) + { + + /* Reading i and i+fftLen/2 inputs */ + /* Input is downscaled by 1 to avoid overflow */ + l = i + n2; + /* Read ya (real), xa (imag) input */ + T = __SRA2(*(v2s *)&pSrc16[i * 2U], ((v2s){1, 1})); + /* Read yb (real), xb (imag) input */ + S = __SRA2(*(v2s *)&pSrc16[l * 2U], ((v2s){1, 1})); + /* R0 = (ya - yb) */ + /* R1 = (xa - xb) */ + R = __SUB2(T, S); + /* writing the butterfly processed i sample */ + /* ya' = ya + yb */ + /* xa' = xa + xb */ + *((v2s *)&pSrc16[i * 2U]) = __SRA2(__ADD2(T, S), ((v2s){1, 1})); + /* out1 = (ya - yb)*cos + (xa - xb)*sin */ + /* out2 = (ya - yb)*cos - (xa - xb)*sin */ + out1 = (int16_t)(__DOTP2(R, coeff) >> 16U); + out2 = (int16_t)(__DOTP2(R, __PACK2(coeff[0], -coeff[1])) >> 16U); + *((v2s *)&pSrc16[l * 2U]) = __PACK2(out2, out1); + + } /* butterfly loop end */ + + } /* groups loop end */ + + twidCoefModifier = twidCoefModifier << 1U; + mempool_barrier(nPE); + } /* stages loop end */ + + // if(core_id==0) { + // for(uint32_t i=0; i> 1; + steps = fftLen / n1; + step = (steps + nPE - 1) / nPE; + /* loop for butterfly */ + + for (i = core_id * step * n1; i < MIN((core_id * step + step) * n1, fftLen); + i += n1) { + + l = i + n2; + /* Read ya (real), xa (imag) input */ + T = __SRA2(*(v2s *)&pSrc16[i * 2U], ((v2s){1, 1})); + /* Read yb (real), xb (imag) input */ + S = __SRA2(*(v2s *)&pSrc16[l * 2U], ((v2s){1, 1})); + /* R0 = (ya - yb) */ + /* R1 = (xa - xb) */ + R = __SUB2(T, S); + /* ya' = ya + yb */ + /* xa' = xa + xb */ + *((v2s *)&pSrc16[i * 2U]) = __ADD2(T, S); + /* yb' = ya - yb */ + /* xb' = xa - xb */ + *((v2s *)&pSrc16[l * 2U]) = R; + + } /* groups loop end */ + + // if(core_id==0) { + // for(uint32_t i=0; i> 2; + tmpa = *(v2s *)&pSrc[addr[0]]; + tmpb = *(v2s *)&pSrc[addr[1]]; + *((v2s *)&pSrc[addr[0]]) = tmpb; + *((v2s *)&pSrc[addr[1]]) = tmpa; + } + mempool_barrier(num_cores); +} diff --git a/software/runtime/kernel/mempool_radix2_cfft_q16s.h b/software/runtime/kernel/mempool_radix2_cfft_q16s.h new file mode 100644 index 000000000..db4253b96 --- /dev/null +++ b/software/runtime/kernel/mempool_radix2_cfft_q16s.h @@ -0,0 +1,141 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) +#include "xpulp/builtins_v2.h" + +void mempool_radix2_cfft_q16s(uint16_t fftLen, int16_t *pTwiddle, + uint16_t *pBitRevTable, int16_t *pSrc, + uint16_t bitReverseLen, uint8_t ifftFlag, + uint8_t bitReverseFlag); + +void mempool_radix2_butterfly_q16s(int16_t *pSrc16, uint32_t fftLen, + int16_t *pCoef16, uint32_t twidCoefModifier); + +void mempool_radix2_bitreversal_q16s(uint16_t *pSrc, const uint16_t bitRevLen, + const uint16_t *pBitRevTab); + +void mempool_radix2_cfft_q16s(uint16_t fftLen, int16_t *pTwiddle, + uint16_t *pBitRevTable, int16_t *pSrc, + uint16_t bitReverseLen, uint8_t ifftFlag, + uint8_t bitReverseFlag) { + if (ifftFlag == 0) { + mempool_radix2_butterfly_q16s(pSrc, fftLen, pTwiddle, 1U); + } + + if (bitReverseFlag) { + mempool_radix2_bitreversal_q16s((uint16_t *)pSrc, bitReverseLen, + pBitRevTable); + } +} + +void mempool_radix2_butterfly_q16s(int16_t *pSrc16, uint32_t fftLen, + int16_t *pCoef16, + uint32_t twidCoefModifier) { + + uint32_t i, j, k, l; + uint32_t n1, n2, ia; + int16_t xt, yt, cosVal, sinVal; + + // N = fftLen; + n2 = fftLen; + + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + for (i = 0; i < n2; i++) { + + cosVal = pCoef16[i * 2]; + sinVal = pCoef16[(i * 2) + 1]; + l = i + n2; + + xt = (int16_t)((pSrc16[2 * i] >> 1U) - (pSrc16[2 * l] >> 1U)); + pSrc16[2 * i] = + (int16_t)(((pSrc16[2 * i] >> 1U) + (pSrc16[2 * l] >> 1U)) >> 1U); + yt = (int16_t)((pSrc16[2 * i + 1] >> 1U) - (pSrc16[2 * l + 1] >> 1U)); + pSrc16[2 * i + 1] = + (int16_t)(((pSrc16[2 * l + 1] >> 1U) + (pSrc16[2 * i + 1] >> 1U)) >> + 1U); + pSrc16[2U * l] = (int16_t)(((int16_t)(((int32_t)xt * cosVal) >> 16)) + + ((int16_t)(((int32_t)yt * sinVal) >> 16))); + pSrc16[2U * l + 1] = (int16_t)(((int16_t)(((int32_t)yt * cosVal) >> 16)) - + ((int16_t)(((int32_t)xt * sinVal) >> 16))); + } + twidCoefModifier = twidCoefModifier << 1U; + + /* loop for stage */ + for (k = fftLen / 2; k > 2; k = k >> 1) { + n1 = n2; + n2 = n2 >> 1; + ia = 0; + + /* loop for groups */ + for (j = 0; j < n2; j++) { + cosVal = pCoef16[ia * 2]; + sinVal = pCoef16[(ia * 2) + 1]; + ia = ia + twidCoefModifier; + + /* loop for butterfly */ + for (i = j; i < fftLen; i += n1) { + l = i + n2; + + xt = (int16_t)((pSrc16[2 * i] >> 1U) - (pSrc16[2 * l] >> 1U)); + pSrc16[2 * i] = + (int16_t)(((pSrc16[2 * i] >> 1U) + (pSrc16[2 * l] >> 1U)) >> 1U); + yt = (int16_t)((pSrc16[2 * i + 1] >> 1U) - (pSrc16[2 * l + 1] >> 1U)); + pSrc16[2 * i + 1] = + (int16_t)(((pSrc16[2 * l + 1] >> 1U) + (pSrc16[2 * i + 1] >> 1U)) >> + 1U); + pSrc16[2U * l] = (int16_t)(((int16_t)(((int32_t)xt * cosVal) >> 16)) + + ((int16_t)(((int32_t)yt * sinVal) >> 16))); + pSrc16[2U * l + 1] = + (int16_t)(((int16_t)(((int32_t)yt * cosVal) >> 16)) - + ((int16_t)(((int32_t)xt * sinVal) >> 16))); + + } /* butterfly loop end */ + + } /* groups loop end */ + + twidCoefModifier = twidCoefModifier << 1U; + } /* stages loop end */ + + n1 = n2; + n2 = n2 >> 1; + /* loop for groups */ + for (i = 0; i < fftLen; i += n1) { + l = i + n2; + xt = (int16_t)((pSrc16[2 * i] >> 1U) - (pSrc16[2 * l] >> 1U)); + pSrc16[2 * i] = (int16_t)(((pSrc16[2 * i] >> 1U) + (pSrc16[2 * l] >> 1U))); + yt = (int16_t)((pSrc16[2 * i + 1] >> 1U) - (pSrc16[2 * l + 1] >> 1U)); + pSrc16[2 * i + 1] = + (int16_t)(((pSrc16[2 * l + 1] >> 1U) + (pSrc16[2 * i + 1] >> 1U))); + pSrc16[2U * l] = xt; + pSrc16[2U * l + 1] = yt; + } /* groups loop end */ +} + +void mempool_radix2_bitreversal_q16s(uint16_t *pSrc, const uint16_t bitRevLen, + const uint16_t *pBitRevTab) { + uint16_t a, b, tmp; + + for (uint32_t i = 0; i < bitRevLen; i += 2) { + a = pBitRevTab[i] >> 2; + b = pBitRevTab[i + 1] >> 2; + + // real + tmp = pSrc[a]; + pSrc[a] = pSrc[b]; + pSrc[b] = tmp; + + // complex + tmp = pSrc[a + 1]; + pSrc[a + 1] = pSrc[b + 1]; + pSrc[b + 1] = tmp; + + // i += 2; + } +} diff --git a/software/runtime/kernel/mempool_radix4_cfft_butterfly_q16.h b/software/runtime/kernel/mempool_radix4_cfft_butterfly_q16.h new file mode 100644 index 000000000..725908b71 --- /dev/null +++ b/software/runtime/kernel/mempool_radix4_cfft_butterfly_q16.h @@ -0,0 +1,488 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +#include "xpulp/builtins_v2.h" + +/** + @brief First butterfly stage. + @param[in] pIn points to input buffer of 16b data, Re and Im parts are + interleaved + @param[out] pOut points to output buffer of 16b data, Re and Im parts are + interleaved + @param[in] i0 points to the first element to be processed + @param[in] n2 number of elements in the first wing of the butterfly + @param[in] CoSi1 packed cosine and sine first twiddle + @param[in] CoSi2 packed cosine and sine second twiddle + @param[in] CoSi3 packed cosine and sine third twiddle + @param[in] C1 packed sine and cosine first twiddle + @param[in] C2 packed sine and cosine second twiddle + @param[in] C3 packed sine and cosine third twiddle + @return none +*/ +static inline void radix4_butterfly_first(int16_t *pIn, int16_t *pOut, + uint32_t i0, uint32_t n2, v2s CoSi1, + v2s CoSi2, v2s CoSi3, v2s C1, v2s C2, + v2s C3) { + int16_t t0, t1, t2, t3, t4, t5; + uint32_t i1, i2, i3; + uint32_t i0_store, i1_store, i2_store, i3_store; + v2s A, B, C, D, E, F, G, H; + +// LOAD INDEXES +#if defined(FOLDED) || defined(SCHEDULED) + /* index calculation for the input as, */ + /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; +#else + /* index calculation for the input as, */ + /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; +#endif +// STORE INDEXES +#if defined(FOLDED) || defined(SCHEDULED) + uint32_t n2_store = n2 >> 2U; + i0_store = (i0 % n2_store) + (i0 / n2_store) * N_BANKS; + i1_store = i0_store + n2_store; + i2_store = i1_store + n2_store; + i3_store = i2_store + n2_store; +#else + i0_store = i0; + i1_store = i1; + i2_store = i2; + i3_store = i3; +#endif + +#ifndef ASM + v2s s1 = {1, 1}; + v2s s2 = {2, 2}; + /* Read yb (real), xb(imag) input */ + B = __SRA2(*(v2s *)&pIn[i1 * 2U], s2); + /* Read yd (real), xd(imag) input */ + D = __SRA2(*(v2s *)&pIn[i3 * 2U], s2); + /* Read ya (real), xa (imag) input */ + A = __SRA2(*(v2s *)&pIn[i0 * 2U], s2); + /* Read yc (real), xc(imag) input */ + C = __SRA2(*(v2s *)&pIn[i2 * 2U], s2); + /* G0 = (yb + yd), G1 = (xb + xd) */ + G = __ADD2(B, D); + /* H0 = (yb - yd), H1 = (xb - xd) */ + H = __SUB2(B, D); + /* E0 = (ya + yc), E1 = (xa + xc) */ + E = __ADD2(A, C); + /* F0 = (ya - yc), F1 = (xa - xc) */ + F = __SUB2(A, C); + t0 = (int16_t)H[0]; + t1 = (int16_t)H[1]; + A = __SRA2(E, s1); + B = __SRA2(G, s1); + /* C0 = (xb - xd), C1 = (yd - yb) */ + C = __PACK2(t0, -t1); + /* D0 = (xd - xb), D1 = (yb - yd) */ + D = __PACK2(-t0, t1); + /* E0 = (ya+yc) - (yb+yd), E1 = (xa+xc) - (xb+xd) */ + E = __SUB2(E, G); + /* G1 = (ya-yc) + (xb-xd), G0 = (xa-xc) - (yb-yd) */ + G = __ADD2(F, C); + /* H1 = (ya-yc) - (xb-xd), H0 = (xa-xc) + (yb-yd) */ + H = __ADD2(F, D); + /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */ + /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */ + t0 = (int16_t)(__DOTP2(CoSi2, E) >> 16U); + t1 = (int16_t)(__DOTP2(C2, E) >> 16U); + /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */ + /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */ + t2 = (int16_t)(__DOTP2(CoSi1, H) >> 16U); + t3 = (int16_t)(__DOTP2(C1, H) >> 16U); + /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */ + /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */ + t4 = (int16_t)(__DOTP2(CoSi3, G) >> 16U); + t5 = (int16_t)(__DOTP2(C3, G) >> 16U); + /* ya' = ya + yb + yc + yd */ + /* xa' = xa + xb + xc + xd */ + A = __ADD2(A, B); + E = __PACK2(t1, t0); + F = __PACK2(t3, t2); + G = __PACK2(t5, t4); + *((v2s *)&pOut[i0_store * 2U]) = A; + *((v2s *)&pOut[i1_store * 2U]) = E; + *((v2s *)&pOut[i2_store * 2U]) = F; + *((v2s *)&pOut[i3_store * 2U]) = G; +#else + v2s s1, s2; + /* Read yb (real), xb(imag) input */ + B = *(v2s *)&pIn[i1 * 2U]; + /* Read yd (real), xd(imag) input */ + D = *(v2s *)&pIn[i3 * 2U]; + /* Read ya (real), xa (imag) input */ + A = *(v2s *)&pIn[i0 * 2U]; + /* Read yc (real), xc(imag) input */ + C = *(v2s *)&pIn[i2 * 2U]; + asm volatile("addi %[s1], zero, 0x01;" + "slli %[s1], %[s1], 0x10;" + "addi %[s1], %[s1], 0x01;" + "addi %[s2], zero, 0x02;" + "slli %[s2], %[s2], 0x10;" + "addi %[s2], %[s2], 0x02;" + "pv.sra.h %[B],%[B],%[s2];" + "pv.sra.h %[D],%[D],%[s2];" + "pv.sra.h %[A],%[A],%[s2];" + "pv.sra.h %[C],%[C],%[s2];" + "pv.add.h %[G],%[B],%[D];" + "pv.sub.h %[H],%[B],%[D];" + "pv.add.h %[E],%[A],%[C];" + "pv.sub.h %[F],%[A],%[C];" + "pv.extract.h %[t0],%[H],0;" + "pv.extract.h %[t1],%[H],1;" + "pv.sra.h %[A],%[E],%[s1];" + "pv.sra.h %[B],%[G],%[s1];" + "sub %[t3],zero,%[t1];" + "sub %[t4],zero,%[t0];" + "pv.pack %[C],%[t0],%[t3];" + "pv.pack %[D],%[t4],%[t1];" + "pv.sub.h %[E],%[E],%[G];" + "pv.add.h %[G],%[F],%[C];" + "pv.add.h %[H],%[F],%[D];" + "pv.dotsp.h %[C],%[CoSi2],%[E];" + "pv.dotsp.h %[D],%[C2],%[E];" + "pv.dotsp.h %[E],%[CoSi1],%[H];" + "pv.dotsp.h %[F],%[C1],%[H];" + "srai %[t0],%[C],0x10;" + "srai %[t1],%[D],0x10;" + "pv.dotsp.h %[C],%[CoSi3],%[G];" + "pv.dotsp.h %[D],%[C3],%[G];" + "srai %[t2],%[E],0x10;" + "srai %[t3],%[F],0x10;" + "srai %[t4],%[C],0x10;" + "srai %[t5],%[D],0x10;" + "pv.add.h %[A],%[A],%[B];" + "pv.pack %[E],%[t1],%[t0];" + "pv.pack %[F],%[t3],%[t2];" + "pv.pack %[G],%[t5],%[t4];" + : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), + [E] "=&r"(E), [F] "=&r"(F), [G] "=&r"(G), [H] "=&r"(H), + [t0] "=&r"(t0), [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), + [t4] "=&r"(t4), [t5] "=&r"(t5), [s1] "=&r"(s1), [s2] "=&r"(s2) + : [C1] "r"(C1), [C2] "r"(C2), [C3] "r"(C3), [CoSi1] "r"(CoSi1), + [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3) + :); + *((v2s *)&pOut[i0_store * 2U]) = A; + *((v2s *)&pOut[i1_store * 2U]) = E; + *((v2s *)&pOut[i2_store * 2U]) = F; + *((v2s *)&pOut[i3_store * 2U]) = G; +#endif +} + +/** + @brief Middle butterfly stage. + @param[in] pIn points to input buffer of 16b data, Re and Im parts are + interleaved + @param[out] pOut points to output buffer of 16b data, Re and Im parts are + interleaved + @param[in] i0 points to the first element to be processed + @param[in] n2 number of elements in the first wing of the butterfly + @param[in] CoSi1 packed cosine and sine first twiddle + @param[in] CoSi2 packed cosine and sine second twiddle + @param[in] CoSi3 packed cosine and sine third twiddle + @param[in] C1 packed sine and cosine first twiddle + @param[in] C2 packed sine and cosine second twiddle + @param[in] C3 packed sine and cosine third twiddle + @return none +*/ +static inline void radix4_butterfly_middle(int16_t *pIn, int16_t *pOut, + uint32_t i0, uint32_t n2, v2s CoSi1, + v2s CoSi2, v2s CoSi3, v2s C1, v2s C2, + v2s C3) { + int16_t t0, t1, t2, t3, t4, t5; + uint32_t i1, i2, i3; + uint32_t i0_store, i1_store, i2_store, i3_store; + v2s A, B, C, D, E, F, G, H; + +// LOAD INDEXES +#if defined(FOLDED) || defined(SCHEDULED) + /* index calculation for the input as, */ + /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + + * 3fftLen/4] */ + i1 = i0 + N_BANKS; + i2 = i1 + N_BANKS; + i3 = i2 + N_BANKS; +#else + /* index calculation for the input as, */ + /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + + * 3fftLen/4] */ + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; +#endif +// STORE INDEXES +#if defined(FOLDED) || defined(SCHEDULED) + uint32_t n2_store = n2 >> 2U; + i0_store = + (i0 % n2_store) + (i0 / n2) * n2 + ((i0 % n2) / n2_store) * N_BANKS; + i1_store = i0_store + n2_store; + i2_store = i1_store + n2_store; + i3_store = i2_store + n2_store; +#else + i0_store = i0; + i1_store = i1; + i2_store = i2; + i3_store = i3; +#endif + +#ifndef ASM + v2s s1 = {1, 1}; + /* Read yb (real), xb(imag) input */ + B = *(v2s *)&pIn[i1 * 2U]; + /* Read yd (real), xd(imag) input */ + D = *(v2s *)&pIn[i3 * 2U]; + /* Read ya (real), xa(imag) input */ + A = *(v2s *)&pIn[i0 * 2U]; + /* Read yc (real), xc(imag) input */ + C = *(v2s *)&pIn[i2 * 2U]; + /* G0 = (yb + yd), G1 = (xb + xd) */ + G = __ADD2(B, D); + /* H0 = (yb - yd), H1 = (xb - xd) */ + H = __SUB2(B, D); + /* E0 = (ya + yc), E1 = (xa + xc) */ + E = __ADD2(A, C); + /* F0 = (ya - yc), F1 =(xa - xc) */ + F = __SUB2(A, C); + G = __SRA2(G, s1); + H = __SRA2(H, s1); + E = __SRA2(E, s1); + F = __SRA2(F, s1); + t0 = (int16_t)H[0]; + t1 = (int16_t)H[1]; + /* C0 = (ya+yc) - (yb+yd), C1 = (xa+xc) - (xb+xd) */ + C = __SUB2(E, G); + /* D0 = (ya+yc) + (yb+yd), D1 = (xa+xc) + (xb+xd) */ + D = __ADD2(E, G); + /* A0 = (xb-xd), A1 = (yd-yb) */ + A = __PACK2(t0, -t1); + /* B0 = (xd-xb), B1 = (yb-yd) */ + B = __PACK2(-t0, t1); + /* xa' = xa + xb + xc + xd */ + /* ya' = ya + yb + yc + yd */ + D = __SRA2(D, s1); + /* E1 = (ya-yc) + (xb-xd), E0 = (xa-xc) - (yb-yd)) */ + E = __ADD2(F, A); + /* F1 = (ya-yc) - (xb-xd), F0 = (xa-xc) + (yb-yd)) */ + F = __ADD2(F, B); + /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */ + /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */ + t0 = (int16_t)(__DOTP2(CoSi2, C) >> 16U); + t1 = (int16_t)(__DOTP2(C2, C) >> 16U); + /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */ + /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */ + t2 = (int16_t)(__DOTP2(CoSi1, F) >> 16U); + t3 = (int16_t)(__DOTP2(C1, F) >> 16U); + /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */ + /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */ + t4 = (int16_t)(__DOTP2(CoSi3, E) >> 16U); + t5 = (int16_t)(__DOTP2(C3, E) >> 16U); + A = __PACK2(t1, t0); + B = __PACK2(t3, t2); + C = __PACK2(t5, t4); + *((v2s *)&pOut[i0_store * 2U]) = D; + *((v2s *)&pOut[i1_store * 2U]) = A; + *((v2s *)&pOut[i2_store * 2U]) = B; + *((v2s *)&pOut[i3_store * 2U]) = C; +#else + v2s s1; + /* Read yb (real), xb(imag) input */ + B = *(v2s *)&pIn[i1 * 2U]; + /* Read yd (real), xd(imag) input */ + D = *(v2s *)&pIn[i3 * 2U]; + /* Read ya (real), xa(imag) input */ + A = *(v2s *)&pIn[i0 * 2U]; + /* Read yc (real), xc(imag) input */ + C = *(v2s *)&pIn[i2 * 2U]; + asm volatile("addi %[s1], zero, 0x01;" + "slli %[s1], %[s1], 0x10;" + "addi %[s1], %[s1], 0x01;" + "pv.add.h %[G],%[B],%[D];" + "pv.sub.h %[H],%[B],%[D];" + "pv.add.h %[E],%[A],%[C];" + "pv.sub.h %[F],%[A],%[C];" + "pv.sra.h %[G],%[G],%[s1];" + "pv.sra.h %[H],%[H],%[s1];" + "pv.sra.h %[E],%[E],%[s1];" + "pv.sra.h %[F],%[F],%[s1];" + "pv.extract.h %[t0],%[H],0;" + "pv.extract.h %[t1],%[H],1;" + "pv.sub.h %[C],%[E],%[G];" + "pv.add.h %[D],%[E],%[G];" + "sub %[t4],zero,%[t1];" + "sub %[t3],zero,%[t0];" + "pv.pack %[A],%[t0],%[t4];" + "pv.pack %[B],%[t3],%[t1];" + "pv.sra.h %[D],%[D],%[s1];" + "pv.add.h %[E],%[F],%[A];" + "pv.add.h %[F],%[F],%[B];" + "pv.dotsp.h %[G],%[CoSi2],%[C];" + "pv.dotsp.h %[H],%[C2],%[C];" + "pv.dotsp.h %[A],%[CoSi1],%[F];" + "pv.dotsp.h %[B],%[C1],%[F];" + "srai %[t0],%[G],0x10;" + "srai %[t1],%[H],0x10;" + "pv.dotsp.h %[G],%[CoSi3],%[E];" + "pv.dotsp.h %[H],%[C3],%[E];" + "srai %[t2],%[A],0x10;" + "srai %[t3],%[B],0x10;" + "srai %[t4],%[G],0x10;" + "srai %[t5],%[H],0x10;" + "pv.pack %[A],%[t1],%[t0];" + "pv.pack %[B],%[t3],%[t2];" + "pv.pack %[C],%[t5],%[t4];" + : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), + [E] "=&r"(E), [F] "=&r"(F), [G] "=&r"(G), [H] "=&r"(H), + [t0] "=&r"(t0), [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), + [t4] "=&r"(t4), [t5] "=&r"(t5), [s1] "=&r"(s1) + : [C1] "r"(C1), [C2] "r"(C2), [C3] "r"(C3), [CoSi1] "r"(CoSi1), + [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3) + :); + *((v2s *)&pOut[i0_store * 2U]) = D; + *((v2s *)&pOut[i1_store * 2U]) = A; + *((v2s *)&pOut[i2_store * 2U]) = B; + *((v2s *)&pOut[i3_store * 2U]) = C; +#endif +} + +/** + @brief Last butterfly stage. + @param[in] pIn points to input buffer of 16b data, Re and Im parts are + interleaved + @param[out] pOut points to output buffer of 16b data, Re and Im parts are + interleaved + @param[in] i0 points to the first element to be processed + @return none +*/ +static inline void radix4_butterfly_last(int16_t *pIn, int16_t *pOut, + uint32_t i0) { + int16_t t0, t1; + uint32_t i1, i2, i3; + uint32_t i0_store, i1_store, i2_store, i3_store; + v2s A, B, C, D, E, F, G, H; + +// LOAD INDEXES +#if defined(FOLDED) || defined(SCHEDULED) + /* index calculation for the input as, */ + /* pIn[i0 + 0], pIn[i0 + fftLen/4], + pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */ + i1 = i0 + N_BANKS; + i2 = i1 + N_BANKS; + i3 = i2 + N_BANKS; +#else + /* index calculation for the input as, */ + /* pIn[i0 + 0], pIn[i0 + fftLen/4], + pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */ + i1 = i0 + 1U; + i2 = i1 + 1U; + i3 = i2 + 1U; +#endif +// STORE INDEXES +#if defined(FOLDED) + i0_store = i0 * 4; + i1_store = i0_store + 1; + i2_store = i1_store + 1; + i3_store = i2_store + 1; +#else + i0_store = i0; + i1_store = i1; + i2_store = i2; + i3_store = i3; +#endif + +#ifndef ASM + v2s s1 = {1, 1}; + /* Read yb (real), xb(imag) input */ + B = *(v2s *)&pIn[i1 * 2U]; + /* Read yd (real), xd(imag) input */ + D = *(v2s *)&pIn[i3 * 2U]; + /* Read ya (real), xa(imag) input */ + A = *(v2s *)&pIn[i0 * 2U]; + /* Read yc (real), xc(imag) input */ + C = *(v2s *)&pIn[i2 * 2U]; + /* H0 = (yb-yd), H1 = (xb-xd) */ + H = __SUB2(B, D); + /* G0 = (yb+yd), G1 = (xb+xd) */ + G = __ADD2(B, D); + /* E0 = (ya+yc), E1 = (xa+xc) */ + E = __ADD2(A, C); + /* F0 = (ya-yc), F1 = (xa-xc) */ + F = __SUB2(A, C); + H = __SRA2(H, s1); + G = __SRA2(G, s1); + E = __SRA2(E, s1); + t0 = (int16_t)H[0]; + t1 = (int16_t)H[1]; + F = __SRA2(F, s1); + /* xa' = (xa+xb+xc+xd) */ + /* ya' = (ya+yb+yc+yd) */ + *((v2s *)&pOut[i0_store * 2U]) = __ADD2(E, G); + /* A0 = (xb-xd), A1 = (yd-yb) */ + A = __PACK2(-t0, t1); + /* B0 = (xd-xb), B1 = (yb-yd) */ + B = __PACK2(t0, -t1); + /* xc' = (xa-xb+xc-xd) */ + /* yc' = (ya-yb+yc-yd) */ + E = __SUB2(E, G); + /* xb' = (xa+yb-xc-yd) */ + /* yb' = (ya-xb-yc+xd) */ + A = __ADD2(F, A); + /* xd' = (xa-yb-xc+yd) */ + /* yd' = (ya+xb-yc-xd) */ + B = __ADD2(F, B); + *((v2s *)&pOut[i1_store * 2U]) = E; + *((v2s *)&pOut[i2_store * 2U]) = A; + *((v2s *)&pOut[i3_store * 2U]) = B; +#else + /* Read yb (real), xb(imag) input */ + B = *(v2s *)&pIn[i1 * 2U]; + /* Read yd (real), xd(imag) input */ + D = *(v2s *)&pIn[i3 * 2U]; + /* Read ya (real), xa(imag) input */ + A = *(v2s *)&pIn[i0 * 2U]; + /* Read yc (real), xc(imag) input */ + C = *(v2s *)&pIn[i2 * 2U]; + int16_t t2, t3; + v2s s1; + asm volatile( + "addi %[s1], zero, 0x01;" + "slli %[s1], %[s1], 0x10;" + "addi %[s1], %[s1], 0x01;" + "pv.sub.h %[H],%[B],%[D];" + "pv.add.h %[G],%[B],%[D];" + "pv.add.h %[E],%[A],%[C];" + "pv.sub.h %[F],%[A],%[C];" + "pv.sra.h %[H],%[H],%[s1];" + "pv.sra.h %[G],%[G],%[s1];" + "pv.sra.h %[E],%[E],%[s1];" + "pv.extract.h %[t0],%[H],0;" + "pv.extract.h %[t1],%[H],1;" + "pv.sra.h %[F],%[F],%[s1];" + "sub %[t2], zero, %[t0];" + "sub %[t3], zero, %[t1];" + "pv.pack %[A],%[t2],%[t1];" + "pv.pack %[B],%[t0],%[t3];" + "pv.add.h %[H],%[E],%[G];" + "pv.sub.h %[E],%[E],%[G];" + "pv.add.h %[A],%[F],%[A];" + "pv.add.h %[B],%[F],%[B];" + : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "=&r"(E), + [F] "=&r"(F), [G] "=&r"(G), [H] "=&r"(H), [t0] "=&r"(t0), + [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [s1] "=&r"(s1) + : + :); + *((v2s *)&pOut[i0_store * 2U]) = H; + *((v2s *)&pOut[i1_store * 2U]) = E; + *((v2s *)&pOut[i2_store * 2U]) = A; + *((v2s *)&pOut[i3_store * 2U]) = B; +#endif +} diff --git a/software/runtime/kernel/mempool_radix4_cfft_q16_bitreversal.h b/software/runtime/kernel/mempool_radix4_cfft_q16_bitreversal.h new file mode 100644 index 000000000..56f4f478b --- /dev/null +++ b/software/runtime/kernel/mempool_radix4_cfft_q16_bitreversal.h @@ -0,0 +1,147 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +void mempool_bitrevtable_q16s_riscv32(uint16_t *pSrc, const uint16_t bitRevLen, + const uint16_t *pBitRevTab) { + uint16_t addr1, addr2; + uint16_t tmpa, tmpb; + for (uint32_t i = 0; i < bitRevLen; i += 2) { + addr1 = pBitRevTab[i] >> 2U; + addr2 = pBitRevTab[i + 1] >> 2U; + + tmpa = pSrc[addr1]; + tmpb = pSrc[addr2]; + pSrc[addr1] = tmpb; + pSrc[addr2] = tmpa; + + tmpa = pSrc[addr1 + 1]; + tmpb = pSrc[addr2 + 1]; + pSrc[addr1 + 1] = tmpb; + pSrc[addr2 + 1] = tmpa; + } +} + +#ifndef ASM +#define SWAP_ITEMS \ + addr1 = *(v2s *)&pBitRevTab[i]; \ + addr2 = *(v2s *)&pBitRevTab[i + 2]; \ + addr3 = *(v2s *)&pBitRevTab[i + 4]; \ + addr4 = *(v2s *)&pBitRevTab[i + 6]; \ + addr1 = __SRA2(addr1, s2); \ + addr2 = __SRA2(addr2, s2); \ + addr3 = __SRA2(addr3, s2); \ + addr4 = __SRA2(addr4, s2); \ + a1 = addr1[1]; \ + a2 = addr2[1]; \ + a3 = addr3[1]; \ + a4 = addr4[1]; \ + b1 = addr1[0]; \ + b2 = addr2[0]; \ + b3 = addr3[0]; \ + b4 = addr4[0]; \ + tmpa1 = *(v2s *)&pSrc[a1]; \ + tmpa2 = *(v2s *)&pSrc[a2]; \ + tmpa3 = *(v2s *)&pSrc[a3]; \ + tmpa4 = *(v2s *)&pSrc[a4]; \ + tmpb1 = *(v2s *)&pSrc[b1]; \ + tmpb2 = *(v2s *)&pSrc[b2]; \ + tmpb3 = *(v2s *)&pSrc[b3]; \ + tmpb4 = *(v2s *)&pSrc[b4]; \ + *((v2s *)&pSrc[a1]) = tmpb1; \ + *((v2s *)&pSrc[a2]) = tmpb2; \ + *((v2s *)&pSrc[a3]) = tmpb3; \ + *((v2s *)&pSrc[a4]) = tmpb4; \ + *((v2s *)&pSrc[b1]) = tmpa1; \ + *((v2s *)&pSrc[b2]) = tmpa2; \ + *((v2s *)&pSrc[b3]) = tmpa3; \ + *((v2s *)&pSrc[b4]) = tmpa4; +#else +#define SWAP_ITEMS \ + addr1 = *(v2s *)&pBitRevTab[i]; \ + addr2 = *(v2s *)&pBitRevTab[i + 2]; \ + addr3 = *(v2s *)&pBitRevTab[i + 4]; \ + addr4 = *(v2s *)&pBitRevTab[i + 6]; \ + asm volatile("pv.sra.h %[addr1],%[addr1],%[s2];" \ + "pv.sra.h %[addr2],%[addr2],%[s2];" \ + "pv.sra.h %[addr3],%[addr3],%[s2];" \ + "pv.sra.h %[addr4],%[addr4],%[s2];" \ + "pv.extract.h %[a1],%[addr1],1;" \ + "pv.extract.h %[a2],%[addr2],1;" \ + "pv.extract.h %[a3],%[addr3],1;" \ + "pv.extract.h %[a4],%[addr4],1;" \ + "pv.extract.h %[b1],%[addr1],0;" \ + "pv.extract.h %[b2],%[addr2],0;" \ + "pv.extract.h %[b3],%[addr3],0;" \ + "pv.extract.h %[b4],%[addr4],0;" \ + : [a1] "=r"(a1), [a2] "=r"(a2), [a3] "=r"(a3), [a4] "=r"(a4), \ + [b1] "=r"(b1), [b2] "=r"(b2), [b3] "=r"(b3), [b4] "=r"(b4), \ + [addr1] "+&r"(addr1), [addr2] "+&r"(addr2), \ + [addr3] "+&r"(addr3), [addr4] "+&r"(addr4) \ + : [s2] "r"(s2) \ + :); \ + tmpa1 = *(v2s *)&pSrc[a1]; \ + tmpa2 = *(v2s *)&pSrc[a2]; \ + tmpa3 = *(v2s *)&pSrc[a3]; \ + tmpa4 = *(v2s *)&pSrc[a4]; \ + tmpb1 = *(v2s *)&pSrc[b1]; \ + tmpb2 = *(v2s *)&pSrc[b2]; \ + tmpb3 = *(v2s *)&pSrc[b3]; \ + tmpb4 = *(v2s *)&pSrc[b4]; \ + *((v2s *)&pSrc[a1]) = tmpb1; \ + *((v2s *)&pSrc[a2]) = tmpb2; \ + *((v2s *)&pSrc[a3]) = tmpb3; \ + *((v2s *)&pSrc[a4]) = tmpb4; \ + *((v2s *)&pSrc[b1]) = tmpa1; \ + *((v2s *)&pSrc[b2]) = tmpa2; \ + *((v2s *)&pSrc[b3]) = tmpa3; \ + *((v2s *)&pSrc[b4]) = tmpa4; +#endif + +void mempool_bitrevtable_q16s_xpulpimg(uint16_t *pSrc, const uint16_t bitRevLen, + const uint16_t *pBitRevTab) { + v2s addr1, addr2, addr3, addr4; + v2s s2 = (v2s){2, 2}; + v2s tmpa1, tmpa2, tmpa3, tmpa4; + v2s tmpb1, tmpb2, tmpb3, tmpb4; + int32_t a1, a2, a3, a4; + int32_t b1, b2, b3, b4; + for (uint32_t i = 0; i < bitRevLen; i += 8) { + SWAP_ITEMS; + } +} + +void mempool_bitrevtable_q16p_xpulpimg(uint16_t *pSrc, const uint16_t bitRevLen, + const uint16_t *pBitRevTab, + const uint32_t nPE) { + uint32_t core_id = mempool_get_core_id(); + v2s addr1, addr2, addr3, addr4; + v2s s2 = (v2s){2, 2}; + v2s tmpa1, tmpa2, tmpa3, tmpa4; + v2s tmpb1, tmpb2, tmpb3, tmpb4; + int32_t a1, a2, a3, a4; + int32_t b1, b2, b3, b4; + for (uint32_t i = 8 * core_id; i < bitRevLen; i += (8 * nPE)) { + SWAP_ITEMS; + } + mempool_log_partial_barrier(2, core_id, nPE); +} + +void mempool_bitrev_q16p_xpulpimg(uint16_t *pSrc, uint16_t *pDst, + const uint16_t fftLen, const uint32_t nPE) { + uint32_t core_id = mempool_get_core_id(); + uint32_t idx_result, idx, i, j; + for (i = core_id; i < fftLen; i += nPE) { + idx_result = 0; + idx = i; + for (j = 0; j < LOG2; j++) { + idx_result = (idx_result << 1U) | (idx & 1U); + idx = idx >> 1U; + } + pDst[2 * idx_result] = pSrc[2 * i]; + pDst[2 * idx_result + 1] = pSrc[2 * i + 1]; + } + mempool_log_partial_barrier(2, core_id, nPE); +} diff --git a/software/runtime/kernel/mempool_radix4_cfft_q16p.h b/software/runtime/kernel/mempool_radix4_cfft_q16p.h new file mode 100644 index 000000000..34f338ce8 --- /dev/null +++ b/software/runtime/kernel/mempool_radix4_cfft_q16p.h @@ -0,0 +1,640 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +#include "xpulp/builtins_v2.h" +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) + +#ifndef ASM +#define SHUFFLE_TWIDDLEFACT \ + t1 = (int16_t)CoSi1[0]; \ + t3 = (int16_t)CoSi2[0]; \ + t5 = (int16_t)CoSi3[0]; \ + t0 = (int16_t)CoSi1[1]; \ + t2 = (int16_t)CoSi2[1]; \ + t4 = (int16_t)CoSi3[1]; \ + C1 = __PACK2(t1, -t0); \ + C2 = __PACK2(t3, -t2); \ + C3 = __PACK2(t5, -t4); +#else +#define SHUFFLE_TWIDDLEFACT \ + asm volatile("pv.extract.h %[t1],%[CoSi1],0;" \ + "pv.extract.h %[t3],%[CoSi2],0;" \ + "pv.extract.h %[t5],%[CoSi3],0;" \ + "pv.extract.h %[t0],%[CoSi1],1;" \ + "pv.extract.h %[t2],%[CoSi2],1;" \ + "pv.extract.h %[t4],%[CoSi3],1;" \ + "sub %[t0],zero,%[t0];" \ + "sub %[t2],zero,%[t2];" \ + "sub %[t4],zero,%[t4];" \ + "pv.pack %[C1],%[t1],%[t0];" \ + "pv.pack %[C2],%[t3],%[t2];" \ + "pv.pack %[C3],%[t5],%[t4];" \ + : [C1] "=r"(C1), [C2] "=r"(C2), [C3] "=r"(C3), [t0] "=&r"(t0), \ + [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), \ + [t4] "=&r"(t4), [t5] "=&r"(t5) \ + : [CoSi1] "r"(CoSi1), [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3) \ + :); +#endif + +#ifdef FOLDED_TWIDDLES + +#define LOAD_STORE_TWIDDLEFACT \ + CoSi1 = *(v2s *)&pCoef_src[2U * ic]; \ + CoSi2 = *(v2s *)&pCoef_src[2U * (ic + 1 * N_BANKS)]; \ + CoSi3 = *(v2s *)&pCoef_src[2U * (ic + 2 * N_BANKS)]; \ + if (ic % 4 == 0) { \ + *((v2s *)&pCoef_dst[2U * (ic_store)]) = CoSi1; \ + *((v2s *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi1; \ + *((v2s *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi1; \ + *((v2s *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi1; \ + ic_store += N_BANKS; \ + *((v2s *)&pCoef_dst[2U * (ic_store)]) = CoSi2; \ + *((v2s *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi2; \ + *((v2s *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi2; \ + *((v2s *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi2; \ + ic_store += N_BANKS; \ + *((v2s *)&pCoef_dst[2U * (ic_store)]) = CoSi3; \ + *((v2s *)&pCoef_dst[2U * (n2_store * 1 + ic_store)]) = CoSi3; \ + *((v2s *)&pCoef_dst[2U * (n2_store * 2 + ic_store)]) = CoSi3; \ + *((v2s *)&pCoef_dst[2U * (n2_store * 3 + ic_store)]) = CoSi3; \ + } + +#else +#define LOAD_STORE_TWIDDLEFACT \ + CoSi1 = *(v2s *)&pCoef_src[2U * ic]; \ + CoSi2 = *(v2s *)&pCoef_src[2U * (ic * 2U)]; \ + CoSi3 = *(v2s *)&pCoef_src[2U * (ic * 3U)]; +#endif + +void mempool_radix4_cfft_q16p_xpulpimg(int16_t *pSrc16, uint32_t fftLen, + const int16_t *pCoef16, + uint32_t twidCoefModifier, + uint32_t nPE) { + uint32_t absolute_core_id = mempool_get_core_id(); + uint32_t core_id = absolute_core_id % nPE; + v2s CoSi1, CoSi2, CoSi3; + v2s C1, C2, C3; + int16_t t0, t1, t2, t3, t4, t5; + uint32_t n1, n2, ic, i0, j, k; + uint32_t step, steps; + + /* START OF FIRST STAGE PROCESS */ + n1 = fftLen; + n2 = n1 >> 2U; + step = (n2 + nPE - 1) / nPE; + for (i0 = core_id * step; i0 < MIN(core_id * step + step, n2); i0++) { + + /* Twiddle coefficients index modifier */ + ic = i0 * twidCoefModifier; + /* co1 & si1 are read from Coefficient pointer */ + CoSi1 = *(v2s *)&pCoef16[ic * 2U]; + /* co2 & si2 are read from Coefficient pointer */ + CoSi2 = *(v2s *)&pCoef16[2U * (ic * 2U)]; + /* co3 & si3 are read from Coefficient pointer */ + CoSi3 = *(v2s *)&pCoef16[3U * (ic * 2U)]; + SHUFFLE_TWIDDLEFACT; + + radix4_butterfly_first(pSrc16, pSrc16, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2, + C3); + } + mempool_log_barrier(2, absolute_core_id); + /* END OF FIRST STAGE PROCESSING */ + + /* START OF MIDDLE STAGE PROCESSING */ + twidCoefModifier <<= 2U; + for (k = fftLen / 4U; k > 4U; k >>= 2U) { + + uint32_t offset, butt_id; + n1 = n2; + n2 >>= 2U; + step = (n2 + nPE - 1) / nPE; + butt_id = core_id % n2; + offset = (core_id / n2) * n1; + for (j = butt_id * step; j < MIN(butt_id * step + step, n2); j++) { + /* Twiddle coefficients index modifier */ + ic = twidCoefModifier * j; + /* co1 & si1 are read from Coefficient pointer */ + CoSi1 = *(v2s *)&pCoef16[ic * 2U]; + /* co2 & si2 are read from Coefficient pointer */ + CoSi2 = *(v2s *)&pCoef16[2U * (ic * 2U)]; + /* co3 & si3 are read from Coefficient pointer */ + CoSi3 = *(v2s *)&pCoef16[3U * (ic * 2U)]; + SHUFFLE_TWIDDLEFACT; + + /* Butterfly implementation */ + for (i0 = offset + j; i0 < fftLen; i0 += ((nPE + n2 - 1) / n2) * n1) { + radix4_butterfly_middle(pSrc16, pSrc16, i0, n2, CoSi1, CoSi2, CoSi3, C1, + C2, C3); + } + } + twidCoefModifier <<= 2U; + mempool_log_barrier(2, absolute_core_id); + } + /* END OF MIDDLE STAGE PROCESSING */ + + /* START OF LAST STAGE PROCESSING */ + n1 = n2; + n2 >>= 2U; + steps = fftLen / n1; + step = (steps + nPE - 1) / nPE; + /* Butterfly implementation */ + for (i0 = core_id * step * n1; i0 < MIN((core_id * step + step) * n1, fftLen); + i0 += n1) { + radix4_butterfly_last(pSrc16, pSrc16, i0); + } + mempool_log_barrier(2, absolute_core_id); + /* END OF LAST STAGE PROCESSING */ + return; +} + +void mempool_radix4by2_cfft_q16p(int16_t *pSrc, uint32_t fftLen, + const int16_t *pCoef, uint32_t nPE) { + + uint32_t i; + uint32_t n2, step; + v2s pa, pb; + + uint32_t l; + v2s CoSi; + v2s a, b, t; + int16_t testa, testb; + uint32_t core_id = mempool_get_core_id(); + + n2 = fftLen >> 1; + step = (n2 + nPE - 1) / nPE; + for (i = core_id * step; i < MIN(core_id * step + step, n2); i++) { + + CoSi = *(v2s *)&pCoef[i * 2]; + l = i + n2; + a = __SRA2(*(v2s *)&pSrc[2 * i], ((v2s){1, 1})); + b = __SRA2(*(v2s *)&pSrc[2 * l], ((v2s){1, 1})); + t = __SUB2(a, b); + *((v2s *)&pSrc[i * 2]) = __SRA2(__ADD2(a, b), ((v2s){1, 1})); + + testa = (int16_t)(__DOTP2(t, CoSi) >> 16); + testb = (int16_t)(__DOTP2(t, __PACK2(-CoSi[1], CoSi[0])) >> 16); + *((v2s *)&pSrc[l * 2]) = __PACK2(testa, testb); + } + mempool_log_barrier(2, core_id); + + if (nPE > 1) { + if (core_id < nPE / 2) { + // first col + mempool_radix4_cfft_q16p_xpulpimg(pSrc, n2, (int16_t *)pCoef, 2U, + nPE / 2); + } else { + // second col + mempool_radix4_cfft_q16p_xpulpimg(pSrc + fftLen, n2, (int16_t *)pCoef, 2U, + nPE - nPE / 2); + } + } else { + // first col + mempool_radix4_cfft_q16p_xpulpimg(pSrc, n2, (int16_t *)pCoef, 2U, nPE); + // second col + mempool_radix4_cfft_q16p_xpulpimg(pSrc + fftLen, n2, (int16_t *)pCoef, 2U, + nPE); + } + + for (i = core_id * step; i < MIN(core_id * step + step, n2); i++) { + + pa = *(v2s *)&pSrc[4 * i]; + pb = *(v2s *)&pSrc[4 * i + 2]; + + pa = __SLL2(pa, ((v2s){1, 1})); + pb = __SLL2(pb, ((v2s){1, 1})); + + *((v2s *)&pSrc[4 * i]) = pa; + *((v2s *)&pSrc[4 * i + 2]) = pb; + } + mempool_log_barrier(2, core_id); + return; +} + +/** + @brief Folding in local memory function + @param[in] pSrc16 points to input buffer of 16b data, Re and Im parts are + interleaved + @param[in] fftLen Length of the complex input vector + @param[in] nPE Number of PE + @return none +*/ + +static inline void fold_radix4(int16_t *pSrc16, uint32_t fftLen, uint32_t nPE) { + uint32_t n2, i0, i1, i2, i3; + uint32_t i1_store, i2_store, i3_store; + volatile v2s A, B, C; + uint32_t absolute_core_id = mempool_get_core_id(); + uint32_t core_id = absolute_core_id % 4U; + n2 = fftLen >> 2U; + for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, n2); i0++) { + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + i1_store = i0 + N_BANKS; + i2_store = i1_store + N_BANKS; + i3_store = i2_store + N_BANKS; + for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) { + A = *(v2s *)&pSrc16[i1 * 2U + idx_row * (8 * N_BANKS)]; + B = *(v2s *)&pSrc16[i2 * 2U + idx_row * (8 * N_BANKS)]; + C = *(v2s *)&pSrc16[i3 * 2U + idx_row * (8 * N_BANKS)]; + *(v2s *)&pSrc16[i1_store * 2U + idx_row * (8 * N_BANKS)] = A; + *(v2s *)&pSrc16[i2_store * 2U + idx_row * (8 * N_BANKS)] = B; + *(v2s *)&pSrc16[i3_store * 2U + idx_row * (8 * N_BANKS)] = C; + } + } + mempool_log_partial_barrier(2, absolute_core_id, nPE); + return; +} + +#ifdef FOLDED_TWIDDLES +/** + @brief Full FFT butterfly + @param[in] pSrc16 points to input buffer of 16b data, Re and Im parts are + interleaved + @param[out] pDst16 points to output buffer of 16b data, Re and Im parts + are interleaved + @param[in] fftLen Length of the complex input vector + @param[in] pCoef_src Twiddle coefficients vector + @param[in] pCoef_dst Auxiliary twiddle coefficients vector + @param[in] nPE Number of PE + @return pointer to output vector +*/ +void mempool_radix4_cfft_q16p_folded(int16_t *pSrc16, int16_t *pDst16, + uint32_t fftLen, int16_t *pCoef_src, + int16_t *pCoef_dst, uint32_t nPE) +#else +/** + Twiddles are not folded in memory + @brief Full FFT butterfly + @param[in] pSrc16 points to input buffer of 16b data, Re and Im parts are + interleaved + @param[out] pDst16 points to output buffer of 16b data, Re and Im parts + are interleaved + @param[in] fftLen Length of the complex input vector + @param[in] pCoef_src Twiddle coefficients vector + @param[in] nPE Number of PE + @return pointer to output vector +*/ +void mempool_radix4_cfft_q16p_folded(int16_t *pSrc16, int16_t *pDst16, + uint32_t fftLen, int16_t *pCoef_src, + uint32_t nPE) +#endif +{ + + uint32_t absolute_core_id = mempool_get_core_id(); + uint32_t core_id = absolute_core_id; + int16_t t0, t1, t2, t3, t4, t5; + v2s CoSi1, CoSi2, CoSi3; + v2s C1, C2, C3; +#ifdef FOLDED_TWIDDLES + uint32_t n1, n2, n2_store; + uint32_t i0, k, ic, ic_store; + int16_t *pTmp; +#else + uint32_t n1, n2; + uint32_t i0, k, ic; + int16_t *pTmp; + uint32_t twidCoefModifier = 1U; +#endif + + /* START OF FIRST STAGE PROCESSING */ + n1 = fftLen; + n2 = n1 >> 2U; + for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, n2); i0++) { +#ifdef FOLDED_TWIDDLES + ic = i0; + ic_store = ic >> 2U; + n2_store = n2 >> 2U; +#else + ic = i0; +#endif + LOAD_STORE_TWIDDLEFACT; + SHUFFLE_TWIDDLEFACT; + radix4_butterfly_first(pSrc16, pDst16, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2, + C3); + } + pTmp = pSrc16; + pSrc16 = pDst16; + pDst16 = pTmp; +#ifdef FOLDED_TWIDDLES + pTmp = pCoef_src; + pCoef_src = pCoef_dst; + pCoef_dst = pTmp; +#else + twidCoefModifier <<= 2U; +#endif + mempool_log_partial_barrier(2, absolute_core_id, nPE); + /* END OF FIRST STAGE PROCESSING */ + + /* START OF MIDDLE STAGE PROCESSING */ + for (k = fftLen / 4U; k > 4U; k >>= 2U) { + n1 = n2; + n2 >>= 2U; + for (i0 = core_id * 4; i0 < core_id * 4 + 4; i0++) { +#ifdef FOLDED_TWIDDLES + ic = i0; + // (ic % n2) / 4 take only every 4th index in the wing + // (ic / n2) * n2 shift of the wing size + ic_store = ((ic % n2) >> 2) + (ic / n2) * n2; + n2_store = n2 >> 2U; +#else + ic = (i0 % n2) * twidCoefModifier; +#endif + LOAD_STORE_TWIDDLEFACT; + SHUFFLE_TWIDDLEFACT; + radix4_butterfly_middle(pSrc16, pDst16, i0, n2, CoSi1, CoSi2, CoSi3, C1, + C2, C3); + } + pTmp = pSrc16; + pSrc16 = pDst16; + pDst16 = pTmp; +#ifdef FOLDED_TWIDDLES + pTmp = pCoef_src; + pCoef_src = pCoef_dst; + pCoef_dst = pTmp; +#else + twidCoefModifier <<= 2U; +#endif + mempool_log_partial_barrier(2, absolute_core_id, nPE); + } + /* END OF MIDDLE STAGE PROCESSING */ + + /* START OF LAST STAGE PROCESSING */ + for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, fftLen >> 2U); i0++) { + radix4_butterfly_last(pSrc16, pDst16, i0); + } + mempool_log_partial_barrier(2, absolute_core_id, nPE); + /* END OF LAST STAGE PROCESSING */ + + return; +} + +/** + SCHEDULER OF MULTIPLE FOLDED FFTS + Memory: + + 1st row of FFTS + + col_idx1 col_idx2 col_idx3 + xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... + xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... + xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... + xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... + + 2nd row of FFTS + + col_idx1 col_idx2 col_idx3 + xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... + xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... + xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... + xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... + + ... + + @brief Scheduler of folded FFTs + @param[in] column index of the current FFT + @param[in] pSrc16 input buffer of 16b data, Re and Im are interleaved + @param[out] pDst16 output buffer of 16b data, Re and Im are interleaved + @param[in] fftLen Length of the complex input vector + @param[in] pCoef_src Twiddle coefficients vector + @param[in] pCoef_dst Twiddle coefficients vector + @param[in] pBitRevTable Bitreversal table + @param[in] bitReverseLen Length of bitreversal table + @param[in] bitReverseFlag Flag for bitreversal + @param[in] nPE Number of PE + @return void +*/ + +void mempool_radix4_cfft_q16p_scheduler( + int16_t *pSrc16, int16_t *pDst16, uint32_t fftLen, int16_t *pCoef_src, + __attribute__((unused)) int16_t *pCoef_dst, + __attribute__((unused)) uint16_t *pBitRevTable, + __attribute__((unused)) uint16_t bitReverseLen, uint8_t bitReverseFlag, + uint32_t nPE) { + + uint32_t absolute_core_id = mempool_get_core_id(); + uint32_t core_id = absolute_core_id % (fftLen >> 4U); + uint32_t col_id = absolute_core_id / (fftLen >> 4U); + + int16_t t0, t1, t2, t3, t4, t5; + v2s CoSi1, CoSi2, CoSi3; + v2s C1, C2, C3; +#ifdef FOLDED_TWIDDLES + uint32_t n1, n2, n2_store; + uint32_t i0, k, ic, ic_store; +#else + uint32_t n1, n2; + uint32_t i0, k, ic; + uint32_t twidCoefModifier = 1U; +#endif + int16_t *pTmp; + + /* FIRST STAGE */ + n1 = fftLen; + n2 = n1 >> 2U; + for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, n2); i0++) { + ic = i0; +#ifdef FOLDED_TWIDDLES + ic_store = ic >> 2U; + n2_store = n2 >> 2U; +#endif + LOAD_STORE_TWIDDLEFACT; + SHUFFLE_TWIDDLEFACT; + for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) { + int16_t *pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * fftLen; + int16_t *pOut = + pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); + radix4_butterfly_first(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2, + C3); + } + } + pTmp = pSrc16; + pSrc16 = pDst16; + pDst16 = pTmp; +#ifdef FOLDED_TWIDDLES + pTmp = pCoef_src; + pCoef_src = pCoef_dst; + pCoef_dst = pTmp; +#else + twidCoefModifier <<= 2U; +#endif + mempool_log_partial_barrier(2, absolute_core_id, nPE); + + /* MIDDLE STAGE */ + for (k = fftLen / 4U; k > 4U; k >>= 2U) { + n1 = n2; + n2 >>= 2U; + for (i0 = core_id * 4; i0 < core_id * 4 + 4; i0++) { +#ifdef FOLDED_TWIDDLES + ic = i0; + ic_store = ((ic % n2) >> 2) + (ic / n2) * n2; + n2_store = n2 >> 2U; +#else + ic = (i0 % n2) * twidCoefModifier; +#endif + LOAD_STORE_TWIDDLEFACT; + SHUFFLE_TWIDDLEFACT; + + for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) { + int16_t *pIn = + pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); + int16_t *pOut = + pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); + radix4_butterfly_middle(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2, + C3); + } + } + pTmp = pSrc16; + pSrc16 = pDst16; + pDst16 = pTmp; +#ifdef FOLDED_TWIDDLES + pTmp = pCoef_src; + pCoef_src = pCoef_dst; + pCoef_dst = pTmp; +#else + twidCoefModifier <<= 2U; +#endif + mempool_log_partial_barrier(2, absolute_core_id, N_FFTs_COL * nPE); + } + + /* LAST STAGE */ + for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, fftLen >> 2U); i0++) { + for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) { + int16_t *pIn = + pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); + int16_t *pOut = + pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); + radix4_butterfly_last(pIn, pOut, i0); + } + } + pTmp = pSrc16; + pSrc16 = pDst16; + pDst16 = pTmp; + mempool_log_partial_barrier(2, absolute_core_id, N_FFTs_COL * nPE); + mempool_stop_benchmark(); + mempool_start_benchmark(); + /* BITREVERSAL */ + // Bitreversal stage stores in the sequential addresses + if (bitReverseFlag) { +#ifdef BITREVERSETABLE + pSrc16 = pSrc16 + 2 * col_id * (fftLen / 4); + pDst16 = pDst16 + 2 * col_id * fftLen; + for (ic = 8 * core_id; ic < bitReverseLen; ic += 8 * nPE) { + uint32_t addr1, addr2, addr3, addr4; + uint32_t tmpa1, tmpa2, tmpa3, tmpa4; + uint32_t tmpb1, tmpb2, tmpb3, tmpb4; + uint32_t a1, a2, a3, a4; + uint32_t b1, b2, b3, b4; + uint32_t a1_load, a2_load, a3_load, a4_load; + uint32_t b1_load, b2_load, b3_load, b4_load; + uint32_t s2 = 0x00020002; + addr1 = *(uint32_t *)&pBitRevTable[ic]; + addr2 = *(uint32_t *)&pBitRevTable[ic + 2]; + addr3 = *(uint32_t *)&pBitRevTable[ic + 4]; + addr4 = *(uint32_t *)&pBitRevTable[ic + 6]; + asm volatile("pv.sra.h %[addr1],%[addr1],%[s2];" + "pv.sra.h %[addr2],%[addr2],%[s2];" + "pv.sra.h %[addr3],%[addr3],%[s2];" + "pv.sra.h %[addr4],%[addr4],%[s2];" + "pv.extract.h %[a1],%[addr1],0;" + "pv.extract.h %[a2],%[addr2],0;" + "pv.extract.h %[a3],%[addr3],0;" + "pv.extract.h %[a4],%[addr4],0;" + "pv.extract.h %[b1],%[addr1],1;" + "pv.extract.h %[b2],%[addr2],1;" + "pv.extract.h %[b3],%[addr3],1;" + "pv.extract.h %[b4],%[addr4],1;" + : [a1] "=r"(a1), [a2] "=r"(a2), [a3] "=r"(a3), [a4] "=r"(a4), + [b1] "=r"(b1), [b2] "=r"(b2), [b3] "=r"(b3), [b4] "=r"(b4), + [addr1] "+&r"(addr1), [addr2] "+&r"(addr2), + [addr3] "+&r"(addr3), [addr4] "+&r"(addr4) + : [s2] "r"(s2) + :); + // Compute the local addresses from the natural order ones + a1_load = (a1 % 4) * 2 * N_BANKS + 2 * (a1 / 4); + a2_load = (a2 % 4) * 2 * N_BANKS + 2 * (a2 / 4); + a3_load = (a3 % 4) * 2 * N_BANKS + 2 * (a3 / 4); + a4_load = (a4 % 4) * 2 * N_BANKS + 2 * (a4 / 4); + b1_load = (b1 % 4) * 2 * N_BANKS + 2 * (b1 / 4); + b2_load = (b2 % 4) * 2 * N_BANKS + 2 * (b2 / 4); + b3_load = (b3 % 4) * 2 * N_BANKS + 2 * (b3 / 4); + b4_load = (b4 % 4) * 2 * N_BANKS + 2 * (b4 / 4); + for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) { + uint16_t *ptr1 = (uint16_t *)(pSrc16 + idx_row * (N_BANKS * 8)); + uint16_t *ptr2 = (uint16_t *)(pDst16 + idx_row * (N_BANKS * 8)); + // Load at address a + tmpa1 = *(uint32_t *)&ptr1[a1_load]; + tmpa2 = *(uint32_t *)&ptr1[a2_load]; + tmpa3 = *(uint32_t *)&ptr1[a3_load]; + tmpa4 = *(uint32_t *)&ptr1[a4_load]; + // Load at address b + tmpb1 = *(uint32_t *)&ptr1[b1_load]; + tmpb2 = *(uint32_t *)&ptr1[b2_load]; + tmpb3 = *(uint32_t *)&ptr1[b3_load]; + tmpb4 = *(uint32_t *)&ptr1[b4_load]; + // Swap a with b + *((uint32_t *)&ptr2[b1]) = tmpa1; + *((uint32_t *)&ptr2[b2]) = tmpa2; + *((uint32_t *)&ptr2[b3]) = tmpa3; + *((uint32_t *)&ptr2[b4]) = tmpa4; + // Swap b with a + *((uint32_t *)&ptr2[a1]) = tmpb1; + *((uint32_t *)&ptr2[a2]) = tmpb2; + *((uint32_t *)&ptr2[a3]) = tmpb3; + *((uint32_t *)&ptr2[a4]) = tmpb4; + } + } +#else + uint16_t *ptr1 = (uint16_t *)(pSrc16 + 2 * col_id * (fftLen / 4)); + uint16_t *ptr2 = (uint16_t *)(pDst16 + 2 * col_id * fftLen); + for (ic = core_id * 16; ic < MIN(core_id * 16 + 16, fftLen >> 2U); + ic += 4) { + uint32_t idx0 = ic; + uint32_t idx1 = ic + 1; + uint32_t idx2 = ic + 2; + uint32_t idx3 = ic + 3; + uint32_t idx_result0 = 0; + uint32_t idx_result1 = 0; + uint32_t idx_result2 = 0; + uint32_t idx_result3 = 0; + for (k = 0; k < LOG2; k++) { + idx_result0 = (idx_result0 << 1U) | (idx0 & 1U); + idx_result1 = (idx_result1 << 1U) | (idx1 & 1U); + idx_result2 = (idx_result2 << 1U) | (idx2 & 1U); + idx_result3 = (idx_result3 << 1U) | (idx3 & 1U); + idx0 = idx0 >> 1U; + idx1 = idx1 >> 1U; + idx2 = idx2 >> 1U; + idx3 = idx3 >> 1U; + } + for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) { + uint32_t addr_src0 = (idx0 / 4) + (idx0 % 4) * N_BANKS; + uint32_t addr_src1 = (idx1 / 4) + (idx1 % 4) * N_BANKS; + uint32_t addr_src2 = (idx2 / 4) + (idx2 % 4) * N_BANKS; + uint32_t addr_src3 = (idx3 / 4) + (idx3 % 4) * N_BANKS; + uint32_t addr_dst0 = idx_result0; + uint32_t addr_dst1 = idx_result1; + uint32_t addr_dst2 = idx_result2; + uint32_t addr_dst3 = idx_result3; + addr_src0 += idx_row * (N_BANKS * 8); + addr_src1 += idx_row * (N_BANKS * 8); + addr_src2 += idx_row * (N_BANKS * 8); + addr_src3 += idx_row * (N_BANKS * 8); + addr_dst0 += idx_row * (N_BANKS * 8); + addr_dst1 += idx_row * (N_BANKS * 8); + addr_dst2 += idx_row * (N_BANKS * 8); + addr_dst3 += idx_row * (N_BANKS * 8); + *((uint32_t *)&ptr2[addr_dst0]) = (uint32_t)ptr1[addr_src0]; + *((uint32_t *)&ptr2[addr_dst1]) = (uint32_t)ptr1[addr_src1]; + *((uint32_t *)&ptr2[addr_dst2]) = (uint32_t)ptr1[addr_src2]; + *((uint32_t *)&ptr2[addr_dst3]) = (uint32_t)ptr1[addr_src3]; + } + } +#endif + } + mempool_log_partial_barrier(2, absolute_core_id, nPE); + return; +} diff --git a/software/runtime/kernel/mempool_radix4_cfft_q16s.h b/software/runtime/kernel/mempool_radix4_cfft_q16s.h new file mode 100644 index 000000000..d795cc40c --- /dev/null +++ b/software/runtime/kernel/mempool_radix4_cfft_q16s.h @@ -0,0 +1,572 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +#include "xpulp/builtins_v2.h" + +void mempool_radix4_cfft_q16s_riscv32(int16_t *pIn, uint32_t fftLen, + int16_t *pCoef16, + uint32_t twidCoefModifier) { + + int16_t R0, R1, S0, S1, T0, T1, U0, U1; + int16_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2; + uint32_t n1, n2, ic, i0, i1, i2, i3, j, k; + + /* START OF FIRST STAGE PROCESS */ + n2 = fftLen; + n1 = n2; + n2 >>= 2U; + ic = 0U; + for (i0 = 0; i0 < n2; i0++) { + + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + /* Reading i0, i0+fftLen/2 inputs */ + /* input is down scale by 4 to avoid overflow */ + /* Read ya (real), xa (imag) input */ + T0 = pIn[i0 * 2U] >> 2U; + T1 = pIn[(i0 * 2U) + 1U] >> 2U; + /* input is down scale by 4 to avoid overflow */ + /* Read yc (real), xc(imag) input */ + S0 = pIn[i2 * 2U] >> 2U; + S1 = pIn[(i2 * 2U) + 1U] >> 2U; + /* R0 = (ya + yc) */ + R0 = (int16_t)__CLIP(T0 + S0, 15); + /* R1 = (xa + xc) */ + R1 = (int16_t)__CLIP(T1 + S1, 15); + /* S0 = (ya - yc) */ + S0 = (int16_t)__CLIP(T0 - S0, 15); + /* S1 = (xa - xc) */ + S1 = (int16_t)__CLIP(T1 - S1, 15); + /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */ + /* input is down scale by 4 to avoid overflow */ + /* Read yb (real), xb(imag) input */ + T0 = pIn[i1 * 2U] >> 2U; + T1 = pIn[(i1 * 2U) + 1U] >> 2U; + /* input is down scale by 4 to avoid overflow */ + /* Read yd (real), xd(imag) input */ + U0 = pIn[i3 * 2U] >> 2U; + U1 = pIn[(i3 * 2U) + 1U] >> 2U; + /* T0 = (yb + yd) */ + T0 = (int16_t)__CLIP(T0 + U0, 15); + /* T1 = (xb + xd) */ + T1 = (int16_t)__CLIP(T1 + U1, 15); + /* writing the butterfly processed i0 sample */ + /* ya' = ya + yb + yc + yd */ + /* xa' = xa + xb + xc + xd */ + pIn[i0 * 2] = (int16_t)((R0 >> 1U) + (T0 >> 1U)); + pIn[(i0 * 2) + 1] = (int16_t)((R1 >> 1U) + (T1 >> 1U)); + /* R0 = (ya + yc) - (yb + yd) */ + /* R1 = (xa + xc) - (xb + xd) */ + R0 = (int16_t)__CLIP(R0 - T0, 15); + R1 = (int16_t)__CLIP(R1 - T1, 15); + /* co2 & si2 are read from Coefficient pointer */ + Co2 = pCoef16[2U * ic * 2U]; + Si2 = pCoef16[(2U * ic * 2U) + 1]; + /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */ + out1 = (int16_t)((Co2 * R0 + Si2 * R1) >> 16U); + /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */ + out2 = (int16_t)((-Si2 * R0 + Co2 * R1) >> 16U); + /* Reading i0+fftLen/4 */ + /* input is down scale by 4 to avoid overflow */ + /* T0 = yb, T1 = xb */ + T0 = pIn[i1 * 2U] >> 2; + T1 = pIn[(i1 * 2U) + 1] >> 2; + /* writing the butterfly processed i0 + fftLen/4 sample */ + /* writing output(xc', yc') in little endian format */ + pIn[i1 * 2U] = out1; + pIn[(i1 * 2U) + 1] = out2; + /* Butterfly calculations */ + /* input is down scale by 4 to avoid overflow */ + /* U0 = yd, U1 = xd */ + U0 = pIn[i3 * 2U] >> 2; + U1 = pIn[(i3 * 2U) + 1] >> 2; + /* T0 = yb-yd */ + T0 = (int16_t)__CLIP(T0 - U0, 15); + /* T1 = xb-xd */ + T1 = (int16_t)__CLIP(T1 - U1, 15); + /* R1 = (ya-yc) + (xb- xd), R0 = (xa-xc) - (yb-yd)) */ + R0 = (int16_t)__CLIP((int32_t)(S0 - T1), 15); + R1 = (int16_t)__CLIP((int32_t)(S1 + T0), 15); + /* S1 = (ya-yc) - (xb- xd), S0 = (xa-xc) + (yb-yd)) */ + S0 = (int16_t)__CLIP(((int32_t)S0 + T1), 15); + S1 = (int16_t)__CLIP(((int32_t)S1 - T0), 15); + /* co1 & si1 are read from Coefficient pointer */ + Co1 = pCoef16[ic * 2U]; + Si1 = pCoef16[(ic * 2U) + 1]; + /* Butterfly process for the i0+fftLen/2 sample */ + /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */ + out1 = (int16_t)((Si1 * S1 + Co1 * S0) >> 16); + /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */ + out2 = (int16_t)((-Si1 * S0 + Co1 * S1) >> 16); + /* writing output(xb', yb') in little endian format */ + pIn[i2 * 2U] = out1; + pIn[(i2 * 2U) + 1] = out2; + /* Co3 & si3 are read from Coefficient pointer */ + Co3 = pCoef16[3U * (ic * 2U)]; + Si3 = pCoef16[(3U * (ic * 2U)) + 1]; + /* Butterfly process for the i0+3fftLen/4 sample */ + /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */ + out1 = (int16_t)((Si3 * R1 + Co3 * R0) >> 16U); + /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */ + out2 = (int16_t)((-Si3 * R0 + Co3 * R1) >> 16U); + /* writing output(xd', yd') in little endian format */ + pIn[i3 * 2U] = out1; + pIn[(i3 * 2U) + 1] = out2; + + /* Twiddle coefficients index modifier */ + ic = ic + twidCoefModifier; + }; + /* END OF FIRST STAGE PROCESS */ + + /* START OF MIDDLE STAGE PROCESS */ + twidCoefModifier <<= 2U; + for (k = fftLen / 4U; k > 4U; k >>= 2U) { + + /* Initializations for the middle stage */ + n1 = n2; + n2 >>= 2U; + ic = 0U; + for (j = 0U; j <= (n2 - 1U); j++) { + + /* index calculation for the coefficients */ + Co1 = pCoef16[ic * 2U]; + Si1 = pCoef16[(ic * 2U) + 1U]; + Co2 = pCoef16[2U * (ic * 2U)]; + Si2 = pCoef16[(2U * (ic * 2U)) + 1U]; + Co3 = pCoef16[3U * (ic * 2U)]; + Si3 = pCoef16[(3U * (ic * 2U)) + 1U]; + /* Twiddle coefficients index modifier */ + ic = ic + twidCoefModifier; + + /* Butterfly implementation */ + for (i0 = j; i0 < fftLen; i0 += n1) { + + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + /* Reading i0, i0+fftLen/2 inputs */ + /* Read ya (real), xa(imag) input */ + T0 = pIn[i0 * 2U]; + T1 = pIn[(i0 * 2U) + 1U]; + /* Read yc (real), xc(imag) input */ + S0 = pIn[i2 * 2U]; + S1 = pIn[(i2 * 2U) + 1U]; + /* R0 = (ya + yc), R1 = (xa + xc) */ + R0 = (int16_t)__CLIP(T0 + S0, 15); + R1 = (int16_t)__CLIP(T1 + S1, 15); + /* S0 = (ya - yc), S1 =(xa - xc) */ + S0 = (int16_t)__CLIP(T0 - S0, 15); + S1 = (int16_t)__CLIP(T1 - S1, 15); + /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */ + /* Read yb (real), xb(imag) input */ + T0 = pIn[i1 * 2U]; + T1 = pIn[(i1 * 2U) + 1U]; + /* Read yd (real), xd(imag) input */ + U0 = pIn[i3 * 2U]; + U1 = pIn[(i3 * 2U) + 1U]; + /* T0 = (yb + yd), T1 = (xb + xd) */ + T0 = (int16_t)__CLIP(T0 + U0, 15); + T1 = (int16_t)__CLIP(T1 + U1, 15); + /* writing the butterfly processed i0 sample */ + /* xa' = xa + xb + xc + xd */ + /* ya' = ya + yb + yc + yd */ + out1 = (int16_t)(((R0 >> 1U) + (T0 >> 1U)) >> 1U); + out2 = (int16_t)(((R1 >> 1U) + (T1 >> 1U)) >> 1U); + pIn[i0 * 2U] = out1; + pIn[(2U * i0) + 1U] = out2; + /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */ + R0 = (int16_t)((R0 >> 1U) - (T0 >> 1U)); + R1 = (int16_t)((R1 >> 1U) - (T1 >> 1U)); + /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */ + out1 = (int16_t)((Co2 * R0 + Si2 * R1) >> 16U); + /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */ + out2 = (int16_t)((-Si2 * R0 + Co2 * R1) >> 16U); + /* Reading i0+3fftLen/4 */ + /* Read yb (real), xb(imag) input */ + T0 = pIn[i1 * 2U]; + T1 = pIn[(i1 * 2U) + 1U]; + /* writing the butterfly processed i0 + fftLen/4 sample */ + /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */ + /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */ + pIn[i1 * 2U] = out1; + pIn[(i1 * 2U) + 1U] = out2; + /* Butterfly calculations */ + /* Read yd (real), xd(imag) input */ + U0 = pIn[i3 * 2U]; + U1 = pIn[(i3 * 2U) + 1U]; + /* T0 = yb-yd, T1 = xb-xd */ + T0 = (int16_t)__CLIP(T0 - U0, 15); + T1 = (int16_t)__CLIP(T1 - U1, 15); + /* R0 = (ya-yc) + (xb- xd), R1 = (xa-xc) - (yb-yd)) */ + R0 = (int16_t)((S0 >> 1U) - (T1 >> 1U)); + R1 = (int16_t)((S1 >> 1U) + (T0 >> 1U)); + /* S0 = (ya-yc) - (xb- xd), S1 = (xa-xc) + (yb-yd)) */ + S0 = (int16_t)((S0 >> 1U) + (T1 >> 1U)); + S1 = (int16_t)((S1 >> 1U) - (T0 >> 1U)); + /* Butterfly process for the i0+fftLen/2 sample */ + out1 = (int16_t)((Co1 * S0 + Si1 * S1) >> 16U); + out2 = (int16_t)((-Si1 * S0 + Co1 * S1) >> 16U); + /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */ + /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */ + pIn[i2 * 2U] = out1; + pIn[(i2 * 2U) + 1U] = out2; + /* Butterfly process for the i0+3fftLen/4 sample */ + out1 = (int16_t)((Si3 * R1 + Co3 * R0) >> 16U); + out2 = (int16_t)((-Si3 * R0 + Co3 * R1) >> 16U); + /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */ + /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */ + pIn[i3 * 2U] = out1; + pIn[(i3 * 2U) + 1U] = out2; + } + } + /* Twiddle coefficients index modifier */ + twidCoefModifier <<= 2U; + } + /* END OF MIDDLE STAGE PROCESSING */ + + /* START OF LAST STAGE PROCESSING */ + n1 = n2; + n2 >>= 2U; + for (i0 = 0U; i0 < fftLen; i0 += n1) { + + i1 = i0 + n2; + i2 = i1 + n2; + i3 = i2 + n2; + + /* Reading i0, i0+fftLen/2 inputs */ + /* Read ya (real), xa(imag) input */ + T0 = pIn[i0 * 2U]; + T1 = pIn[(i0 * 2U) + 1U]; + /* Read yc (real), xc(imag) input */ + S0 = pIn[i2 * 2U]; + S1 = pIn[(i2 * 2U) + 1U]; + /* R0 = (ya + yc), R1 = (xa + xc) */ + R0 = (int16_t)__CLIP(T0 + S0, 15); + R1 = (int16_t)__CLIP(T1 + S1, 15); + /* S0 = (ya - yc), S1 = (xa - xc) */ + S0 = (int16_t)__CLIP(T0 - S0, 15); + S1 = (int16_t)__CLIP(T1 - S1, 15); + /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */ + /* Read yb (real), xb(imag) input */ + T0 = pIn[i1 * 2U]; + T1 = pIn[(i1 * 2U) + 1U]; + /* Read yd (real), xd(imag) input */ + U0 = pIn[i3 * 2U]; + U1 = pIn[(i3 * 2U) + 1U]; + /* T0 = (yb + yd), T1 = (xb + xd)) */ + T0 = (int16_t)__CLIP(T0 + U0, 15); + T1 = (int16_t)__CLIP(T1 + U1, 15); + /* writing the butterfly processed i0 sample */ + /* xa' = xa + xb + xc + xd */ + /* ya' = ya + yb + yc + yd */ + pIn[i0 * 2U] = (int16_t)((R0 >> 1U) + (T0 >> 1U)); + pIn[(i0 * 2U) + 1U] = (int16_t)((R1 >> 1U) + (T1 >> 1U)); + /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */ + R0 = (int16_t)((R0 >> 1U) - (T0 >> 1U)); + R1 = (int16_t)((R1 >> 1U) - (T1 >> 1U)); + /* Read yb (real), xb(imag) input */ + T0 = pIn[i1 * 2U]; + T1 = pIn[(i1 * 2U) + 1U]; + /* writing the butterfly processed i0 + fftLen/4 sample */ + /* xc' = (xa-xb+xc-xd) */ + /* yc' = (ya-yb+yc-yd) */ + pIn[i1 * 2U] = R0; + pIn[(i1 * 2U) + 1U] = R1; + /* Read yd (real), xd(imag) input */ + U0 = pIn[i3 * 2U]; + U1 = pIn[(i3 * 2U) + 1U]; + /* T0 = (yb - yd), T1 = (xb - xd) */ + T0 = (int16_t)__CLIP(T0 - U0, 15); + T1 = (int16_t)__CLIP(T1 - U1, 15); + /* writing the butterfly processed i0 + fftLen/2 sample */ + /* xb' = (xa+yb-xc-yd) */ + /* yb' = (ya-xb-yc+xd) */ + pIn[i2 * 2U] = (int16_t)((S0 >> 1U) + (T1 >> 1U)); + pIn[(i2 * 2U) + 1U] = (int16_t)((S1 >> 1U) - (T0 >> 1U)); + /* writing the butterfly processed i0 + 3fftLen/4 sample */ + /* xd' = (xa-yb-xc+yd) */ + /* yd' = (ya+xb-yc-xd) */ + pIn[i3 * 2U] = (int16_t)((S0 >> 1U) - (T1 >> 1U)); + pIn[(i3 * 2U) + 1U] = (int16_t)((S1 >> 1U) + (T0 >> 1U)); + } + /* END OF LAST STAGE PROCESSING */ +} + +void mempool_radix4_cfft_q16s_xpulpimg(int16_t *pSrc16, uint32_t fftLen, + int16_t *pCoef16, + uint32_t twidCoefModifier) { + + v2s CoSi1, CoSi2, CoSi3; + v2s C1, C2, C3; + int16_t t0, t1, t2, t3, t4, t5; + uint32_t n1, n2, ic, i0, j, k; + + n1 = fftLen; + n2 = n1 >> 2U; + + /* START OF FIRST STAGE PROCESS */ + for (i0 = 0; i0 < n2; i0++) { + CoSi1 = *(v2s *)&pCoef16[2U * i0]; + CoSi2 = *(v2s *)&pCoef16[2U * i0 * 2U]; + CoSi3 = *(v2s *)&pCoef16[2U * i0 * 3U]; + SHUFFLE_TWIDDLEFACT; + + radix4_butterfly_first(pSrc16, pSrc16, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2, + C3); + } + /* END OF FIRST STAGE PROCESS */ + + /* START OF MIDDLE STAGE PROCESS */ + twidCoefModifier <<= 2U; + for (k = fftLen / 4U; k > 4U; k >>= 2U) { + n1 = n2; + n2 >>= 2U; + ic = 0U; + for (j = 0U; j <= (n2 - 1U); j++) { + /* index calculation for the coefficients */ + CoSi1 = *(v2s *)&pCoef16[ic * 2U]; + CoSi2 = *(v2s *)&pCoef16[2U * (ic * 2U)]; + CoSi3 = *(v2s *)&pCoef16[3U * (ic * 2U)]; + SHUFFLE_TWIDDLEFACT; + + /* Twiddle coefficients index modifier */ + ic = ic + twidCoefModifier; + /* Butterfly implementation */ + for (i0 = j; i0 < fftLen; i0 += n1) { + radix4_butterfly_middle(pSrc16, pSrc16, i0, n2, CoSi1, CoSi2, CoSi3, C1, + C2, C3); + } + } + twidCoefModifier <<= 2U; + } + /* END OF MIDDLE STAGE PROCESSING */ + + /* START OF LAST STAGE PROCESSING */ + n1 = n2; + n2 >>= 2U; + for (i0 = 0U; i0 < fftLen; i0 += n1) { + radix4_butterfly_last(pSrc16, pSrc16, i0); + } + /* END OF LAST STAGE PROCESSING */ +} + +void mempool_cfft_radix4by2_q16s_riscv32(int16_t *pSrc, uint32_t fftLen, + const int16_t *pCoef) { + + uint32_t i; + uint32_t n2; + int16_t p0, p1, p2, p3; + + uint32_t l; + int16_t xt, yt, cosVal, sinVal; + + n2 = fftLen >> 1; + + for (i = 0; i < n2; i++) { + cosVal = pCoef[i * 2]; + sinVal = pCoef[(i * 2) + 1]; + + l = i + n2; + + xt = (int16_t)((pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U)); + pSrc[2 * i] = (int16_t)(((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U); + + yt = (int16_t)((pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U)); + pSrc[2 * i + 1] = + (int16_t)(((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U); + + pSrc[2U * l] = (int16_t)(((int16_t)(((int32_t)xt * cosVal) >> 16)) + + ((int16_t)(((int32_t)yt * sinVal) >> 16))); + + pSrc[2U * l + 1U] = (int16_t)(((int16_t)(((int32_t)yt * cosVal) >> 16)) - + ((int16_t)(((int32_t)xt * sinVal) >> 16))); + } + + // first col + mempool_radix4_cfft_q16s_riscv32(pSrc, n2, (int16_t *)pCoef, 2U); + // second col + mempool_radix4_cfft_q16s_riscv32(pSrc + fftLen, n2, (int16_t *)pCoef, 2U); + + for (i = 0; i < (fftLen >> 1); i++) { + p0 = pSrc[4 * i + 0]; + p1 = pSrc[4 * i + 1]; + p2 = pSrc[4 * i + 2]; + p3 = pSrc[4 * i + 3]; + + p0 = (int16_t)(p0 << 1U); + p1 = (int16_t)(p1 << 1U); + p2 = (int16_t)(p2 << 1U); + p3 = (int16_t)(p3 << 1U); + + pSrc[4 * i + 0] = p0; + pSrc[4 * i + 1] = p1; + pSrc[4 * i + 2] = p2; + pSrc[4 * i + 3] = p3; + } +} + +void mempool_cfft_radix4by2_q16s_xpulpimg(int16_t *pSrc, uint32_t fftLen, + const int16_t *pCoef) { + + uint32_t i; + uint32_t n2; + v2s pa, pb; + + uint32_t l; + v2s CoSi; + v2s a, b, t; + int16_t testa, testb; + + n2 = fftLen >> 1; + + for (i = 0; i < n2; i++) { + CoSi = *(v2s *)&pCoef[i * 2]; + l = i + n2; + a = __SRA2(*(v2s *)&pSrc[2 * i], ((v2s){1, 1})); + b = __SRA2(*(v2s *)&pSrc[2 * l], ((v2s){1, 1})); + t = __SUB2(a, b); + *((v2s *)&pSrc[i * 2]) = __SRA2(__ADD2(a, b), ((v2s){1, 1})); + + testa = (int16_t)(__DOTP2(t, CoSi) >> 16); + testb = (int16_t)(__DOTP2(t, __PACK2(-CoSi[1], CoSi[0])) >> 16); + *((v2s *)&pSrc[l * 2]) = __PACK2(testa, testb); + } + + // first col + mempool_radix4_cfft_q16s_xpulpimg(pSrc, n2, (int16_t *)pCoef, 2U); + // second col + mempool_radix4_cfft_q16s_xpulpimg(pSrc + fftLen, n2, (int16_t *)pCoef, 2U); + + for (i = 0; i < (fftLen >> 1); i++) { + pa = *(v2s *)&pSrc[4 * i]; + pb = *(v2s *)&pSrc[4 * i + 2]; + + pa = __SLL2(pa, ((v2s){1, 1})); + pb = __SLL2(pb, ((v2s){1, 1})); + + *((v2s *)&pSrc[4 * i]) = pa; + *((v2s *)&pSrc[4 * i + 2]) = pb; + } +} + +void mempool_radix4_cfft_q16s_scheduler( + int16_t *pSrc16, uint16_t fftLen, int16_t *pCoef16, uint16_t *pBitRevTable, + uint16_t bitReverseLen, uint8_t bitReverseFlag, uint32_t nFFTs) { + + /* Initializations for the first stage */ + int16_t t0, t1, t2, t3, t4, t5; + uint32_t n1, n2, i0, ic, j, k, twidCoefModifier; + v2s CoSi1, CoSi2, CoSi3; + v2s C1, C2, C3; + + /* FIRST STAGE */ + n1 = fftLen; + n2 = n1 >> 2U; + for (i0 = 0; i0 < n2; i0++) { + CoSi1 = *(v2s *)&pCoef16[2U * i0]; + CoSi2 = *(v2s *)&pCoef16[2U * i0 * 2U]; + CoSi3 = *(v2s *)&pCoef16[2U * i0 * 3U]; + SHUFFLE_TWIDDLEFACT; + + for (uint32_t idx_fft = 0; idx_fft < nFFTs; idx_fft++) { + radix4_butterfly_first(pSrc16 + (int32_t)idx_fft * (2 * fftLen), + pSrc16 + (int32_t)idx_fft * (2 * fftLen), i0, n2, + CoSi1, CoSi2, CoSi3, C1, C2, C3); + } + } + + /* MIDDLE STAGE */ + twidCoefModifier = 4; + for (k = fftLen / 4U; k > 4U; k >>= 2U) { + n1 = n2; + n2 >>= 2U; + ic = 0U; + for (j = 0U; j <= (n2 - 1U); j++) { + CoSi1 = *(v2s *)&pCoef16[2U * ic]; + CoSi2 = *(v2s *)&pCoef16[2U * ic * 2U]; + CoSi3 = *(v2s *)&pCoef16[2U * ic * 3U]; + SHUFFLE_TWIDDLEFACT; + + ic = ic + twidCoefModifier; + for (i0 = j; i0 < fftLen; i0 += n1) { + for (uint32_t idx_fft = 0; idx_fft < nFFTs; idx_fft++) { + radix4_butterfly_middle(pSrc16 + (int32_t)idx_fft * (2 * fftLen), + pSrc16 + (int32_t)idx_fft * (2 * fftLen), i0, + n2, CoSi1, CoSi2, CoSi3, C1, C2, C3); + } + } + } + twidCoefModifier <<= 2U; + } + + /* LAST STAGE */ + n1 = n2; + n2 >>= 2U; + for (i0 = 0U; i0 < fftLen; i0 += n1) { + for (uint32_t idx_fft = 0; idx_fft < nFFTs; idx_fft++) { + radix4_butterfly_last(pSrc16 + (int32_t)idx_fft * (2 * fftLen), + pSrc16 + (int32_t)idx_fft * (2 * fftLen), i0); + } + } + + /* BITREVERSAL */ + if (bitReverseFlag) { + v2s addr1, addr2, addr3, addr4; + v2s s2 = (v2s){2, 2}; + v2s tmpa1, tmpa2, tmpa3, tmpa4; + v2s tmpb1, tmpb2, tmpb3, tmpb4; + int32_t a1, a2, a3, a4; + int32_t b1, b2, b3, b4; + uint16_t *ptr; + for (uint32_t i = 0; i < bitReverseLen; i += 8) { + addr1 = *(v2s *)&pBitRevTable[i]; + addr2 = *(v2s *)&pBitRevTable[i + 2]; + addr3 = *(v2s *)&pBitRevTable[i + 4]; + addr4 = *(v2s *)&pBitRevTable[i + 6]; + asm volatile("pv.sra.h %[addr1],%[addr1],%[s2];" + "pv.sra.h %[addr2],%[addr2],%[s2];" + "pv.sra.h %[addr3],%[addr3],%[s2];" + "pv.sra.h %[addr4],%[addr4],%[s2];" + "pv.extract.h %[a1],%[addr1],0;" + "pv.extract.h %[a2],%[addr2],0;" + "pv.extract.h %[a3],%[addr3],0;" + "pv.extract.h %[a4],%[addr4],0;" + "pv.extract.h %[b1],%[addr1],1;" + "pv.extract.h %[b2],%[addr2],1;" + "pv.extract.h %[b3],%[addr3],1;" + "pv.extract.h %[b4],%[addr4],1;" + : [a1] "=r"(a1), [a2] "=r"(a2), [a3] "=r"(a3), [a4] "=r"(a4), + [b1] "=r"(b1), [b2] "=r"(b2), [b3] "=r"(b3), [b4] "=r"(b4), + [addr1] "+r"(addr1), [addr2] "+r"(addr2), + [addr3] "+r"(addr3), [addr4] "+r"(addr4) + : [s2] "r"(s2) + :); + ptr = (uint16_t *)pSrc16; + for (uint32_t idx_fft = 0; idx_fft < nFFTs; idx_fft++) { + tmpa1 = *(v2s *)&ptr[a1]; + tmpa2 = *(v2s *)&ptr[a2]; + tmpa3 = *(v2s *)&ptr[a3]; + tmpa4 = *(v2s *)&ptr[a4]; + tmpb1 = *(v2s *)&ptr[b1]; + tmpb2 = *(v2s *)&ptr[b2]; + tmpb3 = *(v2s *)&ptr[b3]; + tmpb4 = *(v2s *)&ptr[b4]; + *((v2s *)&ptr[a1]) = tmpb1; + *((v2s *)&ptr[a2]) = tmpb2; + *((v2s *)&ptr[a3]) = tmpb3; + *((v2s *)&ptr[a4]) = tmpb4; + *((v2s *)&ptr[b1]) = tmpa1; + *((v2s *)&ptr[b2]) = tmpa2; + *((v2s *)&ptr[b3]) = tmpa3; + *((v2s *)&ptr[b4]) = tmpa4; + ptr += (2 * fftLen); + } + } + } +}