From c27e56cc737a3d5e99bd8ef898196dbdfb4ba5de Mon Sep 17 00:00:00 2001 From: mbertuletti Date: Thu, 25 Apr 2024 09:13:29 +0200 Subject: [PATCH] [software] Adapt to new folder structure --- Makefile | 2 +- software/apps/baremetal/Makefile | 4 +- .../apps/baremetal/cfft_radix4_f16/main.c | 117 +-- software/apps/baremetal/chest_f16/main.c | 7 +- software/apps/baremetal/chest_q16/main.c | 4 +- software/apps/baremetal/cholesky_f16/main.c | 1 - software/apps/baremetal/cholesky_q16/main.c | 5 +- software/apps/baremetal/cmatmul_f16/main.c | 7 +- .../apps/{ => baremetal}/cmatmul_q16/main.c | 6 +- software/apps/baremetal/mimo_mmse_f16/main.c | 29 +- software/apps/baremetal/mimo_mmse_f32/main.c | 18 +- software/apps/baremetal/mimo_mmse_q16/main.c | 10 +- software/apps/{ => baremetal}/ofdm/main.c | 53 +- software/apps/cfft_radix4_q16/main.c | 0 software/apps/chest_q16/main.c | 0 .../{runtime => }/data/data_cfft_f16.h.tpl | 0 .../{runtime => }/data/data_cfft_q16.h.tpl | 0 software/data/data_cfft_radix2_q16.h.tpl | 55 -- software/data/data_cfft_radix2_q16.py | 200 ----- software/data/data_cfft_radix4_f16.h.tpl | 45 - software/data/data_cfft_radix4_f16.py | 121 --- software/data/data_cfft_radix4_q16.h.tpl | 57 -- software/data/data_cfft_radix4_q16.py | 200 ----- software/data/data_chest_f16.py | 132 --- software/data/data_chest_q16.py | 160 ---- software/data/data_cholesky_f16.py | 108 --- software/data/data_cholesky_q16.py | 117 --- software/data/data_cholesky_q32.py | 106 --- software/data/data_cmatmul_f16.py | 117 --- .../{runtime => }/data/data_cmatmul_q16.h.tpl | 0 software/data/data_matmulf16.py | 111 --- software/data/data_matmulf32.py | 112 --- software/data/data_mimo_mmse_f16.py | 185 ---- software/data/data_mimo_mmse_f32.py | 154 ---- software/data/data_mimo_mmse_q16.py | 166 ---- software/data/data_ofdm.py | 137 --- software/{runtime => }/data/generate_cfft.py | 0 software/{runtime => }/data/generate_chest.py | 4 +- .../{runtime => }/data/generate_cholesky.py | 9 +- .../{runtime => }/data/generate_matmul.py | 0 .../{runtime => }/data/generate_mimo_mmse.py | 25 +- software/{runtime => }/data/generate_ofdm.py | 0 .../mempool_cfft_radix4_butterfly_f16.h | 199 ----- .../baremetal/mempool_cfft_radix4_f16p.h | 526 ----------- .../mempool_cfft_radix4_q16_bitreversal.h | 23 - .../kernels/baremetal/mempool_chest_f16.h | 382 ++++++++ .../kernels/baremetal/mempool_chest_f16p.h | 62 -- .../kernels/baremetal/mempool_chest_f16s.h | 194 ----- .../kernels/baremetal/mempool_chest_q16.h | 2 +- .../kernels/baremetal/mempool_cholesky_f16s.h | 1 + .../kernels/baremetal/mempool_cholesky_f32s.h | 4 + .../baremetal}/mempool_cholesky_q16s.h | 3 +- .../kernels/baremetal/mempool_cmatmul_f16.h | 814 ++++++++++-------- .../baremetal}/mempool_cmatmul_q16.h | 2 +- .../baremetal/mempool_linearsolver_f32s.h | 4 + .../baremetal}/mempool_linearsolver_q16s.h | 0 .../baremetal/mempool_mimo_mmse_f16s.h | 1 + .../baremetal}/mempool_mimo_mmse_q16s.h | 0 .../mempool_radix4_cfft_butterfly_f16.h | 2 +- .../baremetal}/mempool_radix4_cfft_f16p.h | 173 ++-- .../runtime/data/data_cfft_radix4_f16.h.tpl | 1 - .../runtime/data/data_cfft_radix4_q16.h.tpl | 1 - software/runtime/data/data_ofdm.py | 1 - software/runtime/kernel/mempool_checks.h | 0 software/runtime/kernel/mempool_chest_f16.h | 372 -------- software/runtime/kernel/mempool_chest_q16.h | 245 ------ software/runtime/kernel/mempool_chest_q16p.h | 1 - software/runtime/kernel/mempool_chest_q16s.h | 1 - .../runtime/kernel/mempool_radix2_cfft_q16s.h | 0 69 files changed, 1102 insertions(+), 4496 deletions(-) rename software/apps/{ => baremetal}/cmatmul_q16/main.c (93%) rename software/apps/{ => baremetal}/ofdm/main.c (74%) delete mode 100644 software/apps/cfft_radix4_q16/main.c delete mode 100644 software/apps/chest_q16/main.c rename software/{runtime => }/data/data_cfft_f16.h.tpl (100%) rename software/{runtime => }/data/data_cfft_q16.h.tpl (100%) delete mode 100644 software/data/data_cfft_radix2_q16.h.tpl delete mode 100644 software/data/data_cfft_radix2_q16.py delete mode 100644 software/data/data_cfft_radix4_f16.h.tpl delete mode 100644 software/data/data_cfft_radix4_f16.py delete mode 100644 software/data/data_cfft_radix4_q16.h.tpl delete mode 100755 software/data/data_cfft_radix4_q16.py delete mode 100644 software/data/data_chest_f16.py delete mode 100755 software/data/data_chest_q16.py delete mode 100644 software/data/data_cholesky_f16.py delete mode 100644 software/data/data_cholesky_q16.py delete mode 100644 software/data/data_cholesky_q32.py delete mode 100644 software/data/data_cmatmul_f16.py rename software/{runtime => }/data/data_cmatmul_q16.h.tpl (100%) delete mode 100644 software/data/data_matmulf16.py delete mode 100644 software/data/data_matmulf32.py delete mode 100644 software/data/data_mimo_mmse_f16.py delete mode 100644 software/data/data_mimo_mmse_f32.py delete mode 100644 software/data/data_mimo_mmse_q16.py delete mode 100644 software/data/data_ofdm.py rename software/{runtime => }/data/generate_cfft.py (100%) rename software/{runtime => }/data/generate_chest.py (98%) rename software/{runtime => }/data/generate_cholesky.py (95%) rename software/{runtime => }/data/generate_matmul.py (100%) rename software/{runtime => }/data/generate_mimo_mmse.py (93%) rename software/{runtime => }/data/generate_ofdm.py (100%) delete mode 100644 software/kernels/baremetal/mempool_cfft_radix4_butterfly_f16.h delete mode 100644 software/kernels/baremetal/mempool_cfft_radix4_f16p.h delete mode 100644 software/kernels/baremetal/mempool_cfft_radix4_q16_bitreversal.h create mode 100644 software/kernels/baremetal/mempool_chest_f16.h delete mode 100644 software/kernels/baremetal/mempool_chest_f16p.h delete mode 100644 software/kernels/baremetal/mempool_chest_f16s.h rename software/{runtime/kernel => kernels/baremetal}/mempool_cholesky_q16s.h (97%) rename software/{runtime/kernel => kernels/baremetal}/mempool_cmatmul_q16.h (99%) rename software/{runtime/kernel => kernels/baremetal}/mempool_linearsolver_q16s.h (100%) rename software/{runtime/kernel => kernels/baremetal}/mempool_mimo_mmse_q16s.h (100%) rename software/{runtime/kernel => kernels/baremetal}/mempool_radix4_cfft_butterfly_f16.h (99%) rename software/{runtime/kernel => kernels/baremetal}/mempool_radix4_cfft_f16p.h (72%) delete mode 100644 software/runtime/data/data_cfft_radix4_f16.h.tpl delete mode 100644 software/runtime/data/data_cfft_radix4_q16.h.tpl delete mode 100644 software/runtime/data/data_ofdm.py delete mode 100644 software/runtime/kernel/mempool_checks.h delete mode 100644 software/runtime/kernel/mempool_chest_f16.h delete mode 100644 software/runtime/kernel/mempool_chest_q16.h delete mode 100644 software/runtime/kernel/mempool_chest_q16p.h delete mode 100644 software/runtime/kernel/mempool_chest_q16s.h delete mode 100644 software/runtime/kernel/mempool_radix2_cfft_q16s.h diff --git a/Makefile b/Makefile index 6afa3eaba..9e5ff6e60 100644 --- a/Makefile +++ b/Makefile @@ -183,7 +183,7 @@ toolchain/riscv-opcodes/*: format: $(ROOT_DIR)/scripts/run_clang_format.py --clang-format-executable=$(LLVM_INSTALL_DIR)/bin/clang-format -i -r $(ROOT_DIR) - find ./software/runtime/data -name '*.py' -exec autopep8 --in-place --aggressive {} + + find ./software/data -name '*.py' -exec autopep8 --in-place --aggressive {} + clean: clean-riscv-tests rm -rf $(INSTALL_DIR) diff --git a/software/apps/baremetal/Makefile b/software/apps/baremetal/Makefile index cc9e2db7a..b4b2ee496 100644 --- a/software/apps/baremetal/Makefile +++ b/software/apps/baremetal/Makefile @@ -22,8 +22,8 @@ ALLPYS := $(patsubst %.py,%.h,$(wildcard $(DATA_DIR)/*.py)) BINARIES := $(addprefix $(BIN_DIR)/,$(APPS)) ALL := $(APPS) -ALL_GCC := $(filter-out matmul_f16 matmul_f32, $(ALL)) -ALL_LLVM := $(filter-out synth_i32 chest_q16 cfft_radix2_q16 cfft_radix4_q16, $(ALL)) +ALL_GCC := $(filter-out cfft_radix4_f16 chest_f16 cholesky_f16 cmatmul_f16 matmul_f16 matmul_f32 mimo_mmse_f32 mimo_mmse_f16 ofdm, $(ALL)) +ALL_LLVM := $(filter-out synth_i32 cfft_radix2_q16 cfft_radix4_q16 chest_q16 cholesky_q16 cholesky_q32 cmatmul_q16 mimo_mmse_q16, $(ALL)) # Make all applications all: $(ALL_GCC) diff --git a/software/apps/baremetal/cfft_radix4_f16/main.c b/software/apps/baremetal/cfft_radix4_f16/main.c index 2f4270c80..c459d062a 100644 --- a/software/apps/baremetal/cfft_radix4_f16/main.c +++ b/software/apps/baremetal/cfft_radix4_f16/main.c @@ -10,35 +10,53 @@ #include /* Mempool runtime libraries */ +#include "builtins_v2.h" #include "dma.h" #include "encoding.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" -#include "xpulp/builtins_v2.h" /* CFFT data libraries */ -#include "data/data_cfft_radix4_f16.h" - -/* - - FOLDED: Parallel FFT with "memory-aware" load/store scheme - - SCHEDULED: Scheduling of multiple parallel FFTs with "memory-aware" - load/store scheme - - N_FFTs_COL: Independent FFTs scheduled on one row (default 1) - - N_FFTs_ROW: Independent FFTs scheduled on columns (default 1) - - FOLDED_TWIDDLES: Also the twiddles have "memory-aware" load/stores -*/ - -#define FOLDED +#include "data_cfft_radix4_f16.h" + +/* CHOOSE ONE */ +//#define SINGLE // Single core FFT. +//#define PARALLEL // Parallel FFT not "memory-aware". +//#define FOLDED // Parallel FFT with "memory-aware" load/store. +#define SCHEDULED // Folded FFTs arranged in rows and cols.''' + +// Bitreversal index from table. +#define BITREVERSETABLE +// Independent FFTs scheduled on one row (default 1). +#define N_FFTs_ROW 2 +// Independent FFTs scheduled on columns (default 1). +#define N_FFTs_COL 2 +#if (N_FFTs_COL > MAX_COL) +#error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)] +#endif +// Also the twiddles have "memory-aware" load/stores. #define FOLDED_TWIDDLES -#define N_FFTs_ROW 1 -#define N_FFTs_COL 1 -#include "kernel/mempool_checks.h" -#include "kernel/mempool_radix4_cfft_butterfly_f16.h" -#include "kernel/mempool_radix4_cfft_f16p.h" -#include "kernel/mempool_radix4_cfft_q16_bitreversal.h" +#include "baremetal/mempool_cfft_q16_bitreversal.h" +#include "baremetal/mempool_checks.h" +#include "baremetal/mempool_radix4_cfft_butterfly_f16.h" +#include "baremetal/mempool_radix4_cfft_f16p.h" + +#if (defined(SINGLE) || defined(PARALLEL)) +__fp16 l1_pSrc[2 * N_CSAMPLES] + __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); +__fp16 l1_pDst[2 * N_CSAMPLES] + __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); +__fp16 l1_twiddleCoef_f16_src[2 * 3 * N_CSAMPLES / 4] + __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); +__fp16 l1_twiddleCoef_f16_dst[2 * 3 * N_CSAMPLES / 4] + __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); +uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH] + __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); +#endif +#if (defined(SCHEDULED) || defined(FOLDED)) __fp16 l1_pSrc[N_FFTs_ROW * 8 * N_BANKS] __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); __fp16 l1_pDst[N_FFTs_ROW * 8 * N_BANKS] @@ -49,49 +67,44 @@ __fp16 l1_twiddleCoef_f16_dst[8 * N_BANKS] __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH] __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); +#endif int main() { - uint32_t core_id = mempool_get_core_id(); uint32_t num_cores = mempool_get_core_count(); - mempool_barrier_init(core_id); __fp16 *pRes = (__fp16 *)0; + mempool_barrier_init(core_id); + + /* INITIALIZATION */ if (core_id == 0) { - // Each FFT is folded over 4 memory rows - // Each memory row is 2 * N_BANKS (real-imag) samples for (uint32_t j = 0; j < N_FFTs_ROW; j++) { - dma_memcpy_blocking(l1_pSrc + j * (8 * N_BANKS), l2_pSrc, - (N_CSAMPLES * N_FFTs_COL) * sizeof(int32_t)); + for (uint32_t i = 0; i < N_FFTs_COL; i++) { + dma_memcpy_blocking(l1_pSrc + i * 2 * N_CSAMPLES + j * (8 * N_BANKS), + l2_pSrc, N_CSAMPLES * sizeof(int32_t)); + } } - dma_memcpy_blocking(l1_pSrc, l2_pSrc, N_CSAMPLES * sizeof(int32_t)); dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable, - BITREVINDEXTABLE_LENGTH * sizeof(int16_t)); - dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16, - 3 * (N_CSAMPLES / 4) * sizeof(int32_t)); + BITREVINDEXTABLE_LENGTH * sizeof(int32_t)); } -// Initialize the Twiddles folded -#ifdef FOLDED_TWIDDLES + mempool_barrier(num_cores); for (uint32_t j = 0; j < N_FFTs_COL; j++) { - uint32_t N_WORDS_COL = (N_CSAMPLES / 4); + uint32_t N_WORDS_COL = N_CSAMPLES >> 2; for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) { - *(v2h *)&l1_twiddleCoef_f16_src[2U * (i + j * N_WORDS_COL)] = - *(v2h *)&l2_twiddleCoef_f16[2U * i]; - *(v2h *)&l1_twiddleCoef_f16_src[2U * - (i + j * N_WORDS_COL + 1 * N_BANKS)] = - *(v2h *)&l2_twiddleCoef_f16[2U * (i * 2U)]; - *(v2h *)&l1_twiddleCoef_f16_src[2U * - (i + j * N_WORDS_COL + 2 * N_BANKS)] = - *(v2h *)&l2_twiddleCoef_f16[2U * (i * 3U)]; + *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL)] = + *(v2h *)&l2_twiddleCoef_f16[2 * i]; + *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL + 1 * N_BANKS)] = + *(v2h *)&l2_twiddleCoef_f16[2 * (i * 2U)]; + *(v2h *)&l1_twiddleCoef_f16_src[2 * (i + j * N_WORDS_COL + 2 * N_BANKS)] = + *(v2h *)&l2_twiddleCoef_f16[2 * (i * 3U)]; } } -#endif if (core_id == 0) { printf("01: END INITIALIZATION\n"); } mempool_barrier(num_cores); -#if (defined(FOLDED) && defined(FOLDED_TWIDDLES)) +#ifdef FOLDED if (core_id < (N_CSAMPLES / 16)) { mempool_start_benchmark(); mempool_radix4_cfft_f16p_folded(l1_pSrc, l1_pDst, N_CSAMPLES, @@ -105,26 +118,28 @@ int main() { #endif #ifdef SCHEDULED - uint32_t nPE = (N_CSAMPLES / 16); - if (core_id < N_FFTs_COL * nPE) { + uint32_t CORES_USED = (N_CSAMPLES / 4) / BANKING_FACTOR; + if (core_id < N_FFTs_COL * CORES_USED) { mempool_start_benchmark(); - uint32_t N_WORDS_COL = N_CSAMPLES / 4; - uint32_t col_id = core_id / nPE; mempool_radix4_cfft_f16p_scheduler( l1_pSrc, l1_pDst, N_CSAMPLES, N_FFTs_ROW, N_FFTs_COL, - l1_twiddleCoef_f16_src + 2 * col_id * N_WORDS_COL, - l1_twiddleCoef_f16_dst + 2 * col_id * N_WORDS_COL, l1_BitRevIndexTable, - BITREVINDEXTABLE_LENGTH, 1, nPE); - pRes = l1_pDst; - mempool_log_partial_barrier(2, core_id, N_FFTs_COL * nPE); + l1_twiddleCoef_f16_src, l1_twiddleCoef_f16_dst, l1_BitRevIndexTable, + BITREVINDEXTABLE_LENGTH, 1, CORES_USED); + mempool_log_partial_barrier(2, core_id, N_FFTs_COL * CORES_USED); mempool_stop_benchmark(); } +#ifdef BITREVERSETABLE + pRes = ((LOG2 / 2) % 2) == 0 ? l1_pSrc : l1_pDst; +#else + pRes = ((LOG2 / 2) % 2) == 0 ? l1_pDst : l1_pSrc; +#endif #endif mempool_barrier(num_cores); if (core_id == 0) { printf("02: END COMPUTATION\n"); } + mempool_check_f16(pRes, l2_pRes, 2 * N_CSAMPLES, 0.5, 0); mempool_barrier(num_cores); return 0; diff --git a/software/apps/baremetal/chest_f16/main.c b/software/apps/baremetal/chest_f16/main.c index 7abfa8add..e0feb90c7 100644 --- a/software/apps/baremetal/chest_f16/main.c +++ b/software/apps/baremetal/chest_f16/main.c @@ -8,17 +8,16 @@ #include #include +#include "builtins_v2.h" #include "dma.h" #include "encoding.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" -#include "builtins_v2.h" -#include "data_chest_f16.h" #include "baremetal/mempool_checks.h" -#include "baremetal/mempool_chest_f16p.h" -#include "baremetal/mempool_chest_f16s.h" +#include "baremetal/mempool_chest_f16.h" +#include "data_chest_f16.h" //#define SINGLE #define PARALLEL diff --git a/software/apps/baremetal/chest_q16/main.c b/software/apps/baremetal/chest_q16/main.c index eecac204a..9288bb4d7 100644 --- a/software/apps/baremetal/chest_q16/main.c +++ b/software/apps/baremetal/chest_q16/main.c @@ -55,8 +55,8 @@ int main() { #endif #ifdef PARALLEL mempool_start_benchmark(); - mempool_chest_q16p_unrolled4_local(l1_HEST, l1_PilotRX, l1_PilotTX, N_RX, - N_TX, N_SAMPLES, core_id, num_cores); + mempool_chest_q16p_unrolled4(l1_HEST, l1_PilotRX, l1_PilotTX, N_RX, N_TX, + N_SAMPLES, core_id, num_cores); mempool_stop_benchmark(); mempool_barrier(num_cores); #endif diff --git a/software/apps/baremetal/cholesky_f16/main.c b/software/apps/baremetal/cholesky_f16/main.c index d27af143c..908ca99fa 100644 --- a/software/apps/baremetal/cholesky_f16/main.c +++ b/software/apps/baremetal/cholesky_f16/main.c @@ -11,7 +11,6 @@ #include "printf.h" #include "runtime.h" #include "synchronization.h" -#include "builtins_v2.h" #include "data_cholesky_f16.h" diff --git a/software/apps/baremetal/cholesky_q16/main.c b/software/apps/baremetal/cholesky_q16/main.c index 2f30ae94a..3c382c500 100644 --- a/software/apps/baremetal/cholesky_q16/main.c +++ b/software/apps/baremetal/cholesky_q16/main.c @@ -9,11 +9,10 @@ #include "printf.h" #include "runtime.h" #include "synchronization.h" -#include "xpulp/builtins_v2.h" +#include "baremetal/mempool_checks.h" +#include "baremetal/mempool_cholesky_q16s.h" #include "data_cholesky_q16.h" -#include "kernel/mempool_checks.h" -#include "kernel/mempool_cholesky_q16s.h" #define SINGLE diff --git a/software/apps/baremetal/cmatmul_f16/main.c b/software/apps/baremetal/cmatmul_f16/main.c index 2ca261599..2bfbcb144 100644 --- a/software/apps/baremetal/cmatmul_f16/main.c +++ b/software/apps/baremetal/cmatmul_f16/main.c @@ -13,9 +13,11 @@ #include "synchronization.h" #include "data_cmatmul_f16.h" + #include "baremetal/mempool_checks.h" #include "baremetal/mempool_cmatmul_f16.h" -#define PARALLEL_2x2 +#define PARALLEL_2x4 +#define TEST __fp16 matrix_a[2 * dim_M * dim_N] __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)), @@ -26,7 +28,7 @@ __fp16 matrix_b[2 * dim_N * dim_P] __fp16 matrix_c[2 * dim_M * dim_P] __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)), section(".l1_prio"))); -__fp16 matrix_a_folded[2 * dim_M * (4 * NUM_CORES)] +__fp16 matrix_a_folded[2 * (BANKING_FACTOR * NUM_CORES)] __attribute__((aligned(BANKING_FACTOR * NUM_CORES * sizeof(int32_t)), section(".l1_prio"))); @@ -73,7 +75,6 @@ int main() { mempool_start_benchmark(); cmatmul_2x4_f16p(matrix_a, matrix_b, matrix_c, dim_M, dim_N, dim_P, core_id, nPE); - mempool_log_partial_barrier(2, core_id, nPE); mempool_stop_benchmark(); } mempool_barrier(num_cores); diff --git a/software/apps/cmatmul_q16/main.c b/software/apps/baremetal/cmatmul_q16/main.c similarity index 93% rename from software/apps/cmatmul_q16/main.c rename to software/apps/baremetal/cmatmul_q16/main.c index b3e4e7503..f7a6bd31d 100644 --- a/software/apps/cmatmul_q16/main.c +++ b/software/apps/baremetal/cmatmul_q16/main.c @@ -12,9 +12,9 @@ #include "runtime.h" #include "synchronization.h" -#include "data/data_cmatmul_q16.h" -#include "kernel/mempool_checks.h" -#include "kernel/mempool_cmatmul_q16.h" +#include "baremetal/mempool_checks.h" +#include "baremetal/mempool_cmatmul_q16.h" +#include "data_cmatmul_q16.h" #define PARALLEL diff --git a/software/apps/baremetal/mimo_mmse_f16/main.c b/software/apps/baremetal/mimo_mmse_f16/main.c index 1ff7f43f4..4389a0f3e 100644 --- a/software/apps/baremetal/mimo_mmse_f16/main.c +++ b/software/apps/baremetal/mimo_mmse_f16/main.c @@ -11,19 +11,26 @@ #include "printf.h" #include "runtime.h" #include "synchronization.h" -#include "builtins_v2.h" -#include "data_mimo_mmse_f16.h" #include "baremetal/mempool_checks.h" #include "baremetal/mempool_cholesky_f16s.h" #include "baremetal/mempool_linearsolver_f16s.h" #include "baremetal/mempool_mimo_mmse_f16s.h" -//#define DOUBLE_BUFFERING +#include "data_mimo_mmse_f16.h" + +// #define DOUBLE_BUFFERING +// #define N_ROUNDS (1) +// #define DMA_TRANSFER2 + #ifndef DOUBLE_BUFFERING -#define SINGLE -//#define PARALLEL +/**********************************************/ +/* TEST OF THE KERNELS WITH NO DATA MOVEMENTS */ +/**********************************************/ + +//#define SINGLE +#define PARALLEL //#define FOLDED __fp16 l1_H[2 * N_TX * N_RX * N_ITR] @@ -110,7 +117,7 @@ int main() { Ptrx += 2 * itr_bg * N_TX_bg; } } - mempool_log_barrier(2, core_id); + mempool_barrier(num_cores); mempool_stop_benchmark(); #endif @@ -139,7 +146,7 @@ int main() { mempool_Ltrisol_folded_f16s(PtrL, Ptry2, Ptry3, N_TX); mempool_Lttrisol_folded_f16s(PtrL, Ptry3, Ptrx, N_TX); } - mempool_log_barrier(2, core_id); + mempool_barrier(num_cores); mempool_stop_benchmark(); #endif @@ -244,8 +251,8 @@ int main() { __fp16 *PtrL = L + itr * (2 * N_TX * N_TX); __fp16 *Ptry2 = y2 + itr * (2 * N_TX); __fp16 *Ptry3 = y3 + itr * (2 * N_TX); - mempool_hermitian_f16s(PtrH, PtrG, PtrSigma, N_RX, N_TX, 0, 0); - mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX, 0); + mempool_hermitian_f16vecs(PtrH, PtrG, PtrSigma, N_RX, N_TX); + mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX); mempool_cholesky_f16vecs(PtrG, PtrL, N_TX); mempool_Ltrisol_f16s(PtrL, Ptry2, Ptry3, N_TX); mempool_Lttrisol_f16s(PtrL, Ptry3, Ptrx, N_TX); @@ -294,8 +301,8 @@ int main() { __fp16 *PtrSigma = cmpt_Sigma + itr * (2 * N_TX); __fp16 *PtrG = G + itr * (2 * N_TX * N_TX); __fp16 *Ptry2 = y2 + itr * (2 * N_TX); - mempool_hermitian_f16s(PtrH, PtrG, PtrSigma, N_RX, N_TX, 0, 0); - mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX, 0); + mempool_hermitian_f16vecs(PtrH, PtrG, PtrSigma, N_RX, N_TX); + mempool_MVP_conjtransp_f16vecs(PtrH, Ptry, Ptry2, N_RX, N_TX); } mempool_log_barrier(2, core_id); diff --git a/software/apps/baremetal/mimo_mmse_f32/main.c b/software/apps/baremetal/mimo_mmse_f32/main.c index 8e5f5f80f..aa95b1919 100644 --- a/software/apps/baremetal/mimo_mmse_f32/main.c +++ b/software/apps/baremetal/mimo_mmse_f32/main.c @@ -10,13 +10,17 @@ #include "runtime.h" #include "synchronization.h" -#include "data_mimo_mmse_f32.h" #include "baremetal/mempool_checks.h" -#include "baremetal/mempool_cholesky_f32s.h" -#include "baremetal/mempool_linearsolver_f32s.h" #include "baremetal/mempool_mimo_mmse_f32p.h" #include "baremetal/mempool_mimo_mmse_f32s.h" +#if defined(__XDIVSQRT) +#include "baremetal/mempool_cholesky_f32s.h" +#include "baremetal/mempool_linearsolver_f32s.h" +#endif + +#include "data_mimo_mmse_f32.h" + //#define SINGLE //#define JACOBI #define PARALLEL @@ -52,7 +56,7 @@ int main() { } mempool_barrier(num_cores); -#ifdef SINGLE +#if defined(SINGLE) && defined(__XDIVSQRT) /* Benchmark */ if (core_id == 0) { mempool_start_benchmark(); @@ -80,7 +84,7 @@ int main() { mempool_barrier(num_cores); #endif -#ifdef PARALLEL +#if defined(PARALLEL) && defined(__XDIVSQRT) // Each iteration is assigned to a processor mempool_start_benchmark(); for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) { @@ -104,7 +108,7 @@ int main() { mempool_stop_benchmark(); #endif -#ifdef PARALLEL_HERMITIAN +#if defined(PARALLEL_HERMITIAN) && defined(__XDIVSQRT) mempool_start_benchmark(); // Each iteration is assigned to a pool of processors // In a pool each PE gets a column of the H matrix, accumulating a row of the @@ -139,7 +143,7 @@ int main() { mempool_stop_benchmark(); #endif -#ifdef FOLDED +#if defined(FOLDED) && defined(__XDIVSQRT) mempool_start_benchmark(); for (uint32_t itr = core_id; itr < N_ITR; itr += num_cores) { // Inputs diff --git a/software/apps/baremetal/mimo_mmse_q16/main.c b/software/apps/baremetal/mimo_mmse_q16/main.c index dff61adf8..c7dcda78d 100644 --- a/software/apps/baremetal/mimo_mmse_q16/main.c +++ b/software/apps/baremetal/mimo_mmse_q16/main.c @@ -9,12 +9,12 @@ #include "printf.h" #include "runtime.h" #include "synchronization.h" -#include "xpulp/builtins_v2.h" -#include "data/data_mimo_mmse_q16.h" -#include "kernel/mempool_cholesky_q16s.h" -#include "kernel/mempool_linearsolver_q16s.h" -#include "kernel/mempool_mimo_mmse_q16s.h" +#include "data_mimo_mmse_q16.h" + +#include "baremetal/mempool_cholesky_q16s.h" +#include "baremetal/mempool_linearsolver_q16s.h" +#include "baremetal/mempool_mimo_mmse_q16s.h" #define PARALLEL diff --git a/software/apps/ofdm/main.c b/software/apps/baremetal/ofdm/main.c similarity index 74% rename from software/apps/ofdm/main.c rename to software/apps/baremetal/ofdm/main.c index 8408c1035..59b1835d7 100644 --- a/software/apps/ofdm/main.c +++ b/software/apps/baremetal/ofdm/main.c @@ -10,14 +10,14 @@ #include /* Mempool runtime libraries */ +#include "builtins_v2.h" #include "dma.h" #include "encoding.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" -#include "xpulp/builtins_v2.h" -#include "data/data_ofdm.h" +#include "data_ofdm.h" // CFFT Parameters #define SCHEDULED @@ -28,17 +28,20 @@ #define N_FFTs_ROW (N_RX / N_FFTs_COL) // CMATMUL Parameters #define NUM_COPIES (N_BANKS / (N_BEAMS * N_RX)) +#define dim_M (N_BEAMS) +#define dim_N (N_RX) +#define dim_P (N_SC) #define ROUNDS 3 -dump(prova, 1); +dump(checkpoint, 1); -#include "kernel/mempool_cmatmul_f16.h" -#include "kernel/mempool_radix4_cfft_butterfly_f16.h" -#include "kernel/mempool_radix4_cfft_f16p.h" -#include "kernel/mempool_radix4_cfft_q16_bitreversal.h" +#include "baremetal/mempool_cfft_q16_bitreversal.h" +#include "baremetal/mempool_cmatmul_f16.h" +#include "baremetal/mempool_radix4_cfft_butterfly_f16.h" +#include "baremetal/mempool_radix4_cfft_f16p.h" uint32_t arrival_index __attribute__((section(".l1_prio"))); -__fp16 l1_pBF_Coef_folded[2 * N_BEAMS * N_RX * NUM_COPIES] +__fp16 l1_pBF_Coef_folded[2 * BANKING_FACTOR * NUM_CORES] __attribute__((aligned(4 * N_BANKS), section(".l1_prio"))); __fp16 l1_pFFT_Src[N_FFTs_ROW * 8 * N_BANKS] @@ -69,9 +72,9 @@ int main() { (N_RX * N_SC) * sizeof(int32_t)); dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable, BITREVINDEXTABLE_LENGTH * sizeof(int16_t)); - for (uint32_t i = 0; i < NUM_COPIES; i++) { - dma_memcpy_blocking(l1_pBF_Coef_folded + i * (N_BEAMS * N_RX), - l2_pBF_Coef, (N_BEAMS * N_RX) * sizeof(int32_t)); + for (uint32_t i = 0; i < BANKING_FACTOR * NUM_CORES; i += dim_M * dim_N) { + dma_memcpy_blocking(&l1_pBF_Coef_folded[2 * i], l2_pBF_Coef, + dim_M * dim_N * sizeof(int32_t)); } for (uint32_t i = 0; i < N_FFTs_COL; i++) { dma_memcpy_blocking(l1_twiddleCoef_f16_src + (2 * i * N_BANKS), @@ -80,31 +83,27 @@ int main() { } mempool_barrier(num_cores); mempool_stop_benchmark(); - dump_prova(0); - - // // Start of the iterations - // for (uint32_t round = 0; round < ROUNDS; round++) { + dump_checkpoint(0); /* FFT */ mempool_start_benchmark(); - uint32_t col_fftLen = N_SC / 4; - uint32_t col_id = core_id / (N_SC / 16); + uint32_t CORES_USED = (N_SC / 4) / BANKING_FACTOR; // Distribute FFTs over columns mempool_radix4_cfft_f16p_scheduler( - l1_pFFT_Src, l1_pFFT_Dst, N_SC, - l1_twiddleCoef_f16_src + 2 * col_id * col_fftLen, - l1_twiddleCoef_f16_dst + 2 * col_id * col_fftLen, l1_BitRevIndexTable, - BITREVINDEXTABLE_LENGTH, 1, (N_SC / 16)); + l1_pFFT_Src, l1_pFFT_Dst, N_SC, N_FFTs_ROW, N_FFTs_COL, + l1_twiddleCoef_f16_src, l1_twiddleCoef_f16_dst, l1_BitRevIndexTable, + BITREVINDEXTABLE_LENGTH, 1, CORES_USED); mempool_log_barrier(2, core_id); mempool_stop_benchmark(); - dump_prova(1); + dump_checkpoint(1); /* BEAMFORMING */ mempool_start_benchmark(); - cmatmul_2x4_folded_f16p(l1_pBF_Coef_folded, l1_pBF_Coef_folded, l1_pFFT_Src, - l1_pFFT_Dst, N_BEAMS, N_RX, N_SC, core_id, num_cores); + cmatmul_4x4_f16p((int32_t *)l1_pBF_Coef_folded, (int32_t *)l1_pFFT_Src, + (int32_t *)l1_pFFT_Dst, dim_M, dim_N, dim_P, core_id, + num_cores); mempool_stop_benchmark(); - dump_prova(2); + dump_checkpoint(2); mempool_start_benchmark(); // Transfer and synchronization @@ -124,9 +123,7 @@ int main() { } mempool_wfi(); mempool_stop_benchmark(); - dump_prova(3); - - // } + dump_checkpoint(3); return 0; } diff --git a/software/apps/cfft_radix4_q16/main.c b/software/apps/cfft_radix4_q16/main.c deleted file mode 100644 index e69de29bb..000000000 diff --git a/software/apps/chest_q16/main.c b/software/apps/chest_q16/main.c deleted file mode 100644 index e69de29bb..000000000 diff --git a/software/runtime/data/data_cfft_f16.h.tpl b/software/data/data_cfft_f16.h.tpl similarity index 100% rename from software/runtime/data/data_cfft_f16.h.tpl rename to software/data/data_cfft_f16.h.tpl diff --git a/software/runtime/data/data_cfft_q16.h.tpl b/software/data/data_cfft_q16.h.tpl similarity index 100% rename from software/runtime/data/data_cfft_q16.h.tpl rename to software/data/data_cfft_q16.h.tpl diff --git a/software/data/data_cfft_radix2_q16.h.tpl b/software/data/data_cfft_radix2_q16.h.tpl deleted file mode 100644 index 6044e424d..000000000 --- a/software/data/data_cfft_radix2_q16.h.tpl +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Automatically generated by: -// data/data_cfft_radix2_q16.py - -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '(int16_t) 0X{:04X}, '.format(a&0xffff) - i += 1 - if i % 16 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -<% def array_to_str(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '{}, '.format(a) - i += 1 - if i % 16 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -#define LOG2 (${Log2Len}) -#define N_CSAMPLES (${Len}) -#define N_TWIDDLES (3 * N_CSAMPLES / 4) -#define N_BANKS (NUM_CORES * BANKING_FACTOR) -#define BITREVINDEXTABLE_LENGTH (${BitrevLen}) - -// Tolerance for correctness check -#define TOLERANCE (${tolerance}) - -% for m, m_str in zip([vector_inp, vector_res], ['l2_pSrc', 'l2_pRes']): - -// Data arrays for matrix ${m_str} -int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) ${m_str}[${2*Len}] = ${array_to_cstr(m)}; - -% endfor \ - -// Twiddles -int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_twiddleCoef_q16[${int(6*Len/4)}] = ${array_to_cstr(vector_twi)}; - -// Bitreversal -uint16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_BitRevIndexTable[${BitrevLen}] = ${array_to_str(vector_bitrev)}; diff --git a/software/data/data_cfft_radix2_q16.py b/software/data/data_cfft_radix2_q16.py deleted file mode 100644 index e1615e53e..000000000 --- a/software/data/data_cfft_radix2_q16.py +++ /dev/null @@ -1,200 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the cfft kernel. -# Author: Marco Bertuletti - -import numpy as np -import math as M -import argparse -import pathlib -from mako.template import Template -from sympy.combinatorics import Permutation - - -################## -# compute_result # -################## - - -def compute_result(inp, len): - """ - Funciton to generate the expected result of the testcase. - - Arguments - --------- - input: numpy array of inputs - env: Length of the input transform. - """ - - # Q16: - # len=16: Q1.15 -> Q5.11 - # len=32: Q1.15 -> Q6.10 - # len=64: Q1.15 -> Q7.9 - # len=128: Q1.15 -> Q8.8 - # len=256: Q1.15 -> Q9.7 - # len=512: Q1.15 -> Q10.6 - # len=1024: Q1.15 -> Q11.5 - # len=2048: Q1.15 -> Q12.4 - # len=4096: Q1.15 -> Q13.3 - bit_shift_dict_q16 = { - 16: 11, - 32: 10, - 64: 9, - 128: 8, - 256: 7, - 512: 6, - 1024: 5, - 2048: 4, - 4096: 3} - my_type = np.int16 - my_fixpoint = 15 - bit_shift_dict = bit_shift_dict_q16 - a = inp.astype(my_type) - result = np.zeros(a.size, dtype=my_type) - complex_a = np.zeros(int(a.size / 2), dtype=np.csingle) - complex_result = np.zeros(a.size >> 1, dtype=np.csingle) - for i in range(a.size >> 1): - complex_a[i] = a[2 * i].astype(np.csingle) / (2**(my_fixpoint)) + ( - a[2 * i + 1].astype(np.csingle) / (2**(my_fixpoint))) * 1j - complex_result = np.fft.fft(complex_a) - for i in range(int(a.size / 2)): - result[2 * i] = (np.real(complex_result[i]) * - (2**(bit_shift_dict[int(a.size / 2)])) - ).astype(my_type) - result[2 * i + 1] = (np.imag(complex_result[i]) * - (2**(bit_shift_dict[int(a.size / 2)])) - ).astype(my_type) - - return result - - -def compute_twiddles(length): - PI = 3.14159265358979 - N = length - twiddleCoefq15 = np.zeros((int)(2 * 3 * N / 4), np.int16) - for i in range(0, (int)(3 * N / 4)): - twiddleCoefq15_cos = M.cos(i * 2 * PI / N) - twiddleCoefq15_sin = M.sin(i * 2 * PI / N) - twiddleCoefq15[2 * i] = int(round(twiddleCoefq15_cos * (2**15 - 1))) - twiddleCoefq15[2 * i + - 1] = int(round(twiddleCoefq15_sin * (2**15 - 1))) - return twiddleCoefq15 - - -def compute_bitreversal(N, R): - - # Decompose - logR2 = [] - idx = N - while (idx >= R): - logR2.append(int(M.log2(R))) - idx = idx // R - if (idx > 1): - logR2.append(int(M.log2(idx))) - - # Bitreversal - indexes = [] - for x in range(N): - result = 0 - for bits in logR2: - mask = (0xffffffff >> (32 - bits)) - result = (result << bits) | (x & mask) - x = x >> bits - indexes.append(result) - - # Create transpositions table - tps = [] - for c in Permutation.from_sequence(indexes).cyclic_form: - for i in range(len(c) - 1): - tps.append([c[i] * 8, c[-1] * 8]) - - return tps - - -def gen_data_header_file( - outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), - **kwargs): - - file = outdir / f"{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-t", - "--tpl", - type=pathlib.Path, - required=False, - default=pathlib.Path(__file__).parent.absolute() / - "data_cfft_radix2_q16.h.tpl", - help='Path to mako template') - parser.add_argument( - "-v", - "--verbose", - action='store_true', - help='Set verbose' - ) - parser.add_argument( - "-d", - "--dimension", - type=int, - required=False, - default=64, - help='Input dimension' - ) - - args = parser.parse_args() - - # Create sparse matrix - Len = args.dimension - Input = np.random.randint(-2**(15), 2**(15) - 1, 2 * Len, dtype=np.int16) - Result = compute_result(Input, Len) - Twiddles = compute_twiddles(Len) - Bitreversal = np.ndarray.flatten(np.array(compute_bitreversal(Len, 2))) - - tolerance = { - 16: 16, - 32: 20, - 64: 24, - 128: 28, - 256: 32, - 512: 48, - 1024: 64, - 2048: 96, - 4096: 128} - - kwargs = {'name': 'data_cfft_radix2_q16', - 'vector_inp': Input, - 'vector_res': Result, - 'vector_twi': Twiddles, - 'vector_bitrev': Bitreversal, - 'Len': Len, - 'Log2Len': int(np.log2(Len)), - 'BitrevLen': int(2 * len(Bitreversal)), - 'tolerance': tolerance[int(Len)]} - - gen_data_header_file(args.outdir, args.tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/data/data_cfft_radix4_f16.h.tpl b/software/data/data_cfft_radix4_f16.h.tpl deleted file mode 100644 index 883049a44..000000000 --- a/software/data/data_cfft_radix4_f16.h.tpl +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '(__fp16){:0.4}f, '.format(a) - i += 1 - if i % 8 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -<% def array_to_str(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '{}, '.format(a) - i += 1 - if i % 16 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -#define LOG2 (${Log2Len}) -#define N_CSAMPLES (${Len}) -#define N_RSAMPLES (2 * N_CSAMPLES) -#define N_TWIDDLES (3 * N_CSAMPLES / 4) -#define N_BANKS (NUM_CORES * BANKING_FACTOR) -#define BITREVINDEXTABLE_LENGTH (${BitrevLen}) - -__fp16 l2_pSrc[${2 * Len}] = ${array_to_cstr(src)}; - -__fp16 l2_pRes[${2 * Len}] = ${array_to_cstr(dst)}; - -__fp16 l2_twiddleCoef_f16[${2 * Len}] = ${array_to_cstr(twi)}; - -// Bitreversal -uint16_t l2_BitRevIndexTable[${BitrevLen}] = ${array_to_str(bitrev)}; diff --git a/software/data/data_cfft_radix4_f16.py b/software/data/data_cfft_radix4_f16.py deleted file mode 100644 index ca90265c8..000000000 --- a/software/data/data_cfft_radix4_f16.py +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the fp16 FFT. -# Author: Marco Bertuletti - -import numpy as np -import math as M -import argparse -import pathlib -from mako.template import Template -from sympy.combinatorics import Permutation - - -def compute_bitreversal(N, R): - # Decompose - logR2 = [] - idx = N - while (idx >= R): - logR2.append(int(M.log2(R))) - idx = idx // R - if (idx > 1): - logR2.append(int(M.log2(idx))) - # Bitreversal - indexes = [] - for x in range(N): - result = 0 - for bits in logR2: - mask = (0xffffffff >> (32 - bits)) - result = (result << bits) | (x & mask) - x = x >> bits - indexes.append(result) - - # Create transpositions table - tps = [] - for c in Permutation.from_sequence(indexes).cyclic_form: - for i in range(len(c) - 1): - tps.append([c[i] * 8, c[-1] * 8]) - return tps - - -def gen_data_header_file( - outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), - **kwargs): - - file = outdir / f"{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-t", - "--tpl", - type=pathlib.Path, - required=False, - default=pathlib.Path(__file__).parent.absolute() / - "data_cfft_radix4_f16.h.tpl", - help='Path to mako template') - parser.add_argument( - "-v", - "--verbose", - action='store_true', - help='Set verbose' - ) - parser.add_argument( - "-d", - "--dimension", - type=int, - required=False, - default=4096, - help='FFT dimension' - ) - - args = parser.parse_args() - Len = args.dimension - - src = np.random.rand(Len).astype(np.float16) - src = src + 1.j * np.random.rand(Len).astype(np.float16) - dst = np.fft.fft(src) - src = np.column_stack((src.imag, src.real)).astype(np.float16).flatten() - dst = np.column_stack((dst.imag, dst.real)).astype(np.float16).flatten() - Bitreversal = np.ndarray.flatten(np.array(compute_bitreversal(Len, 2))) - - twi = np.zeros(int(2 * 3 * Len / 4), np.float16) - for i in range(0, int(3 * Len / 4)): - twi[2 * i] = np.sin(i * 2 * np.pi / Len).astype(np.float16) - twi[2 * i + 1] = np.cos(i * 2 * np.pi / Len).astype(np.float16) - - kwargs = {'name': 'data_cfft_radix4_f16', - 'src': src, - 'dst': dst, - 'twi': twi, - 'bitrev': Bitreversal, - 'Len': Len, - 'Log2Len': int(np.log2(Len)), - 'BitrevLen': len(Bitreversal)} - - gen_data_header_file(args.outdir, args.tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/data/data_cfft_radix4_q16.h.tpl b/software/data/data_cfft_radix4_q16.h.tpl deleted file mode 100644 index 3af1b764d..000000000 --- a/software/data/data_cfft_radix4_q16.h.tpl +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Automatically generated by: -// data/data_cfft_radix4_q16.py - -\ -<% def array_to_cstr(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '(int16_t) 0X{:04X}, '.format(a&0xffff) - i += 1 - if i % 16 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -<% def array_to_str(array): - out = '{' - i = 0 - out += '\n' - for a in array: - out += '{}, '.format(a) - i += 1 - if i % 16 == 0: - out += '\n' - out = out[:-2] + '}' - return out -%> \ - -#define LOG2 (${Log2Len}) -#define N_CSAMPLES (${Len}) -#define N_TWIDDLES (3 * N_CSAMPLES / 4) -#define N_BANKS (NUM_CORES * BANKING_FACTOR) -#define BITREVINDEXTABLE_LENGTH (${BitrevLen}) - -// Maximum number of independent FFT columns allowed -#define MAX_COL (N_BANKS / (N_CSAMPLES / 4)) -// Tolerance for correctness check -#define TOLERANCE (${tolerance}) - -% for m, m_str in zip([vector_inp, vector_res], ['l2_pSrc', 'l2_pRes']): - -// Data arrays for matrix ${m_str} -int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) ${m_str}[${2*Len}] = ${array_to_cstr(m)}; - -% endfor \ - -// Twiddles -int16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_twiddleCoef_q16[${int(6*Len/4)}] = ${array_to_cstr(vector_twi)}; - -// Bitreversal -uint16_t __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_BitRevIndexTable[${BitrevLen}] = ${array_to_str(vector_bitrev)}; diff --git a/software/data/data_cfft_radix4_q16.py b/software/data/data_cfft_radix4_q16.py deleted file mode 100755 index b394a2884..000000000 --- a/software/data/data_cfft_radix4_q16.py +++ /dev/null @@ -1,200 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the cfft kernel. -# Author: Marco Bertuletti - -import numpy as np -import math as M -import argparse -import pathlib -from mako.template import Template -from sympy.combinatorics import Permutation - - -################## -# compute_result # -################## - - -def compute_result(inp, len): - """ - Funciton to generate the expected result of the testcase. - - Arguments - --------- - input: numpy array of inputs - env: Length of the input transform. - """ - - # Q16: - # len=16: Q1.15 -> Q5.11 - # len=32: Q1.15 -> Q6.10 - # len=64: Q1.15 -> Q7.9 - # len=128: Q1.15 -> Q8.8 - # len=256: Q1.15 -> Q9.7 - # len=512: Q1.15 -> Q10.6 - # len=1024: Q1.15 -> Q11.5 - # len=2048: Q1.15 -> Q12.4 - # len=4096: Q1.15 -> Q13.3 - bit_shift_dict_q16 = { - 16: 11, - 32: 10, - 64: 9, - 128: 8, - 256: 7, - 512: 6, - 1024: 5, - 2048: 4, - 4096: 3} - my_type = np.int16 - my_fixpoint = 15 - bit_shift_dict = bit_shift_dict_q16 - a = inp.astype(my_type) - result = np.zeros(a.size, dtype=my_type) - complex_a = np.zeros(int(a.size / 2), dtype=np.csingle) - complex_result = np.zeros(a.size >> 1, dtype=np.csingle) - for i in range(a.size >> 1): - complex_a[i] = a[2 * i].astype(np.csingle) / (2**(my_fixpoint)) + ( - a[2 * i + 1].astype(np.csingle) / (2**(my_fixpoint))) * 1j - complex_result = np.fft.fft(complex_a) - for i in range(int(a.size / 2)): - result[2 * i] = (np.real(complex_result[i]) * - (2**(bit_shift_dict[int(a.size / 2)])) - ).astype(my_type) - result[2 * i + 1] = (np.imag(complex_result[i]) * - (2**(bit_shift_dict[int(a.size / 2)])) - ).astype(my_type) - - return result - - -def compute_twiddles(length): - PI = 3.14159265358979 - N = length - twiddleCoefq15 = np.zeros((int)(2 * 3 * N / 4), np.int16) - for i in range(0, (int)(3 * N / 4)): - twiddleCoefq15_cos = M.cos(i * 2 * PI / N) - twiddleCoefq15_sin = M.sin(i * 2 * PI / N) - twiddleCoefq15[2 * i] = int(round(twiddleCoefq15_cos * (2**15 - 1))) - twiddleCoefq15[2 * i + - 1] = int(round(twiddleCoefq15_sin * (2**15 - 1))) - return twiddleCoefq15 - - -def compute_bitreversal(N, R): - - # Decompose - logR2 = [] - idx = N - while (idx >= R): - logR2.append(int(M.log2(R))) - idx = idx // R - if (idx > 1): - logR2.append(int(M.log2(idx))) - - # Bitreversal - indexes = [] - for x in range(N): - result = 0 - for bits in logR2: - mask = (0xffffffff >> (32 - bits)) - result = (result << bits) | (x & mask) - x = x >> bits - indexes.append(result) - - # Create transpositions table - tps = [] - for c in Permutation.from_sequence(indexes).cyclic_form: - for i in range(len(c) - 1): - tps.append([c[i] * 8, c[-1] * 8]) - - return tps - - -def gen_data_header_file( - outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), - **kwargs): - - file = outdir / f"{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-t", - "--tpl", - type=pathlib.Path, - required=False, - default=pathlib.Path(__file__).parent.absolute() / - "data_cfft_radix4_q16.h.tpl", - help='Path to mako template') - parser.add_argument( - "-v", - "--verbose", - action='store_true', - help='Set verbose' - ) - parser.add_argument( - "-d", - "--dimension", - type=int, - required=False, - default=64, - help='Input dimension' - ) - - args = parser.parse_args() - - # Create sparse matrix - Len = args.dimension - Input = np.random.randint(-2**(15), 2**(15) - 1, 2 * Len, dtype=np.int16) - Result = compute_result(Input, Len) - Twiddles = compute_twiddles(Len) - Bitreversal = np.ndarray.flatten(np.array(compute_bitreversal(Len, 2))) - - tolerance = { - 16: 16, - 32: 20, - 64: 24, - 128: 28, - 256: 32, - 512: 48, - 1024: 64, - 2048: 96, - 4096: 128} - - kwargs = {'name': 'data_cfft_radix4_q16', - 'vector_inp': Input, - 'vector_res': Result, - 'vector_twi': Twiddles, - 'vector_bitrev': Bitreversal, - 'Len': Len, - 'Log2Len': int(np.log2(Len)), - 'BitrevLen': len(Bitreversal), - 'tolerance': tolerance[int(Len)]} - - gen_data_header_file(args.outdir, args.tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/data/data_chest_f16.py b/software/data/data_chest_f16.py deleted file mode 100644 index 29c19e4a3..000000000 --- a/software/data/data_chest_f16.py +++ /dev/null @@ -1,132 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the Channel estimation. -# Author: Marco Bertuletti - -import numpy as np -import argparse -import pathlib - -from mako.template import Template - -################## -# write_result # -################## - - -def gen_data_header_file( - outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), - **kwargs): - - file = outdir / f"{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-t", - "--tpl", - type=pathlib.Path, - required=False, - default=pathlib.Path(__file__).parent.absolute() / - "data_chest_f16.h.tpl", - help='Path to mako template') - parser.add_argument( - "-v", - "--verbose", - action='store_true', - help='Set verbose' - ) - parser.add_argument( - "-b", - "--num_beams", - type=int, - required=False, - default=4, - help='Number beams' - ) - parser.add_argument( - "-l", - "--num_layers", - type=int, - required=False, - default=4, - help='Number layers' - ) - parser.add_argument( - "-s", - "--num_samples", - type=int, - required=False, - default=32, - help='Number samples' - ) - - args = parser.parse_args() - nb_rx = args.num_beams - nb_tx = args.num_layers - nb_samples = args.num_samples - - H = np.random.randn(nb_rx, nb_tx) + 1j * np.random.randn(nb_rx, nb_tx) - - vector_pilot_tx = [] - vector_pilot_rx = [] - vector_Hest = [] - for k in range(nb_samples): - - # Compute data - pilot_tx = 1 * np.exp(1j * np.random.randn(nb_tx)) - pilot_rx = np.dot(H, pilot_tx) - Hest = pilot_rx[:, np.newaxis] / pilot_tx[np.newaxis, :] - - # Interleaved real and imaginary parts - pilot_tx = np.column_stack( - (pilot_tx.real, pilot_tx.imag)).astype(np.float16).flatten() - pilot_rx = np.column_stack( - (pilot_rx.real, pilot_rx.imag)).astype(np.float16).flatten() - Hest = Hest.flatten() - Hest = np.column_stack((Hest.real, Hest.imag) - ).astype(np.float16).flatten() - - # Output vectors - vector_pilot_tx.append(pilot_tx) - vector_pilot_rx.append(pilot_rx) - vector_Hest.append(Hest) - - vector_pilot_rx = np.concatenate(vector_pilot_rx, axis=0) - vector_pilot_tx = np.concatenate(vector_pilot_tx, axis=0) - vector_Hest = np.concatenate(vector_Hest, axis=0) - - kwargs = {'name': 'data_chest_f16', - 'pilot_rx': vector_pilot_rx, - 'pilot_tx': vector_pilot_tx, - 'Hest': vector_Hest, - 'nb_tx': nb_tx, - 'nb_rx': nb_rx, - 'nb_samples': nb_samples} - - gen_data_header_file(args.outdir, args.tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/data/data_chest_q16.py b/software/data/data_chest_q16.py deleted file mode 100755 index e1fca8649..000000000 --- a/software/data/data_chest_q16.py +++ /dev/null @@ -1,160 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the Channel estimation. -# Author: Marco Bertuletti - -import numpy as np -import argparse -import pathlib - -from mako.template import Template - -################## -# write_result # -################## - - -def gen_data_header_file( - outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), - **kwargs): - - file = outdir / f"{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - -###################### -# Fixpoint Functions # -###################### - - -def q_sat(x): - if x > 2**15 - 1: - return x - 2**16 - elif x < -2**15: - return x + 2**16 - else: - return x - - -def compute_chest_q16(in_rx, in_tx, p): - n_rx = in_rx.size - n_tx = in_tx.size - result = np.zeros(2 * (n_tx * n_rx), dtype=np.int16) - for i in range(n_rx): - a_r = in_rx[i].real - a_i = in_rx[i].imag - for j in range(n_tx): - b_r = in_tx[j].real - b_i = in_tx[j].imag - -# # Compute data division -# den = (2**16) // (b_r * b_r + b_i * b_i) -# num_r = (a_r * b_r) + (a_i * b_i) -# num_i = (a_i * b_r) - (a_r * b_i) -# result[2 * (i * n_tx + j)] = q_sat((num_r * den) // 2**p) -# result[2 * (i * n_tx + j) + 1] = q_sat((num_i * den) // 2**p) - - # Compute data multiplication - num_r = (a_r * b_r) - (a_i * b_i) - num_i = (a_i * b_r) + (a_r * b_i) - result[2 * (i * n_tx + j)] = q_sat(num_r // 2**p) - result[2 * (i * n_tx + j) + 1] = q_sat(num_i // 2**p) - return result - - -def generate_chest_q16(nb_tx, nb_rx, nb_samples): - FIXED_POINT = 8 - MAX = 2**7 - - qvector_pilot_tx = [] - qvector_pilot_rx = [] - qvector_Hest = [] - for k in range(nb_samples): - # Create pilots - pilot_rx = np.random.randint(-MAX, MAX - 1, size=nb_rx) + 1j * \ - np.random.randint(-MAX, MAX - 1, size=nb_rx) - pilot_tx = np.random.randint(-MAX, MAX - 1, size=nb_tx) + 1j * \ - np.random.randint(-MAX, MAX - 1, size=nb_tx) - # Compute Hest - Hest = compute_chest_q16(pilot_rx, pilot_tx, FIXED_POINT) - - pilot_tx = np.column_stack( - (pilot_tx.imag, pilot_tx.real)).astype( - np.int16).flatten() - pilot_rx = np.column_stack( - (pilot_rx.imag, pilot_rx.real)).astype( - np.int16).flatten() - qvector_pilot_tx.append(pilot_tx) - qvector_pilot_rx.append(pilot_rx) - qvector_Hest.append(Hest) - - qvector_pilot_tx = np.reshape(qvector_pilot_tx, [2 * nb_tx * nb_samples]) - qvector_pilot_rx = np.reshape(qvector_pilot_rx, [2 * nb_rx * nb_samples]) - qvector_Hest = np.reshape(qvector_Hest, [2 * nb_tx * nb_rx * nb_samples]) - return qvector_pilot_tx, qvector_pilot_rx, qvector_Hest - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-b", - "--num_rx", - type=int, - required=False, - default=32, - help='Number beams' - ) - parser.add_argument( - "-l", - "--num_tx", - type=int, - required=False, - default=4, - help='Number layers' - ) - parser.add_argument( - "-s", - "--num_samples", - type=int, - required=False, - default=32, - help='Number samples' - ) - - args = parser.parse_args() - nb_tx = args.num_tx - nb_rx = args.num_rx - nb_samples = args.num_samples - - pilot_tx, pilot_rx, Hest = generate_chest_q16(nb_tx, nb_rx, nb_samples) - tpl = pathlib.Path(__file__).parent.absolute() / "data_chest_q16.h.tpl" - kwargs = {'name': 'data_chest_q16', - 'pilot_tx': pilot_tx, - 'pilot_rx': pilot_rx, - 'Hest': Hest, - 'nb_tx': nb_tx, - 'nb_rx': nb_rx, - 'nb_samples': nb_samples} - gen_data_header_file(args.outdir, tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/data/data_cholesky_f16.py b/software/data/data_cholesky_f16.py deleted file mode 100644 index 32dfa8df9..000000000 --- a/software/data/data_cholesky_f16.py +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the fp16 cholesky. -# Author: Marco Bertuletti - -import numpy as np -import argparse -import pathlib -from mako.template import Template - - -################## -# compute_result # -################## - -def gen_data_header_file(outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), **kwargs): - - file = outdir / f"{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-t", - "--tpl", - type=pathlib.Path, - required=False, - default=pathlib.Path(__file__).parent.absolute() / - "data_cholesky_f16.h.tpl", - help='Path to mako template' - ) - parser.add_argument( - "-v", - "--verbose", - action='store_true', - help='Set verbose' - ) - parser.add_argument( - "-n", - "--dimension", - type=int, - required=False, - default=4, - help='Matrix dimension' - ) - parser.add_argument( - "-s", - "--num_samples", - type=int, - required=False, - default=256, - help='Number samples' - ) - - args = parser.parse_args() - n_matrix = args.dimension - n_samples = args.num_samples - - vector_G = [] - vector_L = [] - for k in range(n_samples): - # Create hermitian matrix - H = np.random.rand(n_matrix, n_matrix) + 1.j * \ - np.random.rand(n_matrix, n_matrix) - # Matrix to be inverted - # H_H = np.asmatrix(H).H - G = np.matmul(H, np.asmatrix(H).H) - # Cholesky decomposition - L = np.linalg.cholesky(G) - # Reshape - G = np.reshape(np.asarray(G), (n_matrix * n_matrix), order='C') - L = np.reshape(np.asarray(L), (n_matrix * n_matrix), order='C') - G = np.column_stack((G.real, G.imag)).astype(np.float16).flatten() - L = np.column_stack((L.real, L.imag)).astype(np.float16).flatten() - # Output vectors - vector_G.append(G) - vector_L.append(L) - - vector_G = np.concatenate(vector_G, axis=0) - vector_L = np.concatenate(vector_L, axis=0) - - kwargs = {'name': 'data_cholesky_f16', 'G': vector_G, - 'L': vector_L, 'n_matrix': n_matrix, 'n_samples': n_samples} - gen_data_header_file(args.outdir, args.tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/data/data_cholesky_q16.py b/software/data/data_cholesky_q16.py deleted file mode 100644 index d342f3fb9..000000000 --- a/software/data/data_cholesky_q16.py +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the fp16 cholesky. -# Author: Marco Bertuletti - -import numpy as np -import argparse -import pathlib -from mako.template import Template - - -################## -# compute_result # -################## - - -def gen_data_header_file(outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), **kwargs): - - file = outdir / f"{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-t", - "--tpl", - type=pathlib.Path, - required=False, - default=pathlib.Path(__file__).parent.absolute() / - "data_cholesky_q16.h.tpl", - help='Path to mako template' - ) - parser.add_argument( - "-v", - "--verbose", - action='store_true', - help='Set verbose' - ) - parser.add_argument( - "-n", - "--dimension", - type=int, - required=False, - default=4, - help='Matrix dimension' - ) - parser.add_argument( - "-s", - "--num_samples", - type=int, - required=False, - default=256, - help='Number samples' - ) - - args = parser.parse_args() - n_matrix = args.dimension - n_samples = args.num_samples - - vector_G = [] - vector_L = [] - for k in range(n_samples): - # Create hermitian matrix - H = np.random.randint(-2**(15), 2**(15) - 1, - n_matrix * n_matrix, dtype=np.int16) \ - + 1.j * np.random.randint(-2**(15), 2**(15) - 1, - n_matrix * n_matrix, dtype=np.int16) - H = H.reshape(n_matrix, n_matrix) - # Matrix to be inverted - H_h = (np.asmatrix(H).H) - # H_H = np.asmatrix(H).H - G = H_h * H - # Cholesky decomposition - L = np.linalg.cholesky(G) - # Reshape - G = np.reshape(np.asarray(G), (n_matrix * n_matrix), order='C') - L = np.reshape(np.asarray(L), (n_matrix * n_matrix), order='C') - G = np.column_stack((G.real, G.imag)).astype(np.int16).flatten() - L = np.column_stack((L.real, L.imag)).astype(np.int16).flatten() - # Output vectors - vector_G.append(G) - vector_L.append(L) - - vector_G = np.concatenate(vector_G, axis=0) - vector_L = np.concatenate(vector_L, axis=0) - - kwargs = {'name': 'data_cholesky_q16', - 'G': vector_G, - 'L': vector_L, - 'n_matrix': n_matrix, - 'n_samples': n_samples} - - gen_data_header_file(args.outdir, args.tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/data/data_cholesky_q32.py b/software/data/data_cholesky_q32.py deleted file mode 100644 index acadcc135..000000000 --- a/software/data/data_cholesky_q32.py +++ /dev/null @@ -1,106 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the fp16 cholesky. -# Author: Marco Bertuletti - -import numpy as np -import argparse -import pathlib -from scipy.linalg import solve_triangular -from mako.template import Template - - -################## -# compute_result # -################## - -def gen_data_header_file(outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), **kwargs): - - file = outdir / f"{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-t", - "--tpl", - type=pathlib.Path, - required=False, - default=pathlib.Path(__file__).parent.absolute() / - "data_cholesky_q32.h.tpl", - help='Path to mako template' - ) - parser.add_argument( - "-v", - "--verbose", - action='store_true', - help='Set verbose' - ) - parser.add_argument( - "-n", - "--dimension", - type=int, - required=False, - default=4, - help='Matrix dimension' - ) - - args = parser.parse_args() - n_matrix = args.dimension - - # Create hermitian matrix - L = np.random.randint(-2**(15), 2**(15) - 1, - size=(n_matrix, n_matrix), dtype=np.int32) - L = np.tril(L).astype(np.int32) - G = np.dot(np.asmatrix(L), np.asmatrix(L).transpose()) - - y = np.random.randint(-2**(15), 2**(15) - 1, n_matrix, dtype=np.int32) - - # Linear system solution - y = solve_triangular(L, y, lower=True) - # x = solve_triangular(np.asmatrix(L).T, y) - - # Reshape - G = np.reshape( - np.asarray(G), - (n_matrix * n_matrix), - order='C').astype( - np.int32) - L = np.reshape( - np.asarray(L), - (n_matrix * n_matrix), - order='C').astype( - np.int32) - y = np.reshape(np.asarray(y), (n_matrix), order='C').astype(np.int32) - - kwargs = {'name': 'data_cholesky_q32', - 'G': G, - 'L': L, - 'y': y, - 'n_matrix': n_matrix} - - gen_data_header_file(args.outdir, args.tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/data/data_cmatmul_f16.py b/software/data/data_cmatmul_f16.py deleted file mode 100644 index b3010977b..000000000 --- a/software/data/data_cmatmul_f16.py +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the fp16 matmul. -# Author: Marco Bertuletti - -import numpy as np -import argparse -import pathlib -from mako.template import Template - - -################## -# compute_result # -################## - -def gen_data_header_file(outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), **kwargs): - - file = outdir / f"{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-t", - "--tpl", - type=pathlib.Path, - required=False, - default=pathlib.Path(__file__).parent.absolute() / - "data_cmatmul_f16.h.tpl", - help='Path to mako template' - ) - parser.add_argument( - "-v", - "--verbose", - action='store_true', - help='Set verbose' - ) - parser.add_argument( - "-m", - "--dim_m", - type=int, - required=False, - default=16, - help='First dimension.' - ) - parser.add_argument( - "-n", - "--dim_n", - type=int, - required=False, - default=16, - help='Second dimension.' - ) - parser.add_argument( - "-p", - "--dim_p", - type=int, - required=False, - default=16, - help='Third dimension.' - ) - - args = parser.parse_args() - - matrix_M = args.dim_m - matrix_N = args.dim_n - matrix_P = args.dim_p - - # Create sparse matrix - A = np.random.rand(matrix_M, matrix_N) + 1j * \ - np.random.rand(matrix_M, matrix_N) - B = np.random.rand(matrix_N, matrix_P) + 1j * \ - np.random.rand(matrix_N, matrix_P) - C = np.matmul(A, B) - - A = np.reshape(A, (matrix_M * matrix_N), order='C') - B = np.reshape(B, (matrix_N * matrix_P), order='C') - C = np.reshape(C, (matrix_M * matrix_P), order='C') - - A = np.column_stack((A.imag, A.real)).astype(np.float16).flatten() - B = np.column_stack((B.imag, B.real)).astype(np.float16).flatten() - C = np.column_stack((C.imag, C.real)).astype(np.float16).flatten() - - kwargs = { - 'name': 'data_cmatmul_f16', - 'A': A, - 'B': B, - 'C': C, - 'matrix_M': matrix_M, - 'matrix_N': matrix_N, - 'matrix_P': matrix_P} - - gen_data_header_file(args.outdir, args.tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/runtime/data/data_cmatmul_q16.h.tpl b/software/data/data_cmatmul_q16.h.tpl similarity index 100% rename from software/runtime/data/data_cmatmul_q16.h.tpl rename to software/data/data_cmatmul_q16.h.tpl diff --git a/software/data/data_matmulf16.py b/software/data/data_matmulf16.py deleted file mode 100644 index 2c362208b..000000000 --- a/software/data/data_matmulf16.py +++ /dev/null @@ -1,111 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the fp16 matmul. -# Author: Marco Bertuletti - -import numpy as np -import argparse -import pathlib -from mako.template import Template - - -################## -# compute_result # -################## - -def gen_data_header_file(outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), **kwargs): - - file = outdir / f"{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-t", - "--tpl", - type=pathlib.Path, - required=False, - default=pathlib.Path(__file__).parent.absolute() / - "data_matmul_f16.h.tpl", - help='Path to mako template' - ) - parser.add_argument( - "-v", - "--verbose", - action='store_true', - help='Set verbose' - ) - parser.add_argument( - "-m", - "--dim_m", - type=int, - required=False, - default=16, - help='First dimension.' - ) - parser.add_argument( - "-n", - "--dim_n", - type=int, - required=False, - default=16, - help='Second dimension.' - ) - parser.add_argument( - "-p", - "--dim_p", - type=int, - required=False, - default=16, - help='Third dimension.' - ) - - args = parser.parse_args() - - matrix_M = args.dim_m - matrix_N = args.dim_n - matrix_P = args.dim_p - - # Create matrix - A = (np.random.rand(matrix_M, matrix_N) - 0.5).astype(np.float16) - B = (np.random.rand(matrix_N, matrix_P) - 0.5).astype(np.float16) - C = np.matmul(A, B) - - A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(np.float16) - B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(np.float16) - C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(np.float16) - - kwargs = { - 'name': 'data_matmul_f16', - 'A': A, - 'B': B, - 'C': C, - 'matrix_M': matrix_M, - 'matrix_N': matrix_N, - 'matrix_P': matrix_P} - - gen_data_header_file(args.outdir, args.tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/data/data_matmulf32.py b/software/data/data_matmulf32.py deleted file mode 100644 index 15086d0fc..000000000 --- a/software/data/data_matmulf32.py +++ /dev/null @@ -1,112 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the fp32 matmul. -# Author: Marco Bertuletti - -import numpy as np -import argparse -import pathlib -from mako.template import Template - - -################## -# compute_result # -################## - -def gen_data_header_file(outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), **kwargs): - - file = outdir / f"{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-t", - "--tpl", - type=pathlib.Path, - required=False, - default=pathlib.Path(__file__).parent.absolute() / - "data_matmul_f32.h.tpl", - help='Path to mako template' - ) - parser.add_argument( - "-v", - "--verbose", - action='store_true', - help='Set verbose' - ) - - parser.add_argument( - "-m", - "--dim_m", - type=int, - required=False, - default=16, - help='First dimension.' - ) - parser.add_argument( - "-n", - "--dim_n", - type=int, - required=False, - default=16, - help='Second dimension.' - ) - parser.add_argument( - "-p", - "--dim_p", - type=int, - required=False, - default=16, - help='Third dimension.' - ) - - args = parser.parse_args() - - matrix_M = args.dim_m - matrix_N = args.dim_n - matrix_P = args.dim_p - - # Create matrix - A = np.random.rand(matrix_M, matrix_N) - B = np.random.rand(matrix_N, matrix_P) - C = np.matmul(A, B) - - A = np.reshape(A, (matrix_M * matrix_N), order='C').astype(np.float32) - B = np.reshape(B, (matrix_N * matrix_P), order='C').astype(np.float32) - C = np.reshape(C, (matrix_M * matrix_P), order='C').astype(np.float32) - - kwargs = { - 'name': 'data_matmul_f32', - 'A': A, - 'B': B, - 'C': C, - 'matrix_M': matrix_M, - 'matrix_N': matrix_N, - 'matrix_P': matrix_P} - - gen_data_header_file(args.outdir, args.tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/data/data_mimo_mmse_f16.py b/software/data/data_mimo_mmse_f16.py deleted file mode 100644 index ff41e7e18..000000000 --- a/software/data/data_mimo_mmse_f16.py +++ /dev/null @@ -1,185 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the fp16 mmse. -# Author: Marco Bertuletti - -import numpy as np -import argparse -import pathlib -from mako.template import Template -from scipy.linalg import solve_triangular - - -################## -# compute_result # -################## - -def gen_data_header_file(outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), **kwargs): - - file = outdir / f"{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def gen_input_data(N_rx, N_tx, y): - # Create channel matrix - H = np.random.rand(N_rx, N_tx).astype(np.float16) + 1.j * \ - np.random.rand(N_rx, N_tx).astype(np.float16) - # Generate noise variance - sigma = np.diag(np.random.rand(N_tx, N_tx).astype(np.float16)) - - # Matrix to be inverted in MMSE estimator - H_h = (np.asmatrix(H).H) - - G = H_h * H - G = G + np.diag(sigma) - # Cholesky decomposition - L = np.linalg.cholesky(G) - # Linear system solution - y1 = np.transpose(np.dot(H_h, y)) - y2 = solve_triangular(L, y1, lower=True) - x = solve_triangular(np.asmatrix(L).H, y2) - - sigma = sigma + 0j - H = np.reshape(np.asarray(H), (N_tx * N_rx), order='C') - G = np.reshape(np.asarray(G), (N_tx * N_tx), order='C') - L = np.reshape(np.asarray(L), (N_tx * N_tx), order='C') - sigma = np.column_stack((sigma.real, sigma.imag) - ).astype(np.float16).flatten() - H = np.column_stack((H.real, H.imag)).astype(np.float16).flatten() - G = np.column_stack((G.real, G.imag)).astype(np.float16).flatten() - L = np.column_stack((L.real, L.imag)).astype(np.float16).flatten() - - y = np.column_stack((y.real, y.imag)).astype(np.float16).flatten() - x = np.column_stack((x.real, x.imag)).astype(np.float16).flatten() - - return sigma, H, G, y, x - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-t", - "--tpl", - type=pathlib.Path, - required=False, - default=pathlib.Path(__file__).parent.absolute() / - "data_mimo_mmse_f16.h.tpl", - help='Path to mako template' - ) - parser.add_argument( - "-v", - "--verbose", - action='store_true', - help='Set verbose' - ) - parser.add_argument( - "-n", - "--transmitters", - type=int, - required=False, - default=4, - help='First dimension.' - ) - parser.add_argument( - "-m", - "--receivers", - type=int, - required=False, - default=32, - help='First dimension.' - ) - parser.add_argument( - "-k", - "--iterations", - type=int, - required=False, - default=256, - help='Iterations.' - ) - parser.add_argument( - "-r", - "--randomize", - type=int, - required=False, - default=0, - help='Randomizes the number of beamgroups on each subcarrier.' - ) - - args = parser.parse_args() - N_tx = args.transmitters - N_rx = args.receivers - N_itr = args.iterations - - sigma = np.zeros([N_itr, 2 * N_tx]) - H_RI = np.zeros([N_itr, 2 * N_tx * N_rx]) - G_RI = np.zeros([N_itr, 2 * N_tx * N_tx]) - y_RI = np.zeros([N_itr, 2 * N_rx]) - x_RI = np.zeros([N_itr, 2 * N_tx]) - beamgroups = np.zeros(N_itr) - - for k in range(N_itr): - - # Create input vector - y_bg = np.random.rand(N_rx).astype(np.float16) + 1.j * \ - np.random.rand(N_rx).astype(np.float16) - if (args.randomize == 1): - N_beamgroups = 2 ** np.random.randint(0, np.log2(2 * N_tx)) - else: - N_beamgroups = 1 - N_tx_itr = N_tx // N_beamgroups - beamgroups[k] = N_beamgroups - - for i in range(N_beamgroups): - - sigma_itr, H_itr, G_itr, y_itr, x_itr = gen_input_data( - N_rx, N_tx_itr, y_bg) - sigma[k, (i * 2 * N_tx_itr):((i + 1) * 2 * N_tx_itr)] = sigma_itr - H_RI[k, (i * 2 * N_tx_itr * N_rx) - :((i + 1) * 2 * N_tx_itr * N_rx)] = H_itr - G_RI[k, (i * 2 * N_tx_itr * N_tx_itr) - :((i + 1) * 2 * N_tx_itr * N_tx_itr)] = G_itr - y_RI[k, :] = y_itr - x_RI[k, (i * 2 * N_tx_itr):((i + 1) * 2 * N_tx_itr)] = x_itr - - sigma = np.reshape(sigma, (2 * N_tx * N_itr)).astype(np.float16) - H_RI = np.reshape(H_RI, (2 * N_rx * N_tx * N_itr)).astype(np.float16) - G_RI = np.reshape(G_RI, (2 * N_tx * N_tx * N_itr)).astype(np.float16) - y_RI = np.reshape(y_RI, (2 * N_rx * N_itr)).astype(np.float16) - x_RI = np.reshape(x_RI, (2 * N_tx * N_itr)).astype(np.float16) - beamgroups = beamgroups.astype(np.int32) - - kwargs = {'name': 'data_mimo_mmse_f16', - 'H': H_RI, - 'G': G_RI, - 'sigma': sigma, - 'y': y_RI, - 'x': x_RI, - 'beamgroups': beamgroups, - 'N_tx': N_tx, - 'N_rx': N_rx, - 'N_itr': N_itr} - - gen_data_header_file(args.outdir, args.tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/data/data_mimo_mmse_f32.py b/software/data/data_mimo_mmse_f32.py deleted file mode 100644 index 26515e03d..000000000 --- a/software/data/data_mimo_mmse_f32.py +++ /dev/null @@ -1,154 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the fp32 mmse. -# Author: Marco Bertuletti - -import numpy as np -import argparse -import pathlib -from mako.template import Template -from scipy.linalg import solve_triangular - - -################## -# compute_result # -################## - -def gen_data_header_file(outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), **kwargs): - - file = outdir / f"{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def gen_input_data(N_rx, N_tx): - - # Create channel matrix - H = np.random.rand(N_rx, N_tx).astype(np.float32) + 1.j * \ - np.random.rand(N_rx, N_tx).astype(np.float32) - # Create input vector - y = np.random.rand(N_rx).astype(np.float32) + 1.j * \ - np.random.rand(N_rx).astype(np.float32) - # Generate noise variance - sigma = np.diag(np.random.rand(N_tx, N_tx).astype(np.float32)) - - # Matrix to be inverted in MMSE estimator - H_h = np.asmatrix(H).H - - G = H_h * H - G = G + np.diag(sigma) - # Cholesky decomposition - L = np.linalg.cholesky(G) - # Linear system solution - y1 = np.transpose(np.dot(H_h, y)) - y2 = solve_triangular(L, y1, lower=True) - x = solve_triangular(np.asmatrix(L).H, y2) - - H = np.reshape(np.asarray(H), (N_tx * N_rx), order='C') - G = np.reshape(np.asarray(G), (N_tx * N_tx), order='C') - L = np.reshape(np.asarray(L), (N_tx * N_tx), order='C') - H = np.column_stack((H.real, H.imag)).astype(np.float32).flatten() - G = np.column_stack((G.real, G.imag)).astype(np.float32).flatten() - L = np.column_stack((L.real, L.imag)).astype(np.float32).flatten() - - y = np.column_stack((y.real, y.imag)).astype(np.float32).flatten() - x = np.column_stack((x.real, x.imag)).astype(np.float32).flatten() - - return sigma, H, G, y, x - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-t", - "--tpl", - type=pathlib.Path, - required=False, - default=pathlib.Path(__file__).parent.absolute() / - "data_mimo_mmse_f32.h.tpl", - help='Path to mako template' - ) - parser.add_argument( - "-v", - "--verbose", - action='store_true', - help='Set verbose' - ) - parser.add_argument( - "-n", - "--transmitters", - type=int, - required=False, - default=4, - help='First dimension.' - ) - parser.add_argument( - "-m", - "--receivers", - type=int, - required=False, - default=32, - help='Second dimension.' - ) - parser.add_argument( - "-k", - "--iterations", - type=int, - required=False, - default=256, - help='Iterations.' - ) - - args = parser.parse_args() - N_tx = args.transmitters - N_rx = args.receivers - itr = args.iterations - - sigma = np.zeros([itr, N_tx]) - H_RI = np.zeros([itr, 2 * N_tx * N_rx]) - G_RI = np.zeros([itr, 2 * N_tx * N_tx]) - y_RI = np.zeros([itr, 2 * N_rx]) - x_RI = np.zeros([itr, 2 * N_tx]) - for k in range(itr): - sigma[k, :], H_RI[k, :], G_RI[k, :], \ - y_RI[k, :], x_RI[k, :] = gen_input_data(N_rx, N_tx) - - sigma = np.reshape(sigma, (N_tx * itr)) - H_RI = np.reshape(H_RI, (2 * N_rx * N_tx * itr)) - G_RI = np.reshape(G_RI, (2 * N_tx * N_tx * itr)) - y_RI = np.reshape(y_RI, (2 * N_rx * itr)) - x_RI = np.reshape(x_RI, (2 * N_tx * itr)) - - kwargs = {'name': 'data_mimo_mmse_f32', - 'H': H_RI, - 'G': G_RI, - 'sigma': sigma, - 'y': y_RI, - 'x': x_RI, - 'N_tx': N_tx, - 'N_rx': N_rx, - 'N_itr': itr} - - gen_data_header_file(args.outdir, args.tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/data/data_mimo_mmse_q16.py b/software/data/data_mimo_mmse_q16.py deleted file mode 100644 index 718978824..000000000 --- a/software/data/data_mimo_mmse_q16.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Solderpad Hardware License, Version 0.51, see LICENSE for details. -# SPDX-License-Identifier: SHL-0.51 - -# This script generates data for the fp16 mmse. -# Author: Marco Bertuletti - -import numpy as np -import argparse -import pathlib -from mako.template import Template -from scipy.linalg import solve_triangular - - -################## -# compute_result # -################## - -def gen_data_header_file(outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), **kwargs): - - file = outdir / f"{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def gen_input_data(N_rx, N_tx): - - # Create channel matrix - H = np.random.randint(-2**(15), 2**(15) - 1, - N_rx * N_tx, dtype=np.int16) \ - + 1.j * np.random.randint(-2**(15), 2**(15) - 1, - N_rx * N_tx, dtype=np.int16) - H = H.reshape(N_rx, N_tx) - # Create input vector - y = np.random.randint(-2**(15), 2**(15) - 1, N_rx, dtype=np.int16) + \ - 1.j * np.random.randint(-2**(15), 2**(15) - 1, N_rx, dtype=np.int16) - # Generate noise variance - sigma = np.random.randint(-2**(15), 2**(15) - 1, N_tx, dtype=np.int16) - - # Matrix to be inverted in MMSE estimator - H_h = (np.asmatrix(H).H) - - # Hermitian - G = H_h * H + np.diag(sigma) - # Matrix vector product - y1 = np.transpose(np.dot(H_h, y)) - - # Cholesky decomposition - # L = np.linalg.cholesky(G) - L = G - # Linear system solution - y2 = solve_triangular(L, y1, lower=True) - x = solve_triangular(np.asmatrix(L).H, y2) - - sigma = sigma + 0j - H = np.reshape(np.asarray(H), (N_rx * N_tx), order='C') - G = np.reshape(np.asarray(G), (N_tx * N_tx), order='C') - L = np.reshape(np.asarray(L), (N_tx * N_tx), order='C') - sigma = np.column_stack( - (sigma.real, sigma.imag)).astype( - np.int16).flatten() - H = np.column_stack((H.real, H.imag)).astype(np.int16).flatten() - G = np.column_stack((G.real, G.imag)).astype(np.int16).flatten() - L = np.column_stack((L.real, L.imag)).astype(np.int16).flatten() - y = np.column_stack((y.real, y.imag)).astype(np.int16).flatten() - x = np.column_stack((x.real, x.imag)).astype(np.int16).flatten() - - return sigma, H, G, y, x - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-t", - "--tpl", - type=pathlib.Path, - required=False, - default=pathlib.Path(__file__).parent.absolute() / - "data_mimo_mmse_q16.h.tpl", - help='Path to mako template' - ) - parser.add_argument( - "-v", - "--verbose", - action='store_true', - help='Set verbose' - ) - parser.add_argument( - "-n", - "--transmitters", - type=int, - required=False, - default=4, - help='First dimension.' - ) - parser.add_argument( - "-m", - "--receivers", - type=int, - required=False, - default=32, - help='First dimension.' - ) - parser.add_argument( - "-k", - "--iterations", - type=int, - required=False, - default=1, - help='Iterations.' - ) - - args = parser.parse_args() - N_tx = args.transmitters - N_rx = args.receivers - itr = args.iterations - - sigma = np.zeros([itr, 2 * N_tx], dtype=np.int16) - H_RI = np.zeros([itr, 2 * N_tx * N_rx], dtype=np.int16) - G_RI = np.zeros([itr, 2 * N_tx * N_tx], dtype=np.int16) - y_RI = np.zeros([itr, 2 * N_rx], dtype=np.int16) - x_RI = np.zeros([itr, 2 * N_tx], dtype=np.int16) - for k in range(itr): - [sigma[k, :], - H_RI[k, :], - G_RI[k, :], - y_RI[k, :], - x_RI[k, :]] = gen_input_data(N_rx, N_tx) - - sigma = np.reshape(sigma, (2 * N_tx * itr)).astype(np.int16) - H_RI = np.reshape(H_RI, (2 * N_rx * N_tx * itr)).astype(np.int16) - G_RI = np.reshape(G_RI, (2 * N_tx * N_tx * itr)).astype(np.int16) - y_RI = np.reshape(y_RI, (2 * N_rx * itr)).astype(np.int16) - x_RI = np.reshape(x_RI, (2 * N_tx * itr)).astype(np.int16) - - kwargs = {'name': 'data_mimo_mmse_q16', - 'H': H_RI, - 'G': G_RI, - 'sigma': sigma, - 'y': y_RI, - 'x': x_RI, - 'N_tx': N_tx, - 'N_rx': N_rx, - 'N_itr': itr} - - gen_data_header_file(args.outdir, args.tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/data/data_ofdm.py b/software/data/data_ofdm.py deleted file mode 100644 index 64b0a7ca6..000000000 --- a/software/data/data_ofdm.py +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2022 ETH Zurich and University of Bologna. -# Licensed under the Apache License, Version 2.0, see LICENSE for details. -# SPDX-License-Identifier: Apache-2.0 - -# Author: Marco Bertuletti, ETH Zurich - -import numpy as np -import math as M -import argparse -import pathlib -from mako.template import Template -from sympy.combinatorics import Permutation - -################## -# compute_result # -################## - - -def compute_bitreversal(N, R): - # Decompose - logR2 = [] - idx = N - while (idx >= R): - logR2.append(int(M.log2(R))) - idx = idx // R - if (idx > 1): - logR2.append(int(M.log2(idx))) - # Bitreversal - indexes = [] - for x in range(N): - result = 0 - for bits in logR2: - mask = (0xffffffff >> (32 - bits)) - result = (result << bits) | (x & mask) - x = x >> bits - indexes.append(result) - - # Create transpositions table - tps = [] - for c in Permutation.from_sequence(indexes).cyclic_form: - for i in range(len(c) - 1): - tps.append([c[i] * 8, c[-1] * 8]) - return tps - - -def gen_data_header_file(outdir: pathlib.Path.cwd(), - tpl: pathlib.Path.cwd(), **kwargs): - - file = outdir / f"data_{kwargs['name']}.h" - - print(tpl, outdir, kwargs['name']) - - template = Template(filename=str(tpl)) - with file.open('w') as f: - f.write(template.render(**kwargs)) - - -def main(): - - parser = argparse.ArgumentParser(description='Generate data for kernels') - parser.add_argument( - "-o", - "--outdir", - type=pathlib.Path, - default=pathlib.Path(__file__).parent.absolute(), - required=False, - help='Select out directory of generated data files' - ) - parser.add_argument( - "-t", - "--tpl", - type=pathlib.Path, - required=False, - default=pathlib.Path(__file__).parent.absolute() / "data_ofdm.h.tpl", - help='Path to mako template' - ) - parser.add_argument( - "-v", - "--verbose", - action='store_true', - help='Set verbose' - ) - parser.add_argument( - "-rx", - "--receivers", - type=int, - required=False, - default=64, - help='First dimension.' - ) - parser.add_argument( - "-bs", - "--beams", - type=int, - required=False, - default=32, - help='Second dimension.' - ) - parser.add_argument( - "-sc", - "--subcarriers", - type=int, - required=False, - default=4096, - help='Iterations.' - ) - - args = parser.parse_args() - N_rx = args.receivers - N_bs = args.beams - N_sc = args.subcarriers - - pFFT_src = (np.random.rand(2 * N_rx * N_sc)).astype(np.float16) - pTw_coef = (np.random.rand(int(3 * N_sc / 4))).astype(np.float16) - pBF_coef = (np.random.rand(2 * N_rx * N_bs)).astype(np.float16) - pBF_dst = (np.random.rand(2 * N_bs * N_sc)).astype(np.float16) - - Bitreversal = np.ndarray.flatten(np.array(compute_bitreversal(N_sc, 2))) - - kwargs = {'name': 'ofdm', - 'pFFT_src': pFFT_src, - 'pTw_coef': pTw_coef, - 'pBF_coef': pBF_coef, - 'pBF_dst': pBF_dst, - 'bitrev': Bitreversal, - 'N_rx': N_rx, - 'N_bs': N_bs, - 'N_sc': N_sc, - 'Log2Len': int(np.log2(N_sc)), - 'BitrevLen': len(Bitreversal)} - gen_data_header_file(args.outdir, args.tpl, **kwargs) - - -if __name__ == "__main__": - main() diff --git a/software/runtime/data/generate_cfft.py b/software/data/generate_cfft.py similarity index 100% rename from software/runtime/data/generate_cfft.py rename to software/data/generate_cfft.py diff --git a/software/runtime/data/generate_chest.py b/software/data/generate_chest.py similarity index 98% rename from software/runtime/data/generate_chest.py rename to software/data/generate_chest.py index 058ce2abf..e11eb8b62 100755 --- a/software/runtime/data/generate_chest.py +++ b/software/data/generate_chest.py @@ -103,8 +103,8 @@ def compute_chest_q16(in_rx, in_tx, p): # result[2 * (i * n_tx + j) + 1] = q_sat((num_i * den) // 2**p) # Compute data multiplication - num_r = (a_r * b_r) + (a_i * b_i) - num_i = (a_i * b_r) - (a_r * b_i) + num_r = (a_r * b_r) - (a_i * b_i) + num_i = (a_i * b_r) + (a_r * b_i) result[2 * (i * n_tx + j)] = q_sat(num_r // 2**p) result[2 * (i * n_tx + j) + 1] = q_sat(num_i // 2**p) return result diff --git a/software/runtime/data/generate_cholesky.py b/software/data/generate_cholesky.py similarity index 95% rename from software/runtime/data/generate_cholesky.py rename to software/data/generate_cholesky.py index a72dc210f..1a25c4206 100644 --- a/software/runtime/data/generate_cholesky.py +++ b/software/data/generate_cholesky.py @@ -64,11 +64,10 @@ def generate_cholesky_q16(n_matrix, n_samples): vector_L = [] for k in range(n_samples): # Create hermitian matrix - H = np.random.randint(-2**(15), 2**(15) - 1, n_matrix * n_matrix, dtype=np.int16) \ - + 1.j * np.random.randint(-2**(15), - 2**(15) - 1, - n_matrix * n_matrix, - dtype=np.int16) + H = np.random.randint(-2**(15), 2**(15) - 1, n_matrix * n_matrix, + dtype=np.int16) + \ + 1.j * np.random.randint(-2**(15), 2**(15) - 1, n_matrix * n_matrix, + dtype=np.int16) H = H.reshape(n_matrix, n_matrix) # Matrix to be inverted H_h = (np.asmatrix(H).H) diff --git a/software/runtime/data/generate_matmul.py b/software/data/generate_matmul.py similarity index 100% rename from software/runtime/data/generate_matmul.py rename to software/data/generate_matmul.py diff --git a/software/runtime/data/generate_mimo_mmse.py b/software/data/generate_mimo_mmse.py similarity index 93% rename from software/runtime/data/generate_mimo_mmse.py rename to software/data/generate_mimo_mmse.py index 454976f27..5e95d3ef6 100644 --- a/software/runtime/data/generate_mimo_mmse.py +++ b/software/data/generate_mimo_mmse.py @@ -137,9 +137,10 @@ def generate_mimo_mmse_f16(N_tx, N_rx, N_itr, randomize): x = np.column_stack((x.real, x.imag)).astype(np.float16).flatten() vSigma[k, (i * 2 * N_tx_itr):((i + 1) * 2 * N_tx_itr)] = sigma - vH[k, (i * 2 * N_tx_itr * N_rx):((i + 1) * 2 * N_tx_itr * N_rx)] = H - vG[k, (i * 2 * N_tx_itr * N_tx_itr) - :((i + 1) * 2 * N_tx_itr * N_tx_itr)] = G + vH[k, (i * 2 * N_tx_itr * N_rx):( + (i + 1) * 2 * N_tx_itr * N_rx)] = H + vG[k, (i * 2 * N_tx_itr * N_tx_itr):( + (i + 1) * 2 * N_tx_itr * N_tx_itr)] = G vy[k, :] = y vx[k, (i * 2 * N_tx_itr):((i + 1) * 2 * N_tx_itr)] = x @@ -162,13 +163,15 @@ def generate_mimo_mmse_q16(N_tx, N_rx, N_itr): vx = np.zeros([N_itr, 2 * N_tx], dtype=np.int16) for k in range(N_itr): # Create channel matrix - H = np.random.randint(-2**(15), 2**(15) - 1, N_rx * N_tx, dtype=np.int16) \ - + 1.j * np.random.randint(-2**(15), 2 ** - (15) - 1, N_rx * N_tx, dtype=np.int16) + H = np.random.randint(-2**(15), 2**(15) - 1, N_rx * N_tx, + dtype=np.int16) + \ + 1.j * np.random.randint(-2**(15), 2 ** (15) - 1, + N_rx * N_tx, dtype=np.int16) # Create input vector - y = np.random.randint(-2**(15), 2**(15) - 1, N_rx, dtype=np.int16) \ - + 1.j * np.random.randint(-2**(15), 2 ** - (15) - 1, N_rx, dtype=np.int16) + y = np.random.randint(-2**(15), 2**(15) - 1, N_rx, + dtype=np.int16) + \ + 1.j * np.random.randint(-2**(15), 2 ** (15) - 1, N_rx, + dtype=np.int16) # Generate noise variance sigma = np.random.randint(-2**(15), 2**(15) - 1, N_tx, dtype=np.int16) @@ -238,7 +241,7 @@ def main(): "--iterations", type=int, required=False, - default=1, + default=32, help='Iterations.' ) @@ -261,7 +264,7 @@ def main(): gen_data_header_file(args.outdir, tpl, **kwargs) vSigma, vH, vG, vy, vx, beamgroups = generate_mimo_mmse_f16( - N_tx, N_rx, N_itr, 1) + N_tx, N_rx, N_itr, 0) tpl = pathlib.Path(__file__).parent.absolute() / "data_mimo_mmse_f16.h.tpl" kwargs = {'name': 'data_mimo_mmse_f16', 'H': vH, diff --git a/software/runtime/data/generate_ofdm.py b/software/data/generate_ofdm.py similarity index 100% rename from software/runtime/data/generate_ofdm.py rename to software/data/generate_ofdm.py diff --git a/software/kernels/baremetal/mempool_cfft_radix4_butterfly_f16.h b/software/kernels/baremetal/mempool_cfft_radix4_butterfly_f16.h deleted file mode 100644 index 5196fc30d..000000000 --- a/software/kernels/baremetal/mempool_cfft_radix4_butterfly_f16.h +++ /dev/null @@ -1,199 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -#include "xpulp/builtins_v2.h" - -/** - @brief First butterfly stage. - @param[in] pIn points to input buffer of 16b data, Re and Im parts are - interleaved - @param[out] pOut points to output buffer of 16b data, Re and Im parts are - interleaved - @param[in] i0 points to the first element to be processed - @param[in] n2 number of elements in the first wing of the butterfly - @param[in] CoSi1 packed cosine and sine first twiddle - @param[in] CoSi2 packed cosine and sine second twiddle - @param[in] CoSi3 packed cosine and sine third twiddle - @param[in] C1 packed sine and cosine first twiddle - @param[in] C2 packed sine and cosine second twiddle - @param[in] C3 packed sine and cosine third twiddle - @return none -*/ -static inline void radix4_butterfly(__fp16 *pIn, __fp16 *pOut, - uint32_t i0, uint32_t n2, v2h CoSi1, - v2h CoSi2, v2h CoSi3, v2h C1, v2h C2, - v2h C3) { - uint32_t i1, i2, i3; - __fp16 t0, t1, t2, t3, t4, t5; - v2h A, B, C, D, E, F, G, H; - -#if defined(FOLDED) || defined(SCHEDULED) - /* index calculation for the input as, */ - /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */ - i1 = i0 + N_BANKS; - i2 = i1 + N_BANKS; - i3 = i2 + N_BANKS; - uint32_t n2_store = n2 >> 2U; - uint32_t i0_store = - (i0 % n2_store) + (i0 / n2) * n2 + ((i0 % n2) / n2_store) * N_BANKS; - uint32_t i1_store = i0_store + n2_store; - uint32_t i2_store = i1_store + n2_store; - uint32_t i3_store = i2_store + n2_store; -#else - /* index calculation for the input as, */ - /* pIn[i0 + 0], pIn[i0 + fftLen/4], pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */ - i1 = i0 + n2; - i2 = i1 + n2; - i3 = i2 + n2; -#endif - /* Read ya (real), xa (imag) input */ - A = *(v2h *)&pIn[i0 * 2U]; - /* Read yb (real), xb(imag) input */ - B = *(v2h *)&pIn[i1 * 2U]; - /* Read yc (real), xc(imag) input */ - C = *(v2h *)&pIn[i2 * 2U]; - /* Read yd (real), xd(imag) input */ - D = *(v2h *)&pIn[i3 * 2U]; - asm volatile( - // xa + xc, ya + yc - "vfadd.h %[E],%[A],%[C];" - // xa - xc, ya - yc - "vfsub.h %[F],%[A],%[C];" - // xb + xd, yd + yd - "vfadd.h %[G],%[B],%[D];" - // xb - xd, yb - yd - "vfsub.h %[H],%[B],%[D];" - "pv.extract.h %[t0],%[H],0;" - "pv.extract.h %[t1],%[H],1;" - "fsub.h %[t3],zero,%[t1];" - "fsub.h %[t4],zero,%[t0];" - // yd - yb, xb - xd - "pv.pack.h %[C],%[t0],%[t3];" - // yb - yd, xd - xb - "pv.pack.h %[D],%[t4],%[t1];" - // xa + xc + xb + xd, ya + yb + yc + yd - "vfadd.h %[A],%[E],%[G];" - // xa - xc + yb - yd, ya - yc + xd - xb - "vfadd.h %[D],%[F],%[D];" - // xa + xc - xb - xd, ya + yc - yb - yd - "vfsub.h %[B],%[E],%[G];" - // xa - xc - yb + yd, ya - yc + xb - xd - "vfadd.h %[C],%[F],%[C];" - "vfdotpex.s.h %[t0],%[CoSi1],%[D];" - "vfdotpex.s.h %[t2],%[CoSi2],%[B];" - "vfdotpex.s.h %[t4],%[CoSi3],%[C];" - "vfdotpex.s.h %[t1],%[C1],%[D];" - "vfdotpex.s.h %[t3],%[C1],%[B];" - "vfdotpex.s.h %[t5],%[C3],%[C];" - "fcvt.h.s %[t0],%[t0];" - "fcvt.h.s %[t1],%[t1];" - "fcvt.h.s %[t2],%[t2];" - "fcvt.h.s %[t3],%[t3];" - "fcvt.h.s %[t4],%[t4];" - "fcvt.h.s %[t5],%[t5];" - "pv.pack.h %[E],%[t1],%[t0];" - "pv.pack.h %[F],%[t3],%[t2];" - "pv.pack.h %[G],%[t5],%[t4];" - : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), - [E] "=&r"(E), [F] "=&r"(F), [G] "=&r"(G), [H] "=&r"(H), - [t0] "=&r"(t0), [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), - [t4] "=&r"(t4), [t5] "=&r"(t5) - : [C1] "r"(C1), [C2] "r"(C2), [C3] "r"(C3), [CoSi1] "r"(CoSi1), - [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3) - :); -#if defined(FOLDED) || defined(SCHEDULED) - *((v2h *)&pOut[i0_store * 2U]) = A; - *((v2h *)&pOut[i1_store * 2U]) = E; - *((v2h *)&pOut[i2_store * 2U]) = F; - *((v2h *)&pOut[i3_store * 2U]) = G; -#else - *((v2h *)&pOut[i0 * 2U]) = A; - *((v2h *)&pOut[i1 * 2U]) = E; - *((v2h *)&pOut[i2 * 2U]) = F; - *((v2h *)&pOut[i3 * 2U]) = G; -#endif - -} - -/** - @brief Last butterfly stage. - @param[in] pIn points to input buffer of 16b data, Re and Im parts are - interleaved - @param[out] pOut points to output buffer of 16b data, Re and Im parts are - interleaved - @param[in] i0 points to the first element to be processed - @return none -*/ -static inline void radix4_butterfly_last(__fp16 *pIn, __fp16 *pOut, - uint32_t i0) { - __fp16 t0, t1; - uint32_t i1, i2, i3; - v2h A, B, C, D, E, F, G, H; - -#if defined(FOLDED) || defined(SCHEDULED) - /* index calculation for the input as, */ - /* pIn[i0 + 0], pIn[i0 + fftLen/4], - pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */ - i1 = i0 + N_BANKS; - i2 = i1 + N_BANKS; - i3 = i2 + N_BANKS; -#ifndef SCHEDULED - uint32_t i0_store = i0 * 4; - uint32_t i1_store = i0_store + 1; - uint32_t i2_store = i1_store + 1; - uint32_t i3_store = i2_store + 1; -#endif -#else - /* index calculation for the input as, */ - /* pIn[i0 + 0], pIn[i0 + fftLen/4], - pIn[i0 + fftLen/2], pIn[i0 + 3fftLen/4] */ - i1 = i0 + 1U; - i2 = i1 + 1U; - i3 = i2 + 1U; -#endif - - /* Read ya (real), xa(imag) input */ - A = *(v2h *)&pIn[i0 * 2U]; - /* Read yb (real), xb(imag) input */ - B = *(v2h *)&pIn[i1 * 2U]; - /* Read yc (real), xc(imag) input */ - C = *(v2h *)&pIn[i2 * 2U]; - /* Read yd (real), xd(imag) input */ - D = *(v2h *)&pIn[i3 * 2U]; - __fp16 t2, t3; - asm volatile( - "vfsub.h %[H],%[B],%[D];" - "vfadd.h %[G],%[B],%[D];" - "vfadd.h %[E],%[A],%[C];" - "vfsub.h %[F],%[A],%[C];" - "pv.extract.h %[t0],%[H],0;" - "pv.extract.h %[t1],%[H],1;" - "fsub.h %[t2], zero, %[t0];" - "fsub.h %[t3], zero, %[t1];" - "pv.pack.h %[A],%[t2],%[t1];" - "pv.pack.h %[B],%[t0],%[t3];" - "vfadd.h %[H],%[E],%[G];" - "vfsub.h %[E],%[E],%[G];" - "vfadd.h %[A],%[F],%[A];" - "vfadd.h %[B],%[F],%[B];" - : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "=&r"(E), - [F] "=&r"(F), [G] "=&r"(G), [H] "=&r"(H), [t0] "=&r"(t0), - [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3) - : - :); -#if defined(FOLDED) - *((v2h *)&pOut[i0_store * 2U]) = H; - *((v2h *)&pOut[i1_store * 2U]) = E; - *((v2h *)&pOut[i2_store * 2U]) = A; - *((v2h *)&pOut[i3_store * 2U]) = B; -#else - *((v2h *)&pOut[i0 * 2U]) = H; - *((v2h *)&pOut[i1 * 2U]) = E; - *((v2h *)&pOut[i2 * 2U]) = A; - *((v2h *)&pOut[i3 * 2U]) = B; -#endif - -} diff --git a/software/kernels/baremetal/mempool_cfft_radix4_f16p.h b/software/kernels/baremetal/mempool_cfft_radix4_f16p.h deleted file mode 100644 index d2220d090..000000000 --- a/software/kernels/baremetal/mempool_cfft_radix4_f16p.h +++ /dev/null @@ -1,526 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -#include "xpulp/builtins_v2.h" -#define MIN(x, y) (((x) < (y)) ? (x) : (y)) - -/** - @brief Folding in local memory function - @param[in] pSrc16 points to input buffer of 16b data, Re and Im parts are - interleaved - @param[in] fftLen Length of the complex input vector - @param[in] nPE Number of PE - @return none -*/ - -static inline void fold_radix4(__fp16 *pSrc16, uint32_t fftLen, - uint32_t core_id, uint32_t nPE) { - uint32_t n2, i0, i1, i2, i3; - uint32_t i1_store, i2_store, i3_store; - volatile v2h A, B, C; - n2 = fftLen >> 2U; - for (i0 = core_id * STEP; i0 < MIN(core_id * STEP + STEP, n2); i0++) { - i1 = i0 + n2; - i2 = i1 + n2; - i3 = i2 + n2; - A = *(v2h *)&pSrc16[i1 * 2U]; - B = *(v2h *)&pSrc16[i2 * 2U]; - C = *(v2h *)&pSrc16[i3 * 2U]; - i1_store = i0 + N_BANKS; - i2_store = i1_store + N_BANKS; - i3_store = i2_store + N_BANKS; - *(v2h *)&pSrc16[i1_store * 2U] = A; - *(v2h *)&pSrc16[i2_store * 2U] = B; - *(v2h *)&pSrc16[i3_store * 2U] = C; - } - mempool_log_partial_barrier(2 * WU_STRIDE, WU_STRIDE * core_id, - nPE * WU_STRIDE); -} - -#ifdef FOLDED_TWIDDLES -/** - @brief Full FFT butterfly - @param[in] pSrc16 points to input buffer of 16b data, Re and Im parts are - interleaved - @param[out] pDst16 points to output buffer of 16b data, Re and Im parts - are interleaved - @param[in] fftLen Length of the complex input vector - @param[in] pCoef_src Twiddle coefficients vector - @param[in] pCoef_dst Auxiliary twiddle coefficients vector - @param[in] nPE Number of PE - @return pointer to output vector -*/ -__fp16 *mempool_radix4_cfft_q16p_folded(__fp16 *pSrc16, __fp16 *pDst16, - uint32_t fftLen, __fp16 *pCoef_src, - __fp16 *pCoef_dst, uint32_t nPE) -#else -/** - Twiddles are not folded in memory - @brief Full FFT butterfly - @param[in] pSrc16 points to input buffer of 16b data, Re and Im parts are - interleaved - @param[out] pDst16 points to output buffer of 16b data, Re and Im parts - are interleaved - @param[in] fftLen Length of the complex input vector - @param[in] pCoef_src Twiddle coefficients vector - @param[in] nPE Number of PE - @return pointer to output vector -*/ -__fp16 *mempool_radix4_cfft_q16p_folded(__fp16 *pSrc16, __fp16 *pDst16, - uint32_t fftLen, __fp16 *pCoef_src, - uint32_t nPE) -#endif -{ - -#ifdef FOLDED_TWIDDLES - uint32_t absolute_core_id = mempool_get_core_id(); - uint32_t core_id = absolute_core_id / WU_STRIDE; - __fp16 t0, t1, t2, t3, t4, t5; - v2h CoSi1, CoSi2, CoSi3; - v2h C1, C2, C3; - uint32_t n1, n2, n2_store, i0, j, k; - uint32_t ic, offset, wing_idx; - __fp16 *pTmp; -#else - uint32_t absolute_core_id = mempool_get_core_id(); - uint32_t core_id = absolute_core_id / WU_STRIDE; - __fp16 t0, t1, t2, t3, t4, t5; - v2h CoSi1, CoSi2, CoSi3; - v2h C1, C2, C3; - uint32_t n1, n2, n2_store, i0, j, k; - uint32_t ic, offset, wing_id, bank_id; - __fp16 *pTmp; - uint32_t twidCoefModifier = 1U; -#endif - - if (fftLen <= N_BANKS) - fold_radix4(pSrc16, fftLen, core_id, nPE); - - /* START OF FIRST STAGE PROCESS */ - n1 = fftLen; - n2 = n1 >> 2U; - n2_store = n2 >> 2U; - for (i0 = core_id * STEP; i0 < MIN(core_id * STEP + STEP, n2); i0++) { - -#ifdef FOLDED_TWIDDLES - CoSi1 = *(v2h *)&pCoef_src[2U * i0]; - CoSi2 = *(v2h *)&pCoef_src[2U * (i0 + 1 * N_BANKS)]; - CoSi3 = *(v2h *)&pCoef_src[2U * (i0 + 2 * N_BANKS)]; - if (i0 % 4 == 0) { - ic = i0 >> 2U; - *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi1; - *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi1; - *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi1; - *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi1; - ic += N_BANKS; - *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi2; - *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi2; - *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi2; - *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi2; - ic += N_BANKS; - *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi3; - *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi3; - *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi3; - *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi3; - } -#else - CoSi1 = *(v2h *)&pCoef_src[2U * i0]; - CoSi2 = *(v2h *)&pCoef_src[2U * (i0 * 2U)]; - CoSi3 = *(v2h *)&pCoef_src[2U * (i0 * 3U)]; -#endif - asm volatile("pv.extract.h %[t1],%[CoSi1],0;" - "pv.extract.h %[t3],%[CoSi2],0;" - "pv.extract.h %[t5],%[CoSi3],0;" - "pv.extract.h %[t0],%[CoSi1],1;" - "pv.extract.h %[t2],%[CoSi2],1;" - "pv.extract.h %[t4],%[CoSi3],1;" - "fsub.h %[t0],zero,%[t0];" - "fsub.h %[t2],zero,%[t2];" - "fsub.h %[t4],zero,%[t4];" - "pv.pack.h %[C1],%[t1],%[t0];" - "pv.pack.h %[C2],%[t3],%[t2];" - "pv.pack.h %[C3],%[t5],%[t4];" - : [C1] "=r"(C1), [C2] "=r"(C2), [C3] "=r"(C3), [t0] "=&r"(t0), - [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), - [t4] "=&r"(t4), [t5] "=&r"(t5) - : [CoSi1] "r"(CoSi1), [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3) - :); - radix4_butterfly(pSrc16, pDst16, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2, - C3); - } - pTmp = pSrc16; - pSrc16 = pDst16; - pDst16 = pTmp; -#ifdef FOLDED_TWIDDLES - pTmp = pCoef_src; - pCoef_src = pCoef_dst; - pCoef_dst = pTmp; -#else - twidCoefModifier <<= 2U; -#endif - mempool_log_partial_barrier(2 * WU_STRIDE, absolute_core_id, nPE * WU_STRIDE); - /* END OF FIRST STAGE PROCESSING */ - - /* START OF MIDDLE STAGE PROCESS */ - for (k = fftLen / 4U; k > 4U; k >>= 2U) { - n1 = n2; - n2 >>= 2U; - n2_store = n2 >> 2U; - -#ifdef FOLDED_TWIDDLES - for (j = core_id * STEP; j < core_id * STEP + STEP; j++) { - CoSi1 = *(v2h *)&pCoef_src[2U * j]; - CoSi2 = *(v2h *)&pCoef_src[2U * (j + 1 * N_BANKS)]; - CoSi3 = *(v2h *)&pCoef_src[2U * (j + 2 * N_BANKS)]; - if (j % 4 == 0) { - wing_idx = j % n2; - offset = (j / n2); - ic = wing_idx >> 2U; - ic += offset * n2; - *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi1; - *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi1; - *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi1; - *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi1; - ic += N_BANKS; - *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi2; - *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi2; - *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi2; - *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi2; - ic += N_BANKS; - *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi3; - *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi3; - *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi3; - *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi3; - } -#else - bank_id = core_id / n2_store; - wing_id = core_id % n2_store; - offset = bank_id * n2; - for (j = wing_id * 4; j < MIN(wing_id * 4 + 4, n2); j++) { - ic = j * twidCoefModifier; - CoSi1 = *(v2h *)&pCoef_src[2U * ic]; - CoSi2 = *(v2h *)&pCoef_src[2U * (ic * 2U)]; - CoSi3 = *(v2h *)&pCoef_src[2U * (ic * 3U)]; -#endif - asm volatile("pv.extract.h %[t1],%[CoSi1],0;" - "pv.extract.h %[t3],%[CoSi2],0;" - "pv.extract.h %[t5],%[CoSi3],0;" - "pv.extract.h %[t0],%[CoSi1],1;" - "pv.extract.h %[t2],%[CoSi2],1;" - "pv.extract.h %[t4],%[CoSi3],1;" - "fsub.h %[t0],zero,%[t0];" - "fsub.h %[t2],zero,%[t2];" - "fsub.h %[t4],zero,%[t4];" - "pv.pack %[C1],%[t1],%[t0];" - "pv.pack %[C2],%[t3],%[t2];" - "pv.pack %[C3],%[t5],%[t4];" - : [C1] "=r"(C1), [C2] "=r"(C2), [C3] "=r"(C3), - [t0] "=&r"(t0), [t1] "=&r"(t1), [t2] "=&r"(t2), - [t3] "=&r"(t3), [t4] "=&r"(t4), [t5] "=&r"(t5) - : [CoSi1] "r"(CoSi1), [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3) - :); -#ifdef FOLDED_TWIDDLES - i0 = j; - radix4_butterfly(pSrc16, pDst16, i0, n2, CoSi1, CoSi2, CoSi3, C1, - C2, C3); - } -#else - i0 = offset + j; - radix4_butterfly(pSrc16, pDst16, i0, n2, CoSi1, CoSi2, CoSi3, C1, - C2, C3); - } -#endif - pTmp = pSrc16; - pSrc16 = pDst16; - pDst16 = pTmp; -#ifdef FOLDED_TWIDDLES - pTmp = pCoef_src; - pCoef_src = pCoef_dst; - pCoef_dst = pTmp; -#else - twidCoefModifier <<= 2U; -#endif - mempool_log_partial_barrier(2 * WU_STRIDE, absolute_core_id, - nPE * WU_STRIDE); - } - /* END OF MIDDLE STAGE PROCESSING */ - - /* START OF LAST STAGE PROCESSING */ - n1 = n2; - n2 >>= 2U; - for (i0 = core_id * STEP; i0 < MIN(core_id * STEP + STEP, fftLen >> 2U); - i0++) { - radix4_butterfly_last(pSrc16, pDst16, i0); - } - mempool_log_partial_barrier(2 * WU_STRIDE, absolute_core_id, nPE * WU_STRIDE); - /* END OF LAST STAGE PROCESSING */ - - return pDst16; -} - -/** - SCHEDULER OF MULTIPLE FOLDED FFTS - Memory: - - 1st row of FFTS - - col_idx1 col_idx2 col_idx3 - xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... - xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... - xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... - xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... - - 2nd row of FFTS - - col_idx1 col_idx2 col_idx3 - xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... - xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... - xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... - xxxxxxxxxxxx xxxxxxxxxxxx xxxxxxxxxxxx ... - - ... - - @brief Scheduler of folded FFTs - @param[in] column index of the current FFT - @param[in] pSrc16 input buffer of 16b data, Re and Im are interleaved - @param[out] pDst16 output buffer of 16b data, Re and Im are interleaved - @param[in] fftLen Length of the complex input vector - @param[in] pCoef_src Twiddle coefficients vector - @param[in] pCoef_dst Twiddle coefficients vector - @param[in] pBitRevTable Bitreversal table - @param[in] bitReverseLen Length of bitreversal table - @param[in] bitReverseFlag Flag for bitreversal - @param[in] nPE Number of PE - @return void -*/ - -void mempool_radix4_cfft_q16p_scheduler(uint32_t col_id, __fp16 *pSrc16, - __fp16 *pDst16, uint32_t fftLen, - __fp16 *pCoef_src, __fp16 *pCoef_dst, - __attribute__((unused)) - uint16_t *pBitRevTable, - __attribute__((unused)) - uint16_t bitReverseLen, - uint8_t bitReverseFlag, uint32_t nPE) { - - uint32_t absolute_core_id = mempool_get_core_id(); - uint32_t core_id = absolute_core_id % (fftLen >> 4U); - - uint32_t n1, n2, i0, ic, j, k; - uint32_t n2_store; - uint32_t offset, wing_idx; - __fp16 *pTmp; - int32_t t0, t1, t2, t3, t4, t5; - v2h CoSi1, CoSi2, CoSi3; - v2h C1, C2, C3; - - /* FIRST STAGE */ - n1 = fftLen; - n2 = n1 >> 2U; - n2_store = n2 >> 2U; - for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, n2); i0++) { - CoSi1 = *(v2h *)&pCoef_src[2U * i0]; - CoSi2 = *(v2h *)&pCoef_src[2U * (i0 + 1 * N_BANKS)]; - CoSi3 = *(v2h *)&pCoef_src[2U * (i0 + 2 * N_BANKS)]; - if (i0 % 4 == 0) { - ic = i0 / 4; - *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi1; - *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi1; - *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi1; - *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi1; - ic += N_BANKS; - *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi2; - *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi2; - *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi2; - *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi2; - ic += N_BANKS; - *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi3; - *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi3; - *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi3; - *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi3; - } - asm volatile("pv.extract.h %[t1],%[CoSi1],0;" - "pv.extract.h %[t3],%[CoSi2],0;" - "pv.extract.h %[t5],%[CoSi3],0;" - "pv.extract.h %[t0],%[CoSi1],1;" - "pv.extract.h %[t2],%[CoSi2],1;" - "pv.extract.h %[t4],%[CoSi3],1;" - "fsub.h %[t0],zero,%[t0];" - "fsub.h %[t2],zero,%[t2];" - "fsub.h %[t4],zero,%[t4];" - "pv.pack.h %[C1],%[t1],%[t0];" - "pv.pack.h %[C2],%[t3],%[t2];" - "pv.pack.h %[C3],%[t5],%[t4];" - : [C1] "=r"(C1), [C2] "=r"(C2), [C3] "=r"(C3), [t0] "=&r"(t0), - [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), - [t4] "=&r"(t4), [t5] "=&r"(t5) - : [CoSi1] "r"(CoSi1), [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3) - :); - for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) { - __fp16 *pIn = pSrc16 + idx_row * (N_BANKS * 8); - __fp16 *pOut = pDst16 + idx_row * (N_BANKS * 8); - radix4_butterfly(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2, - C3); - } - } - pTmp = pSrc16; - pSrc16 = pDst16; - pDst16 = pTmp; - pTmp = pCoef_src; - pCoef_src = pCoef_dst; - pCoef_dst = pTmp; - mempool_log_partial_barrier(2, absolute_core_id, nPE); - - /* MIDDLE STAGE */ - for (k = fftLen / 4U; k > 4U; k >>= 2U) { - n1 = n2; - n2 >>= 2U; - n2_store = n2 >> 2U; - - for (j = core_id * 4; j < core_id * 4 + 4; j++) { - CoSi1 = *(v2h *)&pCoef_src[2U * (j)]; - CoSi2 = *(v2h *)&pCoef_src[2U * (j + 1 * N_BANKS)]; - CoSi3 = *(v2h *)&pCoef_src[2U * (j + 2 * N_BANKS)]; - if (j % 4 == 0) { - - wing_idx = j % n2; - offset = (j / n2); - ic = wing_idx >> 2U; - ic += offset * n2; - - *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi1; - *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi1; - *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi1; - *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi1; - ic += N_BANKS; - *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi2; - *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi2; - *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi2; - *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi2; - ic += N_BANKS; - *((v2h *)&pCoef_dst[2U * (ic)]) = CoSi3; - *((v2h *)&pCoef_dst[2U * (n2_store * 1 + ic)]) = CoSi3; - *((v2h *)&pCoef_dst[2U * (n2_store * 2 + ic)]) = CoSi3; - *((v2h *)&pCoef_dst[2U * (n2_store * 3 + ic)]) = CoSi3; - } - asm volatile("pv.extract.h %[t1],%[CoSi1],0;" - "pv.extract.h %[t3],%[CoSi2],0;" - "pv.extract.h %[t5],%[CoSi3],0;" - "pv.extract.h %[t0],%[CoSi1],1;" - "pv.extract.h %[t2],%[CoSi2],1;" - "pv.extract.h %[t4],%[CoSi3],1;" - "fsub.h %[t0],zero,%[t0];" - "fsub.h %[t2],zero,%[t2];" - "fsub.h %[t4],zero,%[t4];" - "pv.pack.h %[C1],%[t1],%[t0];" - "pv.pack.h %[C2],%[t3],%[t2];" - "pv.pack.h %[C3],%[t5],%[t4];" - : [C1] "=r"(C1), [C2] "=r"(C2), [C3] "=r"(C3), - [t0] "=&r"(t0), [t1] "=&r"(t1), [t2] "=&r"(t2), - [t3] "=&r"(t3), [t4] "=&r"(t4), [t5] "=&r"(t5) - : [CoSi1] "r"(CoSi1), [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3) - :); - for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) { - __fp16 *pIn = pSrc16 + idx_row * (N_BANKS * 8); - __fp16 *pOut = pDst16 + idx_row * (N_BANKS * 8); - radix4_butterfly(pIn, pOut, j, n2, CoSi1, CoSi2, CoSi3, C1, C2, - C3); - } - } - pTmp = pSrc16; - pSrc16 = pDst16; - pDst16 = pTmp; - pTmp = pCoef_src; - pCoef_src = pCoef_dst; - pCoef_dst = pTmp; - mempool_log_partial_barrier(2, absolute_core_id, nPE); - } - - /* LAST STAGE */ - n1 = n2; - n2 >>= 2U; - for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, fftLen >> 2U); i0++) { - for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) { - __fp16 *pIn = pSrc16 + idx_row * (N_BANKS * 8); - __fp16 *pOut = pDst16 + idx_row * (N_BANKS * 8); - radix4_butterfly_last(pIn, pOut, i0); - } - } - pTmp = pSrc16; - pSrc16 = pDst16; - pDst16 = pTmp; - mempool_log_partial_barrier(2, absolute_core_id, nPE); - - mempool_stop_benchmark(); - mempool_start_benchmark(); - - /* BITREVERSAL */ - // Bitreversal stage stores in the sequential addresses - if (bitReverseFlag) { -#ifdef BITREVERSETABLE - uint16_t *ptr1 = (uint16_t *)(pSrc16 + 2 * col_id * (fftLen >> 2U)); - uint16_t *ptr2 = (uint16_t *)(pDst16 + 2 * col_id * (3 * (fftLen >> 2))); - for (j = 2 * core_id; j < bitReverseLen; j += 2 * nPE) { - v2h addr, tmpa, tmpb; - addr = __SRA2(*(v2h *)&pBitRevTable[j], ((v2h){2, 2})); - for (int32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) { - int32_t a0 = addr[0] / 4 + (addr[0] % 4) * N_BANKS; - int32_t a1 = addr[1] / 4 + (addr[0] % 4) * N_BANKS; - tmpa = *(v2h *)&ptr1[a0 + idx_row * (N_BANKS * 8)]; - tmpb = *(v2h *)&ptr1[a1 + idx_row * (N_BANKS * 8)]; - *((v2h *)&ptr2[addr[0] + idx_row * (N_BANKS * 8)]) = tmpb; - *((v2h *)&ptr2[addr[1] + idx_row * (N_BANKS * 8)]) = tmpa; - } - } -#else - uint16_t *ptr1 = (uint16_t *)(pSrc16 + 2 * col_id * (fftLen >> 2U)); - uint16_t *ptr2 = (uint16_t *)(pDst16 + 2 * col_id * (3 * (fftLen >> 2))); - for (j = core_id * 16; j < MIN(core_id * 16 + 16, fftLen >> 2U); j += 4) { - uint32_t idx0 = j; - uint32_t idx1 = j + 1; - uint32_t idx2 = j + 2; - uint32_t idx3 = j + 3; - uint32_t idx_result0 = 0; - uint32_t idx_result1 = 0; - uint32_t idx_result2 = 0; - uint32_t idx_result3 = 0; - for (k = 0; k < LOG2; k++) { - idx_result0 = (idx_result0 << 1U) | (idx0 & 1U); - idx_result1 = (idx_result1 << 1U) | (idx1 & 1U); - idx_result2 = (idx_result2 << 1U) | (idx2 & 1U); - idx_result3 = (idx_result3 << 1U) | (idx3 & 1U); - idx0 = idx0 >> 1U; - idx1 = idx1 >> 1U; - idx2 = idx2 >> 1U; - idx3 = idx3 >> 1U; - } - for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) { - uint32_t addr_src0 = (idx0 / 4) + (idx0 % 4) * N_BANKS; - uint32_t addr_src1 = (idx1 / 4) + (idx1 % 4) * N_BANKS; - uint32_t addr_src2 = (idx2 / 4) + (idx2 % 4) * N_BANKS; - uint32_t addr_src3 = (idx3 / 4) + (idx3 % 4) * N_BANKS; - uint32_t addr_dst0 = idx_result0; - uint32_t addr_dst1 = idx_result1; - uint32_t addr_dst2 = idx_result2; - uint32_t addr_dst3 = idx_result3; - addr_src0 += idx_row * (N_BANKS * 8); - addr_src1 += idx_row * (N_BANKS * 8); - addr_src2 += idx_row * (N_BANKS * 8); - addr_src3 += idx_row * (N_BANKS * 8); - addr_dst0 += idx_row * (N_BANKS * 8); - addr_dst1 += idx_row * (N_BANKS * 8); - addr_dst2 += idx_row * (N_BANKS * 8); - addr_dst3 += idx_row * (N_BANKS * 8); - *((uint32_t *)&ptr2[addr_dst0]) = (uint32_t)ptr1[addr_src0]; - *((uint32_t *)&ptr2[addr_dst1]) = (uint32_t)ptr1[addr_src1]; - *((uint32_t *)&ptr2[addr_dst2]) = (uint32_t)ptr1[addr_src2]; - *((uint32_t *)&ptr2[addr_dst3]) = (uint32_t)ptr1[addr_src3]; - } - } -#endif - } - mempool_log_partial_barrier(2, absolute_core_id, nPE); -} diff --git a/software/kernels/baremetal/mempool_cfft_radix4_q16_bitreversal.h b/software/kernels/baremetal/mempool_cfft_radix4_q16_bitreversal.h deleted file mode 100644 index 32f7a5265..000000000 --- a/software/kernels/baremetal/mempool_cfft_radix4_q16_bitreversal.h +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -void mempool_bitrev_q16p_xpulpimg(uint16_t *pSrc, uint16_t *pDst, - const uint16_t fftLen, const uint32_t nPE) { - uint32_t absolute_core_id = mempool_get_core_id(); - uint32_t core_id = absolute_core_id / WU_STRIDE; - uint32_t idx_result, idx, i, j; - for (i = core_id; i < fftLen; i += nPE) { - idx_result = 0; - idx = i; - for (j = 0; j < LOG2; j++) { - idx_result = (idx_result << 1U) | (idx & 1U); - idx = idx >> 1U; - } - pDst[2 * idx_result] = pSrc[2 * i]; - pDst[2 * idx_result + 1] = pSrc[2 * i + 1]; - } - mempool_log_partial_barrier(2 * WU_STRIDE, absolute_core_id, nPE * WU_STRIDE); -} diff --git a/software/kernels/baremetal/mempool_chest_f16.h b/software/kernels/baremetal/mempool_chest_f16.h new file mode 100644 index 000000000..7d53afc65 --- /dev/null +++ b/software/kernels/baremetal/mempool_chest_f16.h @@ -0,0 +1,382 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +#pragma once +#define __CDOTP +#define __MUL + +/* a[i] = ar[i] + i * ai[j] + + out[i][j] = a[i] / c[j] + out[i][j + 1] = a[i] / c[j + 1h + out[i][j + 2] = a[i] / c[j + 2] + out[i][j + 3] = a[i] / c[j + 3]*/ + +#ifdef __XDIVSQRT +#define DIV_LOOP(ab, ab_n, i) \ + { \ + re0 = 0; \ + re1 = 0; \ + re2 = 0; \ + re3 = 0; \ + im0 = 0; \ + im1 = 0; \ + im2 = 0; \ + im3 = 0; \ + D0 = 0; \ + D1 = 0; \ + D2 = 0; \ + D3 = 0; \ + cd0 = *(uint32_t *)&pPilotTX_itr[2U * j]; \ + cd1 = *(uint32_t *)&pPilotTX_itr[2U * (j + 1)]; \ + cd2 = *(uint32_t *)&pPilotTX_itr[2U * (j + 2)]; \ + cd3 = *(uint32_t *)&pPilotTX_itr[2U * (j + 3)]; \ + asm volatile("vfdotpex.s.h %[D0], %[cd0], %[cd0];" \ + "vfdotpex.s.h %[D1], %[cd1], %[cd1];" \ + "vfdotpex.s.h %[D2], %[cd2], %[cd2];" \ + "vfdotpex.s.h %[D3], %[cd3], %[cd3];" \ + "vfdotpex.s.h %[re0], %[x], %[cd0];" \ + "vfdotpex.s.h %[re1], %[x], %[cd1];" \ + "vfdotpex.s.h %[re2], %[x], %[cd2];" \ + "vfdotpex.s.h %[re3], %[x], %[cd3];" \ + "vfdotpex.s.h %[im0], %[y], %[cd0];" \ + "vfdotpex.s.h %[im1], %[y], %[cd1];" \ + "vfdotpex.s.h %[im2], %[y], %[cd2];" \ + "vfdotpex.s.h %[im3], %[y], %[cd3];" \ + "fdiv.s %[re0], %[re0], %[D0];" \ + "fdiv.s %[re1], %[re1], %[D1];" \ + "fdiv.s %[re2], %[re2], %[D2];" \ + "fdiv.s %[re3], %[re3], %[D3];" \ + "fdiv.s %[im0], %[im0], %[D0];" \ + "fdiv.s %[im1], %[im1], %[D1];" \ + "fdiv.s %[im2], %[im2], %[D2];" \ + "fdiv.s %[im3], %[im3], %[D3];" \ + "vfcpka.h.s %[re0], %[re0], %[im0];" \ + "vfcpka.h.s %[re1], %[re1], %[im1];" \ + "vfcpka.h.s %[re2], %[re2], %[im2];" \ + "vfcpka.h.s %[re3], %[re3], %[im3];" \ + : [D0] "+&r"(D0), [D1] "+&r"(D1), [D2] "+&r"(D2), \ + [D3] "+&r"(D3), [re0] "+&r"(re0), [re1] "+&r"(re1), \ + [re2] "+&r"(re2), [re3] "+&r"(re3), [im0] "+&r"(im0), \ + [im1] "+&r"(im1), [im2] "+&r"(im2), [im3] "+&r"(im3) \ + : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2), \ + [cd3] "r"(cd3), [x] "r"(ab), [y] "r"(ab_n) \ + :); \ + *((uint32_t *)&pH_itr[2 * (i * nTX + j)]) = re0; \ + *((uint32_t *)&pH_itr[2 * (i * nTX + j + 1)]) = re1; \ + *((uint32_t *)&pH_itr[2 * (i * nTX + j + 2)]) = re2; \ + *((uint32_t *)&pH_itr[2 * (i * nTX + j + 3)]) = re3; \ + } +#else +#define DIV_LOOP(ab, ab_n, i) \ + { \ + re0 = 0; \ + re1 = 0; \ + re2 = 0; \ + re3 = 0; \ + im0 = 0; \ + im1 = 0; \ + im2 = 0; \ + im3 = 0; \ + D0 = 0; \ + D1 = 0; \ + D2 = 0; \ + D3 = 0; \ + cd0 = *(uint32_t *)&pPilotTX_itr[2U * j]; \ + cd1 = *(uint32_t *)&pPilotTX_itr[2U * (j + 1)]; \ + cd2 = *(uint32_t *)&pPilotTX_itr[2U * (j + 2)]; \ + cd3 = *(uint32_t *)&pPilotTX_itr[2U * (j + 3)]; \ + asm volatile("vfdotpex.s.h %[D0], %[cd0], %[cd0];" \ + "vfdotpex.s.h %[D1], %[cd1], %[cd1];" \ + "vfdotpex.s.h %[D2], %[cd2], %[cd2];" \ + "vfdotpex.s.h %[D3], %[cd3], %[cd3];" \ + "vfdotpex.s.h %[re0], %[x], %[cd0];" \ + "vfdotpex.s.h %[re1], %[x], %[cd1];" \ + "vfdotpex.s.h %[re2], %[x], %[cd2];" \ + "vfdotpex.s.h %[re3], %[x], %[cd3];" \ + "vfdotpex.s.h %[im0], %[y], %[cd0];" \ + "vfdotpex.s.h %[im1], %[y], %[cd1];" \ + "vfdotpex.s.h %[im2], %[y], %[cd2];" \ + "vfdotpex.s.h %[im3], %[y], %[cd3];" \ + : [D0] "+&r"(D0), [D1] "+&r"(D1), [D2] "+&r"(D2), \ + [D3] "+&r"(D3), [re0] "+&r"(re0), [re1] "+&r"(re1), \ + [re2] "+&r"(re2), [re3] "+&r"(re3), [im0] "+&r"(im0), \ + [im1] "+&r"(im1), [im2] "+&r"(im2), [im3] "+&r"(im3) \ + : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2), \ + [cd3] "r"(cd3), [x] "r"(ab), [y] "r"(ab_n) \ + :); \ + re0 = re0 / D0; \ + re1 = re1 / D1; \ + re2 = re2 / D2; \ + re3 = re3 / D3; \ + im0 = im0 / D0; \ + im1 = im1 / D1; \ + im2 = im2 / D2; \ + im3 = im3 / D3; \ + asm volatile("vfcpka.h.s %[re0], %[re0], %[im0];" \ + "vfcpka.h.s %[re1], %[re1], %[im1];" \ + "vfcpka.h.s %[re2], %[re2], %[im2];" \ + "vfcpka.h.s %[re3], %[re3], %[im3];" \ + : [re0] "+&r"(re0), [re1] "+&r"(re1), [re2] "+&r"(re2), \ + [re3] "+&r"(re3), [im0] "+&r"(im0), [im1] "+&r"(im1), \ + [im2] "+&r"(im2), [im3] "+&r"(im3) \ + : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2), \ + [cd3] "r"(cd3), [x] "r"(ab), [y] "r"(ab_n) \ + :); \ + *((uint32_t *)&pH_itr[2 * (i * nTX + j)]) = re0; \ + *((uint32_t *)&pH_itr[2 * (i * nTX + j + 1)]) = re1; \ + *((uint32_t *)&pH_itr[2 * (i * nTX + j + 2)]) = re2; \ + *((uint32_t *)&pH_itr[2 * (i * nTX + j + 3)]) = re3; \ + } +#endif + +/* a[i] = ar[i] + i * ai[j] + + out[i][j] = a[i] * c[j] + out[i][j + 1] = a[i] * c[j + 1] + out[i][j + 2] = a[i] * c[j + 2] + out[i][j + 3] = a[i] * c[j + 3]*/ + +#define MUL_LOOP(ab, ab_n, i) \ + { \ + re0 = 0; \ + re1 = 0; \ + re2 = 0; \ + re3 = 0; \ + im0 = 0; \ + im1 = 0; \ + im2 = 0; \ + im3 = 0; \ + cd0 = *(uint32_t *)&pPilotTX_itr[2U * j]; \ + cd1 = *(uint32_t *)&pPilotTX_itr[2U * (j + 1)]; \ + cd2 = *(uint32_t *)&pPilotTX_itr[2U * (j + 2)]; \ + cd3 = *(uint32_t *)&pPilotTX_itr[2U * (j + 3)]; \ + asm volatile("vfdotpex.s.h %[re0], %[x], %[cd0];" \ + "vfdotpex.s.h %[re1], %[x], %[cd1];" \ + "vfdotpex.s.h %[re2], %[x], %[cd2];" \ + "vfdotpex.s.h %[re3], %[x], %[cd3];" \ + "vfdotpex.s.h %[im0], %[y], %[cd0];" \ + "vfdotpex.s.h %[im1], %[y], %[cd1];" \ + "vfdotpex.s.h %[im2], %[y], %[cd2];" \ + "vfdotpex.s.h %[im3], %[y], %[cd3];" \ + : [re0] "+&r"(re0), [re1] "+&r"(re1), [re2] "+&r"(re2), \ + [re3] "+&r"(re3), [im0] "+&r"(im0), [im1] "+&r"(im1), \ + [im2] "+&r"(im2), [im3] "+&r"(im3) \ + : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2), \ + [cd3] "r"(cd3), [x] "r"(ab), [y] "r"(ab_n) \ + :); \ + asm volatile( \ + "vfcpka.h.s %[re0], %[re0], %[im0];" \ + "vfcpka.h.s %[re1], %[re1], %[im1];" \ + "vfcpka.h.s %[re2], %[re2], %[im2];" \ + "vfcpka.h.s %[re3], %[re3], %[im3];" \ + : [re0] "+&r"(re0), [re1] "+&r"(re1), [re2] "+&r"(re2), \ + [re3] "+&r"(re3), [im0] "+&r"(im0), [im1] "+&r"(im1), \ + [im2] "+&r"(im2), [im3] "+&r"(im3) \ + : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2), [cd3] "r"(cd3) \ + :); \ + *((uint32_t *)&pH_itr[2 * (i * nTX + j)]) = re0; \ + *((uint32_t *)&pH_itr[2 * (i * nTX + j + 1)]) = re1; \ + *((uint32_t *)&pH_itr[2 * (i * nTX + j + 2)]) = re2; \ + *((uint32_t *)&pH_itr[2 * (i * nTX + j + 3)]) = re3; \ + } + +#define CMUL_LOOP(ab, i) \ + { \ + sum0 = 0; \ + sum1 = 0; \ + sum2 = 0; \ + sum3 = 0; \ + cd0 = *(uint32_t *)&pPilotTX_itr[2U * j]; \ + cd1 = *(uint32_t *)&pPilotTX_itr[2U * (j + 1)]; \ + cd2 = *(uint32_t *)&pPilotTX_itr[2U * (j + 2)]; \ + cd3 = *(uint32_t *)&pPilotTX_itr[2U * (j + 3)]; \ + asm volatile("fcdotpex.s.h %[sum0], %[x], %[cd0];" \ + "fcdotpex.s.h %[sum1], %[x], %[cd1];" \ + "fcdotpex.s.h %[sum2], %[x], %[cd2];" \ + "fcdotpex.s.h %[sum3], %[x], %[cd3];" \ + : [sum0] "+&r"(sum0), [sum1] "+&r"(sum1), [sum2] "+&r"(sum2), \ + [sum3] "+&r"(sum3) \ + : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2), \ + [cd3] "r"(cd3), [x] "r"(ab) \ + :); \ + *((uint32_t *)&pH_itr[2 * (i * nTX + j)]) = sum0; \ + *((uint32_t *)&pH_itr[2 * (i * nTX + j + 1)]) = sum1; \ + *((uint32_t *)&pH_itr[2 * (i * nTX + j + 2)]) = sum2; \ + *((uint32_t *)&pH_itr[2 * (i * nTX + j + 3)]) = sum3; \ + } + +#define SHUFFLE_A \ + { \ + asm volatile( \ + "xor %[ab_n0], %[ab0], %[neg_mask];" \ + "xor %[ab_n1], %[ab1], %[neg_mask];" \ + "xor %[ab_n2], %[ab2], %[neg_mask];" \ + "xor %[ab_n3], %[ab3], %[neg_mask];" \ + "pv.shuffle2.h %[ab_n0], %[ab_n0], %[mask];" \ + "pv.shuffle2.h %[ab_n1], %[ab_n1], %[mask];" \ + "pv.shuffle2.h %[ab_n2], %[ab_n2], %[mask];" \ + "pv.shuffle2.h %[ab_n3], %[ab_n3], %[mask];" \ + : [ab_n0] "+&r"(ab_n0), [ab_n1] "+&r"(ab_n1), [ab_n2] "+&r"(ab_n2), \ + [ab_n3] "+&r"(ab_n3) \ + : [ab0] "r"(ab0), [ab1] "r"(ab1), [ab2] "r"(ab2), [ab3] "r"(ab3), \ + [neg_mask] "r"(0x00008000), [mask] "r"(0x00020003) \ + :); \ + } + +/** + @brief Block-type channel estimation. + @param[in] pH points to output channel + @param[in] pPilotRX points to received symbol + @param[in] pPilotTX points to sent pilot + @param[in] nTX Number of transmitters + @param[in] nRX Number of receivers + @param[in] nSc Number of Subcarriers + @return none +*/ +void mempool_chest_f16s_unrolled4(__fp16 *pH, __fp16 *pPilotRX, + __fp16 *pPilotTX, uint32_t nRX, uint32_t nTX, + uint32_t nSc) { + + uint32_t ab0, ab1, ab2, ab3; + uint32_t cd0, cd1, cd2, cd3; + uint32_t re0, re1, re2, re3; + uint32_t im0, im1, im2, im3; + uint32_t D0, D1, D2, D3; + uint32_t ab_n0, ab_n1, ab_n2, ab_n3; + __fp16 *pPilotTX_itr; + __fp16 *pPilotRX_itr; + __fp16 *pH_itr; + + for (uint32_t k = 0; k < nSc; k++) { + pPilotTX_itr = pPilotTX + k * (2 * nTX); + pPilotRX_itr = pPilotRX + k * (2 * nRX); + pH_itr = pH + k * 2 * (nTX * nRX); + for (uint32_t i = 0; i < nRX; i++) { + ab0 = *(uint32_t *)&pPilotRX_itr[2U * i]; + ab1 = *(uint32_t *)&pPilotRX_itr[2U * (i + 1)]; + ab2 = *(uint32_t *)&pPilotRX_itr[2U * (i + 2)]; + ab3 = *(uint32_t *)&pPilotRX_itr[2U * (i + 3)]; + SHUFFLE_A; + for (uint32_t j = 0; j < nTX; j += 4) { + DIV_LOOP(ab0, ab_n0, i); + DIV_LOOP(ab1, ab_n1, i + 1); + DIV_LOOP(ab2, ab_n2, i + 2); + DIV_LOOP(ab3, ab_n3, i + 3); + } + } + } + return; +} + +/** + @brief Block-type channel estimation. + @param[in] pH points to output channel + @param[in] pPilotRX points to received symbol + @param[in] pPilotTX points to sent pilot + @param[in] nTX Number of transmitters + @param[in] nRX Number of receivers + @param[in] nSc Number of Subcarriers + @param[in] core_id ID of the PE + @param[in] nPE Number of PEs + @return none +*/ +void mempool_chest_f16p_unrolled4(__fp16 *pH, __fp16 *pPilotRX, + __fp16 *pPilotTX, uint32_t nRX, uint32_t nTX, + uint32_t nSc, uint32_t core_id, + uint32_t nPE) { + uint32_t ab0, ab1, ab2, ab3; + uint32_t cd0, cd1, cd2, cd3; +#ifndef __CDOTP + uint32_t ab_n0, ab_n1, ab_n2, ab_n3; + uint32_t re0, re1, re2, re3; + uint32_t im0, im1, im2, im3; +#else + uint32_t sum0, sum1, sum2, sum3; +#endif + +#ifndef __MUL + uint32_t D0, D1, D2, D3; +#endif + + __fp16 *pPilotTX_itr; + __fp16 *pPilotRX_itr; + __fp16 *pH_itr; + + for (uint32_t k = core_id; k < nSc; k += nPE) { + pPilotTX_itr = pPilotTX + k * (2 * nTX); + pPilotRX_itr = pPilotRX + k * (2 * nRX); + pH_itr = pH + k * 2 * (nTX * nRX); + for (uint32_t i = 0; i < nRX; i += 4) { + ab0 = *(uint32_t *)&pPilotRX_itr[2U * i]; + ab1 = *(uint32_t *)&pPilotRX_itr[2U * (i + 1)]; + ab2 = *(uint32_t *)&pPilotRX_itr[2U * (i + 2)]; + ab3 = *(uint32_t *)&pPilotRX_itr[2U * (i + 3)]; +#ifndef __CDOTP + SHUFFLE_A; +#endif + + for (uint32_t j = 0; j < nTX; j += 4) { +#if (defined(__CDOTP) && defined(__MUL)) + CMUL_LOOP(ab0, i); + CMUL_LOOP(ab1, i + 1); + CMUL_LOOP(ab2, i + 2); + CMUL_LOOP(ab3, i + 3); +#elif (!defined(__CDOTP) && defined(__MUL)) + MUL_LOOP(ab0, ab_n0, i); + MUL_LOOP(ab1, ab_n1, i + 1); + MUL_LOOP(ab2, ab_n2, i + 2); + MUL_LOOP(ab3, ab_n3, i + 3); +#else + DIV_LOOP(ab0, ab_n0, i) + DIV_LOOP(ab1, ab_n1, i + 1) + DIV_LOOP(ab2, ab_n2, i + 2) + DIV_LOOP(ab3, ab_n3, i + 3) +#endif + } + } + } + return; +} + +void mempool_chest_f16p_unrolled4_local(__fp16 *volatile pH, + __fp16 *volatile pPilotRX, + __fp16 *volatile pPilotTX, uint32_t nRX, + uint32_t nTX, uint32_t nSc, + uint32_t core_id, uint32_t nPE) { + uint32_t ab0, ab1, ab2, ab3; + uint32_t cd0, cd1, cd2, cd3; + uint32_t sum0, sum1, sum2, sum3; + __fp16 *pPilotTX_itr; + __fp16 *pPilotRX_itr; + __fp16 *pH_itr; + uint32_t itr, i, j; + + // Cores Loop over the received pilots vector + for (itr = core_id * 4; itr < (nSc * nRX); + itr += (BANKING_FACTOR * NUM_CORES)) { + // Received pilots are aligned to cores + uint32_t sc_RX = itr / nRX; + pPilotTX_itr = pPilotTX + sc_RX * (2 * nTX); + pPilotRX_itr = pPilotRX + sc_RX * (2 * nRX); + pH_itr = pH + sc_RX * 2 * (nTX * nRX); + + // Load received pilots + i = itr % nRX; + ab0 = *(uint32_t *)&pPilotRX_itr[2U * i]; + ab1 = *(uint32_t *)&pPilotRX_itr[2U * (i + 1)]; + ab2 = *(uint32_t *)&pPilotRX_itr[2U * (i + 2)]; + ab3 = *(uint32_t *)&pPilotRX_itr[2U * (i + 3)]; + for (j = 0; j < nTX; j += 4) { + CMUL_LOOP(ab0, i); + CMUL_LOOP(ab1, i + 1); + CMUL_LOOP(ab2, i + 2); + CMUL_LOOP(ab3, i + 3); + } + } + mempool_barrier(nPE); + return; +} diff --git a/software/kernels/baremetal/mempool_chest_f16p.h b/software/kernels/baremetal/mempool_chest_f16p.h deleted file mode 100644 index 835b26237..000000000 --- a/software/kernels/baremetal/mempool_chest_f16p.h +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -// Includes inner loop -#include "mempool_chest_f16s.h" - -/** - @brief Block-type channel estimation. - @param[in] pH points to output channel - @param[in] pPilotRX points to received symbol - @param[in] pPilotTX points to sent pilot - @param[in] nTX Number of transmitters - @param[in] nRX Number of receivers - @param[in] nSc Number of Subcarriers - @return none -*/ -void mempool_chest_f16p_unrolled4(__fp16 *pH, __fp16 *pPilotRX, - __fp16 *pPilotTX, uint32_t nRX, uint32_t nTX, - uint32_t nSc, uint32_t core_id, - uint32_t nPE) { - uint32_t ab0, ab1, ab2, ab3; - uint32_t ab_n0, ab_n1, ab_n2, ab_n3; - __fp16 *pTX; - __fp16 *pRX; - __fp16 *pOut; - for (uint32_t k = core_id; k < nSc; k += nPE) { - pTX = pPilotTX + k * (2 * nTX); - pRX = pPilotRX + k * (2 * nRX); - pOut = pH + k * 2 * (nTX * nRX); - for (uint32_t i = 0; i < nRX; i += 4) { - ab0 = *(uint32_t *)&pRX[2U * i]; - ab1 = *(uint32_t *)&pRX[2U * (i + 1)]; - ab2 = *(uint32_t *)&pRX[2U * (i + 2)]; - ab3 = *(uint32_t *)&pRX[2U * (i + 3)]; - asm volatile( - "xor %[ab_n0], %[ab0], %[neg_mask];" - "xor %[ab_n1], %[ab1], %[neg_mask];" - "xor %[ab_n2], %[ab2], %[neg_mask];" - "xor %[ab_n3], %[ab3], %[neg_mask];" - "pv.shuffle2.h %[ab_n0], %[ab_n0], %[mask];" - "pv.shuffle2.h %[ab_n1], %[ab_n1], %[mask];" - "pv.shuffle2.h %[ab_n2], %[ab_n2], %[mask];" - "pv.shuffle2.h %[ab_n3], %[ab_n3], %[mask];" - : [ab_n0] "=&r"(ab_n0), [ab_n1] "=&r"(ab_n1), [ab_n2] "=&r"(ab_n2), - [ab_n3] "=&r"(ab_n3) - : [ab0] "r"(ab0), [ab1] "r"(ab1), [ab2] "r"(ab2), [ab3] "r"(ab3), - [neg_mask] "r"(0x00008000), [mask] "r"(0x00020001) - :); - for (uint32_t j = 0; j < nTX; j += 4) { - chest_unrolled4_inner_loop_f16(pTX, pOut, nTX, ab0, ab_n0, i, j); - chest_unrolled4_inner_loop_f16(pTX, pOut, nTX, ab1, ab_n1, i + 1, j); - chest_unrolled4_inner_loop_f16(pTX, pOut, nTX, ab2, ab_n2, i + 2, j); - chest_unrolled4_inner_loop_f16(pTX, pOut, nTX, ab3, ab_n3, i + 3, j); - } - } - } - mempool_log_partial_barrier(2, core_id, nPE); - return; -} diff --git a/software/kernels/baremetal/mempool_chest_f16s.h b/software/kernels/baremetal/mempool_chest_f16s.h deleted file mode 100644 index 4830ebc6f..000000000 --- a/software/kernels/baremetal/mempool_chest_f16s.h +++ /dev/null @@ -1,194 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -#pragma once - -#ifdef __XDIVSQRT - -/* a[i] = ar[i] + i * ai[j] - - out[i][j] = a[i] / c[j] - out[i][j + 1] = a[i] / c[j + 1h - out[i][j + 2] = a[i] / c[j + 2] - out[i][j + 3] = a[i] / c[j + 3]*/ - -static inline void chest_unrolled4_inner_loop_f16(__fp16 *pPilotTX, __fp16 *pH, - uint32_t nTX, uint32_t ab, - uint32_t ab_n, uint32_t i, - uint32_t j) { - - uint32_t cd0, cd1, cd2, cd3; - float re0 = 0.0f, re1 = 0.0f, re2 = 0.0f, re3 = 0.0f; - float im0 = 0.0f, im1 = 0.0f, im2 = 0.0f, im3 = 0.0f; - float D0 = 0.0f, D1 = 0.0f, D2 = 0.0f, D3 = 0.0f; - cd0 = *(uint32_t *)&pPilotTX[2U * j]; - cd1 = *(uint32_t *)&pPilotTX[2U * (j + 1)]; - cd2 = *(uint32_t *)&pPilotTX[2U * (j + 2)]; - cd3 = *(uint32_t *)&pPilotTX[2U * (j + 3)]; - asm volatile( - // Compute denominator - "vfdotpex.s.h %[D0], %[cd0], %[cd0];" - "vfdotpex.s.h %[D1], %[cd1], %[cd1];" - "vfdotpex.s.h %[D2], %[cd2], %[cd2];" - "vfdotpex.s.h %[D3], %[cd3], %[cd3];" - // Compute numerator - "vfdotpex.s.h %[re0], %[ab], %[cd0];" - "vfdotpex.s.h %[re1], %[ab], %[cd1];" - "vfdotpex.s.h %[re2], %[ab], %[cd2];" - "vfdotpex.s.h %[re3], %[ab], %[cd3];" - "vfdotpex.s.h %[im0], %[ab_n], %[cd0];" - "vfdotpex.s.h %[im1], %[ab_n], %[cd1];" - "vfdotpex.s.h %[im2], %[ab_n], %[cd2];" - "vfdotpex.s.h %[im3], %[ab_n], %[cd3];" - "fdiv.s %[re0], %[re0], %[D0];" - "fdiv.s %[re1], %[re1], %[D1];" - "fdiv.s %[re2], %[re2], %[D2];" - "fdiv.s %[re3], %[re3], %[D3];" - "fdiv.s %[im0], %[im0], %[D0];" - "fdiv.s %[im1], %[im1], %[D1];" - "fdiv.s %[im2], %[im2], %[D2];" - "fdiv.s %[im3], %[im3], %[D3];" - // Pack in 32b word - "vfcpka.h.s %[re0], %[re0], %[im0];" - "vfcpka.h.s %[re1], %[re1], %[im1];" - "vfcpka.h.s %[re2], %[re2], %[im2];" - "vfcpka.h.s %[re3], %[re3], %[im3];" - : [D0] "+&r"(D0), [D1] "+&r"(D1), [D2] "+&r"(D2), [D3] "+&r"(D3), - [re0] "+&r"(re0), [re1] "+&r"(re1), [re2] "+&r"(re2), [re3] "+&r"(re3), - [im0] "+&r"(im0), [im1] "+&r"(im1), [im2] "+&r"(im2), [im3] "+&r"(im3) - : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2), [cd3] "r"(cd3), - [ab] "r"(ab), [ab_n] "r"(ab_n) - :); - *((uint32_t *)&pH[2 * (i * nTX + j)]) = *(uint32_t *)&re0; - *((uint32_t *)&pH[2 * (i * nTX + j + 1)]) = *(uint32_t *)&re1; - *((uint32_t *)&pH[2 * (i * nTX + j + 2)]) = *(uint32_t *)&re2; - *((uint32_t *)&pH[2 * (i * nTX + j + 3)]) = *(uint32_t *)&re3; - return; -} - -#else - -/* a[i] = ar[i] + i * ai[j] - - out[i][j] = a[i] / c[j] - out[i][j + 1] = a[i] / c[j + 1h - out[i][j + 2] = a[i] / c[j + 2] - out[i][j + 3] = a[i] / c[j + 3]*/ - -static inline void chest_unrolled4_inner_loop_f16(__fp16 *pPilotTX, __fp16 *pH, - uint32_t nTX, uint32_t ab, - uint32_t ab_n, uint32_t i, - uint32_t j) { - - uint32_t cd0, cd1, cd2, cd3; - float re0 = 0.0f, re1 = 0.0f, re2 = 0.0f, re3 = 0.0f; - float im0 = 0.0f, im1 = 0.0f, im2 = 0.0f, im3 = 0.0f; - float D0 = 0.0f, D1 = 0.0f, D2 = 0.0f, D3 = 0.0f; - cd0 = *(uint32_t *)&pPilotTX[2U * j]; - cd1 = *(uint32_t *)&pPilotTX[2U * (j + 1)]; - cd2 = *(uint32_t *)&pPilotTX[2U * (j + 2)]; - cd3 = *(uint32_t *)&pPilotTX[2U * (j + 3)]; - asm volatile( - // Compute denominator - "vfdotpex.s.h %[D0], %[cd0], %[cd0];" - "vfdotpex.s.h %[D1], %[cd1], %[cd1];" - "vfdotpex.s.h %[D2], %[cd2], %[cd2];" - "vfdotpex.s.h %[D3], %[cd3], %[cd3];" - // Compute numerator - "vfdotpex.s.h %[re0], %[ab], %[cd0];" - "vfdotpex.s.h %[re1], %[ab], %[cd1];" - "vfdotpex.s.h %[re2], %[ab], %[cd2];" - "vfdotpex.s.h %[re3], %[ab], %[cd3];" - "vfdotpex.s.h %[im0], %[ab_n], %[cd0];" - "vfdotpex.s.h %[im1], %[ab_n], %[cd1];" - "vfdotpex.s.h %[im2], %[ab_n], %[cd2];" - "vfdotpex.s.h %[im3], %[ab_n], %[cd3];" - : [D0] "+&r"(D0), [D1] "+&r"(D1), [D2] "+&r"(D2), [D3] "+&r"(D3), - [re0] "+&r"(re0), [re1] "+&r"(re1), [re2] "+&r"(re2), [re3] "+&r"(re3), - [im0] "+&r"(im0), [im1] "+&r"(im1), [im2] "+&r"(im2), [im3] "+&r"(im3) - : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2), [cd3] "r"(cd3), - [ab] "r"(ab), [ab_n] "r"(ab_n) - :); - re0 = re0 / D0; - re1 = re1 / D1; - re2 = re2 / D2; - re3 = re3 / D3; - im0 = im0 / D0; - im1 = im1 / D1; - im2 = im2 / D2; - im3 = im3 / D3; - asm volatile( - // Pack in 32b word - "vfcpka.h.s %[re0], %[re0], %[im0];" - "vfcpka.h.s %[re1], %[re1], %[im1];" - "vfcpka.h.s %[re2], %[re2], %[im2];" - "vfcpka.h.s %[re3], %[re3], %[im3];" - : [D0] "+&r"(D0), [D1] "+&r"(D1), [D2] "+&r"(D2), [D3] "+&r"(D3), - [re0] "+&r"(re0), [re1] "+&r"(re1), [re2] "+&r"(re2), [re3] "+&r"(re3), - [im0] "+&r"(im0), [im1] "+&r"(im1), [im2] "+&r"(im2), [im3] "+&r"(im3) - : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2), [cd3] "r"(cd3), - [ab] "r"(ab), [ab_n] "r"(ab_n) - :); - - *((uint32_t *)&pH[2 * (i * nTX + j)]) = *(uint32_t *)&re0; - *((uint32_t *)&pH[2 * (i * nTX + j + 1)]) = *(uint32_t *)&re1; - *((uint32_t *)&pH[2 * (i * nTX + j + 2)]) = *(uint32_t *)&re2; - *((uint32_t *)&pH[2 * (i * nTX + j + 3)]) = *(uint32_t *)&re3; - return; -} - -#endif - -/** - @brief Block-type channel estimation. - @param[in] pH points to output channel - @param[in] pPilotRX points to received symbol - @param[in] pPilotTX points to sent pilot - @param[in] nTX Number of transmitters - @param[in] nRX Number of receivers - @param[in] nSc Number of Subcarriers - @return none -*/ -void mempool_chest_f16s_unrolled4(__fp16 *pH, __fp16 *pPilotRX, - __fp16 *pPilotTX, uint32_t nRX, uint32_t nTX, - uint32_t nSc) { - - uint32_t ab0, ab1, ab2, ab3; - uint32_t ab_n0, ab_n1, ab_n2, ab_n3; - - for (uint32_t k = 0; k < nSc; k++) { - for (uint32_t i = 0; i < nRX; i++) { - ab0 = *(uint32_t *)&pPilotRX[2U * i]; - ab1 = *(uint32_t *)&pPilotRX[2U * (i + 1)]; - ab2 = *(uint32_t *)&pPilotRX[2U * (i + 2)]; - ab3 = *(uint32_t *)&pPilotRX[2U * (i + 3)]; - asm volatile( - "xor %[ab_n0], %[ab0], %[neg_mask];" - "xor %[ab_n1], %[ab1], %[neg_mask];" - "xor %[ab_n2], %[ab2], %[neg_mask];" - "xor %[ab_n3], %[ab3], %[neg_mask];" - "pv.shuffle2.h %[ab_n0], %[ab_n0], %[mask];" - "pv.shuffle2.h %[ab_n1], %[ab_n1], %[mask];" - "pv.shuffle2.h %[ab_n2], %[ab_n2], %[mask];" - "pv.shuffle2.h %[ab_n3], %[ab_n3], %[mask];" - : [ab_n0] "+&r"(ab_n0), [ab_n1] "+&r"(ab_n1), [ab_n2] "+&r"(ab_n2), - [ab_n3] "+&r"(ab_n3) - : [ab0] "r"(ab0), [ab1] "r"(ab1), [ab2] "r"(ab2), [ab3] "r"(ab3), - [neg_mask] "r"(0x00008000), [mask] "r"(0x00020003) - :); - for (uint32_t j = 0; j < nTX; j += 4) { - chest_unrolled4_inner_loop_f16(pPilotTX, pH, nTX, ab0, ab_n0, i, j); - chest_unrolled4_inner_loop_f16(pPilotTX, pH, nTX, ab1, ab_n1, i + 1, j); - chest_unrolled4_inner_loop_f16(pPilotTX, pH, nTX, ab2, ab_n2, i + 2, j); - chest_unrolled4_inner_loop_f16(pPilotTX, pH, nTX, ab3, ab_n3, i + 3, j); - } - } - pPilotTX += 2 * nTX; - pPilotRX += 2 * nRX; - pH += 2 * (nTX * nRX); - } - return; -} diff --git a/software/kernels/baremetal/mempool_chest_q16.h b/software/kernels/baremetal/mempool_chest_q16.h index 914a8e09e..a91aa0097 100644 --- a/software/kernels/baremetal/mempool_chest_q16.h +++ b/software/kernels/baremetal/mempool_chest_q16.h @@ -307,7 +307,7 @@ void mempool_chest_q16p_unrolled4(int16_t *volatile pH, } } } - mempool_barrier(nPE); + mempool_log_partial_barrier(2, core_id, nPE); return; } diff --git a/software/kernels/baremetal/mempool_cholesky_f16s.h b/software/kernels/baremetal/mempool_cholesky_f16s.h index c870782da..bb6143ed7 100644 --- a/software/kernels/baremetal/mempool_cholesky_f16s.h +++ b/software/kernels/baremetal/mempool_cholesky_f16s.h @@ -6,6 +6,7 @@ // Author: Bowen Wang, ETH Zurich #pragma once +#include "builtins_v2.h" #define N_BANKS (NUM_CORES * BANKING_FACTOR) #ifdef __XDIVSQRT diff --git a/software/kernels/baremetal/mempool_cholesky_f32s.h b/software/kernels/baremetal/mempool_cholesky_f32s.h index 6cd54fe69..2bad891f5 100644 --- a/software/kernels/baremetal/mempool_cholesky_f32s.h +++ b/software/kernels/baremetal/mempool_cholesky_f32s.h @@ -165,4 +165,8 @@ void mempool_cholesky_folded_f32s(float *pSrc, float *pL, const uint32_t n) { return; } +#else + +#error "ERROR: f32 MMSE functions available only for __XDIVSQRT." + #endif diff --git a/software/runtime/kernel/mempool_cholesky_q16s.h b/software/kernels/baremetal/mempool_cholesky_q16s.h similarity index 97% rename from software/runtime/kernel/mempool_cholesky_q16s.h rename to software/kernels/baremetal/mempool_cholesky_q16s.h index 65f8e0b42..dc20a2b94 100644 --- a/software/runtime/kernel/mempool_cholesky_q16s.h +++ b/software/kernels/baremetal/mempool_cholesky_q16s.h @@ -5,7 +5,8 @@ // Author: Marco Bertuletti, ETH Zurich #pragma once -#include "kernel/mempool_sqrt_q32s.h" +#include "baremetal/mempool_sqrt_q32s.h" +#include "builtins_v2.h" #define N_BANKS (NUM_CORES * BANKING_FACTOR) /** VECTORIZED CODE diff --git a/software/kernels/baremetal/mempool_cmatmul_f16.h b/software/kernels/baremetal/mempool_cmatmul_f16.h index c83443076..f795a82ba 100644 --- a/software/kernels/baremetal/mempool_cmatmul_f16.h +++ b/software/kernels/baremetal/mempool_cmatmul_f16.h @@ -14,318 +14,453 @@ #include "builtins_v2.h" #define CMATMUL_2x2_LOOP \ - float sum00_real = 0.0f; \ - float sum01_real = 0.0f; \ - float sum10_real = 0.0f; \ - float sum11_real = 0.0f; \ - float sum00_imag = 0.0f; \ - float sum01_imag = 0.0f; \ - float sum10_imag = 0.0f; \ - float sum11_imag = 0.0f; \ - v2h a00s, a01s, a10s, a11s; \ - v2h res00, res01, res10, res11; \ - for (j = 0; j < N; j += 2) { \ - v2h a00 = *(v2h *)&A[2 * ((i + 0) * N + (j + 0))]; \ - v2h a01 = *(v2h *)&A[2 * ((i + 0) * N + (j + 1))]; \ - v2h a10 = *(v2h *)&A[2 * ((i + 1) * N + (j + 0))]; \ - v2h a11 = *(v2h *)&A[2 * ((i + 1) * N + (j + 1))]; \ - v2h b00 = *(v2h *)&B[2 * ((j + 0) * P + (k + 0))]; \ - v2h b01 = *(v2h *)&B[2 * ((j + 0) * P + (k + 1))]; \ - v2h b10 = *(v2h *)&B[2 * ((j + 1) * P + (k + 0))]; \ - v2h b11 = *(v2h *)&B[2 * ((j + 1) * P + (k + 1))]; \ - asm volatile("pv.shuffle2.h %[a00s], %[a00], %[mask];" \ - "pv.shuffle2.h %[a10s], %[a10], %[mask];" \ - "pv.shuffle2.h %[a01s], %[a01], %[mask];" \ - "pv.shuffle2.h %[a11s], %[a11], %[mask];" \ - : [a00s] "=&r"(a00s), [a01s] "=&r"(a01s), [a10s] "=&r"(a10s), \ - [a11s] "=&r"(a11s) \ - : [a00] "r"(a00), [a01] "r"(a01), [a10] "r"(a10), \ - [a11] "r"(a11), [mask] "r"(0x00020003) \ + { \ + float sum00_real = 0.0f; \ + float sum01_real = 0.0f; \ + float sum10_real = 0.0f; \ + float sum11_real = 0.0f; \ + float sum00_imag = 0.0f; \ + float sum01_imag = 0.0f; \ + float sum10_imag = 0.0f; \ + float sum11_imag = 0.0f; \ + v2h a00s, a01s, a10s, a11s; \ + v2h res00, res01, res10, res11; \ + for (j = 0; j < N; j += 2) { \ + v2h a00 = *(v2h *)&A[2 * ((i + 0) * N + (j + 0))]; \ + v2h a01 = *(v2h *)&A[2 * ((i + 0) * N + (j + 1))]; \ + v2h a10 = *(v2h *)&A[2 * ((i + 1) * N + (j + 0))]; \ + v2h a11 = *(v2h *)&A[2 * ((i + 1) * N + (j + 1))]; \ + v2h b00 = *(v2h *)&B[2 * ((j + 0) * P + (k + 0))]; \ + v2h b01 = *(v2h *)&B[2 * ((j + 0) * P + (k + 1))]; \ + v2h b10 = *(v2h *)&B[2 * ((j + 1) * P + (k + 0))]; \ + v2h b11 = *(v2h *)&B[2 * ((j + 1) * P + (k + 1))]; \ + asm volatile("pv.shuffle2.h %[a00s], %[a00], %[mask];" \ + "pv.shuffle2.h %[a10s], %[a10], %[mask];" \ + "pv.shuffle2.h %[a01s], %[a01], %[mask];" \ + "pv.shuffle2.h %[a11s], %[a11], %[mask];" \ + : [a00s] "=&r"(a00s), [a01s] "=&r"(a01s), \ + [a10s] "=&r"(a10s), [a11s] "=&r"(a11s) \ + : [a00] "r"(a00), [a01] "r"(a01), [a10] "r"(a10), \ + [a11] "r"(a11), [mask] "r"(0x00020003) \ + :); \ + asm volatile( \ + "vfdotpex.s.h %[sum00_imag], %[a00s], %[b00];" \ + "vfdotpex.s.h %[sum10_imag], %[a10s], %[b00];" \ + "vfdotpex.s.h %[sum01_imag], %[a00s], %[b01];" \ + "vfdotpex.s.h %[sum11_imag], %[a10s], %[b01];" \ + "vfdotpex.s.h %[sum00_imag], %[a01s], %[b10];" \ + "vfdotpex.s.h %[sum10_imag], %[a11s], %[b10];" \ + "vfdotpex.s.h %[sum01_imag], %[a01s], %[b11];" \ + "vfdotpex.s.h %[sum11_imag], %[a11s], %[b11];" \ + : [sum00_imag] "+&r"(sum00_imag), [sum01_imag] "+&r"(sum01_imag), \ + [sum10_imag] "+&r"(sum10_imag), [sum11_imag] "+&r"(sum11_imag) \ + : [a00s] "r"(a00s), [a01s] "r"(a01s), [a10s] "r"(a10s), \ + [a11s] "r"(a11s), [b00] "r"(b00), [b01] "r"(b01), [b10] "r"(b10), \ + [b11] "r"(b11) \ + :); \ + asm volatile("xor %[a00s], %[a00], %[mask];" \ + "xor %[a10s], %[a10], %[mask];" \ + "xor %[a01s], %[a01], %[mask];" \ + "xor %[a11s], %[a11], %[mask];" \ + : [a00s] "=&r"(a00s), [a01s] "=&r"(a01s), \ + [a10s] "=&r"(a10s), [a11s] "=&r"(a11s) \ + : [a00] "r"(a00), [a01] "r"(a01), [a10] "r"(a10), \ + [a11] "r"(a11), [mask] "r"(0x00008000) \ + :); \ + asm volatile( \ + "vfdotpex.s.h %[sum00_real], %[a00s], %[b00];" \ + "vfdotpex.s.h %[sum10_real], %[a10s], %[b00];" \ + "vfdotpex.s.h %[sum01_real], %[a00s], %[b01];" \ + "vfdotpex.s.h %[sum11_real], %[a10s], %[b01];" \ + "vfdotpex.s.h %[sum00_real], %[a01s], %[b10];" \ + "vfdotpex.s.h %[sum10_real], %[a11s], %[b10];" \ + "vfdotpex.s.h %[sum01_real], %[a01s], %[b11];" \ + "vfdotpex.s.h %[sum11_real], %[a11s], %[b11];" \ + : [sum00_real] "+&r"(sum00_real), [sum01_real] "+&r"(sum01_real), \ + [sum10_real] "+&r"(sum10_real), [sum11_real] "+&r"(sum11_real) \ + : [a00s] "r"(a00s), [a01s] "r"(a01s), [a10s] "r"(a10s), \ + [a11s] "r"(a11s), [b00] "r"(b00), [b01] "r"(b01), [b10] "r"(b10), \ + [b11] "r"(b11) \ + :); \ + } \ + asm volatile("vfcpka.h.s %[res00], %[sum00_imag], %[sum00_real];" \ + "vfcpka.h.s %[res01], %[sum01_imag], %[sum01_real];" \ + "vfcpka.h.s %[res10], %[sum10_imag], %[sum10_real];" \ + "vfcpka.h.s %[res11], %[sum11_imag], %[sum11_real];" \ + : [res00] "=r"(res00), [res01] "=r"(res01), \ + [res10] "=r"(res10), [res11] "=r"(res11) \ + : [sum00_imag] "r"(sum00_imag), [sum01_imag] "r"(sum01_imag), \ + [sum10_imag] "r"(sum10_imag), [sum11_imag] "r"(sum11_imag), \ + [sum00_real] "r"(sum00_real), [sum01_real] "r"(sum01_real), \ + [sum10_real] "r"(sum10_real), [sum11_real] "r"(sum11_real) \ :); \ - asm volatile( \ - "vfdotpex.s.h %[sum00_imag], %[a00s], %[b00];" \ - "vfdotpex.s.h %[sum10_imag], %[a10s], %[b00];" \ - "vfdotpex.s.h %[sum01_imag], %[a00s], %[b01];" \ - "vfdotpex.s.h %[sum11_imag], %[a10s], %[b01];" \ - "vfdotpex.s.h %[sum00_imag], %[a01s], %[b10];" \ - "vfdotpex.s.h %[sum10_imag], %[a11s], %[b10];" \ - "vfdotpex.s.h %[sum01_imag], %[a01s], %[b11];" \ - "vfdotpex.s.h %[sum11_imag], %[a11s], %[b11];" \ - : [sum00_imag] "+&r"(sum00_imag), [sum01_imag] "+&r"(sum01_imag), \ - [sum10_imag] "+&r"(sum10_imag), [sum11_imag] "+&r"(sum11_imag) \ - : [a00s] "r"(a00s), [a01s] "r"(a01s), [a10s] "r"(a10s), \ - [a11s] "r"(a11s), [b00] "r"(b00), [b01] "r"(b01), [b10] "r"(b10), \ - [b11] "r"(b11) \ - :); \ - asm volatile("xor %[a00s], %[a00], %[mask];" \ - "xor %[a10s], %[a10], %[mask];" \ - "xor %[a01s], %[a01], %[mask];" \ - "xor %[a11s], %[a11], %[mask];" \ - : [a00s] "=&r"(a00s), [a01s] "=&r"(a01s), [a10s] "=&r"(a10s), \ - [a11s] "=&r"(a11s) \ - : [a00] "r"(a00), [a01] "r"(a01), [a10] "r"(a10), \ - [a11] "r"(a11), [mask] "r"(0x00008000) \ - :); \ - asm volatile( \ - "vfdotpex.s.h %[sum00_real], %[a00s], %[b00];" \ - "vfdotpex.s.h %[sum10_real], %[a10s], %[b00];" \ - "vfdotpex.s.h %[sum01_real], %[a00s], %[b01];" \ - "vfdotpex.s.h %[sum11_real], %[a10s], %[b01];" \ - "vfdotpex.s.h %[sum00_real], %[a01s], %[b10];" \ - "vfdotpex.s.h %[sum10_real], %[a11s], %[b10];" \ - "vfdotpex.s.h %[sum01_real], %[a01s], %[b11];" \ - "vfdotpex.s.h %[sum11_real], %[a11s], %[b11];" \ - : [sum00_real] "+&r"(sum00_real), [sum01_real] "+&r"(sum01_real), \ - [sum10_real] "+&r"(sum10_real), [sum11_real] "+&r"(sum11_real) \ - : [a00s] "r"(a00s), [a01s] "r"(a01s), [a10s] "r"(a10s), \ - [a11s] "r"(a11s), [b00] "r"(b00), [b01] "r"(b01), [b10] "r"(b10), \ - [b11] "r"(b11) \ - :); \ - } \ - asm volatile("vfcpka.h.s %[res00], %[sum00_imag], %[sum00_real];" \ - "vfcpka.h.s %[res01], %[sum01_imag], %[sum01_real];" \ - "vfcpka.h.s %[res10], %[sum10_imag], %[sum10_real];" \ - "vfcpka.h.s %[res11], %[sum11_imag], %[sum11_real];" \ - : [res00] "=r"(res00), [res01] "=r"(res01), \ - [res10] "=r"(res10), [res11] "=r"(res11) \ - : [sum00_imag] "r"(sum00_imag), [sum01_imag] "r"(sum01_imag), \ - [sum10_imag] "r"(sum10_imag), [sum11_imag] "r"(sum11_imag), \ - [sum00_real] "r"(sum00_real), [sum01_real] "r"(sum01_real), \ - [sum10_real] "r"(sum10_real), [sum11_real] "r"(sum11_real) \ - :); \ - (*(v2h *)&C[2 * ((i + 0) * P + k + 0)]) = res00; \ - (*(v2h *)&C[2 * ((i + 0) * P + k + 1)]) = res01; \ - (*(v2h *)&C[2 * ((i + 1) * P + k + 0)]) = res10; \ - (*(v2h *)&C[2 * ((i + 1) * P + k + 1)]) = res11; + (*(v2h *)&C[2 * ((i + 0) * P + k + 0)]) = res00; \ + (*(v2h *)&C[2 * ((i + 0) * P + k + 1)]) = res01; \ + (*(v2h *)&C[2 * ((i + 1) * P + k + 0)]) = res10; \ + (*(v2h *)&C[2 * ((i + 1) * P + k + 1)]) = res11; \ + } #define CMATMUL_2x4_LOOP \ - float register volatile sum00_real = 0.0f; \ - float register volatile sum01_real = 0.0f; \ - float register volatile sum02_real = 0.0f; \ - float register volatile sum03_real = 0.0f; \ - float register volatile sum10_real = 0.0f; \ - float register volatile sum11_real = 0.0f; \ - float register volatile sum12_real = 0.0f; \ - float register volatile sum13_real = 0.0f; \ - float register volatile sum00_imag = 0.0f; \ - float register volatile sum01_imag = 0.0f; \ - float register volatile sum02_imag = 0.0f; \ - float register volatile sum03_imag = 0.0f; \ - float register volatile sum10_imag = 0.0f; \ - float register volatile sum11_imag = 0.0f; \ - float register volatile sum12_imag = 0.0f; \ - float register volatile sum13_imag = 0.0f; \ - v2h a00s, a01s, a10s, a11s; \ - for (j = 0; j < N; j += 2) { \ - v2h a00 = A[(i + 0) * N + (j + 0)]; \ - v2h a01 = A[(i + 0) * N + (j + 1)]; \ - v2h a10 = A[(i + 1) * N + (j + 0)]; \ - v2h a11 = A[(i + 1) * N + (j + 1)]; \ - v2h b00 = B[(j + 0) * P + (k + 0)]; \ - v2h b01 = B[(j + 0) * P + (k + 1)]; \ - v2h b02 = B[(j + 0) * P + (k + 2)]; \ - v2h b03 = B[(j + 0) * P + (k + 3)]; \ - v2h b10 = B[(j + 1) * P + (k + 0)]; \ - v2h b11 = B[(j + 1) * P + (k + 1)]; \ - v2h b12 = B[(j + 1) * P + (k + 2)]; \ - v2h b13 = B[(j + 1) * P + (k + 3)]; \ + { \ + float register volatile sum00_real = 0.0f; \ + float register volatile sum01_real = 0.0f; \ + float register volatile sum02_real = 0.0f; \ + float register volatile sum03_real = 0.0f; \ + float register volatile sum10_real = 0.0f; \ + float register volatile sum11_real = 0.0f; \ + float register volatile sum12_real = 0.0f; \ + float register volatile sum13_real = 0.0f; \ + float register volatile sum00_imag = 0.0f; \ + float register volatile sum01_imag = 0.0f; \ + float register volatile sum02_imag = 0.0f; \ + float register volatile sum03_imag = 0.0f; \ + float register volatile sum10_imag = 0.0f; \ + float register volatile sum11_imag = 0.0f; \ + float register volatile sum12_imag = 0.0f; \ + float register volatile sum13_imag = 0.0f; \ + v2h a00s, a01s, a10s, a11s; \ + for (j = 0; j < N; j += 2) { \ + v2h a00 = A[(i + 0) * N + (j + 0)]; \ + v2h a01 = A[(i + 0) * N + (j + 1)]; \ + v2h a10 = A[(i + 1) * N + (j + 0)]; \ + v2h a11 = A[(i + 1) * N + (j + 1)]; \ + v2h b00 = B[(j + 0) * P + (k + 0)]; \ + v2h b01 = B[(j + 0) * P + (k + 1)]; \ + v2h b02 = B[(j + 0) * P + (k + 2)]; \ + v2h b03 = B[(j + 0) * P + (k + 3)]; \ + v2h b10 = B[(j + 1) * P + (k + 0)]; \ + v2h b11 = B[(j + 1) * P + (k + 1)]; \ + v2h b12 = B[(j + 1) * P + (k + 2)]; \ + v2h b13 = B[(j + 1) * P + (k + 3)]; \ + asm volatile( \ + "pv.shuffle2.h %[a00s], %[a00], %[mask];" \ + "pv.shuffle2.h %[a10s], %[a10], %[mask];" \ + "pv.shuffle2.h %[a01s], %[a01], %[mask];" \ + "pv.shuffle2.h %[a11s], %[a11], %[mask];" \ + "vfdotpex.s.h %[sum00_imag], %[a00s], %[b00];" \ + "vfdotpex.s.h %[sum10_imag], %[a10s], %[b00];" \ + "vfdotpex.s.h %[sum01_imag], %[a00s], %[b01];" \ + "vfdotpex.s.h %[sum11_imag], %[a10s], %[b01];" \ + "vfdotpex.s.h %[sum02_imag], %[a00s], %[b02];" \ + "vfdotpex.s.h %[sum12_imag], %[a10s], %[b02];" \ + "vfdotpex.s.h %[sum03_imag], %[a00s], %[b03];" \ + "vfdotpex.s.h %[sum13_imag], %[a10s], %[b03];" \ + "vfdotpex.s.h %[sum00_imag], %[a01s], %[b10];" \ + "vfdotpex.s.h %[sum10_imag], %[a11s], %[b10];" \ + "vfdotpex.s.h %[sum01_imag], %[a01s], %[b11];" \ + "vfdotpex.s.h %[sum11_imag], %[a11s], %[b11];" \ + "vfdotpex.s.h %[sum02_imag], %[a01s], %[b12];" \ + "vfdotpex.s.h %[sum12_imag], %[a11s], %[b12];" \ + "vfdotpex.s.h %[sum03_imag], %[a01s], %[b13];" \ + "vfdotpex.s.h %[sum13_imag], %[a11s], %[b13];" \ + : [a00s] "=&r"(a00s), [a01s] "=&r"(a01s), [a10s] "=&r"(a10s), \ + [a11s] "=&r"(a11s), [sum00_imag] "+&r"(sum00_imag), \ + [sum01_imag] "+&r"(sum01_imag), [sum02_imag] "+&r"(sum02_imag), \ + [sum03_imag] "+&r"(sum03_imag), [sum10_imag] "+&r"(sum10_imag), \ + [sum11_imag] "+&r"(sum11_imag), [sum12_imag] "+&r"(sum12_imag), \ + [sum13_imag] "+&r"(sum13_imag) \ + : [a00] "r"(a00), [a01] "r"(a01), [a10] "r"(a10), [a11] "r"(a11), \ + [b00] "r"(b00), [b01] "r"(b01), [b02] "r"(b02), [b03] "r"(b03), \ + [b10] "r"(b10), [b11] "r"(b11), [b12] "r"(b12), [b13] "r"(b13), \ + [mask] "r"(0x00020003) \ + :); \ + asm volatile( \ + "xor %[a00s], %[a00], %[maskn];" \ + "xor %[a10s], %[a10], %[maskn];" \ + "xor %[a01s], %[a01], %[maskn];" \ + "xor %[a11s], %[a11], %[maskn];" \ + "vfdotpex.s.h %[sum00_real], %[a00s], %[b00];" \ + "vfdotpex.s.h %[sum10_real], %[a10s], %[b00];" \ + "vfdotpex.s.h %[sum01_real], %[a00s], %[b01];" \ + "vfdotpex.s.h %[sum11_real], %[a10s], %[b01];" \ + "vfdotpex.s.h %[sum02_real], %[a00s], %[b02];" \ + "vfdotpex.s.h %[sum12_real], %[a10s], %[b02];" \ + "vfdotpex.s.h %[sum03_real], %[a00s], %[b03];" \ + "vfdotpex.s.h %[sum13_real], %[a10s], %[b03];" \ + "vfdotpex.s.h %[sum00_real], %[a01s], %[b10];" \ + "vfdotpex.s.h %[sum10_real], %[a11s], %[b10];" \ + "vfdotpex.s.h %[sum01_real], %[a01s], %[b11];" \ + "vfdotpex.s.h %[sum11_real], %[a11s], %[b11];" \ + "vfdotpex.s.h %[sum02_real], %[a01s], %[b12];" \ + "vfdotpex.s.h %[sum12_real], %[a11s], %[b12];" \ + "vfdotpex.s.h %[sum03_real], %[a01s], %[b13];" \ + "vfdotpex.s.h %[sum13_real], %[a11s], %[b13];" \ + : [a00s] "=&r"(a00s), [a01s] "=&r"(a01s), [a10s] "=&r"(a10s), \ + [a11s] "=&r"(a11s), [sum00_real] "+&r"(sum00_real), \ + [sum01_real] "+&r"(sum01_real), [sum02_real] "+&r"(sum02_real), \ + [sum03_real] "+&r"(sum03_real), [sum10_real] "+&r"(sum10_real), \ + [sum11_real] "+&r"(sum11_real), [sum12_real] "+&r"(sum12_real), \ + [sum13_real] "+&r"(sum13_real) \ + : [a00] "r"(a00), [a01] "r"(a01), [a10] "r"(a10), [a11] "r"(a11), \ + [b00] "r"(b00), [b01] "r"(b01), [b02] "r"(b02), [b03] "r"(b03), \ + [b10] "r"(b10), [b11] "r"(b11), [b12] "r"(b12), [b13] "r"(b13), \ + [maskn] "r"(0x00008000) \ + :); \ + } \ asm volatile( \ - "pv.shuffle2.h %[a00s], %[a00], %[mask];" \ - "pv.shuffle2.h %[a10s], %[a10], %[mask];" \ - "pv.shuffle2.h %[a01s], %[a01], %[mask];" \ - "pv.shuffle2.h %[a11s], %[a11], %[mask];" \ - "vfdotpex.s.h %[sum00_imag], %[a00s], %[b00];" \ - "vfdotpex.s.h %[sum10_imag], %[a10s], %[b00];" \ - "vfdotpex.s.h %[sum01_imag], %[a00s], %[b01];" \ - "vfdotpex.s.h %[sum11_imag], %[a10s], %[b01];" \ - "vfdotpex.s.h %[sum02_imag], %[a00s], %[b02];" \ - "vfdotpex.s.h %[sum12_imag], %[a10s], %[b02];" \ - "vfdotpex.s.h %[sum03_imag], %[a00s], %[b03];" \ - "vfdotpex.s.h %[sum13_imag], %[a10s], %[b03];" \ - "vfdotpex.s.h %[sum00_imag], %[a01s], %[b10];" \ - "vfdotpex.s.h %[sum10_imag], %[a11s], %[b10];" \ - "vfdotpex.s.h %[sum01_imag], %[a01s], %[b11];" \ - "vfdotpex.s.h %[sum11_imag], %[a11s], %[b11];" \ - "vfdotpex.s.h %[sum02_imag], %[a01s], %[b12];" \ - "vfdotpex.s.h %[sum12_imag], %[a11s], %[b12];" \ - "vfdotpex.s.h %[sum03_imag], %[a01s], %[b13];" \ - "vfdotpex.s.h %[sum13_imag], %[a11s], %[b13];" \ - : [a00s] "=&r"(a00s), [a01s] "=&r"(a01s), [a10s] "=&r"(a10s), \ - [a11s] "=&r"(a11s), [sum00_imag] "+&r"(sum00_imag), \ - [sum01_imag] "+&r"(sum01_imag), [sum02_imag] "+&r"(sum02_imag), \ - [sum03_imag] "+&r"(sum03_imag), [sum10_imag] "+&r"(sum10_imag), \ - [sum11_imag] "+&r"(sum11_imag), [sum12_imag] "+&r"(sum12_imag), \ - [sum13_imag] "+&r"(sum13_imag) \ - : [a00] "r"(a00), [a01] "r"(a01), [a10] "r"(a10), [a11] "r"(a11), \ - [b00] "r"(b00), [b01] "r"(b01), [b02] "r"(b02), [b03] "r"(b03), \ - [b10] "r"(b10), [b11] "r"(b11), [b12] "r"(b12), [b13] "r"(b13), \ - [mask] "r"(0x00020003) \ - :); \ - asm volatile( \ - "xor %[a00s], %[a00], %[maskn];" \ - "xor %[a10s], %[a10], %[maskn];" \ - "xor %[a01s], %[a01], %[maskn];" \ - "xor %[a11s], %[a11], %[maskn];" \ - "vfdotpex.s.h %[sum00_real], %[a00s], %[b00];" \ - "vfdotpex.s.h %[sum10_real], %[a10s], %[b00];" \ - "vfdotpex.s.h %[sum01_real], %[a00s], %[b01];" \ - "vfdotpex.s.h %[sum11_real], %[a10s], %[b01];" \ - "vfdotpex.s.h %[sum02_real], %[a00s], %[b02];" \ - "vfdotpex.s.h %[sum12_real], %[a10s], %[b02];" \ - "vfdotpex.s.h %[sum03_real], %[a00s], %[b03];" \ - "vfdotpex.s.h %[sum13_real], %[a10s], %[b03];" \ - "vfdotpex.s.h %[sum00_real], %[a01s], %[b10];" \ - "vfdotpex.s.h %[sum10_real], %[a11s], %[b10];" \ - "vfdotpex.s.h %[sum01_real], %[a01s], %[b11];" \ - "vfdotpex.s.h %[sum11_real], %[a11s], %[b11];" \ - "vfdotpex.s.h %[sum02_real], %[a01s], %[b12];" \ - "vfdotpex.s.h %[sum12_real], %[a11s], %[b12];" \ - "vfdotpex.s.h %[sum03_real], %[a01s], %[b13];" \ - "vfdotpex.s.h %[sum13_real], %[a11s], %[b13];" \ - : [a00s] "=&r"(a00s), [a01s] "=&r"(a01s), [a10s] "=&r"(a10s), \ - [a11s] "=&r"(a11s), [sum00_real] "+&r"(sum00_real), \ - [sum01_real] "+&r"(sum01_real), [sum02_real] "+&r"(sum02_real), \ - [sum03_real] "+&r"(sum03_real), [sum10_real] "+&r"(sum10_real), \ - [sum11_real] "+&r"(sum11_real), [sum12_real] "+&r"(sum12_real), \ - [sum13_real] "+&r"(sum13_real) \ - : [a00] "r"(a00), [a01] "r"(a01), [a10] "r"(a10), [a11] "r"(a11), \ - [b00] "r"(b00), [b01] "r"(b01), [b02] "r"(b02), [b03] "r"(b03), \ - [b10] "r"(b10), [b11] "r"(b11), [b12] "r"(b12), [b13] "r"(b13), \ - [maskn] "r"(0x00008000) \ + "vfcpka.h.s %[sum00_real], %[sum00_imag], %[sum00_real];" \ + "vfcpka.h.s %[sum01_real], %[sum01_imag], %[sum01_real];" \ + "vfcpka.h.s %[sum02_real], %[sum02_imag], %[sum02_real];" \ + "vfcpka.h.s %[sum03_real], %[sum03_imag], %[sum03_real];" \ + "vfcpka.h.s %[sum10_real], %[sum10_imag], %[sum10_real];" \ + "vfcpka.h.s %[sum11_real], %[sum11_imag], %[sum11_real];" \ + "vfcpka.h.s %[sum12_real], %[sum12_imag], %[sum12_real];" \ + "vfcpka.h.s %[sum13_real], %[sum13_imag], %[sum13_real];" \ + : [sum00_real] "+&r"(sum00_real), [sum01_real] "+&r"(sum01_real), \ + [sum02_real] "+&r"(sum02_real), [sum03_real] "+&r"(sum03_real), \ + [sum10_real] "+&r"(sum10_real), [sum11_real] "+&r"(sum11_real), \ + [sum12_real] "+&r"(sum12_real), [sum13_real] "+&r"(sum13_real) \ + : [sum00_imag] "r"(sum00_imag), [sum01_imag] "r"(sum01_imag), \ + [sum02_imag] "r"(sum02_imag), [sum03_imag] "r"(sum03_imag), \ + [sum10_imag] "r"(sum10_imag), [sum11_imag] "r"(sum11_imag), \ + [sum12_imag] "r"(sum12_imag), [sum13_imag] "r"(sum13_imag) \ :); \ - } \ - asm volatile( \ - "vfcpka.h.s %[sum00_real], %[sum00_imag], %[sum00_real];" \ - "vfcpka.h.s %[sum01_real], %[sum01_imag], %[sum01_real];" \ - "vfcpka.h.s %[sum02_real], %[sum02_imag], %[sum02_real];" \ - "vfcpka.h.s %[sum03_real], %[sum03_imag], %[sum03_real];" \ - "vfcpka.h.s %[sum10_real], %[sum10_imag], %[sum10_real];" \ - "vfcpka.h.s %[sum11_real], %[sum11_imag], %[sum11_real];" \ - "vfcpka.h.s %[sum12_real], %[sum12_imag], %[sum12_real];" \ - "vfcpka.h.s %[sum13_real], %[sum13_imag], %[sum13_real];" \ - : [sum00_real] "+&r"(sum00_real), [sum01_real] "+&r"(sum01_real), \ - [sum02_real] "+&r"(sum02_real), [sum03_real] "+&r"(sum03_real), \ - [sum10_real] "+&r"(sum10_real), [sum11_real] "+&r"(sum11_real), \ - [sum12_real] "+&r"(sum12_real), [sum13_real] "+&r"(sum13_real) \ - : [sum00_imag] "r"(sum00_imag), [sum01_imag] "r"(sum01_imag), \ - [sum02_imag] "r"(sum02_imag), [sum03_imag] "r"(sum03_imag), \ - [sum10_imag] "r"(sum10_imag), [sum11_imag] "r"(sum11_imag), \ - [sum12_imag] "r"(sum12_imag), [sum13_imag] "r"(sum13_imag) \ - :); \ - C[(i + 0) * P + k + 0] = (v2h)sum00_real; \ - C[(i + 0) * P + k + 1] = (v2h)sum01_real; \ - C[(i + 0) * P + k + 2] = (v2h)sum02_real; \ - C[(i + 0) * P + k + 3] = (v2h)sum03_real; \ - C[(i + 1) * P + k + 0] = (v2h)sum10_real; \ - C[(i + 1) * P + k + 1] = (v2h)sum11_real; \ - C[(i + 1) * P + k + 2] = (v2h)sum12_real; \ - C[(i + 1) * P + k + 3] = (v2h)sum13_real; + C[(i + 0) * P + k + 0] = (v2h)sum00_real; \ + C[(i + 0) * P + k + 1] = (v2h)sum01_real; \ + C[(i + 0) * P + k + 2] = (v2h)sum02_real; \ + C[(i + 0) * P + k + 3] = (v2h)sum03_real; \ + C[(i + 1) * P + k + 0] = (v2h)sum10_real; \ + C[(i + 1) * P + k + 1] = (v2h)sum11_real; \ + C[(i + 1) * P + k + 2] = (v2h)sum12_real; \ + C[(i + 1) * P + k + 3] = (v2h)sum13_real; \ + } /**************************************************************************/ /**************************************************************************/ // COMPLEX DOTP INSTRUCTIONS #define CMATMUL_CDOTP_1x1_LOOP \ - v2h sum = (v2h)0.0f; \ - for (j = 0; j < N; j++) { \ - v2h a = *(v2h *)&A[2 * (i * M + j)]; \ - v2h b = *(v2h *)&B[2 * (j * P + k)]; \ - asm volatile("fcdotpex.s.h %[sum], %[a], %[b];" \ - : [sum] "+&r"(sum) \ - : [a] "r"(a), [b] "r"(b) \ - :); \ - } \ - (*(v2h *)&C[2 * ((i + 0) * P + k + 0)]) = sum; + { \ + v2h sum = (v2h)0.0f; \ + for (j = 0; j < N; j++) { \ + v2h a = *(v2h *)&A[2 * (i * M + j)]; \ + v2h b = *(v2h *)&B[2 * (j * P + k)]; \ + asm volatile("fcdotpex.s.h %[sum], %[a], %[b];" \ + : [sum] "+&r"(sum) \ + : [a] "r"(a), [b] "r"(b) \ + :); \ + } \ + (*(v2h *)&C[2 * ((i + 0) * P + k + 0)]) = sum; \ + } #define CMATMUL_CDOTP_2x2_LOOP \ - v2h sum00 = (v2h)0.0f; \ - v2h sum01 = (v2h)0.0f; \ - v2h sum10 = (v2h)0.0f; \ - v2h sum11 = (v2h)0.0f; \ - for (j = 0; j < N; j += 2) { \ - v2h a00 = *(v2h *)&A[2 * ((i + 0) * M + (j + 0))]; \ - v2h a01 = *(v2h *)&A[2 * ((i + 0) * M + (j + 1))]; \ - v2h a10 = *(v2h *)&A[2 * ((i + 1) * M + (j + 0))]; \ - v2h a11 = *(v2h *)&A[2 * ((i + 1) * M + (j + 1))]; \ - v2h b00 = *(v2h *)&B[2 * ((j + 0) * P + (k + 0))]; \ - v2h b01 = *(v2h *)&B[2 * ((j + 0) * P + (k + 1))]; \ - v2h b10 = *(v2h *)&B[2 * ((j + 1) * P + (k + 0))]; \ - v2h b11 = *(v2h *)&B[2 * ((j + 1) * P + (k + 1))]; \ - asm volatile( \ - "fcdotpex.s.h %[sum00], %[a00], %[b00];" \ - "fcdotpex.s.h %[sum10], %[a10], %[b00];" \ - "fcdotpex.s.h %[sum01], %[a00], %[b01];" \ - "fcdotpex.s.h %[sum11], %[a10], %[b01];" \ - "fcdotpex.s.h %[sum00], %[a01], %[b10];" \ - "fcdotpex.s.h %[sum10], %[a11], %[b10];" \ - "fcdotpex.s.h %[sum01], %[a01], %[b11];" \ - "fcdotpex.s.h %[sum11], %[a11], %[b11];" \ - : [sum00] "+&r"(sum00), [sum01] "+&r"(sum01), [sum10] "+&r"(sum10), \ - [sum11] "+&r"(sum11) \ - : [a00] "r"(a00), [a01] "r"(a01), [a10] "r"(a10), [a11] "r"(a11), \ - [b00] "r"(b00), [b01] "r"(b01), [b10] "r"(b10), [b11] "r"(b11) \ - :); \ - } \ - (*(v2h *)&C[2 * ((i + 0) * P + k + 0)]) = sum00; \ - (*(v2h *)&C[2 * ((i + 0) * P + k + 1)]) = sum01; \ - (*(v2h *)&C[2 * ((i + 1) * P + k + 0)]) = sum10; \ - (*(v2h *)&C[2 * ((i + 1) * P + k + 1)]) = sum11; + { \ + v2h sum00 = (v2h)0.0f; \ + v2h sum01 = (v2h)0.0f; \ + v2h sum10 = (v2h)0.0f; \ + v2h sum11 = (v2h)0.0f; \ + for (j = 0; j < N; j += 2) { \ + v2h a00 = *(v2h *)&A[2 * ((i + 0) * M + (j + 0))]; \ + v2h a01 = *(v2h *)&A[2 * ((i + 0) * M + (j + 1))]; \ + v2h a10 = *(v2h *)&A[2 * ((i + 1) * M + (j + 0))]; \ + v2h a11 = *(v2h *)&A[2 * ((i + 1) * M + (j + 1))]; \ + v2h b00 = *(v2h *)&B[2 * ((j + 0) * P + (k + 0))]; \ + v2h b01 = *(v2h *)&B[2 * ((j + 0) * P + (k + 1))]; \ + v2h b10 = *(v2h *)&B[2 * ((j + 1) * P + (k + 0))]; \ + v2h b11 = *(v2h *)&B[2 * ((j + 1) * P + (k + 1))]; \ + asm volatile( \ + "fcdotpex.s.h %[sum00], %[a00], %[b00];" \ + "fcdotpex.s.h %[sum10], %[a10], %[b00];" \ + "fcdotpex.s.h %[sum01], %[a00], %[b01];" \ + "fcdotpex.s.h %[sum11], %[a10], %[b01];" \ + "fcdotpex.s.h %[sum00], %[a01], %[b10];" \ + "fcdotpex.s.h %[sum10], %[a11], %[b10];" \ + "fcdotpex.s.h %[sum01], %[a01], %[b11];" \ + "fcdotpex.s.h %[sum11], %[a11], %[b11];" \ + : [sum00] "+&r"(sum00), [sum01] "+&r"(sum01), [sum10] "+&r"(sum10), \ + [sum11] "+&r"(sum11) \ + : [a00] "r"(a00), [a01] "r"(a01), [a10] "r"(a10), [a11] "r"(a11), \ + [b00] "r"(b00), [b01] "r"(b01), [b10] "r"(b10), [b11] "r"(b11) \ + :); \ + } \ + (*(v2h *)&C[2 * ((i + 0) * P + k + 0)]) = sum00; \ + (*(v2h *)&C[2 * ((i + 0) * P + k + 1)]) = sum01; \ + (*(v2h *)&C[2 * ((i + 1) * P + k + 0)]) = sum10; \ + (*(v2h *)&C[2 * ((i + 1) * P + k + 1)]) = sum11; \ + } #define CMATMUL_CDOTP_2x4_LOOP \ - v2h sum00 = (v2h)0.0f; \ - v2h sum01 = (v2h)0.0f; \ - v2h sum02 = (v2h)0.0f; \ - v2h sum03 = (v2h)0.0f; \ - v2h sum10 = (v2h)0.0f; \ - v2h sum11 = (v2h)0.0f; \ - v2h sum12 = (v2h)0.0f; \ - v2h sum13 = (v2h)0.0f; \ - for (j = 0; j < N; j += 2) { \ - v2h a00 = A[i * M + j + 0]; \ - v2h a01 = A[i * M + j + 1]; \ - v2h a10 = A[(i + 1) * M + j + 0]; \ - v2h a11 = A[(i + 1) * M + j + 1]; \ - v2h b00 = B[j * P + k + 0]; \ - v2h b01 = B[j * P + k + 1]; \ - v2h b02 = B[j * P + k + 2]; \ - v2h b03 = B[j * P + k + 3]; \ - v2h b10 = B[(j + 1) * P + k + 0]; \ - v2h b11 = B[(j + 1) * P + k + 1]; \ - v2h b12 = B[(j + 1) * P + k + 2]; \ - v2h b13 = B[(j + 1) * P + k + 3]; \ - asm volatile( \ - "fcdotpex.s.h %[sum00], %[a00], %[b00];" \ - "fcdotpex.s.h %[sum10], %[a10], %[b00];" \ - "fcdotpex.s.h %[sum01], %[a00], %[b01];" \ - "fcdotpex.s.h %[sum11], %[a10], %[b01];" \ - "fcdotpex.s.h %[sum02], %[a00], %[b02];" \ - "fcdotpex.s.h %[sum12], %[a10], %[b02];" \ - "fcdotpex.s.h %[sum03], %[a00], %[b03];" \ - "fcdotpex.s.h %[sum13], %[a10], %[b03];" \ - "fcdotpex.s.h %[sum00], %[a01], %[b10];" \ - "fcdotpex.s.h %[sum10], %[a11], %[b10];" \ - "fcdotpex.s.h %[sum01], %[a01], %[b11];" \ - "fcdotpex.s.h %[sum11], %[a11], %[b11];" \ - "fcdotpex.s.h %[sum02], %[a01], %[b12];" \ - "fcdotpex.s.h %[sum12], %[a11], %[b12];" \ - "fcdotpex.s.h %[sum03], %[a01], %[b13];" \ - "fcdotpex.s.h %[sum13], %[a11], %[b13];" \ - : [sum00] "+&r"(sum00), [sum01] "+&r"(sum01), [sum02] "+&r"(sum02), \ - [sum03] "+&r"(sum03), [sum10] "+&r"(sum10), [sum11] "+&r"(sum11), \ - [sum12] "+&r"(sum12), [sum13] "+&r"(sum13) \ - : [a00] "r"(a00), [a01] "r"(a01), [a10] "r"(a10), [a11] "r"(a11), \ - [b00] "r"(b00), [b01] "r"(b01), [b02] "r"(b02), [b03] "r"(b03), \ - [b10] "r"(b10), [b11] "r"(b11), [b12] "r"(b12), [b13] "r"(b13) \ - :); \ - } \ - C[i * P + k + 0] = sum00; \ - C[i * P + k + 1] = sum01; \ - C[i * P + k + 2] = sum02; \ - C[i * P + k + 3] = sum03; \ - C[(i + 1) * P + k + 0] = sum10; \ - C[(i + 1) * P + k + 1] = sum11; \ - C[(i + 1) * P + k + 2] = sum12; \ - C[(i + 1) * P + k + 3] = sum13; + { \ + v2h sum00 = (v2h)0.0f; \ + v2h sum01 = (v2h)0.0f; \ + v2h sum02 = (v2h)0.0f; \ + v2h sum03 = (v2h)0.0f; \ + v2h sum10 = (v2h)0.0f; \ + v2h sum11 = (v2h)0.0f; \ + v2h sum12 = (v2h)0.0f; \ + v2h sum13 = (v2h)0.0f; \ + for (j = 0; j < N; j += 2) { \ + v2h a00 = *(v2h *)&A[2 * (i * M + j + 0)]; \ + v2h a01 = *(v2h *)&A[2 * (i * M + j + 1)]; \ + v2h a10 = *(v2h *)&A[2 * ((i + 1) * M + j + 0)]; \ + v2h a11 = *(v2h *)&A[2 * ((i + 1) * M + j + 1)]; \ + v2h b00 = *(v2h *)&B[2 * (j * P + k + 0)]; \ + v2h b01 = *(v2h *)&B[2 * (j * P + k + 1)]; \ + v2h b02 = *(v2h *)&B[2 * (j * P + k + 2)]; \ + v2h b03 = *(v2h *)&B[2 * (j * P + k + 3)]; \ + v2h b10 = *(v2h *)&B[2 * ((j + 1) * P + k + 0)]; \ + v2h b11 = *(v2h *)&B[2 * ((j + 1) * P + k + 1)]; \ + v2h b12 = *(v2h *)&B[2 * ((j + 1) * P + k + 2)]; \ + v2h b13 = *(v2h *)&B[2 * ((j + 1) * P + k + 3)]; \ + asm volatile( \ + "fcdotpex.s.h %[sum00], %[a00], %[b00];" \ + "fcdotpex.s.h %[sum10], %[a10], %[b00];" \ + "fcdotpex.s.h %[sum01], %[a00], %[b01];" \ + "fcdotpex.s.h %[sum11], %[a10], %[b01];" \ + "fcdotpex.s.h %[sum02], %[a00], %[b02];" \ + "fcdotpex.s.h %[sum12], %[a10], %[b02];" \ + "fcdotpex.s.h %[sum03], %[a00], %[b03];" \ + "fcdotpex.s.h %[sum13], %[a10], %[b03];" \ + "fcdotpex.s.h %[sum00], %[a01], %[b10];" \ + "fcdotpex.s.h %[sum10], %[a11], %[b10];" \ + "fcdotpex.s.h %[sum01], %[a01], %[b11];" \ + "fcdotpex.s.h %[sum11], %[a11], %[b11];" \ + "fcdotpex.s.h %[sum02], %[a01], %[b12];" \ + "fcdotpex.s.h %[sum12], %[a11], %[b12];" \ + "fcdotpex.s.h %[sum03], %[a01], %[b13];" \ + "fcdotpex.s.h %[sum13], %[a11], %[b13];" \ + : [sum00] "+&r"(sum00), [sum01] "+&r"(sum01), [sum02] "+&r"(sum02), \ + [sum03] "+&r"(sum03), [sum10] "+&r"(sum10), [sum11] "+&r"(sum11), \ + [sum12] "+&r"(sum12), [sum13] "+&r"(sum13) \ + : [a00] "r"(a00), [a01] "r"(a01), [a10] "r"(a10), [a11] "r"(a11), \ + [b00] "r"(b00), [b01] "r"(b01), [b02] "r"(b02), [b03] "r"(b03), \ + [b10] "r"(b10), [b11] "r"(b11), [b12] "r"(b12), [b13] "r"(b13) \ + :); \ + } \ + (*(v2h *)&C[2 * (i * P + k + 0)]) = sum00; \ + (*(v2h *)&C[2 * (i * P + k + 1)]) = sum01; \ + (*(v2h *)&C[2 * (i * P + k + 2)]) = sum02; \ + (*(v2h *)&C[2 * (i * P + k + 3)]) = sum03; \ + (*(v2h *)&C[2 * ((i + 1) * P + k + 0)]) = sum10; \ + (*(v2h *)&C[2 * ((i + 1) * P + k + 1)]) = sum11; \ + (*(v2h *)&C[2 * ((i + 1) * P + k + 2)]) = sum12; \ + (*(v2h *)&C[2 * ((i + 1) * P + k + 3)]) = sum13; \ + } + +#define CMATMUL_CDOTP_4x4_LOOP \ + { \ + int32_t const *addr_a = &A[i * N]; \ + int32_t const *addr_b = &B[j]; \ + int32_t const *end_b = &B[N * P + j]; \ + int32_t const *addr_c = &C[i * P + j]; \ + int32_t const P3 = ((int32_t)P - 3) * 4; \ + int32_t const N31 = (-3 * (int32_t)N + 1) * 4; \ + register int32_t k asm("x1") = (int32_t)end_b; \ + __asm__ volatile( \ + ".balign 16 \n\t" \ + "p.lw x3, %[N](%[addr_a]!) \n\t" \ + "p.lw x12, 4(%[addr_b]!) \n\t" \ + "p.lw x13, 4(%[addr_b]!) \n\t" \ + "p.lw x14, 4(%[addr_b]!) \n\t" \ + "p.lw x15, %[P3](%[addr_b]!) \n\t" \ + "p.lw x4, %[N](%[addr_a]!) \n\t" \ + "p.lw x10, %[N](%[addr_a]!) \n\t" \ + "p.lw x11, %[N31](%[addr_a]!) \n\t" \ + "mv x16, zero \n\t" \ + "mv x17, zero \n\t" \ + "mv x18, zero \n\t" \ + "mv x19, zero \n\t" \ + "mv x20, zero \n\t" \ + "mv x21, zero \n\t" \ + "mv x22, zero \n\t" \ + "mv x23, zero \n\t" \ + "mv x24, zero \n\t" \ + "mv x25, zero \n\t" \ + "mv x26, zero \n\t" \ + "mv x27, zero \n\t" \ + "mv x28, zero \n\t" \ + "mv x29, zero \n\t" \ + "mv x30, zero \n\t" \ + "mv x31, zero \n\t" \ + "fcdotpex.s.h x16, x3, x12 \n\t" \ + "fcdotpex.s.h x17, x3, x13 \n\t" \ + "fcdotpex.s.h x18, x3, x14 \n\t" \ + "fcdotpex.s.h x19, x3, x15 \n\t" \ + "p.lw x3, %[N](%[addr_a]!) \n\t" \ + "fcdotpex.s.h x20, x4, x12 \n\t" \ + "fcdotpex.s.h x21, x4, x13 \n\t" \ + "fcdotpex.s.h x22, x4, x14 \n\t" \ + "fcdotpex.s.h x23, x4, x15 \n\t" \ + "p.lw x4, %[N](%[addr_a]!) \n\t" \ + "fcdotpex.s.h x24, x10, x12 \n\t" \ + "fcdotpex.s.h x25, x10, x13 \n\t" \ + "fcdotpex.s.h x26, x10, x14 \n\t" \ + "fcdotpex.s.h x27, x10, x15 \n\t" \ + "p.lw x10, %[N](%[addr_a]!) \n\t" \ + "fcdotpex.s.h x28, x11, x12 \n\t" \ + "p.lw x12, 4(%[addr_b]!) \n\t" \ + "fcdotpex.s.h x29, x11, x13 \n\t" \ + "p.lw x13, 4(%[addr_b]!) \n\t" \ + "fcdotpex.s.h x30, x11, x14 \n\t" \ + "p.lw x14, 4(%[addr_b]!) \n\t" \ + "fcdotpex.s.h x31, x11, x15 \n\t" \ + "p.lw x15, %[P3](%[addr_b]!) \n\t" \ + "p.lw x11, %[N31](%[addr_a]!) \n\t" \ + "1: \n\t" \ + "fcdotpex.s.h x16, x3, x12 \n\t" \ + "fcdotpex.s.h x17, x3, x13 \n\t" \ + "fcdotpex.s.h x20, x4, x12 \n\t" \ + "fcdotpex.s.h x21, x4, x13 \n\t" \ + "fcdotpex.s.h x18, x3, x14 \n\t" \ + "fcdotpex.s.h x22, x4, x14 \n\t" \ + "fcdotpex.s.h x19, x3, x15 \n\t" \ + "p.lw x3, %[N](%[addr_a]!) \n\t" \ + "fcdotpex.s.h x23, x4, x15 \n\t" \ + "p.lw x4, %[N](%[addr_a]!) \n\t" \ + "fcdotpex.s.h x24, x10, x12 \n\t" \ + "fcdotpex.s.h x28, x11, x12 \n\t" \ + "p.lw x12, 4(%[addr_b]!) \n\t" \ + "fcdotpex.s.h x25, x10, x13 \n\t" \ + "fcdotpex.s.h x29, x11, x13 \n\t" \ + "p.lw x13, 4(%[addr_b]!) \n\t" \ + "fcdotpex.s.h x26, x10, x14 \n\t" \ + "fcdotpex.s.h x30, x11, x14 \n\t" \ + "p.lw x14, 4(%[addr_b]!) \n\t" \ + "fcdotpex.s.h x27, x10, x15 \n\t" \ + "fcdotpex.s.h x31, x11, x15 \n\t" \ + "p.lw x15, %[P3](%[addr_b]!) \n\t" \ + "p.lw x10, %[N](%[addr_a]!) \n\t" \ + "p.lw x11, %[N31](%[addr_a]!) \n\t" \ + "bne %[addr_b], x1, 1b \n\t" \ + "fcdotpex.s.h x16, x3, x12 \n\t" \ + "fcdotpex.s.h x17, x3, x13 \n\t" \ + "fcdotpex.s.h x18, x3, x14 \n\t" \ + "p.sw x16, 4(%[addr_c]!) \n\t" \ + "fcdotpex.s.h x19, x3, x15 \n\t" \ + "p.sw x17, 4(%[addr_c]!) \n\t" \ + "fcdotpex.s.h x20, x4, x12 \n\t" \ + "p.sw x18, 4(%[addr_c]!) \n\t" \ + "fcdotpex.s.h x21, x4, x13 \n\t" \ + "p.sw x19, %[P3](%[addr_c]!) \n\t" \ + "fcdotpex.s.h x22, x4, x14 \n\t" \ + "p.sw x20, 4(%[addr_c]!) \n\t" \ + "fcdotpex.s.h x23, x4, x15 \n\t" \ + "p.sw x21, 4(%[addr_c]!) \n\t" \ + "fcdotpex.s.h x24, x10, x12 \n\t" \ + "p.sw x22, 4(%[addr_c]!) \n\t" \ + "fcdotpex.s.h x25, x10, x13 \n\t" \ + "p.sw x23, %[P3](%[addr_c]!) \n\t" \ + "fcdotpex.s.h x26, x10, x14 \n\t" \ + "p.sw x24, 4(%[addr_c]!) \n\t" \ + "fcdotpex.s.h x27, x10, x15 \n\t" \ + "p.sw x25, 4(%[addr_c]!) \n\t" \ + "fcdotpex.s.h x28, x11, x12 \n\t" \ + "p.sw x26, 4(%[addr_c]!) \n\t" \ + "fcdotpex.s.h x29, x11, x13 \n\t" \ + "p.sw x27, %[P3](%[addr_c]!) \n\t" \ + "fcdotpex.s.h x30, x11, x14 \n\t" \ + "p.sw x28, 4(%[addr_c]!) \n\t" \ + "fcdotpex.s.h x31, x11, x15 \n\t" \ + "p.sw x29, 4(%[addr_c]!) \n\t" \ + "p.sw x30, 4(%[addr_c]!) \n\t" \ + "p.sw x31, %[P3](%[addr_c]!) \n\t" \ + : \ + [addr_a] "+&r"(addr_a), [addr_b] "+&r"(addr_b), [addr_c] "+&r"(addr_c) \ + : [N31] "r"(N31), [P3] "r"(P3), [x1] "r"(k), [N] "I"(dim_N * 4) \ + : "x3", "x4", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", \ + "x18", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", \ + "x27", "x28", "x29", "x30", "x31", "memory"); \ + } #define __CDOTP void cmatmul_2x2_f16s(__fp16 const *__restrict__ A, @@ -364,14 +499,13 @@ void cmatmul_2x2_f16p(__fp16 const *__restrict__ A, #endif } } - mempool_log_partial_barrier(2, core_id, numThreads); return; } #define __SHIFT_A -void cmatmul_2x4_f16p(v2h *__restrict__ A, v2h const *__restrict__ B, - v2h *__restrict__ C, uint32_t M, uint32_t N, uint32_t P, - uint32_t core_id, uint32_t numThreads) { +void cmatmul_2x4_f16p(__fp16 *__restrict__ A, __fp16 const *__restrict__ B, + __fp16 *__restrict__ C, uint32_t M, uint32_t N, + uint32_t P, uint32_t core_id, uint32_t numThreads) { uint32_t i = 0; // loop counter for M uint32_t j = 0; // loop counter for N uint32_t k = 0; // loop counter for P @@ -386,7 +520,7 @@ void cmatmul_2x4_f16p(v2h *__restrict__ A, v2h const *__restrict__ B, } } #else - uint32_t shift_id = 2 * (core_id % NUM_CORES_PER_TILE); + uint32_t shift_id = (2 * (core_id % NUM_CORES_PER_TILE)) % M; for (k = core_id * 4; k < P; k += 4 * numThreads) { for (i = shift_id; i < M; i += 2) { #ifdef __CDOTP @@ -404,62 +538,42 @@ void cmatmul_2x4_f16p(v2h *__restrict__ A, v2h const *__restrict__ B, } } #endif - mempool_log_partial_barrier(2, core_id, numThreads); return; } -void cmatmul_2x4_folded_f16p(v2h *A, v2h const *__restrict__ B, - v2h *__restrict__ A_folded, v2h *__restrict__ C, - uint32_t M, uint32_t N, uint32_t P, - uint32_t core_id, uint32_t numThreads) { - uint32_t i = 0; // loop counter for M - uint32_t j = 0; // loop counter for N - uint32_t k = 0; // loop counter for P - // Copy multiple A matrices in memory - uint32_t num_copy = NUM_BANKS / (N * M); - for (k = core_id * 4; k < N * M; k += 4 * numThreads) { - v2h a0 = A[k]; - v2h a1 = A[k + 1]; - v2h a2 = A[k + 2]; - v2h a3 = A[k + 3]; - i = k / N; // row_index - j = k % N; // col_index - for (uint32_t idx_copy = 0; idx_copy < num_copy; idx_copy++) { - A_folded[idx_copy * N * M + i * N + j] = a0; - A_folded[idx_copy * N * M + i * N + j + 1] = a1; - A_folded[idx_copy * N * M + i * N + j + 2] = a2; - A_folded[idx_copy * N * M + i * N + j + 3] = a3; +// 4x4 MATMUL +void cmatmul_4x4_f16p(int32_t const *__restrict__ A, + int32_t const *__restrict__ B, int32_t *__restrict__ C, + uint32_t M, uint32_t N, uint32_t P, uint32_t id, + uint32_t numThreads) { + uint32_t shift_id = (4 * (id % NUM_CORES_PER_TILE)) % M; + for (uint32_t j = 4 * id; j < P; j += 4 * numThreads) { + for (uint32_t i = shift_id; i < M; i += 4) { + CMATMUL_CDOTP_4x4_LOOP } - } - A = A_folded + (N * M) * ((core_id * BANKING_FACTOR) / (N * M)); - mempool_log_partial_barrier(2, core_id, numThreads); - // Compute -#ifndef __SHIFT_A - for (k = core_id * 4; k < P; k += 4 * numThreads) { - for (i = 0; i < M; i += 2) { -#ifdef __CDOTP - CMATMUL_CDOTP_2x4_LOOP; -#else - CMATMUL_2x4_LOOP; -#endif + for (uint32_t i = 0; i < shift_id; i += 4) { + CMATMUL_CDOTP_4x4_LOOP } } -#else - uint32_t shift_id = 2 * (core_id % NUM_CORES_PER_TILE); - for (k = core_id * 4; k < P; k += 4 * numThreads) { - for (i = shift_id; i < M; i += 2) { - // CMATMUL_2x4_LOOP; - CMATMUL_CDOTP_2x4_LOOP; - } - for (i = 0; i < shift_id; i += 2) { -#ifdef __CDOTP - CMATMUL_CDOTP_2x4_LOOP; -#else - CMATMUL_2x4_LOOP; -#endif + mempool_log_partial_barrier(2, id, numThreads); + return; +} + +void cmatmul_4x4_folded_f16p(int32_t *A_l2, int32_t *A_folded, int32_t *const B, + int32_t *C, uint32_t M, uint32_t N, uint32_t P, + uint32_t core_id, uint32_t numThreads) { + + // Copy multiple A matrices in memory + if (core_id == 0) { + for (uint32_t idx_copy = 0; idx_copy < (BANKING_FACTOR * NUM_CORES); + idx_copy += (M * N)) { + dma_memcpy_blocking(&A_folded[idx_copy], A_l2, M * N * sizeof(int32_t)); } } -#endif + // Cores only fetch from local A + int32_t *A_shifted = A_folded; + A_shifted += (N * M) * ((core_id * BANKING_FACTOR) / (N * M)); mempool_log_partial_barrier(2, core_id, numThreads); + cmatmul_4x4_f16p(A_shifted, B, C, M, N, P, core_id, numThreads); return; } diff --git a/software/runtime/kernel/mempool_cmatmul_q16.h b/software/kernels/baremetal/mempool_cmatmul_q16.h similarity index 99% rename from software/runtime/kernel/mempool_cmatmul_q16.h rename to software/kernels/baremetal/mempool_cmatmul_q16.h index fc020619d..53b84d80c 100644 --- a/software/runtime/kernel/mempool_cmatmul_q16.h +++ b/software/kernels/baremetal/mempool_cmatmul_q16.h @@ -12,7 +12,7 @@ */ #pragma once -#include "xpulp/builtins_v2.h" +#include "builtins_v2.h" #define NUM_BANKS (NUM_CORES * BANKING_FACTOR) #define CMATMUL_1x1_LOOP \ diff --git a/software/kernels/baremetal/mempool_linearsolver_f32s.h b/software/kernels/baremetal/mempool_linearsolver_f32s.h index 18e4ec94e..c3f3b6ce1 100644 --- a/software/kernels/baremetal/mempool_linearsolver_f32s.h +++ b/software/kernels/baremetal/mempool_linearsolver_f32s.h @@ -222,4 +222,8 @@ void mempool_Lttrisol_folded_f32s(float *pL, float *in, float *x, return; } +#else + +#error "ERROR: f32 MMSE functions available only for __XDIVSQRT." + #endif diff --git a/software/runtime/kernel/mempool_linearsolver_q16s.h b/software/kernels/baremetal/mempool_linearsolver_q16s.h similarity index 100% rename from software/runtime/kernel/mempool_linearsolver_q16s.h rename to software/kernels/baremetal/mempool_linearsolver_q16s.h diff --git a/software/kernels/baremetal/mempool_mimo_mmse_f16s.h b/software/kernels/baremetal/mempool_mimo_mmse_f16s.h index e639918ce..e47ff133a 100644 --- a/software/kernels/baremetal/mempool_mimo_mmse_f16s.h +++ b/software/kernels/baremetal/mempool_mimo_mmse_f16s.h @@ -6,6 +6,7 @@ // Author: Aofeng Aoshen, ETH Zurich #pragma once +#include "builtins_v2.h" #define N_BANKS (NUM_CORES * BANKING_FACTOR) /** diff --git a/software/runtime/kernel/mempool_mimo_mmse_q16s.h b/software/kernels/baremetal/mempool_mimo_mmse_q16s.h similarity index 100% rename from software/runtime/kernel/mempool_mimo_mmse_q16s.h rename to software/kernels/baremetal/mempool_mimo_mmse_q16s.h diff --git a/software/runtime/kernel/mempool_radix4_cfft_butterfly_f16.h b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h similarity index 99% rename from software/runtime/kernel/mempool_radix4_cfft_butterfly_f16.h rename to software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h index 7c305b222..edf7ea735 100644 --- a/software/runtime/kernel/mempool_radix4_cfft_butterfly_f16.h +++ b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h @@ -5,7 +5,7 @@ // Author: Marco Bertuletti, ETH Zurich #pragma once -#include "xpulp/builtins_v2.h" +#include "builtins_v2.h" /** @brief First butterfly stage. diff --git a/software/runtime/kernel/mempool_radix4_cfft_f16p.h b/software/kernels/baremetal/mempool_radix4_cfft_f16p.h similarity index 72% rename from software/runtime/kernel/mempool_radix4_cfft_f16p.h rename to software/kernels/baremetal/mempool_radix4_cfft_f16p.h index 5699480fb..c82684995 100644 --- a/software/runtime/kernel/mempool_radix4_cfft_f16p.h +++ b/software/kernels/baremetal/mempool_radix4_cfft_f16p.h @@ -6,7 +6,7 @@ #pragma once #define BITREVERSETABLE -#include "xpulp/builtins_v2.h" +#include "builtins_v2.h" #define MIN(x, y) (((x) < (y)) ? (x) : (y)) // CoSi: (Si, Co) -> C: (Co, -Si) @@ -215,6 +215,10 @@ void mempool_radix4_cfft_f16p_scheduler( __fp16 t0, t1, t2, t3, t4, t5; v2h CoSi1, CoSi2, CoSi3; v2h C1, C2, C3; + __fp16 *pIn; + __fp16 *pOut; + __fp16 *pTmp; + #ifdef FOLDED_TWIDDLES uint32_t n1, n2, n2_store; uint32_t i0, k, ic, ic_store; @@ -223,7 +227,6 @@ void mempool_radix4_cfft_f16p_scheduler( uint32_t i0, k, ic; uint32_t twidCoefModifier = 1U; #endif - __fp16 *pTmp; /* FIRST STAGE */ n1 = fftLen; @@ -237,9 +240,8 @@ void mempool_radix4_cfft_f16p_scheduler( LOAD_STORE_TWIDDLEFACT; SHUFFLE_TWIDDLEFACT; for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) { - __fp16 *pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * fftLen; - __fp16 *pOut = - pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); + pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * fftLen; + pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); radix4_butterfly_first(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2, C3); } @@ -268,10 +270,8 @@ void mempool_radix4_cfft_f16p_scheduler( SHUFFLE_TWIDDLEFACT; for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) { - __fp16 *pIn = - pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); - __fp16 *pOut = - pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); + pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); + pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); radix4_butterfly_middle(pIn, pOut, i0, n2, CoSi1, CoSi2, CoSi3, C1, C2, C3); } @@ -288,10 +288,15 @@ void mempool_radix4_cfft_f16p_scheduler( /* LAST STAGE */ for (i0 = core_id * 4; i0 < MIN(core_id * 4 + 4, fftLen >> 2U); i0++) { for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) { - __fp16 *pIn = - pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); - __fp16 *pOut = - pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); + +#if defined(BITREVERSETABLE) + uint32_t col_shift = fftLen; +#else + uint32_t col_shift = fftLen / 4; +#endif + + pIn = pSrc16 + idx_row * (N_BANKS * 8) + 2 * col_id * (fftLen / 4); + pOut = pDst16 + idx_row * (N_BANKS * 8) + 2 * col_id * col_shift; radix4_butterfly_last(pIn, pOut, i0); } } @@ -300,22 +305,19 @@ void mempool_radix4_cfft_f16p_scheduler( pDst16 = pTmp; mempool_log_partial_barrier(2, absolute_core_id, n_FFTs_COL * nPE); mempool_stop_benchmark(); - mempool_start_benchmark(); - /* BITREVERSAL */ - // Bitreversal stage stores in the sequential addresses + if (bitReverseFlag) { #ifdef BITREVERSETABLE - pSrc16 = pSrc16 + 2 * col_id * (fftLen / 4); - pDst16 = pDst16 + 2 * col_id * fftLen; + /* BITREVERSAL */ + mempool_start_benchmark(); + pIn = pSrc16 + 2 * col_id * fftLen; + uint32_t addr1, addr2, addr3, addr4; + uint32_t s2 = 0x00020002; + uint32_t tmpa1, tmpa2, tmpa3, tmpa4; + uint32_t tmpb1, tmpb2, tmpb3, tmpb4; + int32_t a1, a2, a3, a4; + int32_t b1, b2, b3, b4; for (ic = 8 * core_id; ic < bitReverseLen; ic += 8 * nPE) { - uint32_t addr1, addr2, addr3, addr4; - uint32_t tmpa1, tmpa2, tmpa3, tmpa4; - uint32_t tmpb1, tmpb2, tmpb3, tmpb4; - uint32_t a1, a2, a3, a4; - uint32_t b1, b2, b3, b4; - uint32_t a1_load, a2_load, a3_load, a4_load; - uint32_t b1_load, b2_load, b3_load, b4_load; - uint32_t s2 = 0x00020002; addr1 = *(uint32_t *)&pBitRevTable[ic]; addr2 = *(uint32_t *)&pBitRevTable[ic + 2]; addr3 = *(uint32_t *)&pBitRevTable[ic + 4]; @@ -324,67 +326,59 @@ void mempool_radix4_cfft_f16p_scheduler( "pv.sra.h %[addr2],%[addr2],%[s2];" "pv.sra.h %[addr3],%[addr3],%[s2];" "pv.sra.h %[addr4],%[addr4],%[s2];" - "pv.extract.h %[a1],%[addr1],0;" - "pv.extract.h %[a2],%[addr2],0;" - "pv.extract.h %[a3],%[addr3],0;" - "pv.extract.h %[a4],%[addr4],0;" - "pv.extract.h %[b1],%[addr1],1;" - "pv.extract.h %[b2],%[addr2],1;" - "pv.extract.h %[b3],%[addr3],1;" - "pv.extract.h %[b4],%[addr4],1;" + "pv.extract.h %[a1],%[addr1],1;" + "pv.extract.h %[a2],%[addr2],1;" + "pv.extract.h %[a3],%[addr3],1;" + "pv.extract.h %[a4],%[addr4],1;" + "pv.extract.h %[b1],%[addr1],0;" + "pv.extract.h %[b2],%[addr2],0;" + "pv.extract.h %[b3],%[addr3],0;" + "pv.extract.h %[b4],%[addr4],0;" : [a1] "=r"(a1), [a2] "=r"(a2), [a3] "=r"(a3), [a4] "=r"(a4), [b1] "=r"(b1), [b2] "=r"(b2), [b3] "=r"(b3), [b4] "=r"(b4), [addr1] "+&r"(addr1), [addr2] "+&r"(addr2), [addr3] "+&r"(addr3), [addr4] "+&r"(addr4) : [s2] "r"(s2) :); - // Compute the local addresses from the natural order ones - a1_load = (a1 % 4) * 2 * N_BANKS + 2 * (a1 / 4); - a2_load = (a2 % 4) * 2 * N_BANKS + 2 * (a2 / 4); - a3_load = (a3 % 4) * 2 * N_BANKS + 2 * (a3 / 4); - a4_load = (a4 % 4) * 2 * N_BANKS + 2 * (a4 / 4); - b1_load = (b1 % 4) * 2 * N_BANKS + 2 * (b1 / 4); - b2_load = (b2 % 4) * 2 * N_BANKS + 2 * (b2 / 4); - b3_load = (b3 % 4) * 2 * N_BANKS + 2 * (b3 / 4); - b4_load = (b4 % 4) * 2 * N_BANKS + 2 * (b4 / 4); - for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) { - uint16_t *ptr1 = (uint16_t *)(pSrc16 + idx_row * (N_BANKS * 8)); - uint16_t *ptr2 = (uint16_t *)(pDst16 + idx_row * (N_BANKS * 8)); + for (uint32_t idx_row = 0; idx_row < N_FFTs_ROW; idx_row++) { + uint16_t *ptr = (uint16_t *)(pIn + idx_row * (N_BANKS * 8)); // Load at address a - tmpa1 = *(uint32_t *)&ptr1[a1_load]; - tmpa2 = *(uint32_t *)&ptr1[a2_load]; - tmpa3 = *(uint32_t *)&ptr1[a3_load]; - tmpa4 = *(uint32_t *)&ptr1[a4_load]; + tmpa1 = *(uint32_t *)&ptr[a1]; + tmpa2 = *(uint32_t *)&ptr[a2]; + tmpa3 = *(uint32_t *)&ptr[a3]; + tmpa4 = *(uint32_t *)&ptr[a4]; // Load at address b - tmpb1 = *(uint32_t *)&ptr1[b1_load]; - tmpb2 = *(uint32_t *)&ptr1[b2_load]; - tmpb3 = *(uint32_t *)&ptr1[b3_load]; - tmpb4 = *(uint32_t *)&ptr1[b4_load]; + tmpb1 = *(uint32_t *)&ptr[b1]; + tmpb2 = *(uint32_t *)&ptr[b2]; + tmpb3 = *(uint32_t *)&ptr[b3]; + tmpb4 = *(uint32_t *)&ptr[b4]; // Swap a with b - *((uint32_t *)&ptr2[b1]) = tmpa1; - *((uint32_t *)&ptr2[b2]) = tmpa2; - *((uint32_t *)&ptr2[b3]) = tmpa3; - *((uint32_t *)&ptr2[b4]) = tmpa4; + *((uint32_t *)&ptr[b1]) = tmpa1; + *((uint32_t *)&ptr[b2]) = tmpa2; + *((uint32_t *)&ptr[b3]) = tmpa3; + *((uint32_t *)&ptr[b4]) = tmpa4; // Swap b with a - *((uint32_t *)&ptr2[a1]) = tmpb1; - *((uint32_t *)&ptr2[a2]) = tmpb2; - *((uint32_t *)&ptr2[a3]) = tmpb3; - *((uint32_t *)&ptr2[a4]) = tmpb4; + *((uint32_t *)&ptr[a1]) = tmpb1; + *((uint32_t *)&ptr[a2]) = tmpb2; + *((uint32_t *)&ptr[a3]) = tmpb3; + *((uint32_t *)&ptr[a4]) = tmpb4; } } #else - uint16_t *ptr1 = (uint16_t *)(pSrc16 + 2 * col_id * (fftLen / 4)); - uint16_t *ptr2 = (uint16_t *)(pDst16 + 2 * col_id * fftLen); - for (ic = core_id * 16; ic < MIN(core_id * 16 + 16, fftLen >> 2U); - ic += 4) { - uint32_t idx0 = ic; - uint32_t idx1 = ic + 1; - uint32_t idx2 = ic + 2; - uint32_t idx3 = ic + 3; - uint32_t idx_result0 = 0; - uint32_t idx_result1 = 0; - uint32_t idx_result2 = 0; - uint32_t idx_result3 = 0; + mempool_start_benchmark(); + int16_t *ptr1; + int16_t *ptr2; + uint32_t idx0, idx1, idx2, idx3; + uint32_t idx_result0, idx_result1, idx_result2, idx_result3; + for (ic = core_id * 4; ic < fftLen; ic += nPE * 4) { + idx_result0 = 0; + idx_result1 = 0; + idx_result2 = 0; + idx_result3 = 0; + idx0 = ic; + idx1 = ic + 1; + idx2 = ic + 2; + idx3 = ic + 3; for (k = 0; k < LOG2; k++) { idx_result0 = (idx_result0 << 1U) | (idx0 & 1U); idx_result1 = (idx_result1 << 1U) | (idx1 & 1U); @@ -395,29 +389,20 @@ void mempool_radix4_cfft_f16p_scheduler( idx2 = idx2 >> 1U; idx3 = idx3 >> 1U; } + idx0 = ic / 4; + idx1 = ic / 4 + N_BANKS; + idx2 = ic / 4 + 2 * N_BANKS; + idx3 = ic / 4 + 3 * N_BANKS; for (uint32_t idx_row = 0; idx_row < n_FFTs_ROW; idx_row++) { - uint32_t addr_src0 = (idx0 / 4) + (idx0 % 4) * N_BANKS; - uint32_t addr_src1 = (idx1 / 4) + (idx1 % 4) * N_BANKS; - uint32_t addr_src2 = (idx2 / 4) + (idx2 % 4) * N_BANKS; - uint32_t addr_src3 = (idx3 / 4) + (idx3 % 4) * N_BANKS; - uint32_t addr_dst0 = idx_result0; - uint32_t addr_dst1 = idx_result1; - uint32_t addr_dst2 = idx_result2; - uint32_t addr_dst3 = idx_result3; - addr_src0 += idx_row * (N_BANKS * 8); - addr_src1 += idx_row * (N_BANKS * 8); - addr_src2 += idx_row * (N_BANKS * 8); - addr_src3 += idx_row * (N_BANKS * 8); - addr_dst0 += idx_row * (N_BANKS * 8); - addr_dst1 += idx_row * (N_BANKS * 8); - addr_dst2 += idx_row * (N_BANKS * 8); - addr_dst3 += idx_row * (N_BANKS * 8); - *((uint32_t *)&ptr2[addr_dst0]) = (uint32_t)ptr1[addr_src0]; - *((uint32_t *)&ptr2[addr_dst1]) = (uint32_t)ptr1[addr_src1]; - *((uint32_t *)&ptr2[addr_dst2]) = (uint32_t)ptr1[addr_src2]; - *((uint32_t *)&ptr2[addr_dst3]) = (uint32_t)ptr1[addr_src3]; + ptr1 = pSrc16 + 2 * col_id * (fftLen / 4) + idx_row * (N_BANKS * 8); + ptr2 = pDst16 + 2 * col_id * fftLen + idx_row * (N_BANKS * 8); + *((uint32_t *)&ptr2[2 * idx_result0]) = *((uint32_t *)&ptr1[2 * idx0]); + *((uint32_t *)&ptr2[2 * idx_result1]) = *((uint32_t *)&ptr1[2 * idx1]); + *((uint32_t *)&ptr2[2 * idx_result2]) = *((uint32_t *)&ptr1[2 * idx2]); + *((uint32_t *)&ptr2[2 * idx_result3]) = *((uint32_t *)&ptr1[2 * idx3]); } } + mempool_stop_benchmark(); #endif } mempool_log_partial_barrier(2, absolute_core_id, nPE); diff --git a/software/runtime/data/data_cfft_radix4_f16.h.tpl b/software/runtime/data/data_cfft_radix4_f16.h.tpl deleted file mode 100644 index 8b1378917..000000000 --- a/software/runtime/data/data_cfft_radix4_f16.h.tpl +++ /dev/null @@ -1 +0,0 @@ - diff --git a/software/runtime/data/data_cfft_radix4_q16.h.tpl b/software/runtime/data/data_cfft_radix4_q16.h.tpl deleted file mode 100644 index 8b1378917..000000000 --- a/software/runtime/data/data_cfft_radix4_q16.h.tpl +++ /dev/null @@ -1 +0,0 @@ - diff --git a/software/runtime/data/data_ofdm.py b/software/runtime/data/data_ofdm.py deleted file mode 100644 index 8b1378917..000000000 --- a/software/runtime/data/data_ofdm.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/software/runtime/kernel/mempool_checks.h b/software/runtime/kernel/mempool_checks.h deleted file mode 100644 index e69de29bb..000000000 diff --git a/software/runtime/kernel/mempool_chest_f16.h b/software/runtime/kernel/mempool_chest_f16.h deleted file mode 100644 index ba99a9e3f..000000000 --- a/software/runtime/kernel/mempool_chest_f16.h +++ /dev/null @@ -1,372 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -#pragma once -#define __CDOTP -#define __MUL - -/* a[i] = ar[i] + i * ai[j] - - out[i][j] = a[i] / c[j] - out[i][j + 1] = a[i] / c[j + 1h - out[i][j + 2] = a[i] / c[j + 2] - out[i][j + 3] = a[i] / c[j + 3]*/ - -#ifdef __XDIVSQRT -#define DIV_LOOP(ab, ab_n, i) \ - re0 = 0; \ - re1 = 0; \ - re2 = 0; \ - re3 = 0; \ - im0 = 0; \ - im1 = 0; \ - im2 = 0; \ - im3 = 0; \ - D0 = 0; \ - D1 = 0; \ - D2 = 0; \ - D3 = 0; \ - cd0 = *(uint32_t *)&pPilotTX_itr[2U * j]; \ - cd1 = *(uint32_t *)&pPilotTX_itr[2U * (j + 1)]; \ - cd2 = *(uint32_t *)&pPilotTX_itr[2U * (j + 2)]; \ - cd3 = *(uint32_t *)&pPilotTX_itr[2U * (j + 3)]; \ - asm volatile("vfdotpex.s.h %[D0], %[cd0], %[cd0];" \ - "vfdotpex.s.h %[D1], %[cd1], %[cd1];" \ - "vfdotpex.s.h %[D2], %[cd2], %[cd2];" \ - "vfdotpex.s.h %[D3], %[cd3], %[cd3];" \ - "vfdotpex.s.h %[re0], %[x], %[cd0];" \ - "vfdotpex.s.h %[re1], %[x], %[cd1];" \ - "vfdotpex.s.h %[re2], %[x], %[cd2];" \ - "vfdotpex.s.h %[re3], %[x], %[cd3];" \ - "vfdotpex.s.h %[im0], %[y], %[cd0];" \ - "vfdotpex.s.h %[im1], %[y], %[cd1];" \ - "vfdotpex.s.h %[im2], %[y], %[cd2];" \ - "vfdotpex.s.h %[im3], %[y], %[cd3];" \ - "fdiv.s %[re0], %[re0], %[D0];" \ - "fdiv.s %[re1], %[re1], %[D1];" \ - "fdiv.s %[re2], %[re2], %[D2];" \ - "fdiv.s %[re3], %[re3], %[D3];" \ - "fdiv.s %[im0], %[im0], %[D0];" \ - "fdiv.s %[im1], %[im1], %[D1];" \ - "fdiv.s %[im2], %[im2], %[D2];" \ - "fdiv.s %[im3], %[im3], %[D3];" \ - "vfcpka.h.s %[re0], %[re0], %[im0];" \ - "vfcpka.h.s %[re1], %[re1], %[im1];" \ - "vfcpka.h.s %[re2], %[re2], %[im2];" \ - "vfcpka.h.s %[re3], %[re3], %[im3];" \ - : [D0] "+&r"(D0), [D1] "+&r"(D1), [D2] "+&r"(D2), \ - [D3] "+&r"(D3), [re0] "+&r"(re0), [re1] "+&r"(re1), \ - [re2] "+&r"(re2), [re3] "+&r"(re3), [im0] "+&r"(im0), \ - [im1] "+&r"(im1), [im2] "+&r"(im2), [im3] "+&r"(im3) \ - : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2), \ - [cd3] "r"(cd3), [x] "r"(ab), [y] "r"(ab_n) \ - :); \ - *((uint32_t *)&pH_itr[2 * (i * nTX + j)]) = re0; \ - *((uint32_t *)&pH_itr[2 * (i * nTX + j + 1)]) = re1; \ - *((uint32_t *)&pH_itr[2 * (i * nTX + j + 2)]) = re2; \ - *((uint32_t *)&pH_itr[2 * (i * nTX + j + 3)]) = re3; -#else -#define DIV_LOOP(ab, ab_n, i) \ - re0 = 0; \ - re1 = 0; \ - re2 = 0; \ - re3 = 0; \ - im0 = 0; \ - im1 = 0; \ - im2 = 0; \ - im3 = 0; \ - D0 = 0; \ - D1 = 0; \ - D2 = 0; \ - D3 = 0; \ - cd0 = *(uint32_t *)&pPilotTX_itr[2U * j]; \ - cd1 = *(uint32_t *)&pPilotTX_itr[2U * (j + 1)]; \ - cd2 = *(uint32_t *)&pPilotTX_itr[2U * (j + 2)]; \ - cd3 = *(uint32_t *)&pPilotTX_itr[2U * (j + 3)]; \ - asm volatile("vfdotpex.s.h %[D0], %[cd0], %[cd0];" \ - "vfdotpex.s.h %[D1], %[cd1], %[cd1];" \ - "vfdotpex.s.h %[D2], %[cd2], %[cd2];" \ - "vfdotpex.s.h %[D3], %[cd3], %[cd3];" \ - "vfdotpex.s.h %[re0], %[x], %[cd0];" \ - "vfdotpex.s.h %[re1], %[x], %[cd1];" \ - "vfdotpex.s.h %[re2], %[x], %[cd2];" \ - "vfdotpex.s.h %[re3], %[x], %[cd3];" \ - "vfdotpex.s.h %[im0], %[y], %[cd0];" \ - "vfdotpex.s.h %[im1], %[y], %[cd1];" \ - "vfdotpex.s.h %[im2], %[y], %[cd2];" \ - "vfdotpex.s.h %[im3], %[y], %[cd3];" \ - : [D0] "+&r"(D0), [D1] "+&r"(D1), [D2] "+&r"(D2), \ - [D3] "+&r"(D3), [re0] "+&r"(re0), [re1] "+&r"(re1), \ - [re2] "+&r"(re2), [re3] "+&r"(re3), [im0] "+&r"(im0), \ - [im1] "+&r"(im1), [im2] "+&r"(im2), [im3] "+&r"(im3) \ - : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2), \ - [cd3] "r"(cd3), [x] "r"(ab), [y] "r"(ab_n) \ - :); \ - re0 = re0 / D0; \ - re1 = re1 / D1; \ - re2 = re2 / D2; \ - re3 = re3 / D3; \ - im0 = im0 / D0; \ - im1 = im1 / D1; \ - im2 = im2 / D2; \ - im3 = im3 / D3; \ - asm volatile("vfcpka.h.s %[re0], %[re0], %[im0];" \ - "vfcpka.h.s %[re1], %[re1], %[im1];" \ - "vfcpka.h.s %[re2], %[re2], %[im2];" \ - "vfcpka.h.s %[re3], %[re3], %[im3];" \ - : [re0] "+&r"(re0), [re1] "+&r"(re1), [re2] "+&r"(re2), \ - [re3] "+&r"(re3), [im0] "+&r"(im0), [im1] "+&r"(im1), \ - [im2] "+&r"(im2), [im3] "+&r"(im3) \ - : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2), \ - [cd3] "r"(cd3), [x] "r"(ab), [y] "r"(ab_n) \ - :); \ - *((uint32_t *)&pH_itr[2 * (i * nTX + j)]) = re0; \ - *((uint32_t *)&pH_itr[2 * (i * nTX + j + 1)]) = re1; \ - *((uint32_t *)&pH_itr[2 * (i * nTX + j + 2)]) = re2; \ - *((uint32_t *)&pH_itr[2 * (i * nTX + j + 3)]) = re3; -#endif - -/* a[i] = ar[i] + i * ai[j] - - out[i][j] = a[i] * c[j] - out[i][j + 1] = a[i] * c[j + 1] - out[i][j + 2] = a[i] * c[j + 2] - out[i][j + 3] = a[i] * c[j + 3]*/ - -#define MUL_LOOP(ab, ab_n, i) \ - re0 = 0; \ - re1 = 0; \ - re2 = 0; \ - re3 = 0; \ - im0 = 0; \ - im1 = 0; \ - im2 = 0; \ - im3 = 0; \ - cd0 = *(uint32_t *)&pPilotTX_itr[2U * j]; \ - cd1 = *(uint32_t *)&pPilotTX_itr[2U * (j + 1)]; \ - cd2 = *(uint32_t *)&pPilotTX_itr[2U * (j + 2)]; \ - cd3 = *(uint32_t *)&pPilotTX_itr[2U * (j + 3)]; \ - asm volatile("vfdotpex.s.h %[re0], %[x], %[cd0];" \ - "vfdotpex.s.h %[re1], %[x], %[cd1];" \ - "vfdotpex.s.h %[re2], %[x], %[cd2];" \ - "vfdotpex.s.h %[re3], %[x], %[cd3];" \ - "vfdotpex.s.h %[im0], %[y], %[cd0];" \ - "vfdotpex.s.h %[im1], %[y], %[cd1];" \ - "vfdotpex.s.h %[im2], %[y], %[cd2];" \ - "vfdotpex.s.h %[im3], %[y], %[cd3];" \ - : [re0] "+&r"(re0), [re1] "+&r"(re1), [re2] "+&r"(re2), \ - [re3] "+&r"(re3), [im0] "+&r"(im0), [im1] "+&r"(im1), \ - [im2] "+&r"(im2), [im3] "+&r"(im3) \ - : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2), \ - [cd3] "r"(cd3), [x] "r"(ab), [y] "r"(ab_n) \ - :); \ - asm volatile( \ - "vfcpka.h.s %[re0], %[re0], %[im0];" \ - "vfcpka.h.s %[re1], %[re1], %[im1];" \ - "vfcpka.h.s %[re2], %[re2], %[im2];" \ - "vfcpka.h.s %[re3], %[re3], %[im3];" \ - : [re0] "+&r"(re0), [re1] "+&r"(re1), [re2] "+&r"(re2), \ - [re3] "+&r"(re3), [im0] "+&r"(im0), [im1] "+&r"(im1), \ - [im2] "+&r"(im2), [im3] "+&r"(im3) \ - : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2), [cd3] "r"(cd3) \ - :); \ - *((uint32_t *)&pH_itr[2 * (i * nTX + j)]) = re0; \ - *((uint32_t *)&pH_itr[2 * (i * nTX + j + 1)]) = re1; \ - *((uint32_t *)&pH_itr[2 * (i * nTX + j + 2)]) = re2; \ - *((uint32_t *)&pH_itr[2 * (i * nTX + j + 3)]) = re3; - -#define CMUL_LOOP(ab, i) \ - sum0 = 0; \ - sum1 = 0; \ - sum2 = 0; \ - sum3 = 0; \ - cd0 = *(uint32_t *)&pPilotTX_itr[2U * j]; \ - cd1 = *(uint32_t *)&pPilotTX_itr[2U * (j + 1)]; \ - cd2 = *(uint32_t *)&pPilotTX_itr[2U * (j + 2)]; \ - cd3 = *(uint32_t *)&pPilotTX_itr[2U * (j + 3)]; \ - asm volatile("fcdotpex.s.h %[sum0], %[x], %[cd0];" \ - "fcdotpex.s.h %[sum1], %[x], %[cd1];" \ - "fcdotpex.s.h %[sum2], %[x], %[cd2];" \ - "fcdotpex.s.h %[sum3], %[x], %[cd3];" \ - : [sum0] "+&r"(sum0), [sum1] "+&r"(sum1), [sum2] "+&r"(sum2), \ - [sum3] "+&r"(sum3) \ - : [cd0] "r"(cd0), [cd1] "r"(cd1), [cd2] "r"(cd2), \ - [cd3] "r"(cd3), [x] "r"(ab) \ - :); \ - *((uint32_t *)&pH_itr[2 * (i * nTX + j)]) = sum0; \ - *((uint32_t *)&pH_itr[2 * (i * nTX + j + 1)]) = sum1; \ - *((uint32_t *)&pH_itr[2 * (i * nTX + j + 2)]) = sum2; \ - *((uint32_t *)&pH_itr[2 * (i * nTX + j + 3)]) = sum3; - -#define SHUFFLE_A \ - asm volatile( \ - "xor %[ab_n0], %[ab0], %[neg_mask];" \ - "xor %[ab_n1], %[ab1], %[neg_mask];" \ - "xor %[ab_n2], %[ab2], %[neg_mask];" \ - "xor %[ab_n3], %[ab3], %[neg_mask];" \ - "pv.shuffle2.h %[ab_n0], %[ab_n0], %[mask];" \ - "pv.shuffle2.h %[ab_n1], %[ab_n1], %[mask];" \ - "pv.shuffle2.h %[ab_n2], %[ab_n2], %[mask];" \ - "pv.shuffle2.h %[ab_n3], %[ab_n3], %[mask];" \ - : [ab_n0] "+&r"(ab_n0), [ab_n1] "+&r"(ab_n1), [ab_n2] "+&r"(ab_n2), \ - [ab_n3] "+&r"(ab_n3) \ - : [ab0] "r"(ab0), [ab1] "r"(ab1), [ab2] "r"(ab2), [ab3] "r"(ab3), \ - [neg_mask] "r"(0x00008000), [mask] "r"(0x00020003) \ - :); \ -/** \ - @brief Block-type channel estimation. \ - @param[in] pH points to output channel \ - @param[in] pPilotRX points to received symbol \ - @param[in] pPilotTX points to sent pilot \ - @param[in] nTX Number of transmitters \ - @param[in] nRX Number of receivers \ - @param[in] nSc Number of Subcarriers \ - @return none \ -*/ -void mempool_chest_f16s_unrolled4(__fp16 *pH, __fp16 *pPilotRX, - __fp16 *pPilotTX, uint32_t nRX, uint32_t nTX, - uint32_t nSc) { - - uint32_t ab0, ab1, ab2, ab3; - uint32_t cd0, cd1, cd2, cd3; - uint32_t re0, re1, re2, re3; - uint32_t im0, im1, im2, im3; - uint32_t D0, D1, D2, D3; - uint32_t ab_n0, ab_n1, ab_n2, ab_n3; - __fp16 *pPilotTX_itr; - __fp16 *pPilotRX_itr; - __fp16 *pH_itr; - - for (uint32_t k = 0; k < nSc; k++) { - pPilotTX_itr = pPilotTX + k * (2 * nTX); - pPilotRX_itr = pPilotRX + k * (2 * nRX); - pH_itr = pH + k * 2 * (nTX * nRX); - for (uint32_t i = 0; i < nRX; i++) { - ab0 = *(uint32_t *)&pPilotRX_itr[2U * i]; - ab1 = *(uint32_t *)&pPilotRX_itr[2U * (i + 1)]; - ab2 = *(uint32_t *)&pPilotRX_itr[2U * (i + 2)]; - ab3 = *(uint32_t *)&pPilotRX_itr[2U * (i + 3)]; - SHUFFLE_A; - for (uint32_t j = 0; j < nTX; j += 4) { - DIV_LOOP(ab0, ab_n0, i); - DIV_LOOP(ab1, ab_n1, i + 1); - DIV_LOOP(ab2, ab_n2, i + 2); - DIV_LOOP(ab3, ab_n3, i + 3); - } - } - } - return; -} - -/** - @brief Block-type channel estimation. - @param[in] pH points to output channel - @param[in] pPilotRX points to received symbol - @param[in] pPilotTX points to sent pilot - @param[in] nTX Number of transmitters - @param[in] nRX Number of receivers - @param[in] nSc Number of Subcarriers - @param[in] core_id ID of the PE - @param[in] nPE Number of PEs - @return none -*/ -void mempool_chest_f16p_unrolled4(__fp16 *pH, __fp16 *pPilotRX, - __fp16 *pPilotTX, uint32_t nRX, uint32_t nTX, - uint32_t nSc, uint32_t core_id, - uint32_t nPE) { - uint32_t ab0, ab1, ab2, ab3; - uint32_t cd0, cd1, cd2, cd3; -#ifndef __CDOTP - uint32_t ab_n0, ab_n1, ab_n2, ab_n3; - uint32_t re0, re1, re2, re3; - uint32_t im0, im1, im2, im3; -#else - uint32_t sum0, sum1, sum2, sum3; -#endif - -#ifndef __MUL - uint32_t D0, D1, D2, D3; -#endif - - __fp16 *pPilotTX_itr; - __fp16 *pPilotRX_itr; - __fp16 *pH_itr; - - for (uint32_t k = core_id; k < nSc; k += nPE) { - pPilotTX_itr = pPilotTX + k * (2 * nTX); - pPilotRX_itr = pPilotRX + k * (2 * nRX); - pH_itr = pH + k * 2 * (nTX * nRX); - for (uint32_t i = 0; i < nRX; i += 4) { - ab0 = *(uint32_t *)&pPilotRX_itr[2U * i]; - ab1 = *(uint32_t *)&pPilotRX_itr[2U * (i + 1)]; - ab2 = *(uint32_t *)&pPilotRX_itr[2U * (i + 2)]; - ab3 = *(uint32_t *)&pPilotRX_itr[2U * (i + 3)]; -#ifndef __CDOTP - SHUFFLE_A; -#endif - - for (uint32_t j = 0; j < nTX; j += 4) { -#if (defined(__CDOTP) && defined(__MUL)) - CMUL_LOOP(ab0, i); - CMUL_LOOP(ab1, i + 1); - CMUL_LOOP(ab2, i + 2); - CMUL_LOOP(ab3, i + 3); -#elif (!defined(__CDOTP) && defined(__MUL)) - MUL_LOOP(ab0, ab_n0, i); - MUL_LOOP(ab1, ab_n1, i + 1); - MUL_LOOP(ab2, ab_n2, i + 2); - MUL_LOOP(ab3, ab_n3, i + 3); -#else - DIV_LOOP(ab0, ab_n0, i) - DIV_LOOP(ab1, ab_n1, i + 1) - DIV_LOOP(ab2, ab_n2, i + 2) - DIV_LOOP(ab3, ab_n3, i + 3) -#endif - } - } - } - mempool_barrier(nPE); - return; -} - -void mempool_chest_f16p_unrolled4_local(__fp16 *volatile pH, - __fp16 *volatile pPilotRX, - __fp16 *volatile pPilotTX, uint32_t nRX, - uint32_t nTX, uint32_t nSc, - uint32_t core_id, uint32_t nPE) { - uint32_t ab0, ab1, ab2, ab3; - uint32_t cd0, cd1, cd2, cd3; - uint32_t sum0, sum1, sum2, sum3; - __fp16 *pPilotTX_itr; - __fp16 *pPilotRX_itr; - __fp16 *pH_itr; - uint32_t itr, i, j; - - // Cores Loop over the received pilots vector - for (itr = core_id * 4; itr < (nSc * nRX); - itr += (BANKING_FACTOR * NUM_CORES)) { - // Received pilots are aligned to cores - uint32_t sc_RX = itr / nRX; - pPilotTX_itr = pPilotTX + sc_RX * (2 * nTX); - pPilotRX_itr = pPilotRX + sc_RX * (2 * nRX); - pH_itr = pH + sc_RX * 2 * (nTX * nRX); - - // Load received pilots - i = itr % nRX; - ab0 = *(uint32_t *)&pPilotRX_itr[2U * i]; - ab1 = *(uint32_t *)&pPilotRX_itr[2U * (i + 1)]; - ab2 = *(uint32_t *)&pPilotRX_itr[2U * (i + 2)]; - ab3 = *(uint32_t *)&pPilotRX_itr[2U * (i + 3)]; - for (j = 0; j < nTX; j += 4) { - CMUL_LOOP(ab0, i); - CMUL_LOOP(ab1, i + 1); - CMUL_LOOP(ab2, i + 2); - CMUL_LOOP(ab3, i + 3); - } - } - mempool_barrier(nPE); - return; -} diff --git a/software/runtime/kernel/mempool_chest_q16.h b/software/runtime/kernel/mempool_chest_q16.h deleted file mode 100644 index c66aa0537..000000000 --- a/software/runtime/kernel/mempool_chest_q16.h +++ /dev/null @@ -1,245 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -// Author: Marco Bertuletti, ETH Zurich - -#pragma once -#include "xpulp/builtins_v2.h" -#define __MUL - -/* a[i] = ar[i] + i * ai[j] - out[i][j] = a[i] / c[j] - out[i][j + 1] = a[i] / c[j + 1] - out[i][j + 2] = a[i] / c[j + 2] - out[i][j + 3] = a[i] / c[j + 3]*/ - -#define DIV_LOOP(ab, ab_n, i) \ - cd0 = *(v2s *)&pPilotTX_itr[2U * j]; \ - cd1 = *(v2s *)&pPilotTX_itr[2U * (j + 1)]; \ - cd2 = *(v2s *)&pPilotTX_itr[2U * (j + 2)]; \ - cd3 = *(v2s *)&pPilotTX_itr[2U * (j + 3)]; \ - D0 = (1 << 16U) / __DOTP2(cd0, cd0); \ - D1 = (1 << 16U) / __DOTP2(cd1, cd1); \ - D2 = (1 << 16U) / __DOTP2(cd2, cd2); \ - D3 = (1 << 16U) / __DOTP2(cd3, cd3); \ - re0 = __DOTP2(ab, cd0); \ - re1 = __DOTP2(ab, cd1); \ - re2 = __DOTP2(ab, cd2); \ - re3 = __DOTP2(ab, cd3); \ - im0 = __DOTP2(ab_n, cd0); \ - im1 = __DOTP2(ab_n, cd1); \ - im2 = __DOTP2(ab_n, cd2); \ - im3 = __DOTP2(ab_n, cd3); \ - re0 = __CLIP((re0 * D0) >> 8, 16); \ - re1 = __CLIP((re1 * D1) >> 8, 16); \ - re2 = __CLIP((re2 * D2) >> 8, 16); \ - re3 = __CLIP((re3 * D3) >> 8, 16); \ - im0 = __CLIP((im0 * D0) >> 8, 16); \ - im1 = __CLIP((im1 * D1) >> 8, 16); \ - im2 = __CLIP((im2 * D2) >> 8, 16); \ - im3 = __CLIP((im3 * D3) >> 8, 16); \ - re0 = (int32_t)(__PACK2(re0, im0)); \ - re1 = (int32_t)(__PACK2(re1, im1)); \ - re2 = (int32_t)(__PACK2(re2, im2)); \ - re3 = (int32_t)(__PACK2(re3, im3)); \ - *((v2s *)&pH_itr[2 * (i * nTX + j)]) = (v2s)re0; \ - *((v2s *)&pH_itr[2 * (i * nTX + j + 1)]) = (v2s)re1; \ - *((v2s *)&pH_itr[2 * (i * nTX + j + 2)]) = (v2s)re2; \ - *((v2s *)&pH_itr[2 * (i * nTX + j + 3)]) = (v2s)re3; - -/* a[i] = ar[i] + i * ai[j] - out[i][j] = a[i] * c[j] - out[i][j + 1] = a[i] * c[j + 1] - out[i][j + 2] = a[i] * c[j + 2] - out[i][j + 3] = a[i] * c[j + 3]*/ - -#define MUL_LOOP(ab, ab_n, i) \ - cd0 = *(v2s *)&pPilotTX_itr[2U * j]; \ - cd1 = *(v2s *)&pPilotTX_itr[2U * (j + 1)]; \ - cd2 = *(v2s *)&pPilotTX_itr[2U * (j + 2)]; \ - cd3 = *(v2s *)&pPilotTX_itr[2U * (j + 3)]; \ - re0 = __DOTP2(ab, cd0); \ - re1 = __DOTP2(ab, cd1); \ - re2 = __DOTP2(ab, cd2); \ - re3 = __DOTP2(ab, cd3); \ - im0 = __DOTP2(ab_n, cd0); \ - im1 = __DOTP2(ab_n, cd1); \ - im2 = __DOTP2(ab_n, cd2); \ - im3 = __DOTP2(ab_n, cd3); \ - re0 = __CLIP(re0 >> 8, 16); \ - re1 = __CLIP(re1 >> 8, 16); \ - re2 = __CLIP(re2 >> 8, 16); \ - re3 = __CLIP(re3 >> 8, 16); \ - im0 = __CLIP(im0 >> 8, 16); \ - im1 = __CLIP(im1 >> 8, 16); \ - im2 = __CLIP(im2 >> 8, 16); \ - im3 = __CLIP(im3 >> 8, 16); \ - re0 = (int32_t)(__PACK2(re0, im0)); \ - re1 = (int32_t)(__PACK2(re1, im1)); \ - re2 = (int32_t)(__PACK2(re2, im2)); \ - re3 = (int32_t)(__PACK2(re3, im3)); \ - *((v2s *)&pH_itr[2 * (i * nTX + j)]) = (v2s)re0; \ - *((v2s *)&pH_itr[2 * (i * nTX + j + 1)]) = (v2s)re1; \ - *((v2s *)&pH_itr[2 * (i * nTX + j + 2)]) = (v2s)re2; \ - *((v2s *)&pH_itr[2 * (i * nTX + j + 3)]) = (v2s)re3; - -#define SHUFFLE_A \ - asm volatile( \ - "pv.sub.h %[ab_n0], %[zero], %[ab0];" \ - "pv.sub.h %[ab_n1], %[zero], %[ab1];" \ - "pv.sub.h %[ab_n2], %[zero], %[ab2];" \ - "pv.sub.h %[ab_n3], %[zero], %[ab3];" \ - "pv.shuffle2.h %[ab_n0], %[ab_n0], %[mask];" \ - "pv.shuffle2.h %[ab_n1], %[ab_n1], %[mask];" \ - "pv.shuffle2.h %[ab_n2], %[ab_n2], %[mask];" \ - "pv.shuffle2.h %[ab_n3], %[ab_n3], %[mask];" \ - : [ab_n0] "=&r"(ab_n0), [ab_n1] "=&r"(ab_n1), [ab_n2] "=&r"(ab_n2), \ - [ab_n3] "=&r"(ab_n3) \ - : [ab0] "r"(ab0), [ab1] "r"(ab1), [ab2] "r"(ab2), [ab3] "r"(ab3), \ - [zero] "r"(0x00000000), [mask] "r"(0x00020001) \ - :); - -/** - @brief Block-type channel estimation. - @param[in] pH points to output channel - @param[in] pPilotRX points to received symbol - @param[in] pPilotTX points to sent pilot - @param[in] nTX Number of transmitters - @param[in] nRX Number of receivers - @param[in] nSc Number of Subcarriers - @return none -*/ -void mempool_chest_q16s_unrolled4(int16_t *pH, int16_t *pPilotRX, - int16_t *pPilotTX, uint32_t nRX, uint32_t nTX, - uint32_t nSc) { - - v2s ab0, ab1, ab2, ab3; - v2s ab_n0, ab_n1, ab_n2, ab_n3; - v2s cd0, cd1, cd2, cd3; - int32_t re0, re1, re2, re3; - int32_t im0, im1, im2, im3; - int32_t D0, D1, D2, D3; - - int16_t *pPilotTX_itr; - int16_t *pPilotRX_itr; - int16_t *pH_itr; - for (uint32_t k = 0; k < nSc; k++) { - pPilotTX_itr = pPilotTX + k * (2 * nTX); - pPilotRX_itr = pPilotRX + k * (2 * nRX); - pH_itr = pH + k * 2 * (nTX * nRX); - for (uint32_t i = 0; i < nRX; i += 4) { - ab0 = *(v2s *)&pPilotRX_itr[2U * i]; - ab1 = *(v2s *)&pPilotRX_itr[2U * (i + 1)]; - ab2 = *(v2s *)&pPilotRX_itr[2U * (i + 2)]; - ab3 = *(v2s *)&pPilotRX_itr[2U * (i + 3)]; - SHUFFLE_A; - for (uint32_t j = 0; j < nTX; j += 4) { - DIV_LOOP(ab0, ab_n0, i); - DIV_LOOP(ab1, ab_n1, i + 1); - DIV_LOOP(ab2, ab_n2, i + 2); - DIV_LOOP(ab3, ab_n3, i + 3); - } - } - } - return; -} - -/** - @brief Block-type channel estimation. - @param[in] pH points to output channel - @param[in] pPilotRX points to received symbol - @param[in] pPilotTX points to sent pilot - @param[in] nTX Number of transmitters - @param[in] nRX Number of receivers - @param[in] nSc Number of Subcarriers - @return none -*/ -void mempool_chest_q16p_unrolled4(int16_t *volatile pH, - int16_t *volatile pPilotRX, - int16_t *volatile pPilotTX, uint32_t nRX, - uint32_t nTX, uint32_t nSc, uint32_t core_id, - uint32_t nPE) { - - v2s ab0, ab1, ab2, ab3; - v2s ab_n0, ab_n1, ab_n2, ab_n3; - v2s cd0, cd1, cd2, cd3; - int32_t re0, re1, re2, re3; - int32_t im0, im1, im2, im3; -#ifndef __MUL - int32_t D0, D1, D2, D3; -#endif - - int16_t *pPilotTX_itr; - int16_t *pPilotRX_itr; - int16_t *pH_itr; - for (uint32_t k = core_id; k < nSc; k += nPE) { - pPilotTX_itr = pPilotTX + k * (2 * nTX); - pPilotRX_itr = pPilotRX + k * (2 * nRX); - pH_itr = pH + k * 2 * (nTX * nRX); - for (uint32_t i = 0; i < nRX; i += 4) { - ab0 = *(v2s *)&pPilotRX_itr[2U * i]; - ab1 = *(v2s *)&pPilotRX_itr[2U * (i + 1)]; - ab2 = *(v2s *)&pPilotRX_itr[2U * (i + 2)]; - ab3 = *(v2s *)&pPilotRX_itr[2U * (i + 3)]; - SHUFFLE_A; - for (uint32_t j = 0; j < nTX; j += 4) { -#ifdef __MUL - MUL_LOOP(ab0, ab_n0, i); - MUL_LOOP(ab1, ab_n1, i + 1); - MUL_LOOP(ab2, ab_n2, i + 2); - MUL_LOOP(ab3, ab_n3, i + 3); -#else - DIV_LOOP(ab0, ab_n0, i); - DIV_LOOP(ab1, ab_n1, i + 1); - DIV_LOOP(ab2, ab_n2, i + 2); - DIV_LOOP(ab3, ab_n3, i + 3); -#endif - } - } - } - mempool_barrier(nPE); - return; -} - -void mempool_chest_q16p_unrolled4_local(int16_t *volatile pH, - int16_t *volatile pPilotRX, - int16_t *volatile pPilotTX, - uint32_t nRX, uint32_t nTX, - uint32_t nSc, uint32_t core_id, - uint32_t nPE) { - v2s ab0, ab1, ab2, ab3; - v2s ab_n0, ab_n1, ab_n2, ab_n3; - v2s cd0, cd1, cd2, cd3; - int32_t re0, re1, re2, re3; - int32_t im0, im1, im2, im3; - int16_t *pPilotTX_itr; - int16_t *pPilotRX_itr; - int16_t *pH_itr; - uint32_t itr, i, j; - - // Cores Loop over the received pilots vector - for (itr = core_id * 4; itr < (nSc * nRX); - itr += (BANKING_FACTOR * NUM_CORES)) { - // Received pilots are aligned to cores - uint32_t sc_RX = itr / nRX; - pPilotTX_itr = pPilotTX + sc_RX * (2 * nTX); - pPilotRX_itr = pPilotRX + sc_RX * (2 * nRX); - pH_itr = pH + sc_RX * 2 * (nTX * nRX); - // Load received pilots - i = itr % nRX; - ab0 = *(v2s *)&pPilotRX_itr[2U * i]; - ab1 = *(v2s *)&pPilotRX_itr[2U * (i + 1)]; - ab2 = *(v2s *)&pPilotRX_itr[2U * (i + 2)]; - ab3 = *(v2s *)&pPilotRX_itr[2U * (i + 3)]; - SHUFFLE_A; - for (j = 0; j < nTX; j += 4) { - MUL_LOOP(ab0, ab_n0, i); - MUL_LOOP(ab1, ab_n1, i + 1); - MUL_LOOP(ab2, ab_n2, i + 2); - MUL_LOOP(ab3, ab_n3, i + 3); - } - } - mempool_barrier(nPE); - return; -} diff --git a/software/runtime/kernel/mempool_chest_q16p.h b/software/runtime/kernel/mempool_chest_q16p.h deleted file mode 100644 index 8b1378917..000000000 --- a/software/runtime/kernel/mempool_chest_q16p.h +++ /dev/null @@ -1 +0,0 @@ - diff --git a/software/runtime/kernel/mempool_chest_q16s.h b/software/runtime/kernel/mempool_chest_q16s.h deleted file mode 100644 index 8b1378917..000000000 --- a/software/runtime/kernel/mempool_chest_q16s.h +++ /dev/null @@ -1 +0,0 @@ - diff --git a/software/runtime/kernel/mempool_radix2_cfft_q16s.h b/software/runtime/kernel/mempool_radix2_cfft_q16s.h deleted file mode 100644 index e69de29bb..000000000