Merge pull request #84 from pulp-platform/cfft

Add cfft radix-4 and radix-2 kernels
pulp-platform · Oct 12, 2023 · 715fa0b · 715fa0b
2 parents d371350 + 3023765
commit 715fa0b
Show file tree

Hide file tree

Showing 16 changed files with 2,985 additions and 2 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Update Bender to version 0.27.3
 - Update default Questasim version to 2022.3
 - Decrease stack size to 128 words
+- Add CFFT radix-4 and radix-2 kernels
 
 ### Fixed
 - Fix type issue in `snitch_addr_demux`

diff --git a/python-requirements.txt b/python-requirements.txt
@@ -13,3 +13,4 @@ numpy
 pandas
 progressbar2
 tabulate
+sympy
diff --git a/software/.gitignore b/software/.gitignore
@@ -26,4 +26,4 @@ runtime/arch.ld
 
 # Generated data files
 data.h
-runtime/data/*.h
+runtime/data/data*.h
diff --git a/software/apps/Makefile b/software/apps/Makefile
@@ -25,7 +25,7 @@ else
 	ALL := $(filter-out systolic/%,$(APPS))
 endif
 
-ALL_LLVM := $(filter-out chest_q16, $(ALL))
+ALL_LLVM := $(filter-out chest_q16 cfft_radix2_q16 cfft_radix4_q16, $(ALL))
 
 # Make all applications
 all: $(ALL)

diff --git a/software/apps/cfft_radix2_q16/main.c b/software/apps/cfft_radix2_q16/main.c
@@ -0,0 +1,81 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* Mempool runtime libraries */
+#include "dma.h"
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+#include "xpulp/builtins_v2.h"
+
+/* CFFT mempool libraries */
+#include "data/data_cfft_radix2_q16.h"
+#include "kernel/mempool_radix2_cfft_q16p.h"
+#include "kernel/mempool_radix2_cfft_q16s.h"
+
+#define PARALLEL
+#define SINGLE
+
+/* CFFT mempool data */
+int16_t l1_pSrc[N_RSAMPLES]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+int16_t l1_twiddleCoef_q16[6 * N_CSAMPLES / 4]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+
+int main() {
+
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  mempool_barrier_init(core_id);
+
+  if (core_id == 0) {
+    dma_memcpy_blocking(l1_pSrc, l2_pSrc, N_CSAMPLES * sizeof(int32_t));
+    dma_memcpy_blocking(l1_twiddleCoef_q16, l2_twiddleCoef_q16,
+                        (3 * N_CSAMPLES / 4) * sizeof(int32_t));
+    dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable,
+                        BITREVINDEXTABLE_LENGTH * sizeof(int32_t));
+  }
+  mempool_barrier(num_cores);
+
+  /* SINGLE-CORE */
+#ifdef SINGLE
+  if (core_id == 0) {
+    mempool_start_benchmark();
+    mempool_radix2_cfft_q16s((uint16_t)16, l1_twiddleCoef_q16,
+                             l1_BitRevIndexTable, l1_pSrc,
+                             BITREVINDEXTABLE_LENGTH, 0, 0);
+    mempool_stop_benchmark();
+  }
+  mempool_barrier(num_cores);
+#endif
+
+  /* PARALLEL-CORE */
+#ifdef PARALLEL
+  mempool_start_benchmark();
+  mempool_radix2_cfft_q16p((uint16_t)16, l1_twiddleCoef_q16,
+                           l1_BitRevIndexTable, l1_pSrc,
+                           BITREVINDEXTABLE_LENGTH, 0, 0, num_cores);
+  mempool_stop_benchmark();
+#endif
+
+  if (core_id == 0) {
+    for (uint32_t i = 0; i < N_RSAMPLES; i += 2) {
+      printf("{%6d;%6d } \n", l1_pSrc[i], l1_pSrc[i + 1]);
+    }
+    printf("Done!\n");
+  }
+  mempool_barrier(num_cores);
+  return 0;
+}
diff --git a/software/apps/cfft_radix4_q16/main.c b/software/apps/cfft_radix4_q16/main.c
@@ -0,0 +1,195 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* Mempool runtime libraries */
+#include "dma.h"
+#include "encoding.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+#include "xpulp/builtins_v2.h"
+
+/* CFFT data libraries */
+#include "data/data_cfft_radix4_q16.h"
+
+/*
+  CHOOSE ONE
+   - SINGLE:    Single core FFT
+   - PARALLEL:  Parallel FFT not "memory-aware"
+   - FOLDED:    Parallel FFT with "memory-aware" load/store scheme
+   - SCHEDULED: Scheduling of multiple parallel FFTs with "memory-aware"
+  load/store scheme
+      - N_FFTs_COL: Independent FFTs scheduled on one row (default 1)
+      - N_FFTs_ROW: Independent FFTs scheduled on columns (default 1)
+      (OPTIONALLY ENABLE)
+      - FOLDED_TWIDDLES: Also the twiddles have "memory-aware" load/stores
+      - BITREVERSETABLE: The bitreversal indeces are loaded from a table
+      - ASM:             Use asm_volatile statements
+*/
+
+#define SCHEDULED
+#define FOLDED_TWIDDLES
+#define BITREVERSETABLE
+#define ASM // Use asm_volatile statements
+
+#if !(defined(N_FFT_ROW) && defined(N_FFTs_COL))
+#define N_FFTs_ROW 2
+#define N_FFTs_COL 2
+#endif
+
+#define ABS(x) (((x) < 0) ? (-x) : (x))
+#include "kernel/mempool_radix4_cfft_butterfly_q16.h"
+#include "kernel/mempool_radix4_cfft_q16_bitreversal.h"
+#include "kernel/mempool_radix4_cfft_q16p.h"
+#include "kernel/mempool_radix4_cfft_q16s.h"
+
+int16_t l1_pSrc[N_FFTs_ROW * 8 * N_BANKS]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+int16_t l1_pDst[N_FFTs_ROW * 8 * N_BANKS]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+int16_t l1_twiddleCoef_q16_src[8 * N_BANKS]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+int16_t l1_twiddleCoef_q16_dst[8 * N_BANKS]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
+    __attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/* MAIN */
+int main() {
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  mempool_barrier_init(core_id);
+
+  ///////////////////////////////////////////////////////////////////////////////////////////////////
+  /* INITIALIZATION */
+  if (core_id == 0) {
+    // Each FFT is folded over 4 memory rows
+    // Each memory row is 2 * N_BANKS samples
+    for (uint32_t j = 0; j < N_FFTs_ROW; j++) {
+      dma_memcpy_blocking(l1_pSrc + j * (8 * N_BANKS), l2_pSrc,
+                          (N_RSAMPLES * N_FFTs_COL) * sizeof(int32_t));
+    }
+    dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable,
+                        BITREVINDEXTABLE_LENGTH * sizeof(int32_t));
+    dma_memcpy_blocking(l1_twiddleCoef_q16_src, l2_twiddleCoef_q16,
+                        3 * (N_CSAMPLES / 4) * sizeof(int32_t));
+  }
+  // Initialize the Twiddles folded
+#ifdef FOLDED_TWIDDLES
+  for (uint32_t j = 0; j < N_FFTs_COL; j++) {
+    uint32_t N_WORDS_COL = (N_CSAMPLES / 4);
+    for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) {
+      *(v2s *)&l1_twiddleCoef_q16_src[2U * (i + j * (N_CSAMPLES / 4))] =
+          *(v2s *)&l2_twiddleCoef_q16[2U * i];
+      *(v2s *)&l1_twiddleCoef_q16_src[2U * (i + j * (N_CSAMPLES / 4) +
+                                            1 * N_BANKS)] =
+          *(v2s *)&l2_twiddleCoef_q16[2U * (i * 2U)];
+      *(v2s *)&l1_twiddleCoef_q16_src[2U * (i + j * (N_CSAMPLES / 4) +
+                                            2 * N_BANKS)] =
+          *(v2s *)&l2_twiddleCoef_q16[2U * (i * 3U)];
+    }
+  }
+#endif
+  mempool_barrier(num_cores);
+
+  if (core_id == 0) {
+    printf("On the run...\n");
+  }
+  mempool_barrier(num_cores);
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/* SINGLE-CORE */
+#ifdef SINGLE
+  int16_t *pRes; // Result pointer
+  if (core_id == 0) {
+    mempool_start_benchmark();
+    mempool_radix4_cfft_q16s_xpulpimg(l1_pSrc, (uint16_t)N_CSAMPLES,
+                                      l1_twiddleCoef_q16_src, 1);
+    mempool_bitrevtable_q16s_xpulpimg(
+        (uint16_t *)l1_pSrc, BITREVINDEXTABLE_LENGTH, l1_BitRevIndexTable);
+    pRes = l1_pSrc;
+    mempool_stop_benchmark();
+  }
+  mempool_barrier(num_cores);
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/* MULTI-CORE */
+#ifdef PARALLEL
+  int16_t *pRes; // Result pointer
+  mempool_start_benchmark();
+  mempool_radix4_cfft_q16p_xpulpimg(l1_pSrc, (uint16_t)N_CSAMPLES,
+                                    l1_twiddleCoef_q16_src, 1, num_cores);
+  mempool_bitrevtable_q16p_xpulpimg((uint16_t *)l1_pSrc,
+                                    BITREVINDEXTABLE_LENGTH,
+                                    l1_BitRevIndexTable, num_cores);
+  pRes = l1_pSrc;
+  mempool_stop_benchmark();
+#endif
+  mempool_barrier(num_cores);
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/* MULTI-CORE FOLDED */
+#ifdef FOLDED
+  int16_t *pRes; // Result pointer
+  if (core_id < (N_CSAMPLES / 16)) {
+    mempool_start_benchmark();
+#ifdef FOLDED_TWIDDLES
+    mempool_radix4_cfft_q16p_folded(l1_pSrc, l1_pDst, (uint16_t)N_CSAMPLES,
+                                    l1_twiddleCoef_q16_src,
+                                    l1_twiddleCoef_q16_dst, (N_CSAMPLES / 16));
+#else
+    mempool_radix4_cfft_q16p_folded(l1_pSrc, l1_pDst, (uint16_t)N_CSAMPLES,
+                                    l1_twiddleCoef_q16_src, (N_CSAMPLES / 16));
+#endif
+    pRes = ((LOG2 / 2) % 2) == 0 ? l1_pSrc : l1_pDst;
+    mempool_bitrevtable_q16p_xpulpimg((uint16_t *)pRes, BITREVINDEXTABLE_LENGTH,
+                                      pRevT16, (N_CSAMPLES / 16));
+    mempool_stop_benchmark();
+  }
+  mempool_barrier(num_cores);
+#endif
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/* MULTI-CORE SCHEDULED */
+#ifdef SCHEDULED
+  if (core_id < N_FFTs_COL * (N_CSAMPLES >> 4U)) {
+    mempool_start_benchmark();
+    uint32_t col_fftLen = N_CSAMPLES >> 2U;
+    uint32_t col_id = core_id / (N_CSAMPLES >> 4U);
+    mempool_radix4_cfft_q16p_scheduler(
+        l1_pSrc, l1_pDst, N_CSAMPLES,
+        l1_twiddleCoef_q16_src + 2 * col_id * col_fftLen,
+        l1_twiddleCoef_q16_dst + 2 * col_id * col_fftLen, l1_BitRevIndexTable,
+        BITREVINDEXTABLE_LENGTH, 1, N_CSAMPLES >> 4U);
+    mempool_log_partial_barrier(2, core_id, N_CSAMPLES >> 4U);
+    mempool_stop_benchmark();
+  }
+#endif
+  mempool_barrier(num_cores);
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+/* CHECK */
+#if defined(SINGLE) || defined(PARALLEL) || defined(FOLDED)
+  if (core_id == 0) {
+    printf("Done!\n");
+    for (uint32_t i = 0; i < N_RSAMPLES; i++) {
+      if (ABS(((int32_t)pRes[i] - (int32_t)l2_pRes[i])) > TOLERANCE)
+        printf("ERROR!!! Result[%d]: %6d Expected[%d]: %6d\n", i, pRes[i], i,
+               l2_pRes[i]);
+    }
+  }
+  mempool_barrier(num_cores);
+#endif
+
+  return 0;
+}
diff --git a/software/runtime/data/data_cfft_radix2_q16.h.tpl b/software/runtime/data/data_cfft_radix2_q16.h.tpl
@@ -0,0 +1,56 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Automatically generated by:
+// data/data_cfft_radix2_q16.py
+
+\
+<% def array_to_cstr(array):
+    out = '{'
+    i = 0
+    out += '\n'
+    for a in array:
+        out += '(int16_t) 0X{:04X}, '.format(a&0xffff)
+        i += 1
+        if i % 16 == 0:
+            out += '\n'
+    out = out[:-2] + '}'
+    return out
+%> \
+
+<% def array_to_str(array):
+    out = '{'
+    i = 0
+    out += '\n'
+    for a in array:
+        out += '{}, '.format(a)
+        i += 1
+        if i % 16 == 0:
+            out += '\n'
+    out = out[:-2] + '}'
+    return out
+%> \
+
+#define LOG2 (${Log2Len})
+#define N_CSAMPLES (${Len})
+#define N_RSAMPLES (2 * N_CSAMPLES)
+#define N_TWIDDLES (3 * N_CSAMPLES / 4)
+#define N_BANKS (NUM_CORES * BANKING_FACTOR)
+#define BITREVINDEXTABLE_LENGTH (${BitrevLen})
+
+// Tolerance for correctness check
+#define TOLERANCE (${tolerance})
+
+% for m, m_str in zip([vector_inp, vector_res], ['l2_pSrc', 'l2_pRes']):
+
+// Data arrays for matrix ${m_str}
+int16_t ${m_str}[${2*Len}] = ${array_to_cstr(m)};
+
+% endfor \
+
+// Twiddles
+int16_t l2_twiddleCoef_q16[${int(6*Len/4)}] = ${array_to_cstr(vector_twi)};
+
+// Bitreversal
+uint16_t l2_BitRevIndexTable[${BitrevLen}] = ${array_to_str(vector_bitrev)};