Skip to content

Commit

Permalink
[software] Load data with DMA
Browse files Browse the repository at this point in the history
  • Loading branch information
mbertuletti committed Oct 11, 2023
1 parent 1a9d877 commit 65bbbd3
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 176 deletions.
38 changes: 22 additions & 16 deletions software/apps/cfft_radix2_q16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <string.h>

/* Mempool runtime libraries */
#include "dma.h"
#include "encoding.h"
#include "printf.h"
#include "runtime.h"
Expand All @@ -26,30 +27,35 @@
#define SINGLE

/* CFFT mempool data */
int16_t pSrc[N_RSAMPLES] __attribute__((aligned(N_CSAMPLES), section(".l1")));

void initialize_l1(int16_t *pSrc, uint32_t N_el) {
uint32_t core_id = mempool_get_core_id();
uint32_t num_cores = mempool_get_core_count();
for (uint32_t i = core_id; i < N_el; i += num_cores) {
pSrc[i] = vector_inp[i];
}
}
int16_t l1_pSrc[N_RSAMPLES]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
int16_t l1_twiddleCoef_q16[6 * N_CSAMPLES / 4]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));

int main() {

uint32_t core_id = mempool_get_core_id();
uint32_t num_cores = mempool_get_core_count();
mempool_barrier_init(core_id);
initialize_l1(pSrc, N_RSAMPLES);

if (core_id == 0) {
dma_memcpy_blocking(l1_pSrc, l2_pSrc, N_CSAMPLES * sizeof(int32_t));
dma_memcpy_blocking(l1_twiddleCoef_q16, l2_twiddleCoef_q16,
(3 * N_CSAMPLES / 4) * sizeof(int32_t));
dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable,
BITREVINDEXTABLE_LENGTH * sizeof(int32_t));
}
mempool_barrier(num_cores);

/* SINGLE-CORE */
#ifdef SINGLE
if (core_id == 0) {
mempool_start_benchmark();
mempool_radix2_cfft_q16s((uint16_t)16, twiddleCoef_q16, BitRevIndexTable,
pSrc, BITREVINDEXTABLE_FIXED_TABLE_LENGTH, 0, 0);
mempool_radix2_cfft_q16s((uint16_t)16, l1_twiddleCoef_q16,
l1_BitRevIndexTable, l1_pSrc,
BITREVINDEXTABLE_LENGTH, 0, 0);
mempool_stop_benchmark();
}
mempool_barrier(num_cores);
Expand All @@ -58,15 +64,15 @@ int main() {
/* PARALLEL-CORE */
#ifdef PARALLEL
mempool_start_benchmark();
mempool_radix2_cfft_q16p((uint16_t)16, twiddleCoef_q16, BitRevIndexTable,
pSrc, BITREVINDEXTABLE_FIXED_TABLE_LENGTH, 0, 0,
num_cores);
mempool_radix2_cfft_q16p((uint16_t)16, l1_twiddleCoef_q16,
l1_BitRevIndexTable, l1_pSrc,
BITREVINDEXTABLE_LENGTH, 0, 0, num_cores);
mempool_stop_benchmark();
#endif

if (core_id == 0) {
for (uint32_t i = 0; i < N_RSAMPLES; i += 2) {
printf("{%6d;%6d } \n", pSrc[i], pSrc[i + 1]);
printf("{%6d;%6d } \n", l1_pSrc[i], l1_pSrc[i + 1]);
}
printf("Done!\n");
}
Expand Down
165 changes: 73 additions & 92 deletions software/apps/cfft_radix4_q16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <string.h>

/* Mempool runtime libraries */
#include "dma.h"
#include "encoding.h"
#include "printf.h"
#include "runtime.h"
Expand All @@ -34,14 +35,14 @@
- ASM: Use asm_volatile statements
*/

#define FOLDED
#define SCHEDULED
#define FOLDED_TWIDDLES
#define BITREVERSETABLE
#define ASM // Use asm_volatile statements

#if !(defined(N_FFT_ROW) && defined(N_FFTs_COL))
#define N_FFTs_ROW 1
#define N_FFTs_COL 1
#define N_FFTs_ROW 2
#define N_FFTs_COL 2
#endif

#define ABS(x) (((x) < 0) ? (-x) : (x))
Expand All @@ -50,76 +51,56 @@
#include "kernel/mempool_radix4_cfft_q16p.h"
#include "kernel/mempool_radix4_cfft_q16s.h"

int16_t pSrc[N_FFTs_ROW * 8 * N_BANKS]
__attribute__((aligned(N_FFTs_ROW * 8 * N_BANKS), section(".l1")));
int16_t pDst[N_FFTs_ROW * 8 * N_BANKS]
__attribute__((aligned(N_FFTs_ROW * 8 * N_BANKS), section(".l1")));
int16_t pCoef16_src[8 * N_BANKS]
__attribute__((aligned(8 * N_BANKS), section(".l1")));
int16_t pCoef16_dst[8 * N_BANKS]
__attribute__((aligned(8 * N_BANKS), section(".l1")));
uint16_t pRevT16[BITREVINDEXTABLE_FIXED_TABLE_LENGTH]
__attribute__((aligned(N_BANKS), section(".l1")));
int16_t l1_pSrc[N_FFTs_ROW * 8 * N_BANKS]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
int16_t l1_pDst[N_FFTs_ROW * 8 * N_BANKS]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
int16_t l1_twiddleCoef_q16_src[8 * N_BANKS]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
int16_t l1_twiddleCoef_q16_dst[8 * N_BANKS]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));

///////////////////////////////////////////////////////////////////////////////////////////////////
/* INITIALIZATION FUNCTIONS*/

void initialize_l1() {
/* MAIN */
int main() {
uint32_t core_id = mempool_get_core_id();
uint32_t num_cores = mempool_get_core_count();
mempool_barrier_init(core_id);

// Initialize the inputs and results from the data.h file
for (uint32_t j = 0; j < N_FFTs_ROW; j++) {
for (uint32_t i = core_id; i < (8 * N_BANKS); i += num_cores) {
if (i < N_RSAMPLES * N_FFTs_COL) {
pSrc[j * (8 * N_BANKS) + i] = (int16_t)vector_inp[i % N_RSAMPLES];
} else {
pSrc[j * (8 * N_BANKS) + i] = (int16_t)0;
}
pDst[j * (8 * N_BANKS) + i] = (int16_t)0;
///////////////////////////////////////////////////////////////////////////////////////////////////
/* INITIALIZATION */
if (core_id == 0) {
// Each FFT is folded over 4 memory rows
// Each memory row is 2 * N_BANKS samples
for (uint32_t j = 0; j < N_FFTs_ROW; j++) {
dma_memcpy_blocking(l1_pSrc + j * (8 * N_BANKS), l2_pSrc,
(N_RSAMPLES * N_FFTs_COL) * sizeof(int32_t));
}
dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable,
BITREVINDEXTABLE_LENGTH * sizeof(int32_t));
dma_memcpy_blocking(l1_twiddleCoef_q16_src, l2_twiddleCoef_q16,
3 * (N_CSAMPLES / 4) * sizeof(int32_t));
}
// Initialize the Bitreversal table
for (uint32_t i = core_id; i < BITREVINDEXTABLE_FIXED_TABLE_LENGTH;
i += num_cores) {
*(v2s *)&pRevT16[2U * i] = *(v2s *)&BitRevIndexTable[2U * i];
}
mempool_barrier(num_cores);

// Initialize the Twiddles
// Initialize the Twiddles folded
#ifdef FOLDED_TWIDDLES
for (uint32_t i = core_id; i < 8 * N_BANKS; i += num_cores) {
pCoef16_src[i] = (int16_t)0;
pCoef16_dst[i] = (int16_t)0;
}
mempool_barrier(num_cores);
for (uint32_t j = 0; j < N_FFTs_COL; j++) {
uint32_t N_WORDS_COL = (N_CSAMPLES / 4);
for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) {
*(v2s *)&pCoef16_src[2U * (i + j * (N_CSAMPLES / 4))] =
*(v2s *)&twiddleCoef_q16[2U * i];
*(v2s *)&pCoef16_src[2U * (i + j * (N_CSAMPLES / 4) + 1 * N_BANKS)] =
*(v2s *)&twiddleCoef_q16[2U * (i * 2U)];
*(v2s *)&pCoef16_src[2U * (i + j * (N_CSAMPLES / 4) + 2 * N_BANKS)] =
*(v2s *)&twiddleCoef_q16[2U * (i * 3U)];
*(v2s *)&l1_twiddleCoef_q16_src[2U * (i + j * (N_CSAMPLES / 4))] =
*(v2s *)&l2_twiddleCoef_q16[2U * i];
*(v2s *)&l1_twiddleCoef_q16_src[2U * (i + j * (N_CSAMPLES / 4) +
1 * N_BANKS)] =
*(v2s *)&l2_twiddleCoef_q16[2U * (i * 2U)];
*(v2s *)&l1_twiddleCoef_q16_src[2U * (i + j * (N_CSAMPLES / 4) +
2 * N_BANKS)] =
*(v2s *)&l2_twiddleCoef_q16[2U * (i * 3U)];
}
}
#else
for (uint32_t i = core_id; i < 6 * (N_CSAMPLES / 4); i += num_cores) {
pCoef16_src[i] = twiddleCoef_q16[i];
}
#endif
mempool_barrier(num_cores);
}

///////////////////////////////////////////////////////////////////////////////////////////////////
/* MAIN */
int main() {
uint32_t core_id = mempool_get_core_id();
uint32_t num_cores = mempool_get_core_count();
mempool_barrier_init(core_id);

initialize_l1();
if (core_id == 0) {
printf("On the run...\n");
}
Expand All @@ -131,11 +112,11 @@ int main() {
int16_t *pRes; // Result pointer
if (core_id == 0) {
mempool_start_benchmark();
mempool_radix4_cfft_q16s_xpulpimg(pSrc, (uint16_t)N_CSAMPLES, pCoef16_src,
1);
mempool_radix4_cfft_q16s_xpulpimg(l1_pSrc, (uint16_t)N_CSAMPLES,
l1_twiddleCoef_q16_src, 1);
mempool_bitrevtable_q16s_xpulpimg(
(uint16_t *)pSrc, BITREVINDEXTABLE_FIXED_TABLE_LENGTH, pRevT16);
pRes = pSrc;
(uint16_t *)l1_pSrc, BITREVINDEXTABLE_LENGTH, l1_BitRevIndexTable);
pRes = l1_pSrc;
mempool_stop_benchmark();
}
mempool_barrier(num_cores);
Expand All @@ -146,12 +127,12 @@ int main() {
#ifdef PARALLEL
int16_t *pRes; // Result pointer
mempool_start_benchmark();
mempool_radix4_cfft_q16p_xpulpimg(pSrc, (uint16_t)N_CSAMPLES, pCoef16_src, 1,
num_cores);
mempool_bitrevtable_q16p_xpulpimg((uint16_t *)pSrc,
BITREVINDEXTABLE_FIXED_TABLE_LENGTH,
pRevT16, num_cores);
pRes = pSrc;
mempool_radix4_cfft_q16p_xpulpimg(l1_pSrc, (uint16_t)N_CSAMPLES,
l1_twiddleCoef_q16_src, 1, num_cores);
mempool_bitrevtable_q16p_xpulpimg((uint16_t *)l1_pSrc,
BITREVINDEXTABLE_LENGTH,
l1_BitRevIndexTable, num_cores);
pRes = l1_pSrc;
mempool_stop_benchmark();
#endif
mempool_barrier(num_cores);
Expand All @@ -163,36 +144,21 @@ int main() {
if (core_id < (N_CSAMPLES / 16)) {
mempool_start_benchmark();
#ifdef FOLDED_TWIDDLES
mempool_radix4_cfft_q16p_folded(pSrc, pDst, (uint16_t)N_CSAMPLES,
pCoef16_src, pCoef16_dst,
(N_CSAMPLES / 16));
mempool_radix4_cfft_q16p_folded(l1_pSrc, l1_pDst, (uint16_t)N_CSAMPLES,
l1_twiddleCoef_q16_src,
l1_twiddleCoef_q16_dst, (N_CSAMPLES / 16));
#else
mempool_radix4_cfft_q16p_folded(pSrc, pDst, (uint16_t)N_CSAMPLES,
pCoef16_src, (N_CSAMPLES / 16));
mempool_radix4_cfft_q16p_folded(l1_pSrc, l1_pDst, (uint16_t)N_CSAMPLES,
l1_twiddleCoef_q16_src, (N_CSAMPLES / 16));
#endif
pRes = ((LOG2 / 2) % 2) == 0 ? pSrc : pDst;
mempool_bitrevtable_q16p_xpulpimg((uint16_t *)pRes,
BITREVINDEXTABLE_FIXED_TABLE_LENGTH,
pRes = ((LOG2 / 2) % 2) == 0 ? l1_pSrc : l1_pDst;
mempool_bitrevtable_q16p_xpulpimg((uint16_t *)pRes, BITREVINDEXTABLE_LENGTH,
pRevT16, (N_CSAMPLES / 16));
mempool_stop_benchmark();
}
mempool_barrier(num_cores);
#endif

///////////////////////////////////////////////////////////////////////////////////////////////////
/* CHECK */
#if defined(SINGLE) || defined(PARALLEL) || defined(FOLDED)
if (core_id == 0) {
printf("Done!\n");
for (uint32_t i = 0; i < N_RSAMPLES; i++) {
if (ABS(((int32_t)pRes[i] - (int32_t)vector_res[i])) > TOLERANCE)
printf("ERROR!!! Result[%d]: %6d Expected[%d]: %6d\n", i, pSrc[i], i,
vector_res[i]);
}
}
mempool_barrier(num_cores);
#endif

///////////////////////////////////////////////////////////////////////////////////////////////////
/* MULTI-CORE SCHEDULED */
#ifdef SCHEDULED
Expand All @@ -201,14 +167,29 @@ int main() {
uint32_t col_fftLen = N_CSAMPLES >> 2U;
uint32_t col_id = core_id / (N_CSAMPLES >> 4U);
mempool_radix4_cfft_q16p_scheduler(
pSrc, pDst, N_CSAMPLES, pCoef16_src + 2 * col_id * col_fftLen,
pCoef16_dst + 2 * col_id * col_fftLen, pRevT16,
BITREVINDEXTABLE_FIXED_TABLE_LENGTH, 1, N_CSAMPLES >> 4U);
l1_pSrc, l1_pDst, N_CSAMPLES,
l1_twiddleCoef_q16_src + 2 * col_id * col_fftLen,
l1_twiddleCoef_q16_dst + 2 * col_id * col_fftLen, l1_BitRevIndexTable,
BITREVINDEXTABLE_LENGTH, 1, N_CSAMPLES >> 4U);
mempool_log_partial_barrier(2, core_id, N_CSAMPLES >> 4U);
mempool_stop_benchmark();
}
#endif
mempool_barrier(num_cores);

///////////////////////////////////////////////////////////////////////////////////////////////////
/* CHECK */
#if defined(SINGLE) || defined(PARALLEL) || defined(FOLDED)
if (core_id == 0) {
printf("Done!\n");
for (uint32_t i = 0; i < N_RSAMPLES; i++) {
if (ABS(((int32_t)pRes[i] - (int32_t)l2_pRes[i])) > TOLERANCE)
printf("ERROR!!! Result[%d]: %6d Expected[%d]: %6d\n", i, pRes[i], i,
l2_pRes[i]);
}
}
mempool_barrier(num_cores);
#endif

return 0;
}
9 changes: 5 additions & 4 deletions software/runtime/data/data_cfft_radix2_q16.h.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -36,20 +36,21 @@
#define N_CSAMPLES (${Len})
#define N_RSAMPLES (2 * N_CSAMPLES)
#define N_TWIDDLES (3 * N_CSAMPLES / 4)
#define BITREVINDEXTABLE_FIXED_TABLE_LENGTH (${BitrevLen})
#define N_BANKS (NUM_CORES * BANKING_FACTOR)
#define BITREVINDEXTABLE_LENGTH (${BitrevLen})

// Tolerance for correctness check
#define TOLERANCE (${tolerance})

% for m, m_str in zip([vector_inp, vector_res], ['vector_inp', 'vector_res']):
% for m, m_str in zip([vector_inp, vector_res], ['l2_pSrc', 'l2_pRes']):

// Data arrays for matrix ${m_str}
int16_t ${m_str}[${2*Len}] = ${array_to_cstr(m)};

% endfor \

// Twiddles
int16_t twiddleCoef_q16[${int(6*Len/4)}] = ${array_to_cstr(vector_twi)};
int16_t l2_twiddleCoef_q16[${int(6*Len/4)}] = ${array_to_cstr(vector_twi)};

// Bitreversal
uint16_t BitRevIndexTable[${BitrevLen}] = ${array_to_str(vector_bitrev)};
uint16_t l2_BitRevIndexTable[${BitrevLen}] = ${array_to_str(vector_bitrev)};
30 changes: 0 additions & 30 deletions software/runtime/data/data_cfft_radix2_q16.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,33 +198,3 @@ def main():

if __name__ == "__main__":
main()

######################
# Fixpoint Functions #
######################


def q_sat(x):
if x > 2**31 - 1:
return x - 2**32
elif x < -2**31:
return x + 2**32
else:
return x


def q_add(a, b):
return q_sat(a + b)


def q_sub(a, b):
return q_sat(a - b)


def q_mul(a, b, p):
return q_roundnorm(a * b, p)


def q_roundnorm(a, p):
rounding = 1 << (p - 1)
return q_sat((a + rounding) >> p)
Loading

0 comments on commit 65bbbd3

Please sign in to comment.