Skip to content

Commit

Permalink
Merge pull request #84 from pulp-platform/cfft
Browse files Browse the repository at this point in the history
Add cfft radix-4 and radix-2 kernels
  • Loading branch information
mbertuletti authored Oct 12, 2023
2 parents d371350 + 3023765 commit 715fa0b
Show file tree
Hide file tree
Showing 16 changed files with 2,985 additions and 2 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Update Bender to version 0.27.3
- Update default Questasim version to 2022.3
- Decrease stack size to 128 words
- Add CFFT radix-4 and radix-2 kernels

### Fixed
- Fix type issue in `snitch_addr_demux`
Expand Down
1 change: 1 addition & 0 deletions python-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ numpy
pandas
progressbar2
tabulate
sympy
2 changes: 1 addition & 1 deletion software/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ runtime/arch.ld

# Generated data files
data.h
runtime/data/*.h
runtime/data/data*.h
2 changes: 1 addition & 1 deletion software/apps/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ else
ALL := $(filter-out systolic/%,$(APPS))
endif

ALL_LLVM := $(filter-out chest_q16, $(ALL))
ALL_LLVM := $(filter-out chest_q16 cfft_radix2_q16 cfft_radix4_q16, $(ALL))

# Make all applications
all: $(ALL)
Expand Down
81 changes: 81 additions & 0 deletions software/apps/cfft_radix2_q16/main.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
// Copyright 2022 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

// Author: Marco Bertuletti, ETH Zurich

#include <limits.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* Mempool runtime libraries */
#include "dma.h"
#include "encoding.h"
#include "printf.h"
#include "runtime.h"
#include "synchronization.h"
#include "xpulp/builtins_v2.h"

/* CFFT mempool libraries */
#include "data/data_cfft_radix2_q16.h"
#include "kernel/mempool_radix2_cfft_q16p.h"
#include "kernel/mempool_radix2_cfft_q16s.h"

#define PARALLEL
#define SINGLE

/* CFFT mempool data */
int16_t l1_pSrc[N_RSAMPLES]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
int16_t l1_twiddleCoef_q16[6 * N_CSAMPLES / 4]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));

int main() {

uint32_t core_id = mempool_get_core_id();
uint32_t num_cores = mempool_get_core_count();
mempool_barrier_init(core_id);

if (core_id == 0) {
dma_memcpy_blocking(l1_pSrc, l2_pSrc, N_CSAMPLES * sizeof(int32_t));
dma_memcpy_blocking(l1_twiddleCoef_q16, l2_twiddleCoef_q16,
(3 * N_CSAMPLES / 4) * sizeof(int32_t));
dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable,
BITREVINDEXTABLE_LENGTH * sizeof(int32_t));
}
mempool_barrier(num_cores);

/* SINGLE-CORE */
#ifdef SINGLE
if (core_id == 0) {
mempool_start_benchmark();
mempool_radix2_cfft_q16s((uint16_t)16, l1_twiddleCoef_q16,
l1_BitRevIndexTable, l1_pSrc,
BITREVINDEXTABLE_LENGTH, 0, 0);
mempool_stop_benchmark();
}
mempool_barrier(num_cores);
#endif

/* PARALLEL-CORE */
#ifdef PARALLEL
mempool_start_benchmark();
mempool_radix2_cfft_q16p((uint16_t)16, l1_twiddleCoef_q16,
l1_BitRevIndexTable, l1_pSrc,
BITREVINDEXTABLE_LENGTH, 0, 0, num_cores);
mempool_stop_benchmark();
#endif

if (core_id == 0) {
for (uint32_t i = 0; i < N_RSAMPLES; i += 2) {
printf("{%6d;%6d } \n", l1_pSrc[i], l1_pSrc[i + 1]);
}
printf("Done!\n");
}
mempool_barrier(num_cores);
return 0;
}
195 changes: 195 additions & 0 deletions software/apps/cfft_radix4_q16/main.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
// Copyright 2022 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

// Author: Marco Bertuletti, ETH Zurich

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* Mempool runtime libraries */
#include "dma.h"
#include "encoding.h"
#include "printf.h"
#include "runtime.h"
#include "synchronization.h"
#include "xpulp/builtins_v2.h"

/* CFFT data libraries */
#include "data/data_cfft_radix4_q16.h"

/*
CHOOSE ONE
- SINGLE: Single core FFT
- PARALLEL: Parallel FFT not "memory-aware"
- FOLDED: Parallel FFT with "memory-aware" load/store scheme
- SCHEDULED: Scheduling of multiple parallel FFTs with "memory-aware"
load/store scheme
- N_FFTs_COL: Independent FFTs scheduled on one row (default 1)
- N_FFTs_ROW: Independent FFTs scheduled on columns (default 1)
(OPTIONALLY ENABLE)
- FOLDED_TWIDDLES: Also the twiddles have "memory-aware" load/stores
- BITREVERSETABLE: The bitreversal indeces are loaded from a table
- ASM: Use asm_volatile statements
*/

#define SCHEDULED
#define FOLDED_TWIDDLES
#define BITREVERSETABLE
#define ASM // Use asm_volatile statements

#if !(defined(N_FFT_ROW) && defined(N_FFTs_COL))
#define N_FFTs_ROW 2
#define N_FFTs_COL 2
#endif

#define ABS(x) (((x) < 0) ? (-x) : (x))
#include "kernel/mempool_radix4_cfft_butterfly_q16.h"
#include "kernel/mempool_radix4_cfft_q16_bitreversal.h"
#include "kernel/mempool_radix4_cfft_q16p.h"
#include "kernel/mempool_radix4_cfft_q16s.h"

int16_t l1_pSrc[N_FFTs_ROW * 8 * N_BANKS]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
int16_t l1_pDst[N_FFTs_ROW * 8 * N_BANKS]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
int16_t l1_twiddleCoef_q16_src[8 * N_BANKS]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
int16_t l1_twiddleCoef_q16_dst[8 * N_BANKS]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));
uint16_t l1_BitRevIndexTable[BITREVINDEXTABLE_LENGTH]
__attribute__((aligned(4 * N_BANKS), section(".l1_prio")));

///////////////////////////////////////////////////////////////////////////////////////////////////
/* MAIN */
int main() {
uint32_t core_id = mempool_get_core_id();
uint32_t num_cores = mempool_get_core_count();
mempool_barrier_init(core_id);

///////////////////////////////////////////////////////////////////////////////////////////////////
/* INITIALIZATION */
if (core_id == 0) {
// Each FFT is folded over 4 memory rows
// Each memory row is 2 * N_BANKS samples
for (uint32_t j = 0; j < N_FFTs_ROW; j++) {
dma_memcpy_blocking(l1_pSrc + j * (8 * N_BANKS), l2_pSrc,
(N_RSAMPLES * N_FFTs_COL) * sizeof(int32_t));
}
dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable,
BITREVINDEXTABLE_LENGTH * sizeof(int32_t));
dma_memcpy_blocking(l1_twiddleCoef_q16_src, l2_twiddleCoef_q16,
3 * (N_CSAMPLES / 4) * sizeof(int32_t));
}
// Initialize the Twiddles folded
#ifdef FOLDED_TWIDDLES
for (uint32_t j = 0; j < N_FFTs_COL; j++) {
uint32_t N_WORDS_COL = (N_CSAMPLES / 4);
for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) {
*(v2s *)&l1_twiddleCoef_q16_src[2U * (i + j * (N_CSAMPLES / 4))] =
*(v2s *)&l2_twiddleCoef_q16[2U * i];
*(v2s *)&l1_twiddleCoef_q16_src[2U * (i + j * (N_CSAMPLES / 4) +
1 * N_BANKS)] =
*(v2s *)&l2_twiddleCoef_q16[2U * (i * 2U)];
*(v2s *)&l1_twiddleCoef_q16_src[2U * (i + j * (N_CSAMPLES / 4) +
2 * N_BANKS)] =
*(v2s *)&l2_twiddleCoef_q16[2U * (i * 3U)];
}
}
#endif
mempool_barrier(num_cores);

if (core_id == 0) {
printf("On the run...\n");
}
mempool_barrier(num_cores);

///////////////////////////////////////////////////////////////////////////////////////////////////
/* SINGLE-CORE */
#ifdef SINGLE
int16_t *pRes; // Result pointer
if (core_id == 0) {
mempool_start_benchmark();
mempool_radix4_cfft_q16s_xpulpimg(l1_pSrc, (uint16_t)N_CSAMPLES,
l1_twiddleCoef_q16_src, 1);
mempool_bitrevtable_q16s_xpulpimg(
(uint16_t *)l1_pSrc, BITREVINDEXTABLE_LENGTH, l1_BitRevIndexTable);
pRes = l1_pSrc;
mempool_stop_benchmark();
}
mempool_barrier(num_cores);
#endif

///////////////////////////////////////////////////////////////////////////////////////////////////
/* MULTI-CORE */
#ifdef PARALLEL
int16_t *pRes; // Result pointer
mempool_start_benchmark();
mempool_radix4_cfft_q16p_xpulpimg(l1_pSrc, (uint16_t)N_CSAMPLES,
l1_twiddleCoef_q16_src, 1, num_cores);
mempool_bitrevtable_q16p_xpulpimg((uint16_t *)l1_pSrc,
BITREVINDEXTABLE_LENGTH,
l1_BitRevIndexTable, num_cores);
pRes = l1_pSrc;
mempool_stop_benchmark();
#endif
mempool_barrier(num_cores);

///////////////////////////////////////////////////////////////////////////////////////////////////
/* MULTI-CORE FOLDED */
#ifdef FOLDED
int16_t *pRes; // Result pointer
if (core_id < (N_CSAMPLES / 16)) {
mempool_start_benchmark();
#ifdef FOLDED_TWIDDLES
mempool_radix4_cfft_q16p_folded(l1_pSrc, l1_pDst, (uint16_t)N_CSAMPLES,
l1_twiddleCoef_q16_src,
l1_twiddleCoef_q16_dst, (N_CSAMPLES / 16));
#else
mempool_radix4_cfft_q16p_folded(l1_pSrc, l1_pDst, (uint16_t)N_CSAMPLES,
l1_twiddleCoef_q16_src, (N_CSAMPLES / 16));
#endif
pRes = ((LOG2 / 2) % 2) == 0 ? l1_pSrc : l1_pDst;
mempool_bitrevtable_q16p_xpulpimg((uint16_t *)pRes, BITREVINDEXTABLE_LENGTH,
pRevT16, (N_CSAMPLES / 16));
mempool_stop_benchmark();
}
mempool_barrier(num_cores);
#endif

///////////////////////////////////////////////////////////////////////////////////////////////////
/* MULTI-CORE SCHEDULED */
#ifdef SCHEDULED
if (core_id < N_FFTs_COL * (N_CSAMPLES >> 4U)) {
mempool_start_benchmark();
uint32_t col_fftLen = N_CSAMPLES >> 2U;
uint32_t col_id = core_id / (N_CSAMPLES >> 4U);
mempool_radix4_cfft_q16p_scheduler(
l1_pSrc, l1_pDst, N_CSAMPLES,
l1_twiddleCoef_q16_src + 2 * col_id * col_fftLen,
l1_twiddleCoef_q16_dst + 2 * col_id * col_fftLen, l1_BitRevIndexTable,
BITREVINDEXTABLE_LENGTH, 1, N_CSAMPLES >> 4U);
mempool_log_partial_barrier(2, core_id, N_CSAMPLES >> 4U);
mempool_stop_benchmark();
}
#endif
mempool_barrier(num_cores);

///////////////////////////////////////////////////////////////////////////////////////////////////
/* CHECK */
#if defined(SINGLE) || defined(PARALLEL) || defined(FOLDED)
if (core_id == 0) {
printf("Done!\n");
for (uint32_t i = 0; i < N_RSAMPLES; i++) {
if (ABS(((int32_t)pRes[i] - (int32_t)l2_pRes[i])) > TOLERANCE)
printf("ERROR!!! Result[%d]: %6d Expected[%d]: %6d\n", i, pRes[i], i,
l2_pRes[i]);
}
}
mempool_barrier(num_cores);
#endif

return 0;
}
56 changes: 56 additions & 0 deletions software/runtime/data/data_cfft_radix2_q16.h.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// Copyright 2022 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

// Automatically generated by:
// data/data_cfft_radix2_q16.py

\
<% def array_to_cstr(array):
out = '{'
i = 0
out += '\n'
for a in array:
out += '(int16_t) 0X{:04X}, '.format(a&0xffff)
i += 1
if i % 16 == 0:
out += '\n'
out = out[:-2] + '}'
return out
%> \

<% def array_to_str(array):
out = '{'
i = 0
out += '\n'
for a in array:
out += '{}, '.format(a)
i += 1
if i % 16 == 0:
out += '\n'
out = out[:-2] + '}'
return out
%> \

#define LOG2 (${Log2Len})
#define N_CSAMPLES (${Len})
#define N_RSAMPLES (2 * N_CSAMPLES)
#define N_TWIDDLES (3 * N_CSAMPLES / 4)
#define N_BANKS (NUM_CORES * BANKING_FACTOR)
#define BITREVINDEXTABLE_LENGTH (${BitrevLen})

// Tolerance for correctness check
#define TOLERANCE (${tolerance})

% for m, m_str in zip([vector_inp, vector_res], ['l2_pSrc', 'l2_pRes']):

// Data arrays for matrix ${m_str}
int16_t ${m_str}[${2*Len}] = ${array_to_cstr(m)};

% endfor \

// Twiddles
int16_t l2_twiddleCoef_q16[${int(6*Len/4)}] = ${array_to_cstr(vector_twi)};

// Bitreversal
uint16_t l2_BitRevIndexTable[${BitrevLen}] = ${array_to_str(vector_bitrev)};
Loading

0 comments on commit 715fa0b

Please sign in to comment.