Skip to content

Commit

Permalink
[software] Add shuffle instruction in FFT butterfly
Browse files Browse the repository at this point in the history
  • Loading branch information
mbertuletti committed Jul 5, 2024
1 parent 491a050 commit b9ff155
Show file tree
Hide file tree
Showing 4 changed files with 179 additions and 80 deletions.
46 changes: 41 additions & 5 deletions software/apps/baremetal/cfft_radix4_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,16 @@
#include "data_cfft_radix4_f16.h"

/* CHOOSE ONE */
//#define SINGLE // Single core FFT.
//#define PARALLEL // Parallel FFT not "memory-aware".
//#define FOLDED // Parallel FFT with "memory-aware" load/store.
#define SCHEDULED // Folded FFTs arranged in rows and cols.'''

// Bitreversal index from table.
#define BITREVERSETABLE
// Independent FFTs scheduled on one row (default 1).
#define N_FFTs_ROW 2
#define N_FFTs_ROW 1
// Independent FFTs scheduled on columns (default 1).
#define N_FFTs_COL 2
#define N_FFTs_COL 1
#if (N_FFTs_COL > MAX_COL)
#error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)]
#endif
Expand Down Expand Up @@ -77,6 +76,20 @@ int main() {

/* INITIALIZATION */

#if (defined(SINGLE) || defined(PARALLEL))
if (core_id == 0) {
dma_memcpy_blocking(l1_pSrc, l2_pSrc, N_CSAMPLES * sizeof(int32_t));
dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16,
3 * (N_CSAMPLES / 4) * sizeof(int32_t));
dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable,
BITREVINDEXTABLE_LENGTH * sizeof(int16_t));
printf("01: END INITIALIZATION\n");
}
mempool_barrier(num_cores);
#endif

#if (defined(SCHEDULED) || defined(FOLDED))

if (core_id == 0) {
for (uint32_t j = 0; j < N_FFTs_ROW; j++) {
for (uint32_t i = 0; i < N_FFTs_COL; i++) {
Expand All @@ -88,6 +101,8 @@ int main() {
BITREVINDEXTABLE_LENGTH * sizeof(int32_t));
}
mempool_barrier(num_cores);

#ifdef FOLDED_TWIDDLES
for (uint32_t j = 0; j < N_FFTs_COL; j++) {
uint32_t N_WORDS_COL = N_CSAMPLES >> 2;
for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) {
Expand All @@ -99,10 +114,31 @@ int main() {
*(v2h *)&l2_twiddleCoef_f16[2 * (i * 3U)];
}
}
#else
if (core_id == 0) {
dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16,
3 * (N_CSAMPLES / 4) * sizeof(int32_t));
}
#endif
mempool_barrier(num_cores);

if (core_id == 0) {
printf("01: END INITIALIZATION\n");
}
mempool_barrier(num_cores);
#endif

/* COMPUTATION */

#ifdef PARALLEL
mempool_start_benchmark();
mempool_radix4_cfft_f16p(l1_pSrc, N_CSAMPLES, l1_twiddleCoef_f16_src, 1,
num_cores);
mempool_bitrevtable_q16p_xpulpimg((int16_t *)l1_pSrc, BITREVINDEXTABLE_LENGTH,
l1_BitRevIndexTable, num_cores);
mempool_stop_benchmark();
pRes = l1_pSrc;
#endif

#ifdef FOLDED
if (core_id < (N_CSAMPLES / 16)) {
Expand All @@ -111,7 +147,7 @@ int main() {
l1_twiddleCoef_f16_src,
l1_twiddleCoef_f16_dst, (N_CSAMPLES / 16));
pRes = ((LOG2 / 2) % 2) == 0 ? l1_pSrc : l1_pDst;
mempool_bitrevtable_q16p_xpulpimg((uint16_t *)pRes, BITREVINDEXTABLE_LENGTH,
mempool_bitrevtable_q16p_xpulpimg((int16_t *)pRes, BITREVINDEXTABLE_LENGTH,
l1_BitRevIndexTable, (N_CSAMPLES / 16));
mempool_stop_benchmark();
}
Expand Down Expand Up @@ -140,7 +176,7 @@ int main() {
printf("02: END COMPUTATION\n");
}

mempool_check_f16(pRes, l2_pRes, 2 * N_CSAMPLES, 0.5, 0);
mempool_check_f16(pRes, l2_pRes, 2 * N_CSAMPLES, 0.05f, 0);
mempool_barrier(num_cores);
return 0;
}
8 changes: 5 additions & 3 deletions software/data/generate_cfft.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,10 @@ def generate_cfft_q16(N):


def generate_cfft_f16(N):
src = np.random.rand(N).astype(np.float16)
src = src + 1.j * np.random.rand(N).astype(np.float16)
# src = np.random.rand(N).astype(np.float16)
# src = src + 1.j * np.random.rand(N).astype(np.float16)
src = np.cos(np.linspace(0, N / 4, num=N)).astype(np.float16)
src = src + 1.j * np.sin(np.linspace(0, N / 4, num=N)).astype(np.float16)
dst = np.fft.fft(src)
src = np.column_stack((src.imag, src.real)).astype(np.float16).flatten()
dst = np.column_stack((dst.imag, dst.real)).astype(np.float16).flatten()
Expand Down Expand Up @@ -142,7 +144,7 @@ def main():
"--dimension",
type=int,
required=False,
default=64,
default=256,
help='Input dimension'
)

Expand Down
120 changes: 53 additions & 67 deletions software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ static inline void radix4_butterfly_first(__fp16 *pIn, __fp16 *pOut,
uint32_t i0, uint32_t n2, v2h CoSi1,
v2h CoSi2, v2h CoSi3, v2h C1, v2h C2,
v2h C3) {
__fp16 t0, t1, t2, t3;
uint32_t i1, i2, i3;
uint32_t i0_store, i1_store, i2_store, i3_store;
v2h A, B, C, D, E, F, G, H;
Expand Down Expand Up @@ -74,6 +73,7 @@ static inline void radix4_butterfly_first(__fp16 *pIn, __fp16 *pOut,
A = *(v2h *)&pIn[i0 * 2U];
/* Read xc (real), yc(imag) input */
C = *(v2h *)&pIn[i2 * 2U];

asm volatile(
// G = (xb + xd), (yb + yd)
"vfadd.h %[G],%[B],%[D];"
Expand All @@ -85,33 +85,30 @@ static inline void radix4_butterfly_first(__fp16 *pIn, __fp16 *pOut,
"vfsub.h %[F],%[A],%[C];"

// C = (yb - yd), (xd - xb)
// D = (yd - yb), (xb - xd)
"pv.extract.h %[t0],%[H],0;" // yb - yd
"pv.extract.h %[t1],%[H],1;" // xb - xd
"xor %[t2],%[t0],%[neg_mask];" // yd - yb
"xor %[t3],%[t1],%[neg_mask];" // xd - xb
"pv.pack %[C],%[t0],%[t3];"
"pv.pack %[D],%[t2],%[t1];"
"pv.shuffle2.h %[C],%[H],%[mask];"
"vfmul.h %[C],%[C],%[neg_mask];"

// xa + xb + xc + xd, ya + yc + yb + yd
"vfadd.h %[A],%[E],%[G];"
// xa + xc - xb - xd, ya + yc - yb - yd
"vfsub.h %[B],%[E],%[G];"
// xa - xc + yb - yd, ya - yc + xd - xb
"vfadd.h %[C],%[F],%[C];"
"vfadd.h %[D],%[F],%[C];"
// xa - xc + yd - yb, ya - yc + xb - xd
"vfadd.h %[D],%[F],%[D];"
"vfsub.h %[C],%[F],%[C];"

// s4 = Co1 * (xa - xc + yb - yd) + Si1 * (ya - yc + xd - xb)
// s5 = -Si1 * (xa - xc + yb - yd) + Co1 * (ya - yc + xd - xb)
"vfdotpex.s.h %[s0],%[CoSi1],%[D];"
"vfdotpex.s.h %[s1],%[C1],%[D];"

// s0 = Co2 * (xa + xc - xb - xd) + Si2 * (ya + yc - yb - yd)
// s1 = Si2 * (xa + xc - xb - xd) - Co2 * (ya + yc - yb - yd)
"vfdotpex.s.h %[s0],%[CoSi2],%[B];"
"vfdotpex.s.h %[s1],%[C2],%[B];"
// s2 = Co1 * (xa - xc + yd - yb) + Si1 * (ya - yc + xb - xd)
// s3 = Si1 * (xa - xc + yd - yb) - Co1 * (ya - yc + xb - xd)
"vfdotpex.s.h %[s2],%[CoSi1],%[D];"
"vfdotpex.s.h %[s3],%[C1],%[D];"
// s4 = Co3 * (xa - xc + yb - yd) + Si3 * (ya - yc + xd - xb)
// s5 = Si3 * (xa - xc + yb - yd) - Co3 * (ya - yc + xd - xb)
// s1 = -Si2 * (xa + xc - xb - xd) + Co2 * (ya + yc - yb - yd)
"vfdotpex.s.h %[s2],%[CoSi2],%[B];"
"vfdotpex.s.h %[s3],%[C2],%[B];"

// s3 = Co3 * (xa - xc + yd - yb) + Si3 * (ya - yc + xb - xd)
// s4 = -Si3 * (xa - xc + yd - yb) + Co3 * (ya - yc + xb - xd)
"vfdotpex.s.h %[s4],%[CoSi3],%[C];"
"vfdotpex.s.h %[s5],%[C3],%[C];"

Expand All @@ -121,17 +118,17 @@ static inline void radix4_butterfly_first(__fp16 *pIn, __fp16 *pOut,
"vfcpka.h.s %[C], %[s3], %[s2];"
// xd', yd'
"vfcpka.h.s %[D], %[s5], %[s4];"
: [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "+&r"(E),
[F] "+&r"(F), [G] "+&r"(G), [H] "+&r"(H), [t0] "+&r"(t0),
[t1] "+&r"(t1), [t2] "+&r"(t2), [t3] "+&r"(t3), [s0] "=&r"(s0),
: [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "=&r"(E),
[F] "=&r"(F), [G] "=&r"(G), [H] "=&r"(H), [s0] "=&r"(s0),
[s1] "=&r"(s1), [s2] "=&r"(s2), [s3] "=&r"(s3), [s4] "=&r"(s4),
[s5] "=&r"(s5)
: [C1] "r"(C1), [C2] "r"(C2), [C3] "r"(C3), [CoSi1] "r"(CoSi1),
[CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3), [neg_mask] "r"(0x00008000)
[CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3), [mask] "r"(0x00020003),
[neg_mask] "r"(0x3C00BC00)
:);
*((v2h *)&pOut[i0_store * 2U]) = A;
*((v2h *)&pOut[i1_store * 2U]) = B;
*((v2h *)&pOut[i2_store * 2U]) = C;
*((v2h *)&pOut[i1_store * 2U]) = C;
*((v2h *)&pOut[i2_store * 2U]) = B;
*((v2h *)&pOut[i3_store * 2U]) = D;
}

Expand All @@ -155,7 +152,6 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut,
uint32_t i0, uint32_t n2, v2h CoSi1,
v2h CoSi2, v2h CoSi3, v2h C1, v2h C2,
v2h C3) {
__fp16 t0, t1, t2, t3;
uint32_t i1, i2, i3;
uint32_t i0_store, i1_store, i2_store, i3_store;
v2h A, B, C, D, E, F, G, H;
Expand Down Expand Up @@ -205,6 +201,7 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut,
A = *(v2h *)&pIn[i0 * 2U];
/* Read xc (real), yc(imag) input */
C = *(v2h *)&pIn[i2 * 2U];

asm volatile(
// G = (xb + xd), (yb + yd)
"vfadd.h %[G],%[B],%[D];"
Expand All @@ -216,33 +213,30 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut,
"vfsub.h %[F],%[A],%[C];"

// C = (yb - yd), (xd - xb)
// D = (yd - yb), (xb - xd)
"pv.extract.h %[t0],%[H],0;" // yb - yd
"pv.extract.h %[t1],%[H],1;" // xb - xd
"xor %[t2],%[t0],%[neg_mask];" // yd - yb
"xor %[t3],%[t1],%[neg_mask];" // xd - xb
"pv.pack %[C],%[t0],%[t3];"
"pv.pack %[D],%[t2],%[t1];"
"pv.shuffle2.h %[C],%[H],%[mask];"
"vfmul.h %[C],%[C],%[neg_mask];"

// xa + xb + xc + xd, ya + yc + yb + yd
"vfadd.h %[A],%[E],%[G];"
// xa + xc - xb - xd, ya + yc - yb - yd
"vfsub.h %[B],%[E],%[G];"
// xa - xc + yb - yd, ya - yc + xd - xb
"vfadd.h %[C],%[F],%[C];"
"vfadd.h %[D],%[F],%[C];"
// xa - xc + yd - yb, ya - yc + xb - xd
"vfadd.h %[D],%[F],%[D];"
"vfsub.h %[C],%[F],%[C];"

// s4 = Co1 * (xa - xc + yb - yd) + Si1 * (ya - yc + xd - xb)
// s5 = -Si1 * (xa - xc + yb - yd) + Co1 * (ya - yc + xd - xb)
"vfdotpex.s.h %[s0],%[CoSi1],%[D];"
"vfdotpex.s.h %[s1],%[C1],%[D];"

// s0 = Co2 * (xa + xc - xb - xd) + Si2 * (ya + yc - yb - yd)
// s1 = Si2 * (xa + xc - xb - xd) - Co2 * (ya + yc - yb - yd)
"vfdotpex.s.h %[s0],%[CoSi2],%[B];"
"vfdotpex.s.h %[s1],%[C2],%[B];"
// s2 = Co1 * (xa - xc + yd - yb) + Si1 * (ya - yc + xb - xd)
// s3 = Si1 * (xa - xc + yd - yb) - Co1 * (ya - yc + xb - xd)
"vfdotpex.s.h %[s2],%[CoSi1],%[D];"
"vfdotpex.s.h %[s3],%[C1],%[D];"
// s4 = Co3 * (xa - xc + yb - yd) + Si3 * (ya - yc + xd - xb)
// s5 = Si3 * (xa - xc + yb - yd) - Co3 * (ya - yc + xd - xb)
// s1 = -Si2 * (xa + xc - xb - xd) + Co2 * (ya + yc - yb - yd)
"vfdotpex.s.h %[s2],%[CoSi2],%[B];"
"vfdotpex.s.h %[s3],%[C2],%[B];"

// s3 = Co3 * (xa - xc + yd - yb) + Si3 * (ya - yc + xb - xd)
// s4 = -Si3 * (xa - xc + yd - yb) + Co3 * (ya - yc + xb - xd)
"vfdotpex.s.h %[s4],%[CoSi3],%[C];"
"vfdotpex.s.h %[s5],%[C3],%[C];"

Expand All @@ -252,18 +246,17 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut,
"vfcpka.h.s %[C], %[s3], %[s2];"
// xd', yd'
"vfcpka.h.s %[D], %[s5], %[s4];"
: [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "+&r"(E),
[F] "+&r"(F), [G] "+&r"(G), [H] "+&r"(H), [t0] "+&r"(t0),
[t1] "+&r"(t1), [t2] "+&r"(t2), [t3] "+&r"(t3), [s0] "=&r"(s0),
: [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "=&r"(E),
[F] "=&r"(F), [G] "=&r"(G), [H] "=&r"(H), [s0] "=&r"(s0),
[s1] "=&r"(s1), [s2] "=&r"(s2), [s3] "=&r"(s3), [s4] "=&r"(s4),
[s5] "=&r"(s5)
: [C1] "r"(C1), [C2] "r"(C2), [C3] "r"(C3), [CoSi1] "r"(CoSi1),
[CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3), [neg_mask] "r"(0x00008000)
[CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3), [mask] "r"(0x00020003),
[neg_mask] "r"(0x3C00BC00)
:);

*((v2h *)&pOut[i0_store * 2U]) = A;
*((v2h *)&pOut[i1_store * 2U]) = B;
*((v2h *)&pOut[i2_store * 2U]) = C;
*((v2h *)&pOut[i1_store * 2U]) = C;
*((v2h *)&pOut[i2_store * 2U]) = B;
*((v2h *)&pOut[i3_store * 2U]) = D;
}

Expand All @@ -278,7 +271,7 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut,
*/
static inline void radix4_butterfly_last(__fp16 *pIn, __fp16 *pOut,
uint32_t i0) {
__fp16 t0, t1;
__fp16 t0, t1, t2, t3;
uint32_t i1, i2, i3;
uint32_t i0_store, i1_store, i2_store, i3_store;
v2h A, B, C, D, E, F, G, H;
Expand All @@ -300,7 +293,7 @@ static inline void radix4_butterfly_last(__fp16 *pIn, __fp16 *pOut,
i3 = i2 + 1U;
#endif
// STORE INDEXES
#if defined(FOLDED)
#if defined(FOLDED) || (defined(SCHEDULED) && defined(BITREVERSETABLE))
i0_store = i0 * 4;
i1_store = i0_store + 1;
i2_store = i1_store + 1;
Expand All @@ -320,7 +313,6 @@ static inline void radix4_butterfly_last(__fp16 *pIn, __fp16 *pOut,
A = *(v2h *)&pIn[i0 * 2U];
/* Read xc (imag), yc(real) input */
C = *(v2h *)&pIn[i2 * 2U];
__fp16 t2, t3;
asm volatile(
/* (xb - xd), (yb - yd) */
"vfsub.h %[H],%[B],%[D];"
Expand All @@ -331,32 +323,26 @@ static inline void radix4_butterfly_last(__fp16 *pIn, __fp16 *pOut,
/* (xa - xc), (ya - yc) */
"vfsub.h %[F],%[A],%[C];"

"pv.extract.h %[t0],%[H],0;" // (yb - yd)
"pv.extract.h %[t1],%[H],1;" // (xb - xd)
"xor %[t2],%[t0],%[neg_mask];" // (yd - yb)
"xor %[t3],%[t1],%[neg_mask];" // (xd - xb)
/* (yd - yb), (xb - xd) */
"pv.pack %[A],%[t2],%[t1];"
/* (yb - yd), (xd - xb) */
"pv.pack %[B],%[t0],%[t3];"
"pv.shuffle2.h %[A],%[H],%[mask];"
"vfmul.h %[A],%[A],%[neg_mask];"

/* (xa + xc + xb + xd), (ya + yc + yb + yd) */
"vfadd.h %[H],%[E],%[G];"
/* (xa + xc - xb - xd), (ya + yc - yb - yd) */
"vfsub.h %[E],%[E],%[G];"
/* (xa - xc + yd - yb), (ya - yc + xb - xd) */
"vfadd.h %[A],%[F],%[A];"
"vfadd.h %[B],%[F],%[A];"
/* (xa - xc + yb - yd), (ya - yc + xd - xb) */
"vfadd.h %[B],%[F],%[B];"
"vfsub.h %[A],%[F],%[A];"

: [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "+&r"(E),
[F] "+&r"(F), [G] "+&r"(G), [H] "+&r"(H), [t0] "+&r"(t0),
[t1] "+&r"(t1), [t2] "+&r"(t2), [t3] "+&r"(t3)
: [neg_mask] "r"(0x00008000)
: [mask] "r"(0x00020003), [neg_mask] "r"(0x3C00BC00)
:);

*((v2h *)&pOut[i0_store * 2U]) = H;
*((v2h *)&pOut[i1_store * 2U]) = E;
*((v2h *)&pOut[i2_store * 2U]) = A;
*((v2h *)&pOut[i3_store * 2U]) = B;
*((v2h *)&pOut[i2_store * 2U]) = B;
*((v2h *)&pOut[i3_store * 2U]) = A;
}
Loading

0 comments on commit b9ff155

Please sign in to comment.