[software] Add shuffle instruction in FFT butterfly

pulp-platform · Jul 5, 2024 · b9ff155 · b9ff155
1 parent 491a050
commit b9ff155
Show file tree

Hide file tree

Showing 4 changed files with 179 additions and 80 deletions.
diff --git a/software/apps/baremetal/cfft_radix4_f16/main.c b/software/apps/baremetal/cfft_radix4_f16/main.c
@@ -21,17 +21,16 @@
 #include "data_cfft_radix4_f16.h"
 
 /* CHOOSE ONE */
-//#define SINGLE // Single core FFT.
 //#define PARALLEL // Parallel FFT not "memory-aware".
 //#define FOLDED // Parallel FFT with "memory-aware" load/store.
 #define SCHEDULED // Folded FFTs arranged in rows and cols.'''
 
 // Bitreversal index from table.
 #define BITREVERSETABLE
 // Independent FFTs scheduled on one row (default 1).
-#define N_FFTs_ROW 2
+#define N_FFTs_ROW 1
 // Independent FFTs scheduled on columns (default 1).
-#define N_FFTs_COL 2
+#define N_FFTs_COL 1
 #if (N_FFTs_COL > MAX_COL)
 #error Parallelization not supporting N_FFTs_COL > [N_BANKS / (N_CSAMPLES / 4)]
 #endif
@@ -77,6 +76,20 @@ int main() {
 
   /* INITIALIZATION */
 
+#if (defined(SINGLE) || defined(PARALLEL))
+  if (core_id == 0) {
+    dma_memcpy_blocking(l1_pSrc, l2_pSrc, N_CSAMPLES * sizeof(int32_t));
+    dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16,
+                        3 * (N_CSAMPLES / 4) * sizeof(int32_t));
+    dma_memcpy_blocking(l1_BitRevIndexTable, l2_BitRevIndexTable,
+                        BITREVINDEXTABLE_LENGTH * sizeof(int16_t));
+    printf("01: END INITIALIZATION\n");
+  }
+  mempool_barrier(num_cores);
+#endif
+
+#if (defined(SCHEDULED) || defined(FOLDED))
+
   if (core_id == 0) {
     for (uint32_t j = 0; j < N_FFTs_ROW; j++) {
       for (uint32_t i = 0; i < N_FFTs_COL; i++) {
@@ -88,6 +101,8 @@ int main() {
                         BITREVINDEXTABLE_LENGTH * sizeof(int32_t));
   }
   mempool_barrier(num_cores);
+
+#ifdef FOLDED_TWIDDLES
   for (uint32_t j = 0; j < N_FFTs_COL; j++) {
     uint32_t N_WORDS_COL = N_CSAMPLES >> 2;
     for (uint32_t i = core_id; i < N_WORDS_COL; i += num_cores) {
@@ -99,10 +114,31 @@ int main() {
           *(v2h *)&l2_twiddleCoef_f16[2 * (i * 3U)];
     }
   }
+#else
+  if (core_id == 0) {
+    dma_memcpy_blocking(l1_twiddleCoef_f16_src, l2_twiddleCoef_f16,
+                        3 * (N_CSAMPLES / 4) * sizeof(int32_t));
+  }
+#endif
+  mempool_barrier(num_cores);
+
   if (core_id == 0) {
     printf("01: END INITIALIZATION\n");
   }
   mempool_barrier(num_cores);
+#endif
+
+  /* COMPUTATION */
+
+#ifdef PARALLEL
+  mempool_start_benchmark();
+  mempool_radix4_cfft_f16p(l1_pSrc, N_CSAMPLES, l1_twiddleCoef_f16_src, 1,
+                           num_cores);
+  mempool_bitrevtable_q16p_xpulpimg((int16_t *)l1_pSrc, BITREVINDEXTABLE_LENGTH,
+                                    l1_BitRevIndexTable, num_cores);
+  mempool_stop_benchmark();
+  pRes = l1_pSrc;
+#endif
 
 #ifdef FOLDED
   if (core_id < (N_CSAMPLES / 16)) {
@@ -111,7 +147,7 @@ int main() {
                                     l1_twiddleCoef_f16_src,
                                     l1_twiddleCoef_f16_dst, (N_CSAMPLES / 16));
     pRes = ((LOG2 / 2) % 2) == 0 ? l1_pSrc : l1_pDst;
-    mempool_bitrevtable_q16p_xpulpimg((uint16_t *)pRes, BITREVINDEXTABLE_LENGTH,
+    mempool_bitrevtable_q16p_xpulpimg((int16_t *)pRes, BITREVINDEXTABLE_LENGTH,
                                       l1_BitRevIndexTable, (N_CSAMPLES / 16));
     mempool_stop_benchmark();
   }
@@ -140,7 +176,7 @@ int main() {
     printf("02: END COMPUTATION\n");
   }
 
-  mempool_check_f16(pRes, l2_pRes, 2 * N_CSAMPLES, 0.5, 0);
+  mempool_check_f16(pRes, l2_pRes, 2 * N_CSAMPLES, 0.05f, 0);
   mempool_barrier(num_cores);
   return 0;
 }
diff --git a/software/data/generate_cfft.py b/software/data/generate_cfft.py
@@ -58,8 +58,10 @@ def generate_cfft_q16(N):
 
 
 def generate_cfft_f16(N):
-    src = np.random.rand(N).astype(np.float16)
-    src = src + 1.j * np.random.rand(N).astype(np.float16)
+    # src = np.random.rand(N).astype(np.float16)
+    # src = src + 1.j * np.random.rand(N).astype(np.float16)
+    src = np.cos(np.linspace(0, N / 4, num=N)).astype(np.float16)
+    src = src + 1.j * np.sin(np.linspace(0, N / 4, num=N)).astype(np.float16)
     dst = np.fft.fft(src)
     src = np.column_stack((src.imag, src.real)).astype(np.float16).flatten()
     dst = np.column_stack((dst.imag, dst.real)).astype(np.float16).flatten()
@@ -142,7 +144,7 @@ def main():
         "--dimension",
         type=int,
         required=False,
-        default=64,
+        default=256,
         help='Input dimension'
     )
 

diff --git a/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h b/software/kernels/baremetal/mempool_radix4_cfft_butterfly_f16.h
@@ -27,7 +27,6 @@ static inline void radix4_butterfly_first(__fp16 *pIn, __fp16 *pOut,
                                           uint32_t i0, uint32_t n2, v2h CoSi1,
                                           v2h CoSi2, v2h CoSi3, v2h C1, v2h C2,
                                           v2h C3) {
-  __fp16 t0, t1, t2, t3;
   uint32_t i1, i2, i3;
   uint32_t i0_store, i1_store, i2_store, i3_store;
   v2h A, B, C, D, E, F, G, H;
@@ -74,6 +73,7 @@ static inline void radix4_butterfly_first(__fp16 *pIn, __fp16 *pOut,
   A = *(v2h *)&pIn[i0 * 2U];
   /* Read xc (real), yc(imag) input */
   C = *(v2h *)&pIn[i2 * 2U];
+
   asm volatile(
       // G = (xb + xd), (yb + yd)
       "vfadd.h  %[G],%[B],%[D];"
@@ -85,33 +85,30 @@ static inline void radix4_butterfly_first(__fp16 *pIn, __fp16 *pOut,
       "vfsub.h  %[F],%[A],%[C];"
 
       // C = (yb - yd), (xd - xb)
-      // D = (yd - yb), (xb - xd)
-      "pv.extract.h  %[t0],%[H],0;"  // yb - yd
-      "pv.extract.h  %[t1],%[H],1;"  // xb - xd
-      "xor %[t2],%[t0],%[neg_mask];" // yd - yb
-      "xor %[t3],%[t1],%[neg_mask];" // xd - xb
-      "pv.pack  %[C],%[t0],%[t3];"
-      "pv.pack  %[D],%[t2],%[t1];"
+      "pv.shuffle2.h %[C],%[H],%[mask];"
+      "vfmul.h       %[C],%[C],%[neg_mask];"
 
       // xa + xb + xc + xd, ya + yc + yb + yd
       "vfadd.h  %[A],%[E],%[G];"
       // xa + xc - xb - xd, ya + yc - yb - yd
       "vfsub.h  %[B],%[E],%[G];"
       // xa - xc + yb - yd, ya - yc + xd - xb
-      "vfadd.h  %[C],%[F],%[C];"
+      "vfadd.h  %[D],%[F],%[C];"
       // xa - xc + yd - yb, ya - yc + xb - xd
-      "vfadd.h  %[D],%[F],%[D];"
+      "vfsub.h  %[C],%[F],%[C];"
+
+      // s4 = Co1 * (xa - xc + yb - yd) + Si1 * (ya - yc + xd - xb)
+      // s5 = -Si1 * (xa - xc + yb - yd) + Co1 * (ya - yc + xd - xb)
+      "vfdotpex.s.h  %[s0],%[CoSi1],%[D];"
+      "vfdotpex.s.h  %[s1],%[C1],%[D];"
 
       // s0 = Co2 * (xa + xc - xb - xd) + Si2 * (ya + yc - yb - yd)
-      // s1 = Si2 * (xa + xc - xb - xd) - Co2 * (ya + yc - yb - yd)
-      "vfdotpex.s.h  %[s0],%[CoSi2],%[B];"
-      "vfdotpex.s.h  %[s1],%[C2],%[B];"
-      // s2 = Co1 * (xa - xc + yd - yb) + Si1 * (ya - yc + xb - xd)
-      // s3 = Si1 * (xa - xc + yd - yb) - Co1 * (ya - yc + xb - xd)
-      "vfdotpex.s.h  %[s2],%[CoSi1],%[D];"
-      "vfdotpex.s.h  %[s3],%[C1],%[D];"
-      // s4 = Co3 * (xa - xc + yb - yd) + Si3 * (ya - yc + xd - xb)
-      // s5 = Si3 * (xa - xc + yb - yd) - Co3 * (ya - yc + xd - xb)
+      // s1 = -Si2 * (xa + xc - xb - xd) + Co2 * (ya + yc - yb - yd)
+      "vfdotpex.s.h  %[s2],%[CoSi2],%[B];"
+      "vfdotpex.s.h  %[s3],%[C2],%[B];"
+
+      // s3 = Co3 * (xa - xc + yd - yb) + Si3 * (ya - yc + xb - xd)
+      // s4 = -Si3 * (xa - xc + yd - yb) + Co3 * (ya - yc + xb - xd)
       "vfdotpex.s.h  %[s4],%[CoSi3],%[C];"
       "vfdotpex.s.h  %[s5],%[C3],%[C];"
 
@@ -121,17 +118,17 @@ static inline void radix4_butterfly_first(__fp16 *pIn, __fp16 *pOut,
       "vfcpka.h.s %[C], %[s3], %[s2];"
       // xd', yd'
       "vfcpka.h.s %[D], %[s5], %[s4];"
-      : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "+&r"(E),
-        [F] "+&r"(F), [G] "+&r"(G), [H] "+&r"(H), [t0] "+&r"(t0),
-        [t1] "+&r"(t1), [t2] "+&r"(t2), [t3] "+&r"(t3), [s0] "=&r"(s0),
+      : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "=&r"(E),
+        [F] "=&r"(F), [G] "=&r"(G), [H] "=&r"(H), [s0] "=&r"(s0),
         [s1] "=&r"(s1), [s2] "=&r"(s2), [s3] "=&r"(s3), [s4] "=&r"(s4),
         [s5] "=&r"(s5)
       : [C1] "r"(C1), [C2] "r"(C2), [C3] "r"(C3), [CoSi1] "r"(CoSi1),
-        [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3), [neg_mask] "r"(0x00008000)
+        [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3), [mask] "r"(0x00020003),
+        [neg_mask] "r"(0x3C00BC00)
       :);
   *((v2h *)&pOut[i0_store * 2U]) = A;
-  *((v2h *)&pOut[i1_store * 2U]) = B;
-  *((v2h *)&pOut[i2_store * 2U]) = C;
+  *((v2h *)&pOut[i1_store * 2U]) = C;
+  *((v2h *)&pOut[i2_store * 2U]) = B;
   *((v2h *)&pOut[i3_store * 2U]) = D;
 }
 
@@ -155,7 +152,6 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut,
                                            uint32_t i0, uint32_t n2, v2h CoSi1,
                                            v2h CoSi2, v2h CoSi3, v2h C1, v2h C2,
                                            v2h C3) {
-  __fp16 t0, t1, t2, t3;
   uint32_t i1, i2, i3;
   uint32_t i0_store, i1_store, i2_store, i3_store;
   v2h A, B, C, D, E, F, G, H;
@@ -205,6 +201,7 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut,
   A = *(v2h *)&pIn[i0 * 2U];
   /* Read xc (real), yc(imag) input */
   C = *(v2h *)&pIn[i2 * 2U];
+
   asm volatile(
       // G = (xb + xd), (yb + yd)
       "vfadd.h  %[G],%[B],%[D];"
@@ -216,33 +213,30 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut,
       "vfsub.h  %[F],%[A],%[C];"
 
       // C = (yb - yd), (xd - xb)
-      // D = (yd - yb), (xb - xd)
-      "pv.extract.h  %[t0],%[H],0;"  // yb - yd
-      "pv.extract.h  %[t1],%[H],1;"  // xb - xd
-      "xor %[t2],%[t0],%[neg_mask];" // yd - yb
-      "xor %[t3],%[t1],%[neg_mask];" // xd - xb
-      "pv.pack  %[C],%[t0],%[t3];"
-      "pv.pack  %[D],%[t2],%[t1];"
+      "pv.shuffle2.h %[C],%[H],%[mask];"
+      "vfmul.h       %[C],%[C],%[neg_mask];"
 
       // xa + xb + xc + xd, ya + yc + yb + yd
       "vfadd.h  %[A],%[E],%[G];"
       // xa + xc - xb - xd, ya + yc - yb - yd
       "vfsub.h  %[B],%[E],%[G];"
       // xa - xc + yb - yd, ya - yc + xd - xb
-      "vfadd.h  %[C],%[F],%[C];"
+      "vfadd.h  %[D],%[F],%[C];"
       // xa - xc + yd - yb, ya - yc + xb - xd
-      "vfadd.h  %[D],%[F],%[D];"
+      "vfsub.h  %[C],%[F],%[C];"
+
+      // s4 = Co1 * (xa - xc + yb - yd) + Si1 * (ya - yc + xd - xb)
+      // s5 = -Si1 * (xa - xc + yb - yd) + Co1 * (ya - yc + xd - xb)
+      "vfdotpex.s.h  %[s0],%[CoSi1],%[D];"
+      "vfdotpex.s.h  %[s1],%[C1],%[D];"
 
       // s0 = Co2 * (xa + xc - xb - xd) + Si2 * (ya + yc - yb - yd)
-      // s1 = Si2 * (xa + xc - xb - xd) - Co2 * (ya + yc - yb - yd)
-      "vfdotpex.s.h  %[s0],%[CoSi2],%[B];"
-      "vfdotpex.s.h  %[s1],%[C2],%[B];"
-      // s2 = Co1 * (xa - xc + yd - yb) + Si1 * (ya - yc + xb - xd)
-      // s3 = Si1 * (xa - xc + yd - yb) - Co1 * (ya - yc + xb - xd)
-      "vfdotpex.s.h  %[s2],%[CoSi1],%[D];"
-      "vfdotpex.s.h  %[s3],%[C1],%[D];"
-      // s4 = Co3 * (xa - xc + yb - yd) + Si3 * (ya - yc + xd - xb)
-      // s5 = Si3 * (xa - xc + yb - yd) - Co3 * (ya - yc + xd - xb)
+      // s1 = -Si2 * (xa + xc - xb - xd) + Co2 * (ya + yc - yb - yd)
+      "vfdotpex.s.h  %[s2],%[CoSi2],%[B];"
+      "vfdotpex.s.h  %[s3],%[C2],%[B];"
+
+      // s3 = Co3 * (xa - xc + yd - yb) + Si3 * (ya - yc + xb - xd)
+      // s4 = -Si3 * (xa - xc + yd - yb) + Co3 * (ya - yc + xb - xd)
       "vfdotpex.s.h  %[s4],%[CoSi3],%[C];"
       "vfdotpex.s.h  %[s5],%[C3],%[C];"
 
@@ -252,18 +246,17 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut,
       "vfcpka.h.s %[C], %[s3], %[s2];"
       // xd', yd'
       "vfcpka.h.s %[D], %[s5], %[s4];"
-      : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "+&r"(E),
-        [F] "+&r"(F), [G] "+&r"(G), [H] "+&r"(H), [t0] "+&r"(t0),
-        [t1] "+&r"(t1), [t2] "+&r"(t2), [t3] "+&r"(t3), [s0] "=&r"(s0),
+      : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "=&r"(E),
+        [F] "=&r"(F), [G] "=&r"(G), [H] "=&r"(H), [s0] "=&r"(s0),
         [s1] "=&r"(s1), [s2] "=&r"(s2), [s3] "=&r"(s3), [s4] "=&r"(s4),
         [s5] "=&r"(s5)
       : [C1] "r"(C1), [C2] "r"(C2), [C3] "r"(C3), [CoSi1] "r"(CoSi1),
-        [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3), [neg_mask] "r"(0x00008000)
+        [CoSi2] "r"(CoSi2), [CoSi3] "r"(CoSi3), [mask] "r"(0x00020003),
+        [neg_mask] "r"(0x3C00BC00)
       :);
-
   *((v2h *)&pOut[i0_store * 2U]) = A;
-  *((v2h *)&pOut[i1_store * 2U]) = B;
-  *((v2h *)&pOut[i2_store * 2U]) = C;
+  *((v2h *)&pOut[i1_store * 2U]) = C;
+  *((v2h *)&pOut[i2_store * 2U]) = B;
   *((v2h *)&pOut[i3_store * 2U]) = D;
 }
 
@@ -278,7 +271,7 @@ static inline void radix4_butterfly_middle(__fp16 *pIn, __fp16 *pOut,
 */
 static inline void radix4_butterfly_last(__fp16 *pIn, __fp16 *pOut,
                                          uint32_t i0) {
-  __fp16 t0, t1;
+  __fp16 t0, t1, t2, t3;
   uint32_t i1, i2, i3;
   uint32_t i0_store, i1_store, i2_store, i3_store;
   v2h A, B, C, D, E, F, G, H;
@@ -300,7 +293,7 @@ static inline void radix4_butterfly_last(__fp16 *pIn, __fp16 *pOut,
   i3 = i2 + 1U;
 #endif
 // STORE INDEXES
-#if defined(FOLDED)
+#if defined(FOLDED) || (defined(SCHEDULED) && defined(BITREVERSETABLE))
   i0_store = i0 * 4;
   i1_store = i0_store + 1;
   i2_store = i1_store + 1;
@@ -320,7 +313,6 @@ static inline void radix4_butterfly_last(__fp16 *pIn, __fp16 *pOut,
   A = *(v2h *)&pIn[i0 * 2U];
   /* Read xc (imag), yc(real) input */
   C = *(v2h *)&pIn[i2 * 2U];
-  __fp16 t2, t3;
   asm volatile(
       /* (xb - xd), (yb - yd) */
       "vfsub.h  %[H],%[B],%[D];"
@@ -331,32 +323,26 @@ static inline void radix4_butterfly_last(__fp16 *pIn, __fp16 *pOut,
       /* (xa - xc), (ya - yc) */
       "vfsub.h  %[F],%[A],%[C];"
 
-      "pv.extract.h  %[t0],%[H],0;"  // (yb - yd)
-      "pv.extract.h  %[t1],%[H],1;"  // (xb - xd)
-      "xor %[t2],%[t0],%[neg_mask];" // (yd - yb)
-      "xor %[t3],%[t1],%[neg_mask];" // (xd - xb)
       /* (yd - yb), (xb - xd) */
-      "pv.pack %[A],%[t2],%[t1];"
-      /* (yb - yd), (xd - xb) */
-      "pv.pack %[B],%[t0],%[t3];"
+      "pv.shuffle2.h %[A],%[H],%[mask];"
+      "vfmul.h       %[A],%[A],%[neg_mask];"
 
       /* (xa + xc + xb + xd), (ya + yc + yb + yd) */
       "vfadd.h  %[H],%[E],%[G];"
       /* (xa + xc - xb - xd), (ya + yc - yb - yd) */
       "vfsub.h  %[E],%[E],%[G];"
       /* (xa - xc + yd - yb), (ya - yc + xb - xd) */
-      "vfadd.h  %[A],%[F],%[A];"
+      "vfadd.h  %[B],%[F],%[A];"
       /* (xa - xc + yb - yd), (ya - yc + xd - xb) */
-      "vfadd.h  %[B],%[F],%[B];"
+      "vfsub.h  %[A],%[F],%[A];"
 
       : [A] "+&r"(A), [B] "+&r"(B), [C] "+&r"(C), [D] "+&r"(D), [E] "+&r"(E),
         [F] "+&r"(F), [G] "+&r"(G), [H] "+&r"(H), [t0] "+&r"(t0),
         [t1] "+&r"(t1), [t2] "+&r"(t2), [t3] "+&r"(t3)
-      : [neg_mask] "r"(0x00008000)
+      : [mask] "r"(0x00020003), [neg_mask] "r"(0x3C00BC00)
       :);
-
   *((v2h *)&pOut[i0_store * 2U]) = H;
   *((v2h *)&pOut[i1_store * 2U]) = E;
-  *((v2h *)&pOut[i2_store * 2U]) = A;
-  *((v2h *)&pOut[i3_store * 2U]) = B;
+  *((v2h *)&pOut[i2_store * 2U]) = B;
+  *((v2h *)&pOut[i3_store * 2U]) = A;
 }