[software] Improve performance of axpy and dotp

pulp-platform · Sep 13, 2024 · a1bc514 · a1bc514
1 parent 4abcd88
commit a1bc514
Show file tree

Hide file tree

Showing 11 changed files with 221 additions and 219 deletions.
diff --git a/software/apps/baremetal/axpy_f16/main.c b/software/apps/baremetal/axpy_f16/main.c
@@ -18,9 +18,8 @@
 #define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
 
 // Vectors for kernel computation
-__fp16 l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
-__fp16 l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
-__fp16 l1_C[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
+__fp16 l1_X[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+__fp16 l1_Y[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 
 #include "baremetal/mempool_axpy_f16.h"
 #include "baremetal/mempool_checks.h"
@@ -35,25 +34,28 @@ int main() {
   time_init = 0;
   time_end = 0;
   if (core_id == 0) {
-    dma_memcpy_blocking(l1_A, l2_A, LEN * sizeof(int16_t));
-    dma_memcpy_blocking(l1_B, l2_B, LEN * sizeof(int16_t));
-    dma_memcpy_blocking(l1_C, l2_C, LEN * sizeof(int16_t));
+    dma_memcpy_blocking(l1_X, l2_X, LEN * sizeof(int16_t));
+    dma_memcpy_blocking(l1_Y, l2_Y, LEN * sizeof(int16_t));
   }
+  uint32_t register volatile a = *(uint32_t *)&(A)&0x0000FFFF;
   mempool_barrier(num_cores);
 
   //  // SINGLE
   //  time_init = mempool_get_timer();
-  //  axpy_f16s(l1_A, l1_B, l1_C, LEN);
+  //  axpy_f16s(A, l1_X, l1_Y, LEN);
   //  time_end = mempool_get_timer();
 
   //  // PARALLEL
   //  time_init = mempool_get_timer();
-  //  axpy_f16vecp_unrolled4(l1_A, l1_B, l1_C, LEN, num_cores);
+  //  axpy_f16vecp_unrolled4(A, l1_X, l1_Y, LEN, num_cores);
   //  time_end = mempool_get_timer();
 
   // PARALLEL, LOCAL ACCESSES
   time_init = mempool_get_timer();
-  axpy_f16vecp_local_unrolled4(l1_A, l1_B, l1_C, LEN);
+  // axpy_f16vecp_local_unrolled4(a, l1_X, l1_Y, LEN);
+  mempool_start_benchmark();
+  axpy_f16vecp_local_unrolled4(a, l1_X, l1_Y, LEN);
+  mempool_stop_benchmark();
   time_end = mempool_get_timer();
 
   mempool_barrier(num_cores);
@@ -62,7 +64,7 @@ int main() {
     uint32_t clock_cycles = (time_end - time_init);
     printf("\nKernel execution takes %d clock cycles\n", clock_cycles);
   }
-  mempool_check_f16(l1_C, l2_out, 100, 0.1f, 0);
+  mempool_check_f16(l1_Y, l2_out, 100, 0.1f, 0);
   mempool_barrier(num_cores);
 
   return 0;

diff --git a/software/apps/baremetal/axpy_f32/main.c b/software/apps/baremetal/axpy_f32/main.c
@@ -16,13 +16,10 @@
 
 #include "data_axpy_f32.h"
 #define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
-#define SINGLE_CORE_REDUCTION
-// #define BINARY_REDUCTION
 
 // Vectors for kernel computation
-float l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
-float l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
-float l1_C[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
+float l1_X[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+float l1_Y[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 
 #include "baremetal/mempool_axpy_f32.h"
 #include "baremetal/mempool_checks.h"
@@ -37,25 +34,25 @@ int main() {
   time_init = 0;
   time_end = 0;
   if (core_id == 0) {
-    dma_memcpy_blocking(l1_A, l2_A, LEN * sizeof(int32_t));
-    dma_memcpy_blocking(l1_B, l2_B, LEN * sizeof(int32_t));
-    dma_memcpy_blocking(l1_C, l2_C, LEN * sizeof(int32_t));
+    dma_memcpy_blocking(l1_X, l2_X, LEN * sizeof(int32_t));
+    dma_memcpy_blocking(l1_Y, l2_Y, LEN * sizeof(int32_t));
   }
+  float register volatile a = A;
   mempool_barrier(num_cores);
 
   // PARALLEL
   time_init = mempool_get_timer();
-  // axpy_f32p(l1_A, l1_B, l1_C, LEN, num_cores);
-  // axpy_f32p_unrolled4(l1_A, l1_B, l1_C, LEN, num_cores);
-  axpy_f32p_local_unrolled4(l1_A, l1_B, l1_C, LEN);
+  // axpy_f32p(a, l1_X, l1_Y, LEN, num_cores);
+  // axpy_f32p_unrolled4(a, l1_X, l1_Y, LEN, num_cores);
+  axpy_f32p_local_unrolled4(a, l1_X, l1_Y, LEN);
   time_end = mempool_get_timer();
 
   // Check results
   if (core_id == 0) {
     uint32_t clock_cycles = (time_end - time_init);
     printf("\nKernel execution takes %d clock cycles\n", clock_cycles);
   }
-  mempool_check_f32(l1_C, l2_out, 100, 0.1f, 0);
+  mempool_check_f32(l1_Y, l2_out, 100, 0.1f, 0);
   mempool_barrier(num_cores);
 
   return 0;

diff --git a/software/apps/baremetal/dotp_f16/main.c b/software/apps/baremetal/dotp_f16/main.c
@@ -19,8 +19,8 @@
 #define BINARY_REDUCTION
 
 // Vectors for kernel computation
-__fp16 l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
-__fp16 l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
+__fp16 l1_A[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+__fp16 l1_B[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 uint32_t red_barrier[NUM_BANKS]
     __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 __fp16 sum[2 * NUM_BANKS]

diff --git a/software/apps/baremetal/dotp_f32/main.c b/software/apps/baremetal/dotp_f32/main.c
@@ -20,8 +20,8 @@
 #define BINARY_REDUCTION
 
 // Vectors for kernel computation
-float l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
-float l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
+float l1_A[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
+float l1_B[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 uint32_t red_barrier[NUM_BANKS]
     __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
 float sum[NUM_BANKS] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));

diff --git a/software/data/data_axpy_f16.h.tpl b/software/data/data_axpy_f16.h.tpl
@@ -17,10 +17,10 @@
 
 #define LEN (${Len})
 
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)};
+__fp16 __attribute__((section(".l2"))) A = ${'(__fp16){:.4f}'.format(A)};
 
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)};
+__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_X[${Len}] = ${array_to_cstr(X)};
 
-__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C[${Len}] = ${array_to_cstr(C)};
+__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_Y[${Len}] = ${array_to_cstr(Y)};
 
 __fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_out[${Len}] = ${array_to_cstr(out)};
diff --git a/software/data/data_axpy_f32.h.tpl b/software/data/data_axpy_f32.h.tpl
@@ -15,12 +15,13 @@
     return out
 %> \
 
+
 #define LEN (${Len})
 
-float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)};
+float __attribute__((section(".l2"))) A = ${'(float){:.8f}'.format(A)};
 
-float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)};
+float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_X[${Len}] = ${array_to_cstr(X)};
 
-float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C[${Len}] = ${array_to_cstr(C)};
+float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_Y[${Len}] = ${array_to_cstr(Y)};
 
 float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_out[${Len}] = ${array_to_cstr(out)};
diff --git a/software/data/generate_dotp.py b/software/data/generate_dotp.py
@@ -26,39 +26,39 @@ def generate_dotp_i32(Len):
 def generate_dotp_f32(Len):
 
     # Create matrix
-    A = np.random.rand(Len).astype(np.float32)
-    B = np.random.rand(Len).astype(np.float32)
+    A = np.random.randn(Len).astype(np.float32)
+    B = np.random.randn(Len).astype(np.float32)
     C = (np.dot(A, B)).astype(np.float32)
     return A, B, C
 
 
 def generate_dotp_f16(Len):
 
     # Create matrix
-    A = np.random.rand(Len).astype(np.float16)
-    B = np.random.rand(Len).astype(np.float16)
+    A = np.random.randn(Len).astype(np.float16)
+    B = np.random.randn(Len).astype(np.float16)
     C = (np.dot(A, B)).astype(np.float16)
     return A, B, C
 
 
 def generate_axpy_f32(Len):
 
     # Create matrix
-    A = np.random.rand(Len).astype(np.float32)
-    B = np.random.rand(Len).astype(np.float32)
-    C = np.random.rand(Len).astype(np.float32)
-    out = C + A * B
-    return A, B, C, out
+    X = np.random.rand(Len).astype(np.float32)
+    Y = np.random.rand(Len).astype(np.float32)
+    A = np.float32(3.14)
+    out = Y + A * X
+    return A, X, Y, out
 
 
 def generate_axpy_f16(Len):
 
     # Create matrix
-    A = np.random.rand(Len).astype(np.float16)
-    B = np.random.rand(Len).astype(np.float16)
-    C = np.random.rand(Len).astype(np.float16)
-    out = C + A * B
-    return A, B, C, out
+    X = np.random.rand(Len).astype(np.float16)
+    Y = np.random.rand(Len).astype(np.float16)
+    A = np.float16(3.14)
+    out = Y + A * X
+    return A, X, Y, out
 
 ##################
 # compute_result #
@@ -130,24 +130,24 @@ def main():
         'Len': Len}
     gen_data_header_file(args.outdir, tpl, **kwargs)
 
-    A, B, C, out = generate_axpy_f32(Len)
+    A, X, Y, out = generate_axpy_f32(Len)
     tpl = pathlib.Path(__file__).parent.absolute() / "data_axpy_f32.h.tpl"
     kwargs = {
         'name': 'data_axpy_f32',
         'A': A,
-        'B': B,
-        'C': C,
+        'X': X,
+        'Y': Y,
         'out': out,
         'Len': Len}
     gen_data_header_file(args.outdir, tpl, **kwargs)
 
-    A, B, C, out = generate_axpy_f16(Len)
+    A, X, Y, out = generate_axpy_f16(Len)
     tpl = pathlib.Path(__file__).parent.absolute() / "data_axpy_f16.h.tpl"
     kwargs = {
         'name': 'data_axpy_f16',
         'A': A,
-        'B': B,
-        'C': C,
+        'X': X,
+        'Y': Y,
         'out': out,
         'Len': Len}
     gen_data_header_file(args.outdir, tpl, **kwargs)