Skip to content

Commit

Permalink
[software] Improve performance of axpy and dotp
Browse files Browse the repository at this point in the history
  • Loading branch information
mbertuletti committed Sep 13, 2024
1 parent 4abcd88 commit a1bc514
Show file tree
Hide file tree
Showing 11 changed files with 221 additions and 219 deletions.
22 changes: 12 additions & 10 deletions software/apps/baremetal/axpy_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,8 @@
#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)

// Vectors for kernel computation
__fp16 l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
__fp16 l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
__fp16 l1_C[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
__fp16 l1_X[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
__fp16 l1_Y[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));

#include "baremetal/mempool_axpy_f16.h"
#include "baremetal/mempool_checks.h"
Expand All @@ -35,25 +34,28 @@ int main() {
time_init = 0;
time_end = 0;
if (core_id == 0) {
dma_memcpy_blocking(l1_A, l2_A, LEN * sizeof(int16_t));
dma_memcpy_blocking(l1_B, l2_B, LEN * sizeof(int16_t));
dma_memcpy_blocking(l1_C, l2_C, LEN * sizeof(int16_t));
dma_memcpy_blocking(l1_X, l2_X, LEN * sizeof(int16_t));
dma_memcpy_blocking(l1_Y, l2_Y, LEN * sizeof(int16_t));
}
uint32_t register volatile a = *(uint32_t *)&(A)&0x0000FFFF;
mempool_barrier(num_cores);

// // SINGLE
// time_init = mempool_get_timer();
// axpy_f16s(l1_A, l1_B, l1_C, LEN);
// axpy_f16s(A, l1_X, l1_Y, LEN);
// time_end = mempool_get_timer();

// // PARALLEL
// time_init = mempool_get_timer();
// axpy_f16vecp_unrolled4(l1_A, l1_B, l1_C, LEN, num_cores);
// axpy_f16vecp_unrolled4(A, l1_X, l1_Y, LEN, num_cores);
// time_end = mempool_get_timer();

// PARALLEL, LOCAL ACCESSES
time_init = mempool_get_timer();
axpy_f16vecp_local_unrolled4(l1_A, l1_B, l1_C, LEN);
// axpy_f16vecp_local_unrolled4(a, l1_X, l1_Y, LEN);
mempool_start_benchmark();
axpy_f16vecp_local_unrolled4(a, l1_X, l1_Y, LEN);
mempool_stop_benchmark();
time_end = mempool_get_timer();

mempool_barrier(num_cores);
Expand All @@ -62,7 +64,7 @@ int main() {
uint32_t clock_cycles = (time_end - time_init);
printf("\nKernel execution takes %d clock cycles\n", clock_cycles);
}
mempool_check_f16(l1_C, l2_out, 100, 0.1f, 0);
mempool_check_f16(l1_Y, l2_out, 100, 0.1f, 0);
mempool_barrier(num_cores);

return 0;
Expand Down
21 changes: 9 additions & 12 deletions software/apps/baremetal/axpy_f32/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,10 @@

#include "data_axpy_f32.h"
#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
#define SINGLE_CORE_REDUCTION
// #define BINARY_REDUCTION

// Vectors for kernel computation
float l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
float l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
float l1_C[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
float l1_X[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
float l1_Y[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));

#include "baremetal/mempool_axpy_f32.h"
#include "baremetal/mempool_checks.h"
Expand All @@ -37,25 +34,25 @@ int main() {
time_init = 0;
time_end = 0;
if (core_id == 0) {
dma_memcpy_blocking(l1_A, l2_A, LEN * sizeof(int32_t));
dma_memcpy_blocking(l1_B, l2_B, LEN * sizeof(int32_t));
dma_memcpy_blocking(l1_C, l2_C, LEN * sizeof(int32_t));
dma_memcpy_blocking(l1_X, l2_X, LEN * sizeof(int32_t));
dma_memcpy_blocking(l1_Y, l2_Y, LEN * sizeof(int32_t));
}
float register volatile a = A;
mempool_barrier(num_cores);

// PARALLEL
time_init = mempool_get_timer();
// axpy_f32p(l1_A, l1_B, l1_C, LEN, num_cores);
// axpy_f32p_unrolled4(l1_A, l1_B, l1_C, LEN, num_cores);
axpy_f32p_local_unrolled4(l1_A, l1_B, l1_C, LEN);
// axpy_f32p(a, l1_X, l1_Y, LEN, num_cores);
// axpy_f32p_unrolled4(a, l1_X, l1_Y, LEN, num_cores);
axpy_f32p_local_unrolled4(a, l1_X, l1_Y, LEN);
time_end = mempool_get_timer();

// Check results
if (core_id == 0) {
uint32_t clock_cycles = (time_end - time_init);
printf("\nKernel execution takes %d clock cycles\n", clock_cycles);
}
mempool_check_f32(l1_C, l2_out, 100, 0.1f, 0);
mempool_check_f32(l1_Y, l2_out, 100, 0.1f, 0);
mempool_barrier(num_cores);

return 0;
Expand Down
4 changes: 2 additions & 2 deletions software/apps/baremetal/dotp_f16/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
#define BINARY_REDUCTION

// Vectors for kernel computation
__fp16 l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
__fp16 l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
__fp16 l1_A[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
__fp16 l1_B[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
uint32_t red_barrier[NUM_BANKS]
__attribute__((aligned(NUM_BANKS), section(".l1_prio")));
__fp16 sum[2 * NUM_BANKS]
Expand Down
4 changes: 2 additions & 2 deletions software/apps/baremetal/dotp_f32/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
#define BINARY_REDUCTION

// Vectors for kernel computation
float l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
float l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
float l1_A[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
float l1_B[LEN] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
uint32_t red_barrier[NUM_BANKS]
__attribute__((aligned(NUM_BANKS), section(".l1_prio")));
float sum[NUM_BANKS] __attribute__((aligned(NUM_BANKS), section(".l1_prio")));
Expand Down
6 changes: 3 additions & 3 deletions software/data/data_axpy_f16.h.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@

#define LEN (${Len})

__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)};
__fp16 __attribute__((section(".l2"))) A = ${'(__fp16){:.4f}'.format(A)};

__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)};
__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_X[${Len}] = ${array_to_cstr(X)};

__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C[${Len}] = ${array_to_cstr(C)};
__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_Y[${Len}] = ${array_to_cstr(Y)};

__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_out[${Len}] = ${array_to_cstr(out)};
7 changes: 4 additions & 3 deletions software/data/data_axpy_f32.h.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,13 @@
return out
%> \


#define LEN (${Len})

float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)};
float __attribute__((section(".l2"))) A = ${'(float){:.8f}'.format(A)};

float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)};
float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_X[${Len}] = ${array_to_cstr(X)};

float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C[${Len}] = ${array_to_cstr(C)};
float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_Y[${Len}] = ${array_to_cstr(Y)};

float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_out[${Len}] = ${array_to_cstr(out)};
40 changes: 20 additions & 20 deletions software/data/generate_dotp.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,39 +26,39 @@ def generate_dotp_i32(Len):
def generate_dotp_f32(Len):

# Create matrix
A = np.random.rand(Len).astype(np.float32)
B = np.random.rand(Len).astype(np.float32)
A = np.random.randn(Len).astype(np.float32)
B = np.random.randn(Len).astype(np.float32)
C = (np.dot(A, B)).astype(np.float32)
return A, B, C


def generate_dotp_f16(Len):

# Create matrix
A = np.random.rand(Len).astype(np.float16)
B = np.random.rand(Len).astype(np.float16)
A = np.random.randn(Len).astype(np.float16)
B = np.random.randn(Len).astype(np.float16)
C = (np.dot(A, B)).astype(np.float16)
return A, B, C


def generate_axpy_f32(Len):

# Create matrix
A = np.random.rand(Len).astype(np.float32)
B = np.random.rand(Len).astype(np.float32)
C = np.random.rand(Len).astype(np.float32)
out = C + A * B
return A, B, C, out
X = np.random.rand(Len).astype(np.float32)
Y = np.random.rand(Len).astype(np.float32)
A = np.float32(3.14)
out = Y + A * X
return A, X, Y, out


def generate_axpy_f16(Len):

# Create matrix
A = np.random.rand(Len).astype(np.float16)
B = np.random.rand(Len).astype(np.float16)
C = np.random.rand(Len).astype(np.float16)
out = C + A * B
return A, B, C, out
X = np.random.rand(Len).astype(np.float16)
Y = np.random.rand(Len).astype(np.float16)
A = np.float16(3.14)
out = Y + A * X
return A, X, Y, out

##################
# compute_result #
Expand Down Expand Up @@ -130,24 +130,24 @@ def main():
'Len': Len}
gen_data_header_file(args.outdir, tpl, **kwargs)

A, B, C, out = generate_axpy_f32(Len)
A, X, Y, out = generate_axpy_f32(Len)
tpl = pathlib.Path(__file__).parent.absolute() / "data_axpy_f32.h.tpl"
kwargs = {
'name': 'data_axpy_f32',
'A': A,
'B': B,
'C': C,
'X': X,
'Y': Y,
'out': out,
'Len': Len}
gen_data_header_file(args.outdir, tpl, **kwargs)

A, B, C, out = generate_axpy_f16(Len)
A, X, Y, out = generate_axpy_f16(Len)
tpl = pathlib.Path(__file__).parent.absolute() / "data_axpy_f16.h.tpl"
kwargs = {
'name': 'data_axpy_f16',
'A': A,
'B': B,
'C': C,
'X': X,
'Y': Y,
'out': out,
'Len': Len}
gen_data_header_file(args.outdir, tpl, **kwargs)
Expand Down
Loading

0 comments on commit a1bc514

Please sign in to comment.