Skip to content

Commit

Permalink
[software] Add f32 and f16 axpy app
Browse files Browse the repository at this point in the history
  • Loading branch information
mbertuletti committed Sep 2, 2024
1 parent d7b065c commit e615cf1
Show file tree
Hide file tree
Showing 10 changed files with 528 additions and 4 deletions.
14 changes: 12 additions & 2 deletions software/apps/baremetal/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,18 @@ ALLPYS := $(patsubst %.py,%.h,$(wildcard $(DATA_DIR)/*.py))
BINARIES := $(addprefix $(BIN_DIR)/,$(APPS))
ALL := $(APPS)

ALL_GCC := $(filter-out cfft_radix4_f16 chest_f16 cholesky_f16 cmatmul_f16 matmul_f16 matmul_f32 mimo_mmse_f32 mimo_mmse_f16 ofdm, $(ALL))
ALL_LLVM := $(filter-out synth_i32 cfft_radix2_q16 cfft_radix4_q16 chest_q16 cholesky_q16 cholesky_q32 cmatmul_q16 mimo_mmse_q16, $(ALL))
FP_APPS := axpy_f16 axpy_f32
FP_APPS += cfft_radix4_f16 chest_f16 cholesky_f16
FP_APPS += cmatmul_f16 matmul_f16 matmul_f32
FP_APPS += dotp_f16 dotp_f32
FP_APPS += mimo_mmse_f32 mimo_mmse_f16 ofdm

I_APPS := synth_i32
I_APPS += cfft_radix2_q16 cfft_radix4_q16 chest_q16 cholesky_q16 cholesky_q32
I_APPS += cmatmul_q16 mimo_mmse_q16

ALL_GCC := $(filter-out $(FP_APPS), $(ALL))
ALL_LLVM := $(filter-out $(I_APPS), $(ALL))

# Make all applications
all: $(ALL_GCC)
Expand Down
69 changes: 69 additions & 0 deletions software/apps/baremetal/axpy_f16/main.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// Copyright 2021 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

// Author: Marco Bertuletti, ETH Zurich

#include <stdint.h>
#include <stdlib.h>
#include <string.h>

#include "dma.h"
#include "encoding.h"
#include "printf.h"
#include "runtime.h"
#include "synchronization.h"

#include "data_axpy_f16.h"
#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)

// Vectors for kernel computation
__fp16 l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
__fp16 l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
__fp16 l1_C[LEN] __attribute__((aligned(LEN), section(".l1_prio")));

#include "baremetal/mempool_axpy_f16.h"
#include "baremetal/mempool_checks.h"

int main() {

uint32_t core_id = mempool_get_core_id();
uint32_t num_cores = mempool_get_core_count();
uint32_t time_init, time_end;
mempool_barrier_init(core_id);

time_init = 0;
time_end = 0;
if (core_id == 0) {
dma_memcpy_blocking(l1_A, l2_A, LEN * sizeof(int16_t));
dma_memcpy_blocking(l1_B, l2_B, LEN * sizeof(int16_t));
dma_memcpy_blocking(l1_C, l2_C, LEN * sizeof(int16_t));
}
mempool_barrier(num_cores);

// // SINGLE
// time_init = mempool_get_timer();
// axpy_f16s(l1_A, l1_B, l1_C, LEN);
// time_end = mempool_get_timer();

// // PARALLEL
// time_init = mempool_get_timer();
// axpy_f16vecp_unrolled4(l1_A, l1_B, l1_C, LEN, num_cores);
// time_end = mempool_get_timer();

// PARALLEL, LOCAL ACCESSES
time_init = mempool_get_timer();
axpy_f16vecp_local_unrolled4(l1_A, l1_B, l1_C, LEN);
time_end = mempool_get_timer();

mempool_barrier(num_cores);
// Check results
if (core_id == 0) {
uint32_t clock_cycles = (time_end - time_init);
printf("\nKernel execution takes %d clock cycles\n", clock_cycles);
}
mempool_check_f16(l1_C, l2_out, 100, 0.1f, 0);
mempool_barrier(num_cores);

return 0;
}
62 changes: 62 additions & 0 deletions software/apps/baremetal/axpy_f32/main.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// Copyright 2021 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

// Author: Marco Bertuletti, ETH Zurich

#include <stdint.h>
#include <stdlib.h>
#include <string.h>

#include "dma.h"
#include "encoding.h"
#include "printf.h"
#include "runtime.h"
#include "synchronization.h"

#include "data_axpy_f32.h"
#define NUM_BANKS (NUM_CORES * BANKING_FACTOR)
#define SINGLE_CORE_REDUCTION
// #define BINARY_REDUCTION

// Vectors for kernel computation
float l1_A[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
float l1_B[LEN] __attribute__((aligned(LEN), section(".l1_prio")));
float l1_C[LEN] __attribute__((aligned(LEN), section(".l1_prio")));

#include "baremetal/mempool_axpy_f32.h"
#include "baremetal/mempool_checks.h"

int main() {

uint32_t core_id = mempool_get_core_id();
uint32_t num_cores = mempool_get_core_count();
uint32_t time_init, time_end;
mempool_barrier_init(core_id);

time_init = 0;
time_end = 0;
if (core_id == 0) {
dma_memcpy_blocking(l1_A, l2_A, LEN * sizeof(int32_t));
dma_memcpy_blocking(l1_B, l2_B, LEN * sizeof(int32_t));
dma_memcpy_blocking(l1_C, l2_C, LEN * sizeof(int32_t));
}
mempool_barrier(num_cores);

// PARALLEL
time_init = mempool_get_timer();
// axpy_f32p(l1_A, l1_B, l1_C, LEN, num_cores);
// axpy_f32p_unrolled4(l1_A, l1_B, l1_C, LEN, num_cores);
axpy_f32p_local_unrolled4(l1_A, l1_B, l1_C, LEN);
time_end = mempool_get_timer();

// Check results
if (core_id == 0) {
uint32_t clock_cycles = (time_end - time_init);
printf("\nKernel execution takes %d clock cycles\n", clock_cycles);
}
mempool_check_f32(l1_C, l2_out, 100, 0.1f, 0);
mempool_barrier(num_cores);

return 0;
}
2 changes: 1 addition & 1 deletion software/apps/baremetal/axpy_i32/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include <stdint.h>
#include <string.h>

#include "baremetal/mempool_axpy_i32p.h"
#include "baremetal/mempool_axpy_i32.h"
#include "encoding.h"
#include "printf.h"
#include "runtime.h"
Expand Down
26 changes: 26 additions & 0 deletions software/data/data_axpy_f16.h.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// Copyright 2022 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
\
<% def array_to_cstr(array):
out = '{'
i = 0
out += '\n'
for a in array:
out += '(__fp16){:.4f}, '.format(a)
i += 1
if i % 8 == 0:
out += '\n'
out = out[:-2] + '}'
return out
%> \

#define LEN (${Len})

__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)};

__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)};

__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C[${Len}] = ${array_to_cstr(C)};

__fp16 __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_out[${Len}] = ${array_to_cstr(out)};
26 changes: 26 additions & 0 deletions software/data/data_axpy_f32.h.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// Copyright 2022 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
\
<% def array_to_cstr(array):
out = '{'
i = 0
out += '\n'
for a in array:
out += '{}f, '.format(a)
i += 1
if i % 8 == 0:
out += '\n'
out = out[:-2] + '}'
return out
%> \

#define LEN (${Len})

float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_A[${Len}] = ${array_to_cstr(A)};

float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_B[${Len}] = ${array_to_cstr(B)};

float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_C[${Len}] = ${array_to_cstr(C)};

float __attribute__((aligned(sizeof(int32_t)), section(".l2"))) l2_out[${Len}] = ${array_to_cstr(out)};
44 changes: 43 additions & 1 deletion software/data/generate_dotp.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,26 @@ def generate_dotp_f16(Len):
C = (np.dot(A, B)).astype(np.float16)
return A, B, C


def generate_axpy_f32(Len):

# Create matrix
A = np.random.rand(Len).astype(np.float32)
B = np.random.rand(Len).astype(np.float32)
C = np.random.rand(Len).astype(np.float32)
out = C + A * B
return A, B, C, out


def generate_axpy_f16(Len):

# Create matrix
A = np.random.rand(Len).astype(np.float16)
B = np.random.rand(Len).astype(np.float16)
C = np.random.rand(Len).astype(np.float16)
out = C + A * B
return A, B, C, out

##################
# compute_result #
##################
Expand Down Expand Up @@ -73,7 +93,7 @@ def main():
"--length",
type=int,
required=False,
default=4096,
default=1024,
help='First dimension.'
)

Expand Down Expand Up @@ -110,6 +130,28 @@ def main():
'Len': Len}
gen_data_header_file(args.outdir, tpl, **kwargs)

A, B, C, out = generate_axpy_f32(Len)
tpl = pathlib.Path(__file__).parent.absolute() / "data_axpy_f32.h.tpl"
kwargs = {
'name': 'data_axpy_f32',
'A': A,
'B': B,
'C': C,
'out': out,
'Len': Len}
gen_data_header_file(args.outdir, tpl, **kwargs)

A, B, C, out = generate_axpy_f16(Len)
tpl = pathlib.Path(__file__).parent.absolute() / "data_axpy_f16.h.tpl"
kwargs = {
'name': 'data_axpy_f16',
'A': A,
'B': B,
'C': C,
'out': out,
'Len': Len}
gen_data_header_file(args.outdir, tpl, **kwargs)


if __name__ == "__main__":
main()
Loading

0 comments on commit e615cf1

Please sign in to comment.